segmentationCBS <- structure(function(# CBS segmentation ### The default segmentation function. This function is called via the ### fun.segmentation argument of runAbsoluteCN. The arguments are passed ### via args.segmentation. normal, ### GATK coverage file for normal sample. tumor, ### GATK coverage file for tumor sample. log.ratio, ### Copy number log-ratios, one for each exon in coverage file. plot.cnv, ### Segmentation plots. coverage.cutoff, ### Minimum coverage in both normal and tumor. sampleid=sampleid, ### Sample id, used in output files. exon.weight.file=NULL, ### Can be used to assign weights to exons. alpha=0.005, ### Alpha value for CBS, see documentation for the segment function. vcf=NULL, ### Optional VCF object with germline allelic ratios. tumor.id.in.vcf=1, ### Id of tumor in case multiple samples are stored in VCF. verbose=TRUE, ### Verbose output. ... ### Additional parameters passed to a modified version of CNV.analyze function ### of the ExomeCNV package. ) { exon.weights <- NULL if (!is.null(exon.weight.file)) { exon.weights <- read.delim(exon.weight.file, as.is=TRUE) exon.weights <- exon.weights[match(as.character(tumor[,1]), exon.weights[,1]),2] if (verbose) message("Exon weights found, will use weighted CBS.") } x <- .CNV.analyze2(normal, tumor, logR=log.ratio, plot.cnv=plot.cnv, coverage.cutoff=coverage.cutoff, sampleid=sampleid, alpha=alpha, weights=exon.weights, verbose=verbose,...) if (!is.null(vcf)) { x <- .pruneByVCF(x, vcf, tumor.id.in.vcf) } idx.enough.markers <- x$cna$output$num.mark > 1 ##value<< A list with elements xx <- list( seg=x$cna$output[idx.enough.markers,], ##<< The segmentation. size=x$cnv$size[idx.enough.markers] ##<< The size of all segments (in base pairs). ) ##end<< },ex=function() { gatk.normal.file <- system.file("extdata", "example_normal.txt", package="PureCN") gatk.tumor.file <- system.file("extdata", "example_tumor.txt", package="PureCN") vcf.file <- system.file("extdata", "example_vcf.vcf", package="PureCN") gc.gene.file <- system.file("extdata", "example_gc.gene.file.txt", package="PureCN") # speed-up the runAbsoluteCN by using the stored grid-search. # (purecn.example.output$candidates). data(purecn.example.output) # The max.candidate.solutions is set to a very low value only to speed-up this example. # This is not a good idea for real samples. ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, gatk.tumor.file=gatk.tumor.file, vcf.file=vcf.file, sampleid='Sample1', gc.gene.file=gc.gene.file, candidates=purecn.example.output$candidates, max.candidate.solutions=2, fun.segmentation=segmentationCBS, args.segmentation=list(alpha=0.001)) }) # looks at breakpoints, and if p-value is higher than max.pval, merge unless there is evidence based # on germline SNPs .pruneByVCF <- function(x, vcf, tumor.id.in.vcf, min.size=5, max.pval=0.00001, iterations=3, debug=FALSE) { seg <- segments.p(x$cna) for (iter in 1:iterations) { seg.gr <- GRanges(seqnames=.add.chr.name(seg$chrom), IRanges(start=seg$loc.start, end=seg$loc.end)) ov <- findOverlaps(seg.gr, vcf) ar <- sapply(geno(vcf)$FA[,tumor.id.in.vcf], function(x) x[1]) ar.r <- ifelse(ar>0.5, 1-ar, ar) merged <- rep(FALSE, nrow(seg)) for (i in 2:nrow(seg)) { # don't try to merge chromosomes or very significant breakpoints if (is.na(seg$pval[i-1]) || seg$pval[i-1]<max.pval) next # don't merge when we have no germline data for segments if (!(i %in% queryHits(ov) && (i-1) %in% queryHits(ov))) next ar.i <- list(ar.r[subjectHits(ov)][queryHits(ov)==i],ar.r[subjectHits(ov)][queryHits(ov)==i-1]) if (length(ar.i[[1]]) < min.size || length(ar.i[[2]]) < min.size) next if (merged[i-1]) next p.t <- t.test(ar.i[[1]], ar.i[[2]])$p.value if (p.t>0.2) { merged[i] <- TRUE x$cna$output$seg.mean[i-1] <- weighted.mean(c(seg$seg.mean[i],seg$seg.mean[i-1]),w=c(seg$num.mark[i],seg$num.mark[i-1])) x$cnv$size[i-1] <- x$cnv$size[i]+x$cnv$size[i-1] x$cna$output$num.mark[i-1] <- seg$num.mark[i]+seg$num.mark[i-1] x$cna$output$loc.end[i-1] <- seg$loc.end[i] seg$pval[i-1] <- seg$pval[i] } if (debug) message(paste(i, "LR diff:", abs(seg$seg.mean[i]-seg$seg.mean[i-1]), "Size: ", seg$num.mark[i-1], "PV:", p.t, "PV bp:",seg$pval[i-1] , "Merged:", merged[i],"\n", sep=" ")) } x$cna$output <- x$cna$output[!merged,] x$cnv <- x$cnv[!merged,] seg <- seg[!merged,] } x } # ExomeCNV version without the x11() calls .CNV.analyze2 <- function(normal, tumor, logR=NULL, coverage.cutoff=15, normal.chrs=c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY"), normal.chr=normal.chrs, c=0.5, write.file=FALSE, file=NULL, weights=NULL, doDNAcopy=TRUE, sdundo=0.5, undo.splits="sdundo", smooth=TRUE, alpha=0.01, sampleid=NULL, plot.cnv=TRUE, verbose=TRUE) { `%+%` <- function(x,y) paste(x,y,sep="") normal.chrs = intersect(levels(normal$chr), normal.chrs) # first, do it for exons with enough coverage. MR: added less stringent cutoff in case normal looks great. these could be homozygous deletions in high purity samples well.covered.exon.idx = ((normal$average.coverage > coverage.cutoff) & (tumor$average.coverage > coverage.cutoff)) | ((normal$average.coverage > 1.5 * coverage.cutoff) & (tumor$average.coverage > 0.5 * coverage.cutoff)) #MR: fix for missing chrX/Y well.covered.exon.idx[is.na(well.covered.exon.idx)] <- FALSE if (verbose) message(paste("Removing", sum(!well.covered.exon.idx), "low coverage exons.")) if (is.null(logR)) norm.log.ratio = .calcLogRatio(normal, tumor, verbose) else norm.log.ratio = logR if (doDNAcopy) { CNA.obj = CNA(norm.log.ratio[well.covered.exon.idx], .strip.chr.name(normal$chr[well.covered.exon.idx]), (normal$probe_start[well.covered.exon.idx] + normal$probe_end[well.covered.exon.idx])/2, data.type="logratio", sampleid=sampleid) smoothed.CNA.obj = if (smooth) smooth.CNA(CNA.obj) else CNA.obj if (!is.null(weights)) { weights <- weights[well.covered.exon.idx] # MR: this shouldn't happen. In doubt, count them as maximum (assuming that poorly performing exons are down-weighted) weights[is.na(weights)] <- max(weights, na.rm=TRUE) segment.smoothed.CNA.obj = segment(smoothed.CNA.obj, undo.splits=undo.splits, undo.SD=sdundo, verbose=ifelse(verbose, 1, 0), alpha=alpha,weights=weights) } else { segment.smoothed.CNA.obj = segment(smoothed.CNA.obj, undo.splits=undo.splits, undo.SD=sdundo, verbose=ifelse(verbose, 1, 0), alpha=alpha) } if (plot.cnv) { if (write.file && !is.null(file)) png(file, width=2000, height=1000, units="px") else if (write.file) png("CNV detection for exons with > " %+% coverage.cutoff %+% " coverage.png", width=2000, height=1000, units="px") plot(segment.smoothed.CNA.obj, plot.type="s") if (write.file) dev.off() if (write.file && !is.null(file)) png("allchr." %+% file, width=2000, height=1000, units="px") else if (write.file) png("all chromosome CNV detection for exons with > " %+% coverage.cutoff %+% " coverage.png", width=2000, height=1000, units="px") plot(segment.smoothed.CNA.obj, plot.type="w") abline(h=log2(c + (1-c)*c(1,3,4,5)/2), col="purple") if (write.file) dev.off() } cnv = .get.proper.cnv.positions(normal[well.covered.exon.idx,], print(segment.smoothed.CNA.obj)) return(list(cnv=cnv, cna=segment.smoothed.CNA.obj, logR=norm.log.ratio)) } else { logR.mean = mean(norm.log.ratio[well.covered.exon.idx]) logR.sd = sd(norm.log.ratio[well.covered.exon.idx]) logR.min = min(norm.log.ratio[well.covered.exon.idx]) logR.max = max(norm.log.ratio[well.covered.exon.idx]) if (plot.cnv) { if (write.file && !is.null(file)) png(file, width=2000, height=1000, units="px") else if (write.file) png("CNV detection for exons with > " %+% coverage.cutoff %+% " coverage.noDNAcopy.png", width=2000, height=1000, units="px") par(mfrow=c(4,6)) for (chr in levels(normal$chr)) { plot((normal$probe_start[well.covered.exon.idx & normal$chr==chr] + normal$probe_end[well.covered.exon.idx & normal$chr==chr])/2, norm.log.ratio[well.covered.exon.idx & normal$chr==chr], pch="*", pc=20, ylim=c(logR.min, logR.max), main=chr, xlab="position", ylab="log ratio") abline(h=logR.mean + logR.sd, col="red") abline(h=logR.mean - logR.sd, col="red") abline(h=0, col="gray") } if (write.file) dev.off() } return(list(logR=norm.log.ratio)) } } .get.proper.cnv.positions <- function (exons, cnv) { chr.hash <- NULL data(chr.hash, envir=environment()) `%+%` <- function(x, y) paste(x, y, sep = "") order.by.chr = order(.strip.chr.name(exons$chr)) exons = exons[order.by.chr, ] cnv$chr = as.character(chr.hash[cnv$chrom, "chr"]) cnv$probe = "cnv" %+% as.character(1:nrow(cnv)) end.idx = cumsum(cnv$num.mark) start.idx = c(1, 1 + end.idx[-length(end.idx)]) cnv$probe_start = exons$probe_start[start.idx] cnv$probe_end = exons$probe_end[end.idx] cnv$size = cnv$probe_end - cnv$probe_start + 1 sum.chunk = function(i, colName) { sum(exons[start.idx[i]:end.idx[i], colName]) } cnv$targeted.base = sapply(1:nrow(cnv), sum.chunk, colName = "targeted.base") cnv$sequenced.base = sapply(1:nrow(cnv), sum.chunk, colName = "sequenced.base") cnv$coverage = sapply(1:nrow(cnv), sum.chunk, colName = "coverage") cnv$average.coverage = cnv$coverage/cnv$targeted.base cnv$base.with..10.coverage = sapply(1:nrow(cnv), sum.chunk, colName = "base.with..10.coverage") return(cnv) }