R/segmentationCBS.R
ea58a451
 segmentationCBS <-
 structure(function(# CBS segmentation
 ### The default segmentation function. This function is called via the 
 ### fun.segmentation argument of runAbsoluteCN. The arguments are passed
 ### via args.segmentation.
 normal, 
 ### GATK coverage file for normal sample.
 tumor,  
 ### GATK coverage file for tumor sample.
 log.ratio, 
 ### Copy number log-ratios, one for each exon in coverage file.
 plot.cnv, 
 ### Segmentation plots.
 coverage.cutoff, 
 ### Minimum coverage in both normal and tumor.
 sampleid=sampleid,
 ### Sample id, used in output files.
 exon.weight.file=NULL,
 ### Can be used to assign weights to exons.
 alpha=0.005,
 ### Alpha value for CBS, see documentation for the segment function.
 vcf=NULL,
 ### Optional VCF object with germline allelic ratios.
 tumor.id.in.vcf=1,
 ### Id of tumor in case multiple samples are stored in VCF.
 verbose=TRUE,
 ### Verbose output.
 ...
 ### Additional parameters passed to a modified version of CNV.analyze function
 ### of the ExomeCNV package.
 ) {
     exon.weights <- NULL
     if (!is.null(exon.weight.file)) {
         exon.weights <- read.delim(exon.weight.file, as.is=TRUE)
         exon.weights <- exon.weights[match(as.character(tumor[,1]), exon.weights[,1]),2]
         if (verbose) message("Exon weights found, will use weighted CBS.")
     }
     x <- .CNV.analyze2(normal, tumor, logR=log.ratio, plot.cnv=plot.cnv, coverage.cutoff=coverage.cutoff, sampleid=sampleid, alpha=alpha, weights=exon.weights, verbose=verbose,...) 
     if (!is.null(vcf)) {
         x <- .pruneByVCF(x, vcf, tumor.id.in.vcf)
     }
     idx.enough.markers <- x$cna$output$num.mark > 1
     ##value<< A list with elements
     xx <- list(
         seg=x$cna$output[idx.enough.markers,], ##<< The segmentation.
         size=x$cnv$size[idx.enough.markers]  ##<< The size of all segments (in base pairs).
     )
 ##end<<
 },ex=function() {
 gatk.normal.file <- system.file("extdata", "example_normal.txt", package="PureCN")
 gatk.tumor.file <- system.file("extdata", "example_tumor.txt", package="PureCN")
 vcf.file <- system.file("extdata", "example_vcf.vcf", package="PureCN")
 gc.gene.file <- system.file("extdata", "example_gc.gene.file.txt", package="PureCN")
 
 # speed-up the runAbsoluteCN by using the stored grid-search.
 # (purecn.example.output$candidates).
 data(purecn.example.output)
 
 # The max.candidate.solutions is set to a very low value only to speed-up this example.
 # This is not a good idea for real samples.
 ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, gatk.tumor.file=gatk.tumor.file, 
    vcf.file=vcf.file, sampleid='Sample1', gc.gene.file=gc.gene.file, 
    candidates=purecn.example.output$candidates, max.candidate.solutions=2,
    fun.segmentation=segmentationCBS, args.segmentation=list(alpha=0.001))
 })    
 
 # looks at breakpoints, and if p-value is higher than max.pval, merge unless there is evidence based
 # on germline SNPs
 .pruneByVCF <- function(x, vcf, tumor.id.in.vcf, min.size=5, max.pval=0.00001, iterations=3, debug=FALSE) {
     seg <- segments.p(x$cna)
     for (iter in 1:iterations) {
44f86eab
         seg.gr <- GRanges(seqnames=.add.chr.name(seg$chrom), IRanges(start=seg$loc.start, end=seg$loc.end))
ea58a451
         ov <- findOverlaps(seg.gr, vcf)
         ar <- sapply(geno(vcf)$FA[,tumor.id.in.vcf], function(x) x[1])
         ar.r <- ifelse(ar>0.5, 1-ar, ar)
         merged <- rep(FALSE, nrow(seg))
         for (i in 2:nrow(seg)) {
             # don't try to merge chromosomes or very significant breakpoints
             if (is.na(seg$pval[i-1]) || seg$pval[i-1]<max.pval) next
             # don't merge when we have no germline data for segments    
             if (!(i %in% queryHits(ov) && (i-1) %in% queryHits(ov))) next
             ar.i <- list(ar.r[subjectHits(ov)][queryHits(ov)==i],ar.r[subjectHits(ov)][queryHits(ov)==i-1])
             if (length(ar.i[[1]]) < min.size || length(ar.i[[2]]) < min.size) next
             if (merged[i-1]) next
             
             p.t <- t.test(ar.i[[1]], ar.i[[2]])$p.value
             if (p.t>0.2) {
                 merged[i] <- TRUE
                 x$cna$output$seg.mean[i-1] <- weighted.mean(c(seg$seg.mean[i],seg$seg.mean[i-1]),w=c(seg$num.mark[i],seg$num.mark[i-1]))
                 x$cnv$size[i-1] <- x$cnv$size[i]+x$cnv$size[i-1]
                 x$cna$output$num.mark[i-1] <- seg$num.mark[i]+seg$num.mark[i-1]
                 x$cna$output$loc.end[i-1] <- seg$loc.end[i]
                 seg$pval[i-1] <- seg$pval[i]
             }
             if (debug) message(paste(i, "LR diff:", abs(seg$seg.mean[i]-seg$seg.mean[i-1]), "Size: ", seg$num.mark[i-1], "PV:", p.t, "PV bp:",seg$pval[i-1] , "Merged:", merged[i],"\n", sep=" "))
         }
         x$cna$output <- x$cna$output[!merged,]
         x$cnv <- x$cnv[!merged,]
         seg <- seg[!merged,]
     }
     x
 }
     
 
 # ExomeCNV version without the x11() calls 
 .CNV.analyze2 <-
 function(normal, tumor, logR=NULL, coverage.cutoff=15, normal.chrs=c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY"), normal.chr=normal.chrs, c=0.5, write.file=FALSE, file=NULL, weights=NULL, doDNAcopy=TRUE, sdundo=0.5, undo.splits="sdundo", smooth=TRUE, alpha=0.01, sampleid=NULL, plot.cnv=TRUE, verbose=TRUE) {
     `%+%` <- function(x,y) paste(x,y,sep="")
     normal.chrs = intersect(levels(normal$chr), normal.chrs)
 
     # first, do it for exons with enough coverage. MR: added less stringent cutoff in case normal looks great. these could be homozygous deletions in high purity samples
     well.covered.exon.idx = ((normal$average.coverage > coverage.cutoff) & (tumor$average.coverage > coverage.cutoff)) | ((normal$average.coverage > 1.5 * coverage.cutoff) &  (tumor$average.coverage > 0.5 * coverage.cutoff))
     #MR: fix for missing chrX/Y 
     well.covered.exon.idx[is.na(well.covered.exon.idx)] <- FALSE
 
     if (verbose) message(paste("Removing", sum(!well.covered.exon.idx), "low coverage exons."))
     if (is.null(logR)) norm.log.ratio = .calcLogRatio(normal, tumor, verbose)
     else norm.log.ratio = logR
 
     if (doDNAcopy) {
 
         CNA.obj = CNA(norm.log.ratio[well.covered.exon.idx], .strip.chr.name(normal$chr[well.covered.exon.idx]), (normal$probe_start[well.covered.exon.idx] + normal$probe_end[well.covered.exon.idx])/2, data.type="logratio", sampleid=sampleid)
         smoothed.CNA.obj = if (smooth) smooth.CNA(CNA.obj) else CNA.obj
         if (!is.null(weights)) { 
             weights <- weights[well.covered.exon.idx]
             # MR: this shouldn't happen. In doubt, count them as maximum (assuming that poorly performing exons are down-weighted)
             weights[is.na(weights)] <- max(weights, na.rm=TRUE)
             segment.smoothed.CNA.obj = segment(smoothed.CNA.obj, undo.splits=undo.splits, undo.SD=sdundo, verbose=ifelse(verbose, 1, 0), alpha=alpha,weights=weights)
         } else {
             segment.smoothed.CNA.obj = segment(smoothed.CNA.obj, undo.splits=undo.splits, undo.SD=sdundo, verbose=ifelse(verbose, 1, 0), alpha=alpha)
         }        
 
         if (plot.cnv) {
             if (write.file && !is.null(file)) png(file, width=2000, height=1000, units="px")
             else if (write.file) png("CNV detection for exons with > " %+% coverage.cutoff %+% " coverage.png", width=2000, height=1000, units="px")
             plot(segment.smoothed.CNA.obj, plot.type="s")
             if (write.file) dev.off()
 
             if (write.file && !is.null(file)) png("allchr." %+% file, width=2000, height=1000, units="px")
             else if (write.file) png("all chromosome CNV detection for exons with > " %+% coverage.cutoff %+% " coverage.png", width=2000, height=1000, units="px") 
             plot(segment.smoothed.CNA.obj, plot.type="w")
             abline(h=log2(c + (1-c)*c(1,3,4,5)/2), col="purple")
             if (write.file) dev.off()
         }
 
         cnv = .get.proper.cnv.positions(normal[well.covered.exon.idx,], print(segment.smoothed.CNA.obj))
 
         return(list(cnv=cnv, cna=segment.smoothed.CNA.obj, logR=norm.log.ratio))
 
     } else {
 
         logR.mean = mean(norm.log.ratio[well.covered.exon.idx])
         logR.sd = sd(norm.log.ratio[well.covered.exon.idx])
         logR.min = min(norm.log.ratio[well.covered.exon.idx])
         logR.max = max(norm.log.ratio[well.covered.exon.idx])
 
         if (plot.cnv) {
             if (write.file && !is.null(file)) png(file, width=2000, height=1000, units="px")
             else if (write.file) png("CNV detection for exons with > " %+% coverage.cutoff %+% " coverage.noDNAcopy.png", width=2000, height=1000, units="px")
             par(mfrow=c(4,6))
             for (chr in levels(normal$chr)) {
                 plot((normal$probe_start[well.covered.exon.idx & normal$chr==chr] + normal$probe_end[well.covered.exon.idx & normal$chr==chr])/2, norm.log.ratio[well.covered.exon.idx & normal$chr==chr], pch="*", pc=20, ylim=c(logR.min, logR.max), main=chr, xlab="position", ylab="log ratio")
                 abline(h=logR.mean + logR.sd, col="red")
                 abline(h=logR.mean - logR.sd, col="red")
                 abline(h=0, col="gray")
             }
             if (write.file) dev.off()
         }
 
         return(list(logR=norm.log.ratio))
 
     }
 }
 
 .get.proper.cnv.positions <- function (exons, cnv) 
 {
     chr.hash <- NULL    
     data(chr.hash, envir=environment())
     `%+%` <- function(x, y) paste(x, y, sep = "")
     order.by.chr = order(.strip.chr.name(exons$chr))
     exons = exons[order.by.chr, ]
     cnv$chr = as.character(chr.hash[cnv$chrom, "chr"])
     cnv$probe = "cnv" %+% as.character(1:nrow(cnv))
     end.idx = cumsum(cnv$num.mark)
     start.idx = c(1, 1 + end.idx[-length(end.idx)])
     cnv$probe_start = exons$probe_start[start.idx]
     cnv$probe_end = exons$probe_end[end.idx]
     cnv$size = cnv$probe_end - cnv$probe_start + 1
     sum.chunk = function(i, colName) {
         sum(exons[start.idx[i]:end.idx[i], colName])
     }
     cnv$targeted.base = sapply(1:nrow(cnv), sum.chunk, colName = "targeted.base")
     cnv$sequenced.base = sapply(1:nrow(cnv), sum.chunk, colName = "sequenced.base")
     cnv$coverage = sapply(1:nrow(cnv), sum.chunk, colName = "coverage")
     cnv$average.coverage = cnv$coverage/cnv$targeted.base
     cnv$base.with..10.coverage = sapply(1:nrow(cnv), sum.chunk, 
         colName = "base.with..10.coverage")
     return(cnv)
 }