Bioconductor Code: PureCN

Raw Blame Patch Log History
#' PSCBS segmentation
#' 
#' Alternative segmentation function using the \code{PSCBS} package.  This
#' function is called via the \code{fun.segmentation} argument of
#' \code{\link{runAbsoluteCN}}.  The arguments are passed via
#' \code{args.segmentation}.
#' 
#' 
#' @param normal GATK coverage data for normal sample.
#' @param tumor GATK coverage data for tumor sample.
#' @param log.ratio Copy number log-ratios, one for each exon in coverage file.
#' @param seg If segmentation was provided by the user, this data structure
#' will contain this segmentation. Useful for minimal segmentation functions.
#' Otherwise PureCN will re-segment the data. This segmentation function
#' ignores this user provided segmentation.
#' @param plot.cnv Segmentation plots.
#' @param min.coverage Minimum coverage in both normal and tumor.
#' @param sampleid Sample id, used in output files.
#' @param target.weight.file Can be used to assign weights to targets. NOT
#' SUPPORTED YET in segmentation. Will remove targets with weight below 1/3.
#' @param alpha Alpha value for CBS, see documentation for the \code{segment}
#' function.
#' @param undo.SD \code{undo.SD} for CBS, see documentation of the
#' \code{segment} function. If \code{NULL}, try to find a sensible default.
#' @param flavor Flavor value for PSCBS. See \code{segmentByNonPairedPSCBS}.
#' @param tauA tauA argument for PSCBS. See \code{segmentByNonPairedPSCBS}.
#' @param vcf Optional VCF object with germline allelic ratios.
#' @param tumor.id.in.vcf Id of tumor in case multiple samples are stored in
#' VCF.
#' @param normal.id.in.vcf Id of normal in in VCF. If \code{NULL}, 
#' use unpaired PSCBS.
#' @param max.segments If not \code{NULL}, try a higher \code{undo.SD} 
#' parameter if number of segments exceeds the threshold.
#' @param prune.hclust.h Height in the \code{hclust} pruning step. Increasing
#' this value will merge segments more aggressively. If \code{NULL}, try to 
#' find a sensible default.
#' @param prune.hclust.method Cluster method used in the \code{hclust} pruning
#' step. See documentation for the \code{hclust} function.
#' @param chr.hash Mapping of non-numerical chromsome names to numerical names
#' (e.g. chr1 to 1, chr2 to 2, etc.). If \code{NULL}, assume chromsomes are 
#' properly ordered.
#' @param centromeres A \code{data.frame} with centromere positions in first
#' three columns.  If not \code{NULL}, add breakpoints at centromeres. 
#' @param \dots Additional parameters passed to the 
#' \code{segmentByNonPairedPSCBS} function.
#' @return \code{data.frame} containing the segmentation.
#' @author Markus Riester
#' @references Olshen, A. B., Venkatraman, E. S., Lucito, R., Wigler, M. 
#' (2004). Circular binary segmentation for the analysis of array-based DNA 
#' copy number data. Biostatistics 5: 557-572.
#'
#' Venkatraman, E. S., Olshen, A. B. (2007). A faster circular binary 
#' segmentation algorithm for the analysis of array CGH data. Bioinformatics 
#' 23: 657-63.
#'
#' Olshen et al. (2011). Parent-specific copy number in paired tumor-normal 
#' studies using circular binary segmentation. Bioinformatics.
#' @seealso \code{\link{runAbsoluteCN}}
#' @examples
#' 
#' normal.coverage.file <- system.file("extdata", "example_normal.txt", 
#'     package="PureCN")
#' tumor.coverage.file <- system.file("extdata", "example_tumor.txt", 
#'     package="PureCN")
#' vcf.file <- system.file("extdata", "example_vcf.vcf", 
#'     package="PureCN")
#' gc.gene.file <- system.file("extdata", "example_gc.gene.file.txt", 
#'     package="PureCN")
#' 
#' # The max.candidate.solutions, max.ploidy and test.purity parameters are set to
#' # non-default values to speed-up this example.  This is not a good idea for real
#' # samples.
#'  ret <-runAbsoluteCN(normal.coverage.file=normal.coverage.file, 
#'      tumor.coverage.file=tumor.coverage.file, vcf.file=vcf.file, genome="hg19",
#'      sampleid="Sample1", gc.gene.file=gc.gene.file,
#'      fun.segmentation=segmentationPSCBS, max.ploidy=4,
#'      test.purity=seq(0.3,0.7,by=0.05), max.candidate.solutions=1)
#' 
#' @export segmentationPSCBS
segmentationPSCBS <- function(normal, tumor, log.ratio, seg, plot.cnv, 
    min.coverage, sampleid, target.weight.file = NULL, alpha = 0.005, undo.SD =
    NULL, flavor = "tcn&dh", tauA = 0.03, vcf = NULL,
    tumor.id.in.vcf = 1, normal.id.in.vcf = NULL, max.segments = NULL,
    prune.hclust.h = NULL, prune.hclust.method = "ward.D", chr.hash = NULL,
    centromeres = NULL, ...) {

    debug <- TRUE
        
    if (!requireNamespace("PSCBS", quietly = TRUE)) {
        .stopUserError("segmentationPSCBS requires the PSCBS package.")
    }

    if (is.null(chr.hash)) chr.hash <- .getChrHash(tumor$chr)

    well.covered.exon.idx <- ((normal$average.coverage > min.coverage) &
        (tumor$average.coverage > min.coverage)) | 
        ((normal$average.coverage > 1.5 * min.coverage) &  
        (tumor$average.coverage > 0.5 * min.coverage))

    target.weights <- NULL
    if (!is.null(target.weight.file)) {
        target.weights <- read.delim(target.weight.file, as.is=TRUE)
        target.weights <- target.weights[match(as.character(tumor[,1]), 
            target.weights[,1]),2]
         flog.info("Target weights found, but currently not supported by PSCBS. %s",
            "Will simply exclude targets with low weight.")
        lowWeightTargets <- target.weights < 1/3
        well.covered.exon.idx[which(lowWeightTargets)] <- FALSE
    }

    #MR: fix for missing chrX/Y 
    well.covered.exon.idx[is.na(well.covered.exon.idx)] <- FALSE
    tumor <- tumor[well.covered.exon.idx,]
    log.ratio <- log.ratio[well.covered.exon.idx]
    exon.gr <- GRanges(seqnames=tumor$chr, 
        IRanges(start=tumor$probe_start, end=tumor$probe_end))
    ov <- findOverlaps(vcf, exon.gr)
    d.f <- cbind(tumor[subjectHits(ov),], 
        CT=2^(log.ratio+1)[subjectHits(ov)], 
        betaT=unlist(geno(vcf[queryHits(ov)])$FA[,tumor.id.in.vcf]), 
        betaN=NA,
        x=start(vcf[queryHits(ov)]) )
    
    if (!is.null(normal.id.in.vcf)) {
        d.f$betaN <- unlist(geno(vcf[queryHits(ov)])$FA[,normal.id.in.vcf])
    }
         
    d.f.2 <- cbind(tumor[-subjectHits(ov),], 
        CT=2^(log.ratio+1)[-subjectHits(ov)], betaT=NA, betaN=NA,
        x=tumor$probe_start[-subjectHits(ov)] )
    
    d.f.3 <- rbind(d.f, d.f.2)
    d.f.3 <- d.f.3[order(.strip.chr.name(d.f.3$chr, chr.hash), d.f.3$x),]
    d.f <- d.f.3
    colnames(d.f)[2] <- "chromosome"
    d.f$chromosome <- .strip.chr.name(d.f$chromosome, chr.hash)
    #if (!is.null(normal.id.in.vcf)) {
    #    seg <- PSCBS::segmentByPairedPSCBS(d.f, tauA=tauA, 
    #        flavor=flavor, ...)
    #} else {
    if (is.null(undo.SD)) {
        undo.SD <- .getSDundo(log.ratio)
        flog.info("Setting undo.SD parameter to %f.", undo.SD)
    }   
    knownSegments <- NULL
    if (!is.null(centromeres)) {
        knownSegments <- centromeres
        colnames(knownSegments) <- c("chromosome", "start", "end")
        knownSegments$length <- knownSegments$end-knownSegments$start+1
        knownSegments$chromosome <- .strip.chr.name(knownSegments$chromosome,
            chr.hash)
        knownSegments <- PSCBS::gapsToSegments(knownSegments)
    }    
    seg <- PSCBS::segmentByNonPairedPSCBS(d.f, tauA=tauA, 
        flavor=flavor, undoTCN=undo.SD, knownSegments=knownSegments, 
        min.width=3,alphaTCN=alpha, ...)
    #}    
    if (plot.cnv) PSCBS::plotTracks(seg)
    x <- .PSCBSoutput2DNAcopy(seg, sampleid)

    if (!is.null(vcf)) {
        x <- .pruneByHclust(x, vcf, tumor.id.in.vcf, h=prune.hclust.h, 
            method=prune.hclust.method, chr.hash=chr.hash)
    }
    x$cna$output
}

.PSCBSoutput2DNAcopy <- function(seg, sampleid) {
    sx <- cbind(ID=sampleid, seg$output[!is.na(seg$output$tcnMean),])
    sx <- sx[,c("ID", "chromosome", "tcnStart", "tcnEnd", "tcnNbrOfLoci", 
        "tcnMean")]
    colnames(sx) <- c("ID", "chrom", "loc.start",  "loc.end", "num.mark", 
        "seg.mean")
    sx$seg.mean <- log2(sx$seg.mean/2)
    list(cna=list(output=sx))
}