R/processMultipleSamples.R
0fda3814
 #' Multi sample normalization and segmentation
 #' 
 #' This function performs normalization and segmentation when multiple 
 #' for the same patient are available. 
 #' 
 #' 
 #' @param tumor.coverage.files Coverage data for tumor samples.
 #' @param sampleids Sample ids, used in output files.
 #' @param normalDB Database of normal samples, created with
 #' \code{\link{createNormalDatabase}}.
 #' @param num.eigen Number of eigen vectors used.
82c73cb5
 #' @param genome Genome version, for example hg19. Needed to get centromere
 #' positions.
0fda3814
 #' @param plot.cnv Segmentation plots.
fe78821a
 #' @param interval.weight.file Deprecated. For \code{normalDB} objects generated
 #' with PureCN versions older than 1.16, re-run \code{\link{createNormalDatabase}}.
82c73cb5
 #' @param min.interval.weight Can be used to ignore intervals with low weights.
33250502
 #' @param w Weight of samples. Can be used to downweight poor quality samples.
 #' If \code{NULL}, sets to inverse of median on-target duplication rate if
 #' available, otherwise does not do any weighting.
0fda3814
 #' @param max.segments If not \code{NULL}, try a higher \code{undo.SD}
 #' parameter if number of segments exceeds the threshold.
 #' @param chr.hash Mapping of non-numerical chromsome names to numerical names
 #' (e.g. chr1 to 1, chr2 to 2, etc.). If \code{NULL}, assume chromsomes are
 #' properly ordered.
 #' @param centromeres A \code{GRanges} object with centromere positions.
 #' @param ... Arguments passed to the segmentation function.
 #' @return \code{data.frame} containing the segmentation.
 #' @author Markus Riester
82c73cb5
 #' @references Nilsen G., Liestol K., Van Loo P., Vollan H., Eide M., Rueda O.,
 #' Chin S., Russell R., Baumbusch L., Caldas C., Borresen-Dale A., 
 #' Lingjaerde O. (2012). "Copynumber: Efficient algorithms for single- and
 #' multi-track copy number segmentation." BMC Genomics, 13(1), 591.
0fda3814
 #'
 #' @seealso \code{\link{runAbsoluteCN}}
 #' @examples
 #' 
82c73cb5
 #' normal1.coverage.file <- system.file("extdata", "example_normal.txt", 
0fda3814
 #'     package="PureCN")
82c73cb5
 #' normal2.coverage.file <- system.file("extdata", "example_normal2.txt", 
0fda3814
 #'     package="PureCN")
82c73cb5
 #' tumor1.coverage.file <- system.file("extdata", "example_tumor.txt",
0fda3814
 #'     package="PureCN")
82c73cb5
 #' tumor2.coverage.file <- system.file("extdata", "example_tumor2.txt",
0fda3814
 #'     package="PureCN")
82c73cb5
 #'
 #' normal.coverage.files <- c(normal1.coverage.file, normal2.coverage.file)
 #' tumor.coverage.files <- c(tumor1.coverage.file, tumor2.coverage.file)
 #'
 #' normalDB <- createNormalDatabase(normal.coverage.files)
 #'
 #' seg <- processMultipleSamples(tumor.coverage.files, 
 #'          sampleids = c("Sample1", "Sample2"),
 #'          normalDB = normalDB,
 #'          genome = "hg19")
 #'
0fda3814
 #' @export processMultipleSamples
 processMultipleSamples <- function(tumor.coverage.files, sampleids, normalDB, 
ce934788
     num.eigen = 20, genome, plot.cnv = TRUE, w = NULL,
82c73cb5
     interval.weight.file = NULL, min.interval.weight = 1/3,
     max.segments = NULL, chr.hash = NULL, centromeres = NULL, ...) {
0fda3814
 
     if (!requireNamespace("copynumber", quietly = TRUE)) {
         .stopUserError("processMultipleSamples requires the copynumber package.")
     }
     tumor.coverage.files <- normalizePath(tumor.coverage.files)
     tumors <- lapply(.readNormals(tumor.coverage.files), 
         calculateTangentNormal, normalDB, num.eigen = num.eigen)
 
     interval.weights <- NULL
     intervalsUsed <- rep(TRUE, length(tumors[[1]]))
fe78821a
     # TODO defunct in 1.18
     if (!is.null(interval.weight.file) && 
         !is.null(min.interval.weight)) {
         normalDB <- .add_weights_to_normaldb(interval.weight.file, normalDB)
     }    
     if (!is.null(normalDB$sd$weights) && !is.null(min.interval.weight)) {
        interval.weights <- normalDB$sd$weights$weights
        if (length(interval.weights) != length(tumors[[1]])) {
             # should not happen because it is checked upstream
             .stopRuntimeError("interval.weights and tumors does not align.")
        }    
        flog.info("Interval weights found, but currently not supported by copynumber. %s",
0fda3814
             "Will simply exclude intervals with low weight.")
fe78821a
        lowWeightIntervals <- interval.weights < min.interval.weight
        intervalsUsed[which(lowWeightIntervals)] <- FALSE
0fda3814
     }
     #MR: fix for missing chrX/Y 
     intervalsUsed[is.na(intervalsUsed)] <- FALSE
     if (is.null(chr.hash)) chr.hash <- .getChrHash(seqlevels(tumors[[1]]))
     intervalsUsed <- .filterIntervalsChrHash(intervalsUsed, tumors[[1]], chr.hash)
     centromeres <- .getCentromerePositions(centromeres, genome, 
             if (is.null(tumors[[1]])) NULL else seqlevelsStyle(tumors[[1]]))
 
     armLocations <- .getArmLocations(tumors[[1]], chr.hash, centromeres)
     armLocationsGr <- GRanges(armLocations)
     arms <- armLocationsGr$arm[findOverlaps(tumors[[1]], armLocationsGr, select="first")]
     lrs <- data.frame(do.call(cbind, lapply(tumors, function(x) unlist(x$log.ratio))))
     colnames(lrs) <- sampleids
     lrs <- data.frame(chrom = .strip.chr.name(seqnames(tumors[[1]]), chr.hash), 
                       pos = start(tumors[[1]]), lrs)
82c73cb5
     # ignore arms with only 2 or fewer probes
     carms <- paste0(lrs[,1], arms)
     intervalsUsed <- as.logical(intervalsUsed & complete.cases(lrs) 
                                 & !is.na(arms)) & table(carms)[carms] > 2
0fda3814
 
     lrs <- lrs[intervalsUsed,]
     arms <- arms[intervalsUsed]
82c73cb5
     lrsw <- copynumber::winsorize(lrs, arms = arms, verbose = FALSE)
33250502
     if (is.null(w)) {
         w <- 1
bd374dcb
         dupr <- vapply(tumors, function(x) 
                        median(x[x$on.target]$duplication.rate, na.rm = TRUE),
                        double(1))
33844cbb
         if (!sum(is.na(dupr)) && min(dupr, na.rm = TRUE) > 0) { 
bd374dcb
             w <- (1 / dupr)
             w <- w / max(w)
33250502
 
             flog.info("Setting weights by duplication rate. Lowest weight for %s (%.2f), heighest for %s.",
                 sampleids[which.min(w)], min(w), sampleids[which.max(w)])
         }
     }
     lrsm <- copynumber::multipcf(lrsw, arms = arms, w = w, ...)
0fda3814
     if (plot.cnv) {
         copynumber::plotHeatmap(segments = lrsm, upper.lim = 1)
82c73cb5
         copynumber::plotGenome(lrsw, segments = lrsm, onefile = TRUE)
0fda3814
     }
     idx.enough.markers <- lrsm$n.probes > 1
     rownames(lrsm) <- NULL
     lrsm[idx.enough.markers,]
     #transform to DNAcopy format
67a695ed
     m <- data.table::melt(data.table(lrsm), id.vars=1:5)
0fda3814
     m <- m[, c(6,1,3,4,5,7)]
     colnames(m) <- c("ID", "chrom", "loc.start", "loc.end", "num.mark", "seg.mean")
67a695ed
     data.frame(m)
0fda3814
 }
 
fe78821a
 .add_weights_to_normaldb <- function(interval.weight.file, normalDB = NULL) {
     flog.warn("interval.weight.file is deprecated.")
     if (!is.null(normalDB$sd$weights)) return(normalDB)
         
     interval.weights <- read.delim(interval.weight.file, as.is = TRUE)
     if (!is.null(normalDB)) {
         interval.weights <- interval.weights[match(normalDB$intervals, 
             interval.weights[,1]),]
     }    
     weights <- GRanges(interval.weights[,1],,, DataFrame(weights = interval.weights[,2]))
     if (is.null(normalDB)) normalDB <- list()
     normalDB$sd$weights <- weights
     normalDB
 }