R/createNormalDatabase.R
7d29a0be
 #' Create database of normal samples
08db496a
 #'
ec6f11d9
 #' Function to create a database of normal samples, used to normalize
 #' tumor coverages.
08db496a
 #'
 #'
 #' @param normal.coverage.files Vector with file names pointing to
7d29a0be
 #' coverage files of normal samples.
 #' @param sex \code{character(length(normal.coverage.files))} with sex for all
 #' files.  \code{F} for female, \code{M} for male. If all chromosomes are
 #' diploid, specify \code{diploid}. If \code{NULL}, determine from coverage.
8a972c87
 #' @param coverage.outliers Exclude samples with coverages below or above
 #' the specified cutoffs (fractions of the normal sample coverages median).
 #' Only for databases with more than 5 samples.
08db496a
 #' @param min.coverage Exclude intervals with coverage lower than
ec6f11d9
 #' the specified fraction of the chromosome median in the pool of normals.
0208ae29
 #' @param max.missing Exclude intervals with zero coverage in the
ec6f11d9
 #' specified fraction of normal samples.
08db496a
 #' @param low.coverage Specifies the maximum number of total reads
23cc6c14
 #' (NOT average coverage) to call a target low coverage.
da2cf7d5
 #' @param optimal.off.target.counts Used to suggest an optimal off-target
08db496a
 #' interval width (BETA).
fe78821a
 #' @param plot Diagnostics plot, useful to tune parameters.
7d29a0be
 #' @param \dots Arguments passed to the \code{prcomp} function.
 #' @return A normal database that can be used in the
ec6f11d9
 #' \code{\link{calculateTangentNormal}} function to retrieve a coverage
 #' normalization sample for a given tumor sample.
7d29a0be
 #' @author Markus Riester
ec6f11d9
 #' @seealso \code{\link{calculateTangentNormal}}
7d29a0be
 #' @examples
08db496a
 #'
4a7f4816
 #' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz",
08db496a
 #'     package = "PureCN")
4a7f4816
 #' normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz",
08db496a
 #'     package = "PureCN")
7d29a0be
 #' normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
 #' normalDB <- createNormalDatabase(normal.coverage.files)
 #' 
 #' @export createNormalDatabase
ec6f11d9
 #' @importFrom Matrix tcrossprod
7d29a0be
 createNormalDatabase <- function(normal.coverage.files, sex = NULL,
fe78821a
 coverage.outliers = c(0.25, 4), min.coverage = 0.25,
da2cf7d5
 max.missing = 0.03, low.coverage = 15,
 optimal.off.target.counts = 120, plot = FALSE, ...) {
6f30d9f0
     normal.coverage.files <- normalizePath(normal.coverage.files)
bf20092f
 
     if (any(duplicated(normal.coverage.files))) {
         flog.warn("Some normal.coverage.files duplicated.")
         normal.coverage.files <- unique(normal.coverage.files)
     }
     if (length(normal.coverage.files) < 2) {
         .stopUserError("At least 2 normal.coverage.files required.")
     }
1d2bdaac
     if (is(normal.coverage.files, "character") &&
         any(grepl("_coverage.txt", normal.coverage.files)) &&
         any(grepl("_loess.txt", normal.coverage.files))) {
         .stopUserError("normal.coverage.files with _coverage.txt and _loess.txt suffix provided. Provide either only GC-normalized or raw coverage files!")
     }
bf20092f
 
51e99c42
     normals <- .readNormals(normal.coverage.files)
08db496a
 
da2cf7d5
     if (!all(normals[[1]]$on.target)) {
         ot_w <- median(sapply(lapply(normals, function(x) width(x)[!x$on.target]), median, na.rm = TRUE))
         ot_c <- median(sapply(lapply(normals, function(x) x$counts[!x$on.target]), median, na.rm = TRUE))
c439e822
         if (ot_w < 5000) {
fd59ff58
             flog.warn("Small median off-target width (%.0f). Double check that this is correct.",
67f45934
                 ot_w)
c439e822
         }
         if (ot_c < 1) {
             flog.warn("Not enough off-target counts. Suggest re-running IntervalFile.R without --off-target.")
         } else {    
             optimal_width <- round(optimal.off.target.counts / ot_c * ot_w / 100000, digits = 1) * 100000
             flog.info("Recommended minimum off-target width is %i compared to %i currently available.",
                 round(optimal_width), round(ot_w))
fd59ff58
             if (optimal_width > 350000) {
                 flog.warn("Large minimum off-target width, your assay might not provide sufficient off-target reads to make including them useful.")
             }
c439e822
         }
da2cf7d5
     }
2f2306ec
 
08db496a
     normals.m <- do.call(cbind,
ec6f11d9
         lapply(normals, function(x) x$counts))
8a972c87
 
08db496a
     low.coverage.targets <- .warnLowCoverageTargets(normals.m, normals[[1]],
553bb0aa
         low.coverage)
ec6f11d9
     normals.m[is.na(normals.m)] <- 0
 
08db496a
     z <- apply(normals.m, 2, mean)
8a972c87
     idx.failed <- rep(FALSE, length(normals))
39254458
 
8a972c87
     if (length(normals) > 5) {
08db496a
         idx.failed <- z < median(z) * coverage.outliers[1] |
8a972c87
                       z > median(z) * coverage.outliers[2]
08db496a
         if (sum(idx.failed)) {
             flog.info("Dropping %s due to outlier coverage.",
                 paste(basename(normal.coverage.files[idx.failed]), collapse = ", "))
8a972c87
         }
         normals <- normals[!idx.failed]
08db496a
         normals.m <- normals.m[, !idx.failed, drop = FALSE]
8a972c87
         normal.coverage.files <- normal.coverage.files[!idx.failed]
     }
08db496a
     sex.determined <- sapply(normals, getSexFromCoverage)
aef10b2d
     if (is.null(sex)) {
         sex <- sex.determined
e769c0fc
     } else {
8a972c87
         if (length(sex) != length(idx.failed)) {
6f30d9f0
             .stopUserError("Length of normal.coverage.files and sex different")
8a972c87
         }
         sex <- sex[!idx.failed]
2142e4c1
         idx.sex <- sex %in% c(NA, "F", "M", "diploid")
aef10b2d
         sex[!idx.sex] <- NA
08db496a
         if (sum(!idx.sex) > 0) warning("Unexpected values in sex ignored.")
aef10b2d
         for (i in seq_along(sex.determined)) {
2142e4c1
             if (!is.na(sex.determined[i]) && sex[i] != "diploid" &&
aef10b2d
                 sex.determined[i] != sex[i]) {
08db496a
                 flog.warn("Sex mismatch in %s. Sex provided is %s, but could be %s.",
4d22136e
                     normal.coverage.files[i], sex[i], sex.determined[i])
08db496a
             }
         }
aef10b2d
     }
ec6f11d9
 
553bb0aa
 
ec6f11d9
     groups <- lapply(c(TRUE, FALSE), function(on.target) {
         idx <- normals[[1]]$on.target == on.target
         intervals <- normals[[1]][idx]
         if (length(intervals)) {
08db496a
             flog.info("Processing %s-target regions...",
                 ifelse(on.target, "on", "off"))
         }
         .standardizeNormals(normals.m[idx, ], normals[[1]][idx], min.coverage,
ec6f11d9
             max.missing, sex)
     })
08db496a
 
     # merge back some on-target and off-target interval statistics
ec6f11d9
     intervals.used <- logical(length(normals[[1]]))
     fraction.missing <- double(length(normals[[1]]))
     intervals.used[normals[[1]]$on.target] <- groups[[1]]$intervals.used
     fraction.missing[normals[[1]]$on.target] <- groups[[1]]$fraction.missing
 
     if (!is.null(groups[[2]]$intervals.used)) {
         intervals.used[!normals[[1]]$on.target] <- groups[[2]]$intervals.used
         fraction.missing[!normals[[1]]$on.target] <- groups[[2]]$fraction.missing
     }
08db496a
    
fe78821a
     normalDB <- list(
2603eb7b
         normal.coverage.files = normal.coverage.files,
         intervals = as.character(normals[[1]]),
553bb0aa
         groups = groups,
         intervals.used = intervals.used,
         sex = sex,
         low.coverage.targets = low.coverage.targets,
f6ae256d
         version = 8
ea58a451
     )
f6ae256d
     normalDB <- .calculateIntervalWeights(normalDB, normals, plot = plot)
fe78821a
     normalDB
7d29a0be
 }
 
553bb0aa
 .warnLowCoverageTargets <- function(counts, intervals, low.coverage) {
     all.low <- apply(counts, 1, max) <= low.coverage
     low.intervals <- intervals[which(all.low & intervals$on.target)]
     mcols(low.intervals) <- NULL
     n.low <- length(low.intervals)
     if (n.low > 0) {
         flog.info("%i on-target bins with low coverage in all samples.", n.low)
     }
     if (n.low > sum(intervals$on.target, na.rm = TRUE) * 0.05) {
         flog.warn("You are likely not using the correct baits file!")
     }
08db496a
     low.intervals
553bb0aa
 }
      
c439e822
 .standardizeNormals <- function(counts, intervals, min.coverage, max.missing, sex, return.counts = FALSE) {
ec6f11d9
     if (!length(intervals)) return(list(present = FALSE))
     # recalculate without dropped samples
     fcnts <- apply(counts, 2, function(x) x/sum(x))
     fcnts_interval_medians <- apply(fcnts, 1, median, na.rm=TRUE)
     fraction.missing <- apply(counts, 1, function(x)
                     sum(is.na(x)|x<=0))/ncol(counts)
 
0208ae29
     intervals.used <- .filterIntervalsCreateNormalDB(intervals, 
ec6f11d9
         fcnts_interval_medians, fraction.missing, min.coverage, max.missing) 
ea58a451
 
ec6f11d9
     fcnts <- apply(counts[intervals.used,], 2, function(x) x/sum(x))
     fcnts_interval_medians <- apply(fcnts, 1, median)
26aa6f47
     fcnts_interval_medians_F <- fcnts_interval_medians
     fcnts_interval_medians_M <- fcnts_interval_medians
     
     # do we need a sex-aware database?
     if ("F" %in% sex && "M" %in% sex) {
31ed7540
         fcnts_interval_medians_F <- apply(fcnts[, which(sex=="F"), drop = FALSE], 1, median)
         fcnts_interval_medians_M <- apply(fcnts[, which(sex=="M"), drop = FALSE], 1, median)
26aa6f47
         sex.chr <- .getSexChr(seqlevels(intervals))
         idx <- as.character(seqnames(intervals)[intervals.used]) %in% sex.chr
         fcnts_interval_medians_F[!idx] <- fcnts_interval_medians[!idx]
         fcnts_interval_medians_M[!idx] <- fcnts_interval_medians[!idx]
         fcnts_std <- fcnts
         for (i in seq(ncol(fcnts))) {
31ed7540
             if (is.null(sex[i]) || is.na(sex[i])) sex[i] <- "?"
26aa6f47
             iv <- switch(sex[i],
                 "F"=fcnts_interval_medians_F,
                 "M"=fcnts_interval_medians_M,
                 fcnts_interval_medians)
 
             fcnts_std[, i] <- fcnts_std[,i] / iv
         }
c439e822
     } else {
bb3203dc
         fcnts_std <- apply(fcnts, 2, function(x) x / fcnts_interval_medians)
26aa6f47
     }
bb3203dc
     fcnts_interval_non_zero_medians <- apply(fcnts_std, 1, function(x) median(x[x > 0]))
     fcnts_std_imp <- apply(fcnts_std, 2, function(x) { x[x <= 0] <- fcnts_interval_non_zero_medians[x <= 0]; x})
     p <- 0.001
     li <- quantile(as.vector(fcnts_std_imp), probs = c(p, 1 - p))
ec6f11d9
     fcnts_std_trunc <- fcnts_std_imp
c439e822
     fcnts_std_trunc[fcnts_std_imp < li[1]] <- li[1]
     fcnts_std_trunc[fcnts_std_imp > li[2]] <- li[2]
     fcnts_std_final <- apply(fcnts_std_trunc, 2, function(x) log2(x / median(x)))
bb3203dc
     fcnts_std_final <- fcnts_std_final - median(apply(fcnts_std_final, 2, median))
653a7a79
     s <- svd(fcnts_std_final)
ec6f11d9
 
c439e822
     ret <- list(
653a7a79
         projection = s$u,
         projection.v = s$v,
26aa6f47
         intervals.used = intervals.used,
         interval.median.coverage = list(all = fcnts_interval_medians,
                                         F = fcnts_interval_medians_F,        
                                         M = fcnts_interval_medians_M),
         fraction.missing = fraction.missing,
08db496a
         dist.t = dist(t(fcnts_std_final)),
c439e822
         present = TRUE
ec6f11d9
     )
c439e822
     if (return.counts) {
         ret$counts <- fcnts_std_final
     }
     return(ret)
ec6f11d9
 }
 
26aa6f47
 .denoiseSample <- function(x, normalDB, num.eigen, sex) {
ec6f11d9
     fcnts <- x$counts[normalDB$intervals.used]
     fcnts <- fcnts/sum(fcnts, na.rm=TRUE)
31ed7540
     if (is.null(sex) || is.na(sex)) sex <- "?"
26aa6f47
     iv <- switch(sex,
         "F"=normalDB$interval.median.coverage$F,
         "M"=normalDB$interval.median.coverage$M,
         normalDB$interval.median.coverage$all)
 
     fcnts_std <- fcnts/iv
dcf0f557
     # impute intervals which have no data in sample but in database
     # otherwise the projection will return only NA
6a01f5d7
     idxMissing <- which(fcnts_std <= 0 | is.na(fcnts_std))
     fcnts_std[idxMissing] <- 1e-9
ec6f11d9
 
     fcnts_std_final <- log2(fcnts_std/median(fcnts_std, na.rm = TRUE))
     x$log.ratio.std <- 0.
     x$log.ratio.std[!normalDB$intervals.used] <- NA
     x$log.ratio <- x$log.ratio.std
     x$log.ratio.std[normalDB$intervals.used] <- fcnts_std_final
     if (num.eigen > ncol(normalDB$projection)) num.eigen <- ncol(normalDB$projection)
     P <- normalDB$projection[,seq(1,num.eigen)]    
     x$log.ratio[normalDB$intervals.used] <- fcnts_std_final - 
         as.vector(tcrossprod(fcnts_std_final %*% P, P))
     x
 }
 
 #' Calculate tangent normal
 #' 
 #' Reimplementation of GATK4 denoising. Please cite the relevant GATK
 #' publication if you use this in a publication.
 #' 
 #' 
 #' @param tumor.coverage.file Coverage file or data  of a tumor sample.
 #' @param normalDB Database of normal samples, created with
 #' \code{\link{createNormalDatabase}}.
 #' @param num.eigen Number of eigen vectors used.
 #' @param ignore.sex If \code{FALSE}, detects sex of sample and returns best
 #' normals with matching sex.
 #' @param sex Sex of sample. If \code{NULL}, determine with
 #' \code{\link{getSexFromCoverage}} and default parameters. Valid values are
 #' \code{F} for female, \code{M} for male. If all chromosomes are diploid,
 #' specify \code{diploid}.
 #' @seealso \code{\link{createNormalDatabase}}
 #' @author Markus Riester
 #' @examples
 #'
4a7f4816
 #' tumor.coverage.file <- system.file('extdata', 'example_tumor.txt.gz', 
 #'     package = 'PureCN')
 #' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", 
 #'     package = "PureCN")
 #' normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", 
 #'     package = "PureCN")
ec6f11d9
 #' normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
 #' normalDB <- createNormalDatabase(normal.coverage.files)
 #' pool <- calculateTangentNormal(tumor.coverage.file, normalDB)
 #' 
 #' @export calculateTangentNormal
 calculateTangentNormal <- function(tumor.coverage.file, normalDB, 
                                    num.eigen = 20, ignore.sex = FALSE, 
                                    sex = NULL) {
 
     if (is.character(tumor.coverage.file)) {
         tumor  <- readCoverageFile(tumor.coverage.file)
     } else {
         tumor <- tumor.coverage.file
     }
a312ed63
     
     if (!.checkNormalDB(tumor, normalDB)) {
ec6f11d9
         .stopUserError("tumor.coverage.file and normalDB do not align.")
     }
 
26aa6f47
     if (!ignore.sex && !is.null(normalDB$sex) && 
         sum(!is.na(normalDB$sex))>0) {
         if (is.null(sex)) {
             sex <- getSexFromCoverage(tumor)
         }
         flog.info("Sample sex: %s", sex)
     }
ec6f11d9
     tumor$log.ratio <- 0.
     tumor$log.ratio.std <- 0.
26aa6f47
     denoised <- .denoiseSample(tumor[tumor$on.target], normalDB$groups[[1]], num.eigen, sex)
ec6f11d9
     ov <- findOverlaps(tumor, denoised)
     tumor$log.ratio[queryHits(ov)] <- denoised$log.ratio[subjectHits(ov)]
     tumor$log.ratio.std[queryHits(ov)] <- denoised$log.ratio.std[subjectHits(ov)]
 
     if (normalDB$groups[[2]]$present) {
26aa6f47
         denoised <- .denoiseSample(tumor[!tumor$on.target], normalDB$groups[[2]], num.eigen, sex)
ec6f11d9
         ov <- findOverlaps(tumor, denoised)
         tumor$log.ratio[queryHits(ov)] <- denoised$log.ratio[subjectHits(ov)]
         tumor$log.ratio.std[queryHits(ov)] <- denoised$log.ratio.std[subjectHits(ov)]
     }
         
     fakeNormal <- tumor
108e255d
     idxNA <- is.na(fakeNormal$log.ratio)
     fakeNormal$average.coverage[!idxNA] <- (2 ^ (log2(tumor$average.coverage) - tumor$log.ratio))[!idxNA]
ec6f11d9
     fakeNormal$coverage <- fakeNormal$average.coverage * width(fakeNormal)
     fakeNormal
 }
     
51e99c42
 .readNormals <- function(normal.coverage.files) {
167ea8ab
     normals <- lapply(normal.coverage.files, function(x) {
         if (is(x, "character")) {
             return(readCoverageFile(normalizePath(x)))
         }
         return(x)
     })
51e99c42
 
     # check that all files used the same interval file.
     for (i in seq_along(normals)) {
44326645
         if (!identical(as.character(normals[[i]]), as.character(normals[[1]]))) {
51e99c42
             .stopUserError("All normal.coverage.files must have the same ",
                 "intervals. ", normal.coverage.files[i], " is different.")
         }
     }
     normals
 }
ec6f11d9
 
0208ae29
 .filterIntervalsCreateNormalDB <- function(intervals, 
26aa6f47
 interval.median.coverage, fraction.missing,
ec6f11d9
 min.coverage, max.missing) {
     
     nBefore <- length(intervals)
     intervals.used <- is.finite(fraction.missing) 
2e76b10e
 
     key <- paste(as.character(seqnames(intervals)), intervals$on.target)
     min.coverage <- (sapply(split(interval.median.coverage, 
0208ae29
             key), median, na.rm=TRUE) * min.coverage)[key]
2e76b10e
 
26aa6f47
     intervals.used <- intervals.used & !is.na(interval.median.coverage) & 
2e76b10e
         interval.median.coverage >= min.coverage
ec6f11d9
         
     nAfter <- sum(intervals.used)
 
     if (nAfter < nBefore) {
76a87bd3
         flog.info("Removing %i intervals with low coverage in normalDB.", 
ec6f11d9
             nBefore-nAfter)
     }
 
     nBefore <- sum(intervals.used)
     intervals.used <- intervals.used & !is.na(fraction.missing) &
         fraction.missing <= max.missing
     nAfter <- sum(intervals.used)
 
     if (nAfter < nBefore) {
76a87bd3
         flog.info("Removing %i intervals with zero coverage in more than %.0f%% of normalDB.", 
ec6f11d9
             nBefore-nAfter, max.missing*100)
     }
 
4010268a
     if (nAfter < 2) {
         .stopUserError("No intervals left after coverage checks.")
     }
 
ec6f11d9
     intervals.used
 }
b5957a16
 
 .calculateIntervalWeights <- function(normalDB, normal.coverage, 
6e428672
     top.quantile = 0.7, plot = FALSE) {
b5957a16
     
     tumor.coverage <- list(poolCoverage(normal.coverage, 
         w = rep(1, length(normal.coverage)) / length(normal.coverage)))
 
6e428672
     lrs <- lapply(normal.coverage, function(x) calculateTangentNormal(x, normalDB)$log.ratio)
b5957a16
     lrs <- do.call(cbind, lrs)
 
     lrs[is.infinite(lrs)] <- NA
 
     intervals <- normal.coverage[[1]]
     mcols(intervals) <- NULL
 
     lrs.sd <- apply(lrs, 1, sd, na.rm = TRUE)
     lrs.cnt.na <- apply(lrs, 1, function(x) sum(is.na(x)))
     # get the top.quantile % of sd by chromosome and use this to normalize weight=1
     chrom <-  as.character(seqnames(intervals))
     sdCutoffByChr <- sapply(split(lrs.sd, chrom), quantile, probs = top.quantile, 
         names = FALSE, na.rm = TRUE)[chrom]
 
     zz <- sdCutoffByChr / lrs.sd
     zz[zz > 1] <- 1
     idx <- is.na(zz) | lrs.cnt.na > ncol(lrs) / 3
     zz[idx] <- min(zz, na.rm = TRUE)
 
     ret <- list(
                 log.ratios = GRanges(intervals,,, DataFrame(lrs)),
                 weights = GRanges(intervals,,, DataFrame(weights = zz)))
     
     if (plot) .plotIntervalWeights(lrs.sd, width(tumor.coverage[[1]]), 
         tumor.coverage[[1]]$on.target)
     normalDB$sd <- ret
     normalDB
 }
 
 .plotIntervalWeights <- function(lrs.sd, width, on.target) {
     par(mfrow = c(1, 2))
     plot(width[on.target], lrs.sd[on.target], ylim = c(0,2),
         xlab = "Interval Width", ylab = "log2 ratio sd.", main = "On-Target")
     if (sum(!on.target)) {
         plot(width[!on.target], lrs.sd[!on.target], col = "red", 
              ylim = c(0, 2), xlab = "Interval Width", ylab = "log2 ratio sd.",
              main = "Off-Target")
     }
 }