#' Create database of normal samples #' #' Function to create a database of normal samples, used to normalize #' tumor coverages. #' #' #' @param normal.coverage.files Vector with file names pointing to #' coverage files of normal samples. #' @param sex \code{character(length(normal.coverage.files))} with sex for all #' files. \code{F} for female, \code{M} for male. If all chromosomes are #' diploid, specify \code{diploid}. If \code{NULL}, determine from coverage. #' @param coverage.outliers Exclude samples with coverages below or above #' the specified cutoffs (fractions of the normal sample coverages median). #' Only for databases with more than 5 samples. #' @param min.coverage Exclude intervals with coverage lower than #' the specified fraction of the chromosome median in the pool of normals. #' @param max.missing Exclude intervals with zero coverage in the #' specified fraction of normal samples. #' @param low.coverage Specifies the maximum number of total reads #' (NOT average coverage) to call a target low coverage. #' @param optimal.off.target.counts Used to suggest an optimal off-target #' interval width (BETA). #' @param plot Diagnostics plot, useful to tune parameters. #' @param \dots Arguments passed to the \code{prcomp} function. #' @return A normal database that can be used in the #' \code{\link{calculateTangentNormal}} function to retrieve a coverage #' normalization sample for a given tumor sample. #' @author Markus Riester #' @seealso \code{\link{calculateTangentNormal}} #' @examples #' #' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", #' package = "PureCN") #' normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", #' package = "PureCN") #' normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file) #' normalDB <- createNormalDatabase(normal.coverage.files) #' #' @export createNormalDatabase #' @importFrom Matrix tcrossprod createNormalDatabase <- function(normal.coverage.files, sex = NULL, coverage.outliers = c(0.25, 4), min.coverage = 0.25, max.missing = 0.03, low.coverage = 15, optimal.off.target.counts = 120, plot = FALSE, ...) { normal.coverage.files <- normalizePath(normal.coverage.files) if (any(duplicated(normal.coverage.files))) { flog.warn("Some normal.coverage.files duplicated.") normal.coverage.files <- unique(normal.coverage.files) } if (length(normal.coverage.files) < 2) { .stopUserError("At least 2 normal.coverage.files required.") } if (is(normal.coverage.files, "character") && any(grepl("_coverage.txt", normal.coverage.files)) && any(grepl("_loess.txt", normal.coverage.files))) { .stopUserError("normal.coverage.files with _coverage.txt and _loess.txt suffix provided. Provide either only GC-normalized or raw coverage files!") } normals <- .readNormals(normal.coverage.files) if (!all(normals[[1]]$on.target)) { ot_w <- median(sapply(lapply(normals, function(x) width(x)[!x$on.target]), median, na.rm = TRUE)) ot_c <- median(sapply(lapply(normals, function(x) x$counts[!x$on.target]), median, na.rm = TRUE)) if (ot_w < 5000) { flog.warn("Small median off-target width (%.0f). Double check that this is correct.", ot_w) } if (ot_c < 1) { flog.warn("Not enough off-target counts. Suggest re-running IntervalFile.R without --off-target.") } else { optimal_width <- round(optimal.off.target.counts / ot_c * ot_w / 100000, digits = 1) * 100000 flog.info("Recommended minimum off-target width is %i compared to %i currently available.", round(optimal_width), round(ot_w)) if (optimal_width > 350000) { flog.warn("Large minimum off-target width, your assay might not provide sufficient off-target reads to make including them useful.") } } } normals.m <- do.call(cbind, lapply(normals, function(x) x$counts)) low.coverage.targets <- .warnLowCoverageTargets(normals.m, normals[[1]], low.coverage) normals.m[is.na(normals.m)] <- 0 z <- apply(normals.m, 2, mean) idx.failed <- rep(FALSE, length(normals)) if (length(normals) > 5) { idx.failed <- z < median(z) * coverage.outliers[1] | z > median(z) * coverage.outliers[2] if (sum(idx.failed)) { flog.info("Dropping %s due to outlier coverage.", paste(basename(normal.coverage.files[idx.failed]), collapse = ", ")) } normals <- normals[!idx.failed] normals.m <- normals.m[, !idx.failed, drop = FALSE] normal.coverage.files <- normal.coverage.files[!idx.failed] } sex.determined <- sapply(normals, getSexFromCoverage) if (is.null(sex)) { sex <- sex.determined } else { if (length(sex) != length(idx.failed)) { .stopUserError("Length of normal.coverage.files and sex different") } sex <- sex[!idx.failed] idx.sex <- sex %in% c(NA, "F", "M", "diploid") sex[!idx.sex] <- NA if (sum(!idx.sex) > 0) warning("Unexpected values in sex ignored.") for (i in seq_along(sex.determined)) { if (!is.na(sex.determined[i]) && sex[i] != "diploid" && sex.determined[i] != sex[i]) { flog.warn("Sex mismatch in %s. Sex provided is %s, but could be %s.", normal.coverage.files[i], sex[i], sex.determined[i]) } } } groups <- lapply(c(TRUE, FALSE), function(on.target) { idx <- normals[[1]]$on.target == on.target intervals <- normals[[1]][idx] if (length(intervals)) { flog.info("Processing %s-target regions...", ifelse(on.target, "on", "off")) } .standardizeNormals(normals.m[idx, ], normals[[1]][idx], min.coverage, max.missing, sex) }) # merge back some on-target and off-target interval statistics intervals.used <- logical(length(normals[[1]])) fraction.missing <- double(length(normals[[1]])) intervals.used[normals[[1]]$on.target] <- groups[[1]]$intervals.used fraction.missing[normals[[1]]$on.target] <- groups[[1]]$fraction.missing if (!is.null(groups[[2]]$intervals.used)) { intervals.used[!normals[[1]]$on.target] <- groups[[2]]$intervals.used fraction.missing[!normals[[1]]$on.target] <- groups[[2]]$fraction.missing } normalDB <- list( normal.coverage.files = normal.coverage.files, intervals = as.character(normals[[1]]), groups = groups, intervals.used = intervals.used, sex = sex, low.coverage.targets = low.coverage.targets, version = 8 ) normalDB <- .calculateIntervalWeights(normalDB, normals, plot = plot) normalDB } .warnLowCoverageTargets <- function(counts, intervals, low.coverage) { all.low <- apply(counts, 1, max) <= low.coverage low.intervals <- intervals[which(all.low & intervals$on.target)] mcols(low.intervals) <- NULL n.low <- length(low.intervals) if (n.low > 0) { flog.info("%i on-target bins with low coverage in all samples.", n.low) } if (n.low > sum(intervals$on.target, na.rm = TRUE) * 0.05) { flog.warn("You are likely not using the correct baits file!") } low.intervals } .standardizeNormals <- function(counts, intervals, min.coverage, max.missing, sex, return.counts = FALSE) { if (!length(intervals)) return(list(present = FALSE)) # recalculate without dropped samples fcnts <- apply(counts, 2, function(x) x/sum(x)) fcnts_interval_medians <- apply(fcnts, 1, median, na.rm=TRUE) fraction.missing <- apply(counts, 1, function(x) sum(is.na(x)|x<=0))/ncol(counts) intervals.used <- .filterIntervalsCreateNormalDB(intervals, fcnts_interval_medians, fraction.missing, min.coverage, max.missing) fcnts <- apply(counts[intervals.used,], 2, function(x) x/sum(x)) fcnts_interval_medians <- apply(fcnts, 1, median) fcnts_interval_medians_F <- fcnts_interval_medians fcnts_interval_medians_M <- fcnts_interval_medians # do we need a sex-aware database? if ("F" %in% sex && "M" %in% sex) { fcnts_interval_medians_F <- apply(fcnts[, which(sex=="F"), drop = FALSE], 1, median) fcnts_interval_medians_M <- apply(fcnts[, which(sex=="M"), drop = FALSE], 1, median) sex.chr <- .getSexChr(seqlevels(intervals)) idx <- as.character(seqnames(intervals)[intervals.used]) %in% sex.chr fcnts_interval_medians_F[!idx] <- fcnts_interval_medians[!idx] fcnts_interval_medians_M[!idx] <- fcnts_interval_medians[!idx] fcnts_std <- fcnts for (i in seq(ncol(fcnts))) { if (is.null(sex[i]) || is.na(sex[i])) sex[i] <- "?" iv <- switch(sex[i], "F"=fcnts_interval_medians_F, "M"=fcnts_interval_medians_M, fcnts_interval_medians) fcnts_std[, i] <- fcnts_std[,i] / iv } } else { fcnts_std <- apply(fcnts, 2, function(x) x / fcnts_interval_medians) } fcnts_interval_non_zero_medians <- apply(fcnts_std, 1, function(x) median(x[x > 0])) fcnts_std_imp <- apply(fcnts_std, 2, function(x) { x[x <= 0] <- fcnts_interval_non_zero_medians[x <= 0]; x}) p <- 0.001 li <- quantile(as.vector(fcnts_std_imp), probs = c(p, 1 - p)) fcnts_std_trunc <- fcnts_std_imp fcnts_std_trunc[fcnts_std_imp < li[1]] <- li[1] fcnts_std_trunc[fcnts_std_imp > li[2]] <- li[2] fcnts_std_final <- apply(fcnts_std_trunc, 2, function(x) log2(x / median(x))) fcnts_std_final <- fcnts_std_final - median(apply(fcnts_std_final, 2, median)) s <- svd(fcnts_std_final) ret <- list( projection = s$u, projection.v = s$v, intervals.used = intervals.used, interval.median.coverage = list(all = fcnts_interval_medians, F = fcnts_interval_medians_F, M = fcnts_interval_medians_M), fraction.missing = fraction.missing, dist.t = dist(t(fcnts_std_final)), present = TRUE ) if (return.counts) { ret$counts <- fcnts_std_final } return(ret) } .denoiseSample <- function(x, normalDB, num.eigen, sex) { fcnts <- x$counts[normalDB$intervals.used] fcnts <- fcnts/sum(fcnts, na.rm=TRUE) if (is.null(sex) || is.na(sex)) sex <- "?" iv <- switch(sex, "F"=normalDB$interval.median.coverage$F, "M"=normalDB$interval.median.coverage$M, normalDB$interval.median.coverage$all) fcnts_std <- fcnts/iv # impute intervals which have no data in sample but in database # otherwise the projection will return only NA idxMissing <- which(fcnts_std <= 0 | is.na(fcnts_std)) fcnts_std[idxMissing] <- 1e-9 fcnts_std_final <- log2(fcnts_std/median(fcnts_std, na.rm = TRUE)) x$log.ratio.std <- 0. x$log.ratio.std[!normalDB$intervals.used] <- NA x$log.ratio <- x$log.ratio.std x$log.ratio.std[normalDB$intervals.used] <- fcnts_std_final if (num.eigen > ncol(normalDB$projection)) num.eigen <- ncol(normalDB$projection) P <- normalDB$projection[,seq(1,num.eigen)] x$log.ratio[normalDB$intervals.used] <- fcnts_std_final - as.vector(tcrossprod(fcnts_std_final %*% P, P)) x } #' Calculate tangent normal #' #' Reimplementation of GATK4 denoising. Please cite the relevant GATK #' publication if you use this in a publication. #' #' #' @param tumor.coverage.file Coverage file or data of a tumor sample. #' @param normalDB Database of normal samples, created with #' \code{\link{createNormalDatabase}}. #' @param num.eigen Number of eigen vectors used. #' @param ignore.sex If \code{FALSE}, detects sex of sample and returns best #' normals with matching sex. #' @param sex Sex of sample. If \code{NULL}, determine with #' \code{\link{getSexFromCoverage}} and default parameters. Valid values are #' \code{F} for female, \code{M} for male. If all chromosomes are diploid, #' specify \code{diploid}. #' @seealso \code{\link{createNormalDatabase}} #' @author Markus Riester #' @examples #' #' tumor.coverage.file <- system.file('extdata', 'example_tumor.txt.gz', #' package = 'PureCN') #' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", #' package = "PureCN") #' normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", #' package = "PureCN") #' normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file) #' normalDB <- createNormalDatabase(normal.coverage.files) #' pool <- calculateTangentNormal(tumor.coverage.file, normalDB) #' #' @export calculateTangentNormal calculateTangentNormal <- function(tumor.coverage.file, normalDB, num.eigen = 20, ignore.sex = FALSE, sex = NULL) { if (is.character(tumor.coverage.file)) { tumor <- readCoverageFile(tumor.coverage.file) } else { tumor <- tumor.coverage.file } if (!.checkNormalDB(tumor, normalDB)) { .stopUserError("tumor.coverage.file and normalDB do not align.") } if (!ignore.sex && !is.null(normalDB$sex) && sum(!is.na(normalDB$sex))>0) { if (is.null(sex)) { sex <- getSexFromCoverage(tumor) } flog.info("Sample sex: %s", sex) } tumor$log.ratio <- 0. tumor$log.ratio.std <- 0. denoised <- .denoiseSample(tumor[tumor$on.target], normalDB$groups[[1]], num.eigen, sex) ov <- findOverlaps(tumor, denoised) tumor$log.ratio[queryHits(ov)] <- denoised$log.ratio[subjectHits(ov)] tumor$log.ratio.std[queryHits(ov)] <- denoised$log.ratio.std[subjectHits(ov)] if (normalDB$groups[[2]]$present) { denoised <- .denoiseSample(tumor[!tumor$on.target], normalDB$groups[[2]], num.eigen, sex) ov <- findOverlaps(tumor, denoised) tumor$log.ratio[queryHits(ov)] <- denoised$log.ratio[subjectHits(ov)] tumor$log.ratio.std[queryHits(ov)] <- denoised$log.ratio.std[subjectHits(ov)] } fakeNormal <- tumor idxNA <- is.na(fakeNormal$log.ratio) fakeNormal$average.coverage[!idxNA] <- (2 ^ (log2(tumor$average.coverage) - tumor$log.ratio))[!idxNA] fakeNormal$coverage <- fakeNormal$average.coverage * width(fakeNormal) fakeNormal } .readNormals <- function(normal.coverage.files) { normals <- lapply(normal.coverage.files, function(x) { if (is(x, "character")) { return(readCoverageFile(normalizePath(x))) } return(x) }) # check that all files used the same interval file. for (i in seq_along(normals)) { if (!identical(as.character(normals[[i]]), as.character(normals[[1]]))) { .stopUserError("All normal.coverage.files must have the same ", "intervals. ", normal.coverage.files[i], " is different.") } } normals } .filterIntervalsCreateNormalDB <- function(intervals, interval.median.coverage, fraction.missing, min.coverage, max.missing) { nBefore <- length(intervals) intervals.used <- is.finite(fraction.missing) key <- paste(as.character(seqnames(intervals)), intervals$on.target) min.coverage <- (sapply(split(interval.median.coverage, key), median, na.rm=TRUE) * min.coverage)[key] intervals.used <- intervals.used & !is.na(interval.median.coverage) & interval.median.coverage >= min.coverage nAfter <- sum(intervals.used) if (nAfter < nBefore) { flog.info("Removing %i intervals with low coverage in normalDB.", nBefore-nAfter) } nBefore <- sum(intervals.used) intervals.used <- intervals.used & !is.na(fraction.missing) & fraction.missing <= max.missing nAfter <- sum(intervals.used) if (nAfter < nBefore) { flog.info("Removing %i intervals with zero coverage in more than %.0f%% of normalDB.", nBefore-nAfter, max.missing*100) } if (nAfter < 2) { .stopUserError("No intervals left after coverage checks.") } intervals.used } .calculateIntervalWeights <- function(normalDB, normal.coverage, top.quantile = 0.7, plot = FALSE) { tumor.coverage <- list(poolCoverage(normal.coverage, w = rep(1, length(normal.coverage)) / length(normal.coverage))) lrs <- lapply(normal.coverage, function(x) calculateTangentNormal(x, normalDB)$log.ratio) lrs <- do.call(cbind, lrs) lrs[is.infinite(lrs)] <- NA intervals <- normal.coverage[[1]] mcols(intervals) <- NULL lrs.sd <- apply(lrs, 1, sd, na.rm = TRUE) lrs.cnt.na <- apply(lrs, 1, function(x) sum(is.na(x))) # get the top.quantile % of sd by chromosome and use this to normalize weight=1 chrom <- as.character(seqnames(intervals)) sdCutoffByChr <- sapply(split(lrs.sd, chrom), quantile, probs = top.quantile, names = FALSE, na.rm = TRUE)[chrom] zz <- sdCutoffByChr / lrs.sd zz[zz > 1] <- 1 idx <- is.na(zz) | lrs.cnt.na > ncol(lrs) / 3 zz[idx] <- min(zz, na.rm = TRUE) ret <- list( log.ratios = GRanges(intervals,,, DataFrame(lrs)), weights = GRanges(intervals,,, DataFrame(weights = zz))) if (plot) .plotIntervalWeights(lrs.sd, width(tumor.coverage[[1]]), tumor.coverage[[1]]$on.target) normalDB$sd <- ret normalDB } .plotIntervalWeights <- function(lrs.sd, width, on.target) { par(mfrow = c(1, 2)) plot(width[on.target], lrs.sd[on.target], ylim = c(0,2), xlab = "Interval Width", ylab = "log2 ratio sd.", main = "On-Target") if (sum(!on.target)) { plot(width[!on.target], lrs.sd[!on.target], col = "red", ylim = c(0, 2), xlab = "Interval Width", ylab = "log2 ratio sd.", main = "Off-Target") } }