Bioconductor Code: PureCN

R/createNormalDatabase.R

7d29a0be	#' Create database of normal samples
08db496a	#'
ec6f11d9	#' Function to create a database of normal samples, used to normalize #' tumor coverages.
08db496a	#' #' #' @param normal.coverage.files Vector with file names pointing to
7d29a0be	#' coverage files of normal samples. #' @param sex \code{character(length(normal.coverage.files))} with sex for all #' files. \code{F} for female, \code{M} for male. If all chromosomes are #' diploid, specify \code{diploid}. If \code{NULL}, determine from coverage.
8a972c87	#' @param coverage.outliers Exclude samples with coverages below or above #' the specified cutoffs (fractions of the normal sample coverages median). #' Only for databases with more than 5 samples.
08db496a	#' @param min.coverage Exclude intervals with coverage lower than
ec6f11d9	#' the specified fraction of the chromosome median in the pool of normals.
0208ae29	#' @param max.missing Exclude intervals with zero coverage in the
ec6f11d9	#' specified fraction of normal samples.
08db496a	#' @param low.coverage Specifies the maximum number of total reads
23cc6c14	#' (NOT average coverage) to call a target low coverage.
da2cf7d5	#' @param optimal.off.target.counts Used to suggest an optimal off-target
08db496a	#' interval width (BETA).
fe78821a	#' @param plot Diagnostics plot, useful to tune parameters.
7d29a0be	#' @param \dots Arguments passed to the \code{prcomp} function. #' @return A normal database that can be used in the
ec6f11d9	#' \code{\link{calculateTangentNormal}} function to retrieve a coverage #' normalization sample for a given tumor sample.
7d29a0be	#' @author Markus Riester
ec6f11d9	#' @seealso \code{\link{calculateTangentNormal}}
7d29a0be	#' @examples
08db496a	#'
4a7f4816	#' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz",
08db496a	#' package = "PureCN")
4a7f4816	#' normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz",
08db496a	#' package = "PureCN")
7d29a0be	#' normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file) #' normalDB <- createNormalDatabase(normal.coverage.files) #' #' @export createNormalDatabase
ec6f11d9	#' @importFrom Matrix tcrossprod
7d29a0be	createNormalDatabase <- function(normal.coverage.files, sex = NULL,
fe78821a	coverage.outliers = c(0.25, 4), min.coverage = 0.25,
da2cf7d5	max.missing = 0.03, low.coverage = 15, optimal.off.target.counts = 120, plot = FALSE, ...) {
6f30d9f0	normal.coverage.files <- normalizePath(normal.coverage.files)
bf20092f	if (any(duplicated(normal.coverage.files))) { flog.warn("Some normal.coverage.files duplicated.") normal.coverage.files <- unique(normal.coverage.files) } if (length(normal.coverage.files) < 2) { .stopUserError("At least 2 normal.coverage.files required.") }
1d2bdaac	if (is(normal.coverage.files, "character") && any(grepl("_coverage.txt", normal.coverage.files)) && any(grepl("_loess.txt", normal.coverage.files))) { .stopUserError("normal.coverage.files with _coverage.txt and _loess.txt suffix provided. Provide either only GC-normalized or raw coverage files!") }
bf20092f
51e99c42	normals <- .readNormals(normal.coverage.files)
08db496a
da2cf7d5	if (!all(normals[[1]]$on.target)) { ot_w <- median(sapply(lapply(normals, function(x) width(x)[!x$on.target]), median, na.rm = TRUE)) ot_c <- median(sapply(lapply(normals, function(x) x$counts[!x$on.target]), median, na.rm = TRUE))
c439e822	if (ot_w < 5000) {
fd59ff58	flog.warn("Small median off-target width (%.0f). Double check that this is correct.",
67f45934	ot_w)
c439e822	} if (ot_c < 1) { flog.warn("Not enough off-target counts. Suggest re-running IntervalFile.R without --off-target.") } else { optimal_width <- round(optimal.off.target.counts / ot_c * ot_w / 100000, digits = 1) * 100000 flog.info("Recommended minimum off-target width is %i compared to %i currently available.", round(optimal_width), round(ot_w))
fd59ff58	if (optimal_width > 350000) { flog.warn("Large minimum off-target width, your assay might not provide sufficient off-target reads to make including them useful.") }
c439e822	}
da2cf7d5	}
2f2306ec
08db496a	normals.m <- do.call(cbind,
ec6f11d9	lapply(normals, function(x) x$counts))
8a972c87
08db496a	low.coverage.targets <- .warnLowCoverageTargets(normals.m, normals[[1]],
553bb0aa	low.coverage)
ec6f11d9	normals.m[is.na(normals.m)] <- 0
08db496a	z <- apply(normals.m, 2, mean)
8a972c87	idx.failed <- rep(FALSE, length(normals))
39254458
8a972c87	if (length(normals) > 5) {
08db496a	idx.failed <- z < median(z) * coverage.outliers[1] \|
8a972c87	z > median(z) * coverage.outliers[2]
08db496a	if (sum(idx.failed)) { flog.info("Dropping %s due to outlier coverage.", paste(basename(normal.coverage.files[idx.failed]), collapse = ", "))
8a972c87	} normals <- normals[!idx.failed]
08db496a	normals.m <- normals.m[, !idx.failed, drop = FALSE]
8a972c87	normal.coverage.files <- normal.coverage.files[!idx.failed] }
08db496a	sex.determined <- sapply(normals, getSexFromCoverage)
aef10b2d	if (is.null(sex)) { sex <- sex.determined
e769c0fc	} else {
8a972c87	if (length(sex) != length(idx.failed)) {
6f30d9f0	.stopUserError("Length of normal.coverage.files and sex different")
8a972c87	} sex <- sex[!idx.failed]
2142e4c1	idx.sex <- sex %in% c(NA, "F", "M", "diploid")
aef10b2d	sex[!idx.sex] <- NA
08db496a	if (sum(!idx.sex) > 0) warning("Unexpected values in sex ignored.")
aef10b2d	for (i in seq_along(sex.determined)) {
2142e4c1	if (!is.na(sex.determined[i]) && sex[i] != "diploid" &&
aef10b2d	sex.determined[i] != sex[i]) {
08db496a	flog.warn("Sex mismatch in %s. Sex provided is %s, but could be %s.",
4d22136e	normal.coverage.files[i], sex[i], sex.determined[i])
08db496a	} }
aef10b2d	}
ec6f11d9
553bb0aa
ec6f11d9	groups <- lapply(c(TRUE, FALSE), function(on.target) { idx <- normals[[1]]$on.target == on.target intervals <- normals[[1]][idx] if (length(intervals)) {
08db496a	flog.info("Processing %s-target regions...", ifelse(on.target, "on", "off")) } .standardizeNormals(normals.m[idx, ], normals[[1]][idx], min.coverage,
ec6f11d9	max.missing, sex) })
08db496a	# merge back some on-target and off-target interval statistics
ec6f11d9	intervals.used <- logical(length(normals[[1]])) fraction.missing <- double(length(normals[[1]])) intervals.used[normals[[1]]$on.target] <- groups[[1]]$intervals.used fraction.missing[normals[[1]]$on.target] <- groups[[1]]$fraction.missing if (!is.null(groups[[2]]$intervals.used)) { intervals.used[!normals[[1]]$on.target] <- groups[[2]]$intervals.used fraction.missing[!normals[[1]]$on.target] <- groups[[2]]$fraction.missing }
08db496a
fe78821a	normalDB <- list(
2603eb7b	normal.coverage.files = normal.coverage.files, intervals = as.character(normals[[1]]),
553bb0aa	groups = groups, intervals.used = intervals.used, sex = sex, low.coverage.targets = low.coverage.targets,
f6ae256d	version = 8
ea58a451	)
f6ae256d	normalDB <- .calculateIntervalWeights(normalDB, normals, plot = plot)
fe78821a	normalDB
7d29a0be	}
553bb0aa	.warnLowCoverageTargets <- function(counts, intervals, low.coverage) { all.low <- apply(counts, 1, max) <= low.coverage low.intervals <- intervals[which(all.low & intervals$on.target)] mcols(low.intervals) <- NULL n.low <- length(low.intervals) if (n.low > 0) { flog.info("%i on-target bins with low coverage in all samples.", n.low) } if (n.low > sum(intervals$on.target, na.rm = TRUE) * 0.05) { flog.warn("You are likely not using the correct baits file!") }
08db496a	low.intervals
553bb0aa	}
c439e822	.standardizeNormals <- function(counts, intervals, min.coverage, max.missing, sex, return.counts = FALSE) {
ec6f11d9	if (!length(intervals)) return(list(present = FALSE)) # recalculate without dropped samples fcnts <- apply(counts, 2, function(x) x/sum(x)) fcnts_interval_medians <- apply(fcnts, 1, median, na.rm=TRUE) fraction.missing <- apply(counts, 1, function(x) sum(is.na(x)\|x<=0))/ncol(counts)
0208ae29	intervals.used <- .filterIntervalsCreateNormalDB(intervals,
ec6f11d9	fcnts_interval_medians, fraction.missing, min.coverage, max.missing)
ea58a451
ec6f11d9	fcnts <- apply(counts[intervals.used,], 2, function(x) x/sum(x)) fcnts_interval_medians <- apply(fcnts, 1, median)
26aa6f47	fcnts_interval_medians_F <- fcnts_interval_medians fcnts_interval_medians_M <- fcnts_interval_medians # do we need a sex-aware database? if ("F" %in% sex && "M" %in% sex) {
31ed7540	fcnts_interval_medians_F <- apply(fcnts[, which(sex=="F"), drop = FALSE], 1, median) fcnts_interval_medians_M <- apply(fcnts[, which(sex=="M"), drop = FALSE], 1, median)
26aa6f47	sex.chr <- .getSexChr(seqlevels(intervals)) idx <- as.character(seqnames(intervals)[intervals.used]) %in% sex.chr fcnts_interval_medians_F[!idx] <- fcnts_interval_medians[!idx] fcnts_interval_medians_M[!idx] <- fcnts_interval_medians[!idx] fcnts_std <- fcnts for (i in seq(ncol(fcnts))) {
31ed7540	if (is.null(sex[i]) \|\| is.na(sex[i])) sex[i] <- "?"
26aa6f47	iv <- switch(sex[i], "F"=fcnts_interval_medians_F, "M"=fcnts_interval_medians_M, fcnts_interval_medians) fcnts_std[, i] <- fcnts_std[,i] / iv }
c439e822	} else {
bb3203dc	fcnts_std <- apply(fcnts, 2, function(x) x / fcnts_interval_medians)
26aa6f47	}
bb3203dc	fcnts_interval_non_zero_medians <- apply(fcnts_std, 1, function(x) median(x[x > 0])) fcnts_std_imp <- apply(fcnts_std, 2, function(x) { x[x <= 0] <- fcnts_interval_non_zero_medians[x <= 0]; x}) p <- 0.001 li <- quantile(as.vector(fcnts_std_imp), probs = c(p, 1 - p))
ec6f11d9	fcnts_std_trunc <- fcnts_std_imp
c439e822	fcnts_std_trunc[fcnts_std_imp < li[1]] <- li[1] fcnts_std_trunc[fcnts_std_imp > li[2]] <- li[2] fcnts_std_final <- apply(fcnts_std_trunc, 2, function(x) log2(x / median(x)))
bb3203dc	fcnts_std_final <- fcnts_std_final - median(apply(fcnts_std_final, 2, median))
653a7a79	s <- svd(fcnts_std_final)
ec6f11d9
c439e822	ret <- list(
653a7a79	projection = s$u, projection.v = s$v,
26aa6f47	intervals.used = intervals.used, interval.median.coverage = list(all = fcnts_interval_medians, F = fcnts_interval_medians_F, M = fcnts_interval_medians_M), fraction.missing = fraction.missing,
08db496a	dist.t = dist(t(fcnts_std_final)),
c439e822	present = TRUE
ec6f11d9	)
c439e822	if (return.counts) { ret$counts <- fcnts_std_final } return(ret)
ec6f11d9	}
26aa6f47	.denoiseSample <- function(x, normalDB, num.eigen, sex) {
ec6f11d9	fcnts <- x$counts[normalDB$intervals.used] fcnts <- fcnts/sum(fcnts, na.rm=TRUE)
31ed7540	if (is.null(sex) \|\| is.na(sex)) sex <- "?"
26aa6f47	iv <- switch(sex, "F"=normalDB$interval.median.coverage$F, "M"=normalDB$interval.median.coverage$M, normalDB$interval.median.coverage$all) fcnts_std <- fcnts/iv
dcf0f557	# impute intervals which have no data in sample but in database # otherwise the projection will return only NA
6a01f5d7	idxMissing <- which(fcnts_std <= 0 \| is.na(fcnts_std)) fcnts_std[idxMissing] <- 1e-9
ec6f11d9	fcnts_std_final <- log2(fcnts_std/median(fcnts_std, na.rm = TRUE)) x$log.ratio.std <- 0. x$log.ratio.std[!normalDB$intervals.used] <- NA x$log.ratio <- x$log.ratio.std x$log.ratio.std[normalDB$intervals.used] <- fcnts_std_final if (num.eigen > ncol(normalDB$projection)) num.eigen <- ncol(normalDB$projection) P <- normalDB$projection[,seq(1,num.eigen)] x$log.ratio[normalDB$intervals.used] <- fcnts_std_final - as.vector(tcrossprod(fcnts_std_final %*% P, P)) x } #' Calculate tangent normal #' #' Reimplementation of GATK4 denoising. Please cite the relevant GATK #' publication if you use this in a publication. #' #' #' @param tumor.coverage.file Coverage file or data of a tumor sample. #' @param normalDB Database of normal samples, created with #' \code{\link{createNormalDatabase}}. #' @param num.eigen Number of eigen vectors used. #' @param ignore.sex If \code{FALSE}, detects sex of sample and returns best #' normals with matching sex. #' @param sex Sex of sample. If \code{NULL}, determine with #' \code{\link{getSexFromCoverage}} and default parameters. Valid values are #' \code{F} for female, \code{M} for male. If all chromosomes are diploid, #' specify \code{diploid}. #' @seealso \code{\link{createNormalDatabase}} #' @author Markus Riester #' @examples #'
4a7f4816	#' tumor.coverage.file <- system.file('extdata', 'example_tumor.txt.gz', #' package = 'PureCN') #' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz", #' package = "PureCN") #' normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz", #' package = "PureCN")
ec6f11d9	#' normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file) #' normalDB <- createNormalDatabase(normal.coverage.files) #' pool <- calculateTangentNormal(tumor.coverage.file, normalDB) #' #' @export calculateTangentNormal calculateTangentNormal <- function(tumor.coverage.file, normalDB, num.eigen = 20, ignore.sex = FALSE, sex = NULL) { if (is.character(tumor.coverage.file)) { tumor <- readCoverageFile(tumor.coverage.file) } else { tumor <- tumor.coverage.file }
a312ed63	if (!.checkNormalDB(tumor, normalDB)) {
ec6f11d9	.stopUserError("tumor.coverage.file and normalDB do not align.") }
26aa6f47	if (!ignore.sex && !is.null(normalDB$sex) && sum(!is.na(normalDB$sex))>0) { if (is.null(sex)) { sex <- getSexFromCoverage(tumor) } flog.info("Sample sex: %s", sex) }
ec6f11d9	tumor$log.ratio <- 0. tumor$log.ratio.std <- 0.
26aa6f47	denoised <- .denoiseSample(tumor[tumor$on.target], normalDB$groups[[1]], num.eigen, sex)
ec6f11d9	ov <- findOverlaps(tumor, denoised) tumor$log.ratio[queryHits(ov)] <- denoised$log.ratio[subjectHits(ov)] tumor$log.ratio.std[queryHits(ov)] <- denoised$log.ratio.std[subjectHits(ov)] if (normalDB$groups[[2]]$present) {
26aa6f47	denoised <- .denoiseSample(tumor[!tumor$on.target], normalDB$groups[[2]], num.eigen, sex)
ec6f11d9	ov <- findOverlaps(tumor, denoised) tumor$log.ratio[queryHits(ov)] <- denoised$log.ratio[subjectHits(ov)] tumor$log.ratio.std[queryHits(ov)] <- denoised$log.ratio.std[subjectHits(ov)] } fakeNormal <- tumor
108e255d	idxNA <- is.na(fakeNormal$log.ratio) fakeNormal$average.coverage[!idxNA] <- (2 ^ (log2(tumor$average.coverage) - tumor$log.ratio))[!idxNA]
ec6f11d9	fakeNormal$coverage <- fakeNormal$average.coverage * width(fakeNormal) fakeNormal }
51e99c42	.readNormals <- function(normal.coverage.files) {
167ea8ab	normals <- lapply(normal.coverage.files, function(x) { if (is(x, "character")) { return(readCoverageFile(normalizePath(x))) } return(x) })
51e99c42	# check that all files used the same interval file. for (i in seq_along(normals)) {
44326645	if (!identical(as.character(normals[[i]]), as.character(normals[[1]]))) {
51e99c42	.stopUserError("All normal.coverage.files must have the same ", "intervals. ", normal.coverage.files[i], " is different.") } } normals }
ec6f11d9
0208ae29	.filterIntervalsCreateNormalDB <- function(intervals,
26aa6f47	interval.median.coverage, fraction.missing,
ec6f11d9	min.coverage, max.missing) { nBefore <- length(intervals) intervals.used <- is.finite(fraction.missing)
2e76b10e	key <- paste(as.character(seqnames(intervals)), intervals$on.target) min.coverage <- (sapply(split(interval.median.coverage,
0208ae29	key), median, na.rm=TRUE) * min.coverage)[key]
2e76b10e
26aa6f47	intervals.used <- intervals.used & !is.na(interval.median.coverage) &
2e76b10e	interval.median.coverage >= min.coverage
ec6f11d9	nAfter <- sum(intervals.used) if (nAfter < nBefore) {
76a87bd3	flog.info("Removing %i intervals with low coverage in normalDB.",
ec6f11d9	nBefore-nAfter) } nBefore <- sum(intervals.used) intervals.used <- intervals.used & !is.na(fraction.missing) & fraction.missing <= max.missing nAfter <- sum(intervals.used) if (nAfter < nBefore) {
76a87bd3	flog.info("Removing %i intervals with zero coverage in more than %.0f%% of normalDB.",
ec6f11d9	nBefore-nAfter, max.missing*100) }
4010268a	if (nAfter < 2) { .stopUserError("No intervals left after coverage checks.") }
ec6f11d9	intervals.used }
b5957a16	.calculateIntervalWeights <- function(normalDB, normal.coverage,
6e428672	top.quantile = 0.7, plot = FALSE) {
b5957a16	tumor.coverage <- list(poolCoverage(normal.coverage, w = rep(1, length(normal.coverage)) / length(normal.coverage)))
6e428672	lrs <- lapply(normal.coverage, function(x) calculateTangentNormal(x, normalDB)$log.ratio)
b5957a16	lrs <- do.call(cbind, lrs) lrs[is.infinite(lrs)] <- NA intervals <- normal.coverage[[1]] mcols(intervals) <- NULL lrs.sd <- apply(lrs, 1, sd, na.rm = TRUE) lrs.cnt.na <- apply(lrs, 1, function(x) sum(is.na(x))) # get the top.quantile % of sd by chromosome and use this to normalize weight=1 chrom <- as.character(seqnames(intervals)) sdCutoffByChr <- sapply(split(lrs.sd, chrom), quantile, probs = top.quantile, names = FALSE, na.rm = TRUE)[chrom] zz <- sdCutoffByChr / lrs.sd zz[zz > 1] <- 1 idx <- is.na(zz) \| lrs.cnt.na > ncol(lrs) / 3 zz[idx] <- min(zz, na.rm = TRUE) ret <- list( log.ratios = GRanges(intervals,,, DataFrame(lrs)), weights = GRanges(intervals,,, DataFrame(weights = zz))) if (plot) .plotIntervalWeights(lrs.sd, width(tumor.coverage[[1]]), tumor.coverage[[1]]$on.target) normalDB$sd <- ret normalDB } .plotIntervalWeights <- function(lrs.sd, width, on.target) { par(mfrow = c(1, 2)) plot(width[on.target], lrs.sd[on.target], ylim = c(0,2), xlab = "Interval Width", ylab = "log2 ratio sd.", main = "On-Target") if (sum(!on.target)) { plot(width[!on.target], lrs.sd[!on.target], col = "red", ylim = c(0, 2), xlab = "Interval Width", ylab = "log2 ratio sd.", main = "Off-Target") } }