7d29a0be |
#' Create database of normal samples
|
08db496a |
#'
|
ec6f11d9 |
#' Function to create a database of normal samples, used to normalize
#' tumor coverages.
|
08db496a |
#'
#'
#' @param normal.coverage.files Vector with file names pointing to
|
7d29a0be |
#' coverage files of normal samples.
#' @param sex \code{character(length(normal.coverage.files))} with sex for all
#' files. \code{F} for female, \code{M} for male. If all chromosomes are
#' diploid, specify \code{diploid}. If \code{NULL}, determine from coverage.
|
8a972c87 |
#' @param coverage.outliers Exclude samples with coverages below or above
#' the specified cutoffs (fractions of the normal sample coverages median).
#' Only for databases with more than 5 samples.
|
08db496a |
#' @param min.coverage Exclude intervals with coverage lower than
|
ec6f11d9 |
#' the specified fraction of the chromosome median in the pool of normals.
|
0208ae29 |
#' @param max.missing Exclude intervals with zero coverage in the
|
ec6f11d9 |
#' specified fraction of normal samples.
|
08db496a |
#' @param low.coverage Specifies the maximum number of total reads
|
23cc6c14 |
#' (NOT average coverage) to call a target low coverage.
|
da2cf7d5 |
#' @param optimal.off.target.counts Used to suggest an optimal off-target
|
08db496a |
#' interval width (BETA).
|
fe78821a |
#' @param plot Diagnostics plot, useful to tune parameters.
|
7d29a0be |
#' @param \dots Arguments passed to the \code{prcomp} function.
#' @return A normal database that can be used in the
|
ec6f11d9 |
#' \code{\link{calculateTangentNormal}} function to retrieve a coverage
#' normalization sample for a given tumor sample.
|
7d29a0be |
#' @author Markus Riester
|
ec6f11d9 |
#' @seealso \code{\link{calculateTangentNormal}}
|
7d29a0be |
#' @examples
|
08db496a |
#'
|
4a7f4816 |
#' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz",
|
08db496a |
#' package = "PureCN")
|
4a7f4816 |
#' normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz",
|
08db496a |
#' package = "PureCN")
|
7d29a0be |
#' normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
#' normalDB <- createNormalDatabase(normal.coverage.files)
#'
#' @export createNormalDatabase
|
ec6f11d9 |
#' @importFrom Matrix tcrossprod
|
7d29a0be |
createNormalDatabase <- function(normal.coverage.files, sex = NULL,
|
fe78821a |
coverage.outliers = c(0.25, 4), min.coverage = 0.25,
|
da2cf7d5 |
max.missing = 0.03, low.coverage = 15,
optimal.off.target.counts = 120, plot = FALSE, ...) {
|
6f30d9f0 |
normal.coverage.files <- normalizePath(normal.coverage.files)
|
bf20092f |
if (any(duplicated(normal.coverage.files))) {
flog.warn("Some normal.coverage.files duplicated.")
normal.coverage.files <- unique(normal.coverage.files)
}
if (length(normal.coverage.files) < 2) {
.stopUserError("At least 2 normal.coverage.files required.")
}
|
1d2bdaac |
if (is(normal.coverage.files, "character") &&
any(grepl("_coverage.txt", normal.coverage.files)) &&
any(grepl("_loess.txt", normal.coverage.files))) {
.stopUserError("normal.coverage.files with _coverage.txt and _loess.txt suffix provided. Provide either only GC-normalized or raw coverage files!")
}
|
bf20092f |
|
51e99c42 |
normals <- .readNormals(normal.coverage.files)
|
08db496a |
|
da2cf7d5 |
if (!all(normals[[1]]$on.target)) {
ot_w <- median(sapply(lapply(normals, function(x) width(x)[!x$on.target]), median, na.rm = TRUE))
ot_c <- median(sapply(lapply(normals, function(x) x$counts[!x$on.target]), median, na.rm = TRUE))
|
c439e822 |
if (ot_w < 5000) {
|
fd59ff58 |
flog.warn("Small median off-target width (%.0f). Double check that this is correct.",
|
67f45934 |
ot_w)
|
c439e822 |
}
if (ot_c < 1) {
flog.warn("Not enough off-target counts. Suggest re-running IntervalFile.R without --off-target.")
} else {
optimal_width <- round(optimal.off.target.counts / ot_c * ot_w / 100000, digits = 1) * 100000
flog.info("Recommended minimum off-target width is %i compared to %i currently available.",
round(optimal_width), round(ot_w))
|
fd59ff58 |
if (optimal_width > 350000) {
flog.warn("Large minimum off-target width, your assay might not provide sufficient off-target reads to make including them useful.")
}
|
c439e822 |
}
|
da2cf7d5 |
}
|
2f2306ec |
|
08db496a |
normals.m <- do.call(cbind,
|
ec6f11d9 |
lapply(normals, function(x) x$counts))
|
8a972c87 |
|
08db496a |
low.coverage.targets <- .warnLowCoverageTargets(normals.m, normals[[1]],
|
553bb0aa |
low.coverage)
|
ec6f11d9 |
normals.m[is.na(normals.m)] <- 0
|
08db496a |
z <- apply(normals.m, 2, mean)
|
8a972c87 |
idx.failed <- rep(FALSE, length(normals))
|
39254458 |
|
8a972c87 |
if (length(normals) > 5) {
|
08db496a |
idx.failed <- z < median(z) * coverage.outliers[1] |
|
8a972c87 |
z > median(z) * coverage.outliers[2]
|
08db496a |
if (sum(idx.failed)) {
flog.info("Dropping %s due to outlier coverage.",
paste(basename(normal.coverage.files[idx.failed]), collapse = ", "))
|
8a972c87 |
}
normals <- normals[!idx.failed]
|
08db496a |
normals.m <- normals.m[, !idx.failed, drop = FALSE]
|
8a972c87 |
normal.coverage.files <- normal.coverage.files[!idx.failed]
}
|
08db496a |
sex.determined <- sapply(normals, getSexFromCoverage)
|
aef10b2d |
if (is.null(sex)) {
sex <- sex.determined
|
e769c0fc |
} else {
|
8a972c87 |
if (length(sex) != length(idx.failed)) {
|
6f30d9f0 |
.stopUserError("Length of normal.coverage.files and sex different")
|
8a972c87 |
}
sex <- sex[!idx.failed]
|
2142e4c1 |
idx.sex <- sex %in% c(NA, "F", "M", "diploid")
|
aef10b2d |
sex[!idx.sex] <- NA
|
08db496a |
if (sum(!idx.sex) > 0) warning("Unexpected values in sex ignored.")
|
aef10b2d |
for (i in seq_along(sex.determined)) {
|
2142e4c1 |
if (!is.na(sex.determined[i]) && sex[i] != "diploid" &&
|
aef10b2d |
sex.determined[i] != sex[i]) {
|
08db496a |
flog.warn("Sex mismatch in %s. Sex provided is %s, but could be %s.",
|
4d22136e |
normal.coverage.files[i], sex[i], sex.determined[i])
|
08db496a |
}
}
|
aef10b2d |
}
|
ec6f11d9 |
|
553bb0aa |
|
ec6f11d9 |
groups <- lapply(c(TRUE, FALSE), function(on.target) {
idx <- normals[[1]]$on.target == on.target
intervals <- normals[[1]][idx]
if (length(intervals)) {
|
08db496a |
flog.info("Processing %s-target regions...",
ifelse(on.target, "on", "off"))
}
.standardizeNormals(normals.m[idx, ], normals[[1]][idx], min.coverage,
|
ec6f11d9 |
max.missing, sex)
})
|
08db496a |
# merge back some on-target and off-target interval statistics
|
ec6f11d9 |
intervals.used <- logical(length(normals[[1]]))
fraction.missing <- double(length(normals[[1]]))
intervals.used[normals[[1]]$on.target] <- groups[[1]]$intervals.used
fraction.missing[normals[[1]]$on.target] <- groups[[1]]$fraction.missing
if (!is.null(groups[[2]]$intervals.used)) {
intervals.used[!normals[[1]]$on.target] <- groups[[2]]$intervals.used
fraction.missing[!normals[[1]]$on.target] <- groups[[2]]$fraction.missing
}
|
08db496a |
|
fe78821a |
normalDB <- list(
|
2603eb7b |
normal.coverage.files = normal.coverage.files,
intervals = as.character(normals[[1]]),
|
553bb0aa |
groups = groups,
intervals.used = intervals.used,
sex = sex,
low.coverage.targets = low.coverage.targets,
|
f6ae256d |
version = 8
|
ea58a451 |
)
|
f6ae256d |
normalDB <- .calculateIntervalWeights(normalDB, normals, plot = plot)
|
fe78821a |
normalDB
|
7d29a0be |
}
|
553bb0aa |
.warnLowCoverageTargets <- function(counts, intervals, low.coverage) {
all.low <- apply(counts, 1, max) <= low.coverage
low.intervals <- intervals[which(all.low & intervals$on.target)]
mcols(low.intervals) <- NULL
n.low <- length(low.intervals)
if (n.low > 0) {
flog.info("%i on-target bins with low coverage in all samples.", n.low)
}
if (n.low > sum(intervals$on.target, na.rm = TRUE) * 0.05) {
flog.warn("You are likely not using the correct baits file!")
}
|
08db496a |
low.intervals
|
553bb0aa |
}
|
c439e822 |
.standardizeNormals <- function(counts, intervals, min.coverage, max.missing, sex, return.counts = FALSE) {
|
ec6f11d9 |
if (!length(intervals)) return(list(present = FALSE))
# recalculate without dropped samples
fcnts <- apply(counts, 2, function(x) x/sum(x))
fcnts_interval_medians <- apply(fcnts, 1, median, na.rm=TRUE)
fraction.missing <- apply(counts, 1, function(x)
sum(is.na(x)|x<=0))/ncol(counts)
|
0208ae29 |
intervals.used <- .filterIntervalsCreateNormalDB(intervals,
|
ec6f11d9 |
fcnts_interval_medians, fraction.missing, min.coverage, max.missing)
|
ea58a451 |
|
ec6f11d9 |
fcnts <- apply(counts[intervals.used,], 2, function(x) x/sum(x))
fcnts_interval_medians <- apply(fcnts, 1, median)
|
26aa6f47 |
fcnts_interval_medians_F <- fcnts_interval_medians
fcnts_interval_medians_M <- fcnts_interval_medians
# do we need a sex-aware database?
if ("F" %in% sex && "M" %in% sex) {
|
31ed7540 |
fcnts_interval_medians_F <- apply(fcnts[, which(sex=="F"), drop = FALSE], 1, median)
fcnts_interval_medians_M <- apply(fcnts[, which(sex=="M"), drop = FALSE], 1, median)
|
26aa6f47 |
sex.chr <- .getSexChr(seqlevels(intervals))
idx <- as.character(seqnames(intervals)[intervals.used]) %in% sex.chr
fcnts_interval_medians_F[!idx] <- fcnts_interval_medians[!idx]
fcnts_interval_medians_M[!idx] <- fcnts_interval_medians[!idx]
fcnts_std <- fcnts
for (i in seq(ncol(fcnts))) {
|
31ed7540 |
if (is.null(sex[i]) || is.na(sex[i])) sex[i] <- "?"
|
26aa6f47 |
iv <- switch(sex[i],
"F"=fcnts_interval_medians_F,
"M"=fcnts_interval_medians_M,
fcnts_interval_medians)
fcnts_std[, i] <- fcnts_std[,i] / iv
}
|
c439e822 |
} else {
|
bb3203dc |
fcnts_std <- apply(fcnts, 2, function(x) x / fcnts_interval_medians)
|
26aa6f47 |
}
|
bb3203dc |
fcnts_interval_non_zero_medians <- apply(fcnts_std, 1, function(x) median(x[x > 0]))
fcnts_std_imp <- apply(fcnts_std, 2, function(x) { x[x <= 0] <- fcnts_interval_non_zero_medians[x <= 0]; x})
p <- 0.001
li <- quantile(as.vector(fcnts_std_imp), probs = c(p, 1 - p))
|
ec6f11d9 |
fcnts_std_trunc <- fcnts_std_imp
|
c439e822 |
fcnts_std_trunc[fcnts_std_imp < li[1]] <- li[1]
fcnts_std_trunc[fcnts_std_imp > li[2]] <- li[2]
fcnts_std_final <- apply(fcnts_std_trunc, 2, function(x) log2(x / median(x)))
|
bb3203dc |
fcnts_std_final <- fcnts_std_final - median(apply(fcnts_std_final, 2, median))
|
653a7a79 |
s <- svd(fcnts_std_final)
|
ec6f11d9 |
|
c439e822 |
ret <- list(
|
653a7a79 |
projection = s$u,
projection.v = s$v,
|
26aa6f47 |
intervals.used = intervals.used,
interval.median.coverage = list(all = fcnts_interval_medians,
F = fcnts_interval_medians_F,
M = fcnts_interval_medians_M),
fraction.missing = fraction.missing,
|
08db496a |
dist.t = dist(t(fcnts_std_final)),
|
c439e822 |
present = TRUE
|
ec6f11d9 |
)
|
c439e822 |
if (return.counts) {
ret$counts <- fcnts_std_final
}
return(ret)
|
ec6f11d9 |
}
|
26aa6f47 |
.denoiseSample <- function(x, normalDB, num.eigen, sex) {
|
ec6f11d9 |
fcnts <- x$counts[normalDB$intervals.used]
fcnts <- fcnts/sum(fcnts, na.rm=TRUE)
|
31ed7540 |
if (is.null(sex) || is.na(sex)) sex <- "?"
|
26aa6f47 |
iv <- switch(sex,
"F"=normalDB$interval.median.coverage$F,
"M"=normalDB$interval.median.coverage$M,
normalDB$interval.median.coverage$all)
fcnts_std <- fcnts/iv
|
dcf0f557 |
# impute intervals which have no data in sample but in database
# otherwise the projection will return only NA
|
6a01f5d7 |
idxMissing <- which(fcnts_std <= 0 | is.na(fcnts_std))
fcnts_std[idxMissing] <- 1e-9
|
ec6f11d9 |
fcnts_std_final <- log2(fcnts_std/median(fcnts_std, na.rm = TRUE))
x$log.ratio.std <- 0.
x$log.ratio.std[!normalDB$intervals.used] <- NA
x$log.ratio <- x$log.ratio.std
x$log.ratio.std[normalDB$intervals.used] <- fcnts_std_final
if (num.eigen > ncol(normalDB$projection)) num.eigen <- ncol(normalDB$projection)
P <- normalDB$projection[,seq(1,num.eigen)]
x$log.ratio[normalDB$intervals.used] <- fcnts_std_final -
as.vector(tcrossprod(fcnts_std_final %*% P, P))
x
}
#' Calculate tangent normal
#'
#' Reimplementation of GATK4 denoising. Please cite the relevant GATK
#' publication if you use this in a publication.
#'
#'
#' @param tumor.coverage.file Coverage file or data of a tumor sample.
#' @param normalDB Database of normal samples, created with
#' \code{\link{createNormalDatabase}}.
#' @param num.eigen Number of eigen vectors used.
#' @param ignore.sex If \code{FALSE}, detects sex of sample and returns best
#' normals with matching sex.
#' @param sex Sex of sample. If \code{NULL}, determine with
#' \code{\link{getSexFromCoverage}} and default parameters. Valid values are
#' \code{F} for female, \code{M} for male. If all chromosomes are diploid,
#' specify \code{diploid}.
#' @seealso \code{\link{createNormalDatabase}}
#' @author Markus Riester
#' @examples
#'
|
4a7f4816 |
#' tumor.coverage.file <- system.file('extdata', 'example_tumor.txt.gz',
#' package = 'PureCN')
#' normal.coverage.file <- system.file("extdata", "example_normal.txt.gz",
#' package = "PureCN")
#' normal2.coverage.file <- system.file("extdata", "example_normal2.txt.gz",
#' package = "PureCN")
|
ec6f11d9 |
#' normal.coverage.files <- c(normal.coverage.file, normal2.coverage.file)
#' normalDB <- createNormalDatabase(normal.coverage.files)
#' pool <- calculateTangentNormal(tumor.coverage.file, normalDB)
#'
#' @export calculateTangentNormal
calculateTangentNormal <- function(tumor.coverage.file, normalDB,
num.eigen = 20, ignore.sex = FALSE,
sex = NULL) {
if (is.character(tumor.coverage.file)) {
tumor <- readCoverageFile(tumor.coverage.file)
} else {
tumor <- tumor.coverage.file
}
|
a312ed63 |
if (!.checkNormalDB(tumor, normalDB)) {
|
ec6f11d9 |
.stopUserError("tumor.coverage.file and normalDB do not align.")
}
|
26aa6f47 |
if (!ignore.sex && !is.null(normalDB$sex) &&
sum(!is.na(normalDB$sex))>0) {
if (is.null(sex)) {
sex <- getSexFromCoverage(tumor)
}
flog.info("Sample sex: %s", sex)
}
|
ec6f11d9 |
tumor$log.ratio <- 0.
tumor$log.ratio.std <- 0.
|
26aa6f47 |
denoised <- .denoiseSample(tumor[tumor$on.target], normalDB$groups[[1]], num.eigen, sex)
|
ec6f11d9 |
ov <- findOverlaps(tumor, denoised)
tumor$log.ratio[queryHits(ov)] <- denoised$log.ratio[subjectHits(ov)]
tumor$log.ratio.std[queryHits(ov)] <- denoised$log.ratio.std[subjectHits(ov)]
if (normalDB$groups[[2]]$present) {
|
26aa6f47 |
denoised <- .denoiseSample(tumor[!tumor$on.target], normalDB$groups[[2]], num.eigen, sex)
|
ec6f11d9 |
ov <- findOverlaps(tumor, denoised)
tumor$log.ratio[queryHits(ov)] <- denoised$log.ratio[subjectHits(ov)]
tumor$log.ratio.std[queryHits(ov)] <- denoised$log.ratio.std[subjectHits(ov)]
}
fakeNormal <- tumor
|
108e255d |
idxNA <- is.na(fakeNormal$log.ratio)
fakeNormal$average.coverage[!idxNA] <- (2 ^ (log2(tumor$average.coverage) - tumor$log.ratio))[!idxNA]
|
ec6f11d9 |
fakeNormal$coverage <- fakeNormal$average.coverage * width(fakeNormal)
fakeNormal
}
|
51e99c42 |
.readNormals <- function(normal.coverage.files) {
|
167ea8ab |
normals <- lapply(normal.coverage.files, function(x) {
if (is(x, "character")) {
return(readCoverageFile(normalizePath(x)))
}
return(x)
})
|
51e99c42 |
# check that all files used the same interval file.
for (i in seq_along(normals)) {
|
44326645 |
if (!identical(as.character(normals[[i]]), as.character(normals[[1]]))) {
|
51e99c42 |
.stopUserError("All normal.coverage.files must have the same ",
"intervals. ", normal.coverage.files[i], " is different.")
}
}
normals
}
|
ec6f11d9 |
|
0208ae29 |
.filterIntervalsCreateNormalDB <- function(intervals,
|
26aa6f47 |
interval.median.coverage, fraction.missing,
|
ec6f11d9 |
min.coverage, max.missing) {
nBefore <- length(intervals)
intervals.used <- is.finite(fraction.missing)
|
2e76b10e |
key <- paste(as.character(seqnames(intervals)), intervals$on.target)
min.coverage <- (sapply(split(interval.median.coverage,
|
0208ae29 |
key), median, na.rm=TRUE) * min.coverage)[key]
|
2e76b10e |
|
26aa6f47 |
intervals.used <- intervals.used & !is.na(interval.median.coverage) &
|
2e76b10e |
interval.median.coverage >= min.coverage
|
ec6f11d9 |
nAfter <- sum(intervals.used)
if (nAfter < nBefore) {
|
76a87bd3 |
flog.info("Removing %i intervals with low coverage in normalDB.",
|
ec6f11d9 |
nBefore-nAfter)
}
nBefore <- sum(intervals.used)
intervals.used <- intervals.used & !is.na(fraction.missing) &
fraction.missing <= max.missing
nAfter <- sum(intervals.used)
if (nAfter < nBefore) {
|
76a87bd3 |
flog.info("Removing %i intervals with zero coverage in more than %.0f%% of normalDB.",
|
ec6f11d9 |
nBefore-nAfter, max.missing*100)
}
|
4010268a |
if (nAfter < 2) {
.stopUserError("No intervals left after coverage checks.")
}
|
ec6f11d9 |
intervals.used
}
|
b5957a16 |
.calculateIntervalWeights <- function(normalDB, normal.coverage,
|
6e428672 |
top.quantile = 0.7, plot = FALSE) {
|
b5957a16 |
tumor.coverage <- list(poolCoverage(normal.coverage,
w = rep(1, length(normal.coverage)) / length(normal.coverage)))
|
6e428672 |
lrs <- lapply(normal.coverage, function(x) calculateTangentNormal(x, normalDB)$log.ratio)
|
b5957a16 |
lrs <- do.call(cbind, lrs)
lrs[is.infinite(lrs)] <- NA
intervals <- normal.coverage[[1]]
mcols(intervals) <- NULL
lrs.sd <- apply(lrs, 1, sd, na.rm = TRUE)
lrs.cnt.na <- apply(lrs, 1, function(x) sum(is.na(x)))
# get the top.quantile % of sd by chromosome and use this to normalize weight=1
chrom <- as.character(seqnames(intervals))
sdCutoffByChr <- sapply(split(lrs.sd, chrom), quantile, probs = top.quantile,
names = FALSE, na.rm = TRUE)[chrom]
zz <- sdCutoffByChr / lrs.sd
zz[zz > 1] <- 1
idx <- is.na(zz) | lrs.cnt.na > ncol(lrs) / 3
zz[idx] <- min(zz, na.rm = TRUE)
ret <- list(
log.ratios = GRanges(intervals,,, DataFrame(lrs)),
weights = GRanges(intervals,,, DataFrame(weights = zz)))
if (plot) .plotIntervalWeights(lrs.sd, width(tumor.coverage[[1]]),
tumor.coverage[[1]]$on.target)
normalDB$sd <- ret
normalDB
}
.plotIntervalWeights <- function(lrs.sd, width, on.target) {
par(mfrow = c(1, 2))
plot(width[on.target], lrs.sd[on.target], ylim = c(0,2),
xlab = "Interval Width", ylab = "log2 ratio sd.", main = "On-Target")
if (sum(!on.target)) {
plot(width[!on.target], lrs.sd[!on.target], col = "red",
ylim = c(0, 2), xlab = "Interval Width", ylab = "log2 ratio sd.",
main = "Off-Target")
}
}
|