# Make CMD check happy globalVariables(names=c("Gene", "LR", "chrom", "seg.id", "seg.length", "seg.mean")) # calculates the log-likelihood of a segment, given log-ratios, the # log ratio standard deviation, purity, copy number and ploidy. # for sub-clonal alterations, it uses a uniform distribution, otherwise # multiple gaussians for all tested copy numbers. .calcLlikSegment <- function(subclonal, lr, sd.seg, p, Ci, total.ploidy, max.exon.ratio) { if (subclonal) { return(.calcLlikSegmentSubClonal(lr, max.exon.ratio)) } .calcLlikSegmentClonal(lr, sd.seg, p, Ci, total.ploidy) } .calcLlikSegmentClonal <- function(lr, sd.seg, p, Ci, total.ploidy) { sum(dnorm(lr, mean = log2((p * Ci + (1 - p) * 2)/total.ploidy), sd = sd.seg, log = TRUE)) } .calcLlikSegmentSubClonal <- function(lr, max.exon.ratio) { sum(dunif( x = vapply(2^lr, function(y) min(y, max.exon.ratio), double(1)), min = 0, max = max.exon.ratio, log = TRUE)) } # Previously calculated likelihood scores do not take the segmentation into # account. This will find the most likely segment minor copy number .calcMsSegmentC <- function(yy, test.num.copy, Ci, prior.K, mapping.bias.ok, seg.id, min.variants.segment) { prior.M <- list(0.2,0.15,c(0.1,0.25),c(0.1,0.3),c(0.1,0.2,0.55)) prior.M <- c(list(1), lapply(prior.M, function(x) c(x, 1-sum(x)))) max.M <- floor(Ci/2) idx.germline <- test.num.copy+length(test.num.copy)+1 idx.somatic <- test.num.copy+1 yys <- lapply(0:max.M, function(Mi) { for (i in test.num.copy) { n.cases.germ <- ifelse(Mi==Ci-Mi,1,2) tmp <- unique(c(0, 1,Mi, Ci-Mi)) n.cases.somatic <- length(tmp) Cx <- max(i,Ci) if (i!=Mi && i!=Ci - Mi) { yy[,idx.germline[i+1]] <- yy[,idx.germline[i+1]] + log(1-prior.K) - log(Cx + 1 - n.cases.germ) # allow somatic mutations always have M=1 if (i<=1) { yy[,idx.somatic[i+1]] <- yy[,idx.somatic[i+1]] + log(prior.K) - log(n.cases.somatic) } else { yy[,idx.somatic[i+1]] <- yy[,idx.somatic[i+1]] + log(1-prior.K) - log(Cx + 1 - n.cases.somatic) } } else { yy[,idx.germline[i+1]] <- yy[,idx.germline[i+1]] + log(prior.K) -log(n.cases.germ) yy[,idx.somatic[i+1]] <- yy[,idx.somatic[i+1]] + log(prior.K) - log(n.cases.somatic) } } yy }) # if not enough variants in segment, flag flag <- sum(mapping.bias.ok) < min.variants.segment if (!sum(mapping.bias.ok)) { mapping.bias.ok <- rep(TRUE, length(mapping.bias.ok)) } likelihoodScores <- vapply(yys, function(x) sum(apply(x[mapping.bias.ok,,drop=FALSE], 1, max)),double(1)) best <- which.max(likelihoodScores) # transform and scale likelihoodScores <- exp(likelihoodScores - likelihoodScores[best]) posterior <- likelihoodScores[best]/sum(likelihoodScores, na.rm=TRUE) if (is.na(posterior) || posterior < 0.5) flag <- TRUE list(best=yys[[best]], M=test.num.copy[best], flag=flag, posterior=posterior) } # calculate likelihood scores for all possible minor copy numbers .calcMsSegment <- function(xxi, test.num.copy, opt.C, prior.K, mapping.bias.ok, seg.id, min.variants.segment) { lapply(seq_along(xxi), function(i).calcMsSegmentC(xxi[[i]], test.num.copy, c(test.num.copy, round(opt.C))[i], prior.K, mapping.bias.ok, seg.id, min.variants.segment)) } .calcSNVLLik <- function(vcf, tumor.id.in.vcf, ov, p, test.num.copy, C.likelihood, C, opt.C, median.C, snv.model, prior.somatic, mapping.bias, snv.lr, sampleid = NULL, cont.rate = 0.01, prior.K, max.coverage.vcf, non.clonal.M, model.homozygous=FALSE, error=0.001, max.mapping.bias=0.8, max.pon, min.variants.segment) { .dbeta <- function(x, shape1, shape2, log, size) dbeta(x=x, shape1=shape1, shape2=shape2, log=log) if (snv.model=="betabin") { .dbeta <- function(x, shape1, shape2, log, size) { size <- pmin(size, max.coverage.vcf) dbetabinom.ab(x=round(x*size),shape1=shape1, shape2=shape2, size=size, log=TRUE) } } prior.cont <- ifelse(prior.somatic < 0.1, cont.rate, 0) prior.somatic <- prior.somatic - (prior.cont*prior.somatic) priorHom <- if (model.homozygous) -log(3) else log(0) if (length(prior.somatic) != nrow(vcf)) { .stopRuntimeError("prior.somatic and VCF do not align.") } if (length(mapping.bias$bias) != nrow(vcf)) { .stopRuntimeError("mapping.bias and VCF do not align.") } haploid.penalty <- 0 if (median.C < 1.1) { haploid.penalty <- 2 } subclonal <- apply(C.likelihood[queryHits(ov), ], 1, which.max) == ncol(C.likelihood) seg.idx <- which(seq_len(nrow(C.likelihood)) %in% queryHits(ov)) ar_all <- unlist(geno(vcf)$FA[, tumor.id.in.vcf]) ar_all <- ar_all/mapping.bias$bias ar_all[ar_all > 1] <- 1 dp_all <- unlist(geno(vcf)$DP[, tumor.id.in.vcf]) if (snv.model!="betabin") dp_all[dp_all>max.coverage.vcf] <- max.coverage.vcf # Fit variants in all segments xx <- lapply(seg.idx, function(i) { # classify germline vs somatic idx <- subjectHits(ov)[queryHits(ov) == i] shape1 <- ar_all[idx] * dp_all[idx] + 1 shape2 <- (1 - ar_all[idx]) * dp_all[idx] + 1 mInf_all <- log(double(length(shape1))) list(vcf.ids=idx, likelihoods=lapply(seq(ncol(C.likelihood)), function(k) { Ci <- c(test.num.copy, opt.C[i])[k] priorM <- log(max(Ci,1) + 1 + haploid.penalty) skip <- test.num.copy > Ci | C.likelihood[i, k] <= 0 p.ar <- lapply(c(0, 1), function(g) { cns <- test.num.copy if (!g) cns[1] <- non.clonal.M dbetax <- (p * cns + g * (1-p)) / (p * Ci + 2 * (1-p)) l <- lapply(seq_along(test.num.copy), function(j) { if (skip[j]) return(mInf_all) .dbeta(x = dbetax[j], shape1 = shape1, shape2 = shape2, log = TRUE, size = dp_all[idx]) - priorM }) do.call(cbind,l) }) p.ar.cont.1 <- .dbeta(x = (p * Ci + 2 * (1 - p - cont.rate))/ (p * Ci + 2 * (1 - p)),shape1=shape1, shape2=shape2, log=TRUE, size=dp_all[idx]) - priorM p.ar.cont.2 <- .dbeta(x = cont.rate / (p * Ci + 2 * (1 - p)), shape1 = shape1, shape2 = shape2, log = TRUE, size = dp_all[idx]) - priorM # add prior probabilities for somatic vs germline p.ar[[1]] <- p.ar[[1]] + log(prior.somatic[idx]) p.ar[[2]] <- p.ar[[2]] + log(1 - prior.somatic[idx]) # contamination (either homozygous germline, or germline from # other sample) p.ar[[3]] <- p.ar.cont.1 + log(prior.cont[idx]) p.ar[[4]] <- p.ar.cont.2 + log(prior.cont[idx]) # homozygous state p.ar[[5]] <- dbinom(round((1-ar_all[idx])*dp_all[idx]),size=round(dp_all[idx]), prob=error/3, log=TRUE) + priorHom + log(1-prior.somatic[idx]) do.call(cbind, p.ar) })) }) tmp <- lapply(seq_along(xx),function(i) .calcMsSegment(xx[[i]]$likelihoods, test.num.copy, opt.C[seg.idx[i]], prior.K, mapping.bias.ok=mapping.bias$bias[xx[[i]]$vcf.ids]>=max.mapping.bias, seg.id=seg.idx[i], min.variants.segment)) xx <- lapply(tmp, lapply, function(x) x$best) .extractValues <- function(tmp, field) { segmentValue <- sapply(seq_along(tmp), function(i) tmp[[i]][[min(C[seg.idx[i]], max(test.num.copy))+1]][[field]]) segmentValue <- unlist(sapply(seq_along(seg.idx), function(i) rep(segmentValue[i], sum(seg.idx[i]==queryHits(ov))))) } # Get segment M's for each SNV segment.M <- .extractValues(tmp, "M") segment.Flag <- .extractValues(tmp, "flag") segment.Posterior <- .extractValues(tmp, "posterior") likelihoods <- do.call(rbind, lapply(seq_along(xx), function(i) Reduce("+", lapply(seq(ncol(C.likelihood)), function(j) exp(xx[[i]][[j]]) * C.likelihood[seg.idx[i], j])))) colnames(likelihoods) <- c(paste("SOMATIC.M", test.num.copy, sep = ""), paste("GERMLINE.M", test.num.copy, sep = ""), "GERMLINE.CONTHIGH", "GERMLINE.CONTLOW", "GERMLINE.HOMOZYGOUS") vcf.ids <- do.call(c, lapply(seg.idx, function(i) subjectHits(ov)[queryHits(ov) == i])) rownames(likelihoods) <- vcf.ids # for very high-level amplifications, all posteriors can be 0, so make sure # we get valid values here and flag those later. posteriors <- likelihoods/pmax(rowSums(likelihoods),.Machine$double.xmin) # this just adds a lot of helpful info to the SNV posteriors xx <- .extractMLSNVState(posteriors) posteriors <- cbind( as.data.frame(rowRanges(vcf[vcf.ids]), row.names=NULL)[, 1:3], ID = names(vcf[vcf.ids]), REF = as.character(ref(vcf[vcf.ids])), ALT = sapply(alt(vcf[vcf.ids]), function(x) paste(as.character(x), collapse=";")), posteriors, xx, ML.C = C[queryHits(ov)], ML.M.SEGMENT = segment.M, M.SEGMENT.POSTERIOR = segment.Posterior, M.SEGMENT.FLAGGED = segment.Flag, row.names = NULL ) posteriors$ML.AR <- (p * posteriors$ML.M + ifelse(posteriors$ML.SOMATIC, 0, 1) * (1 - p)) / (p * posteriors$ML.C + 2 * (1 - p)) posteriors$ML.AR[posteriors$ML.AR > 1] <- 1 posteriors$AR <- unlist(geno(vcf[vcf.ids])$FA[, tumor.id.in.vcf]) posteriors$AR.ADJUSTED <- posteriors$AR / mapping.bias$bias[vcf.ids] posteriors$AR.ADJUSTED[posteriors$AR.ADJUSTED>1] <- 1 posteriors$MAPPING.BIAS <- mapping.bias$bias[vcf.ids] # Extract LOH posteriors$ML.LOH <- (posteriors$ML.M == posteriors$ML.C | posteriors$ML.M == 0 | posteriors$ML.C == 1) posteriors$CN.SUBCLONAL <- subclonal posteriors$CELLFRACTION <- (posteriors$AR/posteriors$ML.M) * (p*posteriors$ML.C + 2 * (1-p)) / p posteriors$CELLFRACTION[!posteriors$ML.SOMATIC] <- NA rm.snv.posteriors <- apply(likelihoods, 1, max) idx.ignore <- rm.snv.posteriors == 0 | posteriors$MAPPING.BIAS < max.mapping.bias | posteriors$start != posteriors$end posteriors$FLAGGED <- idx.ignore posteriors$log.ratio <- snv.lr[vcf.ids] posteriors$depth <-as.numeric(geno(vcf[vcf.ids])$DP[, tumor.id.in.vcf]) posteriors$prior.somatic <- prior.somatic[vcf.ids] posteriors$prior.contamination <- prior.cont[vcf.ids] posteriors$on.target <- info(vcf[vcf.ids])$OnTarget posteriors$seg.id <- queryHits(ov) if (!is.null(mapping.bias$pon.count)) { posteriors$pon.count <- mapping.bias$pon.count[vcf.ids] idx.ignore <- idx.ignore | (posteriors$pon.count > max.pon & posteriors$prior.somatic > 0.1) posteriors$FLAGGED <- idx.ignore } # change seqnames to chr colnames(posteriors)[1] <- "chr" ret <- list( llik = sum(log(rm.snv.posteriors[!idx.ignore])) - sum(idx.ignore), likelihoods = likelihoods, posteriors = posteriors, vcf.ids = vcf.ids, posterior.contamination = 0) ret } .extractMLSNVState <- function(snv.posteriors) { l1 <- apply(snv.posteriors, 1, which.max) xx <- do.call(rbind, strsplit(colnames(snv.posteriors)[l1], "\\.")) xx[, 1] <- ifelse(xx[, 1] == "GERMLINE", FALSE, TRUE) xx[, 2] <- gsub("^M", "", xx[, 2]) xx <- as.data.frame(xx, stringsAsFactors = FALSE) xx[, 1] <- as.logical(xx[, 1]) xx[, 2] <- suppressWarnings(as.numeric(xx[, 2])) colnames(xx) <- c("ML.SOMATIC", "ML.M") # set sub-clonal (ML.M=0) to 1 xx$ML.M[xx$ML.SOMATIC & !xx$ML.M] <- 1 xx$POSTERIOR.SOMATIC <- apply(snv.posteriors[, grep("SOMATIC.M", colnames(snv.posteriors))], 1, sum, na.rm=TRUE) xx[, c(1,3,2)] } .checkFraction <- function(x, name) { if (!is.numeric(x) || length(x) !=1 || x < 0 || x > 1) { .stopUserError(name, " not within expected range or format.") } } .checkParameters <- function(test.purity, min.ploidy, max.ploidy, max.non.clonal, max.homozygous.loss, sampleid, prior.K, prior.contamination, prior.purity, iterations, min.gof, model.homozygous, interval.file, log.ratio.calibration, test.num.copy) { if (min(test.purity) <= 0 || max(test.purity) > 0.99) .stopUserError("test.purity not within expected range.") if (min.ploidy <= 0 || max.ploidy <= 2) .stopUserError("min.ploidy or max.ploidy not within expected range.") if (min(test.num.copy) < 0) .stopUserError("test.num.copy not within expected range.") if (min(test.num.copy) > 0 || max(test.num.copy)>8) flog.warn("test.num.copy outside recommended range.") .checkFraction(max.non.clonal, "max.non.clonal") .checkFraction(max.homozygous.loss[1], "max.homozygous.loss") .checkFraction(prior.K, "prior.K") .checkFraction(prior.contamination, "prior.contamination") .checkFraction(min.gof, "min.gof") tmp <- sapply(prior.purity, .checkFraction, "prior.purity") if (!is.null(sampleid) && (!is(sampleid, "character") || length(sampleid) != 1)) { .stopUserError("sampleid not a character string.") } if (abs(1-sum(prior.purity)) > 0.02) { .stopUserError("prior.purity must add to 1. Sum is ", sum(prior.purity)) } if (length(prior.purity) != length(test.purity)) { .stopUserError("prior.purity must have the same length as ", "test.purity.") } if (!is.null(interval.file) && !file.exists(interval.file)) { .stopUserError("interval.file ", interval.file, " not found.") } stopifnot(is.numeric(min.ploidy)) stopifnot(is.numeric(max.ploidy)) stopifnot(is.numeric(test.purity)) stopifnot(is.numeric(iterations)) stopifnot(is.numeric(log.ratio.calibration)) stopifnot(is.logical(model.homozygous)) if (iterations < 10 || iterations > 250) { .stopUserError("Iterations not in the expected range from 10 to 250.") } } .failedNonAberrant <- function(result, cutoffs = c(0.01, 0.005)) { xx <- split(result$seg, result$seg$C) if (length(xx) < 3) return(TRUE) xx.sum <- sort(vapply(xx, function(x) sum(x$size), double(1)), decreasing = TRUE) xx.sum <- xx.sum/sum(xx.sum) if (xx.sum[2] <= cutoffs[1] && xx.sum[3] <= cutoffs[2]) return(TRUE) FALSE } .filterDuplicatedResults <- function(results, purity.cutoff = 0.1) { if (length(results) < 2) return(results) idx.duplicated <- rep(FALSE, length(results)) for (i in seq_len(length(results)-1)) { for (j in seq(i+1,length(results))) { if (abs(results[[i]]$purity - results[[j]]$purity) < purity.cutoff && abs(results[[i]]$ploidy - results[[j]]$ploidy) / results[[i]]$ploidy < 0.1) { idx.duplicated[j] <- TRUE } } } results[!idx.duplicated] } .filterDuplicatedCandidates <- function(candidates) { if (nrow(candidates) < 2) return(candidates) idx.duplicated <- rep(FALSE, nrow(candidates)) for (i in seq_len(nrow(candidates)-1)) { for (j in seq(i+1,nrow(candidates))) { if (abs(candidates$purity[i] - candidates$purity[j]) < 0.1 && abs(candidates$tumor.ploidy[i] - candidates$tumor.ploidy[j]) / candidates$tumor.ploidy[i] < 0.1) { idx.duplicated[j] <- TRUE } } } candidates[!idx.duplicated, ] } .findLocalMinima <- function(m) { loc.min <- matrix(nrow = 0, ncol = 2) for (i in seq_len(nrow(m))) { for (j in seq_len(ncol(m))) { x <- seq(i - 1, i + 1) x <- x[x >= 1 & x <= nrow(m)] y <- seq(j - 1, j + 1) y <- y[y >= 1 & y <= ncol(m)] if (m[i, j] == max(m[x, y]) && !is.infinite(m[i, j])) loc.min <- rbind(loc.min, c(row = i, col = j)) } } loc.min } .appendComment <- function(a, b) { if (is.na(a)) return(b) paste(a, b, sep = ";") } .isRareKaryotype <- function(ploidy) { ploidy > 4.5 || ploidy < 1.5 } .flagResult <- function(result, max.non.clonal = 0.2, min.gof, use.somatic.status, model.homozygous) { result$flag_comment <- NA result$flag <- .failedNonAberrant(result) if (result$flag) { result$flag_comment <- .appendComment(result$flag_comment, "NON-ABERRANT") } if (result$fraction.subclonal > max.non.clonal*0.75) { result$flag <- TRUE result$flag_comment <- .appendComment(result$flag_comment, "POLYGENOMIC") } if (result$fraction.homozygous.loss > 0.01) { result$flag <- TRUE result$flag_comment <- .appendComment(result$flag_comment, "EXCESSIVE LOSSES") } if (.isRareKaryotype(result$ploidy)) { result$flag <- TRUE result$flag_comment <- .appendComment(result$flag_comment, "RARE KARYOTYPE") } if (result$purity < 0.3) { result$flag <- TRUE result$flag_comment <- .appendComment(result$flag_comment, "LOW PURITY") } if (result$purity > 0.9 && !model.homozygous && (!is.null(use.somatic.status) && !use.somatic.status)) { result$flag <- TRUE result$flag_comment <- .appendComment(result$flag_comment, "HIGH PURITY AND model.homozygous=FALSE") } result$GoF <- .getGoF(result) if (!is.null(result$GoF) && result$GoF < min.gof) { result$flag <- TRUE result$flag_comment <- .appendComment(result$flag_comment, paste0("POOR GOF (", round(result$GoF*100,digits=1),"%)")) } fraction.loh <- .getFractionLoh(result) # Assume that everything below 2.6 did not undergo genome duplication, which can # result in lots of LOH if (result$ploidy < 2.6 && fraction.loh > 0.5) { result$flag <- TRUE result$flag_comment <- .appendComment(result$flag_comment, "EXCESSIVE LOH") } return(result) } .getFractionLoh <- function(result) { if (is.null(result$SNV.posterior)) return(0) pp <- result$SNV.posterior$posteriors x1 <- unique(pp$seg.id[pp$ML.M.SEGMENT==0]) sum(result$seg$size[x1])/sum(result$seg$size) } .getGoF <- function(result) { if (is.null(result$SNV.posterior)) return(0) r <- result$SNV.posterior$posteriors e <- (r$ML.AR-r$AR.ADJUSTED)^2 maxDist <- 0.2^2 r2 <- max(1-mean(e,na.rm=TRUE)/maxDist,0) return(r2) } .flagResults <- function(results, max.non.clonal = 0.2, max.logr.sdev, logr.sdev, max.segments, min.gof, flag = NA, flag_comment = NA, dropout=FALSE, use.somatic.status=TRUE, model.homozygous=FALSE) { if (length(results) < 1) return(results) results <- lapply(results, .flagResult, max.non.clonal=max.non.clonal, min.gof=min.gof, use.somatic.status=use.somatic.status, model.homozygous=model.homozygous) number.segments <- nrow(results[[1]]$seg) # some global flags if (logr.sdev > max.logr.sdev) { for (i in seq_along(results)) { results[[i]]$flag <- TRUE results[[i]]$flag_comment <- .appendComment(results[[i]]$flag_comment, "NOISY LOG-RATIO") } } if (number.segments > max.segments) { for (i in seq_along(results)) { results[[i]]$flag <- TRUE results[[i]]$flag_comment <- .appendComment(results[[i]]$flag_comment, "NOISY SEGMENTATION") } } if (dropout) { for (i in seq_along(results)) { results[[i]]$flag <- TRUE results[[i]]$flag_comment <- .appendComment(results[[i]]$flag_comment, "HIGH AT- OR GC-DROPOUT") } } if (!is.na(flag) && flag) { for (i in seq_along(results)) { results[[i]]$flag <- TRUE results[[i]]$flag_comment <- .appendComment(results[[i]]$flag_comment, flag_comment) } } results } .getGeneCalls <- function(seg.adjusted, tumor, log.ratio, fun.focal, args.focal, chr.hash) { args.focal <- c(list(seg = seg.adjusted), args.focal) focal <- do.call(fun.focal, args.focal) abs.gc <- GRanges(seqnames = .add.chr.name(seg.adjusted$chrom, chr.hash), IRanges(start = seg.adjusted$loc.start, end = seg.adjusted$loc.end)) # that will otherwise mess up the log-ratio means, mins and maxs idx <- which(!is.nan(log.ratio) & is.finite(log.ratio) & tumor$Gene != ".") if (!length(idx)) return(NA) tumor <- tumor[idx] log.ratio <- log.ratio[idx] ov <- findOverlaps(tumor, abs.gc) # use funky data.table to calculate means etc. in two lines of code. dt <- data.table(as.data.frame(tumor[queryHits(ov)]), C = seg.adjusted[subjectHits(ov), "C"], seg.mean = seg.adjusted[subjectHits(ov), "seg.mean"], LR = log.ratio[queryHits(ov)], seg.id = subjectHits(ov), seg.length = seg.adjusted$size[subjectHits(ov)], focal = focal[subjectHits(ov)] ) # some targets have multipe genes assigned? if (sum(grepl(",", dt$Gene))) { dt <- dt[, list(Gene = unlist(strsplit(as.character(Gene), ",", fixed=TRUE))), by = list(seqnames, start, end, C, seg.mean, seg.id, seg.length, LR, focal)] } gene.calls <- data.frame(dt[, list(chr = seqnames[1], start = min(start), end = max(end), C = as.double(median(C)), seg.mean = median(seg.mean), seg.id = seg.id[which.min(seg.length)], .min.segid=min(seg.id), .max.segid=max(seg.id), number.targets = length(start), gene.mean = mean(LR, na.rm = TRUE), gene.min = min(LR, na.rm = TRUE), gene.max = max(LR, na.rm = TRUE), focal = focal[which.min(seg.length)]), by = Gene], row.names = 1) breakpoints <- gene.calls$.max.segid - gene.calls$.min.segid gene.calls$breakpoints <- breakpoints gene.calls } .extractCountMatrix <- function(coverages) { useCounts <- .coverageHasCounts(coverages) if (useCounts) { return(do.call(cbind, lapply(coverages, function(x) x$counts))) } # TODO: remove spring 2018 release flog.info("Coverage file does not contain read count information, %s", "using total coverage for calculating log-ratios.") do.call(cbind, lapply(coverages, function(x) x$coverage)) } .coverageHasCounts <- function(coverages) { for (i in seq_along(coverages)) if (sum(!is.na(coverages[[i]]$counts))==0) return(FALSE) return(TRUE) } .checkSymbolsChromosome <- function(tumor, blank=c("", ".")) { if (is.null(tumor$Gene)) { tumor$Gene <- "." return(tumor) } chrsPerSymbol <- lapply(split(seqnames(tumor), tumor$Gene), unique) nonUniqueSymbols <- names(chrsPerSymbol[sapply(chrsPerSymbol, length)>1]) idx <- tumor$Gene %in% nonUniqueSymbols idxBlank <- tumor$Gene %in% blank tumor$Gene <- as.character(tumor$Gene) tumor$Gene[idx] <- paste(tumor$Gene, tumor$chr, sep=".")[idx] tumor$Gene[idxBlank] <- "." tumor } .get2DPurityGrid <- function(test.purity, by=1/30) { startPurity <- max(0.1, min(test.purity)) endPurity <- min(0.99, max(test.purity)) grid <- seq(startPurity, endPurity, by=by) if (startPurity < 0.34 && endPurity > 0.35) { grid <- c(seq(startPurity, 0.34, by=1/50), seq(0.35, endPurity, by=by)) } grid } .optimizeGrid <- function(test.purity, min.ploidy, max.ploidy, test.num.copy = 0:7, exon.lrs, seg, sd.seg, li, max.exon.ratio, max.non.clonal, BPPARAM) { ploidy.grid <- seq(min.ploidy, max.ploidy, by = 0.2) if (min.ploidy < 1.8 && max.ploidy > 2.2) { ploidy.grid <- c(seq(min.ploidy, 1.8, by = 0.2), 1.9, 2, 2.1, seq(2.2, max.ploidy, by = 0.2)) } .optimizeGridPurity <- function(p) { b <- 2 * (1 - p) log.ratio.offset <- 0 lapply(ploidy.grid, function(D) { dt <- p/D llik.all <- lapply(seq_along(exon.lrs), function(i) .calcLlikSegmentExonLrs(exon.lrs[[i]], log.ratio.offset, max.exon.ratio, sd.seg, dt, b, D, test.num.copy)) subclonal <- vapply(llik.all, which.max, double(1)) == 1 subclonal.f <- length(unlist(exon.lrs[subclonal]))/length(unlist(exon.lrs)) if (subclonal.f > max.non.clonal) return(-Inf) sum(vapply(llik.all, max, double(1))) }) } if (is.null(BPPARAM)) { mm <- lapply(test.purity, .optimizeGridPurity) } else { mm <- BiocParallel::bplapply(test.purity, .optimizeGridPurity, BPPARAM = BPPARAM) } mm <- sapply(mm, function(x) unlist(x)) colnames(mm) <- test.purity rownames(mm) <- ploidy.grid if (!sum(as.vector(is.finite(mm)))) { .stopUserError("Cannot find valid purity/ploidy solution. ", "This happens when input segmentations are garbage, most likely ", "due to a catastrophic sample QC failure. Re-check standard QC ", "metrics for this sample.") } ai <- .findLocalMinima(mm) candidates <- data.frame(ploidy = as.numeric(rownames(mm)[ai[, 1]]), purity = as.numeric(colnames(mm)[ai[, 2]]), llik = mm[ai]) candidates$tumor.ploidy <- (candidates$ploidy - 2 * (1 - candidates$purity))/candidates$purity # add diploid candidate solutions in the following purity grid in # case there are none. grid <- seq(0,1,by=1/4) for (i in seq_along(grid)[-length(grid)]) { t1 <- which.min(abs(as.numeric(colnames(mm)) - grid[i])) t2 <- which.min(abs(as.numeric(colnames(mm)) - grid[i+1])) if (t2-t1 < 2) next if (sum(candidates$purity>grid[i] & candidates$purity< grid[i+1], na.rm=TRUE) < 1) next # Nothing close to diplod in this range? Then add. if (min(abs(2 - candidates$tumor.ploidy[candidates$purity>grid[i] & candidates$purity< grid[i+1]])) > 0.3) { mm.05 <- mm[, seq(t1+1, t2), drop=FALSE] # Find row most similar to normal diploid diploidRowId <- which.min(abs(2-as.numeric(row.names(mm.05)))) # assert that rownames are still what they should be if (diploidRowId != which.min(abs(2-ploidy.grid))) { .stopRuntimeError("Cannot find diploid row in grid search.") } candidates <- rbind(candidates, c(2, as.numeric(names(which.max(mm.05[diploidRowId, ]))), max(mm.05[diploidRowId, ]), 2)) # Remove again if too similar with existing candidate if (nrow(candidates) > 2 && abs(Reduce("-",tail(candidates$ploidy,2))) < 0.001 && abs(Reduce("-",tail(candidates$purity,2))) < 0.1) { candidates <- candidates[- (nrow(candidates) - 2 + which.min(tail(candidates$llik,2))),] } } } candidates <- candidates[candidates$tumor.ploidy >= 0.5, ] candidates <- .filterDuplicatedCandidates(candidates) list(all = mm, candidates = candidates[order(candidates$llik, decreasing = TRUE), ]) } .calcLlikSegmentExonLrs <- function(exon.lrs, log.ratio.offset, max.exon.ratio, sd.seg, dt, b, D, test.num.copy) { c(.calcLlikSegmentSubClonal(exon.lrs + log.ratio.offset, max.exon.ratio), vapply(test.num.copy, function(Ci) sum(dnorm(exon.lrs + log.ratio.offset, mean = log2(dt * Ci + b/D), sd = sd.seg, log = TRUE)),double(1))) } # This function is used to punish more complex copy number models # a little bit. Based on the BIC. This just counts the number of utilized # copy number states, excluding normal 2. Then multiplies by # log(number exons) .calcComplexityCopyNumber <- function(results) { if (length(results) < 2) return(0) cs <- sapply((0:7)[-3], function(i) sapply(results, function(y) sum(y$seg$size[y$seg$C == i])/sum(y$seg$size))) complexity <- apply(cs,1, function(z) sum(z>0.001)) n <- sum(results[[1]]$seg$num.mark, na.rm=TRUE) -complexity*log(n) } .rankResults <- function(results) { if (length(results) < 1) return(results) complexity <- .calcComplexityCopyNumber(results) for (i in seq_along(results)) { if (is.null(results[[i]]$SNV.posterior)) { results[[i]]$total.log.likelihood <- results[[i]]$log.likelihood } else { results[[i]]$total.log.likelihood <- results[[i]]$log.likelihood/2 + results[[i]]$SNV.posterior$llik } results[[i]]$total.log.likelihood <- results[[i]]$total.log.likelihood + complexity[i] } idx.opt <- order(sapply(results, function(z) z$total.log.likelihood), decreasing = TRUE) results <- results[idx.opt] # remove solutions with -inf likelihood score results[!is.infinite(sapply(results, function(z) z$total.log.likelihood))] } .sampleOffsetFast <- function(test.num.copy, seg, exon.lrs, sd.seg, p, C, total.ploidy, max.exon.ratio, simulated.annealing, log.ratio.calibration) { # Gibbs sample offset test.offset <- seq(sd.seg * -log.ratio.calibration, sd.seg * log.ratio.calibration, by = 0.01) test.offset <- seq(p * -log.ratio.calibration, p * log.ratio.calibration * 0.2, length.out=12) seg.ids.by.chr <- list(seq_len(nrow(seg))) lr <- lapply(seq_along(seg.ids.by.chr), function(j) { px.offset <- lapply(test.offset, function(px) vapply(seg.ids.by.chr[[j]], function(i) { b <- 2 * (1 - p) D <- total.ploidy dt <- p/D llik.all <- .calcLlikSegmentExonLrs(exon.lrs[[i]], px, max.exon.ratio, sd.seg, dt, b, D, test.num.copy) vapply(llik.all, max, double(1)) }, double(1+length(test.num.copy)))) px.offset.s <- vapply(lapply(px.offset, apply, 2, max), sum, double(1)) px.offset.s <- exp(px.offset.s - max(px.offset.s)) log.ratio.offset <- test.offset[min(which(runif(n = 1, min = 0, max = sum(px.offset.s)) <= cumsum(px.offset.s)))] }) do.call(c, lapply(seq_along(lr), function(i) rep(lr[[i]], length(seg.ids.by.chr[[i]])))) } .sampleError <- function(subclonal, seg, exon.lrs, sd.seg, p, C, total.ploidy, max.exon.ratio, simulated.annealing, iter, log.ratio.calibration, log.ratio.offset) { # Gibbs sample error test.error <- seq(sd.seg, sd.seg + sd.seg * 0.2, length.out = 5) seg.ids <- seq_len(nrow(seg)) px.error <- lapply(test.error, function(error) vapply(seg.ids, function(i) .calcLlikSegment(subclonal[i], exon.lrs[[i]] + log.ratio.offset[i], error, p, C[i], total.ploidy, max.exon.ratio), double(1)) ) px.error.s <- sapply(px.error, sum, na.rm = TRUE) if (simulated.annealing) px.error.s <- px.error.s * exp(iter/4) px.error.s <- exp(px.error.s - max(px.error.s)) error <- test.error[min(which(runif(n = 1, min = 0, max = sum(px.error.s)) <= cumsum(px.error.s)))] } .sampleOffset <- function(subclonal, seg, exon.lrs, sd.seg, p, C, total.ploidy, max.exon.ratio, simulated.annealing, iter, log.ratio.calibration = 0.25) { # Gibbs sample offset test.offset <- seq(sd.seg * -log.ratio.calibration, sd.seg * log.ratio.calibration, by = 0.01) test.offset <- seq(p * -log.ratio.calibration, p * log.ratio.calibration * 0.2, length.out=12) seg.ids.by.chr <- list(seq_len(nrow(seg))) lr <- lapply(seq_along(seg.ids.by.chr), function(j) { px.offset <- lapply(test.offset, function(px) vapply(seg.ids.by.chr[[j]], function(i) .calcLlikSegment(subclonal[i], exon.lrs[[i]] + px, sd.seg, p, C[i], total.ploidy, max.exon.ratio), double(1)) ) px.offset.s <- sapply(px.offset, sum, na.rm = TRUE) if (simulated.annealing) px.offset.s <- px.offset.s * exp(iter/4) px.offset.s <- exp(px.offset.s - max(px.offset.s)) log.ratio.offset <- test.offset[min(which(runif(n = 1, min = 0, max = sum(px.offset.s)) <= cumsum(px.offset.s)))] }) do.call(c, lapply(seq_along(lr), function(i) rep(lr[[i]], length(seg.ids.by.chr[[i]])))) } .removeOutliers <- function(x, na.rm=TRUE,...) { if (length(x) < 5) return(x) qnt <- quantile(x, probs = c(0.25, 0.75), na.rm = na.rm, ...) H <- 1.5 * IQR(x, na.rm = na.rm) if (is.na(H) || H < 0.001) return(x) # find points outside the 'boxplot' range idx <- x <= (qnt[1] - H) | x >= (qnt[2] + H) x[!idx] } .calcPuritySomaticVariants <- function(vcf, prior.somatic, tumor.id.in.vcf) { median(unlist(geno(vcf[prior.somatic > 0.5])$FA[, tumor.id.in.vcf]), na.rm = TRUE)/0.48 } .loadSegFile <- function(seg.file, sampleid, model.homozygous=FALSE, verbose=TRUE) { if (is.null(seg.file)) return(NULL) seg <- read.delim(seg.file) if (verbose) flog.info("Loaded provided segmentation file %s.", basename(seg.file)) .checkSeg(seg, sampleid, model.homozygous, verbose) } .checkSeg <- function(seg, sampleid, model.homozygous, verbose=TRUE) { required.colnames <- c("ID", "chrom", "loc.start", "loc.end", "num.mark", "seg.mean") required.colnames2 <- c("ID", "chromosome", "start", "end", "num_probes", "mean") if (ncol(seg) > length(required.colnames)) { seg <- seg[,seq_along(required.colnames)] } if (identical(colnames(seg), required.colnames2)) { colnames(seg) <- required.colnames } if (!identical(as.character(as.numeric(seg$chrom)), as.character(seg$chrom))) { flog.warn("Expecting numeric chromosome names in seg.file, assuming file is properly sorted.") seg$chrom <- .strip.chr.name(seg$chrom, .getChrHash(seg$chrom)) } # The smallest possible log-ratio is about 8 # for 0.99 purity and high ploidy. # remove artifacts with lower log-ratio if (!model.homozygous && min(seg$seg.mean, na.rm=TRUE) < -8) { nBefore <- nrow(seg) seg <- seg[which(seg$seg.mean >= -8 | seg$num.mark >= 4),] if (verbose) flog.warn("Removing %i short segments with log-ratio < -8.", nBefore-nrow(seg)) } if (!identical(colnames(seg), required.colnames)) { .stopUserError(paste("Segmentation file expected with colnames", paste(required.colnames, collapse = ", "))) } segs <- split(seg, seg$ID) matchedSeg <- match(make.names(sampleid), make.names(names(segs))) if (length(segs)==1) { if (!is.null(sampleid) && is.na(matchedSeg)) { flog.warn("Provided sampleid (%s) does not match %s found in %s", sampleid, names(segs)[1], "segmentation.") } matchedSeg <- 1 } else if (is.null(sampleid)) { .stopUserError("seg.file contains multiple samples and sampleid missing.") } else if (is.na(matchedSeg)) { .stopUserError("seg.file contains multiple samples and sampleid does not match any.") } else { seg <- segs[[matchedSeg]] } seg$seg.mean <- seg$seg.mean - weighted.mean(seg$seg.mean, seg$num.mark, na.rm = TRUE) seg } .createFakeLogRatios <- function(tumor, seg.file, sampleid, chr.hash, model.homozygous=FALSE, max.logr.sdev) { if (!is.null(tumor$log.ratio)) { # calculate log.ratio sd in chunks of size 25 to estimate the # segmented sd .robustSd <- function(d, size = 25) median( sapply(split(d, ceiling(seq_along(d) / size)), sd, na.rm = TRUE), na.rm = TRUE) if (.robustSd(tumor$log.ratio) < max.logr.sdev) { flog.info("Found log2-ratio in tumor coverage data.") return(tumor$log.ratio) } else { flog.info("Provided log2-ratio looks too noisy, using segmentation only.") } } if (is(seg.file, "character")) { seg <- .loadSegFile(seg.file, sampleid, model.homozygous, verbose=FALSE) } else { seg <-.checkSeg(seg.file, sampleid, model.homozygous, verbose=FALSE) } seg.gr <- GRanges(seqnames = .add.chr.name(seg$chrom, chr.hash), IRanges(start = round(seg$loc.start), end = seg$loc.end)) ov <- findOverlaps(tumor, seg.gr) log.ratio <- seg$seg.mean[subjectHits(ov)] # sanity check, so that every exon has exactly one segment log-ratio log.ratio <- log.ratio[match(seq_len(length(tumor)), queryHits(ov))] mean.log.ratio <- mean(subset(log.ratio, !is.infinite(log.ratio)), na.rm = TRUE) # calibrate log.ratio <- log.ratio - mean.log.ratio log.ratio } .strip.chr.name <- function(ls, chr.hash) { x <- chr.hash[as.character(ls), 2] x[is.na(x)] <- as.numeric(ls[is.na(x)]) x } .add.chr.name <- function(ls, chr.hash) { x <- as.character(chr.hash$chr[match(ls, chr.hash$number)]) x[is.na(x)] <- ls[is.na(x)] x } .getChrHash <- function(ls) { ls <- unique(ls) chr.hash <- genomeStyles(species="Homo_sapiens") chr.hash <- chr.hash[!chr.hash$circular,] id <- which.max(apply(chr.hash[,-(1:3)],2,function(x) sum(ls %in%x)))+3 chr.hash <- data.frame(chr=chr.hash[,id], number=seq_len(nrow(chr.hash)), row.names=chr.hash[,id]) if (sum(!ls %in% chr.hash[,1]) == 0) return(chr.hash) data.frame(chr=as.factor(ls), number=seq_along(ls), row.names=ls) } .stopUserError <- function(...) { msg <- paste(c(...), collapse="") msg <- paste0(msg, "\n\nThis is most likely a user error due to", " invalid input data or parameters (PureCN ", packageVersion("PureCN"), ").") flog.fatal(paste(strwrap(msg),"\n")) stop(paste(strwrap(msg),"\n"), call.= FALSE) } .stopRuntimeError <- function(...) { msg <- paste(c(...), collapse="") msg <- paste0(msg, "\n\nThis runtime error might be caused by", " invalid input data or parameters. Please report bug (PureCN ", packageVersion("PureCN"), ").") flog.fatal(paste(strwrap(msg),"\n")) stop(paste(strwrap(msg),"\n"), call.= FALSE) } .logHeader <- function(l) { flog.info(strrep("-", 60)) flog.info("PureCN %s", as.character(packageVersion("PureCN"))) flog.info(strrep("-", 60)) # which arguments are printable in a single line? idxSupported <- sapply(l, function(x) class(eval.parent(x))) %in% c("character", "numeric", "NULL", "list", "logical") & sapply(l, function(x) object.size(eval.parent(x))) < 1024 idxSmall <- sapply(l[idxSupported], function(x) length(unlist(eval.parent(x)))) < 12 idxSupported[idxSupported] <- idxSmall l <- c(l[idxSupported],lapply(l[!idxSupported], function(x) "<data>")) argsStrings <- paste(sapply(seq_along(l), function(i) paste0("-", names(l)[i], " ", paste(eval.parent(l[[i]]),collapse=","))), collapse=" ") flog.info("Arguments: %s", argsStrings) } .logFooter <- function() { flog.info("Done.") flog.info(strrep("-", 60)) } .calcGCmetric <- function(gc_bias, coverage, on.target) { idx <- which(coverage$on.target==on.target) if (!length(idx)) return(NA) gcbins <- split(coverage[idx]$average.coverage, gc_bias[idx] < 0.5) mean(gcbins[[1]], na.rm=TRUE) / mean(gcbins[[2]], na.rm=TRUE) } .checkGCBias <- function(normal, tumor, max.dropout, on.target=TRUE) { gcMetricNormal <- .calcGCmetric(tumor$gc_bias, normal, on.target) gcMetricTumor <- .calcGCmetric(tumor$gc_bias, tumor, on.target) if (is.na(gcMetricTumor)) return(FALSE) flog.info("AT/GC dropout%s: %.2f (tumor), %.2f (normal). ", ifelse(on.target,""," (off-target regions)"), gcMetricTumor, gcMetricNormal) if (gcMetricNormal < max.dropout[1] || gcMetricNormal > max.dropout[2] || gcMetricTumor < max.dropout[1] || gcMetricTumor > max.dropout[2]) { flog.warn("High GC-bias in normal or tumor. Is data GC-normalized?") return(TRUE) } return(FALSE) } .gcGeneToCoverage <- function(interval.file, min.coverage) { gc.data <- readCoverageFile(interval.file) gc.data$average.coverage <- min.coverage gc.data$coverage <- min.coverage * width(gc.data) gc.data } .getCentromerePositions <- function(centromeres, genome, style=NULL) { if (is.null(centromeres)) { data(centromeres, envir = environment()) if (genome %in% names(centromeres)) { centromeres <- centromeres[[genome]] if (!is.null(style)) seqlevelsStyle(centromeres) <- style[1] } else { centromeres <- NULL } } centromeres } .checkArgs <- function(args, fname) { dups <- duplicated(names(args)) if (sum(dups)) { args <- args[!dups] flog.warn("Duplicated arguments in %s", fname) } args } .calcFractionBalanced <- function(p) { sum(p$ML.C - p$ML.M.SEGMENT == p$ML.M.SEGMENT, na.rm=TRUE)/nrow(p) } # function to adjust log-ratios to segment mean .postprocessLogRatios <- function(exon.lrs, seg.mean) { exon.lrs <- lapply(seq_along(exon.lrs), function(i) { if (length(exon.lrs[[i]]) > 3) return(exon.lrs[[i]]) exon.lrs[[i]] - mean(exon.lrs[[i]]) + seg.mean[i] }) return(exon.lrs) } .estimateContamination <- function(pp, max.mapping.bias=NULL, min.fraction.chromosomes=0.8) { if (is.null(max.mapping.bias)) max.mapping.bias=0 idx <- pp$GERMLINE.CONTHIGH + pp$GERMLINE.CONTLOW > 0.5 & pp$MAPPING.BIAS >= max.mapping.bias if (!length(which(idx))) return(0) df <- data.frame( chr=pp$chr[idx], AR=sapply(pp$AR.ADJUSTED[idx], function(x) ifelse(x>0.5, 1-x,x)), HIGHLOW=ifelse(pp$GERMLINE.CONTHIGH>pp$GERMLINE.CONTLOW, "HIGH", "LOW")[idx] ) # take the chromosome median and then average. the low count # might be biased in case contamination rate is < AR cutoff estimatedRate <- weighted.mean( sapply(split(df$AR, df$HIGHLOW), median), sapply(split(df$AR, df$HIGHLOW), length) ) fractionChrs <- sum(unique(pp$chr) %in% df$chr)/length(unique(pp$chr)) estimatedRate <- if (fractionChrs >= min.fraction.chromosomes) estimatedRate else 0 estimatedRate }