... | ... |
@@ -2,8 +2,8 @@ Package: PureCN |
2 | 2 |
Type: Package |
3 | 3 |
Title: Copy number calling and SNV classification using |
4 | 4 |
targeted short read sequencing |
5 |
-Version: 1.23.18 |
|
6 |
-Date: 2021-08-20 |
|
5 |
+Version: 1.23.19 |
|
6 |
+Date: 2021-08-21 |
|
7 | 7 |
Authors@R: c(person("Markus", "Riester", |
8 | 8 |
role = c("aut", "cre"), |
9 | 9 |
email = "[email protected]", |
... | ... |
@@ -43,6 +43,7 @@ Imports: |
43 | 43 |
VGAM, |
44 | 44 |
tools, |
45 | 45 |
methods, |
46 |
+ mclust, |
|
46 | 47 |
rhdf5, |
47 | 48 |
Matrix |
48 | 49 |
Suggests: |
... | ... |
@@ -139,6 +139,9 @@ importFrom(graphics,strwidth) |
139 | 139 |
importFrom(graphics,symbols) |
140 | 140 |
importFrom(graphics,text) |
141 | 141 |
importFrom(gridExtra,grid.arrange) |
142 |
+importFrom(mclust,Mclust) |
|
143 |
+importFrom(mclust,emControl) |
|
144 |
+importFrom(mclust,mclustBIC) |
|
142 | 145 |
importFrom(methods,is) |
143 | 146 |
importFrom(rhdf5,H5Fopen) |
144 | 147 |
importFrom(rtracklayer,import) |
... | ... |
@@ -5,6 +5,9 @@ NEW FEATURES |
5 | 5 |
|
6 | 6 |
o Report median absolute pairwise difference (MAPD) of tumor vs normal log2 |
7 | 7 |
ratios in runAbsoluteCN |
8 |
+ o Improved mapping bias estimates: variants with insufficient information |
|
9 |
+ for position-specific fits (default 3-6 heterozygous variants) |
|
10 |
+ are clustered and assigned to the most similar fit |
|
8 | 11 |
|
9 | 12 |
SIGNIFICANT USER-VISIBLE CHANGES |
10 | 13 |
|
... | ... |
@@ -25,6 +28,7 @@ SIGNIFICANT USER-VISIBLE CHANGES |
25 | 28 |
finding noisy samples or batches in normal databases |
26 | 29 |
o Do not error out readCurationFile when CSV is missing and directory |
27 | 30 |
is not writable when re-generating it (#196) |
31 |
+ o Add segmentation parameters as attributes to segmentation data.frame |
|
28 | 32 |
|
29 | 33 |
BUGFIXES |
30 | 34 |
|
... | ... |
@@ -10,9 +10,12 @@ |
10 | 10 |
#' @param min.normals Minimum number of normals with heterozygous SNP for |
11 | 11 |
#' calculating position-specific mapping bias. |
12 | 12 |
#' @param min.normals.betafit Minimum number of normals with heterozygous SNP |
13 |
-#' fitting a beta distribution |
|
13 |
+#' fitting a beta binomial distribution |
|
14 |
+#' @param min.normals.assign.betafit Minimum number of normals with |
|
15 |
+#' heterozygous SNPs to assign to a beta binomal fit cluster |
|
14 | 16 |
#' @param min.median.coverage.betafit Minimum median coverage of normals with |
15 | 17 |
#' heterozygous SNP for fitting a beta distribution |
18 |
+#' @param num.betafit.clusters Maximum number of beta binomial fit clusters |
|
16 | 19 |
#' @param yieldSize See \code{TabixFile} |
17 | 20 |
#' @param genome See \code{readVcf} |
18 | 21 |
#' @return A \code{GRanges} object with mapping bias and number of normal |
... | ... |
@@ -28,12 +31,15 @@ |
28 | 31 |
#' @importFrom GenomicRanges GRangesList |
29 | 32 |
#' @importFrom VGAM vglm Coef betabinomial dbetabinom |
30 | 33 |
#' @importFrom data.table rbindlist |
34 |
+#' @importFrom mclust Mclust mclustBIC emControl |
|
31 | 35 |
#' @export calculateMappingBiasVcf |
32 | 36 |
calculateMappingBiasVcf <- function(normal.panel.vcf.file, |
33 | 37 |
min.normals = 1, |
34 | 38 |
min.normals.betafit = 7, |
39 |
+ min.normals.assign.betafit = 3, |
|
35 | 40 |
min.median.coverage.betafit = 5, |
36 |
- yieldSize = 5000, genome) { |
|
41 |
+ num.betafit.clusters = 9, |
|
42 |
+ yieldSize = 50000, genome) { |
|
37 | 43 |
tab <- TabixFile(normal.panel.vcf.file, yieldSize = yieldSize) |
38 | 44 |
open(tab) |
39 | 45 |
param <- ScanVcfParam(geno = c("AD"), fixed = "ALT", info = NA) |
... | ... |
@@ -48,7 +54,10 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, |
48 | 54 |
mappingBias <- .calculateMappingBias(nvcf = vcf_yield, |
49 | 55 |
min.normals = min.normals, |
50 | 56 |
min.normals.betafit = min.normals.betafit, |
51 |
- min.median.coverage.betafit = min.median.coverage.betafit) |
|
57 |
+ min.normals.assign.betafit = min.normals.assign.betafit, |
|
58 |
+ min.median.coverage.betafit = min.median.coverage.betafit, |
|
59 |
+ num.betafit.clusters = num.betafit.clusters |
|
60 |
+ ) |
|
52 | 61 |
if (length(ret)) { |
53 | 62 |
ret <- append(ret, GRangesList(mappingBias)) |
54 | 63 |
} else { |
... | ... |
@@ -61,7 +70,9 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, |
61 | 70 |
attr(bias, "normal.panel.vcf.file") <- normal.panel.vcf.file |
62 | 71 |
attr(bias, "min.normals") <- min.normals |
63 | 72 |
attr(bias, "min.normals.betafit") <- min.normals.betafit |
73 |
+ attr(bias, "min.normals.assign.betafit") <- min.normals.assign.betafit |
|
64 | 74 |
attr(bias, "min.median.coverage.betafit") <- min.median.coverage.betafit |
75 |
+ attr(bias, "num.betafit.clusters") <- num.betafit.clusters |
|
65 | 76 |
attr(bias, "genome") <- genome |
66 | 77 |
bias |
67 | 78 |
} |
... | ... |
@@ -78,8 +89,11 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, |
78 | 89 |
#' calculating position-specific mapping bias. |
79 | 90 |
#' @param min.normals.betafit Minimum number of normals with heterozygous SNP |
80 | 91 |
#' fitting a beta distribution |
92 |
+#' @param min.normals.assign.betafit Minimum number of normals with |
|
93 |
+#' heterozygous SNPs to assign to a beta binomal fit cluster |
|
81 | 94 |
#' @param min.median.coverage.betafit Minimum median coverage of normals with |
82 | 95 |
#' heterozygous SNP for fitting a beta distribution |
96 |
+#' @param num.betafit.clusters Maximum number of beta binomial fit clusters |
|
83 | 97 |
#' @param AF.info.field Field in the \code{workspace} that stores the allelic |
84 | 98 |
#' fraction |
85 | 99 |
#' @return A \code{GRanges} object with mapping bias and number of normal |
... | ... |
@@ -104,7 +118,9 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, |
104 | 118 |
calculateMappingBiasGatk4 <- function(workspace, reference.genome, |
105 | 119 |
min.normals = 1, |
106 | 120 |
min.normals.betafit = 7, |
121 |
+ min.normals.assign.betafit = 3, |
|
107 | 122 |
min.median.coverage.betafit = 5, |
123 |
+ num.betafit.clusters = 9, |
|
108 | 124 |
AF.info.field = "AF") { |
109 | 125 |
|
110 | 126 |
if (!requireNamespace("genomicsdb", quietly = TRUE) || |
... | ... |
@@ -160,7 +176,9 @@ calculateMappingBiasGatk4 <- function(workspace, reference.genome, |
160 | 176 |
alt = m_alt, ref = m_ref, gr = gr, |
161 | 177 |
min.normals = min.normals, |
162 | 178 |
min.normals.betafit = min.normals.betafit, |
179 |
+ min.normals.assign.betafit = min.normals.assign.betafit, |
|
163 | 180 |
min.median.coverage.betafit = min.median.coverage.betafit, |
181 |
+ num.betafit.clusters = num.betafit.clusters, |
|
164 | 182 |
verbose = TRUE |
165 | 183 |
) |
166 | 184 |
attr(bias, "workspace") <- workspace |
... | ... |
@@ -187,7 +205,9 @@ calculateMappingBiasGatk4 <- function(workspace, reference.genome, |
187 | 205 |
|
188 | 206 |
.calculateMappingBias <- function(nvcf, alt = NULL, ref = NULL, gr = NULL, |
189 | 207 |
min.normals, min.normals.betafit = 7, |
208 |
+ min.normals.assign.betafit = 3, |
|
190 | 209 |
min.median.coverage.betafit = 5, |
210 |
+ num.betafit.clusters = 9, |
|
191 | 211 |
verbose = FALSE) { |
192 | 212 |
if (!is.null(nvcf)) { |
193 | 213 |
if (ncol(nvcf) < 2) { |
... | ... |
@@ -240,6 +260,7 @@ calculateMappingBiasGatk4 <- function(workspace, reference.genome, |
240 | 260 |
gr$pon.count <- ponCntHits |
241 | 261 |
gr$mu <- x[5, ] |
242 | 262 |
gr$rho <- x[6, ] |
263 |
+ gr <- .clusterFa(gr, fa, alt, ref, min.normals.assign.betafit, num.betafit.clusters) |
|
243 | 264 |
gr <- gr[order(gr$pon.count, decreasing = TRUE)] |
244 | 265 |
gr <- sort(gr) |
245 | 266 |
gr$triallelic <- FALSE |
... | ... |
@@ -290,3 +311,27 @@ calculateMappingBiasGatk4 <- function(workspace, reference.genome, |
290 | 311 |
# get the alt allelic fraction for all SNPs |
291 | 312 |
apply(x, 2, function(y) y[2] / sum(head(y, 2))) |
292 | 313 |
} |
314 |
+ |
|
315 |
+.clusterFa <- function(gr, fa, alt, ref, min.normals.assign.betafit = 3, num.betafit.clusters = 9) { |
|
316 |
+ idx <- !is.na(gr$mu) |
|
317 |
+ if (sum(idx) < num.betafit.clusters) return(gr) |
|
318 |
+ flog.info("Clustering beta binomial fits...") |
|
319 |
+ fit <- Mclust(mcols(gr)[idx, c("mu", "rho")], G = seq_len(num.betafit.clusters)) |
|
320 |
+ x <- sapply(seq_len(nrow(fa)), function(i) { |
|
321 |
+ idx <- !is.na(fa[i, ]) & fa[i, ] > 0.05 & fa[i, ] < 0.9 |
|
322 |
+ if (!sum(idx) >= min.normals.assign.betafit) return(NA) |
|
323 |
+ ll <- apply(fit$parameters$mean, 2, function(m) { |
|
324 |
+ sum(dbetabinom(x = alt[i, idx], prob = m[1], |
|
325 |
+ size = alt[i, idx] + ref[i, idx], |
|
326 |
+ rho = m[2], log = TRUE)) |
|
327 |
+ }) |
|
328 |
+ if (is.infinite(max(ll))) return(NA) |
|
329 |
+ which.max(ll) |
|
330 |
+ }) |
|
331 |
+ idxa <- is.na(gr$mu) & !is.na(x) |
|
332 |
+ flog.info("Assigning (%i/%i) variants a clustered beta binomal fit.", |
|
333 |
+ sum(idxa), length(gr)) |
|
334 |
+ gr$mu[idxa] <- fit$parameters$mean[1,x[idxa]] |
|
335 |
+ gr$rho[idxa] <- fit$parameters$mean[2,x[idxa]] |
|
336 |
+ return(gr) |
|
337 |
+} |
... | ... |
@@ -146,6 +146,14 @@ segmentationPSCBS <- function(normal, tumor, log.ratio, seg, plot.cnv, |
146 | 146 |
min.width = 3,alphaTCN = alpha, ...) |
147 | 147 |
try(flog.debug("Kappa: %f", PSCBS::estimateKappa(segPSCBS)), silent = TRUE) |
148 | 148 |
seg <- .PSCBSoutput2DNAcopy(segPSCBS, sampleid) |
149 |
+ if (flog.threshold() == "DEBUG") { |
|
150 |
+ flog.debug("Re-running segmentation without undo.SD...") |
|
151 |
+ segusd <- PSCBS::segmentByNonPairedPSCBS(input, tauA = tauA, |
|
152 |
+ flavor = flavor, undoTCN = 0, knownSegments = knownSegments, |
|
153 |
+ min.width = 3,alphaTCN = alpha, ...) |
|
154 |
+ segusd <- .PSCBSoutput2DNAcopy(segusd, sampleid) |
|
155 |
+ attr(seg, "Debug.undo.SD") <- segusd |
|
156 |
+ } |
|
149 | 157 |
if (undo.SD <= 0 || is.null(max.segments) || nrow(seg) < max.segments) break |
150 | 158 |
flog.info("Found %i segments, exceeding max.segments threshold of %i.", |
151 | 159 |
nrow(seg), max.segments) |
... | ... |
@@ -162,6 +170,17 @@ segmentationPSCBS <- function(normal, tumor, log.ratio, seg, plot.cnv, |
162 | 170 |
} |
163 | 171 |
seg <- .addAverageWeights(seg, weight.flag.pvalue, tumor, chr.hash) |
164 | 172 |
seg <- .fixBreakpointsInBaits(tumor, log.ratio, seg, chr.hash) |
173 |
+ attr(seg, "PSCBS.Args") <- list( |
|
174 |
+ alpha = alpha, |
|
175 |
+ boost.on.target.max.size = boost.on.target.max.size, |
|
176 |
+ flavor = flavor, |
|
177 |
+ iterations = try.again + 1, |
|
178 |
+ min.logr.sdev = min.logr.sdev, |
|
179 |
+ prune.hclust.h = prune.hclust.h, |
|
180 |
+ prune.hclust.method = prune.hclust.method, |
|
181 |
+ tauA = tauA, |
|
182 |
+ undo.SD = undo.SD |
|
183 |
+ ) |
|
165 | 184 |
seg |
166 | 185 |
} |
167 | 186 |
|
... | ... |
@@ -9,7 +9,9 @@ calculateMappingBiasGatk4( |
9 | 9 |
reference.genome, |
10 | 10 |
min.normals = 1, |
11 | 11 |
min.normals.betafit = 7, |
12 |
+ min.normals.assign.betafit = 3, |
|
12 | 13 |
min.median.coverage.betafit = 5, |
14 |
+ num.betafit.clusters = 9, |
|
13 | 15 |
AF.info.field = "AF" |
14 | 16 |
) |
15 | 17 |
} |
... | ... |
@@ -24,9 +26,14 @@ calculating position-specific mapping bias.} |
24 | 26 |
\item{min.normals.betafit}{Minimum number of normals with heterozygous SNP |
25 | 27 |
fitting a beta distribution} |
26 | 28 |
|
29 |
+\item{min.normals.assign.betafit}{Minimum number of normals with |
|
30 |
+heterozygous SNPs to assign to a beta binomal fit cluster} |
|
31 |
+ |
|
27 | 32 |
\item{min.median.coverage.betafit}{Minimum median coverage of normals with |
28 | 33 |
heterozygous SNP for fitting a beta distribution} |
29 | 34 |
|
35 |
+\item{num.betafit.clusters}{Maximum number of beta binomial fit clusters} |
|
36 |
+ |
|
30 | 37 |
\item{AF.info.field}{Field in the \code{workspace} that stores the allelic |
31 | 38 |
fraction} |
32 | 39 |
} |
... | ... |
@@ -8,8 +8,10 @@ calculateMappingBiasVcf( |
8 | 8 |
normal.panel.vcf.file, |
9 | 9 |
min.normals = 1, |
10 | 10 |
min.normals.betafit = 7, |
11 |
+ min.normals.assign.betafit = 3, |
|
11 | 12 |
min.median.coverage.betafit = 5, |
12 |
- yieldSize = 5000, |
|
13 |
+ num.betafit.clusters = 9, |
|
14 |
+ yieldSize = 50000, |
|
13 | 15 |
genome |
14 | 16 |
) |
15 | 17 |
} |
... | ... |
@@ -22,11 +24,16 @@ Needs to be compressed and indexed with bgzip and tabix, respectively.} |
22 | 24 |
calculating position-specific mapping bias.} |
23 | 25 |
|
24 | 26 |
\item{min.normals.betafit}{Minimum number of normals with heterozygous SNP |
25 |
-fitting a beta distribution} |
|
27 |
+fitting a beta binomial distribution} |
|
28 |
+ |
|
29 |
+\item{min.normals.assign.betafit}{Minimum number of normals with |
|
30 |
+heterozygous SNPs to assign to a beta binomal fit cluster} |
|
26 | 31 |
|
27 | 32 |
\item{min.median.coverage.betafit}{Minimum median coverage of normals with |
28 | 33 |
heterozygous SNP for fitting a beta distribution} |
29 | 34 |
|
35 |
+\item{num.betafit.clusters}{Maximum number of beta binomial fit clusters} |
|
36 |
+ |
|
30 | 37 |
\item{yieldSize}{See \code{TabixFile}} |
31 | 38 |
|
32 | 39 |
\item{genome}{See \code{readVcf}} |