Browse code

First major update to mapping bias estimation code.

Markus Riester authored on 23/08/2021 02:30:10
Showing 7 changed files

... ...
@@ -2,8 +2,8 @@ Package: PureCN
2 2
 Type: Package
3 3
 Title: Copy number calling and SNV classification using
4 4
     targeted short read sequencing
5
-Version: 1.23.18
6
-Date: 2021-08-20
5
+Version: 1.23.19
6
+Date: 2021-08-21
7 7
 Authors@R: c(person("Markus", "Riester",
8 8
                     role = c("aut", "cre"),
9 9
                     email = "[email protected]",
... ...
@@ -43,6 +43,7 @@ Imports:
43 43
     VGAM,
44 44
     tools,
45 45
     methods,
46
+    mclust,
46 47
     rhdf5,
47 48
     Matrix
48 49
 Suggests:
... ...
@@ -139,6 +139,9 @@ importFrom(graphics,strwidth)
139 139
 importFrom(graphics,symbols)
140 140
 importFrom(graphics,text)
141 141
 importFrom(gridExtra,grid.arrange)
142
+importFrom(mclust,Mclust)
143
+importFrom(mclust,emControl)
144
+importFrom(mclust,mclustBIC)
142 145
 importFrom(methods,is)
143 146
 importFrom(rhdf5,H5Fopen)
144 147
 importFrom(rtracklayer,import)
... ...
@@ -5,6 +5,9 @@ NEW FEATURES
5 5
 
6 6
     o Report median absolute pairwise difference (MAPD) of tumor vs normal log2
7 7
       ratios in runAbsoluteCN
8
+    o Improved mapping bias estimates: variants with insufficient information
9
+      for position-specific fits (default 3-6 heterozygous variants) 
10
+      are clustered and assigned to the most similar fit  
8 11
 
9 12
 SIGNIFICANT USER-VISIBLE CHANGES
10 13
 
... ...
@@ -25,6 +28,7 @@ SIGNIFICANT USER-VISIBLE CHANGES
25 28
       finding noisy samples or batches in normal databases  
26 29
     o Do not error out readCurationFile when CSV is missing and directory
27 30
       is not writable when re-generating it (#196)
31
+    o Add segmentation parameters as attributes to segmentation data.frame  
28 32
       
29 33
 BUGFIXES
30 34
 
... ...
@@ -10,9 +10,12 @@
10 10
 #' @param min.normals Minimum number of normals with heterozygous SNP for
11 11
 #' calculating position-specific mapping bias.
12 12
 #' @param min.normals.betafit Minimum number of normals with heterozygous SNP
13
-#' fitting a beta distribution
13
+#' fitting a beta binomial distribution
14
+#' @param min.normals.assign.betafit Minimum number of normals with 
15
+#' heterozygous SNPs to assign to a beta binomal fit cluster
14 16
 #' @param min.median.coverage.betafit Minimum median coverage of normals with
15 17
 #' heterozygous SNP for fitting a beta distribution
18
+#' @param num.betafit.clusters Maximum number of beta binomial fit clusters
16 19
 #' @param yieldSize See \code{TabixFile}
17 20
 #' @param genome See \code{readVcf}
18 21
 #' @return A \code{GRanges} object with mapping bias and number of normal
... ...
@@ -28,12 +31,15 @@
28 31
 #' @importFrom GenomicRanges GRangesList
29 32
 #' @importFrom VGAM vglm Coef betabinomial dbetabinom
30 33
 #' @importFrom data.table rbindlist
34
+#' @importFrom mclust Mclust mclustBIC emControl
31 35
 #' @export calculateMappingBiasVcf
32 36
 calculateMappingBiasVcf <- function(normal.panel.vcf.file,
33 37
                                     min.normals = 1,
34 38
                                     min.normals.betafit = 7,
39
+                                    min.normals.assign.betafit = 3,
35 40
                                     min.median.coverage.betafit = 5,
36
-                                    yieldSize = 5000, genome) {
41
+                                    num.betafit.clusters = 9,
42
+                                    yieldSize = 50000, genome) {
37 43
     tab <- TabixFile(normal.panel.vcf.file, yieldSize = yieldSize)
38 44
     open(tab)
39 45
     param <- ScanVcfParam(geno = c("AD"), fixed = "ALT", info = NA)
... ...
@@ -48,7 +54,10 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file,
48 54
         mappingBias <- .calculateMappingBias(nvcf = vcf_yield,
49 55
             min.normals = min.normals,
50 56
             min.normals.betafit = min.normals.betafit,
51
-            min.median.coverage.betafit = min.median.coverage.betafit)
57
+            min.normals.assign.betafit = min.normals.assign.betafit,
58
+            min.median.coverage.betafit = min.median.coverage.betafit,
59
+            num.betafit.clusters = num.betafit.clusters
60
+        )
52 61
         if (length(ret)) {
53 62
             ret <- append(ret, GRangesList(mappingBias))
54 63
         } else {
... ...
@@ -61,7 +70,9 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file,
61 70
     attr(bias, "normal.panel.vcf.file") <- normal.panel.vcf.file
62 71
     attr(bias, "min.normals") <- min.normals
63 72
     attr(bias, "min.normals.betafit") <- min.normals.betafit
73
+    attr(bias, "min.normals.assign.betafit") <- min.normals.assign.betafit
64 74
     attr(bias, "min.median.coverage.betafit") <- min.median.coverage.betafit
75
+    attr(bias, "num.betafit.clusters") <- num.betafit.clusters
65 76
     attr(bias, "genome") <- genome
66 77
     bias
67 78
 }
... ...
@@ -78,8 +89,11 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file,
78 89
 #' calculating position-specific mapping bias.
79 90
 #' @param min.normals.betafit Minimum number of normals with heterozygous SNP
80 91
 #' fitting a beta distribution
92
+#' @param min.normals.assign.betafit Minimum number of normals with 
93
+#' heterozygous SNPs to assign to a beta binomal fit cluster
81 94
 #' @param min.median.coverage.betafit Minimum median coverage of normals with
82 95
 #' heterozygous SNP for fitting a beta distribution
96
+#' @param num.betafit.clusters Maximum number of beta binomial fit clusters
83 97
 #' @param AF.info.field Field in the \code{workspace} that stores the allelic 
84 98
 #' fraction
85 99
 #' @return A \code{GRanges} object with mapping bias and number of normal
... ...
@@ -104,7 +118,9 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file,
104 118
 calculateMappingBiasGatk4 <- function(workspace, reference.genome,
105 119
                                     min.normals = 1,
106 120
                                     min.normals.betafit = 7,
121
+                                    min.normals.assign.betafit = 3,
107 122
                                     min.median.coverage.betafit = 5,
123
+                                    num.betafit.clusters = 9,
108 124
                                     AF.info.field = "AF") {
109 125
 
110 126
     if (!requireNamespace("genomicsdb", quietly = TRUE) || 
... ...
@@ -160,7 +176,9 @@ calculateMappingBiasGatk4 <- function(workspace, reference.genome,
160 176
         alt = m_alt, ref = m_ref, gr = gr,
161 177
         min.normals = min.normals,
162 178
         min.normals.betafit = min.normals.betafit,
179
+        min.normals.assign.betafit = min.normals.assign.betafit,
163 180
         min.median.coverage.betafit = min.median.coverage.betafit,
181
+        num.betafit.clusters = num.betafit.clusters,
164 182
         verbose = TRUE
165 183
     )
166 184
     attr(bias, "workspace") <- workspace
... ...
@@ -187,7 +205,9 @@ calculateMappingBiasGatk4 <- function(workspace, reference.genome,
187 205
     
188 206
 .calculateMappingBias <- function(nvcf, alt = NULL, ref = NULL, gr = NULL,
189 207
                                   min.normals, min.normals.betafit = 7,
208
+                                  min.normals.assign.betafit = 3,
190 209
                                   min.median.coverage.betafit = 5,
210
+                                  num.betafit.clusters = 9,
191 211
                                   verbose = FALSE) {
192 212
     if (!is.null(nvcf)) {
193 213
         if (ncol(nvcf) < 2) {
... ...
@@ -240,6 +260,7 @@ calculateMappingBiasGatk4 <- function(workspace, reference.genome,
240 260
     gr$pon.count <- ponCntHits
241 261
     gr$mu <- x[5, ]
242 262
     gr$rho <- x[6, ]
263
+    gr <- .clusterFa(gr, fa, alt, ref, min.normals.assign.betafit, num.betafit.clusters)
243 264
     gr <- gr[order(gr$pon.count, decreasing = TRUE)]
244 265
     gr <- sort(gr)
245 266
     gr$triallelic <- FALSE
... ...
@@ -290,3 +311,27 @@ calculateMappingBiasGatk4 <- function(workspace, reference.genome,
290 311
     # get the alt allelic fraction for all SNPs
291 312
     apply(x, 2, function(y) y[2] / sum(head(y, 2)))
292 313
 }
314
+
315
+.clusterFa <- function(gr, fa, alt, ref, min.normals.assign.betafit = 3, num.betafit.clusters = 9) {
316
+    idx <- !is.na(gr$mu)
317
+    if (sum(idx) < num.betafit.clusters) return(gr) 
318
+    flog.info("Clustering beta binomial fits...")    
319
+    fit <- Mclust(mcols(gr)[idx, c("mu", "rho")], G = seq_len(num.betafit.clusters))
320
+    x <- sapply(seq_len(nrow(fa)), function(i) {
321
+        idx <- !is.na(fa[i, ]) & fa[i, ] > 0.05 & fa[i, ] < 0.9
322
+        if (!sum(idx) >= min.normals.assign.betafit) return(NA)
323
+        ll <- apply(fit$parameters$mean, 2, function(m) {
324
+                sum(dbetabinom(x = alt[i, idx], prob = m[1],
325
+                   size = alt[i, idx] + ref[i, idx],
326
+                   rho = m[2], log = TRUE))
327
+        })
328
+        if (is.infinite(max(ll))) return(NA)
329
+        which.max(ll)
330
+    })
331
+    idxa <- is.na(gr$mu) & !is.na(x)
332
+    flog.info("Assigning (%i/%i) variants a clustered beta binomal fit.",
333
+        sum(idxa), length(gr))
334
+    gr$mu[idxa] <- fit$parameters$mean[1,x[idxa]]
335
+    gr$rho[idxa] <- fit$parameters$mean[2,x[idxa]]
336
+    return(gr)
337
+}    
... ...
@@ -146,6 +146,14 @@ segmentationPSCBS <- function(normal, tumor, log.ratio, seg, plot.cnv,
146 146
             min.width = 3,alphaTCN = alpha, ...)
147 147
         try(flog.debug("Kappa: %f", PSCBS::estimateKappa(segPSCBS)), silent = TRUE)
148 148
         seg <- .PSCBSoutput2DNAcopy(segPSCBS, sampleid)
149
+        if (flog.threshold() == "DEBUG") {
150
+            flog.debug("Re-running segmentation without undo.SD...") 
151
+            segusd <- PSCBS::segmentByNonPairedPSCBS(input, tauA = tauA,
152
+                flavor = flavor, undoTCN = 0, knownSegments = knownSegments,
153
+                min.width = 3,alphaTCN = alpha, ...)
154
+            segusd <- .PSCBSoutput2DNAcopy(segusd, sampleid)
155
+            attr(seg, "Debug.undo.SD") <- segusd
156
+        }
149 157
         if (undo.SD <= 0 || is.null(max.segments) || nrow(seg) < max.segments) break
150 158
         flog.info("Found %i segments, exceeding max.segments threshold of %i.",
151 159
             nrow(seg), max.segments)
... ...
@@ -162,6 +170,17 @@ segmentationPSCBS <- function(normal, tumor, log.ratio, seg, plot.cnv,
162 170
     }
163 171
     seg <- .addAverageWeights(seg, weight.flag.pvalue, tumor, chr.hash)
164 172
     seg <- .fixBreakpointsInBaits(tumor, log.ratio, seg, chr.hash)
173
+    attr(seg, "PSCBS.Args") <- list(
174
+        alpha = alpha,
175
+        boost.on.target.max.size = boost.on.target.max.size,
176
+        flavor = flavor, 
177
+        iterations = try.again + 1,
178
+        min.logr.sdev = min.logr.sdev,
179
+        prune.hclust.h = prune.hclust.h,
180
+        prune.hclust.method = prune.hclust.method,
181
+        tauA = tauA,
182
+        undo.SD = undo.SD
183
+    )
165 184
     seg
166 185
 }
167 186
 
... ...
@@ -9,7 +9,9 @@ calculateMappingBiasGatk4(
9 9
   reference.genome,
10 10
   min.normals = 1,
11 11
   min.normals.betafit = 7,
12
+  min.normals.assign.betafit = 3,
12 13
   min.median.coverage.betafit = 5,
14
+  num.betafit.clusters = 9,
13 15
   AF.info.field = "AF"
14 16
 )
15 17
 }
... ...
@@ -24,9 +26,14 @@ calculating position-specific mapping bias.}
24 26
 \item{min.normals.betafit}{Minimum number of normals with heterozygous SNP
25 27
 fitting a beta distribution}
26 28
 
29
+\item{min.normals.assign.betafit}{Minimum number of normals with 
30
+heterozygous SNPs to assign to a beta binomal fit cluster}
31
+
27 32
 \item{min.median.coverage.betafit}{Minimum median coverage of normals with
28 33
 heterozygous SNP for fitting a beta distribution}
29 34
 
35
+\item{num.betafit.clusters}{Maximum number of beta binomial fit clusters}
36
+
30 37
 \item{AF.info.field}{Field in the \code{workspace} that stores the allelic 
31 38
 fraction}
32 39
 }
... ...
@@ -8,8 +8,10 @@ calculateMappingBiasVcf(
8 8
   normal.panel.vcf.file,
9 9
   min.normals = 1,
10 10
   min.normals.betafit = 7,
11
+  min.normals.assign.betafit = 3,
11 12
   min.median.coverage.betafit = 5,
12
-  yieldSize = 5000,
13
+  num.betafit.clusters = 9,
14
+  yieldSize = 50000,
13 15
   genome
14 16
 )
15 17
 }
... ...
@@ -22,11 +24,16 @@ Needs to be compressed and indexed with bgzip and tabix, respectively.}
22 24
 calculating position-specific mapping bias.}
23 25
 
24 26
 \item{min.normals.betafit}{Minimum number of normals with heterozygous SNP
25
-fitting a beta distribution}
27
+fitting a beta binomial distribution}
28
+
29
+\item{min.normals.assign.betafit}{Minimum number of normals with 
30
+heterozygous SNPs to assign to a beta binomal fit cluster}
26 31
 
27 32
 \item{min.median.coverage.betafit}{Minimum median coverage of normals with
28 33
 heterozygous SNP for fitting a beta distribution}
29 34
 
35
+\item{num.betafit.clusters}{Maximum number of beta binomial fit clusters}
36
+
30 37
 \item{yieldSize}{See \code{TabixFile}}
31 38
 
32 39
 \item{genome}{See \code{readVcf}}