Browse code

Initial support for GATK4 GenomicsDBImport (#6).

Markus Riester authored on 05/07/2020 04:28:39
Showing 10 changed files

... ...
@@ -2,8 +2,8 @@ Package: PureCN
2 2
 Type: Package
3 3
 Title: Copy number calling and SNV classification using
4 4
     targeted short read sequencing
5
-Version: 1.19.3
6
-Date: 2020-06-24
5
+Version: 1.19.4
6
+Date: 2020-07-04
7 7
 Authors@R: c(person("Markus", "Riester",
8 8
                     role = c("aut", "cre"),
9 9
                     email = "[email protected]",
... ...
@@ -54,8 +54,11 @@ Suggests:
54 54
     knitr,
55 55
     optparse,
56 56
     org.Hs.eg.db,
57
+    jsonlite,
57 58
     rmarkdown,
58 59
     testthat
60
+Enhances:
61
+    genomicsdb    
59 62
 VignetteBuilder: knitr
60 63
 License: Artistic-2.0
61 64
 URL: https://github.com/lima1/PureCN
... ...
@@ -6,6 +6,7 @@ export(calculateBamCoverageByInterval)
6 6
 export(calculateGCContentByInterval)
7 7
 export(calculateIntervalWeights)
8 8
 export(calculateLogRatio)
9
+export(calculateMappingBiasGatk4)
9 10
 export(calculateMappingBiasVcf)
10 11
 export(calculatePowerDetectSomatic)
11 12
 export(calculateTangentNormal)
... ...
@@ -52,6 +53,7 @@ importFrom(GenomeInfoDb,"seqlengths<-")
52 53
 importFrom(GenomeInfoDb,"seqlevels<-")
53 54
 importFrom(GenomeInfoDb,"seqlevelsStyle<-")
54 55
 importFrom(GenomeInfoDb,genomeStyles)
56
+importFrom(GenomeInfoDb,rankSeqlevels)
55 57
 importFrom(GenomeInfoDb,seqlengths)
56 58
 importFrom(GenomeInfoDb,seqlevelsInUse)
57 59
 importFrom(GenomeInfoDb,seqlevelsStyle)
... ...
@@ -85,6 +87,7 @@ importFrom(VGAM,dbetabinom)
85 87
 importFrom(VGAM,dbetabinom.ab)
86 88
 importFrom(VGAM,vglm)
87 89
 importFrom(data.table,data.table)
90
+importFrom(data.table,dcast)
88 91
 importFrom(data.table,fread)
89 92
 importFrom(data.table,fwrite)
90 93
 importFrom(futile.logger,appender.tee)
... ...
@@ -1,6 +1,11 @@
1 1
 Changes in version 1.20.0
2 2
 -------------------------
3 3
 
4
+NEW FEATURES
5
+
6
+    o Support for GATK4 GenomicsDB import for mapping bias calculation
7
+    
8
+
4 9
 SIGNIFICANT USER-VISIBLE CHANGES
5 10
 
6 11
     o We now check if POP_AF or POPAF is -log10 scaled as new Mutect2 versions
... ...
@@ -27,7 +27,8 @@
27 27
 #' @importFrom GenomicRanges GRangesList
28 28
 #' @importFrom VGAM vglm Coef betabinomial dbetabinom
29 29
 #' @export calculateMappingBiasVcf
30
-calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2,
30
+calculateMappingBiasVcf <- function(normal.panel.vcf.file,
31
+                                    min.normals = 2,
31 32
                                     min.normals.betafit = 7,
32 33
                                     min.median.coverage.betafit = 5,
33 34
                                     yieldSize = 5000, genome) {
... ...
@@ -42,8 +43,10 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2,
42 43
         if (!(cntStep %% 10)) {
43 44
             flog.info("Position %s:%i", as.character(seqnames(vcf_yield)[1]), start(vcf_yield)[1])
44 45
         }
45
-        mappingBias <- .calculateMappingBias(vcf_yield, min.normals, 
46
-            min.normals.betafit, min.median.coverage.betafit)
46
+        mappingBias <- .calculateMappingBias(nvcf = vcf_yield,
47
+            min.normals = min.normals, 
48
+            min.normals.betafit = min.normals.betafit,
49
+            min.median.coverage.betafit = min.median.coverage.betafit)
47 50
         ret <- append(ret, GRangesList(mappingBias))
48 51
         cntVar <- cntVar + yieldSize
49 52
         cntStep <- cntStep + 1
... ...
@@ -57,16 +60,131 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2,
57 60
     bias
58 61
 }
59 62
 
60
-.calculateMappingBias <- function(nvcf, min.normals, min.normals.betafit = 7,
63
+#' Calculate Mapping Bias from GATK4 GenomicsDB
64
+#'
65
+#' Function calculate mapping bias for each variant in the provided
66
+#' panel of normals GenomicsDB.
67
+#'
68
+#'
69
+#' @param workspace Path to the GenomicsDB created by \code{GenomicsDBImport}
70
+#' @param reference.genome Reference FASTA file.
71
+#' @param min.normals Minimum number of normals with heterozygous SNP for
72
+#' calculating position-specific mapping bias. 
73
+#' @param min.normals.betafit Minimum number of normals with heterozygous SNP
74
+#' fitting a beta distribution
75
+#' @param min.median.coverage.betafit Minimum median coverage of normals with
76
+#' heterozygous SNP for fitting a beta distribution
77
+#' @return A \code{GRanges} object with mapping bias and number of normal
78
+#' samples with this variant.
79
+#' @author Markus Riester
80
+#' @examples
81
+#'
82
+#' normal.panel.vcf <- system.file("extdata", "normalpanel.vcf.gz", package="PureCN")
83
+#' bias <- calculateMappingBiasVcf(normal.panel.vcf, genome = "h19")
84
+#' saveRDS(bias, "mapping_bias.rds")
85
+#'
86
+#' @export calculateMappingBiasGatk4
87
+#' @importFrom data.table dcast
88
+#' @importFrom GenomeInfoDb rankSeqlevels
89
+calculateMappingBiasGatk4 <- function(workspace, reference.genome,
90
+                                    min.normals = 2,
91
+                                    min.normals.betafit = 7,
92
+                                    min.median.coverage.betafit = 5) {
93
+
94
+    if (!requireNamespace("genomicsdb", quietly = TRUE) || 
95
+        !requireNamespace("jsonlite", quietly = TRUE)
96
+        ) {
97
+        .stopUserError("Install the genomicsdb and jsonlite R packages for GenomicsDB import.")
98
+    }
99
+    workspace <- normalizePath(workspace, mustWork = TRUE)
100
+
101
+    db <- genomicsdb::connect(workspace = workspace,
102
+        vid_mapping_file = file.path(workspace, "vidmap.json"),
103
+        callset_mapping_file=file.path(workspace, "callset.json"),
104
+        reference_genome = reference.genome,
105
+        c("DP", "AD", "AF"))
106
+
107
+    jcallset <- jsonlite::read_json(file.path(workspace, "callset.json"))
108
+    jvidmap <- jsonlite::read_json(file.path(workspace, "vidmap.json"))
109
+    
110
+    # get all available arrays
111
+    arrays <- sapply(dir(workspace, full.names=TRUE), file.path, "genomicsdb_meta_dir")
112
+    arrays <- basename(names(arrays)[which(file.exists(arrays))])
113
+    # get offsets and lengths
114
+    contigs <- sapply(arrays, function(ary) strsplit(ary, "\\$")[[1]][1])
115
+    contigs <- jvidmap$contigs[match(contigs, sapply(jvidmap$contigs, function(x) x$name))]
116
+    idx <- order(rankSeqlevels(sapply(contigs, function(x) x$name)))
117
+
118
+    bias <- lapply(idx, function(i) {
119
+        c_offset <- as.numeric(contigs[[i]]$tiledb_column_offset)
120
+        c_length <- as.numeric(contigs[[i]]$length)
121
+
122
+        flog.info("Processing %s (offset %.0f, length %.0f)...",
123
+            arrays[i], c_offset, c_length)
124
+        query <- data.table(genomicsdb::query_variant_calls(db, 
125
+            array = arrays[i], 
126
+            column_ranges = list(c(c_offset, c_offset + c_length)),
127
+            row_ranges = list(range(sapply(jcallset$callsets,
128
+                function(x) x$row_idx)))))
129
+
130
+        parsed_ad <- .parseADGenomicsDb(query)
131
+        .calculateMappingBias(nvcf = NULL, 
132
+            alt = parsed_ad$alt, 
133
+            ref = parsed_ad$ref,
134
+            gr = parsed_ad$gr,
135
+            min.normals = min.normals,
136
+            min.normals.betafit = min.normals.betafit,
137
+            min.median.coverage.betafit = min.median.coverage.betafit
138
+        )
139
+    })
140
+    genomicsdb::disconnect(db)
141
+    bias <- unlist(GRangesList(bias))
142
+    attr(bias, "workspace") <- workspace
143
+    attr(bias, "min.normals") <- min.normals
144
+    attr(bias, "min.normals.betafit") <- min.normals.betafit
145
+    attr(bias, "min.median.coverage.betafit") <- min.median.coverage.betafit
146
+    return(bias)
147
+}
148
+
149
+.parseADGenomicsDb <- function(query) {
150
+    ref <-  dcast(query, CHROM+POS+END+REF+ALT~SAMPLE, value.var = "AD")
151
+    af <-  dcast(query, CHROM+POS+END+REF+ALT~SAMPLE, value.var = "AF")
152
+    gr <- GRanges(seqnames = ref$CHROM, IRanges(start = ref$POS, end = ref$END))
153
+    genomic_change <- paste0(as.character(gr), "_", ref$REF, ">", ref$ALT)
154
+    ref <- as.matrix(ref[,-(1:5)])
155
+    af <- as.matrix(af[,-(1:5)])
156
+    alt <- round(ref/(1-af)-ref)
157
+    rownames(ref) <- genomic_change
158
+    rownames(af) <- genomic_change
159
+    rownames(alt) <- genomic_change
160
+    list(ref = ref, alt = alt, gr = gr)
161
+}
162
+    
163
+.calculateMappingBias <- function(nvcf, alt = NULL, ref = NULL, gr = NULL, 
164
+                                  min.normals, min.normals.betafit = 7,
61 165
                                   min.median.coverage.betafit = 5) {
62
-    if (ncol(nvcf) < 2) {
63
-        .stopUserError("The normal.panel.vcf.file contains only a single sample.")
166
+    if (!is.null(nvcf)) {
167
+        if (ncol(nvcf) < 2) {
168
+            .stopUserError("The normal.panel.vcf.file contains only a single sample.")
169
+        }
170
+        # TODO: deal with tri-allelic sites
171
+        alt <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][2])
172
+        ref <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][1])
173
+        fa  <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][2]/sum(x[[1]]))
174
+        rownames(alt) <-  as.character(rowRanges(nvcf))
175
+        gr <- rowRanges(nvcf)
176
+    } else {
177
+        if (is.null(alt) || is.null(ref) || is.null(gr) || ncol(ref) != ncol(alt)) {
178
+            .stopRuntimeError("Either nvcf or valid alt and ref required.")
179
+        }
180
+        if (ncol(alt) < 2) {
181
+            .stopUserError("The normal.panel.vcf.file contains only a single sample.")
182
+        }
183
+        fa <- alt / (ref + alt)
64 184
     }
65
-    # TODO: deal with tri-allelic sites
66
-    alt <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][2])
67
-    ref <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][1])
68
-    fa  <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][2]/sum(x[[1]]))
69
-    x <- sapply(seq_len(nrow(nvcf)), function(i) {
185
+    ponCntHits <- apply(alt,1,function(x) sum(!is.na(x)))
186
+            
187
+    x <- sapply(seq_len(nrow(fa)), function(i) {
70 188
         idx <- !is.na(fa[i,]) & fa[i,] > 0.05 & fa[i,] < 0.9
71 189
         shapes <- c(NA, NA)
72 190
         if (!sum(idx) >= min.normals) return(c(0, 0, 0, 0, shapes))
... ...
@@ -78,7 +196,7 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2,
78 196
                 ref[i,idx]) ~ 1, betabinomial, trace = FALSE)))
79 197
             if (class(fit) == "try-error") {
80 198
                 flog.warn("Could not fit beta binomial dist for %s (%s).",
81
-                    as.character(rowRanges(nvcf[i])),
199
+                    rownames(alt)[i],
82 200
                     paste0(round(fa[i, idx], digits = 3), collapse=","))
83 201
             } else {    
84 202
                 shapes <- Coef(fit)
... ...
@@ -89,16 +207,13 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2,
89 207
     # Add an average "normal" SNP (average coverage and allelic fraction > 0.4)
90 208
     # as empirical prior
91 209
     psMappingBias <- .adjustEmpBayes(x[1:4,]) * 2
92
-    ponCntHits <- apply(geno(nvcf)$AD, 1, function(x)
93
-        sum(!is.na(unlist(x))) / 2)
94 210
 
95
-    tmp <- rowRanges(nvcf)
96
-    mcols(tmp) <- NULL
97
-    tmp$bias <- psMappingBias
98
-    tmp$pon.count <- ponCntHits
99
-    tmp$mu <- x[5,]
100
-    tmp$rho <- x[6,]
101
-    tmp
211
+    mcols(gr) <- NULL
212
+    gr$bias <- psMappingBias
213
+    gr$pon.count <- ponCntHits
214
+    gr$mu <- x[5,]
215
+    gr$rho <- x[6,]
216
+    gr
102 217
 }
103 218
  
104 219
 .readNormalPanelVcfLarge <- function(vcf, normal.panel.vcf.file,
... ...
@@ -84,7 +84,8 @@ normal.panel.vcf.file = NULL, min.normals = 2, smooth = TRUE, smooth.n = 5) {
84 84
             flog.warn("setMappingBiasVcf: no hits in %s.", mapping.bias.file)
85 85
             return(data.frame(bias = tmp, mu = NA, rho = NA))
86 86
         }
87
-        mappingBias <- .calculateMappingBias(nvcf, min.normals)
87
+        mappingBias <- .calculateMappingBias(nvcf = nvcf,
88
+            min.normals = min.normals)
88 89
     }
89 90
     .annotateMappingBias(tmp, vcf, mappingBias, max.bias, smooth, smooth.n)
90 91
 }
... ...
@@ -60,7 +60,11 @@ if (!is.null(opt$normal_panel)) {
60 60
     } else {
61 61
         suppressPackageStartupMessages(library(PureCN))
62 62
         flog.info("Creating mapping bias database.")
63
-        bias <- calculateMappingBiasVcf(opt$normal_panel, genome = genome)
63
+        if (file.exists(file.path(opt$normal_panel, "callset.json"))) {
64
+            bias <- calculateMappingBiasGatk4(opt$normal_panel, genome)
65
+        } else {
66
+            bias <- calculateMappingBiasVcf(opt$normal_panel, genome = genome)
67
+        }
64 68
         saveRDS(bias, file = output.file)
65 69
     }
66 70
 }
67 71
new file mode 100644
68 72
Binary files /dev/null and b/inst/extdata/gatk4_pon_db.tgz differ
69 73
new file mode 100644
... ...
@@ -0,0 +1,46 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/calculateMappingBiasVcf.R
3
+\name{calculateMappingBiasGatk4}
4
+\alias{calculateMappingBiasGatk4}
5
+\title{Calculate Mapping Bias from GATK4 GenomicsDB}
6
+\usage{
7
+calculateMappingBiasGatk4(
8
+  workspace,
9
+  reference.genome,
10
+  min.normals = 2,
11
+  min.normals.betafit = 7,
12
+  min.median.coverage.betafit = 5
13
+)
14
+}
15
+\arguments{
16
+\item{workspace}{Path to the GenomicsDB created by \code{GenomicsDBImport}}
17
+
18
+\item{reference.genome}{Reference FASTA file.}
19
+
20
+\item{min.normals}{Minimum number of normals with heterozygous SNP for
21
+calculating position-specific mapping bias.}
22
+
23
+\item{min.normals.betafit}{Minimum number of normals with heterozygous SNP
24
+fitting a beta distribution}
25
+
26
+\item{min.median.coverage.betafit}{Minimum median coverage of normals with
27
+heterozygous SNP for fitting a beta distribution}
28
+}
29
+\value{
30
+A \code{GRanges} object with mapping bias and number of normal
31
+samples with this variant.
32
+}
33
+\description{
34
+Function calculate mapping bias for each variant in the provided
35
+panel of normals GenomicsDB.
36
+}
37
+\examples{
38
+
39
+normal.panel.vcf <- system.file("extdata", "normalpanel.vcf.gz", package="PureCN")
40
+bias <- calculateMappingBiasVcf(normal.panel.vcf, genome = "h19")
41
+saveRDS(bias, "mapping_bias.rds")
42
+
43
+}
44
+\author{
45
+Markus Riester
46
+}
... ...
@@ -52,3 +52,16 @@ test_that("Precomputed mapping bias matches", {
52 52
     vcf.single.file <- system.file("extdata", "example_single.vcf.gz", package = "PureCN")
53 53
     expect_error(calculateMappingBiasVcf(vcf.single.file), "only a single sample")
54 54
 })
55
+
56
+test_that("GenomicsDB import works", {
57
+    skip_if_not(requireNamespace("genomicsdb"), "genomicsdb required")
58
+    skip_if_not(requireNamespace("jsonlite"), "jsonlite required")
59
+    resources_file <- system.file("extdata", "gatk4_pon_db.tgz", 
60
+        package = "PureCN")
61
+    tmp_dir <- tempdir()
62
+    untar(resources_file, exdir = tmp_dir)
63
+    workspace <- file.path(tmp_dir, "gatk4_pon_db")
64
+    bias <- calculateMappingBiasGatk4(workspace, "hg19")
65
+    expect_equal(2101, length(bias))
66
+    unlink(tmp_dir, recursive=TRUE)
67
+})
... ...
@@ -449,10 +449,12 @@ Important recommendations:
449 449
 ## Recommended _GATK4_ usage
450 450
 
451 451
 ```
452
-# Recommended: Provide a normal panel VCF to remove mapping biases, pre-compute 
452
+# Recommended: Provide a normal panel GenomicsDB to remove mapping biases, pre-compute 
453 453
 # position-specific bias for much faster runtimes with large panels
454 454
 # This needs to be done only once for each assay
455
-Rscript $PURECN/NormalDB.R --outdir $OUT_REF --normal_panel $NORMAL_PANEL \
455
+# Requires the genomicsdb R package
456
+Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
457
+    --normal_panel $GENOMICSDB-WORKSPACE-PATH/pon_db \
456 458
     --assay agilent_v6 --genome hg19 --force
457 459
 
458 460
 Rscript $PURECN/PureCN.R --out $OUT/$SAMPLEID  \