... | ... |
@@ -2,8 +2,8 @@ Package: PureCN |
2 | 2 |
Type: Package |
3 | 3 |
Title: Copy number calling and SNV classification using |
4 | 4 |
targeted short read sequencing |
5 |
-Version: 1.19.3 |
|
6 |
-Date: 2020-06-24 |
|
5 |
+Version: 1.19.4 |
|
6 |
+Date: 2020-07-04 |
|
7 | 7 |
Authors@R: c(person("Markus", "Riester", |
8 | 8 |
role = c("aut", "cre"), |
9 | 9 |
email = "[email protected]", |
... | ... |
@@ -54,8 +54,11 @@ Suggests: |
54 | 54 |
knitr, |
55 | 55 |
optparse, |
56 | 56 |
org.Hs.eg.db, |
57 |
+ jsonlite, |
|
57 | 58 |
rmarkdown, |
58 | 59 |
testthat |
60 |
+Enhances: |
|
61 |
+ genomicsdb |
|
59 | 62 |
VignetteBuilder: knitr |
60 | 63 |
License: Artistic-2.0 |
61 | 64 |
URL: https://github.com/lima1/PureCN |
... | ... |
@@ -6,6 +6,7 @@ export(calculateBamCoverageByInterval) |
6 | 6 |
export(calculateGCContentByInterval) |
7 | 7 |
export(calculateIntervalWeights) |
8 | 8 |
export(calculateLogRatio) |
9 |
+export(calculateMappingBiasGatk4) |
|
9 | 10 |
export(calculateMappingBiasVcf) |
10 | 11 |
export(calculatePowerDetectSomatic) |
11 | 12 |
export(calculateTangentNormal) |
... | ... |
@@ -52,6 +53,7 @@ importFrom(GenomeInfoDb,"seqlengths<-") |
52 | 53 |
importFrom(GenomeInfoDb,"seqlevels<-") |
53 | 54 |
importFrom(GenomeInfoDb,"seqlevelsStyle<-") |
54 | 55 |
importFrom(GenomeInfoDb,genomeStyles) |
56 |
+importFrom(GenomeInfoDb,rankSeqlevels) |
|
55 | 57 |
importFrom(GenomeInfoDb,seqlengths) |
56 | 58 |
importFrom(GenomeInfoDb,seqlevelsInUse) |
57 | 59 |
importFrom(GenomeInfoDb,seqlevelsStyle) |
... | ... |
@@ -85,6 +87,7 @@ importFrom(VGAM,dbetabinom) |
85 | 87 |
importFrom(VGAM,dbetabinom.ab) |
86 | 88 |
importFrom(VGAM,vglm) |
87 | 89 |
importFrom(data.table,data.table) |
90 |
+importFrom(data.table,dcast) |
|
88 | 91 |
importFrom(data.table,fread) |
89 | 92 |
importFrom(data.table,fwrite) |
90 | 93 |
importFrom(futile.logger,appender.tee) |
... | ... |
@@ -1,6 +1,11 @@ |
1 | 1 |
Changes in version 1.20.0 |
2 | 2 |
------------------------- |
3 | 3 |
|
4 |
+NEW FEATURES |
|
5 |
+ |
|
6 |
+ o Support for GATK4 GenomicsDB import for mapping bias calculation |
|
7 |
+ |
|
8 |
+ |
|
4 | 9 |
SIGNIFICANT USER-VISIBLE CHANGES |
5 | 10 |
|
6 | 11 |
o We now check if POP_AF or POPAF is -log10 scaled as new Mutect2 versions |
... | ... |
@@ -27,7 +27,8 @@ |
27 | 27 |
#' @importFrom GenomicRanges GRangesList |
28 | 28 |
#' @importFrom VGAM vglm Coef betabinomial dbetabinom |
29 | 29 |
#' @export calculateMappingBiasVcf |
30 |
-calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2, |
|
30 |
+calculateMappingBiasVcf <- function(normal.panel.vcf.file, |
|
31 |
+ min.normals = 2, |
|
31 | 32 |
min.normals.betafit = 7, |
32 | 33 |
min.median.coverage.betafit = 5, |
33 | 34 |
yieldSize = 5000, genome) { |
... | ... |
@@ -42,8 +43,10 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2, |
42 | 43 |
if (!(cntStep %% 10)) { |
43 | 44 |
flog.info("Position %s:%i", as.character(seqnames(vcf_yield)[1]), start(vcf_yield)[1]) |
44 | 45 |
} |
45 |
- mappingBias <- .calculateMappingBias(vcf_yield, min.normals, |
|
46 |
- min.normals.betafit, min.median.coverage.betafit) |
|
46 |
+ mappingBias <- .calculateMappingBias(nvcf = vcf_yield, |
|
47 |
+ min.normals = min.normals, |
|
48 |
+ min.normals.betafit = min.normals.betafit, |
|
49 |
+ min.median.coverage.betafit = min.median.coverage.betafit) |
|
47 | 50 |
ret <- append(ret, GRangesList(mappingBias)) |
48 | 51 |
cntVar <- cntVar + yieldSize |
49 | 52 |
cntStep <- cntStep + 1 |
... | ... |
@@ -57,16 +60,131 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2, |
57 | 60 |
bias |
58 | 61 |
} |
59 | 62 |
|
60 |
-.calculateMappingBias <- function(nvcf, min.normals, min.normals.betafit = 7, |
|
63 |
+#' Calculate Mapping Bias from GATK4 GenomicsDB |
|
64 |
+#' |
|
65 |
+#' Function calculate mapping bias for each variant in the provided |
|
66 |
+#' panel of normals GenomicsDB. |
|
67 |
+#' |
|
68 |
+#' |
|
69 |
+#' @param workspace Path to the GenomicsDB created by \code{GenomicsDBImport} |
|
70 |
+#' @param reference.genome Reference FASTA file. |
|
71 |
+#' @param min.normals Minimum number of normals with heterozygous SNP for |
|
72 |
+#' calculating position-specific mapping bias. |
|
73 |
+#' @param min.normals.betafit Minimum number of normals with heterozygous SNP |
|
74 |
+#' fitting a beta distribution |
|
75 |
+#' @param min.median.coverage.betafit Minimum median coverage of normals with |
|
76 |
+#' heterozygous SNP for fitting a beta distribution |
|
77 |
+#' @return A \code{GRanges} object with mapping bias and number of normal |
|
78 |
+#' samples with this variant. |
|
79 |
+#' @author Markus Riester |
|
80 |
+#' @examples |
|
81 |
+#' |
|
82 |
+#' normal.panel.vcf <- system.file("extdata", "normalpanel.vcf.gz", package="PureCN") |
|
83 |
+#' bias <- calculateMappingBiasVcf(normal.panel.vcf, genome = "h19") |
|
84 |
+#' saveRDS(bias, "mapping_bias.rds") |
|
85 |
+#' |
|
86 |
+#' @export calculateMappingBiasGatk4 |
|
87 |
+#' @importFrom data.table dcast |
|
88 |
+#' @importFrom GenomeInfoDb rankSeqlevels |
|
89 |
+calculateMappingBiasGatk4 <- function(workspace, reference.genome, |
|
90 |
+ min.normals = 2, |
|
91 |
+ min.normals.betafit = 7, |
|
92 |
+ min.median.coverage.betafit = 5) { |
|
93 |
+ |
|
94 |
+ if (!requireNamespace("genomicsdb", quietly = TRUE) || |
|
95 |
+ !requireNamespace("jsonlite", quietly = TRUE) |
|
96 |
+ ) { |
|
97 |
+ .stopUserError("Install the genomicsdb and jsonlite R packages for GenomicsDB import.") |
|
98 |
+ } |
|
99 |
+ workspace <- normalizePath(workspace, mustWork = TRUE) |
|
100 |
+ |
|
101 |
+ db <- genomicsdb::connect(workspace = workspace, |
|
102 |
+ vid_mapping_file = file.path(workspace, "vidmap.json"), |
|
103 |
+ callset_mapping_file=file.path(workspace, "callset.json"), |
|
104 |
+ reference_genome = reference.genome, |
|
105 |
+ c("DP", "AD", "AF")) |
|
106 |
+ |
|
107 |
+ jcallset <- jsonlite::read_json(file.path(workspace, "callset.json")) |
|
108 |
+ jvidmap <- jsonlite::read_json(file.path(workspace, "vidmap.json")) |
|
109 |
+ |
|
110 |
+ # get all available arrays |
|
111 |
+ arrays <- sapply(dir(workspace, full.names=TRUE), file.path, "genomicsdb_meta_dir") |
|
112 |
+ arrays <- basename(names(arrays)[which(file.exists(arrays))]) |
|
113 |
+ # get offsets and lengths |
|
114 |
+ contigs <- sapply(arrays, function(ary) strsplit(ary, "\\$")[[1]][1]) |
|
115 |
+ contigs <- jvidmap$contigs[match(contigs, sapply(jvidmap$contigs, function(x) x$name))] |
|
116 |
+ idx <- order(rankSeqlevels(sapply(contigs, function(x) x$name))) |
|
117 |
+ |
|
118 |
+ bias <- lapply(idx, function(i) { |
|
119 |
+ c_offset <- as.numeric(contigs[[i]]$tiledb_column_offset) |
|
120 |
+ c_length <- as.numeric(contigs[[i]]$length) |
|
121 |
+ |
|
122 |
+ flog.info("Processing %s (offset %.0f, length %.0f)...", |
|
123 |
+ arrays[i], c_offset, c_length) |
|
124 |
+ query <- data.table(genomicsdb::query_variant_calls(db, |
|
125 |
+ array = arrays[i], |
|
126 |
+ column_ranges = list(c(c_offset, c_offset + c_length)), |
|
127 |
+ row_ranges = list(range(sapply(jcallset$callsets, |
|
128 |
+ function(x) x$row_idx))))) |
|
129 |
+ |
|
130 |
+ parsed_ad <- .parseADGenomicsDb(query) |
|
131 |
+ .calculateMappingBias(nvcf = NULL, |
|
132 |
+ alt = parsed_ad$alt, |
|
133 |
+ ref = parsed_ad$ref, |
|
134 |
+ gr = parsed_ad$gr, |
|
135 |
+ min.normals = min.normals, |
|
136 |
+ min.normals.betafit = min.normals.betafit, |
|
137 |
+ min.median.coverage.betafit = min.median.coverage.betafit |
|
138 |
+ ) |
|
139 |
+ }) |
|
140 |
+ genomicsdb::disconnect(db) |
|
141 |
+ bias <- unlist(GRangesList(bias)) |
|
142 |
+ attr(bias, "workspace") <- workspace |
|
143 |
+ attr(bias, "min.normals") <- min.normals |
|
144 |
+ attr(bias, "min.normals.betafit") <- min.normals.betafit |
|
145 |
+ attr(bias, "min.median.coverage.betafit") <- min.median.coverage.betafit |
|
146 |
+ return(bias) |
|
147 |
+} |
|
148 |
+ |
|
149 |
+.parseADGenomicsDb <- function(query) { |
|
150 |
+ ref <- dcast(query, CHROM+POS+END+REF+ALT~SAMPLE, value.var = "AD") |
|
151 |
+ af <- dcast(query, CHROM+POS+END+REF+ALT~SAMPLE, value.var = "AF") |
|
152 |
+ gr <- GRanges(seqnames = ref$CHROM, IRanges(start = ref$POS, end = ref$END)) |
|
153 |
+ genomic_change <- paste0(as.character(gr), "_", ref$REF, ">", ref$ALT) |
|
154 |
+ ref <- as.matrix(ref[,-(1:5)]) |
|
155 |
+ af <- as.matrix(af[,-(1:5)]) |
|
156 |
+ alt <- round(ref/(1-af)-ref) |
|
157 |
+ rownames(ref) <- genomic_change |
|
158 |
+ rownames(af) <- genomic_change |
|
159 |
+ rownames(alt) <- genomic_change |
|
160 |
+ list(ref = ref, alt = alt, gr = gr) |
|
161 |
+} |
|
162 |
+ |
|
163 |
+.calculateMappingBias <- function(nvcf, alt = NULL, ref = NULL, gr = NULL, |
|
164 |
+ min.normals, min.normals.betafit = 7, |
|
61 | 165 |
min.median.coverage.betafit = 5) { |
62 |
- if (ncol(nvcf) < 2) { |
|
63 |
- .stopUserError("The normal.panel.vcf.file contains only a single sample.") |
|
166 |
+ if (!is.null(nvcf)) { |
|
167 |
+ if (ncol(nvcf) < 2) { |
|
168 |
+ .stopUserError("The normal.panel.vcf.file contains only a single sample.") |
|
169 |
+ } |
|
170 |
+ # TODO: deal with tri-allelic sites |
|
171 |
+ alt <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][2]) |
|
172 |
+ ref <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][1]) |
|
173 |
+ fa <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][2]/sum(x[[1]])) |
|
174 |
+ rownames(alt) <- as.character(rowRanges(nvcf)) |
|
175 |
+ gr <- rowRanges(nvcf) |
|
176 |
+ } else { |
|
177 |
+ if (is.null(alt) || is.null(ref) || is.null(gr) || ncol(ref) != ncol(alt)) { |
|
178 |
+ .stopRuntimeError("Either nvcf or valid alt and ref required.") |
|
179 |
+ } |
|
180 |
+ if (ncol(alt) < 2) { |
|
181 |
+ .stopUserError("The normal.panel.vcf.file contains only a single sample.") |
|
182 |
+ } |
|
183 |
+ fa <- alt / (ref + alt) |
|
64 | 184 |
} |
65 |
- # TODO: deal with tri-allelic sites |
|
66 |
- alt <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][2]) |
|
67 |
- ref <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][1]) |
|
68 |
- fa <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][2]/sum(x[[1]])) |
|
69 |
- x <- sapply(seq_len(nrow(nvcf)), function(i) { |
|
185 |
+ ponCntHits <- apply(alt,1,function(x) sum(!is.na(x))) |
|
186 |
+ |
|
187 |
+ x <- sapply(seq_len(nrow(fa)), function(i) { |
|
70 | 188 |
idx <- !is.na(fa[i,]) & fa[i,] > 0.05 & fa[i,] < 0.9 |
71 | 189 |
shapes <- c(NA, NA) |
72 | 190 |
if (!sum(idx) >= min.normals) return(c(0, 0, 0, 0, shapes)) |
... | ... |
@@ -78,7 +196,7 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2, |
78 | 196 |
ref[i,idx]) ~ 1, betabinomial, trace = FALSE))) |
79 | 197 |
if (class(fit) == "try-error") { |
80 | 198 |
flog.warn("Could not fit beta binomial dist for %s (%s).", |
81 |
- as.character(rowRanges(nvcf[i])), |
|
199 |
+ rownames(alt)[i], |
|
82 | 200 |
paste0(round(fa[i, idx], digits = 3), collapse=",")) |
83 | 201 |
} else { |
84 | 202 |
shapes <- Coef(fit) |
... | ... |
@@ -89,16 +207,13 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2, |
89 | 207 |
# Add an average "normal" SNP (average coverage and allelic fraction > 0.4) |
90 | 208 |
# as empirical prior |
91 | 209 |
psMappingBias <- .adjustEmpBayes(x[1:4,]) * 2 |
92 |
- ponCntHits <- apply(geno(nvcf)$AD, 1, function(x) |
|
93 |
- sum(!is.na(unlist(x))) / 2) |
|
94 | 210 |
|
95 |
- tmp <- rowRanges(nvcf) |
|
96 |
- mcols(tmp) <- NULL |
|
97 |
- tmp$bias <- psMappingBias |
|
98 |
- tmp$pon.count <- ponCntHits |
|
99 |
- tmp$mu <- x[5,] |
|
100 |
- tmp$rho <- x[6,] |
|
101 |
- tmp |
|
211 |
+ mcols(gr) <- NULL |
|
212 |
+ gr$bias <- psMappingBias |
|
213 |
+ gr$pon.count <- ponCntHits |
|
214 |
+ gr$mu <- x[5,] |
|
215 |
+ gr$rho <- x[6,] |
|
216 |
+ gr |
|
102 | 217 |
} |
103 | 218 |
|
104 | 219 |
.readNormalPanelVcfLarge <- function(vcf, normal.panel.vcf.file, |
... | ... |
@@ -84,7 +84,8 @@ normal.panel.vcf.file = NULL, min.normals = 2, smooth = TRUE, smooth.n = 5) { |
84 | 84 |
flog.warn("setMappingBiasVcf: no hits in %s.", mapping.bias.file) |
85 | 85 |
return(data.frame(bias = tmp, mu = NA, rho = NA)) |
86 | 86 |
} |
87 |
- mappingBias <- .calculateMappingBias(nvcf, min.normals) |
|
87 |
+ mappingBias <- .calculateMappingBias(nvcf = nvcf, |
|
88 |
+ min.normals = min.normals) |
|
88 | 89 |
} |
89 | 90 |
.annotateMappingBias(tmp, vcf, mappingBias, max.bias, smooth, smooth.n) |
90 | 91 |
} |
... | ... |
@@ -60,7 +60,11 @@ if (!is.null(opt$normal_panel)) { |
60 | 60 |
} else { |
61 | 61 |
suppressPackageStartupMessages(library(PureCN)) |
62 | 62 |
flog.info("Creating mapping bias database.") |
63 |
- bias <- calculateMappingBiasVcf(opt$normal_panel, genome = genome) |
|
63 |
+ if (file.exists(file.path(opt$normal_panel, "callset.json"))) { |
|
64 |
+ bias <- calculateMappingBiasGatk4(opt$normal_panel, genome) |
|
65 |
+ } else { |
|
66 |
+ bias <- calculateMappingBiasVcf(opt$normal_panel, genome = genome) |
|
67 |
+ } |
|
64 | 68 |
saveRDS(bias, file = output.file) |
65 | 69 |
} |
66 | 70 |
} |
69 | 73 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,46 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/calculateMappingBiasVcf.R |
|
3 |
+\name{calculateMappingBiasGatk4} |
|
4 |
+\alias{calculateMappingBiasGatk4} |
|
5 |
+\title{Calculate Mapping Bias from GATK4 GenomicsDB} |
|
6 |
+\usage{ |
|
7 |
+calculateMappingBiasGatk4( |
|
8 |
+ workspace, |
|
9 |
+ reference.genome, |
|
10 |
+ min.normals = 2, |
|
11 |
+ min.normals.betafit = 7, |
|
12 |
+ min.median.coverage.betafit = 5 |
|
13 |
+) |
|
14 |
+} |
|
15 |
+\arguments{ |
|
16 |
+\item{workspace}{Path to the GenomicsDB created by \code{GenomicsDBImport}} |
|
17 |
+ |
|
18 |
+\item{reference.genome}{Reference FASTA file.} |
|
19 |
+ |
|
20 |
+\item{min.normals}{Minimum number of normals with heterozygous SNP for |
|
21 |
+calculating position-specific mapping bias.} |
|
22 |
+ |
|
23 |
+\item{min.normals.betafit}{Minimum number of normals with heterozygous SNP |
|
24 |
+fitting a beta distribution} |
|
25 |
+ |
|
26 |
+\item{min.median.coverage.betafit}{Minimum median coverage of normals with |
|
27 |
+heterozygous SNP for fitting a beta distribution} |
|
28 |
+} |
|
29 |
+\value{ |
|
30 |
+A \code{GRanges} object with mapping bias and number of normal |
|
31 |
+samples with this variant. |
|
32 |
+} |
|
33 |
+\description{ |
|
34 |
+Function calculate mapping bias for each variant in the provided |
|
35 |
+panel of normals GenomicsDB. |
|
36 |
+} |
|
37 |
+\examples{ |
|
38 |
+ |
|
39 |
+normal.panel.vcf <- system.file("extdata", "normalpanel.vcf.gz", package="PureCN") |
|
40 |
+bias <- calculateMappingBiasVcf(normal.panel.vcf, genome = "h19") |
|
41 |
+saveRDS(bias, "mapping_bias.rds") |
|
42 |
+ |
|
43 |
+} |
|
44 |
+\author{ |
|
45 |
+Markus Riester |
|
46 |
+} |
... | ... |
@@ -52,3 +52,16 @@ test_that("Precomputed mapping bias matches", { |
52 | 52 |
vcf.single.file <- system.file("extdata", "example_single.vcf.gz", package = "PureCN") |
53 | 53 |
expect_error(calculateMappingBiasVcf(vcf.single.file), "only a single sample") |
54 | 54 |
}) |
55 |
+ |
|
56 |
+test_that("GenomicsDB import works", { |
|
57 |
+ skip_if_not(requireNamespace("genomicsdb"), "genomicsdb required") |
|
58 |
+ skip_if_not(requireNamespace("jsonlite"), "jsonlite required") |
|
59 |
+ resources_file <- system.file("extdata", "gatk4_pon_db.tgz", |
|
60 |
+ package = "PureCN") |
|
61 |
+ tmp_dir <- tempdir() |
|
62 |
+ untar(resources_file, exdir = tmp_dir) |
|
63 |
+ workspace <- file.path(tmp_dir, "gatk4_pon_db") |
|
64 |
+ bias <- calculateMappingBiasGatk4(workspace, "hg19") |
|
65 |
+ expect_equal(2101, length(bias)) |
|
66 |
+ unlink(tmp_dir, recursive=TRUE) |
|
67 |
+}) |
... | ... |
@@ -449,10 +449,12 @@ Important recommendations: |
449 | 449 |
## Recommended _GATK4_ usage |
450 | 450 |
|
451 | 451 |
``` |
452 |
-# Recommended: Provide a normal panel VCF to remove mapping biases, pre-compute |
|
452 |
+# Recommended: Provide a normal panel GenomicsDB to remove mapping biases, pre-compute |
|
453 | 453 |
# position-specific bias for much faster runtimes with large panels |
454 | 454 |
# This needs to be done only once for each assay |
455 |
-Rscript $PURECN/NormalDB.R --outdir $OUT_REF --normal_panel $NORMAL_PANEL \ |
|
455 |
+# Requires the genomicsdb R package |
|
456 |
+Rscript $PURECN/NormalDB.R --outdir $OUT_REF \ |
|
457 |
+ --normal_panel $GENOMICSDB-WORKSPACE-PATH/pon_db \ |
|
456 | 458 |
--assay agilent_v6 --genome hg19 --force |
457 | 459 |
|
458 | 460 |
Rscript $PURECN/PureCN.R --out $OUT/$SAMPLEID \ |