Issue 320
... | ... |
@@ -693,6 +693,9 @@ function(vcf, tumor.id.in.vcf, allowed = 0.05) { |
693 | 693 |
.filterVcfByBQ <- function(vcf, tumor.id.in.vcf, min.base.quality) { |
694 | 694 |
n.vcf.before.filter <- .countVariants(vcf) |
695 | 695 |
idx <- .getBQFromVcf(vcf, tumor.id.in.vcf, base.quality.offset = 0) < min.base.quality |
696 |
+ if (sum(idx, na.rm = TRUE) / length(idx) > 0.5) { |
|
697 |
+ flog.warn("Many variants removed by min.base.quality (%i) filter. You might want to lower the cutoff.", min.base.quality) |
|
698 |
+ } |
|
696 | 699 |
vcf <- .removeVariants(vcf, idx, "BQ", na.rm = FALSE) |
697 | 700 |
flog.info("Removing %i low quality variants with non-offset BQ < %i.", |
698 | 701 |
n.vcf.before.filter - .countVariants(vcf), min.base.quality) |
... | ... |
@@ -42,6 +42,9 @@ option_list <- list( |
42 | 42 |
make_option(c("--base-quality-offset"), action = "store", type = "integer", |
43 | 43 |
default = formals(PureCN::filterVcfBasic)$base.quality.offset, |
44 | 44 |
help = "VCF Filter: Subtracts the specified value from the base quality score [default %default]"), |
45 |
+ make_option(c("--min-base-quality"), action = "store", type = "integer", |
|
46 |
+ default = formals(PureCN::filterVcfBasic)$min.base.quality, |
|
47 |
+ help = "VCF Filter: Minimum base quality for variants. Set to 0 to turn off filter [default %default]"), |
|
45 | 48 |
make_option(c("--min-supporting-reads"), action = "store", type = "integer", |
46 | 49 |
default = formals(PureCN::filterVcfBasic)$min.supporting.reads, |
47 | 50 |
help = "VCF Filter: Instead of calculating the min. number of supporting reads, use specified one [default %default]"), |
... | ... |
@@ -177,14 +180,14 @@ alias_list <- list( |
177 | 180 |
"maxhomozygousloss" = "max-homozygous-loss", |
178 | 181 |
"speedupheuristics" = "speedup-heuristics", |
179 | 182 |
"outvcf" = "out-vcf" |
180 |
-) |
|
183 |
+) |
|
181 | 184 |
replace_alias <- function(x, deprecated = TRUE) { |
182 | 185 |
idx <- match(x, paste0("--", names(alias_list))) |
183 | 186 |
if (any(!is.na(idx))) { |
184 | 187 |
replaced <- paste0("--", alias_list[na.omit(idx)]) |
185 | 188 |
x[!is.na(idx)] <- replaced |
186 | 189 |
if (deprecated) { |
187 |
- flog.warn("Deprecated arguments, use %s instead.", paste(replaced, collapse=" ")) |
|
190 |
+ flog.warn("Deprecated arguments, use %s instead.", paste(replaced, collapse = " ")) |
|
188 | 191 |
} |
189 | 192 |
} |
190 | 193 |
return(x) |
... | ... |
@@ -253,9 +256,9 @@ if (Sys.getenv("PURECN_DEBUG") != "") { |
253 | 256 |
} |
254 | 257 |
|
255 | 258 |
.checkFileList <- function(file) { |
256 |
- files <- read.delim(file, as.is = TRUE, header = FALSE)[,1] |
|
259 |
+ files <- read.delim(file, as.is = TRUE, header = FALSE)[, 1] |
|
257 | 260 |
numExists <- sum(file.exists(files), na.rm = TRUE) |
258 |
- if (numExists < length(files)) { |
|
261 |
+ if (numExists < length(files)) { |
|
259 | 262 |
stop("File not exists in file ", file) |
260 | 263 |
} |
261 | 264 |
files |
... | ... |
@@ -397,6 +400,7 @@ if (file.exists(file.rds) && !opt$force) { |
397 | 400 |
af.range = af.range, stats.file = opt$stats_file, |
398 | 401 |
min.supporting.reads = opt$min_supporting_reads, |
399 | 402 |
base.quality.offset = opt$base_quality_offset, |
403 |
+ min.base.quality = opt$min_base_quality, |
|
400 | 404 |
ignore = mutect.ignore, |
401 | 405 |
interval.padding = opt$interval_padding), |
402 | 406 |
args.filterIntervals = list( |
... | ... |
@@ -133,3 +133,10 @@ test_that("issue 249", { |
133 | 133 |
vcf.bq <- PureCN:::.readAndCheckVcf(vcf.bq, "hg19") |
134 | 134 |
expect_equal(NA, geno(vcf.bq)$BQ[[2, 1]]) |
135 | 135 |
}) |
136 |
+ |
|
137 |
+test_that("issue 320", { |
|
138 |
+ # set random BQ to NA |
|
139 |
+ expect_output(x <- filterVcfBasic(vcf, use.somatic.status = FALSE, min.base.quality = 34), |
|
140 |
+ "Many variants removed by min.base.quality") |
|
141 |
+ expect_equal(34, min(sapply(geno(x$vcf)$BQ[,1], function(x) x[1]))) |
|
142 |
+}) |
... | ... |
@@ -207,7 +207,10 @@ Important recommendations: |
207 | 207 |
to presence in germline databases. _Mutect 1.1.7_ automatically calls SNPs, |
208 | 208 |
but _Mutect 2_ does not. Make sure to run _Mutect 2_ with |
209 | 209 |
`--genotype-germline-sites true --genotype-pon-sites true`. You will not get |
210 |
- usuable output without those flags. |
|
210 |
+ usuable output without those flags. Since _Mutect 2_ from _GATK 4.2.0+_, |
|
211 |
+ average base quality scores can be very low and variants will be too |
|
212 |
+ aggressively removed by _PureCN_. You will need to set `--min-base-quality |
|
213 |
+ 20` in _PureCN.R_ to keep them. |
|
211 | 214 |
|
212 | 215 |
- Run the variant caller with a 50-75 base pair interval padding to increase |
213 | 216 |
the number of heterozygous SNPs (for example `--interval_padding` and |
... | ... |
@@ -524,7 +527,9 @@ Important recommendations: |
524 | 527 |
with 50-100bp interval padding or no interval file at all. Also check |
525 | 528 |
that the interval file was generated using the baits coordinates, not the |
526 | 529 |
targets (the baits BED file should have a more even size distribution, |
527 |
- e.g. 120bp and multiples of it). |
|
530 |
+ e.g. 120bp and multiples of it). If many variants are removed by the |
|
531 |
+ default 25 base quality feature, you might be using _Mutect 2_ and need |
|
532 |
+ to re-run _PureCN.R_ with `--min-base-quality 20`. |
|
528 | 533 |
|
529 | 534 |
- "Initial testing for significant sample cross-contamination" in the log |
530 | 535 |
file should not have many false positives, i.e. should be "unlikely" for |