Browse code

Merge pull request #326 from lima1/issue_320

Issue 320

Markus Riester authored on 23/09/2023 23:38:46 • GitHub committed on 23/09/2023 23:38:46
Showing 4 changed files

... ...
@@ -693,6 +693,9 @@ function(vcf, tumor.id.in.vcf, allowed = 0.05) {
693 693
 .filterVcfByBQ <- function(vcf, tumor.id.in.vcf, min.base.quality) {
694 694
     n.vcf.before.filter <- .countVariants(vcf)
695 695
     idx <- .getBQFromVcf(vcf, tumor.id.in.vcf, base.quality.offset = 0) < min.base.quality
696
+    if (sum(idx, na.rm = TRUE) / length(idx) > 0.5) {
697
+        flog.warn("Many variants removed by min.base.quality (%i) filter. You might want to lower the cutoff.", min.base.quality)
698
+    }
696 699
     vcf <- .removeVariants(vcf, idx, "BQ", na.rm = FALSE)
697 700
     flog.info("Removing %i low quality variants with non-offset BQ < %i.",
698 701
         n.vcf.before.filter - .countVariants(vcf), min.base.quality)
... ...
@@ -42,6 +42,9 @@ option_list <- list(
42 42
     make_option(c("--base-quality-offset"), action = "store", type = "integer",
43 43
         default = formals(PureCN::filterVcfBasic)$base.quality.offset,
44 44
         help = "VCF Filter: Subtracts the specified value from the base quality score [default %default]"),
45
+    make_option(c("--min-base-quality"), action = "store", type = "integer",
46
+        default = formals(PureCN::filterVcfBasic)$min.base.quality,
47
+        help = "VCF Filter: Minimum base quality for variants. Set to 0 to turn off filter [default %default]"),
45 48
     make_option(c("--min-supporting-reads"), action = "store", type = "integer",
46 49
         default = formals(PureCN::filterVcfBasic)$min.supporting.reads,
47 50
         help = "VCF Filter: Instead of calculating the min. number of supporting reads, use specified one [default %default]"),
... ...
@@ -177,14 +180,14 @@ alias_list <- list(
177 180
     "maxhomozygousloss" = "max-homozygous-loss",
178 181
     "speedupheuristics" = "speedup-heuristics",
179 182
     "outvcf" = "out-vcf"
180
-)    
183
+)
181 184
 replace_alias <- function(x, deprecated = TRUE) {
182 185
     idx <- match(x, paste0("--", names(alias_list)))
183 186
     if (any(!is.na(idx))) {
184 187
         replaced <- paste0("--", alias_list[na.omit(idx)])
185 188
         x[!is.na(idx)] <- replaced
186 189
         if (deprecated) {
187
-            flog.warn("Deprecated arguments, use %s instead.", paste(replaced, collapse=" "))
190
+            flog.warn("Deprecated arguments, use %s instead.", paste(replaced, collapse = " "))
188 191
         }
189 192
     }
190 193
     return(x)
... ...
@@ -253,9 +256,9 @@ if (Sys.getenv("PURECN_DEBUG") != "") {
253 256
 }
254 257
 
255 258
 .checkFileList <- function(file) {
256
-    files <- read.delim(file, as.is = TRUE, header = FALSE)[,1]
259
+    files <- read.delim(file, as.is = TRUE, header = FALSE)[, 1]
257 260
     numExists <- sum(file.exists(files), na.rm = TRUE)
258
-    if (numExists < length(files)) { 
261
+    if (numExists < length(files)) {
259 262
         stop("File not exists in file ", file)
260 263
     }
261 264
     files
... ...
@@ -397,6 +400,7 @@ if (file.exists(file.rds) && !opt$force) {
397 400
                 af.range = af.range, stats.file = opt$stats_file,
398 401
                 min.supporting.reads = opt$min_supporting_reads,
399 402
                 base.quality.offset = opt$base_quality_offset,
403
+                min.base.quality = opt$min_base_quality,
400 404
                 ignore = mutect.ignore,
401 405
                 interval.padding = opt$interval_padding),
402 406
             args.filterIntervals = list(
... ...
@@ -133,3 +133,10 @@ test_that("issue 249", {
133 133
     vcf.bq <- PureCN:::.readAndCheckVcf(vcf.bq, "hg19")
134 134
     expect_equal(NA, geno(vcf.bq)$BQ[[2, 1]])
135 135
 })
136
+
137
+test_that("issue 320", {
138
+    # set random BQ to NA
139
+    expect_output(x <- filterVcfBasic(vcf, use.somatic.status = FALSE, min.base.quality = 34),
140
+        "Many variants removed by min.base.quality")
141
+    expect_equal(34, min(sapply(geno(x$vcf)$BQ[,1], function(x) x[1])))
142
+})
... ...
@@ -207,7 +207,10 @@ Important recommendations:
207 207
   to presence in germline databases. _Mutect 1.1.7_ automatically calls SNPs, 
208 208
   but _Mutect 2_ does not. Make sure to run _Mutect 2_ with
209 209
   `--genotype-germline-sites true --genotype-pon-sites true`. You will not get
210
-  usuable output without those flags.
210
+  usuable output without those flags. Since _Mutect 2_ from _GATK 4.2.0+_,
211
+  average base quality scores can be very low and variants will be too
212
+  aggressively removed by _PureCN_.  You will need to set `--min-base-quality
213
+  20` in _PureCN.R_ to keep them.
211 214
 
212 215
 - Run the variant caller with a 50-75 base pair interval padding to increase
213 216
   the number of heterozygous SNPs (for example `--interval_padding` and
... ...
@@ -524,7 +527,9 @@ Important recommendations:
524 527
       with 50-100bp interval padding or no interval file at all. Also check
525 528
       that the interval file was generated using the baits coordinates, not the
526 529
       targets (the baits BED file should have a more even size distribution,
527
-      e.g. 120bp and multiples of it).
530
+      e.g. 120bp and multiples of it). If many variants are removed by the
531
+      default 25 base quality feature, you might be using _Mutect 2_ and need
532
+      to re-run _PureCN.R_ with `--min-base-quality 20`.
528 533
     
529 534
     - "Initial testing for significant sample cross-contamination" in the log
530 535
       file should not have many false positives, i.e. should be "unlikely" for