Browse code

Updated documentation and comments: --max-allele-counts, dbSNP membership, COSMIC counts

Dmitriy Drichel authored on 15/09/2023 10:12:50
Showing 11 changed files

... ...
@@ -69,4 +69,4 @@ biocViews: CopyNumberVariation, Software, Sequencing,
69 69
     VariantAnnotation, VariantDetection, Coverage, ImmunoOncology
70 70
 NeedsCompilation: no
71 71
 ByteCompile: yes
72
-RoxygenNote: 7.2.3
72
+RoxygenNote: 7.2.3.9000
... ...
@@ -17,7 +17,7 @@
17 17
 #' which potentially have allelic fractions smaller than 1 due to artifacts or
18 18
 #' contamination. If a matched normal is available, this value is ignored,
19 19
 #' because homozygosity can be confirmed in the normal.
20
-#' @param contamination.range Count variants in dbSNP with allelic fraction
20
+#' @param contamination.range Count variants in germline databases with allelic fraction
21 21
 #' in the specified range. If the number of these putative contamination
22 22
 #' variants exceeds an expected value and if they are found on almost all
23 23
 #' chromosomes, the sample is flagged as potentially contaminated and extra
... ...
@@ -49,7 +49,7 @@
49 49
 #' the specified size in bp. Requires \code{target.granges}.
50 50
 #' @param DB.info.flag Flag in INFO of VCF that marks presence in common
51 51
 #' germline databases. Defaults to \code{DB} that may contain somatic variants
52
-#' if it is from an unfiltered dbSNP VCF.
52
+#' if it is from an unfiltered germline database.
53 53
 #' @return A list with elements \item{vcf}{The filtered \code{CollapsedVCF}
54 54
 #' object.} \item{flag}{A flag (\code{logical(1)}) if problems were
55 55
 #' identified.} \item{flag_comment}{A comment describing the flagging.}
... ...
@@ -160,7 +160,7 @@ interval.padding = 50, DB.info.flag = "DB") {
160 160
             ifelse(fractionContaminated > minFractionContaminated, "maybe", "unlikely"))
161 161
     }
162 162
 
163
-    # do we have many low allelic fraction calls that are in dbSNP on basically
163
+    # do we have many low allelic fraction calls that are in germline databases on basically
164 164
     # all chromosomes? then we found some contamination
165 165
     if (sum(runLength(seqnames(rowRanges(vcf[idx]))) > 3) >= 20 &&
166 166
         fractionContaminated >= minFractionContaminated) {
... ...
@@ -223,7 +223,7 @@ interval.padding = 50, DB.info.flag = "DB") {
223 223
     }
224 224
     if (!is.null(info(vcf)[[DB.info.flag]]) &&
225 225
         sum(info(vcf)[[DB.info.flag]]) < nrow(vcf) / 2) {
226
-        flog.warn("Less than half of variants in dbSNP. Make sure that VCF %s",
226
+        flog.warn("Less than half of variants are likely somatic. Make sure that VCF %s",
227 227
             "contains both germline and somatic variants.")
228 228
     }
229 229
 
... ...
@@ -501,7 +501,7 @@ function(vcf, tumor.id.in.vcf, allowed = 0.05) {
501 501
     } else if (!is.null(info(vcf)[[POPAF.info.field]]) &&
502 502
         max(unlist(info(vcf)[[POPAF.info.field]]), na.rm = TRUE) > 0.05) {
503 503
         if (max(unlist(info(vcf)[[POPAF.info.field]]), na.rm = TRUE) > 1.1) {
504
-            flog.info("Maximum of POPAP INFO is > 1, assuming -log10 scaled values")
504
+            flog.info("Maximum of POPAF INFO is > 1, assuming -log10 scaled values")
505 505
             db <- info(vcf)[[POPAF.info.field]] < -log10(min.pop.af)
506 506
         } else {
507 507
             db <- info(vcf)[[POPAF.info.field]] > min.pop.af
... ...
@@ -523,7 +523,7 @@ function(vcf, tumor.id.in.vcf, allowed = 0.05) {
523 523
     newInfo <- DataFrame(
524 524
         Number = 0,
525 525
         Type = "Flag",
526
-        Description = "dbSNP Membership",
526
+        Description = "Likely somatic status, based on SOMATIC or Cosmic.CNT info fields, population allele frequency, or dbSNP membership",
527 527
         row.names = DB.info.flag)
528 528
     info(header(vcf)) <- rbind(info(header(vcf)), newInfo)
529 529
     info(vcf)[[DB.info.flag]] <- db
... ...
@@ -170,7 +170,7 @@ ss) {
170 170
 
171 171
 .getAFPlotGroups <- function(r, single.mode) {
172 172
     if (single.mode) {
173
-        groupLevels <- c("dbSNP/germline", "dbSNP/somatic", "novel/somatic",
173
+        groupLevels <- c("dbSNP or POPAF/germline", "dbSNP or POPAF/somatic", "novel/somatic",
174 174
             "novel/germline", "COSMIC/germline", "COSMIC/somatic", "contamination")
175 175
         r$group <- groupLevels[1]
176 176
         r$group[r$prior.somatic < 0.1 & r$ML.SOMATIC] <- groupLevels[2]
... ...
@@ -566,7 +566,7 @@ ss) {
566 566
                 y=r$ML.AR[r$ML.SOMATIC][idx.labels],
567 567
                 labels=scatter.labels[idx.labels])
568 568
         }
569
-        idxSomatic <- !grepl("germline|dbSNP|contamination", as.character(r$group))
569
+        idxSomatic <- !grepl("germline|dbSNP or POPAF|contamination", as.character(r$group))
570 570
         if (sum(idxSomatic)) {
571 571
             colSomatic <- mycol.palette$color[match(names(sort(table(r$group[idxSomatic]),
572 572
                 decreasing=TRUE)[1]), mycol.palette$group)]
... ...
@@ -80,7 +80,7 @@ readAllelicCountsFile <- function(file, format, zero=NULL) {
80 80
     info(header(vcf)) <- DataFrame(
81 81
         Number = "0",
82 82
         Type = "Flag",
83
-        Description = "dbSNP Membership",
83
+        Description = "Likely somatic status, based on SOMATIC or Cosmic.CNT info fields, population allele frequency, or dbSNP membership",
84 84
         row.names = "DB")
85 85
 
86 86
     geno(header(vcf)) <- DataFrame(
... ...
@@ -3,10 +3,10 @@
3 3
 #' This function takes as input tumor and normal control coverage data and
4 4
 #' a VCF containing allelic fractions of germline variants and somatic
5 5
 #' mutations. Normal control does not need to be from the same patient.
6
-#' In case VCF does not contain somatic status, it should contain dbSNP and
7
-#' optionally COSMIC annotation. Returns purity and ploidy combinations,
8
-#' sorted by likelihood score. Provides copy number and LOH data, by both
9
-#' gene and genomic region.
6
+#' In case VCF does not contain somatic status, it should contain either
7
+#' dbSNP or population allele frequencies, and  optionally COSMIC annotation.
8
+#' Returns purity and ploidy combinations, sorted by likelihood score.
9
+#' Provides copy number and LOH data, by both gene and genomic region.
10 10
 #'
11 11
 #'
12 12
 #' @param normal.coverage.file Coverage file of normal control (optional
... ...
@@ -29,7 +29,7 @@
29 29
 #' @param vcf.file VCF file.
30 30
 #' Optional, but typically needed to select between local optima of similar
31 31
 #' likelihood. Can also be a \code{CollapsedVCF}, read with the \code{readVcf}
32
-#' function.  Requires a DB info flag for dbSNP membership. The default
32
+#' function.  Requires a DB info flag for likely somatic status. The default
33 33
 #' \code{fun.setPriorVcf} function will also look for a Cosmic.CNT slot (see
34 34
 #' \code{cosmic.vcf.file}), containing the hits in the COSMIC database. Again,
35 35
 #' do not expect very useful results without a VCF file.
... ...
@@ -131,10 +131,10 @@
131 131
 #' likelihood score calculation. Note that bias is reported on an inverse
132 132
 #' scale; a variant with mapping bias of 1 has no bias.
133 133
 #' @param max.pon Exclude variants found more than \code{max.pon} times in
134
-#' pool of normals and not in dbSNP. Requires \code{mapping.bias.file} in
135
-#' \code{\link{setMappingBiasVcf}}. Should be set to a value high enough
134
+#' pool of normals and not in germline databases. Requires \code{mapping.bias.file}
135
+#' in \code{\link{setMappingBiasVcf}}. Should be set to a value high enough
136 136
 #' to be much more likely an artifact and not a true germline variant not
137
-#' present in dbSNP.
137
+#' present in germline databases.
138 138
 #' @param min.variants.segment Flag segments with fewer variants. The
139 139
 #' minor copy number estimation is not reliable with insufficient variants.
140 140
 #' @param iterations Maximum number of iterations in the Simulated Annealing
... ...
@@ -189,7 +189,7 @@
189 189
 #' and indexed with bgzip and tabix, respectively.
190 190
 #' @param DB.info.flag Flag in INFO of VCF that marks presence in common
191 191
 #' germline databases. Defaults to \code{DB} that may contain somatic variants
192
-#' if it is from an unfiltered dbSNP VCF.
192
+#' if it is from an unfiltered germline database.
193 193
 #' @param POPAF.info.field As alternative to a flag, use an info field that
194 194
 #' contains population allele frequencies. The \code{DB} info flag has priority
195 195
 #' over this field when both exist.
... ...
@@ -8,21 +8,22 @@
8 8
 #' function from the VariantAnnotation package.
9 9
 #' @param prior.somatic Prior probabilities for somatic mutations. First value
10 10
 #' is for the case when no matched normals are available and the variant is not
11
-#' in dbSNP (second value). Third value is for variants with MuTect somatic
12
-#' call. Different from 1, because somatic mutations in segments of copy number
13
-#' 0 have 0 probability and artifacts can thus have dramatic influence on
11
+#' in germline databases (second value). Third value is for variants with MuTect
12
+#' somatic call. Different from 1, because somatic mutations in segments of copy
13
+#' number 0 have 0 probability and artifacts can thus have dramatic influence on
14 14
 #' likelihood score. Forth value is for variants not labeled as somatic by
15 15
 #' MuTect. Last two values are optional, if vcf contains a flag Cosmic.CNT, it
16
-#' will set the prior probability for variants with CNT > 2 to the first of
16
+#' will set the prior probability for variants with CNT > 6 to the first of
17 17
 #' those values in case of no matched normal available (0.995 default).  Final
18
-#' value is for the case that variant is in both dbSNP and COSMIC > 2.
18
+#' value is for the case that variant is in both germline databases and
19
+#' COSMIC count > 6.
19 20
 #' @param tumor.id.in.vcf Id of tumor in case multiple samples are stored in
20 21
 #' VCF.
21 22
 #' @param min.cosmic.cnt Minimum number of hits in the COSMIC database to 
22 23
 #' call variant as likely somatic.
23 24
 #' @param DB.info.flag Flag in INFO of VCF that marks presence in common
24 25
 #' germline databases. Defaults to \code{DB} that may contain somatic variants
25
-#' if it is from an unfiltered dbSNP VCF.
26
+#' if it is from an unfiltered germline database.
26 27
 #' @param Cosmic.CNT.info.field Info field containing hits in the Cosmic database
27 28
 #' @return The \code{vcf} with \code{numeric(nrow(vcf))} vector with the
28 29
 #' prior probability of somatic status for each variant in the 
... ...
@@ -59,14 +60,14 @@ setPriorVcf <- function(vcf, prior.somatic = c(0.5, 0.0005, 0.999, 0.0001,
59 60
          if (!is.null(info(vcf)[[Cosmic.CNT.info.field]])) {
60 61
              flog.info("Found COSMIC annotation in VCF. Requiring %i hits.", 
61 62
                 min.cosmic.cnt)
62
-             flog.info("Setting somatic prior probabilities for hits to %f or to %f if in both COSMIC and dbSNP.", 
63
+             flog.info("Setting somatic prior probabilities for hits to %f or to %f if in both COSMIC and likely somatic based on dbSNP membership or population allele frequency.", 
63 64
                 tmp[5], tmp[6])
64 65
 
65 66
              prior.somatic[which(info(vcf)[[Cosmic.CNT.info.field]] >= min.cosmic.cnt)] <- tmp[5]
66 67
              prior.somatic[which(info(vcf)[[Cosmic.CNT.info.field]] >= min.cosmic.cnt & 
67 68
                 info(vcf)[[DB.info.flag]])] <- tmp[6]
68 69
          } else {
69
-             flog.info("Setting somatic prior probabilities for dbSNP hits to %f or to %f otherwise.", 
70
+             flog.info("Setting somatic prior probabilities for likely somatic hits to %f or to %f otherwise.", 
70 71
                 tmp[2], tmp[1])
71 72
          }      
72 73
     }     
... ...
@@ -102,7 +102,7 @@ option_list <- list(
102 102
         help = "Maximum considered ploidy [default %default]"),
103 103
     make_option(c("--max-copy-number"), action = "store", type = "double",
104 104
         default =  max(eval(formals(PureCN::runAbsoluteCN)$test.num.copy)),
105
-        help = "Maximum allele-specific integer copy number [default %default]"),
105
+        help = "Maximum allele-specific integer copy number, only used for fitting allele-specific copy numbers. Higher copy numbers might still be inferred and reported [default %default]"),
106 106
     make_option(c("--post-optimize"), action = "store_true", default = FALSE,
107 107
         help = "Post-optimization [default %default]"),
108 108
     make_option(c("--bootstrap-n"), action = "store", type = "integer", default = 0,
... ...
@@ -44,7 +44,7 @@ which potentially have allelic fractions smaller than 1 due to artifacts or
44 44
 contamination. If a matched normal is available, this value is ignored,
45 45
 because homozygosity can be confirmed in the normal.}
46 46
 
47
-\item{contamination.range}{Count variants in dbSNP with allelic fraction
47
+\item{contamination.range}{Count variants in germline databases with allelic fraction
48 48
 in the specified range. If the number of these putative contamination
49 49
 variants exceeds an expected value and if they are found on almost all
50 50
 chromosomes, the sample is flagged as potentially contaminated and extra
... ...
@@ -87,7 +87,7 @@ the specified size in bp. Requires \code{target.granges}.}
87 87
 
88 88
 \item{DB.info.flag}{Flag in INFO of VCF that marks presence in common
89 89
 germline databases. Defaults to \code{DB} that may contain somatic variants
90
-if it is from an unfiltered dbSNP VCF.}
90
+if it is from an unfiltered germline database.}
91 91
 }
92 92
 \value{
93 93
 A list with elements \item{vcf}{The filtered \code{CollapsedVCF}
... ...
@@ -98,7 +98,7 @@ deviation, used to model likelihood of sub-clonal copy number events.}
98 98
 \item{vcf.file}{VCF file.
99 99
 Optional, but typically needed to select between local optima of similar
100 100
 likelihood. Can also be a \code{CollapsedVCF}, read with the \code{readVcf}
101
-function.  Requires a DB info flag for dbSNP membership. The default
101
+function.  Requires a DB info flag for likely somatic status. The default
102 102
 \code{fun.setPriorVcf} function will also look for a Cosmic.CNT slot (see
103 103
 \code{cosmic.vcf.file}), containing the hits in the COSMIC database. Again,
104 104
 do not expect very useful results without a VCF file.}
... ...
@@ -233,10 +233,10 @@ likelihood score calculation. Note that bias is reported on an inverse
233 233
 scale; a variant with mapping bias of 1 has no bias.}
234 234
 
235 235
 \item{max.pon}{Exclude variants found more than \code{max.pon} times in
236
-pool of normals and not in dbSNP. Requires \code{mapping.bias.file} in
237
-\code{\link{setMappingBiasVcf}}. Should be set to a value high enough
236
+pool of normals and not in germline databases. Requires \code{mapping.bias.file}
237
+in \code{\link{setMappingBiasVcf}}. Should be set to a value high enough
238 238
 to be much more likely an artifact and not a true germline variant not
239
-present in dbSNP.}
239
+present in germline databases.}
240 240
 
241 241
 \item{iterations}{Maximum number of iterations in the Simulated Annealing
242 242
 copy number fit optimization. Note that this an integer optimization problem
... ...
@@ -308,7 +308,7 @@ and indexed with bgzip and tabix, respectively.}
308 308
 
309 309
 \item{DB.info.flag}{Flag in INFO of VCF that marks presence in common
310 310
 germline databases. Defaults to \code{DB} that may contain somatic variants
311
-if it is from an unfiltered dbSNP VCF.}
311
+if it is from an unfiltered germline database.}
312 312
 
313 313
 \item{POPAF.info.field}{As alternative to a flag, use an info field that
314 314
 contains population allele frequencies. The \code{DB} info flag has priority
... ...
@@ -354,10 +354,10 @@ input data.}
354 354
 This function takes as input tumor and normal control coverage data and
355 355
 a VCF containing allelic fractions of germline variants and somatic
356 356
 mutations. Normal control does not need to be from the same patient.
357
-In case VCF does not contain somatic status, it should contain dbSNP and
358
-optionally COSMIC annotation. Returns purity and ploidy combinations,
359
-sorted by likelihood score. Provides copy number and LOH data, by both
360
-gene and genomic region.
357
+In case VCF does not contain somatic status, it should contain either
358
+dbSNP or population allele frequencies, and  optionally COSMIC annotation.
359
+Returns purity and ploidy combinations, sorted by likelihood score.
360
+Provides copy number and LOH data, by both gene and genomic region.
361 361
 }
362 362
 \examples{
363 363
 
... ...
@@ -19,14 +19,15 @@ function from the VariantAnnotation package.}
19 19
 
20 20
 \item{prior.somatic}{Prior probabilities for somatic mutations. First value
21 21
 is for the case when no matched normals are available and the variant is not
22
-in dbSNP (second value). Third value is for variants with MuTect somatic
23
-call. Different from 1, because somatic mutations in segments of copy number
24
-0 have 0 probability and artifacts can thus have dramatic influence on
22
+in germline databases (second value). Third value is for variants with MuTect
23
+somatic call. Different from 1, because somatic mutations in segments of copy
24
+number 0 have 0 probability and artifacts can thus have dramatic influence on
25 25
 likelihood score. Forth value is for variants not labeled as somatic by
26 26
 MuTect. Last two values are optional, if vcf contains a flag Cosmic.CNT, it
27
-will set the prior probability for variants with CNT > 2 to the first of
27
+will set the prior probability for variants with CNT > 6 to the first of
28 28
 those values in case of no matched normal available (0.995 default).  Final
29
-value is for the case that variant is in both dbSNP and COSMIC > 2.}
29
+value is for the case that variant is in both germline databases and
30
+COSMIC count > 6.}
30 31
 
31 32
 \item{tumor.id.in.vcf}{Id of tumor in case multiple samples are stored in
32 33
 VCF.}
... ...
@@ -36,7 +37,7 @@ call variant as likely somatic.}
36 37
 
37 38
 \item{DB.info.flag}{Flag in INFO of VCF that marks presence in common
38 39
 germline databases. Defaults to \code{DB} that may contain somatic variants
39
-if it is from an unfiltered dbSNP VCF.}
40
+if it is from an unfiltered germline database.}
40 41
 
41 42
 \item{Cosmic.CNT.info.field}{Info field containing hits in the Cosmic database}
42 43
 }
... ...
@@ -947,7 +947,7 @@ NOISY SEGMENTATION & More than \Rcode{max.segments} \\
947 947
 NON-ABERRANT & $\geq 99 $\% of genome has identical copy number and $\geq 0.5$\% has second most common state \\
948 948
 POLYGENOMIC & $\geq 0.75 \times$ \Rcode{max.non.clonal} fraction of the genome in sub-clonal state \\
949 949
 POOR GOF & GoF $<$ \Rcode{min.gof} \\
950
-POTENTIAL SAMPLE CONTAMINATION & Significant portion of dbSNP variants potentially cross-contaminated \\
950
+POTENTIAL SAMPLE CONTAMINATION & Significant portion of variants present in germline databases are potentially cross-contaminated \\
951 951
 RARE KARYOTYPE & Ploidy $< 1.5$ or  $> 4.5$  \\
952 952
 \bottomrule
953 953
 \end{tabular}
... ...
@@ -1115,12 +1115,12 @@ If a matched normal is not available, it is also helpful to provide
1115 1115
 \Rcode{cosmic.vcf.file} (or via a \Rcode{Cosmic.CNT} INFO field in the VCF).
1116 1116
 While this has limited effect on purity and ploidy estimation due the sparsity
1117 1117
 of hotspot mutations, it often helps in the manual curation to compare how well
1118
-high confidence germline (dbSNP) vs. somatic (COSMIC) variants fit a particular
1119
-purity/ploidy combination. 
1118
+high confidence germline (based on dbSNP membership or POPAF) vs. somatic (COSMIC)
1119
+variants fit a particular purity/ploidy combination. 
1120 1120
 
1121 1121
 For variant classification (Section~\ref{predictsomatic}), providing COSMIC
1122
-annotation also avoids that hotspot mutations with dbSNP id get assigned a very
1123
-low prior probability of being somatic.
1122
+annotation also avoids that hotspot mutations in germline databases get assigned
1123
+a very low prior probability of being somatic.
1124 1124
 
1125 1125
 \subsection{ExAC and gnomAD annotation}
1126 1126