... | ... |
@@ -17,7 +17,7 @@ |
17 | 17 |
#' which potentially have allelic fractions smaller than 1 due to artifacts or |
18 | 18 |
#' contamination. If a matched normal is available, this value is ignored, |
19 | 19 |
#' because homozygosity can be confirmed in the normal. |
20 |
-#' @param contamination.range Count variants in dbSNP with allelic fraction |
|
20 |
+#' @param contamination.range Count variants in germline databases with allelic fraction |
|
21 | 21 |
#' in the specified range. If the number of these putative contamination |
22 | 22 |
#' variants exceeds an expected value and if they are found on almost all |
23 | 23 |
#' chromosomes, the sample is flagged as potentially contaminated and extra |
... | ... |
@@ -49,7 +49,7 @@ |
49 | 49 |
#' the specified size in bp. Requires \code{target.granges}. |
50 | 50 |
#' @param DB.info.flag Flag in INFO of VCF that marks presence in common |
51 | 51 |
#' germline databases. Defaults to \code{DB} that may contain somatic variants |
52 |
-#' if it is from an unfiltered dbSNP VCF. |
|
52 |
+#' if it is from an unfiltered germline database. |
|
53 | 53 |
#' @return A list with elements \item{vcf}{The filtered \code{CollapsedVCF} |
54 | 54 |
#' object.} \item{flag}{A flag (\code{logical(1)}) if problems were |
55 | 55 |
#' identified.} \item{flag_comment}{A comment describing the flagging.} |
... | ... |
@@ -160,7 +160,7 @@ interval.padding = 50, DB.info.flag = "DB") { |
160 | 160 |
ifelse(fractionContaminated > minFractionContaminated, "maybe", "unlikely")) |
161 | 161 |
} |
162 | 162 |
|
163 |
- # do we have many low allelic fraction calls that are in dbSNP on basically |
|
163 |
+ # do we have many low allelic fraction calls that are in germline databases on basically |
|
164 | 164 |
# all chromosomes? then we found some contamination |
165 | 165 |
if (sum(runLength(seqnames(rowRanges(vcf[idx]))) > 3) >= 20 && |
166 | 166 |
fractionContaminated >= minFractionContaminated) { |
... | ... |
@@ -223,7 +223,7 @@ interval.padding = 50, DB.info.flag = "DB") { |
223 | 223 |
} |
224 | 224 |
if (!is.null(info(vcf)[[DB.info.flag]]) && |
225 | 225 |
sum(info(vcf)[[DB.info.flag]]) < nrow(vcf) / 2) { |
226 |
- flog.warn("Less than half of variants in dbSNP. Make sure that VCF %s", |
|
226 |
+ flog.warn("Less than half of variants are likely somatic. Make sure that VCF %s", |
|
227 | 227 |
"contains both germline and somatic variants.") |
228 | 228 |
} |
229 | 229 |
|
... | ... |
@@ -501,7 +501,7 @@ function(vcf, tumor.id.in.vcf, allowed = 0.05) { |
501 | 501 |
} else if (!is.null(info(vcf)[[POPAF.info.field]]) && |
502 | 502 |
max(unlist(info(vcf)[[POPAF.info.field]]), na.rm = TRUE) > 0.05) { |
503 | 503 |
if (max(unlist(info(vcf)[[POPAF.info.field]]), na.rm = TRUE) > 1.1) { |
504 |
- flog.info("Maximum of POPAP INFO is > 1, assuming -log10 scaled values") |
|
504 |
+ flog.info("Maximum of POPAF INFO is > 1, assuming -log10 scaled values") |
|
505 | 505 |
db <- info(vcf)[[POPAF.info.field]] < -log10(min.pop.af) |
506 | 506 |
} else { |
507 | 507 |
db <- info(vcf)[[POPAF.info.field]] > min.pop.af |
... | ... |
@@ -523,7 +523,7 @@ function(vcf, tumor.id.in.vcf, allowed = 0.05) { |
523 | 523 |
newInfo <- DataFrame( |
524 | 524 |
Number = 0, |
525 | 525 |
Type = "Flag", |
526 |
- Description = "dbSNP Membership", |
|
526 |
+ Description = "Likely somatic status, based on SOMATIC or Cosmic.CNT info fields, population allele frequency, or dbSNP membership", |
|
527 | 527 |
row.names = DB.info.flag) |
528 | 528 |
info(header(vcf)) <- rbind(info(header(vcf)), newInfo) |
529 | 529 |
info(vcf)[[DB.info.flag]] <- db |
... | ... |
@@ -170,7 +170,7 @@ ss) { |
170 | 170 |
|
171 | 171 |
.getAFPlotGroups <- function(r, single.mode) { |
172 | 172 |
if (single.mode) { |
173 |
- groupLevels <- c("dbSNP/germline", "dbSNP/somatic", "novel/somatic", |
|
173 |
+ groupLevels <- c("dbSNP or POPAF/germline", "dbSNP or POPAF/somatic", "novel/somatic", |
|
174 | 174 |
"novel/germline", "COSMIC/germline", "COSMIC/somatic", "contamination") |
175 | 175 |
r$group <- groupLevels[1] |
176 | 176 |
r$group[r$prior.somatic < 0.1 & r$ML.SOMATIC] <- groupLevels[2] |
... | ... |
@@ -566,7 +566,7 @@ ss) { |
566 | 566 |
y=r$ML.AR[r$ML.SOMATIC][idx.labels], |
567 | 567 |
labels=scatter.labels[idx.labels]) |
568 | 568 |
} |
569 |
- idxSomatic <- !grepl("germline|dbSNP|contamination", as.character(r$group)) |
|
569 |
+ idxSomatic <- !grepl("germline|dbSNP or POPAF|contamination", as.character(r$group)) |
|
570 | 570 |
if (sum(idxSomatic)) { |
571 | 571 |
colSomatic <- mycol.palette$color[match(names(sort(table(r$group[idxSomatic]), |
572 | 572 |
decreasing=TRUE)[1]), mycol.palette$group)] |
... | ... |
@@ -80,7 +80,7 @@ readAllelicCountsFile <- function(file, format, zero=NULL) { |
80 | 80 |
info(header(vcf)) <- DataFrame( |
81 | 81 |
Number = "0", |
82 | 82 |
Type = "Flag", |
83 |
- Description = "dbSNP Membership", |
|
83 |
+ Description = "Likely somatic status, based on SOMATIC or Cosmic.CNT info fields, population allele frequency, or dbSNP membership", |
|
84 | 84 |
row.names = "DB") |
85 | 85 |
|
86 | 86 |
geno(header(vcf)) <- DataFrame( |
... | ... |
@@ -3,10 +3,10 @@ |
3 | 3 |
#' This function takes as input tumor and normal control coverage data and |
4 | 4 |
#' a VCF containing allelic fractions of germline variants and somatic |
5 | 5 |
#' mutations. Normal control does not need to be from the same patient. |
6 |
-#' In case VCF does not contain somatic status, it should contain dbSNP and |
|
7 |
-#' optionally COSMIC annotation. Returns purity and ploidy combinations, |
|
8 |
-#' sorted by likelihood score. Provides copy number and LOH data, by both |
|
9 |
-#' gene and genomic region. |
|
6 |
+#' In case VCF does not contain somatic status, it should contain either |
|
7 |
+#' dbSNP or population allele frequencies, and optionally COSMIC annotation. |
|
8 |
+#' Returns purity and ploidy combinations, sorted by likelihood score. |
|
9 |
+#' Provides copy number and LOH data, by both gene and genomic region. |
|
10 | 10 |
#' |
11 | 11 |
#' |
12 | 12 |
#' @param normal.coverage.file Coverage file of normal control (optional |
... | ... |
@@ -29,7 +29,7 @@ |
29 | 29 |
#' @param vcf.file VCF file. |
30 | 30 |
#' Optional, but typically needed to select between local optima of similar |
31 | 31 |
#' likelihood. Can also be a \code{CollapsedVCF}, read with the \code{readVcf} |
32 |
-#' function. Requires a DB info flag for dbSNP membership. The default |
|
32 |
+#' function. Requires a DB info flag for likely somatic status. The default |
|
33 | 33 |
#' \code{fun.setPriorVcf} function will also look for a Cosmic.CNT slot (see |
34 | 34 |
#' \code{cosmic.vcf.file}), containing the hits in the COSMIC database. Again, |
35 | 35 |
#' do not expect very useful results without a VCF file. |
... | ... |
@@ -131,10 +131,10 @@ |
131 | 131 |
#' likelihood score calculation. Note that bias is reported on an inverse |
132 | 132 |
#' scale; a variant with mapping bias of 1 has no bias. |
133 | 133 |
#' @param max.pon Exclude variants found more than \code{max.pon} times in |
134 |
-#' pool of normals and not in dbSNP. Requires \code{mapping.bias.file} in |
|
135 |
-#' \code{\link{setMappingBiasVcf}}. Should be set to a value high enough |
|
134 |
+#' pool of normals and not in germline databases. Requires \code{mapping.bias.file} |
|
135 |
+#' in \code{\link{setMappingBiasVcf}}. Should be set to a value high enough |
|
136 | 136 |
#' to be much more likely an artifact and not a true germline variant not |
137 |
-#' present in dbSNP. |
|
137 |
+#' present in germline databases. |
|
138 | 138 |
#' @param min.variants.segment Flag segments with fewer variants. The |
139 | 139 |
#' minor copy number estimation is not reliable with insufficient variants. |
140 | 140 |
#' @param iterations Maximum number of iterations in the Simulated Annealing |
... | ... |
@@ -189,7 +189,7 @@ |
189 | 189 |
#' and indexed with bgzip and tabix, respectively. |
190 | 190 |
#' @param DB.info.flag Flag in INFO of VCF that marks presence in common |
191 | 191 |
#' germline databases. Defaults to \code{DB} that may contain somatic variants |
192 |
-#' if it is from an unfiltered dbSNP VCF. |
|
192 |
+#' if it is from an unfiltered germline database. |
|
193 | 193 |
#' @param POPAF.info.field As alternative to a flag, use an info field that |
194 | 194 |
#' contains population allele frequencies. The \code{DB} info flag has priority |
195 | 195 |
#' over this field when both exist. |
... | ... |
@@ -8,21 +8,22 @@ |
8 | 8 |
#' function from the VariantAnnotation package. |
9 | 9 |
#' @param prior.somatic Prior probabilities for somatic mutations. First value |
10 | 10 |
#' is for the case when no matched normals are available and the variant is not |
11 |
-#' in dbSNP (second value). Third value is for variants with MuTect somatic |
|
12 |
-#' call. Different from 1, because somatic mutations in segments of copy number |
|
13 |
-#' 0 have 0 probability and artifacts can thus have dramatic influence on |
|
11 |
+#' in germline databases (second value). Third value is for variants with MuTect |
|
12 |
+#' somatic call. Different from 1, because somatic mutations in segments of copy |
|
13 |
+#' number 0 have 0 probability and artifacts can thus have dramatic influence on |
|
14 | 14 |
#' likelihood score. Forth value is for variants not labeled as somatic by |
15 | 15 |
#' MuTect. Last two values are optional, if vcf contains a flag Cosmic.CNT, it |
16 |
-#' will set the prior probability for variants with CNT > 2 to the first of |
|
16 |
+#' will set the prior probability for variants with CNT > 6 to the first of |
|
17 | 17 |
#' those values in case of no matched normal available (0.995 default). Final |
18 |
-#' value is for the case that variant is in both dbSNP and COSMIC > 2. |
|
18 |
+#' value is for the case that variant is in both germline databases and |
|
19 |
+#' COSMIC count > 6. |
|
19 | 20 |
#' @param tumor.id.in.vcf Id of tumor in case multiple samples are stored in |
20 | 21 |
#' VCF. |
21 | 22 |
#' @param min.cosmic.cnt Minimum number of hits in the COSMIC database to |
22 | 23 |
#' call variant as likely somatic. |
23 | 24 |
#' @param DB.info.flag Flag in INFO of VCF that marks presence in common |
24 | 25 |
#' germline databases. Defaults to \code{DB} that may contain somatic variants |
25 |
-#' if it is from an unfiltered dbSNP VCF. |
|
26 |
+#' if it is from an unfiltered germline database. |
|
26 | 27 |
#' @param Cosmic.CNT.info.field Info field containing hits in the Cosmic database |
27 | 28 |
#' @return The \code{vcf} with \code{numeric(nrow(vcf))} vector with the |
28 | 29 |
#' prior probability of somatic status for each variant in the |
... | ... |
@@ -59,14 +60,14 @@ setPriorVcf <- function(vcf, prior.somatic = c(0.5, 0.0005, 0.999, 0.0001, |
59 | 60 |
if (!is.null(info(vcf)[[Cosmic.CNT.info.field]])) { |
60 | 61 |
flog.info("Found COSMIC annotation in VCF. Requiring %i hits.", |
61 | 62 |
min.cosmic.cnt) |
62 |
- flog.info("Setting somatic prior probabilities for hits to %f or to %f if in both COSMIC and dbSNP.", |
|
63 |
+ flog.info("Setting somatic prior probabilities for hits to %f or to %f if in both COSMIC and likely somatic based on dbSNP membership or population allele frequency.", |
|
63 | 64 |
tmp[5], tmp[6]) |
64 | 65 |
|
65 | 66 |
prior.somatic[which(info(vcf)[[Cosmic.CNT.info.field]] >= min.cosmic.cnt)] <- tmp[5] |
66 | 67 |
prior.somatic[which(info(vcf)[[Cosmic.CNT.info.field]] >= min.cosmic.cnt & |
67 | 68 |
info(vcf)[[DB.info.flag]])] <- tmp[6] |
68 | 69 |
} else { |
69 |
- flog.info("Setting somatic prior probabilities for dbSNP hits to %f or to %f otherwise.", |
|
70 |
+ flog.info("Setting somatic prior probabilities for likely somatic hits to %f or to %f otherwise.", |
|
70 | 71 |
tmp[2], tmp[1]) |
71 | 72 |
} |
72 | 73 |
} |
... | ... |
@@ -102,7 +102,7 @@ option_list <- list( |
102 | 102 |
help = "Maximum considered ploidy [default %default]"), |
103 | 103 |
make_option(c("--max-copy-number"), action = "store", type = "double", |
104 | 104 |
default = max(eval(formals(PureCN::runAbsoluteCN)$test.num.copy)), |
105 |
- help = "Maximum allele-specific integer copy number [default %default]"), |
|
105 |
+ help = "Maximum allele-specific integer copy number, only used for fitting allele-specific copy numbers. Higher copy numbers might still be inferred and reported [default %default]"), |
|
106 | 106 |
make_option(c("--post-optimize"), action = "store_true", default = FALSE, |
107 | 107 |
help = "Post-optimization [default %default]"), |
108 | 108 |
make_option(c("--bootstrap-n"), action = "store", type = "integer", default = 0, |
... | ... |
@@ -44,7 +44,7 @@ which potentially have allelic fractions smaller than 1 due to artifacts or |
44 | 44 |
contamination. If a matched normal is available, this value is ignored, |
45 | 45 |
because homozygosity can be confirmed in the normal.} |
46 | 46 |
|
47 |
-\item{contamination.range}{Count variants in dbSNP with allelic fraction |
|
47 |
+\item{contamination.range}{Count variants in germline databases with allelic fraction |
|
48 | 48 |
in the specified range. If the number of these putative contamination |
49 | 49 |
variants exceeds an expected value and if they are found on almost all |
50 | 50 |
chromosomes, the sample is flagged as potentially contaminated and extra |
... | ... |
@@ -87,7 +87,7 @@ the specified size in bp. Requires \code{target.granges}.} |
87 | 87 |
|
88 | 88 |
\item{DB.info.flag}{Flag in INFO of VCF that marks presence in common |
89 | 89 |
germline databases. Defaults to \code{DB} that may contain somatic variants |
90 |
-if it is from an unfiltered dbSNP VCF.} |
|
90 |
+if it is from an unfiltered germline database.} |
|
91 | 91 |
} |
92 | 92 |
\value{ |
93 | 93 |
A list with elements \item{vcf}{The filtered \code{CollapsedVCF} |
... | ... |
@@ -98,7 +98,7 @@ deviation, used to model likelihood of sub-clonal copy number events.} |
98 | 98 |
\item{vcf.file}{VCF file. |
99 | 99 |
Optional, but typically needed to select between local optima of similar |
100 | 100 |
likelihood. Can also be a \code{CollapsedVCF}, read with the \code{readVcf} |
101 |
-function. Requires a DB info flag for dbSNP membership. The default |
|
101 |
+function. Requires a DB info flag for likely somatic status. The default |
|
102 | 102 |
\code{fun.setPriorVcf} function will also look for a Cosmic.CNT slot (see |
103 | 103 |
\code{cosmic.vcf.file}), containing the hits in the COSMIC database. Again, |
104 | 104 |
do not expect very useful results without a VCF file.} |
... | ... |
@@ -233,10 +233,10 @@ likelihood score calculation. Note that bias is reported on an inverse |
233 | 233 |
scale; a variant with mapping bias of 1 has no bias.} |
234 | 234 |
|
235 | 235 |
\item{max.pon}{Exclude variants found more than \code{max.pon} times in |
236 |
-pool of normals and not in dbSNP. Requires \code{mapping.bias.file} in |
|
237 |
-\code{\link{setMappingBiasVcf}}. Should be set to a value high enough |
|
236 |
+pool of normals and not in germline databases. Requires \code{mapping.bias.file} |
|
237 |
+in \code{\link{setMappingBiasVcf}}. Should be set to a value high enough |
|
238 | 238 |
to be much more likely an artifact and not a true germline variant not |
239 |
-present in dbSNP.} |
|
239 |
+present in germline databases.} |
|
240 | 240 |
|
241 | 241 |
\item{iterations}{Maximum number of iterations in the Simulated Annealing |
242 | 242 |
copy number fit optimization. Note that this an integer optimization problem |
... | ... |
@@ -308,7 +308,7 @@ and indexed with bgzip and tabix, respectively.} |
308 | 308 |
|
309 | 309 |
\item{DB.info.flag}{Flag in INFO of VCF that marks presence in common |
310 | 310 |
germline databases. Defaults to \code{DB} that may contain somatic variants |
311 |
-if it is from an unfiltered dbSNP VCF.} |
|
311 |
+if it is from an unfiltered germline database.} |
|
312 | 312 |
|
313 | 313 |
\item{POPAF.info.field}{As alternative to a flag, use an info field that |
314 | 314 |
contains population allele frequencies. The \code{DB} info flag has priority |
... | ... |
@@ -354,10 +354,10 @@ input data.} |
354 | 354 |
This function takes as input tumor and normal control coverage data and |
355 | 355 |
a VCF containing allelic fractions of germline variants and somatic |
356 | 356 |
mutations. Normal control does not need to be from the same patient. |
357 |
-In case VCF does not contain somatic status, it should contain dbSNP and |
|
358 |
-optionally COSMIC annotation. Returns purity and ploidy combinations, |
|
359 |
-sorted by likelihood score. Provides copy number and LOH data, by both |
|
360 |
-gene and genomic region. |
|
357 |
+In case VCF does not contain somatic status, it should contain either |
|
358 |
+dbSNP or population allele frequencies, and optionally COSMIC annotation. |
|
359 |
+Returns purity and ploidy combinations, sorted by likelihood score. |
|
360 |
+Provides copy number and LOH data, by both gene and genomic region. |
|
361 | 361 |
} |
362 | 362 |
\examples{ |
363 | 363 |
|
... | ... |
@@ -19,14 +19,15 @@ function from the VariantAnnotation package.} |
19 | 19 |
|
20 | 20 |
\item{prior.somatic}{Prior probabilities for somatic mutations. First value |
21 | 21 |
is for the case when no matched normals are available and the variant is not |
22 |
-in dbSNP (second value). Third value is for variants with MuTect somatic |
|
23 |
-call. Different from 1, because somatic mutations in segments of copy number |
|
24 |
-0 have 0 probability and artifacts can thus have dramatic influence on |
|
22 |
+in germline databases (second value). Third value is for variants with MuTect |
|
23 |
+somatic call. Different from 1, because somatic mutations in segments of copy |
|
24 |
+number 0 have 0 probability and artifacts can thus have dramatic influence on |
|
25 | 25 |
likelihood score. Forth value is for variants not labeled as somatic by |
26 | 26 |
MuTect. Last two values are optional, if vcf contains a flag Cosmic.CNT, it |
27 |
-will set the prior probability for variants with CNT > 2 to the first of |
|
27 |
+will set the prior probability for variants with CNT > 6 to the first of |
|
28 | 28 |
those values in case of no matched normal available (0.995 default). Final |
29 |
-value is for the case that variant is in both dbSNP and COSMIC > 2.} |
|
29 |
+value is for the case that variant is in both germline databases and |
|
30 |
+COSMIC count > 6.} |
|
30 | 31 |
|
31 | 32 |
\item{tumor.id.in.vcf}{Id of tumor in case multiple samples are stored in |
32 | 33 |
VCF.} |
... | ... |
@@ -36,7 +37,7 @@ call variant as likely somatic.} |
36 | 37 |
|
37 | 38 |
\item{DB.info.flag}{Flag in INFO of VCF that marks presence in common |
38 | 39 |
germline databases. Defaults to \code{DB} that may contain somatic variants |
39 |
-if it is from an unfiltered dbSNP VCF.} |
|
40 |
+if it is from an unfiltered germline database.} |
|
40 | 41 |
|
41 | 42 |
\item{Cosmic.CNT.info.field}{Info field containing hits in the Cosmic database} |
42 | 43 |
} |
... | ... |
@@ -947,7 +947,7 @@ NOISY SEGMENTATION & More than \Rcode{max.segments} \\ |
947 | 947 |
NON-ABERRANT & $\geq 99 $\% of genome has identical copy number and $\geq 0.5$\% has second most common state \\ |
948 | 948 |
POLYGENOMIC & $\geq 0.75 \times$ \Rcode{max.non.clonal} fraction of the genome in sub-clonal state \\ |
949 | 949 |
POOR GOF & GoF $<$ \Rcode{min.gof} \\ |
950 |
-POTENTIAL SAMPLE CONTAMINATION & Significant portion of dbSNP variants potentially cross-contaminated \\ |
|
950 |
+POTENTIAL SAMPLE CONTAMINATION & Significant portion of variants present in germline databases are potentially cross-contaminated \\ |
|
951 | 951 |
RARE KARYOTYPE & Ploidy $< 1.5$ or $> 4.5$ \\ |
952 | 952 |
\bottomrule |
953 | 953 |
\end{tabular} |
... | ... |
@@ -1115,12 +1115,12 @@ If a matched normal is not available, it is also helpful to provide |
1115 | 1115 |
\Rcode{cosmic.vcf.file} (or via a \Rcode{Cosmic.CNT} INFO field in the VCF). |
1116 | 1116 |
While this has limited effect on purity and ploidy estimation due the sparsity |
1117 | 1117 |
of hotspot mutations, it often helps in the manual curation to compare how well |
1118 |
-high confidence germline (dbSNP) vs. somatic (COSMIC) variants fit a particular |
|
1119 |
-purity/ploidy combination. |
|
1118 |
+high confidence germline (based on dbSNP membership or POPAF) vs. somatic (COSMIC) |
|
1119 |
+variants fit a particular purity/ploidy combination. |
|
1120 | 1120 |
|
1121 | 1121 |
For variant classification (Section~\ref{predictsomatic}), providing COSMIC |
1122 |
-annotation also avoids that hotspot mutations with dbSNP id get assigned a very |
|
1123 |
-low prior probability of being somatic. |
|
1122 |
+annotation also avoids that hotspot mutations in germline databases get assigned |
|
1123 |
+a very low prior probability of being somatic. |
|
1124 | 1124 |
|
1125 | 1125 |
\subsection{ExAC and gnomAD annotation} |
1126 | 1126 |
|