... | ... |
@@ -2,8 +2,8 @@ Package: PureCN |
2 | 2 |
Type: Package |
3 | 3 |
Title: Copy number calling and SNV classification using |
4 | 4 |
targeted short read sequencing |
5 |
-Version: 1.19.5 |
|
6 |
-Date: 2020-07-06 |
|
5 |
+Version: 1.19.6 |
|
6 |
+Date: 2020-07-11 |
|
7 | 7 |
Authors@R: c(person("Markus", "Riester", |
8 | 8 |
role = c("aut", "cre"), |
9 | 9 |
email = "[email protected]", |
... | ... |
@@ -4,6 +4,8 @@ Changes in version 1.20.0 |
4 | 4 |
NEW FEATURES |
5 | 5 |
|
6 | 6 |
o Support for GATK4 GenomicsDB import for mapping bias calculation |
7 |
+ o Added --additionaltumors to PureCN.R to provide coverage files |
|
8 |
+ from additional biopsies from the same patient when available |
|
7 | 9 |
|
8 | 10 |
SIGNIFICANT USER-VISIBLE CHANGES |
9 | 11 |
|
... | ... |
@@ -20,6 +22,9 @@ SIGNIFICANT USER-VISIBLE CHANGES |
20 | 22 |
o Made calculateIntervalWeights defunct |
21 | 23 |
o Changed default of min.normals in calculateMappingBiasVcf/Gatk4 to 1 |
22 | 24 |
from 2 |
25 |
+ o Changed default of --signature_databases to |
|
26 |
+ "signatures.exome.cosmic.v3.may2019" (v3 instead of v2) |
|
27 |
+ o Now warn if recommended -funsegmentation is not used |
|
23 | 28 |
|
24 | 29 |
BUGFIXES |
25 | 30 |
|
... | ... |
@@ -303,7 +303,12 @@ calculateTangentNormal <- function(tumor.coverage.file, normalDB, |
303 | 303 |
} |
304 | 304 |
|
305 | 305 |
.readNormals <- function(normal.coverage.files) { |
306 |
- normals <- lapply(normal.coverage.files, readCoverageFile) |
|
306 |
+ normals <- lapply(normal.coverage.files, function(x) { |
|
307 |
+ if (is(x, "character")) { |
|
308 |
+ return(readCoverageFile(normalizePath(x))) |
|
309 |
+ } |
|
310 |
+ return(x) |
|
311 |
+ }) |
|
307 | 312 |
|
308 | 313 |
# check that all files used the same interval file. |
309 | 314 |
for (i in seq_along(normals)) { |
... | ... |
@@ -61,7 +61,6 @@ processMultipleSamples <- function(tumor.coverage.files, sampleids, normalDB, |
61 | 61 |
if (!requireNamespace("copynumber", quietly = TRUE)) { |
62 | 62 |
.stopUserError("processMultipleSamples requires the copynumber package.") |
63 | 63 |
} |
64 |
- tumor.coverage.files <- normalizePath(tumor.coverage.files) |
|
65 | 64 |
tumors <- lapply(.readNormals(tumor.coverage.files), |
66 | 65 |
calculateTangentNormal, normalDB, num.eigen = num.eigen) |
67 | 66 |
|
... | ... |
@@ -84,7 +83,10 @@ processMultipleSamples <- function(tumor.coverage.files, sampleids, normalDB, |
84 | 83 |
intervalsUsed <- .filterIntervalsChrHash(intervalsUsed, tumors[[1]], chr.hash) |
85 | 84 |
centromeres <- .getCentromerePositions(centromeres, genome, |
86 | 85 |
if (is.null(tumors[[1]])) NULL else seqlevelsStyle(tumors[[1]])) |
87 |
- |
|
86 |
+ if (is.null(centromeres)) { |
|
87 |
+ .stopUserError("Cannot find centromeres for ", genome, |
|
88 |
+ ". Provide them manually or select a supported genome.") |
|
89 |
+ } |
|
88 | 90 |
armLocations <- .getArmLocations(tumors[[1]], chr.hash, centromeres) |
89 | 91 |
armLocationsGr <- GRanges(armLocations) |
90 | 92 |
arms <- armLocationsGr$arm[findOverlaps(tumors[[1]], armLocationsGr, select="first")] |
... | ... |
@@ -21,7 +21,7 @@ option_list <- list( |
21 | 21 |
make_option(c("--signatures"), action = "store_true", default = FALSE, |
22 | 22 |
help="Attempt the deconstruction of COSMIC signatures (requires deconstructSigs package)"), |
23 | 23 |
make_option(c("--signature_databases"), action = "store", type = "character", |
24 |
- default = "signatures.cosmic", |
|
24 |
+ default = "signatures.exome.cosmic.v3.may2019", |
|
25 | 25 |
help = "Use the specified signature databases provided by deconstrucSigs. To test multiple databases, provide them : separated [%default]."), |
26 | 26 |
make_option(c("--out"), action = "store", type = "character", |
27 | 27 |
default = NULL, |
... | ... |
@@ -24,6 +24,8 @@ option_list <- list( |
24 | 24 |
default = NULL, help = "Input: Segmentation file"), |
25 | 25 |
make_option(c("--logratiofile"), action = "store", type = "character", |
26 | 26 |
default = NULL, help = "Input: Log2 copy number ratio file"), |
27 |
+ make_option(c("--additionaltumors"), action = "store", type = "character", |
|
28 |
+ default = NULL, help = "Input: tumor coverages from additional biopsies from the SAME patient, GC-normalized"), |
|
27 | 29 |
make_option(c("--sex"), action = "store", type = "character", |
28 | 30 |
default = formals(PureCN::runAbsoluteCN)$sex[[2]], |
29 | 31 |
help = "Input: Sex of sample. ? (detect), diplod (non-diploid chromosomes removed), F or M [default %default]"), |
... | ... |
@@ -181,6 +183,16 @@ if (Sys.getenv("PURECN_DEBUG") != "") { |
181 | 183 |
debug <- TRUE |
182 | 184 |
} |
183 | 185 |
|
186 |
+.checkFileList <- function(file) { |
|
187 |
+ files <- read.delim(file, as.is = TRUE, header = FALSE)[,1] |
|
188 |
+ numExists <- sum(file.exists(files), na.rm = TRUE) |
|
189 |
+ if (numExists < length(files)) { |
|
190 |
+ stop("File not exists in file ", file) |
|
191 |
+ } |
|
192 |
+ files |
|
193 |
+} |
|
194 |
+ |
|
195 |
+ |
|
184 | 196 |
### Run PureCN ---------------------------------------------------------------- |
185 | 197 |
|
186 | 198 |
if (file.exists(file.rds) && !opt$force) { |
... | ... |
@@ -190,7 +202,6 @@ if (file.exists(file.rds) && !opt$force) { |
190 | 202 |
if (is.null(sampleid)) sampleid <- ret$input$sampleid |
191 | 203 |
} else { |
192 | 204 |
if (!is.null(opt$normaldb)) { |
193 |
- #if (!is.null(seg.file)) stop("normalDB and segfile do not work together.") |
|
194 | 205 |
normalDB <- readRDS(opt$normaldb) |
195 | 206 |
if (!is.null(normal.coverage.file)) { |
196 | 207 |
flog.warn("Both --normal and --normalDB provided. normalDB will NOT be used for coverage denoising. You probably do not want this.") |
... | ... |
@@ -213,21 +224,46 @@ if (file.exists(file.rds) && !opt$force) { |
213 | 224 |
file.log <- paste0(out, ".log") |
214 | 225 |
|
215 | 226 |
pdf(paste0(out, "_segmentation.pdf"), width = 10, height = 11) |
227 |
+ if (!is.null(opt$additionaltumors)) { |
|
228 |
+ if (!is.null(seg.file)) { |
|
229 |
+ stop("--additionaltumors overwrites --segfile") |
|
230 |
+ } |
|
231 |
+ seg.file <- paste0(out, "_multisample.seg") |
|
232 |
+ if (grepl(".list$", opt$additionaltumors)) { |
|
233 |
+ additional.tumors <- .checkFileList(opt$additionaltumors) |
|
234 |
+ } else { |
|
235 |
+ additional.tumors <- opt$additionaltumors |
|
236 |
+ } |
|
237 |
+ multi.seg <- processMultipleSamples( |
|
238 |
+ c(list(tumor.coverage.file), as.list(additional.tumors)), |
|
239 |
+ sampleids = c(sampleid, paste(sampleid, |
|
240 |
+ seq_along(additional.tumors) + 1, sep = "_")), |
|
241 |
+ normalDB = normalDB, genome = opt$genome, verbose = debug) |
|
242 |
+ write.table(multi.seg, seg.file, row.names = FALSE, sep = "\t") |
|
243 |
+ } |
|
216 | 244 |
af.range <- c(opt$minaf, 1 - opt$minaf) |
217 | 245 |
test.purity <- seq(opt$minpurity, opt$maxpurity, by = 0.01) |
218 | 246 |
|
247 |
+ uses.recommended.fun <- FALSE |
|
248 |
+ recommended.fun <- if (is.null(seg.file)) "PSCBS" else "Hclust" |
|
219 | 249 |
fun.segmentation <- segmentationCBS |
220 | 250 |
if (opt$funsegmentation != "CBS") { |
221 | 251 |
if (opt$funsegmentation == "PSCBS") { |
222 | 252 |
fun.segmentation <- segmentationPSCBS |
253 |
+ if (is.null(seg.file)) uses.recommended.fun <- TRUE |
|
223 | 254 |
} else if (opt$funsegmentation == "Hclust") { |
224 | 255 |
fun.segmentation <- segmentationHclust |
256 |
+ if (!is.null(seg.file)) uses.recommended.fun <- TRUE |
|
225 | 257 |
} else if (opt$funsegmentation == "none") { |
226 | 258 |
fun.segmentation <- function(seg, ...) seg |
227 | 259 |
} else { |
228 | 260 |
stop("Unknown segmentation function") |
229 | 261 |
} |
230 | 262 |
} |
263 |
+ if (!uses.recommended.fun) { |
|
264 |
+ flog.warn("Recommended to provide --funsegmentation %s.", recommended.fun) |
|
265 |
+ } |
|
266 |
+ |
|
231 | 267 |
mutect.ignore <- eval(formals(PureCN::filterVcfMuTect)$ignore) |
232 | 268 |
if (opt$error < formals(PureCN::runAbsoluteCN)$error && !is.null(opt$statsfile)) { |
233 | 269 |
flog.info("Low specified error, will keep fstar_tumor_lod flagged variants") |
... | ... |
@@ -255,6 +291,7 @@ if (file.exists(file.rds) && !opt$force) { |
255 | 291 |
} |
256 | 292 |
log.ratio <- log.ratio$log.ratio |
257 | 293 |
} |
294 |
+ |
|
258 | 295 |
ret <- runAbsoluteCN(normal.coverage.file = normal.coverage.file, |
259 | 296 |
tumor.coverage.file = tumor.coverage.file, vcf.file = opt$vcf, |
260 | 297 |
sampleid = sampleid, plot.cnv = TRUE, |
... | ... |
@@ -20,6 +20,12 @@ test_that("example output correct", { |
20 | 20 |
normalDB = normalDB, |
21 | 21 |
genome = "hg19") |
22 | 22 |
expect_equal(c("Sample1", "Sample2"), levels(seg[,1])) |
23 |
+ seg2 <- processMultipleSamples( |
|
24 |
+ list(tumor.coverage.files[1],readCoverageFile(tumor.coverage.files[2])), |
|
25 |
+ sampleids = c("Sample1", "Sample2"), |
|
26 |
+ normalDB = normalDB, |
|
27 |
+ genome = "hg38", plot.cnv = FALSE) |
|
28 |
+ expect_equal(c("Sample1", "Sample2"), levels(seg2[,1])) |
|
23 | 29 |
seg.file <- tempfile(fileext = ".seg") |
24 | 30 |
write.table(seg, seg.file, row.names = FALSE, sep = "\t") |
25 | 31 |
vcf.file <- system.file("extdata", "example.vcf.gz", package = "PureCN") |
... | ... |
@@ -30,4 +36,8 @@ test_that("example output correct", { |
30 | 36 |
genome = "hg19", min.ploidy = 1.5, max.ploidy = 2.1, |
31 | 37 |
test.purity = seq(0.4, 0.7, by = 0.05), sampleid = "Sample1") |
32 | 38 |
expect_equal(0.65, ret$results[[1]]$purity) |
39 |
+ expect_error(processMultipleSamples(tumor.coverage.files, |
|
40 |
+ sampleids = c("Sample1", "Sample2"), |
|
41 |
+ normalDB = normalDB, |
|
42 |
+ genome = "hg20"), "centromere") |
|
33 | 43 |
}) |
... | ... |
@@ -28,11 +28,9 @@ library(BiocStyle) |
28 | 28 |
|
29 | 29 |
## Update from previous stable versions |
30 | 30 |
|
31 |
-`r Biocpkg("PureCN")` is fully backward compatible with input generated by |
|
32 |
-versions 1.10, 1.12, 1.14, 1.16 and 1.18. However, 1.16 slightly changed the |
|
33 |
-mapping bias database and incorporated the interval weights into the database directly. |
|
34 |
-Simply re-create this RDS file to take advantage of the new features in case |
|
35 |
-you upgrade from earlier versions: |
|
31 |
+`r Biocpkg("PureCN")` is backward compatible with input generated by |
|
32 |
+versions 1.16 and 1.18. For versions 1.8 to 1.14, please re-run `NormalDB.R` |
|
33 |
+(see also below): |
|
36 | 34 |
|
37 | 35 |
``` |
38 | 36 |
$ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \ |
... | ... |
@@ -40,14 +38,10 @@ $ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \ |
40 | 38 |
--genome hg19 --normal_panel $NORMAL_PANEL --assay agilent_v6 |
41 | 39 |
``` |
42 | 40 |
|
43 |
-`r Biocpkg("PureCN")` 1.10 introduced a completely new normal database format with |
|
44 |
-several important improvements such as not splitting the database by sample sex |
|
45 |
-for the normalization of autosomes. It is therefore necessary (not only recommended) |
|
46 |
-to re-run the `NormalDB.R` when upgrading from version 1.8. |
|
47 |
- |
|
48 | 41 |
For upgrades from version 1.6, we highly recommend starting from scratch |
49 | 42 |
following this tutorial. |
50 | 43 |
|
44 |
+ |
|
51 | 45 |
## Installation |
52 | 46 |
|
53 | 47 |
For the command line scripts described in this tutorial, we will need to |
... | ... |
@@ -240,18 +234,20 @@ $ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \ |
240 | 234 |
--coveragefiles example_normal.list \ |
241 | 235 |
--genome hg19 --assay agilent_v6 |
242 | 236 |
|
243 |
-# When normal panel VCF is available (highly recommended for unmatched samples) |
|
237 |
+# When normal panel VCF is available (highly recommended for |
|
238 |
+# unmatched samples) |
|
244 | 239 |
$ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \ |
245 | 240 |
--coveragefiles example_normal.list \ |
246 |
- --genome hg19 --normal_panel $NORMAL_PANEL \ |
|
241 |
+ --normal_panel $NORMAL_PANEL \ |
|
242 |
+ --genome hg19 \ |
|
247 | 243 |
--assay agilent_v6 |
248 | 244 |
|
249 | 245 |
# For a Mutect2/GATK4 normal panel GenomicsDB (experimental) |
250 | 246 |
$ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \ |
251 | 247 |
--coveragefiles example_normal.list \ |
252 |
- --genome hg19 $GENOMICSDB-WORKSPACE-PATH/pon_db \ |
|
248 |
+ --normal_panel $GENOMICSDB-WORKSPACE-PATH/pon_db \ |
|
249 |
+ --genome hg19 \ |
|
253 | 250 |
--assay agilent_v6 |
254 |
- |
|
255 | 251 |
``` |
256 | 252 |
|
257 | 253 |
Important recommendations: |
... | ... |
@@ -461,9 +457,9 @@ Important recommendations: |
461 | 457 |
## Recommended _GATK4_ usage |
462 | 458 |
|
463 | 459 |
``` |
464 |
-# Recommended: Provide a normal panel GenomicsDB to remove mapping biases, |
|
465 |
-# pre-compute position-specific bias for much faster runtimes with large panels |
|
466 |
-# This needs to be done only once for each assay. |
|
460 |
+# Recommended: Provide a normal panel GenomicsDB to remove mapping |
|
461 |
+# biases, pre-compute position-specific bias for much faster runtimes |
|
462 |
+# with large panels. This needs to be done only once for each assay. |
|
467 | 463 |
Rscript $PURECN/NormalDB.R --outdir $OUT_REF \ |
468 | 464 |
--normal_panel $GENOMICSDB-WORKSPACE-PATH/pon_db \ |
469 | 465 |
--assay agilent_v6 --genome hg19 --force |
... | ... |
@@ -474,8 +470,7 @@ Rscript $PURECN/PureCN.R --out $OUT/$SAMPLEID \ |
474 | 470 |
--logratiofile $OUT/$SAMPLEID/${SAMPLEID}.denoisedCR.tsv \ |
475 | 471 |
--segfile $OUT/$SAMPLEID/${SAMPLEID}.modelFinal.seg \ |
476 | 472 |
--mappingbiasfile $OUT_REF/mapping_bias_agilent_v6_hg19.rds \ |
477 |
- --vcf ${SAMPLEID}_mutect.vcf \ |
|
478 |
- --statsfile ${SAMPLEID}_mutect_stats.txt \ |
|
473 |
+ --vcf ${SAMPLEID}_mutect2_filtered.vcf \ |
|
479 | 474 |
--snpblacklist hg19_simpleRepeats.bed \ |
480 | 475 |
--genome hg19 \ |
481 | 476 |
--funsegmentation Hclust \ |
... | ... |
@@ -495,17 +490,16 @@ and mutational signatures. |
495 | 490 |
grep CALLABLE ${SAMPLEID}_callable_status.bed > \ |
496 | 491 |
${SAMPLEID}_callable_status_filtered.bed |
497 | 492 |
|
498 |
-# Only count mutations in callable regions, also subtract what was ignored |
|
499 |
-# in PureCN.R via --snpblacklist, like simple repeats, from the mutation per |
|
500 |
-# megabase calculation |
|
493 |
+# Only count mutations in callable regions, also subtract what was |
|
494 |
+# ignored in PureCN.R via --snpblacklist, like simple repeats, from the |
|
495 |
+# mutation per megabase calculation |
|
501 | 496 |
# Also search for the COSMIC mutation signatures |
502 | 497 |
# (http://cancer.sanger.ac.uk/cosmic/signatures) |
503 | 498 |
Rscript $PureCN/Dx.R --out $OUT/$SAMPLEID/$SAMPLEID \ |
504 | 499 |
--rds $OUT/SAMPLEID/${SAMPLEID}.rds \ |
505 | 500 |
--callable ${SAMPLEID}_callable_status_filtered.bed \ |
506 | 501 |
--exclude hg19_simpleRepeats.bed \ |
507 |
- --signatures \ |
|
508 |
- --signature_databases signatures.exome.cosmic.v3.may2019 |
|
502 |
+ --signatures |
|
509 | 503 |
|
510 | 504 |
# Restrict mutation burden calculation to coding sequences |
511 | 505 |
Rscript $PureCN/FilterCallableLoci.R --genome hg19 \ |
... | ... |
@@ -595,6 +589,7 @@ Argument name | Corresponding PureCN argument | PureCN function |
595 | 589 |
`--normaldb` | `normalDB` (serialized with `saveRDS`) | `calculateTangentNormal`, `filterTargets` |
596 | 590 |
`--segfile` | `seg.file` | `runAbsoluteCN` |
597 | 591 |
`--logratiofile` | `log.ratio` | `runAbsoluteCN` |
592 |
+`--additionaltumors` | `tumor.coverage.files` | `processMultipleSamples` |
|
598 | 593 |
`--sex` | `sex` | `runAbsoluteCN` |
599 | 594 |
`--genome` | `genome` | `runAbsoluteCN` |
600 | 595 |
`--intervals` | `interval.file` | `runAbsoluteCN` |