Browse code

Support for processMultipleSamples in PureCN.R via --additionaltumors.

Markus Riester authored on 11/07/2020 21:07:06
Showing 8 changed files

... ...
@@ -2,8 +2,8 @@ Package: PureCN
2 2
 Type: Package
3 3
 Title: Copy number calling and SNV classification using
4 4
     targeted short read sequencing
5
-Version: 1.19.5
6
-Date: 2020-07-06
5
+Version: 1.19.6
6
+Date: 2020-07-11
7 7
 Authors@R: c(person("Markus", "Riester",
8 8
                     role = c("aut", "cre"),
9 9
                     email = "[email protected]",
... ...
@@ -4,6 +4,8 @@ Changes in version 1.20.0
4 4
 NEW FEATURES
5 5
 
6 6
     o Support for GATK4 GenomicsDB import for mapping bias calculation
7
+    o Added --additionaltumors to PureCN.R to provide coverage files
8
+      from additional biopsies from the same patient when available
7 9
     
8 10
 SIGNIFICANT USER-VISIBLE CHANGES
9 11
 
... ...
@@ -20,6 +22,9 @@ SIGNIFICANT USER-VISIBLE CHANGES
20 22
     o Made calculateIntervalWeights defunct
21 23
     o Changed default of min.normals in calculateMappingBiasVcf/Gatk4 to 1
22 24
       from 2
25
+    o Changed default of --signature_databases to 
26
+      "signatures.exome.cosmic.v3.may2019" (v3 instead of v2)
27
+    o Now warn if recommended  -funsegmentation is not used  
23 28
 
24 29
 BUGFIXES
25 30
 
... ...
@@ -303,7 +303,12 @@ calculateTangentNormal <- function(tumor.coverage.file, normalDB,
303 303
 }
304 304
     
305 305
 .readNormals <- function(normal.coverage.files) {
306
-    normals <- lapply(normal.coverage.files, readCoverageFile)
306
+    normals <- lapply(normal.coverage.files, function(x) {
307
+        if (is(x, "character")) {
308
+            return(readCoverageFile(normalizePath(x)))
309
+        }
310
+        return(x)
311
+    })
307 312
 
308 313
     # check that all files used the same interval file.
309 314
     for (i in seq_along(normals)) {
... ...
@@ -61,7 +61,6 @@ processMultipleSamples <- function(tumor.coverage.files, sampleids, normalDB,
61 61
     if (!requireNamespace("copynumber", quietly = TRUE)) {
62 62
         .stopUserError("processMultipleSamples requires the copynumber package.")
63 63
     }
64
-    tumor.coverage.files <- normalizePath(tumor.coverage.files)
65 64
     tumors <- lapply(.readNormals(tumor.coverage.files), 
66 65
         calculateTangentNormal, normalDB, num.eigen = num.eigen)
67 66
 
... ...
@@ -84,7 +83,10 @@ processMultipleSamples <- function(tumor.coverage.files, sampleids, normalDB,
84 83
     intervalsUsed <- .filterIntervalsChrHash(intervalsUsed, tumors[[1]], chr.hash)
85 84
     centromeres <- .getCentromerePositions(centromeres, genome, 
86 85
             if (is.null(tumors[[1]])) NULL else seqlevelsStyle(tumors[[1]]))
87
-
86
+    if (is.null(centromeres)) {
87
+        .stopUserError("Cannot find centromeres for ", genome,
88
+            ". Provide them manually or select a supported genome.")
89
+    }    
88 90
     armLocations <- .getArmLocations(tumors[[1]], chr.hash, centromeres)
89 91
     armLocationsGr <- GRanges(armLocations)
90 92
     arms <- armLocationsGr$arm[findOverlaps(tumors[[1]], armLocationsGr, select="first")]
... ...
@@ -21,7 +21,7 @@ option_list <- list(
21 21
     make_option(c("--signatures"), action = "store_true", default = FALSE, 
22 22
         help="Attempt the deconstruction of COSMIC signatures (requires deconstructSigs package)"),
23 23
     make_option(c("--signature_databases"), action = "store", type = "character", 
24
-        default = "signatures.cosmic", 
24
+        default = "signatures.exome.cosmic.v3.may2019", 
25 25
         help = "Use the specified signature databases provided by deconstrucSigs. To test multiple databases, provide them : separated [%default]."),
26 26
     make_option(c("--out"), action = "store", type = "character",
27 27
         default = NULL,
... ...
@@ -24,6 +24,8 @@ option_list <- list(
24 24
                 default = NULL, help = "Input: Segmentation file"),
25 25
     make_option(c("--logratiofile"), action = "store", type = "character",
26 26
                 default = NULL, help = "Input: Log2 copy number ratio file"),
27
+    make_option(c("--additionaltumors"), action = "store", type = "character",
28
+                default = NULL, help = "Input: tumor coverages from additional biopsies from the SAME patient, GC-normalized"),
27 29
     make_option(c("--sex"), action = "store", type = "character",
28 30
         default = formals(PureCN::runAbsoluteCN)$sex[[2]],
29 31
         help = "Input: Sex of sample. ? (detect), diplod (non-diploid chromosomes removed), F or M [default %default]"),
... ...
@@ -181,6 +183,16 @@ if (Sys.getenv("PURECN_DEBUG") != "") {
181 183
     debug <- TRUE
182 184
 }
183 185
 
186
+.checkFileList <- function(file) {
187
+    files <- read.delim(file, as.is = TRUE, header = FALSE)[,1]
188
+    numExists <- sum(file.exists(files), na.rm = TRUE)
189
+    if (numExists < length(files)) { 
190
+        stop("File not exists in file ", file)
191
+    }
192
+    files
193
+}
194
+
195
+
184 196
 ### Run PureCN ----------------------------------------------------------------
185 197
 
186 198
 if (file.exists(file.rds) && !opt$force) {
... ...
@@ -190,7 +202,6 @@ if (file.exists(file.rds) && !opt$force) {
190 202
     if (is.null(sampleid)) sampleid <- ret$input$sampleid
191 203
 } else {
192 204
     if (!is.null(opt$normaldb)) {
193
-        #if (!is.null(seg.file)) stop("normalDB and segfile do not work together.")
194 205
         normalDB <- readRDS(opt$normaldb)
195 206
         if (!is.null(normal.coverage.file)) {
196 207
             flog.warn("Both --normal and --normalDB provided. normalDB will NOT be used for coverage denoising. You probably do not want this.")
... ...
@@ -213,21 +224,46 @@ if (file.exists(file.rds) && !opt$force) {
213 224
     file.log <- paste0(out, ".log")
214 225
 
215 226
     pdf(paste0(out, "_segmentation.pdf"), width = 10, height = 11)
227
+    if (!is.null(opt$additionaltumors)) {
228
+        if (!is.null(seg.file)) {
229
+            stop("--additionaltumors overwrites --segfile")
230
+        }
231
+        seg.file <- paste0(out, "_multisample.seg")
232
+        if (grepl(".list$", opt$additionaltumors)) {
233
+            additional.tumors <- .checkFileList(opt$additionaltumors)
234
+        } else {
235
+            additional.tumors <- opt$additionaltumors 
236
+        }
237
+        multi.seg <- processMultipleSamples(
238
+            c(list(tumor.coverage.file), as.list(additional.tumors)),
239
+            sampleids = c(sampleid, paste(sampleid,
240
+                seq_along(additional.tumors) + 1, sep = "_")),
241
+            normalDB = normalDB, genome = opt$genome, verbose = debug)
242
+        write.table(multi.seg, seg.file, row.names = FALSE, sep = "\t")
243
+    }    
216 244
     af.range <- c(opt$minaf, 1 - opt$minaf)
217 245
     test.purity <- seq(opt$minpurity, opt$maxpurity, by = 0.01)
218 246
 
247
+    uses.recommended.fun <- FALSE
248
+    recommended.fun <- if (is.null(seg.file)) "PSCBS" else "Hclust"
219 249
     fun.segmentation <- segmentationCBS
220 250
     if (opt$funsegmentation != "CBS") {
221 251
         if (opt$funsegmentation == "PSCBS") {
222 252
             fun.segmentation <- segmentationPSCBS
253
+            if (is.null(seg.file)) uses.recommended.fun <- TRUE
223 254
         } else if (opt$funsegmentation == "Hclust") {
224 255
             fun.segmentation <- segmentationHclust
256
+            if (!is.null(seg.file)) uses.recommended.fun <- TRUE
225 257
         } else if (opt$funsegmentation == "none") {
226 258
             fun.segmentation <- function(seg, ...) seg
227 259
         } else {
228 260
             stop("Unknown segmentation function")
229 261
         }
230 262
     }
263
+    if (!uses.recommended.fun) {
264
+        flog.warn("Recommended to provide --funsegmentation %s.", recommended.fun)
265
+    }
266
+
231 267
     mutect.ignore <- eval(formals(PureCN::filterVcfMuTect)$ignore)
232 268
     if (opt$error < formals(PureCN::runAbsoluteCN)$error && !is.null(opt$statsfile)) {
233 269
         flog.info("Low specified error, will keep fstar_tumor_lod flagged variants")
... ...
@@ -255,6 +291,7 @@ if (file.exists(file.rds) && !opt$force) {
255 291
         }
256 292
         log.ratio <- log.ratio$log.ratio
257 293
     }    
294
+    
258 295
     ret <- runAbsoluteCN(normal.coverage.file = normal.coverage.file,
259 296
             tumor.coverage.file = tumor.coverage.file, vcf.file = opt$vcf,
260 297
             sampleid = sampleid, plot.cnv = TRUE,
... ...
@@ -20,6 +20,12 @@ test_that("example output correct", {
20 20
 			 normalDB = normalDB,
21 21
 			 genome = "hg19")
22 22
     expect_equal(c("Sample1", "Sample2"), levels(seg[,1]))
23
+	seg2 <- processMultipleSamples(
24
+             list(tumor.coverage.files[1],readCoverageFile(tumor.coverage.files[2])),
25
+			 sampleids = c("Sample1", "Sample2"),
26
+			 normalDB = normalDB,
27
+			 genome = "hg38", plot.cnv = FALSE)
28
+    expect_equal(c("Sample1", "Sample2"), levels(seg2[,1]))
23 29
     seg.file <- tempfile(fileext = ".seg")
24 30
     write.table(seg, seg.file, row.names = FALSE, sep = "\t")
25 31
     vcf.file <- system.file("extdata", "example.vcf.gz", package = "PureCN")
... ...
@@ -30,4 +36,8 @@ test_that("example output correct", {
30 36
         genome = "hg19", min.ploidy = 1.5, max.ploidy = 2.1, 
31 37
         test.purity = seq(0.4, 0.7, by = 0.05), sampleid = "Sample1")
32 38
     expect_equal(0.65, ret$results[[1]]$purity)
39
+	expect_error(processMultipleSamples(tumor.coverage.files,
40
+			 sampleids = c("Sample1", "Sample2"),
41
+			 normalDB = normalDB,
42
+			 genome = "hg20"), "centromere")
33 43
 })
... ...
@@ -28,11 +28,9 @@ library(BiocStyle)
28 28
 
29 29
 ## Update from previous stable versions
30 30
 
31
-`r Biocpkg("PureCN")` is fully backward compatible with input generated by
32
-versions 1.10, 1.12, 1.14, 1.16 and 1.18. However, 1.16 slightly changed the
33
-mapping bias database and incorporated the interval weights into the database directly.
34
-Simply re-create this RDS file to take advantage of the new features in case
35
-you upgrade from earlier versions:
31
+`r Biocpkg("PureCN")` is backward compatible with input generated by
32
+versions 1.16 and 1.18. For versions 1.8 to 1.14, please re-run `NormalDB.R`
33
+(see also below): 
36 34
 
37 35
 ```
38 36
 $ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
... ...
@@ -40,14 +38,10 @@ $ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
40 38
     --genome hg19 --normal_panel $NORMAL_PANEL --assay agilent_v6
41 39
 ```
42 40
 
43
-`r Biocpkg("PureCN")` 1.10 introduced a completely new normal database format with
44
-several important improvements such as not splitting the database by sample sex
45
-for the normalization of autosomes. It is therefore necessary (not only recommended)
46
-to re-run the `NormalDB.R` when upgrading from version 1.8. 
47
-
48 41
 For upgrades from version 1.6, we highly recommend starting from scratch
49 42
 following this tutorial.
50 43
 
44
+
51 45
 ## Installation
52 46
 
53 47
 For the command line scripts described in this tutorial, we will need to 
... ...
@@ -240,18 +234,20 @@ $ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
240 234
     --coveragefiles example_normal.list \
241 235
     --genome hg19 --assay agilent_v6
242 236
 
243
-# When normal panel VCF is available (highly recommended for unmatched samples)
237
+# When normal panel VCF is available (highly recommended for
238
+# unmatched samples)
244 239
 $ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
245 240
     --coveragefiles example_normal.list \
246
-    --genome hg19 --normal_panel $NORMAL_PANEL \
241
+    --normal_panel $NORMAL_PANEL \
242
+    --genome hg19 \
247 243
     --assay agilent_v6
248 244
 
249 245
 # For a Mutect2/GATK4 normal panel GenomicsDB (experimental)
250 246
 $ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
251 247
     --coveragefiles example_normal.list \
252
-    --genome hg19 $GENOMICSDB-WORKSPACE-PATH/pon_db \
248
+    --normal_panel $GENOMICSDB-WORKSPACE-PATH/pon_db \
249
+    --genome hg19 \
253 250
     --assay agilent_v6
254
-
255 251
 ```
256 252
 
257 253
 Important recommendations:
... ...
@@ -461,9 +457,9 @@ Important recommendations:
461 457
 ## Recommended _GATK4_ usage
462 458
 
463 459
 ```
464
-# Recommended: Provide a normal panel GenomicsDB to remove mapping biases,
465
-# pre-compute position-specific bias for much faster runtimes with large panels
466
-# This needs to be done only once for each assay. 
460
+# Recommended: Provide a normal panel GenomicsDB to remove mapping
461
+# biases, pre-compute position-specific bias for much faster runtimes
462
+# with large panels. This needs to be done only once for each assay. 
467 463
 Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
468 464
     --normal_panel $GENOMICSDB-WORKSPACE-PATH/pon_db \
469 465
     --assay agilent_v6 --genome hg19 --force
... ...
@@ -474,8 +470,7 @@ Rscript $PURECN/PureCN.R --out $OUT/$SAMPLEID  \
474 470
     --logratiofile $OUT/$SAMPLEID/${SAMPLEID}.denoisedCR.tsv \
475 471
     --segfile $OUT/$SAMPLEID/${SAMPLEID}.modelFinal.seg \
476 472
     --mappingbiasfile $OUT_REF/mapping_bias_agilent_v6_hg19.rds \
477
-    --vcf ${SAMPLEID}_mutect.vcf \
478
-    --statsfile ${SAMPLEID}_mutect_stats.txt \
473
+    --vcf ${SAMPLEID}_mutect2_filtered.vcf \
479 474
     --snpblacklist hg19_simpleRepeats.bed \
480 475
     --genome hg19 \
481 476
     --funsegmentation Hclust \
... ...
@@ -495,17 +490,16 @@ and mutational signatures.
495 490
 grep CALLABLE ${SAMPLEID}_callable_status.bed > \ 
496 491
     ${SAMPLEID}_callable_status_filtered.bed
497 492
 
498
-# Only count mutations in callable regions, also subtract what was ignored
499
-# in PureCN.R via --snpblacklist, like simple repeats, from the mutation per 
500
-# megabase calculation
493
+# Only count mutations in callable regions, also subtract what was
494
+# ignored in PureCN.R via --snpblacklist, like simple repeats, from the
495
+# mutation per megabase calculation
501 496
 # Also search for the COSMIC mutation signatures
502 497
 # (http://cancer.sanger.ac.uk/cosmic/signatures)
503 498
 Rscript $PureCN/Dx.R --out $OUT/$SAMPLEID/$SAMPLEID \
504 499
     --rds $OUT/SAMPLEID/${SAMPLEID}.rds \
505 500
     --callable ${SAMPLEID}_callable_status_filtered.bed \
506 501
     --exclude hg19_simpleRepeats.bed \
507
-    --signatures \
508
-    --signature_databases signatures.exome.cosmic.v3.may2019
502
+    --signatures 
509 503
 
510 504
 # Restrict mutation burden calculation to coding sequences
511 505
 Rscript $PureCN/FilterCallableLoci.R --genome hg19 \
... ...
@@ -595,6 +589,7 @@ Argument name          | Corresponding PureCN argument | PureCN function
595 589
 `--normaldb`         | `normalDB` (serialized with `saveRDS`) | `calculateTangentNormal`, `filterTargets` 
596 590
 `--segfile`          | `seg.file`           | `runAbsoluteCN`  
597 591
 `--logratiofile`     | `log.ratio`          | `runAbsoluteCN`  
592
+`--additionaltumors` | `tumor.coverage.files` | `processMultipleSamples`  
598 593
 `--sex`              | `sex`                | `runAbsoluteCN`  
599 594
 `--genome`           | `genome`             | `runAbsoluteCN`  
600 595
 `--intervals`        | `interval.file`      | `runAbsoluteCN`