Browse code

Initial support for CNVkit coverage data; started converting --outdir in command line tools to GATK-like --out for more file naming flexibility; Minor changes to quick vignette.

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/PureCN@129450 bc3139a8-67e5-0310-9ffc-ced21a209358

Markus Riester authored on 08/05/2017 03:30:36
Showing 14 changed files

... ...
@@ -2,7 +2,7 @@ Package: PureCN
2 2
 Type: Package
3 3
 Title: Copy number calling and SNV classification using
4 4
     targeted short read sequencing
5
-Version: 1.7.6
5
+Version: 1.7.7
6 6
 Date: 2017-05-05
7 7
 Authors@R: c(person("Markus", "Riester", role=c("aut", "cre"),
8 8
         email="[email protected]"),
... ...
@@ -36,6 +36,7 @@ Imports:
36 36
     futile.logger,
37 37
     VGAM,
38 38
     edgeR,
39
+    tools,
39 40
     limma
40 41
 Suggests:
41 42
     PSCBS,
... ...
@@ -118,6 +118,7 @@ importFrom(stats,predict)
118 118
 importFrom(stats,runif)
119 119
 importFrom(stats,t.test)
120 120
 importFrom(stats,weighted.mean)
121
+importFrom(tools,file_ext)
121 122
 importFrom(utils,data)
122 123
 importFrom(utils,head)
123 124
 importFrom(utils,object.size)
... ...
@@ -3,15 +3,16 @@ Changes in version 1.8.0
3 3
 
4 4
 - Added mutation burden calculation
5 5
 - Code cleanups (switch to GRanges where possible)
6
+- Added support for CNVkit coverage files (*.cnn, *.cnr)
6 7
 
7 8
 API CHANGES
8 9
 
10
+- New function: callMutationBurden
9 11
 - Defunct functions: createSNPBlacklist, getDiploid, autoCurateResults
10 12
 - Minor changes: 
11
-    - Remove gc.data from filterTargets since gc_bias is now added to tumor 
13
+    - Removed gc.data from filterTargets since gc_bias is now added to tumor 
12 14
       coverage
13 15
  
14
- 
15 16
 PLANNED FEATURES FOR 1.8
16 17
 
17 18
 - Better sample summary statistics, like mutation burden, chromosomal 
... ...
@@ -16,7 +16,8 @@
16 16
 #' @param fun.countMutation Function that can be used to filter the
17 17
 #' input VCF further for filtering, for example to only keep missense
18 18
 #' mutations. Expects a \code{logical} vector indicating whether variant
19
-#' should be counted (\code{TRUE}) or not (\code{FALSE}).
19
+#' should be counted (\code{TRUE}) or not (\code{FALSE}). Default
20
+#' is to keep only single nucleotide variants.
20 21
 #' @param callable \code{GRanges} object with callable genomic regions,
21 22
 #' for example obtained by \sQuote{GATK CallableLoci} BED file, imported 
22 23
 #' with \code{rtracklayer}.
... ...
@@ -49,7 +50,8 @@
49 50
 #' 
50 51
 #' @export callMutationBurden
51 52
 callMutationBurden <- function(res, id = 1, remove.flagged = TRUE, 
52
-    min.prior.somatic=0.1, min.cellfraction=0, fun.countMutation=NULL,
53
+    min.prior.somatic=0.1, min.cellfraction=0, 
54
+    fun.countMutation=function(vcf) width(vcf)==1,
53 55
     callable=NULL, exclude=NULL) {
54 56
 
55 57
     if (is.null(res$input$vcf)) {
... ...
@@ -7,6 +7,8 @@
7 7
 #' @param file Target coverage file.
8 8
 #' @param format File format. If missing, derived from the file 
9 9
 #' extension. Currently only GATK DepthofCoverage format supported.
10
+#' @param zero Start position is 0-based. Default is \code{FALSE}
11
+#' for GATK, \code{TRUE} for BED file based intervals.
10 12
 #' @return A \code{data.frame} with the parsed coverage information.
11 13
 #' @author Markus Riester
12 14
 #' @seealso \code{\link{calculateBamCoverageByInterval}}
... ...
@@ -16,14 +18,24 @@
16 18
 #'     package="PureCN")
17 19
 #' coverage <- readCoverageFile(tumor.coverage.file)
18 20
 #' 
21
+#' @importFrom tools file_ext
19 22
 #' @export readCoverageFile
20
-readCoverageFile <- function(file, format) {
21
-    if (missing(format)) format <- "GATK"
22
-    targetCoverage <- .readCoverageGatk(file)
23
+readCoverageFile <- function(file, format, zero=NULL) {
24
+    if (missing(format)) format <- .getFormat(file)
25
+    if (format %in% c("cnn", "cnr")) {    
26
+        targetCoverage <- .readCoverageCnn(file, zero)
27
+    } else {
28
+        targetCoverage <- .readCoverageGatk(file, zero)
29
+    }
23 30
     .checkLowCoverage(targetCoverage)
24 31
     .checkIntervals(targetCoverage)
25 32
 }
26 33
 
34
+.getFormat <- function(file) {
35
+    ext <- file_ext(file)
36
+    if (ext %in% c("cnn", "cnr")) return("cnn")
37
+    "GATK"    
38
+}    
27 39
 
28 40
 #' Read GATK coverage files
29 41
 #' 
... ...
@@ -44,18 +56,37 @@ readCoverageGatk <- function(file) {
44 56
     readCoverageFile(file, format="GATK")
45 57
 }
46 58
 
47
-.readCoverageGatk <- function(file) {
59
+.readCoverageGatk <- function(file, zero) {
60
+    if (!is.null(zero)) flog.warn("zero ignored for GATK coverage files.")
48 61
     inputCoverage <- utils::read.table(file, header = TRUE)
49 62
     if (is.null(inputCoverage$total_coverage)) inputCoverage$total_coverage <- NA
50 63
     if (is.null(inputCoverage$average_coverage)) inputCoverage$average_coverage <- NA
64
+    if (is.null(inputCoverage$ontarget)) inputCoverage$ontarget <- TRUE
51 65
 
52 66
     targetCoverage <- GRanges(inputCoverage$Target, 
53 67
         coverage=inputCoverage$total_coverage, 
54
-        average.coverage=inputCoverage$average_coverage)
68
+        average.coverage=inputCoverage$average_coverage,
69
+        ontarget=inputCoverage$ontarget)
55 70
 
56 71
     targetCoverage
57 72
 }
58 73
 
74
+.readCoverageCnn <- function(file, zero) {
75
+    if (is.null(zero)) zero <- TRUE
76
+    inputCoverage <- utils::read.table(file, header = TRUE)
77
+    if (zero) inputCoverage$start <- inputCoverage$start + 1
78
+    targetCoverage <- GRanges(inputCoverage)    
79
+    targetCoverage$coverage <- targetCoverage$depth * width(targetCoverage)
80
+    targetCoverage$average.coverage <- targetCoverage$depth
81
+    targetCoverage$ontarget <- TRUE
82
+    targetCoverage$depth <- NULL
83
+    targetCoverage$Gene <- targetCoverage$gene
84
+    targetCoverage$ontarget[which(targetCoverage$Gene=="Background")] <- FALSE
85
+    targetCoverage$gene <- NULL
86
+    targetCoverage$log2 <- NULL
87
+    targetCoverage
88
+}
89
+
59 90
 .checkIntervals <- function(coverageGr) {
60 91
     if (min(width(coverageGr))<2) {
61 92
         flog.warn("Coverage data contains single nucleotide intervals.")
... ...
@@ -10,7 +10,7 @@ spec <- matrix(c(
10 10
 'rds',          'r', 1, "character",
11 11
 'callable',     'a', 1, "character",
12 12
 'exclude',      'b', 1, "character",
13
-'outdir',       'o', 1, "character"
13
+'out',          'o', 1, "character"
14 14
 ), byrow=TRUE, ncol=4)
15 15
 opt <- getopt(spec)
16 16
 
... ...
@@ -31,9 +31,6 @@ if (is.null(infileRds)) stop("Need --rds")
31 31
 infileRds <- normalizePath(infileRds, mustWork=TRUE)
32 32
 
33 33
 # Parse outdir
34
-outdir <- opt$outdir
35
-if (is.null(outdir)) outdir <- dirname(infileRds)
36
-outdir <- normalizePath(outdir, mustWork=TRUE)
37 34
 
38 35
 # Parse both BED files restricting covered region
39 36
 callableFile <- opt$callable
... ...
@@ -42,7 +39,7 @@ if (!is.null(callableFile)) {
42 39
     suppressPackageStartupMessages(library(rtracklayer))
43 40
     callableFile <- normalizePath(callableFile, mustWork=TRUE)
44 41
     flog.info("Reading %s...", callableFile)
45
-    callable <- import(callableFile)
42
+    callable <- GRanges(import(callableFile))
46 43
 }
47 44
 
48 45
 excludeFile <- opt$exclude
... ...
@@ -51,7 +48,7 @@ if (!is.null(excludeFile)) {
51 48
     suppressPackageStartupMessages(library(rtracklayer))
52 49
     excludeFile <- normalizePath(excludeFile, mustWork=TRUE)
53 50
     flog.info("Reading %s...", excludeFile)
54
-    exclude <- import(excludeFile)
51
+    exclude <- GRanges(import(excludeFile))
55 52
 }
56 53
 
57 54
 
... ...
@@ -60,12 +57,28 @@ if (!is.null(excludeFile)) {
60 57
 flog.info("Loading PureCN...")
61 58
 suppressPackageStartupMessages(library(PureCN))
62 59
 
63
-flog.info("Reading %s...", infileRds)
64 60
 res <- readCurationFile(infileRds)
65 61
 sampleid <- res$input$sampleid
66
-file.suffix <- ""
67 62
 
68
-outfileMb <- file.path(outdir, paste0(sampleid, file.suffix, '_mutation_burden.csv'))
63
+.getOutPrefix <- function(opt, infile, sampleid) {
64
+    out <- opt[["out"]]
65
+    if (is.null(out)) {
66
+        if (!is.null(infile) && file.exists(infile)) {
67
+            outdir <- dirname(infile)
68
+            prefix <- sampleid
69
+        } else {
70
+            stop("Need --out")
71
+        }    
72
+    } else {
73
+        outdir <- dirname(out)
74
+        prefix <- basename(out)
75
+    }    
76
+    outdir <- normalizePath(outdir, mustWork=TRUE)
77
+    outPrefix <- file.path(outdir, prefix)
78
+}
79
+outPrefix <- .getOutPrefix(opt, infileRds, sampleid)
80
+     
81
+outfileMb <- paste0(outPrefix, '_mutation_burden.csv')
69 82
 
70 83
 force <- !is.null(opt$force)
71 84
 
... ...
@@ -30,7 +30,7 @@ spec <- matrix(c(
30 30
 'modelhomozygous','y', 0, "logical",
31 31
 'model',          'q', 1, "character",
32 32
 'funsegmentation','w', 1, "character",
33
-'outdir',         'o', 1, "character",
33
+'outdir',            'o', 1, "character",
34 34
 'outvcf',         'u', 0, "logical",
35 35
 'twopass',        'T', 0, "logical",
36 36
 'sampleid',       'i', 1, "character"
... ...
@@ -78,7 +78,7 @@ outvcf <- !is.null(opt$outvcf)
78 78
 pool <- opt$pool
79 79
 model.homozygous <- !is.null(opt$modelhomozygous)
80 80
 file.rds <- opt$rds
81
-file.suffix <- ''
81
+sampleidExtension <- if (is.null(opt$extension)) '' else opt$extension
82 82
 
83 83
 if (!is.null(file.rds) && file.exists(file.rds)) {
84 84
     if (is.null(outdir)) outdir <- dirname(file.rds)
... ...
@@ -86,7 +86,7 @@ if (!is.null(file.rds) && file.exists(file.rds)) {
86 86
     if (is.null(sampleid)) stop("Need --sampleid.")
87 87
     if (is.null(genome)) stop("Need --genome")
88 88
     genome <- as.character(genome)
89
-    file.rds <- file.path(outdir, paste0(sampleid, file.suffix, '.rds'))
89
+    file.rds <- file.path(outdir, paste0(sampleid, sampleidExtension, '.rds'))
90 90
     if (is.null(seg.file)) {
91 91
         tumor.coverage.file <- normalizePath(tumor.coverage.file, 
92 92
             mustWork=TRUE)
... ...
@@ -159,9 +159,9 @@ if (file.exists(file.rds) && !force) {
159 159
     }
160 160
     normal.coverage.file <- .getNormalCoverage(normal.coverage.file)
161 161
         
162
-    file.log <- file.path(outdir, paste0(sampleid, file.suffix, '.log'))
162
+    file.log <- file.path(outdir, paste0(sampleid, sampleidExtension, '.log'))
163 163
 
164
-    pdf(paste(outdir,"/", sampleid, file.suffix, '_segmentation.pdf', sep=''), 
164
+    pdf(paste(outdir,"/", sampleid, sampleidExtension, '_segmentation.pdf', sep=''), 
165 165
         width=10, height=11)
166 166
     af.range = c(0.03, 0.97)
167 167
     if (!is.null(opt$minaf)) {
... ...
@@ -232,43 +232,43 @@ if (file.exists(file.rds) && !force) {
232 232
 ### Create output files -------------------------------------------------------
233 233
 
234 234
 createCurationFile(file.rds)
235
-file.pdf <- file.path(outdir, paste0(sampleid, file.suffix, '.pdf'))
235
+file.pdf <- file.path(outdir, paste0(sampleid, sampleidExtension, '.pdf'))
236 236
 pdf(file.pdf, width=10, height=11)
237 237
 plotAbs(ret, type='all')
238 238
 dev.off()
239 239
 
240
-file.png <- file.path(outdir, paste0(sampleid, file.suffix, '_contamination.png'))
240
+file.png <- file.path(outdir, paste0(sampleid, sampleidExtension, '_contamination.png'))
241 241
 png(file.png, width=800)
242 242
 plotAbs(ret,1, type='contamination')
243 243
 dev.off()
244 244
 
245 245
 if (outvcf) {
246
-    file.vcf <- file.path(outdir, paste0(sampleid, file.suffix, '.vcf'))
246
+    file.vcf <- file.path(outdir, paste0(sampleid, sampleidExtension, '.vcf'))
247 247
     vcfanno <- predictSomatic(ret, return.vcf=TRUE, 
248 248
         vcf.field.prefix="PureCN.")
249 249
     writeVcf(vcfanno, file=file.vcf)    
250 250
 } else {
251
-    file.csv <- file.path(outdir, paste0(sampleid, file.suffix, '_variants.csv'))
251
+    file.csv <- file.path(outdir, paste0(sampleid, sampleidExtension, '_variants.csv'))
252 252
     write.csv(cbind(Sampleid=sampleid, predictSomatic(ret)), file=file.csv, 
253 253
         row.names=FALSE, quote=FALSE)
254 254
 }    
255 255
 
256
-file.loh <- file.path(outdir, paste0(sampleid, file.suffix, '_loh.csv'))
256
+file.loh <- file.path(outdir, paste0(sampleid, sampleidExtension, '_loh.csv'))
257 257
 write.csv(cbind(Sampleid=sampleid, callLOH(ret)), file=file.loh, 
258 258
     row.names=FALSE, quote=FALSE)
259 259
 
260
-file.genes <- file.path(outdir, paste0(sampleid, file.suffix, '_genes.csv'))
260
+file.genes <- file.path(outdir, paste0(sampleid, sampleidExtension, '_genes.csv'))
261 261
 allAlterations <- callAlterations(ret, all.genes=TRUE)
262 262
 
263 263
 write.csv(cbind(Sampleid=sampleid, gene.symbol=rownames(allAlterations), 
264 264
     allAlterations), row.names=FALSE, file=file.genes, quote=FALSE)
265 265
 
266
-file.seg <- file.path(outdir, paste0(sampleid, file.suffix, '_dnacopy.txt'))
266
+file.seg <- file.path(outdir, paste0(sampleid, sampleidExtension, '_dnacopy.txt'))
267 267
 write.table(ret$results[[1]]$seg, file=file.seg, sep="\t", quote=FALSE, 
268 268
     row.names=FALSE)
269 269
 
270 270
 if (!is.null(ret$input$vcf)) {
271
-    file.pdf <- file.path(outdir, paste0(sampleid, file.suffix, '_chromosomes.pdf'))
271
+    file.pdf <- file.path(outdir, paste0(sampleid, sampleidExtension, '_chromosomes.pdf'))
272 272
     pdf(file.pdf, width=9, height=10)
273 273
     vcf <- ret$input$vcf[ret$results[[1]]$SNV.posterior$vcf.ids]
274 274
     chromosomes <- seqlevelsInUse(vcf)
275 275
new file mode 100644
... ...
@@ -0,0 +1,5 @@
1
+chromosome	start	end	gene	depth	log2
2
+chr1	762097	762270	LINC00115	174.89	7.45031
3
+chr1	861281	861490	SAMD11	28.9043	4.85321
4
+chr1	865591	865791	SAMD11	51.26	5.67976
5
+chr1	866325	866498	SAMD11	14	3.80735
0 6
new file mode 100644
... ...
@@ -0,0 +1,6 @@
1
+chromosome	start	end	gene	log2	depth	weight
2
+chr1	10500	68590	Background	0.55584	0.70587	0.466868
3
+chr1	70509	176917	Background	0.235896	1.02411	0.482562
4
+chr1	227917	267219	Background	0.163203	0.387996	0.408305
5
+chr1	318219	367158	Background	0.375418	1.42616	0.424955
6
+chr1	367658	367893	.	0.68569	17.617	0.310347
... ...
@@ -9,4 +9,20 @@ test_readCoverageFile <- function() {
9 9
     checkEquals(3, length(coverage))
10 10
     checkEqualsNumeric(c(1216042, 1216606, 1216791), start(coverage))
11 11
     checkEqualsNumeric(c(1216050, 1216678, 1217991), end(coverage))
12
+
13
+    coverageFile <- system.file("extdata", "example_normal3.cnn", package="PureCN")
14
+    coverage <- readCoverageFile(coverageFile)
15
+    checkEquals(4, length(coverage))
16
+    checkEqualsNumeric(c(762097, 861281, 865591, 866325)+1, start(coverage))
17
+    checkEqualsNumeric(c(762270, 861490, 865791, 866498), end(coverage))
18
+    coverage <- readCoverageFile(coverageFile, zero=FALSE)
19
+    checkEquals(4, length(coverage))
20
+    checkEqualsNumeric(c(762097, 861281, 865591, 866325), start(coverage))
21
+    checkEqualsNumeric(c(762270, 861490, 865791, 866498), end(coverage))
22
+    coverageFile <- system.file("extdata", "example_normal4.cnr", package="PureCN")
23
+    coverage <- readCoverageFile(coverageFile)
24
+    checkEquals(5, length(coverage))
25
+    checkEqualsNumeric(c(10500, 70509, 227917, 318219, 367658)+1, start(coverage))
26
+    checkEqualsNumeric(c(68590, 176917, 267219, 367158, 367893), end(coverage))
27
+    checkEquals(c(FALSE, FALSE, FALSE, FALSE, TRUE), coverage$ontarget)
12 28
 }    
... ...
@@ -5,8 +5,9 @@
5 5
 \title{Call mutation burden}
6 6
 \usage{
7 7
 callMutationBurden(res, id = 1, remove.flagged = TRUE,
8
-  min.prior.somatic = 0.1, min.cellfraction = 0, fun.countMutation = NULL,
9
-  callable = NULL, exclude = NULL)
8
+  min.prior.somatic = 0.1, min.cellfraction = 0,
9
+  fun.countMutation = function(vcf) width(vcf) == 1, callable = NULL,
10
+  exclude = NULL)
10 11
 }
11 12
 \arguments{
12 13
 \item{res}{Return object of the \code{\link{runAbsoluteCN}} function.}
... ...
@@ -27,7 +28,8 @@ with very low allelic fraction.}
27 28
 \item{fun.countMutation}{Function that can be used to filter the
28 29
 input VCF further for filtering, for example to only keep missense
29 30
 mutations. Expects a \code{logical} vector indicating whether variant
30
-should be counted (\code{TRUE}) or not (\code{FALSE}).}
31
+should be counted (\code{TRUE}) or not (\code{FALSE}). Default
32
+is to keep only single nucleotide variants.}
31 33
 
32 34
 \item{callable}{\code{GRanges} object with callable genomic regions,
33 35
 for example obtained by \sQuote{GATK CallableLoci} BED file, imported 
... ...
@@ -4,13 +4,16 @@
4 4
 \alias{readCoverageFile}
5 5
 \title{Read coverage file}
6 6
 \usage{
7
-readCoverageFile(file, format)
7
+readCoverageFile(file, format, zero = NULL)
8 8
 }
9 9
 \arguments{
10 10
 \item{file}{Target coverage file.}
11 11
 
12 12
 \item{format}{File format. If missing, derived from the file 
13 13
 extension. Currently only GATK DepthofCoverage format supported.}
14
+
15
+\item{zero}{Start position is 0-based. Default is \code{FALSE}
16
+for GATK, \code{TRUE} for BED file based intervals.}
14 17
 }
15 18
 \value{
16 19
 A \code{data.frame} with the parsed coverage information.
... ...
@@ -1205,14 +1205,14 @@ Dx.R extracts copy number and mutation metrics from PureCN.R output.
1205 1205
 
1206 1206
 \begin{verbatim}
1207 1207
 # Basic output
1208
-Rscript Dx.R --outdir $OUT --rds Sample1_purecn.rds 
1208
+Rscript Dx.R --out ${OUT}/Sample1 --rds Sample1_purecn.rds 
1209 1209
 
1210 1210
 # Provide a BED file with callable regions, for examples obtained by
1211 1211
 # GATK CallableLoci. Useful to calculate mutations per megabase and
1212 1212
 # to exclude low quality regions.
1213 1213
 grep CALLABLE Sample1_callable_status.bed > \ 
1214 1214
     Sample1_callable_status_filtered.bed
1215
-Rscript Dx.R --outdir $OUT  --rds Sample1_purecn.rds \
1215
+Rscript Dx.R --out ${OUT}/Sample1  --rds Sample1_purecn.rds \
1216 1216
     --callable Sample1_callable_status_filtered.bed
1217 1217
 \end{verbatim}
1218 1218
 
... ...
@@ -1228,7 +1228,7 @@ Argument name  & Corresponding PureCN argument & PureCN function \\
1228 1228
 --rds -r      & file.rds & \Rfunction{readCurationFile}   \\
1229 1229
 --callable -a & callable & \Rfunction{callMutationBurden} \\
1230 1230
 --exclude -b  & exclude  & \Rfunction{callMutationBurden} \\
1231
+--out -o      & & \\
1231 1232
 \bottomrule
1232 1233
 \end{tabular}
1233 1234
 \end{table*}
... ...
@@ -25,13 +25,15 @@ vignette.
25 25
 
26 26
 \subsection*{Prepare environment and files}
27 27
 
28
-Get the path to command line scripts in R:
28
+\begin{itemize}
29
+
30
+\item Get the path to command line scripts in R:
29 31
 
30 32
 <<paths>>=
31 33
 system.file("extdata", package="PureCN")
32 34
 @ 
33 35
 
34
-Store this path in an environment variable, for example in BASH:
36
+\item Store this path in an environment variable, for example in BASH:
35 37
 
36 38
 \begin{verbatim}
37 39
 $ export PURECN="/path/to/PureCN/extdata"
... ...
@@ -39,11 +41,11 @@ $ Rscript $PURECN/PureCN.R --help
39 41
 Usage: /path/to/PureCN/inst/extdata/PureCN.R [-[-help|h]] ...
40 42
 \end{verbatim}
41 43
 
42
-Generate a basic interval file from a BED file containing target coordinates:
44
+\item Generate a basic interval file from a BED file containing target coordinates:
43 45
 
44 46
 \begin{verbatim}
45
-$ Rscript $PURECN/IntervalFile.R --infile ex_intervals.bed \ 
46
-    --fasta ex_reference.fa --outfile ex_gcgene.txt
47
+$ Rscript $PURECN/IntervalFile.R --infile baits_hg19.bed \ 
48
+    --fasta hg19.fa --outfile baits_hg19_gcgene.txt
47 49
 \end{verbatim}
48 50
 
49 51
 Internally, this script uses \Biocpkg{rtracklayer} to parse the
... ...
@@ -53,6 +55,8 @@ See the main vignette how to add gene symbols to the interval file. Symbols are
53 55
 necessary to obtain gene-level copy number and LOH calls. For a test run, you
54 56
 will not need this.
55 57
 
58
+\end{itemize}
59
+
56 60
 \subsection*{Run PureCN with third-party segmentation}
57 61
 
58 62
 If you already have a segmentation from third-party tools (for example CNVkit,
... ...
@@ -62,8 +66,8 @@ EXCAVATOR2). For a test run:
62 66
 Rscript $PURECN/PureCN.R --outdir $OUT/$SAMPLEID  \
63 67
     --sampleid $SAMPLEID \
64 68
     --segfile $OUT/$SAMPLEID/${SAMPLEID}.cnvkit.seg \
65
-    --vcf $VCF_FILE \
66
-    --genome hg19 --gcgene ex_gcgene.txt 
69
+    --vcf ${SAMPLEID}_mutect.vcf \
70
+    --genome hg19 --gcgene baits_hg19_gcgene.txt 
67 71
 \end{verbatim}
68 72
 
69 73
 The main VCF (\Rcode{--vcf}) is ideally created by \software{MuTect} 1.1.7.
... ...
@@ -77,12 +81,12 @@ and genome:
77 81
 \begin{verbatim}
78 82
 Rscript $PURECN/PureCN.R --outdir $OUT/$SAMPLEID  \
79 83
     --sampleid $SAMPLEID \
80
-    --segfile $OUT/$SAMPLEID/${SAMPLEID}.cnvkit.seg \
84
+    --segfile $OUT/$SAMPLEID/${SAMPLEID}_cnvkit.seg \
81 85
     --normal_panel $NORMAL_PANEL \
82
-    --vcf $VCF_FILE \
83
-    --statsfile $VCF_STATS_FILE \
86
+    --vcf ${SAMPLEID}_mutect.vcf \
87
+    --statsfile ${SAMPLEID}_mutect_stats.txt \
84 88
     --snpblacklist hg19_simpleRepeats.bed \
85
-    --genome hg19 --gcgene ex_gcgene.txt \
89
+    --genome hg19 --gcgene baits_hg19_gcgene.txt \
86 90
 #    --funsegmentation none \
87 91
     --force --postoptimize  
88 92
 \end{verbatim}
... ...
@@ -105,7 +109,8 @@ This results in a significant runtime increase for whole-exome data.
105 109
 
106 110
 The following describes \Biocpkg{PureCN} runs with internal copy number
107 111
 normalization and segmentation. Provided are again minimal examples for
108
-test runs.
112
+test runs. See the main vignette how to get optimal results in production
113
+pipelines.
109 114
 
110 115
 \subsubsection*{Coverage}
111 116
 
... ...
@@ -114,13 +119,13 @@ For each sample, tumor and normal:
114 119
 \begin{verbatim}
115 120
 # From a BAM file 
116 121
 $ Rscript $PURECN/Coverage.R --outdir $OUT/$SAMPLEID \ 
117
-    --bam example.bam \
118
-    --gcgene ex_gcgene.txt
122
+    --bam ${SAMPLEID}.bam \
123
+    --gcgene baits_hg19_gcgene.txt
119 124
 
120 125
 # From a GATK DepthOfCoverage file
121 126
 Rscript $PURECN/Coverage.R --outdir $OUT/$SAMPLEID \
122
-    --gatkcoverage example_tumor.txt \ 
123
-    --gcgene  ex_gcgene.txt
127
+    --gatkcoverage ${SAMPLEID}.coverage.sample_interval_summary \ 
128
+    --gcgene  baits_hg19_gcgene.txt
124 129
 \end{verbatim}
125 130
 
126 131
 \subsubsection*{NormalDB}
... ...
@@ -142,17 +147,23 @@ $ Rscript $PURECN/NormalDB.R --outdir $OUT \
142 147
 \begin{verbatim}
143 148
 cd $OUT/$SAMPLEID
144 149
 # From GC-normalized coverage data
145
-$ Rscript $PURECN/PureCN.R --outdir . --tumor example_tumor_loess.txt \ 
146
-    --normal example_normal_loess.txt --vcf $VCF_FILE -i $SAMPLEID \
147
-    --genome hg19 --gcgene  ex_gcgene.txt 
150
+$ Rscript $PURECN/PureCN.R --outdir . --tumor ${SAMPLEID}_coverage_loess.txt \ 
151
+    --normal ${SAMPLEID_NORMAL}_coverage_loess.txt \ 
152
+    --sampleid $SAMPLEID \
153
+    --vcf ${SAMPLEID}_mutect.vcf \
154
+    --genome hg19 \
155
+    --gcgene baits_hg19_gcgene.txt 
148 156
 
149 157
 # Without a matched normal    
150
-$ Rscript $PURECN/PureCN.R --outdir . --tumor example_tumor_loess.txt \ 
151
-    --normaldb ../normalDB_hg19.rds --pool 5 --vcf example_vcf.vcf -i $SAMPLEID \
152
-    --genome hg19 --gcgene example_gc.gene.file.txt 
158
+$ Rscript $PURECN/PureCN.R --outdir . --tumor ${SAMPLEID}_coverage_loess.txt \ 
159
+    --normaldb ../normalDB_hg19.rds \
160
+    --sampleid $SAMPLEID \
161
+    --vcf ${SAMPLEID}_mutect.vcf \
162
+    --pool 5 \
163
+    --genome hg19 --gcgene baits_hg19_gcgene.txt 
153 164
 
154 165
 # Recreate output after manual curation of Sample_purecn.csv
155
-$ Rscript $PURECN/PureCN.R --rds Sample1_purecn.rds
166
+$ Rscript $PURECN/PureCN.R --rds ${SAMPLEID}_purecn.rds
156 167
 \end{verbatim}
157 168
 
158 169
 \subsection*{Session Info}