git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/PureCN@129450 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -2,7 +2,7 @@ Package: PureCN |
2 | 2 |
Type: Package |
3 | 3 |
Title: Copy number calling and SNV classification using |
4 | 4 |
targeted short read sequencing |
5 |
-Version: 1.7.6 |
|
5 |
+Version: 1.7.7 |
|
6 | 6 |
Date: 2017-05-05 |
7 | 7 |
Authors@R: c(person("Markus", "Riester", role=c("aut", "cre"), |
8 | 8 |
email="[email protected]"), |
... | ... |
@@ -36,6 +36,7 @@ Imports: |
36 | 36 |
futile.logger, |
37 | 37 |
VGAM, |
38 | 38 |
edgeR, |
39 |
+ tools, |
|
39 | 40 |
limma |
40 | 41 |
Suggests: |
41 | 42 |
PSCBS, |
... | ... |
@@ -3,15 +3,16 @@ Changes in version 1.8.0 |
3 | 3 |
|
4 | 4 |
- Added mutation burden calculation |
5 | 5 |
- Code cleanups (switch to GRanges where possible) |
6 |
+- Added support for CNVkit coverage files (*.cnn, *.cnr) |
|
6 | 7 |
|
7 | 8 |
API CHANGES |
8 | 9 |
|
10 |
+- New function: callMutationBurden |
|
9 | 11 |
- Defunct functions: createSNPBlacklist, getDiploid, autoCurateResults |
10 | 12 |
- Minor changes: |
11 |
- - Remove gc.data from filterTargets since gc_bias is now added to tumor |
|
13 |
+ - Removed gc.data from filterTargets since gc_bias is now added to tumor |
|
12 | 14 |
coverage |
13 | 15 |
|
14 |
- |
|
15 | 16 |
PLANNED FEATURES FOR 1.8 |
16 | 17 |
|
17 | 18 |
- Better sample summary statistics, like mutation burden, chromosomal |
... | ... |
@@ -16,7 +16,8 @@ |
16 | 16 |
#' @param fun.countMutation Function that can be used to filter the |
17 | 17 |
#' input VCF further for filtering, for example to only keep missense |
18 | 18 |
#' mutations. Expects a \code{logical} vector indicating whether variant |
19 |
-#' should be counted (\code{TRUE}) or not (\code{FALSE}). |
|
19 |
+#' should be counted (\code{TRUE}) or not (\code{FALSE}). Default |
|
20 |
+#' is to keep only single nucleotide variants. |
|
20 | 21 |
#' @param callable \code{GRanges} object with callable genomic regions, |
21 | 22 |
#' for example obtained by \sQuote{GATK CallableLoci} BED file, imported |
22 | 23 |
#' with \code{rtracklayer}. |
... | ... |
@@ -49,7 +50,8 @@ |
49 | 50 |
#' |
50 | 51 |
#' @export callMutationBurden |
51 | 52 |
callMutationBurden <- function(res, id = 1, remove.flagged = TRUE, |
52 |
- min.prior.somatic=0.1, min.cellfraction=0, fun.countMutation=NULL, |
|
53 |
+ min.prior.somatic=0.1, min.cellfraction=0, |
|
54 |
+ fun.countMutation=function(vcf) width(vcf)==1, |
|
53 | 55 |
callable=NULL, exclude=NULL) { |
54 | 56 |
|
55 | 57 |
if (is.null(res$input$vcf)) { |
... | ... |
@@ -7,6 +7,8 @@ |
7 | 7 |
#' @param file Target coverage file. |
8 | 8 |
#' @param format File format. If missing, derived from the file |
9 | 9 |
#' extension. Currently only GATK DepthofCoverage format supported. |
10 |
+#' @param zero Start position is 0-based. Default is \code{FALSE} |
|
11 |
+#' for GATK, \code{TRUE} for BED file based intervals. |
|
10 | 12 |
#' @return A \code{data.frame} with the parsed coverage information. |
11 | 13 |
#' @author Markus Riester |
12 | 14 |
#' @seealso \code{\link{calculateBamCoverageByInterval}} |
... | ... |
@@ -16,14 +18,24 @@ |
16 | 18 |
#' package="PureCN") |
17 | 19 |
#' coverage <- readCoverageFile(tumor.coverage.file) |
18 | 20 |
#' |
21 |
+#' @importFrom tools file_ext |
|
19 | 22 |
#' @export readCoverageFile |
20 |
-readCoverageFile <- function(file, format) { |
|
21 |
- if (missing(format)) format <- "GATK" |
|
22 |
- targetCoverage <- .readCoverageGatk(file) |
|
23 |
+readCoverageFile <- function(file, format, zero=NULL) { |
|
24 |
+ if (missing(format)) format <- .getFormat(file) |
|
25 |
+ if (format %in% c("cnn", "cnr")) { |
|
26 |
+ targetCoverage <- .readCoverageCnn(file, zero) |
|
27 |
+ } else { |
|
28 |
+ targetCoverage <- .readCoverageGatk(file, zero) |
|
29 |
+ } |
|
23 | 30 |
.checkLowCoverage(targetCoverage) |
24 | 31 |
.checkIntervals(targetCoverage) |
25 | 32 |
} |
26 | 33 |
|
34 |
+.getFormat <- function(file) { |
|
35 |
+ ext <- file_ext(file) |
|
36 |
+ if (ext %in% c("cnn", "cnr")) return("cnn") |
|
37 |
+ "GATK" |
|
38 |
+} |
|
27 | 39 |
|
28 | 40 |
#' Read GATK coverage files |
29 | 41 |
#' |
... | ... |
@@ -44,18 +56,37 @@ readCoverageGatk <- function(file) { |
44 | 56 |
readCoverageFile(file, format="GATK") |
45 | 57 |
} |
46 | 58 |
|
47 |
-.readCoverageGatk <- function(file) { |
|
59 |
+.readCoverageGatk <- function(file, zero) { |
|
60 |
+ if (!is.null(zero)) flog.warn("zero ignored for GATK coverage files.") |
|
48 | 61 |
inputCoverage <- utils::read.table(file, header = TRUE) |
49 | 62 |
if (is.null(inputCoverage$total_coverage)) inputCoverage$total_coverage <- NA |
50 | 63 |
if (is.null(inputCoverage$average_coverage)) inputCoverage$average_coverage <- NA |
64 |
+ if (is.null(inputCoverage$ontarget)) inputCoverage$ontarget <- TRUE |
|
51 | 65 |
|
52 | 66 |
targetCoverage <- GRanges(inputCoverage$Target, |
53 | 67 |
coverage=inputCoverage$total_coverage, |
54 |
- average.coverage=inputCoverage$average_coverage) |
|
68 |
+ average.coverage=inputCoverage$average_coverage, |
|
69 |
+ ontarget=inputCoverage$ontarget) |
|
55 | 70 |
|
56 | 71 |
targetCoverage |
57 | 72 |
} |
58 | 73 |
|
74 |
+.readCoverageCnn <- function(file, zero) { |
|
75 |
+ if (is.null(zero)) zero <- TRUE |
|
76 |
+ inputCoverage <- utils::read.table(file, header = TRUE) |
|
77 |
+ if (zero) inputCoverage$start <- inputCoverage$start + 1 |
|
78 |
+ targetCoverage <- GRanges(inputCoverage) |
|
79 |
+ targetCoverage$coverage <- targetCoverage$depth * width(targetCoverage) |
|
80 |
+ targetCoverage$average.coverage <- targetCoverage$depth |
|
81 |
+ targetCoverage$ontarget <- TRUE |
|
82 |
+ targetCoverage$depth <- NULL |
|
83 |
+ targetCoverage$Gene <- targetCoverage$gene |
|
84 |
+ targetCoverage$ontarget[which(targetCoverage$Gene=="Background")] <- FALSE |
|
85 |
+ targetCoverage$gene <- NULL |
|
86 |
+ targetCoverage$log2 <- NULL |
|
87 |
+ targetCoverage |
|
88 |
+} |
|
89 |
+ |
|
59 | 90 |
.checkIntervals <- function(coverageGr) { |
60 | 91 |
if (min(width(coverageGr))<2) { |
61 | 92 |
flog.warn("Coverage data contains single nucleotide intervals.") |
... | ... |
@@ -10,7 +10,7 @@ spec <- matrix(c( |
10 | 10 |
'rds', 'r', 1, "character", |
11 | 11 |
'callable', 'a', 1, "character", |
12 | 12 |
'exclude', 'b', 1, "character", |
13 |
-'outdir', 'o', 1, "character" |
|
13 |
+'out', 'o', 1, "character" |
|
14 | 14 |
), byrow=TRUE, ncol=4) |
15 | 15 |
opt <- getopt(spec) |
16 | 16 |
|
... | ... |
@@ -31,9 +31,6 @@ if (is.null(infileRds)) stop("Need --rds") |
31 | 31 |
infileRds <- normalizePath(infileRds, mustWork=TRUE) |
32 | 32 |
|
33 | 33 |
# Parse outdir |
34 |
-outdir <- opt$outdir |
|
35 |
-if (is.null(outdir)) outdir <- dirname(infileRds) |
|
36 |
-outdir <- normalizePath(outdir, mustWork=TRUE) |
|
37 | 34 |
|
38 | 35 |
# Parse both BED files restricting covered region |
39 | 36 |
callableFile <- opt$callable |
... | ... |
@@ -42,7 +39,7 @@ if (!is.null(callableFile)) { |
42 | 39 |
suppressPackageStartupMessages(library(rtracklayer)) |
43 | 40 |
callableFile <- normalizePath(callableFile, mustWork=TRUE) |
44 | 41 |
flog.info("Reading %s...", callableFile) |
45 |
- callable <- import(callableFile) |
|
42 |
+ callable <- GRanges(import(callableFile)) |
|
46 | 43 |
} |
47 | 44 |
|
48 | 45 |
excludeFile <- opt$exclude |
... | ... |
@@ -51,7 +48,7 @@ if (!is.null(excludeFile)) { |
51 | 48 |
suppressPackageStartupMessages(library(rtracklayer)) |
52 | 49 |
excludeFile <- normalizePath(excludeFile, mustWork=TRUE) |
53 | 50 |
flog.info("Reading %s...", excludeFile) |
54 |
- exclude <- import(excludeFile) |
|
51 |
+ exclude <- GRanges(import(excludeFile)) |
|
55 | 52 |
} |
56 | 53 |
|
57 | 54 |
|
... | ... |
@@ -60,12 +57,28 @@ if (!is.null(excludeFile)) { |
60 | 57 |
flog.info("Loading PureCN...") |
61 | 58 |
suppressPackageStartupMessages(library(PureCN)) |
62 | 59 |
|
63 |
-flog.info("Reading %s...", infileRds) |
|
64 | 60 |
res <- readCurationFile(infileRds) |
65 | 61 |
sampleid <- res$input$sampleid |
66 |
-file.suffix <- "" |
|
67 | 62 |
|
68 |
-outfileMb <- file.path(outdir, paste0(sampleid, file.suffix, '_mutation_burden.csv')) |
|
63 |
+.getOutPrefix <- function(opt, infile, sampleid) { |
|
64 |
+ out <- opt[["out"]] |
|
65 |
+ if (is.null(out)) { |
|
66 |
+ if (!is.null(infile) && file.exists(infile)) { |
|
67 |
+ outdir <- dirname(infile) |
|
68 |
+ prefix <- sampleid |
|
69 |
+ } else { |
|
70 |
+ stop("Need --out") |
|
71 |
+ } |
|
72 |
+ } else { |
|
73 |
+ outdir <- dirname(out) |
|
74 |
+ prefix <- basename(out) |
|
75 |
+ } |
|
76 |
+ outdir <- normalizePath(outdir, mustWork=TRUE) |
|
77 |
+ outPrefix <- file.path(outdir, prefix) |
|
78 |
+} |
|
79 |
+outPrefix <- .getOutPrefix(opt, infileRds, sampleid) |
|
80 |
+ |
|
81 |
+outfileMb <- paste0(outPrefix, '_mutation_burden.csv') |
|
69 | 82 |
|
70 | 83 |
force <- !is.null(opt$force) |
71 | 84 |
|
... | ... |
@@ -30,7 +30,7 @@ spec <- matrix(c( |
30 | 30 |
'modelhomozygous','y', 0, "logical", |
31 | 31 |
'model', 'q', 1, "character", |
32 | 32 |
'funsegmentation','w', 1, "character", |
33 |
-'outdir', 'o', 1, "character", |
|
33 |
+'outdir', 'o', 1, "character", |
|
34 | 34 |
'outvcf', 'u', 0, "logical", |
35 | 35 |
'twopass', 'T', 0, "logical", |
36 | 36 |
'sampleid', 'i', 1, "character" |
... | ... |
@@ -78,7 +78,7 @@ outvcf <- !is.null(opt$outvcf) |
78 | 78 |
pool <- opt$pool |
79 | 79 |
model.homozygous <- !is.null(opt$modelhomozygous) |
80 | 80 |
file.rds <- opt$rds |
81 |
-file.suffix <- '' |
|
81 |
+sampleidExtension <- if (is.null(opt$extension)) '' else opt$extension |
|
82 | 82 |
|
83 | 83 |
if (!is.null(file.rds) && file.exists(file.rds)) { |
84 | 84 |
if (is.null(outdir)) outdir <- dirname(file.rds) |
... | ... |
@@ -86,7 +86,7 @@ if (!is.null(file.rds) && file.exists(file.rds)) { |
86 | 86 |
if (is.null(sampleid)) stop("Need --sampleid.") |
87 | 87 |
if (is.null(genome)) stop("Need --genome") |
88 | 88 |
genome <- as.character(genome) |
89 |
- file.rds <- file.path(outdir, paste0(sampleid, file.suffix, '.rds')) |
|
89 |
+ file.rds <- file.path(outdir, paste0(sampleid, sampleidExtension, '.rds')) |
|
90 | 90 |
if (is.null(seg.file)) { |
91 | 91 |
tumor.coverage.file <- normalizePath(tumor.coverage.file, |
92 | 92 |
mustWork=TRUE) |
... | ... |
@@ -159,9 +159,9 @@ if (file.exists(file.rds) && !force) { |
159 | 159 |
} |
160 | 160 |
normal.coverage.file <- .getNormalCoverage(normal.coverage.file) |
161 | 161 |
|
162 |
- file.log <- file.path(outdir, paste0(sampleid, file.suffix, '.log')) |
|
162 |
+ file.log <- file.path(outdir, paste0(sampleid, sampleidExtension, '.log')) |
|
163 | 163 |
|
164 |
- pdf(paste(outdir,"/", sampleid, file.suffix, '_segmentation.pdf', sep=''), |
|
164 |
+ pdf(paste(outdir,"/", sampleid, sampleidExtension, '_segmentation.pdf', sep=''), |
|
165 | 165 |
width=10, height=11) |
166 | 166 |
af.range = c(0.03, 0.97) |
167 | 167 |
if (!is.null(opt$minaf)) { |
... | ... |
@@ -232,43 +232,43 @@ if (file.exists(file.rds) && !force) { |
232 | 232 |
### Create output files ------------------------------------------------------- |
233 | 233 |
|
234 | 234 |
createCurationFile(file.rds) |
235 |
-file.pdf <- file.path(outdir, paste0(sampleid, file.suffix, '.pdf')) |
|
235 |
+file.pdf <- file.path(outdir, paste0(sampleid, sampleidExtension, '.pdf')) |
|
236 | 236 |
pdf(file.pdf, width=10, height=11) |
237 | 237 |
plotAbs(ret, type='all') |
238 | 238 |
dev.off() |
239 | 239 |
|
240 |
-file.png <- file.path(outdir, paste0(sampleid, file.suffix, '_contamination.png')) |
|
240 |
+file.png <- file.path(outdir, paste0(sampleid, sampleidExtension, '_contamination.png')) |
|
241 | 241 |
png(file.png, width=800) |
242 | 242 |
plotAbs(ret,1, type='contamination') |
243 | 243 |
dev.off() |
244 | 244 |
|
245 | 245 |
if (outvcf) { |
246 |
- file.vcf <- file.path(outdir, paste0(sampleid, file.suffix, '.vcf')) |
|
246 |
+ file.vcf <- file.path(outdir, paste0(sampleid, sampleidExtension, '.vcf')) |
|
247 | 247 |
vcfanno <- predictSomatic(ret, return.vcf=TRUE, |
248 | 248 |
vcf.field.prefix="PureCN.") |
249 | 249 |
writeVcf(vcfanno, file=file.vcf) |
250 | 250 |
} else { |
251 |
- file.csv <- file.path(outdir, paste0(sampleid, file.suffix, '_variants.csv')) |
|
251 |
+ file.csv <- file.path(outdir, paste0(sampleid, sampleidExtension, '_variants.csv')) |
|
252 | 252 |
write.csv(cbind(Sampleid=sampleid, predictSomatic(ret)), file=file.csv, |
253 | 253 |
row.names=FALSE, quote=FALSE) |
254 | 254 |
} |
255 | 255 |
|
256 |
-file.loh <- file.path(outdir, paste0(sampleid, file.suffix, '_loh.csv')) |
|
256 |
+file.loh <- file.path(outdir, paste0(sampleid, sampleidExtension, '_loh.csv')) |
|
257 | 257 |
write.csv(cbind(Sampleid=sampleid, callLOH(ret)), file=file.loh, |
258 | 258 |
row.names=FALSE, quote=FALSE) |
259 | 259 |
|
260 |
-file.genes <- file.path(outdir, paste0(sampleid, file.suffix, '_genes.csv')) |
|
260 |
+file.genes <- file.path(outdir, paste0(sampleid, sampleidExtension, '_genes.csv')) |
|
261 | 261 |
allAlterations <- callAlterations(ret, all.genes=TRUE) |
262 | 262 |
|
263 | 263 |
write.csv(cbind(Sampleid=sampleid, gene.symbol=rownames(allAlterations), |
264 | 264 |
allAlterations), row.names=FALSE, file=file.genes, quote=FALSE) |
265 | 265 |
|
266 |
-file.seg <- file.path(outdir, paste0(sampleid, file.suffix, '_dnacopy.txt')) |
|
266 |
+file.seg <- file.path(outdir, paste0(sampleid, sampleidExtension, '_dnacopy.txt')) |
|
267 | 267 |
write.table(ret$results[[1]]$seg, file=file.seg, sep="\t", quote=FALSE, |
268 | 268 |
row.names=FALSE) |
269 | 269 |
|
270 | 270 |
if (!is.null(ret$input$vcf)) { |
271 |
- file.pdf <- file.path(outdir, paste0(sampleid, file.suffix, '_chromosomes.pdf')) |
|
271 |
+ file.pdf <- file.path(outdir, paste0(sampleid, sampleidExtension, '_chromosomes.pdf')) |
|
272 | 272 |
pdf(file.pdf, width=9, height=10) |
273 | 273 |
vcf <- ret$input$vcf[ret$results[[1]]$SNV.posterior$vcf.ids] |
274 | 274 |
chromosomes <- seqlevelsInUse(vcf) |
0 | 6 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,6 @@ |
1 |
+chromosome start end gene log2 depth weight |
|
2 |
+chr1 10500 68590 Background 0.55584 0.70587 0.466868 |
|
3 |
+chr1 70509 176917 Background 0.235896 1.02411 0.482562 |
|
4 |
+chr1 227917 267219 Background 0.163203 0.387996 0.408305 |
|
5 |
+chr1 318219 367158 Background 0.375418 1.42616 0.424955 |
|
6 |
+chr1 367658 367893 . 0.68569 17.617 0.310347 |
... | ... |
@@ -9,4 +9,20 @@ test_readCoverageFile <- function() { |
9 | 9 |
checkEquals(3, length(coverage)) |
10 | 10 |
checkEqualsNumeric(c(1216042, 1216606, 1216791), start(coverage)) |
11 | 11 |
checkEqualsNumeric(c(1216050, 1216678, 1217991), end(coverage)) |
12 |
+ |
|
13 |
+ coverageFile <- system.file("extdata", "example_normal3.cnn", package="PureCN") |
|
14 |
+ coverage <- readCoverageFile(coverageFile) |
|
15 |
+ checkEquals(4, length(coverage)) |
|
16 |
+ checkEqualsNumeric(c(762097, 861281, 865591, 866325)+1, start(coverage)) |
|
17 |
+ checkEqualsNumeric(c(762270, 861490, 865791, 866498), end(coverage)) |
|
18 |
+ coverage <- readCoverageFile(coverageFile, zero=FALSE) |
|
19 |
+ checkEquals(4, length(coverage)) |
|
20 |
+ checkEqualsNumeric(c(762097, 861281, 865591, 866325), start(coverage)) |
|
21 |
+ checkEqualsNumeric(c(762270, 861490, 865791, 866498), end(coverage)) |
|
22 |
+ coverageFile <- system.file("extdata", "example_normal4.cnr", package="PureCN") |
|
23 |
+ coverage <- readCoverageFile(coverageFile) |
|
24 |
+ checkEquals(5, length(coverage)) |
|
25 |
+ checkEqualsNumeric(c(10500, 70509, 227917, 318219, 367658)+1, start(coverage)) |
|
26 |
+ checkEqualsNumeric(c(68590, 176917, 267219, 367158, 367893), end(coverage)) |
|
27 |
+ checkEquals(c(FALSE, FALSE, FALSE, FALSE, TRUE), coverage$ontarget) |
|
12 | 28 |
} |
... | ... |
@@ -5,8 +5,9 @@ |
5 | 5 |
\title{Call mutation burden} |
6 | 6 |
\usage{ |
7 | 7 |
callMutationBurden(res, id = 1, remove.flagged = TRUE, |
8 |
- min.prior.somatic = 0.1, min.cellfraction = 0, fun.countMutation = NULL, |
|
9 |
- callable = NULL, exclude = NULL) |
|
8 |
+ min.prior.somatic = 0.1, min.cellfraction = 0, |
|
9 |
+ fun.countMutation = function(vcf) width(vcf) == 1, callable = NULL, |
|
10 |
+ exclude = NULL) |
|
10 | 11 |
} |
11 | 12 |
\arguments{ |
12 | 13 |
\item{res}{Return object of the \code{\link{runAbsoluteCN}} function.} |
... | ... |
@@ -27,7 +28,8 @@ with very low allelic fraction.} |
27 | 28 |
\item{fun.countMutation}{Function that can be used to filter the |
28 | 29 |
input VCF further for filtering, for example to only keep missense |
29 | 30 |
mutations. Expects a \code{logical} vector indicating whether variant |
30 |
-should be counted (\code{TRUE}) or not (\code{FALSE}).} |
|
31 |
+should be counted (\code{TRUE}) or not (\code{FALSE}). Default |
|
32 |
+is to keep only single nucleotide variants.} |
|
31 | 33 |
|
32 | 34 |
\item{callable}{\code{GRanges} object with callable genomic regions, |
33 | 35 |
for example obtained by \sQuote{GATK CallableLoci} BED file, imported |
... | ... |
@@ -4,13 +4,16 @@ |
4 | 4 |
\alias{readCoverageFile} |
5 | 5 |
\title{Read coverage file} |
6 | 6 |
\usage{ |
7 |
-readCoverageFile(file, format) |
|
7 |
+readCoverageFile(file, format, zero = NULL) |
|
8 | 8 |
} |
9 | 9 |
\arguments{ |
10 | 10 |
\item{file}{Target coverage file.} |
11 | 11 |
|
12 | 12 |
\item{format}{File format. If missing, derived from the file |
13 | 13 |
extension. Currently only GATK DepthofCoverage format supported.} |
14 |
+ |
|
15 |
+\item{zero}{Start position is 0-based. Default is \code{FALSE} |
|
16 |
+for GATK, \code{TRUE} for BED file based intervals.} |
|
14 | 17 |
} |
15 | 18 |
\value{ |
16 | 19 |
A \code{data.frame} with the parsed coverage information. |
... | ... |
@@ -1205,14 +1205,14 @@ Dx.R extracts copy number and mutation metrics from PureCN.R output. |
1205 | 1205 |
|
1206 | 1206 |
\begin{verbatim} |
1207 | 1207 |
# Basic output |
1208 |
-Rscript Dx.R --outdir $OUT --rds Sample1_purecn.rds |
|
1208 |
+Rscript Dx.R --out ${OUT}/Sample1 --rds Sample1_purecn.rds |
|
1209 | 1209 |
|
1210 | 1210 |
# Provide a BED file with callable regions, for examples obtained by |
1211 | 1211 |
# GATK CallableLoci. Useful to calculate mutations per megabase and |
1212 | 1212 |
# to exclude low quality regions. |
1213 | 1213 |
grep CALLABLE Sample1_callable_status.bed > \ |
1214 | 1214 |
Sample1_callable_status_filtered.bed |
1215 |
-Rscript Dx.R --outdir $OUT --rds Sample1_purecn.rds \ |
|
1215 |
+Rscript Dx.R --out ${OUT}/Sample1 --rds Sample1_purecn.rds \ |
|
1216 | 1216 |
--callable Sample1_callable_status_filtered.bed |
1217 | 1217 |
\end{verbatim} |
1218 | 1218 |
|
... | ... |
@@ -1228,7 +1228,7 @@ Argument name & Corresponding PureCN argument & PureCN function \\ |
1228 | 1228 |
--rds -r & file.rds & \Rfunction{readCurationFile} \\ |
1229 | 1229 |
--callable -a & callable & \Rfunction{callMutationBurden} \\ |
1230 | 1230 |
--exclude -b & exclude & \Rfunction{callMutationBurden} \\ |
1231 |
+--out -o & & \\ |
|
1231 | 1232 |
\bottomrule |
1232 | 1233 |
\end{tabular} |
1233 | 1234 |
\end{table*} |
... | ... |
@@ -25,13 +25,15 @@ vignette. |
25 | 25 |
|
26 | 26 |
\subsection*{Prepare environment and files} |
27 | 27 |
|
28 |
-Get the path to command line scripts in R: |
|
28 |
+\begin{itemize} |
|
29 |
+ |
|
30 |
+\item Get the path to command line scripts in R: |
|
29 | 31 |
|
30 | 32 |
<<paths>>= |
31 | 33 |
system.file("extdata", package="PureCN") |
32 | 34 |
@ |
33 | 35 |
|
34 |
-Store this path in an environment variable, for example in BASH: |
|
36 |
+\item Store this path in an environment variable, for example in BASH: |
|
35 | 37 |
|
36 | 38 |
\begin{verbatim} |
37 | 39 |
$ export PURECN="/path/to/PureCN/extdata" |
... | ... |
@@ -39,11 +41,11 @@ $ Rscript $PURECN/PureCN.R --help |
39 | 41 |
Usage: /path/to/PureCN/inst/extdata/PureCN.R [-[-help|h]] ... |
40 | 42 |
\end{verbatim} |
41 | 43 |
|
42 |
-Generate a basic interval file from a BED file containing target coordinates: |
|
44 |
+\item Generate a basic interval file from a BED file containing target coordinates: |
|
43 | 45 |
|
44 | 46 |
\begin{verbatim} |
45 |
-$ Rscript $PURECN/IntervalFile.R --infile ex_intervals.bed \ |
|
46 |
- --fasta ex_reference.fa --outfile ex_gcgene.txt |
|
47 |
+$ Rscript $PURECN/IntervalFile.R --infile baits_hg19.bed \ |
|
48 |
+ --fasta hg19.fa --outfile baits_hg19_gcgene.txt |
|
47 | 49 |
\end{verbatim} |
48 | 50 |
|
49 | 51 |
Internally, this script uses \Biocpkg{rtracklayer} to parse the |
... | ... |
@@ -53,6 +55,8 @@ See the main vignette how to add gene symbols to the interval file. Symbols are |
53 | 55 |
necessary to obtain gene-level copy number and LOH calls. For a test run, you |
54 | 56 |
will not need this. |
55 | 57 |
|
58 |
+\end{itemize} |
|
59 |
+ |
|
56 | 60 |
\subsection*{Run PureCN with third-party segmentation} |
57 | 61 |
|
58 | 62 |
If you already have a segmentation from third-party tools (for example CNVkit, |
... | ... |
@@ -62,8 +66,8 @@ EXCAVATOR2). For a test run: |
62 | 66 |
Rscript $PURECN/PureCN.R --outdir $OUT/$SAMPLEID \ |
63 | 67 |
--sampleid $SAMPLEID \ |
64 | 68 |
--segfile $OUT/$SAMPLEID/${SAMPLEID}.cnvkit.seg \ |
65 |
- --vcf $VCF_FILE \ |
|
66 |
- --genome hg19 --gcgene ex_gcgene.txt |
|
69 |
+ --vcf ${SAMPLEID}_mutect.vcf \ |
|
70 |
+ --genome hg19 --gcgene baits_hg19_gcgene.txt |
|
67 | 71 |
\end{verbatim} |
68 | 72 |
|
69 | 73 |
The main VCF (\Rcode{--vcf}) is ideally created by \software{MuTect} 1.1.7. |
... | ... |
@@ -77,12 +81,12 @@ and genome: |
77 | 81 |
\begin{verbatim} |
78 | 82 |
Rscript $PURECN/PureCN.R --outdir $OUT/$SAMPLEID \ |
79 | 83 |
--sampleid $SAMPLEID \ |
80 |
- --segfile $OUT/$SAMPLEID/${SAMPLEID}.cnvkit.seg \ |
|
84 |
+ --segfile $OUT/$SAMPLEID/${SAMPLEID}_cnvkit.seg \ |
|
81 | 85 |
--normal_panel $NORMAL_PANEL \ |
82 |
- --vcf $VCF_FILE \ |
|
83 |
- --statsfile $VCF_STATS_FILE \ |
|
86 |
+ --vcf ${SAMPLEID}_mutect.vcf \ |
|
87 |
+ --statsfile ${SAMPLEID}_mutect_stats.txt \ |
|
84 | 88 |
--snpblacklist hg19_simpleRepeats.bed \ |
85 |
- --genome hg19 --gcgene ex_gcgene.txt \ |
|
89 |
+ --genome hg19 --gcgene baits_hg19_gcgene.txt \ |
|
86 | 90 |
# --funsegmentation none \ |
87 | 91 |
--force --postoptimize |
88 | 92 |
\end{verbatim} |
... | ... |
@@ -105,7 +109,8 @@ This results in a significant runtime increase for whole-exome data. |
105 | 109 |
|
106 | 110 |
The following describes \Biocpkg{PureCN} runs with internal copy number |
107 | 111 |
normalization and segmentation. Provided are again minimal examples for |
108 |
-test runs. |
|
112 |
+test runs. See the main vignette how to get optimal results in production |
|
113 |
+pipelines. |
|
109 | 114 |
|
110 | 115 |
\subsubsection*{Coverage} |
111 | 116 |
|
... | ... |
@@ -114,13 +119,13 @@ For each sample, tumor and normal: |
114 | 119 |
\begin{verbatim} |
115 | 120 |
# From a BAM file |
116 | 121 |
$ Rscript $PURECN/Coverage.R --outdir $OUT/$SAMPLEID \ |
117 |
- --bam example.bam \ |
|
118 |
- --gcgene ex_gcgene.txt |
|
122 |
+ --bam ${SAMPLEID}.bam \ |
|
123 |
+ --gcgene baits_hg19_gcgene.txt |
|
119 | 124 |
|
120 | 125 |
# From a GATK DepthOfCoverage file |
121 | 126 |
Rscript $PURECN/Coverage.R --outdir $OUT/$SAMPLEID \ |
122 |
- --gatkcoverage example_tumor.txt \ |
|
123 |
- --gcgene ex_gcgene.txt |
|
127 |
+ --gatkcoverage ${SAMPLEID}.coverage.sample_interval_summary \ |
|
128 |
+ --gcgene baits_hg19_gcgene.txt |
|
124 | 129 |
\end{verbatim} |
125 | 130 |
|
126 | 131 |
\subsubsection*{NormalDB} |
... | ... |
@@ -142,17 +147,23 @@ $ Rscript $PURECN/NormalDB.R --outdir $OUT \ |
142 | 147 |
\begin{verbatim} |
143 | 148 |
cd $OUT/$SAMPLEID |
144 | 149 |
# From GC-normalized coverage data |
145 |
-$ Rscript $PURECN/PureCN.R --outdir . --tumor example_tumor_loess.txt \ |
|
146 |
- --normal example_normal_loess.txt --vcf $VCF_FILE -i $SAMPLEID \ |
|
147 |
- --genome hg19 --gcgene ex_gcgene.txt |
|
150 |
+$ Rscript $PURECN/PureCN.R --outdir . --tumor ${SAMPLEID}_coverage_loess.txt \ |
|
151 |
+ --normal ${SAMPLEID_NORMAL}_coverage_loess.txt \ |
|
152 |
+ --sampleid $SAMPLEID \ |
|
153 |
+ --vcf ${SAMPLEID}_mutect.vcf \ |
|
154 |
+ --genome hg19 \ |
|
155 |
+ --gcgene baits_hg19_gcgene.txt |
|
148 | 156 |
|
149 | 157 |
# Without a matched normal |
150 |
-$ Rscript $PURECN/PureCN.R --outdir . --tumor example_tumor_loess.txt \ |
|
151 |
- --normaldb ../normalDB_hg19.rds --pool 5 --vcf example_vcf.vcf -i $SAMPLEID \ |
|
152 |
- --genome hg19 --gcgene example_gc.gene.file.txt |
|
158 |
+$ Rscript $PURECN/PureCN.R --outdir . --tumor ${SAMPLEID}_coverage_loess.txt \ |
|
159 |
+ --normaldb ../normalDB_hg19.rds \ |
|
160 |
+ --sampleid $SAMPLEID \ |
|
161 |
+ --vcf ${SAMPLEID}_mutect.vcf \ |
|
162 |
+ --pool 5 \ |
|
163 |
+ --genome hg19 --gcgene baits_hg19_gcgene.txt |
|
153 | 164 |
|
154 | 165 |
# Recreate output after manual curation of Sample_purecn.csv |
155 |
-$ Rscript $PURECN/PureCN.R --rds Sample1_purecn.rds |
|
166 |
+$ Rscript $PURECN/PureCN.R --rds ${SAMPLEID}_purecn.rds |
|
156 | 167 |
\end{verbatim} |
157 | 168 |
|
158 | 169 |
\subsection*{Session Info} |