git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/PureCN@116349 bc3139a8-67e5-0310-9ffc-ced21a209358
... | ... |
@@ -2,16 +2,15 @@ Package: PureCN |
2 | 2 |
Type: Package |
3 | 3 |
Title: Estimating tumor purity, ploidy, LOH, and SNV status using |
4 | 4 |
hybrid capture NGS data |
5 |
-Version: 0.99.6 |
|
5 |
+Version: 0.99.7 |
|
6 | 6 |
Date: 2016-03-30 |
7 | 7 |
Author: Markus Riester |
8 | 8 |
Maintainer: Markus Riester <[email protected]> |
9 | 9 |
Description: This package estimates tumor purity, copy number, loss of |
10 |
- heterozygosity (LOH), and status of short nucleotide variants |
|
11 |
- (SNVs). PureCN is designed for hybrid capture next generation |
|
12 |
- sequencing (NGS) data, integrates well with standard somatic |
|
13 |
- variant detection pipelines, and has support for tumor samples |
|
14 |
- without matching normal samples. |
|
10 |
+ heterozygosity (LOH), and status of short nucleotide variants (SNVs). |
|
11 |
+ PureCN is designed for hybrid capture next generation sequencing (NGS) |
|
12 |
+ data, integrates well with standard somatic variant detection pipelines, |
|
13 |
+ and has support for tumor samples without matching normal samples. |
|
15 | 14 |
Depends: R (>= 3.3), DNAcopy, VariantAnnotation (>= 1.14.1) |
16 | 15 |
Imports: GenomicRanges (>= 1.20.3), IRanges (>= 2.2.1), RColorBrewer, |
17 | 16 |
S4Vectors, data.table, grDevices, graphics, stats, utils, |
... | ... |
@@ -22,3 +21,4 @@ License: Artistic-2.0 |
22 | 21 |
biocViews: CopyNumberVariation, Software, Sequencing, |
23 | 22 |
VariantAnnotation, VariantDetection |
24 | 23 |
NeedsCompilation: no |
24 |
+Packaged: 2016-04-14 21:52:04 UTC; riestma1 |
... | ... |
@@ -86,11 +86,6 @@ max.exon.ratio) { |
86 | 86 |
Mi + g * (1 - p))/(p * lr.C[j] + 2 * (1 - p)), sd = sd.ar, log = TRUE) - |
87 | 87 |
log(lr.C[j] + 1), -Inf)))) |
88 | 88 |
} else { |
89 |
- # p.ar <- lapply(c(0,1), function(g) lapply(1:length(ar_all), function(j) |
|
90 |
- # sapply(test.num.copy, function(Mi) ifelse(Mi <= lr.C[j], dbeta(x = (p * Mi + g |
|
91 |
- # * (1 - p))/(p * lr.C[j] + 2 * (1 - p)), shape1=ar_all[j]*dp_all[j]+1, |
|
92 |
- # shape2=(1-ar_all[j])*dp_all[j]+1, log = TRUE)-log(lr.C[j]+1+haploid.penalty) , |
|
93 |
- # -Inf)))) |
|
94 | 89 |
p.ar <- lapply(c(0, 1), function(g) lapply(1:length(ar_all), function(j) sapply(test.num.copy, |
95 | 90 |
function(Mi) ifelse(Mi <= lr.C[j], dbeta(x = (p * Mi + g * (1 - |
96 | 91 |
p))/(p * lr.C[j] + 2 * (1 - p)), shape1 = ar_all[j] * dp_all[j] + |
... | ... |
@@ -265,7 +260,7 @@ max.exon.ratio) { |
265 | 260 |
return(result) |
266 | 261 |
} |
267 | 262 |
|
268 |
-.flagResults <- function(results, max.non.clonal = 0.2, max.logr.sdev, logr.sdev, |
|
263 |
+.flagResults <- function(results, max.non.clonal = 0.2, max.logr.sdev, logr.sdev, max.segments, |
|
269 | 264 |
flag = NA, flag_comment = NA) { |
270 | 265 |
results <- lapply(results, .flagResult, max.non.clonal = max.non.clonal) |
271 | 266 |
|
... | ... |
@@ -273,6 +268,7 @@ max.exon.ratio) { |
273 | 268 |
# results[[2]]$total.log.likelihood ) / abs( results[[1]]$total.log.likelihood ) |
274 | 269 |
# if (ldiff < 0.1) { results[[1]]$flag <- TRUE results[[1]]$flag_comment <- |
275 | 270 |
# 'SIMILAR TO SECOND MOST LIKELY OPTIMUM' return(results) } |
271 |
+ number.segments <- nrow(results[[1]]$seg) |
|
276 | 272 |
|
277 | 273 |
if (logr.sdev > max.logr.sdev) { |
278 | 274 |
for (i in 1:length(results)) { |
... | ... |
@@ -281,6 +277,14 @@ max.exon.ratio) { |
281 | 277 |
"NOISY LOG-RATIO") |
282 | 278 |
} |
283 | 279 |
} |
280 |
+ |
|
281 |
+ if (number.segments > max.segments) { |
|
282 |
+ for (i in 1:length(results)) { |
|
283 |
+ results[[i]]$flag <- TRUE |
|
284 |
+ results[[i]]$flag_comment <- .appendComment(results[[i]]$flag_comment, |
|
285 |
+ "NOISY SEGMENTATION") |
|
286 |
+ } |
|
287 |
+ } |
|
284 | 288 |
|
285 | 289 |
# some global flags |
286 | 290 |
if (!is.na(flag) && flag) { |
... | ... |
@@ -519,4 +523,10 @@ max.exon.ratio) { |
519 | 523 |
data(chr.hash, envir = environment()) |
520 | 524 |
chr.hash[as.character(ls), 2] |
521 | 525 |
} |
526 |
+.add.chr.name <- function(ls) { |
|
527 |
+ chr.hash <- NULL |
|
528 |
+ data(chr.hash, envir = environment()) |
|
529 |
+ as.character(chr.hash$chr[match(ls, chr.hash$number)]) |
|
530 |
+} |
|
531 |
+ |
|
522 | 532 |
|
... | ... |
@@ -1,10 +1,13 @@ |
1 | 1 |
createNormalDatabase <- structure(function( |
2 |
-### Function to create a database of normal samples, used to find a good match for tumor copy number normalization. |
|
2 |
+### Function to create a database of normal samples, used to find |
|
3 |
+### a good match for tumor copy number normalization. |
|
3 | 4 |
gatk.normal.files, |
4 |
-### Vector with file names pointing to GATK coverage files of normal samples. |
|
5 |
+### Vector with file names pointing to GATK coverage files |
|
6 |
+### of normal samples. |
|
5 | 7 |
... |
6 | 8 |
### Arguments passed to the prcomp function. |
7 | 9 |
) { |
10 |
+ gatk.normal.files <- normalizePath(gatk.normal.files) |
|
8 | 11 |
normals <- lapply(gatk.normal.files, readCoverageGatk) |
9 | 12 |
normals.m <- do.call(cbind, lapply(normals, function(x) x$average.coverage)) |
10 | 13 |
idx <- complete.cases(normals.m) |
... | ... |
@@ -15,7 +18,8 @@ gatk.normal.files, |
15 | 18 |
exons.used=idx, |
16 | 19 |
coverage=apply(normals.m, 2, mean, na.rm=TRUE), |
17 | 20 |
exon.median.coverage=apply(normals.m,1,median, na.rm=TRUE), |
18 |
- exon.log2.sd.coverage=apply(log2(normals.m+1),1,sd, na.rm=TRUE) |
|
21 |
+ exon.log2.sd.coverage=apply(log2(normals.m+1),1,sd, na.rm=TRUE), |
|
22 |
+ sex=sapply(normals,getSexFromCoverage) |
|
19 | 23 |
) |
20 | 24 |
### A normal database that can be used in the findBestNormal function to |
21 | 25 |
### retrieve good matching normal samples for a given tumor sample. |
... | ... |
@@ -27,11 +31,13 @@ normalDB <- createNormalDatabase(gatk.normal.files) |
27 | 31 |
}) |
28 | 32 |
|
29 | 33 |
createExonWeightFile <- structure(function(# Calculate exon weights |
30 |
-### Creates an exon weight file useful for segmentation, by down-weighting unreliable exons. |
|
34 |
+### Creates an exon weight file useful for segmentation, by |
|
35 |
+### down-weighting unreliable exons. |
|
31 | 36 |
gatk.tumor.files, |
32 | 37 |
### A small number (1-3) of GATK tumor coverage samples. |
33 | 38 |
gatk.normal.files, |
34 |
-### A large number of GATK normal coverage samples (>20) to estimate exon log-ratio standard deviations. |
|
39 |
+### A large number of GATK normal coverage samples (>20) |
|
40 |
+### to estimate exon log-ratio standard deviations. |
|
35 | 41 |
exon.weight.file |
36 | 42 |
### Output filename. |
37 | 43 |
) { |
... | ... |
@@ -51,7 +57,7 @@ exon.weight.file |
51 | 57 |
ret <- data.frame(Target=tumor.coverage[[1]][,1], Weights=zz) |
52 | 58 |
|
53 | 59 |
write.table(ret, file=exon.weight.file,row.names=FALSE, quote=FALSE, sep="\t") |
54 |
- ret |
|
60 |
+ invisible(ret) |
|
55 | 61 |
###A data.frame with exon weights. |
56 | 62 |
}, ex=function() { |
57 | 63 |
exon.weight.file <- "exon_weights.txt" |
... | ... |
@@ -7,13 +7,16 @@ normalDB, |
7 | 7 |
pcs=1:3, |
8 | 8 |
### Principal components to use for distance calculation. |
9 | 9 |
num.normals=1, |
10 |
-## Return the num.normals best normals. |
|
10 |
+### Return the num.normals best normals. |
|
11 |
+ignore.sex=FALSE, |
|
12 |
+### If FALSE, detects sex of sample returns a best normal |
|
13 |
+### with matching sex. |
|
11 | 14 |
verbose=TRUE |
15 |
+### Verbose output. |
|
12 | 16 |
) { |
13 | 17 |
if (is.character(gatk.tumor.file)) { |
14 | 18 |
tumor <- readCoverageGatk(gatk.tumor.file) |
15 | 19 |
} else { |
16 |
- if (verbose) message("gatk.tumor.file does not appear to be a filename, assuming it is valid GATK coverage data.") |
|
17 | 20 |
tumor <- gatk.tumor.file |
18 | 21 |
} |
19 | 22 |
x <- t(tumor[normalDB$exons.used,"average.coverage", drop=FALSE]) |
... | ... |
@@ -22,8 +25,22 @@ verbose=TRUE |
22 | 25 |
idx.pcs <- pcs |
23 | 26 |
idx.pcs <- idx.pcs[idx.pcs %in% 1:ncol(normalDB$pca$x)] |
24 | 27 |
|
25 |
- best.match <- order(sapply(1:length(normalDB$gatk.normal.files), function(i) dist( rbind(predict(normalDB$pca, x)[1,idx.pcs], predict(normalDB$pca)[i,idx.pcs]))[1])) |
|
26 |
- normalDB$gatk.normal.files[head(best.match, num.normals)] |
|
28 |
+ idx.normals <- seq_along(normalDB$gatk.normal.files) |
|
29 |
+ |
|
30 |
+ if (!ignore.sex && !is.null(normalDB$sex) && sum(!is.na(normalDB$sex))>0) { |
|
31 |
+ sex <- getSexFromCoverage(tumor, verbose=FALSE) |
|
32 |
+ if (verbose) message(paste("Sex of sample:", sex)) |
|
33 |
+ if (!is.na(sex)) { |
|
34 |
+ idx.normals <- which(normalDB$sex == sex) |
|
35 |
+ } |
|
36 |
+ if (length(idx.normals) < 2) { |
|
37 |
+ warning(paste("Not enough samples of sex", sex, "in database. Ignoring sex.")) |
|
38 |
+ idx.normals <- seq_along(normalDB$gatk.normal.files) |
|
39 |
+ } |
|
40 |
+ } |
|
41 |
+ |
|
42 |
+ best.match <- order(sapply(idx.normals, function(i) dist( rbind(predict(normalDB$pca, x)[1,idx.pcs], predict(normalDB$pca)[i,idx.pcs]))[1])) |
|
43 |
+ normalDB$gatk.normal.files[idx.normals][head(best.match, num.normals)] |
|
27 | 44 |
### Return the filename of the best matching normal |
28 | 45 |
},ex=function() { |
29 | 46 |
gatk.normal.file <- system.file("extdata", "example_normal.txt", package="PureCN") |
30 | 47 |
new file mode 100755 |
... | ... |
@@ -0,0 +1,42 @@ |
1 |
+getSexFromCoverage <- function(# Get sample sex from coverage |
|
2 |
+### This function determines the sex of a sample by the coverage |
|
3 |
+### ratio of chrX and chrY. |
|
4 |
+gatk.coverage, |
|
5 |
+### GATK coverage file or data read with readCoverageGatk. |
|
6 |
+min.ratio=20, |
|
7 |
+### Min chrX/chrY coverage ratio to call sample as female. |
|
8 |
+verbose=TRUE |
|
9 |
+### Verbose output. |
|
10 |
+) { |
|
11 |
+ if (is.character(gatk.coverage)) { |
|
12 |
+ x <- readCoverageGatk(gatk.coverage) |
|
13 |
+ } else { |
|
14 |
+ x <- gatk.coverage |
|
15 |
+ } |
|
16 |
+ |
|
17 |
+ sex.chr <- .getSexChr(x) |
|
18 |
+ xx <- split(x$average.coverage, x$chr) |
|
19 |
+ avg.coverage <- sapply(xx, mean, na.rm=TRUE) |
|
20 |
+ if (is.na(avg.coverage[sex.chr[1]]) || is.na(avg.coverage[sex.chr[2]]) ) { |
|
21 |
+ if (verbose) message("Allosome coverage appears to be missing, cannot determine sex.") |
|
22 |
+ return(NA) |
|
23 |
+ } |
|
24 |
+ |
|
25 |
+ autosome.ratio <- mean(avg.coverage[-match(sex.chr, names(avg.coverage))], na.rm=TRUE)/(avg.coverage[sex.chr[1]]+0.0001) |
|
26 |
+ if (autosome.ratio > 5) { |
|
27 |
+ if (verbose) message("Allosome coverage very low, cannot determine sex.") |
|
28 |
+ return(NA) |
|
29 |
+ } |
|
30 |
+ XY.ratio <- avg.coverage[sex.chr[1]]/ (avg.coverage[sex.chr[2]]+ 0.0001) |
|
31 |
+ if (XY.ratio > min.ratio) return("F") |
|
32 |
+ return("M") |
|
33 |
+### Returns "M" for male, "F" for female, or NULL if unknown. |
|
34 |
+} |
|
35 |
+ |
|
36 |
+.getSexChr <- function(gatk.coverage) { |
|
37 |
+ if ("chrX" %in% gatk.coverage$chr) { |
|
38 |
+ return(c("chrX", "chrY")) |
|
39 |
+ } |
|
40 |
+ return(as.character(23:24)) |
|
41 |
+} |
|
42 |
+ |
... | ... |
@@ -1,35 +1,67 @@ |
1 | 1 |
runAbsoluteCN <- |
2 |
-structure(function(# Run our implementation of ABSOLUTE |
|
3 |
-### This function takes as input tumor and normal control coverage and allelic fractions of germline variants and somatic mutations. |
|
4 |
-### Coverage data is provied in GATK DepthOfCoverage format, allelic fraction in VCF format (e.g. obtained by MuTect). |
|
5 |
-### Normal control does not need to be matched (from the same patient). In case VCF does not contain somatic status, it should contain |
|
6 |
-### dbSNP and optionally COSMIC annotation. |
|
2 |
+structure(function(# Run PureCN implementation of ABSOLUTE |
|
3 |
+### This function takes as input tumor and normal control coverage and |
|
4 |
+### allelic fractions of germline variants and somatic mutations. |
|
5 |
+### Coverage data is provied in GATK DepthOfCoverage format, allelic fraction |
|
6 |
+### in VCF format (e.g. obtained by MuTect). Normal control does not need to |
|
7 |
+### be matched (from the same patient). In case VCF does not contain somatic |
|
8 |
+### status, it should contain dbSNP and optionally COSMIC annotation. |
|
7 | 9 |
gatk.normal.file=NULL, |
8 |
-### GATK coverage file of normal control (optional if log.ratio is provided - then it will be only used to filter low coverage exons). Should be already GC-normalized. |
|
10 |
+### GATK coverage file of normal control (optional if |
|
11 |
+### log.ratio is provided - then it will be only used to filter low coverage |
|
12 |
+### exons). Should be already GC-normalized. Needs to be either a file name |
|
13 |
+### or data read with the readCoverageGatk function. |
|
9 | 14 |
gatk.tumor.file, |
10 |
-### GATK coverage file of tumor. Should be already GC-normalized. |
|
15 |
+### GATK coverage file of tumor. Should be already |
|
16 |
+### GC-normalized. Needs to be either a file name or data read with the |
|
17 |
+### readCoverageGatk function. |
|
11 | 18 |
log.ratio=NULL, |
12 |
-### Copy number log-ratios for all exons in the coverage files. If NULL, calculated based on coverage files. |
|
19 |
+### Copy number log-ratios for all exons in the coverage files. |
|
20 |
+### If NULL, calculated based on coverage files. |
|
13 | 21 |
seg.file=NULL, |
14 |
-### Segmented data. Optional, to support matched SNP6 data. If null, use coverage files or log.ratio to segment the data. |
|
22 |
+### Segmented data. Optional, to support matched SNP6 data. |
|
23 |
+### If null, use coverage files or log.ratio to segment the data. |
|
15 | 24 |
seg.file.sdev=0.4, |
16 |
-### If seg.file provided, the log-ratio standard deviation, used to model likelihood of sub-clonal copy number events. |
|
25 |
+### If seg.file provided, the log-ratio standard deviation, |
|
26 |
+### used to model likelihood of sub-clonal copy number events. |
|
17 | 27 |
vcf.file=NULL, |
18 |
-### VCF file, tested with MuTect output files. Optional, but typically needed to select between local optima of similar likelihood. Can also be a CollapsedVCF, read with the readVcf function. Requires a DB info flag for dbSNP membership. The default fun.setPriorVcf function will also look for a Cosmic.CNT slot, containing the hits in the COSMIC database. Again, do not expect very useful results without a VCF file. |
|
28 |
+### VCF file, tested with MuTect output files. Optional, but |
|
29 |
+### typically needed to select between local optima of similar likelihood. Can |
|
30 |
+### also be a CollapsedVCF, read with the readVcf function. Requires a DB info |
|
31 |
+### flag for dbSNP membership. The default fun.setPriorVcf function will also |
|
32 |
+### look for a Cosmic.CNT slot, containing the hits in the COSMIC database. |
|
33 |
+### Again, do not expect very useful results without a VCF file. |
|
19 | 34 |
genome="hg19", |
20 | 35 |
### Genome version, required for the readVcf function. |
36 |
+sex=c("?","F","M"), |
|
37 |
+### Sex of sample. If ?, detect. |
|
21 | 38 |
fun.filterVcf=filterVcfMuTect, |
22 |
-### Function for filtering variants. Expected output is a list with elements vcf (CollapsedVCF), flag (TRUE/FALSE) and flag_comment (string). The flags will be added to the output data and can be used to warn users, for example when samples look too noisy. Default filter will remove variants flagged by MuTect, but will keep germline variants. If ran in matched normal mode, it will by default use somatic status of variants and filter non-somatic calls with allelic fraction significantly different from 0.5 in normal. |
|
39 |
+### Function for filtering variants. Expected output is a |
|
40 |
+### list with elements vcf (CollapsedVCF), flag (TRUE/FALSE) and flag_comment |
|
41 |
+### (string). The flags will be added to the output data and can be used to |
|
42 |
+### warn users, for example when samples look too noisy. Default filter will |
|
43 |
+### remove variants flagged by MuTect, but will keep germline variants. If |
|
44 |
+### ran in matched normal mode, it will by default use somatic status of |
|
45 |
+### variants and filter non-somatic calls with allelic fraction significantly |
|
46 |
+### different from 0.5 in normal. |
|
23 | 47 |
args.filterVcf=list(), |
24 |
-### Arguments for variant filtering function. Arguments vcf, tumor.id.in.vcf, coverage.cutoff and verbose are required in the filter function and are automatically set (do NOT set them here again). |
|
48 |
+### Arguments for variant filtering function. Arguments |
|
49 |
+### vcf, tumor.id.in.vcf, coverage.cutoff and verbose are required in the |
|
50 |
+### filter function and are automatically set (do NOT set them here again). |
|
25 | 51 |
fun.setPriorVcf=setPriorVcf, |
26 |
-### Function to set prior for somatic status for each variant in the VCF. |
|
52 |
+### Function to set prior for somatic status for each |
|
53 |
+### variant in the VCF. |
|
27 | 54 |
args.setPriorVcf=list(), |
28 | 55 |
### Arguments for somatic prior function. |
29 | 56 |
fun.segmentation=segmentationCBS, |
30 |
-### Function for segmenting the copy number log-ratios. Expected return value is a list with elements seg (the segmentation) and size (the size in bp for all segments). |
|
57 |
+### Function for segmenting the copy number log-ratios. |
|
58 |
+### Expected return value is a list with elements seg (the segmentation) and |
|
59 |
+### size (the size in bp for all segments). |
|
31 | 60 |
args.segmentation=list(), |
32 |
-### Arguments for segmentation function. Arguments normal, tumor, log.ratio, plot.cnv, coverage.cutoff, sampleid, vcf, tumor.id.in.vcf, verbose are required in the segmentation function and automatically set (do NOT set them here again). |
|
61 |
+### Arguments for segmentation function. Arguments |
|
62 |
+### normal, tumor, log.ratio, plot.cnv, coverage.cutoff, sampleid, vcf, |
|
63 |
+### tumor.id.in.vcf, verbose are required in the segmentation function and |
|
64 |
+### automatically set (do NOT set them here again). |
|
33 | 65 |
fun.focal=findFocal, |
34 | 66 |
### Function for identifying focal amplifications. |
35 | 67 |
args.focal=list(), |
... | ... |
@@ -41,37 +73,72 @@ min.ploidy=1, |
41 | 73 |
max.ploidy=6, |
42 | 74 |
### Maximum ploidy to be considered. |
43 | 75 |
test.num.copy=0:7, |
44 |
-### Copy numbers tested in the grid search. Note that focal amplifications can have much higher copy numbers, but they will be labeled as subclonal (because they do not fit the integer copy numbers). |
|
76 |
+### Copy numbers tested in the grid search. Note that focal |
|
77 |
+### amplifications can have much higher copy numbers, but they will be labeled |
|
78 |
+### as subclonal (because they do not fit the integer copy numbers). |
|
45 | 79 |
test.purity=seq(0.05,0.95,by=0.01), |
46 | 80 |
### Considered tumor purity values. |
47 | 81 |
prior.purity=rep(1,length(test.purity))/length(test.purity), |
48 |
-### Priors for purity if they are available. Only change when you know what you are doing. |
|
82 |
+### Priors for purity if they are available. Only change |
|
83 |
+### when you know what you are doing. |
|
49 | 84 |
max.candidate.solutions=15, |
50 |
-### Number of local optima considered in optimization and variant fitting steps. If there are too many local optima, it will use specified number of top candidate solutions, but will also include all optima close to diploid, because silent genomes have often lots of local optima. |
|
85 |
+### Number of local optima considered in optimization |
|
86 |
+### and variant fitting steps. If there are too many local optima, it will use |
|
87 |
+### specified number of top candidate solutions, but will also include all |
|
88 |
+### optima close to diploid, because silent genomes have often lots of local |
|
89 |
+### optima. |
|
51 | 90 |
candidates=NULL, |
52 |
-### Candidates to optimize from a previous run (return.object$candidates). If NULL, do 2D grid search and find local optima. |
|
91 |
+### Candidates to optimize from a previous run |
|
92 |
+### (return.object$candidates). |
|
93 |
+### If NULL, do 2D grid search and find local optima. |
|
53 | 94 |
coverage.cutoff=15, |
54 |
-### Minimum exon coverage in both normal and tumor. Exons with lower coverage are ingored. The cutoff choice depends on the expected purity and overall coverage. High purity samples might need a lower cutoff to call homozygous deletions. If an exon.weigh.file (below) is NOT specified, it is recommended to set a higher cutoff (e.g. 20) to remove noise from unreliable exon measurements. |
|
95 |
+### Minimum exon coverage in both normal and tumor. Exons |
|
96 |
+### with lower coverage are ingored. The cutoff choice depends on the expected |
|
97 |
+### purity and overall coverage. High purity samples might need a lower cutoff |
|
98 |
+### to call homozygous deletions. If an exon.weigh.file (below) is NOT |
|
99 |
+### specified, it is recommended to set a higher cutoff (e.g. 20) to remove |
|
100 |
+### noise from unreliable exon measurements. |
|
55 | 101 |
max.non.clonal=0.2, |
56 |
-### Maximum genomic fraction assigned to a subclonal copy number state. |
|
102 |
+### Maximum genomic fraction assigned to a subclonal copy |
|
103 |
+### number state. |
|
104 |
+max.homozygous.loss=0.1, |
|
105 |
+### Maximum genomic fraction assigned to homozygous loss. |
|
106 |
+### This is set to a fairly high default value to not exclude correct |
|
107 |
+### solutions, especially in noisy segmentations. |
|
57 | 108 |
iterations=30, |
58 |
-### Maximum number of iterations in the Simulated Annealing copy number fit optimization. |
|
109 |
+### Maximum number of iterations in the Simulated Annealing copy |
|
110 |
+### number fit optimization. |
|
59 | 111 |
log.ratio.calibration=0.25, |
60 |
-### re-calibrate log-ratios in the window sd(log.ratio)*log.ratio.calibration. |
|
112 |
+### re-calibrate log-ratios in the window |
|
113 |
+### sd(log.ratio)*log.ratio.calibration. |
|
61 | 114 |
gc.gene.file=NULL, |
62 |
-### GC gene file, which assigns GC content and gene symbols to each exon in the coverage files. Used for generating gene level calls. First column in format CHR:START-END. Second column GC content (0 to 1). Third column gene symbol. |
|
115 |
+### GC gene file, which assigns GC content and gene symbols |
|
116 |
+### to each exon in the coverage files. Used for generating gene level calls. |
|
117 |
+### First column in format CHR:START-END. Second column GC content (0 to 1). |
|
118 |
+### Third column gene symbol. |
|
63 | 119 |
filter.lowhigh.gc.exons=0.001, |
64 |
-### Quantile q (defines lower q and upper 1-q) for removing exons with outlier GC profile. Assuming that GC correction might not have been worked on those. Requires gc.gene.file. |
|
120 |
+### Quantile q (defines lower q and upper 1-q) |
|
121 |
+### for removing exons with outlier GC profile. Assuming that GC correction |
|
122 |
+### might not have been worked on those. Requires gc.gene.file. |
|
65 | 123 |
filter.targeted.base=4, |
66 |
-### Exclude exons with targeted base (size) smaller than this cutoff. |
|
124 |
+### Exclude exons with targeted base (size) smaller |
|
125 |
+### than this cutoff. |
|
67 | 126 |
max.logr.sdev=0.75, |
68 |
-### Flag noisy samples with segment log-ratio standard deviation larger than this. Assay specific and needs to be calibrated. |
|
127 |
+### Flag noisy samples with segment log-ratio standard deviation |
|
128 |
+### larger than this. Assay specific and needs to be calibrated. |
|
129 |
+max.segments=200, |
|
130 |
+### Flag noisy samples with a large number of segments. Assay |
|
131 |
+### specific and needs to be calibrated. |
|
69 | 132 |
plot.cnv=TRUE, |
70 | 133 |
### Generate segmentation plots. |
71 | 134 |
verbose=TRUE, |
72 | 135 |
### Verbose output. |
73 | 136 |
post.optimize=FALSE, |
74 |
-### Optimize purity using final SCNA-fit and SNVs. This might take a long time when lots of SNVs need to be fitted, but will result in a slightly more accurate purity, especially for rather silent genomes or very low purities. Otherwise, it will just use the purity determined via the SCNA-fit. |
|
137 |
+### Optimize purity using final SCNA-fit and SNVs. This |
|
138 |
+### might take a long time when lots of SNVs need to be fitted, but will |
|
139 |
+### typically result in a slightly more accurate purity, especially for rather |
|
140 |
+### silent genomes or very low purities. Otherwise, it will just use the |
|
141 |
+### purity determined via the SCNA-fit. |
|
75 | 142 |
... |
76 | 143 |
### Additional parameters passed to the segmentation function. |
77 | 144 |
) { |
... | ... |
@@ -132,6 +199,29 @@ post.optimize=FALSE, |
132 | 199 |
if (is.null(gatk.normal.file)) normal <- tumor |
133 | 200 |
if (!is.null(seg.file)) stop("Provide either log.ratio or seg.file, not both.") |
134 | 201 |
} |
202 |
+ sex <- match.arg(sex) |
|
203 |
+ sex.chr <- .getSexChr(tumor) |
|
204 |
+ if (sex =="?") { |
|
205 |
+ sex.tumor <- getSexFromCoverage(tumor, verbose=FALSE) |
|
206 |
+ sex.normal <- getSexFromCoverage(normal, verbose=FALSE) |
|
207 |
+ if (is.na(sex.tumor)) { |
|
208 |
+ tumor <- .removeChr(tumor, remove.chrs=sex.chr) |
|
209 |
+ } |
|
210 |
+ if (is.na(sex.normal)) { |
|
211 |
+ normal <- .removeChr(normal, remove.chrs=sex.chr) |
|
212 |
+ } |
|
213 |
+ if (!identical(sex.tumor, sex.normal)) warning(paste("Sex tumor/normal mismatch: tumor =", sex.tumor, "normal =", sex.normal)) |
|
214 |
+ sex <- sex.tumor |
|
215 |
+ if (is.na(sex)) sex = "?" |
|
216 |
+ } |
|
217 |
+ if (sex=="M") { |
|
218 |
+ tumor <- .removeChr(tumor, remove.chrs=sex.chr) |
|
219 |
+ } else if (sex=="F") { |
|
220 |
+ tumor <- .removeChr(tumor, remove.chrs=sex.chr[2]) |
|
221 |
+ } |
|
222 |
+ |
|
223 |
+ if (verbose) message(paste("Sex of sample:", sex)) |
|
224 |
+ |
|
135 | 225 |
# NA's in log.ratio confuse the CBS function |
136 | 226 |
idx <- !is.na(log.ratio) & !is.infinite(log.ratio) |
137 | 227 |
log.ratio <- log.ratio[idx] |
... | ... |
@@ -171,6 +261,7 @@ post.optimize=FALSE, |
171 | 261 |
vcf.germline <- NULL |
172 | 262 |
tumor.id.in.vcf <- NULL |
173 | 263 |
prior.somatic <- NULL |
264 |
+ vcf.filtering <-list(flag=FALSE, flag_comment="") |
|
174 | 265 |
|
175 | 266 |
if (!is.null(vcf.file)) { |
176 | 267 |
if (verbose) message("Loading VCF...") |
... | ... |
@@ -358,6 +449,10 @@ post.optimize=FALSE, |
358 | 449 |
# set probability to zero if ploidy is not within requested range |
359 | 450 |
log.prior.ploidy <- log(ifelse(ploidy<min.ploidy | ploidy>max.ploidy,0,1)) |
360 | 451 |
if (iter > 1) p.rij <- p.rij+log.prior.ploidy |
452 |
+ |
|
453 |
+ frac.homozygous.loss <- sapply(test.num.copy, function(Ci) (sum(li[-i] * ifelse(C[-i]==0,1,0)) + li[i] * ifelse(Ci==0,1,0))/sum(li) ) |
|
454 |
+ log.prior.homozygous.loss <- log(ifelse(frac.homozygous.loss > max.homozygous.loss,0,1)) |
|
455 |
+ if (iter > 1) p.rij <- p.rij+log.prior.homozygous.loss |
|
361 | 456 |
|
362 | 457 |
# model sub clonal state with a uniform distribution |
363 | 458 |
p.rij <- c(p.rij, .calcLlikSegmentSubClonal( exon.lrs[[i]]+log.ratio.offset[i], max.exon.ratio)) |
... | ... |
@@ -443,7 +538,7 @@ post.optimize=FALSE, |
443 | 538 |
SNV.posterior <- res.snvllik[[idx]] |
444 | 539 |
} |
445 | 540 |
|
446 |
- list(log.likelihood=llik, purity=p, ploidy=weighted.mean(C,li), total.ploidy=total.ploidy, seg=seg.adjusted, C.posterior=data.frame(C.posterior/rowSums(C.posterior), ML.C=C, ML.Subclonal=subclonal), SNV.posterior=SNV.posterior, fraction.subclonal=subclonal.f, gene.calls=gene.calls, log.ratio.offset=log.ratio.offset, failed=FALSE) |
|
541 |
+ list(log.likelihood=llik, purity=p, ploidy=weighted.mean(C,li), total.ploidy=total.ploidy, seg=seg.adjusted, C.posterior=data.frame(C.posterior/rowSums(C.posterior), ML.C=C, ML.Subclonal=subclonal), SNV.posterior=SNV.posterior, fraction.subclonal=subclonal.f, fraction.homozygous.loss=sum(li[which(C<0.01)])/sum(li), gene.calls=gene.calls, log.ratio.offset=log.ratio.offset, failed=FALSE) |
|
447 | 542 |
} |
448 | 543 |
|
449 | 544 |
results <- lapply(1:nrow(candidate.solutions$candidates),.optimizeSolution) |
... | ... |
@@ -451,7 +546,7 @@ post.optimize=FALSE, |
451 | 546 |
|
452 | 547 |
results <- .rankResults(results) |
453 | 548 |
results <- .filterDuplicatedResults(results) |
454 |
- results <- .flagResults(results, max.non.clonal=max.non.clonal, max.logr.sdev=max.logr.sdev, logr.sdev=sd.seg, flag=vcf.filtering$flag, flag_comment=vcf.filtering$flag_comment) |
|
549 |
+ results <- .flagResults(results, max.non.clonal=max.non.clonal, max.logr.sdev=max.logr.sdev, logr.sdev=sd.seg, max.segments=max.segments, flag=vcf.filtering$flag, flag_comment=vcf.filtering$flag_comment) |
|
455 | 550 |
if (!is.null(gc.gene.file)) { |
456 | 551 |
# Add LOH calls to gene level calls |
457 | 552 |
for (i in 1:length(results)) { |
... | ... |
@@ -459,30 +554,37 @@ post.optimize=FALSE, |
459 | 554 |
} |
460 | 555 |
} |
461 | 556 |
|
462 |
- for (i in 1:length(results)) { |
|
463 |
- results[[i]]$SNV.posterior$beta.model$posteriors <- .getVariantCallsLOH( results[[i]] ) |
|
557 |
+ if (!is.null(vcf.file)) { |
|
558 |
+ for (i in 1:length(results)) { |
|
559 |
+ results[[i]]$SNV.posterior$beta.model$posteriors <- .getVariantCallsLOH( results[[i]] ) |
|
560 |
+ } |
|
464 | 561 |
} |
465 | 562 |
##value<< A list with elements |
466 | 563 |
list( |
467 | 564 |
candidates=candidate.solutions, ##<< Results of the grid search. |
468 | 565 |
results=results, ##<< All local optima, sorted by final rank. |
469 |
- input=list(tumor=gatk.tumor.file, normal=gatk.normal.file, log.ratio=data.frame(probe=normal[,1], log.ratio=log.ratio), log.ratio.sdev=sd.seg, vcf=vcf, sampleid=sampleid) ##<< The input data. |
|
566 |
+ input=list(tumor=gatk.tumor.file, normal=gatk.normal.file, log.ratio=data.frame(probe=normal[,1], log.ratio=log.ratio), log.ratio.sdev=sd.seg, vcf=vcf, sampleid=sampleid, sex=sex) ##<< The input data. |
|
470 | 567 |
) |
471 | 568 |
##end<< |
472 | 569 |
},ex=function(){ |
473 |
-gatk.normal.file <- system.file("extdata", "example_normal.txt", package="PureCN") |
|
474 |
-gatk.tumor.file <- system.file("extdata", "example_tumor.txt", package="PureCN") |
|
475 |
-vcf.file <- system.file("extdata", "example_vcf.vcf", package="PureCN") |
|
476 |
-gc.gene.file <- system.file("extdata", "example_gc.gene.file.txt", package="PureCN") |
|
477 |
- |
|
478 |
-# Speed-up the runAbsoluteCN by using the stored grid-search |
|
570 |
+gatk.normal.file <- system.file("extdata", "example_normal.txt", |
|
571 |
+ package="PureCN") |
|
572 |
+gatk.tumor.file <- system.file("extdata", "example_tumor.txt", |
|
573 |
+ package="PureCN") |
|
574 |
+vcf.file <- system.file("extdata", "example_vcf.vcf", |
|
575 |
+ package="PureCN") |
|
576 |
+gc.gene.file <- system.file("extdata", "example_gc.gene.file.txt", |
|
577 |
+ package="PureCN") |
|
578 |
+ |
|
579 |
+# Speed-up the runAbsoluteCN call by using the stored grid-search |
|
479 | 580 |
# (purecn.example.output$candidates). |
480 | 581 |
data(purecn.example.output) |
481 | 582 |
|
482 | 583 |
# The max.candidate.solutions parameter is set to a very low value only to |
483 | 584 |
# speed-up this example. This is not a good idea for real samples. |
484 | 585 |
|
485 |
-ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, gatk.tumor.file=gatk.tumor.file, |
|
486 |
- candidates=purecn.example.output$candidates, max.candidate.solutions=2, |
|
586 |
+ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, |
|
587 |
+ gatk.tumor.file=gatk.tumor.file, |
|
588 |
+ candidates=purecn.example.output$candidates, max.candidate.solutions=2, |
|
487 | 589 |
vcf.file=vcf.file, sampleid='Sample1', gc.gene.file=gc.gene.file) |
488 | 590 |
}) |
... | ... |
@@ -69,7 +69,7 @@ ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, gatk.tumor.file=gatk.tumo |
69 | 69 |
.pruneByVCF <- function(x, vcf, tumor.id.in.vcf, min.size=5, max.pval=0.00001, iterations=3, debug=FALSE) { |
70 | 70 |
seg <- segments.p(x$cna) |
71 | 71 |
for (iter in 1:iterations) { |
72 |
- seg.gr <- GRanges(seqnames=paste("chr", seg$chrom,sep=""), IRanges(start=seg$loc.start, end=seg$loc.end)) |
|
72 |
+ seg.gr <- GRanges(seqnames=.add.chr.name(seg$chrom), IRanges(start=seg$loc.start, end=seg$loc.end)) |
|
73 | 73 |
ov <- findOverlaps(seg.gr, vcf) |
74 | 74 |
ar <- sapply(geno(vcf)$FA[,tumor.id.in.vcf], function(x) x[1]) |
75 | 75 |
ar.r <- ifelse(ar>0.5, 1-ar, ar) |
76 | 76 |
deleted file mode 100755 |
... | ... |
@@ -1,119 +0,0 @@ |
1 |
-<h1 id="quick-start">Quick Start</h1> |
|
2 |
-<h2 id="basic-input-files">Basic Input Files</h2> |
|
3 |
-<p>You will need coverage files in GATK DepthOfCoverage format for tumor and a control sample:</p> |
|
4 |
-<pre><code>Target total_coverage average_coverage Sample1_total_cvg Sample1_mean_cvg Sample1_granular_Q1 Sample1_granular_median Sample1_granular_Q3 Sample1_._above_15 |
|
5 |
-chr1:69091-70009 0 0 0 0 1 1 1 0 |
|
6 |
-chr1:367659-368598 6358 9.25121153819759 6358 9.25121153819759 1 7 13 11.6 |
|
7 |
-chr1:621096-622035 6294 9.16910019318401 6294 9.16910019318401 1 7 12 9.5</code></pre> |
|
8 |
-<p>PureCN will look for the columns "Target", "total_coverage", and "average_coverage". The algorithm works best when the coverage data is already GC normalized. The steps below will NOT GC normalize.</p> |
|
9 |
-<p>You will also need a VCF file with both somatic and germline variants, for example as obtained by MuTect.</p> |
|
10 |
-<h2 id="first-run">First Run</h2> |
|
11 |
-<p>In R:</p> |
|
12 |
-<pre><code>library(PureCN) |
|
13 |
-ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, |
|
14 |
- gatk.tumor.file=gatk.tumor.file, |
|
15 |
- vcf.file=vcf.file)</code></pre> |
|
16 |
-<p>To obtain gene level copy number calls, we need an exon-to-gene mapping file. This is provided together with GC content via the gc.gene.file argument. The expected format:</p> |
|
17 |
-<pre><code>Target gc_bias Gene |
|
18 |
-chr1:69091-70009 0.427638737758433 OR4F5 |
|
19 |
-chr1:367659-368598 0.459574468085106 OR4F29 |
|
20 |
-chr1:621096-622035 0.459574468085106 OR4F3</code></pre> |
|
21 |
-<h2 id="pool-of-normals">Pool of Normals</h2> |
|
22 |
-<p>If a pool of normal samples is available, we can use the coverage data to estimate the expected variance in coverage per exon. This will help the segmentation algorithm to filter noise:</p> |
|
23 |
-<pre><code>gatk.normal.files <- dir("poolofnormals", |
|
24 |
- pattern="coverage.sample_interval_summary", full.names=TRUE) |
|
25 |
-createExonWeightFile(gatk.normal.files[1:2], gatk.normal.files[-(1:2)], "exon_weights.txt")</code></pre> |
|
26 |
-<p>This will calculate log-ratios for all normals using the first two as tumor and the remaining ones as normal. This estimates then exon level coverage standard deviations.</p> |
|
27 |
-<p>We can also use the pool of normal to find SNPs with biased allelic fractions (significantly different from 0.5 for heterozygous SNPs):</p> |
|
28 |
-<pre><code>mutect.normal.files <- dir("poolofnormals", pattern="vcf$", full.names=TRUE) |
|
29 |
-snp.blacklist <- createSNPBlacklist(mutect.normal.files) |
|
30 |
-write.csv(snp.bl, file="SNP_blacklist.csv") |
|
31 |
-write.csv(snp.bl, file="SNP_blacklist_segmented.csv", row.names=FALSE, quote=FALSE)</code></pre> |
|
32 |
-<p>Finally, we can use the pool of normals to find normal samples for log-ratio calculation, especially when no matched normal sample is available:</p> |
|
33 |
-<pre><code>if (!file.exists("normalDB.rds")) { |
|
34 |
- normalDB <- createNormalDatabase(gatk.normal.files) |
|
35 |
- saveRDS(normalDB, "normalDB.rds") |
|
36 |
-} else { |
|
37 |
- normalDB <- readRDS("normalDBe4.rds") |
|
38 |
-} |
|
39 |
- |
|
40 |
-# get the best 4 normals and average them |
|
41 |
-gatk.normal.file <- findBestNormal(gatk.tumor.file, normalDB, num.normals=4) |
|
42 |
-pool <- pool.coverage2(lapply(gatk.normal.file, read.coverage.gatk), remove.chrs=c('chrX', 'chrY')) |
|
43 |
- |
|
44 |
-# get the best normal |
|
45 |
-gatk.normal.file <- findBestNormal(gatk.tumor.file, normalDB)</code></pre> |
|
46 |
-<h2 id="vcf-artifact-filtering">VCF Artifact Filtering</h2> |
|
47 |
-<p>If MuTect was run in matched normal mode, then all germline variants are rejected, that means we cannot just filter by the KEEP/REJECT MuTect flags. The filterVcfMuTect function optionally reads the MuTect stats file and will keep germline variants, while removing potential artifacts. Otherwise it will use only the filters based on read depths as defined in filterVcfBasic.</p> |
|
48 |
-<h2 id="recommended-run">Recommended Run</h2> |
|
49 |
-<p>Finally, we can run PureCN with all that information:</p> |
|
50 |
-<pre><code> ret <-runAbsoluteCN(gatk.normal.file=pool, gatk.tumor.file=gatk.tumor.file, |
|
51 |
- vcf.file=vcf.file, sampleid='Sample1', gc.gene.file=gc.gene.file, |
|
52 |
- args.filterVcf=list(snp.blacklist=snp.blacklist, stats.file=mutect.stats.file), |
|
53 |
- args.segmentation=list(exon.weight.file=exon.weight.file), |
|
54 |
- post.optimize=TRUE)</code></pre> |
|
55 |
-<p>The post.optimize flag will increase the runtime by a lot, but might be worth it if the copy number profile is not very clean. This is recommended with lower coverage datasets or if there are significant capture biases.</p> |
|
56 |
-<p>We also need to create a few output files:</p> |
|
57 |
-<pre><code>file.rds <- 'Sample1_PureCN.rds' |
|
58 |
-saveRDS(ret, file=file.rds) |
|
59 |
-pdf('Sample1_PureCN.pdf'), width=10, height=12) |
|
60 |
-plotAbs(ret, type='all') |
|
61 |
-dev.off()</code></pre> |
|
62 |
-<h2 id="manual-curation">Manual Curation</h2> |
|
63 |
-<p>For prediction of SNV status (germline vs somatic, sub-clonal vs. clonal, homozyous vs heterozygous), it is important that both purity and ploidy are correct. We provide functionality for curating results:</p> |
|
64 |
-<pre><code>createCurationFile(file.rds) </code></pre> |
|
65 |
-<p>This will generate a CSV file, in which the correct purity and ploidy values can be manually entered. It also contains a column "Curated", which should be set to TRUE, otherwise the file will be overwritten when re-run.</p> |
|
66 |
-<p>Then in R, the correct solution (closest to the combination in the CSV file) can be loaded with the readCurationFile function:</p> |
|
67 |
-<pre><code>ret <- readCurationFile(file.rds)</code></pre> |
|
68 |
-<h2 id="custom-segmentation">Custom Segmentation</h2> |
|
69 |
-<p>By default, we will use DNAcopy to segment the log-ratio. You can easily change that to your favorite method and the segmentationCBS function can serve as an example.</p> |
|
70 |
-<pre><code>segmentationCBS <- function (normal, tumor, log.ratio, plot.cnv, coverage.cutoff, |
|
71 |
- sampleid = sampleid, exon.weight.file = NULL, alpha = 0.005, |
|
72 |
- vcf = NULL, tumor.id.in.vcf = 1, verbose = TRUE, ...) </code></pre> |
|
73 |
-<p>It is also possible to provide segmented file, which we however only recommend when matched SNP6 data is available (otherwise it is better to customize the segmentation function as described above). The expected file format is:</p> |
|
74 |
-<pre><code>ID chrom loc.start loc.end num.mark seg.mean |
|
75 |
-Sample1 1 61723 5773942 2681 0.125406444072723 |
|
76 |
-Sample1 1 5774674 5785170 10 -0.756511807441712</code></pre> |
|
77 |
-<h2 id="output">Output</h2> |
|
78 |
-<p>The plotAbs() call above will generate the main plots shown in the manuscript. The R data file (file.rds) contains gene level copy number calls, SNV status and LOH calls. The purity/ploidy combinations are sorted by likelihood at stored in ret$results.</p> |
|
79 |
-<pre><code>names(ret) |
|
80 |
-head(ret$results[[1]]$gene.calls, 3) |
|
81 |
-chr start end C seg.mean seg.id number.exons gene.mean gene.min |
|
82 |
-TP73 chr1 3598833 3649721 3 0.3434 1 14 0.1755614 -0.3861084 |
|
83 |
-CHD5 chr1 6161904 6240126 3 0.3434 1 44 0.3059672 -0.8812580 |
|
84 |
-MTOR chr1 11166594 11319542 3 0.3434 1 62 0.2360036 -0.1346406 |
|
85 |
-gene.max focal num.snps.loh.segment percentage.loh.in.loh.segment |
|
86 |
-TP73 0.6437573 FALSE 0 0 |
|
87 |
-CHD5 1.5984124 FALSE 49 0 |
|
88 |
-MTOR 0.6705342 FALSE 49 0</code></pre> |
|
89 |
-<p>This data.frame also contains gene level LOH information. The SNV posteriors:</p> |
|
90 |
-<pre><code>head(ret$results[[1]]$SNV.posterior$beta.model$posteriors, 3) |
|
91 |
- seqnames start end SOMATIC.M0 SOMATIC.M1 SOMATIC.M2 |
|
92 |
-rs6696489 chr1 6162054 6162054 0 9.155037e-04 2.938312e-07 |
|
93 |
-rs59788818 chr1 6171992 6171992 0 2.364795e-05 3.397817e-07 |
|
94 |
-rs2843493 chr1 6184092 6184092 0 9.362524e-16 2.714454e-04 |
|
95 |
- SOMATIC.M3 SOMATIC.M4 SOMATIC.M5 SOMATIC.M6 SOMATIC.M7 |
|
96 |
-rs6696489 6.592308e-21 2.743271e-76 2.002534e-167 3.562023e-275 0 |
|
97 |
-rs59788818 3.491208e-34 1.711087e-93 7.197813e-188 2.275374e-298 0 |
|
98 |
-rs2843493 1.037518e-11 5.167773e-67 3.034251e-158 3.725864e-266 0 |
|
99 |
- GERMLINE.M0 GERMLINE.M1 GERMLINE.M2 GERMLINE.M3 GERMLINE.M4 |
|
100 |
-rs6696489 1.723074e-04 9.989057e-01 6.216658e-06 3.302211e-27 1.067221e-82 |
|
101 |
-rs59788818 5.836546e-23 9.999757e-01 2.683928e-07 1.233256e-51 2.340805e-111 |
|
102 |
-rs2843493 8.802336e-49 1.023147e-07 9.997285e-01 2.798399e-18 3.718923e-74 |
|
103 |
- GERMLINE.M5 GERMLINE.M6 GERMLINE.M7 GERMLINE.CONTHIGH |
|
104 |
-rs6696489 6.678185e-174 1.070845e-281 0 3.060961e-62 |
|
105 |
-rs59788818 5.523414e-206 1.183050e-316 0 8.253575e-127 |
|
106 |
-rs2843493 9.760218e-166 6.969108e-274 0 5.841728e-59 |
|
107 |
- GERMLINE.CONTLOW ML.SOMATIC ML.M ML.C ML.AR AR |
|
108 |
-rs6696489 2.105422e-22 FALSE 1 3 0.3571429 0.2803790 |
|
109 |
-rs59788818 3.106132e-86 FALSE 1 3 0.3571429 0.3957373 |
|
110 |
-rs2843493 4.644636e-138 FALSE 2 3 0.6428571 0.6259540 |
|
111 |
- CN.Subclonal Log.Ratio Prior.Somatic Prior.Contamination ML.LOH |
|
112 |
-rs6696489 FALSE 1.5984124 0.0004950495 0.01 FALSE |
|
113 |
-rs59788818 FALSE 0.2600908 0.0004950495 0.01 FALSE |
|
114 |
-rs2843493 FALSE 0.7559721 0.0004950495 0.01 FALSE |
|
115 |
- num.snps.loh.segment percentage.loh.in.loh.segment |
|
116 |
-rs6696489 49 0 |
|
117 |
-rs59788818 49 0 |
|
118 |
-rs2843493 49 0</code></pre> |
|
119 |
-<p>This lists all posterior probabilities for all possible SNV states.</p> |
120 | 0 |
deleted file mode 100755 |
... | ... |
@@ -1,191 +0,0 @@ |
1 |
-# Quick Start |
|
2 |
- |
|
3 |
-## Basic Input Files |
|
4 |
- |
|
5 |
-You will need coverage files in GATK DepthOfCoverage format for tumor and a control sample: |
|
6 |
- |
|
7 |
- Target total_coverage average_coverage Sample1_total_cvg Sample1_mean_cvg Sample1_granular_Q1 Sample1_granular_median Sample1_granular_Q3 Sample1_._above_15 |
|
8 |
- chr1:69091-70009 0 0 0 0 1 1 1 0 |
|
9 |
- chr1:367659-368598 6358 9.25121153819759 6358 9.25121153819759 1 7 13 11.6 |
|
10 |
- chr1:621096-622035 6294 9.16910019318401 6294 9.16910019318401 1 7 12 9.5 |
|
11 |
- |
|
12 |
-PureCN will look for the columns "Target", "total_coverage", and |
|
13 |
-"average_coverage". The algorithm works best when the coverage data is already |
|
14 |
-GC normalized. The steps below will NOT GC normalize. |
|
15 |
- |
|
16 |
-You will also need a VCF file with both somatic and germline variants, for |
|
17 |
-example as obtained by MuTect. |
|
18 |
- |
|
19 |
-## First Run |
|
20 |
- |
|
21 |
-In R: |
|
22 |
- |
|
23 |
- library(PureCN) |
|
24 |
- ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, |
|
25 |
- gatk.tumor.file=gatk.tumor.file, |
|
26 |
- vcf.file=vcf.file) |
|
27 |
- |
|
28 |
-To obtain gene level copy number calls, we need an exon-to-gene mapping file. |
|
29 |
-This is provided together with GC content via the gc.gene.file argument. The |
|
30 |
-expected format: |
|
31 |
- |
|
32 |
- Target gc_bias Gene |
|
33 |
- chr1:69091-70009 0.427638737758433 OR4F5 |
|
34 |
- chr1:367659-368598 0.459574468085106 OR4F29 |
|
35 |
- chr1:621096-622035 0.459574468085106 OR4F3 |
|
36 |
- |
|
37 |
-## Pool of Normals |
|
38 |
- |
|
39 |
-If a pool of normal samples is available, we can use the coverage data to |
|
40 |
-estimate the expected variance in coverage per exon. This will help the |
|
41 |
-segmentation algorithm to filter noise: |
|
42 |
- |
|
43 |
- gatk.normal.files <- dir("poolofnormals", |
|
44 |
- pattern="coverage.sample_interval_summary", full.names=TRUE) |
|
45 |
- createExonWeightFile(gatk.normal.files[1:2], gatk.normal.files[-(1:2)], "exon_weights.txt") |
|
46 |
- |
|
47 |
-This will calculate log-ratios for all normals using the first two as tumor and |
|
48 |
-the remaining ones as normal. This estimates then exon level coverage standard |
|
49 |
-deviations. |
|
50 |
- |
|
51 |
-We can also use the pool of normal to find SNPs with biased allelic fractions |
|
52 |
-(significantly different from 0.5 for heterozygous SNPs): |
|
53 |
- |
|
54 |
- mutect.normal.files <- dir("poolofnormals", pattern="vcf$", full.names=TRUE) |
|
55 |
- snp.blacklist <- createSNPBlacklist(mutect.normal.files) |
|
56 |
- write.csv(snp.bl, file="SNP_blacklist.csv") |
|
57 |
- write.csv(snp.bl, file="SNP_blacklist_segmented.csv", row.names=FALSE, quote=FALSE) |
|
58 |
- |
|
59 |
-Finally, we can use the pool of normals to find normal samples for log-ratio |
|
60 |
-calculation, especially when no matched normal sample is available: |
|
61 |
- |
|
62 |
- if (!file.exists("normalDB.rds")) { |
|
63 |
- normalDB <- createNormalDatabase(gatk.normal.files) |
|
64 |
- saveRDS(normalDB, "normalDB.rds") |
|
65 |
- } else { |
|
66 |
- normalDB <- readRDS("normalDBe4.rds") |
|
67 |
- } |
|
68 |
- |
|
69 |
- # get the best 4 normals and average them |
|
70 |
- gatk.normal.file <- findBestNormal(gatk.tumor.file, normalDB, num.normals=4) |
|
71 |
- pool <- poolCoverage(lapply(gatk.normal.file, read.coverage.gatk), remove.chrs=c('chrX', 'chrY')) |
|
72 |
- |
|
73 |
- # get the best normal |
|
74 |
- gatk.normal.file <- findBestNormal(gatk.tumor.file, normalDB) |
|
75 |
- |
|
76 |
-## VCF Artifact Filtering |
|
77 |
- |
|
78 |
-If MuTect was run in matched normal mode, then all germline variants are |
|
79 |
-rejected, that means we cannot just filter by the PASS/REJECT MuTect flags. The |
|
80 |
-filterVcfMuTect function optionally reads the MuTect stats file and will keep |
|
81 |
-germline variants, while removing potential artifacts. Otherwise it will use |
|
82 |
-only the filters based on read depths as defined in filterVcfBasic. |
|
83 |
- |
|
84 |
-## Recommended Run |
|
85 |
- |
|
86 |
-Finally, we can run PureCN with all that information: |
|
87 |
- |
|
88 |
- ret <-runAbsoluteCN(gatk.normal.file=pool, gatk.tumor.file=gatk.tumor.file, |
|
89 |
- vcf.file=vcf.file, sampleid='Sample1', gc.gene.file=gc.gene.file, |
|
90 |
- args.filterVcf=list(snp.blacklist=snp.blacklist, stats.file=mutect.stats.file), |
|
91 |
- args.segmentation=list(exon.weight.file=exon.weight.file), |
|
92 |
- post.optimize=TRUE) |
|
93 |
- |
|
94 |
-The post.optimize flag will increase the runtime by a lot, but might be worth |
|
95 |
-it if the copy number profile is not very clean. This is recommended with lower |
|
96 |
-coverage datasets or if there are significant capture biases. For high quality |
|
97 |
-whole exome data, this is typically not necessary. |
|
98 |
- |
|
99 |
-We also need to create a few output files: |
|
100 |
- |
|
101 |
- file.rds <- 'Sample1_PureCN.rds' |
|
102 |
- saveRDS(ret, file=file.rds) |
|
103 |
- pdf('Sample1_PureCN.pdf', width=10, height=12) |
|
104 |
- plotAbs(ret, type='all') |
|
105 |
- dev.off() |
|
106 |
- |
|
107 |
-## Manual Curation |
|
108 |
- |
|
109 |
-For prediction of SNV status (germline vs somatic, sub-clonal vs. clonal, |
|
110 |
-homozyous vs heterozygous), it is important that both purity and ploidy are |
|
111 |
-correct. We provide functionality for curating results: |
|
112 |
- |
|
113 |
- createCurationFile(file.rds) |
|
114 |
- |
|
115 |
-This will generate a CSV file, in which the correct purity and ploidy values |
|
116 |
-can be manually entered. It also contains a column "Curated", which should be |
|
117 |
-set to TRUE, otherwise the file will be overwritten when re-run. |
|
118 |
- |
|
119 |
-Then in R, the correct solution (closest to the combination in the CSV file) |
|
120 |
-can be loaded with the readCurationFile function: |
|
121 |
- |
|
122 |
- ret <- readCurationFile(file.rds) |
|
123 |
- |
|
124 |
-## Custom Segmentation |
|
125 |
- |
|
126 |
-By default, we will use DNAcopy to segment the log-ratio. You can easily change |
|
127 |
-that to your favorite method and the segmentationCBS function can serve as an |
|
128 |
-example. |
|
129 |
- |
|
130 |
- segmentationCBS <- function (normal, tumor, log.ratio, plot.cnv, coverage.cutoff, |
|
131 |
- sampleid = sampleid, exon.weight.file = NULL, alpha = 0.005, |
|
132 |
- vcf = NULL, tumor.id.in.vcf = 1, verbose = TRUE, ...) |
|
133 |
- |
|
134 |
-It is also possible to provide segmented file, which we however only recommend |
|
135 |
-when matched SNP6 data is available (otherwise it is better to customize the |
|
136 |
-segmentation function as described above). The expected file format is: |
|
137 |
- |
|
138 |
- ID chrom loc.start loc.end num.mark seg.mean |
|
139 |
- Sample1 1 61723 5773942 2681 0.125406444072723 |
|
140 |
- Sample1 1 5774674 5785170 10 -0.756511807441712 |
|
141 |
- |
|
142 |
-## Output |
|
143 |
- |
|
144 |
-The plotAbs() call above will generate the main plots shown in the manuscript. The R data file (file.rds) contains gene level copy number calls, SNV status and LOH calls. |
|
145 |
-The purity/ploidy combinations are sorted by likelihood at stored in ret$results. |
|
146 |
- |
|
147 |
- names(ret) |
|
148 |
- head(ret$results[[1]]$gene.calls, 3) |
|
149 |
- chr start end C seg.mean seg.id number.exons gene.mean gene.min |
|
150 |
- TP73 chr1 3598833 3649721 3 0.3434 1 14 0.1755614 -0.3861084 |
|
151 |
- CHD5 chr1 6161904 6240126 3 0.3434 1 44 0.3059672 -0.8812580 |
|
152 |
- MTOR chr1 11166594 11319542 3 0.3434 1 62 0.2360036 -0.1346406 |
|
153 |
- gene.max focal num.snps.loh.segment percentage.loh.in.loh.segment |
|
154 |
- TP73 0.6437573 FALSE 0 0 |
|
155 |
- CHD5 1.5984124 FALSE 49 0 |
|
156 |
- MTOR 0.6705342 FALSE 49 0 |
|
157 |
- |
|
158 |
-This data.frame also contains gene level LOH information. The SNV posteriors: |
|
159 |
- |
|
160 |
- head(ret$results[[1]]$SNV.posterior$beta.model$posteriors, 3) |
|
161 |
- seqnames start end SOMATIC.M0 SOMATIC.M1 SOMATIC.M2 |
|
162 |
- rs6696489 chr1 6162054 6162054 0 9.155037e-04 2.938312e-07 |
|
163 |
- rs59788818 chr1 6171992 6171992 0 2.364795e-05 3.397817e-07 |
|
164 |
- rs2843493 chr1 6184092 6184092 0 9.362524e-16 2.714454e-04 |
|
165 |
- SOMATIC.M3 SOMATIC.M4 SOMATIC.M5 SOMATIC.M6 SOMATIC.M7 |
|
166 |
- rs6696489 6.592308e-21 2.743271e-76 2.002534e-167 3.562023e-275 0 |
|
167 |
- rs59788818 3.491208e-34 1.711087e-93 7.197813e-188 2.275374e-298 0 |
|
168 |
- rs2843493 1.037518e-11 5.167773e-67 3.034251e-158 3.725864e-266 0 |
|
169 |
- GERMLINE.M0 GERMLINE.M1 GERMLINE.M2 GERMLINE.M3 GERMLINE.M4 |
|
170 |
- rs6696489 1.723074e-04 9.989057e-01 6.216658e-06 3.302211e-27 1.067221e-82 |
|
171 |
- rs59788818 5.836546e-23 9.999757e-01 2.683928e-07 1.233256e-51 2.340805e-111 |
|
172 |
- rs2843493 8.802336e-49 1.023147e-07 9.997285e-01 2.798399e-18 3.718923e-74 |
|
173 |
- GERMLINE.M5 GERMLINE.M6 GERMLINE.M7 GERMLINE.CONTHIGH |
|
174 |
- rs6696489 6.678185e-174 1.070845e-281 0 3.060961e-62 |
|
175 |
- rs59788818 5.523414e-206 1.183050e-316 0 8.253575e-127 |
|
176 |
- rs2843493 9.760218e-166 6.969108e-274 0 5.841728e-59 |
|
177 |
- GERMLINE.CONTLOW ML.SOMATIC ML.M ML.C ML.AR AR |
|
178 |
- rs6696489 2.105422e-22 FALSE 1 3 0.3571429 0.2803790 |
|
179 |
- rs59788818 3.106132e-86 FALSE 1 3 0.3571429 0.3957373 |
|
180 |
- rs2843493 4.644636e-138 FALSE 2 3 0.6428571 0.6259540 |
|
181 |
- CN.Subclonal Log.Ratio Prior.Somatic Prior.Contamination ML.LOH |
|
182 |
- rs6696489 FALSE 1.5984124 0.0004950495 0.01 FALSE |
|
183 |
- rs59788818 FALSE 0.2600908 0.0004950495 0.01 FALSE |
|
184 |
- rs2843493 FALSE 0.7559721 0.0004950495 0.01 FALSE |
|
185 |
- num.snps.loh.segment percentage.loh.in.loh.segment |
|
186 |
- rs6696489 49 0 |
|
187 |
- rs59788818 49 0 |
|
188 |
- rs2843493 49 0 |
|
189 |
- |
|
190 |
-This lists all posterior probabilities for all possible SNV states. |
|
191 |
- |
... | ... |
@@ -1,12 +1,14 @@ |
1 | 1 |
\name{createExonWeightFile} |
2 | 2 |
\alias{createExonWeightFile} |
3 | 3 |
\title{Calculate exon weights} |
4 |
-\description{Creates an exon weight file useful for segmentation, by down-weighting unreliable exons.} |
|
4 |
+\description{Creates an exon weight file useful for segmentation, by |
|
5 |
+down-weighting unreliable exons.} |
|
5 | 6 |
\usage{createExonWeightFile(gatk.tumor.files, gatk.normal.files, |
6 | 7 |
exon.weight.file)} |
7 | 8 |
\arguments{ |
8 | 9 |
\item{gatk.tumor.files}{A small number (1-3) of GATK tumor coverage samples.} |
9 |
- \item{gatk.normal.files}{A large number of GATK normal coverage samples (>20) to estimate exon log-ratio standard deviations.} |
|
10 |
+ \item{gatk.normal.files}{A large number of GATK normal coverage samples (>20) |
|
11 |
+to estimate exon log-ratio standard deviations.} |
|
10 | 12 |
\item{exon.weight.file}{Output filename.} |
11 | 13 |
} |
12 | 14 |
|
... | ... |
@@ -1,10 +1,12 @@ |
1 | 1 |
\name{createNormalDatabase} |
2 | 2 |
\alias{createNormalDatabase} |
3 | 3 |
\title{createNormalDatabase} |
4 |
-\description{Function to create a database of normal samples, used to find a good match for tumor copy number normalization.} |
|
4 |
+\description{Function to create a database of normal samples, used to find |
|
5 |
+a good match for tumor copy number normalization.} |
|
5 | 6 |
\usage{createNormalDatabase(gatk.normal.files, ...)} |
6 | 7 |
\arguments{ |
7 |
- \item{gatk.normal.files}{Vector with file names pointing to GATK coverage files of normal samples. } |
|
8 |
+ \item{gatk.normal.files}{Vector with file names pointing to GATK coverage files |
|
9 |
+of normal samples. } |
|
8 | 10 |
\item{\dots}{Arguments passed to the prcomp function.} |
9 | 11 |
} |
10 | 12 |
|
... | ... |
@@ -3,15 +3,15 @@ |
3 | 3 |
\title{findBestNormal} |
4 | 4 |
\description{Function to find the best matching normal for a provided tumor sample.} |
5 | 5 |
\usage{findBestNormal(gatk.tumor.file, normalDB, pcs = 1:3, |
6 |
- num.normals = 1, verbose = TRUE)} |
|
6 |
+ num.normals = 1, ignore.sex = FALSE, verbose = TRUE)} |
|
7 | 7 |
\arguments{ |
8 | 8 |
\item{gatk.tumor.file}{GATK coverage file of a tumor sample.} |
9 | 9 |
\item{normalDB}{Database of normal samples, created with createNormalDatabase().} |
10 | 10 |
\item{pcs}{Principal components to use for distance calculation.} |
11 |
- \item{num.normals}{ |
|
12 |
-} |
|
13 |
- \item{verbose}{ |
|
14 |
-} |
|
11 |
+ \item{num.normals}{Return the num.normals best normals.} |
|
12 |
+ \item{ignore.sex}{If FALSE, detects sex of sample returns a best normal |
|
13 |
+with matching sex.} |
|
14 |
+ \item{verbose}{Verbose output.} |
|
15 | 15 |
} |
16 | 16 |
|
17 | 17 |
\value{Return the filename of the best matching normal } |
18 | 18 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,20 @@ |
1 |
+\name{getSexFromCoverage} |
|
2 |
+\alias{getSexFromCoverage} |
|
3 |
+\title{Get sample sex from coverage} |
|
4 |
+\description{This function determines the sex of a sample by the coverage |
|
5 |
+ratio of chrX and chrY.} |
|
6 |
+\usage{getSexFromCoverage(gatk.coverage, min.ratio = 20, verbose = TRUE)} |
|
7 |
+\arguments{ |
|
8 |
+ \item{gatk.coverage}{GATK coverage file or data read with readCoverageGatk.} |
|
9 |
+ \item{min.ratio}{Min chrX/chrY coverage ratio to call sample as female.} |
|
10 |
+ \item{verbose}{Verbose output.} |
|
11 |
+} |
|
12 |
+ |
|
13 |
+\value{Returns "M" for male, "F" for female, or NULL if unknown. } |
|
14 |
+ |
|
15 |
+\author{Markus Riester} |
|
16 |
+ |
|
17 |
+ |
|
18 |
+ |
|
19 |
+ |
|
20 |
+ |
... | ... |
@@ -1,13 +1,16 @@ |
1 | 1 |
\name{runAbsoluteCN} |
2 | 2 |
\alias{runAbsoluteCN} |
3 |
-\title{Run our implementation of ABSOLUTE} |
|
4 |
-\description{This function takes as input tumor and normal control coverage and allelic fractions of germline variants and somatic mutations. |
|
5 |
-Coverage data is provied in GATK DepthOfCoverage format, allelic fraction in VCF format (e.g. obtained by MuTect). |
|
6 |
-Normal control does not need to be matched (from the same patient). In case VCF does not contain somatic status, it should contain |
|
7 |
-dbSNP and optionally COSMIC annotation.} |
|
3 |
+\title{Run PureCN implementation of ABSOLUTE} |
|
4 |
+\description{This function takes as input tumor and normal control coverage and |
|
5 |
+allelic fractions of germline variants and somatic mutations. |
|
6 |
+Coverage data is provied in GATK DepthOfCoverage format, allelic fraction |
|
7 |
+in VCF format (e.g. obtained by MuTect). Normal control does not need to |
|
8 |
+be matched (from the same patient). In case VCF does not contain somatic |
|
9 |
+status, it should contain dbSNP and optionally COSMIC annotation.} |
|
8 | 10 |
\usage{runAbsoluteCN(gatk.normal.file = NULL, gatk.tumor.file, |
9 | 11 |
log.ratio = NULL, seg.file = NULL, seg.file.sdev = 0.4, |
10 |
- vcf.file = NULL, genome = "hg19", fun.filterVcf = filterVcfMuTect, |
|
12 |
+ vcf.file = NULL, genome = "hg19", sex = c("?", |
|
13 |
+ "F", "M"), fun.filterVcf = filterVcfMuTect, |
|
11 | 14 |
args.filterVcf = list(), fun.setPriorVcf = setPriorVcf, |
12 | 15 |
args.setPriorVcf = list(), fun.segmentation = segmentationCBS, |
13 | 16 |
args.segmentation = list(), fun.focal = findFocal, |
... | ... |
@@ -15,46 +18,109 @@ dbSNP and optionally COSMIC annotation.} |
15 | 18 |
max.ploidy = 6, test.num.copy = 0:7, test.purity = seq(0.05, |
16 | 19 |
0.95, by = 0.01), prior.purity = rep(1, length(test.purity))/length(test.purity), |
17 | 20 |
max.candidate.solutions = 15, candidates = NULL, |
18 |
- coverage.cutoff = 15, max.non.clonal = 0.2, iterations = 30, |
|
19 |
- log.ratio.calibration = 0.25, gc.gene.file = NULL, |
|
20 |
- filter.lowhigh.gc.exons = 0.001, filter.targeted.base = 4, |
|
21 |
- max.logr.sdev = 0.75, plot.cnv = TRUE, verbose = TRUE, |
|
21 |
+ coverage.cutoff = 15, max.non.clonal = 0.2, max.homozygous.loss = 0.1, |
|
22 |
+ iterations = 30, log.ratio.calibration = 0.25, |
|
23 |
+ gc.gene.file = NULL, filter.lowhigh.gc.exons = 0.001, |
|
24 |
+ filter.targeted.base = 4, max.logr.sdev = 0.75, |
|
25 |
+ max.segments = 200, plot.cnv = TRUE, verbose = TRUE, |
|
22 | 26 |
post.optimize = FALSE, ...)} |
23 | 27 |
\arguments{ |
24 |
- \item{gatk.normal.file}{GATK coverage file of normal control (optional if log.ratio is provided - then it will be only used to filter low coverage exons). Should be already GC-normalized.} |
|
25 |
- \item{gatk.tumor.file}{GATK coverage file of tumor. Should be already GC-normalized.} |
|
26 |
- \item{log.ratio}{Copy number log-ratios for all exons in the coverage files. If NULL, calculated based on coverage files.} |
|
27 |
- \item{seg.file}{Segmented data. Optional, to support matched SNP6 data. If null, use coverage files or log.ratio to segment the data. } |
|
28 |
- \item{seg.file.sdev}{If seg.file provided, the log-ratio standard deviation, used to model likelihood of sub-clonal copy number events.} |
|
29 |
- \item{vcf.file}{VCF file, tested with MuTect output files. Optional, but typically needed to select between local optima of similar likelihood. Can also be a CollapsedVCF, read with the readVcf function. Requires a DB info flag for dbSNP membership. The default fun.setPriorVcf function will also look for a Cosmic.CNT slot, containing the hits in the COSMIC database. Again, do not expect very useful results without a VCF file.} |
|
28 |
+ \item{gatk.normal.file}{GATK coverage file of normal control (optional if |
|
29 |
+log.ratio is provided - then it will be only used to filter low coverage |
|
30 |
+exons). Should be already GC-normalized. Needs to be either a file name |
|
31 |
+or data read with the readCoverageGatk function.} |
|
32 |
+ \item{gatk.tumor.file}{GATK coverage file of tumor. Should be already |
|
33 |
+GC-normalized. Needs to be either a file name or data read with the |
|
34 |
+readCoverageGatk function.} |
|
35 |
+ \item{log.ratio}{Copy number log-ratios for all exons in the coverage files. |
|
36 |
+If NULL, calculated based on coverage files.} |
|
37 |
+ \item{seg.file}{Segmented data. Optional, to support matched SNP6 data. |
|
38 |
+If null, use coverage files or log.ratio to segment the data. } |
|
39 |
+ \item{seg.file.sdev}{If seg.file provided, the log-ratio standard deviation, |
|
40 |
+used to model likelihood of sub-clonal copy number events.} |
|
41 |
+ \item{vcf.file}{VCF file, tested with MuTect output files. Optional, but |
|
42 |
+typically needed to select between local optima of similar likelihood. Can |
|
43 |
+also be a CollapsedVCF, read with the readVcf function. Requires a DB info |
|
44 |
+flag for dbSNP membership. The default fun.setPriorVcf function will also |
|
45 |
+look for a Cosmic.CNT slot, containing the hits in the COSMIC database. |
|
46 |
+Again, do not expect very useful results without a VCF file.} |
|
30 | 47 |
\item{genome}{Genome version, required for the readVcf function.} |
31 |
- \item{fun.filterVcf}{Function for filtering variants. Expected output is a list with elements vcf (CollapsedVCF), flag (TRUE/FALSE) and flag_comment (string). The flags will be added to the output data and can be used to warn users, for example when samples look too noisy. Default filter will remove variants flagged by MuTect, but will keep germline variants. If ran in matched normal mode, it will by default use somatic status of variants and filter non-somatic calls with allelic fraction significantly different from 0.5 in normal. } |
|
32 |
- \item{args.filterVcf}{Arguments for variant filtering function. Arguments vcf, tumor.id.in.vcf, coverage.cutoff and verbose are required in the filter function and are automatically set (do NOT set them here again).} |
|
33 |
- \item{fun.setPriorVcf}{Function to set prior for somatic status for each variant in the VCF.} |
|
48 |
+ \item{sex}{Sex of sample. If ?, detect.} |
|
49 |
+ \item{fun.filterVcf}{Function for filtering variants. Expected output is a |
|
50 |
+list with elements vcf (CollapsedVCF), flag (TRUE/FALSE) and flag_comment |
|
51 |
+(string). The flags will be added to the output data and can be used to |
|
52 |
+warn users, for example when samples look too noisy. Default filter will |
|
53 |
+remove variants flagged by MuTect, but will keep germline variants. If |
|
54 |
+ran in matched normal mode, it will by default use somatic status of |
|
55 |
+variants and filter non-somatic calls with allelic fraction significantly |
|
56 |
+different from 0.5 in normal. } |
|
57 |
+ \item{args.filterVcf}{Arguments for variant filtering function. Arguments |
|
58 |
+vcf, tumor.id.in.vcf, coverage.cutoff and verbose are required in the |
|
59 |
+filter function and are automatically set (do NOT set them here again).} |
|
60 |
+ \item{fun.setPriorVcf}{Function to set prior for somatic status for each |
|
61 |
+variant in the VCF.} |
|
34 | 62 |
\item{args.setPriorVcf}{Arguments for somatic prior function.} |
35 |
- \item{fun.segmentation}{Function for segmenting the copy number log-ratios. Expected return value is a list with elements seg (the segmentation) and size (the size in bp for all segments).} |
|
36 |
- \item{args.segmentation}{Arguments for segmentation function. Arguments normal, tumor, log.ratio, plot.cnv, coverage.cutoff, sampleid, vcf, tumor.id.in.vcf, verbose are required in the segmentation function and automatically set (do NOT set them here again).} |
|
63 |
+ \item{fun.segmentation}{Function for segmenting the copy number log-ratios. |
|
64 |
+Expected return value is a list with elements seg (the segmentation) and |
|
65 |
+size (the size in bp for all segments).} |
|
66 |
+ \item{args.segmentation}{Arguments for segmentation function. Arguments |
|
67 |
+normal, tumor, log.ratio, plot.cnv, coverage.cutoff, sampleid, vcf, |
|
68 |
+tumor.id.in.vcf, verbose are required in the segmentation function and |
|
69 |
+automatically set (do NOT set them here again).} |
|
37 | 70 |
\item{fun.focal}{Function for identifying focal amplifications.} |
38 | 71 |
\item{args.focal}{Arguments for focal amplification function.} |
39 | 72 |
\item{sampleid}{Sample id, provided in output files etc.} |
40 | 73 |
\item{min.ploidy}{Minimum ploidy to be considered.} |
41 | 74 |
\item{max.ploidy}{Maximum ploidy to be considered.} |
42 |
- \item{test.num.copy}{Copy numbers tested in the grid search. Note that focal amplifications can have much higher copy numbers, but they will be labeled as subclonal (because they do not fit the integer copy numbers).} |
|
75 |
+ \item{test.num.copy}{Copy numbers tested in the grid search. Note that focal |
|
76 |
+amplifications can have much higher copy numbers, but they will be labeled |
|
77 |
+as subclonal (because they do not fit the integer copy numbers).} |
|
43 | 78 |
\item{test.purity}{Considered tumor purity values. } |
44 |
- \item{prior.purity}{Priors for purity if they are available. Only change when you know what you are doing.} |
|
45 |
- \item{max.candidate.solutions}{Number of local optima considered in optimization and variant fitting steps. If there are too many local optima, it will use specified number of top candidate solutions, but will also include all optima close to diploid, because silent genomes have often lots of local optima.} |
|
46 |
- \item{candidates}{Candidates to optimize from a previous run (return.object$candidates). If NULL, do 2D grid search and find local optima. } |
|
47 |
- \item{coverage.cutoff}{Minimum exon coverage in both normal and tumor. Exons with lower coverage are ingored. The cutoff choice depends on the expected purity and overall coverage. High purity samples might need a lower cutoff to call homozygous deletions. If an exon.weigh.file (below) is NOT specified, it is recommended to set a higher cutoff (e.g. 20) to remove noise from unreliable exon measurements. } |
|
48 |
- \item{max.non.clonal}{Maximum genomic fraction assigned to a subclonal copy number state.} |
|
49 |
- \item{iterations}{Maximum number of iterations in the Simulated Annealing copy number fit optimization.} |
|
50 |
- \item{log.ratio.calibration}{re-calibrate log-ratios in the window sd(log.ratio)*log.ratio.calibration.} |
|
51 |
- \item{gc.gene.file}{GC gene file, which assigns GC content and gene symbols to each exon in the coverage files. Used for generating gene level calls. First column in format CHR:START-END. Second column GC content (0 to 1). Third column gene symbol.} |
|
52 |
- \item{filter.lowhigh.gc.exons}{Quantile q (defines lower q and upper 1-q) for removing exons with outlier GC profile. Assuming that GC correction might not have been worked on those. Requires gc.gene.file.} |
|
53 |
- \item{filter.targeted.base}{Exclude exons with targeted base (size) smaller than this cutoff.} |
|
54 |
- \item{max.logr.sdev}{Flag noisy samples with segment log-ratio standard deviation larger than this. Assay specific and needs to be calibrated.} |
|
79 |
+ \item{prior.purity}{Priors for purity if they are available. Only change |
|
80 |
+when you know what you are doing.} |
|
81 |
+ \item{max.candidate.solutions}{Number of local optima considered in optimization |
|
82 |
+and variant fitting steps. If there are too many local optima, it will use |
|
83 |
+specified number of top candidate solutions, but will also include all |
|
84 |
+optima close to diploid, because silent genomes have often lots of local |
|
85 |
+optima.} |
|
86 |
+ \item{candidates}{Candidates to optimize from a previous run |
|
87 |
+(return.object$candidates). |
|
88 |
+If NULL, do 2D grid search and find local optima. } |
|
89 |
+ \item{coverage.cutoff}{Minimum exon coverage in both normal and tumor. Exons |
|
90 |
+with lower coverage are ingored. The cutoff choice depends on the expected |
|
91 |
+purity and overall coverage. High purity samples might need a lower cutoff |
|
92 |
+to call homozygous deletions. If an exon.weigh.file (below) is NOT |
|
93 |
+specified, it is recommended to set a higher cutoff (e.g. 20) to remove |
|
94 |
+noise from unreliable exon measurements. } |
|
95 |
+ \item{max.non.clonal}{Maximum genomic fraction assigned to a subclonal copy |
|
96 |
+number state.} |
|
97 |
+ \item{max.homozygous.loss}{Maximum genomic fraction assigned to homozygous loss. |
|
98 |
+This is set to a fairly high default value to not exclude correct |
|
99 |
+solutions, especially in noisy segmentations. } |
|
100 |
+ \item{iterations}{Maximum number of iterations in the Simulated Annealing copy |
|
101 |
+number fit optimization.} |
|
102 |
+ \item{log.ratio.calibration}{re-calibrate log-ratios in the window |
|
103 |
+sd(log.ratio)*log.ratio.calibration.} |
|
104 |
+ \item{gc.gene.file}{GC gene file, which assigns GC content and gene symbols |
|
105 |
+to each exon in the coverage files. Used for generating gene level calls. |
|
106 |
+First column in format CHR:START-END. Second column GC content (0 to 1). |
|
107 |
+Third column gene symbol.} |
|
108 |
+ \item{filter.lowhigh.gc.exons}{Quantile q (defines lower q and upper 1-q) |
|
109 |
+for removing exons with outlier GC profile. Assuming that GC correction |
|
110 |
+might not have been worked on those. Requires gc.gene.file.} |
|
111 |
+ \item{filter.targeted.base}{Exclude exons with targeted base (size) smaller |
|
112 |
+than this cutoff.} |
|
113 |
+ \item{max.logr.sdev}{Flag noisy samples with segment log-ratio standard deviation |
|
114 |
+larger than this. Assay specific and needs to be calibrated.} |
|
115 |
+ \item{max.segments}{Flag noisy samples with a large number of segments. Assay |
|
116 |
+specific and needs to be calibrated.} |
|
55 | 117 |
\item{plot.cnv}{Generate segmentation plots.} |
56 | 118 |
\item{verbose}{Verbose output.} |
57 |
- \item{post.optimize}{Optimize purity using final SCNA-fit and SNVs. This might take a long time when lots of SNVs need to be fitted, but will result in a slightly more accurate purity, especially for rather silent genomes or very low purities. Otherwise, it will just use the purity determined via the SCNA-fit.} |
|
119 |
+ \item{post.optimize}{Optimize purity using final SCNA-fit and SNVs. This |
|
120 |
+might take a long time when lots of SNVs need to be fitted, but will |
|
121 |
+typically result in a slightly more accurate purity, especially for rather |
|
122 |
+silent genomes or very low purities. Otherwise, it will just use the |
|
123 |
+purity determined via the SCNA-fit.} |
|
58 | 124 |
\item{\dots}{Additional parameters passed to the segmentation function.} |
59 | 125 |
} |
60 | 126 |
|
... | ... |
@@ -69,19 +135,24 @@ dbSNP and optionally COSMIC annotation.} |
69 | 135 |
|
70 | 136 |
|
71 | 137 |
\examples{ |
72 |
-gatk.normal.file <- system.file("extdata", "example_normal.txt", package="PureCN") |
|
73 |
-gatk.tumor.file <- system.file("extdata", "example_tumor.txt", package="PureCN") |
|
74 |
-vcf.file <- system.file("extdata", "example_vcf.vcf", package="PureCN") |
|
75 |
-gc.gene.file <- system.file("extdata", "example_gc.gene.file.txt", package="PureCN") |
|
138 |
+gatk.normal.file <- system.file("extdata", "example_normal.txt", |
|
139 |
+ package="PureCN") |
|
140 |
+gatk.tumor.file <- system.file("extdata", "example_tumor.txt", |
|
141 |
+ package="PureCN") |
|
142 |
+vcf.file <- system.file("extdata", "example_vcf.vcf", |
|
143 |
+ package="PureCN") |
|
144 |
+gc.gene.file <- system.file("extdata", "example_gc.gene.file.txt", |
|
145 |
+ package="PureCN") |
|
76 | 146 |
|
77 |
-# Speed-up the runAbsoluteCN by using the stored grid-search |
|
147 |
+# Speed-up the runAbsoluteCN call by using the stored grid-search |
|
78 | 148 |
# (purecn.example.output$candidates). |
79 | 149 |
data(purecn.example.output) |
80 | 150 |
|
81 | 151 |
# The max.candidate.solutions parameter is set to a very low value only to |
82 | 152 |
# speed-up this example. This is not a good idea for real samples. |
83 | 153 |
|
84 |
-ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, gatk.tumor.file=gatk.tumor.file, |
|
85 |
- candidates=purecn.example.output$candidates, max.candidate.solutions=2, |
|
154 |
+ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, |
|
155 |
+ gatk.tumor.file=gatk.tumor.file, |
|
156 |
+ candidates=purecn.example.output$candidates, max.candidate.solutions=2, |
|
86 | 157 |
vcf.file=vcf.file, sampleid='Sample1', gc.gene.file=gc.gene.file) |
87 | 158 |
} |
... | ... |
@@ -61,9 +61,10 @@ following steps on GC corrected data. All other assay-specific biases such as |
61 | 61 |
mappability or exon length are mitigated by using a control sample (GC bias is |
62 | 62 |
commonly library specific). |
63 | 63 |
|
64 |
-PureCN currently assumes a completely diploid normal genome, this means the user |
|
65 |
-is expected to remove any non-diploid chromosomes (i.e. XY for males and Y for |
|
66 |
-females). |
|
64 |
+PureCN currently assumes a completely diploid normal genome. PureCN tries to |
|
65 |
+detect the sex of samples by calculating the coverage ratio of chromosomes X |
|
66 |
+and Y and will then remove any non-diploid chromosomes (i.e. XY for males and Y |
|
67 |
+for females). |
|
67 | 68 |
|
68 | 69 |
We load a few example files: |
69 | 70 |
<<example_files>>= |