Browse code

added support for chromosome X

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/PureCN@116349 bc3139a8-67e5-0310-9ffc-ced21a209358

Markus Riester authored on 15/04/2016 13:36:59
Showing 16 changed files

... ...
@@ -2,16 +2,15 @@ Package: PureCN
2 2
 Type: Package
3 3
 Title: Estimating tumor purity, ploidy, LOH, and SNV status using
4 4
         hybrid capture NGS data
5
-Version: 0.99.6
5
+Version: 0.99.7
6 6
 Date: 2016-03-30
7 7
 Author: Markus Riester
8 8
 Maintainer: Markus Riester <[email protected]>
9 9
 Description: This package estimates tumor purity, copy number, loss of
10
-        heterozygosity (LOH), and status of short nucleotide variants
11
-        (SNVs). PureCN is designed for hybrid capture next generation
12
-        sequencing (NGS) data, integrates well with standard somatic
13
-        variant detection pipelines, and has support for tumor samples
14
-        without matching normal samples.
10
+    heterozygosity (LOH), and status of short nucleotide variants (SNVs). 
11
+    PureCN is designed for hybrid capture next generation sequencing (NGS) 
12
+    data, integrates well with standard somatic variant detection pipelines, 
13
+    and has support for tumor samples without matching normal samples.
15 14
 Depends: R (>= 3.3), DNAcopy, VariantAnnotation (>= 1.14.1)
16 15
 Imports: GenomicRanges (>= 1.20.3), IRanges (>= 2.2.1), RColorBrewer,
17 16
         S4Vectors, data.table, grDevices, graphics, stats, utils,
... ...
@@ -22,3 +21,4 @@ License: Artistic-2.0
22 21
 biocViews: CopyNumberVariation, Software, Sequencing,
23 22
         VariantAnnotation, VariantDetection
24 23
 NeedsCompilation: no
24
+Packaged: 2016-04-14 21:52:04 UTC; riestma1
... ...
@@ -17,6 +17,7 @@ export(findBestNormal)
17 17
 export(plotBestNormal)
18 18
 export(predictSomatic)
19 19
 export(plotAbs)
20
+export(getSexFromCoverage)
20 21
 import(VariantAnnotation)
21 22
 import(DNAcopy)
22 23
 importFrom("data.table", "data.table")
... ...
@@ -86,11 +86,6 @@ max.exon.ratio) {
86 86
                     Mi + g * (1 - p))/(p * lr.C[j] + 2 * (1 - p)), sd = sd.ar, log = TRUE) - 
87 87
                     log(lr.C[j] + 1), -Inf))))
88 88
             } else {
89
-                # p.ar <- lapply(c(0,1), function(g) lapply(1:length(ar_all), function(j)
90
-                # sapply(test.num.copy, function(Mi) ifelse(Mi <= lr.C[j], dbeta(x = (p * Mi + g
91
-                # * (1 - p))/(p * lr.C[j] + 2 * (1 - p)), shape1=ar_all[j]*dp_all[j]+1,
92
-                # shape2=(1-ar_all[j])*dp_all[j]+1, log = TRUE)-log(lr.C[j]+1+haploid.penalty) ,
93
-                # -Inf))))
94 89
                 p.ar <- lapply(c(0, 1), function(g) lapply(1:length(ar_all), function(j) sapply(test.num.copy, 
95 90
                   function(Mi) ifelse(Mi <= lr.C[j], dbeta(x = (p * Mi + g * (1 - 
96 91
                     p))/(p * lr.C[j] + 2 * (1 - p)), shape1 = ar_all[j] * dp_all[j] + 
... ...
@@ -265,7 +260,7 @@ max.exon.ratio) {
265 260
     return(result)
266 261
 }
267 262
 
268
-.flagResults <- function(results, max.non.clonal = 0.2, max.logr.sdev, logr.sdev, 
263
+.flagResults <- function(results, max.non.clonal = 0.2, max.logr.sdev, logr.sdev, max.segments,
269 264
     flag = NA, flag_comment = NA) {
270 265
     results <- lapply(results, .flagResult, max.non.clonal = max.non.clonal)
271 266
     
... ...
@@ -273,6 +268,7 @@ max.exon.ratio) {
273 268
     # results[[2]]$total.log.likelihood ) / abs( results[[1]]$total.log.likelihood )
274 269
     # if (ldiff < 0.1) { results[[1]]$flag <- TRUE results[[1]]$flag_comment <-
275 270
     # 'SIMILAR TO SECOND MOST LIKELY OPTIMUM' return(results) }
271
+    number.segments <- nrow(results[[1]]$seg)
276 272
     
277 273
     if (logr.sdev > max.logr.sdev) {
278 274
         for (i in 1:length(results)) {
... ...
@@ -281,6 +277,14 @@ max.exon.ratio) {
281 277
                 "NOISY LOG-RATIO")
282 278
         }
283 279
     }
280
+
281
+    if (number.segments > max.segments) {
282
+        for (i in 1:length(results)) {
283
+            results[[i]]$flag <- TRUE
284
+            results[[i]]$flag_comment <- .appendComment(results[[i]]$flag_comment, 
285
+                "NOISY SEGMENTATION")
286
+        }
287
+    }
284 288
     
285 289
     # some global flags
286 290
     if (!is.na(flag) && flag) {
... ...
@@ -519,4 +523,10 @@ max.exon.ratio) {
519 523
     data(chr.hash, envir = environment())
520 524
     chr.hash[as.character(ls), 2]
521 525
 }
526
+.add.chr.name <- function(ls) {
527
+    chr.hash <- NULL
528
+    data(chr.hash, envir = environment())
529
+    as.character(chr.hash$chr[match(ls, chr.hash$number)])
530
+}
531
+    
522 532
  
... ...
@@ -1,10 +1,13 @@
1 1
 createNormalDatabase <- structure(function(
2
-### Function to create a database of normal samples, used to find a good match for tumor copy number normalization.
2
+### Function to create a database of normal samples, used to find 
3
+### a good match for tumor copy number normalization.
3 4
 gatk.normal.files,
4
-### Vector with file names pointing to GATK coverage files of normal samples. 
5
+### Vector with file names pointing to GATK coverage files 
6
+### of normal samples. 
5 7
 ...
6 8
 ### Arguments passed to the prcomp function.
7 9
 ) {
10
+    gatk.normal.files <- normalizePath(gatk.normal.files)
8 11
     normals <- lapply(gatk.normal.files, readCoverageGatk)
9 12
     normals.m <- do.call(cbind, lapply(normals, function(x) x$average.coverage))
10 13
     idx <- complete.cases(normals.m)
... ...
@@ -15,7 +18,8 @@ gatk.normal.files,
15 18
         exons.used=idx, 
16 19
         coverage=apply(normals.m, 2, mean, na.rm=TRUE), 
17 20
         exon.median.coverage=apply(normals.m,1,median, na.rm=TRUE),
18
-        exon.log2.sd.coverage=apply(log2(normals.m+1),1,sd, na.rm=TRUE)
21
+        exon.log2.sd.coverage=apply(log2(normals.m+1),1,sd, na.rm=TRUE),
22
+        sex=sapply(normals,getSexFromCoverage)
19 23
     )
20 24
 ### A normal database that can be used in the findBestNormal function to 
21 25
 ### retrieve good matching normal samples for a given tumor sample.
... ...
@@ -27,11 +31,13 @@ normalDB <- createNormalDatabase(gatk.normal.files)
27 31
 })    
28 32
 
29 33
 createExonWeightFile <- structure(function(# Calculate exon weights
30
-### Creates an exon weight file useful for segmentation, by down-weighting unreliable exons.
34
+### Creates an exon weight file useful for segmentation, by 
35
+### down-weighting unreliable exons.
31 36
 gatk.tumor.files, 
32 37
 ### A small number (1-3) of GATK tumor coverage samples.
33 38
 gatk.normal.files,
34
-### A large number of GATK normal coverage samples (>20) to estimate exon log-ratio standard deviations.
39
+### A large number of GATK normal coverage samples (>20) 
40
+### to estimate exon log-ratio standard deviations.
35 41
 exon.weight.file
36 42
 ### Output filename.
37 43
 ) {
... ...
@@ -51,7 +57,7 @@ exon.weight.file
51 57
     ret <- data.frame(Target=tumor.coverage[[1]][,1], Weights=zz)
52 58
 
53 59
     write.table(ret, file=exon.weight.file,row.names=FALSE, quote=FALSE, sep="\t")
54
-    ret
60
+    invisible(ret)
55 61
 ###A data.frame with exon weights.
56 62
 }, ex=function() {
57 63
 exon.weight.file <- "exon_weights.txt"
... ...
@@ -7,13 +7,16 @@ normalDB,
7 7
 pcs=1:3,
8 8
 ### Principal components to use for distance calculation.
9 9
 num.normals=1,
10
-## Return the num.normals best normals.
10
+### Return the num.normals best normals.
11
+ignore.sex=FALSE,
12
+### If FALSE, detects sex of sample returns a best normal 
13
+### with matching sex.
11 14
 verbose=TRUE
15
+### Verbose output.
12 16
 ) {
13 17
     if (is.character(gatk.tumor.file)) {
14 18
         tumor  <- readCoverageGatk(gatk.tumor.file)
15 19
     } else {
16
-        if (verbose) message("gatk.tumor.file does not appear to be a filename, assuming it is valid GATK coverage data.")
17 20
         tumor <- gatk.tumor.file
18 21
     }    
19 22
     x <- t(tumor[normalDB$exons.used,"average.coverage", drop=FALSE])
... ...
@@ -22,8 +25,22 @@ verbose=TRUE
22 25
     idx.pcs <- pcs
23 26
     idx.pcs <- idx.pcs[idx.pcs %in% 1:ncol(normalDB$pca$x)]
24 27
 
25
-    best.match <- order(sapply(1:length(normalDB$gatk.normal.files), function(i) dist(  rbind(predict(normalDB$pca, x)[1,idx.pcs], predict(normalDB$pca)[i,idx.pcs]))[1]))
26
-    normalDB$gatk.normal.files[head(best.match, num.normals)]
28
+    idx.normals <- seq_along(normalDB$gatk.normal.files)
29
+
30
+    if (!ignore.sex && !is.null(normalDB$sex) && sum(!is.na(normalDB$sex))>0) {
31
+        sex <- getSexFromCoverage(tumor, verbose=FALSE)
32
+        if (verbose) message(paste("Sex of sample:", sex))
33
+        if (!is.na(sex)) {
34
+            idx.normals <- which(normalDB$sex == sex)
35
+        }
36
+        if (length(idx.normals) < 2) { 
37
+            warning(paste("Not enough samples of sex", sex, "in database. Ignoring sex."))
38
+            idx.normals <- seq_along(normalDB$gatk.normal.files)
39
+        }    
40
+    }    
41
+
42
+    best.match <- order(sapply(idx.normals, function(i) dist(  rbind(predict(normalDB$pca, x)[1,idx.pcs], predict(normalDB$pca)[i,idx.pcs]))[1]))
43
+    normalDB$gatk.normal.files[idx.normals][head(best.match, num.normals)]
27 44
 ### Return the filename of the best matching normal    
28 45
 },ex=function() {
29 46
 gatk.normal.file <- system.file("extdata", "example_normal.txt", package="PureCN")
30 47
new file mode 100755
... ...
@@ -0,0 +1,42 @@
1
+getSexFromCoverage <- function(# Get sample sex from coverage
2
+### This function determines the sex of a sample by the coverage 
3
+### ratio of chrX and chrY.
4
+gatk.coverage, 
5
+### GATK coverage file or data read with readCoverageGatk.
6
+min.ratio=20,
7
+### Min chrX/chrY coverage ratio to call sample as female.
8
+verbose=TRUE
9
+### Verbose output.
10
+) {
11
+    if (is.character(gatk.coverage)) {
12
+        x <- readCoverageGatk(gatk.coverage)
13
+    } else {
14
+        x <- gatk.coverage
15
+    }
16
+
17
+    sex.chr <- .getSexChr(x)
18
+    xx <- split(x$average.coverage, x$chr)
19
+    avg.coverage <- sapply(xx, mean, na.rm=TRUE)
20
+    if (is.na(avg.coverage[sex.chr[1]]) || is.na(avg.coverage[sex.chr[2]]) ) {
21
+        if (verbose) message("Allosome coverage appears to be missing, cannot determine sex.")
22
+        return(NA)
23
+    }    
24
+
25
+    autosome.ratio <- mean(avg.coverage[-match(sex.chr, names(avg.coverage))], na.rm=TRUE)/(avg.coverage[sex.chr[1]]+0.0001)
26
+    if (autosome.ratio > 5) { 
27
+        if (verbose) message("Allosome coverage very low, cannot determine sex.")
28
+        return(NA)
29
+    }
30
+    XY.ratio <- avg.coverage[sex.chr[1]]/ (avg.coverage[sex.chr[2]]+ 0.0001)
31
+    if (XY.ratio > min.ratio) return("F")
32
+    return("M")    
33
+### Returns "M" for male, "F" for female, or NULL if unknown.    
34
+}
35
+
36
+.getSexChr <- function(gatk.coverage) {
37
+    if ("chrX" %in% gatk.coverage$chr) {
38
+        return(c("chrX", "chrY"))
39
+    }
40
+    return(as.character(23:24))    
41
+}
42
+       
... ...
@@ -1,35 +1,67 @@
1 1
 runAbsoluteCN <-
2
-structure(function(# Run our implementation of ABSOLUTE
3
-### This function takes as input tumor and normal control coverage and allelic fractions of germline variants and somatic mutations.
4
-### Coverage data is provied in GATK DepthOfCoverage format, allelic fraction in VCF format (e.g. obtained by MuTect). 
5
-### Normal control does not need to be matched (from the same patient). In case VCF does not contain somatic status, it should contain
6
-### dbSNP and optionally COSMIC annotation.
2
+structure(function(# Run PureCN implementation of ABSOLUTE
3
+### This function takes as input tumor and normal control coverage and 
4
+### allelic fractions of germline variants and somatic mutations.
5
+### Coverage data is provied in GATK DepthOfCoverage format, allelic fraction 
6
+### in VCF format (e.g. obtained by MuTect). Normal control does not need to 
7
+### be matched (from the same patient). In case VCF does not contain somatic 
8
+### status, it should contain dbSNP and optionally COSMIC annotation.
7 9
 gatk.normal.file=NULL, 
8
-### GATK coverage file of normal control (optional if log.ratio is provided - then it will be only used to filter low coverage exons). Should be already GC-normalized.
10
+### GATK coverage file of normal control (optional if 
11
+### log.ratio is provided - then it will be only used to filter low coverage 
12
+### exons). Should be already GC-normalized. Needs to be either a file name 
13
+### or data read with the readCoverageGatk function.
9 14
 gatk.tumor.file, 
10
-### GATK coverage file of tumor. Should be already GC-normalized.
15
+### GATK coverage file of tumor. Should be already 
16
+### GC-normalized. Needs to be either a file name or data read with the 
17
+### readCoverageGatk function.
11 18
 log.ratio=NULL, 
12
-### Copy number log-ratios for all exons in the coverage files. If NULL, calculated based on coverage files.
19
+### Copy number log-ratios for all exons in the coverage files. 
20
+### If NULL, calculated based on coverage files.
13 21
 seg.file=NULL,
14
-### Segmented data. Optional, to support matched SNP6 data. If null, use coverage files or log.ratio to segment the data.  
22
+### Segmented data. Optional, to support matched SNP6 data. 
23
+### If null, use coverage files or log.ratio to segment the data.  
15 24
 seg.file.sdev=0.4,
16
-### If seg.file provided, the log-ratio standard deviation, used to model likelihood of sub-clonal copy number events.
25
+### If seg.file provided, the log-ratio standard deviation, 
26
+### used to model likelihood of sub-clonal copy number events.
17 27
 vcf.file=NULL, 
18
-### VCF file, tested with MuTect output files.  Optional, but typically needed to select between local optima of similar likelihood. Can also be a CollapsedVCF, read with the readVcf function. Requires a DB info flag for dbSNP membership. The default fun.setPriorVcf function will also look for a Cosmic.CNT slot, containing the hits in the COSMIC database. Again, do not expect very useful results without a VCF file.
28
+### VCF file, tested with MuTect output files.  Optional, but
29
+### typically needed to select between local optima of similar likelihood. Can 
30
+### also be a CollapsedVCF, read with the readVcf function. Requires a DB info 
31
+### flag for dbSNP membership. The default fun.setPriorVcf function will also
32
+### look for a Cosmic.CNT slot, containing the hits in the COSMIC database.
33
+### Again, do not expect very useful results without a VCF file.
19 34
 genome="hg19",
20 35
 ### Genome version, required for the readVcf function.
36
+sex=c("?","F","M"),
37
+### Sex of sample. If ?, detect.
21 38
 fun.filterVcf=filterVcfMuTect, 
22
-### Function for filtering variants. Expected output is a list with elements vcf (CollapsedVCF), flag (TRUE/FALSE) and flag_comment (string). The flags will be added to the output data and can be used to warn users, for example when samples look too noisy. Default filter will remove variants flagged by MuTect, but will keep germline variants. If ran in matched normal mode, it will by default use somatic status of variants and filter non-somatic calls with allelic fraction significantly different from 0.5 in normal. 
39
+### Function for filtering variants. Expected output is a 
40
+### list with elements vcf (CollapsedVCF), flag (TRUE/FALSE) and flag_comment 
41
+### (string). The flags will be added to the output data and can be used to 
42
+### warn users, for example when samples look too noisy. Default filter will 
43
+### remove variants flagged by MuTect, but will keep germline variants. If 
44
+### ran in matched normal mode, it will by default use somatic status of 
45
+### variants and filter non-somatic calls with allelic fraction significantly 
46
+### different from 0.5 in normal. 
23 47
 args.filterVcf=list(),
24
-### Arguments for variant filtering function. Arguments vcf, tumor.id.in.vcf, coverage.cutoff and verbose are required in the filter function and are automatically set (do NOT set them here again).
48
+### Arguments for variant filtering function. Arguments 
49
+### vcf, tumor.id.in.vcf, coverage.cutoff and verbose are required in the 
50
+### filter function and are automatically set (do NOT set them here again).
25 51
 fun.setPriorVcf=setPriorVcf,
26
-### Function to set prior for somatic status for each variant in the VCF.
52
+### Function to set prior for somatic status for each 
53
+### variant in the VCF.
27 54
 args.setPriorVcf=list(),
28 55
 ### Arguments for somatic prior function.
29 56
 fun.segmentation=segmentationCBS, 
30
-### Function for segmenting the copy number log-ratios. Expected return value is a list with elements seg (the segmentation) and size (the size in bp for all segments).
57
+### Function for segmenting the copy number log-ratios. 
58
+### Expected return value is a list with elements seg (the segmentation) and 
59
+### size (the size in bp for all segments).
31 60
 args.segmentation=list(),
32
-### Arguments for segmentation function. Arguments normal, tumor, log.ratio, plot.cnv, coverage.cutoff, sampleid, vcf, tumor.id.in.vcf, verbose are required in the segmentation function and automatically set (do NOT set them here again).
61
+### Arguments for segmentation function. Arguments 
62
+### normal, tumor, log.ratio, plot.cnv, coverage.cutoff, sampleid, vcf, 
63
+### tumor.id.in.vcf, verbose are required in the segmentation function and 
64
+### automatically set (do NOT set them here again).
33 65
 fun.focal=findFocal,
34 66
 ### Function for identifying focal amplifications.
35 67
 args.focal=list(),
... ...
@@ -41,37 +73,72 @@ min.ploidy=1,
41 73
 max.ploidy=6, 
42 74
 ### Maximum ploidy to be considered.
43 75
 test.num.copy=0:7, 
44
-### Copy numbers tested in the grid search. Note that focal amplifications can have much higher copy numbers, but they will be labeled as subclonal (because they do not fit the integer copy numbers).
76
+### Copy numbers tested in the grid search. Note that focal 
77
+### amplifications can have much higher copy numbers, but they will be labeled
78
+### as subclonal (because they do not fit the integer copy numbers).
45 79
 test.purity=seq(0.05,0.95,by=0.01), 
46 80
 ### Considered tumor purity values. 
47 81
 prior.purity=rep(1,length(test.purity))/length(test.purity), 
48
-### Priors for purity if they are available. Only change when you know what you are doing.
82
+### Priors for purity if they are available. Only change 
83
+### when you know what you are doing.
49 84
 max.candidate.solutions=15, 
50
-### Number of local optima considered in optimization and variant fitting steps. If there are too many local optima, it will use specified number of top candidate solutions, but will also include all optima close to diploid, because silent genomes have often lots of local optima.
85
+### Number of local optima considered in optimization 
86
+### and variant fitting steps. If there are too many local optima, it will use
87
+### specified number of top candidate solutions, but will also include all 
88
+### optima close to diploid, because silent genomes have often lots of local 
89
+### optima.
51 90
 candidates=NULL, 
52
-### Candidates to optimize from a previous run (return.object$candidates). If NULL, do 2D grid search and find local optima. 
91
+### Candidates to optimize from a previous run 
92
+### (return.object$candidates). 
93
+### If NULL, do 2D grid search and find local optima. 
53 94
 coverage.cutoff=15, 
54
-### Minimum exon coverage in both normal and tumor. Exons with lower coverage are ingored. The cutoff choice depends on the expected purity and overall coverage. High purity samples might need a lower cutoff to call homozygous deletions. If an exon.weigh.file (below) is NOT specified, it is recommended to set a higher cutoff (e.g. 20) to remove noise from unreliable exon measurements. 
95
+### Minimum exon coverage in both normal and tumor. Exons
96
+### with lower coverage are ingored. The cutoff choice depends on the expected
97
+### purity and overall coverage. High purity samples might need a lower cutoff
98
+### to call homozygous deletions. If an exon.weigh.file (below) is NOT 
99
+### specified, it is recommended to set a higher cutoff (e.g. 20) to remove 
100
+### noise from unreliable exon measurements. 
55 101
 max.non.clonal=0.2, 
56
-### Maximum genomic fraction assigned to a subclonal copy number state.
102
+### Maximum genomic fraction assigned to a subclonal copy 
103
+### number state.
104
+max.homozygous.loss=0.1,
105
+### Maximum genomic fraction assigned to homozygous loss. 
106
+### This is set to a fairly high default value to not exclude correct
107
+### solutions, especially in noisy segmentations. 
57 108
 iterations=30, 
58
-### Maximum number of iterations in the Simulated Annealing copy number fit optimization.
109
+### Maximum number of iterations in the Simulated Annealing copy 
110
+### number fit optimization.
59 111
 log.ratio.calibration=0.25,
60
-### re-calibrate log-ratios in the window sd(log.ratio)*log.ratio.calibration.
112
+### re-calibrate log-ratios in the window 
113
+### sd(log.ratio)*log.ratio.calibration.
61 114
 gc.gene.file=NULL, 
62
-### GC gene file, which assigns GC content and gene symbols to each exon in the coverage files. Used for generating gene level calls. First column in format CHR:START-END. Second column GC content (0 to 1). Third column gene symbol.
115
+### GC gene file, which assigns GC content and gene symbols 
116
+### to each exon in the coverage files. Used for generating gene level calls. 
117
+### First column in format CHR:START-END. Second column GC content (0 to 1). 
118
+### Third column gene symbol.
63 119
 filter.lowhigh.gc.exons=0.001,
64
-### Quantile q (defines lower q and upper 1-q) for removing exons with outlier GC profile. Assuming that GC correction might not have been worked on those. Requires gc.gene.file.
120
+### Quantile q (defines lower q and upper 1-q) 
121
+### for removing exons with outlier GC profile. Assuming that GC correction 
122
+### might not have been worked on those. Requires gc.gene.file.
65 123
 filter.targeted.base=4,
66
-### Exclude exons with targeted base (size) smaller than this cutoff.
124
+### Exclude exons with targeted base (size) smaller 
125
+### than this cutoff.
67 126
 max.logr.sdev=0.75,
68
-### Flag noisy samples with segment log-ratio standard deviation larger than this. Assay specific and needs to be calibrated.
127
+### Flag noisy samples with segment log-ratio standard deviation 
128
+### larger than this. Assay specific and needs to be calibrated.
129
+max.segments=200,
130
+### Flag noisy samples with a large number of segments. Assay 
131
+### specific and needs to be calibrated.
69 132
 plot.cnv=TRUE, 
70 133
 ### Generate segmentation plots.
71 134
 verbose=TRUE, 
72 135
 ### Verbose output.
73 136
 post.optimize=FALSE,
74
-### Optimize purity using final SCNA-fit and SNVs. This might take a long time when lots of SNVs need to be fitted, but will result in a slightly more accurate purity, especially for rather silent genomes or very low purities. Otherwise, it will just use the purity determined via the SCNA-fit.
137
+### Optimize purity using final SCNA-fit and SNVs. This 
138
+### might take a long time when lots of SNVs need to be fitted, but will 
139
+### typically result in a slightly more accurate purity, especially for rather
140
+### silent genomes or very low purities. Otherwise, it will just use the 
141
+### purity determined via the SCNA-fit.
75 142
 ... 
76 143
 ### Additional parameters passed to the segmentation function.
77 144
 ) {
... ...
@@ -132,6 +199,29 @@ post.optimize=FALSE,
132 199
         if (is.null(gatk.normal.file)) normal <- tumor
133 200
         if (!is.null(seg.file)) stop("Provide either log.ratio or seg.file, not both.") 
134 201
     }        
202
+    sex <- match.arg(sex)
203
+    sex.chr <- .getSexChr(tumor)
204
+    if (sex =="?") {
205
+        sex.tumor <- getSexFromCoverage(tumor, verbose=FALSE)
206
+        sex.normal <- getSexFromCoverage(normal, verbose=FALSE)
207
+        if (is.na(sex.tumor)) {
208
+            tumor <- .removeChr(tumor, remove.chrs=sex.chr)
209
+        }    
210
+        if (is.na(sex.normal)) {
211
+            normal <- .removeChr(normal, remove.chrs=sex.chr)
212
+        }    
213
+        if (!identical(sex.tumor, sex.normal)) warning(paste("Sex tumor/normal mismatch: tumor =", sex.tumor, "normal =", sex.normal))  
214
+        sex <- sex.tumor    
215
+        if (is.na(sex)) sex = "?"
216
+    } 
217
+    if (sex=="M") {
218
+        tumor <- .removeChr(tumor, remove.chrs=sex.chr)
219
+    } else if (sex=="F") {
220
+        tumor <- .removeChr(tumor, remove.chrs=sex.chr[2])
221
+    }       
222
+          
223
+    if (verbose) message(paste("Sex of sample:", sex))
224
+          
135 225
     # NA's in log.ratio confuse the CBS function
136 226
     idx <- !is.na(log.ratio) & !is.infinite(log.ratio)
137 227
     log.ratio <- log.ratio[idx]    
... ...
@@ -171,6 +261,7 @@ post.optimize=FALSE,
171 261
     vcf.germline <- NULL
172 262
     tumor.id.in.vcf <- NULL
173 263
     prior.somatic <- NULL
264
+    vcf.filtering <-list(flag=FALSE, flag_comment="")
174 265
 
175 266
     if (!is.null(vcf.file)) {
176 267
         if (verbose) message("Loading VCF...")
... ...
@@ -358,6 +449,10 @@ post.optimize=FALSE,
358 449
                 # set probability to zero if ploidy is not within requested range 
359 450
                 log.prior.ploidy <- log(ifelse(ploidy<min.ploidy | ploidy>max.ploidy,0,1))
360 451
                 if (iter > 1) p.rij <- p.rij+log.prior.ploidy
452
+                
453
+                frac.homozygous.loss <- sapply(test.num.copy, function(Ci) (sum(li[-i] * ifelse(C[-i]==0,1,0)) + li[i] * ifelse(Ci==0,1,0))/sum(li) )
454
+                log.prior.homozygous.loss <- log(ifelse(frac.homozygous.loss > max.homozygous.loss,0,1))
455
+                if (iter > 1) p.rij <- p.rij+log.prior.homozygous.loss
361 456
 
362 457
                 # model sub clonal state with a uniform distribution
363 458
                 p.rij <- c(p.rij,  .calcLlikSegmentSubClonal( exon.lrs[[i]]+log.ratio.offset[i], max.exon.ratio))
... ...
@@ -443,7 +538,7 @@ post.optimize=FALSE,
443 538
             SNV.posterior <- res.snvllik[[idx]]
444 539
         }
445 540
 
446
-        list(log.likelihood=llik, purity=p, ploidy=weighted.mean(C,li), total.ploidy=total.ploidy, seg=seg.adjusted, C.posterior=data.frame(C.posterior/rowSums(C.posterior), ML.C=C, ML.Subclonal=subclonal), SNV.posterior=SNV.posterior, fraction.subclonal=subclonal.f, gene.calls=gene.calls, log.ratio.offset=log.ratio.offset, failed=FALSE)
541
+        list(log.likelihood=llik, purity=p, ploidy=weighted.mean(C,li), total.ploidy=total.ploidy, seg=seg.adjusted, C.posterior=data.frame(C.posterior/rowSums(C.posterior), ML.C=C, ML.Subclonal=subclonal), SNV.posterior=SNV.posterior, fraction.subclonal=subclonal.f, fraction.homozygous.loss=sum(li[which(C<0.01)])/sum(li), gene.calls=gene.calls, log.ratio.offset=log.ratio.offset, failed=FALSE)
447 542
     }
448 543
 
449 544
     results <- lapply(1:nrow(candidate.solutions$candidates),.optimizeSolution)
... ...
@@ -451,7 +546,7 @@ post.optimize=FALSE,
451 546
 
452 547
     results <- .rankResults(results)
453 548
     results <- .filterDuplicatedResults(results)
454
-    results <- .flagResults(results, max.non.clonal=max.non.clonal, max.logr.sdev=max.logr.sdev, logr.sdev=sd.seg, flag=vcf.filtering$flag, flag_comment=vcf.filtering$flag_comment)  
549
+    results <- .flagResults(results, max.non.clonal=max.non.clonal, max.logr.sdev=max.logr.sdev, logr.sdev=sd.seg, max.segments=max.segments, flag=vcf.filtering$flag, flag_comment=vcf.filtering$flag_comment)  
455 550
     if (!is.null(gc.gene.file)) {
456 551
         # Add LOH calls to gene level calls 
457 552
         for (i in 1:length(results)) {
... ...
@@ -459,30 +554,37 @@ post.optimize=FALSE,
459 554
         }
460 555
     }    
461 556
 
462
-    for (i in 1:length(results)) {
463
-        results[[i]]$SNV.posterior$beta.model$posteriors <- .getVariantCallsLOH( results[[i]] )
557
+    if (!is.null(vcf.file)) {
558
+        for (i in 1:length(results)) {
559
+            results[[i]]$SNV.posterior$beta.model$posteriors <- .getVariantCallsLOH( results[[i]] )
560
+        }
464 561
     }
465 562
     ##value<< A list with elements
466 563
     list(
467 564
         candidates=candidate.solutions, ##<< Results of the grid search.
468 565
         results=results, ##<< All local optima, sorted by final rank.
469
-        input=list(tumor=gatk.tumor.file, normal=gatk.normal.file, log.ratio=data.frame(probe=normal[,1], log.ratio=log.ratio), log.ratio.sdev=sd.seg, vcf=vcf, sampleid=sampleid) ##<< The input data.
566
+        input=list(tumor=gatk.tumor.file, normal=gatk.normal.file, log.ratio=data.frame(probe=normal[,1], log.ratio=log.ratio), log.ratio.sdev=sd.seg, vcf=vcf, sampleid=sampleid, sex=sex) ##<< The input data.
470 567
         )
471 568
 ##end<<
472 569
 },ex=function(){
473
-gatk.normal.file <- system.file("extdata", "example_normal.txt", package="PureCN")
474
-gatk.tumor.file <- system.file("extdata", "example_tumor.txt", package="PureCN")
475
-vcf.file <- system.file("extdata", "example_vcf.vcf", package="PureCN")
476
-gc.gene.file <- system.file("extdata", "example_gc.gene.file.txt", package="PureCN")
477
-
478
-# Speed-up the runAbsoluteCN by using the stored grid-search 
570
+gatk.normal.file <- system.file("extdata", "example_normal.txt", 
571
+    package="PureCN")
572
+gatk.tumor.file <- system.file("extdata", "example_tumor.txt", 
573
+    package="PureCN")
574
+vcf.file <- system.file("extdata", "example_vcf.vcf", 
575
+    package="PureCN")
576
+gc.gene.file <- system.file("extdata", "example_gc.gene.file.txt", 
577
+    package="PureCN")
578
+
579
+# Speed-up the runAbsoluteCN call by using the stored grid-search 
479 580
 # (purecn.example.output$candidates).
480 581
 data(purecn.example.output)
481 582
 
482 583
 # The max.candidate.solutions parameter is set to a very low value only to
483 584
 # speed-up this example.  This is not a good idea for real samples.
484 585
 
485
-ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, gatk.tumor.file=gatk.tumor.file, 
486
-   candidates=purecn.example.output$candidates, max.candidate.solutions=2,
586
+ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, 
587
+    gatk.tumor.file=gatk.tumor.file, 
588
+    candidates=purecn.example.output$candidates, max.candidate.solutions=2,
487 589
     vcf.file=vcf.file, sampleid='Sample1', gc.gene.file=gc.gene.file)
488 590
 })    
... ...
@@ -69,7 +69,7 @@ ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, gatk.tumor.file=gatk.tumo
69 69
 .pruneByVCF <- function(x, vcf, tumor.id.in.vcf, min.size=5, max.pval=0.00001, iterations=3, debug=FALSE) {
70 70
     seg <- segments.p(x$cna)
71 71
     for (iter in 1:iterations) {
72
-        seg.gr <- GRanges(seqnames=paste("chr", seg$chrom,sep=""), IRanges(start=seg$loc.start, end=seg$loc.end))
72
+        seg.gr <- GRanges(seqnames=.add.chr.name(seg$chrom), IRanges(start=seg$loc.start, end=seg$loc.end))
73 73
         ov <- findOverlaps(seg.gr, vcf)
74 74
         ar <- sapply(geno(vcf)$FA[,tumor.id.in.vcf], function(x) x[1])
75 75
         ar.r <- ifelse(ar>0.5, 1-ar, ar)
76 76
deleted file mode 100755
... ...
@@ -1,119 +0,0 @@
1
-<h1 id="quick-start">Quick Start</h1>
2
-<h2 id="basic-input-files">Basic Input Files</h2>
3
-<p>You will need coverage files in GATK DepthOfCoverage format for tumor and a control sample:</p>
4
-<pre><code>Target  total_coverage  average_coverage    Sample1_total_cvg   Sample1_mean_cvg    Sample1_granular_Q1 Sample1_granular_median Sample1_granular_Q3 Sample1_._above_15
5
-chr1:69091-70009    0   0   0   0   1   1   1   0
6
-chr1:367659-368598  6358    9.25121153819759    6358    9.25121153819759    1   7   13  11.6
7
-chr1:621096-622035  6294    9.16910019318401    6294    9.16910019318401    1   7   12  9.5</code></pre>
8
-<p>PureCN will look for the columns &quot;Target&quot;, &quot;total_coverage&quot;, and &quot;average_coverage&quot;. The algorithm works best when the coverage data is already GC normalized. The steps below will NOT GC normalize.</p>
9
-<p>You will also need a VCF file with both somatic and germline variants, for example as obtained by MuTect.</p>
10
-<h2 id="first-run">First Run</h2>
11
-<p>In R:</p>
12
-<pre><code>library(PureCN)
13
-ret &lt;-runAbsoluteCN(gatk.normal.file=gatk.normal.file, 
14
-                    gatk.tumor.file=gatk.tumor.file, 
15
-                    vcf.file=vcf.file)</code></pre>
16
-<p>To obtain gene level copy number calls, we need an exon-to-gene mapping file. This is provided together with GC content via the gc.gene.file argument. The expected format:</p>
17
-<pre><code>Target  gc_bias Gene
18
-chr1:69091-70009    0.427638737758433   OR4F5
19
-chr1:367659-368598  0.459574468085106   OR4F29
20
-chr1:621096-622035  0.459574468085106   OR4F3</code></pre>
21
-<h2 id="pool-of-normals">Pool of Normals</h2>
22
-<p>If a pool of normal samples is available, we can use the coverage data to estimate the expected variance in coverage per exon. This will help the segmentation algorithm to filter noise:</p>
23
-<pre><code>gatk.normal.files &lt;- dir(&quot;poolofnormals&quot;, 
24
-    pattern=&quot;coverage.sample_interval_summary&quot;, full.names=TRUE) 
25
-createExonWeightFile(gatk.normal.files[1:2], gatk.normal.files[-(1:2)], &quot;exon_weights.txt&quot;)</code></pre>
26
-<p>This will calculate log-ratios for all normals using the first two as tumor and the remaining ones as normal. This estimates then exon level coverage standard deviations.</p>
27
-<p>We can also use the pool of normal to find SNPs with biased allelic fractions (significantly different from 0.5 for heterozygous SNPs):</p>
28
-<pre><code>mutect.normal.files &lt;- dir(&quot;poolofnormals&quot;, pattern=&quot;vcf$&quot;, full.names=TRUE) 
29
-snp.blacklist &lt;- createSNPBlacklist(mutect.normal.files)
30
-write.csv(snp.bl, file=&quot;SNP_blacklist.csv&quot;)
31
-write.csv(snp.bl, file=&quot;SNP_blacklist_segmented.csv&quot;, row.names=FALSE, quote=FALSE)</code></pre>
32
-<p>Finally, we can use the pool of normals to find normal samples for log-ratio calculation, especially when no matched normal sample is available:</p>
33
-<pre><code>if (!file.exists(&quot;normalDB.rds&quot;)) {
34
-    normalDB &lt;- createNormalDatabase(gatk.normal.files)
35
-    saveRDS(normalDB, &quot;normalDB.rds&quot;)
36
-} else {
37
-    normalDB &lt;- readRDS(&quot;normalDBe4.rds&quot;)
38
-}
39
-
40
-# get the best 4 normals and average them
41
-gatk.normal.file &lt;- findBestNormal(gatk.tumor.file, normalDB, num.normals=4)
42
-pool &lt;- pool.coverage2(lapply(gatk.normal.file, read.coverage.gatk), remove.chrs=c(&#39;chrX&#39;, &#39;chrY&#39;))
43
-
44
-# get the best normal
45
-gatk.normal.file &lt;- findBestNormal(gatk.tumor.file, normalDB)</code></pre>
46
-<h2 id="vcf-artifact-filtering">VCF Artifact Filtering</h2>
47
-<p>If MuTect was run in matched normal mode, then all germline variants are rejected, that means we cannot just filter by the KEEP/REJECT MuTect flags. The filterVcfMuTect function optionally reads the MuTect stats file and will keep germline variants, while removing potential artifacts. Otherwise it will use only the filters based on read depths as defined in filterVcfBasic.</p>
48
-<h2 id="recommended-run">Recommended Run</h2>
49
-<p>Finally, we can run PureCN with all that information:</p>
50
-<pre><code>  ret &lt;-runAbsoluteCN(gatk.normal.file=pool, gatk.tumor.file=gatk.tumor.file, 
51
-    vcf.file=vcf.file, sampleid=&#39;Sample1&#39;, gc.gene.file=gc.gene.file, 
52
-    args.filterVcf=list(snp.blacklist=snp.blacklist, stats.file=mutect.stats.file), 
53
-    args.segmentation=list(exon.weight.file=exon.weight.file), 
54
-    post.optimize=TRUE)</code></pre>
55
-<p>The post.optimize flag will increase the runtime by a lot, but might be worth it if the copy number profile is not very clean. This is recommended with lower coverage datasets or if there are significant capture biases.</p>
56
-<p>We also need to create a few output files:</p>
57
-<pre><code>file.rds &lt;- &#39;Sample1_PureCN.rds&#39;
58
-saveRDS(ret, file=file.rds)
59
-pdf(&#39;Sample1_PureCN.pdf&#39;), width=10, height=12)
60
-plotAbs(ret, type=&#39;all&#39;)
61
-dev.off()</code></pre>
62
-<h2 id="manual-curation">Manual Curation</h2>
63
-<p>For prediction of SNV status (germline vs somatic, sub-clonal vs. clonal, homozyous vs heterozygous), it is important that both purity and ploidy are correct. We provide functionality for curating results:</p>
64
-<pre><code>createCurationFile(file.rds) </code></pre>
65
-<p>This will generate a CSV file, in which the correct purity and ploidy values can be manually entered. It also contains a column &quot;Curated&quot;, which should be set to TRUE, otherwise the file will be overwritten when re-run.</p>
66
-<p>Then in R, the correct solution (closest to the combination in the CSV file) can be loaded with the readCurationFile function:</p>
67
-<pre><code>ret &lt;- readCurationFile(file.rds)</code></pre>
68
-<h2 id="custom-segmentation">Custom Segmentation</h2>
69
-<p>By default, we will use DNAcopy to segment the log-ratio. You can easily change that to your favorite method and the segmentationCBS function can serve as an example.</p>
70
-<pre><code>segmentationCBS &lt;- function (normal, tumor, log.ratio, plot.cnv, coverage.cutoff, 
71
-    sampleid = sampleid, exon.weight.file = NULL, alpha = 0.005, 
72
-    vcf = NULL, tumor.id.in.vcf = 1, verbose = TRUE, ...) </code></pre>
73
-<p>It is also possible to provide segmented file, which we however only recommend when matched SNP6 data is available (otherwise it is better to customize the segmentation function as described above). The expected file format is:</p>
74
-<pre><code>ID  chrom   loc.start   loc.end num.mark    seg.mean
75
-Sample1   1   61723   5773942 2681    0.125406444072723
76
-Sample1   1   5774674 5785170 10  -0.756511807441712</code></pre>
77
-<h2 id="output">Output</h2>
78
-<p>The plotAbs() call above will generate the main plots shown in the manuscript. The R data file (file.rds) contains gene level copy number calls, SNV status and LOH calls. The purity/ploidy combinations are sorted by likelihood at stored in ret$results.</p>
79
-<pre><code>names(ret)
80
-head(ret$results[[1]]$gene.calls, 3)
81
-chr    start      end C seg.mean seg.id number.exons gene.mean   gene.min
82
-TP73 chr1  3598833  3649721 3   0.3434      1           14 0.1755614 -0.3861084
83
-CHD5 chr1  6161904  6240126 3   0.3434      1           44 0.3059672 -0.8812580
84
-MTOR chr1 11166594 11319542 3   0.3434      1           62 0.2360036 -0.1346406
85
-gene.max focal num.snps.loh.segment percentage.loh.in.loh.segment
86
-TP73 0.6437573 FALSE                    0                             0
87
-CHD5 1.5984124 FALSE                   49                             0
88
-MTOR 0.6705342 FALSE                   49                             0</code></pre>
89
-<p>This data.frame also contains gene level LOH information. The SNV posteriors:</p>
90
-<pre><code>head(ret$results[[1]]$SNV.posterior$beta.model$posteriors, 3)
91
-           seqnames   start     end SOMATIC.M0   SOMATIC.M1   SOMATIC.M2
92
-rs6696489      chr1 6162054 6162054          0 9.155037e-04 2.938312e-07
93
-rs59788818     chr1 6171992 6171992          0 2.364795e-05 3.397817e-07
94
-rs2843493      chr1 6184092 6184092          0 9.362524e-16 2.714454e-04
95
-             SOMATIC.M3   SOMATIC.M4    SOMATIC.M5    SOMATIC.M6 SOMATIC.M7
96
-rs6696489  6.592308e-21 2.743271e-76 2.002534e-167 3.562023e-275          0
97
-rs59788818 3.491208e-34 1.711087e-93 7.197813e-188 2.275374e-298          0
98
-rs2843493  1.037518e-11 5.167773e-67 3.034251e-158 3.725864e-266          0
99
-            GERMLINE.M0  GERMLINE.M1  GERMLINE.M2  GERMLINE.M3   GERMLINE.M4
100
-rs6696489  1.723074e-04 9.989057e-01 6.216658e-06 3.302211e-27  1.067221e-82
101
-rs59788818 5.836546e-23 9.999757e-01 2.683928e-07 1.233256e-51 2.340805e-111
102
-rs2843493  8.802336e-49 1.023147e-07 9.997285e-01 2.798399e-18  3.718923e-74
103
-             GERMLINE.M5   GERMLINE.M6 GERMLINE.M7 GERMLINE.CONTHIGH
104
-rs6696489  6.678185e-174 1.070845e-281           0      3.060961e-62
105
-rs59788818 5.523414e-206 1.183050e-316           0     8.253575e-127
106
-rs2843493  9.760218e-166 6.969108e-274           0      5.841728e-59
107
-           GERMLINE.CONTLOW ML.SOMATIC ML.M ML.C     ML.AR        AR
108
-rs6696489      2.105422e-22      FALSE    1    3 0.3571429 0.2803790
109
-rs59788818     3.106132e-86      FALSE    1    3 0.3571429 0.3957373
110
-rs2843493     4.644636e-138      FALSE    2    3 0.6428571 0.6259540
111
-           CN.Subclonal Log.Ratio Prior.Somatic Prior.Contamination ML.LOH
112
-rs6696489         FALSE 1.5984124  0.0004950495                0.01  FALSE
113
-rs59788818        FALSE 0.2600908  0.0004950495                0.01  FALSE
114
-rs2843493         FALSE 0.7559721  0.0004950495                0.01  FALSE
115
-           num.snps.loh.segment percentage.loh.in.loh.segment
116
-rs6696489                    49                             0
117
-rs59788818                   49                             0
118
-rs2843493                    49                             0</code></pre>
119
-<p>This lists all posterior probabilities for all possible SNV states.</p>
120 0
deleted file mode 100755
... ...
@@ -1,191 +0,0 @@
1
-# Quick Start
2
-
3
-## Basic Input Files 
4
-
5
-You will need coverage files in GATK DepthOfCoverage format for tumor and a control sample:
6
-
7
-    Target  total_coverage  average_coverage    Sample1_total_cvg   Sample1_mean_cvg    Sample1_granular_Q1 Sample1_granular_median Sample1_granular_Q3 Sample1_._above_15
8
-    chr1:69091-70009    0   0   0   0   1   1   1   0
9
-    chr1:367659-368598  6358    9.25121153819759    6358    9.25121153819759    1   7   13  11.6
10
-    chr1:621096-622035  6294    9.16910019318401    6294    9.16910019318401    1   7   12  9.5
11
-
12
-PureCN will look for the columns "Target", "total_coverage", and
13
-"average_coverage". The algorithm works best when the coverage data is already
14
-GC normalized. The steps below will NOT GC normalize.
15
-
16
-You will also need a VCF file with both somatic and germline variants, for
17
-example as obtained by MuTect.
18
-
19
-## First Run
20
-
21
-In R:
22
-
23
-    library(PureCN)
24
-    ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, 
25
-                        gatk.tumor.file=gatk.tumor.file, 
26
-                        vcf.file=vcf.file)
27
-
28
-To obtain gene level copy number calls, we need an exon-to-gene mapping file.
29
-This is provided together with GC content via the gc.gene.file argument. The
30
-expected format:
31
-
32
-    Target	gc_bias	Gene
33
-    chr1:69091-70009	0.427638737758433	OR4F5
34
-    chr1:367659-368598	0.459574468085106	OR4F29
35
-    chr1:621096-622035	0.459574468085106	OR4F3
36
-
37
-## Pool of Normals
38
-
39
-If a pool of normal samples is available, we can use the coverage data to
40
-estimate the expected variance in coverage per exon. This will help the
41
-segmentation algorithm to filter noise:
42
-
43
-    gatk.normal.files <- dir("poolofnormals", 
44
-        pattern="coverage.sample_interval_summary", full.names=TRUE) 
45
-    createExonWeightFile(gatk.normal.files[1:2], gatk.normal.files[-(1:2)], "exon_weights.txt")
46
-
47
-This will calculate log-ratios for all normals using the first two as tumor and
48
-the remaining ones as normal. This estimates then exon level coverage standard
49
-deviations.
50
-
51
-We can also use the pool of normal to find SNPs with biased allelic fractions
52
-(significantly different from 0.5 for heterozygous SNPs):
53
-
54
-    mutect.normal.files <- dir("poolofnormals", pattern="vcf$", full.names=TRUE) 
55
-    snp.blacklist <- createSNPBlacklist(mutect.normal.files)
56
-    write.csv(snp.bl, file="SNP_blacklist.csv")
57
-    write.csv(snp.bl, file="SNP_blacklist_segmented.csv", row.names=FALSE, quote=FALSE)
58
-
59
-Finally, we can use the pool of normals to find normal samples for log-ratio
60
-calculation, especially when no matched normal sample is available:
61
-
62
-    if (!file.exists("normalDB.rds")) {
63
-        normalDB <- createNormalDatabase(gatk.normal.files)
64
-        saveRDS(normalDB, "normalDB.rds")
65
-    } else {
66
-        normalDB <- readRDS("normalDBe4.rds")
67
-    }
68
-    
69
-    # get the best 4 normals and average them
70
-    gatk.normal.file <- findBestNormal(gatk.tumor.file, normalDB, num.normals=4)
71
-    pool <- poolCoverage(lapply(gatk.normal.file, read.coverage.gatk), remove.chrs=c('chrX', 'chrY'))
72
-    
73
-    # get the best normal
74
-    gatk.normal.file <- findBestNormal(gatk.tumor.file, normalDB)
75
-
76
-## VCF Artifact Filtering
77
-
78
-If MuTect was run in matched normal mode, then all germline variants are
79
-rejected, that means we cannot just filter by the PASS/REJECT MuTect flags. The
80
-filterVcfMuTect function optionally reads the MuTect stats file and will keep
81
-germline variants, while removing potential artifacts. Otherwise it will use
82
-only the filters based on read depths as defined in filterVcfBasic.
83
-
84
-## Recommended Run
85
-
86
-Finally, we can run PureCN with all that information:
87
-
88
-      ret <-runAbsoluteCN(gatk.normal.file=pool, gatk.tumor.file=gatk.tumor.file, 
89
-        vcf.file=vcf.file, sampleid='Sample1', gc.gene.file=gc.gene.file, 
90
-        args.filterVcf=list(snp.blacklist=snp.blacklist, stats.file=mutect.stats.file), 
91
-        args.segmentation=list(exon.weight.file=exon.weight.file), 
92
-        post.optimize=TRUE)
93
-
94
-The post.optimize flag will increase the runtime by a lot, but might be worth
95
-it if the copy number profile is not very clean. This is recommended with lower
96
-coverage datasets or if there are significant capture biases. For high quality
97
-whole exome data, this is typically not necessary.
98
-
99
-We also need to create a few output files:
100
-
101
-    file.rds <- 'Sample1_PureCN.rds'
102
-    saveRDS(ret, file=file.rds)
103
-    pdf('Sample1_PureCN.pdf', width=10, height=12)
104
-    plotAbs(ret, type='all')
105
-    dev.off()
106
-
107
-## Manual Curation
108
-
109
-For prediction of SNV status (germline vs somatic, sub-clonal vs. clonal,
110
-homozyous vs heterozygous), it is important that both purity and ploidy are
111
-correct. We provide functionality for curating results:
112
-
113
-    createCurationFile(file.rds) 
114
-
115
-This will generate a CSV file, in which the correct purity and ploidy values
116
-can be manually entered. It also contains a column "Curated", which should be
117
-set to TRUE, otherwise the file will be overwritten when re-run.
118
-
119
-Then in R, the correct solution (closest to the combination in the CSV file)
120
-can be loaded with the readCurationFile function:
121
-
122
-    ret <- readCurationFile(file.rds)
123
-    
124
-## Custom Segmentation
125
-
126
-By default, we will use DNAcopy to segment the log-ratio. You can easily change
127
-that to your favorite method and the segmentationCBS function can serve as an
128
-example.
129
-
130
-    segmentationCBS <- function (normal, tumor, log.ratio, plot.cnv, coverage.cutoff, 
131
-        sampleid = sampleid, exon.weight.file = NULL, alpha = 0.005, 
132
-        vcf = NULL, tumor.id.in.vcf = 1, verbose = TRUE, ...) 
133
-
134
-It is also possible to provide segmented file, which we however only recommend
135
-when matched SNP6 data is available (otherwise it is better to customize the
136
-segmentation function as described above). The expected file format is:
137
-
138
-    ID  chrom   loc.start   loc.end num.mark    seg.mean
139
-    Sample1   1   61723   5773942 2681    0.125406444072723
140
-    Sample1   1   5774674 5785170 10  -0.756511807441712
141
-
142
-## Output
143
-
144
-The plotAbs() call above will generate the main plots shown in the manuscript. The R data file (file.rds) contains gene level copy number calls, SNV status and LOH calls.
145
-The purity/ploidy combinations are sorted by likelihood at stored in ret$results.
146
-
147
-    names(ret)
148
-    head(ret$results[[1]]$gene.calls, 3)
149
-    chr    start      end C seg.mean seg.id number.exons gene.mean   gene.min
150
-    TP73 chr1  3598833  3649721 3   0.3434      1           14 0.1755614 -0.3861084
151
-    CHD5 chr1  6161904  6240126 3   0.3434      1           44 0.3059672 -0.8812580
152
-    MTOR chr1 11166594 11319542 3   0.3434      1           62 0.2360036 -0.1346406
153
-    gene.max focal num.snps.loh.segment percentage.loh.in.loh.segment
154
-    TP73 0.6437573 FALSE                    0                             0
155
-    CHD5 1.5984124 FALSE                   49                             0
156
-    MTOR 0.6705342 FALSE                   49                             0
157
-
158
-This data.frame also contains gene level LOH information. The SNV posteriors:
159
-
160
-    head(ret$results[[1]]$SNV.posterior$beta.model$posteriors, 3)
161
-               seqnames   start     end SOMATIC.M0   SOMATIC.M1   SOMATIC.M2
162
-    rs6696489      chr1 6162054 6162054          0 9.155037e-04 2.938312e-07
163
-    rs59788818     chr1 6171992 6171992          0 2.364795e-05 3.397817e-07
164
-    rs2843493      chr1 6184092 6184092          0 9.362524e-16 2.714454e-04
165
-                 SOMATIC.M3   SOMATIC.M4    SOMATIC.M5    SOMATIC.M6 SOMATIC.M7
166
-    rs6696489  6.592308e-21 2.743271e-76 2.002534e-167 3.562023e-275          0
167
-    rs59788818 3.491208e-34 1.711087e-93 7.197813e-188 2.275374e-298          0
168
-    rs2843493  1.037518e-11 5.167773e-67 3.034251e-158 3.725864e-266          0
169
-                GERMLINE.M0  GERMLINE.M1  GERMLINE.M2  GERMLINE.M3   GERMLINE.M4
170
-    rs6696489  1.723074e-04 9.989057e-01 6.216658e-06 3.302211e-27  1.067221e-82
171
-    rs59788818 5.836546e-23 9.999757e-01 2.683928e-07 1.233256e-51 2.340805e-111
172
-    rs2843493  8.802336e-49 1.023147e-07 9.997285e-01 2.798399e-18  3.718923e-74
173
-                 GERMLINE.M5   GERMLINE.M6 GERMLINE.M7 GERMLINE.CONTHIGH
174
-    rs6696489  6.678185e-174 1.070845e-281           0      3.060961e-62
175
-    rs59788818 5.523414e-206 1.183050e-316           0     8.253575e-127
176
-    rs2843493  9.760218e-166 6.969108e-274           0      5.841728e-59
177
-               GERMLINE.CONTLOW ML.SOMATIC ML.M ML.C     ML.AR        AR
178
-    rs6696489      2.105422e-22      FALSE    1    3 0.3571429 0.2803790
179
-    rs59788818     3.106132e-86      FALSE    1    3 0.3571429 0.3957373
180
-    rs2843493     4.644636e-138      FALSE    2    3 0.6428571 0.6259540
181
-               CN.Subclonal Log.Ratio Prior.Somatic Prior.Contamination ML.LOH
182
-    rs6696489         FALSE 1.5984124  0.0004950495                0.01  FALSE
183
-    rs59788818        FALSE 0.2600908  0.0004950495                0.01  FALSE
184
-    rs2843493         FALSE 0.7559721  0.0004950495                0.01  FALSE
185
-               num.snps.loh.segment percentage.loh.in.loh.segment
186
-    rs6696489                    49                             0
187
-    rs59788818                   49                             0
188
-    rs2843493                    49                             0
189
-
190
-This lists all posterior probabilities for all possible SNV states.
191
-
... ...
@@ -1,12 +1,14 @@
1 1
 \name{createExonWeightFile}
2 2
 \alias{createExonWeightFile}
3 3
 \title{Calculate exon weights}
4
-\description{Creates an exon weight file useful for segmentation, by down-weighting unreliable exons.}
4
+\description{Creates an exon weight file useful for segmentation, by 
5
+down-weighting unreliable exons.}
5 6
 \usage{createExonWeightFile(gatk.tumor.files, gatk.normal.files, 
6 7
     exon.weight.file)}
7 8
 \arguments{
8 9
   \item{gatk.tumor.files}{A small number (1-3) of GATK tumor coverage samples.}
9
-  \item{gatk.normal.files}{A large number of GATK normal coverage samples (>20) to estimate exon log-ratio standard deviations.}
10
+  \item{gatk.normal.files}{A large number of GATK normal coverage samples (>20) 
11
+to estimate exon log-ratio standard deviations.}
10 12
   \item{exon.weight.file}{Output filename.}
11 13
 }
12 14
 
... ...
@@ -1,10 +1,12 @@
1 1
 \name{createNormalDatabase}
2 2
 \alias{createNormalDatabase}
3 3
 \title{createNormalDatabase}
4
-\description{Function to create a database of normal samples, used to find a good match for tumor copy number normalization.}
4
+\description{Function to create a database of normal samples, used to find 
5
+a good match for tumor copy number normalization.}
5 6
 \usage{createNormalDatabase(gatk.normal.files, ...)}
6 7
 \arguments{
7
-  \item{gatk.normal.files}{Vector with file names pointing to GATK coverage files of normal samples. }
8
+  \item{gatk.normal.files}{Vector with file names pointing to GATK coverage files 
9
+of normal samples. }
8 10
   \item{\dots}{Arguments passed to the prcomp function.}
9 11
 }
10 12
 
... ...
@@ -3,15 +3,15 @@
3 3
 \title{findBestNormal}
4 4
 \description{Function to find the best matching normal for a provided tumor sample.}
5 5
 \usage{findBestNormal(gatk.tumor.file, normalDB, pcs = 1:3, 
6
-    num.normals = 1, verbose = TRUE)}
6
+    num.normals = 1, ignore.sex = FALSE, verbose = TRUE)}
7 7
 \arguments{
8 8
   \item{gatk.tumor.file}{GATK coverage file of a tumor sample.}
9 9
   \item{normalDB}{Database of normal samples, created with createNormalDatabase().}
10 10
   \item{pcs}{Principal components to use for distance calculation.}
11
-  \item{num.normals}{
12
-}
13
-  \item{verbose}{
14
-}
11
+  \item{num.normals}{Return the num.normals best normals.}
12
+  \item{ignore.sex}{If FALSE, detects sex of sample returns a best normal 
13
+with matching sex.}
14
+  \item{verbose}{Verbose output.}
15 15
 }
16 16
 
17 17
 \value{Return the filename of the best matching normal    }
18 18
new file mode 100644
... ...
@@ -0,0 +1,20 @@
1
+\name{getSexFromCoverage}
2
+\alias{getSexFromCoverage}
3
+\title{Get sample sex from coverage}
4
+\description{This function determines the sex of a sample by the coverage 
5
+ratio of chrX and chrY.}
6
+\usage{getSexFromCoverage(gatk.coverage, min.ratio = 20, verbose = TRUE)}
7
+\arguments{
8
+  \item{gatk.coverage}{GATK coverage file or data read with readCoverageGatk.}
9
+  \item{min.ratio}{Min chrX/chrY coverage ratio to call sample as female.}
10
+  \item{verbose}{Verbose output.}
11
+}
12
+
13
+\value{Returns "M" for male, "F" for female, or NULL if unknown.    }
14
+
15
+\author{Markus Riester}
16
+
17
+
18
+
19
+
20
+
... ...
@@ -1,13 +1,16 @@
1 1
 \name{runAbsoluteCN}
2 2
 \alias{runAbsoluteCN}
3
-\title{Run our implementation of ABSOLUTE}
4
-\description{This function takes as input tumor and normal control coverage and allelic fractions of germline variants and somatic mutations.
5
-Coverage data is provied in GATK DepthOfCoverage format, allelic fraction in VCF format (e.g. obtained by MuTect). 
6
-Normal control does not need to be matched (from the same patient). In case VCF does not contain somatic status, it should contain
7
-dbSNP and optionally COSMIC annotation.}
3
+\title{Run PureCN implementation of ABSOLUTE}
4
+\description{This function takes as input tumor and normal control coverage and 
5
+allelic fractions of germline variants and somatic mutations.
6
+Coverage data is provied in GATK DepthOfCoverage format, allelic fraction 
7
+in VCF format (e.g. obtained by MuTect). Normal control does not need to 
8
+be matched (from the same patient). In case VCF does not contain somatic 
9
+status, it should contain dbSNP and optionally COSMIC annotation.}
8 10
 \usage{runAbsoluteCN(gatk.normal.file = NULL, gatk.tumor.file, 
9 11
     log.ratio = NULL, seg.file = NULL, seg.file.sdev = 0.4, 
10
-    vcf.file = NULL, genome = "hg19", fun.filterVcf = filterVcfMuTect, 
12
+    vcf.file = NULL, genome = "hg19", sex = c("?", 
13
+        "F", "M"), fun.filterVcf = filterVcfMuTect, 
11 14
     args.filterVcf = list(), fun.setPriorVcf = setPriorVcf, 
12 15
     args.setPriorVcf = list(), fun.segmentation = segmentationCBS, 
13 16
     args.segmentation = list(), fun.focal = findFocal, 
... ...
@@ -15,46 +18,109 @@ dbSNP and optionally COSMIC annotation.}
15 18
     max.ploidy = 6, test.num.copy = 0:7, test.purity = seq(0.05, 
16 19
         0.95, by = 0.01), prior.purity = rep(1, length(test.purity))/length(test.purity), 
17 20
     max.candidate.solutions = 15, candidates = NULL, 
18
-    coverage.cutoff = 15, max.non.clonal = 0.2, iterations = 30, 
19
-    log.ratio.calibration = 0.25, gc.gene.file = NULL, 
20
-    filter.lowhigh.gc.exons = 0.001, filter.targeted.base = 4, 
21
-    max.logr.sdev = 0.75, plot.cnv = TRUE, verbose = TRUE, 
21
+    coverage.cutoff = 15, max.non.clonal = 0.2, max.homozygous.loss = 0.1, 
22
+    iterations = 30, log.ratio.calibration = 0.25, 
23
+    gc.gene.file = NULL, filter.lowhigh.gc.exons = 0.001, 
24
+    filter.targeted.base = 4, max.logr.sdev = 0.75, 
25
+    max.segments = 200, plot.cnv = TRUE, verbose = TRUE, 
22 26
     post.optimize = FALSE, ...)}
23 27
 \arguments{
24
-  \item{gatk.normal.file}{GATK coverage file of normal control (optional if log.ratio is provided - then it will be only used to filter low coverage exons). Should be already GC-normalized.}
25
-  \item{gatk.tumor.file}{GATK coverage file of tumor. Should be already GC-normalized.}
26
-  \item{log.ratio}{Copy number log-ratios for all exons in the coverage files. If NULL, calculated based on coverage files.}
27
-  \item{seg.file}{Segmented data. Optional, to support matched SNP6 data. If null, use coverage files or log.ratio to segment the data.  }
28
-  \item{seg.file.sdev}{If seg.file provided, the log-ratio standard deviation, used to model likelihood of sub-clonal copy number events.}
29
-  \item{vcf.file}{VCF file, tested with MuTect output files.  Optional, but typically needed to select between local optima of similar likelihood. Can also be a CollapsedVCF, read with the readVcf function. Requires a DB info flag for dbSNP membership. The default fun.setPriorVcf function will also look for a Cosmic.CNT slot, containing the hits in the COSMIC database. Again, do not expect very useful results without a VCF file.}
28
+  \item{gatk.normal.file}{GATK coverage file of normal control (optional if 
29
+log.ratio is provided - then it will be only used to filter low coverage 
30
+exons). Should be already GC-normalized. Needs to be either a file name 
31
+or data read with the readCoverageGatk function.}
32
+  \item{gatk.tumor.file}{GATK coverage file of tumor. Should be already 
33
+GC-normalized. Needs to be either a file name or data read with the 
34
+readCoverageGatk function.}
35
+  \item{log.ratio}{Copy number log-ratios for all exons in the coverage files. 
36
+If NULL, calculated based on coverage files.}
37
+  \item{seg.file}{Segmented data. Optional, to support matched SNP6 data. 
38
+If null, use coverage files or log.ratio to segment the data.  }
39
+  \item{seg.file.sdev}{If seg.file provided, the log-ratio standard deviation, 
40
+used to model likelihood of sub-clonal copy number events.}
41
+  \item{vcf.file}{VCF file, tested with MuTect output files.  Optional, but
42
+typically needed to select between local optima of similar likelihood. Can 
43
+also be a CollapsedVCF, read with the readVcf function. Requires a DB info 
44
+flag for dbSNP membership. The default fun.setPriorVcf function will also
45
+look for a Cosmic.CNT slot, containing the hits in the COSMIC database.
46
+Again, do not expect very useful results without a VCF file.}
30 47
   \item{genome}{Genome version, required for the readVcf function.}
31
-  \item{fun.filterVcf}{Function for filtering variants. Expected output is a list with elements vcf (CollapsedVCF), flag (TRUE/FALSE) and flag_comment (string). The flags will be added to the output data and can be used to warn users, for example when samples look too noisy. Default filter will remove variants flagged by MuTect, but will keep germline variants. If ran in matched normal mode, it will by default use somatic status of variants and filter non-somatic calls with allelic fraction significantly different from 0.5 in normal. }
32
-  \item{args.filterVcf}{Arguments for variant filtering function. Arguments vcf, tumor.id.in.vcf, coverage.cutoff and verbose are required in the filter function and are automatically set (do NOT set them here again).}
33
-  \item{fun.setPriorVcf}{Function to set prior for somatic status for each variant in the VCF.}
48
+  \item{sex}{Sex of sample. If ?, detect.}
49
+  \item{fun.filterVcf}{Function for filtering variants. Expected output is a 
50
+list with elements vcf (CollapsedVCF), flag (TRUE/FALSE) and flag_comment 
51
+(string). The flags will be added to the output data and can be used to 
52
+warn users, for example when samples look too noisy. Default filter will 
53
+remove variants flagged by MuTect, but will keep germline variants. If 
54
+ran in matched normal mode, it will by default use somatic status of 
55
+variants and filter non-somatic calls with allelic fraction significantly 
56
+different from 0.5 in normal. }
57
+  \item{args.filterVcf}{Arguments for variant filtering function. Arguments 
58
+vcf, tumor.id.in.vcf, coverage.cutoff and verbose are required in the 
59
+filter function and are automatically set (do NOT set them here again).}
60
+  \item{fun.setPriorVcf}{Function to set prior for somatic status for each 
61
+variant in the VCF.}
34 62
   \item{args.setPriorVcf}{Arguments for somatic prior function.}
35
-  \item{fun.segmentation}{Function for segmenting the copy number log-ratios. Expected return value is a list with elements seg (the segmentation) and size (the size in bp for all segments).}
36
-  \item{args.segmentation}{Arguments for segmentation function. Arguments normal, tumor, log.ratio, plot.cnv, coverage.cutoff, sampleid, vcf, tumor.id.in.vcf, verbose are required in the segmentation function and automatically set (do NOT set them here again).}
63
+  \item{fun.segmentation}{Function for segmenting the copy number log-ratios. 
64
+Expected return value is a list with elements seg (the segmentation) and 
65
+size (the size in bp for all segments).}
66
+  \item{args.segmentation}{Arguments for segmentation function. Arguments 
67
+normal, tumor, log.ratio, plot.cnv, coverage.cutoff, sampleid, vcf, 
68
+tumor.id.in.vcf, verbose are required in the segmentation function and 
69
+automatically set (do NOT set them here again).}
37 70
   \item{fun.focal}{Function for identifying focal amplifications.}
38 71
   \item{args.focal}{Arguments for focal amplification function.}
39 72
   \item{sampleid}{Sample id, provided in output files etc.}
40 73
   \item{min.ploidy}{Minimum ploidy to be considered.}
41 74
   \item{max.ploidy}{Maximum ploidy to be considered.}
42
-  \item{test.num.copy}{Copy numbers tested in the grid search. Note that focal amplifications can have much higher copy numbers, but they will be labeled as subclonal (because they do not fit the integer copy numbers).}
75
+  \item{test.num.copy}{Copy numbers tested in the grid search. Note that focal 
76
+amplifications can have much higher copy numbers, but they will be labeled
77
+as subclonal (because they do not fit the integer copy numbers).}
43 78
   \item{test.purity}{Considered tumor purity values. }
44
-  \item{prior.purity}{Priors for purity if they are available. Only change when you know what you are doing.}
45
-  \item{max.candidate.solutions}{Number of local optima considered in optimization and variant fitting steps. If there are too many local optima, it will use specified number of top candidate solutions, but will also include all optima close to diploid, because silent genomes have often lots of local optima.}
46
-  \item{candidates}{Candidates to optimize from a previous run (return.object$candidates). If NULL, do 2D grid search and find local optima. }
47
-  \item{coverage.cutoff}{Minimum exon coverage in both normal and tumor. Exons with lower coverage are ingored. The cutoff choice depends on the expected purity and overall coverage. High purity samples might need a lower cutoff to call homozygous deletions. If an exon.weigh.file (below) is NOT specified, it is recommended to set a higher cutoff (e.g. 20) to remove noise from unreliable exon measurements. }
48
-  \item{max.non.clonal}{Maximum genomic fraction assigned to a subclonal copy number state.}
49
-  \item{iterations}{Maximum number of iterations in the Simulated Annealing copy number fit optimization.}
50
-  \item{log.ratio.calibration}{re-calibrate log-ratios in the window sd(log.ratio)*log.ratio.calibration.}
51
-  \item{gc.gene.file}{GC gene file, which assigns GC content and gene symbols to each exon in the coverage files. Used for generating gene level calls. First column in format CHR:START-END. Second column GC content (0 to 1). Third column gene symbol.}
52
-  \item{filter.lowhigh.gc.exons}{Quantile q (defines lower q and upper 1-q) for removing exons with outlier GC profile. Assuming that GC correction might not have been worked on those. Requires gc.gene.file.}
53
-  \item{filter.targeted.base}{Exclude exons with targeted base (size) smaller than this cutoff.}
54
-  \item{max.logr.sdev}{Flag noisy samples with segment log-ratio standard deviation larger than this. Assay specific and needs to be calibrated.}
79
+  \item{prior.purity}{Priors for purity if they are available. Only change 
80
+when you know what you are doing.}
81
+  \item{max.candidate.solutions}{Number of local optima considered in optimization 
82
+and variant fitting steps. If there are too many local optima, it will use
83
+specified number of top candidate solutions, but will also include all 
84
+optima close to diploid, because silent genomes have often lots of local 
85
+optima.}
86
+  \item{candidates}{Candidates to optimize from a previous run 
87
+(return.object$candidates). 
88
+If NULL, do 2D grid search and find local optima. }
89
+  \item{coverage.cutoff}{Minimum exon coverage in both normal and tumor. Exons
90
+with lower coverage are ingored. The cutoff choice depends on the expected
91
+purity and overall coverage. High purity samples might need a lower cutoff
92
+to call homozygous deletions. If an exon.weigh.file (below) is NOT 
93
+specified, it is recommended to set a higher cutoff (e.g. 20) to remove 
94
+noise from unreliable exon measurements. }
95
+  \item{max.non.clonal}{Maximum genomic fraction assigned to a subclonal copy 
96
+number state.}
97
+  \item{max.homozygous.loss}{Maximum genomic fraction assigned to homozygous loss. 
98
+This is set to a fairly high default value to not exclude correct
99
+solutions, especially in noisy segmentations. }
100
+  \item{iterations}{Maximum number of iterations in the Simulated Annealing copy 
101
+number fit optimization.}
102
+  \item{log.ratio.calibration}{re-calibrate log-ratios in the window 
103
+sd(log.ratio)*log.ratio.calibration.}
104
+  \item{gc.gene.file}{GC gene file, which assigns GC content and gene symbols 
105
+to each exon in the coverage files. Used for generating gene level calls. 
106
+First column in format CHR:START-END. Second column GC content (0 to 1). 
107
+Third column gene symbol.}
108
+  \item{filter.lowhigh.gc.exons}{Quantile q (defines lower q and upper 1-q) 
109
+for removing exons with outlier GC profile. Assuming that GC correction 
110
+might not have been worked on those. Requires gc.gene.file.}
111
+  \item{filter.targeted.base}{Exclude exons with targeted base (size) smaller 
112
+than this cutoff.}
113
+  \item{max.logr.sdev}{Flag noisy samples with segment log-ratio standard deviation 
114
+larger than this. Assay specific and needs to be calibrated.}
115
+  \item{max.segments}{Flag noisy samples with a large number of segments. Assay 
116
+specific and needs to be calibrated.}
55 117
   \item{plot.cnv}{Generate segmentation plots.}
56 118
   \item{verbose}{Verbose output.}
57
-  \item{post.optimize}{Optimize purity using final SCNA-fit and SNVs. This might take a long time when lots of SNVs need to be fitted, but will result in a slightly more accurate purity, especially for rather silent genomes or very low purities. Otherwise, it will just use the purity determined via the SCNA-fit.}
119
+  \item{post.optimize}{Optimize purity using final SCNA-fit and SNVs. This 
120
+might take a long time when lots of SNVs need to be fitted, but will 
121
+typically result in a slightly more accurate purity, especially for rather
122
+silent genomes or very low purities. Otherwise, it will just use the 
123
+purity determined via the SCNA-fit.}
58 124
   \item{\dots}{Additional parameters passed to the segmentation function.}
59 125
 }
60 126
 
... ...
@@ -69,19 +135,24 @@ dbSNP and optionally COSMIC annotation.}
69 135
 
70 136
 
71 137
 \examples{
72
-gatk.normal.file <- system.file("extdata", "example_normal.txt", package="PureCN")
73
-gatk.tumor.file <- system.file("extdata", "example_tumor.txt", package="PureCN")
74
-vcf.file <- system.file("extdata", "example_vcf.vcf", package="PureCN")
75
-gc.gene.file <- system.file("extdata", "example_gc.gene.file.txt", package="PureCN")
138
+gatk.normal.file <- system.file("extdata", "example_normal.txt", 
139
+    package="PureCN")
140
+gatk.tumor.file <- system.file("extdata", "example_tumor.txt", 
141
+    package="PureCN")
142
+vcf.file <- system.file("extdata", "example_vcf.vcf", 
143
+    package="PureCN")
144
+gc.gene.file <- system.file("extdata", "example_gc.gene.file.txt", 
145
+    package="PureCN")
76 146
 
77
-# Speed-up the runAbsoluteCN by using the stored grid-search 
147
+# Speed-up the runAbsoluteCN call by using the stored grid-search 
78 148
 # (purecn.example.output$candidates).
79 149
 data(purecn.example.output)
80 150
 
81 151
 # The max.candidate.solutions parameter is set to a very low value only to
82 152
 # speed-up this example.  This is not a good idea for real samples.
83 153
 
84
-ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, gatk.tumor.file=gatk.tumor.file, 
85
-   candidates=purecn.example.output$candidates, max.candidate.solutions=2,
154
+ret <-runAbsoluteCN(gatk.normal.file=gatk.normal.file, 
155
+    gatk.tumor.file=gatk.tumor.file, 
156
+    candidates=purecn.example.output$candidates, max.candidate.solutions=2,
86 157
     vcf.file=vcf.file, sampleid='Sample1', gc.gene.file=gc.gene.file)
87 158
 }
... ...
@@ -61,9 +61,10 @@ following steps on GC corrected data. All other assay-specific biases such as
61 61
 mappability or exon length are mitigated by using a control sample (GC bias is
62 62
 commonly library specific).
63 63
 
64
-PureCN currently assumes a completely diploid normal genome, this means the user
65
-is expected to remove any non-diploid chromosomes (i.e. XY for males and Y for
66
-females).
64
+PureCN currently assumes a completely diploid normal genome. PureCN tries to
65
+detect the sex of samples by calculating the coverage ratio of chromosomes X
66
+and Y and will then remove any non-diploid chromosomes (i.e. XY for males and Y
67
+for females).
67 68
 
68 69
 We load a few example files:
69 70
 <<example_files>>=