Browse code

Added support for gzipped output files (closes #106).

Markus Riester authored on 06/10/2019 21:44:29
Showing 10 changed files

... ...
@@ -2,8 +2,8 @@ Package: PureCN
2 2
 Type: Package
3 3
 Title: Copy number calling and SNV classification using
4 4
     targeted short read sequencing
5
-Version: 1.15.22
6
-Date: 2019-10-05
5
+Version: 1.15.23
6
+Date: 2019-10-06
7 7
 Authors@R: c(person("Markus", "Riester",
8 8
                     role = c("aut", "cre"),
9 9
                     email = "[email protected]",
... ...
@@ -86,6 +86,7 @@ importFrom(VGAM,dbetabinom.ab)
86 86
 importFrom(VGAM,vglm)
87 87
 importFrom(data.table,data.table)
88 88
 importFrom(data.table,fread)
89
+importFrom(data.table,fwrite)
89 90
 importFrom(futile.logger,appender.tee)
90 91
 importFrom(futile.logger,flog.appender)
91 92
 importFrom(futile.logger,flog.debug)
... ...
@@ -166,4 +167,3 @@ importFrom(utils,read.csv)
166 167
 importFrom(utils,read.delim)
167 168
 importFrom(utils,tail)
168 169
 importFrom(utils,write.csv)
169
-importFrom(utils,write.table)
... ...
@@ -33,6 +33,9 @@ SIGNIFICANT USER-VISIBLE CHANGES
33 33
       weights when available  
34 34
     o Changed default of min.target.width in preprocessIntervals from 10 to 100
35 35
       (#73)  
36
+    o replaced write.table with data.table::fwrite to automatically support
37
+      producing gzipped output (requires data.table 1.12.4, #106)
38
+    o Coverage.R now gzips BAM file coverage (requires data.table 1.12.4, #106)
36 39
 
37 40
 BUGFIXES
38 41
 
... ...
@@ -83,7 +83,7 @@ calculateBamCoverageByInterval <- function(bam.file, interval.file,
83 83
         on_target = intervalGr$on.target,
84 84
         duplication_rate = intervalGr$duplication.rate
85 85
     )
86
-    write.table(tmp, file = output.file, row.names = FALSE, quote = FALSE)
86
+    fwrite(tmp, file = output.file, row.names = FALSE, quote = FALSE)
87 87
 }
88 88
 
89 89
 .filterDuplicates <- function(x) {
... ...
@@ -88,7 +88,7 @@ old_method = FALSE) {
88 88
             Target = as.character(ret$weights), 
89 89
             weights = ret$weights$weights)
90 90
 
91
-        write.table(ret_output, file = interval.weight.file, row.names = FALSE, 
91
+        fwrite(ret_output, file = interval.weight.file, row.names = FALSE, 
92 92
                     quote = FALSE, sep = "\t")
93 93
     }
94 94
     if (plot) .plotIntervalWeights(lrs.sd, width(tumor.coverage[[1]]), 
... ...
@@ -42,7 +42,7 @@ globalVariables(names=c("..level.."))
42 42
 #'             coord_trans
43 43
 #' @importFrom gridExtra grid.arrange
44 44
 #' @importFrom stats loess lm predict
45
-#' @importFrom utils write.table
45
+#' @importFrom data.table fwrite
46 46
 correctCoverageBias <- function(coverage.file, interval.file,
47 47
 output.file = NULL, plot.bias = FALSE, plot.max.density = 50000, 
48 48
 output.qc.file = NULL) {
... ...
@@ -105,7 +105,8 @@ output.qc.file = NULL) {
105 105
     colnames(qc)[5:10] <- paste0("mom.", c("raw", "raw", "post.gc", "post.gc", 
106 106
                              "post.reptiming", "post.reptiming"),
107 107
                              ".", rep(c("ontarget", "offtarget"),3))
108
-    write.table(qc, file = output.qc.file, row.names = FALSE, quote = FALSE)
108
+    fwrite(qc, file = output.qc.file, row.names = FALSE, quote = FALSE,
109
+        sep = " ")
109 110
 }
110 111
     
111 112
 .createCoverageGgplot <- function(raw, normalized, plot.max.density, x, log = FALSE) {
... ...
@@ -320,7 +320,7 @@ calculateGCContentByInterval <- function() {
320 320
         Gene=interval.gr$Gene,
321 321
         on_target=interval.gr$on.target
322 322
     )    
323
-    write.table(tmp, file=output.file, row.names=FALSE, quote=FALSE, sep="\t")
323
+    fwrite(tmp, file = output.file, row.names = FALSE, quote = FALSE, sep = "\t")
324 324
 }
325 325
 
326 326
 .checkSeqlengths <- function(ref, x) {
... ...
@@ -129,10 +129,10 @@ processMultipleSamples <- function(tumor.coverage.files, sampleids, normalDB,
129 129
     rownames(lrsm) <- NULL
130 130
     lrsm[idx.enough.markers,]
131 131
     #transform to DNAcopy format
132
-    m <- data.table::melt(lrsm, id.vars=1:5)
132
+    m <- data.table::melt(data.table(lrsm), id.vars=1:5)
133 133
     m <- m[, c(6,1,3,4,5,7)]
134 134
     colnames(m) <- c("ID", "chrom", "loc.start", "loc.end", "num.mark", "seg.mean")
135
-    m
135
+    data.frame(m)
136 136
 }
137 137
 
138 138
 .add_weights_to_normaldb <- function(interval.weight.file, normalDB = NULL) {
... ...
@@ -72,6 +72,14 @@ interval.file <- normalizePath(interval.file, mustWork = TRUE)
72 72
     files
73 73
 }
74 74
 
75
+checkDataTableVersion <- function() {
76
+    if (compareVersion(package.version("data.table"), "1.12.4") < 0) {
77
+        flog.fatal("data.table package is outdated. >= 1.12.4 required")
78
+        q(status = 0)
79
+    }
80
+    return(TRUE)
81
+}        
82
+
75 83
 getCoverageBams <- function(bamFiles, indexFiles, outdir, interval.file, 
76 84
     force = FALSE, keep.duplicates = FALSE, removemapq0 = FALSE) {
77 85
 
... ...
@@ -83,7 +91,8 @@ getCoverageBams <- function(bamFiles, indexFiles, outdir, interval.file,
83 91
 
84 92
     .getCoverageBam <- function(bam.file, index.file = NULL, outdir,
85 93
         interval.file, force) {
86
-        output.file <- file.path(outdir,  gsub(".bam$", "_coverage.txt", 
94
+        checkDataTableVersion()
95
+        output.file <- file.path(outdir,  gsub(".bam$", "_coverage.txt.gz", 
87 96
             basename(bam.file)))
88 97
         futile.logger::flog.info("Processing %s...", output.file)
89 98
         if (!is.null(index.file)) {
... ...
@@ -160,10 +169,11 @@ if (!is.null(bam.file)) {
160 169
 
161 170
 ### GC-normalize coverage -----------------------------------------------------
162 171
 .gcNormalize <- function(gatk.coverage, interval.file, outdir, force) {
163
-    output.file <- file.path(outdir,  gsub(".txt$|_interval_summary",
164
-        "_loess.txt", basename(gatk.coverage)))
165
-    outpng.file <- sub("txt$", "png", output.file)
166
-    output.qc.file <- sub(".txt$", "_qc.txt", output.file)
172
+    checkDataTableVersion()
173
+    output.file <- file.path(outdir,  gsub(".txt$|.txt.gz$|_interval_summary",
174
+        "_loess.txt.gz", basename(gatk.coverage)))
175
+    outpng.file <- sub("txt.gz$", "png", output.file)
176
+    output.qc.file <- sub(".txt.gz$", "_qc.txt", output.file)
167 177
 
168 178
     if (file.exists(output.file) && !force) {
169 179
         flog.info("%s exists. Skipping... (--force will overwrite)", output.file)
... ...
@@ -227,7 +227,7 @@ To build a normal database for coverage normalization, copy the paths to all
227 227
 GC-normalized normal coverage files in a single text file, line-by-line:
228 228
 
229 229
 ```
230
-ls -a normal*loess.txt | cat > example_normal.list
230
+ls -a normal*loess.txt.gz | cat > example_normal.list
231 231
 
232 232
 # From already GC-normalized files
233 233
 $ Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
... ...
@@ -277,7 +277,7 @@ mkdir $OUT/$SAMPLEID
277 277
 
278 278
 # Without a matched normal (minimal test run)
279 279
 $ Rscript $PURECN/PureCN.R --out $OUT/$SAMPLEID \
280
-    --tumor $OUT/$SAMPLEID/${SAMPLEID}_coverage_loess.txt \
280
+    --tumor $OUT/$SAMPLEID/${SAMPLEID}_coverage_loess.txt.gz \
281 281
     --sampleid $SAMPLEID \
282 282
     --vcf ${SAMPLEID}_mutect.vcf \
283 283
     --normaldb $OUT_REF/normalDB_hg19.rds \
... ...
@@ -286,7 +286,7 @@ $ Rscript $PURECN/PureCN.R --out $OUT/$SAMPLEID \
286 286
 
287 287
 # Production pipeline run
288 288
 $ Rscript $PURECN/PureCN.R --out $OUT/$SAMPLEID \
289
-    --tumor $OUT/$SAMPLEID/${SAMPLEID}_coverage_loess.txt \
289
+    --tumor $OUT/$SAMPLEID/${SAMPLEID}_coverage_loess.txt.gz \
290 290
     --sampleid $SAMPLEID \
291 291
     --vcf ${SAMPLEID}_mutect.vcf \
292 292
     --statsfile ${SAMPLEID}_mutect_stats.txt \
... ...
@@ -300,8 +300,8 @@ $ Rscript $PURECN/PureCN.R --out $OUT/$SAMPLEID \
300 300
 # With a matched normal (test run; for production pipelines we recommend the
301 301
 # unmatched workflow described above)
302 302
 $ Rscript $PURECN/PureCN.R --out $OUT/$SAMPLEID \
303
-    --tumor $OUT/$SAMPLEID/${SAMPLEID}_coverage_loess.txt \
304
-    --normal $OUT/$SAMPLEID/${SAMPLEID_NORMAL}_coverage_loess.txt \
303
+    --tumor $OUT/$SAMPLEID/${SAMPLEID}_coverage_loess.txt.gz \
304
+    --normal $OUT/$SAMPLEID/${SAMPLEID_NORMAL}_coverage_loess.txt.gz \
305 305
     --sampleid $SAMPLEID \
306 306
     --vcf ${SAMPLEID}_mutect.vcf \
307 307
     --normaldb $OUT_REF/normalDB_hg19.rds \