... | ... |
@@ -2,8 +2,8 @@ Package: PureCN |
2 | 2 |
Type: Package |
3 | 3 |
Title: Copy number calling and SNV classification using |
4 | 4 |
targeted short read sequencing |
5 |
-Version: 1.9.18 |
|
6 |
-Date: 2018-01-14 |
|
5 |
+Version: 1.9.19 |
|
6 |
+Date: 2018-01-15 |
|
7 | 7 |
Authors@R: c(person("Markus", "Riester", role=c("aut", "cre"), |
8 | 8 |
email="[email protected]"), |
9 | 9 |
person("Angad P.", "Singh", role="aut")) |
... | ... |
@@ -40,6 +40,7 @@ Imports: |
40 | 40 |
VGAM, |
41 | 41 |
edgeR, |
42 | 42 |
tools, |
43 |
+ rhdf5, |
|
43 | 44 |
limma |
44 | 45 |
Suggests: |
45 | 46 |
PSCBS, |
... | ... |
@@ -115,6 +115,7 @@ importFrom(graphics,strwidth) |
115 | 115 |
importFrom(graphics,symbols) |
116 | 116 |
importFrom(graphics,text) |
117 | 117 |
importFrom(gridExtra,grid.arrange) |
118 |
+importFrom(rhdf5,H5Fopen) |
|
118 | 119 |
importFrom(rtracklayer,import) |
119 | 120 |
importFrom(stats,C) |
120 | 121 |
importFrom(stats,aggregate) |
... | ... |
@@ -16,6 +16,7 @@ |
16 | 16 |
#' @param index.file The bai index. This is expected without the .bai file |
17 | 17 |
#' suffix, see \code{?scanBam}. |
18 | 18 |
#' @param keep.duplicates Keep or remove duplicated reads. |
19 |
+#' @param ... Additional parameters passed to \code{ScanBamParam}. |
|
19 | 20 |
#' @return Returns total and average coverage by intervals. |
20 | 21 |
#' @author Markus Riester |
21 | 22 |
#' @seealso \code{\link{preprocessIntervals} |
... | ... |
@@ -37,7 +38,8 @@ |
37 | 38 |
#' @importFrom Rsamtools headerTabix ScanBamParam scanBamFlag |
38 | 39 |
#' scanBam scanFa scanFaIndex TabixFile |
39 | 40 |
calculateBamCoverageByInterval <- function(bam.file, interval.file, |
40 |
- output.file = NULL, index.file = bam.file, keep.duplicates = FALSE) { |
|
41 |
+ output.file = NULL, index.file = bam.file, keep.duplicates = FALSE, |
|
42 |
+ ...) { |
|
41 | 43 |
intervalGr <- readCoverageFile(interval.file) |
42 | 44 |
|
43 | 45 |
param <- ScanBamParam(what = c("pos", "qwidth", "flag"), |
... | ... |
@@ -46,7 +48,8 @@ calculateBamCoverageByInterval <- function(bam.file, interval.file, |
46 | 48 |
isNotPassingQualityControls = FALSE, |
47 | 49 |
isSecondaryAlignment = FALSE, |
48 | 50 |
isDuplicate = NA |
49 |
- ) |
|
51 |
+ ), |
|
52 |
+ ... |
|
50 | 53 |
) |
51 | 54 |
|
52 | 55 |
xAll <- scanBam(bam.file, index = index.file, param = param) |
... | ... |
@@ -5,10 +5,12 @@ |
5 | 5 |
#' |
6 | 6 |
#' @param file Target coverage file. |
7 | 7 |
#' @param format File format. If missing, derived from the file |
8 |
-#' extension. Currently only GATK DepthofCoverage and CNVkit formats |
|
9 |
-#' supported. |
|
8 |
+#' extension. Currently GATK3 DepthofCoverage, GATK4 CollectFragmentCounts |
|
9 |
+#' (hdf5), and CNVkit formats supported. |
|
10 | 10 |
#' @param zero Start position is 0-based. Default is \code{FALSE} |
11 | 11 |
#' for GATK, \code{TRUE} for BED file based intervals. |
12 |
+#' @param read.length For output formats which do not provide both counts |
|
13 |
+#' and total coverages, approximate them using the specified read length. |
|
12 | 14 |
#' @return A \code{data.frame} with the parsed coverage information. |
13 | 15 |
#' @author Markus Riester |
14 | 16 |
#' @seealso \code{\link{calculateBamCoverageByInterval}} |
... | ... |
@@ -19,13 +21,16 @@ |
19 | 21 |
#' coverage <- readCoverageFile(tumor.coverage.file) |
20 | 22 |
#' |
21 | 23 |
#' @importFrom tools file_ext |
24 |
+#' @importFrom rhdf5 H5Fopen |
|
22 | 25 |
#' @export readCoverageFile |
23 |
-readCoverageFile <- function(file, format, zero=NULL) { |
|
26 |
+readCoverageFile <- function(file, format, zero=NULL, read.length = 100) { |
|
24 | 27 |
if (missing(format)) format <- .getFormat(file) |
25 | 28 |
if (format %in% c("cnn", "cnr")) { |
26 | 29 |
targetCoverage <- .readCoverageCnn(file, zero, format) |
30 |
+ } else if (format %in% c("hdf5")) { |
|
31 |
+ targetCoverage <- .readCoverageGatk4(file, zero, format, read.length) |
|
27 | 32 |
} else { |
28 |
- targetCoverage <- .readCoverageGatk(file, zero) |
|
33 |
+ targetCoverage <- .readCoverageGatk3(file, zero) |
|
29 | 34 |
} |
30 | 35 |
.checkLowCoverage(targetCoverage) |
31 | 36 |
.checkIntervals(targetCoverage) |
... | ... |
@@ -34,10 +39,11 @@ readCoverageFile <- function(file, format, zero=NULL) { |
34 | 39 |
.getFormat <- function(file) { |
35 | 40 |
ext <- file_ext(file) |
36 | 41 |
if (ext %in% c("cnn", "cnr")) return(ext) |
37 |
- "GATK" |
|
38 |
-} |
|
42 |
+ if (ext %in% c("hdf5", "h5")) return("hdf5") |
|
43 |
+ "GATK" |
|
44 |
+} |
|
39 | 45 |
|
40 |
-.readCoverageGatk <- function(file, zero) { |
|
46 |
+.readCoverageGatk3 <- function(file, zero) { |
|
41 | 47 |
if (!is.null(zero)) flog.warn("zero ignored for GATK coverage files.") |
42 | 48 |
inputCoverage <- utils::read.table(file, header = TRUE) |
43 | 49 |
if (is.null(inputCoverage$total_coverage)) inputCoverage$total_coverage <- NA |
... | ... |
@@ -55,6 +61,20 @@ readCoverageFile <- function(file, format, zero=NULL) { |
55 | 61 |
targetCoverage |
56 | 62 |
} |
57 | 63 |
|
64 |
+.readCoverageGatk4 <- function(file, zero, format, read.length) { |
|
65 |
+ if (!is.null(zero)) flog.warn("zero ignored for GATK coverage files.") |
|
66 |
+ x <- H5Fopen(file) |
|
67 |
+ intervals <- data.frame(x$intervals$transposed_index_start_end) |
|
68 |
+ intervals[, 1] <- x$intervals$indexed_contig_names[intervals[, 1] + 1] |
|
69 |
+ targetCoverage <- GRanges(intervals[, 1], IRanges(intervals[, 2], intervals[, 3])) |
|
70 |
+ targetCoverage$counts <- x$counts$values[,1] |
|
71 |
+ # TODO |
|
72 |
+ targetCoverage$coverage <- targetCoverage$counts * read.length * 2 |
|
73 |
+ targetCoverage <- .addAverageCoverage(targetCoverage) |
|
74 |
+ targetCoverage$on.target <- TRUE |
|
75 |
+ targetCoverage |
|
76 |
+} |
|
77 |
+ |
|
58 | 78 |
.addAverageCoverage <- function(x) { |
59 | 79 |
x$average.coverage <- x$coverage/width(x) |
60 | 80 |
x |
... | ... |
@@ -20,6 +20,8 @@ option_list <- list( |
20 | 20 |
make_option(c("--keepduplicates"), action = "store_true", |
21 | 21 |
default = formals(PureCN::calculateBamCoverageByInterval)$keep.duplicates, |
22 | 22 |
help = "Count reads marked as duplicates [default %default]"), |
23 |
+ make_option(c("--removemapq0"), action = "store_true", default = FALSE, |
|
24 |
+ help = "Not count reads marked with mapping quality 0 [default %default]"), |
|
23 | 25 |
make_option(c("--outdir"), action = "store", type = "character", |
24 | 26 |
default = NULL, |
25 | 27 |
help = "Output directory to which results should be written"), |
... | ... |
@@ -73,7 +75,7 @@ interval.file <- normalizePath(interval.file, mustWork = TRUE) |
73 | 75 |
} |
74 | 76 |
|
75 | 77 |
getCoverageBams <- function(bamFiles, indexFiles, outdir, interval.file, |
76 |
- force = FALSE, cpu = 1, keep.duplicates = FALSE) { |
|
78 |
+ force = FALSE, cpu = 1, keep.duplicates = FALSE, removemapq0 = FALSE) { |
|
77 | 79 |
|
78 | 80 |
bamFiles <- bamFiles |
79 | 81 |
indexFiles <- indexFiles |
... | ... |
@@ -102,7 +104,8 @@ getCoverageBams <- function(bamFiles, indexFiles, outdir, interval.file, |
102 | 104 |
} else { |
103 | 105 |
PureCN::calculateBamCoverageByInterval(bam.file = bam.file, |
104 | 106 |
interval.file = interval.file, output.file = output.file, |
105 |
- index.file = index.file, keep.duplicates = keep.duplicates) |
|
107 |
+ index.file = index.file, keep.duplicates = keep.duplicates, |
|
108 |
+ mapqFilter = if (removemapq0) 1 else NA) |
|
106 | 109 |
} |
107 | 110 |
output.file |
108 | 111 |
} |
... | ... |
@@ -121,7 +124,8 @@ getCoverageBams <- function(bamFiles, indexFiles, outdir, interval.file, |
121 | 124 |
coverageFiles <- NULL |
122 | 125 |
indexFiles <- NULL |
123 | 126 |
|
124 |
-flog.info("Loading PureCN...") |
|
127 |
+flog.info("Loading PureCN %s...", Biobase::package.version("PureCN")) |
|
128 |
+ |
|
125 | 129 |
suppressPackageStartupMessages(library(PureCN)) |
126 | 130 |
|
127 | 131 |
if (!is.null(bam.file)) { |
... | ... |
@@ -142,7 +146,7 @@ if (!is.null(bam.file)) { |
142 | 146 |
} |
143 | 147 |
|
144 | 148 |
coverageFiles <- getCoverageBams(bamFiles, indexFiles, outdir, |
145 |
- interval.file, force, opt$cpu, opt$keepduplicates) |
|
149 |
+ interval.file, force, opt$cpu, opt$keepduplicates, opt$removemapq0) |
|
146 | 150 |
} |
147 | 151 |
|
148 | 152 |
### GC-normalize coverage ----------------------------------------------------- |
... | ... |
@@ -58,7 +58,7 @@ if (is.null(genome)) stop("Need --genome") |
58 | 58 |
file.path(outdir, paste0(prefix, assay, genome, suffix)) |
59 | 59 |
} |
60 | 60 |
|
61 |
-flog.info("Loading PureCN...") |
|
61 |
+flog.info("Loading PureCN %s...", Biobase::package.version("PureCN")) |
|
62 | 62 |
if (length(coverageFiles)) { |
63 | 63 |
output.file <- .getFileName(outdir,"normalDB",".rds", assay, genome) |
64 | 64 |
if (file.exists(output.file) && !opt$force) { |
... | ... |
@@ -149,7 +149,7 @@ if (!is.null(file.rds) && file.exists(file.rds)) { |
149 | 149 |
|
150 | 150 |
normalizePath(dirname(out), mustWork = TRUE) |
151 | 151 |
|
152 |
-flog.info("Loading PureCN...") |
|
152 |
+flog.info("Loading PureCN %s...", Biobase::package.version("PureCN")) |
|
153 | 153 |
suppressPackageStartupMessages(library(PureCN)) |
154 | 154 |
library(futile.logger) |
155 | 155 |
|
... | ... |
@@ -5,7 +5,7 @@ |
5 | 5 |
\title{Function to calculate coverage from BAM file} |
6 | 6 |
\usage{ |
7 | 7 |
calculateBamCoverageByInterval(bam.file, interval.file, output.file = NULL, |
8 |
- index.file = bam.file, keep.duplicates = FALSE) |
|
8 |
+ index.file = bam.file, keep.duplicates = FALSE, ...) |
|
9 | 9 |
} |
10 | 10 |
\arguments{ |
11 | 11 |
\item{bam.file}{Filename of a BAM file.} |
... | ... |
@@ -20,6 +20,8 @@ the \code{\link{readCoverageFile}} function.} |
20 | 20 |
suffix, see \code{?scanBam}.} |
21 | 21 |
|
22 | 22 |
\item{keep.duplicates}{Keep or remove duplicated reads.} |
23 |
+ |
|
24 |
+\item{...}{Additional parameters passed to \code{ScanBamParam}.} |
|
23 | 25 |
} |
24 | 26 |
\value{ |
25 | 27 |
Returns total and average coverage by intervals. |
... | ... |
@@ -4,17 +4,20 @@ |
4 | 4 |
\alias{readCoverageFile} |
5 | 5 |
\title{Read coverage file} |
6 | 6 |
\usage{ |
7 |
-readCoverageFile(file, format, zero = NULL) |
|
7 |
+readCoverageFile(file, format, zero = NULL, read.length = 100) |
|
8 | 8 |
} |
9 | 9 |
\arguments{ |
10 | 10 |
\item{file}{Target coverage file.} |
11 | 11 |
|
12 | 12 |
\item{format}{File format. If missing, derived from the file |
13 |
-extension. Currently only GATK DepthofCoverage and CNVkit formats |
|
14 |
-supported.} |
|
13 |
+extension. Currently GATK3 DepthofCoverage, GATK4 CollectFragmentCounts |
|
14 |
+(hdf5), and CNVkit formats supported.} |
|
15 | 15 |
|
16 | 16 |
\item{zero}{Start position is 0-based. Default is \code{FALSE} |
17 | 17 |
for GATK, \code{TRUE} for BED file based intervals.} |
18 |
+ |
|
19 |
+\item{read.length}{For output formats which do not provide both counts |
|
20 |
+and total coverages, approximate them using the specified read length.} |
|
18 | 21 |
} |
19 | 22 |
\value{ |
20 | 23 |
A \code{data.frame} with the parsed coverage information. |
... | ... |
@@ -53,3 +53,16 @@ test_that("CNVkit *cnr example data is parsed correctly", { |
53 | 53 |
expect_equal(coverage$on.target, c(FALSE, FALSE, FALSE, FALSE, |
54 | 54 |
TRUE)) |
55 | 55 |
}) |
56 |
+ |
|
57 |
+test_that("GATK4 *hdf5 example data is parsed correctly", { |
|
58 |
+ coverageFile <- system.file("extdata", "example_normal5.hdf5", |
|
59 |
+ package = "PureCN") |
|
60 |
+ coverage <- readCoverageFile(coverageFile) |
|
61 |
+ expect_equal(length(coverage), 10) |
|
62 |
+ expect_equal(head(start(coverage)), |
|
63 |
+ c(3598833, 3599562, 3607444, 3624039, 3638537, 3639872)) |
|
64 |
+ expect_equal(head(coverage$counts), |
|
65 |
+ c(127, 305, 78, 699, 566, 344)) |
|
66 |
+ |
|
67 |
+ expect_equal(head(coverage$on.target), rep(TRUE, 6)) |
|
68 |
+}) |