Browse code

Added support for GATK4 CollectFragmentCounts in readCoverageFile.

Markus Riester authored on 16/01/2018 04:56:56
Showing 11 changed files

... ...
@@ -2,8 +2,8 @@ Package: PureCN
2 2
 Type: Package
3 3
 Title: Copy number calling and SNV classification using
4 4
     targeted short read sequencing
5
-Version: 1.9.18
6
-Date: 2018-01-14
5
+Version: 1.9.19
6
+Date: 2018-01-15
7 7
 Authors@R: c(person("Markus", "Riester", role=c("aut", "cre"),
8 8
         email="[email protected]"),
9 9
     person("Angad P.", "Singh", role="aut"))
... ...
@@ -40,6 +40,7 @@ Imports:
40 40
     VGAM,
41 41
     edgeR,
42 42
     tools,
43
+    rhdf5,
43 44
     limma
44 45
 Suggests:
45 46
     PSCBS,
... ...
@@ -115,6 +115,7 @@ importFrom(graphics,strwidth)
115 115
 importFrom(graphics,symbols)
116 116
 importFrom(graphics,text)
117 117
 importFrom(gridExtra,grid.arrange)
118
+importFrom(rhdf5,H5Fopen)
118 119
 importFrom(rtracklayer,import)
119 120
 importFrom(stats,C)
120 121
 importFrom(stats,aggregate)
... ...
@@ -16,6 +16,7 @@
16 16
 #' @param index.file The bai index. This is expected without the .bai file
17 17
 #' suffix, see \code{?scanBam}.
18 18
 #' @param keep.duplicates Keep or remove duplicated reads.
19
+#' @param ... Additional parameters passed to \code{ScanBamParam}.
19 20
 #' @return Returns total and average coverage by intervals.
20 21
 #' @author Markus Riester
21 22
 #' @seealso \code{\link{preprocessIntervals}
... ...
@@ -37,7 +38,8 @@
37 38
 #' @importFrom Rsamtools headerTabix ScanBamParam scanBamFlag
38 39
 #'             scanBam scanFa scanFaIndex TabixFile
39 40
 calculateBamCoverageByInterval <- function(bam.file, interval.file,
40
-    output.file = NULL, index.file = bam.file, keep.duplicates = FALSE) {
41
+    output.file = NULL, index.file = bam.file, keep.duplicates = FALSE,
42
+    ...) {
41 43
     intervalGr <- readCoverageFile(interval.file)
42 44
 
43 45
     param <- ScanBamParam(what = c("pos", "qwidth", "flag"),
... ...
@@ -46,7 +48,8 @@ calculateBamCoverageByInterval <- function(bam.file, interval.file,
46 48
                              isNotPassingQualityControls = FALSE,
47 49
                              isSecondaryAlignment = FALSE,
48 50
                              isDuplicate = NA
49
-                     )
51
+                     ),
52
+                ...
50 53
              )
51 54
 
52 55
     xAll <- scanBam(bam.file, index = index.file, param = param)
... ...
@@ -5,10 +5,12 @@
5 5
 #' 
6 6
 #' @param file Target coverage file.
7 7
 #' @param format File format. If missing, derived from the file 
8
-#' extension. Currently only GATK DepthofCoverage and CNVkit formats 
9
-#' supported.
8
+#' extension. Currently GATK3 DepthofCoverage, GATK4 CollectFragmentCounts 
9
+#' (hdf5), and CNVkit formats supported.
10 10
 #' @param zero Start position is 0-based. Default is \code{FALSE}
11 11
 #' for GATK, \code{TRUE} for BED file based intervals.
12
+#' @param read.length For output formats which do not provide both counts 
13
+#' and total coverages, approximate them using the specified read length.
12 14
 #' @return A \code{data.frame} with the parsed coverage information.
13 15
 #' @author Markus Riester
14 16
 #' @seealso \code{\link{calculateBamCoverageByInterval}}
... ...
@@ -19,13 +21,16 @@
19 21
 #' coverage <- readCoverageFile(tumor.coverage.file)
20 22
 #' 
21 23
 #' @importFrom tools file_ext
24
+#' @importFrom rhdf5 H5Fopen
22 25
 #' @export readCoverageFile
23
-readCoverageFile <- function(file, format, zero=NULL) {
26
+readCoverageFile <- function(file, format, zero=NULL, read.length = 100) {
24 27
     if (missing(format)) format <- .getFormat(file)
25 28
     if (format %in% c("cnn", "cnr")) {
26 29
         targetCoverage <- .readCoverageCnn(file, zero, format)
30
+    } else if (format %in% c("hdf5")) {
31
+        targetCoverage <- .readCoverageGatk4(file, zero, format, read.length)
27 32
     } else {
28
-        targetCoverage <- .readCoverageGatk(file, zero)
33
+        targetCoverage <- .readCoverageGatk3(file, zero)
29 34
     }
30 35
     .checkLowCoverage(targetCoverage)
31 36
     .checkIntervals(targetCoverage)
... ...
@@ -34,10 +39,11 @@ readCoverageFile <- function(file, format, zero=NULL) {
34 39
 .getFormat <- function(file) {
35 40
     ext <- file_ext(file)
36 41
     if (ext %in% c("cnn", "cnr")) return(ext)
37
-    "GATK"    
38
-}    
42
+    if (ext %in% c("hdf5", "h5")) return("hdf5")
43
+    "GATK"
44
+}
39 45
 
40
-.readCoverageGatk <- function(file, zero) {
46
+.readCoverageGatk3 <- function(file, zero) {
41 47
     if (!is.null(zero)) flog.warn("zero ignored for GATK coverage files.")
42 48
     inputCoverage <- utils::read.table(file, header = TRUE)
43 49
     if (is.null(inputCoverage$total_coverage)) inputCoverage$total_coverage <- NA
... ...
@@ -55,6 +61,20 @@ readCoverageFile <- function(file, format, zero=NULL) {
55 61
     targetCoverage
56 62
 }
57 63
 
64
+.readCoverageGatk4 <- function(file, zero, format, read.length) {
65
+    if (!is.null(zero)) flog.warn("zero ignored for GATK coverage files.")
66
+    x <- H5Fopen(file)
67
+    intervals <- data.frame(x$intervals$transposed_index_start_end)
68
+    intervals[, 1] <- x$intervals$indexed_contig_names[intervals[, 1] + 1]
69
+    targetCoverage <- GRanges(intervals[, 1], IRanges(intervals[, 2], intervals[, 3]))
70
+    targetCoverage$counts <- x$counts$values[,1]
71
+    # TODO
72
+    targetCoverage$coverage <- targetCoverage$counts * read.length * 2
73
+    targetCoverage <- .addAverageCoverage(targetCoverage)
74
+    targetCoverage$on.target <- TRUE
75
+    targetCoverage
76
+}
77
+
58 78
 .addAverageCoverage <- function(x) {
59 79
     x$average.coverage <- x$coverage/width(x)
60 80
     x
... ...
@@ -20,6 +20,8 @@ option_list <- list(
20 20
     make_option(c("--keepduplicates"), action = "store_true", 
21 21
         default = formals(PureCN::calculateBamCoverageByInterval)$keep.duplicates, 
22 22
         help = "Count reads marked as duplicates [default %default]"),
23
+    make_option(c("--removemapq0"), action = "store_true", default = FALSE, 
24
+        help = "Not count reads marked with mapping quality 0 [default %default]"),
23 25
     make_option(c("--outdir"), action = "store", type = "character", 
24 26
         default = NULL,
25 27
         help = "Output directory to which results should be written"),
... ...
@@ -73,7 +75,7 @@ interval.file <- normalizePath(interval.file, mustWork = TRUE)
73 75
 }
74 76
 
75 77
 getCoverageBams <- function(bamFiles, indexFiles, outdir, interval.file, 
76
-    force = FALSE, cpu = 1, keep.duplicates = FALSE) {
78
+    force = FALSE, cpu = 1, keep.duplicates = FALSE, removemapq0 = FALSE) {
77 79
 
78 80
     bamFiles <- bamFiles
79 81
     indexFiles <- indexFiles
... ...
@@ -102,7 +104,8 @@ getCoverageBams <- function(bamFiles, indexFiles, outdir, interval.file,
102 104
         } else {
103 105
             PureCN::calculateBamCoverageByInterval(bam.file = bam.file,
104 106
                 interval.file = interval.file, output.file = output.file,
105
-                index.file = index.file, keep.duplicates = keep.duplicates)
107
+                index.file = index.file, keep.duplicates = keep.duplicates,
108
+                mapqFilter = if (removemapq0) 1 else NA)
106 109
         }
107 110
         output.file
108 111
     }
... ...
@@ -121,7 +124,8 @@ getCoverageBams <- function(bamFiles, indexFiles, outdir, interval.file,
121 124
 coverageFiles <- NULL
122 125
 indexFiles <- NULL
123 126
 
124
-flog.info("Loading PureCN...")
127
+flog.info("Loading PureCN %s...", Biobase::package.version("PureCN"))
128
+
125 129
 suppressPackageStartupMessages(library(PureCN))
126 130
     
127 131
 if (!is.null(bam.file)) {
... ...
@@ -142,7 +146,7 @@ if (!is.null(bam.file)) {
142 146
     }    
143 147
 
144 148
     coverageFiles <- getCoverageBams(bamFiles, indexFiles, outdir, 
145
-        interval.file, force, opt$cpu, opt$keepduplicates) 
149
+        interval.file, force, opt$cpu, opt$keepduplicates, opt$removemapq0) 
146 150
 }
147 151
 
148 152
 ### GC-normalize coverage -----------------------------------------------------
... ...
@@ -58,7 +58,7 @@ if (is.null(genome)) stop("Need --genome")
58 58
     file.path(outdir, paste0(prefix, assay, genome, suffix))
59 59
 }
60 60
 
61
-flog.info("Loading PureCN...")
61
+flog.info("Loading PureCN %s...", Biobase::package.version("PureCN"))
62 62
 if (length(coverageFiles)) {
63 63
     output.file <- .getFileName(outdir,"normalDB",".rds", assay, genome)
64 64
     if (file.exists(output.file) && !opt$force) {
... ...
@@ -149,7 +149,7 @@ if (!is.null(file.rds) && file.exists(file.rds)) {
149 149
     
150 150
 normalizePath(dirname(out), mustWork = TRUE)
151 151
 
152
-flog.info("Loading PureCN...")
152
+flog.info("Loading PureCN %s...", Biobase::package.version("PureCN"))
153 153
 suppressPackageStartupMessages(library(PureCN))
154 154
 library(futile.logger)
155 155
 
156 156
new file mode 100755
157 157
Binary files /dev/null and b/inst/extdata/example_normal5.hdf5 differ
... ...
@@ -5,7 +5,7 @@
5 5
 \title{Function to calculate coverage from BAM file}
6 6
 \usage{
7 7
 calculateBamCoverageByInterval(bam.file, interval.file, output.file = NULL,
8
-  index.file = bam.file, keep.duplicates = FALSE)
8
+  index.file = bam.file, keep.duplicates = FALSE, ...)
9 9
 }
10 10
 \arguments{
11 11
 \item{bam.file}{Filename of a BAM file.}
... ...
@@ -20,6 +20,8 @@ the \code{\link{readCoverageFile}} function.}
20 20
 suffix, see \code{?scanBam}.}
21 21
 
22 22
 \item{keep.duplicates}{Keep or remove duplicated reads.}
23
+
24
+\item{...}{Additional parameters passed to \code{ScanBamParam}.}
23 25
 }
24 26
 \value{
25 27
 Returns total and average coverage by intervals.
... ...
@@ -4,17 +4,20 @@
4 4
 \alias{readCoverageFile}
5 5
 \title{Read coverage file}
6 6
 \usage{
7
-readCoverageFile(file, format, zero = NULL)
7
+readCoverageFile(file, format, zero = NULL, read.length = 100)
8 8
 }
9 9
 \arguments{
10 10
 \item{file}{Target coverage file.}
11 11
 
12 12
 \item{format}{File format. If missing, derived from the file 
13
-extension. Currently only GATK DepthofCoverage and CNVkit formats 
14
-supported.}
13
+extension. Currently GATK3 DepthofCoverage, GATK4 CollectFragmentCounts 
14
+(hdf5), and CNVkit formats supported.}
15 15
 
16 16
 \item{zero}{Start position is 0-based. Default is \code{FALSE}
17 17
 for GATK, \code{TRUE} for BED file based intervals.}
18
+
19
+\item{read.length}{For output formats which do not provide both counts 
20
+and total coverages, approximate them using the specified read length.}
18 21
 }
19 22
 \value{
20 23
 A \code{data.frame} with the parsed coverage information.
... ...
@@ -53,3 +53,16 @@ test_that("CNVkit *cnr example data is parsed correctly", {
53 53
     expect_equal(coverage$on.target, c(FALSE, FALSE, FALSE, FALSE, 
54 54
         TRUE))
55 55
 })
56
+
57
+test_that("GATK4 *hdf5 example data is parsed correctly", {
58
+    coverageFile <- system.file("extdata", "example_normal5.hdf5", 
59
+        package = "PureCN")
60
+    coverage <- readCoverageFile(coverageFile)
61
+    expect_equal(length(coverage), 10)
62
+    expect_equal(head(start(coverage)), 
63
+        c(3598833, 3599562, 3607444, 3624039, 3638537, 3639872))
64
+    expect_equal(head(coverage$counts), 
65
+        c(127, 305, 78, 699, 566, 344))
66
+
67
+    expect_equal(head(coverage$on.target), rep(TRUE, 6))
68
+})