Browse code

Initial support for GATK4 copy number output (#6)

Markus Riester authored on 10/08/2019 21:05:42
Showing 12 changed files

... ...
@@ -2,8 +2,8 @@ Package: PureCN
2 2
 Type: Package
3 3
 Title: Copy number calling and SNV classification using
4 4
     targeted short read sequencing
5
-Version: 1.15.2
6
-Date: 2019-06-21
5
+Version: 1.15.3
6
+Date: 2019-08-10
7 7
 Authors@R: c(person("Markus", "Riester", role=c("aut", "cre"),
8 8
         email="[email protected]"),
9 9
     person("Angad P.", "Singh", role="aut"))
... ...
@@ -33,6 +33,7 @@ export(preprocessIntervals)
33 33
 export(processMultipleSamples)
34 34
 export(readCoverageFile)
35 35
 export(readCurationFile)
36
+export(readLogRatioFile)
36 37
 export(runAbsoluteCN)
37 38
 export(segmentationCBS)
38 39
 export(segmentationHclust)
... ...
@@ -5,6 +5,8 @@ NEW FEATURES
5 5
 
6 6
     o flag segments in poor quality regions
7 7
     o Dx.R now reports chromosomal instability scores
8
+    o Added readLogRatioFile function to read GATK4 DenoiseReadCounts
9
+      output files containing log2 tumor/normal ratios
8 10
 
9 11
 BUGFIXES
10 12
 
... ...
@@ -1099,3 +1099,20 @@ c(test.num.copy, round(opt.C))[i], prior.K, mapping.bias.ok, seg.id, min.variant
1099 1099
     return(c(ccf_point, ccf_lower, ccf_upper))
1100 1100
 }
1101 1101
 
1102
+.getExonLrs <- function(seg.gr, tumor, log.ratio, idx = NULL) {
1103
+    if (!is.null(idx)) {
1104
+        tumor <- tumor[idx]
1105
+        log.ratio <- log.ratio[idx]
1106
+    }    
1107
+    ov.se <- findOverlaps(seg.gr, tumor)
1108
+    exon.lrs <- lapply(seq_len(length(seg.gr)), function(i) log.ratio[subjectHits(ov.se)[queryHits(ov.se) == 
1109
+        i]])
1110
+    exon.lrs <- lapply(exon.lrs, function(x) subset(x, !is.na(x) & !is.infinite(x)))
1111
+    # estimate stand. dev. for target logR within targets. this will be used as proxy
1112
+    # for sample error.
1113
+    targetsPerSegment <- vapply(exon.lrs, length, integer(1))
1114
+    if (!sum(targetsPerSegment > 50, na.rm = TRUE) && !is.null(idx)) {
1115
+        .stopRuntimeError("Only tiny segments.")
1116
+    }
1117
+    exon.lrs
1118
+}
... ...
@@ -64,7 +64,7 @@ readCoverageFile <- function(file, format, zero=NULL, read.length = 100) {
64 64
 
65 65
 .readCoverageGatk4 <- function(file, zero, format, read.length) {
66 66
     if (!is.null(zero)) flog.warn("zero ignored for GATK coverage files.")
67
-    x <- H5Fopen(file)
67
+    x <- H5Fopen(file, flags = "H5F_ACC_RDONLY")
68 68
     intervals <- data.frame(x$intervals$transposed_index_start_end)
69 69
     intervals[, 1] <- x$intervals$indexed_contig_names[intervals[, 1] + 1]
70 70
     targetCoverage <- GRanges(intervals[, 1], IRanges(intervals[, 2], intervals[, 3]))
71 71
new file mode 100755
... ...
@@ -0,0 +1,30 @@
1
+#' Read file containing interval-level log2 tumor/normal ratios 
2
+#' 
3
+#' Read log2 ratio file produced by external tools like The Genome Analysis 
4
+#' Toolkit version 4.
5
+#' 
6
+#' @param file Log2 coverage file.
7
+#' @param format File format. If missing, derived from the file 
8
+#' extension. Currently GATK4 DenoiseReadCounts format supported.
9
+#' @param zero Start position is 0-based. Default is \code{FALSE}
10
+#' for GATK, \code{TRUE} for BED file based intervals.
11
+#' @return A \code{GRange} with the log2 ratio.
12
+#' @author Markus Riester
13
+#' @examples
14
+#' 
15
+#' tumor.coverage.file <- system.file("extdata", "example_tumor.txt", 
16
+#'     package="PureCN")
17
+#' coverage <- readCoverageFile(tumor.coverage.file)
18
+#' 
19
+#' @export readLogRatioFile
20
+readLogRatioFile <- function(file, format, zero=NULL) {
21
+    if (missing(format)) format <- "GATK4"
22
+    if (format == "GATK4") return(.readLogRatioFileGATK4(file, zero = FALSE))
23
+}
24
+
25
+.readLogRatioFileGATK4 <- function(file, zero = FALSE) {
26
+    x <- read.delim(file, comment.char = "@", as.is = TRUE)
27
+    gr <- GRanges(x[,1], IRanges(start = x[,2], end = x[,3]))
28
+    gr$log.ratio <- x[,4]
29
+    gr
30
+}
... ...
@@ -619,22 +619,20 @@ runAbsoluteCN <- function(normal.coverage.file = NULL,
619 619
     }
620 620
     
621 621
     # get target log-ratios for all segments
622
-    ov.se <- findOverlaps(seg.gr, tumor)
623
-    exon.lrs <- lapply(seq_len(nrow(seg)), function(i) log.ratio[subjectHits(ov.se)[queryHits(ov.se) == 
624
-        i]])
625
-    exon.lrs <- lapply(exon.lrs, function(x) subset(x, !is.na(x) & !is.infinite(x)))
626
-    
627
-    # estimate stand. dev. for target logR within targets. this will be used as proxy
628
-    # for sample error.
629
-    targetsPerSegment <- vapply(exon.lrs, length, integer(1))
630
-    
631
-    if (!sum(targetsPerSegment > 50, na.rm = TRUE)) {
632
-        .stopRuntimeError("Only tiny segments.")
633
-    }
622
+    exon.lrs <- .getExonLrs(seg.gr, tumor, log.ratio)
634 623
     
635 624
     sd.seg <- max(median(vapply(exon.lrs, sd, double(1)), na.rm = TRUE),
636 625
                   min.logr.sdev)
637
-   
626
+
627
+    if (sum(!tumor$on.target, na.rm = TRUE)) {
628
+        sd.seg.ontarget <- median(vapply(
629
+            .getExonLrs(seg.gr, tumor, log.ratio, idx = tumor$on.target), 
630
+            sd, double(1)), na.rm = TRUE)
631
+
632
+        sd.seg.offtarget <- median(vapply(
633
+            .getExonLrs(seg.gr, tumor, log.ratio, idx = !tumor$on.target), 
634
+            sd, double(1)), na.rm = TRUE)
635
+    } 
638 636
     # if user provided seg file, then we do not have access to the log-ratios and
639 637
     # need to use the user provided noise estimate also, don't do outlier smoothing
640 638
     # when we use already segmented data
... ...
@@ -669,6 +667,12 @@ runAbsoluteCN <- function(normal.coverage.file = NULL,
669 667
     if (sum(li < 0) > 0) 
670 668
         .stopRuntimeError("Some segments have negative size.")
671 669
     flog.info("Mean standard deviation of log-ratios: %.2f", sd.seg)
670
+    if (sum(!tumor$on.target, na.rm = TRUE)) {
671
+        flog.info("Mean standard deviation of on-target log-ratios only: %.2f",
672
+            sd.seg.ontarget)
673
+        flog.info("Mean standard deviation of off-target log-ratios only: %.2f",
674
+            sd.seg.offtarget)
675
+    }    
672 676
     log.ratio.offset <- rep(0, nrow(seg))
673 677
     
674 678
     flog.info("2D-grid search of purity and ploidy...")
... ...
@@ -20,6 +20,8 @@ option_list <- list(
20 20
         help = "Input: NormalDB.rds file. Generated by NormalDB.R."),
21 21
     make_option(c("--segfile"), action = "store", type = "character",
22 22
                 default = NULL, help = "Input: Segmentation file"),
23
+    make_option(c("--logratiofile"), action = "store", type = "character",
24
+                default = NULL, help = "Input: Log2 copy number ratio file"),
23 25
     make_option(c("--sex"), action = "store", type = "character",
24 26
         default = formals(PureCN::runAbsoluteCN)$sex[[2]],
25 27
         help = "Input: Sex of sample. ? (detect), diplod (non-diploid chromosomes removed), F or M [default %default]"),
... ...
@@ -136,6 +138,7 @@ if (!is.null(snp.blacklist)) {
136 138
 }
137 139
 
138 140
 seg.file <- opt$segfile
141
+log.ratio <- opt$logratiofile
139 142
 normalDB <- opt$normaldb
140 143
 sampleid <- opt$sampleid
141 144
 out <- opt[["out"]]
... ...
@@ -180,7 +183,6 @@ if (file.exists(file.rds) && !opt$force) {
180 183
     ret <- readCurationFile(file.rds)
181 184
     if (is.null(sampleid)) sampleid <- ret$input$sampleid
182 185
 } else {
183
-    tumor.coverage.file.orig <- tumor.coverage.file
184 186
     if (!is.null(normalDB)) {
185 187
         #if (!is.null(seg.file)) stop("normalDB and segfile do not work together.")
186 188
         normalDB <- readRDS(normalDB)
... ...
@@ -195,7 +197,8 @@ if (file.exists(file.rds) && !opt$force) {
195 197
                 normal.coverage.file <- calculateTangentNormal(tumor.coverage.file,
196 198
                     normalDB)
197 199
             }
198
-        } else if (is.null(normal.coverage.file) && is.null(seg.file)) {
200
+        } else if (is.null(normal.coverage.file) && is.null(seg.file) &&
201
+                   is.null(log.ratio)) {
199 202
             stop("Need either normalDB or normal.coverage.file")
200 203
         }
201 204
         normal.coverage.file
... ...
@@ -234,11 +237,20 @@ if (file.exists(file.rds) && !opt$force) {
234 237
         BPPARAM <- bpparam()
235 238
         flog.info("Using default BiocParallel backend. You can change the default in your ~/.Rprofile file.") 
236 239
     }
240
+    if (!is.null(log.ratio)) {
241
+        flog.info("Reading %s...", log.ratio)
242
+        log.ratio <- readLogRatioFile(log.ratio)
243
+        flog.info("Reading %s...", tumor.coverage.file)
244
+        tumor.coverage.file <- readCoverageFile(tumor.coverage.file)
245
+        tumor.coverage.file <- subsetByOverlaps(tumor.coverage.file, log.ratio)
246
+        log.ratio <- log.ratio$log.ratio
247
+    }    
237 248
     ret <- runAbsoluteCN(normal.coverage.file = normal.coverage.file,
238 249
             tumor.coverage.file = tumor.coverage.file, vcf.file = opt$vcf,
239 250
             sampleid = sampleid, plot.cnv = TRUE,
240 251
             interval.file = opt$intervals,
241 252
             genome = opt$genome, seg.file = seg.file,
253
+            log.ratio = log.ratio,
242 254
             test.purity = test.purity,
243 255
             test.num.copy = seq(0, opt$maxcopynumber),
244 256
             sex = opt$sex,
245 257
new file mode 100755
246 258
Binary files /dev/null and b/inst/extdata/example_gatk4_denoised_cr.tsv.gz differ
247 259
new file mode 100644
... ...
@@ -0,0 +1,34 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/readLogRatioFile.R
3
+\name{readLogRatioFile}
4
+\alias{readLogRatioFile}
5
+\title{Read file containing interval-level log2 tumor/normal ratios}
6
+\usage{
7
+readLogRatioFile(file, format, zero = NULL)
8
+}
9
+\arguments{
10
+\item{file}{Log2 coverage file.}
11
+
12
+\item{format}{File format. If missing, derived from the file 
13
+extension. Currently GATK4 DenoiseReadCounts format supported.}
14
+
15
+\item{zero}{Start position is 0-based. Default is \code{FALSE}
16
+for GATK, \code{TRUE} for BED file based intervals.}
17
+}
18
+\value{
19
+A \code{GRange} with the log2 ratio.
20
+}
21
+\description{
22
+Read log2 ratio file produced by external tools like The Genome Analysis 
23
+Toolkit version 4.
24
+}
25
+\examples{
26
+
27
+tumor.coverage.file <- system.file("extdata", "example_tumor.txt", 
28
+    package="PureCN")
29
+coverage <- readCoverageFile(tumor.coverage.file)
30
+
31
+}
32
+\author{
33
+Markus Riester
34
+}
0 35
new file mode 100644
... ...
@@ -0,0 +1,10 @@
1
+context("readLogRatioFile")
2
+
3
+test_that("Example data matches", {
4
+    logratio.file <- system.file("extdata", "example_gatk4_denoised_cr.tsv.gz", 
5
+        package = "PureCN")
6
+    logratio <- readLogRatioFile(logratio.file)
7
+    expect_equal(21, length(logratio))
8
+    expect_equal(0.109473, logratio$log.ratio[1], tolerance = .00001)
9
+    expect_equal(-0.185664, logratio$log.ratio[21], tolerance = .00001)
10
+})
... ...
@@ -552,6 +552,7 @@ Argument name      & Corresponding PureCN argument & PureCN function \\
552 552
 --normaldb         & normalDB (serialized with \Rfunction{saveRDS}) & 
553 553
     \Rfunction{calculateTangentNormal}, \Rfunction{filterTargets} \\
554 554
 --segfile          & seg.file           & \Rfunction{runAbsoluteCN}  \\
555
+--logratiofile     & log.ratio          & \Rfunction{runAbsoluteCN}  \\
555 556
 --sex              & sex                & \Rfunction{runAbsoluteCN}  \\
556 557
 --genome           & genome             & \Rfunction{runAbsoluteCN}  \\
557 558
 --intervals        & interval.file      & \Rfunction{runAbsoluteCN}   \\