... | ... |
@@ -2,8 +2,8 @@ Package: PureCN |
2 | 2 |
Type: Package |
3 | 3 |
Title: Copy number calling and SNV classification using |
4 | 4 |
targeted short read sequencing |
5 |
-Version: 1.15.2 |
|
6 |
-Date: 2019-06-21 |
|
5 |
+Version: 1.15.3 |
|
6 |
+Date: 2019-08-10 |
|
7 | 7 |
Authors@R: c(person("Markus", "Riester", role=c("aut", "cre"), |
8 | 8 |
email="[email protected]"), |
9 | 9 |
person("Angad P.", "Singh", role="aut")) |
... | ... |
@@ -1099,3 +1099,20 @@ c(test.num.copy, round(opt.C))[i], prior.K, mapping.bias.ok, seg.id, min.variant |
1099 | 1099 |
return(c(ccf_point, ccf_lower, ccf_upper)) |
1100 | 1100 |
} |
1101 | 1101 |
|
1102 |
+.getExonLrs <- function(seg.gr, tumor, log.ratio, idx = NULL) { |
|
1103 |
+ if (!is.null(idx)) { |
|
1104 |
+ tumor <- tumor[idx] |
|
1105 |
+ log.ratio <- log.ratio[idx] |
|
1106 |
+ } |
|
1107 |
+ ov.se <- findOverlaps(seg.gr, tumor) |
|
1108 |
+ exon.lrs <- lapply(seq_len(length(seg.gr)), function(i) log.ratio[subjectHits(ov.se)[queryHits(ov.se) == |
|
1109 |
+ i]]) |
|
1110 |
+ exon.lrs <- lapply(exon.lrs, function(x) subset(x, !is.na(x) & !is.infinite(x))) |
|
1111 |
+ # estimate stand. dev. for target logR within targets. this will be used as proxy |
|
1112 |
+ # for sample error. |
|
1113 |
+ targetsPerSegment <- vapply(exon.lrs, length, integer(1)) |
|
1114 |
+ if (!sum(targetsPerSegment > 50, na.rm = TRUE) && !is.null(idx)) { |
|
1115 |
+ .stopRuntimeError("Only tiny segments.") |
|
1116 |
+ } |
|
1117 |
+ exon.lrs |
|
1118 |
+} |
... | ... |
@@ -64,7 +64,7 @@ readCoverageFile <- function(file, format, zero=NULL, read.length = 100) { |
64 | 64 |
|
65 | 65 |
.readCoverageGatk4 <- function(file, zero, format, read.length) { |
66 | 66 |
if (!is.null(zero)) flog.warn("zero ignored for GATK coverage files.") |
67 |
- x <- H5Fopen(file) |
|
67 |
+ x <- H5Fopen(file, flags = "H5F_ACC_RDONLY") |
|
68 | 68 |
intervals <- data.frame(x$intervals$transposed_index_start_end) |
69 | 69 |
intervals[, 1] <- x$intervals$indexed_contig_names[intervals[, 1] + 1] |
70 | 70 |
targetCoverage <- GRanges(intervals[, 1], IRanges(intervals[, 2], intervals[, 3])) |
71 | 71 |
new file mode 100755 |
... | ... |
@@ -0,0 +1,30 @@ |
1 |
+#' Read file containing interval-level log2 tumor/normal ratios |
|
2 |
+#' |
|
3 |
+#' Read log2 ratio file produced by external tools like The Genome Analysis |
|
4 |
+#' Toolkit version 4. |
|
5 |
+#' |
|
6 |
+#' @param file Log2 coverage file. |
|
7 |
+#' @param format File format. If missing, derived from the file |
|
8 |
+#' extension. Currently GATK4 DenoiseReadCounts format supported. |
|
9 |
+#' @param zero Start position is 0-based. Default is \code{FALSE} |
|
10 |
+#' for GATK, \code{TRUE} for BED file based intervals. |
|
11 |
+#' @return A \code{GRange} with the log2 ratio. |
|
12 |
+#' @author Markus Riester |
|
13 |
+#' @examples |
|
14 |
+#' |
|
15 |
+#' tumor.coverage.file <- system.file("extdata", "example_tumor.txt", |
|
16 |
+#' package="PureCN") |
|
17 |
+#' coverage <- readCoverageFile(tumor.coverage.file) |
|
18 |
+#' |
|
19 |
+#' @export readLogRatioFile |
|
20 |
+readLogRatioFile <- function(file, format, zero=NULL) { |
|
21 |
+ if (missing(format)) format <- "GATK4" |
|
22 |
+ if (format == "GATK4") return(.readLogRatioFileGATK4(file, zero = FALSE)) |
|
23 |
+} |
|
24 |
+ |
|
25 |
+.readLogRatioFileGATK4 <- function(file, zero = FALSE) { |
|
26 |
+ x <- read.delim(file, comment.char = "@", as.is = TRUE) |
|
27 |
+ gr <- GRanges(x[,1], IRanges(start = x[,2], end = x[,3])) |
|
28 |
+ gr$log.ratio <- x[,4] |
|
29 |
+ gr |
|
30 |
+} |
... | ... |
@@ -619,22 +619,20 @@ runAbsoluteCN <- function(normal.coverage.file = NULL, |
619 | 619 |
} |
620 | 620 |
|
621 | 621 |
# get target log-ratios for all segments |
622 |
- ov.se <- findOverlaps(seg.gr, tumor) |
|
623 |
- exon.lrs <- lapply(seq_len(nrow(seg)), function(i) log.ratio[subjectHits(ov.se)[queryHits(ov.se) == |
|
624 |
- i]]) |
|
625 |
- exon.lrs <- lapply(exon.lrs, function(x) subset(x, !is.na(x) & !is.infinite(x))) |
|
626 |
- |
|
627 |
- # estimate stand. dev. for target logR within targets. this will be used as proxy |
|
628 |
- # for sample error. |
|
629 |
- targetsPerSegment <- vapply(exon.lrs, length, integer(1)) |
|
630 |
- |
|
631 |
- if (!sum(targetsPerSegment > 50, na.rm = TRUE)) { |
|
632 |
- .stopRuntimeError("Only tiny segments.") |
|
633 |
- } |
|
622 |
+ exon.lrs <- .getExonLrs(seg.gr, tumor, log.ratio) |
|
634 | 623 |
|
635 | 624 |
sd.seg <- max(median(vapply(exon.lrs, sd, double(1)), na.rm = TRUE), |
636 | 625 |
min.logr.sdev) |
637 |
- |
|
626 |
+ |
|
627 |
+ if (sum(!tumor$on.target, na.rm = TRUE)) { |
|
628 |
+ sd.seg.ontarget <- median(vapply( |
|
629 |
+ .getExonLrs(seg.gr, tumor, log.ratio, idx = tumor$on.target), |
|
630 |
+ sd, double(1)), na.rm = TRUE) |
|
631 |
+ |
|
632 |
+ sd.seg.offtarget <- median(vapply( |
|
633 |
+ .getExonLrs(seg.gr, tumor, log.ratio, idx = !tumor$on.target), |
|
634 |
+ sd, double(1)), na.rm = TRUE) |
|
635 |
+ } |
|
638 | 636 |
# if user provided seg file, then we do not have access to the log-ratios and |
639 | 637 |
# need to use the user provided noise estimate also, don't do outlier smoothing |
640 | 638 |
# when we use already segmented data |
... | ... |
@@ -669,6 +667,12 @@ runAbsoluteCN <- function(normal.coverage.file = NULL, |
669 | 667 |
if (sum(li < 0) > 0) |
670 | 668 |
.stopRuntimeError("Some segments have negative size.") |
671 | 669 |
flog.info("Mean standard deviation of log-ratios: %.2f", sd.seg) |
670 |
+ if (sum(!tumor$on.target, na.rm = TRUE)) { |
|
671 |
+ flog.info("Mean standard deviation of on-target log-ratios only: %.2f", |
|
672 |
+ sd.seg.ontarget) |
|
673 |
+ flog.info("Mean standard deviation of off-target log-ratios only: %.2f", |
|
674 |
+ sd.seg.offtarget) |
|
675 |
+ } |
|
672 | 676 |
log.ratio.offset <- rep(0, nrow(seg)) |
673 | 677 |
|
674 | 678 |
flog.info("2D-grid search of purity and ploidy...") |
... | ... |
@@ -20,6 +20,8 @@ option_list <- list( |
20 | 20 |
help = "Input: NormalDB.rds file. Generated by NormalDB.R."), |
21 | 21 |
make_option(c("--segfile"), action = "store", type = "character", |
22 | 22 |
default = NULL, help = "Input: Segmentation file"), |
23 |
+ make_option(c("--logratiofile"), action = "store", type = "character", |
|
24 |
+ default = NULL, help = "Input: Log2 copy number ratio file"), |
|
23 | 25 |
make_option(c("--sex"), action = "store", type = "character", |
24 | 26 |
default = formals(PureCN::runAbsoluteCN)$sex[[2]], |
25 | 27 |
help = "Input: Sex of sample. ? (detect), diplod (non-diploid chromosomes removed), F or M [default %default]"), |
... | ... |
@@ -136,6 +138,7 @@ if (!is.null(snp.blacklist)) { |
136 | 138 |
} |
137 | 139 |
|
138 | 140 |
seg.file <- opt$segfile |
141 |
+log.ratio <- opt$logratiofile |
|
139 | 142 |
normalDB <- opt$normaldb |
140 | 143 |
sampleid <- opt$sampleid |
141 | 144 |
out <- opt[["out"]] |
... | ... |
@@ -180,7 +183,6 @@ if (file.exists(file.rds) && !opt$force) { |
180 | 183 |
ret <- readCurationFile(file.rds) |
181 | 184 |
if (is.null(sampleid)) sampleid <- ret$input$sampleid |
182 | 185 |
} else { |
183 |
- tumor.coverage.file.orig <- tumor.coverage.file |
|
184 | 186 |
if (!is.null(normalDB)) { |
185 | 187 |
#if (!is.null(seg.file)) stop("normalDB and segfile do not work together.") |
186 | 188 |
normalDB <- readRDS(normalDB) |
... | ... |
@@ -195,7 +197,8 @@ if (file.exists(file.rds) && !opt$force) { |
195 | 197 |
normal.coverage.file <- calculateTangentNormal(tumor.coverage.file, |
196 | 198 |
normalDB) |
197 | 199 |
} |
198 |
- } else if (is.null(normal.coverage.file) && is.null(seg.file)) { |
|
200 |
+ } else if (is.null(normal.coverage.file) && is.null(seg.file) && |
|
201 |
+ is.null(log.ratio)) { |
|
199 | 202 |
stop("Need either normalDB or normal.coverage.file") |
200 | 203 |
} |
201 | 204 |
normal.coverage.file |
... | ... |
@@ -234,11 +237,20 @@ if (file.exists(file.rds) && !opt$force) { |
234 | 237 |
BPPARAM <- bpparam() |
235 | 238 |
flog.info("Using default BiocParallel backend. You can change the default in your ~/.Rprofile file.") |
236 | 239 |
} |
240 |
+ if (!is.null(log.ratio)) { |
|
241 |
+ flog.info("Reading %s...", log.ratio) |
|
242 |
+ log.ratio <- readLogRatioFile(log.ratio) |
|
243 |
+ flog.info("Reading %s...", tumor.coverage.file) |
|
244 |
+ tumor.coverage.file <- readCoverageFile(tumor.coverage.file) |
|
245 |
+ tumor.coverage.file <- subsetByOverlaps(tumor.coverage.file, log.ratio) |
|
246 |
+ log.ratio <- log.ratio$log.ratio |
|
247 |
+ } |
|
237 | 248 |
ret <- runAbsoluteCN(normal.coverage.file = normal.coverage.file, |
238 | 249 |
tumor.coverage.file = tumor.coverage.file, vcf.file = opt$vcf, |
239 | 250 |
sampleid = sampleid, plot.cnv = TRUE, |
240 | 251 |
interval.file = opt$intervals, |
241 | 252 |
genome = opt$genome, seg.file = seg.file, |
253 |
+ log.ratio = log.ratio, |
|
242 | 254 |
test.purity = test.purity, |
243 | 255 |
test.num.copy = seq(0, opt$maxcopynumber), |
244 | 256 |
sex = opt$sex, |
247 | 259 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,34 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/readLogRatioFile.R |
|
3 |
+\name{readLogRatioFile} |
|
4 |
+\alias{readLogRatioFile} |
|
5 |
+\title{Read file containing interval-level log2 tumor/normal ratios} |
|
6 |
+\usage{ |
|
7 |
+readLogRatioFile(file, format, zero = NULL) |
|
8 |
+} |
|
9 |
+\arguments{ |
|
10 |
+\item{file}{Log2 coverage file.} |
|
11 |
+ |
|
12 |
+\item{format}{File format. If missing, derived from the file |
|
13 |
+extension. Currently GATK4 DenoiseReadCounts format supported.} |
|
14 |
+ |
|
15 |
+\item{zero}{Start position is 0-based. Default is \code{FALSE} |
|
16 |
+for GATK, \code{TRUE} for BED file based intervals.} |
|
17 |
+} |
|
18 |
+\value{ |
|
19 |
+A \code{GRange} with the log2 ratio. |
|
20 |
+} |
|
21 |
+\description{ |
|
22 |
+Read log2 ratio file produced by external tools like The Genome Analysis |
|
23 |
+Toolkit version 4. |
|
24 |
+} |
|
25 |
+\examples{ |
|
26 |
+ |
|
27 |
+tumor.coverage.file <- system.file("extdata", "example_tumor.txt", |
|
28 |
+ package="PureCN") |
|
29 |
+coverage <- readCoverageFile(tumor.coverage.file) |
|
30 |
+ |
|
31 |
+} |
|
32 |
+\author{ |
|
33 |
+Markus Riester |
|
34 |
+} |
0 | 35 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,10 @@ |
1 |
+context("readLogRatioFile") |
|
2 |
+ |
|
3 |
+test_that("Example data matches", { |
|
4 |
+ logratio.file <- system.file("extdata", "example_gatk4_denoised_cr.tsv.gz", |
|
5 |
+ package = "PureCN") |
|
6 |
+ logratio <- readLogRatioFile(logratio.file) |
|
7 |
+ expect_equal(21, length(logratio)) |
|
8 |
+ expect_equal(0.109473, logratio$log.ratio[1], tolerance = .00001) |
|
9 |
+ expect_equal(-0.185664, logratio$log.ratio[21], tolerance = .00001) |
|
10 |
+}) |
... | ... |
@@ -552,6 +552,7 @@ Argument name & Corresponding PureCN argument & PureCN function \\ |
552 | 552 |
--normaldb & normalDB (serialized with \Rfunction{saveRDS}) & |
553 | 553 |
\Rfunction{calculateTangentNormal}, \Rfunction{filterTargets} \\ |
554 | 554 |
--segfile & seg.file & \Rfunction{runAbsoluteCN} \\ |
555 |
+--logratiofile & log.ratio & \Rfunction{runAbsoluteCN} \\ |
|
555 | 556 |
--sex & sex & \Rfunction{runAbsoluteCN} \\ |
556 | 557 |
--genome & genome & \Rfunction{runAbsoluteCN} \\ |
557 | 558 |
--intervals & interval.file & \Rfunction{runAbsoluteCN} \\ |