... | ... |
@@ -36,6 +36,7 @@ export(readLogRatioFile) |
36 | 36 |
export(readSegmentationFile) |
37 | 37 |
export(runAbsoluteCN) |
38 | 38 |
export(segmentationCBS) |
39 |
+export(segmentationGATK4) |
|
39 | 40 |
export(segmentationHclust) |
40 | 41 |
export(segmentationPSCBS) |
41 | 42 |
export(setMappingBiasVcf) |
... | ... |
@@ -163,6 +164,7 @@ importFrom(stats,runif) |
163 | 164 |
importFrom(stats,t.test) |
164 | 165 |
importFrom(stats,weighted.mean) |
165 | 166 |
importFrom(tools,file_ext) |
167 |
+importFrom(utils,compareVersion) |
|
166 | 168 |
importFrom(utils,data) |
167 | 169 |
importFrom(utils,head) |
168 | 170 |
importFrom(utils,object.size) |
... | ... |
@@ -7,6 +7,8 @@ NEW FEATURES |
7 | 7 |
minimizes noise while keeping the resolution as high as possible |
8 | 8 |
o Added support for GATK4 CollectAllelicCounts output as alternative |
9 | 9 |
to Mutect |
10 |
+ o Added segmentationGATK4 to used GATK4's segmentation function |
|
11 |
+ ModelSegments |
|
10 | 12 |
|
11 | 13 |
SIGNIFICANT USER-VISIBLE CHANGES |
12 | 14 |
|
... | ... |
@@ -47,8 +47,8 @@ readLogRatioFile <- function(file, format, zero = NULL) { |
47 | 47 |
gr |
48 | 48 |
} |
49 | 49 |
|
50 |
-.writeLogRatioFileGATK4 <- function(x, file) { |
|
51 |
- gr <- x$input$log.ratio |
|
50 |
+.writeLogRatioFileGATK4 <- function(x, id = 1, file) { |
|
51 |
+ gr <- x$log.ratio |
|
52 | 52 |
if (is.null(gr$log.ratio)) { |
53 | 53 |
.stopRuntimeError("log.ratio NULL in .writeLogRatioFileGATK4") |
54 | 54 |
} |
... | ... |
@@ -59,7 +59,7 @@ readLogRatioFile <- function(file, format, zero = NULL) { |
59 | 59 |
LOG2_COPY_RATIO = gr$log.ratio |
60 | 60 |
) |
61 | 61 |
con <- file(file, open = "w") |
62 |
- .writeGATKHeader(x$input$vcf, id = 1, con, "log-ratio") |
|
62 |
+ .writeGATKHeader(x$vcf, id, con, "log-ratio") |
|
63 | 63 |
write.table(output, con, row.names = FALSE, quote = FALSE, sep = "\t") |
64 | 64 |
close(con) |
65 | 65 |
invisible(output) |
... | ... |
@@ -73,5 +73,6 @@ readLogRatioFile <- function(file, format, zero = NULL) { |
73 | 73 |
sl <- seqlengths(vcf) |
74 | 74 |
writeLines(paste("@SQ", paste0("SN:",names(sl)), paste0("LN:", sl), sep="\t"), con) |
75 | 75 |
} |
76 |
- writeLines(paste("@RG", "ID:PureCN", paste0("SM:", samples(header(vcf))[id]), sep="\t"), con) |
|
76 |
+ sampleid <- .getSampleIdFromVcf(vcf, id) |
|
77 |
+ writeLines(paste("@RG", "ID:PureCN", paste0("SM:", sampleid), sep="\t"), con) |
|
77 | 78 |
} |
... | ... |
@@ -35,13 +35,17 @@ readSegmentationFile <- function(seg.file, sampleid, model.homozygous = FALSE, |
35 | 35 |
return("DNAcopy") |
36 | 36 |
} |
37 | 37 |
.convertSegGATK4 <- function(seg, sampleid) { |
38 |
+ lr_col_id <- "LOG2_COPY_RATIO_POSTERIOR_50" |
|
39 |
+ if (is.null(seg[[lr_col_id]])) { |
|
40 |
+ lr_col_id <- "MEAN_LOG2_COPY_RATIO" |
|
41 |
+ } |
|
38 | 42 |
data.frame( |
39 | 43 |
ID = sampleid, |
40 | 44 |
chrom = seg$CONTIG, |
41 | 45 |
loc.start = seg$START, |
42 | 46 |
loc.end = seg$END, |
43 | 47 |
num.mark = seg$NUM_POINTS_COPY_RATIO, |
44 |
- seg.mean = seg$LOG2_COPY_RATIO_POSTERIOR_50 |
|
48 |
+ seg.mean = seg[[lr_col_id]] |
|
45 | 49 |
) |
46 | 50 |
} |
47 | 51 |
.checkSeg <- function(seg, sampleid, model.homozygous, verbose=TRUE) { |
48 | 52 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,117 @@ |
1 |
+#' GATK4 ModelSegments segmentation function |
|
2 |
+#' |
|
3 |
+#' A wrapper for GATK4s ModelSegmentation function, useful when normalization |
|
4 |
+#' is performed with other tools than GATK4, for example PureCN. |
|
5 |
+#' This function is called via the |
|
6 |
+#' \code{fun.segmentation} argument of \code{\link{runAbsoluteCN}}. The |
|
7 |
+#' arguments are passed via \code{args.segmentation}. |
|
8 |
+#' |
|
9 |
+#' |
|
10 |
+#' @param normal Coverage data for normal sample. Ignored in this function. |
|
11 |
+#' @param tumor Coverage data for tumor sample. |
|
12 |
+#' @param log.ratio Copy number log-ratios, one for each exon in coverage file. |
|
13 |
+#' @param seg If segmentation was provided by the user, this data structure |
|
14 |
+#' will contain this segmentation. Useful for minimal segmentation functions. |
|
15 |
+#' Otherwise PureCN will re-segment the data. This segmentation function |
|
16 |
+#' ignores this user provided segmentation. |
|
17 |
+#' @param vcf Optional \code{CollapsedVCF} object with germline allelic ratios. |
|
18 |
+#' @param tumor.id.in.vcf Id of tumor in case multiple samples are stored in |
|
19 |
+#' VCF. |
|
20 |
+#' @param normal.id.in.vcf Id of normal in in VCF. Currently not used. |
|
21 |
+#' @param prune.hclust.h Ignored in this function. |
|
22 |
+#' @param prune.hclust.method Ignored in this function. |
|
23 |
+#' @param additional.cmd.args \code{character(1)}. By default, |
|
24 |
+#' \code{ModelSegments} is called with default parameters. Provide additional |
|
25 |
+#' arguments here. |
|
26 |
+#' @param chr.hash Not needed here since \code{ModelSegments} does not |
|
27 |
+#' require numbered chromosome names. |
|
28 |
+#' @param ... Currently unused arguments provided to other segmentation |
|
29 |
+#' functions. |
|
30 |
+#' @return \code{data.frame} containing the segmentation. |
|
31 |
+#' @author Markus Riester |
|
32 |
+#' |
|
33 |
+#' @seealso \code{\link{runAbsoluteCN}} |
|
34 |
+#' @examples |
|
35 |
+#' |
|
36 |
+#' normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt", |
|
37 |
+#' package="PureCN") |
|
38 |
+#' tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt", |
|
39 |
+#' package="PureCN") |
|
40 |
+#' vcf.file <- system.file("extdata", "example.vcf.gz", |
|
41 |
+#' package="PureCN") |
|
42 |
+#' |
|
43 |
+#' # The max.candidate.solutions, max.ploidy and test.purity parameters are set to |
|
44 |
+#' # non-default values to speed-up this example. This is not a good idea for real |
|
45 |
+#' # samples. |
|
46 |
+#'\dontrun{ |
|
47 |
+#' ret <-runAbsoluteCN(normal.coverage.file=normal.coverage.file, |
|
48 |
+#' tumor.coverage.file=tumor.coverage.file, vcf.file=vcf.file, |
|
49 |
+#' sampleid="Sample1", genome="hg19", |
|
50 |
+#' fun.segmentation = segmentationGATK4, max.ploidy=4, |
|
51 |
+#' test.purity=seq(0.3,0.7,by=0.05), max.candidate.solutions=1) |
|
52 |
+#'} |
|
53 |
+#' |
|
54 |
+#' @importFrom utils compareVersion |
|
55 |
+#' @export segmentationGATK4 |
|
56 |
+segmentationGATK4 <- function(normal, tumor, log.ratio, seg, |
|
57 |
+ vcf = NULL, tumor.id.in.vcf = 1, normal.id.in.vcf = NULL, |
|
58 |
+ prune.hclust.h = NULL, prune.hclust.method = NULL, |
|
59 |
+ chr.hash = NULL, ...) { |
|
60 |
+ |
|
61 |
+ min.version <- "4.1.7.0" |
|
62 |
+ if (.checkGATK4Version(min.version) < 0) { |
|
63 |
+ .stopUserError("Unable to find a recent (>= %s) gatk binary.", |
|
64 |
+ min.version) |
|
65 |
+ } |
|
66 |
+ output.vcf.file <- NULL |
|
67 |
+ if (!is.null(vcf)) { |
|
68 |
+ output.vcf.file <- tempfile(fileext = ".tsv") |
|
69 |
+ .writeAllelicCountsFileGatk(vcf, tumor.id.in.vcf, output.vcf.file) |
|
70 |
+ } |
|
71 |
+ if (length(log.ratio) != length(tumor)) { |
|
72 |
+ .stopRuntimeError("tumor and log.ratio do not align in segmentationGATK4.") |
|
73 |
+ } |
|
74 |
+ output.lr.file <- tempfile(fileext = ".tsv") |
|
75 |
+ # following .writeLogRatioFileGATK4 needs the GRanges information, so |
|
76 |
+ # simply update it (should be the same though!) |
|
77 |
+ tumor$log.ratio <- log.ratio |
|
78 |
+ .writeLogRatioFileGATK4(list(vcf=vcf, log.ratio = tumor), tumor.id.in.vcf, output.lr.file) |
|
79 |
+ output.dir <- tempdir() |
|
80 |
+ sampleid <- .getSampleIdFromVcf(vcf, tumor.id.in.vcf) |
|
81 |
+ args <- paste("ModelSegments", |
|
82 |
+ "--allelic-counts", output.vcf.file, |
|
83 |
+ "--denoised-copy-ratios", output.lr.file, |
|
84 |
+ "--output-prefix", "tumor", |
|
85 |
+ "-O", output.dir) |
|
86 |
+ output <- try(system2("gatk", args, stderr = TRUE, stdout = TRUE)) |
|
87 |
+ if (is(output, "try-error")) { |
|
88 |
+ .stopUserError("GATK4 ModelSegments runtime error: ", |
|
89 |
+ output, "\nArguments: ", args) |
|
90 |
+ } |
|
91 |
+ seg <- readSegmentationFile(file.path(output.dir, "tumor.modelFinal.seg"), sampleid) |
|
92 |
+ idx.enough.markers <- seg$num.mark > 1 |
|
93 |
+ rownames(seg) <- NULL |
|
94 |
+ file.remove(output.vcf.file) |
|
95 |
+ file.remove(output.lr.file) |
|
96 |
+ seg[idx.enough.markers,] |
|
97 |
+} |
|
98 |
+ |
|
99 |
+.getSampleIdFromVcf <- function(vcf, tumor.id.in.vcf) { |
|
100 |
+ if (is(tumor.id.in.vcf, "character") && |
|
101 |
+ tumor.id.in.vcf %in% samples(header(vcf))) return(tumor.id.in.vcf) |
|
102 |
+ if (is(tumor.id.in.vcf, "numeric") && |
|
103 |
+ length(samples(header(vcf))) <= tumor.id.in.vcf) { |
|
104 |
+ return(samples(header(vcf))[tumor.id.in.vcf]) |
|
105 |
+ } |
|
106 |
+ return(NA) |
|
107 |
+} |
|
108 |
+ |
|
109 |
+.checkGATK4Version <- function(min.version) { |
|
110 |
+ output <- try(system2("gatk", "--version", stdout=TRUE), silent = TRUE) |
|
111 |
+ if (is(output, "try-error")) { |
|
112 |
+ flog.warn("Cannot find gatk binary in path.") |
|
113 |
+ return(-1) |
|
114 |
+ } |
|
115 |
+ prefix <- "The Genome Analysis Toolkit (GATK) v" |
|
116 |
+ compareVersion(gsub(prefix, "", output[grep(prefix, output, fixed = TRUE)], fixed = TRUE), min.version) |
|
117 |
+} |
... | ... |
@@ -6,7 +6,7 @@ |
6 | 6 |
#' \code{args.segmentation}. |
7 | 7 |
#' |
8 | 8 |
#' |
9 |
-#' @param normal Coverage data for normal sample. |
|
9 |
+#' @param normal Coverage data for normal sample. Ignored in this function. |
|
10 | 10 |
#' @param tumor Coverage data for tumor sample. |
11 | 11 |
#' @param log.ratio Copy number log-ratios, one for each exon in coverage file. |
12 | 12 |
#' @param seg If segmentation was provided by the user, this data structure |
... | ... |
@@ -58,7 +58,7 @@ option_list <- list( |
58 | 58 |
default = formals(PureCN::filterIntervals)$min.total.counts, |
59 | 59 |
help = "Interval Filter: Keep only intervals with at least that many counts in both tumor and (tanget) normal [default %default]"), |
60 | 60 |
make_option(c("--funsegmentation"), action = "store", type = "character", default = "CBS", |
61 |
- help = "Segmentation: Algorithm. CBS, PSCBS, Hclust, or none [default %default]"), |
|
61 |
+ help = "Segmentation: Algorithm. CBS, PSCBS, GATK4, Hclust, or none [default %default]"), |
|
62 | 62 |
make_option(c("--alpha"), action = "store", type = "double", |
63 | 63 |
default = formals(PureCN::segmentationCBS)$alpha, |
64 | 64 |
help = "Segmentation: significance of breakpoints [default %default]"), |
... | ... |
@@ -260,6 +260,8 @@ if (file.exists(file.rds) && !opt$force) { |
260 | 260 |
} else if (opt$funsegmentation == "Hclust") { |
261 | 261 |
fun.segmentation <- segmentationHclust |
262 | 262 |
if (!is.null(seg.file)) uses.recommended.fun <- TRUE |
263 |
+ } else if (opt$funsegmentation == "GATK4") { |
|
264 |
+ fun.segmentation <- segmentationGATK4 |
|
263 | 265 |
} else if (opt$funsegmentation == "none") { |
264 | 266 |
fun.segmentation <- function(seg, ...) seg |
265 | 267 |
} else { |
... | ... |
@@ -297,7 +299,7 @@ if (file.exists(file.rds) && !opt$force) { |
297 | 299 |
log.ratio <- log.ratio$log.ratio |
298 | 300 |
} |
299 | 301 |
vcf <- opt$vcf |
300 |
- if (!is.null(vcf) && file_ext(vcf) == "tsv") { |
|
302 |
+ if (!is.null(vcf) && tools::file_ext(vcf) == "tsv") { |
|
301 | 303 |
flog.info("*.tsv file provided for --vcf, assuming GATK4 CollectAllelicCounts format") |
302 | 304 |
vcf <- readAllelicCountsFile(vcf) |
303 | 305 |
} |
304 | 306 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,90 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/segmentationGATK4.R |
|
3 |
+\name{segmentationGATK4} |
|
4 |
+\alias{segmentationGATK4} |
|
5 |
+\title{GATK4 ModelSegments segmentation function} |
|
6 |
+\usage{ |
|
7 |
+segmentationGATK4( |
|
8 |
+ normal, |
|
9 |
+ tumor, |
|
10 |
+ log.ratio, |
|
11 |
+ seg, |
|
12 |
+ vcf = NULL, |
|
13 |
+ tumor.id.in.vcf = 1, |
|
14 |
+ normal.id.in.vcf = NULL, |
|
15 |
+ prune.hclust.h = NULL, |
|
16 |
+ prune.hclust.method = NULL, |
|
17 |
+ chr.hash = NULL, |
|
18 |
+ ... |
|
19 |
+) |
|
20 |
+} |
|
21 |
+\arguments{ |
|
22 |
+\item{normal}{Coverage data for normal sample. Ignored in this function.} |
|
23 |
+ |
|
24 |
+\item{tumor}{Coverage data for tumor sample.} |
|
25 |
+ |
|
26 |
+\item{log.ratio}{Copy number log-ratios, one for each exon in coverage file.} |
|
27 |
+ |
|
28 |
+\item{seg}{If segmentation was provided by the user, this data structure |
|
29 |
+will contain this segmentation. Useful for minimal segmentation functions. |
|
30 |
+Otherwise PureCN will re-segment the data. This segmentation function |
|
31 |
+ignores this user provided segmentation.} |
|
32 |
+ |
|
33 |
+\item{vcf}{Optional \code{CollapsedVCF} object with germline allelic ratios.} |
|
34 |
+ |
|
35 |
+\item{tumor.id.in.vcf}{Id of tumor in case multiple samples are stored in |
|
36 |
+VCF.} |
|
37 |
+ |
|
38 |
+\item{normal.id.in.vcf}{Id of normal in in VCF. Currently not used.} |
|
39 |
+ |
|
40 |
+\item{prune.hclust.h}{Ignored in this function.} |
|
41 |
+ |
|
42 |
+\item{prune.hclust.method}{Ignored in this function.} |
|
43 |
+ |
|
44 |
+\item{chr.hash}{Not needed here since \code{ModelSegments} does not |
|
45 |
+require numbered chromosome names.} |
|
46 |
+ |
|
47 |
+\item{...}{Currently unused arguments provided to other segmentation |
|
48 |
+functions.} |
|
49 |
+ |
|
50 |
+\item{additional.cmd.args}{\code{character(1)}. By default, |
|
51 |
+\code{ModelSegments} is called with default parameters. Provide additional |
|
52 |
+arguments here.} |
|
53 |
+} |
|
54 |
+\value{ |
|
55 |
+\code{data.frame} containing the segmentation. |
|
56 |
+} |
|
57 |
+\description{ |
|
58 |
+A wrapper for GATK4s ModelSegmentation function, useful when normalization |
|
59 |
+is performed with other tools than GATK4, for example PureCN. |
|
60 |
+This function is called via the |
|
61 |
+\code{fun.segmentation} argument of \code{\link{runAbsoluteCN}}. The |
|
62 |
+arguments are passed via \code{args.segmentation}. |
|
63 |
+} |
|
64 |
+\examples{ |
|
65 |
+ |
|
66 |
+normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt", |
|
67 |
+ package="PureCN") |
|
68 |
+tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt", |
|
69 |
+ package="PureCN") |
|
70 |
+vcf.file <- system.file("extdata", "example.vcf.gz", |
|
71 |
+ package="PureCN") |
|
72 |
+ |
|
73 |
+# The max.candidate.solutions, max.ploidy and test.purity parameters are set to |
|
74 |
+# non-default values to speed-up this example. This is not a good idea for real |
|
75 |
+# samples. |
|
76 |
+\dontrun{ |
|
77 |
+ ret <-runAbsoluteCN(normal.coverage.file=normal.coverage.file, |
|
78 |
+ tumor.coverage.file=tumor.coverage.file, vcf.file=vcf.file, |
|
79 |
+ sampleid="Sample1", genome="hg19", |
|
80 |
+ fun.segmentation = segmentationGATK4, max.ploidy=4, |
|
81 |
+ test.purity=seq(0.3,0.7,by=0.05), max.candidate.solutions=1) |
|
82 |
+} |
|
83 |
+ |
|
84 |
+} |
|
85 |
+\seealso{ |
|
86 |
+\code{\link{runAbsoluteCN}} |
|
87 |
+} |
|
88 |
+\author{ |
|
89 |
+Markus Riester |
|
90 |
+} |
... | ... |
@@ -16,17 +16,17 @@ test_that("Example data matches", { |
16 | 16 |
}) |
17 | 17 |
|
18 | 18 |
test_that("parsing -> writing -> parsing works", { |
19 |
- x <- purecn.example.output |
|
19 |
+ x <- purecn.example.output$input |
|
20 | 20 |
y <- x |
21 |
- y$input$log.ratio$log.ratio <- NULL |
|
21 |
+ y$log.ratio$log.ratio <- NULL |
|
22 | 22 |
output.file <- tempfile(fileext = ".tsv") |
23 | 23 |
expect_error( |
24 |
- PureCN:::.writeLogRatioFileGATK4(y, output.file), |
|
24 |
+ PureCN:::.writeLogRatioFileGATK4(y, 1, output.file), |
|
25 | 25 |
"log.ratio NULL" |
26 | 26 |
) |
27 |
- PureCN:::.writeLogRatioFileGATK4(x, output.file) |
|
27 |
+ PureCN:::.writeLogRatioFileGATK4(x, 1, output.file) |
|
28 | 28 |
z <- readLogRatioFile(output.file) |
29 |
- expect_equivalent(x$input$log.ratio$log.ratio, z$log.ratio) |
|
29 |
+ expect_equivalent(x$log.ratio$log.ratio, z$log.ratio) |
|
30 | 30 |
file.remove(output.file) |
31 | 31 |
}) |
32 | 32 |
|
... | ... |
@@ -1,5 +1,12 @@ |
1 | 1 |
context("segmentation") |
2 | 2 |
|
3 |
+normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt", |
|
4 |
+ package="PureCN") |
|
5 |
+tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt", |
|
6 |
+ package="PureCN") |
|
7 |
+vcf.file <- system.file("extdata", "example.vcf.gz", |
|
8 |
+ package="PureCN") |
|
9 |
+ |
|
3 | 10 |
test_that("Precomputed boudaries are correct", { |
4 | 11 |
data(purecn.DNAcopy.bdry) |
5 | 12 |
alpha <- formals(segmentationCBS)$alpha |
... | ... |
@@ -10,3 +17,18 @@ test_that("Precomputed boudaries are correct", { |
10 | 17 |
sbdry <- getbdry(eta, nperm, max.ones) |
11 | 18 |
expect_equal(purecn.DNAcopy.bdry, sbdry) |
12 | 19 |
}) |
20 |
+ |
|
21 |
+ |
|
22 |
+test_that("GATK4 wrapper works for example data.", { |
|
23 |
+ skip_if_not(PureCN:::.checkGATK4Version("4.1.7.0") >= 0, "gatk binary > 4.1.7.0 required") |
|
24 |
+ |
|
25 |
+ ret <-runAbsoluteCN(normal.coverage.file = normal.coverage.file, |
|
26 |
+ tumor.coverage.file = tumor.coverage.file, vcf.file = vcf.file, |
|
27 |
+ sampleid = "Sample1", genome = "hg19", |
|
28 |
+ fun.segmentation = segmentationGATK4, max.ploidy = 4, |
|
29 |
+ test.purity = seq(0.3, 0.7, by = 0.05), |
|
30 |
+ max.candidate.solutions = 1, plot.cnv = FALSE) |
|
31 |
+ |
|
32 |
+ expect_equal(0.65, ret$results[[1]]$purity, tolerance = 0.02) |
|
33 |
+ expect_equal(1.62, ret$results[[1]]$ploidy, tolerance = 0.2) |
|
34 |
+}) |