Browse code

Added GATK4 ModelSegments wrapper function.

Markus Riester authored on 02/05/2021 22:02:41
Showing 12 changed files

... ...
@@ -2,7 +2,7 @@ Package: PureCN
2 2
 Type: Package
3 3
 Title: Copy number calling and SNV classification using
4 4
     targeted short read sequencing
5
-Version: 1.21.13
5
+Version: 1.21.14
6 6
 Date: 2021-05-02
7 7
 Authors@R: c(person("Markus", "Riester",
8 8
                     role = c("aut", "cre"),
... ...
@@ -36,6 +36,7 @@ export(readLogRatioFile)
36 36
 export(readSegmentationFile)
37 37
 export(runAbsoluteCN)
38 38
 export(segmentationCBS)
39
+export(segmentationGATK4)
39 40
 export(segmentationHclust)
40 41
 export(segmentationPSCBS)
41 42
 export(setMappingBiasVcf)
... ...
@@ -163,6 +164,7 @@ importFrom(stats,runif)
163 164
 importFrom(stats,t.test)
164 165
 importFrom(stats,weighted.mean)
165 166
 importFrom(tools,file_ext)
167
+importFrom(utils,compareVersion)
166 168
 importFrom(utils,data)
167 169
 importFrom(utils,head)
168 170
 importFrom(utils,object.size)
... ...
@@ -7,6 +7,8 @@ NEW FEATURES
7 7
       minimizes noise while keeping the resolution as high as possible
8 8
     o Added support for GATK4 CollectAllelicCounts output as alternative
9 9
       to Mutect  
10
+    o Added segmentationGATK4 to used GATK4's segmentation function 
11
+      ModelSegments 
10 12
 
11 13
 SIGNIFICANT USER-VISIBLE CHANGES
12 14
 
... ...
@@ -47,8 +47,8 @@ readLogRatioFile <- function(file, format, zero = NULL) {
47 47
     gr
48 48
 }
49 49
 
50
-.writeLogRatioFileGATK4 <- function(x, file) {
51
-    gr <- x$input$log.ratio
50
+.writeLogRatioFileGATK4 <- function(x, id = 1, file) {
51
+    gr <- x$log.ratio
52 52
     if (is.null(gr$log.ratio)) {
53 53
         .stopRuntimeError("log.ratio NULL in .writeLogRatioFileGATK4")
54 54
     }
... ...
@@ -59,7 +59,7 @@ readLogRatioFile <- function(file, format, zero = NULL) {
59 59
         LOG2_COPY_RATIO = gr$log.ratio
60 60
     )
61 61
     con <- file(file, open = "w")
62
-    .writeGATKHeader(x$input$vcf, id = 1, con, "log-ratio")
62
+    .writeGATKHeader(x$vcf, id, con, "log-ratio")
63 63
     write.table(output, con, row.names = FALSE, quote = FALSE, sep = "\t")
64 64
     close(con)
65 65
     invisible(output)
... ...
@@ -73,5 +73,6 @@ readLogRatioFile <- function(file, format, zero = NULL) {
73 73
         sl <- seqlengths(vcf)
74 74
         writeLines(paste("@SQ", paste0("SN:",names(sl)), paste0("LN:", sl), sep="\t"), con)
75 75
     }
76
-    writeLines(paste("@RG", "ID:PureCN", paste0("SM:", samples(header(vcf))[id]), sep="\t"), con)
76
+    sampleid <- .getSampleIdFromVcf(vcf, id)
77
+    writeLines(paste("@RG", "ID:PureCN", paste0("SM:", sampleid), sep="\t"), con)
77 78
 }    
... ...
@@ -35,13 +35,17 @@ readSegmentationFile <- function(seg.file, sampleid, model.homozygous = FALSE,
35 35
     return("DNAcopy")
36 36
 }
37 37
 .convertSegGATK4 <- function(seg, sampleid) {
38
+    lr_col_id <- "LOG2_COPY_RATIO_POSTERIOR_50"
39
+    if (is.null(seg[[lr_col_id]])) {
40
+        lr_col_id <- "MEAN_LOG2_COPY_RATIO"
41
+    }
38 42
     data.frame(
39 43
         ID = sampleid,
40 44
         chrom = seg$CONTIG,
41 45
         loc.start = seg$START,
42 46
         loc.end = seg$END,
43 47
         num.mark = seg$NUM_POINTS_COPY_RATIO,
44
-        seg.mean = seg$LOG2_COPY_RATIO_POSTERIOR_50
48
+        seg.mean = seg[[lr_col_id]]
45 49
     )
46 50
 }    
47 51
 .checkSeg <- function(seg, sampleid, model.homozygous, verbose=TRUE) {
48 52
new file mode 100644
... ...
@@ -0,0 +1,117 @@
1
+#' GATK4 ModelSegments segmentation function
2
+#' 
3
+#' A wrapper for GATK4s ModelSegmentation function, useful when normalization
4
+#' is performed with other tools than GATK4, for example PureCN.
5
+#' This function is called via the
6
+#' \code{fun.segmentation} argument of \code{\link{runAbsoluteCN}}.  The
7
+#' arguments are passed via \code{args.segmentation}.
8
+#' 
9
+#' 
10
+#' @param normal Coverage data for normal sample. Ignored in this function.
11
+#' @param tumor Coverage data for tumor sample.
12
+#' @param log.ratio Copy number log-ratios, one for each exon in coverage file.
13
+#' @param seg If segmentation was provided by the user, this data structure
14
+#' will contain this segmentation. Useful for minimal segmentation functions.
15
+#' Otherwise PureCN will re-segment the data. This segmentation function
16
+#' ignores this user provided segmentation.
17
+#' @param vcf Optional \code{CollapsedVCF} object with germline allelic ratios.
18
+#' @param tumor.id.in.vcf Id of tumor in case multiple samples are stored in
19
+#' VCF.
20
+#' @param normal.id.in.vcf Id of normal in in VCF. Currently not used.
21
+#' @param prune.hclust.h Ignored in this function.
22
+#' @param prune.hclust.method Ignored in this function.
23
+#' @param additional.cmd.args \code{character(1)}. By default,
24
+#' \code{ModelSegments} is called with default parameters. Provide additional 
25
+#' arguments here.
26
+#' @param chr.hash Not needed here since \code{ModelSegments} does not
27
+#' require numbered chromosome names.
28
+#' @param ... Currently unused arguments provided to other segmentation
29
+#' functions.
30
+#' @return \code{data.frame} containing the segmentation.
31
+#' @author Markus Riester
32
+#'
33
+#' @seealso \code{\link{runAbsoluteCN}}
34
+#' @examples
35
+#' 
36
+#' normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt", 
37
+#'     package="PureCN")
38
+#' tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt", 
39
+#'     package="PureCN")
40
+#' vcf.file <- system.file("extdata", "example.vcf.gz",
41
+#'     package="PureCN")
42
+#' 
43
+#' # The max.candidate.solutions, max.ploidy and test.purity parameters are set to
44
+#' # non-default values to speed-up this example.  This is not a good idea for real
45
+#' # samples.
46
+#'\dontrun{
47
+#'  ret <-runAbsoluteCN(normal.coverage.file=normal.coverage.file, 
48
+#'      tumor.coverage.file=tumor.coverage.file, vcf.file=vcf.file, 
49
+#'      sampleid="Sample1",  genome="hg19",
50
+#'      fun.segmentation = segmentationGATK4, max.ploidy=4,
51
+#'      test.purity=seq(0.3,0.7,by=0.05), max.candidate.solutions=1)
52
+#'}
53
+#' 
54
+#' @importFrom utils compareVersion
55
+#' @export segmentationGATK4
56
+segmentationGATK4 <- function(normal, tumor, log.ratio, seg, 
57
+    vcf = NULL, tumor.id.in.vcf = 1, normal.id.in.vcf = NULL,
58
+    prune.hclust.h = NULL, prune.hclust.method = NULL,
59
+    chr.hash = NULL, ...) {
60
+
61
+    min.version <- "4.1.7.0"
62
+    if (.checkGATK4Version(min.version) < 0) {
63
+        .stopUserError("Unable to find a recent (>= %s) gatk binary.",
64
+            min.version)
65
+    }    
66
+    output.vcf.file <- NULL    
67
+    if (!is.null(vcf)) {
68
+        output.vcf.file <- tempfile(fileext = ".tsv")
69
+        .writeAllelicCountsFileGatk(vcf, tumor.id.in.vcf, output.vcf.file)
70
+    } 
71
+    if (length(log.ratio) != length(tumor)) {
72
+        .stopRuntimeError("tumor and log.ratio do not align in segmentationGATK4.")
73
+    }
74
+    output.lr.file <- tempfile(fileext = ".tsv")
75
+    # following .writeLogRatioFileGATK4 needs the GRanges information, so
76
+    # simply update it (should be the same though!)
77
+    tumor$log.ratio <- log.ratio
78
+    .writeLogRatioFileGATK4(list(vcf=vcf, log.ratio = tumor), tumor.id.in.vcf, output.lr.file)
79
+    output.dir <- tempdir()
80
+    sampleid <- .getSampleIdFromVcf(vcf, tumor.id.in.vcf)
81
+    args <- paste("ModelSegments",
82
+              "--allelic-counts", output.vcf.file,
83
+              "--denoised-copy-ratios", output.lr.file,
84
+              "--output-prefix", "tumor",
85
+              "-O", output.dir)
86
+    output <- try(system2("gatk", args, stderr = TRUE, stdout = TRUE))
87
+    if (is(output, "try-error")) {
88
+        .stopUserError("GATK4 ModelSegments runtime error: ",
89
+            output, "\nArguments: ", args)
90
+    }
91
+    seg <- readSegmentationFile(file.path(output.dir, "tumor.modelFinal.seg"), sampleid)
92
+    idx.enough.markers <- seg$num.mark > 1
93
+    rownames(seg) <- NULL
94
+    file.remove(output.vcf.file)
95
+    file.remove(output.lr.file)
96
+    seg[idx.enough.markers,]
97
+}
98
+
99
+.getSampleIdFromVcf <- function(vcf, tumor.id.in.vcf) {
100
+    if (is(tumor.id.in.vcf, "character") &&
101
+        tumor.id.in.vcf %in% samples(header(vcf))) return(tumor.id.in.vcf)
102
+    if (is(tumor.id.in.vcf, "numeric") &&
103
+        length(samples(header(vcf))) <= tumor.id.in.vcf) {
104
+        return(samples(header(vcf))[tumor.id.in.vcf])
105
+    }
106
+    return(NA)    
107
+}    
108
+
109
+.checkGATK4Version <- function(min.version) {
110
+    output <- try(system2("gatk", "--version", stdout=TRUE), silent = TRUE)
111
+    if (is(output, "try-error")) {
112
+        flog.warn("Cannot find gatk binary in path.")
113
+        return(-1)
114
+    }
115
+    prefix <- "The Genome Analysis Toolkit (GATK) v"    
116
+    compareVersion(gsub(prefix, "", output[grep(prefix, output, fixed = TRUE)], fixed = TRUE), min.version)
117
+}    
... ...
@@ -6,7 +6,7 @@
6 6
 #' \code{args.segmentation}.
7 7
 #' 
8 8
 #' 
9
-#' @param normal Coverage data for normal sample.
9
+#' @param normal Coverage data for normal sample. Ignored in this function.
10 10
 #' @param tumor Coverage data for tumor sample.
11 11
 #' @param log.ratio Copy number log-ratios, one for each exon in coverage file.
12 12
 #' @param seg If segmentation was provided by the user, this data structure
... ...
@@ -58,7 +58,7 @@ option_list <- list(
58 58
         default = formals(PureCN::filterIntervals)$min.total.counts,
59 59
         help = "Interval Filter: Keep only intervals with at least that many counts in both tumor and (tanget) normal [default %default]"),
60 60
     make_option(c("--funsegmentation"), action = "store", type = "character", default = "CBS",
61
-        help = "Segmentation: Algorithm. CBS, PSCBS, Hclust, or none [default %default]"),
61
+        help = "Segmentation: Algorithm. CBS, PSCBS, GATK4, Hclust, or none [default %default]"),
62 62
     make_option(c("--alpha"), action = "store", type = "double",
63 63
         default = formals(PureCN::segmentationCBS)$alpha,
64 64
         help = "Segmentation: significance of breakpoints [default %default]"),
... ...
@@ -260,6 +260,8 @@ if (file.exists(file.rds) && !opt$force) {
260 260
         } else if (opt$funsegmentation == "Hclust") {
261 261
             fun.segmentation <- segmentationHclust
262 262
             if (!is.null(seg.file)) uses.recommended.fun <- TRUE
263
+        } else if (opt$funsegmentation == "GATK4") {
264
+            fun.segmentation <- segmentationGATK4
263 265
         } else if (opt$funsegmentation == "none") {
264 266
             fun.segmentation <- function(seg, ...) seg
265 267
         } else {
... ...
@@ -297,7 +299,7 @@ if (file.exists(file.rds) && !opt$force) {
297 299
         log.ratio <- log.ratio$log.ratio
298 300
     }    
299 301
     vcf <- opt$vcf
300
-    if (!is.null(vcf) && file_ext(vcf) == "tsv") {
302
+    if (!is.null(vcf) && tools::file_ext(vcf) == "tsv") {
301 303
         flog.info("*.tsv file provided for --vcf, assuming GATK4 CollectAllelicCounts format")
302 304
         vcf <- readAllelicCountsFile(vcf)
303 305
     }    
304 306
new file mode 100644
... ...
@@ -0,0 +1,90 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/segmentationGATK4.R
3
+\name{segmentationGATK4}
4
+\alias{segmentationGATK4}
5
+\title{GATK4 ModelSegments segmentation function}
6
+\usage{
7
+segmentationGATK4(
8
+  normal,
9
+  tumor,
10
+  log.ratio,
11
+  seg,
12
+  vcf = NULL,
13
+  tumor.id.in.vcf = 1,
14
+  normal.id.in.vcf = NULL,
15
+  prune.hclust.h = NULL,
16
+  prune.hclust.method = NULL,
17
+  chr.hash = NULL,
18
+  ...
19
+)
20
+}
21
+\arguments{
22
+\item{normal}{Coverage data for normal sample. Ignored in this function.}
23
+
24
+\item{tumor}{Coverage data for tumor sample.}
25
+
26
+\item{log.ratio}{Copy number log-ratios, one for each exon in coverage file.}
27
+
28
+\item{seg}{If segmentation was provided by the user, this data structure
29
+will contain this segmentation. Useful for minimal segmentation functions.
30
+Otherwise PureCN will re-segment the data. This segmentation function
31
+ignores this user provided segmentation.}
32
+
33
+\item{vcf}{Optional \code{CollapsedVCF} object with germline allelic ratios.}
34
+
35
+\item{tumor.id.in.vcf}{Id of tumor in case multiple samples are stored in
36
+VCF.}
37
+
38
+\item{normal.id.in.vcf}{Id of normal in in VCF. Currently not used.}
39
+
40
+\item{prune.hclust.h}{Ignored in this function.}
41
+
42
+\item{prune.hclust.method}{Ignored in this function.}
43
+
44
+\item{chr.hash}{Not needed here since \code{ModelSegments} does not
45
+require numbered chromosome names.}
46
+
47
+\item{...}{Currently unused arguments provided to other segmentation
48
+functions.}
49
+
50
+\item{additional.cmd.args}{\code{character(1)}. By default,
51
+\code{ModelSegments} is called with default parameters. Provide additional 
52
+arguments here.}
53
+}
54
+\value{
55
+\code{data.frame} containing the segmentation.
56
+}
57
+\description{
58
+A wrapper for GATK4s ModelSegmentation function, useful when normalization
59
+is performed with other tools than GATK4, for example PureCN.
60
+This function is called via the
61
+\code{fun.segmentation} argument of \code{\link{runAbsoluteCN}}.  The
62
+arguments are passed via \code{args.segmentation}.
63
+}
64
+\examples{
65
+
66
+normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt", 
67
+    package="PureCN")
68
+tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt", 
69
+    package="PureCN")
70
+vcf.file <- system.file("extdata", "example.vcf.gz",
71
+    package="PureCN")
72
+
73
+# The max.candidate.solutions, max.ploidy and test.purity parameters are set to
74
+# non-default values to speed-up this example.  This is not a good idea for real
75
+# samples.
76
+\dontrun{
77
+ ret <-runAbsoluteCN(normal.coverage.file=normal.coverage.file, 
78
+     tumor.coverage.file=tumor.coverage.file, vcf.file=vcf.file, 
79
+     sampleid="Sample1",  genome="hg19",
80
+     fun.segmentation = segmentationGATK4, max.ploidy=4,
81
+     test.purity=seq(0.3,0.7,by=0.05), max.candidate.solutions=1)
82
+}
83
+
84
+}
85
+\seealso{
86
+\code{\link{runAbsoluteCN}}
87
+}
88
+\author{
89
+Markus Riester
90
+}
... ...
@@ -29,7 +29,7 @@ segmentationPSCBS(
29 29
 )
30 30
 }
31 31
 \arguments{
32
-\item{normal}{Coverage data for normal sample.}
32
+\item{normal}{Coverage data for normal sample. Ignored in this function.}
33 33
 
34 34
 \item{tumor}{Coverage data for tumor sample.}
35 35
 
... ...
@@ -16,17 +16,17 @@ test_that("Example data matches", {
16 16
 })
17 17
 
18 18
 test_that("parsing -> writing -> parsing works", {
19
-    x <- purecn.example.output
19
+    x <- purecn.example.output$input
20 20
     y <- x
21
-    y$input$log.ratio$log.ratio <- NULL
21
+    y$log.ratio$log.ratio <- NULL
22 22
     output.file <- tempfile(fileext = ".tsv")
23 23
     expect_error(
24
-        PureCN:::.writeLogRatioFileGATK4(y, output.file),
24
+        PureCN:::.writeLogRatioFileGATK4(y, 1, output.file),
25 25
         "log.ratio NULL"      
26 26
     )
27
-    PureCN:::.writeLogRatioFileGATK4(x, output.file)
27
+    PureCN:::.writeLogRatioFileGATK4(x, 1, output.file)
28 28
     z <- readLogRatioFile(output.file)
29
-    expect_equivalent(x$input$log.ratio$log.ratio, z$log.ratio)
29
+    expect_equivalent(x$log.ratio$log.ratio, z$log.ratio)
30 30
     file.remove(output.file)
31 31
 })
32 32
 
... ...
@@ -1,5 +1,12 @@
1 1
 context("segmentation")
2 2
 
3
+normal.coverage.file <- system.file("extdata", "example_normal_tiny.txt", 
4
+    package="PureCN")
5
+tumor.coverage.file <- system.file("extdata", "example_tumor_tiny.txt", 
6
+    package="PureCN")
7
+vcf.file <- system.file("extdata", "example.vcf.gz",
8
+    package="PureCN")
9
+
3 10
 test_that("Precomputed boudaries are correct", {
4 11
     data(purecn.DNAcopy.bdry)
5 12
     alpha <- formals(segmentationCBS)$alpha
... ...
@@ -10,3 +17,18 @@ test_that("Precomputed boudaries are correct", {
10 17
     sbdry <- getbdry(eta, nperm, max.ones)
11 18
     expect_equal(purecn.DNAcopy.bdry, sbdry)
12 19
 })
20
+
21
+
22
+test_that("GATK4 wrapper works for example data.", {
23
+   skip_if_not(PureCN:::.checkGATK4Version("4.1.7.0") >= 0, "gatk binary > 4.1.7.0 required")
24
+ 
25
+    ret <-runAbsoluteCN(normal.coverage.file = normal.coverage.file, 
26
+        tumor.coverage.file = tumor.coverage.file, vcf.file = vcf.file, 
27
+        sampleid = "Sample1",  genome = "hg19",
28
+        fun.segmentation = segmentationGATK4, max.ploidy = 4,
29
+        test.purity = seq(0.3, 0.7, by = 0.05),
30
+        max.candidate.solutions = 1, plot.cnv = FALSE)
31
+    
32
+    expect_equal(0.65, ret$results[[1]]$purity, tolerance = 0.02)
33
+    expect_equal(1.62, ret$results[[1]]$ploidy, tolerance = 0.2)
34
+})