Bioconductor Code: PureCN

Browse code

Initial support for GATK4 GenomicsDBImport (#6).

Markus Riester authored on 05/07/2020 04:28:39
Showing 10 changed files

DESCRIPTION index 45eb241..65d8b51 100644
NAMESPACE index 1130a39..f42baf2 100755
NEWS index 4e357f1..d6adcc6 100755
R/calculateMappingBiasVcf.R index 0e873ad..1c614b0 100644
R/setMappingBiasVcf.R index f276fbf..3c8bd61 100644
inst/extdata/NormalDB.R index 9b7509f..96b566d 100644
inst/extdata/gatk4_pon_db.tgz index 0000000..61bdd6e
man/calculateMappingBiasGatk4.Rd index 0000000..be59e45
tests/testthat/test_setMappingBiasVcf.R index 4946998..ffc97b8 100644
vignettes/Quick.Rmd index e032213..0e43599 100755

History View file @ 4635a93

@@ -2,8 +2,8 @@ Package: PureCN
                      Type: Package
                      Title: Copy number calling and SNV classification using
                          targeted short read sequencing
                     -Version: 1.19.3
                     -Date: 2020-06-24
                     +Version: 1.19.4
                     +Date: 2020-07-04
                      Authors@R: c(person("Markus", "Riester",
                                          role = c("aut", "cre"),
                                          email = "[email protected]",
@@ -54,8 +54,11 @@ Suggests:
                          knitr,
                          optparse,
                          org.Hs.eg.db,
                     +    jsonlite,
                          rmarkdown,
                          testthat
                     +Enhances:
                     +    genomicsdb
                      VignetteBuilder: knitr
                      License: Artistic-2.0
                      URL: https://github.com/lima1/PureCN

NAMESPACE

History View file @ 4635a93

@@ -6,6 +6,7 @@ export(calculateBamCoverageByInterval)
                      export(calculateGCContentByInterval)
                      export(calculateIntervalWeights)
                      export(calculateLogRatio)
                     +export(calculateMappingBiasGatk4)
                      export(calculateMappingBiasVcf)
                      export(calculatePowerDetectSomatic)
                      export(calculateTangentNormal)
@@ -52,6 +53,7 @@ importFrom(GenomeInfoDb,"seqlengths<-")
                      importFrom(GenomeInfoDb,"seqlevels<-")
                      importFrom(GenomeInfoDb,"seqlevelsStyle<-")
                      importFrom(GenomeInfoDb,genomeStyles)
                     +importFrom(GenomeInfoDb,rankSeqlevels)
                      importFrom(GenomeInfoDb,seqlengths)
                      importFrom(GenomeInfoDb,seqlevelsInUse)
                      importFrom(GenomeInfoDb,seqlevelsStyle)
@@ -85,6 +87,7 @@ importFrom(VGAM,dbetabinom)
                      importFrom(VGAM,dbetabinom.ab)
                      importFrom(VGAM,vglm)
                      importFrom(data.table,data.table)
                     +importFrom(data.table,dcast)
                      importFrom(data.table,fread)
                      importFrom(data.table,fwrite)
                      importFrom(futile.logger,appender.tee)

NEWS

History View file @ 4635a93

@@ -1,6 +1,11 @@
                      Changes in version 1.20.0
                      -------------------------
                     +NEW FEATURES
+                    +
                     +    o Support for GATK4 GenomicsDB import for mapping bias calculation
+                    +
+                    +
                      SIGNIFICANT USER-VISIBLE CHANGES
                          o We now check if POP_AF or POPAF is -log10 scaled as new Mutect2 versions

R/calculateMappingBiasVcf.R

History View file @ 4635a93

@@ -27,7 +27,8 @@
                      #' @importFrom GenomicRanges GRangesList
                      #' @importFrom VGAM vglm Coef betabinomial dbetabinom
                      #' @export calculateMappingBiasVcf
                     -calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2,
                     +calculateMappingBiasVcf <- function(normal.panel.vcf.file,
                     +                                    min.normals = 2,
                                                          min.normals.betafit = 7,
                                                          min.median.coverage.betafit = 5,
                                                          yieldSize = 5000, genome) {
@@ -42,8 +43,10 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2,
                              if (!(cntStep %% 10)) {
                                  flog.info("Position %s:%i", as.character(seqnames(vcf_yield)[1]), start(vcf_yield)[1])
+                             }
                     -        mappingBias <- .calculateMappingBias(vcf_yield, min.normals,
                     -            min.normals.betafit, min.median.coverage.betafit)
                     +        mappingBias <- .calculateMappingBias(nvcf = vcf_yield,
                     +            min.normals = min.normals,
                     +            min.normals.betafit = min.normals.betafit,
                     +            min.median.coverage.betafit = min.median.coverage.betafit)
                              ret <- append(ret, GRangesList(mappingBias))
                              cntVar <- cntVar + yieldSize
                              cntStep <- cntStep + 1
@@ -57,16 +60,131 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2,
                          bias
+                     }
                     -.calculateMappingBias <- function(nvcf, min.normals, min.normals.betafit = 7,
                     +#' Calculate Mapping Bias from GATK4 GenomicsDB
                     +#'
                     +#' Function calculate mapping bias for each variant in the provided
                     +#' panel of normals GenomicsDB.
                     +#'
                     +#'
                     +#' @param workspace Path to the GenomicsDB created by \code{GenomicsDBImport}
                     +#' @param reference.genome Reference FASTA file.
                     +#' @param min.normals Minimum number of normals with heterozygous SNP for
                     +#' calculating position-specific mapping bias.
                     +#' @param min.normals.betafit Minimum number of normals with heterozygous SNP
                     +#' fitting a beta distribution
                     +#' @param min.median.coverage.betafit Minimum median coverage of normals with
                     +#' heterozygous SNP for fitting a beta distribution
                     +#' @return A \code{GRanges} object with mapping bias and number of normal
                     +#' samples with this variant.
                     +#' @author Markus Riester
                     +#' @examples
                     +#'
                     +#' normal.panel.vcf <- system.file("extdata", "normalpanel.vcf.gz", package="PureCN")
                     +#' bias <- calculateMappingBiasVcf(normal.panel.vcf, genome = "h19")
                     +#' saveRDS(bias, "mapping_bias.rds")
                     +#'
                     +#' @export calculateMappingBiasGatk4
                     +#' @importFrom data.table dcast
                     +#' @importFrom GenomeInfoDb rankSeqlevels
                     +calculateMappingBiasGatk4 <- function(workspace, reference.genome,
                     +                                    min.normals = 2,
                     +                                    min.normals.betafit = 7,
                     +                                    min.median.coverage.betafit = 5) {
+                    +
                     +    if (!requireNamespace("genomicsdb", quietly = TRUE) ||
                     +        !requireNamespace("jsonlite", quietly = TRUE)
                     +        ) {
                     +        .stopUserError("Install the genomicsdb and jsonlite R packages for GenomicsDB import.")
                     +    }
                     +    workspace <- normalizePath(workspace, mustWork = TRUE)
+                    +
                     +    db <- genomicsdb::connect(workspace = workspace,
                     +        vid_mapping_file = file.path(workspace, "vidmap.json"),
                     +        callset_mapping_file=file.path(workspace, "callset.json"),
                     +        reference_genome = reference.genome,
                     +        c("DP", "AD", "AF"))
+                    +
                     +    jcallset <- jsonlite::read_json(file.path(workspace, "callset.json"))
                     +    jvidmap <- jsonlite::read_json(file.path(workspace, "vidmap.json"))
+                    +
                     +    # get all available arrays
                     +    arrays <- sapply(dir(workspace, full.names=TRUE), file.path, "genomicsdb_meta_dir")
                     +    arrays <- basename(names(arrays)[which(file.exists(arrays))])
                     +    # get offsets and lengths
                     +    contigs <- sapply(arrays, function(ary) strsplit(ary, "\\$")[[1]][1])
                     +    contigs <- jvidmap$contigs[match(contigs, sapply(jvidmap$contigs, function(x) x$name))]
                     +    idx <- order(rankSeqlevels(sapply(contigs, function(x) x$name)))
+                    +
                     +    bias <- lapply(idx, function(i) {
                     +        c_offset <- as.numeric(contigs[[i]]$tiledb_column_offset)
                     +        c_length <- as.numeric(contigs[[i]]$length)
+                    +
                     +        flog.info("Processing %s (offset %.0f, length %.0f)...",
                     +            arrays[i], c_offset, c_length)
                     +        query <- data.table(genomicsdb::query_variant_calls(db,
                     +            array = arrays[i],
                     +            column_ranges = list(c(c_offset, c_offset + c_length)),
                     +            row_ranges = list(range(sapply(jcallset$callsets,
                     +                function(x) x$row_idx)))))
+                    +
                     +        parsed_ad <- .parseADGenomicsDb(query)
                     +        .calculateMappingBias(nvcf = NULL,
                     +            alt = parsed_ad$alt,
                     +            ref = parsed_ad$ref,
                     +            gr = parsed_ad$gr,
                     +            min.normals = min.normals,
                     +            min.normals.betafit = min.normals.betafit,
                     +            min.median.coverage.betafit = min.median.coverage.betafit
                     +        )
                     +    })
                     +    genomicsdb::disconnect(db)
                     +    bias <- unlist(GRangesList(bias))
                     +    attr(bias, "workspace") <- workspace
                     +    attr(bias, "min.normals") <- min.normals
                     +    attr(bias, "min.normals.betafit") <- min.normals.betafit
                     +    attr(bias, "min.median.coverage.betafit") <- min.median.coverage.betafit
                     +    return(bias)
                     +}
+                    +
                     +.parseADGenomicsDb <- function(query) {
                     +    ref <-  dcast(query, CHROM+POS+END+REF+ALT~SAMPLE, value.var = "AD")
                     +    af <-  dcast(query, CHROM+POS+END+REF+ALT~SAMPLE, value.var = "AF")
                     +    gr <- GRanges(seqnames = ref$CHROM, IRanges(start = ref$POS, end = ref$END))
                     +    genomic_change <- paste0(as.character(gr), "_", ref$REF, ">", ref$ALT)
                     +    ref <- as.matrix(ref[,-(1:5)])
                     +    af <- as.matrix(af[,-(1:5)])
                     +    alt <- round(ref/(1-af)-ref)
                     +    rownames(ref) <- genomic_change
                     +    rownames(af) <- genomic_change
                     +    rownames(alt) <- genomic_change
                     +    list(ref = ref, alt = alt, gr = gr)
                     +}
+                    +
                     +.calculateMappingBias <- function(nvcf, alt = NULL, ref = NULL, gr = NULL,
                     +                                  min.normals, min.normals.betafit = 7,
                                                        min.median.coverage.betafit = 5) {
                     -    if (ncol(nvcf) < 2) {
                     -        .stopUserError("The normal.panel.vcf.file contains only a single sample.")
                     +    if (!is.null(nvcf)) {
                     +        if (ncol(nvcf) < 2) {
                     +            .stopUserError("The normal.panel.vcf.file contains only a single sample.")
                     +        }
                     +        # TODO: deal with tri-allelic sites
                     +        alt <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][2])
                     +        ref <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][1])
                     +        fa  <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][2]/sum(x[[1]]))
                     +        rownames(alt) <-  as.character(rowRanges(nvcf))
                     +        gr <- rowRanges(nvcf)
                     +    } else {
                     +        if (is.null(alt) || is.null(ref) || is.null(gr) || ncol(ref) != ncol(alt)) {
                     +            .stopRuntimeError("Either nvcf or valid alt and ref required.")
                     +        }
                     +        if (ncol(alt) < 2) {
                     +            .stopUserError("The normal.panel.vcf.file contains only a single sample.")
                     +        }
                     +        fa <- alt / (ref + alt)
+                         }
                     -    # TODO: deal with tri-allelic sites
                     -    alt <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][2])
                     -    ref <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][1])
                     -    fa  <- apply(geno(nvcf)$AD, c(1,2), function(x) x[[1]][2]/sum(x[[1]]))
                     -    x <- sapply(seq_len(nrow(nvcf)), function(i) {
                     +    ponCntHits <- apply(alt,1,function(x) sum(!is.na(x)))
+                    +
                     +    x <- sapply(seq_len(nrow(fa)), function(i) {
                              idx <- !is.na(fa[i,]) & fa[i,] > 0.05 & fa[i,] < 0.9
                              shapes <- c(NA, NA)
                              if (!sum(idx) >= min.normals) return(c(0, 0, 0, 0, shapes))
@@ -78,7 +196,7 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2,
                                      ref[i,idx]) ~ 1, betabinomial, trace = FALSE)))
                                  if (class(fit) == "try-error") {
                                      flog.warn("Could not fit beta binomial dist for %s (%s).",
                     -                    as.character(rowRanges(nvcf[i])),
                     +                    rownames(alt)[i],
                                          paste0(round(fa[i, idx], digits = 3), collapse=","))
                                  } else {
                                      shapes <- Coef(fit)
@@ -89,16 +207,13 @@ calculateMappingBiasVcf <- function(normal.panel.vcf.file, min.normals = 2,
                          # Add an average "normal" SNP (average coverage and allelic fraction > 0.4)
                          # as empirical prior
                          psMappingBias <- .adjustEmpBayes(x[1:4,]) * 2
                     -    ponCntHits <- apply(geno(nvcf)$AD, 1, function(x)
                     -        sum(!is.na(unlist(x))) / 2)
                     -    tmp <- rowRanges(nvcf)
                     -    mcols(tmp) <- NULL
                     -    tmp$bias <- psMappingBias
                     -    tmp$pon.count <- ponCntHits
                     -    tmp$mu <- x[5,]
                     -    tmp$rho <- x[6,]
                     -    tmp
                     +    mcols(gr) <- NULL
                     +    gr$bias <- psMappingBias
                     +    gr$pon.count <- ponCntHits
                     +    gr$mu <- x[5,]
                     +    gr$rho <- x[6,]
                     +    gr
+                     }
                      .readNormalPanelVcfLarge <- function(vcf, normal.panel.vcf.file,

R/setMappingBiasVcf.R

History View file @ 4635a93

@@ -84,7 +84,8 @@ normal.panel.vcf.file = NULL, min.normals = 2, smooth = TRUE, smooth.n = 5) {
             flog.warn("setMappingBiasVcf: no hits in %s.", mapping.bias.file)
             return(data.frame(bias = tmp, mu = NA, rho = NA))
         }
-        mappingBias <- .calculateMappingBias(nvcf, min.normals)
+        mappingBias <- .calculateMappingBias(nvcf = nvcf,
+            min.normals = min.normals)
     }
     .annotateMappingBias(tmp, vcf, mappingBias, max.bias, smooth, smooth.n)
 }

inst/extdata/NormalDB.R

History View file @ 4635a93

@@ -60,7 +60,11 @@ if (!is.null(opt$normal_panel)) {
                          } else {
                              suppressPackageStartupMessages(library(PureCN))
                              flog.info("Creating mapping bias database.")
                     -        bias <- calculateMappingBiasVcf(opt$normal_panel, genome = genome)
                     +        if (file.exists(file.path(opt$normal_panel, "callset.json"))) {
                     +            bias <- calculateMappingBiasGatk4(opt$normal_panel, genome)
                     +        } else {
                     +            bias <- calculateMappingBiasVcf(opt$normal_panel, genome = genome)
                     +        }
                              saveRDS(bias, file = output.file)
+                         }
+                     }

inst/extdata/gatk4_pon_db.tgz

History View file @ 4635a93

67	71	new file mode 100644
68	72	Binary files /dev/null and b/inst/extdata/gatk4_pon_db.tgz differ

man/calculateMappingBiasGatk4.Rd

History View file @ 4635a93

                     new file mode 100644
@@ -0,0 +1,46 @@
                     +% Generated by roxygen2: do not edit by hand
                     +% Please edit documentation in R/calculateMappingBiasVcf.R
                     +\name{calculateMappingBiasGatk4}
                     +\alias{calculateMappingBiasGatk4}
                     +\title{Calculate Mapping Bias from GATK4 GenomicsDB}
                     +\usage{
                     +calculateMappingBiasGatk4(
                     +  workspace,
                     +  reference.genome,
                     +  min.normals = 2,
                     +  min.normals.betafit = 7,
                     +  min.median.coverage.betafit = 5
                     +)
                     +}
                     +\arguments{
                     +\item{workspace}{Path to the GenomicsDB created by \code{GenomicsDBImport}}
+                    +
                     +\item{reference.genome}{Reference FASTA file.}
+                    +
                     +\item{min.normals}{Minimum number of normals with heterozygous SNP for
                     +calculating position-specific mapping bias.}
+                    +
                     +\item{min.normals.betafit}{Minimum number of normals with heterozygous SNP
                     +fitting a beta distribution}
+                    +
                     +\item{min.median.coverage.betafit}{Minimum median coverage of normals with
                     +heterozygous SNP for fitting a beta distribution}
                     +}
                     +\value{
                     +A \code{GRanges} object with mapping bias and number of normal
                     +samples with this variant.
                     +}
                     +\description{
                     +Function calculate mapping bias for each variant in the provided
                     +panel of normals GenomicsDB.
                     +}
                     +\examples{
+                    +
                     +normal.panel.vcf <- system.file("extdata", "normalpanel.vcf.gz", package="PureCN")
                     +bias <- calculateMappingBiasVcf(normal.panel.vcf, genome = "h19")
                     +saveRDS(bias, "mapping_bias.rds")
+                    +
                     +}
                     +\author{
                     +Markus Riester
                     +}

tests/testthat/test_setMappingBiasVcf.R

History View file @ 4635a93

@@ -52,3 +52,16 @@ test_that("Precomputed mapping bias matches", {
                          vcf.single.file <- system.file("extdata", "example_single.vcf.gz", package = "PureCN")
                          expect_error(calculateMappingBiasVcf(vcf.single.file), "only a single sample")
                      })
+                    +
                     +test_that("GenomicsDB import works", {
                     +    skip_if_not(requireNamespace("genomicsdb"), "genomicsdb required")
                     +    skip_if_not(requireNamespace("jsonlite"), "jsonlite required")
                     +    resources_file <- system.file("extdata", "gatk4_pon_db.tgz",
                     +        package = "PureCN")
                     +    tmp_dir <- tempdir()
                     +    untar(resources_file, exdir = tmp_dir)
                     +    workspace <- file.path(tmp_dir, "gatk4_pon_db")
                     +    bias <- calculateMappingBiasGatk4(workspace, "hg19")
                     +    expect_equal(2101, length(bias))
                     +    unlink(tmp_dir, recursive=TRUE)
                     +})

vignettes/Quick.Rmd

History View file @ 4635a93

@@ -449,10 +449,12 @@ Important recommendations:
                      ## Recommended _GATK4_ usage
                      ```
                     -# Recommended: Provide a normal panel VCF to remove mapping biases, pre-compute
                     +# Recommended: Provide a normal panel GenomicsDB to remove mapping biases, pre-compute
                      # position-specific bias for much faster runtimes with large panels
                      # This needs to be done only once for each assay
                     -Rscript $PURECN/NormalDB.R --outdir $OUT_REF --normal_panel $NORMAL_PANEL \
                     +# Requires the genomicsdb R package
                     +Rscript $PURECN/NormalDB.R --outdir $OUT_REF \
                     +    --normal_panel $GENOMICSDB-WORKSPACE-PATH/pon_db \
                          --assay agilent_v6 --genome hg19 --force
                      Rscript $PURECN/PureCN.R --out $OUT/$SAMPLEID  \

...	...	@@ -84,7 +84,8 @@ normal.panel.vcf.file = NULL, min.normals = 2, smooth = TRUE, smooth.n = 5) {
84	84	flog.warn("setMappingBiasVcf: no hits in %s.", mapping.bias.file)
85	85	return(data.frame(bias = tmp, mu = NA, rho = NA))
86	86	}
87		- mappingBias <- .calculateMappingBias(nvcf, min.normals)
	87	+ mappingBias <- .calculateMappingBias(nvcf = nvcf,
	88	+ min.normals = min.normals)
88	89	}
89	90	.annotateMappingBias(tmp, vcf, mappingBias, max.bias, smooth, smooth.n)
90	91	}