Bioconductor Code: PureCN

Raw Blame Patch Log History
#' Function to calculate coverage from BAM file
#'
#' Takes a BAM file and an interval file as input and returns coverage for each
#' interval. Coverage should be then GC-normalized using the
#' \code{\link{correctCoverageBias}} function before determining purity and
#' ploidy with \code{\link{runAbsoluteCN}}. Uses the \code{scanBam} function
#' and applies low quality, duplicate reads as well as secondary alignment
#' filters.
#'
#'
#' @param bam.file Filename of a BAM file.
#' @param interval.file File specifying the intervals. Interval is expected in
#' first column in format CHR:START-END. The \code{gc.gene.file} can be used.
#' @param output.file Optionally, write minimal coverage file. Can be read with
#' the \code{\link{readCoverageFile}} function.
#' @param index.file The bai index. This is expected without the .bai file
#' suffix, see \code{?scanBam}.
#' @param keep.duplicates Keep or remove duplicated reads.
#' @return Returns total and average coverage by intervals.
#' @author Markus Riester
#' @seealso \code{\link{preprocessIntervals}
#' \link{correctCoverageBias} \link{runAbsoluteCN}}
#' @examples
#'
#' bam.file <- system.file("extdata", "ex1.bam", package = "PureCN",
#'     mustWork = TRUE)
#' interval.file <- system.file("extdata", "ex1_intervals.txt",
#'     package = "PureCN", mustWork = TRUE)
#'
#' # Calculate raw coverage from BAM file. These need to be corrected for
#' # GC-bias using the correctCoverageBias function before determining purity
#' # and ploidy.
#' coverage <- calculateBamCoverageByInterval(bam.file = bam.file,
#'     interval.file = interval.file)
#'
#' @export calculateBamCoverageByInterval
#' @importFrom Rsamtools headerTabix ScanBamParam scanBamFlag
#'             scanBam scanFa scanFaIndex TabixFile
calculateBamCoverageByInterval <- function(bam.file, interval.file,
    output.file = NULL, index.file = bam.file, keep.duplicates = FALSE) {
    intervalGr <- readCoverageFile(interval.file)

    param <- ScanBamParam(what = c("pos", "qwidth", "flag"),
                which = intervalGr,
                flag = scanBamFlag(isUnmappedQuery = FALSE,
                             isNotPassingQualityControls = FALSE,
                             isSecondaryAlignment = FALSE,
                             isDuplicate = NA
                     )
             )

    xAll <- scanBam(bam.file, index = index.file, param = param)
    xDupFiltered <- .filterDuplicates(xAll)

    x <- xDupFiltered
    if (keep.duplicates) x <- xAll

    intervalGr$coverage <- sapply(seq_along(x), function(i)
        sum(coverage(IRanges(x[[i]][["pos"]], width = x[[i]][["qwidth"]]),
            shift = -start(intervalGr)[i], width = width(intervalGr)[i])))

    intervalGr$average.coverage <- intervalGr$coverage / width(intervalGr)

    intervalGr$counts <- as.numeric(sapply(x, function(y) length(y$pos)))
    intervalGr$duplication.rate <- 1 -
        sapply(xDupFiltered, function(y) length(y$pos)) /
        sapply(xAll, function(y) length(y$pos))

    if (!is.null(output.file)) {
        .writeCoverage(intervalGr, output.file)
    }
    invisible(intervalGr)
}

.writeCoverage <- function(intervalGr, output.file) {
    tmp <- data.frame(
        Target = as.character(intervalGr),
        total_coverage = intervalGr$coverage,
        counts = intervalGr$counts,
        on_target = intervalGr$on.target,
        duplication_rate = intervalGr$duplication.rate
    )
    write.table(tmp, file = output.file, row.names = FALSE, quote = FALSE)
}

.filterDuplicates <- function(x) {
    lapply(x, function(y) {
        idx <- y$flag < 1024
        lapply(y, function(z) z[idx])
    })
}