R/param_estimator.R
ec432b6b
 countPhredScores <- function(qual) {
aac4a752
     qual <- as.character(qual)
     qual <- lapply(qual, function(x) unlist(strsplit(x, "")))
     qual <- lapply(qual, function(x) {names(x) <- seq(1, length(x)); return(x)})
     qual <- as.data.frame(do.call("bind_rows", qual))
ec432b6b
     qualProb <- apply(qual, 2, table)
973b1537
     qualProb <- as.matrix(do.call("bind_rows", qualProb))
     qualProb <- qualProb[, order(phred2ASCIIOffset(colnames(qualProb)))]
aac4a752
     qualProb[is.na(qualProb)] <- 0
     return(qualProb)
 }
 
 
 findReads <- function(bamFile,
                       tag = character(0),
                       what=scanBamWhat(),
                       which=IRangesList(),
                       isSupplementary=NA,
                       reverseComplement=NA,
                       mapqFilter=0) {
 
     f <- scanBamFlag(isDuplicate = FALSE,
b3c94ceb
                      isSupplementaryAlignment = isSupplementary)
aac4a752
 
     p <- ScanBamParam(flag = f,
                       tag = tag,
                       simpleCigar = FALSE,
                       reverseComplement = reverseComplement,
                       what = what,
                       which = which,
                       mapqFilter = mapqFilter)
 
     bam <- scanBam(bamFile, param = p)[[1]]
     return(bam)
 }
 
 
ec432b6b
 calcPhredScoreProfile <- function(bamFilePath, mapqFilter=0, maxFileSize=1,
                                   targetRegions=NULL, subsampleRatio=NA,
                                   subsampleRegionLength=1e+5,
                                   disableSubsampling=FALSE, threads=1) {
7a6787f4
     
     if(missing(bamFilePath)) stop("bamFilePath is required")
     
aac4a752
     bamFile <- BamFile(bamFilePath)
     seqInfo <- scanBamHeader(bamFile)$targets
973b1537
     
b3c94ceb
     colnames(targetRegions) <- c("chr", "start", "end")
     targetRegions <- targetRegions[targetRegions$chr %in% names(seqInfo), ]
     
973b1537
     if (length(targetRegions) != 0 & is(targetRegions, "GenomicRanges")) {
         targetRegions <- data.frame(seqnames(targetRegions), 
                                     start(targetRegions), 
                                     end(targetRegions))
     }
     
aac4a752
     if (file.info(bamFilePath)$size / 1e+9 <= maxFileSize |
         disableSubsampling) {
         if (length(targetRegions) == 0) {
             regions <- GRanges(names(seqInfo), IRanges(1, seqInfo))
ec432b6b
             regions <- 
                 unlist(slidingWindows(regions, width=50000, step = 50000))
aac4a752
         } else {
             colnames(targetRegions) <- c("chr", "start", "end")
             regions <- GRanges(targetRegions$chr, IRanges(targetRegions$start,
                                                           targetRegions$end))
         }
         if (file.info(bamFilePath)$size / 1e+9 <= maxFileSize) {
             message("BAM filesize smaller than ",
                     maxFileSize,
                     " gigabytes, calculate Phred quality score profile ",
                     "based on all reads.")
         } else {
             message("Subsampling disabled, ",
                     "calculate Phred quality score profile based on all reads.")
         }
     } else {
         if (is.na(subsampleRatio)) {
             subsampleRatio <-
                 maxFileSize * 1e+9 / file.info(bamFilePath)$size
         }
973b1537
         message("BAM filesize greater than ",
                 maxFileSize,
                 " gigabytes, calculation based on subsampled reads ",
                 "(subsample ratio: ",
                 subsampleRatio, ").")
aac4a752
         if (length(targetRegions) == 0) {
             nRegions <- seqInfo * subsampleRatio / subsampleRegionLength
             seqInfo <- seqInfo[nRegions >= 1]
             nRegions <- ceiling(nRegions[nRegions >= 1])
             if (length(seqInfo) == 0) {
                 stop("subsampleRegionLength is too large ",
                      "with this subsampleRatio.")
             }
             subsampleRegions <- GRanges()
             for (i in seq_along(seqInfo)) {
                 startPos <- sample(seq(1, seqInfo[i], subsampleRegionLength),
                                    nRegions[i], replace = FALSE)
                 subsampleRegion <-
                     GRanges(seqnames = names(seqInfo[i]),
                             ranges = IRanges(startPos,
                                              startPos +
                                                  subsampleRegionLength - 1))
 
                 suppressWarnings(subsampleRegions <-
                                      c(subsampleRegions, subsampleRegion))
             }
             regions <- sort(subsampleRegions)
         } else {
             regions <- GRanges(targetRegions$chr, IRanges(targetRegions$start,
                                                           targetRegions$end))
             regions <- regions[sample(seq_along(regions),
                                       round(length(regions) * subsampleRatio))]
         }
     }
b3c94ceb
     
ec432b6b
     message("Get reads from BAM file in ", length(regions), " regions.", 
973b1537
             "\nUsing ", threads, " thread(s).")
aac4a752
     cl <- makeCluster(threads)
     registerDoParallel(cl)
 
     readInfo <- foreach(i = seq_along(regions),
                         .combine = "c",
                         .inorder = TRUE,
                         .verbose = FALSE,
ec432b6b
                         .errorhandling = "remove",
aac4a752
                         .packages = c("Biostrings", "dplyr",
                                       "GenomicRanges", "Rsamtools"),
ec432b6b
                         .export = c("findReads", "countPhredScores")
aac4a752
     ) %dopar% {
ec432b6b
             temp <- findReads(bamFile,
                               what = "qual",
                               which = regions[i],
                               reverseComplement=TRUE,
                               mapqFilter = mapqFilter)
             if (length(temp$qual) > 1) {
                 temp$qual <- countPhredScores(temp$qual)
                 return(temp)
             }
aac4a752
     }
     stopCluster(cl)
ec432b6b
     quals <- unname(readInfo[names(readInfo) == "qual"])
     if (length(quals) == 0) {
aac4a752
         stop("No reads found.")
     }
ec432b6b
     readLen <- max(unlist(lapply(quals, nrow)))
     PhredScores <- unique(unlist(lapply(quals, colnames)))
     PhredScores <- PhredScores[order(phred2ASCIIOffset(PhredScores))]
     for (i in seq_along(quals)) {
         qual <- quals[[i]]
         PhredScoreDiff <- setdiff(PhredScores, colnames(qual))
         if (length(PhredScoreDiff) > 0 ) {
             temp = matrix(0, nrow = nrow(qual), ncol = length(PhredScoreDiff))
             colnames(temp) <- PhredScoreDiff
             quals[[i]] <- cbind(qual, temp)
             quals[[i]] <- quals[[i]][, PhredScores]
         }
         rowDiff <- readLen - nrow(qual)
         if (rowDiff > 0) {
             temp = matrix(0, nrow = rowDiff, ncol = length(PhredScores))
             quals[[i]] <- rbind(quals[[i]], temp)
         }
aac4a752
     }
ec432b6b
     qualProb <- Reduce("+", quals)
     qualProb <- qualProb / rowSums(qualProb)
     return(qualProb)
aac4a752
 }