Bioconductor Code: SimFFPE

Raw Blame Patch Log History
readSimFFPE <- function(sourceSeq, referencePath, PhredScoreProfile, outFile,
                        coverage, readLen=150, meanInsertLen=250, 
                        sdInsertLen=80, enzymeCut=FALSE, chimericProp=0.1, 
                        sameChrProp=0.43, windowLen=5000, adjChimProp=0.63, 
                        sameStrandProp=0.65, meanLogSRCRLen=1.8, 
                        sdLogSRCRLen=0.55, maxSRCRLen=32,
                        meanLogSRCRDist=4.7, sdLogSRCRDist=0.35, 
                        distWinLen=5000, spikeWidth = 1500, 
                        betaShape1=0.5, betaShape2=0.5, sameTarRegionProb=0,
                        adjFactor=1.65, distFactor=1.65, 
                        chimMutRate=0.003, noiseRate=0.0015,
                        highNoiseRate=0.08, highNoiseProp=0.01,
                        pairedEnd=TRUE, prefix="SimFFPE", threads=1,
                        adjChimeric=TRUE, distChimeric=TRUE,
                        normalReads=TRUE, overWrite=FALSE) {
    
    if(missing(sourceSeq)) stop("sourceSeq is required")
    if(missing(referencePath)) stop("referencePath is required")
    if(missing(PhredScoreProfile)) stop("PhredScoreProfile is required")
    if(missing(outFile)) stop("outFile is required")
    if(missing(coverage)) stop("coverage is required")
    if(max(betaShape1, betaShape2) > 1) {
        stop ("betaShape1 and betaShape2 should not be greater than 1")
    }
    if(min(betaShape1, betaShape2) <= 0) {
        stop ("betaShape1 and betaShape2 should be greater than 0")
    }
    
    if (nrow(PhredScoreProfile) != readLen) {
        stop("The number of rows in PhredScoreProfile should be the same as", 
             " the input readLen")
    }
    
    if (overWrite) {
        overWriteFastq(outFile = outFile, pairedEnd = pairedEnd)
    }
    
    covFFPE <- coverage * chimericProp
    covNorm <- coverage * (1 - chimericProp)
    adjChimProp <- sameChrProp * adjChimProp
    sameStrandProp <- sameStrandProp * 1.05
    
    reference <- readDNAStringSet(referencePath)
    reference <- reference[width(reference) >= distWinLen]
    
    names(reference) <- unlist(lapply(names(reference),
                                      function(x) strsplit(x, " ")[[1]][1]))
    names(sourceSeq) <- unlist(lapply(names(sourceSeq),
                                      function(x) strsplit(x, " ")[[1]][1]))
    nReadsLocal <- 0
    nReadsDistant <- 0
    nReadsNorm <- 0
    
    cl <- makeCluster(threads)
    registerDoParallel(cl)
    message("Using ", threads, " thread(s) for simulation.")
    
    for (i in seq_along(sourceSeq)) {
        chr <- names(sourceSeq)[i]
        message("Simulating FFPE reads on the chromosome ", chr, "...")
        fullSeq <- sourceSeq[[i]]
        
        if (adjChimeric) {
            startPoses <- seq(1, length(fullSeq), by=windowLen)
            endPoses <- startPoses + windowLen - 1
            endPoses[length(endPoses)] <- length(fullSeq)
            if(pairedEnd) {
                nSeedsAll <- round(covFFPE * adjChimProp  * adjFactor * 
                                       length(fullSeq) / readLen / 2)
            } else {
                enzymeCut <- FALSE
                nSeedsAll <- round(covFFPE * adjChimProp  * adjFactor * 
                                       length(fullSeq) / readLen)
            }
            
            nSeedsRegional <- floor(nSeedsAll / length(startPoses))
            nSeeds <- rep(nSeedsRegional, length(startPoses))
            restSeeds <- nSeedsAll - sum(nSeeds)
            nSeeds[length(startPoses)] <- 
                round(nSeedsRegional / windowLen * 
                          (length(fullSeq) - startPoses[length(startPoses)]+1))
            tmp <- sample(seq_along(startPoses), restSeeds)
            nSeeds[tmp] <- nSeeds[tmp] + 1

            startPos <- NULL
            endPos <- NULL
            nSeed <- NULL

            simReads <-
                foreach(startPos = startPoses,
                        endPos = endPoses,
                        nSeed = nSeeds,
                        .combine = "c",
                        .inorder = TRUE,
                        .verbose = FALSE,
                        .errorhandling    = "remove",
                        .packages = c("Biostrings"),
                        .export = c("generateRegionalChimericReads",
                                    "generateRegionalChimericSeqs",
                                    "generateEnzymicCutSeq",
                                    "addRandomMutation",
                                    "addNoise",
                                    "generateReads",
                                    "rtruncnorm")
                ) %dopar% {
                    generateRegionalChimericReads(
                        seq = fullSeq[startPos:endPos],
                        nSeed = nSeed,
                        meanLogSRCRLen = meanLogSRCRLen,
                        sdLogSRCRLen = sdLogSRCRLen,
                        maxSRCRLen=maxSRCRLen,
                        meanLogSRCRDist = meanLogSRCRDist,
                        sdLogSRCRDist = sdLogSRCRDist,
                        meanInsertLen = meanInsertLen,
                        sdInsertLen = sdInsertLen,
                        enzymeCut = enzymeCut,
                        readLen = readLen,
                        chimMutRate = chimMutRate,
                        noiseRate = noiseRate,
                        highNoiseRate = highNoiseRate,
                        highNoiseProp = highNoiseProp,
                        pairedEnd = pairedEnd,
                        sameStrandProp = sameStrandProp)
                }
            
            readsToFastq(simReads = simReads,
                         PhredScoreProfile = PhredScoreProfile,
                         prefix = prefix,
                         prefix2 = "adjChimeric",
                         chr = chr,
                         pairedEnd = pairedEnd,
                         outFile = outFile,
                         threads = threads)
            message("Generated ", length(simReads), " adjacent chimeric reads ",
                    "on chromosome ", chr, ".")
            nReadsLocal <- nReadsLocal + length(simReads)
            simReads <- NULL
        }
        
        if (distChimeric) {
            
            allChr <- names(reference)
            totalLen <- do.call("sum", lapply(reference, length))
            chrProb <- unlist(lapply(reference, length)) / totalLen
            tmpProb <- (sameChrProp - adjChimProp) / (1 - adjChimProp)
            chrProb[chr] <- 0
            chrProb <- chrProb / sum(chrProb) * (1 - tmpProb)
            chrProb[chr] <- tmpProb
            
            if (pairedEnd) {
                nSeedDistant <- round(covFFPE * (1 - adjChimProp) * distFactor *
                                          length(fullSeq) / readLen / 2)
            } else {
                nSeedDistant <- round(covFFPE * (1 - adjChimProp) * distFactor *
                                          length(fullSeq) / readLen)
            }
            
            
            if (nSeedDistant > 1e+4) {
                distWindow <- round(length(fullSeq) / (nSeedDistant / 1e+4))
                nTimes <- ceiling(length(fullSeq) / distWindow / threads)
                nSeedDistant <- 
                    round(nSeedDistant *  distWindow / length(fullSeq))
            } else {
                distWindow <- length(fullSeq)
                nTimes <- 1
            }
            
            allStartPos <- seq(1, length(fullSeq), by = distWindow)
            nReadsDistantChr <- 0
            firstID <- 1
            for (j in seq_len(nTimes)) {
                tmp <- min(j * threads, length(allStartPos))
                startPos <- allStartPos[((j - 1) * threads + 1):tmp]
                simReads <-
                    foreach(startPos = startPos,
                            .combine = "c",
                            .inorder = TRUE,
                            .verbose = FALSE,
                            .packages = "Biostrings",
                            .errorhandling = "remove",
                            .export = c("generateDistantChimericReads",
                                        "generateDistantChimericSeqs",
                                        "generateReads",
                                        "addRandomMutation",
                                        "addNoise",
                                        "generateTargetSeqs",
                                        "rbeta", "round",
                                        "rtruncnorm")
                    ) %dopar% {
                        generateDistantChimericReads(
                            fullSeq = fullSeq,
                            referencePath = referencePath,
                            startPos = startPos,
                            windowLen = distWindow,
                            distWinLen = distWinLen,
                            nSeed = nSeedDistant,
                            meanLogSRCRLen = meanLogSRCRLen,
                            sdLogSRCRLen = sdLogSRCRLen,
                            maxSRCRLen = maxSRCRLen,
                            meanInsertLen = meanInsertLen,
                            sdInsertLen = sdInsertLen,
                            readLen = readLen,
                            allChr = allChr,
                            chrProb = chrProb,
                            chimMutRate = chimMutRate,
                            noiseRate = noiseRate,
                            highNoiseRate = highNoiseRate,
                            highNoiseProp = highNoiseProp,
                            pairedEnd = pairedEnd,
                            spikeWidth = spikeWidth,
                            betaShape1 = betaShape1,
                            betaShape2 = betaShape2,
                            sameTarRegionProb = sameTarRegionProb,
                            sameStrandProp= 0.5)
                    }
                
                readsToFastq(simReads = simReads,
                             PhredScoreProfile = PhredScoreProfile,
                             prefix = prefix,
                             prefix2 = "distChimeric",
                             chr = chr,
                             firstID = firstID,
                             pairedEnd = pairedEnd,
                             outFile = outFile,
                             threads = threads)
                if (pairedEnd) {
                    firstID = firstID + length(simReads) / 2
                } else {
                    firstID = firstID + length(simReads)
                }
                nReadsDistantChr <- nReadsDistantChr + length(simReads)
                simReads <- NULL
            }
            message("Generated ", nReadsDistantChr,
                    " distant chimeric reads ",
                    "on chromosome ", chr, ".")
            nReadsDistant <- nReadsDistant + nReadsDistantChr
        }
        
        if (normalReads) {
            if (pairedEnd) {
                nSeqNorm <- round(covNorm * length(fullSeq) / readLen / 2)
            } else {
                nSeqNorm <- round(covNorm * length(fullSeq) / readLen)
            }
            if (nSeqNorm > 5e+6) {
                nSeqPerTime <- c(rep(5e+6, floor(nSeqNorm / 5e+6)), 
                                 nSeqNorm %% 5e+6)
                nTimes <- ceiling(nSeqNorm / 5e+6)
            } else {
                nSeqPerTime <- nSeqNorm
                nTimes <- 1
            }
            nReadsNormChr <- 0
            firstID <- 1
            for (j in seq_len(nTimes)) {
                
                nSeqPerCluster <- round(nSeqPerTime[j] / threads)
                
                simReads <- foreach(i = seq_len(threads),
                                    .combine = "c",
                                    .inorder = TRUE,
                                    .verbose = FALSE,
                                    #.errorhandling  = "remove",
                                    .packages = c("Biostrings"),
                                    .export = c("addNoise",
                                                "rtruncnorm",
                                                "generateNormalReads")
                ) %dopar% {
                    generateNormalReads(
                        fullSeq = fullSeq,
                        nSeq = nSeqPerCluster,
                        meanInsertLen = meanInsertLen,
                        sdInsertLen = sdInsertLen,
                        readLen = readLen,
                        noiseRate = noiseRate,
                        highNoiseRate = highNoiseRate,
                        highNoiseProp = highNoiseProp,
                        pairedEnd = pairedEnd)
                }
                
                readsToFastq(simReads = simReads,
                             PhredScoreProfile = PhredScoreProfile,
                             prefix = prefix,
                             prefix2 = "Normal",
                             chr = chr,
                             firstID = firstID,
                             pairedEnd = pairedEnd,
                             outFile = outFile,
                             threads = threads)
                if (pairedEnd) {
                    firstID = firstID + length(simReads) / 2
                } else {
                    firstID = firstID + length(simReads)
                }
                nReadsNormChr <- nReadsNormChr + length(simReads)
                simReads <- NULL
            }
            message("Generated ", nReadsNormChr, " normal reads ",
                    "on chromosome ", chr, ".")
            nReadsNorm <- nReadsNorm + nReadsNormChr
        }
    }
    
    stopCluster(cl)
    
    message("In totoal ", nReadsLocal, " adjacent chimeric reads, ",
            nReadsDistant, " distant chimeric reads, ",
            nReadsNorm, " normal reads were generated.",
            "\nAlltogether ", nReadsLocal + nReadsDistant + nReadsNorm,
            " reads were generated. ", nReadsLocal + nReadsDistant, 
            " reads (", round((nReadsLocal + nReadsDistant) / 
                                  (nReadsLocal + nReadsDistant + nReadsNorm), 
                              4) * 100, 
            "%, adjusted by chimericProp) are artifact chimeric reads.",
            "\nOf all artifact chimeric reads, ", nReadsLocal, " reads (",
            round(nReadsLocal / (nReadsLocal + nReadsDistant), 4) * 100,
            "%, adjusted by sameChrProp * adjChimProp)",
            " are adjacent chimeric reads.",
            "\nSimulation done.")
}