R/readSimFFPE_WGS.R
ec432b6b
 readSimFFPE <- function(sourceSeq, referencePath, PhredScoreProfile, outFile,
                         coverage, readLen=150, meanInsertLen=250, 
b3c94ceb
                         sdInsertLen=80, enzymeCut=FALSE, chimericProp=0.1, 
                         sameChrProp=0.43, windowLen=5000, adjChimProp=0.63, 
                         sameStrandProp=0.65, meanLogSRCRLen=1.8, 
                         sdLogSRCRLen=0.55, maxSRCRLen=32,
                         meanLogSRCRDist=4.7, sdLogSRCRDist=0.35, 
                         distWinLen=5000, spikeWidth = 1500, 
ec432b6b
                         betaShape1=0.5, betaShape2=0.5, sameTarRegionProb=0,
b3c94ceb
                         adjFactor=1.65, distFactor=1.65, 
                         chimMutRate=0.003, noiseRate=0.0015,
                         highNoiseRate=0.08, highNoiseProp=0.01,
aac4a752
                         pairedEnd=TRUE, prefix="SimFFPE", threads=1,
b3c94ceb
                         adjChimeric=TRUE, distChimeric=TRUE,
aac4a752
                         normalReads=TRUE, overWrite=FALSE) {
973b1537
     
7a6787f4
     if(missing(sourceSeq)) stop("sourceSeq is required")
ec432b6b
     if(missing(referencePath)) stop("referencePath is required")
7a6787f4
     if(missing(PhredScoreProfile)) stop("PhredScoreProfile is required")
     if(missing(outFile)) stop("outFile is required")
     if(missing(coverage)) stop("coverage is required")
ec432b6b
     if(max(betaShape1, betaShape2) > 1) {
         stop ("betaShape1 and betaShape2 should not be greater than 1")
     }
     if(min(betaShape1, betaShape2) <= 0) {
         stop ("betaShape1 and betaShape2 should be greater than 0")
     }
7a6787f4
     
973b1537
     if (nrow(PhredScoreProfile) != readLen) {
         stop("The number of rows in PhredScoreProfile should be the same as", 
              " the input readLen")
     }
     
aac4a752
     if (overWrite) {
         overWriteFastq(outFile = outFile, pairedEnd = pairedEnd)
     }
ec432b6b
     
b3c94ceb
     covFFPE <- coverage * chimericProp
     covNorm <- coverage * (1 - chimericProp)
     adjChimProp <- sameChrProp * adjChimProp
     sameStrandProp <- sameStrandProp * 1.05
ec432b6b
     
     reference <- readDNAStringSet(referencePath)
b3c94ceb
     reference <- reference[width(reference) >= distWinLen]
ec432b6b
     
aac4a752
     names(reference) <- unlist(lapply(names(reference),
                                       function(x) strsplit(x, " ")[[1]][1]))
     names(sourceSeq) <- unlist(lapply(names(sourceSeq),
                                       function(x) strsplit(x, " ")[[1]][1]))
     nReadsLocal <- 0
     nReadsDistant <- 0
     nReadsNorm <- 0
ec432b6b
     
aac4a752
     cl <- makeCluster(threads)
     registerDoParallel(cl)
973b1537
     message("Using ", threads, " thread(s) for simulation.")
ec432b6b
     
aac4a752
     for (i in seq_along(sourceSeq)) {
         chr <- names(sourceSeq)[i]
973b1537
         message("Simulating FFPE reads on the chromosome ", chr, "...")
aac4a752
         fullSeq <- sourceSeq[[i]]
b3c94ceb
         
         if (adjChimeric) {
             startPoses <- seq(1, length(fullSeq), by=windowLen)
             endPoses <- startPoses + windowLen - 1
             endPoses[length(endPoses)] <- length(fullSeq)
             if(pairedEnd) {
                 nSeedsAll <- round(covFFPE * adjChimProp  * adjFactor * 
                                        length(fullSeq) / readLen / 2)
             } else {
                 enzymeCut <- FALSE
                 nSeedsAll <- round(covFFPE * adjChimProp  * adjFactor * 
                                        length(fullSeq) / readLen)
             }
             
             nSeedsRegional <- floor(nSeedsAll / length(startPoses))
             nSeeds <- rep(nSeedsRegional, length(startPoses))
             restSeeds <- nSeedsAll - sum(nSeeds)
             nSeeds[length(startPoses)] <- 
                 round(nSeedsRegional / windowLen * 
                           (length(fullSeq) - startPoses[length(startPoses)]+1))
             tmp <- sample(seq_along(startPoses), restSeeds)
             nSeeds[tmp] <- nSeeds[tmp] + 1
 
aac4a752
             startPos <- NULL
b3c94ceb
             endPos <- NULL
             nSeed <- NULL
 
aac4a752
             simReads <-
b3c94ceb
                 foreach(startPos = startPoses,
                         endPos = endPoses,
                         nSeed = nSeeds,
aac4a752
                         .combine = "c",
                         .inorder = TRUE,
                         .verbose = FALSE,
973b1537
                         .errorhandling    = "remove",
aac4a752
                         .packages = c("Biostrings"),
                         .export = c("generateRegionalChimericReads",
                                     "generateRegionalChimericSeqs",
                                     "generateEnzymicCutSeq",
                                     "addRandomMutation",
                                     "addNoise",
                                     "generateReads",
                                     "rtruncnorm")
                 ) %dopar% {
                     generateRegionalChimericReads(
b3c94ceb
                         seq = fullSeq[startPos:endPos],
                         nSeed = nSeed,
                         meanLogSRCRLen = meanLogSRCRLen,
                         sdLogSRCRLen = sdLogSRCRLen,
                         maxSRCRLen=maxSRCRLen,
                         meanLogSRCRDist = meanLogSRCRDist,
                         sdLogSRCRDist = sdLogSRCRDist,
aac4a752
                         meanInsertLen = meanInsertLen,
                         sdInsertLen = sdInsertLen,
                         enzymeCut = enzymeCut,
                         readLen = readLen,
ec432b6b
                         chimMutRate = chimMutRate,
aac4a752
                         noiseRate = noiseRate,
                         highNoiseRate = highNoiseRate,
b3c94ceb
                         highNoiseProp = highNoiseProp,
aac4a752
                         pairedEnd = pairedEnd,
b3c94ceb
                         sameStrandProp = sameStrandProp)
aac4a752
                 }
ec432b6b
             
aac4a752
             readsToFastq(simReads = simReads,
                          PhredScoreProfile = PhredScoreProfile,
                          prefix = prefix,
b3c94ceb
                          prefix2 = "adjChimeric",
aac4a752
                          chr = chr,
                          pairedEnd = pairedEnd,
                          outFile = outFile,
                          threads = threads)
b3c94ceb
             message("Generated ", length(simReads), " adjacent chimeric reads ",
973b1537
                     "on chromosome ", chr, ".")
aac4a752
             nReadsLocal <- nReadsLocal + length(simReads)
ec432b6b
             simReads <- NULL
aac4a752
         }
ec432b6b
         
b3c94ceb
         if (distChimeric) {
             
             allChr <- names(reference)
             totalLen <- do.call("sum", lapply(reference, length))
             chrProb <- unlist(lapply(reference, length)) / totalLen
             tmpProb <- (sameChrProp - adjChimProp) / (1 - adjChimProp)
             chrProb[chr] <- 0
             chrProb <- chrProb / sum(chrProb) * (1 - tmpProb)
             chrProb[chr] <- tmpProb
             
aac4a752
             if (pairedEnd) {
b3c94ceb
                 nSeedDistant <- round(covFFPE * (1 - adjChimProp) * distFactor *
ec432b6b
                                           length(fullSeq) / readLen / 2)
aac4a752
             } else {
b3c94ceb
                 nSeedDistant <- round(covFFPE * (1 - adjChimProp) * distFactor *
ec432b6b
                                           length(fullSeq) / readLen)
aac4a752
             }
ec432b6b
             
b3c94ceb
             
             if (nSeedDistant > 1e+4) {
                 distWindow <- round(length(fullSeq) / (nSeedDistant / 1e+4))
ec432b6b
                 nTimes <- ceiling(length(fullSeq) / distWindow / threads)
                 nSeedDistant <- 
                     round(nSeedDistant *  distWindow / length(fullSeq))
             } else {
                 distWindow <- length(fullSeq)
                 nTimes <- 1
             }
             
             allStartPos <- seq(1, length(fullSeq), by = distWindow)
             nReadsDistantChr <- 0
             firstID <- 1
             for (j in seq_len(nTimes)) {
                 tmp <- min(j * threads, length(allStartPos))
                 startPos <- allStartPos[((j - 1) * threads + 1):tmp]
                 simReads <-
                     foreach(startPos = startPos,
                             .combine = "c",
                             .inorder = TRUE,
                             .verbose = FALSE,
                             .packages = "Biostrings",
                             .errorhandling = "remove",
                             .export = c("generateDistantChimericReads",
                                         "generateDistantChimericSeqs",
                                         "generateReads",
                                         "addRandomMutation",
                                         "addNoise",
                                         "generateTargetSeqs",
                                         "rbeta", "round",
                                         "rtruncnorm")
                     ) %dopar% {
                         generateDistantChimericReads(
                             fullSeq = fullSeq,
                             referencePath = referencePath,
                             startPos = startPos,
                             windowLen = distWindow,
b3c94ceb
                             distWinLen = distWinLen,
ec432b6b
                             nSeed = nSeedDistant,
b3c94ceb
                             meanLogSRCRLen = meanLogSRCRLen,
                             sdLogSRCRLen = sdLogSRCRLen,
                             maxSRCRLen = maxSRCRLen,
ec432b6b
                             meanInsertLen = meanInsertLen,
                             sdInsertLen = sdInsertLen,
                             readLen = readLen,
                             allChr = allChr,
                             chrProb = chrProb,
                             chimMutRate = chimMutRate,
                             noiseRate = noiseRate,
                             highNoiseRate = highNoiseRate,
b3c94ceb
                             highNoiseProp = highNoiseProp,
ec432b6b
                             pairedEnd = pairedEnd,
                             spikeWidth = spikeWidth,
                             betaShape1 = betaShape1,
                             betaShape2 = betaShape2,
                             sameTarRegionProb = sameTarRegionProb,
b3c94ceb
                             sameStrandProp= 0.5)
ec432b6b
                     }
                 
                 readsToFastq(simReads = simReads,
                              PhredScoreProfile = PhredScoreProfile,
                              prefix = prefix,
b3c94ceb
                              prefix2 = "distChimeric",
ec432b6b
                              chr = chr,
                              firstID = firstID,
                              pairedEnd = pairedEnd,
                              outFile = outFile,
                              threads = threads)
                 if (pairedEnd) {
                     firstID = firstID + length(simReads) / 2
                 } else {
                     firstID = firstID + length(simReads)
aac4a752
                 }
ec432b6b
                 nReadsDistantChr <- nReadsDistantChr + length(simReads)
                 simReads <- NULL
             }
             message("Generated ", nReadsDistantChr,
973b1537
                     " distant chimeric reads ",
                     "on chromosome ", chr, ".")
ec432b6b
             nReadsDistant <- nReadsDistant + nReadsDistantChr
aac4a752
         }
ec432b6b
         
aac4a752
         if (normalReads) {
             if (pairedEnd) {
                 nSeqNorm <- round(covNorm * length(fullSeq) / readLen / 2)
             } else {
                 nSeqNorm <- round(covNorm * length(fullSeq) / readLen)
             }
ec432b6b
             if (nSeqNorm > 5e+6) {
                 nSeqPerTime <- c(rep(5e+6, floor(nSeqNorm / 5e+6)), 
                                  nSeqNorm %% 5e+6)
                 nTimes <- ceiling(nSeqNorm / 5e+6)
             } else {
                 nSeqPerTime <- nSeqNorm
                 nTimes <- 1
aac4a752
             }
ec432b6b
             nReadsNormChr <- 0
             firstID <- 1
             for (j in seq_len(nTimes)) {
                 
                 nSeqPerCluster <- round(nSeqPerTime[j] / threads)
                 
                 simReads <- foreach(i = seq_len(threads),
                                     .combine = "c",
                                     .inorder = TRUE,
                                     .verbose = FALSE,
                                     #.errorhandling  = "remove",
                                     .packages = c("Biostrings"),
                                     .export = c("addNoise",
                                                 "rtruncnorm",
                                                 "generateNormalReads")
                 ) %dopar% {
                     generateNormalReads(
                         fullSeq = fullSeq,
                         nSeq = nSeqPerCluster,
                         meanInsertLen = meanInsertLen,
                         sdInsertLen = sdInsertLen,
                         readLen = readLen,
                         noiseRate = noiseRate,
                         highNoiseRate = highNoiseRate,
b3c94ceb
                         highNoiseProp = highNoiseProp,
ec432b6b
                         pairedEnd = pairedEnd)
                 }
                 
                 readsToFastq(simReads = simReads,
                              PhredScoreProfile = PhredScoreProfile,
                              prefix = prefix,
                              prefix2 = "Normal",
                              chr = chr,
                              firstID = firstID,
                              pairedEnd = pairedEnd,
                              outFile = outFile,
                              threads = threads)
                 if (pairedEnd) {
                     firstID = firstID + length(simReads) / 2
                 } else {
                     firstID = firstID + length(simReads)
                 }
                 nReadsNormChr <- nReadsNormChr + length(simReads)
                 simReads <- NULL
             }
             message("Generated ", nReadsNormChr, " normal reads ",
973b1537
                     "on chromosome ", chr, ".")
ec432b6b
             nReadsNorm <- nReadsNorm + nReadsNormChr
aac4a752
         }
     }
ec432b6b
     
aac4a752
     stopCluster(cl)
ec432b6b
     
b3c94ceb
     message("In totoal ", nReadsLocal, " adjacent chimeric reads, ",
973b1537
             nReadsDistant, " distant chimeric reads, ",
             nReadsNorm, " normal reads were generated.",
             "\nAlltogether ", nReadsLocal + nReadsDistant + nReadsNorm,
b3c94ceb
             " reads were generated. ", nReadsLocal + nReadsDistant, 
             " reads (", round((nReadsLocal + nReadsDistant) / 
                                   (nReadsLocal + nReadsDistant + nReadsNorm), 
                               4) * 100, 
             "%, adjusted by chimericProp) are artifact chimeric reads.",
             "\nOf all artifact chimeric reads, ", nReadsLocal, " reads (",
             round(nReadsLocal / (nReadsLocal + nReadsDistant), 4) * 100,
             "%, adjusted by sameChrProp * adjChimProp)",
             " are adjacent chimeric reads.",
             "\nSimulation done.")
aac4a752
 }