Bioconductor Code: FastqCleaner

Raw Blame Patch Log History
#' Remove left and right full and partial patterns
#' @param subject \code{\link[Biostrings:DNAString-class]{DNAString}} or 
#' \code{\link[Biostrings:XStringSet-class]{DNAStringSet}} object
#' @param Lpattern 5' pattern,
#'  \code{\link[Biostrings:DNAString-class]{DNAString}} object
#' @param Rpattern 3' pattern,
#'  \code{\link[Biostrings:DNAString-class]{DNAString}} object
#' @param error_rate Error rate (value in [0, 1]). 
#' The error rate is the proportion of mismatches allowed between
#' the adapter and the aligned portion of the subject.
#' For a given adapter A, the number of allowed mismatches between each 
#' subsequence s of A and the subject is computed as: error_rate * L_s,
#' where L_s is the length of the subsequence s.
#' @param with.indels Allow indels? 
#' @param anchored Can the adapter or partial adapter be within 
#' the sequence? (anchored = FALSE)
#' or only in the terminal regions of the sequence? (anchored = TRUE).
#' Default TRUE (trim only flanking regions)
#' @param fixed Parameter passed to 
#' \code{\link[Biostrings]{trimLRPatterns}} 
#' Default 'subject',  ambiguities in the pattern only are interpreted 
#' as wildcard. See the argument fixed in 
#' \code{\link[Biostrings]{trimLRPatterns}} 
#' @param ranges Return ranges? Default FALSE
#' @param checks Perform internal checks? Default TRUE
#' @param min_match_flank Do not trim in flanks of the subject,
#'  if a match has min_match_flank of less length. Default 1L 
#'  (only trim with >=2 coincidences in a flank match)
#' @param ... additional parameters passed to
#' \code{\link[Biostrings]{trimLRPatterns}} 
#' @return  Edited \code{\link[Biostrings:DNAString-class]{DNAString}} or 
#' \code{\link[Biostrings:XStringSet-class]{DNAStringSet}} object
#' @description This set of programs are internal, 
#' and the function adapter_filter is recommended for trimming. 
#' The programs can remove adapters and partial 
#' adapters from 3' and 5'. The adapters can be anchored or not. 
#' When indels are allowed, the error rate consists in the edit distance. 
#' IUPAC simbols are allowed. The methods use the 
#' \code{\link[Biostrings]{trimLRPatterns}} function
#'  of the \pkg{Biostrings} package, with some additions 
#'  to take into account e.g., partial adaptors.
#'  IUPAC symbols are allowed in all the cases. The present function 
#' also removes partial adapters, without the need of additional steps
#'  (for example, creating a padded adapter with 'Ns', etc). 
#' A similar result to the output of \code{\link[Biostrings]{trimLRPatterns}} 
#' can be obtained with the option anchored = TRUE.
#' When several matches are found, the function removes the subsequence 
#' that starts in the first match when cutRseq is used, or ends 
#' in the last match when cutLseq is used.
#' 
#' @examples
#' library(Biostrings)
#' 
#' subject <- DNAStringSet(c('ATCATGCCATCATGAT',
#' 'CATGATATTA', 'TCATG', 'AAAAAA', 'AGGTCATG'))
#' 
#' Lpattern <- Rpattern <- 'TCATG'
#' 
#' FastqCleaner:::cutLseq(subject, Lpattern)
#' FastqCleaner:::cutLseq(subject, Lpattern, ranges = TRUE)
#' FastqCleaner:::cutRseq(subject, Rpattern)
#' 
#' 
#' FastqCleaner:::cutLseq(subject, Lpattern, anchored = FALSE)
#' FastqCleaner:::cutLseq(subject, Lpattern,  error_rate = 0.2)
#' FastqCleaner:::cutLseq(subject, Lpattern,  error_rate = 0.2, 
#' with.indels = TRUE)
#' 
#' @author Leandro Roser \email{learoser@@gmail.com}
#' @rdname matching
#' @keywords internal


cutRseq <- function(subject, Rpattern, 
                    with.indels = FALSE,
                    fixed = "subject", 
                    error_rate = 0.2, 
                    anchored = TRUE, 
                    ranges = FALSE,
                    checks = TRUE, 
                    min_match_flank = 2L, 
                    ...) {
    
   Rpattern <- DNAString(Rpattern)
  
   if (error_rate > 1 || error_rate < 0) {
    stop("error_rate must be a number between 0 and 1")
   }

   if (checks) {
    
       if (!is(Rpattern, "DNAString")) {
           stop("Rpattern must be a character string or a DNAString object")
       }
 
       csub <- class(subject)
       if (csub != "DNAStringSet") {
           stop("subject must be a DNAString or DNAStringSet object")
       }
    
       if (csub == "DNAString") {
           subject <- as(subject[[1]], "DNAStringSet")
       }
    
   }
  
   p <- length(Rpattern)
   s_width <- width(subject)
   s <- max(width(subject))
  
   if(error_rate > 0) {
       flank_seq <- as.integer(seq_len(p) * error_rate)
   } else {
       flank_seq <- rep(0, length(seq_len(p)))
   }
  
  
   if (min_match_flank >= 1L) {
       if (p > min_match_flank) {
           flank_seq[seq_len(min_match_flank)] <- -1
       } else {
           return(subject)
       }
   }
 
   if(!anchored) {
       Rpattern <- as.character(Rpattern)
       maxlen <-  max(width(subject)) - nchar(Rpattern)
       if(maxlen > 0) {
           Rpattern <- paste0(Rpattern,  paste(rep("N",maxlen), collapse = ""))
       }
       Rpattern <- DNAString(Rpattern)
       flank_seq <- c(flank_seq, rep(0,maxlen))
   }
  
   out <- trimLRPatterns(Rpattern = Rpattern,
                       subject = subject, 
                       max.Rmismatch = flank_seq, 
                       with.Rindels = with.indels, 
                       Rfixed = fixed, 
                       ...)
  
  
   if (ranges) {
       out <- IRanges::IRanges(start = rep(1, length(out)), end = width(out))
   } 
  
   out
}

#' Remove left and right full and partial patterns
#' @rdname matching

cutLseq <- function(subject, Lpattern,
                    with.indels = FALSE, 
                    fixed = "subject", 
                    error_rate = 0.2, 
                    anchored = TRUE, 
                    ranges = FALSE, 
                    min_match_flank = 3L, 
                    checks = TRUE, ...) {


   Lpattern <- DNAString(Lpattern)

   if (checks) {
       if (!is(Lpattern, "DNAString")) {
           stop("Rpattern must be a character string or a DNAString object")
       }

       csub <- class(subject)
       if (csub != "DNAStringSet") {
           stop("subject must be a DNAStringSet object")
       }

       if (csub == "DNAString") {
           subject <- as(subject[[1]], "DNAStringSet")
       }
   }

   p <- length(Lpattern)
   s_width <- width(subject)
   s <- max(width(subject))


   if(error_rate > 0) {
       flank_seq <- as.integer(seq_len(p) * error_rate)
   } else {
       flank_seq <- rep(0, length(seq_len(p)))
   }


   if (min_match_flank >= 1L) {
       if (p > min_match_flank) {
           flank_seq[seq_len(min_match_flank)] <- -1
       } else {
           return(subject)
   }
   }


   if(!anchored) {
       Lpattern <- as.character(Lpattern)
       maxlen <-  max(width(subject)) - nchar(Lpattern)
       if(maxlen > 0) {
           Lpattern <- paste0(paste(rep("N",maxlen), collapse = ""), Lpattern)
       }
       Lpattern <- DNAString(Lpattern)
       flank_seq <- c(flank_seq, rep(0,maxlen))
  }


   out <- trimLRPatterns(Lpattern = Lpattern,
                       subject = subject, 
                       max.Lmismatch = flank_seq, 
                       with.Lindels = with.indels, 
                       Lfixed = fixed, 
                       ...)


   if (ranges) {
       out <- IRanges::IRanges(start = rep(1, length(out)), end = width(out))
   } 

   out
}