#' Create random sequences #' @description Create a #' \code{\link[Biostrings:XStringSet-class]{DNAStringSet}} object #' with random sequences #' @param slength Number of sequences #' @param swidth Width of the sequences #' @param nuc Create sequences of DNA (nucleotides = c('A', 'C', 'G', 'T')) or #' RNA (nucleotides = c('A, 'C', 'G', 'U'))?. Default: 'DNA' #' @param prob A vector of four probability values used #' to set the frequency of the nucleotides 'A', 'C', 'G', 'T', for DNA, #' or 'A', 'C', 'G', 'U', for RNA. For example = c(0.25, 0.25, 0.5, 0). #' Default is = c(0.25, 0.25, 0.25, 0.25) (equiprobability for the 4 bases). #' If the sum of the probabilities is > 1, the values will be nomalized #' to the range [0, 1]. #' @return \code{\link[Biostrings:XStringSet-class]{DNAStringSet}} object #' #' @examples #' #' # For reproducible examples, make a call to set.seed before #' # running each random function #' #' set.seed(10) #' s1 <- random_seq(slength = 10, swidth = 20) #' s1 #' #' set.seed(10) #' s2 <- random_seq(slength = 10, swidth = 20, #' prob = c(0.6, 0.1, 0.3, 0)) #' s2 #' #' @author Leandro Roser \email{learoser@@gmail.com} #' @export random_seq <- function(slength, swidth, nuc = c("DNA", "RNA"), prob = c(0.25, 0.25, 0.25, 0.25)) { nuc <- match.arg(nuc) if (length(prob) > 4 || !is.numeric(prob) || any(prob < 0)) { stop("prob must be a numeric vector of length 4") } if (any(swidth < 0) || length(slength) != 1) { stop("swidth must be > 1; slength must be of length 1") } if (length(swidth) > 1) { if (length(swidth) != slength) stop("length(swidth) and slength must be equal") } else { swidth <- rep(swidth, slength) } if (nuc == "DNA") { samplefrom <- c("A", "C", "G", "T") } else if (nuc == "RNA") { samplefrom <- c("A", "C", "G", "U") } prob <- prob/sum(prob) out <- list() for (i in seq_len(slength)) { samp <- sample(samplefrom, size = swidth[i], replace = TRUE, prob = prob) out[[i]] <- paste(samp, collapse = "") } DNAStringSet(do.call("c", out)) } #' Create random qualities for a given encoding #' @param qual quality range for the sequences. It must be a range included #' in the selected encoding: #' #' 'Sanger' = [0, 40] #' #' 'Illumina1.8' = [0, 41] #' #' 'Illumina1.5' = [0, 40] #' #' 'Illumina1.3' = [3, 40] #' #' 'Solexa' = [-5, 40] #' #' example: for a range from 20 to 30 in Sanger encoding, #' pass the argument = c(20, 30) #' @param slength number of sequences #' @param swidth width of the sequences #' @param encod sequence encoding #' @param prob a vector of range = range(qual), with probabilities to set #' the frequency of each quality value. Default is equiprobability. #' If the sum of the probabilities is > 1, the values will be nomalized #' to the range [0, 1]. #' @description Create a \code{\link[Biostrings:XStringSet-class]{BStringSet}} #' object #' with random qualities #' @return \code{\link[Biostrings:XStringSet-class]{BStringSet}} object #' @examples #' #' q <- random_qual(30, 20) #' q #' #' @author Leandro Roser \email{learoser@@gmail.com} #' @export random_qual <- function(slength, swidth, qual = NULL, encod = c("Sanger", "Illumina1.8", "Illumina1.5", "Illumina1.3", "Solexa"), prob = NULL) { encod <- match.arg(encod) if (any(swidth < 0) || length(slength) != 1) { stop("swidth must be > 1; slength must be of length 1") } if (length(swidth) > 1) { if (length(swidth) != slength) stop("length(swidth) and slength must be equal") } else { swidth <- rep(swidth, slength) } selection <- switch(encod, Sanger = 1, Illumina1.8 = 2, Illumina1.5 = 3, Illumina1.3 = 4, Solexa = 5, stop("Invalid encoding")) seqrange <- list(0:40, 0:41, 0:40, 3:40, -5:40) seqrange <- seqrange[[selection]] if (!is.null(qual)) { if (!all(qual %in% seqrange)) { stop(paste0("Quality values out of range. The values must be in the range [", min(seqrange), ",", max(seqrange), "]")) } } else { qual <- seqrange } my_scores <- seq(min(qual), max(qual)) + c(33, 33, 33, 64, 64)[selection] samplefrom <- vapply(my_scores, int2asc, character(1)) lscores <- length(my_scores) if (is.null(prob)) { prob <- rep(1, lscores)/lscores } else { if (length(prob) == lscores || !is.numeric(prob) || any(prob < 0)) { stop(paste0("prob must be a positive numeric vector of length swidth (", swidth, ")")) } } out <- list() for (i in seq_len(slength)) { out[[i]] <- paste(sample(samplefrom, size = swidth[i], replace = TRUE, prob = prob), collapse = "") } BStringSet(do.call("c", out)) } #' Create fastq/sequences/qualities with uniform width #' @param input input to edit #' @param type type of the input: 'fastq' (ShortReadQ), #' 'sequence' (DNAStringSet), 'quality' (BStringset) #' @return ShortReadQ object or character vector with sequences #' or qualities, with uniform widht (padded with Ns or \}) #' @keywords internal create_uniform_width <- function(input, type = c("fastq", "sequence", "quality")) { if (type == "fastq") { seqs <- sread(input) qual <- PhredQuality(quality(quality(input))) ids <- id(input) } check_width <- length(unique(width(input))) != 1 if (check_width) { input_width <- width(input) mean_width <- max(input_width) which_below <- which(input_width < mean_width) howmany <- mean_width - input_width[which_below] if (type == "fastq") { to_paste_s <- cpp_create_stringvec(howmany, "N") # cpp source seqs[which_below] <- paste0(seqs[which_below], to_paste_s) to_paste_q <- cpp_create_stringvec(howmany, "}") qual[which_below] <- paste0(qual[which_below], to_paste_q) out <- ShortReadQ(sread = seqs, quality = qual, id = ids) attr(out, "editedWidth") <- TRUE return(out) } else if (type == "sequence") { to_paste_s <- cpp_create_stringvec(howmany, "N") input[which_below] <- paste0(input[which_below], to_paste_s) attr(input, "editedWidth") <- TRUE return(input) } else if (type == "quality") { to_paste_q <- cpp_create_stringvec(howmany, "}") input[which_below] <- paste0(input[which_below], to_paste_q) attr(input, "editedWidth") <- TRUE return(input) } } else { attr(input, "editedWidth") <- FALSE return(input) } } #' Inject a letter in a set of sequences at random positions #' @param my_seq character vector with sequences to inject #' @param how_many_seqs How many sequences pick to inject Ns. #' An interval [min_s, max_s] with min_s minimum #' and max_s maximum sequences can be passed. In this case, #' a value is picked from the interval. #' If NULL, a random value within the interval [1, length(my_seq)] is picked. #' @param how_many_letters How many times inject the letter #' in the i sequences that are going to be injected. An interval [min_i max_i] #' can be passed. In this case, a value is randomly #' picked for each sequence i. This value represents the number of times #' that the letter will be injected in the sequence i. #' If NULL, a random value within the interval [1, width(my_seq[i])] #' is picked for each sequence i. #' @param letter Letter to inject. Default: 'N' #' @return character vector #' @examples #' #' # For reproducible examples, make a call to set.seed before #' # running each random function #' #' set.seed(10) #' s <- random_seq(slength = 10, swidth = 20) #' #' set.seed(10) #' s <- inject_letter_random(s, how_many_seqs = 1:30, how_many= 2:10) #' #' @author Leandro Roser \email{learoser@@gmail.com} #' @export inject_letter_random <- function(my_seq, how_many_seqs = NULL, how_many_letters = NULL, letter = "N") { if (!is.null(how_many_seqs)) { if (any(how_many_seqs < 0)) stop("how_many_seqs must be > 0") } if (!is.null(how_many_letters)) { if (any(how_many_letters < 0)) stop("how_many_letters must be > 0") } if (is.null(how_many_seqs)) { how_many_seqs <- sample(length(my_seq), 1) } else { how_many_seqs <- c(max(min(how_many_seqs), 1), min(max(how_many_seqs), length(my_seq))) how_many_seqs <- sample(how_many_seqs[1]:how_many_seqs[2], size = 1) } inject_to <- sample(seq_along(my_seq), size = how_many_seqs) for (i in inject_to) { this_seq <- strsplit(as.character(my_seq[i]), "")[[1]] this_seq_len <- length(this_seq) if (is.null(how_many_letters)) { this_how_many <- sample(seq_len(this_seq_len), size = 1) } else { this_how_many <- c(max(min(how_many_letters), 1), min(max(how_many_letters), this_seq_len)) this_how_many <- sample(this_how_many[1]:this_how_many[2], 1) } positions <- sample(this_seq_len, size = this_how_many) this_seq[positions] <- letter this_seq <- paste(this_seq, collapse = "") my_seq[i] <- this_seq } my_seq } #' Create sequences names #' @param n Number of reads #' @param base_name Base name for strings #' @param sep Character separing base names and the read number. Default: '_ #' @description Create \code{\link[Biostrings:XStringSet-class]{BStringSet}} #' object with names #' @return \code{\link[Biostrings:XStringSet-class]{BStringSet}} object #' @examples #' snames <- seq_names(10) #' snames #' snames2 <- seq_names(10, base_name = 's', sep = '.') #' snames2 #' @export seq_names <- function(n, base_name = "s", sep = "_") { BStringSet(paste0(base_name, sep, seq_len(n))) } #' Create a named object with random sequences and qualities #' @param n number of sequences #' @param widths width of the sequences #' @param random_widths width must be picked at random from the #' passed parameter 'widths', considering the value #' as an interval where any integer can be picked. Default TRUE. #' Otherwise, widths are picked only from the vector passed. #' @param replace sample widths with replacement? Default TRUE. #' @param nuc create sequences of DNA (nucleotides = c('A', 'C', 'G', 'T')) or #' RNA (nucleotides = c('A, 'C', 'G', 'U'))?. Default: 'DNA' #' @param len_prob vector with probabilities for each width value. #' Default NULL (equiprobability) #' @param seq_prob a vector of four probabilities values to set #' the frequency of the nucleotides 'A', 'C', 'G', 'T', for DNA, #' or 'A', 'C', 'G', 'U', for RNA. For example = c(0.25, 0.25, 0.5, 0). #' Default is = c(0.25, 0.25, 0.25, 0.25) (equiprobability for the 4 bases). #' If the sum of the probabilities is > 1, the values will be nomalized #' to the range [0, 1]. #' @param q_prob a vector of range = range(qual), with probabilities to set #' the frequency of each quality value. Default is equiprobability. #' If the sum of the probabilities is > 1, the values will be nomalized #' to the range [0, 1]. #' @param qual quality range for the sequences. It must be a range included #' in the selected encoding: #' #' 'Sanger' = [0, 40] #' #' 'Illumina1.8' = [0, 41] #' #' 'Illumina1.5' = [0, 40] #' #' 'Illumina1.3' = [3, 40] #' #' 'Solexa' = [-5, 40] #' #' example: for a range from 20 to 30 in Sanger encoding, #' pass the argument = c(20, 30) #' @param encod sequence encoding #' @param base_name Base name for strings #' @param sep Character separing base names and the read number. Default: '_' #' @return \code{\link[ShortRead:ShortReadQ-class]{ShortReadQ}} object #' @description Create a \code{\link[ShortRead:ShortReadQ-class]{ShortReadQ}} #' object with random sequences and qualities #' @examples #' #' # For reproducible examples, make a call to set.seed before #' # running each random function #' #' set.seed(10) #' s1 <- random_seq(slength = 10, swidth = 20) #' s1 #' #' set.seed(10) #' s2 <- random_seq(slength = 10, swidth = 20, #' prob = c(0.6, 0.1, 0.3, 0)) #' s2 #' #' @author Leandro Roser \email{learoser@@gmail.com} #' @export random_length <- function(n, widths, random_widths = TRUE, replace = TRUE, len_prob = NULL, seq_prob = c(0.25, 0.25, 0.25, 0.25), q_prob = NULL, nuc = c("DNA", "RNA"), qual = NULL, encod = c("Sanger", "Illumina1.8", "Illumina1.5", "Illumina1.3", "Solexa"), base_name = "s", sep = "_") { if (any(widths < 1)) { stop("lengths must be > 1") } if (random_widths) { widths <- sample(min(widths):max(widths), n, replace = replace) } else { widths <- sample(widths, size = n, replace = replace, prob = len_prob) } s_out <- random_seq(slength = n, swidth = widths, nuc = nuc, prob = seq_prob) q_out <- random_qual(slength = n, swidth = widths, qual = qual, encod = encod, prob = q_prob) n_out <- seq_names(n = n, base_name = base_name, sep = sep) ShortReadQ(s_out, q_out, n_out) }