Browse code

Added Poisson & NB capabilities when simulating sequences

Stevie Ped authored on 10/03/2025 13:19:19
Showing 1 changed files
... ...
@@ -12,6 +12,8 @@ simSeq(
12 12
   prob = rep(0.25, 4),
13 13
   shape1 = 1,
14 14
   shape2 = 1,
15
+  rate = NULL,
16
+  theta = NULL,
15 17
   as = "DNAStringSet",
16 18
   ...
17 19
 )
... ...
@@ -25,10 +27,19 @@ simSeq(
25 27
 
26 28
 \item{nt}{Nucleotides to include}
27 29
 
28
-\item{prob}{Sampling probablities for each nucleotide}
30
+\item{prob}{Sampling probabilities for each nucleotide}
29 31
 
30 32
 \item{shape1, shape2}{Passed to \link[VGAM]{rbetabinom.ab}}
31 33
 
34
+\item{rate}{The expected rate of motifs per sequence. Is equivalent to
35
+\eqn{ \lambda } in \link[stats]{rpois}. If set to NULL, all sequences will
36
+be simulated with a single motif, otherwise a Poisson distribution will be used}
37
+
38
+\item{theta}{Overdispersion parameter passed to \link[MASS]{rnegbin}.
39
+If set to NULL the rate parameter will be passed to \link[stats]{rpois}.
40
+However if this value is set, the rate and theta parameters are passed to
41
+\link[MASS]{rnegbin} to simulate overdispersed counts}
42
+
32 43
 \item{as}{ObjectClass to return objects as. Defaults to DNAStringSet, but
33 44
 other viable options may include 'character', 'CharacterList' or any
34 45
 other class from which a character vector may be coerced.}
... ...
@@ -52,9 +63,18 @@ If a PWM/PFM is supplied, the shape parameters are first passed to
52 63
 \link[VGAM]{rbetabinom.ab} to determine the random positions the motif will
53 64
 be placed, with the default parameters representing a discrete uniform
54 65
 distribution.
55
-Once positions for the TFBM have been selected, nucleotides will be randomly
56
-sampled using the probabilities provided in the PWM and these motifs will be
57
-placed at the randomly sample positions
66
+
67
+The sequences to have a motif inserted will be selected, along with the
68
+number of motifs, using the rate and theta parameters.
69
+If both are NULL, every sequence will have a single motif inserted.
70
+If the rate is > 0 and theta is NULL, sequences will be selected to have
71
+motifs inserted using a poisson distribution.
72
+If theta is also provided, sequences will be selected to contain motifs
73
+using a negative binomial distribution
74
+
75
+Once positions and sequences for the TFBM have been selected, nucleotides
76
+will be randomly sampled using the probabilities provided in the PWM and
77
+these motifs will be placed at the randomly sampled positions
58 78
 }
59 79
 \examples{
60 80
 ## Randomly generate 10x50nt sequences without any TFBMs present
Browse code

Added ComplexUpset to Suggests

Stevie Ped authored on 08/03/2025 03:35:00
Showing 1 changed files
... ...
@@ -66,7 +66,9 @@ sim_seq <- simSeq(10, width = 20, pfm = ex_pfm$ESR1)
66 66
 sim_seq
67 67
 ## The position of the motif within each sequence is included in the mcols
68 68
 mcols(sim_seq)
69
+
69 70
 ## Use this to extract the random motifs from the random sequences
71
+library(IRanges)
70 72
 i <- mcols(sim_seq)$pos + cumsum(width(sim_seq)) - width(sim_seq)
71 73
 Views(unlist(sim_seq), start = i, width = 10)
72 74
 
Browse code

Added simSeq

Stevie Ped authored on 12/01/2025 08:01:23
Showing 1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,74 @@
1
+% Generated by roxygen2: do not edit by hand
2
+% Please edit documentation in R/simSeq.R
3
+\name{simSeq}
4
+\alias{simSeq}
5
+\title{Simulate sequences using optional TFBMs}
6
+\usage{
7
+simSeq(
8
+  n,
9
+  width,
10
+  pfm = NULL,
11
+  nt = c("A", "C", "G", "T"),
12
+  prob = rep(0.25, 4),
13
+  shape1 = 1,
14
+  shape2 = 1,
15
+  as = "DNAStringSet",
16
+  ...
17
+)
18
+}
19
+\arguments{
20
+\item{n}{The number of sequences to simulate}
21
+
22
+\item{width}{Width of sequences to simulate}
23
+
24
+\item{pfm}{Probability Weight/Frequency Matrix}
25
+
26
+\item{nt}{Nucleotides to include}
27
+
28
+\item{prob}{Sampling probablities for each nucleotide}
29
+
30
+\item{shape1, shape2}{Passed to \link[VGAM]{rbetabinom.ab}}
31
+
32
+\item{as}{ObjectClass to return objects as. Defaults to DNAStringSet, but
33
+other viable options may include 'character', 'CharacterList' or any
34
+other class from which a character vector may be coerced.}
35
+
36
+\item{...}{Not used}
37
+}
38
+\value{
39
+By default a DNAStringSet will be returned.
40
+If possible, the position of any randomly sampled motifs will be included
41
+in the mcols element of the returned object.
42
+}
43
+\description{
44
+Simulate a set of fixed-width sequences using optional TFBMs
45
+}
46
+\details{
47
+Using the nucleotide and probabilities provided as set of sequences can be
48
+simulated. By default, this will effectively be a set of 'background'
49
+sequences, with letters effectively chosen at random.
50
+
51
+If a PWM/PFM is supplied, the shape parameters are first passed to
52
+\link[VGAM]{rbetabinom.ab} to determine the random positions the motif will
53
+be placed, with the default parameters representing a discrete uniform
54
+distribution.
55
+Once positions for the TFBM have been selected, nucleotides will be randomly
56
+sampled using the probabilities provided in the PWM and these motifs will be
57
+placed at the randomly sample positions
58
+}
59
+\examples{
60
+## Randomly generate 10x50nt sequences without any TFBMs present
61
+simSeq(10, 50)
62
+
63
+## Now place a motif at random positions
64
+data('ex_pfm')
65
+sim_seq <- simSeq(10, width = 20, pfm = ex_pfm$ESR1)
66
+sim_seq
67
+## The position of the motif within each sequence is included in the mcols
68
+mcols(sim_seq)
69
+## Use this to extract the random motifs from the random sequences
70
+i <- mcols(sim_seq)$pos + cumsum(width(sim_seq)) - width(sim_seq)
71
+Views(unlist(sim_seq), start = i, width = 10)
72
+
73
+
74
+}