... | ... |
@@ -12,6 +12,8 @@ simSeq( |
12 | 12 |
prob = rep(0.25, 4), |
13 | 13 |
shape1 = 1, |
14 | 14 |
shape2 = 1, |
15 |
+ rate = NULL, |
|
16 |
+ theta = NULL, |
|
15 | 17 |
as = "DNAStringSet", |
16 | 18 |
... |
17 | 19 |
) |
... | ... |
@@ -25,10 +27,19 @@ simSeq( |
25 | 27 |
|
26 | 28 |
\item{nt}{Nucleotides to include} |
27 | 29 |
|
28 |
-\item{prob}{Sampling probablities for each nucleotide} |
|
30 |
+\item{prob}{Sampling probabilities for each nucleotide} |
|
29 | 31 |
|
30 | 32 |
\item{shape1, shape2}{Passed to \link[VGAM]{rbetabinom.ab}} |
31 | 33 |
|
34 |
+\item{rate}{The expected rate of motifs per sequence. Is equivalent to |
|
35 |
+\eqn{ \lambda } in \link[stats]{rpois}. If set to NULL, all sequences will |
|
36 |
+be simulated with a single motif, otherwise a Poisson distribution will be used} |
|
37 |
+ |
|
38 |
+\item{theta}{Overdispersion parameter passed to \link[MASS]{rnegbin}. |
|
39 |
+If set to NULL the rate parameter will be passed to \link[stats]{rpois}. |
|
40 |
+However if this value is set, the rate and theta parameters are passed to |
|
41 |
+\link[MASS]{rnegbin} to simulate overdispersed counts} |
|
42 |
+ |
|
32 | 43 |
\item{as}{ObjectClass to return objects as. Defaults to DNAStringSet, but |
33 | 44 |
other viable options may include 'character', 'CharacterList' or any |
34 | 45 |
other class from which a character vector may be coerced.} |
... | ... |
@@ -52,9 +63,18 @@ If a PWM/PFM is supplied, the shape parameters are first passed to |
52 | 63 |
\link[VGAM]{rbetabinom.ab} to determine the random positions the motif will |
53 | 64 |
be placed, with the default parameters representing a discrete uniform |
54 | 65 |
distribution. |
55 |
-Once positions for the TFBM have been selected, nucleotides will be randomly |
|
56 |
-sampled using the probabilities provided in the PWM and these motifs will be |
|
57 |
-placed at the randomly sample positions |
|
66 |
+ |
|
67 |
+The sequences to have a motif inserted will be selected, along with the |
|
68 |
+number of motifs, using the rate and theta parameters. |
|
69 |
+If both are NULL, every sequence will have a single motif inserted. |
|
70 |
+If the rate is > 0 and theta is NULL, sequences will be selected to have |
|
71 |
+motifs inserted using a poisson distribution. |
|
72 |
+If theta is also provided, sequences will be selected to contain motifs |
|
73 |
+using a negative binomial distribution |
|
74 |
+ |
|
75 |
+Once positions and sequences for the TFBM have been selected, nucleotides |
|
76 |
+will be randomly sampled using the probabilities provided in the PWM and |
|
77 |
+these motifs will be placed at the randomly sampled positions |
|
58 | 78 |
} |
59 | 79 |
\examples{ |
60 | 80 |
## Randomly generate 10x50nt sequences without any TFBMs present |
... | ... |
@@ -66,7 +66,9 @@ sim_seq <- simSeq(10, width = 20, pfm = ex_pfm$ESR1) |
66 | 66 |
sim_seq |
67 | 67 |
## The position of the motif within each sequence is included in the mcols |
68 | 68 |
mcols(sim_seq) |
69 |
+ |
|
69 | 70 |
## Use this to extract the random motifs from the random sequences |
71 |
+library(IRanges) |
|
70 | 72 |
i <- mcols(sim_seq)$pos + cumsum(width(sim_seq)) - width(sim_seq) |
71 | 73 |
Views(unlist(sim_seq), start = i, width = 10) |
72 | 74 |
|
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,74 @@ |
1 |
+% Generated by roxygen2: do not edit by hand |
|
2 |
+% Please edit documentation in R/simSeq.R |
|
3 |
+\name{simSeq} |
|
4 |
+\alias{simSeq} |
|
5 |
+\title{Simulate sequences using optional TFBMs} |
|
6 |
+\usage{ |
|
7 |
+simSeq( |
|
8 |
+ n, |
|
9 |
+ width, |
|
10 |
+ pfm = NULL, |
|
11 |
+ nt = c("A", "C", "G", "T"), |
|
12 |
+ prob = rep(0.25, 4), |
|
13 |
+ shape1 = 1, |
|
14 |
+ shape2 = 1, |
|
15 |
+ as = "DNAStringSet", |
|
16 |
+ ... |
|
17 |
+) |
|
18 |
+} |
|
19 |
+\arguments{ |
|
20 |
+\item{n}{The number of sequences to simulate} |
|
21 |
+ |
|
22 |
+\item{width}{Width of sequences to simulate} |
|
23 |
+ |
|
24 |
+\item{pfm}{Probability Weight/Frequency Matrix} |
|
25 |
+ |
|
26 |
+\item{nt}{Nucleotides to include} |
|
27 |
+ |
|
28 |
+\item{prob}{Sampling probablities for each nucleotide} |
|
29 |
+ |
|
30 |
+\item{shape1, shape2}{Passed to \link[VGAM]{rbetabinom.ab}} |
|
31 |
+ |
|
32 |
+\item{as}{ObjectClass to return objects as. Defaults to DNAStringSet, but |
|
33 |
+other viable options may include 'character', 'CharacterList' or any |
|
34 |
+other class from which a character vector may be coerced.} |
|
35 |
+ |
|
36 |
+\item{...}{Not used} |
|
37 |
+} |
|
38 |
+\value{ |
|
39 |
+By default a DNAStringSet will be returned. |
|
40 |
+If possible, the position of any randomly sampled motifs will be included |
|
41 |
+in the mcols element of the returned object. |
|
42 |
+} |
|
43 |
+\description{ |
|
44 |
+Simulate a set of fixed-width sequences using optional TFBMs |
|
45 |
+} |
|
46 |
+\details{ |
|
47 |
+Using the nucleotide and probabilities provided as set of sequences can be |
|
48 |
+simulated. By default, this will effectively be a set of 'background' |
|
49 |
+sequences, with letters effectively chosen at random. |
|
50 |
+ |
|
51 |
+If a PWM/PFM is supplied, the shape parameters are first passed to |
|
52 |
+\link[VGAM]{rbetabinom.ab} to determine the random positions the motif will |
|
53 |
+be placed, with the default parameters representing a discrete uniform |
|
54 |
+distribution. |
|
55 |
+Once positions for the TFBM have been selected, nucleotides will be randomly |
|
56 |
+sampled using the probabilities provided in the PWM and these motifs will be |
|
57 |
+placed at the randomly sample positions |
|
58 |
+} |
|
59 |
+\examples{ |
|
60 |
+## Randomly generate 10x50nt sequences without any TFBMs present |
|
61 |
+simSeq(10, 50) |
|
62 |
+ |
|
63 |
+## Now place a motif at random positions |
|
64 |
+data('ex_pfm') |
|
65 |
+sim_seq <- simSeq(10, width = 20, pfm = ex_pfm$ESR1) |
|
66 |
+sim_seq |
|
67 |
+## The position of the motif within each sequence is included in the mcols |
|
68 |
+mcols(sim_seq) |
|
69 |
+## Use this to extract the random motifs from the random sequences |
|
70 |
+i <- mcols(sim_seq)$pos + cumsum(width(sim_seq)) - width(sim_seq) |
|
71 |
+Views(unlist(sim_seq), start = i, width = 10) |
|
72 |
+ |
|
73 |
+ |
|
74 |
+} |