... | ... |
@@ -6,12 +6,27 @@ |
6 | 6 |
#' across 2 experimental conditions from a real scRNA-seq data set. |
7 | 7 |
#' |
8 | 8 |
#' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. |
9 |
-#' @param nc,ns,nk # of cells, samples and clusters to simulate. |
|
10 |
-#' By default, \code{ns = NULL} will simulated as many samples as |
|
11 |
-#' available in the reference to avoid duplicated reference samples. |
|
9 |
+#' @param ng number of genes to simulate. Importantly, for the library sizes |
|
10 |
+#' computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense, |
|
11 |
+#' the number of simulated genes should match with the number of genes |
|
12 |
+#' in the reference. To simulate a reduced number of genes, e.g. for |
|
13 |
+#' testing and development purposes, please set \code{force = TRUE}. |
|
14 |
+#' @param nc number of cells to simulate. |
|
15 |
+#' @param nk number of clusters to simulate; defaults to the number |
|
16 |
+#' of available reference clusters (\code{nlevels(x$cluster_id)}). |
|
17 |
+#' @param ns number of samples to simulate; defaults to as many as |
|
18 |
+#' available in the reference to avoid duplicated reference samples. |
|
19 |
+#' Specifically, the number of samples will be set to |
|
20 |
+#' \code{n = nlevels(x$sample_id)} when \code{dd = FALSE}, |
|
21 |
+#' \code{n} per group when \code{dd, paired = TRUE}, and |
|
22 |
+#' \code{floor(n/2)} per group when \code{dd = TRUE, paired = FALSE}. |
|
23 |
+#' When a larger number samples should be simulated, set \code{force = TRUE}. |
|
12 | 24 |
#' @param probs a list of length 3 containing probabilities of a cell belonging |
13 | 25 |
#' to each cluster, sample, and group, respectively. List elements must be |
14 | 26 |
#' NULL (equal probabilities) or numeric values in [0, 1] that sum to 1. |
27 |
+#' @param dd whether or not to simulate differential distributions; if TRUE, |
|
28 |
+#' two groups are simulated and \code{ns} corresponds to the number of |
|
29 |
+#' samples per group, else one group with \code{ns} samples is simulated. |
|
15 | 30 |
#' @param p_dd numeric vector of length 6. |
16 | 31 |
#' Specifies the probability of a gene being |
17 | 32 |
#' EE, EP, DE, DP, DM, or DB, respectively. |
... | ... |
@@ -39,17 +54,12 @@ |
39 | 54 |
#' of the branches lengths separating them. |
40 | 55 |
#' @param phylo_pars vector of length 2 providing the parameters that control |
41 | 56 |
#' the number of type genes. Passed to an exponential PDF (see details). |
42 |
-#' |
|
43 |
-#' @param ng # of genes to simulate. Importantly, for the library sizes |
|
44 |
-#' computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense, |
|
45 |
-#' the number of simulated genes should match with the number of genes |
|
46 |
-#' in the reference. To simulate a reduced number of genes, e.g. for |
|
47 |
-#' testing and development purposes, please set \code{force = TRUE}. |
|
48 |
-#' @param force logical specifying whether to force |
|
49 |
-#' simulation despite \code{ng != nrow(x)}. |
|
57 |
+#' @param force logical specifying whether to force simulation |
|
58 |
+#' when \code{ng} and/or \code{ns} don't match the number of |
|
59 |
+#' available reference genes and samples, respectively. |
|
50 | 60 |
#' |
51 | 61 |
#' @details The simulation of type genes can be performed in 2 ways; |
52 |
-#' (1) via \code{p_type} to simulate independant clusters, OR |
|
62 |
+#' (1) via \code{p_type} to simulate independent clusters, OR |
|
53 | 63 |
#' (2) via \code{phylo_tree} to simulate a hierarchical cluster structure. |
54 | 64 |
#' |
55 | 65 |
#' For (1), a subset of \code{p_type} \% of genes are selected per cluster |
... | ... |
@@ -89,11 +99,11 @@ |
89 | 99 |
#' \item{\code{args}}{a list of the function call's input arguments.}}}} |
90 | 100 |
#' |
91 | 101 |
#' @examples |
92 |
-#' data(sce) |
|
102 |
+#' data(example_sce) |
|
93 | 103 |
#' library(SingleCellExperiment) |
94 | 104 |
#' |
95 | 105 |
#' # prep. SCE for simulation |
96 |
-#' ref <- prepSim(sce) |
|
106 |
+#' ref <- prepSim(example_sce) |
|
97 | 107 |
#' |
98 | 108 |
#' # simulate data |
99 | 109 |
#' (sim <- simData(ref, nc = 200, |
... | ... |
@@ -179,11 +189,17 @@ simData <- function(x, |
179 | 189 |
if (is.null(x$cluster_id)) { |
180 | 190 |
x$cluster_id <- factor("foo") |
181 | 191 |
no_k <- TRUE |
182 |
- } else no_k <- FALSE |
|
192 |
+ } else { |
|
193 |
+ x$cluster_id <- droplevels(factor(x$cluster_id)) |
|
194 |
+ no_k <- nlevels(x$cluster_id) == 1 |
|
195 |
+ } |
|
183 | 196 |
if (is.null(x$sample_id)) { |
184 | 197 |
x$sample_id <- factor("foo") |
185 | 198 |
no_s <- TRUE |
186 |
- } else no_s <- FALSE |
|
199 |
+ } else { |
|
200 |
+ x$sample_id <- droplevels(factor(x$sample_id)) |
|
201 |
+ no_s <- nlevels(x$sample_id) == 1 |
|
202 |
+ } |
|
187 | 203 |
|
188 | 204 |
# store all input arguments to be returned in final output |
189 | 205 |
args <- c(as.list(environment())) |
... | ... |
@@ -192,12 +208,14 @@ simData <- function(x, |
192 | 208 |
.check_sce(x, req_group = FALSE) |
193 | 209 |
args_tmp <- .check_args_simData(as.list(environment())) |
194 | 210 |
nk <- args$nk <- args_tmp$nk |
195 |
- ns <- args$ns <- args_tmp$ns |
|
196 |
- |
|
211 |
+ |
|
197 | 212 |
# reference IDs |
198 | 213 |
nk0 <- length(kids0 <- set_names(levels(x$cluster_id))) |
199 | 214 |
ns0 <- length(sids0 <- set_names(levels(x$sample_id))) |
200 | 215 |
|
216 |
+ # get number of samples to simulate |
|
217 |
+ ns <- .get_ns(ns0, ns, dd, paired, force) |
|
218 |
+ |
|
201 | 219 |
# simulation IDs |
202 | 220 |
nk <- length(kids <- set_names(paste0("cluster", seq_len(nk)))) |
203 | 221 |
sids <- set_names(paste0("sample", seq_len(ns))) |
... | ... |
@@ -211,8 +229,13 @@ simData <- function(x, |
211 | 229 |
ref_sids <- cbind(ref_sids, ref_sids) |
212 | 230 |
} else { |
213 | 231 |
# draw reference samples at random for each group |
214 |
- sidsA <- sample(sids0, ns, ns > ns0) |
|
215 |
- sidsB <- sample(setdiff(sids0, sidsA), ns, ns > ns0) |
|
232 |
+ sidsA <- sample(sids0, ns, force && ns > ns0) |
|
233 |
+ if (force) { |
|
234 |
+ sidsB <- sample(sids0, ns, ns > ns0) |
|
235 |
+ } else { |
|
236 |
+ sidsB <- setdiff(sids0, sidsA) |
|
237 |
+ sidsB <- sample(sidsB, ns) |
|
238 |
+ } |
|
216 | 239 |
ref_sids <- cbind(sidsA, sidsB) |
217 | 240 |
} |
218 | 241 |
dimnames(ref_sids) <- list(sids, gids) |
... | ... |
@@ -299,6 +299,7 @@ simData <- function(x, |
299 | 299 |
bs$cluster_id <- cbind(0, bs$cluster_id) |
300 | 300 |
names(bs$cluster_id) <- kids0 |
301 | 301 |
} |
302 |
+ os <- x$offset |
|
302 | 303 |
b0 <- bs$beta0 |
303 | 304 |
ds <- rowData(x)$disp |
304 | 305 |
|
... | ... |
@@ -337,8 +338,8 @@ simData <- function(x, |
337 | 338 |
ds_kc <- ds[gs0] |
338 | 339 |
lfc_kc <- lfc[[c, k]] |
339 | 340 |
bs_ksc <- exp(rowSums(bs_ks[gs0, , drop = FALSE])) |
340 |
- ms_g1 <- outer(bs_ksc, exp(x$offset[cs_g1]), "*") |
|
341 |
- ms_g2 <- outer(bs_ksc, exp(x$offset[cs_g2]), "*") |
|
341 |
+ ms_g1 <- outer(bs_ksc, exp(os[cs_g1]), "*") |
|
342 |
+ ms_g2 <- outer(bs_ksc, exp(os[cs_g2]), "*") |
|
342 | 343 |
|
343 | 344 |
re <- .sim(c, cs_g1, cs_g2, ms_g1, ms_g2, ds_kc, lfc_kc, p_ep, p_dp, p_dm) |
344 | 345 |
y[gi, unlist(ci)] <- re$cs |
... | ... |
@@ -7,6 +7,8 @@ |
7 | 7 |
#' |
8 | 8 |
#' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. |
9 | 9 |
#' @param nc,ns,nk # of cells, samples and clusters to simulate. |
10 |
+#' By default, \code{ns = NULL} will simulated as many samples as |
|
11 |
+#' available in the reference to avoid duplicated reference samples. |
|
10 | 12 |
#' @param probs a list of length 3 containing probabilities of a cell belonging |
11 | 13 |
#' to each cluster, sample, and group, respectively. List elements must be |
12 | 14 |
#' NULL (equal probabilities) or numeric values in [0, 1] that sum to 1. |
... | ... |
@@ -157,12 +159,14 @@ |
157 | 159 |
#' @importFrom S4Vectors split unfactor |
158 | 160 |
#' @export |
159 | 161 |
|
160 |
-simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
|
161 |
- probs = NULL, p_dd = diag(6)[1, ], paired = FALSE, |
|
162 |
+simData <- function(x, |
|
163 |
+ ng = nrow(x), nc = ncol(x), |
|
164 |
+ ns = NULL, nk = NULL, probs = NULL, |
|
165 |
+ dd = TRUE, p_dd = diag(6)[1, ], paired = FALSE, |
|
162 | 166 |
p_ep = 0.5, p_dp = 0.3, p_dm = 0.5, |
163 | 167 |
p_type = 0, lfc = 2, rel_lfc = NULL, |
164 | 168 |
phylo_tree = NULL, phylo_pars = c(ifelse(is.null(phylo_tree), 0, 0.1), 3), |
165 |
- ng = nrow(x), force = FALSE) { |
|
169 |
+ force = FALSE) { |
|
166 | 170 |
|
167 | 171 |
# throughout this code... |
168 | 172 |
# k: cluster ID |
... | ... |
@@ -171,6 +175,16 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
171 | 175 |
# c: DD category |
172 | 176 |
# 0: reference |
173 | 177 |
|
178 |
+ # add mock cluster/sample ID if missing |
|
179 |
+ if (is.null(x$cluster_id)) { |
|
180 |
+ x$cluster_id <- factor("foo") |
|
181 |
+ no_k <- TRUE |
|
182 |
+ } else no_k <- FALSE |
|
183 |
+ if (is.null(x$sample_id)) { |
|
184 |
+ x$sample_id <- factor("foo") |
|
185 |
+ no_s <- TRUE |
|
186 |
+ } else no_s <- FALSE |
|
187 |
+ |
|
174 | 188 |
# store all input arguments to be returned in final output |
175 | 189 |
args <- c(as.list(environment())) |
176 | 190 |
|
... | ... |
@@ -178,6 +192,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
178 | 192 |
.check_sce(x, req_group = FALSE) |
179 | 193 |
args_tmp <- .check_args_simData(as.list(environment())) |
180 | 194 |
nk <- args$nk <- args_tmp$nk |
195 |
+ ns <- args$ns <- args_tmp$ns |
|
181 | 196 |
|
182 | 197 |
# reference IDs |
183 | 198 |
nk0 <- length(kids0 <- set_names(levels(x$cluster_id))) |
... | ... |
@@ -190,17 +205,18 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
190 | 205 |
|
191 | 206 |
# sample reference clusters & samples |
192 | 207 |
ref_kids <- setNames(sample(kids0, nk, nk > nk0), kids) |
193 |
- if (paired) { |
|
208 |
+ if (!dd || paired) { |
|
194 | 209 |
# use same set of reference samples for both groups |
195 | 210 |
ref_sids <- sample(sids0, ns, ns > ns0) |
196 |
- ref_sids <- replicate(length(gids), ref_sids) |
|
211 |
+ ref_sids <- cbind(ref_sids, ref_sids) |
|
197 | 212 |
} else { |
198 | 213 |
# draw reference samples at random for each group |
199 |
- ref_sids <- replicate(length(gids), |
|
200 |
- sample(sids0, ns, ns > ns0)) |
|
214 |
+ sidsA <- sample(sids0, ns, ns > ns0) |
|
215 |
+ sidsB <- sample(setdiff(sids0, sidsA), ns, ns > ns0) |
|
216 |
+ ref_sids <- cbind(sidsA, sidsB) |
|
201 | 217 |
} |
202 | 218 |
dimnames(ref_sids) <- list(sids, gids) |
203 |
- |
|
219 |
+ |
|
204 | 220 |
if (is.null(rel_lfc)) |
205 | 221 |
rel_lfc <- rep(1, nk) |
206 | 222 |
if (is.null(names(rel_lfc))) { |
... | ... |
@@ -274,14 +290,17 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
274 | 290 |
}), vector("list", length(cats))) |
275 | 291 |
|
276 | 292 |
# compute NB parameters |
277 |
- m <- lapply(sids0, function(s) { |
|
278 |
- b <- paste0("beta.", s) |
|
279 |
- b <- exp(rowData(x)[[b]]) |
|
280 |
- m <- outer(b, exp(x$offset), "*") |
|
281 |
- dimnames(m) <- dimnames(x); m |
|
282 |
- }) |
|
283 |
- d <- rowData(x)$dispersion |
|
284 |
- names(d) <- rownames(x) |
|
293 |
+ bs <- rowData(x)$beta |
|
294 |
+ if (!is.null(bs$sample_id)) { |
|
295 |
+ bs$sample_id <- cbind(0, bs$sample_id) |
|
296 |
+ names(bs$sample_id) <- sids0 |
|
297 |
+ } |
|
298 |
+ if (!is.null(bs$cluster_id)) { |
|
299 |
+ bs$cluster_id <- cbind(0, bs$cluster_id) |
|
300 |
+ names(bs$cluster_id) <- kids0 |
|
301 |
+ } |
|
302 |
+ b0 <- bs$beta0 |
|
303 |
+ ds <- rowData(x)$disp |
|
285 | 304 |
|
286 | 305 |
# initialize list of depth two to store |
287 | 306 |
# simulation means in each cluster & group |
... | ... |
@@ -292,13 +311,18 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
292 | 311 |
# run simulation ----------------------------------------------------------- |
293 | 312 |
for (k in kids) { |
294 | 313 |
for (s in sids) { |
314 |
+ # get output cell indices |
|
315 |
+ ci <- cs_idx[[k]][[s]] |
|
316 |
+ |
|
295 | 317 |
# get reference samples, clusters & cells |
296 | 318 |
s0 <- ref_sids[s, ] |
297 | 319 |
k0 <- ref_kids[k] |
298 | 320 |
cs0 <- cs_by_ks[[k0]][s0] |
299 | 321 |
|
300 |
- # get output cell indices |
|
301 |
- ci <- cs_idx[[k]][[s]] |
|
322 |
+ # get NB parameters |
|
323 |
+ bs_ks <- cbind(b0, |
|
324 |
+ bs$cluster_id[[k0]], |
|
325 |
+ bs$sample_id[[s0[1]]]) |
|
302 | 326 |
|
303 | 327 |
for (c in cats[n_dd[, k] != 0]) { |
304 | 328 |
# sample cells to simulate from |
... | ... |
@@ -310,12 +334,13 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
310 | 334 |
gi <- gs_idx[[c, k]] |
311 | 335 |
|
312 | 336 |
# get NB parameters |
313 |
- m_g1 <- m[[s0[[1]]]][gs0, cs_g1, drop = FALSE] |
|
314 |
- m_g2 <- m[[s0[[2]]]][gs0, cs_g2, drop = FALSE] |
|
315 |
- d_kc <- d[gs0] |
|
337 |
+ ds_kc <- ds[gs0] |
|
316 | 338 |
lfc_kc <- lfc[[c, k]] |
339 |
+ bs_ksc <- exp(rowSums(bs_ks[gs0, , drop = FALSE])) |
|
340 |
+ ms_g1 <- outer(bs_ksc, exp(x$offset[cs_g1]), "*") |
|
341 |
+ ms_g2 <- outer(bs_ksc, exp(x$offset[cs_g2]), "*") |
|
317 | 342 |
|
318 |
- re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc, p_ep, p_dp, p_dm) |
|
343 |
+ re <- .sim(c, cs_g1, cs_g2, ms_g1, ms_g2, ds_kc, lfc_kc, p_ep, p_dp, p_dm) |
|
319 | 344 |
y[gi, unlist(ci)] <- re$cs |
320 | 345 |
|
321 | 346 |
for (g in gids) sim_mean[[k]][[g]][gi] <- ifelse( |
... | ... |
@@ -331,7 +356,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
331 | 356 |
category = rep.int(rep(cats, nk), c(n_dd)), |
332 | 357 |
logFC = unlist(lfc), |
333 | 358 |
sim_gene = unlist(gs_by_kc), |
334 |
- sim_disp = d[unlist(gs_by_kc)]) %>% |
|
359 |
+ sim_disp = ds[unlist(gs_by_kc)]) %>% |
|
335 | 360 |
mutate_at("gene", as.character) |
336 | 361 |
# add true simulation means |
337 | 362 |
sim_mean <- sim_mean %>% |
... | ... |
@@ -348,14 +373,22 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
348 | 373 |
# construct SCE ------------------------------------------------------------ |
349 | 374 |
# cell metadata including group, sample, cluster IDs |
350 | 375 |
cd$group_id <- droplevels(cd$group_id) |
351 |
- cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = ".")) |
|
376 |
+ cd$sample_id <- if (dd) { |
|
377 |
+ factor(paste(cd$sample_id, cd$group_id, sep = ".")) |
|
378 |
+ } else factor(cd$sample_id) |
|
352 | 379 |
m <- match(levels(cd$sample_id), cd$sample_id) |
353 | 380 |
gids <- cd$group_id[m] |
354 | 381 |
o <- order(gids) |
355 | 382 |
sids <- levels(cd$sample_id)[o] |
356 | 383 |
cd <- cd %>% |
357 |
- mutate_at("cluster_id", factor, levels = kids) %>% |
|
358 |
- mutate_at("sample_id", factor, levels = sids) |
|
384 |
+ mutate_at("sample_id", factor, levels = sids) %>% |
|
385 |
+ mutate_at("cluster_id", factor, levels = kids) |
|
386 |
+ if (!dd) { |
|
387 |
+ cd$group_id <- NULL |
|
388 |
+ ref_sids <- ref_sids[, 1] |
|
389 |
+ } |
|
390 |
+ if (no_s) cd$sample_id <- NULL |
|
391 |
+ if (no_k) cd$cluster_id <- NULL |
|
359 | 392 |
# gene metadata storing gene classes & specificities |
360 | 393 |
rd <- DataFrame(class = factor(class, |
361 | 394 |
levels = c("state", "shared", "type"))) |
... | ... |
@@ -13,7 +13,7 @@ |
13 | 13 |
#' @param p_dd numeric vector of length 6. |
14 | 14 |
#' Specifies the probability of a gene being |
15 | 15 |
#' EE, EP, DE, DP, DM, or DB, respectively. |
16 |
-#' @param paired logial specifying whether a paired design should |
|
16 |
+#' @param paired logical specifying whether a paired design should |
|
17 | 17 |
#' be simulated (both groups use the same set of reference samples) |
18 | 18 |
#' or not (reference samples are drawn at random). |
19 | 19 |
#' @param p_ep,p_dp,p_dm numeric specifying the proportion of cells |
... | ... |
@@ -21,8 +21,8 @@ |
21 | 21 |
#' @param p_type numeric. Probability of EE/EP gene being a type-gene. |
22 | 22 |
#' If a gene is of class "type" in a given cluster, a unique mean |
23 | 23 |
#' will be used for that gene in the respective cluster. |
24 |
-#' @param lfc numeric value to use as mean logFC |
|
25 |
-#' for DE, DP, DM, and DB type of genes. |
|
24 |
+#' @param lfc numeric value to use as mean logFC |
|
25 |
+#' (logarithm base 2) for DE, DP, DM, and DB type of genes. |
|
26 | 26 |
#' @param rel_lfc numeric vector of relative logFCs for each cluster. |
27 | 27 |
#' Should be of length \code{nlevels(x$cluster_id)} with |
28 | 28 |
#' \code{levels(x$cluster_id)} as names. |
... | ... |
@@ -32,22 +32,11 @@ |
32 | 32 |
#' \href{http://evolution.genetics.washington.edu/phylip/newicktree.html}{here}. |
33 | 33 |
#' The distance between the nodes, except for the original branch, will be |
34 | 34 |
#' translated in the number of shared genes between the clusters belonging to |
35 |
-#' these nodes (this relation is controlled with \code{phylo_pars}). The distance |
|
36 |
-#' between two clusters is defined as the sum of the branches lengths |
|
37 |
-#' separating them. |
|
35 |
+#' these nodes (this relation is controlled with \code{phylo_pars}). |
|
36 |
+#' The distance between two clusters is defined as the sum |
|
37 |
+#' of the branches lengths separating them. |
|
38 | 38 |
#' @param phylo_pars vector of length 2 providing the parameters that control |
39 |
-#' the number of type genes. Passed to an exponential's PDF: |
|
40 |
-#' \code{N = #genes x gamma1 * e^(-gamma2 x dist)}, |
|
41 |
-#' |
|
42 |
-#' \itemize{ |
|
43 |
-#' \item \code{gamma1} is the parameter that controls the percentage of shared |
|
44 |
-#' genes between the nodes. By default 0.1 if a tree is given, meaning that |
|
45 |
-#' maximum 10% of the genes can be used as type genes (if gamma2 = 0). |
|
46 |
-#' However it's advised to tune it depending on the input \code{prep_sce}. |
|
47 |
-#' \code{gamma2} is the 'penalty' of increasing the distance between clusters |
|
48 |
-#' (\code{dist}, defined by \code{phylo_tree}), applied on the number of |
|
49 |
-#' shared genes. Default to 3. |
|
50 |
-#' } |
|
39 |
+#' the number of type genes. Passed to an exponential PDF (see details). |
|
51 | 40 |
#' |
52 | 41 |
#' @param ng # of genes to simulate. Importantly, for the library sizes |
53 | 42 |
#' computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense, |
... | ... |
@@ -58,13 +47,45 @@ |
58 | 47 |
#' simulation despite \code{ng != nrow(x)}. |
59 | 48 |
#' |
60 | 49 |
#' @details The simulation of type genes can be performed in 2 ways; |
61 |
-#' (1) by defining \code{p_type} and thus simulating independant clusters, OR |
|
62 |
-#' (2) by defining both \code{phylo_tree} and \code{phylo_pars}, which will |
|
63 |
-#' simulate a hierarchical structure between the clusters. |
|
50 |
+#' (1) via \code{p_type} to simulate independant clusters, OR |
|
51 |
+#' (2) via \code{phylo_tree} to simulate a hierarchical cluster structure. |
|
52 |
+#' |
|
53 |
+#' For (1), a subset of \code{p_type} \% of genes are selected per cluster |
|
54 |
+#' to use a different references genes than the remainder of clusters, |
|
55 |
+#' giving rise to cluster-specific NB means for count sampling. |
|
56 |
+#' |
|
57 |
+#' For (2), the number of shared/type genes at each node |
|
58 |
+#' are given by \code{a*G*e^(-b*d)}, where \itemize{ |
|
59 |
+#' \item{\code{a} -- controls the percentage of shared genes between nodes. |
|
60 |
+#' By default, at most 10\% of the genes are reserved as type genes |
|
61 |
+#' (when \code{b} = 0). However, it is advised to tune this parameter |
|
62 |
+#' depending on the input \code{prep_sce}.} |
|
63 |
+#' \item{\code{b} -- determines how the number of shared genes |
|
64 |
+#' decreases with increasing distance d between clusters |
|
65 |
+#' (defined through \code{phylo_tree}).}} |
|
64 | 66 |
#' |
65 | 67 |
#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}} |
66 |
-#' containing multiple clusters & samples across 2 groups. |
|
67 |
-#' |
|
68 |
+#' containing multiple clusters & samples across 2 groups |
|
69 |
+#' as well as the following metadata: \describe{ |
|
70 |
+#' \item{cell metadata (\code{colData(.)})}{a \code{DataFrame} containing, |
|
71 |
+#' containing, for each cell, it's cluster, sample, and group ID.} |
|
72 |
+#' \item{gene metadata (\code{rowData(.)})}{a \code{DataFrame} containing, |
|
73 |
+#' for each gene, it's \code{class} (one of "state", "type", "none") and |
|
74 |
+#' specificity (\code{specs}; NA for genes of type "state", otherwise |
|
75 |
+#' a character vector of clusters that share the given gene).} |
|
76 |
+#' \item{experiment metadata (\code{metadata(.)})}{ |
|
77 |
+#' \describe{ |
|
78 |
+#' \item{\code{experiment_info}}{a \code{data.frame} |
|
79 |
+#' summarizing the experimental design.} |
|
80 |
+#' \item{\code{n_cells}}{the number of cells for each sample.} |
|
81 |
+#' \item{\code{gene_info}}{a \code{data.frame} containing, for each gene |
|
82 |
+#' in each cluster, it's differential distribution \code{category}, |
|
83 |
+#' mean \code{logFC} (NA for genes for categories "ee" and "ep"), |
|
84 |
+#' gene used as reference (\code{sim_gene}), dispersion \code{sim_disp}, |
|
85 |
+#' and simulation means for each group \code{sim_mean.A/B}.} |
|
86 |
+#' \item{\code{ref_sids/kidskids}}{the sample/cluster IDs used as reference.} |
|
87 |
+#' \item{\code{args}}{a list of the function call's input arguments.}}}} |
|
88 |
+#' |
|
68 | 89 |
#' @examples |
69 | 90 |
#' data(sce) |
70 | 91 |
#' library(SingleCellExperiment) |
... | ... |
@@ -73,7 +94,7 @@ |
73 | 94 |
#' ref <- prepSim(sce) |
74 | 95 |
#' |
75 | 96 |
#' # simulate data |
76 |
-#' (sim <- simData(ref, nc = 10, |
|
97 |
+#' (sim <- simData(ref, nc = 200, |
|
77 | 98 |
#' p_dd = c(0.9, 0, 0.1, 0, 0, 0), |
78 | 99 |
#' ng = 100, force = TRUE, |
79 | 100 |
#' probs = list(NULL, NULL, c(1, 0)))) |
... | ... |
@@ -116,7 +137,7 @@ |
116 | 137 |
#' # view information about shared 'type' genes |
117 | 138 |
#' table(rowData(sim)$class) |
118 | 139 |
#' |
119 |
-#' @author Helena L Crowell |
|
140 |
+#' @author Helena L Crowell & Anthony Sonrel |
|
120 | 141 |
#' |
121 | 142 |
#' @references |
122 | 143 |
#' Crowell, HL, Soneson, C, Germain, P-L, Calini, D, |
... | ... |
@@ -316,10 +337,10 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
316 | 337 |
sim_mean <- sim_mean %>% |
317 | 338 |
map(bind_cols) %>% |
318 | 339 |
bind_rows(.id = "cluster_id") %>% |
319 |
- mutate_at("cluster_id", factor) %>% |
|
320 | 340 |
mutate(gene = rep(gs, nk)) |
321 | 341 |
gi <- full_join(gi, sim_mean, by = c("gene", "cluster_id")) %>% |
322 |
- rename("sim_mean.A" = "A", "sim_mean.B" = "B") |
|
342 |
+ rename("sim_mean.A" = "A", "sim_mean.B" = "B") %>% |
|
343 |
+ mutate_at("cluster_id", factor) |
|
323 | 344 |
# reorder |
324 | 345 |
o <- order(as.numeric(gsub("[a-z]", "", gi$gene))) |
325 | 346 |
gi <- gi[o, ]; rownames(gi) <- NULL |
... | ... |
@@ -332,14 +353,16 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
332 | 353 |
gids <- cd$group_id[m] |
333 | 354 |
o <- order(gids) |
334 | 355 |
sids <- levels(cd$sample_id)[o] |
335 |
- ei <- data.frame(sample_id = sids, group_id = gids[o]) |
|
336 |
- cd <- cd %>% mutate_at("sample_id", factor, levels = sids) |
|
356 |
+ cd <- cd %>% |
|
357 |
+ mutate_at("cluster_id", factor, levels = kids) %>% |
|
358 |
+ mutate_at("sample_id", factor, levels = sids) |
|
337 | 359 |
# gene metadata storing gene classes & specificities |
338 | 360 |
rd <- DataFrame(class = factor(class, |
339 | 361 |
levels = c("state", "shared", "type"))) |
340 | 362 |
rd$specs <- as.list(specs) |
341 | 363 |
# simulation metadata including used reference samples/cluster, |
342 | 364 |
# list of input arguments, and simulated genes' metadata |
365 |
+ ei <- data.frame(sample_id = sids, group_id = gids[o]) |
|
343 | 366 |
md <- list( |
344 | 367 |
experiment_info = ei, |
345 | 368 |
n_cells = table(cd$sample_id), |
... | ... |
@@ -109,7 +109,8 @@ |
109 | 109 |
#' plot(read.dendrogram(text = phylo_tree)) |
110 | 110 |
#' |
111 | 111 |
#' # simulate clusters accordingly |
112 |
-#' sim <- simData(ref, phylo_tree = phylo_tree, |
|
112 |
+#' sim <- simData(ref, |
|
113 |
+#' phylo_tree = phylo_tree, |
|
113 | 114 |
#' phylo_pars = c(0.1, 3), |
114 | 115 |
#' ng = 500, force = TRUE) |
115 | 116 |
#' # view information about shared 'type' genes |
... | ... |
@@ -35,11 +35,9 @@ |
35 | 35 |
#' these nodes (this relation is controlled with \code{phylo_pars}). The distance |
36 | 36 |
#' between two clusters is defined as the sum of the branches lengths |
37 | 37 |
#' separating them. |
38 |
-#' @param phylo_pars vector of length 2, defining the parameters that control the |
|
39 |
-#' number of type genes. It is passed to an adaptation of the |
|
40 |
-#' exponential's PDF: |
|
41 |
-#' |
|
42 |
-#' \code{N = Ngenes x gamma1 * e^(-gamma2 x dist)} , |
|
38 |
+#' @param phylo_pars vector of length 2 providing the parameters that control |
|
39 |
+#' the number of type genes. Passed to an exponential's PDF: |
|
40 |
+#' \code{N = #genes x gamma1 * e^(-gamma2 x dist)}, |
|
43 | 41 |
#' |
44 | 42 |
#' \itemize{ |
45 | 43 |
#' \item \code{gamma1} is the parameter that controls the percentage of shared |
... | ... |
@@ -59,11 +57,10 @@ |
59 | 57 |
#' @param force logical specifying whether to force |
60 | 58 |
#' simulation despite \code{ng != nrow(x)}. |
61 | 59 |
#' |
62 |
-#' @details |
|
63 |
-#' The simulation of type genes can be performed in 2 ways; (1) by defining |
|
64 |
-#' \code{p_type} and thus simulating independant clusters, OR (2) by defining both |
|
65 |
-#' \code{phylo_tree} and \code{phylo_pars}, which will simulate a hierarchical structure |
|
66 |
-#' between the clusters. Only one of the options is allowed. |
|
60 |
+#' @details The simulation of type genes can be performed in 2 ways; |
|
61 |
+#' (1) by defining \code{p_type} and thus simulating independant clusters, OR |
|
62 |
+#' (2) by defining both \code{phylo_tree} and \code{phylo_pars}, which will |
|
63 |
+#' simulate a hierarchical structure between the clusters. |
|
67 | 64 |
#' |
68 | 65 |
#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}} |
69 | 66 |
#' containing multiple clusters & samples across 2 groups. |
... | ... |
@@ -88,7 +85,7 @@ |
88 | 85 |
#' table(gi$category) |
89 | 86 |
#' |
90 | 87 |
#' # unbalanced sample sizes |
91 |
-#' sim <- simData(ref, nc = 100, |
|
88 |
+#' sim <- simData(ref, nc = 100, ns = 2, |
|
92 | 89 |
#' probs = list(NULL, c(0.25, 0.75), NULL), |
93 | 90 |
#' ng = 10, force = TRUE) |
94 | 91 |
#' table(sim$sample_id) |
... | ... |
@@ -142,7 +139,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
142 | 139 |
probs = NULL, p_dd = diag(6)[1, ], paired = FALSE, |
143 | 140 |
p_ep = 0.5, p_dp = 0.3, p_dm = 0.5, |
144 | 141 |
p_type = 0, lfc = 2, rel_lfc = NULL, |
145 |
- phylo_tree = NULL, phylo_pars = c(0, 3), |
|
142 |
+ phylo_tree = NULL, phylo_pars = c(ifelse(is.null(phylo_tree), 0, 0.1), 3), |
|
146 | 143 |
ng = nrow(x), force = FALSE) { |
147 | 144 |
|
148 | 145 |
# throughout this code... |
... | ... |
@@ -152,9 +149,6 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
152 | 149 |
# c: DD category |
153 | 150 |
# 0: reference |
154 | 151 |
|
155 |
- # default shared p if phylo tree given |
|
156 |
- if (missing(phylo_pars)) phylo_pars[1] <- ifelse(is.null(phylo_tree), 0, 0.1) |
|
157 |
- |
|
158 | 152 |
# store all input arguments to be returned in final output |
159 | 153 |
args <- c(as.list(environment())) |
160 | 154 |
|
... | ... |
@@ -35,23 +35,20 @@ |
35 | 35 |
#' these nodes (this relation is controlled with \code{phylo_pars}). The distance |
36 | 36 |
#' between two clusters is defined as the sum of the branches lengths |
37 | 37 |
#' separating them. |
38 |
-#' @param phylo_pars list of length 2, defining the parameters that control the |
|
39 |
-#' number of shared/ specific type-genes; \itemize{ |
|
40 |
-#' \item The first element of the list is a numeric vector of length 2. |
|
41 |
-#' It defines the number of shared genes as an adaptation of the |
|
38 |
+#' @param phylo_pars vector of length 2, defining the parameters that control the |
|
39 |
+#' number of type genes. It is passed to an adaptation of the |
|
42 | 40 |
#' exponential's PDF: |
43 | 41 |
#' |
44 | 42 |
#' \code{N = Ngenes x gamma1 * e^(-gamma2 x dist)} , |
45 | 43 |
#' |
46 |
-#' where \code{gamma1} is the parameter that controls the percentage of shared genes |
|
47 |
-#' between the nodes. By default 0.2, but it's advised to tune it depending |
|
48 |
-#' on the input \code{prep_sce}. |
|
49 |
-#' \code{gamma2} is the 'penalty' of increasing |
|
50 |
-#' the distance between clusters (\code{dist}, defined by \code{phylo_tree}), |
|
51 |
-#' applied on the number of shared genes. Default to -3. |
|
52 |
-#' \item The second element can be a single numeric or a list of length nk. |
|
53 |
-#' It is an equivalent of \code{p_type} for each leaf (i.e. cluster) that is applied |
|
54 |
-#' after the determination of 'shared type genes'. |
|
44 |
+#' \itemize{ |
|
45 |
+#' \item \code{gamma1} is the parameter that controls the percentage of shared |
|
46 |
+#' genes between the nodes. By default 0.1 if a tree is given, meaning that |
|
47 |
+#' maximum 10% of the genes can be used as type genes (if gamma2 = 0). |
|
48 |
+#' However it's advised to tune it depending on the input \code{prep_sce}. |
|
49 |
+#' \code{gamma2} is the 'penalty' of increasing the distance between clusters |
|
50 |
+#' (\code{dist}, defined by \code{phylo_tree}), applied on the number of |
|
51 |
+#' shared genes. Default to 3. |
|
55 | 52 |
#' } |
56 | 53 |
#' |
57 | 54 |
#' @param ng # of genes to simulate. Importantly, for the library sizes |
... | ... |
@@ -116,7 +113,7 @@ |
116 | 113 |
#' |
117 | 114 |
#' # simulate clusters accordingly |
118 | 115 |
#' sim <- simData(ref, phylo_tree = phylo_tree, |
119 |
-#' phylo_pars = list(c(0.1, 3), 0.1), |
|
116 |
+#' phylo_pars = c(0.1, 3), |
|
120 | 117 |
#' ng = 500, force = TRUE) |
121 | 118 |
#' # view information about shared 'type' genes |
122 | 119 |
#' table(rowData(sim)$class) |
... | ... |
@@ -145,7 +142,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
145 | 142 |
probs = NULL, p_dd = diag(6)[1, ], paired = FALSE, |
146 | 143 |
p_ep = 0.5, p_dp = 0.3, p_dm = 0.5, |
147 | 144 |
p_type = 0, lfc = 2, rel_lfc = NULL, |
148 |
- phylo_tree = NULL, phylo_pars = list(c(0, 3), 0), |
|
145 |
+ phylo_tree = NULL, phylo_pars = c(0, 3), |
|
149 | 146 |
ng = nrow(x), force = FALSE) { |
150 | 147 |
|
151 | 148 |
# throughout this code... |
... | ... |
@@ -155,6 +152,9 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
155 | 152 |
# c: DD category |
156 | 153 |
# 0: reference |
157 | 154 |
|
155 |
+ # default shared p if phylo tree given |
|
156 |
+ if (missing(phylo_pars)) phylo_pars[1] <- ifelse(is.null(phylo_tree), 0, 0.1) |
|
157 |
+ |
|
158 | 158 |
# store all input arguments to be returned in final output |
159 | 159 |
args <- c(as.list(environment())) |
160 | 160 |
|
... | ... |
@@ -160,40 +160,12 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
160 | 160 |
|
161 | 161 |
# check validity of input arguments |
162 | 162 |
.check_sce(x, req_group = FALSE) |
163 |
- .check_args_simData(as.list(environment())) |
|
164 |
- if (!force && ng != nrow(x)) |
|
165 |
- stop("Number of simulated genes should match with reference,\n", |
|
166 |
- "but 'ng != nrow(x)'; please specify 'force = TRUE' if\n", |
|
167 |
- "simulation should be forced regardlessly (see '?simData').") |
|
168 |
- if (!is.null(phylo_tree) && p_type != 0) |
|
169 |
- stop("Only one of 'p_type' and 'phylo_tree' can be provided.\n", |
|
170 |
- "Please see the 'Details' section of '?simData'.") |
|
171 |
- if (!length(phylo_pars[[2]]) %in% c(1, nk)) |
|
172 |
- stop("The second element of 'phylo_pars' should be correspond\n", |
|
173 |
- " to the number of clusters ('nk') or of length 1.") |
|
174 |
- if (!is.null(phylo_tree) && phylo_pars[[1]][1] == 0) |
|
175 |
- warning("'phylo_pars[[1]][1]' has been set to 0;\n", |
|
176 |
- "'phylo_tree' argument will be ignored.") |
|
177 |
- if (!is.null(phylo_tree) && all(phylo_pars[[2]] == 0)) |
|
178 |
- warning("'phylo_pars[[2]]' has been set to 0;\n", |
|
179 |
- "type genes for individual clusters won't be simulated.") |
|
163 |
+ args_tmp <- .check_args_simData(as.list(environment())) |
|
164 |
+ nk <- args$nk <- args_tmp$nk |
|
180 | 165 |
|
181 | 166 |
# reference IDs |
182 | 167 |
nk0 <- length(kids0 <- set_names(levels(x$cluster_id))) |
183 | 168 |
ns0 <- length(sids0 <- set_names(levels(x$sample_id))) |
184 |
- |
|
185 |
- # assure number of simulated clusters |
|
186 |
- # matches with specified phylogeny |
|
187 |
- if (!is.null(phylo_tree)) { |
|
188 |
- kids_phylo <- .get_clusters_from_phylo(phylo_tree) |
|
189 |
- nk_phylo <- length(kids_phylo) |
|
190 |
- ns_phylo <- as.numeric(gsub("[a-z]", "", kids_phylo)) |
|
191 |
- if (!all(sort(ns_phylo) == seq_len(nk_phylo))) |
|
192 |
- stop("Some clusters appear to be missing from 'phylo_tree';\n", |
|
193 |
- "please make sure all clusters up to ", |
|
194 |
- dQuote(kids_phylo[which.max(ns_phylo)]), " are present.") |
|
195 |
- if (nk_phylo != nk) args$nk <- nk <- nk_phylo |
|
196 |
- } |
|
197 | 169 |
|
198 | 170 |
# simulation IDs |
199 | 171 |
nk <- length(kids <- set_names(paste0("cluster", seq_len(nk)))) |
... | ... |
@@ -244,7 +216,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
244 | 216 |
gs_idx <- .sample_gene_inds(gs, n_dd) |
245 | 217 |
|
246 | 218 |
# for ea. cluster, sample set of genes to simulate from |
247 |
- gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs) |
|
219 |
+ gs_by_k <- setNames(sample(rownames(x), ng, ng > nrow(x)), gs) |
|
248 | 220 |
gs_by_k <- replicate(nk, gs_by_k) |
249 | 221 |
colnames(gs_by_k) <- kids |
250 | 222 |
|
... | ... |
@@ -18,7 +18,7 @@ |
18 | 18 |
#' or not (reference samples are drawn at random). |
19 | 19 |
#' @param p_ep,p_dp,p_dm numeric specifying the proportion of cells |
20 | 20 |
#' to be shifted to a different expression state in one group (see details). |
21 |
-#' @param p_type numeric. Probaility of EE/EP gene being a type-gene. |
|
21 |
+#' @param p_type numeric. Probability of EE/EP gene being a type-gene. |
|
22 | 22 |
#' If a gene is of class "type" in a given cluster, a unique mean |
23 | 23 |
#' will be used for that gene in the respective cluster. |
24 | 24 |
#' @param lfc numeric value to use as mean logFC |
... | ... |
@@ -28,19 +28,32 @@ |
28 | 28 |
#' \code{levels(x$cluster_id)} as names. |
29 | 29 |
#' Defaults to factor of 1 for all clusters. |
30 | 30 |
#' @param phylo_tree newick tree text representing cluster relations |
31 |
-#' and their relative distance. If a tree is given, the distance between |
|
32 |
-#' the clusters will be translated in the number of shared genes |
|
33 |
-#' (this relation is controlled with \code{phylo_pars}). The distance |
|
31 |
+#' and their relative distance. An explanation of the syntax can be found |
|
32 |
+#' \href{http://evolution.genetics.washington.edu/phylip/newicktree.html}{here}. |
|
33 |
+#' The distance between the nodes, except for the original branch, will be |
|
34 |
+#' translated in the number of shared genes between the clusters belonging to |
|
35 |
+#' these nodes (this relation is controlled with \code{phylo_pars}). The distance |
|
34 | 36 |
#' between two clusters is defined as the sum of the branches lengths |
35 | 37 |
#' separating them. |
36 |
-#' @param phylo_pars numeric vector of length 2. Defines the number of shared |
|
37 |
-#' genes as an adaptation of the exponential's PDF: |
|
38 |
-#' N = Ngenes x gamma1 * e^(-gamma2 x dist) , |
|
39 |
-#' where gamma1 is the parameter that controls the percentage of shared genes. |
|
40 |
-#' By default, it corresponds to \code{p_type} but it's advised to tune it |
|
41 |
-#' depending on the input prep_sce. gamma2 is the 'penalty' of increasing |
|
42 |
-#' the distance between clusters, applied on the number of shared genes. |
|
43 |
-#' Default to -3. |
|
38 |
+#' @param phylo_pars list of length 2, defining the parameters that control the |
|
39 |
+#' number of shared/ specific type-genes; \itemize{ |
|
40 |
+#' \item The first element of the list is a numeric vector of length 2. |
|
41 |
+#' It defines the number of shared genes as an adaptation of the |
|
42 |
+#' exponential's PDF: |
|
43 |
+#' |
|
44 |
+#' \code{N = Ngenes x gamma1 * e^(-gamma2 x dist)} , |
|
45 |
+#' |
|
46 |
+#' where \code{gamma1} is the parameter that controls the percentage of shared genes |
|
47 |
+#' between the nodes. By default 0.2, but it's advised to tune it depending |
|
48 |
+#' on the input \code{prep_sce}. |
|
49 |
+#' \code{gamma2} is the 'penalty' of increasing |
|
50 |
+#' the distance between clusters (\code{dist}, defined by \code{phylo_tree}), |
|
51 |
+#' applied on the number of shared genes. Default to -3. |
|
52 |
+#' \item The second element can be a single numeric or a list of length nk. |
|
53 |
+#' It is an equivalent of \code{p_type} for each leaf (i.e. cluster) that is applied |
|
54 |
+#' after the determination of 'shared type genes'. |
|
55 |
+#' } |
|
56 |
+#' |
|
44 | 57 |
#' @param ng # of genes to simulate. Importantly, for the library sizes |
45 | 58 |
#' computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense, |
46 | 59 |
#' the number of simulated genes should match with the number of genes |
... | ... |
@@ -49,6 +62,12 @@ |
49 | 62 |
#' @param force logical specifying whether to force |
50 | 63 |
#' simulation despite \code{ng != nrow(x)}. |
51 | 64 |
#' |
65 |
+#' @details |
|
66 |
+#' The simulation of type genes can be performed in 2 ways; (1) by defining |
|
67 |
+#' \code{p_type} and thus simulating independant clusters, OR (2) by defining both |
|
68 |
+#' \code{phylo_tree} and \code{phylo_pars}, which will simulate a hierarchical structure |
|
69 |
+#' between the clusters. Only one of the options is allowed. |
|
70 |
+#' |
|
52 | 71 |
#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}} |
53 | 72 |
#' containing multiple clusters & samples across 2 groups. |
54 | 73 |
#' |
... | ... |
@@ -96,9 +115,11 @@ |
96 | 115 |
#' plot(read.dendrogram(text = phylo_tree)) |
97 | 116 |
#' |
98 | 117 |
#' # simulate clusters accordingly |
99 |
-#' sim <- simData(sce, phylo_tree = phylo_tree, ng = 500, force = TRUE) |
|
118 |
+#' sim <- simData(ref, phylo_tree = phylo_tree, |
|
119 |
+#' phylo_pars = list(c(0.1, 3), 0.1), |
|
120 |
+#' ng = 500, force = TRUE) |
|
100 | 121 |
#' # view information about shared 'type' genes |
101 |
-#' table(metadata(sim)$gene_info$shared_class) |
|
122 |
+#' table(rowData(sim)$class) |
|
102 | 123 |
#' |
103 | 124 |
#' @author Helena L Crowell |
104 | 125 |
#' |
... | ... |
@@ -124,7 +145,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
124 | 145 |
probs = NULL, p_dd = diag(6)[1, ], paired = FALSE, |
125 | 146 |
p_ep = 0.5, p_dp = 0.3, p_dm = 0.5, |
126 | 147 |
p_type = 0, lfc = 2, rel_lfc = NULL, |
127 |
- phylo_tree = NULL, phylo_pars = c(0.2, 3), |
|
148 |
+ phylo_tree = NULL, phylo_pars = list(c(0, 3), 0), |
|
128 | 149 |
ng = nrow(x), force = FALSE) { |
129 | 150 |
|
130 | 151 |
# throughout this code... |
... | ... |
@@ -144,6 +165,18 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
144 | 165 |
stop("Number of simulated genes should match with reference,\n", |
145 | 166 |
"but 'ng != nrow(x)'; please specify 'force = TRUE' if\n", |
146 | 167 |
"simulation should be forced regardlessly (see '?simData').") |
168 |
+ if (!is.null(phylo_tree) && p_type != 0) |
|
169 |
+ stop("Only one of 'p_type' and 'phylo_tree' can be provided.\n", |
|
170 |
+ "Please see the 'Details' section of '?simData'.") |
|
171 |
+ if (!length(phylo_pars[[2]]) %in% c(1, nk)) |
|
172 |
+ stop("The second element of 'phylo_pars' should be correspond\n", |
|
173 |
+ " to the number of clusters ('nk') or of length 1.") |
|
174 |
+ if (!is.null(phylo_tree) && phylo_pars[[1]][1] == 0) |
|
175 |
+ warning("'phylo_pars[[1]][1]' has been set to 0;\n", |
|
176 |
+ "'phylo_tree' argument will be ignored.") |
|
177 |
+ if (!is.null(phylo_tree) && all(phylo_pars[[2]] == 0)) |
|
178 |
+ warning("'phylo_pars[[2]]' has been set to 0;\n", |
|
179 |
+ "type genes for individual clusters won't be simulated.") |
|
147 | 180 |
|
148 | 181 |
# reference IDs |
149 | 182 |
nk0 <- length(kids0 <- set_names(levels(x$cluster_id))) |
... | ... |
@@ -221,28 +254,6 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
221 | 254 |
gs_by_k <- res$gs_by_k |
222 | 255 |
class <- res$class |
223 | 256 |
specs <- res$specs |
224 |
- if (p_type != 0) { |
|
225 |
- # update gene indices 'gs_idx' to avoid imputing |
|
226 |
- # exclusive type genes in shared type genes |
|
227 |
- gs_idx_tmp <- gs_idx |
|
228 |
- for (c in c("ee", "ep")) |
|
229 |
- for (k in kids) { |
|
230 |
- u <- gs_idx[[c, k]] |
|
231 |
- gs_idx_tmp[[c, k]] <- u[!u %in% res$used] |
|
232 |
- } |
|
233 |
- # inpute cluster-specific type genes |
|
234 |
- res <- .impute_type_genes(x, gs_by_k, gs_idx_tmp, p_type) |
|
235 |
- gs_by_k <- res$gs_by_k |
|
236 |
- # update genes classes & specificities |
|
237 |
- is_type <- res$class == "type" |
|
238 |
- # spot checks; any type-gene should not be of class 'shared' |
|
239 |
- # and cluster-specificities should be unassigned at this point |
|
240 |
- stopifnot( |
|
241 |
- all(class[is_type] == "state"), |
|
242 |
- all(is.na(unlist(specs[is_type])))) |
|
243 |
- class[is_type] <- "type" |
|
244 |
- specs[is_type] <- res$specs[is_type] |
|
245 |
- } |
|
246 | 257 |
# otherwise, simply impute type-genes w/o phylogeny |
247 | 258 |
} else if (p_type != 0) { |
248 | 259 |
res <- .impute_type_genes(x, gs_by_k, gs_idx, p_type) |
... | ... |
@@ -159,7 +159,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
159 | 159 |
stop("Some clusters appear to be missing from 'phylo_tree';\n", |
160 | 160 |
"please make sure all clusters up to ", |
161 | 161 |
dQuote(kids_phylo[which.max(ns_phylo)]), " are present.") |
162 |
- if (nk_phylo != nk) nk <- nk_phylo |
|
162 |
+ if (nk_phylo != nk) args$nk <- nk <- nk_phylo |
|
163 | 163 |
} |
164 | 164 |
|
165 | 165 |
# simulation IDs |
... | ... |
@@ -212,7 +212,8 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
212 | 212 |
|
213 | 213 |
# for ea. cluster, sample set of genes to simulate from |
214 | 214 |
gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs) |
215 |
- gs_by_k <- set_colnames(replicate(nk, gs_by_k), kids) |
|
215 |
+ gs_by_k <- replicate(nk, gs_by_k) |
|
216 |
+ colnames(gs_by_k) <- kids |
|
216 | 217 |
|
217 | 218 |
# when 'phylo_tree' is specified, induce hierarchical cluster structure |
218 | 219 |
if (!is.null(phylo_tree)) { |
... | ... |
@@ -343,7 +344,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
343 | 344 |
rename("sim_mean.A" = "A", "sim_mean.B" = "B") |
344 | 345 |
# reorder |
345 | 346 |
o <- order(as.numeric(gsub("[a-z]", "", gi$gene))) |
346 |
- gi <- set_rownames(gi[o, ], NULL) |
|
347 |
+ gi <- gi[o, ]; rownames(gi) <- NULL |
|
347 | 348 |
|
348 | 349 |
# construct SCE ------------------------------------------------------------ |
349 | 350 |
# cell metadata including group, sample, cluster IDs |
... | ... |
@@ -6,7 +6,7 @@ |
6 | 6 |
#' across 2 experimental conditions from a real scRNA-seq data set. |
7 | 7 |
#' |
8 | 8 |
#' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. |
9 |
-#' @param ng,nc,ns,nk # of genes, cells, samples and clusters to simulate. |
|
9 |
+#' @param nc,ns,nk # of cells, samples and clusters to simulate. |
|
10 | 10 |
#' @param probs a list of length 3 containing probabilities of a cell belonging |
11 | 11 |
#' to each cluster, sample, and group, respectively. List elements must be |
12 | 12 |
#' NULL (equal probabilities) or numeric values in [0, 1] that sum to 1. |
... | ... |
@@ -27,13 +27,13 @@ |
27 | 27 |
#' Should be of length \code{nlevels(x$cluster_id)} with |
28 | 28 |
#' \code{levels(x$cluster_id)} as names. |
29 | 29 |
#' Defaults to factor of 1 for all clusters. |
30 |
-#' @param cells_phylo newick tree text representing cluster relations |
|
30 |
+#' @param phylo_tree newick tree text representing cluster relations |
|
31 | 31 |
#' and their relative distance. If a tree is given, the distance between |
32 | 32 |
#' the clusters will be translated in the number of shared genes |
33 |
-#' (this relation is controlled with \code{params_dist}). The distance |
|
33 |
+#' (this relation is controlled with \code{phylo_pars}). The distance |
|
34 | 34 |
#' between two clusters is defined as the sum of the branches lengths |
35 | 35 |
#' separating them. |
36 |
-#' @param params_dist numeric vector of length 2. Defines the number of shared |
|
36 |
+#' @param phylo_pars numeric vector of length 2. Defines the number of shared |
|
37 | 37 |
#' genes as an adaptation of the exponential's PDF: |
38 | 38 |
#' N = Ngenes x gamma1 * e^(-gamma2 x dist) , |
39 | 39 |
#' where gamma1 is the parameter that controls the percentage of shared genes. |
... | ... |
@@ -41,6 +41,13 @@ |
41 | 41 |
#' depending on the input prep_sce. gamma2 is the 'penalty' of increasing |
42 | 42 |
#' the distance between clusters, applied on the number of shared genes. |
43 | 43 |
#' Default to -3. |
44 |
+#' @param ng # of genes to simulate. Importantly, for the library sizes |
|
45 |
+#' computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense, |
|
46 |
+#' the number of simulated genes should match with the number of genes |
|
47 |
+#' in the reference. To simulate a reduced number of genes, e.g. for |
|
48 |
+#' testing and development purposes, please set \code{force = TRUE}. |
|
49 |
+#' @param force logical specifying whether to force |
|
50 |
+#' simulation despite \code{ng != nrow(x)}. |
|
44 | 51 |
#' |
45 | 52 |
#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}} |
46 | 53 |
#' containing multiple clusters & samples across 2 groups. |
... | ... |
@@ -50,11 +57,13 @@ |
50 | 57 |
#' library(SingleCellExperiment) |
51 | 58 |
#' |
52 | 59 |
#' # prep. SCE for simulation |
53 |
-#' sce <- prepSim(sce) |
|
60 |
+#' ref <- prepSim(sce) |
|
54 | 61 |
#' |
55 | 62 |
#' # simulate data |
56 |
-#' (sim <- simData(sce, ng = 100, nc = 10, |
|
57 |
-#' p_dd = c(0.9, 0, 0.1, 0, 0, 0))) |
|
63 |
+#' (sim <- simData(ref, nc = 10, |
|
64 |
+#' p_dd = c(0.9, 0, 0.1, 0, 0, 0), |
|
65 |
+#' ng = 100, force = TRUE, |
|
66 |
+#' probs = list(NULL, NULL, c(1, 0)))) |
|
58 | 67 |
#' |
59 | 68 |
#' # simulation metadata |
60 | 69 |
#' head(gi <- metadata(sim)$gene_info) |
... | ... |
@@ -63,30 +72,32 @@ |
63 | 72 |
#' table(gi$category) |
64 | 73 |
#' |
65 | 74 |
#' # unbalanced sample sizes |
66 |
-#' sim <- simData(sce, ng = 10, nc = 100, |
|
67 |
-#' probs = list(NULL, c(0.25, 0.75), NULL)) |
|
75 |
+#' sim <- simData(ref, nc = 100, |
|
76 |
+#' probs = list(NULL, c(0.25, 0.75), NULL), |
|
77 |
+#' ng = 10, force = TRUE) |
|
68 | 78 |
#' table(sim$sample_id) |
69 | 79 |
#' |
70 | 80 |
#' # one group only |
71 |
-#' sim <- simData(sce, ng = 10, nc = 100, |
|
72 |
-#' probs = list(NULL, NULL, c(1, 0))) |
|
81 |
+#' sim <- simData(ref, nc = 100, |
|
82 |
+#' probs = list(NULL, NULL, c(1, 0)), |
|
83 |
+#' ng = 10, force = TRUE) |
|
73 | 84 |
#' levels(sim$group_id) |
74 | 85 |
#' |
75 |
-#' |
|
76 |
-#' # Hierarchical cell-type relations: |
|
77 |
-#' # we first define a phylogram representing the relations between 3 clusters |
|
78 |
-#' cells_phylo <- "(('cluster1':0.1, 'cluster2':0.1):0.4,'cluster3':0.5);" |
|
79 |
-#' # to verify the syntax and the relation, we can plot it |
|
86 |
+#' # HIERARCHICAL CLUSTER STRUCTURE |
|
87 |
+#' # define phylogram specifying cluster relations |
|
88 |
+#' phylo_tree <- "(('cluster1':0.1,'cluster2':0.1):0.4,'cluster3':0.5);" |
|
89 |
+#' # verify syntax & visualize relations |
|
80 | 90 |
#' library(phylogram) |
81 |
-#' dend <- read.dendrogram(text = cells_phylo) |
|
82 |
-#' plot(dend) |
|
83 |
-#' # More complex relations are also possible |
|
84 |
-#' cells_phylo2 <- "(('cluster1':0.4, 'cluster2':0.4):0.4, ('cluster3':0.5,('cluster4':0.2,'cluster5':0.2,'cluster6':0.2):0.4):0.4);" |
|
85 |
-#' dend2 <- read.dendrogram(text = cells_phylo2) |
|
86 |
-#' plot(dend2) |
|
87 |
-#' # simulate clusters based on these distances |
|
88 |
-#' sim <- simData(sce_, nk = 3, cells_phylo = cells_phylo) |
|
89 |
-#' # the information about the shared 'type' genes are kept in the metadata |
|
91 |
+#' plot(read.dendrogram(text = phylo_tree)) |
|
92 |
+#' |
|
93 |
+#' # let's use a more complex phylogeny |
|
94 |
+#' phylo_tree <- "(('cluster1':0.4,'cluster2':0.4):0.4,('cluster3': |
|
95 |
+#' 0.5,('cluster4':0.2,'cluster5':0.2,'cluster6':0.2):0.4):0.4);" |
|
96 |
+#' plot(read.dendrogram(text = phylo_tree)) |
|
97 |
+#' |
|
98 |
+#' # simulate clusters accordingly |
|
99 |
+#' sim <- simData(sce, phylo_tree = phylo_tree, ng = 500, force = TRUE) |
|
100 |
+#' # view information about shared 'type' genes |
|
90 | 101 |
#' table(metadata(sim)$gene_info$shared_class) |
91 | 102 |
#' |
92 | 103 |
#' @author Helena L Crowell |
... | ... |
@@ -102,19 +113,19 @@ |
102 | 113 |
#' @importFrom data.table data.table |
103 | 114 |
#' @importFrom dplyr mutate_all mutate_at |
104 | 115 |
#' @importFrom edgeR DGEList estimateDisp glmFit |
105 |
-#' @importFrom magrittr set_colnames set_rownames |
|
106 | 116 |
#' @importFrom purrr modify_depth set_names |
107 | 117 |
#' @importFrom stats model.matrix rgamma setNames |
108 | 118 |
#' @importFrom SingleCellExperiment SingleCellExperiment |
109 | 119 |
#' @importFrom SummarizedExperiment colData |
110 |
-#' @importFrom S4Vectors split |
|
120 |
+#' @importFrom S4Vectors split unfactor |
|
111 | 121 |
#' @export |
112 | 122 |
|
113 |
-simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
|
123 |
+simData <- function(x, nc = 2e3, ns = 3, nk = 3, |
|
114 | 124 |
probs = NULL, p_dd = diag(6)[1, ], paired = FALSE, |
115 | 125 |
p_ep = 0.5, p_dp = 0.3, p_dm = 0.5, |
116 | 126 |
p_type = 0, lfc = 2, rel_lfc = NULL, |
117 |
- cells_phylo = NULL, params_dist = c(0.2, 3) ) { |
|
127 |
+ phylo_tree = NULL, phylo_pars = c(0.2, 3), |
|
128 |
+ ng = nrow(x), force = FALSE) { |
|
118 | 129 |
|
119 | 130 |
# throughout this code... |
120 | 131 |
# k: cluster ID |
... | ... |
@@ -123,13 +134,33 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
123 | 134 |
# c: DD category |
124 | 135 |
# 0: reference |
125 | 136 |
|
137 |
+ # store all input arguments to be returned in final output |
|
138 |
+ args <- c(as.list(environment())) |
|
139 |
+ |
|
126 | 140 |
# check validity of input arguments |
127 | 141 |
.check_sce(x, req_group = FALSE) |
128 | 142 |
.check_args_simData(as.list(environment())) |
143 |
+ if (!force && ng != nrow(x)) |
|
144 |
+ stop("Number of simulated genes should match with reference,\n", |
|
145 |
+ "but 'ng != nrow(x)'; please specify 'force = TRUE' if\n", |
|
146 |
+ "simulation should be forced regardlessly (see '?simData').") |
|
129 | 147 |
|
130 | 148 |
# reference IDs |
131 | 149 |
nk0 <- length(kids0 <- set_names(levels(x$cluster_id))) |
132 | 150 |
ns0 <- length(sids0 <- set_names(levels(x$sample_id))) |
151 |
+ |
|
152 |
+ # assure number of simulated clusters |
|
153 |
+ # matches with specified phylogeny |
|
154 |
+ if (!is.null(phylo_tree)) { |
|
155 |
+ kids_phylo <- .get_clusters_from_phylo(phylo_tree) |
|
156 |
+ nk_phylo <- length(kids_phylo) |
|
157 |
+ ns_phylo <- as.numeric(gsub("[a-z]", "", kids_phylo)) |
|
158 |
+ if (!all(sort(ns_phylo) == seq_len(nk_phylo))) |
|
159 |
+ stop("Some clusters appear to be missing from 'phylo_tree';\n", |
|
160 |
+ "please make sure all clusters up to ", |
|
161 |
+ dQuote(kids_phylo[which.max(ns_phylo)]), " are present.") |
|
162 |
+ if (nk_phylo != nk) nk <- nk_phylo |
|
163 |
+ } |
|
133 | 164 |
|
134 | 165 |
# simulation IDs |
135 | 166 |
nk <- length(kids <- set_names(paste0("cluster", seq_len(nk)))) |
... | ... |
@@ -183,56 +214,55 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
183 | 214 |
gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs) |
184 | 215 |
gs_by_k <- set_colnames(replicate(nk, gs_by_k), kids) |
185 | 216 |
|
186 |
- if (length(cells_phylo) > 0) { |
|
187 |
- out <- .impute_shared_type_genes(x, gs_by_k, gs_idx, cells_phylo, |
|
188 |
- params_dist) |
|
189 |
- gs_by_k <- out[[1]] |
|
190 |
- used_tg <- out[[2]] |
|
191 |
- shared <- out[[3]] |
|
192 |
- |
|
217 |
+ # when 'phylo_tree' is specified, induce hierarchical cluster structure |
|
218 |
+ if (!is.null(phylo_tree)) { |
|
219 |
+ res <- .impute_shared_type_genes(x, gs_by_k, gs_idx, phylo_tree, phylo_pars) |
|
220 |
+ gs_by_k <- res$gs_by_k |
|
221 |
+ class <- res$class |
|
222 |
+ specs <- res$specs |
|
193 | 223 |
if (p_type != 0) { |
194 |
- |
|
195 |
- ## update gs_idx to avoid having type genes in the shared type genes |
|
196 |
- gs_idx_red <- gs_idx |
|
197 |
- ee_ep_id <- seq(1, 6 * length(kids) - 5, 6) |
|
198 |
- ee_ep_id <- c(ee_ep_id, ee_ep_id + 1) |
|
199 |
- for (i in ee_ep_id) gs_idx_red[[i]] <- (gs_idx[[i]])[-which(gs_idx[[i]] %in% used_tg)] |
|
200 |
- type_info <- gs_by_k |
|
201 |
- gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx_red, p_type) |
|
202 |
- |
|
203 |
- ## keep gene type info |
|
204 |
- type_info[type_info == gs_by_k] <- NA |
|
205 |
- type_info[!is.na(type_info)] <- "type" |
|
206 |
- type_info[is.na(type_info)] <- "state" |
|
207 |
- |
|
208 |
- } else { |
|
209 |
- type_info <- matrix("state", ng, nk) |
|
224 |
+ # update gene indices 'gs_idx' to avoid imputing |
|
225 |
+ # exclusive type genes in shared type genes |
|
226 |
+ gs_idx_tmp <- gs_idx |
|
227 |
+ for (c in c("ee", "ep")) |
|
228 |
+ for (k in kids) { |
|
229 |
+ u <- gs_idx[[c, k]] |
|
230 |
+ gs_idx_tmp[[c, k]] <- u[!u %in% res$used] |
|
231 |
+ } |
|
232 |
+ # inpute cluster-specific type genes |
|
233 |
+ res <- .impute_type_genes(x, gs_by_k, gs_idx_tmp, p_type) |
|
234 |
+ gs_by_k <- res$gs_by_k |
|
235 |
+ # update genes classes & specificities |
|
236 |
+ is_type <- res$class == "type" |
|
237 |
+ # spot checks; any type-gene should not be of class 'shared' |
|
238 |
+ # and cluster-specificities should be unassigned at this point |
|
239 |
+ stopifnot( |
|
240 |
+ all(class[is_type] == "state"), |
|
241 |
+ all(is.na(unlist(specs[is_type])))) |
|
242 |
+ class[is_type] <- "type" |
|
243 |
+ specs[is_type] <- res$specs[is_type] |
|
210 | 244 |
} |
245 |
+ # otherwise, simply impute type-genes w/o phylogeny |
|
246 |
+ } else if (p_type != 0) { |
|
247 |
+ res <- .impute_type_genes(x, gs_by_k, gs_idx, p_type) |
|
248 |
+ stopifnot(!any(res$class == "shared")) |
|
249 |
+ gs_by_k <- res$gs_by_k |
|
250 |
+ class <- res$class |
|
251 |
+ specs <- res$specs |
|
211 | 252 |
} else { |
212 |
- shared <- matrix("state", ng, nk) |
|
253 |
+ class <- rep("state", ng) |
|
254 |
+ specs <- rep(NA, ng) |
|
255 |
+ names(class) <- names(specs) <- gs |
|
213 | 256 |
} |
214 |
- |
|
215 |
- # impute type-genes |
|
216 |
- if (p_type != 0 & length(cells_phylo) == 0) { |
|
217 |
- type_info <- gs_by_k |
|
218 |
- gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx, p_type) |
|
219 |
- |
|
220 |
- ## keep gene type info |
|
221 |
- type_info[type_info == gs_by_k] <- NA |
|
222 |
- type_info[!is.na(type_info)] <- "type" |
|
223 |
- type_info[is.na(type_info)] <- "state" |
|
224 |
- } |
|
225 | 257 |
|
226 | 258 |
# split by cluster & categroy |
227 | 259 |
gs_by_k <- split(gs_by_k, col(gs_by_k)) |
228 | 260 |
gs_by_k <- setNames(map(gs_by_k, set_names, gs), kids) |
229 |
- |
|
230 | 261 |
gs_by_kc <- lapply(kids, function(k) |
231 | 262 |
lapply(unfactor(cats), function(c) |
232 | 263 |
gs_by_k[[k]][gs_idx[[c, k]]])) |
233 | 264 |
|
234 | 265 |
# sample logFCs |
235 |
- lfc0 <- lfc |
|
236 | 266 |
lfc <- vapply(kids, function(k) |
237 | 267 |
lapply(unfactor(cats), function(c) { |
238 | 268 |
n <- n_dd[c, k] |
... | ... |
@@ -253,10 +283,13 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
253 | 283 |
d <- rowData(x)$dispersion |
254 | 284 |
names(d) <- rownames(x) |
255 | 285 |
|
286 |
+ # initialize list of depth two to store |
|
287 |
+ # simulation means in each cluster & group |
|
256 | 288 |
sim_mean <- lapply(kids, function(k) |
257 | 289 |
lapply(gids, function(g) |
258 | 290 |
setNames(numeric(ng), gs))) |
259 | 291 |
|
292 |
+ # run simulation ----------------------------------------------------------- |
|
260 | 293 |
for (k in kids) { |
261 | 294 |
for (s in sids) { |
262 | 295 |
# get reference samples, clusters & cells |
... | ... |
@@ -265,7 +298,7 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
265 | 298 |
cs0 <- cs_by_ks[[k0]][s0] |
266 | 299 |
|
267 | 300 |
# get output cell indices |
268 |
- ci <- unlist(cs_idx[[k]][[s]]) |
|
301 |
+ ci <- cs_idx[[k]][[s]] |
|
269 | 302 |
|
270 | 303 |
for (c in cats[n_dd[, k] != 0]) { |
271 | 304 |
# sample cells to simulate from |
... | ... |
@@ -283,19 +316,13 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
283 | 316 |
lfc_kc <- lfc[[c, k]] |
284 | 317 |
|
285 | 318 |
re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc, p_ep, p_dp, p_dm) |
286 |
- y[gi, ci] <- re$cs |
|
319 |
+ y[gi, unlist(ci)] <- re$cs |
|
287 | 320 |
|
288 | 321 |
for (g in gids) sim_mean[[k]][[g]][gi] <- ifelse( |
289 | 322 |
is.null(re$ms[[g]]), NA, list(re$ms[[g]]))[[1]] |
290 | 323 |
} |
291 | 324 |
} |
292 | 325 |
} |
293 |
- sim_mean <- sim_mean %>% |
|
294 |
- map(bind_cols) %>% |
|
295 |
- bind_rows(.id = "cluster_id") %>% |
|
296 |
- mutate_at("cluster_id", factor) %>% |
|
297 |
- mutate(gene = rep(gs, nk)) |
|
298 |
- |
|
299 | 326 |
# construct gene metadata table storing ------------------------------------ |
300 | 327 |
# gene | cluster_id | category | logFC, gene, disp, mean used for sim. |
301 | 328 |
gi <- data.frame( |
... | ... |
@@ -307,30 +334,19 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
307 | 334 |
sim_disp = d[unlist(gs_by_kc)]) %>% |
308 | 335 |
mutate_at("gene", as.character) |
309 | 336 |
# add true simulation means |
337 |
+ sim_mean <- sim_mean %>% |
|
338 |
+ map(bind_cols) %>% |
|
339 |
+ bind_rows(.id = "cluster_id") %>% |
|
340 |
+ mutate_at("cluster_id", factor) %>% |
|
341 |
+ mutate(gene = rep(gs, nk)) |
|
310 | 342 |
gi <- full_join(gi, sim_mean, by = c("gene", "cluster_id")) %>% |
311 | 343 |
rename("sim_mean.A" = "A", "sim_mean.B" = "B") |
312 | 344 |
# reorder |
313 | 345 |
o <- order(as.numeric(gsub("[a-z]", "", gi$gene))) |
314 | 346 |
gi <- set_rownames(gi[o, ], NULL) |
315 |
- # add gene infos |
|
316 |
- gi$class <- c(t(type_info)) |
|
317 |
- gi$shared_class <- c(t(shared)) |
|
318 |
- # parameters |
|
319 |
- params <- list( |
|
320 |
- ng = ng, |
|
321 |
- nc = nc, |
|
322 |
- ns = ns, |
|
323 |
- nk = nk, |
|
324 |
- probs = probs, |
|
325 |
- p_dd = p_dd, |
|
326 |
- p_type = p_type, |
|
327 |
- lfc = lfc0, |
|
328 |
- rel_lfc = rel_lfc, |
|
329 |
- cells_phylo = cells_phylo, |
|
330 |
- params_dist = params_dist |
|
331 |
- ) |
|
332 | 347 |
|
333 |
- # construct SCE |
|
348 |
+ # construct SCE ------------------------------------------------------------ |
|
349 |
+ # cell metadata including group, sample, cluster IDs |
|
334 | 350 |
cd$group_id <- droplevels(cd$group_id) |
335 | 351 |
cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = ".")) |
336 | 352 |
m <- match(levels(cd$sample_id), cd$sample_id) |
... | ... |
@@ -339,16 +355,21 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
339 | 355 |
sids <- levels(cd$sample_id)[o] |
340 | 356 |
ei <- data.frame(sample_id = sids, group_id = gids[o]) |
341 | 357 |
cd <- cd %>% mutate_at("sample_id", factor, levels = sids) |
342 |
- |
|
358 |
+ # gene metadata storing gene classes & specificities |
|
359 |
+ rd <- DataFrame(class = factor(class, |
|
360 |
+ levels = c("state", "shared", "type"))) |
|
361 |
+ rd$specs <- as.list(specs) |
|
362 |
+ # simulation metadata including used reference samples/cluster, |
|
363 |
+ # list of input arguments, and simulated genes' metadata |
|
343 | 364 |
md <- list( |
344 | 365 |
experiment_info = ei, |
345 | 366 |
n_cells = table(cd$sample_id), |
346 | 367 |
gene_info = gi, |
347 | 368 |
ref_sids = ref_sids, |
348 | 369 |
ref_kids = ref_kids, |
349 |
- parameters = params) |
|
350 |
- |
|
370 |
+ args = args) |
|
371 |
+ # return SCE |
|
351 | 372 |
SingleCellExperiment( |
352 | 373 |
assays = list(counts = as.matrix(y)), |
353 |
- colData = cd, metadata = md) |
|
374 |
+ colData = cd, rowData = rd, metadata = md) |
|
354 | 375 |
} |
... | ... |
@@ -27,6 +27,20 @@ |
27 | 27 |
#' Should be of length \code{nlevels(x$cluster_id)} with |
28 | 28 |
#' \code{levels(x$cluster_id)} as names. |
29 | 29 |
#' Defaults to factor of 1 for all clusters. |
30 |
+#' @param cells_phylo newick tree text representing cluster relations |
|
31 |
+#' and their relative distance. If a tree is given, the distance between |
|
32 |
+#' the clusters will be translated in the number of shared genes |
|
33 |
+#' (this relation is controlled with \code{params_dist}). The distance |
|
34 |
+#' between two clusters is defined as the sum of the branches lengths |
|
35 |
+#' separating them. |
|
36 |
+#' @param params_dist numeric vector of length 2. Defines the number of shared |
|
37 |
+#' genes as an adaptation of the exponential's PDF: |
|
38 |
+#' N = Ngenes x gamma1 * e^(-gamma2 x dist) , |
|
39 |
+#' where gamma1 is the parameter that controls the percentage of shared genes. |
|
40 |
+#' By default, it corresponds to \code{p_type} but it's advised to tune it |
|
41 |
+#' depending on the input prep_sce. gamma2 is the 'penalty' of increasing |
|
42 |
+#' the distance between clusters, applied on the number of shared genes. |
|
43 |
+#' Default to -3. |
|
30 | 44 |
#' |
31 | 45 |
#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}} |
32 | 46 |
#' containing multiple clusters & samples across 2 groups. |
... | ... |
@@ -57,7 +71,24 @@ |
57 | 71 |
#' sim <- simData(sce, ng = 10, nc = 100, |
58 | 72 |
#' probs = list(NULL, NULL, c(1, 0))) |
59 | 73 |
#' levels(sim$group_id) |
60 |
-#' |
|
74 |
+#' |
|
75 |
+#' |
|
76 |
+#' # Hierarchical cell-type relations: |
|
77 |
+#' # we first define a phylogram representing the relations between 3 clusters |
|
78 |
+#' cells_phylo <- "(('cluster1':0.1, 'cluster2':0.1):0.4,'cluster3':0.5);" |
|
79 |
+#' # to verify the syntax and the relation, we can plot it |
|
80 |
+#' library(phylogram) |
|
81 |
+#' dend <- read.dendrogram(text = cells_phylo) |
|
82 |
+#' plot(dend) |
|
83 |
+#' # More complex relations are also possible |
|
84 |
+#' cells_phylo2 <- "(('cluster1':0.4, 'cluster2':0.4):0.4, ('cluster3':0.5,('cluster4':0.2,'cluster5':0.2,'cluster6':0.2):0.4):0.4);" |
|
85 |
+#' dend2 <- read.dendrogram(text = cells_phylo2) |
|
86 |
+#' plot(dend2) |
|
87 |
+#' # simulate clusters based on these distances |
|
88 |
+#' sim <- simData(sce_, nk = 3, cells_phylo = cells_phylo) |
|
89 |
+#' # the information about the shared 'type' genes are kept in the metadata |
|
90 |
+#' table(metadata(sim)$gene_info$shared_class) |
|
91 |
+#' |
|
61 | 92 |
#' @author Helena L Crowell |
62 | 93 |
#' |
63 | 94 |
#' @references |
... | ... |
@@ -82,7 +113,8 @@ |
82 | 113 |
simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
83 | 114 |
probs = NULL, p_dd = diag(6)[1, ], paired = FALSE, |
84 | 115 |
p_ep = 0.5, p_dp = 0.3, p_dm = 0.5, |
85 |
- p_type = 0, lfc = 2, rel_lfc = NULL) { |
|
116 |
+ p_type = 0, lfc = 2, rel_lfc = NULL, |
|
117 |
+ cells_phylo = NULL, params_dist = c(0.2, 3) ) { |
|
86 | 118 |
|
87 | 119 |
# throughout this code... |
88 | 120 |
# k: cluster ID |
... | ... |
@@ -151,9 +183,45 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
151 | 183 |
gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs) |
152 | 184 |
gs_by_k <- set_colnames(replicate(nk, gs_by_k), kids) |
153 | 185 |
|
186 |
+ if (length(cells_phylo) > 0) { |
|
187 |
+ out <- .impute_shared_type_genes(x, gs_by_k, gs_idx, cells_phylo, |
|
188 |
+ params_dist) |
|
189 |
+ gs_by_k <- out[[1]] |
|
190 |
+ used_tg <- out[[2]] |
|
191 |
+ shared <- out[[3]] |
|
192 |
+ |
|
193 |
+ if (p_type != 0) { |
|
194 |
+ |
|
195 |
+ ## update gs_idx to avoid having type genes in the shared type genes |
|
196 |
+ gs_idx_red <- gs_idx |
|
197 |
+ ee_ep_id <- seq(1, 6 * length(kids) - 5, 6) |
|
198 |
+ ee_ep_id <- c(ee_ep_id, ee_ep_id + 1) |
|
199 |
+ for (i in ee_ep_id) gs_idx_red[[i]] <- (gs_idx[[i]])[-which(gs_idx[[i]] %in% used_tg)] |
|
200 |
+ type_info <- gs_by_k |
|
201 |
+ gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx_red, p_type) |
|
202 |
+ |
|
203 |
+ ## keep gene type info |
|
204 |
+ type_info[type_info == gs_by_k] <- NA |
|
205 |
+ type_info[!is.na(type_info)] <- "type" |
|
206 |
+ type_info[is.na(type_info)] <- "state" |
|
207 |
+ |
|
208 |
+ } else { |
|
209 |
+ type_info <- matrix("state", ng, nk) |
|
210 |
+ } |
|
211 |
+ } else { |
|
212 |
+ shared <- matrix("state", ng, nk) |
|
213 |
+ } |
|
214 |
+ |
|
154 | 215 |
# impute type-genes |
155 |
- if (p_type != 0) |
|
216 |
+ if (p_type != 0 & length(cells_phylo) == 0) { |
|
217 |
+ type_info <- gs_by_k |
|
156 | 218 |
gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx, p_type) |
219 |
+ |
|
220 |
+ ## keep gene type info |
|
221 |
+ type_info[type_info == gs_by_k] <- NA |
|
222 |
+ type_info[!is.na(type_info)] <- "type" |
|
223 |
+ type_info[is.na(type_info)] <- "state" |
|
224 |
+ } |
|
157 | 225 |
|
158 | 226 |
# split by cluster & categroy |
159 | 227 |
gs_by_k <- split(gs_by_k, col(gs_by_k)) |
... | ... |
@@ -164,6 +232,7 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
164 | 232 |
gs_by_k[[k]][gs_idx[[c, k]]])) |
165 | 233 |
|
166 | 234 |
# sample logFCs |
235 |
+ lfc0 <- lfc |
|
167 | 236 |
lfc <- vapply(kids, function(k) |
168 | 237 |
lapply(unfactor(cats), function(c) { |
169 | 238 |
n <- n_dd[c, k] |
... | ... |
@@ -243,6 +312,23 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
243 | 312 |
# reorder |
244 | 313 |
o <- order(as.numeric(gsub("[a-z]", "", gi$gene))) |
245 | 314 |
gi <- set_rownames(gi[o, ], NULL) |
315 |
+ # add gene infos |
|
316 |
+ gi$class <- c(t(type_info)) |
|
317 |
+ gi$shared_class <- c(t(shared)) |
|
318 |
+ # parameters |
|
319 |
+ params <- list( |
|
320 |
+ ng = ng, |
|
321 |
+ nc = nc, |
|
322 |
+ ns = ns, |
|
323 |
+ nk = nk, |
|
324 |
+ probs = probs, |
|
325 |
+ p_dd = p_dd, |
|
326 |
+ p_type = p_type, |
|
327 |
+ lfc = lfc0, |
|
328 |
+ rel_lfc = rel_lfc, |
|
329 |
+ cells_phylo = cells_phylo, |
|
330 |
+ params_dist = params_dist |
|
331 |
+ ) |
|
246 | 332 |
|
247 | 333 |
# construct SCE |
248 | 334 |
cd$group_id <- droplevels(cd$group_id) |
... | ... |
@@ -259,7 +345,8 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
259 | 345 |
n_cells = table(cd$sample_id), |
260 | 346 |
gene_info = gi, |
261 | 347 |
ref_sids = ref_sids, |
262 |
- ref_kids = ref_kids) |
|
348 |
+ ref_kids = ref_kids, |
|
349 |
+ parameters = params) |
|
263 | 350 |
|
264 | 351 |
SingleCellExperiment( |
265 | 352 |
assays = list(counts = as.matrix(y)), |
... | ... |
@@ -13,6 +13,9 @@ |
13 | 13 |
#' @param p_dd numeric vector of length 6. |
14 | 14 |
#' Specifies the probability of a gene being |
15 | 15 |
#' EE, EP, DE, DP, DM, or DB, respectively. |
16 |
+#' @param paired logial specifying whether a paired design should |
|
17 |
+#' be simulated (both groups use the same set of reference samples) |
|
18 |
+#' or not (reference samples are drawn at random). |
|
16 | 19 |
#' @param p_ep,p_dp,p_dm numeric specifying the proportion of cells |
17 | 20 |
#' to be shifted to a different expression state in one group (see details). |
18 | 21 |
#' @param p_type numeric. Probaility of EE/EP gene being a type-gene. |
... | ... |
@@ -77,7 +80,7 @@ |
77 | 80 |
#' @export |
78 | 81 |
|
79 | 82 |
simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
80 |
- probs = NULL, p_dd = diag(6)[1, ], |
|
83 |
+ probs = NULL, p_dd = diag(6)[1, ], paired = FALSE, |
|
81 | 84 |
p_ep = 0.5, p_dp = 0.3, p_dm = 0.5, |
82 | 85 |
p_type = 0, lfc = 2, rel_lfc = NULL) { |
83 | 86 |
|
... | ... |
@@ -103,15 +106,24 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
103 | 106 |
|
104 | 107 |
# sample reference clusters & samples |
105 | 108 |
ref_kids <- setNames(sample(kids0, nk, nk > nk0), kids) |
106 |
- ref_sids <- vapply(gids, function(g) |
|
107 |
- setNames(sample(sids0, ns, ns > ns0), |
|
108 |
- paste0("sample", seq_len(ns))), |
|
109 |
- character(ns)) |
|
109 |
+ if (paired) { |
|
110 |
+ # use same set of reference samples for both groups |
|
111 |
+ ref_sids <- sample(sids0, ns, ns > ns0) |
|
112 |
+ ref_sids <- replicate(length(gids), ref_sids) |
|
113 |
+ } else { |
|
114 |
+ # draw reference samples at random for each group |
|
115 |
+ ref_sids <- replicate(length(gids), |
|
116 |
+ sample(sids0, ns, ns > ns0)) |
|
117 |
+ } |
|
118 |
+ dimnames(ref_sids) <- list(sids, gids) |
|
110 | 119 |
|
111 | 120 |
if (is.null(rel_lfc)) |
112 | 121 |
rel_lfc <- rep(1, nk) |
113 |
- if (is.null(names(rel_lfc))) |
|
122 |
+ if (is.null(names(rel_lfc))) { |
|
114 | 123 |
names(rel_lfc) <- kids |
124 |
+ } else { |
|
125 |
+ stopifnot(names(rel_lfc) %in% kids0) |
|
126 |
+ } |
|
115 | 127 |
|
116 | 128 |
# initialize count matrix |
117 | 129 |
gs <- paste0("gene", seq_len(ng)) |
... | ... |
@@ -245,7 +257,9 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
245 | 257 |
md <- list( |
246 | 258 |
experiment_info = ei, |
247 | 259 |
n_cells = table(cd$sample_id), |
248 |
- gene_info = gi) |
|
260 |
+ gene_info = gi, |
|
261 |
+ ref_sids = ref_sids, |
|
262 |
+ ref_kids = ref_kids) |
|
249 | 263 |
|
250 | 264 |
SingleCellExperiment( |
251 | 265 |
assays = list(counts = as.matrix(y)), |
... | ... |
@@ -36,8 +36,7 @@ |
36 | 36 |
#' sce <- prepSim(sce) |
37 | 37 |
#' |
38 | 38 |
#' # simulate data |
39 |
-#' (sim <- simData(sce, |
|
40 |
-#' n_genes = 100, n_cells = 10, |
|
39 |
+#' (sim <- simData(sce, ng = 100, nc = 10, |
|
41 | 40 |
#' p_dd = c(0.9, 0, 0.1, 0, 0, 0))) |
42 | 41 |
#' |
43 | 42 |
#' # simulation metadata |
... | ... |
@@ -47,14 +46,12 @@ |
47 | 46 |
#' table(gi$category) |
48 | 47 |
#' |
49 | 48 |
#' # unbalanced sample sizes |
50 |
-#' sim <- simData(sce, |
|
51 |
-#' n_genes = 10, n_cells = 100, |
|
49 |
+#' sim <- simData(sce, ng = 10, nc = 100, |
|
52 | 50 |
#' probs = list(NULL, c(0.25, 0.75), NULL)) |
53 | 51 |
#' table(sim$sample_id) |
54 | 52 |
#' |
55 | 53 |
#' # one group only |
56 |
-#' sim <- simData(sce, |
|
57 |
-#' n_genes = 10, n_cells = 100, |
|
54 |
+#' sim <- simData(sce, ng = 10, nc = 100, |
|
58 | 55 |
#' probs = list(NULL, NULL, c(1, 0))) |
59 | 56 |
#' levels(sim$group_id) |
60 | 57 |
#' |
... | ... |
@@ -236,6 +233,7 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
236 | 233 |
gi <- set_rownames(gi[o, ], NULL) |
237 | 234 |
|
238 | 235 |
# construct SCE |
236 |
+ cd$group_id <- droplevels(cd$group_id) |
|
239 | 237 |
cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = ".")) |
240 | 238 |
m <- match(levels(cd$sample_id), cd$sample_id) |
241 | 239 |
gids <- cd$group_id[m] |
... | ... |
@@ -13,6 +13,8 @@ |
13 | 13 |
#' @param p_dd numeric vector of length 6. |
14 | 14 |
#' Specifies the probability of a gene being |
15 | 15 |
#' EE, EP, DE, DP, DM, or DB, respectively. |
16 |
+#' @param p_ep,p_dp,p_dm numeric specifying the proportion of cells |
|
17 |
+#' to be shifted to a different expression state in one group (see details). |
|
16 | 18 |
#' @param p_type numeric. Probaility of EE/EP gene being a type-gene. |
17 | 19 |
#' If a gene is of class "type" in a given cluster, a unique mean |
18 | 20 |
#' will be used for that gene in the respective cluster. |
... | ... |
@@ -78,8 +80,9 @@ |
78 | 80 |
#' @export |
79 | 81 |
|
80 | 82 |
simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
81 |
- probs = NULL, p_dd = diag(6)[1, ], p_type = 0, |
|
82 |
- lfc = 2, rel_lfc = NULL) { |
|
83 |
+ probs = NULL, p_dd = diag(6)[1, ], |
|
84 |
+ p_ep = 0.5, p_dp = 0.3, p_dm = 0.5, |
|
85 |
+ p_type = 0, lfc = 2, rel_lfc = NULL) { |
|
83 | 86 |
|
84 | 87 |
# throughout this code... |
85 | 88 |
# k: cluster ID |
... | ... |
@@ -201,7 +204,7 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
201 | 204 |
d_kc <- d[gs0] |
202 | 205 |
lfc_kc <- lfc[[c, k]] |
203 | 206 |
|
204 |
- re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc) |
|
207 |
+ re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc, p_ep, p_dp, p_dm) |
|
205 | 208 |
y[gi, ci] <- re$cs |
206 | 209 |
|
207 | 210 |
for (g in gids) sim_mean[[k]][[g]][gi] <- ifelse( |
... | ... |
@@ -6,9 +6,7 @@ |
6 | 6 |
#' across 2 experimental conditions from a real scRNA-seq data set. |
7 | 7 |
#' |
8 | 8 |
#' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. |
9 |
-#' @param n_genes # of genes to simulate. |
|
10 |
-#' @param n_cells # of cells to simulate. |
|
11 |
-#' Either a single numeric or a range to sample from. |
|
9 |
+#' @param ng,nc,ns,nk # of genes, cells, samples and clusters to simulate. |
|
12 | 10 |
#' @param probs a list of length 3 containing probabilities of a cell belonging |
13 | 11 |
#' to each cluster, sample, and group, respectively. List elements must be |
14 | 12 |
#' NULL (equal probabilities) or numeric values in [0, 1] that sum to 1. |
... | ... |
@@ -79,7 +77,7 @@ |
79 | 77 |
#' @importFrom S4Vectors split |
80 | 78 |
#' @export |
81 | 79 |
|
82 |
-simData <- function(x, n_genes = 500, n_cells = 300, |
|
80 |
+simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3, |
|
83 | 81 |
probs = NULL, p_dd = diag(6)[1, ], p_type = 0, |
84 | 82 |
lfc = 2, rel_lfc = NULL) { |
85 | 83 |
|
... | ... |
@@ -88,15 +86,27 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
88 | 86 |
# s: sample ID |
89 | 87 |
# g: group ID |
90 | 88 |
# c: DD category |
89 |
+ # 0: reference |
|
91 | 90 |
|
92 | 91 |
# check validity of input arguments |
93 | 92 |
.check_sce(x, req_group = FALSE) |
94 | 93 |
.check_args_simData(as.list(environment())) |
95 | 94 |
|
96 |
- kids <- set_names(levels(x$cluster_id)) |
|
97 |
- sids <- set_names(levels(x$sample_id)) |
|
95 |
+ # reference IDs |
|
96 |
+ nk0 <- length(kids0 <- set_names(levels(x$cluster_id))) |
|
97 |
+ ns0 <- length(sids0 <- set_names(levels(x$sample_id))) |
|
98 |
+ |
|
99 |
+ # simulation IDs |
|
100 |
+ nk <- length(kids <- set_names(paste0("cluster", seq_len(nk)))) |
|
101 |
+ sids <- set_names(paste0("sample", seq_len(ns))) |
|
98 | 102 |
gids <- set_names(c("A", "B")) |
99 |
- nk <- length(kids) |
|
103 |
+ |
|
104 |
+ # sample reference clusters & samples |
|
105 |
+ ref_kids <- setNames(sample(kids0, nk, nk > nk0), kids) |
|
106 |
+ ref_sids <- vapply(gids, function(g) |
|
107 |
+ setNames(sample(sids0, ns, ns > ns0), |
|
108 |
+ paste0("sample", seq_len(ns))), |
|
109 |
+ character(ns)) |
|
100 | 110 |
|
101 | 111 |
if (is.null(rel_lfc)) |
102 | 112 |
rel_lfc <- rep(1, nk) |
... | ... |
@@ -104,15 +114,15 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
104 | 114 |
names(rel_lfc) <- kids |
105 | 115 |
|
106 | 116 |
# initialize count matrix |
107 |
- gs <- paste0("gene", seq_len(n_genes)) |
|
108 |
- cs <- paste0("cell", seq_len(n_cells)) |
|
109 |
- y <- matrix(0, n_genes, n_cells, dimnames = list(gs, cs)) |
|
117 |
+ gs <- paste0("gene", seq_len(ng)) |
|
118 |
+ cs <- paste0("cell", seq_len(nc)) |
|
119 |
+ y <- matrix(0, ng, nc, dimnames = list(gs, cs)) |
|
110 | 120 |
|
111 | 121 |
# sample cell metadata |
112 | 122 |
cd <- .sample_cell_md( |
113 |
- n = n_cells, probs = probs, |
|
114 |
- ids = list(kids, sids, gids)) %>% |
|
115 |
- set_rownames(cs) |
|
123 |
+ n = nc, probs = probs, |
|
124 |
+ ids = list(kids, sids, gids)) |
|
125 |
+ rownames(cd) <- cs |
|
116 | 126 |
cs_idx <- .split_cells(cd, by = colnames(cd)) |
117 | 127 |
n_cs <- modify_depth(cs_idx, -1, length) |
118 | 128 |
|
... | ... |
@@ -120,14 +130,14 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
120 | 130 |
cs_by_ks <- .split_cells(x) |
121 | 131 |
|
122 | 132 |
# sample nb. of genes to simulate per category & gene indices |
123 |
- n_dd <- replicate(nk, |
|
124 |
- table(sample(factor(cats, levels = cats), n_genes, TRUE, p_dd))) %>% |
|
125 |
- set_colnames(kids) |
|
133 |
+ n_dd <- table(sample(cats, ng, TRUE, p_dd)) |
|
134 |
+ n_dd <- replicate(nk, n_dd) |
|
135 |
+ colnames(n_dd) <- kids |
|
126 | 136 |
gs_idx <- .sample_gene_inds(gs, n_dd) |
127 | 137 |
|
128 | 138 |
# for ea. cluster, sample set of genes to simulate from |
129 |
- gs_by_k <- setNames(sample(rownames(x), n_genes, TRUE), gs) |
|
130 |
- gs_by_k <- replicate(nk, gs_by_k) %>% set_colnames(kids) |
|
139 |
+ gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs) |
|
140 |
+ gs_by_k <- set_colnames(replicate(nk, gs_by_k), kids) |
|
131 | 141 |
|
132 | 142 |
# impute type-genes |
133 | 143 |
if (p_type != 0) |
... | ... |
@@ -135,74 +145,70 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
135 | 145 |
|
136 | 146 |
# split by cluster & categroy |
137 | 147 |
gs_by_k <- split(gs_by_k, col(gs_by_k)) |
138 |
- gs_by_k <- map(gs_by_k, set_names, gs) |
|
139 |
- names(gs_by_k) <- kids |
|
148 |
+ gs_by_k <- setNames(map(gs_by_k, set_names, gs), kids) |
|
140 | 149 |
|
141 | 150 |
gs_by_kc <- lapply(kids, function(k) |
142 |
- lapply(cats, function(c) |
|
143 |
- gs_by_k[[k]][gs_idx[[c, k]]]) %>% |
|
144 |
- set_names(cats)) |
|
151 |
+ lapply(unfactor(cats), function(c) |
|
152 |
+ gs_by_k[[k]][gs_idx[[c, k]]])) |
|
145 | 153 |
|
146 | 154 |
# sample logFCs |
147 | 155 |
lfc <- vapply(kids, function(k) |
148 |
- lapply(cats, function(c) { |
|
156 |
+ lapply(unfactor(cats), function(c) { |
|
149 | 157 |
n <- n_dd[c, k] |
150 | 158 |
if (c == "ee") return(rep(NA, n)) |
151 | 159 |
signs <- sample(c(-1, 1), n, TRUE) |
152 | 160 |
lfcs <- rgamma(n, 4, 4/lfc) * signs |
153 | 161 |
names(lfcs) <- gs_by_kc[[k]][[c]] |
154 | 162 |
lfcs * rel_lfc[k] |
155 |
- }), vector("list", length(cats))) %>% |
|
156 |
- set_rownames(cats) |
|
157 |
- |
|
163 |
+ }), vector("list", length(cats))) |
|
164 |
+ |
|
158 | 165 |
# compute NB parameters |
159 |
- o <- exp(colData(x)$offset) |
|
160 |
- m <- lapply(sids, function(s) { |
|
161 |
- cn <- paste("beta", s, sep = ".") |
|
162 |
- k <- grep(cn, names(rowData(x))) |
|
163 |
- b <- exp(rowData(x)[[k]]) |
|
164 |
- m <- vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>% |
|
165 |
- set_rownames(rownames(x)) %>% |
|
166 |
- set_colnames(colnames(x)) |
|
166 |
+ m <- lapply(sids0, function(s) { |
|
167 |
+ b <- paste0("beta.", s) |
|
168 |
+ b <- exp(rowData(x)[[b]]) |
|
169 |
+ m <- outer(b, exp(x$offset), "*") |
|
170 |
+ dimnames(m) <- dimnames(x); m |
|
167 | 171 |
}) |
168 |
- d <- rowData(x)$dispersion %>% |
|
169 |
- set_names(rownames(x)) |
|
172 |
+ d <- rowData(x)$dispersion |
|
173 |
+ names(d) <- rownames(x) |
|
170 | 174 |
|
171 | 175 |
sim_mean <- lapply(kids, function(k) |
172 |
- lapply(gids, function(g) |
|
173 |
- setNames(numeric(n_genes), rownames(y)))) |
|
176 |
+ lapply(gids, function(g) |
|
177 |
+ setNames(numeric(ng), gs))) |
|
178 |
+ |
|
174 | 179 |
for (k in kids) { |
175 | 180 |
for (s in sids) { |
181 |
+ # get reference samples, clusters & cells |
|
182 |
+ s0 <- ref_sids[s, ] |
|
183 |
+ k0 <- ref_kids[k] |
|
184 |
+ cs0 <- cs_by_ks[[k0]][s0] |
|
185 |
+ |
|
186 |
+ # get output cell indices |
|
187 |
+ ci <- unlist(cs_idx[[k]][[s]]) |
|
188 |
+ |
|
176 | 189 |
for (c in cats[n_dd[, k] != 0]) { |
177 |
- gs_kc <- gs_by_kc[[k]][[c]] |
|
178 |
- cs_ks <- cs_by_ks[[k]][[s]] |
|
190 |
+ # sample cells to simulate from |
|
191 |
+ cs_g1 <- sample(cs0[[1]], n_cs[[k]][[s]][[1]], TRUE) |
|
192 |
+ cs_g2 <- sample(cs0[[2]], n_cs[[k]][[s]][[2]], TRUE) |
|
179 | 193 |
|
180 |
- g1 <- cs_idx[[k]][[s]]$A |
|
181 |
- g2 <- cs_idx[[k]][[s]]$B |
|
194 |
+ # get reference genes & output gene indices |
|
195 |
+ gs0 <- gs_by_kc[[k]][[c]] |
|
196 |
+ gi <- gs_idx[[c, k]] |
|
182 | 197 |
|
183 |
- ng1 <- length(g1) |
|
184 |
- ng2 <- length(g2) |
|
185 |
- |
|
186 |
- cs_g1 <- sample(cs_ks, ng1, replace = TRUE) |
|
187 |
- cs_g2 <- sample(cs_ks, ng2, replace = TRUE) |
|
188 |
- |
|
189 |
- m_g1 <- m[[s]][gs_kc, cs_g1, drop = FALSE] |
|
190 |
- m_g2 <- m[[s]][gs_kc, cs_g2, drop = FALSE] |
|
191 |
- d_kc <- d[gs_kc] |
|
198 |
+ # get NB parameters |
|
199 |
+ m_g1 <- m[[s0[[1]]]][gs0, cs_g1, drop = FALSE] |
|
200 |
+ m_g2 <- m[[s0[[2]]]][gs0, cs_g2, drop = FALSE] |
|
201 |
+ d_kc <- d[gs0] |
|
192 | 202 |
lfc_kc <- lfc[[c, k]] |
193 | 203 |
|
194 |
- gidx <- gs_idx[[c, k]] |
|
195 |
- cidx <- c(g1, g2) |
|
196 |
- |
|
197 | 204 |
re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc) |
198 |
- y[gidx, cidx] <- re$cs |
|
205 |
+ y[gi, ci] <- re$cs |
|
199 | 206 |
|
200 |
- for (g in c("A", "B")) sim_mean[[k]][[g]][gidx] <- |
|
201 |
- ifelse(is.null(re$ms[[g]]), NA, list(re$ms[[g]]))[[1]] |
|
207 |
+ for (g in gids) sim_mean[[k]][[g]][gi] <- ifelse( |
|
208 |
+ is.null(re$ms[[g]]), NA, list(re$ms[[g]]))[[1]] |
|
202 | 209 |
} |
203 | 210 |
} |
204 | 211 |
} |
205 |
- |
|
206 | 212 |
sim_mean <- sim_mean %>% |
207 | 213 |
map(bind_cols) %>% |
208 | 214 |
bind_rows(.id = "cluster_id") %>% |
... | ... |
@@ -224,7 +230,7 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
224 | 230 |
rename("sim_mean.A" = "A", "sim_mean.B" = "B") |
225 | 231 |
# reorder |
226 | 232 |
o <- order(as.numeric(gsub("[a-z]", "", gi$gene))) |
227 |
- gi <- gi[o, ] %>% set_rownames(NULL) |
|
233 |
+ gi <- set_rownames(gi[o, ], NULL) |
|
228 | 234 |
|
229 | 235 |
# construct SCE |
230 | 236 |
cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = ".")) |
... | ... |
@@ -242,6 +248,5 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
242 | 248 |
|
243 | 249 |
SingleCellExperiment( |
244 | 250 |
assays = list(counts = as.matrix(y)), |
245 |
- colData = cd, |
|
246 |
- metadata = md) |
|
251 |
+ colData = cd, metadata = md) |
|
247 | 252 |
} |
... | ... |
@@ -161,10 +161,9 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
161 | 161 |
cn <- paste("beta", s, sep = ".") |
162 | 162 |
k <- grep(cn, names(rowData(x))) |
163 | 163 |
b <- exp(rowData(x)[[k]]) |
164 |
- vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>% |
|
164 |
+ m <- vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>% |
|
165 | 165 |
set_rownames(rownames(x)) %>% |
166 |
- set_colnames(colnames(x)) %>% |
|
167 |
- round |
|
166 |
+ set_colnames(colnames(x)) |
|
168 | 167 |
}) |
169 | 168 |
d <- rowData(x)$dispersion %>% |
170 | 169 |
set_names(rownames(x)) |
... | ... |
@@ -71,7 +71,7 @@ |
71 | 71 |
#' @importFrom dplyr mutate_all mutate_at |
72 | 72 |
#' @importFrom edgeR DGEList estimateDisp glmFit |
73 | 73 |
#' @importFrom magrittr set_colnames set_rownames |
74 |
-#' @importFrom purrr modify_at set_names |
|
74 |
+#' @importFrom purrr modify_depth set_names |
|
75 | 75 |
#' @importFrom stats model.matrix rgamma setNames |
76 | 76 |
#' @importFrom SingleCellExperiment SingleCellExperiment |
77 | 77 |
#' @importFrom SummarizedExperiment colData |
... | ... |
@@ -30,9 +30,32 @@ |
30 | 30 |
#' |
31 | 31 |
#' @examples |
32 | 32 |
#' data(sce) |
33 |
-#' simData(sce, |
|
34 |
-#' n_genes = 10, n_cells = 10, |
|
35 |
-#' p_dd = diag(6)[1, ]) |
|
33 |
+#' |
|
34 |
+#' # prep. SCE for simulation |
|
35 |
+#' sce <- prepSim(sce) |
|
36 |
+#' |
|
37 |
+#' # simulate data |
|
38 |
+#' (sim <- simData(sce, |
|
39 |
+#' n_genes = 100, n_cells = 10, |
|
40 |
+#' p_dd = c(0.9, 0, 0.1, 0, 0, 0))) |
|
41 |
+#' |
|
42 |
+#' # simulation metadata |
|
43 |
+#' head(gi <- metadata(sim)$gene_info) |
|
44 |
+#' |
|
45 |
+#' # should be ~10% DE |
|
46 |
+#' table(gi$category) |
|
47 |
+#' |
|
48 |
+#' # unbalanced sample sizes |
|
49 |
+#' sim <- simData(sce, |
|
50 |
+#' n_genes = 10, n_cells = 100, |
|
51 |
+#' probs = list(NULL, c(0.1, 0.3, 0.6), NULL)) |
|
52 |
+#' table(sim$sample_id) |
|
53 |
+#' |
|
54 |
+#' # one group only |
|
55 |
+#' sim <- simData(sce, |
|
56 |
+#' n_genes = 10, n_cells = 100, |
|
57 |
+#' probs = list(NULL, NULL, c(1, 0))) |
|
58 |
+#' levels(sim$group_id) |
|
36 | 59 |
#' |
37 | 60 |
#' @author Helena L Crowell |
38 | 61 |
#' |
... | ... |
@@ -53,7 +76,6 @@ |
53 | 76 |
#' @importFrom SingleCellExperiment SingleCellExperiment |
54 | 77 |
#' @importFrom SummarizedExperiment colData |
55 | 78 |
#' @importFrom S4Vectors split |
56 |
-#' @importFrom tibble column_to_rownames |
|
57 | 79 |
#' @export |
58 | 80 |
|
59 | 81 |
simData <- function(x, n_genes = 500, n_cells = 300, |
... | ... |
@@ -63,35 +85,22 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
63 | 85 |
# throughout this code... |
64 | 86 |
# k: cluster ID |
65 | 87 |
# s: sample ID |
66 |
- # c: gene category |
|
88 |
+ # g: group ID |
|
89 |
+ # c: DD category |
|
67 | 90 |
|
68 | 91 |
# check validity of input arguments |
69 | 92 |
.check_sce(x, req_group = FALSE) |
70 |
- stopifnot(is.numeric(n_genes), length(n_genes) == 1) |
|
71 |
- stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2) |
|
72 |
- stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1) |
|
73 |
- stopifnot(is.numeric(p_type), length(p_type) == 1, p_type >= 0) |
|
74 |
- stopifnot(is.numeric(lfc), is.numeric(lfc), lfc > 1) |
|
75 |
- |
|
76 |
- kids <- levels(x$cluster_id) |
|
77 |
- sids <- levels(x$sample_id) |
|
78 |
- gids <- c("A", "B") |
|
79 |
- names(kids) <- kids |
|
80 |
- names(sids) <- sids |
|
81 |
- names(gids) <- gids |
|
93 |
+ .check_args_simData(as.list(environment())) |
|
94 |
+ |
|
95 |
+ kids <- set_names(levels(x$cluster_id)) |
|
96 |
+ sids <- set_names(levels(x$sample_id)) |
|
97 |
+ gids <- set_names(c("A", "B")) |
|
82 | 98 |
nk <- length(kids) |
83 | 99 |
|
84 |
- if (is.null(rel_lfc)) { |
|
100 |
+ if (is.null(rel_lfc)) |
|
85 | 101 |
rel_lfc <- rep(1, nk) |
102 |
+ if (is.null(names(rel_lfc))) |
|
86 | 103 |
names(rel_lfc) <- kids |
87 |
- } else { |
|
88 |
- stopifnot(is.numeric(rel_lfc), length(rel_lfc) == nk, rel_lfc >= 0) |
|
89 |
- if (is.null(names(rel_lfc))) { |
|
90 |
- names(rel_lfc) <- kids |
|
91 |
- } else { |
|
92 |
- stopifnot(setequal(names(rel_lfc), kids)) |
|
93 |
- } |
|
94 |
- } |
|
95 | 104 |
|
96 | 105 |
# initialize count matrix |
97 | 106 |
gs <- paste0("gene", seq_len(n_genes)) |
... | ... |
@@ -25,8 +25,6 @@ |
25 | 25 |
#' \code{levels(x$cluster_id)} as names. |
26 | 26 |
#' Defaults to factor of 1 for all clusters. |
27 | 27 |
#' |
28 |
-#' |
|
29 |
-#' |
|
30 | 28 |
#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}} |
31 | 29 |
#' containing multiple clusters & samples across 2 groups. |
32 | 30 |
#' |
... | ... |
@@ -35,6 +33,16 @@ |
35 | 33 |
#' simData(sce, |
36 | 34 |
#' n_genes = 10, n_cells = 10, |
37 | 35 |
#' p_dd = diag(6)[1, ]) |
36 |
+#' |
|
37 |
+#' @author Helena L Crowell |
|
38 |
+#' |
|
39 |
+#' @references |
|
40 |
+#' Crowell, HL, Soneson, C, Germain, P-L, Calini, D, |
|
41 |
+#' Collin, L, Raposo, C, Malhotra, D & Robinson, MD: |
|
42 |
+#' On the discovery of population-specific state transitions from |
|
43 |
+#' multi-sample multi-condition single-cell RNA sequencing data. |
|
44 |
+#' \emph{bioRxiv} \strong{713412} (2018). |
|
45 |
+#' doi: \url{https://doi.org/10.1101/713412} |
|
38 | 46 |
#' |
39 | 47 |
#' @importFrom data.table data.table |
40 | 48 |
#' @importFrom dplyr mutate_all mutate_at |
... | ... |
@@ -46,7 +54,6 @@ |
46 | 54 |
#' @importFrom SummarizedExperiment colData |
47 | 55 |
#' @importFrom S4Vectors split |
48 | 56 |
#' @importFrom tibble column_to_rownames |
49 |
-#' |
|
50 | 57 |
#' @export |
51 | 58 |
|
52 | 59 |
simData <- function(x, n_genes = 500, n_cells = 300, |
... | ... |
@@ -117,8 +117,10 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
117 | 117 |
gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx, p_type) |
118 | 118 |
|
119 | 119 |
# split by cluster & categroy |
120 |
- gs_by_k <- gs_by_k %>% split(col(.)) %>% |
|
121 |
- set_names(kids) %>% map(set_names, gs) |
|
120 |
+ gs_by_k <- split(gs_by_k, col(gs_by_k)) |
|
121 |
+ gs_by_k <- map(gs_by_k, set_names, gs) |
|
122 |
+ names(gs_by_k) <- kids |
|
123 |
+ |
|
122 | 124 |
gs_by_kc <- lapply(kids, function(k) |
123 | 125 |
lapply(cats, function(c) |
124 | 126 |
gs_by_k[[k]][gs_idx[[c, k]]]) %>% |
... | ... |
@@ -144,7 +144,8 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
144 | 144 |
b <- exp(rowData(x)[[k]]) |
145 | 145 |
vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>% |
146 | 146 |
set_rownames(rownames(x)) %>% |
147 |
- set_colnames(colnames(x)) |
|
147 |
+ set_colnames(colnames(x)) %>% |
|
148 |
+ round |
|
148 | 149 |
}) |
149 | 150 |
d <- rowData(x)$dispersion %>% |
150 | 151 |
set_names(rownames(x)) |
... | ... |
@@ -177,8 +177,9 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
177 | 177 |
|
178 | 178 |
re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc) |
179 | 179 |
y[gidx, cidx] <- re$cs |
180 |
- sim_mean[[k]]$A[gidx] <- re$ms$A |
|
181 |
- sim_mean[[k]]$B[gidx] <- re$ms$B |
|
180 |
+ |
|
181 |
+ for (g in c("A", "B")) sim_mean[[k]][[g]][gidx] <- |
|
182 |
+ ifelse(is.null(re$ms[[g]]), NA, list(re$ms[[g]]))[[1]] |
|
182 | 183 |
} |
183 | 184 |
} |
184 | 185 |
} |
... | ... |
@@ -94,7 +94,8 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
94 | 94 |
# sample cell metadata |
95 | 95 |
cd <- .sample_cell_md( |
96 | 96 |
n = n_cells, probs = probs, |
97 |
- ids = list(kids, sids, gids)) %>% set_rownames(cs) |
|
97 |
+ ids = list(kids, sids, gids)) %>% |
|
98 |
+ set_rownames(cs) |
|
98 | 99 |
cs_idx <- .split_cells(cd, by = colnames(cd)) |
99 | 100 |
n_cs <- modify_depth(cs_idx, -1, length) |
100 | 101 |
|
... | ... |
@@ -15,11 +15,17 @@ |
15 | 15 |
#' @param p_dd numeric vector of length 6. |
16 | 16 |
#' Specifies the probability of a gene being |
17 | 17 |
#' EE, EP, DE, DP, DM, or DB, respectively. |
18 |
-#' @param fc numeric value to use as mean logFC |
|
19 |
-#' for DE, DP, DM, and DB type of genes. |
|
20 | 18 |
#' @param p_type numeric. Probaility of EE/EP gene being a type-gene. |
21 | 19 |
#' If a gene is of class "type" in a given cluster, a unique mean |
22 | 20 |
#' will be used for that gene in the respective cluster. |
21 |
+#' @param lfc numeric value to use as mean logFC |
|
22 |
+#' for DE, DP, DM, and DB type of genes. |
|
23 |
+#' @param rel_lfc numeric vector of relative logFCs for each cluster. |
|
24 |
+#' Should be of length \code{nlevels(x$cluster_id)} with |
|
25 |
+#' \code{levels(x$cluster_id)} as names. |
|
26 |
+#' Defaults to factor of 1 for all clusters. |
|
27 |
+#' |
|
28 |
+#' |
|
23 | 29 |
#' |
24 | 30 |
#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}} |
25 | 31 |
#' containing multiple clusters & samples across 2 groups. |
... | ... |
@@ -45,7 +51,7 @@ |
45 | 51 |
|
46 | 52 |
simData <- function(x, n_genes = 500, n_cells = 300, |
47 | 53 |
probs = NULL, p_dd = diag(6)[1, ], p_type = 0, |
48 |
- fc = 2, rel_fc = NULL) { |
|
54 |
+ lfc = 2, rel_lfc = NULL) { |
|
49 | 55 |
|
50 | 56 |
# throughout this code... |
51 | 57 |
# k: cluster ID |
... | ... |
@@ -58,7 +64,7 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
58 | 64 |
stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2) |
59 | 65 |
stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1) |
60 | 66 |
stopifnot(is.numeric(p_type), length(p_type) == 1, p_type >= 0) |
61 |
- stopifnot(is.numeric(fc), is.numeric(fc), fc > 1) |
|
67 |
+ stopifnot(is.numeric(lfc), is.numeric(lfc), lfc > 1) |
|
62 | 68 |
|
63 | 69 |
kids <- levels(x$cluster_id) |
64 | 70 |
sids <- levels(x$sample_id) |
... | ... |
@@ -67,10 +73,17 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
67 | 73 |
names(sids) <- sids |
68 | 74 |
names(gids) <- gids |
69 | 75 |
nk <- length(kids) |
70 |
- if (is.null(rel_fc)) { |
|
71 |
- rel_fc <- rep(1, nk) |
|
76 |
+ |
|
77 |
+ if (is.null(rel_lfc)) { |
|
78 |
+ rel_lfc <- rep(1, nk) |
|
79 |
+ names(rel_lfc) <- kids |
|
72 | 80 |
} else { |
73 |
- stopifnot(is.numeric(rel_fc), length(rel_fc) == nk, rel_fc >= 0) |
|
81 |
+ stopifnot(is.numeric(rel_lfc), length(rel_lfc) == nk, rel_lfc >= 0) |
|
82 |
+ if (is.null(names(rel_lfc))) { |
|
83 |
+ names(rel_lfc) <- kids |
|
84 |
+ } else { |
|
85 |
+ stopifnot(setequal(names(rel_lfc), kids)) |
|
86 |
+ } |
|
74 | 87 |
} |
75 | 88 |
|
76 | 89 |
# initialize count matrix |
... | ... |
@@ -116,9 +129,9 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
116 | 129 |
n <- n_dd[c, k] |
117 | 130 |
if (c == "ee") return(rep(NA, n)) |
118 | 131 |
signs <- sample(c(-1, 1), n, TRUE) |
119 |
- lfc <- rgamma(n, 4, 4/fc) * signs |
|
120 |
- names(lfc) <- gs_by_kc[[k]][[c]] |
|
121 |
- return(lfc) |
|
132 |
+ lfcs <- rgamma(n, 4, 4/lfc) * signs |
|
133 |
+ names(lfcs) <- gs_by_kc[[k]][[c]] |
|
134 |
+ lfcs * rel_lfc[k] |
|
122 | 135 |
}), vector("list", length(cats))) %>% |
123 | 136 |
set_rownames(cats) |
124 | 137 |
|
... | ... |
@@ -17,6 +17,9 @@ |
17 | 17 |
#' EE, EP, DE, DP, DM, or DB, respectively. |
18 | 18 |
#' @param fc numeric value to use as mean logFC |
19 | 19 |
#' for DE, DP, DM, and DB type of genes. |
20 |
+#' @param p_type numeric. Probaility of EE/EP gene being a type-gene. |
|
21 |
+#' If a gene is of class "type" in a given cluster, a unique mean |
|
22 |
+#' will be used for that gene in the respective cluster. |
|
20 | 23 |
#' |
21 | 24 |
#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}} |
22 | 25 |
#' containing multiple clusters & samples across 2 groups. |
... | ... |
@@ -41,7 +44,8 @@ |
41 | 44 |
#' @export |
42 | 45 |
|
43 | 46 |
simData <- function(x, n_genes = 500, n_cells = 300, |
44 |
- probs = NULL, p_dd = diag(6)[1, ], fc = 2) { |
|
47 |
+ probs = NULL, p_dd = diag(6)[1, ], p_type = 0, |
|
48 |
+ fc = 2, rel_fc = NULL) { |
|
45 | 49 |
|
46 | 50 |
# throughout this code... |
47 | 51 |
# k: cluster ID |
... | ... |
@@ -53,6 +57,7 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
53 | 57 |
stopifnot(is.numeric(n_genes), length(n_genes) == 1) |
54 | 58 |
stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2) |
55 | 59 |
stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1) |
60 |
+ stopifnot(is.numeric(p_type), length(p_type) == 1, p_type >= 0) |
|
56 | 61 |
stopifnot(is.numeric(fc), is.numeric(fc), fc > 1) |
57 | 62 |
|
58 | 63 |
kids <- levels(x$cluster_id) |
... | ... |
@@ -62,6 +67,11 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
62 | 67 |
names(sids) <- sids |
63 | 68 |
names(gids) <- gids |
64 | 69 |
nk <- length(kids) |
70 |
+ if (is.null(rel_fc)) { |
|
71 |
+ rel_fc <- rep(1, nk) |
|
72 |
+ } else { |
|
73 |
+ stopifnot(is.numeric(rel_fc), length(rel_fc) == nk, rel_fc >= 0) |
|
74 |
+ } |
|
65 | 75 |
|
66 | 76 |
# initialize count matrix |
67 | 77 |
gs <- paste0("gene", seq_len(n_genes)) |
... | ... |
@@ -86,10 +96,15 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
86 | 96 |
|
87 | 97 |
# for ea. cluster, sample set of genes to simulate from |
88 | 98 |
gs_by_k <- setNames(sample(rownames(x), n_genes, TRUE), gs) |
89 |
- gs_by_k <- replicate(nk, gs_by_k, simplify = FALSE) %>% set_names(kids) |
|
90 |
- #gs_by_k <- replicate(nk, |
|
91 |
- # setNames(sample(rownames(x), n_genes, TRUE), gs), |
|
92 |
- # simplify = FALSE) %>% set_names(kids) |
|
99 |
+ gs_by_k <- replicate(nk, gs_by_k) %>% set_colnames(kids) |
|
100 |
+ |
|
101 |
+ # impute type-genes |
|
102 |
+ if (p_type != 0) |
|
103 |
+ gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx, p_type) |
|
104 |
+ |
|
105 |
+ # split by cluster & categroy |
|
106 |
+ gs_by_k <- gs_by_k %>% split(col(.)) %>% |
|
107 |
+ set_names(kids) %>% map(set_names, gs) |
|
93 | 108 |
gs_by_kc <- lapply(kids, function(k) |
94 | 109 |
lapply(cats, function(c) |
95 | 110 |
gs_by_k[[k]][gs_idx[[c, k]]]) %>% |
... | ... |
@@ -108,12 +123,17 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
108 | 123 |
set_rownames(cats) |
109 | 124 |
|
110 | 125 |
# compute NB parameters |
111 |
- b <- exp(rowData(x)$beta) |
|
112 | 126 |
o <- exp(colData(x)$offset) |
113 |
- m <- vapply(o, function(l) b*l, numeric(nrow(x))) |
|
114 |
- dimnames(m) <- dimnames(x) |
|
115 |
- d <- rowData(x)$dispersion |
|
116 |
- names(d) <- rownames(x) |
|
127 |
+ m <- lapply(sids, function(s) { |
|
128 |
+ cn <- paste("beta", s, sep = ".") |
|
129 |
+ k <- grep(cn, names(rowData(x))) |
|
130 |
+ b <- exp(rowData(x)[[k]]) |
|
131 |
+ vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>% |
|
132 |
+ set_rownames(rownames(x)) %>% |
|
133 |
+ set_colnames(colnames(x)) |
|
134 |
+ }) |
|
135 |
+ d <- rowData(x)$dispersion %>% |
|
136 |
+ set_names(rownames(x)) |
|
117 | 137 |
|
118 | 138 |
sim_mean <- lapply(kids, function(k) |
119 | 139 |
lapply(gids, function(g) |
... | ... |
@@ -133,24 +153,30 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
133 | 153 |
cs_g1 <- sample(cs_ks, ng1, replace = TRUE) |
134 | 154 |
cs_g2 <- sample(cs_ks, ng2, replace = TRUE) |
135 | 155 |
|
136 |
- m_g1 <- m[gs_kc, cs_g1, drop = FALSE] |
|
137 |
- m_g2 <- m[gs_kc, cs_g2, drop = FALSE] |
|
156 |
+ m_g1 <- m[[s]][gs_kc, cs_g1, drop = FALSE] |
|
157 |
+ m_g2 <- m[[s]][gs_kc, cs_g2, drop = FALSE] |
|
138 | 158 |
d_kc <- d[gs_kc] |
139 | 159 |
lfc_kc <- lfc[[c, k]] |
140 | 160 |
|
141 | 161 |
gidx <- gs_idx[[c, k]] |
142 | 162 |
cidx <- c(g1, g2) |
143 | 163 |
|
144 |
- counts <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d = d_kc, lfc = lfc_kc) |
|
145 |
- y[gidx, cidx] <- counts |
|
146 |
- sim_mean[[k]]$A[gidx] <- rowMeans(m_g1) # ... * lfc ?? |
|
147 |
- sim_mean[[k]]$B[gidx] <- rowMeans(m_g2) |
|
164 |
+ re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc) |
|
165 |
+ y[gidx, cidx] <- re$cs |
|
166 |
+ sim_mean[[k]]$A[gidx] <- re$ms$A |
|
167 |
+ sim_mean[[k]]$B[gidx] <- re$ms$B |
|
148 | 168 |
} |
149 | 169 |
} |
150 | 170 |
} |
151 | 171 |
|
152 |
- # construct gene metadata table storing |
|
153 |
- # gene | cluster_id | category | logFC |
|
172 |
+ sim_mean <- sim_mean %>% |
|
173 |
+ map(bind_cols) %>% |
|
174 |
+ bind_rows(.id = "cluster_id") %>% |
|
175 |
+ mutate_at("cluster_id", factor) %>% |
|
176 |
+ mutate(gene = rep(gs, nk)) |
|
177 |
+ |
|
178 |
+ # construct gene metadata table storing ------------------------------------ |
|
179 |
+ # gene | cluster_id | category | logFC, gene, disp, mean used for sim. |
|
154 | 180 |
gi <- data.frame( |
155 | 181 |
gene = unlist(gs_idx), |
156 | 182 |
cluster_id = rep.int(rep(kids, each = length(cats)), c(n_dd)), |
... | ... |
@@ -159,15 +185,13 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
159 | 185 |
sim_gene = unlist(gs_by_kc), |
160 | 186 |
sim_disp = d[unlist(gs_by_kc)]) %>% |
161 | 187 |
mutate_at("gene", as.character) |
188 |
+ # add true simulation means |
|
189 |
+ gi <- full_join(gi, sim_mean, by = c("gene", "cluster_id")) %>% |
|
190 |
+ rename("sim_mean.A" = "A", "sim_mean.B" = "B") |
|
191 |
+ # reorder |
|
162 | 192 |
o <- order(as.numeric(gsub("[a-z]", "", gi$gene))) |
163 | 193 |
gi <- gi[o, ] %>% set_rownames(NULL) |
164 | 194 |
|
165 |
- a <- unlist(map_depth(sim_mean, 1, "A")) |
|
166 |
- b <- unlist(map_depth(sim_mean, 1, "B")) |
|
167 |
- o <- order(as.numeric(gsub(".*\\.[a-z]+", "", names(a)))) |
|
168 |
- gi$sim_mean.A <- a[o] |
|
169 |
- gi$sim_mean.B <- b[o] |
|
170 |
- |
|
171 | 195 |
# construct SCE |
172 | 196 |
cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = ".")) |
173 | 197 |
m <- match(levels(cd$sample_id), cd$sample_id) |
... | ... |
@@ -187,21 +211,3 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
187 | 211 |
colData = cd, |
188 | 212 |
metadata = md) |
189 | 213 |
} |
190 |
- |
|
191 |
- |
|
192 |
- |
|
193 |
- |
|
194 |
- |
|
195 |
- |
|
196 |
- |
|
197 |
- |
|
198 |
- |
|
199 |
- |
|
200 |
- |
|
201 |
- |
|
202 |
- |
|
203 |
- |
|
204 |
- |
|
205 |
- |
|
206 |
- |
|
207 |
- |
... | ... |
@@ -22,8 +22,8 @@ |
22 | 22 |
#' containing multiple clusters & samples across 2 groups. |
23 | 23 |
#' |
24 | 24 |
#' @examples |
25 |
-#' data(kang) |
|
26 |
-#' simData(kang, |
|
25 |
+#' data(sce) |
|
26 |
+#' simData(sce, |
|
27 | 27 |
#' n_genes = 10, n_cells = 10, |
28 | 28 |
#' p_dd = diag(6)[1, ]) |
29 | 29 |
#' |
... | ... |
@@ -49,14 +49,14 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
49 | 49 |
# c: gene category |
50 | 50 |
|
51 | 51 |
# check validity of input arguments |
52 |
- stopifnot(is(x, "SingleCellExperiment")) |
|
52 |
+ .check_sce(x, req_group = FALSE) |
|
53 | 53 |
stopifnot(is.numeric(n_genes), length(n_genes) == 1) |
54 | 54 |
stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2) |
55 | 55 |
stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1) |
56 | 56 |
stopifnot(is.numeric(fc), is.numeric(fc), fc > 1) |
57 | 57 |
|
58 |
- kids <- levels(colData(x)$cluster_id) |
|
59 |
- sids <- levels(colData(x)$sample_id) |
|
58 |
+ kids <- levels(x$cluster_id) |
|
59 |
+ sids <- levels(x$sample_id) |
|
60 | 60 |
gids <- c("A", "B") |
61 | 61 |
names(kids) <- kids |
62 | 62 |
names(sids) <- sids |
... | ... |
@@ -70,7 +70,7 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
70 | 70 |
|
71 | 71 |
# sample cell metadata |
72 | 72 |
cd <- .sample_cell_md( |
73 |
- n = n_cells, probs = NULL, |
|
73 |
+ n = n_cells, probs = probs, |
|
74 | 74 |
ids = list(kids, sids, gids)) %>% set_rownames(cs) |
75 | 75 |
cs_idx <- .split_cells(cd, by = colnames(cd)) |
76 | 76 |
n_cs <- modify_depth(cs_idx, -1, length) |
... | ... |
@@ -115,6 +115,9 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
115 | 115 |
d <- rowData(x)$dispersion |
116 | 116 |
names(d) <- rownames(x) |
117 | 117 |
|
118 |
+ sim_mean <- lapply(kids, function(k) |
|
119 |
+ lapply(gids, function(g) |
|
120 |
+ setNames(numeric(n_genes), rownames(y)))) |
|
118 | 121 |
for (k in kids) { |
119 | 122 |
for (s in sids) { |
120 | 123 |
for (c in cats[n_dd[, k] != 0]) { |
... | ... |
@@ -135,8 +138,13 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
135 | 138 |
d_kc <- d[gs_kc] |
136 | 139 |
lfc_kc <- lfc[[c, k]] |
137 | 140 |
|
141 |
+ gidx <- gs_idx[[c, k]] |
|
142 |
+ cidx <- c(g1, g2) |
|
143 |
+ |
|
138 | 144 |
counts <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d = d_kc, lfc = lfc_kc) |
139 |
- y[gs_idx[[c, k]], c(g1, g2)] <- counts |
|
145 |
+ y[gidx, cidx] <- counts |
|
146 |
+ sim_mean[[k]]$A[gidx] <- rowMeans(m_g1) # ... * lfc ?? |
|
147 |
+ sim_mean[[k]]$B[gidx] <- rowMeans(m_g2) |
|
140 | 148 |
} |
141 | 149 |
} |
142 | 150 |
} |
... | ... |
@@ -147,11 +155,19 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
147 | 155 |
gene = unlist(gs_idx), |
148 | 156 |
cluster_id = rep.int(rep(kids, each = length(cats)), c(n_dd)), |
149 | 157 |
category = rep.int(rep(cats, nk), c(n_dd)), |
150 |
- logFC = unlist(lfc)) %>% |
|
158 |
+ logFC = unlist(lfc), |
|
159 |
+ sim_gene = unlist(gs_by_kc), |
|
160 |
+ sim_disp = d[unlist(gs_by_kc)]) %>% |
|
151 | 161 |
mutate_at("gene", as.character) |
152 | 162 |
o <- order(as.numeric(gsub("[a-z]", "", gi$gene))) |
153 | 163 |
gi <- gi[o, ] %>% set_rownames(NULL) |
154 | 164 |
|
165 |
+ a <- unlist(map_depth(sim_mean, 1, "A")) |
|
166 |
+ b <- unlist(map_depth(sim_mean, 1, "B")) |
|
167 |
+ o <- order(as.numeric(gsub(".*\\.[a-z]+", "", names(a)))) |
|
168 |
+ gi$sim_mean.A <- a[o] |
|
169 |
+ gi$sim_mean.B <- b[o] |
|
170 |
+ |
|
155 | 171 |
# construct SCE |
156 | 172 |
cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = ".")) |
157 | 173 |
m <- match(levels(cd$sample_id), cd$sample_id) |
... | ... |
@@ -9,7 +9,9 @@ |
9 | 9 |
#' @param n_genes # of genes to simulate. |
10 | 10 |
#' @param n_cells # of cells to simulate. |
11 | 11 |
#' Either a single numeric or a range to sample from. |
12 |
-#' @param ns nb. of genes common to 1, 2, ..., all clusters. |
|
12 |
+#' @param probs a list of length 3 containing probabilities of a cell belonging |
|
13 |
+#' to each cluster, sample, and group, respectively. List elements must be |
|
14 |
+#' NULL (equal probabilities) or numeric values in [0, 1] that sum to 1. |
|
13 | 15 |
#' @param p_dd numeric vector of length 6. |
14 | 16 |
#' Specifies the probability of a gene being |
15 | 17 |
#' EE, EP, DE, DP, DM, or DB, respectively. |
... | ... |
@@ -23,7 +25,7 @@ |
23 | 25 |
#' data(kang) |
24 | 26 |
#' simData(kang, |
25 | 27 |
#' n_genes = 10, n_cells = 10, |
26 |
-#' p_dd = c(1,0,0,0,0,0)) |
|
28 |
+#' p_dd = diag(6)[1, ]) |
|
27 | 29 |
#' |
28 | 30 |
#' @importFrom data.table data.table |
29 | 31 |
#' @importFrom dplyr mutate_all mutate_at |
... | ... |
@@ -82,12 +82,12 @@ simData <- function(x, n_genes = 500, n_cells = 300, |
82 | 82 |
set_colnames(kids) |
83 | 83 |
gs_idx <- .sample_gene_inds(gs, n_dd) |
84 | 84 |
|
85 |
- # for ea. cluster, sample unique set of genes to simulate from |
|
86 |
- #tmp <- setNames(sample(rownames(x), n_genes, TRUE), gs) |
|
87 |
- #gs_by_k <- replicate(nk, tmp, simplify = FALSE) %>% set_names(kids) |
|
88 |
- gs_by_k <- replicate(nk, |
|
89 |
- setNames(sample(rownames(x), n_genes, TRUE), gs), |
|
90 |
- simplify = FALSE) %>% set_names(kids) |
|
85 |
+ # for ea. cluster, sample set of genes to simulate from |
|
86 |
+ gs_by_k <- setNames(sample(rownames(x), n_genes, TRUE), gs) |
|
87 |
+ gs_by_k <- replicate(nk, gs_by_k, simplify = FALSE) %>% set_names(kids) |
|
88 |
+ #gs_by_k <- replicate(nk, |
|
89 |
+ # setNames(sample(rownames(x), n_genes, TRUE), gs), |
|
90 |
+ # simplify = FALSE) %>% set_names(kids) |
|
91 | 91 |
gs_by_kc <- lapply(kids, function(k) |
92 | 92 |
lapply(cats, function(c) |
93 | 93 |
gs_by_k[[k]][gs_idx[[c, k]]]) %>% |
... | ... |
@@ -35,11 +35,11 @@ |
35 | 35 |
#' @importFrom SummarizedExperiment colData |
36 | 36 |
#' @importFrom S4Vectors split |
37 | 37 |
#' @importFrom tibble column_to_rownames |
38 |
-#' @importFrom zeallot %<-% |
|
39 | 38 |
#' |
40 | 39 |
#' @export |
41 | 40 |
|
42 |
-simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6)[1, ], fc = 2) { |
|
41 |
+simData <- function(x, n_genes = 500, n_cells = 300, |
|
42 |
+ probs = NULL, p_dd = diag(6)[1, ], fc = 2) { |
|
43 | 43 |
|
44 | 44 |
# throughout this code... |
45 | 45 |
# k: cluster ID |
... | ... |
@@ -83,6 +83,8 @@ simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6 |
83 | 83 |
gs_idx <- .sample_gene_inds(gs, n_dd) |
84 | 84 |
|
85 | 85 |
# for ea. cluster, sample unique set of genes to simulate from |
86 |
+ #tmp <- setNames(sample(rownames(x), n_genes, TRUE), gs) |
|
87 |
+ #gs_by_k <- replicate(nk, tmp, simplify = FALSE) %>% set_names(kids) |
|
86 | 88 |
gs_by_k <- replicate(nk, |
87 | 89 |
setNames(sample(rownames(x), n_genes, TRUE), gs), |
88 | 90 |
simplify = FALSE) %>% set_names(kids) |
... | ... |
@@ -28,7 +28,8 @@ |
28 | 28 |
#' @importFrom data.table data.table |
29 | 29 |
#' @importFrom dplyr mutate_all mutate_at |
30 | 30 |
#' @importFrom edgeR DGEList estimateDisp glmFit |
31 |
-#' @importFrom purrr modify_at |
|
31 |
+#' @importFrom magrittr set_colnames set_rownames |
|
32 |
+#' @importFrom purrr modify_at set_names |
|
32 | 33 |
#' @importFrom stats model.matrix rgamma setNames |
33 | 34 |
#' @importFrom SingleCellExperiment SingleCellExperiment |
34 | 35 |
#' @importFrom SummarizedExperiment colData |
... | ... |
@@ -142,8 +142,6 @@ simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6 |
142 | 142 |
gene = unlist(gs_idx), |
143 | 143 |
cluster_id = rep.int(rep(kids, each = length(cats)), c(n_dd)), |
144 | 144 |
category = rep.int(rep(cats, nk), c(n_dd)), |
145 |
- # mean = , |
|
146 |
- # disp = , |
|
147 | 145 |
logFC = unlist(lfc)) %>% |
148 | 146 |
mutate_at("gene", as.character) |
149 | 147 |
o <- order(as.numeric(gsub("[a-z]", "", gi$gene))) |
... | ... |
@@ -161,7 +159,7 @@ simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6 |
161 | 159 |
md <- list( |
162 | 160 |
experiment_info = ei, |
163 | 161 |
n_cells = table(cd$sample_id), |
164 |
- gene_info = gi, sim_genes = gs_in) |
|
162 |
+ gene_info = gi) |
|
165 | 163 |
|
166 | 164 |
SingleCellExperiment( |
167 | 165 |
assays = list(counts = as.matrix(y)), |
... | ... |
@@ -125,8 +125,8 @@ simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6 |
125 | 125 |
cs_g1 <- sample(cs_ks, ng1, replace = TRUE) |
126 | 126 |
cs_g2 <- sample(cs_ks, ng2, replace = TRUE) |
127 | 127 |
|
128 |
- m_g1 <- m[gs_kc, cs_g1] |
|
129 |
- m_g2 <- m[gs_kc, cs_g2] |
|
128 |
+ m_g1 <- m[gs_kc, cs_g1, drop = FALSE] |
|
129 |
+ m_g2 <- m[gs_kc, cs_g2, drop = FALSE] |
|
130 | 130 |
d_kc <- d[gs_kc] |
131 | 131 |
lfc_kc <- lfc[[c, k]] |
132 | 132 |
|
... | ... |
@@ -15,25 +15,35 @@ |
15 | 15 |
#' EE, EP, DE, DP, DM, or DB, respectively. |
16 | 16 |
#' @param fc numeric value to use as mean logFC |
17 | 17 |
#' for DE, DP, DM, and DB type of genes. |
18 |
-#' @param seed random seed. |
|
18 |
+#' |
|
19 |
+#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}} |
|
20 |
+#' containing multiple clusters & samples across 2 groups. |
|
19 | 21 |
#' |
20 | 22 |
#' @examples |
21 | 23 |
#' data(kang) |
22 | 24 |
#' simData(kang, |
23 | 25 |
#' n_genes = 10, n_cells = 10, |
24 |
-#' p_dd = c(1,0,0,0,0,0), seed = 1) |
|
26 |
+#' p_dd = c(1,0,0,0,0,0)) |
|
25 | 27 |
#' |
26 | 28 |
#' @importFrom data.table data.table |
29 |
+#' @importFrom dplyr mutate_all mutate_at |
|
27 | 30 |
#' @importFrom edgeR DGEList estimateDisp glmFit |
31 |
+#' @importFrom purrr modify_at |
|
28 | 32 |
#' @importFrom stats model.matrix rgamma setNames |
29 | 33 |
#' @importFrom SingleCellExperiment SingleCellExperiment |
30 | 34 |
#' @importFrom SummarizedExperiment colData |
31 | 35 |
#' @importFrom S4Vectors split |
36 |
+#' @importFrom tibble column_to_rownames |
|
32 | 37 |
#' @importFrom zeallot %<-% |
33 | 38 |
#' |
34 | 39 |
#' @export |
35 | 40 |
|
36 |
-simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) { |
|
41 |
+simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6)[1, ], fc = 2) { |
|
42 |
+ |
|
43 |
+ # throughout this code... |
|
44 |
+ # k: cluster ID |
|
45 |
+ # s: sample ID |
|
46 |
+ # c: gene category |
|
37 | 47 |
|
38 | 48 |
# check validity of input arguments |
39 | 49 |
stopifnot(is(x, "SingleCellExperiment")) |
... | ... |
@@ -41,157 +51,138 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) { |
41 | 51 |
stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2) |
42 | 52 |
stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1) |
43 | 53 |
stopifnot(is.numeric(fc), is.numeric(fc), fc > 1) |
44 |
- stopifnot(is.numeric(seed), length(seed) == 1) |
|
45 | 54 |
|
46 |
- cluster_ids <- levels(colData(x)$cluster_id) |
|
47 |
- sample_ids <- levels(colData(x)$sample_id) |
|
48 |
- n_clusters <- length(cluster_ids) |
|
49 |
- n_samples <- length(sample_ids) |
|
50 |
- |
|
51 |
- # split cells by cluster-sample |
|
52 |
- dt <- data.table( |
|
53 |
- cell = colnames(x), |
|
54 |
- cluster_id = colData(x)$cluster_id, |
|
55 |
- sample_id = colData(x)$sample_id) |
|
56 |
- dt_split <- split(dt, |
|
57 |
- by = c("cluster_id", "sample_id"), |
|
58 |
- keep.by = FALSE, flatten = FALSE) |
|
59 |
- cells_by_cluster_sample <- sapply(dt_split, sapply, "[[", "cell") |
|
60 |
- |
|
61 |
- # sample nb. of cells to simulate per cluster-sample |
|
62 |
- if (length(n_cells) == 1) { |
|
63 |
- n_cells <- list(rep(n_cells, 2)) |
|
64 |
- } else { |
|
65 |
- n_cells <- replicate(n_clusters * n_samples, |
|
66 |
- list(sample(n_cells[1]:n_cells[2], 2))) |
|
67 |
- } |
|
68 |
- n_cells <- matrix(n_cells, |
|
69 |
- nrow = n_samples, ncol = n_clusters, |
|
70 |
- dimnames = list(sample_ids, cluster_ids)) |
|
55 |
+ kids <- levels(colData(x)$cluster_id) |
|
56 |
+ sids <- levels(colData(x)$sample_id) |
|
57 |
+ gids <- c("A", "B") |
|
58 |
+ names(kids) <- kids |
|
59 |
+ names(sids) <- sids |
|
60 |
+ names(gids) <- gids |
|
61 |
+ nk <- length(kids) |
|
71 | 62 |
|
72 | 63 |
# initialize count matrix |
73 |
- y <- matrix(0, |
|
74 |
- nrow = n_genes, |
|
75 |
- ncol = sum(unlist(n_cells)), |
|
76 |
- dimnames = list( |
|
77 |
- paste0("gene", seq_len(n_genes)), |
|
78 |
- paste0("cell", seq_len(sum(unlist(n_cells)))))) |
|
79 |
- |
|
80 |
- # sample nb. of genes to simulate per category |
|
81 |
- ndd <- replicate(n_clusters, { |
|
82 |
- ns <- sample(cats, n_genes, replace = TRUE, prob = p_dd) |
|
83 |
- factor(ns, levels = cats) |
|
84 |
- }, simplify = FALSE) |
|
85 |
- ndd <- sapply(ndd, table) |
|
86 |
- colnames(ndd) <- cluster_ids |
|
87 |
- |
|
88 |
- # sample gene indices |
|
89 |
- is <- sapply(cluster_ids, function(c, gs = rownames(y)) |
|
90 |
- sapply(cats, function(cat) { |
|
91 |
- n <- ndd[cat, c] |
|
92 |
- x <- sample(gs, n) |
|
93 |
- gs <<- setdiff(gs, x) |
|
94 |
- return(x) })) |
|
95 |
- |
|
96 |
- # sample cell indices |
|
97 |
- cs <- colnames(y) |
|
98 |
- js <- sapply(cluster_ids, function(c) |
|
99 |
- setNames(lapply(sample_ids, function(s) |
|
100 |
- lapply(n_cells[[s, c]], function(n) { |
|
101 |
- x <- sample(cs, n) |
|
102 |
- cs <<- setdiff(cs, x) |
|
103 |
- return(x) })), sample_ids)) |
|
104 |
- |
|
105 |
- # sample genes to simulate from |
|
106 |
- gs <- replicate(n_clusters, sample(rownames(x), n_genes, replace = TRUE)) |
|
107 |
- rownames(gs) <- rownames(y) |
|
108 |
- colnames(gs) <- cluster_ids |
|
109 |
- |
|
110 |
- # sample fold-changes |
|
111 |
- lfcs <- sapply(cluster_ids, function(k) |
|
112 |
- sapply(cats, function(c) { |
|
113 |
- n <- ndd[c, k] |
|
64 |
+ gs <- paste0("gene", seq_len(n_genes)) |
|
65 |
+ cs <- paste0("cell", seq_len(n_cells)) |
|
66 |
+ y <- matrix(0, n_genes, n_cells, dimnames = list(gs, cs)) |
|
67 |
+ |
|
68 |
+ # sample cell metadata |
|
69 |
+ cd <- .sample_cell_md( |
|
70 |
+ n = n_cells, probs = NULL, |
|
71 |
+ ids = list(kids, sids, gids)) %>% set_rownames(cs) |
|
72 |
+ cs_idx <- .split_cells(cd, by = colnames(cd)) |
|
73 |
+ n_cs <- modify_depth(cs_idx, -1, length) |
|
74 |
+ |
|
75 |
+ # split input cells by cluster-sample |
|
76 |
+ cs_by_ks <- .split_cells(x) |
|
77 |
+ |
|
78 |
+ # sample nb. of genes to simulate per category & gene indices |
|
79 |
+ n_dd <- replicate(nk, |
|
80 |
+ table(sample(factor(cats, levels = cats), n_genes, TRUE, p_dd))) %>% |
|
81 |
+ set_colnames(kids) |
|
82 |
+ gs_idx <- .sample_gene_inds(gs, n_dd) |
|
83 |
+ |
|
84 |
+ # for ea. cluster, sample unique set of genes to simulate from |
|
85 |
+ gs_by_k <- replicate(nk, |
|
86 |
+ setNames(sample(rownames(x), n_genes, TRUE), gs), |
|
87 |
+ simplify = FALSE) %>% set_names(kids) |
|
88 |
+ gs_by_kc <- lapply(kids, function(k) |
|
89 |
+ lapply(cats, function(c) |
|
90 |
+ gs_by_k[[k]][gs_idx[[c, k]]]) %>% |
|
91 |
+ set_names(cats)) |
|
92 |
+ |
|
93 |
+ # sample logFCs |
|
94 |
+ lfc <- vapply(kids, function(k) |
|
95 |
+ lapply(cats, function(c) { |
|
96 |
+ n <- n_dd[c, k] |
|
114 | 97 |
if (c == "ee") return(rep(NA, n)) |
115 |
- signs <- sample(c(-1, 1), size = n, replace = TRUE) |
|
116 |
- lfcs <- rgamma(n, 4, 4 / fc) * signs |
|
117 |
- names(lfcs) <- gs[is[[c, k]], k] |
|
118 |
- return(lfcs) |
|
119 |
- })) |
|
120 |
- |
|
121 |
- for (k in cluster_ids) { |
|
122 |
- # get NB parameters |
|
123 |
- m <- rowData(x)[gs[, k], ]$beta |
|
124 |
- d <- rowData(x)[gs[, k], ]$dispersion |
|
125 |
- names(m) <- names(d) <- gs[, k] |
|
126 |
- |
|
127 |
- for (s in sample_ids) { |
|
128 |
- # cells to simulate from |
|
129 |
- cs <- cells_by_cluster_sample[[s, k]] |
|
130 |
- |
|
131 |
- # compute mus |
|
132 |
- o <- setNames(colData(x)[cs, ]$offset, cs) |
|
133 |
- mu <- sapply(exp(o), "*", exp(m)) |
|
134 |
- |
|
135 |
- # get cell indices & nb. of cells by group |
|
136 |
- ng1 <- length(g1 <- js[[s, k]][[1]]) |
|
137 |
- ng2 <- length(g2 <- js[[s, k]][[2]]) |
|
138 |
- |
|
139 |
- # simulate data |
|
140 |
- for (c in cats) |
|
141 |
- if (ndd[c, k] > 0) y[is[[c, k]], c(g1, g2)] <- |
|
142 |
- simdd(c, gs[is[[c, k]], k], cs, ng1, ng2, mu, d, lfcs[[c, k]]) |
|
98 |
+ signs <- sample(c(-1, 1), n, TRUE) |
|
99 |
+ lfc <- rgamma(n, 4, 4/fc) * signs |
|
100 |
+ names(lfc) <- gs_by_kc[[k]][[c]] |
|
101 |
+ return(lfc) |
|
102 |
+ }), vector("list", length(cats))) %>% |
|
103 |
+ set_rownames(cats) |
|
104 |
+ |
|
105 |
+ # compute NB parameters |
|
106 |
+ b <- exp(rowData(x)$beta) |
|
107 |
+ o <- exp(colData(x)$offset) |
|
108 |
+ m <- vapply(o, function(l) b*l, numeric(nrow(x))) |
|
109 |
+ dimnames(m) <- dimnames(x) |
|
110 |
+ d <- rowData(x)$dispersion |
|
111 |
+ names(d) <- rownames(x) |
|
112 |
+ |
|
113 |
+ for (k in kids) { |
|
114 |
+ for (s in sids) { |
|
115 |
+ for (c in cats[n_dd[, k] != 0]) { |
|
116 |
+ gs_kc <- gs_by_kc[[k]][[c]] |
|
117 |
+ cs_ks <- cs_by_ks[[k]][[s]] |
|
118 |
+ |
|
119 |
+ g1 <- cs_idx[[k]][[s]]$A |
|
120 |
+ g2 <- cs_idx[[k]][[s]]$B |
|
121 |
+ |
|
122 |
+ ng1 <- length(g1) |
|
123 |
+ ng2 <- length(g2) |
|
124 |
+ |
|
125 |
+ cs_g1 <- sample(cs_ks, ng1, replace = TRUE) |
|
126 |
+ cs_g2 <- sample(cs_ks, ng2, replace = TRUE) |
|
127 |
+ |
|
128 |
+ m_g1 <- m[gs_kc, cs_g1] |
|
129 |
+ m_g2 <- m[gs_kc, cs_g2] |
|
130 |
+ d_kc <- d[gs_kc] |
|
131 |
+ lfc_kc <- lfc[[c, k]] |
|
132 |
+ |
|
133 |
+ counts <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d = d_kc, lfc = lfc_kc) |
|
134 |
+ y[gs_idx[[c, k]], c(g1, g2)] <- counts |
|
135 |
+ } |
|
143 | 136 |
} |
144 | 137 |
} |
145 | 138 |
|
146 |
- # construct SCE |
|
147 |
- gi <- do.call(rbind, lapply(cluster_ids, function(k) |
|
148 |
- do.call(rbind, lapply(cats, function(c) if (ndd[c, k] != 0) |
|
149 |
- data.frame( |
|
150 |
- gene = is[[c, k]], cluster_id = k, |
|
151 |
- category = c, logFC = lfcs[[c, k]]))))) |
|
152 |
- gi <- gi[order(as.numeric(gsub("[a-z]", "", gi$gene))), ] |
|
153 |
- gi$category <- factor(gi$category, levels = ddSingleCell:::cats) |
|
154 |
- rownames(gi) <- NULL |
|
139 |
+ # construct gene metadata table storing |
|
140 |
+ # gene | cluster_id | category | logFC |
|
141 |
+ gi <- data.frame( |
|
142 |
+ gene = unlist(gs_idx), |
|
143 |
+ cluster_id = rep.int(rep(kids, each = length(cats)), c(n_dd)), |
|
144 |
+ category = rep.int(rep(cats, nk), c(n_dd)), |
|
145 |
+ # mean = , |
|
146 |
+ # disp = , |
|
147 |
+ logFC = unlist(lfc)) %>% |
|
148 |
+ mutate_at("gene", as.character) |
|
149 |
+ o <- order(as.numeric(gsub("[a-z]", "", gi$gene))) |
|
150 |
+ gi <- gi[o, ] %>% set_rownames(NULL) |
|
155 | 151 |
|
156 |
- col_data <- do.call(rbind, lapply(cluster_ids, function(c) |
|
157 |
- do.call(rbind, lapply(sample_ids, function(s) |
|
158 |
- data.frame( |
|
159 |
- row.names = 1, |
|
160 |
- unlist(js[[s, c]]), |
|
161 |
- cluster_id = c, sample_id = s, |
|
162 |
- group_id = rep.int(c("A", "B"), n_cells[[s, c]])))))) |
|
163 |
- col_data <- col_data[colnames(y), ] |
|
164 |
- col_data$sample_id <- factor(paste(col_data$group_id, col_data$sample_id, sep = ".")) |
|
152 |
+ # construct SCE |
|
153 |
+ cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = ".")) |
|
154 |
+ m <- match(levels(cd$sample_id), cd$sample_id) |
|
155 |
+ gids <- cd$group_id[m] |
|
156 |
+ o <- order(gids) |
|
157 |
+ sids <- levels(cd$sample_id)[o] |
|
158 |
+ ei <- data.frame(sample_id = sids, group_id = gids[o]) |
|
159 |
+ cd <- cd %>% mutate_at("sample_id", factor, levels = sids) |
|
165 | 160 |
|
166 |
- sample_id <- levels(col_data$sample_id) |
|
167 |
- group_id <- gsub("(A|B)[.].*", "\\1", sample_id) |
|
168 |
- ei <- data.frame(sample_id, group_id) |
|
169 | 161 |
md <- list( |
170 | 162 |
experiment_info = ei, |
171 |
- n_cells = table(col_data$sample_id), |
|
172 |
- gene_info = gi, sim_genes = gs) |
|
163 |
+ n_cells = table(cd$sample_id), |
|
164 |
+ gene_info = gi, sim_genes = gs_in) |
|
173 | 165 |
|
174 | 166 |
SingleCellExperiment( |
175 |
- assays = list(counts = y), |
|
176 |
- colData = col_data, |
|
167 |
+ assays = list(counts = as.matrix(y)), |
|
168 |
+ colData = cd, |
|
177 | 169 |
metadata = md) |
178 | 170 |
} |
179 |
- |
|
180 |
- |
|
181 |
- |
|
182 |
- |
|
183 |
- |
|
184 |
- |
|
185 |
- |
|
186 |
- |
|
187 |
- |
|
188 |
- |
|
189 |
- |
|
190 |
- |
|
191 |
- |
|
192 |
- |
|
193 |
- |
|
194 |
- |
|
195 |
- |
|
196 |
- |
|
197 |
- |
|
198 | 171 |
\ No newline at end of file |
172 |
+ |
|
173 |
+ |
|
174 |
+ |
|
175 |
+ |
|
176 |
+ |
|
177 |
+ |
|
178 |
+ |
|
179 |
+ |
|
180 |
+ |
|
181 |
+ |
|
182 |
+ |
|
183 |
+ |
|
184 |
+ |
|
185 |
+ |
|
186 |
+ |
|
187 |
+ |
|
188 |
+ |
|
189 |
+ |
... | ... |
@@ -13,6 +13,8 @@ |
13 | 13 |
#' @param p_dd numeric vector of length 6. |
14 | 14 |
#' Specifies the probability of a gene being |
15 | 15 |
#' EE, EP, DE, DP, DM, or DB, respectively. |
16 |
+#' @param fc numeric value to use as mean logFC |
|
17 |
+#' for DE, DP, DM, and DB type of genes. |
|
16 | 18 |
#' @param seed random seed. |
17 | 19 |
#' |
18 | 20 |
#' @examples |
... | ... |
@@ -21,10 +23,11 @@ |
21 | 23 |
#' n_genes = 10, n_cells = 10, |
22 | 24 |
#' p_dd = c(1,0,0,0,0,0), seed = 1) |
23 | 25 |
#' |
24 |
-#' @import SingleCellExperiment |
|
25 | 26 |
#' @importFrom data.table data.table |
26 | 27 |
#' @importFrom edgeR DGEList estimateDisp glmFit |
27 |
-#' @importFrom stats model.matrix rnbinom setNames |
|
28 |
+#' @importFrom stats model.matrix rgamma setNames |
|
29 |
+#' @importFrom SingleCellExperiment SingleCellExperiment |
|
30 |
+#' @importFrom SummarizedExperiment colData |
|
28 | 31 |
#' @importFrom S4Vectors split |
29 | 32 |
#' @importFrom zeallot %<-% |
30 | 33 |
#' |
... | ... |
@@ -33,7 +36,7 @@ |
33 | 36 |
simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) { |
34 | 37 |
|
35 | 38 |
# check validity of input arguments |
36 |
- stopifnot(class(x) == "SingleCellExperiment") |
|
39 |
+ stopifnot(is(x, "SingleCellExperiment")) |
|
37 | 40 |
stopifnot(is.numeric(n_genes), length(n_genes) == 1) |
38 | 41 |
stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2) |
39 | 42 |
stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1) |
... | ... |
@@ -108,35 +111,35 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) { |
108 | 111 |
lfcs <- sapply(cluster_ids, function(k) |
109 | 112 |
sapply(cats, function(c) { |
110 | 113 |
n <- ndd[c, k] |
111 |
- if (c %in% c("ee", "ep")) return(rep(NA, n)) |
|
114 |
+ if (c == "ee") return(rep(NA, n)) |
|
112 | 115 |
signs <- sample(c(-1, 1), size = n, replace = TRUE) |
113 | 116 |
lfcs <- rgamma(n, 4, 4 / fc) * signs |
114 | 117 |
names(lfcs) <- gs[is[[c, k]], k] |
115 | 118 |
return(lfcs) |
116 | 119 |
})) |
117 | 120 |
|
118 |
- for (c in cluster_ids) { |
|
121 |
+ for (k in cluster_ids) { |
|
119 | 122 |
# get NB parameters |
120 |
- m <- rowData(x)[gs[, c], ]$beta |
|
121 |
- d <- rowData(x)[gs[, c], ]$dispersion |
|
122 |
- names(m) <- names(d) <- gs[, c] |
|
123 |
+ m <- rowData(x)[gs[, k], ]$beta |
|
124 |
+ d <- rowData(x)[gs[, k], ]$dispersion |
|
125 |
+ names(m) <- names(d) <- gs[, k] |
|
123 | 126 |
|
124 | 127 |
for (s in sample_ids) { |
125 | 128 |
# cells to simulate from |
126 |
- cs <- cells_by_cluster_sample[[s, c]] |
|
129 |
+ cs <- cells_by_cluster_sample[[s, k]] |
|
127 | 130 |
|
128 | 131 |
# compute mus |
129 | 132 |
o <- setNames(colData(x)[cs, ]$offset, cs) |
130 | 133 |
mu <- sapply(exp(o), "*", exp(m)) |
131 | 134 |
|
132 | 135 |
# get cell indices & nb. of cells by group |
133 |
- ng1 <- length(g1 <- js[[s, c]][[1]]) |
|
134 |
- ng2 <- length(g2 <- js[[s, c]][[2]]) |
|
136 |
+ ng1 <- length(g1 <- js[[s, k]][[1]]) |
|
137 |
+ ng2 <- length(g2 <- js[[s, k]][[2]]) |
|
135 | 138 |
|
136 | 139 |
# simulate data |
137 |
- for (cat in cats) |
|
138 |
- if (ndd[cat, c] > 0) y[is[[cat, c]], c(g1, g2)] <- |
|
139 |
- simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, lfcs[[cat, c]]) |
|
140 |
+ for (c in cats) |
|
141 |
+ if (ndd[c, k] > 0) y[is[[c, k]], c(g1, g2)] <- |
|
142 |
+ simdd(c, gs[is[[c, k]], k], cs, ng1, ng2, mu, d, lfcs[[c, k]]) |
|
140 | 143 |
} |
141 | 144 |
} |
142 | 145 |
|
... | ... |
@@ -156,13 +159,13 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) { |
156 | 159 |
row.names = 1, |
157 | 160 |
unlist(js[[s, c]]), |
158 | 161 |
cluster_id = c, sample_id = s, |
159 |
- group = rep.int(c("A", "B"), n_cells[[s, c]])))))) |
|
162 |
+ group_id = rep.int(c("A", "B"), n_cells[[s, c]])))))) |
|
160 | 163 |
col_data <- col_data[colnames(y), ] |
161 |
- col_data$sample_id <- factor(paste(col_data$group, col_data$sample_id, sep = ".")) |
|
164 |
+ col_data$sample_id <- factor(paste(col_data$group_id, col_data$sample_id, sep = ".")) |
|
162 | 165 |
|
163 | 166 |
sample_id <- levels(col_data$sample_id) |
164 |
- group <- gsub("(A|B)[.].*", "\\1", sample_id) |
|
165 |
- ei <- data.frame(sample_id, group) |
|
167 |
+ group_id <- gsub("(A|B)[.].*", "\\1", sample_id) |
|
168 |
+ ei <- data.frame(sample_id, group_id) |
|
166 | 169 |
md <- list( |
167 | 170 |
experiment_info = ei, |
168 | 171 |
n_cells = table(col_data$sample_id), |
... | ... |
@@ -105,13 +105,14 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) { |
105 | 105 |
colnames(gs) <- cluster_ids |
106 | 106 |
|
107 | 107 |
# sample fold-changes |
108 |
- fcs <- sapply(cluster_ids, function(k) |
|
108 |
+ lfcs <- sapply(cluster_ids, function(k) |
|
109 | 109 |
sapply(cats, function(c) { |
110 | 110 |
n <- ndd[c, k] |
111 |
+ if (c %in% c("ee", "ep")) return(rep(NA, n)) |
|
111 | 112 |
signs <- sample(c(-1, 1), size = n, replace = TRUE) |
112 |
- fcs <- 2 ^ ( rgamma(n, 4, 4 / fc) * signs ) |
|
113 |
- names(fcs) <- gs[is[[c, k]], k] |
|
114 |
- return(fcs) |
|
113 |
+ lfcs <- rgamma(n, 4, 4 / fc) * signs |
|
114 |
+ names(lfcs) <- gs[is[[c, k]], k] |
|
115 |
+ return(lfcs) |
|
115 | 116 |
})) |
116 | 117 |
|
117 | 118 |
for (c in cluster_ids) { |
... | ... |
@@ -135,14 +136,16 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) { |
135 | 136 |
# simulate data |
136 | 137 |
for (cat in cats) |
137 | 138 |
if (ndd[cat, c] > 0) y[is[[cat, c]], c(g1, g2)] <- |
138 |
- simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, fcs[[cat, c]]) |
|
139 |
+ simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, lfcs[[cat, c]]) |
|
139 | 140 |
} |
140 | 141 |
} |
141 | 142 |
|
142 | 143 |
# construct SCE |
143 |
- gi <- do.call(rbind, lapply(cluster_ids, function(c) |
|
144 |
- do.call(rbind, lapply(cats, function(cat) if (ndd[cat, c] != 0) |
|
145 |
- data.frame(gene = is[[cat, c]], cluster_id = c, category = cat))))) |
|
144 |
+ gi <- do.call(rbind, lapply(cluster_ids, function(k) |
|
145 |
+ do.call(rbind, lapply(cats, function(c) if (ndd[c, k] != 0) |
|
146 |
+ data.frame( |
|
147 |
+ gene = is[[c, k]], cluster_id = k, |
|
148 |
+ category = c, logFC = lfcs[[c, k]]))))) |
|
146 | 149 |
gi <- gi[order(as.numeric(gsub("[a-z]", "", gi$gene))), ] |
147 | 150 |
gi$category <- factor(gi$category, levels = ddSingleCell:::cats) |
148 | 151 |
rownames(gi) <- NULL |
... | ... |
@@ -109,8 +109,9 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) { |
109 | 109 |
sapply(cats, function(c) { |
110 | 110 |
n <- ndd[c, k] |
111 | 111 |
signs <- sample(c(-1, 1), size = n, replace = TRUE) |
112 |
- fcs <- 2 ^ ( rgamma(n, 4 , 4 / fc) * signs ) |
|
113 |
- setNames(fcs, gs[is[[c, k]], k]) |
|
112 |
+ fcs <- 2 ^ ( rgamma(n, 4, 4 / fc) * signs ) |
|
113 |
+ names(fcs) <- gs[is[[c, k]], k] |
|
114 |
+ return(fcs) |
|
114 | 115 |
})) |
115 | 116 |
|
116 | 117 |
for (c in cluster_ids) { |
... | ... |
@@ -32,11 +32,12 @@ |
32 | 32 |
|
33 | 33 |
simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) { |
34 | 34 |
|
35 |
+ # check validity of input arguments |
|
35 | 36 |
stopifnot(class(x) == "SingleCellExperiment") |
36 | 37 |
stopifnot(is.numeric(n_genes), length(n_genes) == 1) |
37 | 38 |
stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2) |
38 | 39 |
stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1) |
39 |
- stopifnot(is.numeric(fc), is.numeric(fc)) |
|
40 |
+ stopifnot(is.numeric(fc), is.numeric(fc), fc > 1) |
|
40 | 41 |
stopifnot(is.numeric(seed), length(seed) == 1) |
41 | 42 |
|
42 | 43 |
cluster_ids <- levels(colData(x)$cluster_id) |
... | ... |
@@ -47,11 +48,12 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) { |
47 | 48 |
# split cells by cluster-sample |
48 | 49 |
dt <- data.table( |
49 | 50 |
cell = colnames(x), |
50 |
- data.frame(colData(x))) |
|
51 |
- cells <- dt %>% split( |
|
51 |
+ cluster_id = colData(x)$cluster_id, |
|
52 |
+ sample_id = colData(x)$sample_id) |
|
53 |
+ dt_split <- split(dt, |
|
52 | 54 |
by = c("cluster_id", "sample_id"), |
53 | 55 |
keep.by = FALSE, flatten = FALSE) |
54 |
- cells <- sapply(cells, sapply, "[[", "cell") |
|
56 |
+ cells_by_cluster_sample <- sapply(dt_split, sapply, "[[", "cell") |
|
55 | 57 |
|
56 | 58 |
# sample nb. of cells to simulate per cluster-sample |
57 | 59 |
if (length(n_cells) == 1) { |
... | ... |
@@ -98,10 +100,19 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) { |
98 | 100 |
return(x) })), sample_ids)) |
99 | 101 |
|
100 | 102 |
# sample genes to simulate from |
101 |
- gs <- replicate(n_clusters, sample(rownames(x), n_genes)) |
|
103 |
+ gs <- replicate(n_clusters, sample(rownames(x), n_genes, replace = TRUE)) |
|
102 | 104 |
rownames(gs) <- rownames(y) |
103 | 105 |
colnames(gs) <- cluster_ids |
104 | 106 |
|
107 |
+ # sample fold-changes |
|
108 |
+ fcs <- sapply(cluster_ids, function(k) |
|
109 |
+ sapply(cats, function(c) { |
|
110 |
+ n <- ndd[c, k] |
|
111 |
+ signs <- sample(c(-1, 1), size = n, replace = TRUE) |
|
112 |
+ fcs <- 2 ^ ( rgamma(n, 4 , 4 / fc) * signs ) |
|
113 |
+ setNames(fcs, gs[is[[c, k]], k]) |
|
114 |
+ })) |
|
115 |
+ |
|
105 | 116 |
for (c in cluster_ids) { |
106 | 117 |
# get NB parameters |
107 | 118 |
m <- rowData(x)[gs[, c], ]$beta |
... | ... |
@@ -110,7 +121,7 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) { |
110 | 121 |
|
111 | 122 |
for (s in sample_ids) { |
112 | 123 |
# cells to simulate from |
113 |
- cs <- cells[[s, c]] |
|
124 |
+ cs <- cells_by_cluster_sample[[s, c]] |
|
114 | 125 |
|
115 | 126 |
# compute mus |
116 | 127 |
o <- setNames(colData(x)[cs, ]$offset, cs) |
... | ... |
@@ -123,16 +134,16 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) { |
123 | 134 |
# simulate data |
124 | 135 |
for (cat in cats) |
125 | 136 |
if (ndd[cat, c] > 0) y[is[[cat, c]], c(g1, g2)] <- |
126 |
- simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, fc) |
|
137 |
+ simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, fcs[[cat, c]]) |
|
127 | 138 |
} |
128 | 139 |
} |
129 | 140 |
|
130 | 141 |
# construct SCE |
131 | 142 |
gi <- do.call(rbind, lapply(cluster_ids, function(c) |
132 | 143 |
do.call(rbind, lapply(cats, function(cat) if (ndd[cat, c] != 0) |
133 |
- data.frame(genes = is[[cat, c]], cluster_id = c, category = cat))))) |
|
144 |
+ data.frame(gene = is[[cat, c]], cluster_id = c, category = cat))))) |
|
134 | 145 |
gi <- gi[order(as.numeric(gsub("[a-z]", "", gi$gene))), ] |
135 |
- levels(gi$category) <- c("ee", "ep", "de", "dp", "dm", "db") |
|
146 |
+ gi$category <- factor(gi$category, levels = ddSingleCell:::cats) |
|
136 | 147 |
rownames(gi) <- NULL |
137 | 148 |
|
138 | 149 |
col_data <- do.call(rbind, lapply(cluster_ids, function(c) |
... | ... |
@@ -16,21 +16,28 @@ |
16 | 16 |
#' @param seed random seed. |
17 | 17 |
#' |
18 | 18 |
#' @examples |
19 |
-#' data(kang_se, kang_fit) |
|
20 |
-#' simDD(kang_se, kang_fit, |
|
19 |
+#' data(kang) |
|
20 |
+#' simData(kang, |
|
21 | 21 |
#' n_genes = 10, n_cells = 10, |
22 | 22 |
#' p_dd = c(1,0,0,0,0,0), seed = 1) |
23 | 23 |
#' |
24 | 24 |
#' @import SingleCellExperiment |
25 | 25 |
#' @importFrom data.table data.table |
26 | 26 |
#' @importFrom edgeR DGEList estimateDisp glmFit |
27 |
-#' @importFrom stats model.matrix rnbinom |
|
27 |
+#' @importFrom stats model.matrix rnbinom setNames |
|
28 | 28 |
#' @importFrom S4Vectors split |
29 | 29 |
#' @importFrom zeallot %<-% |
30 | 30 |
#' |
31 | 31 |
#' @export |
32 | 32 |
|
33 |
-simData <- function(x, n_genes, n_cells, p_dd) { |
|
33 |
+simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) { |
|
34 |
+ |
|
35 |
+ stopifnot(class(x) == "SingleCellExperiment") |
|
36 |
+ stopifnot(is.numeric(n_genes), length(n_genes) == 1) |
|
37 |
+ stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2) |
|
38 |
+ stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1) |
|
39 |
+ stopifnot(is.numeric(fc), is.numeric(fc)) |
|
40 |
+ stopifnot(is.numeric(seed), length(seed) == 1) |
|
34 | 41 |
|
35 | 42 |
cluster_ids <- levels(colData(x)$cluster_id) |
36 | 43 |
sample_ids <- levels(colData(x)$sample_id) |
... | ... |
@@ -97,19 +104,16 @@ simData <- function(x, n_genes, n_cells, p_dd) { |
97 | 104 |
|
98 | 105 |
for (c in cluster_ids) { |
99 | 106 |
# get NB parameters |
100 |
- m <- rowData(x)$mean[gs[, c]] |
|
101 |
- d <- rowData(x)$dispersion[gs[, c]] |
|
102 |
- |
|
103 |
- # get gene indices & nb. of genes by category |
|
104 |
- c(iee, iep, ide, idp, idm, idb) %<-% sapply(cats, function(i) is[i, c]) |
|
105 |
- c(nee, nep, nde, ndp, ndm, ndb) %<-% vapply(cats, function(i) ndd[i, c], numeric(1)) |
|
107 |
+ m <- rowData(x)[gs[, c], ]$beta |
|
108 |
+ d <- rowData(x)[gs[, c], ]$dispersion |
|
109 |
+ names(m) <- names(d) <- gs[, c] |
|
106 | 110 |
|
107 | 111 |
for (s in sample_ids) { |
108 | 112 |
# cells to simulate from |
109 | 113 |
cs <- cells[[s, c]] |
110 | 114 |
|
111 | 115 |
# compute mus |
112 |
- o <- colData(x)$offset[cs] |
|
116 |
+ o <- setNames(colData(x)[cs, ]$offset, cs) |
|
113 | 117 |
mu <- sapply(exp(o), "*", exp(m)) |
114 | 118 |
|
115 | 119 |
# get cell indices & nb. of cells by group |
... | ... |
@@ -117,12 +121,9 @@ simData <- function(x, n_genes, n_cells, p_dd) { |
117 | 121 |
ng2 <- length(g2 <- js[[s, c]][[2]]) |
118 | 122 |
|
119 | 123 |
# simulate data |
120 |
- if (nee > 0) y[iee, c(g1, g2)] <- simdd("ee", gs[iee, c], cs, ng1, ng2, mu, d) |
|
121 |
- if (nep > 0) y[iep, c(g1, g2)] <- simdd("ep", gs[iep, c], cs, ng1, ng2, mu, d) |
|
122 |
- if (nde > 0) y[ide, c(g1, g2)] <- simdd("de", gs[ide, c], cs, ng1, ng2, mu, d) |
|
123 |
- if (ndp > 0) y[idp, c(g1, g2)] <- simdd("dp", gs[idp, c], cs, ng1, ng2, mu, d) |
|
124 |
- if (ndm > 0) y[idm, c(g1, g2)] <- simdd("dm", gs[idm, c], cs, ng1, ng2, mu, d) |
|
125 |
- if (ndb > 0) y[idb, c(g1, g2)] <- simdd("db", gs[idb, c], cs, ng1, ng2, mu, d) |
|
124 |
+ for (cat in cats) |
|
125 |
+ if (ndd[cat, c] > 0) y[is[[cat, c]], c(g1, g2)] <- |
|
126 |
+ simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, fc) |
|
126 | 127 |
} |
127 | 128 |
} |
128 | 129 |
|
1 | 1 |
new file mode 100755 |
... | ... |
@@ -0,0 +1,178 @@ |
1 |
+#' simData |
|
2 |
+#' |
|
3 |
+#' Simulation of complex scRNA-seq data |
|
4 |
+#' |
|
5 |
+#' \code{simData} simulates multiple clusters and samples |
|
6 |
+#' across 2 experimental conditions from a real scRNA-seq data set. |
|
7 |
+#' |
|
8 |
+#' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}. |
|
9 |
+#' @param n_genes # of genes to simulate. |
|
10 |
+#' @param n_cells # of cells to simulate. |
|
11 |
+#' Either a single numeric or a range to sample from. |
|
12 |
+#' @param ns nb. of genes common to 1, 2, ..., all clusters. |
|
13 |
+#' @param p_dd numeric vector of length 6. |
|
14 |
+#' Specifies the probability of a gene being |
|
15 |
+#' EE, EP, DE, DP, DM, or DB, respectively. |
|
16 |
+#' @param seed random seed. |
|
17 |
+#' |
|
18 |
+#' @examples |
|
19 |
+#' data(kang_se, kang_fit) |
|
20 |
+#' simDD(kang_se, kang_fit, |
|
21 |
+#' n_genes = 10, n_cells = 10, |
|
22 |
+#' p_dd = c(1,0,0,0,0,0), seed = 1) |
|
23 |
+#' |
|
24 |
+#' @import SingleCellExperiment |
|
25 |
+#' @importFrom data.table data.table |
|
26 |
+#' @importFrom edgeR DGEList estimateDisp glmFit |
|
27 |
+#' @importFrom stats model.matrix rnbinom |
|
28 |
+#' @importFrom S4Vectors split |
|
29 |
+#' @importFrom zeallot %<-% |
|
30 |
+#' |
|
31 |
+#' @export |
|
32 |
+ |
|
33 |
+simData <- function(x, n_genes, n_cells, p_dd) { |
|
34 |
+ |
|
35 |
+ cluster_ids <- levels(colData(x)$cluster_id) |
|
36 |
+ sample_ids <- levels(colData(x)$sample_id) |
|
37 |
+ n_clusters <- length(cluster_ids) |
|
38 |
+ n_samples <- length(sample_ids) |
|
39 |
+ |
|
40 |
+ # split cells by cluster-sample |
|
41 |
+ dt <- data.table( |
|
42 |
+ cell = colnames(x), |
|
43 |
+ data.frame(colData(x))) |
|
44 |
+ cells <- dt %>% split( |
|
45 |
+ by = c("cluster_id", "sample_id"), |
|
46 |
+ keep.by = FALSE, flatten = FALSE) |
|
47 |
+ cells <- sapply(cells, sapply, "[[", "cell") |
|
48 |
+ |
|
49 |
+ # sample nb. of cells to simulate per cluster-sample |
|
50 |
+ if (length(n_cells) == 1) { |
|
51 |
+ n_cells <- list(rep(n_cells, 2)) |
|
52 |
+ } else { |
|
53 |
+ n_cells <- replicate(n_clusters * n_samples, |
|
54 |
+ list(sample(n_cells[1]:n_cells[2], 2))) |
|
55 |
+ } |
|
56 |
+ n_cells <- matrix(n_cells, |
|
57 |
+ nrow = n_samples, ncol = n_clusters, |
|
58 |
+ dimnames = list(sample_ids, cluster_ids)) |
|
59 |
+ |
|
60 |
+ # initialize count matrix |
|
61 |
+ y <- matrix(0, |
|
62 |
+ nrow = n_genes, |
|
63 |
+ ncol = sum(unlist(n_cells)), |
|
64 |
+ dimnames = list( |
|
65 |
+ paste0("gene", seq_len(n_genes)), |
|
66 |
+ paste0("cell", seq_len(sum(unlist(n_cells)))))) |
|
67 |
+ |
|
68 |
+ # sample nb. of genes to simulate per category |
|
69 |
+ ndd <- replicate(n_clusters, { |
|
70 |
+ ns <- sample(cats, n_genes, replace = TRUE, prob = p_dd) |
|
71 |
+ factor(ns, levels = cats) |
|
72 |
+ }, simplify = FALSE) |
|
73 |
+ ndd <- sapply(ndd, table) |
|
74 |
+ colnames(ndd) <- cluster_ids |
|
75 |
+ |
|
76 |
+ # sample gene indices |
|
77 |
+ is <- sapply(cluster_ids, function(c, gs = rownames(y)) |
|
78 |
+ sapply(cats, function(cat) { |
|
79 |
+ n <- ndd[cat, c] |
|
80 |
+ x <- sample(gs, n) |
|
81 |
+ gs <<- setdiff(gs, x) |
|
82 |
+ return(x) })) |
|
83 |
+ |
|
84 |
+ # sample cell indices |
|
85 |
+ cs <- colnames(y) |
|
86 |
+ js <- sapply(cluster_ids, function(c) |
|
87 |
+ setNames(lapply(sample_ids, function(s) |
|
88 |
+ lapply(n_cells[[s, c]], function(n) { |
|
89 |
+ x <- sample(cs, n) |
|
90 |
+ cs <<- setdiff(cs, x) |
|
91 |
+ return(x) })), sample_ids)) |
|
92 |
+ |
|
93 |
+ # sample genes to simulate from |
|
94 |
+ gs <- replicate(n_clusters, sample(rownames(x), n_genes)) |
|
95 |
+ rownames(gs) <- rownames(y) |
|
96 |
+ colnames(gs) <- cluster_ids |
|
97 |
+ |
|
98 |
+ for (c in cluster_ids) { |
|
99 |
+ # get NB parameters |
|
100 |
+ m <- rowData(x)$mean[gs[, c]] |
|
101 |
+ d <- rowData(x)$dispersion[gs[, c]] |
|
102 |
+ |
|
103 |
+ # get gene indices & nb. of genes by category |
|
104 |
+ c(iee, iep, ide, idp, idm, idb) %<-% sapply(cats, function(i) is[i, c]) |
|
105 |
+ c(nee, nep, nde, ndp, ndm, ndb) %<-% vapply(cats, function(i) ndd[i, c], numeric(1)) |
|
106 |
+ |
|
107 |
+ for (s in sample_ids) { |
|
108 |
+ # cells to simulate from |
|
109 |
+ cs <- cells[[s, c]] |
|
110 |
+ |
|
111 |
+ # compute mus |
|
112 |
+ o <- colData(x)$offset[cs] |
|
113 |
+ mu <- sapply(exp(o), "*", exp(m)) |
|
114 |
+ |
|
115 |
+ # get cell indices & nb. of cells by group |
|
116 |
+ ng1 <- length(g1 <- js[[s, c]][[1]]) |
|
117 |
+ ng2 <- length(g2 <- js[[s, c]][[2]]) |
|
118 |
+ |
|
119 |
+ # simulate data |
|
120 |
+ if (nee > 0) y[iee, c(g1, g2)] <- simdd("ee", gs[iee, c], cs, ng1, ng2, mu, d) |
|
121 |
+ if (nep > 0) y[iep, c(g1, g2)] <- simdd("ep", gs[iep, c], cs, ng1, ng2, mu, d) |
|
122 |
+ if (nde > 0) y[ide, c(g1, g2)] <- simdd("de", gs[ide, c], cs, ng1, ng2, mu, d) |
|
123 |
+ if (ndp > 0) y[idp, c(g1, g2)] <- simdd("dp", gs[idp, c], cs, ng1, ng2, mu, d) |
|
124 |
+ if (ndm > 0) y[idm, c(g1, g2)] <- simdd("dm", gs[idm, c], cs, ng1, ng2, mu, d) |
|
125 |
+ if (ndb > 0) y[idb, c(g1, g2)] <- simdd("db", gs[idb, c], cs, ng1, ng2, mu, d) |
|
126 |
+ } |
|
127 |
+ } |
|
128 |
+ |
|
129 |
+ # construct SCE |
|
130 |
+ gi <- do.call(rbind, lapply(cluster_ids, function(c) |
|
131 |
+ do.call(rbind, lapply(cats, function(cat) if (ndd[cat, c] != 0) |
|
132 |
+ data.frame(genes = is[[cat, c]], cluster_id = c, category = cat))))) |
|
133 |
+ gi <- gi[order(as.numeric(gsub("[a-z]", "", gi$gene))), ] |
|
134 |
+ levels(gi$category) <- c("ee", "ep", "de", "dp", "dm", "db") |
|
135 |
+ rownames(gi) <- NULL |
|
136 |
+ |
|
137 |
+ col_data <- do.call(rbind, lapply(cluster_ids, function(c) |
|
138 |
+ do.call(rbind, lapply(sample_ids, function(s) |
|
139 |
+ data.frame( |
|
140 |
+ row.names = 1, |
|
141 |
+ unlist(js[[s, c]]), |
|
142 |
+ cluster_id = c, sample_id = s, |
|
143 |
+ group = rep.int(c("A", "B"), n_cells[[s, c]])))))) |
|
144 |
+ col_data <- col_data[colnames(y), ] |
|
145 |
+ col_data$sample_id <- factor(paste(col_data$group, col_data$sample_id, sep = ".")) |
|
146 |
+ |
|
147 |
+ sample_id <- levels(col_data$sample_id) |
|
148 |
+ group <- gsub("(A|B)[.].*", "\\1", sample_id) |
|
149 |
+ ei <- data.frame(sample_id, group) |
|
150 |
+ md <- list( |
|
151 |
+ experiment_info = ei, |
|
152 |
+ n_cells = table(col_data$sample_id), |
|
153 |
+ gene_info = gi, sim_genes = gs) |
|
154 |
+ |
|
155 |
+ SingleCellExperiment( |
|
156 |
+ assays = list(counts = y), |
|
157 |
+ colData = col_data, |
|
158 |
+ metadata = md) |
|
159 |
+} |
|
160 |
+ |
|
161 |
+ |
|
162 |
+ |
|
163 |
+ |
|
164 |
+ |
|
165 |
+ |
|
166 |
+ |
|
167 |
+ |
|
168 |
+ |
|
169 |
+ |
|
170 |
+ |
|
171 |
+ |
|
172 |
+ |
|
173 |
+ |
|
174 |
+ |
|
175 |
+ |
|
176 |
+ |
|
177 |
+ |
|
178 |
+ |
|
0 | 179 |
\ No newline at end of file |