Browse code

rename example data from sce -> example_sce

HelenaLC authored on 03/05/2021 07:05:43
Showing 1 changed files
... ...
@@ -6,12 +6,27 @@
6 6
 #' across 2 experimental conditions from a real scRNA-seq data set.
7 7
 #' 
8 8
 #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
9
-#' @param nc,ns,nk # of cells, samples and clusters to simulate. 
10
-#'   By default, \code{ns = NULL} will simulated as many samples as 
11
-#'   available in the reference to avoid duplicated reference samples.
9
+#' @param ng number of genes to simulate. Importantly, for the library sizes 
10
+#'   computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense, 
11
+#'   the number of simulated genes should match with the number of genes 
12
+#'   in the reference. To simulate a reduced number of genes, e.g. for 
13
+#'   testing and development purposes, please set \code{force = TRUE}.
14
+#' @param nc number of cells to simulate.
15
+#' @param nk number of clusters to simulate; defaults to the number
16
+#'   of available reference clusters (\code{nlevels(x$cluster_id)}).  
17
+#' @param ns number of samples to simulate; defaults to as many as
18
+#'   available in the reference to avoid duplicated reference samples. 
19
+#'   Specifically, the number of samples will be set to 
20
+#'   \code{n = nlevels(x$sample_id)} when \code{dd = FALSE}, 
21
+#'   \code{n} per group  when \code{dd, paired = TRUE}, and 
22
+#'   \code{floor(n/2)} per group when \code{dd = TRUE, paired = FALSE}.
23
+#'   When a larger number samples should be simulated, set \code{force = TRUE}.
12 24
 #' @param probs a list of length 3 containing probabilities of a cell belonging
13 25
 #'   to each cluster, sample, and group, respectively. List elements must be 
14 26
 #'   NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.
27
+#' @param dd whether or not to simulate differential distributions; if TRUE, 
28
+#'   two groups are simulated and \code{ns} corresponds to the number of 
29
+#'   samples per group, else one group with \code{ns} samples is simulated.
15 30
 #' @param p_dd numeric vector of length 6.
16 31
 #'   Specifies the probability of a gene being
17 32
 #'   EE, EP, DE, DP, DM, or DB, respectively.
... ...
@@ -39,17 +54,12 @@
39 54
 #'   of the branches lengths separating them. 
40 55
 #' @param phylo_pars vector of length 2 providing the parameters that control 
41 56
 #'   the number of type genes. Passed to an exponential PDF (see details).
42
-#'   
43
-#' @param ng # of genes to simulate. Importantly, for the library sizes 
44
-#'   computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense, 
45
-#'   the number of simulated genes should match with the number of genes 
46
-#'   in the reference. To simulate a reduced number of genes, e.g. for 
47
-#'   testing and development purposes, please set \code{force = TRUE}.
48
-#' @param force logical specifying whether to force 
49
-#'   simulation despite \code{ng != nrow(x)}.
57
+#' @param force logical specifying whether to force simulation 
58
+#'   when \code{ng} and/or \code{ns} don't match the number of 
59
+#'   available reference genes and samples, respectively.
50 60
 #'   
51 61
 #' @details The simulation of type genes can be performed in 2 ways; 
52
-#'   (1) via \code{p_type} to simulate independant clusters, OR 
62
+#'   (1) via \code{p_type} to simulate independent clusters, OR 
53 63
 #'   (2) via \code{phylo_tree} to simulate a hierarchical cluster structure.
54 64
 #'   
55 65
 #'   For (1), a subset of \code{p_type} \% of genes are selected per cluster
... ...
@@ -89,11 +99,11 @@
89 99
 #'   \item{\code{args}}{a list of the function call's input arguments.}}}}
90 100
 #'   
91 101
 #' @examples
92
-#' data(sce)
102
+#' data(example_sce)
93 103
 #' library(SingleCellExperiment)
94 104
 #' 
95 105
 #' # prep. SCE for simulation
96
-#' ref <- prepSim(sce)
106
+#' ref <- prepSim(example_sce)
97 107
 #' 
98 108
 #' # simulate data
99 109
 #' (sim <- simData(ref, nc = 200,
... ...
@@ -179,11 +189,17 @@ simData <- function(x,
179 189
     if (is.null(x$cluster_id)) {
180 190
         x$cluster_id <- factor("foo")
181 191
         no_k <- TRUE
182
-    } else no_k <- FALSE
192
+    } else {
193
+        x$cluster_id <- droplevels(factor(x$cluster_id))
194
+        no_k <- nlevels(x$cluster_id) == 1
195
+    }
183 196
     if (is.null(x$sample_id)) {
184 197
         x$sample_id <- factor("foo")
185 198
         no_s <- TRUE
186
-    } else no_s <- FALSE
199
+    } else {
200
+        x$sample_id <- droplevels(factor(x$sample_id))
201
+        no_s <- nlevels(x$sample_id) == 1
202
+    }
187 203
     
188 204
     # store all input arguments to be returned in final output
189 205
     args <- c(as.list(environment()))
... ...
@@ -192,12 +208,14 @@ simData <- function(x,
192 208
     .check_sce(x, req_group = FALSE)
193 209
     args_tmp <- .check_args_simData(as.list(environment()))
194 210
     nk <- args$nk <- args_tmp$nk
195
-    ns <- args$ns <- args_tmp$ns
196
-    
211
+
197 212
     # reference IDs
198 213
     nk0 <- length(kids0 <- set_names(levels(x$cluster_id)))
199 214
     ns0 <- length(sids0 <- set_names(levels(x$sample_id)))
200 215
     
216
+    # get number of samples to simulate
217
+    ns <- .get_ns(ns0, ns, dd, paired, force)
218
+    
201 219
     # simulation IDs
202 220
     nk <- length(kids <- set_names(paste0("cluster", seq_len(nk))))
203 221
     sids <- set_names(paste0("sample", seq_len(ns)))
... ...
@@ -211,8 +229,13 @@ simData <- function(x,
211 229
         ref_sids <- cbind(ref_sids, ref_sids)
212 230
     } else {
213 231
         # draw reference samples at random for each group
214
-        sidsA <- sample(sids0, ns, ns > ns0)
215
-        sidsB <- sample(setdiff(sids0, sidsA), ns, ns > ns0)
232
+        sidsA <- sample(sids0, ns, force && ns > ns0)
233
+        if (force) {
234
+            sidsB <- sample(sids0, ns, ns > ns0)
235
+        } else {
236
+            sidsB <- setdiff(sids0, sidsA)
237
+            sidsB <- sample(sidsB, ns)
238
+        }
216 239
         ref_sids <- cbind(sidsA, sidsB)
217 240
     }
218 241
     dimnames(ref_sids) <- list(sids, gids)
Browse code

assure rows/cols have names

HelenaLC authored on 16/04/2021 13:10:57
Showing 1 changed files
... ...
@@ -299,6 +299,7 @@ simData <- function(x,
299 299
         bs$cluster_id <- cbind(0, bs$cluster_id)
300 300
         names(bs$cluster_id) <- kids0
301 301
     }
302
+    os <- x$offset
302 303
     b0 <- bs$beta0
303 304
     ds <- rowData(x)$disp
304 305
     
... ...
@@ -337,8 +338,8 @@ simData <- function(x,
337 338
                 ds_kc <- ds[gs0]
338 339
                 lfc_kc <- lfc[[c, k]]
339 340
                 bs_ksc <- exp(rowSums(bs_ks[gs0, , drop = FALSE]))
340
-                ms_g1 <- outer(bs_ksc, exp(x$offset[cs_g1]), "*")
341
-                ms_g2 <- outer(bs_ksc, exp(x$offset[cs_g2]), "*")
341
+                ms_g1 <- outer(bs_ksc, exp(os[cs_g1]), "*")
342
+                ms_g2 <- outer(bs_ksc, exp(os[cs_g2]), "*")
342 343
                 
343 344
                 re <- .sim(c, cs_g1, cs_g2, ms_g1, ms_g2, ds_kc, lfc_kc, p_ep, p_dp, p_dm)
344 345
                 y[gi, unlist(ci)] <- re$cs
Browse code

fix sim to work w no groups / samples / clusters

HelenaLC authored on 16/04/2021 12:35:32
Showing 1 changed files
... ...
@@ -7,6 +7,8 @@
7 7
 #' 
8 8
 #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
9 9
 #' @param nc,ns,nk # of cells, samples and clusters to simulate. 
10
+#'   By default, \code{ns = NULL} will simulated as many samples as 
11
+#'   available in the reference to avoid duplicated reference samples.
10 12
 #' @param probs a list of length 3 containing probabilities of a cell belonging
11 13
 #'   to each cluster, sample, and group, respectively. List elements must be 
12 14
 #'   NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.
... ...
@@ -157,12 +159,14 @@
157 159
 #' @importFrom S4Vectors split unfactor
158 160
 #' @export
159 161
 
160
-simData <- function(x, nc = 2e3, ns = 3, nk = 3,
161
-    probs = NULL, p_dd = diag(6)[1, ], paired = FALSE,
162
+simData <- function(x, 
163
+    ng = nrow(x), nc = ncol(x), 
164
+    ns = NULL, nk = NULL, probs = NULL, 
165
+    dd = TRUE, p_dd = diag(6)[1, ], paired = FALSE,
162 166
     p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
163 167
     p_type = 0, lfc = 2, rel_lfc = NULL, 
164 168
     phylo_tree = NULL, phylo_pars = c(ifelse(is.null(phylo_tree), 0, 0.1), 3),
165
-    ng = nrow(x), force = FALSE) {
169
+    force = FALSE) {
166 170
     
167 171
     # throughout this code...
168 172
     # k: cluster ID
... ...
@@ -171,6 +175,16 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
171 175
     # c: DD category
172 176
     # 0: reference
173 177
     
178
+    # add mock cluster/sample ID if missing
179
+    if (is.null(x$cluster_id)) {
180
+        x$cluster_id <- factor("foo")
181
+        no_k <- TRUE
182
+    } else no_k <- FALSE
183
+    if (is.null(x$sample_id)) {
184
+        x$sample_id <- factor("foo")
185
+        no_s <- TRUE
186
+    } else no_s <- FALSE
187
+    
174 188
     # store all input arguments to be returned in final output
175 189
     args <- c(as.list(environment()))
176 190
     
... ...
@@ -178,6 +192,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
178 192
     .check_sce(x, req_group = FALSE)
179 193
     args_tmp <- .check_args_simData(as.list(environment()))
180 194
     nk <- args$nk <- args_tmp$nk
195
+    ns <- args$ns <- args_tmp$ns
181 196
     
182 197
     # reference IDs
183 198
     nk0 <- length(kids0 <- set_names(levels(x$cluster_id)))
... ...
@@ -190,17 +205,18 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
190 205
     
191 206
     # sample reference clusters & samples
192 207
     ref_kids <- setNames(sample(kids0, nk, nk > nk0), kids)
193
-    if (paired) { 
208
+    if (!dd || paired) {
194 209
         # use same set of reference samples for both groups
195 210
         ref_sids <- sample(sids0, ns, ns > ns0)
196
-        ref_sids <- replicate(length(gids), ref_sids)
211
+        ref_sids <- cbind(ref_sids, ref_sids)
197 212
     } else {
198 213
         # draw reference samples at random for each group
199
-        ref_sids <- replicate(length(gids), 
200
-            sample(sids0, ns, ns > ns0))
214
+        sidsA <- sample(sids0, ns, ns > ns0)
215
+        sidsB <- sample(setdiff(sids0, sidsA), ns, ns > ns0)
216
+        ref_sids <- cbind(sidsA, sidsB)
201 217
     }
202 218
     dimnames(ref_sids) <- list(sids, gids)
203
-    
219
+
204 220
     if (is.null(rel_lfc)) 
205 221
         rel_lfc <- rep(1, nk)
206 222
     if (is.null(names(rel_lfc))) {
... ...
@@ -274,14 +290,17 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
274 290
         }), vector("list", length(cats)))
275 291
 
276 292
     # compute NB parameters
277
-    m <- lapply(sids0, function(s) {
278
-        b <- paste0("beta.", s)
279
-        b <- exp(rowData(x)[[b]])
280
-        m <- outer(b, exp(x$offset), "*")
281
-        dimnames(m) <- dimnames(x); m
282
-    })
283
-    d <- rowData(x)$dispersion 
284
-    names(d) <- rownames(x)
293
+    bs <- rowData(x)$beta
294
+    if (!is.null(bs$sample_id)) {
295
+        bs$sample_id <- cbind(0, bs$sample_id)
296
+        names(bs$sample_id) <- sids0
297
+    }
298
+    if (!is.null(bs$cluster_id)) {
299
+        bs$cluster_id <- cbind(0, bs$cluster_id)
300
+        names(bs$cluster_id) <- kids0
301
+    }
302
+    b0 <- bs$beta0
303
+    ds <- rowData(x)$disp
285 304
     
286 305
     # initialize list of depth two to store 
287 306
     # simulation means in each cluster & group
... ...
@@ -292,13 +311,18 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
292 311
     # run simulation -----------------------------------------------------------
293 312
     for (k in kids) {
294 313
         for (s in sids) {
314
+            # get output cell indices
315
+            ci <- cs_idx[[k]][[s]]
316
+            
295 317
             # get reference samples, clusters & cells
296 318
             s0 <- ref_sids[s, ]
297 319
             k0 <- ref_kids[k]
298 320
             cs0 <- cs_by_ks[[k0]][s0]
299 321
             
300
-            # get output cell indices
301
-            ci <- cs_idx[[k]][[s]]
322
+            # get NB parameters
323
+            bs_ks <- cbind(b0,
324
+                bs$cluster_id[[k0]],
325
+                bs$sample_id[[s0[1]]])
302 326
             
303 327
             for (c in cats[n_dd[, k] != 0]) {
304 328
                 # sample cells to simulate from
... ...
@@ -310,12 +334,13 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
310 334
                 gi <- gs_idx[[c, k]]
311 335
                 
312 336
                 # get NB parameters
313
-                m_g1 <- m[[s0[[1]]]][gs0, cs_g1, drop = FALSE]
314
-                m_g2 <- m[[s0[[2]]]][gs0, cs_g2, drop = FALSE]
315
-                d_kc <- d[gs0]
337
+                ds_kc <- ds[gs0]
316 338
                 lfc_kc <- lfc[[c, k]]
339
+                bs_ksc <- exp(rowSums(bs_ks[gs0, , drop = FALSE]))
340
+                ms_g1 <- outer(bs_ksc, exp(x$offset[cs_g1]), "*")
341
+                ms_g2 <- outer(bs_ksc, exp(x$offset[cs_g2]), "*")
317 342
                 
318
-                re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc, p_ep, p_dp, p_dm)
343
+                re <- .sim(c, cs_g1, cs_g2, ms_g1, ms_g2, ds_kc, lfc_kc, p_ep, p_dp, p_dm)
319 344
                 y[gi, unlist(ci)] <- re$cs
320 345
                 
321 346
                 for (g in gids) sim_mean[[k]][[g]][gi] <- ifelse(
... ...
@@ -331,7 +356,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
331 356
         category = rep.int(rep(cats, nk), c(n_dd)),
332 357
         logFC = unlist(lfc),
333 358
         sim_gene = unlist(gs_by_kc),
334
-        sim_disp = d[unlist(gs_by_kc)]) %>% 
359
+        sim_disp = ds[unlist(gs_by_kc)]) %>% 
335 360
         mutate_at("gene", as.character)
336 361
     # add true simulation means
337 362
     sim_mean <- sim_mean %>%
... ...
@@ -348,14 +373,22 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
348 373
     # construct SCE ------------------------------------------------------------
349 374
     # cell metadata including group, sample, cluster IDs
350 375
     cd$group_id <- droplevels(cd$group_id)
351
-    cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
376
+    cd$sample_id <- if (dd) {
377
+        factor(paste(cd$sample_id, cd$group_id, sep = "."))
378
+    } else factor(cd$sample_id)
352 379
     m <- match(levels(cd$sample_id), cd$sample_id)
353 380
     gids <- cd$group_id[m]
354 381
     o <- order(gids)
355 382
     sids <- levels(cd$sample_id)[o]
356 383
     cd <- cd %>% 
357
-        mutate_at("cluster_id", factor, levels = kids) %>% 
358
-        mutate_at("sample_id", factor, levels = sids) 
384
+        mutate_at("sample_id", factor, levels = sids) %>% 
385
+        mutate_at("cluster_id", factor, levels = kids)
386
+    if (!dd) {
387
+        cd$group_id <- NULL
388
+        ref_sids <- ref_sids[, 1]
389
+    }
390
+    if (no_s) cd$sample_id <- NULL
391
+    if (no_k) cd$cluster_id <- NULL
359 392
     # gene metadata storing gene classes & specificities
360 393
     rd <- DataFrame(class = factor(class, 
361 394
         levels = c("state", "shared", "type")))
Browse code

add log base to dox

HelenaLC authored on 10/06/2020 07:04:11
Showing 1 changed files
... ...
@@ -13,7 +13,7 @@
13 13
 #' @param p_dd numeric vector of length 6.
14 14
 #'   Specifies the probability of a gene being
15 15
 #'   EE, EP, DE, DP, DM, or DB, respectively.
16
-#' @param paired logial specifying whether a paired design should 
16
+#' @param paired logical specifying whether a paired design should 
17 17
 #'   be simulated (both groups use the same set of reference samples) 
18 18
 #'   or not (reference samples are drawn at random).
19 19
 #' @param p_ep,p_dp,p_dm numeric specifying the proportion of cells
... ...
@@ -21,8 +21,8 @@
21 21
 #' @param p_type numeric. Probability of EE/EP gene being a type-gene.
22 22
 #'   If a gene is of class "type" in a given cluster, a unique mean 
23 23
 #'   will be used for that gene in the respective cluster.
24
-#' @param lfc numeric value to use as mean logFC
25
-#'   for DE, DP, DM, and DB type of genes.
24
+#' @param lfc numeric value to use as mean logFC 
25
+#'   (logarithm base 2) for DE, DP, DM, and DB type of genes.
26 26
 #' @param rel_lfc numeric vector of relative logFCs for each cluster. 
27 27
 #'   Should be of length \code{nlevels(x$cluster_id)} with 
28 28
 #'   \code{levels(x$cluster_id)} as names. 
Browse code

add details to documentation

HelenaLC authored on 28/03/2020 12:25:31
Showing 1 changed files
... ...
@@ -32,22 +32,11 @@
32 32
 #'   \href{http://evolution.genetics.washington.edu/phylip/newicktree.html}{here}. 
33 33
 #'   The distance between the nodes, except for the original branch, will be 
34 34
 #'   translated in the number of shared genes between the clusters belonging to 
35
-#'   these nodes (this relation is controlled with \code{phylo_pars}). The distance 
36
-#'   between two clusters is defined as the sum of the branches lengths 
37
-#'   separating them. 
35
+#'   these nodes (this relation is controlled with \code{phylo_pars}). 
36
+#'   The distance between two clusters is defined as the sum 
37
+#'   of the branches lengths separating them. 
38 38
 #' @param phylo_pars vector of length 2 providing the parameters that control 
39
-#'   the number of type genes. Passed to an exponential's PDF:
40
-#'   \code{N = #genes x gamma1 * e^(-gamma2 x dist)},
41
-#'   
42
-#'   \itemize{
43
-#'   \item  \code{gamma1} is the parameter that controls the percentage of shared 
44
-#'   genes between the nodes. By default 0.1 if a tree is given, meaning that 
45
-#'   maximum 10% of the genes can be used as type genes (if gamma2 = 0). 
46
-#'   However it's advised to tune it depending on the input \code{prep_sce}. 
47
-#'   \code{gamma2} is the 'penalty' of increasing the distance between clusters 
48
-#'   (\code{dist}, defined by \code{phylo_tree}), applied on the number of 
49
-#'   shared genes. Default to 3. 
50
-#'   }
39
+#'   the number of type genes. Passed to an exponential PDF (see details).
51 40
 #'   
52 41
 #' @param ng # of genes to simulate. Importantly, for the library sizes 
53 42
 #'   computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense, 
... ...
@@ -58,13 +47,45 @@
58 47
 #'   simulation despite \code{ng != nrow(x)}.
59 48
 #'   
60 49
 #' @details The simulation of type genes can be performed in 2 ways; 
61
-#'   (1) by defining \code{p_type} and thus simulating independant clusters, OR 
62
-#'   (2) by defining both \code{phylo_tree} and \code{phylo_pars}, which will 
63
-#'   simulate a hierarchical structure between the clusters.
50
+#'   (1) via \code{p_type} to simulate independant clusters, OR 
51
+#'   (2) via \code{phylo_tree} to simulate a hierarchical cluster structure.
52
+#'   
53
+#'   For (1), a subset of \code{p_type} \% of genes are selected per cluster
54
+#'   to use a different references genes than the remainder of clusters,
55
+#'   giving rise to cluster-specific NB means for count sampling.
56
+#'   
57
+#'   For (2), the number of shared/type genes at each node 
58
+#'   are given by \code{a*G*e^(-b*d)}, where \itemize{
59
+#'   \item{\code{a} -- controls the percentage of shared genes between nodes.
60
+#'   By default, at most 10\% of the genes are reserved as type genes
61
+#'   (when \code{b} = 0). However, it is advised to tune this parameter 
62
+#'   depending on the input \code{prep_sce}.}
63
+#'   \item{\code{b} -- determines how the number of shared genes 
64
+#'   decreases with increasing distance d between clusters 
65
+#'   (defined through \code{phylo_tree}).}}
64 66
 #'  
65 67
 #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
66
-#'   containing multiple clusters & samples across 2 groups.
67
-#' 
68
+#'   containing multiple clusters & samples across 2 groups 
69
+#'   as well as the following metadata: \describe{
70
+#'   \item{cell metadata (\code{colData(.)})}{a \code{DataFrame} containing,
71
+#'   containing, for each cell, it's cluster, sample, and group ID.}
72
+#'   \item{gene metadata (\code{rowData(.)})}{a \code{DataFrame} containing, 
73
+#'   for each gene, it's \code{class} (one of "state", "type", "none") and 
74
+#'   specificity (\code{specs}; NA for genes of type "state", otherwise 
75
+#'   a character vector of clusters that share the given gene).}
76
+#'   \item{experiment metadata (\code{metadata(.)})}{
77
+#'   \describe{
78
+#'   \item{\code{experiment_info}}{a \code{data.frame} 
79
+#'   summarizing the experimental design.}
80
+#'   \item{\code{n_cells}}{the number of cells for each sample.}
81
+#'   \item{\code{gene_info}}{a \code{data.frame} containing, for each gene
82
+#'   in each cluster, it's differential distribution \code{category},
83
+#'   mean \code{logFC} (NA for genes for categories "ee" and "ep"),
84
+#'   gene used as reference (\code{sim_gene}), dispersion \code{sim_disp},
85
+#'   and simulation means for each group \code{sim_mean.A/B}.}
86
+#'   \item{\code{ref_sids/kidskids}}{the sample/cluster IDs used as reference.}
87
+#'   \item{\code{args}}{a list of the function call's input arguments.}}}}
88
+#'   
68 89
 #' @examples
69 90
 #' data(sce)
70 91
 #' library(SingleCellExperiment)
... ...
@@ -73,7 +94,7 @@
73 94
 #' ref <- prepSim(sce)
74 95
 #' 
75 96
 #' # simulate data
76
-#' (sim <- simData(ref, nc = 10,
97
+#' (sim <- simData(ref, nc = 200,
77 98
 #'   p_dd = c(0.9, 0, 0.1, 0, 0, 0),
78 99
 #'   ng = 100, force = TRUE,
79 100
 #'   probs = list(NULL, NULL, c(1, 0))))
... ...
@@ -116,7 +137,7 @@
116 137
 #' # view information about shared 'type' genes
117 138
 #' table(rowData(sim)$class)
118 139
 #' 
119
-#' @author Helena L Crowell
140
+#' @author Helena L Crowell & Anthony Sonrel
120 141
 #' 
121 142
 #' @references 
122 143
 #' Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 
... ...
@@ -316,10 +337,10 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
316 337
     sim_mean <- sim_mean %>%
317 338
         map(bind_cols) %>% 
318 339
         bind_rows(.id = "cluster_id") %>% 
319
-        mutate_at("cluster_id", factor) %>% 
320 340
         mutate(gene = rep(gs, nk))
321 341
     gi <- full_join(gi, sim_mean, by = c("gene", "cluster_id")) %>% 
322
-        rename("sim_mean.A" = "A", "sim_mean.B" = "B")
342
+        rename("sim_mean.A" = "A", "sim_mean.B" = "B") %>% 
343
+        mutate_at("cluster_id", factor)
323 344
     # reorder
324 345
     o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
325 346
     gi <- gi[o, ]; rownames(gi) <- NULL
... ...
@@ -332,14 +353,16 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
332 353
     gids <- cd$group_id[m]
333 354
     o <- order(gids)
334 355
     sids <- levels(cd$sample_id)[o]
335
-    ei <- data.frame(sample_id = sids, group_id = gids[o])
336
-    cd <- cd %>% mutate_at("sample_id", factor, levels = sids)
356
+    cd <- cd %>% 
357
+        mutate_at("cluster_id", factor, levels = kids) %>% 
358
+        mutate_at("sample_id", factor, levels = sids) 
337 359
     # gene metadata storing gene classes & specificities
338 360
     rd <- DataFrame(class = factor(class, 
339 361
         levels = c("state", "shared", "type")))
340 362
     rd$specs <- as.list(specs)
341 363
     # simulation metadata including used reference samples/cluster, 
342 364
     # list of input arguments, and simulated genes' metadata
365
+    ei <- data.frame(sample_id = sids, group_id = gids[o])
343 366
     md <- list(
344 367
         experiment_info = ei,
345 368
         n_cells = table(cd$sample_id),
Browse code

reformat example code

HelenaLC authored on 25/03/2020 08:48:41
Showing 1 changed files
... ...
@@ -109,7 +109,8 @@
109 109
 #' plot(read.dendrogram(text = phylo_tree))
110 110
 #' 
111 111
 #' # simulate clusters accordingly
112
-#' sim <- simData(ref, phylo_tree = phylo_tree, 
112
+#' sim <- simData(ref, 
113
+#'   phylo_tree = phylo_tree, 
113 114
 #'   phylo_pars = c(0.1, 3), 
114 115
 #'   ng = 500, force = TRUE)
115 116
 #' # view information about shared 'type' genes
Browse code

code simplifications

HelenaLC authored on 20/03/2020 07:39:07
Showing 1 changed files
... ...
@@ -35,11 +35,9 @@
35 35
 #'   these nodes (this relation is controlled with \code{phylo_pars}). The distance 
36 36
 #'   between two clusters is defined as the sum of the branches lengths 
37 37
 #'   separating them. 
38
-#' @param phylo_pars vector of length 2, defining the parameters that control the 
39
-#'   number of type genes. It is passed to an adaptation of the 
40
-#'   exponential's PDF:
41
-#'   
42
-#'   \code{N = Ngenes x gamma1 * e^(-gamma2 x dist)} ,
38
+#' @param phylo_pars vector of length 2 providing the parameters that control 
39
+#'   the number of type genes. Passed to an exponential's PDF:
40
+#'   \code{N = #genes x gamma1 * e^(-gamma2 x dist)},
43 41
 #'   
44 42
 #'   \itemize{
45 43
 #'   \item  \code{gamma1} is the parameter that controls the percentage of shared 
... ...
@@ -59,11 +57,10 @@
59 57
 #' @param force logical specifying whether to force 
60 58
 #'   simulation despite \code{ng != nrow(x)}.
61 59
 #'   
62
-#' @details 
63
-#'  The simulation of type genes can be performed in 2 ways; (1) by defining 
64
-#'  \code{p_type} and thus simulating independant clusters, OR (2) by defining both 
65
-#'  \code{phylo_tree} and \code{phylo_pars}, which will simulate a hierarchical structure 
66
-#'  between the clusters. Only one of the options is allowed. 
60
+#' @details The simulation of type genes can be performed in 2 ways; 
61
+#'   (1) by defining \code{p_type} and thus simulating independant clusters, OR 
62
+#'   (2) by defining both \code{phylo_tree} and \code{phylo_pars}, which will 
63
+#'   simulate a hierarchical structure between the clusters.
67 64
 #'  
68 65
 #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
69 66
 #'   containing multiple clusters & samples across 2 groups.
... ...
@@ -88,7 +85,7 @@
88 85
 #' table(gi$category)
89 86
 #' 
90 87
 #' # unbalanced sample sizes
91
-#' sim <- simData(ref, nc = 100,
88
+#' sim <- simData(ref, nc = 100, ns = 2,
92 89
 #'   probs = list(NULL, c(0.25, 0.75), NULL),
93 90
 #'   ng = 10, force = TRUE)
94 91
 #' table(sim$sample_id)
... ...
@@ -142,7 +139,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
142 139
     probs = NULL, p_dd = diag(6)[1, ], paired = FALSE,
143 140
     p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
144 141
     p_type = 0, lfc = 2, rel_lfc = NULL, 
145
-    phylo_tree = NULL, phylo_pars = c(0, 3),
142
+    phylo_tree = NULL, phylo_pars = c(ifelse(is.null(phylo_tree), 0, 0.1), 3),
146 143
     ng = nrow(x), force = FALSE) {
147 144
     
148 145
     # throughout this code...
... ...
@@ -152,9 +149,6 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
152 149
     # c: DD category
153 150
     # 0: reference
154 151
     
155
-    # default shared p if phylo tree given
156
-    if (missing(phylo_pars)) phylo_pars[1] <- ifelse(is.null(phylo_tree), 0, 0.1)
157
-    
158 152
     # store all input arguments to be returned in final output
159 153
     args <- c(as.list(environment()))
160 154
     
Browse code

phylo_tree distances now affects the downstream node (also applied to final leaves).

Thony authored on 25/02/2020 14:38:40
Showing 1 changed files
... ...
@@ -35,23 +35,20 @@
35 35
 #'   these nodes (this relation is controlled with \code{phylo_pars}). The distance 
36 36
 #'   between two clusters is defined as the sum of the branches lengths 
37 37
 #'   separating them. 
38
-#' @param phylo_pars list of length 2, defining the parameters that control the 
39
-#'   number of shared/ specific type-genes; \itemize{
40
-#'   \item The first element of the list is a numeric vector of length 2. 
41
-#'   It defines the number of shared genes as an adaptation of the 
38
+#' @param phylo_pars vector of length 2, defining the parameters that control the 
39
+#'   number of type genes. It is passed to an adaptation of the 
42 40
 #'   exponential's PDF:
43 41
 #'   
44 42
 #'   \code{N = Ngenes x gamma1 * e^(-gamma2 x dist)} ,
45 43
 #'   
46
-#'   where \code{gamma1} is the parameter that controls the percentage of shared genes
47
-#'   between the nodes. By default 0.2, but it's advised to tune it depending 
48
-#'   on the input \code{prep_sce}. 
49
-#'   \code{gamma2} is the 'penalty' of increasing 
50
-#'   the distance between clusters (\code{dist}, defined by \code{phylo_tree}),
51
-#'   applied on the number of shared genes. Default to -3. 
52
-#'   \item The second element can be a single numeric or a list of length nk. 
53
-#'   It is an equivalent of \code{p_type} for each leaf (i.e. cluster) that is applied 
54
-#'   after the determination of 'shared type genes'. 
44
+#'   \itemize{
45
+#'   \item  \code{gamma1} is the parameter that controls the percentage of shared 
46
+#'   genes between the nodes. By default 0.1 if a tree is given, meaning that 
47
+#'   maximum 10% of the genes can be used as type genes (if gamma2 = 0). 
48
+#'   However it's advised to tune it depending on the input \code{prep_sce}. 
49
+#'   \code{gamma2} is the 'penalty' of increasing the distance between clusters 
50
+#'   (\code{dist}, defined by \code{phylo_tree}), applied on the number of 
51
+#'   shared genes. Default to 3. 
55 52
 #'   }
56 53
 #'   
57 54
 #' @param ng # of genes to simulate. Importantly, for the library sizes 
... ...
@@ -116,7 +113,7 @@
116 113
 #' 
117 114
 #' # simulate clusters accordingly
118 115
 #' sim <- simData(ref, phylo_tree = phylo_tree, 
119
-#'   phylo_pars = list(c(0.1, 3), 0.1), 
116
+#'   phylo_pars = c(0.1, 3), 
120 117
 #'   ng = 500, force = TRUE)
121 118
 #' # view information about shared 'type' genes
122 119
 #' table(rowData(sim)$class)
... ...
@@ -145,7 +142,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
145 142
     probs = NULL, p_dd = diag(6)[1, ], paired = FALSE,
146 143
     p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
147 144
     p_type = 0, lfc = 2, rel_lfc = NULL, 
148
-    phylo_tree = NULL, phylo_pars = list(c(0, 3), 0),
145
+    phylo_tree = NULL, phylo_pars = c(0, 3),
149 146
     ng = nrow(x), force = FALSE) {
150 147
     
151 148
     # throughout this code...
... ...
@@ -155,6 +152,9 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
155 152
     # c: DD category
156 153
     # 0: reference
157 154
     
155
+    # default shared p if phylo tree given
156
+    if (missing(phylo_pars)) phylo_pars[1] <- ifelse(is.null(phylo_tree), 0, 0.1)
157
+    
158 158
     # store all input arguments to be returned in final output
159 159
     args <- c(as.list(environment()))
160 160
     
Browse code

up phylo

HelenaLC authored on 15/02/2020 14:09:19
Showing 1 changed files
... ...
@@ -160,40 +160,12 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
160 160
     
161 161
     # check validity of input arguments
162 162
     .check_sce(x, req_group = FALSE)
163
-    .check_args_simData(as.list(environment()))
164
-    if (!force && ng != nrow(x))
165
-        stop("Number of simulated genes should match with reference,\n", 
166
-            "but 'ng != nrow(x)'; please specify 'force = TRUE' if\n", 
167
-            "simulation should be forced regardlessly (see '?simData').")
168
-    if (!is.null(phylo_tree) && p_type != 0)
169
-        stop("Only one of 'p_type' and 'phylo_tree' can be provided.\n", 
170
-             "Please see the 'Details' section of '?simData'.")
171
-    if (!length(phylo_pars[[2]]) %in% c(1, nk))
172
-        stop("The second element of 'phylo_pars' should be correspond\n", 
173
-             " to the number of clusters ('nk') or of length 1.")
174
-    if (!is.null(phylo_tree) && phylo_pars[[1]][1] == 0)
175
-        warning("'phylo_pars[[1]][1]' has been set to 0;\n", 
176
-                "'phylo_tree' argument will be ignored.")
177
-    if (!is.null(phylo_tree) && all(phylo_pars[[2]] == 0))
178
-        warning("'phylo_pars[[2]]' has been set to 0;\n", 
179
-                "type genes for individual clusters won't be simulated.")
163
+    args_tmp <- .check_args_simData(as.list(environment()))
164
+    nk <- args$nk <- args_tmp$nk
180 165
     
181 166
     # reference IDs
182 167
     nk0 <- length(kids0 <- set_names(levels(x$cluster_id)))
183 168
     ns0 <- length(sids0 <- set_names(levels(x$sample_id)))
184
-
185
-    # assure number of simulated clusters
186
-    # matches with specified phylogeny
187
-    if (!is.null(phylo_tree)) {
188
-        kids_phylo <- .get_clusters_from_phylo(phylo_tree)
189
-        nk_phylo <- length(kids_phylo)
190
-        ns_phylo <- as.numeric(gsub("[a-z]", "", kids_phylo))
191
-        if (!all(sort(ns_phylo) == seq_len(nk_phylo)))
192
-            stop("Some clusters appear to be missing from 'phylo_tree';\n",
193
-                "please make sure all clusters up to ", 
194
-                dQuote(kids_phylo[which.max(ns_phylo)]), " are present.")
195
-        if (nk_phylo != nk) args$nk <- nk <- nk_phylo
196
-    }
197 169
     
198 170
     # simulation IDs
199 171
     nk <- length(kids <- set_names(paste0("cluster", seq_len(nk))))
... ...
@@ -244,7 +216,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
244 216
     gs_idx <- .sample_gene_inds(gs, n_dd)
245 217
     
246 218
     # for ea. cluster, sample set of genes to simulate from
247
-    gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs)
219
+    gs_by_k <- setNames(sample(rownames(x), ng, ng > nrow(x)), gs)
248 220
     gs_by_k <- replicate(nk, gs_by_k)
249 221
     colnames(gs_by_k) <- kids
250 222
 
Browse code

fix hierarchical sim data

Thony authored on 05/02/2020 14:43:35
Showing 1 changed files
... ...
@@ -18,7 +18,7 @@
18 18
 #'   or not (reference samples are drawn at random).
19 19
 #' @param p_ep,p_dp,p_dm numeric specifying the proportion of cells
20 20
 #'   to be shifted to a different expression state in one group (see details).
21
-#' @param p_type numeric. Probaility of EE/EP gene being a type-gene.
21
+#' @param p_type numeric. Probability of EE/EP gene being a type-gene.
22 22
 #'   If a gene is of class "type" in a given cluster, a unique mean 
23 23
 #'   will be used for that gene in the respective cluster.
24 24
 #' @param lfc numeric value to use as mean logFC
... ...
@@ -28,19 +28,32 @@
28 28
 #'   \code{levels(x$cluster_id)} as names. 
29 29
 #'   Defaults to factor of 1 for all clusters.
30 30
 #' @param phylo_tree newick tree text representing cluster relations 
31
-#'   and their relative distance. If a tree is given, the distance between 
32
-#'   the clusters will be translated in the number of shared genes 
33
-#'   (this relation is controlled with \code{phylo_pars}). The distance 
31
+#'   and their relative distance. An explanation of the syntax can be found 
32
+#'   \href{http://evolution.genetics.washington.edu/phylip/newicktree.html}{here}. 
33
+#'   The distance between the nodes, except for the original branch, will be 
34
+#'   translated in the number of shared genes between the clusters belonging to 
35
+#'   these nodes (this relation is controlled with \code{phylo_pars}). The distance 
34 36
 #'   between two clusters is defined as the sum of the branches lengths 
35 37
 #'   separating them. 
36
-#' @param phylo_pars numeric vector of length 2. Defines the number of shared 
37
-#'   genes as an adaptation of the exponential's PDF:
38
-#'   N = Ngenes x gamma1 * e^(-gamma2 x dist) ,
39
-#'   where gamma1 is the parameter that controls the percentage of shared genes. 
40
-#'   By default, it corresponds to \code{p_type} but it's advised to tune it 
41
-#'   depending on the input prep_sce. gamma2 is the 'penalty' of increasing 
42
-#'   the distance between clusters, applied on the number of shared genes. 
43
-#'   Default to -3. 
38
+#' @param phylo_pars list of length 2, defining the parameters that control the 
39
+#'   number of shared/ specific type-genes; \itemize{
40
+#'   \item The first element of the list is a numeric vector of length 2. 
41
+#'   It defines the number of shared genes as an adaptation of the 
42
+#'   exponential's PDF:
43
+#'   
44
+#'   \code{N = Ngenes x gamma1 * e^(-gamma2 x dist)} ,
45
+#'   
46
+#'   where \code{gamma1} is the parameter that controls the percentage of shared genes
47
+#'   between the nodes. By default 0.2, but it's advised to tune it depending 
48
+#'   on the input \code{prep_sce}. 
49
+#'   \code{gamma2} is the 'penalty' of increasing 
50
+#'   the distance between clusters (\code{dist}, defined by \code{phylo_tree}),
51
+#'   applied on the number of shared genes. Default to -3. 
52
+#'   \item The second element can be a single numeric or a list of length nk. 
53
+#'   It is an equivalent of \code{p_type} for each leaf (i.e. cluster) that is applied 
54
+#'   after the determination of 'shared type genes'. 
55
+#'   }
56
+#'   
44 57
 #' @param ng # of genes to simulate. Importantly, for the library sizes 
45 58
 #'   computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense, 
46 59
 #'   the number of simulated genes should match with the number of genes 
... ...
@@ -49,6 +62,12 @@
49 62
 #' @param force logical specifying whether to force 
50 63
 #'   simulation despite \code{ng != nrow(x)}.
51 64
 #'   
65
+#' @details 
66
+#'  The simulation of type genes can be performed in 2 ways; (1) by defining 
67
+#'  \code{p_type} and thus simulating independant clusters, OR (2) by defining both 
68
+#'  \code{phylo_tree} and \code{phylo_pars}, which will simulate a hierarchical structure 
69
+#'  between the clusters. Only one of the options is allowed. 
70
+#'  
52 71
 #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
53 72
 #'   containing multiple clusters & samples across 2 groups.
54 73
 #' 
... ...
@@ -96,9 +115,11 @@
96 115
 #' plot(read.dendrogram(text = phylo_tree))
97 116
 #' 
98 117
 #' # simulate clusters accordingly
99
-#' sim <- simData(sce, phylo_tree = phylo_tree, ng = 500, force = TRUE)
118
+#' sim <- simData(ref, phylo_tree = phylo_tree, 
119
+#'   phylo_pars = list(c(0.1, 3), 0.1), 
120
+#'   ng = 500, force = TRUE)
100 121
 #' # view information about shared 'type' genes
101
-#' table(metadata(sim)$gene_info$shared_class)
122
+#' table(rowData(sim)$class)
102 123
 #' 
103 124
 #' @author Helena L Crowell
104 125
 #' 
... ...
@@ -124,7 +145,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
124 145
     probs = NULL, p_dd = diag(6)[1, ], paired = FALSE,
125 146
     p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
126 147
     p_type = 0, lfc = 2, rel_lfc = NULL, 
127
-    phylo_tree = NULL, phylo_pars = c(0.2, 3),
148
+    phylo_tree = NULL, phylo_pars = list(c(0, 3), 0),
128 149
     ng = nrow(x), force = FALSE) {
129 150
     
130 151
     # throughout this code...
... ...
@@ -144,6 +165,18 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
144 165
         stop("Number of simulated genes should match with reference,\n", 
145 166
             "but 'ng != nrow(x)'; please specify 'force = TRUE' if\n", 
146 167
             "simulation should be forced regardlessly (see '?simData').")
168
+    if (!is.null(phylo_tree) && p_type != 0)
169
+        stop("Only one of 'p_type' and 'phylo_tree' can be provided.\n", 
170
+             "Please see the 'Details' section of '?simData'.")
171
+    if (!length(phylo_pars[[2]]) %in% c(1, nk))
172
+        stop("The second element of 'phylo_pars' should be correspond\n", 
173
+             " to the number of clusters ('nk') or of length 1.")
174
+    if (!is.null(phylo_tree) && phylo_pars[[1]][1] == 0)
175
+        warning("'phylo_pars[[1]][1]' has been set to 0;\n", 
176
+                "'phylo_tree' argument will be ignored.")
177
+    if (!is.null(phylo_tree) && all(phylo_pars[[2]] == 0))
178
+        warning("'phylo_pars[[2]]' has been set to 0;\n", 
179
+                "type genes for individual clusters won't be simulated.")
147 180
     
148 181
     # reference IDs
149 182
     nk0 <- length(kids0 <- set_names(levels(x$cluster_id)))
... ...
@@ -221,28 +254,6 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
221 254
         gs_by_k <- res$gs_by_k
222 255
         class  <- res$class
223 256
         specs <- res$specs
224
-        if (p_type != 0) {
225
-            # update gene indices 'gs_idx' to avoid imputing
226
-            # exclusive type genes in shared type genes
227
-            gs_idx_tmp <- gs_idx
228
-            for (c in c("ee", "ep"))
229
-                for (k in kids) {
230
-                    u <- gs_idx[[c, k]]
231
-                    gs_idx_tmp[[c, k]] <- u[!u %in% res$used]
232
-                }
233
-            # inpute cluster-specific type genes 
234
-            res <- .impute_type_genes(x, gs_by_k, gs_idx_tmp, p_type)
235
-            gs_by_k <- res$gs_by_k
236
-            # update genes classes & specificities
237
-            is_type <- res$class == "type"
238
-            # spot checks; any type-gene should not be of class 'shared'
239
-            # and cluster-specificities should be unassigned at this point
240
-            stopifnot(
241
-                all(class[is_type] == "state"),
242
-                all(is.na(unlist(specs[is_type]))))
243
-            class[is_type] <- "type"
244
-            specs[is_type] <- res$specs[is_type]
245
-        }
246 257
     # otherwise, simply impute type-genes w/o phylogeny
247 258
     } else if (p_type != 0) {
248 259
         res <- .impute_type_genes(x, gs_by_k, gs_idx, p_type)
Browse code

update phylo sim

HelenaLC authored on 03/02/2020 10:36:01
Showing 1 changed files
... ...
@@ -159,7 +159,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
159 159
             stop("Some clusters appear to be missing from 'phylo_tree';\n",
160 160
                 "please make sure all clusters up to ", 
161 161
                 dQuote(kids_phylo[which.max(ns_phylo)]), " are present.")
162
-        if (nk_phylo != nk) nk <- nk_phylo
162
+        if (nk_phylo != nk) args$nk <- nk <- nk_phylo
163 163
     }
164 164
     
165 165
     # simulation IDs
... ...
@@ -212,7 +212,8 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
212 212
     
213 213
     # for ea. cluster, sample set of genes to simulate from
214 214
     gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs)
215
-    gs_by_k <- set_colnames(replicate(nk, gs_by_k), kids)
215
+    gs_by_k <- replicate(nk, gs_by_k)
216
+    colnames(gs_by_k) <- kids
216 217
 
217 218
     # when 'phylo_tree' is specified, induce hierarchical cluster structure
218 219
     if (!is.null(phylo_tree)) {                                  
... ...
@@ -343,7 +344,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
343 344
         rename("sim_mean.A" = "A", "sim_mean.B" = "B")
344 345
     # reorder
345 346
     o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
346
-    gi <- set_rownames(gi[o, ], NULL)
347
+    gi <- gi[o, ]; rownames(gi) <- NULL
347 348
     
348 349
     # construct SCE ------------------------------------------------------------
349 350
     # cell metadata including group, sample, cluster IDs
Browse code

avoid dependency on magrittr&tibble solely for set_row/colnames&add_column

HelenaLC authored on 02/02/2020 08:37:26
Showing 1 changed files
... ...
@@ -6,7 +6,7 @@
6 6
 #' across 2 experimental conditions from a real scRNA-seq data set.
7 7
 #' 
8 8
 #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
9
-#' @param ng,nc,ns,nk # of genes, cells, samples and clusters to simulate. 
9
+#' @param nc,ns,nk # of cells, samples and clusters to simulate. 
10 10
 #' @param probs a list of length 3 containing probabilities of a cell belonging
11 11
 #'   to each cluster, sample, and group, respectively. List elements must be 
12 12
 #'   NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.
... ...
@@ -27,13 +27,13 @@
27 27
 #'   Should be of length \code{nlevels(x$cluster_id)} with 
28 28
 #'   \code{levels(x$cluster_id)} as names. 
29 29
 #'   Defaults to factor of 1 for all clusters.
30
-#' @param cells_phylo newick tree text representing cluster relations 
30
+#' @param phylo_tree newick tree text representing cluster relations 
31 31
 #'   and their relative distance. If a tree is given, the distance between 
32 32
 #'   the clusters will be translated in the number of shared genes 
33
-#'   (this relation is controlled with \code{params_dist}). The distance 
33
+#'   (this relation is controlled with \code{phylo_pars}). The distance 
34 34
 #'   between two clusters is defined as the sum of the branches lengths 
35 35
 #'   separating them. 
36
-#' @param params_dist numeric vector of length 2. Defines the number of shared 
36
+#' @param phylo_pars numeric vector of length 2. Defines the number of shared 
37 37
 #'   genes as an adaptation of the exponential's PDF:
38 38
 #'   N = Ngenes x gamma1 * e^(-gamma2 x dist) ,
39 39
 #'   where gamma1 is the parameter that controls the percentage of shared genes. 
... ...
@@ -41,6 +41,13 @@
41 41
 #'   depending on the input prep_sce. gamma2 is the 'penalty' of increasing 
42 42
 #'   the distance between clusters, applied on the number of shared genes. 
43 43
 #'   Default to -3. 
44
+#' @param ng # of genes to simulate. Importantly, for the library sizes 
45
+#'   computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense, 
46
+#'   the number of simulated genes should match with the number of genes 
47
+#'   in the reference. To simulate a reduced number of genes, e.g. for 
48
+#'   testing and development purposes, please set \code{force = TRUE}.
49
+#' @param force logical specifying whether to force 
50
+#'   simulation despite \code{ng != nrow(x)}.
44 51
 #'   
45 52
 #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
46 53
 #'   containing multiple clusters & samples across 2 groups.
... ...
@@ -50,11 +57,13 @@
50 57
 #' library(SingleCellExperiment)
51 58
 #' 
52 59
 #' # prep. SCE for simulation
53
-#' sce <- prepSim(sce)
60
+#' ref <- prepSim(sce)
54 61
 #' 
55 62
 #' # simulate data
56
-#' (sim <- simData(sce, ng = 100, nc = 10,
57
-#'   p_dd = c(0.9, 0, 0.1, 0, 0, 0)))
63
+#' (sim <- simData(ref, nc = 10,
64
+#'   p_dd = c(0.9, 0, 0.1, 0, 0, 0),
65
+#'   ng = 100, force = TRUE,
66
+#'   probs = list(NULL, NULL, c(1, 0))))
58 67
 #' 
59 68
 #' # simulation metadata
60 69
 #' head(gi <- metadata(sim)$gene_info)
... ...
@@ -63,30 +72,32 @@
63 72
 #' table(gi$category)
64 73
 #' 
65 74
 #' # unbalanced sample sizes
66
-#' sim <- simData(sce, ng = 10, nc = 100,
67
-#'   probs = list(NULL, c(0.25, 0.75), NULL))
75
+#' sim <- simData(ref, nc = 100,
76
+#'   probs = list(NULL, c(0.25, 0.75), NULL),
77
+#'   ng = 10, force = TRUE)
68 78
 #' table(sim$sample_id)
69 79
 #' 
70 80
 #' # one group only
71
-#' sim <- simData(sce, ng = 10, nc = 100,
72
-#'   probs = list(NULL, NULL, c(1, 0)))
81
+#' sim <- simData(ref, nc = 100,
82
+#'   probs = list(NULL, NULL, c(1, 0)),
83
+#'   ng = 10, force = TRUE)
73 84
 #' levels(sim$group_id)
74 85
 #' 
75
-#' 
76
-#' # Hierarchical cell-type relations: 
77
-#' # we first define a phylogram representing the relations between 3 clusters
78
-#' cells_phylo <- "(('cluster1':0.1, 'cluster2':0.1):0.4,'cluster3':0.5);"
79
-#' # to verify the syntax and the relation, we can plot it
86
+#' # HIERARCHICAL CLUSTER STRUCTURE
87
+#' # define phylogram specifying cluster relations
88
+#' phylo_tree <- "(('cluster1':0.1,'cluster2':0.1):0.4,'cluster3':0.5);"
89
+#' # verify syntax & visualize relations
80 90
 #' library(phylogram)
81
-#' dend <- read.dendrogram(text = cells_phylo)
82
-#' plot(dend)
83
-#' # More complex relations are also possible
84
-#' cells_phylo2 <- "(('cluster1':0.4, 'cluster2':0.4):0.4, ('cluster3':0.5,('cluster4':0.2,'cluster5':0.2,'cluster6':0.2):0.4):0.4);"
85
-#' dend2 <- read.dendrogram(text = cells_phylo2)
86
-#' plot(dend2)
87
-#' # simulate clusters based on these distances
88
-#' sim <- simData(sce_, nk = 3, cells_phylo = cells_phylo)
89
-#' # the information about the shared 'type' genes are kept in the metadata
91
+#' plot(read.dendrogram(text = phylo_tree))
92
+#' 
93
+#' # let's use a more complex phylogeny
94
+#' phylo_tree <- "(('cluster1':0.4,'cluster2':0.4):0.4,('cluster3':
95
+#'   0.5,('cluster4':0.2,'cluster5':0.2,'cluster6':0.2):0.4):0.4);"
96
+#' plot(read.dendrogram(text = phylo_tree))
97
+#' 
98
+#' # simulate clusters accordingly
99
+#' sim <- simData(sce, phylo_tree = phylo_tree, ng = 500, force = TRUE)
100
+#' # view information about shared 'type' genes
90 101
 #' table(metadata(sim)$gene_info$shared_class)
91 102
 #' 
92 103
 #' @author Helena L Crowell
... ...
@@ -102,19 +113,19 @@
102 113
 #' @importFrom data.table data.table
103 114
 #' @importFrom dplyr mutate_all mutate_at
104 115
 #' @importFrom edgeR DGEList estimateDisp glmFit
105
-#' @importFrom magrittr set_colnames set_rownames
106 116
 #' @importFrom purrr modify_depth set_names
107 117
 #' @importFrom stats model.matrix rgamma setNames
108 118
 #' @importFrom SingleCellExperiment SingleCellExperiment
109 119
 #' @importFrom SummarizedExperiment colData
110
-#' @importFrom S4Vectors split
120
+#' @importFrom S4Vectors split unfactor
111 121
 #' @export
112 122
 
113
-simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
123
+simData <- function(x, nc = 2e3, ns = 3, nk = 3,
114 124
     probs = NULL, p_dd = diag(6)[1, ], paired = FALSE,
115 125
     p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
116 126
     p_type = 0, lfc = 2, rel_lfc = NULL, 
117
-    cells_phylo = NULL, params_dist = c(0.2, 3) ) {
127
+    phylo_tree = NULL, phylo_pars = c(0.2, 3),
128
+    ng = nrow(x), force = FALSE) {
118 129
     
119 130
     # throughout this code...
120 131
     # k: cluster ID
... ...
@@ -123,13 +134,33 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
123 134
     # c: DD category
124 135
     # 0: reference
125 136
     
137
+    # store all input arguments to be returned in final output
138
+    args <- c(as.list(environment()))
139
+    
126 140
     # check validity of input arguments
127 141
     .check_sce(x, req_group = FALSE)
128 142
     .check_args_simData(as.list(environment()))
143
+    if (!force && ng != nrow(x))
144
+        stop("Number of simulated genes should match with reference,\n", 
145
+            "but 'ng != nrow(x)'; please specify 'force = TRUE' if\n", 
146
+            "simulation should be forced regardlessly (see '?simData').")
129 147
     
130 148
     # reference IDs
131 149
     nk0 <- length(kids0 <- set_names(levels(x$cluster_id)))
132 150
     ns0 <- length(sids0 <- set_names(levels(x$sample_id)))
151
+
152
+    # assure number of simulated clusters
153
+    # matches with specified phylogeny
154
+    if (!is.null(phylo_tree)) {
155
+        kids_phylo <- .get_clusters_from_phylo(phylo_tree)
156
+        nk_phylo <- length(kids_phylo)
157
+        ns_phylo <- as.numeric(gsub("[a-z]", "", kids_phylo))
158
+        if (!all(sort(ns_phylo) == seq_len(nk_phylo)))
159
+            stop("Some clusters appear to be missing from 'phylo_tree';\n",
160
+                "please make sure all clusters up to ", 
161
+                dQuote(kids_phylo[which.max(ns_phylo)]), " are present.")
162
+        if (nk_phylo != nk) nk <- nk_phylo
163
+    }
133 164
     
134 165
     # simulation IDs
135 166
     nk <- length(kids <- set_names(paste0("cluster", seq_len(nk))))
... ...
@@ -183,56 +214,55 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
183 214
     gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs)
184 215
     gs_by_k <- set_colnames(replicate(nk, gs_by_k), kids)
185 216
 
186
-    if (length(cells_phylo) > 0) {                                  
187
-        out <- .impute_shared_type_genes(x, gs_by_k, gs_idx, cells_phylo, 
188
-                                         params_dist)
189
-        gs_by_k <- out[[1]]
190
-        used_tg <- out[[2]]
191
-        shared  <- out[[3]]
192
-        
217
+    # when 'phylo_tree' is specified, induce hierarchical cluster structure
218
+    if (!is.null(phylo_tree)) {                                  
219
+        res <- .impute_shared_type_genes(x, gs_by_k, gs_idx, phylo_tree, phylo_pars)
220
+        gs_by_k <- res$gs_by_k
221
+        class  <- res$class
222
+        specs <- res$specs
193 223
         if (p_type != 0) {
194
-            
195
-            ## update gs_idx to avoid having type genes in the shared type genes
196
-            gs_idx_red <- gs_idx
197
-            ee_ep_id <- seq(1, 6 * length(kids) - 5, 6)
198
-            ee_ep_id <- c(ee_ep_id, ee_ep_id + 1)
199
-            for (i in ee_ep_id) gs_idx_red[[i]] <- (gs_idx[[i]])[-which(gs_idx[[i]] %in% used_tg)]
200
-            type_info <- gs_by_k
201
-            gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx_red, p_type)
202
-            
203
-            ## keep gene type info 
204
-            type_info[type_info == gs_by_k] <- NA
205
-            type_info[!is.na(type_info)] <- "type"
206
-            type_info[is.na(type_info)] <- "state"
207
-            
208
-        } else {
209
-            type_info <- matrix("state", ng, nk)
224
+            # update gene indices 'gs_idx' to avoid imputing
225
+            # exclusive type genes in shared type genes
226
+            gs_idx_tmp <- gs_idx
227
+            for (c in c("ee", "ep"))
228
+                for (k in kids) {
229
+                    u <- gs_idx[[c, k]]
230
+                    gs_idx_tmp[[c, k]] <- u[!u %in% res$used]
231
+                }
232
+            # inpute cluster-specific type genes 
233
+            res <- .impute_type_genes(x, gs_by_k, gs_idx_tmp, p_type)
234
+            gs_by_k <- res$gs_by_k
235
+            # update genes classes & specificities
236
+            is_type <- res$class == "type"
237
+            # spot checks; any type-gene should not be of class 'shared'
238
+            # and cluster-specificities should be unassigned at this point
239
+            stopifnot(
240
+                all(class[is_type] == "state"),
241
+                all(is.na(unlist(specs[is_type]))))
242
+            class[is_type] <- "type"
243
+            specs[is_type] <- res$specs[is_type]
210 244
         }
245
+    # otherwise, simply impute type-genes w/o phylogeny
246
+    } else if (p_type != 0) {
247
+        res <- .impute_type_genes(x, gs_by_k, gs_idx, p_type)
248
+        stopifnot(!any(res$class == "shared"))
249
+        gs_by_k <- res$gs_by_k
250
+        class <- res$class
251
+        specs <- res$specs
211 252
     } else {
212
-        shared <- matrix("state", ng, nk)
253
+        class <- rep("state", ng)
254
+        specs <- rep(NA, ng)
255
+        names(class) <- names(specs) <- gs
213 256
     }
214
-    
215
-    # impute type-genes
216
-    if (p_type != 0 & length(cells_phylo) == 0)  {
217
-        type_info <- gs_by_k
218
-        gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx, p_type)
219
-        
220
-        ## keep gene type info 
221
-        type_info[type_info == gs_by_k] <- NA
222
-        type_info[!is.na(type_info)] <- "type"
223
-        type_info[is.na(type_info)] <- "state"
224
-    }                       
225 257
 
226 258
     # split by cluster & categroy
227 259
     gs_by_k <- split(gs_by_k, col(gs_by_k))
228 260
     gs_by_k <- setNames(map(gs_by_k, set_names, gs), kids)
229
-    
230 261
     gs_by_kc <- lapply(kids, function(k) 
231 262
         lapply(unfactor(cats), function(c) 
232 263
             gs_by_k[[k]][gs_idx[[c, k]]])) 
233 264
     
234 265
     # sample logFCs
235
-    lfc0 <- lfc 
236 266
     lfc <- vapply(kids, function(k) 
237 267
         lapply(unfactor(cats), function(c) { 
238 268
             n <- n_dd[c, k]
... ...
@@ -253,10 +283,13 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
253 283
     d <- rowData(x)$dispersion 
254 284
     names(d) <- rownames(x)
255 285
     
286
+    # initialize list of depth two to store 
287
+    # simulation means in each cluster & group
256 288
     sim_mean <- lapply(kids, function(k) 
257 289
         lapply(gids, function(g) 
258 290
             setNames(numeric(ng), gs)))
259 291
     
292
+    # run simulation -----------------------------------------------------------
260 293
     for (k in kids) {
261 294
         for (s in sids) {
262 295
             # get reference samples, clusters & cells
... ...
@@ -265,7 +298,7 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
265 298
             cs0 <- cs_by_ks[[k0]][s0]
266 299
             
267 300
             # get output cell indices
268
-            ci <- unlist(cs_idx[[k]][[s]])
301
+            ci <- cs_idx[[k]][[s]]
269 302
             
270 303
             for (c in cats[n_dd[, k] != 0]) {
271 304
                 # sample cells to simulate from
... ...
@@ -283,19 +316,13 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
283 316
                 lfc_kc <- lfc[[c, k]]
284 317
                 
285 318
                 re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc, p_ep, p_dp, p_dm)
286
-                y[gi, ci] <- re$cs
319
+                y[gi, unlist(ci)] <- re$cs
287 320
                 
288 321
                 for (g in gids) sim_mean[[k]][[g]][gi] <- ifelse(
289 322
                     is.null(re$ms[[g]]), NA, list(re$ms[[g]]))[[1]]
290 323
             }
291 324
         }
292 325
     }
293
-    sim_mean <- sim_mean %>%
294
-        map(bind_cols) %>% 
295
-        bind_rows(.id = "cluster_id") %>% 
296
-        mutate_at("cluster_id", factor) %>% 
297
-        mutate(gene = rep(gs, nk))
298
-    
299 326
     # construct gene metadata table storing ------------------------------------
300 327
     # gene | cluster_id | category | logFC, gene, disp, mean used for sim.
301 328
     gi <- data.frame(
... ...
@@ -307,30 +334,19 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
307 334
         sim_disp = d[unlist(gs_by_kc)]) %>% 
308 335
         mutate_at("gene", as.character)
309 336
     # add true simulation means
337
+    sim_mean <- sim_mean %>%
338
+        map(bind_cols) %>% 
339
+        bind_rows(.id = "cluster_id") %>% 
340
+        mutate_at("cluster_id", factor) %>% 
341
+        mutate(gene = rep(gs, nk))
310 342
     gi <- full_join(gi, sim_mean, by = c("gene", "cluster_id")) %>% 
311 343
         rename("sim_mean.A" = "A", "sim_mean.B" = "B")
312 344
     # reorder
313 345
     o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
314 346
     gi <- set_rownames(gi[o, ], NULL)
315
-    # add gene infos                          
316
-    gi$class <- c(t(type_info))
317
-    gi$shared_class <- c(t(shared))
318
-    # parameters                           
319
-    params <- list(
320
-        ng = ng, 
321
-        nc = nc, 
322
-        ns = ns, 
323
-        nk = nk, 
324
-        probs = probs, 
325
-        p_dd = p_dd, 
326
-        p_type = p_type, 
327
-        lfc = lfc0, 
328
-        rel_lfc = rel_lfc, 
329
-        cells_phylo = cells_phylo, 
330
-        params_dist = params_dist
331
-    )
332 347
     
333
-    # construct SCE
348
+    # construct SCE ------------------------------------------------------------
349
+    # cell metadata including group, sample, cluster IDs
334 350
     cd$group_id <- droplevels(cd$group_id)
335 351
     cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
336 352
     m <- match(levels(cd$sample_id), cd$sample_id)
... ...
@@ -339,16 +355,21 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
339 355
     sids <- levels(cd$sample_id)[o]
340 356
     ei <- data.frame(sample_id = sids, group_id = gids[o])
341 357
     cd <- cd %>% mutate_at("sample_id", factor, levels = sids)
342
-    
358
+    # gene metadata storing gene classes & specificities
359
+    rd <- DataFrame(class = factor(class, 
360
+        levels = c("state", "shared", "type")))
361
+    rd$specs <- as.list(specs)
362
+    # simulation metadata including used reference samples/cluster, 
363
+    # list of input arguments, and simulated genes' metadata
343 364
     md <- list(
344 365
         experiment_info = ei,
345 366
         n_cells = table(cd$sample_id),
346 367
         gene_info = gi,
347 368
         ref_sids = ref_sids,
348 369
         ref_kids = ref_kids, 
349
-        parameters = params)
350
-    
370
+        args = args)
371
+    # return SCE 
351 372
     SingleCellExperiment(
352 373
         assays = list(counts = as.matrix(y)),
353
-        colData = cd, metadata = md)
374
+        colData = cd, rowData = rd, metadata = md)
354 375
 }
Browse code

add hierarchical clusters simulation

Thony authored on 30/01/2020 20:46:21
Showing 1 changed files
... ...
@@ -27,6 +27,20 @@
27 27
 #'   Should be of length \code{nlevels(x$cluster_id)} with 
28 28
 #'   \code{levels(x$cluster_id)} as names. 
29 29
 #'   Defaults to factor of 1 for all clusters.
30
+#' @param cells_phylo newick tree text representing cluster relations 
31
+#'   and their relative distance. If a tree is given, the distance between 
32
+#'   the clusters will be translated in the number of shared genes 
33
+#'   (this relation is controlled with \code{params_dist}). The distance 
34
+#'   between two clusters is defined as the sum of the branches lengths 
35
+#'   separating them. 
36
+#' @param params_dist numeric vector of length 2. Defines the number of shared 
37
+#'   genes as an adaptation of the exponential's PDF:
38
+#'   N = Ngenes x gamma1 * e^(-gamma2 x dist) ,
39
+#'   where gamma1 is the parameter that controls the percentage of shared genes. 
40
+#'   By default, it corresponds to \code{p_type} but it's advised to tune it 
41
+#'   depending on the input prep_sce. gamma2 is the 'penalty' of increasing 
42
+#'   the distance between clusters, applied on the number of shared genes. 
43
+#'   Default to -3. 
30 44
 #'   
31 45
 #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
32 46
 #'   containing multiple clusters & samples across 2 groups.
... ...
@@ -57,7 +71,24 @@
57 71
 #' sim <- simData(sce, ng = 10, nc = 100,
58 72
 #'   probs = list(NULL, NULL, c(1, 0)))
59 73
 #' levels(sim$group_id)
60
-#'     
74
+#' 
75
+#' 
76
+#' # Hierarchical cell-type relations: 
77
+#' # we first define a phylogram representing the relations between 3 clusters
78
+#' cells_phylo <- "(('cluster1':0.1, 'cluster2':0.1):0.4,'cluster3':0.5);"
79
+#' # to verify the syntax and the relation, we can plot it
80
+#' library(phylogram)
81
+#' dend <- read.dendrogram(text = cells_phylo)
82
+#' plot(dend)
83
+#' # More complex relations are also possible
84
+#' cells_phylo2 <- "(('cluster1':0.4, 'cluster2':0.4):0.4, ('cluster3':0.5,('cluster4':0.2,'cluster5':0.2,'cluster6':0.2):0.4):0.4);"
85
+#' dend2 <- read.dendrogram(text = cells_phylo2)
86
+#' plot(dend2)
87
+#' # simulate clusters based on these distances
88
+#' sim <- simData(sce_, nk = 3, cells_phylo = cells_phylo)
89
+#' # the information about the shared 'type' genes are kept in the metadata
90
+#' table(metadata(sim)$gene_info$shared_class)
91
+#' 
61 92
 #' @author Helena L Crowell
62 93
 #' 
63 94
 #' @references 
... ...
@@ -82,7 +113,8 @@
82 113
 simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
83 114
     probs = NULL, p_dd = diag(6)[1, ], paired = FALSE,
84 115
     p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
85
-    p_type = 0, lfc = 2, rel_lfc = NULL) {
116
+    p_type = 0, lfc = 2, rel_lfc = NULL, 
117
+    cells_phylo = NULL, params_dist = c(0.2, 3) ) {
86 118
     
87 119
     # throughout this code...
88 120
     # k: cluster ID
... ...
@@ -151,9 +183,45 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
151 183
     gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs)
152 184
     gs_by_k <- set_colnames(replicate(nk, gs_by_k), kids)
153 185
 
186
+    if (length(cells_phylo) > 0) {                                  
187
+        out <- .impute_shared_type_genes(x, gs_by_k, gs_idx, cells_phylo, 
188
+                                         params_dist)
189
+        gs_by_k <- out[[1]]
190
+        used_tg <- out[[2]]
191
+        shared  <- out[[3]]
192
+        
193
+        if (p_type != 0) {
194
+            
195
+            ## update gs_idx to avoid having type genes in the shared type genes
196
+            gs_idx_red <- gs_idx
197
+            ee_ep_id <- seq(1, 6 * length(kids) - 5, 6)
198
+            ee_ep_id <- c(ee_ep_id, ee_ep_id + 1)
199
+            for (i in ee_ep_id) gs_idx_red[[i]] <- (gs_idx[[i]])[-which(gs_idx[[i]] %in% used_tg)]
200
+            type_info <- gs_by_k
201
+            gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx_red, p_type)
202
+            
203
+            ## keep gene type info 
204
+            type_info[type_info == gs_by_k] <- NA
205
+            type_info[!is.na(type_info)] <- "type"
206
+            type_info[is.na(type_info)] <- "state"
207
+            
208
+        } else {
209
+            type_info <- matrix("state", ng, nk)
210
+        }
211
+    } else {
212
+        shared <- matrix("state", ng, nk)
213
+    }
214
+    
154 215
     # impute type-genes
155
-    if (p_type != 0)
216
+    if (p_type != 0 & length(cells_phylo) == 0)  {
217
+        type_info <- gs_by_k
156 218
         gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx, p_type)
219
+        
220
+        ## keep gene type info 
221
+        type_info[type_info == gs_by_k] <- NA
222
+        type_info[!is.na(type_info)] <- "type"
223
+        type_info[is.na(type_info)] <- "state"
224
+    }                       
157 225
 
158 226
     # split by cluster & categroy
159 227
     gs_by_k <- split(gs_by_k, col(gs_by_k))
... ...
@@ -164,6 +232,7 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
164 232
             gs_by_k[[k]][gs_idx[[c, k]]])) 
165 233
     
166 234
     # sample logFCs
235
+    lfc0 <- lfc 
167 236
     lfc <- vapply(kids, function(k) 
168 237
         lapply(unfactor(cats), function(c) { 
169 238
             n <- n_dd[c, k]
... ...
@@ -243,6 +312,23 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
243 312
     # reorder
244 313
     o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
245 314
     gi <- set_rownames(gi[o, ], NULL)
315
+    # add gene infos                          
316
+    gi$class <- c(t(type_info))
317
+    gi$shared_class <- c(t(shared))
318
+    # parameters                           
319
+    params <- list(
320
+        ng = ng, 
321
+        nc = nc, 
322
+        ns = ns, 
323
+        nk = nk, 
324
+        probs = probs, 
325
+        p_dd = p_dd, 
326
+        p_type = p_type, 
327
+        lfc = lfc0, 
328
+        rel_lfc = rel_lfc, 
329
+        cells_phylo = cells_phylo, 
330
+        params_dist = params_dist
331
+    )
246 332
     
247 333
     # construct SCE
248 334
     cd$group_id <- droplevels(cd$group_id)
... ...
@@ -259,7 +345,8 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
259 345
         n_cells = table(cd$sample_id),
260 346
         gene_info = gi,
261 347
         ref_sids = ref_sids,
262
-        ref_kids = ref_kids)
348
+        ref_kids = ref_kids, 
349
+        parameters = params)
263 350
     
264 351
     SingleCellExperiment(
265 352
         assays = list(counts = as.matrix(y)),
Browse code

dream fix

HelenaLC authored on 30/01/2020 13:33:29
Showing 1 changed files
... ...
@@ -13,6 +13,9 @@
13 13
 #' @param p_dd numeric vector of length 6.
14 14
 #'   Specifies the probability of a gene being
15 15
 #'   EE, EP, DE, DP, DM, or DB, respectively.
16
+#' @param paired logial specifying whether a paired design should 
17
+#'   be simulated (both groups use the same set of reference samples) 
18
+#'   or not (reference samples are drawn at random).
16 19
 #' @param p_ep,p_dp,p_dm numeric specifying the proportion of cells
17 20
 #'   to be shifted to a different expression state in one group (see details).
18 21
 #' @param p_type numeric. Probaility of EE/EP gene being a type-gene.
... ...
@@ -77,7 +80,7 @@
77 80
 #' @export
78 81
 
79 82
 simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
80
-    probs = NULL, p_dd = diag(6)[1, ], 
83
+    probs = NULL, p_dd = diag(6)[1, ], paired = FALSE,
81 84
     p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
82 85
     p_type = 0, lfc = 2, rel_lfc = NULL) {
83 86
     
... ...
@@ -103,15 +106,24 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
103 106
     
104 107
     # sample reference clusters & samples
105 108
     ref_kids <- setNames(sample(kids0, nk, nk > nk0), kids)
106
-    ref_sids <- vapply(gids, function(g)
107
-        setNames(sample(sids0, ns, ns > ns0), 
108
-            paste0("sample", seq_len(ns))),
109
-        character(ns))
109
+    if (paired) { 
110
+        # use same set of reference samples for both groups
111
+        ref_sids <- sample(sids0, ns, ns > ns0)
112
+        ref_sids <- replicate(length(gids), ref_sids)
113
+    } else {
114
+        # draw reference samples at random for each group
115
+        ref_sids <- replicate(length(gids), 
116
+            sample(sids0, ns, ns > ns0))
117
+    }
118
+    dimnames(ref_sids) <- list(sids, gids)
110 119
     
111 120
     if (is.null(rel_lfc)) 
112 121
         rel_lfc <- rep(1, nk)
113
-    if (is.null(names(rel_lfc))) 
122
+    if (is.null(names(rel_lfc))) {
114 123
         names(rel_lfc) <- kids
124
+    } else {
125
+        stopifnot(names(rel_lfc) %in% kids0)
126
+    }
115 127
     
116 128
     # initialize count matrix
117 129
     gs <- paste0("gene", seq_len(ng))
... ...
@@ -245,7 +257,9 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
245 257
     md <- list(
246 258
         experiment_info = ei,
247 259
         n_cells = table(cd$sample_id),
248
-        gene_info = gi)
260
+        gene_info = gi,
261
+        ref_sids = ref_sids,
262
+        ref_kids = ref_kids)
249 263
     
250 264
     SingleCellExperiment(
251 265
         assays = list(counts = as.matrix(y)),
Browse code

bug fix when only 1 group is simulated

HelenaLC authored on 26/11/2019 10:36:46
Showing 1 changed files
... ...
@@ -36,8 +36,7 @@
36 36
 #' sce <- prepSim(sce)
37 37
 #' 
38 38
 #' # simulate data
39
-#' (sim <- simData(sce,
40
-#'   n_genes = 100, n_cells = 10,
39
+#' (sim <- simData(sce, ng = 100, nc = 10,
41 40
 #'   p_dd = c(0.9, 0, 0.1, 0, 0, 0)))
42 41
 #' 
43 42
 #' # simulation metadata
... ...
@@ -47,14 +46,12 @@
47 46
 #' table(gi$category)
48 47
 #' 
49 48
 #' # unbalanced sample sizes
50
-#' sim <- simData(sce,
51
-#'   n_genes = 10, n_cells = 100,
49
+#' sim <- simData(sce, ng = 10, nc = 100,
52 50
 #'   probs = list(NULL, c(0.25, 0.75), NULL))
53 51
 #' table(sim$sample_id)
54 52
 #' 
55 53
 #' # one group only
56
-#' sim <- simData(sce,
57
-#'   n_genes = 10, n_cells = 100,
54
+#' sim <- simData(sce, ng = 10, nc = 100,
58 55
 #'   probs = list(NULL, NULL, c(1, 0)))
59 56
 #' levels(sim$group_id)
60 57
 #'     
... ...
@@ -236,6 +233,7 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
236 233
     gi <- set_rownames(gi[o, ], NULL)
237 234
     
238 235
     # construct SCE
236
+    cd$group_id <- droplevels(cd$group_id)
239 237
     cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
240 238
     m <- match(levels(cd$sample_id), cd$sample_id)
241 239
     gids <- cd$group_id[m]
Browse code

add parameter to vary percentage of cells to shift for EP, DP, and DM genes

HelenaLC authored on 19/11/2019 14:36:38
Showing 1 changed files
... ...
@@ -13,6 +13,8 @@
13 13
 #' @param p_dd numeric vector of length 6.
14 14
 #'   Specifies the probability of a gene being
15 15
 #'   EE, EP, DE, DP, DM, or DB, respectively.
16
+#' @param p_ep,p_dp,p_dm numeric specifying the proportion of cells
17
+#'   to be shifted to a different expression state in one group (see details).
16 18
 #' @param p_type numeric. Probaility of EE/EP gene being a type-gene.
17 19
 #'   If a gene is of class "type" in a given cluster, a unique mean 
18 20
 #'   will be used for that gene in the respective cluster.
... ...
@@ -78,8 +80,9 @@
78 80
 #' @export
79 81
 
80 82
 simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
81
-    probs = NULL, p_dd = diag(6)[1, ], p_type = 0,
82
-    lfc = 2, rel_lfc = NULL) {
83
+    probs = NULL, p_dd = diag(6)[1, ], 
84
+    p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
85
+    p_type = 0, lfc = 2, rel_lfc = NULL) {
83 86
     
84 87
     # throughout this code...
85 88
     # k: cluster ID
... ...
@@ -201,7 +204,7 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
201 204
                 d_kc <- d[gs0]
202 205
                 lfc_kc <- lfc[[c, k]]
203 206
                 
204
-                re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc)
207
+                re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc, p_ep, p_dp, p_dm)
205 208
                 y[gi, ci] <- re$cs
206 209
                 
207 210
                 for (g in gids) sim_mean[[k]][[g]][gi] <- ifelse(
Browse code

fix sample pairing

HelenaLC authored on 15/11/2019 11:58:09
Showing 1 changed files
... ...
@@ -6,9 +6,7 @@
6 6
 #' across 2 experimental conditions from a real scRNA-seq data set.
7 7
 #' 
8 8
 #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
9
-#' @param n_genes # of genes to simulate. 
10
-#' @param n_cells # of cells to simulate. 
11
-#'   Either a single numeric or a range to sample from.
9
+#' @param ng,nc,ns,nk # of genes, cells, samples and clusters to simulate. 
12 10
 #' @param probs a list of length 3 containing probabilities of a cell belonging
13 11
 #'   to each cluster, sample, and group, respectively. List elements must be 
14 12
 #'   NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.
... ...
@@ -79,7 +77,7 @@
79 77
 #' @importFrom S4Vectors split
80 78
 #' @export
81 79
 
82
-simData <- function(x, n_genes = 500, n_cells = 300, 
80
+simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
83 81
     probs = NULL, p_dd = diag(6)[1, ], p_type = 0,
84 82
     lfc = 2, rel_lfc = NULL) {
85 83
     
... ...
@@ -88,15 +86,27 @@ simData <- function(x, n_genes = 500, n_cells = 300,
88 86
     # s: sample ID
89 87
     # g: group ID
90 88
     # c: DD category
89
+    # 0: reference
91 90
     
92 91
     # check validity of input arguments
93 92
     .check_sce(x, req_group = FALSE)
94 93
     .check_args_simData(as.list(environment()))
95 94
     
96
-    kids <- set_names(levels(x$cluster_id))
97
-    sids <- set_names(levels(x$sample_id))
95
+    # reference IDs
96
+    nk0 <- length(kids0 <- set_names(levels(x$cluster_id)))
97
+    ns0 <- length(sids0 <- set_names(levels(x$sample_id)))
98
+    
99
+    # simulation IDs
100
+    nk <- length(kids <- set_names(paste0("cluster", seq_len(nk))))
101
+    sids <- set_names(paste0("sample", seq_len(ns)))
98 102
     gids <- set_names(c("A", "B"))
99
-    nk <- length(kids)
103
+    
104
+    # sample reference clusters & samples
105
+    ref_kids <- setNames(sample(kids0, nk, nk > nk0), kids)
106
+    ref_sids <- vapply(gids, function(g)
107
+        setNames(sample(sids0, ns, ns > ns0), 
108
+            paste0("sample", seq_len(ns))),
109
+        character(ns))
100 110
     
101 111
     if (is.null(rel_lfc)) 
102 112
         rel_lfc <- rep(1, nk)
... ...
@@ -104,15 +114,15 @@ simData <- function(x, n_genes = 500, n_cells = 300,
104 114
         names(rel_lfc) <- kids
105 115
     
106 116
     # initialize count matrix
107
-    gs <- paste0("gene", seq_len(n_genes))
108
-    cs <- paste0("cell", seq_len(n_cells))
109
-    y <- matrix(0, n_genes, n_cells, dimnames = list(gs, cs))
117
+    gs <- paste0("gene", seq_len(ng))
118
+    cs <- paste0("cell", seq_len(nc))
119
+    y <- matrix(0, ng, nc, dimnames = list(gs, cs))
110 120
     
111 121
     # sample cell metadata
112 122
     cd <- .sample_cell_md(
113
-        n = n_cells, probs = probs,
114
-        ids = list(kids, sids, gids)) %>% 
115
-        set_rownames(cs)
123
+        n = nc, probs = probs,
124
+        ids = list(kids, sids, gids))
125
+    rownames(cd) <- cs
116 126
     cs_idx <- .split_cells(cd, by = colnames(cd))
117 127
     n_cs <- modify_depth(cs_idx, -1, length)
118 128
     
... ...
@@ -120,14 +130,14 @@ simData <- function(x, n_genes = 500, n_cells = 300,
120 130
     cs_by_ks <- .split_cells(x)
121 131
     
122 132
     # sample nb. of genes to simulate per category & gene indices
123
-    n_dd <- replicate(nk, 
124
-        table(sample(factor(cats, levels = cats), n_genes, TRUE, p_dd))) %>% 
125
-        set_colnames(kids)
133
+    n_dd <- table(sample(cats, ng, TRUE, p_dd))
134
+    n_dd <- replicate(nk, n_dd)
135
+    colnames(n_dd) <- kids
126 136
     gs_idx <- .sample_gene_inds(gs, n_dd)
127 137
     
128 138
     # for ea. cluster, sample set of genes to simulate from
129
-    gs_by_k <- setNames(sample(rownames(x), n_genes, TRUE), gs)
130
-    gs_by_k <- replicate(nk, gs_by_k) %>% set_colnames(kids)
139
+    gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs)
140
+    gs_by_k <- set_colnames(replicate(nk, gs_by_k), kids)
131 141
 
132 142
     # impute type-genes
133 143
     if (p_type != 0)
... ...
@@ -135,74 +145,70 @@ simData <- function(x, n_genes = 500, n_cells = 300,
135 145
 
136 146
     # split by cluster & categroy
137 147
     gs_by_k <- split(gs_by_k, col(gs_by_k))
138
-    gs_by_k <- map(gs_by_k, set_names, gs)
139
-    names(gs_by_k) <- kids
148
+    gs_by_k <- setNames(map(gs_by_k, set_names, gs), kids)
140 149
     
141 150
     gs_by_kc <- lapply(kids, function(k) 
142
-        lapply(cats, function(c)
143
-            gs_by_k[[k]][gs_idx[[c, k]]]) %>% 
144
-            set_names(cats))
151
+        lapply(unfactor(cats), function(c) 
152
+            gs_by_k[[k]][gs_idx[[c, k]]])) 
145 153
     
146 154
     # sample logFCs
147 155
     lfc <- vapply(kids, function(k) 
148
-        lapply(cats, function(c) { 
156
+        lapply(unfactor(cats), function(c) { 
149 157
             n <- n_dd[c, k]
150 158
             if (c == "ee") return(rep(NA, n))
151 159
             signs <- sample(c(-1, 1), n, TRUE)
152 160
             lfcs <- rgamma(n, 4, 4/lfc) * signs
153 161
             names(lfcs) <- gs_by_kc[[k]][[c]]
154 162
             lfcs * rel_lfc[k]
155
-        }), vector("list", length(cats))) %>% 
156
-        set_rownames(cats)
157
-    
163
+        }), vector("list", length(cats)))
164
+
158 165
     # compute NB parameters
159
-    o <- exp(colData(x)$offset)
160
-    m <- lapply(sids, function(s) {
161
-        cn <- paste("beta", s, sep = ".")
162
-        k <- grep(cn, names(rowData(x)))
163
-        b <- exp(rowData(x)[[k]])
164
-        m <- vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>% 
165
-            set_rownames(rownames(x)) %>% 
166
-            set_colnames(colnames(x))
166
+    m <- lapply(sids0, function(s) {
167
+        b <- paste0("beta.", s)
168
+        b <- exp(rowData(x)[[b]])
169
+        m <- outer(b, exp(x$offset), "*")
170
+        dimnames(m) <- dimnames(x); m
167 171
     })
168
-    d <- rowData(x)$dispersion %>% 
169
-        set_names(rownames(x))
172
+    d <- rowData(x)$dispersion 
173
+    names(d) <- rownames(x)
170 174
     
171 175
     sim_mean <- lapply(kids, function(k) 
172
-        lapply(gids, function(g)
173
-            setNames(numeric(n_genes), rownames(y))))
176
+        lapply(gids, function(g) 
177
+            setNames(numeric(ng), gs)))
178
+    
174 179
     for (k in kids) {
175 180
         for (s in sids) {
181
+            # get reference samples, clusters & cells
182
+            s0 <- ref_sids[s, ]
183
+            k0 <- ref_kids[k]
184
+            cs0 <- cs_by_ks[[k0]][s0]
185
+            
186
+            # get output cell indices
187
+            ci <- unlist(cs_idx[[k]][[s]])
188
+            
176 189
             for (c in cats[n_dd[, k] != 0]) {
177
-                gs_kc <- gs_by_kc[[k]][[c]]
178
-                cs_ks <- cs_by_ks[[k]][[s]]
190
+                # sample cells to simulate from
191
+                cs_g1 <- sample(cs0[[1]], n_cs[[k]][[s]][[1]], TRUE)
192
+                cs_g2 <- sample(cs0[[2]], n_cs[[k]][[s]][[2]], TRUE)
179 193
                 
180
-                g1 <- cs_idx[[k]][[s]]$A
181
-                g2 <- cs_idx[[k]][[s]]$B
194
+                # get reference genes & output gene indices
195
+                gs0 <- gs_by_kc[[k]][[c]] 
196
+                gi <- gs_idx[[c, k]]
182 197
                 
183
-                ng1 <- length(g1)
184
-                ng2 <- length(g2) 
185
-                
186
-                cs_g1 <- sample(cs_ks, ng1, replace = TRUE)
187
-                cs_g2 <- sample(cs_ks, ng2, replace = TRUE)
188
-                
189
-                m_g1 <- m[[s]][gs_kc, cs_g1, drop = FALSE]
190
-                m_g2 <- m[[s]][gs_kc, cs_g2, drop = FALSE]
191
-                d_kc <- d[gs_kc]
198
+                # get NB parameters
199
+                m_g1 <- m[[s0[[1]]]][gs0, cs_g1, drop = FALSE]
200
+                m_g2 <- m[[s0[[2]]]][gs0, cs_g2, drop = FALSE]
201
+                d_kc <- d[gs0]
192 202
                 lfc_kc <- lfc[[c, k]]
193 203
                 
194
-                gidx <- gs_idx[[c, k]]
195
-                cidx <- c(g1, g2)
196
-                
197 204
                 re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc)
198
-                y[gidx, cidx] <- re$cs
205
+                y[gi, ci] <- re$cs
199 206
                 
200
-                for (g in c("A", "B")) sim_mean[[k]][[g]][gidx] <- 
201
-                    ifelse(is.null(re$ms[[g]]), NA, list(re$ms[[g]]))[[1]]
207
+                for (g in gids) sim_mean[[k]][[g]][gi] <- ifelse(
208
+                    is.null(re$ms[[g]]), NA, list(re$ms[[g]]))[[1]]
202 209
             }
203 210
         }
204 211
     }
205
-    
206 212
     sim_mean <- sim_mean %>%
207 213
         map(bind_cols) %>% 
208 214
         bind_rows(.id = "cluster_id") %>% 
... ...
@@ -224,7 +230,7 @@ simData <- function(x, n_genes = 500, n_cells = 300,
224 230
         rename("sim_mean.A" = "A", "sim_mean.B" = "B")
225 231
     # reorder
226 232
     o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
227
-    gi <- gi[o, ] %>% set_rownames(NULL)
233
+    gi <- set_rownames(gi[o, ], NULL)
228 234
     
229 235
     # construct SCE
230 236
     cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
... ...
@@ -242,6 +248,5 @@ simData <- function(x, n_genes = 500, n_cells = 300,
242 248
     
243 249
     SingleCellExperiment(
244 250
         assays = list(counts = as.matrix(y)),
245
-        colData = cd, 
246
-        metadata = md)
251
+        colData = cd, metadata = md)
247 252
 }
Browse code

rmv rounding of means

HelenaLC authored on 11/11/2019 11:13:02
Showing 1 changed files
... ...
@@ -161,10 +161,9 @@ simData <- function(x, n_genes = 500, n_cells = 300,
161 161
         cn <- paste("beta", s, sep = ".")
162 162
         k <- grep(cn, names(rowData(x)))
163 163
         b <- exp(rowData(x)[[k]])
164
-        vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>% 
164
+        m <- vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>% 
165 165
             set_rownames(rownames(x)) %>% 
166
-            set_colnames(colnames(x)) %>% 
167
-            round
166
+            set_colnames(colnames(x))
168 167
     })
169 168
     d <- rowData(x)$dispersion %>% 
170 169
         set_names(rownames(x))
Browse code

lfc default 1 > 2

HelenaLC authored on 12/08/2019 15:23:53
Showing 1 changed files
... ...
@@ -81,7 +81,7 @@
81 81
 
82 82
 simData <- function(x, n_genes = 500, n_cells = 300, 
83 83
     probs = NULL, p_dd = diag(6)[1, ], p_type = 0,
84
-    lfc = 1, rel_lfc = NULL) {
84
+    lfc = 2, rel_lfc = NULL) {
85 85
     
86 86
     # throughout this code...
87 87
     # k: cluster ID
Browse code

Merge branch 'devel' of https://github.com/HelenaLC/muscat into devel

HelenaLC authored on 09/08/2019 13:42:08
Showing 0 changed files
Browse code

set default lfc to 1

HelenaLC authored on 09/08/2019 13:41:35
Showing 1 changed files
... ...
@@ -81,7 +81,7 @@
81 81
 
82 82
 simData <- function(x, n_genes = 500, n_cells = 300, 
83 83
     probs = NULL, p_dd = diag(6)[1, ], p_type = 0,
84
-    lfc = 2, rel_lfc = NULL) {
84
+    lfc = 1, rel_lfc = NULL) {
85 85
     
86 86
     # throughout this code...
87 87
     # k: cluster ID
Browse code

fix examples using new example SCE

HelenaLC authored on 06/08/2019 16:10:56
Showing 1 changed files
... ...
@@ -49,7 +49,7 @@
49 49
 #' # unbalanced sample sizes
50 50
 #' sim <- simData(sce,
51 51
 #'   n_genes = 10, n_cells = 100,
52
-#'   probs = list(NULL, c(0.1, 0.3, 0.6), NULL))
52
+#'   probs = list(NULL, c(0.25, 0.75), NULL))
53 53
 #' table(sim$sample_id)
54 54
 #' 
55 55
 #' # one group only
Browse code

load sce for examples

HelenaLC authored on 05/08/2019 13:20:31
Showing 1 changed files
... ...
@@ -30,6 +30,7 @@
30 30
 #' 
31 31
 #' @examples
32 32
 #' data(sce)
33
+#' library(SingleCellExperiment)
33 34
 #' 
34 35
 #' # prep. SCE for simulation
35 36
 #' sce <- prepSim(sce)
Browse code

fix imports

Helena Lucia Crowell authored on 05/08/2019 12:42:47
Showing 1 changed files
... ...
@@ -71,7 +71,7 @@
71 71
 #' @importFrom dplyr mutate_all mutate_at
72 72
 #' @importFrom edgeR DGEList estimateDisp glmFit
73 73
 #' @importFrom magrittr set_colnames set_rownames
74
-#' @importFrom purrr modify_at set_names
74
+#' @importFrom purrr modify_depth set_names
75 75
 #' @importFrom stats model.matrix rgamma setNames
76 76
 #' @importFrom SingleCellExperiment SingleCellExperiment
77 77
 #' @importFrom SummarizedExperiment colData
Browse code

add wrappers for validity checks of function arguments

Helena Lucia Crowell authored on 05/08/2019 11:11:16
Showing 1 changed files
... ...
@@ -30,9 +30,32 @@
30 30
 #' 
31 31
 #' @examples
32 32
 #' data(sce)
33
-#' simData(sce,
34
-#'     n_genes = 10, n_cells = 10,
35
-#'     p_dd = diag(6)[1, ])
33
+#' 
34
+#' # prep. SCE for simulation
35
+#' sce <- prepSim(sce)
36
+#' 
37
+#' # simulate data
38
+#' (sim <- simData(sce,
39
+#'   n_genes = 100, n_cells = 10,
40
+#'   p_dd = c(0.9, 0, 0.1, 0, 0, 0)))
41
+#' 
42
+#' # simulation metadata
43
+#' head(gi <- metadata(sim)$gene_info)
44
+#' 
45
+#' # should be ~10% DE  
46
+#' table(gi$category)
47
+#' 
48
+#' # unbalanced sample sizes
49
+#' sim <- simData(sce,
50
+#'   n_genes = 10, n_cells = 100,
51
+#'   probs = list(NULL, c(0.1, 0.3, 0.6), NULL))
52
+#' table(sim$sample_id)
53
+#' 
54
+#' # one group only
55
+#' sim <- simData(sce,
56
+#'   n_genes = 10, n_cells = 100,
57
+#'   probs = list(NULL, NULL, c(1, 0)))
58
+#' levels(sim$group_id)
36 59
 #'     
37 60
 #' @author Helena L Crowell
38 61
 #' 
... ...
@@ -53,7 +76,6 @@
53 76
 #' @importFrom SingleCellExperiment SingleCellExperiment
54 77
 #' @importFrom SummarizedExperiment colData
55 78
 #' @importFrom S4Vectors split
56
-#' @importFrom tibble column_to_rownames
57 79
 #' @export
58 80
 
59 81
 simData <- function(x, n_genes = 500, n_cells = 300, 
... ...
@@ -63,35 +85,22 @@ simData <- function(x, n_genes = 500, n_cells = 300,
63 85
     # throughout this code...
64 86
     # k: cluster ID
65 87
     # s: sample ID
66
-    # c: gene category
88
+    # g: group ID
89
+    # c: DD category
67 90
     
68 91
     # check validity of input arguments
69 92
     .check_sce(x, req_group = FALSE)
70
-    stopifnot(is.numeric(n_genes), length(n_genes) == 1)
71
-    stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
72
-    stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
73
-    stopifnot(is.numeric(p_type), length(p_type) == 1, p_type >= 0)
74
-    stopifnot(is.numeric(lfc), is.numeric(lfc), lfc > 1)
75
-    
76
-    kids <- levels(x$cluster_id)
77
-    sids <- levels(x$sample_id)
78
-    gids <- c("A", "B")
79
-    names(kids) <- kids
80
-    names(sids) <- sids
81
-    names(gids) <- gids
93
+    .check_args_simData(as.list(environment()))
94
+    
95
+    kids <- set_names(levels(x$cluster_id))
96
+    sids <- set_names(levels(x$sample_id))
97
+    gids <- set_names(c("A", "B"))
82 98
     nk <- length(kids)
83 99
     
84
-    if (is.null(rel_lfc)) {
100
+    if (is.null(rel_lfc)) 
85 101
         rel_lfc <- rep(1, nk)
102
+    if (is.null(names(rel_lfc))) 
86 103
         names(rel_lfc) <- kids
87
-    } else {
88
-        stopifnot(is.numeric(rel_lfc), length(rel_lfc) == nk, rel_lfc >= 0)
89
-        if (is.null(names(rel_lfc))) {
90
-            names(rel_lfc) <- kids
91
-        } else {
92
-            stopifnot(setequal(names(rel_lfc), kids))
93
-        }
94
-    }
95 104
     
96 105
     # initialize count matrix
97 106
     gs <- paste0("gene", seq_len(n_genes))
Browse code

remove my email!!

HelenaLC authored on 02/08/2019 14:19:13
Showing 1 changed files
... ...
@@ -25,8 +25,6 @@
25 25
 #'   \code{levels(x$cluster_id)} as names. 
26 26
 #'   Defaults to factor of 1 for all clusters.
27 27
 #'   
28
-#'   
29
-#' 
30 28
 #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
31 29
 #'   containing multiple clusters & samples across 2 groups.
32 30
 #' 
... ...
@@ -35,6 +33,16 @@
35 33
 #' simData(sce,
36 34
 #'     n_genes = 10, n_cells = 10,
37 35
 #'     p_dd = diag(6)[1, ])
36
+#'     
37
+#' @author Helena L Crowell
38
+#' 
39
+#' @references 
40
+#' Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 
41
+#' Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 
42
+#' On the discovery of population-specific state transitions from 
43
+#' multi-sample multi-condition single-cell RNA sequencing data. 
44
+#' \emph{bioRxiv} \strong{713412} (2018). 
45
+#' doi: \url{https://doi.org/10.1101/713412}
38 46
 #' 
39 47
 #' @importFrom data.table data.table
40 48
 #' @importFrom dplyr mutate_all mutate_at
... ...
@@ -46,7 +54,6 @@
46 54
 #' @importFrom SummarizedExperiment colData
47 55
 #' @importFrom S4Vectors split
48 56
 #' @importFrom tibble column_to_rownames
49
-#' 
50 57
 #' @export
51 58
 
52 59
 simData <- function(x, n_genes = 500, n_cells = 300, 
Browse code

merge plger branch mm-glmm methods; pass BiocCheck

HelenaLC authored on 03/07/2019 15:01:55
Showing 1 changed files
... ...
@@ -117,8 +117,10 @@ simData <- function(x, n_genes = 500, n_cells = 300,
117 117
         gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx, p_type)
118 118
 
119 119
     # split by cluster & categroy
120
-    gs_by_k <- gs_by_k %>% split(col(.)) %>% 
121
-        set_names(kids) %>% map(set_names, gs)
120
+    gs_by_k <- split(gs_by_k, col(gs_by_k))
121
+    gs_by_k <- map(gs_by_k, set_names, gs)
122
+    names(gs_by_k) <- kids
123
+    
122 124
     gs_by_kc <- lapply(kids, function(k) 
123 125
         lapply(cats, function(c)
124 126
             gs_by_k[[k]][gs_idx[[c, k]]]) %>% 
Browse code

Merge branch 'devel'

HelenaLC authored on 02/07/2019 08:22:56
Showing 0 changed files
Browse code

merge devel

HelenaLC authored on 02/07/2019 08:22:01
Showing 1 changed files
... ...
@@ -144,7 +144,8 @@ simData <- function(x, n_genes = 500, n_cells = 300,
144 144
         b <- exp(rowData(x)[[k]])
145 145
         vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>% 
146 146
             set_rownames(rownames(x)) %>% 
147
-            set_colnames(colnames(x))
147
+            set_colnames(colnames(x)) %>% 
148
+            round
148 149
     })
149 150
     d <- rowData(x)$dispersion %>% 
150 151
         set_names(rownames(x))
Browse code

bug fix for simulating 1 group only

HelenaLC authored on 17/06/2019 11:10:54
Showing 1 changed files
... ...
@@ -177,8 +177,9 @@ simData <- function(x, n_genes = 500, n_cells = 300,
177 177
                 
178 178
                 re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc)
179 179
                 y[gidx, cidx] <- re$cs
180
-                sim_mean[[k]]$A[gidx] <- re$ms$A
181
-                sim_mean[[k]]$B[gidx] <- re$ms$B
180
+                
181
+                for (g in c("A", "B")) sim_mean[[k]][[g]][gidx] <- 
182
+                    ifelse(is.null(re$ms[[g]]), NA, list(re$ms[[g]]))[[1]]
182 183
             }
183 184
         }
184 185
     }
Browse code

reformat code

HelenaLC authored on 15/05/2019 10:32:01
Showing 1 changed files
... ...
@@ -94,7 +94,8 @@ simData <- function(x, n_genes = 500, n_cells = 300,
94 94
     # sample cell metadata
95 95
     cd <- .sample_cell_md(
96 96
         n = n_cells, probs = probs,
97
-        ids = list(kids, sids, gids)) %>% set_rownames(cs)
97
+        ids = list(kids, sids, gids)) %>% 
98
+        set_rownames(cs)
98 99
     cs_idx <- .split_cells(cd, by = colnames(cd))
99 100
     n_cs <- modify_depth(cs_idx, -1, length)
100 101
     
Browse code

add cluster-specific mean logFC shift

HelenaLC authored on 01/04/2019 18:50:57
Showing 1 changed files
... ...
@@ -15,11 +15,17 @@
15 15
 #' @param p_dd numeric vector of length 6.
16 16
 #'   Specifies the probability of a gene being
17 17
 #'   EE, EP, DE, DP, DM, or DB, respectively.
18
-#' @param fc numeric value to use as mean logFC
19
-#'   for DE, DP, DM, and DB type of genes.
20 18
 #' @param p_type numeric. Probaility of EE/EP gene being a type-gene.
21 19
 #'   If a gene is of class "type" in a given cluster, a unique mean 
22 20
 #'   will be used for that gene in the respective cluster.
21
+#' @param lfc numeric value to use as mean logFC
22
+#'   for DE, DP, DM, and DB type of genes.
23
+#' @param rel_lfc numeric vector of relative logFCs for each cluster. 
24
+#'   Should be of length \code{nlevels(x$cluster_id)} with 
25
+#'   \code{levels(x$cluster_id)} as names. 
26
+#'   Defaults to factor of 1 for all clusters.
27
+#'   
28
+#'   
23 29
 #' 
24 30
 #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
25 31
 #'   containing multiple clusters & samples across 2 groups.
... ...
@@ -45,7 +51,7 @@
45 51
 
46 52
 simData <- function(x, n_genes = 500, n_cells = 300, 
47 53
     probs = NULL, p_dd = diag(6)[1, ], p_type = 0,
48
-    fc = 2, rel_fc = NULL) {
54
+    lfc = 2, rel_lfc = NULL) {
49 55
     
50 56
     # throughout this code...
51 57
     # k: cluster ID
... ...
@@ -58,7 +64,7 @@ simData <- function(x, n_genes = 500, n_cells = 300,
58 64
     stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
59 65
     stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
60 66
     stopifnot(is.numeric(p_type), length(p_type) == 1, p_type >= 0)
61
-    stopifnot(is.numeric(fc), is.numeric(fc), fc > 1)
67
+    stopifnot(is.numeric(lfc), is.numeric(lfc), lfc > 1)
62 68
     
63 69
     kids <- levels(x$cluster_id)
64 70
     sids <- levels(x$sample_id)
... ...
@@ -67,10 +73,17 @@ simData <- function(x, n_genes = 500, n_cells = 300,
67 73
     names(sids) <- sids
68 74
     names(gids) <- gids
69 75
     nk <- length(kids)
70
-    if (is.null(rel_fc)) {
71
-        rel_fc <- rep(1, nk)
76
+    
77
+    if (is.null(rel_lfc)) {
78
+        rel_lfc <- rep(1, nk)
79
+        names(rel_lfc) <- kids
72 80
     } else {
73
-        stopifnot(is.numeric(rel_fc), length(rel_fc) == nk, rel_fc >= 0)
81
+        stopifnot(is.numeric(rel_lfc), length(rel_lfc) == nk, rel_lfc >= 0)
82
+        if (is.null(names(rel_lfc))) {
83
+            names(rel_lfc) <- kids
84
+        } else {
85
+            stopifnot(setequal(names(rel_lfc), kids))
86
+        }
74 87
     }
75 88
     
76 89
     # initialize count matrix
... ...
@@ -116,9 +129,9 @@ simData <- function(x, n_genes = 500, n_cells = 300,
116 129
             n <- n_dd[c, k]
117 130
             if (c == "ee") return(rep(NA, n))
118 131
             signs <- sample(c(-1, 1), n, TRUE)
119
-            lfc <- rgamma(n, 4, 4/fc) * signs
120
-            names(lfc) <- gs_by_kc[[k]][[c]]
121
-            return(lfc)
132
+            lfcs <- rgamma(n, 4, 4/lfc) * signs
133
+            names(lfcs) <- gs_by_kc[[k]][[c]]
134
+            lfcs * rel_lfc[k]
122 135
         }), vector("list", length(cats))) %>% 
123 136
         set_rownames(cats)
124 137
     
Browse code

code cleaning, update simulation

HelenaLC authored on 28/03/2019 10:21:29
Showing 1 changed files
... ...
@@ -17,6 +17,9 @@
17 17
 #'   EE, EP, DE, DP, DM, or DB, respectively.
18 18
 #' @param fc numeric value to use as mean logFC
19 19
 #'   for DE, DP, DM, and DB type of genes.
20
+#' @param p_type numeric. Probaility of EE/EP gene being a type-gene.
21
+#'   If a gene is of class "type" in a given cluster, a unique mean 
22
+#'   will be used for that gene in the respective cluster.
20 23
 #' 
21 24
 #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
22 25
 #'   containing multiple clusters & samples across 2 groups.
... ...
@@ -41,7 +44,8 @@
41 44
 #' @export
42 45
 
43 46
 simData <- function(x, n_genes = 500, n_cells = 300, 
44
-    probs = NULL, p_dd = diag(6)[1, ], fc = 2) {
47
+    probs = NULL, p_dd = diag(6)[1, ], p_type = 0,
48
+    fc = 2, rel_fc = NULL) {
45 49
     
46 50
     # throughout this code...
47 51
     # k: cluster ID
... ...
@@ -53,6 +57,7 @@ simData <- function(x, n_genes = 500, n_cells = 300,
53 57
     stopifnot(is.numeric(n_genes), length(n_genes) == 1)
54 58
     stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
55 59
     stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
60
+    stopifnot(is.numeric(p_type), length(p_type) == 1, p_type >= 0)
56 61
     stopifnot(is.numeric(fc), is.numeric(fc), fc > 1)
57 62
     
58 63
     kids <- levels(x$cluster_id)
... ...
@@ -62,6 +67,11 @@ simData <- function(x, n_genes = 500, n_cells = 300,
62 67
     names(sids) <- sids
63 68
     names(gids) <- gids
64 69
     nk <- length(kids)
70
+    if (is.null(rel_fc)) {
71
+        rel_fc <- rep(1, nk)
72
+    } else {
73
+        stopifnot(is.numeric(rel_fc), length(rel_fc) == nk, rel_fc >= 0)
74
+    }
65 75
     
66 76
     # initialize count matrix
67 77
     gs <- paste0("gene", seq_len(n_genes))
... ...
@@ -86,10 +96,15 @@ simData <- function(x, n_genes = 500, n_cells = 300,
86 96
     
87 97
     # for ea. cluster, sample set of genes to simulate from
88 98
     gs_by_k <- setNames(sample(rownames(x), n_genes, TRUE), gs)
89
-    gs_by_k <- replicate(nk, gs_by_k, simplify = FALSE) %>% set_names(kids)
90
-    #gs_by_k <- replicate(nk, 
91
-    #    setNames(sample(rownames(x), n_genes, TRUE), gs),
92
-    #    simplify = FALSE) %>% set_names(kids)
99
+    gs_by_k <- replicate(nk, gs_by_k) %>% set_colnames(kids)
100
+
101
+    # impute type-genes
102
+    if (p_type != 0)
103
+        gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx, p_type)
104
+
105
+    # split by cluster & categroy
106
+    gs_by_k <- gs_by_k %>% split(col(.)) %>% 
107
+        set_names(kids) %>% map(set_names, gs)
93 108
     gs_by_kc <- lapply(kids, function(k) 
94 109
         lapply(cats, function(c)
95 110
             gs_by_k[[k]][gs_idx[[c, k]]]) %>% 
... ...
@@ -108,12 +123,17 @@ simData <- function(x, n_genes = 500, n_cells = 300,
108 123
         set_rownames(cats)
109 124
     
110 125
     # compute NB parameters
111
-    b <- exp(rowData(x)$beta)
112 126
     o <- exp(colData(x)$offset)
113
-    m <- vapply(o, function(l) b*l, numeric(nrow(x)))
114
-    dimnames(m) <- dimnames(x)
115
-    d <- rowData(x)$dispersion
116
-    names(d) <- rownames(x)
127
+    m <- lapply(sids, function(s) {
128
+        cn <- paste("beta", s, sep = ".")
129
+        k <- grep(cn, names(rowData(x)))
130
+        b <- exp(rowData(x)[[k]])
131
+        vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>% 
132
+            set_rownames(rownames(x)) %>% 
133
+            set_colnames(colnames(x))
134
+    })
135
+    d <- rowData(x)$dispersion %>% 
136
+        set_names(rownames(x))
117 137
     
118 138
     sim_mean <- lapply(kids, function(k) 
119 139
         lapply(gids, function(g)
... ...
@@ -133,24 +153,30 @@ simData <- function(x, n_genes = 500, n_cells = 300,
133 153
                 cs_g1 <- sample(cs_ks, ng1, replace = TRUE)
134 154
                 cs_g2 <- sample(cs_ks, ng2, replace = TRUE)
135 155
                 
136
-                m_g1 <- m[gs_kc, cs_g1, drop = FALSE]
137
-                m_g2 <- m[gs_kc, cs_g2, drop = FALSE]
156
+                m_g1 <- m[[s]][gs_kc, cs_g1, drop = FALSE]
157
+                m_g2 <- m[[s]][gs_kc, cs_g2, drop = FALSE]
138 158
                 d_kc <- d[gs_kc]
139 159
                 lfc_kc <- lfc[[c, k]]
140 160
                 
141 161
                 gidx <- gs_idx[[c, k]]
142 162
                 cidx <- c(g1, g2)
143 163
                 
144
-                counts <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d = d_kc, lfc = lfc_kc)
145
-                y[gidx, cidx] <- counts
146
-                sim_mean[[k]]$A[gidx] <- rowMeans(m_g1) # ... * lfc ??
147
-                sim_mean[[k]]$B[gidx] <- rowMeans(m_g2)
164
+                re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc)
165
+                y[gidx, cidx] <- re$cs
166
+                sim_mean[[k]]$A[gidx] <- re$ms$A
167
+                sim_mean[[k]]$B[gidx] <- re$ms$B
148 168
             }
149 169
         }
150 170
     }
151 171
     
152
-    # construct gene metadata table storing
153
-    # gene | cluster_id | category | logFC
172
+    sim_mean <- sim_mean %>%
173
+        map(bind_cols) %>% 
174
+        bind_rows(.id = "cluster_id") %>% 
175
+        mutate_at("cluster_id", factor) %>% 
176
+        mutate(gene = rep(gs, nk))
177
+    
178
+    # construct gene metadata table storing ------------------------------------
179
+    # gene | cluster_id | category | logFC, gene, disp, mean used for sim.
154 180
     gi <- data.frame(
155 181
         gene = unlist(gs_idx),
156 182
         cluster_id = rep.int(rep(kids, each = length(cats)), c(n_dd)),
... ...
@@ -159,15 +185,13 @@ simData <- function(x, n_genes = 500, n_cells = 300,
159 185
         sim_gene = unlist(gs_by_kc),
160 186
         sim_disp = d[unlist(gs_by_kc)]) %>% 
161 187
         mutate_at("gene", as.character)
188
+    # add true simulation means
189
+    gi <- full_join(gi, sim_mean, by = c("gene", "cluster_id")) %>% 
190
+        rename("sim_mean.A" = "A", "sim_mean.B" = "B")
191
+    # reorder
162 192
     o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
163 193
     gi <- gi[o, ] %>% set_rownames(NULL)
164 194
     
165
-    a <- unlist(map_depth(sim_mean, 1, "A"))
166
-    b <- unlist(map_depth(sim_mean, 1, "B"))
167
-    o <- order(as.numeric(gsub(".*\\.[a-z]+", "", names(a))))
168
-    gi$sim_mean.A <- a[o]
169
-    gi$sim_mean.B <- b[o]
170
-    
171 195
     # construct SCE
172 196
     cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
173 197
     m <- match(levels(cd$sample_id), cd$sample_id)
... ...
@@ -187,21 +211,3 @@ simData <- function(x, n_genes = 500, n_cells = 300,
187 211
         colData = cd, 
188 212
         metadata = md)
189 213
 }
190
-
191
-
192
-
193
-
194
-
195
-
196
-
197
-
198
-
199
-
200
-
201
-
202
-
203
-
204
-
205
-
206
-
207
-
Browse code

add gs used for sim to output sce

HelenaLC authored on 12/03/2019 08:53:17
Showing 1 changed files
... ...
@@ -22,8 +22,8 @@
22 22
 #'   containing multiple clusters & samples across 2 groups.
23 23
 #' 
24 24
 #' @examples
25
-#' data(kang)
26
-#' simData(kang,
25
+#' data(sce)
26
+#' simData(sce,
27 27
 #'     n_genes = 10, n_cells = 10,
28 28
 #'     p_dd = diag(6)[1, ])
29 29
 #' 
... ...
@@ -49,14 +49,14 @@ simData <- function(x, n_genes = 500, n_cells = 300,
49 49
     # c: gene category
50 50
     
51 51
     # check validity of input arguments
52
-    stopifnot(is(x, "SingleCellExperiment"))
52
+    .check_sce(x, req_group = FALSE)
53 53
     stopifnot(is.numeric(n_genes), length(n_genes) == 1)
54 54
     stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
55 55
     stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
56 56
     stopifnot(is.numeric(fc), is.numeric(fc), fc > 1)
57 57
     
58
-    kids <- levels(colData(x)$cluster_id)
59
-    sids <- levels(colData(x)$sample_id)
58
+    kids <- levels(x$cluster_id)
59
+    sids <- levels(x$sample_id)
60 60
     gids <- c("A", "B")
61 61
     names(kids) <- kids
62 62
     names(sids) <- sids
... ...
@@ -70,7 +70,7 @@ simData <- function(x, n_genes = 500, n_cells = 300,
70 70
     
71 71
     # sample cell metadata
72 72
     cd <- .sample_cell_md(
73
-        n = n_cells, probs = NULL,
73
+        n = n_cells, probs = probs,
74 74
         ids = list(kids, sids, gids)) %>% set_rownames(cs)
75 75
     cs_idx <- .split_cells(cd, by = colnames(cd))
76 76
     n_cs <- modify_depth(cs_idx, -1, length)
... ...
@@ -115,6 +115,9 @@ simData <- function(x, n_genes = 500, n_cells = 300,
115 115
     d <- rowData(x)$dispersion
116 116
     names(d) <- rownames(x)
117 117
     
118
+    sim_mean <- lapply(kids, function(k) 
119
+        lapply(gids, function(g)
120
+            setNames(numeric(n_genes), rownames(y))))
118 121
     for (k in kids) {
119 122
         for (s in sids) {
120 123
             for (c in cats[n_dd[, k] != 0]) {
... ...
@@ -135,8 +138,13 @@ simData <- function(x, n_genes = 500, n_cells = 300,
135 138
                 d_kc <- d[gs_kc]
136 139
                 lfc_kc <- lfc[[c, k]]
137 140
                 
141
+                gidx <- gs_idx[[c, k]]
142
+                cidx <- c(g1, g2)
143
+                
138 144
                 counts <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d = d_kc, lfc = lfc_kc)
139
-                y[gs_idx[[c, k]], c(g1, g2)] <- counts
145
+                y[gidx, cidx] <- counts
146
+                sim_mean[[k]]$A[gidx] <- rowMeans(m_g1) # ... * lfc ??
147
+                sim_mean[[k]]$B[gidx] <- rowMeans(m_g2)
140 148
             }
141 149
         }
142 150
     }
... ...
@@ -147,11 +155,19 @@ simData <- function(x, n_genes = 500, n_cells = 300,
147 155
         gene = unlist(gs_idx),
148 156
         cluster_id = rep.int(rep(kids, each = length(cats)), c(n_dd)),
149 157
         category = rep.int(rep(cats, nk), c(n_dd)),
150
-        logFC = unlist(lfc)) %>% 
158
+        logFC = unlist(lfc),
159
+        sim_gene = unlist(gs_by_kc),
160
+        sim_disp = d[unlist(gs_by_kc)]) %>% 
151 161
         mutate_at("gene", as.character)
152 162
     o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
153 163
     gi <- gi[o, ] %>% set_rownames(NULL)
154 164
     
165
+    a <- unlist(map_depth(sim_mean, 1, "A"))
166
+    b <- unlist(map_depth(sim_mean, 1, "B"))
167
+    o <- order(as.numeric(gsub(".*\\.[a-z]+", "", names(a))))
168
+    gi$sim_mean.A <- a[o]
169
+    gi$sim_mean.B <- b[o]
170
+    
155 171
     # construct SCE
156 172
     cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
157 173
     m <- match(levels(cd$sample_id), cd$sample_id)
Browse code

add vignette

HelenaLC authored on 26/02/2019 11:13:34
Showing 1 changed files
... ...
@@ -9,7 +9,9 @@
9 9
 #' @param n_genes # of genes to simulate. 
10 10
 #' @param n_cells # of cells to simulate. 
11 11
 #'   Either a single numeric or a range to sample from.
12
-#' @param ns nb. of genes common to 1, 2, ..., all clusters.
12
+#' @param probs a list of length 3 containing probabilities of a cell belonging
13
+#'   to each cluster, sample, and group, respectively. List elements must be 
14
+#'   NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.
13 15
 #' @param p_dd numeric vector of length 6.
14 16
 #'   Specifies the probability of a gene being
15 17
 #'   EE, EP, DE, DP, DM, or DB, respectively.
... ...
@@ -23,7 +25,7 @@
23 25
 #' data(kang)
24 26
 #' simData(kang,
25 27
 #'     n_genes = 10, n_cells = 10,
26
-#'     p_dd = c(1,0,0,0,0,0))
28
+#'     p_dd = diag(6)[1, ])
27 29
 #' 
28 30
 #' @importFrom data.table data.table
29 31
 #' @importFrom dplyr mutate_all mutate_at
Browse code

add limma-voom

HelenaLC authored on 18/02/2019 09:05:33
Showing 1 changed files
... ...
@@ -82,12 +82,12 @@ simData <- function(x, n_genes = 500, n_cells = 300,
82 82
         set_colnames(kids)
83 83
     gs_idx <- .sample_gene_inds(gs, n_dd)
84 84
     
85
-    # for ea. cluster, sample unique set of genes to simulate from
86
-    #tmp <- setNames(sample(rownames(x), n_genes, TRUE), gs)
87
-    #gs_by_k <- replicate(nk, tmp, simplify = FALSE) %>% set_names(kids)
88
-    gs_by_k <- replicate(nk, 
89
-        setNames(sample(rownames(x), n_genes, TRUE), gs),
90
-        simplify = FALSE) %>% set_names(kids)
85
+    # for ea. cluster, sample set of genes to simulate from
86
+    gs_by_k <- setNames(sample(rownames(x), n_genes, TRUE), gs)
87
+    gs_by_k <- replicate(nk, gs_by_k, simplify = FALSE) %>% set_names(kids)
88
+    #gs_by_k <- replicate(nk, 
89
+    #    setNames(sample(rownames(x), n_genes, TRUE), gs),
90
+    #    simplify = FALSE) %>% set_names(kids)
91 91
     gs_by_kc <- lapply(kids, function(k) 
92 92
         lapply(cats, function(c)
93 93
             gs_by_k[[k]][gs_idx[[c, k]]]) %>% 
Browse code

update unit-test for runMAST()

HelenaLC authored on 11/02/2019 07:41:21
Showing 1 changed files
... ...
@@ -35,11 +35,11 @@
35 35
 #' @importFrom SummarizedExperiment colData
36 36
 #' @importFrom S4Vectors split
37 37
 #' @importFrom tibble column_to_rownames
38
-#' @importFrom zeallot %<-%
39 38
 #' 
40 39
 #' @export
41 40
 
42
-simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6)[1, ], fc = 2) {
41
+simData <- function(x, n_genes = 500, n_cells = 300, 
42
+    probs = NULL, p_dd = diag(6)[1, ], fc = 2) {
43 43
     
44 44
     # throughout this code...
45 45
     # k: cluster ID
... ...
@@ -83,6 +83,8 @@ simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6
83 83
     gs_idx <- .sample_gene_inds(gs, n_dd)
84 84
     
85 85
     # for ea. cluster, sample unique set of genes to simulate from
86
+    #tmp <- setNames(sample(rownames(x), n_genes, TRUE), gs)
87
+    #gs_by_k <- replicate(nk, tmp, simplify = FALSE) %>% set_names(kids)
86 88
     gs_by_k <- replicate(nk, 
87 89
         setNames(sample(rownames(x), n_genes, TRUE), gs),
88 90
         simplify = FALSE) %>% set_names(kids)
Browse code

all things SE

HelenaLC authored on 06/02/2019 11:32:43
Showing 1 changed files
... ...
@@ -28,7 +28,8 @@
28 28
 #' @importFrom data.table data.table
29 29
 #' @importFrom dplyr mutate_all mutate_at
30 30
 #' @importFrom edgeR DGEList estimateDisp glmFit
31
-#' @importFrom purrr modify_at
31
+#' @importFrom magrittr set_colnames set_rownames
32
+#' @importFrom purrr modify_at set_names
32 33
 #' @importFrom stats model.matrix rgamma setNames
33 34
 #' @importFrom SingleCellExperiment SingleCellExperiment
34 35
 #' @importFrom SummarizedExperiment colData
Browse code

add devel branch

HelenaLC authored on 05/02/2019 05:36:12
Showing 1 changed files
... ...
@@ -142,8 +142,6 @@ simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6
142 142
         gene = unlist(gs_idx),
143 143
         cluster_id = rep.int(rep(kids, each = length(cats)), c(n_dd)),
144 144
         category = rep.int(rep(cats, nk), c(n_dd)),
145
-        # mean = ,
146
-        # disp = ,
147 145
         logFC = unlist(lfc)) %>% 
148 146
         mutate_at("gene", as.character)
149 147
     o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
... ...
@@ -161,7 +159,7 @@ simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6
161 159
     md <- list(
162 160
         experiment_info = ei,
163 161
         n_cells = table(cd$sample_id),
164
-        gene_info = gi, sim_genes = gs_in)
162
+        gene_info = gi)
165 163
     
166 164
     SingleCellExperiment(
167 165
         assays = list(counts = as.matrix(y)),
Browse code

add drop=FALSE

HelenaLC authored on 31/01/2019 16:45:38
Showing 1 changed files
... ...
@@ -125,8 +125,8 @@ simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6
125 125
                 cs_g1 <- sample(cs_ks, ng1, replace = TRUE)
126 126
                 cs_g2 <- sample(cs_ks, ng2, replace = TRUE)
127 127
                 
128
-                m_g1 <- m[gs_kc, cs_g1]
129
-                m_g2 <- m[gs_kc, cs_g2]
128
+                m_g1 <- m[gs_kc, cs_g1, drop = FALSE]
129
+                m_g2 <- m[gs_kc, cs_g2, drop = FALSE]
130 130
                 d_kc <- d[gs_kc]
131 131
                 lfc_kc <- lfc[[c, k]]
132 132
                 
Browse code

add unit-tested helpers

HelenaLC authored on 31/01/2019 13:53:22
Showing 1 changed files
... ...
@@ -15,25 +15,35 @@
15 15
 #'   EE, EP, DE, DP, DM, or DB, respectively.
16 16
 #' @param fc numeric value to use as mean logFC
17 17
 #'   for DE, DP, DM, and DB type of genes.
18
-#' @param seed random seed. 
18
+#' 
19
+#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
20
+#'   containing multiple clusters & samples across 2 groups.
19 21
 #' 
20 22
 #' @examples
21 23
 #' data(kang)
22 24
 #' simData(kang,
23 25
 #'     n_genes = 10, n_cells = 10,
24
-#'     p_dd = c(1,0,0,0,0,0), seed = 1)
26
+#'     p_dd = c(1,0,0,0,0,0))
25 27
 #' 
26 28
 #' @importFrom data.table data.table
29
+#' @importFrom dplyr mutate_all mutate_at
27 30
 #' @importFrom edgeR DGEList estimateDisp glmFit
31
+#' @importFrom purrr modify_at
28 32
 #' @importFrom stats model.matrix rgamma setNames
29 33
 #' @importFrom SingleCellExperiment SingleCellExperiment
30 34
 #' @importFrom SummarizedExperiment colData
31 35
 #' @importFrom S4Vectors split
36
+#' @importFrom tibble column_to_rownames
32 37
 #' @importFrom zeallot %<-%
33 38
 #' 
34 39
 #' @export
35 40
 
36
-simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
41
+simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6)[1, ], fc = 2) {
42
+    
43
+    # throughout this code...
44
+    # k: cluster ID
45
+    # s: sample ID
46
+    # c: gene category
37 47
     
38 48
     # check validity of input arguments
39 49
     stopifnot(is(x, "SingleCellExperiment"))
... ...
@@ -41,157 +51,138 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
41 51
     stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
42 52
     stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
43 53
     stopifnot(is.numeric(fc), is.numeric(fc), fc > 1)
44
-    stopifnot(is.numeric(seed), length(seed) == 1)
45 54
     
46
-    cluster_ids <- levels(colData(x)$cluster_id)
47
-    sample_ids <- levels(colData(x)$sample_id)
48
-    n_clusters <- length(cluster_ids)
49
-    n_samples <- length(sample_ids)
50
-    
51
-    # split cells by cluster-sample
52
-    dt <- data.table(
53
-        cell = colnames(x), 
54
-        cluster_id = colData(x)$cluster_id,
55
-        sample_id = colData(x)$sample_id)
56
-    dt_split <- split(dt,
57
-        by = c("cluster_id", "sample_id"), 
58
-        keep.by = FALSE, flatten = FALSE)
59
-    cells_by_cluster_sample <- sapply(dt_split, sapply, "[[", "cell")
60
-    
61
-    # sample nb. of cells to simulate per cluster-sample
62
-    if (length(n_cells) == 1) {
63
-        n_cells <- list(rep(n_cells, 2))
64
-    } else {
65
-        n_cells <- replicate(n_clusters * n_samples, 
66
-            list(sample(n_cells[1]:n_cells[2], 2)))
67
-    }
68
-    n_cells <- matrix(n_cells, 
69
-        nrow = n_samples, ncol = n_clusters, 
70
-        dimnames = list(sample_ids, cluster_ids))
55
+    kids <- levels(colData(x)$cluster_id)
56
+    sids <- levels(colData(x)$sample_id)
57
+    gids <- c("A", "B")
58
+    names(kids) <- kids
59
+    names(sids) <- sids
60
+    names(gids) <- gids
61
+    nk <- length(kids)
71 62
     
72 63
     # initialize count matrix
73
-    y <- matrix(0, 
74
-        nrow = n_genes, 
75
-        ncol = sum(unlist(n_cells)),
76
-        dimnames = list(
77
-            paste0("gene", seq_len(n_genes)),
78
-            paste0("cell", seq_len(sum(unlist(n_cells))))))
79
-    
80
-    # sample nb. of genes to simulate per category
81
-    ndd <- replicate(n_clusters, {
82
-        ns <- sample(cats, n_genes, replace = TRUE, prob = p_dd)
83
-        factor(ns, levels = cats) 
84
-    }, simplify = FALSE)
85
-    ndd <- sapply(ndd, table)
86
-    colnames(ndd) <- cluster_ids
87
-    
88
-    # sample gene indices
89
-    is <- sapply(cluster_ids, function(c, gs = rownames(y))
90
-        sapply(cats, function(cat) { 
91
-            n <- ndd[cat, c]
92
-            x <- sample(gs, n)
93
-            gs <<- setdiff(gs, x)
94
-            return(x) }))
95
-    
96
-    # sample cell indices
97
-    cs <- colnames(y)
98
-    js <- sapply(cluster_ids, function(c) 
99
-        setNames(lapply(sample_ids, function(s)
100
-            lapply(n_cells[[s, c]], function(n) {
101
-                x <- sample(cs, n)
102
-                cs <<- setdiff(cs, x)
103
-                return(x) })), sample_ids))
104
-    
105
-    # sample genes to simulate from
106
-    gs <- replicate(n_clusters, sample(rownames(x), n_genes, replace = TRUE))
107
-    rownames(gs) <- rownames(y)
108
-    colnames(gs) <- cluster_ids
109
-    
110
-    # sample fold-changes
111
-    lfcs <- sapply(cluster_ids, function(k) 
112
-        sapply(cats, function(c) { 
113
-            n <- ndd[c, k]
64
+    gs <- paste0("gene", seq_len(n_genes))
65
+    cs <- paste0("cell", seq_len(n_cells))
66
+    y <- matrix(0, n_genes, n_cells, dimnames = list(gs, cs))
67
+    
68
+    # sample cell metadata
69
+    cd <- .sample_cell_md(
70
+        n = n_cells, probs = NULL,
71
+        ids = list(kids, sids, gids)) %>% set_rownames(cs)
72
+    cs_idx <- .split_cells(cd, by = colnames(cd))
73
+    n_cs <- modify_depth(cs_idx, -1, length)
74
+    
75
+    # split input cells by cluster-sample
76
+    cs_by_ks <- .split_cells(x)
77
+    
78
+    # sample nb. of genes to simulate per category & gene indices
79
+    n_dd <- replicate(nk, 
80
+        table(sample(factor(cats, levels = cats), n_genes, TRUE, p_dd))) %>% 
81
+        set_colnames(kids)
82
+    gs_idx <- .sample_gene_inds(gs, n_dd)
83
+    
84
+    # for ea. cluster, sample unique set of genes to simulate from
85
+    gs_by_k <- replicate(nk, 
86
+        setNames(sample(rownames(x), n_genes, TRUE), gs),
87
+        simplify = FALSE) %>% set_names(kids)
88
+    gs_by_kc <- lapply(kids, function(k) 
89
+        lapply(cats, function(c)
90
+            gs_by_k[[k]][gs_idx[[c, k]]]) %>% 
91
+            set_names(cats))
92
+    
93
+    # sample logFCs
94
+    lfc <- vapply(kids, function(k) 
95
+        lapply(cats, function(c) { 
96
+            n <- n_dd[c, k]
114 97
             if (c == "ee") return(rep(NA, n))
115
-            signs <- sample(c(-1, 1), size = n, replace = TRUE)
116
-            lfcs <- rgamma(n, 4, 4 / fc) * signs
117
-            names(lfcs) <- gs[is[[c, k]], k]
118
-            return(lfcs)
119
-    }))
120
-    
121
-    for (k in cluster_ids) {
122
-        # get NB parameters
123
-        m <- rowData(x)[gs[, k], ]$beta
124
-        d <- rowData(x)[gs[, k], ]$dispersion
125
-        names(m) <- names(d) <- gs[, k]
126
-        
127
-        for (s in sample_ids) {
128
-            # cells to simulate from
129
-            cs <- cells_by_cluster_sample[[s, k]]
130
-            
131
-            # compute mus
132
-            o <- setNames(colData(x)[cs, ]$offset, cs)
133
-            mu <- sapply(exp(o), "*", exp(m))
134
-            
135
-            # get cell indices & nb. of cells by group
136
-            ng1 <- length(g1 <- js[[s, k]][[1]])
137
-            ng2 <- length(g2 <- js[[s, k]][[2]])
138
-            
139
-            # simulate data
140
-            for (c in cats)
141
-                if (ndd[c, k] > 0) y[is[[c, k]], c(g1, g2)] <- 
142
-                simdd(c, gs[is[[c, k]], k], cs, ng1, ng2, mu, d, lfcs[[c, k]])
98
+            signs <- sample(c(-1, 1), n, TRUE)
99
+            lfc <- rgamma(n, 4, 4/fc) * signs
100
+            names(lfc) <- gs_by_kc[[k]][[c]]
101
+            return(lfc)
102
+        }), vector("list", length(cats))) %>% 
103
+        set_rownames(cats)
104
+    
105
+    # compute NB parameters
106
+    b <- exp(rowData(x)$beta)
107
+    o <- exp(colData(x)$offset)
108
+    m <- vapply(o, function(l) b*l, numeric(nrow(x)))
109
+    dimnames(m) <- dimnames(x)
110
+    d <- rowData(x)$dispersion
111
+    names(d) <- rownames(x)
112
+    
113
+    for (k in kids) {
114
+        for (s in sids) {
115
+            for (c in cats[n_dd[, k] != 0]) {
116
+                gs_kc <- gs_by_kc[[k]][[c]]
117
+                cs_ks <- cs_by_ks[[k]][[s]]
118
+                
119
+                g1 <- cs_idx[[k]][[s]]$A
120
+                g2 <- cs_idx[[k]][[s]]$B
121
+                
122
+                ng1 <- length(g1)
123
+                ng2 <- length(g2) 
124
+                
125
+                cs_g1 <- sample(cs_ks, ng1, replace = TRUE)
126
+                cs_g2 <- sample(cs_ks, ng2, replace = TRUE)
127
+                
128
+                m_g1 <- m[gs_kc, cs_g1]
129
+                m_g2 <- m[gs_kc, cs_g2]
130
+                d_kc <- d[gs_kc]
131
+                lfc_kc <- lfc[[c, k]]
132
+                
133
+                counts <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d = d_kc, lfc = lfc_kc)
134
+                y[gs_idx[[c, k]], c(g1, g2)] <- counts
135
+            }
143 136
         }
144 137
     }
145 138
     
146
-    # construct SCE
147
-    gi <- do.call(rbind, lapply(cluster_ids, function(k)
148
-        do.call(rbind, lapply(cats, function(c) if (ndd[c, k] != 0)
149
-            data.frame(
150
-                gene = is[[c, k]], cluster_id = k, 
151
-                category = c, logFC = lfcs[[c, k]])))))
152
-    gi <- gi[order(as.numeric(gsub("[a-z]", "", gi$gene))), ]
153
-    gi$category <- factor(gi$category, levels = ddSingleCell:::cats)
154
-    rownames(gi) <- NULL
139
+    # construct gene metadata table storing
140
+    # gene | cluster_id | category | logFC
141
+    gi <- data.frame(
142
+        gene = unlist(gs_idx),
143
+        cluster_id = rep.int(rep(kids, each = length(cats)), c(n_dd)),
144
+        category = rep.int(rep(cats, nk), c(n_dd)),
145
+        # mean = ,
146
+        # disp = ,
147
+        logFC = unlist(lfc)) %>% 
148
+        mutate_at("gene", as.character)
149
+    o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
150
+    gi <- gi[o, ] %>% set_rownames(NULL)
155 151
     
156
-    col_data <- do.call(rbind, lapply(cluster_ids, function(c) 
157
-        do.call(rbind, lapply(sample_ids, function(s) 
158
-            data.frame(
159
-                row.names = 1,
160
-                unlist(js[[s, c]]), 
161
-                cluster_id = c, sample_id = s, 
162
-                group_id = rep.int(c("A", "B"), n_cells[[s, c]]))))))
163
-    col_data <- col_data[colnames(y), ]
164
-    col_data$sample_id <- factor(paste(col_data$group_id, col_data$sample_id, sep = "."))
152
+    # construct SCE
153
+    cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
154
+    m <- match(levels(cd$sample_id), cd$sample_id)
155
+    gids <- cd$group_id[m]
156
+    o <- order(gids)
157
+    sids <- levels(cd$sample_id)[o]
158
+    ei <- data.frame(sample_id = sids, group_id = gids[o])
159
+    cd <- cd %>% mutate_at("sample_id", factor, levels = sids)
165 160
     
166
-    sample_id <- levels(col_data$sample_id)
167
-    group_id <- gsub("(A|B)[.].*", "\\1", sample_id)
168
-    ei <- data.frame(sample_id, group_id)
169 161
     md <- list(
170 162
         experiment_info = ei,
171
-        n_cells = table(col_data$sample_id),
172
-        gene_info = gi, sim_genes = gs)
163
+        n_cells = table(cd$sample_id),
164
+        gene_info = gi, sim_genes = gs_in)
173 165
     
174 166
     SingleCellExperiment(
175
-        assays = list(counts = y),
176
-        colData = col_data, 
167
+        assays = list(counts = as.matrix(y)),
168
+        colData = cd, 
177 169
         metadata = md)
178 170
 }
179
-    
180
-    
181
-    
182
-    
183
-    
184
-    
185
-    
186
-    
187
-    
188
-    
189
-    
190
-    
191
-    
192
-    
193
-    
194
-    
195
-    
196
-    
197
-    
198 171
\ No newline at end of file
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
Browse code

sapply>vapply

HelenaLC authored on 23/01/2019 16:20:14
Showing 1 changed files
... ...
@@ -13,6 +13,8 @@
13 13
 #' @param p_dd numeric vector of length 6.
14 14
 #'   Specifies the probability of a gene being
15 15
 #'   EE, EP, DE, DP, DM, or DB, respectively.
16
+#' @param fc numeric value to use as mean logFC
17
+#'   for DE, DP, DM, and DB type of genes.
16 18
 #' @param seed random seed. 
17 19
 #' 
18 20
 #' @examples
... ...
@@ -21,10 +23,11 @@
21 23
 #'     n_genes = 10, n_cells = 10,
22 24
 #'     p_dd = c(1,0,0,0,0,0), seed = 1)
23 25
 #' 
24
-#' @import SingleCellExperiment 
25 26
 #' @importFrom data.table data.table
26 27
 #' @importFrom edgeR DGEList estimateDisp glmFit
27
-#' @importFrom stats model.matrix rnbinom setNames
28
+#' @importFrom stats model.matrix rgamma setNames
29
+#' @importFrom SingleCellExperiment SingleCellExperiment
30
+#' @importFrom SummarizedExperiment colData
28 31
 #' @importFrom S4Vectors split
29 32
 #' @importFrom zeallot %<-%
30 33
 #' 
... ...
@@ -33,7 +36,7 @@
33 36
 simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
34 37
     
35 38
     # check validity of input arguments
36
-    stopifnot(class(x) == "SingleCellExperiment")
39
+    stopifnot(is(x, "SingleCellExperiment"))
37 40
     stopifnot(is.numeric(n_genes), length(n_genes) == 1)
38 41
     stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
39 42
     stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
... ...
@@ -108,35 +111,35 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
108 111
     lfcs <- sapply(cluster_ids, function(k) 
109 112
         sapply(cats, function(c) { 
110 113
             n <- ndd[c, k]
111
-            if (c %in% c("ee", "ep")) return(rep(NA, n))
114
+            if (c == "ee") return(rep(NA, n))
112 115
             signs <- sample(c(-1, 1), size = n, replace = TRUE)
113 116
             lfcs <- rgamma(n, 4, 4 / fc) * signs
114 117
             names(lfcs) <- gs[is[[c, k]], k]
115 118
             return(lfcs)
116 119
     }))
117 120
     
118
-    for (c in cluster_ids) {
121
+    for (k in cluster_ids) {
119 122
         # get NB parameters
120
-        m <- rowData(x)[gs[, c], ]$beta
121
-        d <- rowData(x)[gs[, c], ]$dispersion
122
-        names(m) <- names(d) <- gs[, c]
123
+        m <- rowData(x)[gs[, k], ]$beta
124
+        d <- rowData(x)[gs[, k], ]$dispersion
125
+        names(m) <- names(d) <- gs[, k]
123 126
         
124 127
         for (s in sample_ids) {
125 128
             # cells to simulate from
126
-            cs <- cells_by_cluster_sample[[s, c]]
129
+            cs <- cells_by_cluster_sample[[s, k]]
127 130
             
128 131
             # compute mus
129 132
             o <- setNames(colData(x)[cs, ]$offset, cs)
130 133
             mu <- sapply(exp(o), "*", exp(m))
131 134
             
132 135
             # get cell indices & nb. of cells by group
133
-            ng1 <- length(g1 <- js[[s, c]][[1]])
134
-            ng2 <- length(g2 <- js[[s, c]][[2]])
136
+            ng1 <- length(g1 <- js[[s, k]][[1]])
137
+            ng2 <- length(g2 <- js[[s, k]][[2]])
135 138
             
136 139
             # simulate data
137
-            for (cat in cats)
138
-                if (ndd[cat, c] > 0) y[is[[cat, c]], c(g1, g2)] <- 
139
-                simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, lfcs[[cat, c]])
140
+            for (c in cats)
141
+                if (ndd[c, k] > 0) y[is[[c, k]], c(g1, g2)] <- 
142
+                simdd(c, gs[is[[c, k]], k], cs, ng1, ng2, mu, d, lfcs[[c, k]])
140 143
         }
141 144
     }
142 145
     
... ...
@@ -156,13 +159,13 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
156 159
                 row.names = 1,
157 160
                 unlist(js[[s, c]]), 
158 161
                 cluster_id = c, sample_id = s, 
159
-                group = rep.int(c("A", "B"), n_cells[[s, c]]))))))
162
+                group_id = rep.int(c("A", "B"), n_cells[[s, c]]))))))
160 163
     col_data <- col_data[colnames(y), ]
161
-    col_data$sample_id <- factor(paste(col_data$group, col_data$sample_id, sep = "."))
164
+    col_data$sample_id <- factor(paste(col_data$group_id, col_data$sample_id, sep = "."))
162 165
     
163 166
     sample_id <- levels(col_data$sample_id)
164
-    group <- gsub("(A|B)[.].*", "\\1", sample_id)
165
-    ei <- data.frame(sample_id, group)
167
+    group_id <- gsub("(A|B)[.].*", "\\1", sample_id)
168
+    ei <- data.frame(sample_id, group_id)
166 169
     md <- list(
167 170
         experiment_info = ei,
168 171
         n_cells = table(col_data$sample_id),
Browse code

add logFC columns to metadata

HelenaLC authored on 15/01/2019 12:40:41
Showing 1 changed files
... ...
@@ -105,13 +105,14 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
105 105
     colnames(gs) <- cluster_ids
106 106
     
107 107
     # sample fold-changes
108
-    fcs <- sapply(cluster_ids, function(k) 
108
+    lfcs <- sapply(cluster_ids, function(k) 
109 109
         sapply(cats, function(c) { 
110 110
             n <- ndd[c, k]
111
+            if (c %in% c("ee", "ep")) return(rep(NA, n))
111 112
             signs <- sample(c(-1, 1), size = n, replace = TRUE)
112
-            fcs <- 2 ^ ( rgamma(n, 4, 4 / fc) * signs )
113
-            names(fcs) <- gs[is[[c, k]], k]
114
-            return(fcs)
113
+            lfcs <- rgamma(n, 4, 4 / fc) * signs
114
+            names(lfcs) <- gs[is[[c, k]], k]
115
+            return(lfcs)
115 116
     }))
116 117
     
117 118
     for (c in cluster_ids) {
... ...
@@ -135,14 +136,16 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
135 136
             # simulate data
136 137
             for (cat in cats)
137 138
                 if (ndd[cat, c] > 0) y[is[[cat, c]], c(g1, g2)] <- 
138
-                simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, fcs[[cat, c]])
139
+                simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, lfcs[[cat, c]])
139 140
         }
140 141
     }
141 142
     
142 143
     # construct SCE
143
-    gi <- do.call(rbind, lapply(cluster_ids, function(c)
144
-        do.call(rbind, lapply(cats, function(cat) if (ndd[cat, c] != 0)
145
-            data.frame(gene = is[[cat, c]], cluster_id = c, category = cat)))))
144
+    gi <- do.call(rbind, lapply(cluster_ids, function(k)
145
+        do.call(rbind, lapply(cats, function(c) if (ndd[c, k] != 0)
146
+            data.frame(
147
+                gene = is[[c, k]], cluster_id = k, 
148
+                category = c, logFC = lfcs[[c, k]])))))
146 149
     gi <- gi[order(as.numeric(gsub("[a-z]", "", gi$gene))), ]
147 150
     gi$category <- factor(gi$category, levels = ddSingleCell:::cats)
148 151
     rownames(gi) <- NULL
Browse code

fix usage of FCs

HelenaLC authored on 15/01/2019 12:30:50
Showing 1 changed files
... ...
@@ -109,8 +109,9 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
109 109
         sapply(cats, function(c) { 
110 110
             n <- ndd[c, k]
111 111
             signs <- sample(c(-1, 1), size = n, replace = TRUE)
112
-            fcs <- 2 ^ ( rgamma(n, 4 , 4 / fc) * signs )
113
-            setNames(fcs, gs[is[[c, k]], k]) 
112
+            fcs <- 2 ^ ( rgamma(n, 4, 4 / fc) * signs )
113
+            names(fcs) <- gs[is[[c, k]], k]
114
+            return(fcs)
114 115
     }))
115 116
     
116 117
     for (c in cluster_ids) {
Browse code

major revision of simulation

HelenaLC authored on 14/01/2019 15:21:44
Showing 1 changed files
... ...
@@ -32,11 +32,12 @@
32 32
 
33 33
 simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
34 34
     
35
+    # check validity of input arguments
35 36
     stopifnot(class(x) == "SingleCellExperiment")
36 37
     stopifnot(is.numeric(n_genes), length(n_genes) == 1)
37 38
     stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
38 39
     stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
39
-    stopifnot(is.numeric(fc), is.numeric(fc))
40
+    stopifnot(is.numeric(fc), is.numeric(fc), fc > 1)
40 41
     stopifnot(is.numeric(seed), length(seed) == 1)
41 42
     
42 43
     cluster_ids <- levels(colData(x)$cluster_id)
... ...
@@ -47,11 +48,12 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
47 48
     # split cells by cluster-sample
48 49
     dt <- data.table(
49 50
         cell = colnames(x), 
50
-        data.frame(colData(x)))
51
-    cells <- dt %>% split(
51
+        cluster_id = colData(x)$cluster_id,
52
+        sample_id = colData(x)$sample_id)
53
+    dt_split <- split(dt,
52 54
         by = c("cluster_id", "sample_id"), 
53 55
         keep.by = FALSE, flatten = FALSE)
54
-    cells <- sapply(cells, sapply, "[[", "cell")
56
+    cells_by_cluster_sample <- sapply(dt_split, sapply, "[[", "cell")
55 57
     
56 58
     # sample nb. of cells to simulate per cluster-sample
57 59
     if (length(n_cells) == 1) {
... ...
@@ -98,10 +100,19 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
98 100
                 return(x) })), sample_ids))
99 101
     
100 102
     # sample genes to simulate from
101
-    gs <- replicate(n_clusters, sample(rownames(x), n_genes))
103
+    gs <- replicate(n_clusters, sample(rownames(x), n_genes, replace = TRUE))
102 104
     rownames(gs) <- rownames(y)
103 105
     colnames(gs) <- cluster_ids
104 106
     
107
+    # sample fold-changes
108
+    fcs <- sapply(cluster_ids, function(k) 
109
+        sapply(cats, function(c) { 
110
+            n <- ndd[c, k]
111
+            signs <- sample(c(-1, 1), size = n, replace = TRUE)
112
+            fcs <- 2 ^ ( rgamma(n, 4 , 4 / fc) * signs )
113
+            setNames(fcs, gs[is[[c, k]], k]) 
114
+    }))
115
+    
105 116
     for (c in cluster_ids) {
106 117
         # get NB parameters
107 118
         m <- rowData(x)[gs[, c], ]$beta
... ...
@@ -110,7 +121,7 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
110 121
         
111 122
         for (s in sample_ids) {
112 123
             # cells to simulate from
113
-            cs <- cells[[s, c]]
124
+            cs <- cells_by_cluster_sample[[s, c]]
114 125
             
115 126
             # compute mus
116 127
             o <- setNames(colData(x)[cs, ]$offset, cs)
... ...
@@ -123,16 +134,16 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
123 134
             # simulate data
124 135
             for (cat in cats)
125 136
                 if (ndd[cat, c] > 0) y[is[[cat, c]], c(g1, g2)] <- 
126
-                simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, fc)
137
+                simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, fcs[[cat, c]])
127 138
         }
128 139
     }
129 140
     
130 141
     # construct SCE
131 142
     gi <- do.call(rbind, lapply(cluster_ids, function(c)
132 143
         do.call(rbind, lapply(cats, function(cat) if (ndd[cat, c] != 0)
133
-            data.frame(genes = is[[cat, c]], cluster_id = c, category = cat)))))
144
+            data.frame(gene = is[[cat, c]], cluster_id = c, category = cat)))))
134 145
     gi <- gi[order(as.numeric(gsub("[a-z]", "", gi$gene))), ]
135
-    levels(gi$category) <- c("ee", "ep", "de", "dp", "dm", "db")
146
+    gi$category <- factor(gi$category, levels = ddSingleCell:::cats)
136 147
     rownames(gi) <- NULL
137 148
     
138 149
     col_data <- do.call(rbind, lapply(cluster_ids, function(c) 
Browse code

re-write of aggregateData()

HelenaLC authored on 08/01/2019 11:05:05
Showing 1 changed files
... ...
@@ -16,21 +16,28 @@
16 16
 #' @param seed random seed. 
17 17
 #' 
18 18
 #' @examples
19
-#' data(kang_se, kang_fit)
20
-#' simDD(kang_se, kang_fit,
19
+#' data(kang)
20
+#' simData(kang,
21 21
 #'     n_genes = 10, n_cells = 10,
22 22
 #'     p_dd = c(1,0,0,0,0,0), seed = 1)
23 23
 #' 
24 24
 #' @import SingleCellExperiment 
25 25
 #' @importFrom data.table data.table
26 26
 #' @importFrom edgeR DGEList estimateDisp glmFit
27
-#' @importFrom stats model.matrix rnbinom
27
+#' @importFrom stats model.matrix rnbinom setNames
28 28
 #' @importFrom S4Vectors split
29 29
 #' @importFrom zeallot %<-%
30 30
 #' 
31 31
 #' @export
32 32
 
33
-simData <- function(x, n_genes, n_cells, p_dd) {
33
+simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
34
+    
35
+    stopifnot(class(x) == "SingleCellExperiment")
36
+    stopifnot(is.numeric(n_genes), length(n_genes) == 1)
37
+    stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
38
+    stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
39
+    stopifnot(is.numeric(fc), is.numeric(fc))
40
+    stopifnot(is.numeric(seed), length(seed) == 1)
34 41
     
35 42
     cluster_ids <- levels(colData(x)$cluster_id)
36 43
     sample_ids <- levels(colData(x)$sample_id)
... ...
@@ -97,19 +104,16 @@ simData <- function(x, n_genes, n_cells, p_dd) {
97 104
     
98 105
     for (c in cluster_ids) {
99 106
         # get NB parameters
100
-        m <- rowData(x)$mean[gs[, c]]
101
-        d <- rowData(x)$dispersion[gs[, c]]
102
-        
103
-        # get gene indices & nb. of genes by category
104
-        c(iee, iep, ide, idp, idm, idb) %<-% sapply(cats, function(i) is[i, c])
105
-        c(nee, nep, nde, ndp, ndm, ndb) %<-% vapply(cats, function(i) ndd[i, c], numeric(1))
107
+        m <- rowData(x)[gs[, c], ]$beta
108
+        d <- rowData(x)[gs[, c], ]$dispersion
109
+        names(m) <- names(d) <- gs[, c]
106 110
         
107 111
         for (s in sample_ids) {
108 112
             # cells to simulate from
109 113
             cs <- cells[[s, c]]
110 114
             
111 115
             # compute mus
112
-            o <- colData(x)$offset[cs]
116
+            o <- setNames(colData(x)[cs, ]$offset, cs)
113 117
             mu <- sapply(exp(o), "*", exp(m))
114 118
             
115 119
             # get cell indices & nb. of cells by group
... ...
@@ -117,12 +121,9 @@ simData <- function(x, n_genes, n_cells, p_dd) {
117 121
             ng2 <- length(g2 <- js[[s, c]][[2]])
118 122
             
119 123
             # simulate data
120
-            if (nee > 0) y[iee, c(g1, g2)] <- simdd("ee", gs[iee, c], cs, ng1, ng2, mu, d)
121
-            if (nep > 0) y[iep, c(g1, g2)] <- simdd("ep", gs[iep, c], cs, ng1, ng2, mu, d)
122
-            if (nde > 0) y[ide, c(g1, g2)] <- simdd("de", gs[ide, c], cs, ng1, ng2, mu, d)
123
-            if (ndp > 0) y[idp, c(g1, g2)] <- simdd("dp", gs[idp, c], cs, ng1, ng2, mu, d)
124
-            if (ndm > 0) y[idm, c(g1, g2)] <- simdd("dm", gs[idm, c], cs, ng1, ng2, mu, d)
125
-            if (ndb > 0) y[idb, c(g1, g2)] <- simdd("db", gs[idb, c], cs, ng1, ng2, mu, d)
124
+            for (cat in cats)
125
+                if (ndd[cat, c] > 0) y[is[[cat, c]], c(g1, g2)] <- 
126
+                simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, fc)
126 127
         }
127 128
     }
128 129
     
Browse code

add prepData() & aggregateData(), major revision of run_edgeR()

HelenaLC authored on 26/11/2018 20:59:35
Showing 1 changed files
1 1
new file mode 100755
... ...
@@ -0,0 +1,178 @@
1
+#' simData
2
+#' 
3
+#' Simulation of complex scRNA-seq data 
4
+#' 
5
+#' \code{simData} simulates multiple clusters and samples 
6
+#' across 2 experimental conditions from a real scRNA-seq data set.
7
+#' 
8
+#' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
9
+#' @param n_genes # of genes to simulate. 
10
+#' @param n_cells # of cells to simulate. 
11
+#'   Either a single numeric or a range to sample from.
12
+#' @param ns nb. of genes common to 1, 2, ..., all clusters.
13
+#' @param p_dd numeric vector of length 6.
14
+#'   Specifies the probability of a gene being
15
+#'   EE, EP, DE, DP, DM, or DB, respectively.
16
+#' @param seed random seed. 
17
+#' 
18
+#' @examples
19
+#' data(kang_se, kang_fit)
20
+#' simDD(kang_se, kang_fit,
21
+#'     n_genes = 10, n_cells = 10,
22
+#'     p_dd = c(1,0,0,0,0,0), seed = 1)
23
+#' 
24
+#' @import SingleCellExperiment 
25
+#' @importFrom data.table data.table
26
+#' @importFrom edgeR DGEList estimateDisp glmFit
27
+#' @importFrom stats model.matrix rnbinom
28
+#' @importFrom S4Vectors split
29
+#' @importFrom zeallot %<-%
30
+#' 
31
+#' @export
32
+
33
+simData <- function(x, n_genes, n_cells, p_dd) {
34
+    
35
+    cluster_ids <- levels(colData(x)$cluster_id)
36
+    sample_ids <- levels(colData(x)$sample_id)
37
+    n_clusters <- length(cluster_ids)
38
+    n_samples <- length(sample_ids)
39
+    
40
+    # split cells by cluster-sample
41
+    dt <- data.table(
42
+        cell = colnames(x), 
43
+        data.frame(colData(x)))
44
+    cells <- dt %>% split(
45
+        by = c("cluster_id", "sample_id"), 
46
+        keep.by = FALSE, flatten = FALSE)
47
+    cells <- sapply(cells, sapply, "[[", "cell")
48
+    
49
+    # sample nb. of cells to simulate per cluster-sample
50
+    if (length(n_cells) == 1) {
51
+        n_cells <- list(rep(n_cells, 2))
52
+    } else {
53
+        n_cells <- replicate(n_clusters * n_samples, 
54
+            list(sample(n_cells[1]:n_cells[2], 2)))
55
+    }
56
+    n_cells <- matrix(n_cells, 
57
+        nrow = n_samples, ncol = n_clusters, 
58
+        dimnames = list(sample_ids, cluster_ids))
59
+    
60
+    # initialize count matrix
61
+    y <- matrix(0, 
62
+        nrow = n_genes, 
63
+        ncol = sum(unlist(n_cells)),
64
+        dimnames = list(
65
+            paste0("gene", seq_len(n_genes)),
66
+            paste0("cell", seq_len(sum(unlist(n_cells))))))
67
+    
68
+    # sample nb. of genes to simulate per category
69
+    ndd <- replicate(n_clusters, {
70
+        ns <- sample(cats, n_genes, replace = TRUE, prob = p_dd)
71
+        factor(ns, levels = cats) 
72
+    }, simplify = FALSE)
73
+    ndd <- sapply(ndd, table)
74
+    colnames(ndd) <- cluster_ids
75
+    
76
+    # sample gene indices
77
+    is <- sapply(cluster_ids, function(c, gs = rownames(y))
78
+        sapply(cats, function(cat) { 
79
+            n <- ndd[cat, c]
80
+            x <- sample(gs, n)
81
+            gs <<- setdiff(gs, x)
82
+            return(x) }))
83
+    
84
+    # sample cell indices
85
+    cs <- colnames(y)
86
+    js <- sapply(cluster_ids, function(c) 
87
+        setNames(lapply(sample_ids, function(s)
88
+            lapply(n_cells[[s, c]], function(n) {
89
+                x <- sample(cs, n)
90
+                cs <<- setdiff(cs, x)
91
+                return(x) })), sample_ids))
92
+    
93
+    # sample genes to simulate from
94
+    gs <- replicate(n_clusters, sample(rownames(x), n_genes))
95
+    rownames(gs) <- rownames(y)
96
+    colnames(gs) <- cluster_ids
97
+    
98
+    for (c in cluster_ids) {
99
+        # get NB parameters
100
+        m <- rowData(x)$mean[gs[, c]]
101
+        d <- rowData(x)$dispersion[gs[, c]]
102
+        
103
+        # get gene indices & nb. of genes by category
104
+        c(iee, iep, ide, idp, idm, idb) %<-% sapply(cats, function(i) is[i, c])
105
+        c(nee, nep, nde, ndp, ndm, ndb) %<-% vapply(cats, function(i) ndd[i, c], numeric(1))
106
+        
107
+        for (s in sample_ids) {
108
+            # cells to simulate from
109
+            cs <- cells[[s, c]]
110
+            
111
+            # compute mus
112
+            o <- colData(x)$offset[cs]
113
+            mu <- sapply(exp(o), "*", exp(m))
114
+            
115
+            # get cell indices & nb. of cells by group
116
+            ng1 <- length(g1 <- js[[s, c]][[1]])
117
+            ng2 <- length(g2 <- js[[s, c]][[2]])
118
+            
119
+            # simulate data
120
+            if (nee > 0) y[iee, c(g1, g2)] <- simdd("ee", gs[iee, c], cs, ng1, ng2, mu, d)
121
+            if (nep > 0) y[iep, c(g1, g2)] <- simdd("ep", gs[iep, c], cs, ng1, ng2, mu, d)
122
+            if (nde > 0) y[ide, c(g1, g2)] <- simdd("de", gs[ide, c], cs, ng1, ng2, mu, d)
123
+            if (ndp > 0) y[idp, c(g1, g2)] <- simdd("dp", gs[idp, c], cs, ng1, ng2, mu, d)
124
+            if (ndm > 0) y[idm, c(g1, g2)] <- simdd("dm", gs[idm, c], cs, ng1, ng2, mu, d)
125
+            if (ndb > 0) y[idb, c(g1, g2)] <- simdd("db", gs[idb, c], cs, ng1, ng2, mu, d)
126
+        }
127
+    }
128
+    
129
+    # construct SCE
130
+    gi <- do.call(rbind, lapply(cluster_ids, function(c)
131
+        do.call(rbind, lapply(cats, function(cat) if (ndd[cat, c] != 0)
132
+            data.frame(genes = is[[cat, c]], cluster_id = c, category = cat)))))
133
+    gi <- gi[order(as.numeric(gsub("[a-z]", "", gi$gene))), ]
134
+    levels(gi$category) <- c("ee", "ep", "de", "dp", "dm", "db")
135
+    rownames(gi) <- NULL
136
+    
137
+    col_data <- do.call(rbind, lapply(cluster_ids, function(c) 
138
+        do.call(rbind, lapply(sample_ids, function(s) 
139
+            data.frame(
140
+                row.names = 1,
141
+                unlist(js[[s, c]]), 
142
+                cluster_id = c, sample_id = s, 
143
+                group = rep.int(c("A", "B"), n_cells[[s, c]]))))))
144
+    col_data <- col_data[colnames(y), ]
145
+    col_data$sample_id <- factor(paste(col_data$group, col_data$sample_id, sep = "."))
146
+    
147
+    sample_id <- levels(col_data$sample_id)
148
+    group <- gsub("(A|B)[.].*", "\\1", sample_id)
149
+    ei <- data.frame(sample_id, group)
150
+    md <- list(
151
+        experiment_info = ei,
152
+        n_cells = table(col_data$sample_id),
153
+        gene_info = gi, sim_genes = gs)
154
+    
155
+    SingleCellExperiment(
156
+        assays = list(counts = y),
157
+        colData = col_data, 
158
+        metadata = md)
159
+}
160
+    
161
+    
162
+    
163
+    
164
+    
165
+    
166
+    
167
+    
168
+    
169
+    
170
+    
171
+    
172
+    
173
+    
174
+    
175
+    
176
+    
177
+    
178
+    
0 179
\ No newline at end of file