Bioconductor Code: muscat

Browse code

rename example data from sce -> example_sce

HelenaLC authored on 03/05/2021 07:05:43
Showing 1 changed files

@@ -6,12 +6,27 @@
                                      #' across 2 experimental conditions from a real scRNA-seq data set.
                                      #'
                                      #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
                                     -#' @param nc,ns,nk # of cells, samples and clusters to simulate.
                                     -#'   By default, \code{ns = NULL} will simulated as many samples as
                                     -#'   available in the reference to avoid duplicated reference samples.
                                     +#' @param ng number of genes to simulate. Importantly, for the library sizes
                                     +#'   computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense,
                                     +#'   the number of simulated genes should match with the number of genes
                                     +#'   in the reference. To simulate a reduced number of genes, e.g. for
                                     +#'   testing and development purposes, please set \code{force = TRUE}.
                                     +#' @param nc number of cells to simulate.
                                     +#' @param nk number of clusters to simulate; defaults to the number
                                     +#'   of available reference clusters (\code{nlevels(x$cluster_id)}).
                                     +#' @param ns number of samples to simulate; defaults to as many as
                                     +#'   available in the reference to avoid duplicated reference samples.
                                     +#'   Specifically, the number of samples will be set to
                                     +#'   \code{n = nlevels(x$sample_id)} when \code{dd = FALSE},
                                     +#'   \code{n} per group  when \code{dd, paired = TRUE}, and
                                     +#'   \code{floor(n/2)} per group when \code{dd = TRUE, paired = FALSE}.
                                     +#'   When a larger number samples should be simulated, set \code{force = TRUE}.
                                      #' @param probs a list of length 3 containing probabilities of a cell belonging
                                      #'   to each cluster, sample, and group, respectively. List elements must be
                                      #'   NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.
                                     +#' @param dd whether or not to simulate differential distributions; if TRUE,
                                     +#'   two groups are simulated and \code{ns} corresponds to the number of
                                     +#'   samples per group, else one group with \code{ns} samples is simulated.
                                      #' @param p_dd numeric vector of length 6.
                                      #'   Specifies the probability of a gene being
                                      #'   EE, EP, DE, DP, DM, or DB, respectively.
@@ -39,17 +54,12 @@
                                      #'   of the branches lengths separating them.
                                      #' @param phylo_pars vector of length 2 providing the parameters that control
                                      #'   the number of type genes. Passed to an exponential PDF (see details).
                                     -#'
                                     -#' @param ng # of genes to simulate. Importantly, for the library sizes
                                     -#'   computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense,
                                     -#'   the number of simulated genes should match with the number of genes
                                     -#'   in the reference. To simulate a reduced number of genes, e.g. for
                                     -#'   testing and development purposes, please set \code{force = TRUE}.
                                     -#' @param force logical specifying whether to force
                                     -#'   simulation despite \code{ng != nrow(x)}.
                                     +#' @param force logical specifying whether to force simulation
                                     +#'   when \code{ng} and/or \code{ns} don't match the number of
                                     +#'   available reference genes and samples, respectively.
                                      #'
                                      #' @details The simulation of type genes can be performed in 2 ways;
                                     -#'   (1) via \code{p_type} to simulate independant clusters, OR
                                     +#'   (1) via \code{p_type} to simulate independent clusters, OR
                                      #'   (2) via \code{phylo_tree} to simulate a hierarchical cluster structure.
                                      #'
                                      #'   For (1), a subset of \code{p_type} \% of genes are selected per cluster
@@ -89,11 +99,11 @@
                                      #'   \item{\code{args}}{a list of the function call's input arguments.}}}}
                                      #'
                                      #' @examples
                                     -#' data(sce)
                                     +#' data(example_sce)
                                      #' library(SingleCellExperiment)
                                      #'
                                      #' # prep. SCE for simulation
                                     -#' ref <- prepSim(sce)
                                     +#' ref <- prepSim(example_sce)
                                      #'
                                      #' # simulate data
                                      #' (sim <- simData(ref, nc = 200,
@@ -179,11 +189,17 @@ simData <- function(x,
                                          if (is.null(x$cluster_id)) {
                                              x$cluster_id <- factor("foo")
                                              no_k <- TRUE
                                     -    } else no_k <- FALSE
                                     +    } else {
                                     +        x$cluster_id <- droplevels(factor(x$cluster_id))
                                     +        no_k <- nlevels(x$cluster_id) == 1
                                     +    }
                                          if (is.null(x$sample_id)) {
                                              x$sample_id <- factor("foo")
                                              no_s <- TRUE
                                     -    } else no_s <- FALSE
                                     +    } else {
                                     +        x$sample_id <- droplevels(factor(x$sample_id))
                                     +        no_s <- nlevels(x$sample_id) == 1
                                     +    }
                                          # store all input arguments to be returned in final output
                                          args <- c(as.list(environment()))
@@ -192,12 +208,14 @@ simData <- function(x,
                                          .check_sce(x, req_group = FALSE)
                                          args_tmp <- .check_args_simData(as.list(environment()))
                                          nk <- args$nk <- args_tmp$nk
                                     -    ns <- args$ns <- args_tmp$ns
+                                    -
+                                    +
                                          # reference IDs
                                          nk0 <- length(kids0 <- set_names(levels(x$cluster_id)))
                                          ns0 <- length(sids0 <- set_names(levels(x$sample_id)))
                                     +    # get number of samples to simulate
                                     +    ns <- .get_ns(ns0, ns, dd, paired, force)
+                                    +
                                          # simulation IDs
                                          nk <- length(kids <- set_names(paste0("cluster", seq_len(nk))))
                                          sids <- set_names(paste0("sample", seq_len(ns)))
@@ -211,8 +229,13 @@ simData <- function(x,
                                              ref_sids <- cbind(ref_sids, ref_sids)
                                          } else {
                                              # draw reference samples at random for each group
                                     -        sidsA <- sample(sids0, ns, ns > ns0)
                                     -        sidsB <- sample(setdiff(sids0, sidsA), ns, ns > ns0)
                                     +        sidsA <- sample(sids0, ns, force && ns > ns0)
                                     +        if (force) {
                                     +            sidsB <- sample(sids0, ns, ns > ns0)
                                     +        } else {
                                     +            sidsB <- setdiff(sids0, sidsA)
                                     +            sidsB <- sample(sidsB, ns)
                                     +        }
                                              ref_sids <- cbind(sidsA, sidsB)
+                                         }
                                          dimnames(ref_sids) <- list(sids, gids)

Browse code

assure rows/cols have names

HelenaLC authored on 16/04/2021 13:10:57
Showing 1 changed files

R/simData.R

History View file @ 267b5ec

@@ -299,6 +299,7 @@ simData <- function(x,
                                              bs$cluster_id <- cbind(0, bs$cluster_id)
                                              names(bs$cluster_id) <- kids0
+                                         }
                                     +    os <- x$offset
                                          b0 <- bs$beta0
                                          ds <- rowData(x)$disp
@@ -337,8 +338,8 @@ simData <- function(x,
                                                      ds_kc <- ds[gs0]
                                                      lfc_kc <- lfc[[c, k]]
                                                      bs_ksc <- exp(rowSums(bs_ks[gs0, , drop = FALSE]))
                                     -                ms_g1 <- outer(bs_ksc, exp(x$offset[cs_g1]), "*")
                                     -                ms_g2 <- outer(bs_ksc, exp(x$offset[cs_g2]), "*")
                                     +                ms_g1 <- outer(bs_ksc, exp(os[cs_g1]), "*")
                                     +                ms_g2 <- outer(bs_ksc, exp(os[cs_g2]), "*")
                                                      re <- .sim(c, cs_g1, cs_g2, ms_g1, ms_g2, ds_kc, lfc_kc, p_ep, p_dp, p_dm)
                                                      y[gi, unlist(ci)] <- re$cs

Browse code

fix sim to work w no groups / samples / clusters

HelenaLC authored on 16/04/2021 12:35:32
Showing 1 changed files

R/simData.R

History View file @ 6c6ac72

@@ -7,6 +7,8 @@
                                      #'
                                      #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
                                      #' @param nc,ns,nk # of cells, samples and clusters to simulate.
                                     +#'   By default, \code{ns = NULL} will simulated as many samples as
                                     +#'   available in the reference to avoid duplicated reference samples.
                                      #' @param probs a list of length 3 containing probabilities of a cell belonging
                                      #'   to each cluster, sample, and group, respectively. List elements must be
                                      #'   NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.
@@ -157,12 +159,14 @@
                                      #' @importFrom S4Vectors split unfactor
                                      #' @export
                                     -simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                     -    probs = NULL, p_dd = diag(6)[1, ], paired = FALSE,
                                     +simData <- function(x,
                                     +    ng = nrow(x), nc = ncol(x),
                                     +    ns = NULL, nk = NULL, probs = NULL,
                                     +    dd = TRUE, p_dd = diag(6)[1, ], paired = FALSE,
                                          p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
                                          p_type = 0, lfc = 2, rel_lfc = NULL,
                                          phylo_tree = NULL, phylo_pars = c(ifelse(is.null(phylo_tree), 0, 0.1), 3),
                                     -    ng = nrow(x), force = FALSE) {
                                     +    force = FALSE) {
                                          # throughout this code...
                                          # k: cluster ID
@@ -171,6 +175,16 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          # c: DD category
                                          # 0: reference
                                     +    # add mock cluster/sample ID if missing
                                     +    if (is.null(x$cluster_id)) {
                                     +        x$cluster_id <- factor("foo")
                                     +        no_k <- TRUE
                                     +    } else no_k <- FALSE
                                     +    if (is.null(x$sample_id)) {
                                     +        x$sample_id <- factor("foo")
                                     +        no_s <- TRUE
                                     +    } else no_s <- FALSE
+                                    +
                                          # store all input arguments to be returned in final output
                                          args <- c(as.list(environment()))
@@ -178,6 +192,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          .check_sce(x, req_group = FALSE)
                                          args_tmp <- .check_args_simData(as.list(environment()))
                                          nk <- args$nk <- args_tmp$nk
                                     +    ns <- args$ns <- args_tmp$ns
                                          # reference IDs
                                          nk0 <- length(kids0 <- set_names(levels(x$cluster_id)))
@@ -190,17 +205,18 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          # sample reference clusters & samples
                                          ref_kids <- setNames(sample(kids0, nk, nk > nk0), kids)
                                     -    if (paired) {
                                     +    if (!dd || paired) {
                                              # use same set of reference samples for both groups
                                              ref_sids <- sample(sids0, ns, ns > ns0)
                                     -        ref_sids <- replicate(length(gids), ref_sids)
                                     +        ref_sids <- cbind(ref_sids, ref_sids)
                                          } else {
                                              # draw reference samples at random for each group
                                     -        ref_sids <- replicate(length(gids),
                                     -            sample(sids0, ns, ns > ns0))
                                     +        sidsA <- sample(sids0, ns, ns > ns0)
                                     +        sidsB <- sample(setdiff(sids0, sidsA), ns, ns > ns0)
                                     +        ref_sids <- cbind(sidsA, sidsB)
+                                         }
                                          dimnames(ref_sids) <- list(sids, gids)
+                                    -
+                                    +
                                          if (is.null(rel_lfc))
                                              rel_lfc <- rep(1, nk)
                                          if (is.null(names(rel_lfc))) {
@@ -274,14 +290,17 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                              }), vector("list", length(cats)))
                                          # compute NB parameters
                                     -    m <- lapply(sids0, function(s) {
                                     -        b <- paste0("beta.", s)
                                     -        b <- exp(rowData(x)[[b]])
                                     -        m <- outer(b, exp(x$offset), "*")
                                     -        dimnames(m) <- dimnames(x); m
                                     -    })
                                     -    d <- rowData(x)$dispersion
                                     -    names(d) <- rownames(x)
                                     +    bs <- rowData(x)$beta
                                     +    if (!is.null(bs$sample_id)) {
                                     +        bs$sample_id <- cbind(0, bs$sample_id)
                                     +        names(bs$sample_id) <- sids0
                                     +    }
                                     +    if (!is.null(bs$cluster_id)) {
                                     +        bs$cluster_id <- cbind(0, bs$cluster_id)
                                     +        names(bs$cluster_id) <- kids0
                                     +    }
                                     +    b0 <- bs$beta0
                                     +    ds <- rowData(x)$disp
                                          # initialize list of depth two to store
                                          # simulation means in each cluster & group
@@ -292,13 +311,18 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          # run simulation -----------------------------------------------------------
                                          for (k in kids) {
                                              for (s in sids) {
                                     +            # get output cell indices
                                     +            ci <- cs_idx[[k]][[s]]
+                                    +
                                                  # get reference samples, clusters & cells
                                                  s0 <- ref_sids[s, ]
                                                  k0 <- ref_kids[k]
                                                  cs0 <- cs_by_ks[[k0]][s0]
                                     -            # get output cell indices
                                     -            ci <- cs_idx[[k]][[s]]
                                     +            # get NB parameters
                                     +            bs_ks <- cbind(b0,
                                     +                bs$cluster_id[[k0]],
                                     +                bs$sample_id[[s0[1]]])
                                                  for (c in cats[n_dd[, k] != 0]) {
                                                      # sample cells to simulate from
@@ -310,12 +334,13 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                                      gi <- gs_idx[[c, k]]
                                                      # get NB parameters
                                     -                m_g1 <- m[[s0[[1]]]][gs0, cs_g1, drop = FALSE]
                                     -                m_g2 <- m[[s0[[2]]]][gs0, cs_g2, drop = FALSE]
                                     -                d_kc <- d[gs0]
                                     +                ds_kc <- ds[gs0]
                                                      lfc_kc <- lfc[[c, k]]
                                     +                bs_ksc <- exp(rowSums(bs_ks[gs0, , drop = FALSE]))
                                     +                ms_g1 <- outer(bs_ksc, exp(x$offset[cs_g1]), "*")
                                     +                ms_g2 <- outer(bs_ksc, exp(x$offset[cs_g2]), "*")
                                     -                re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc, p_ep, p_dp, p_dm)
                                     +                re <- .sim(c, cs_g1, cs_g2, ms_g1, ms_g2, ds_kc, lfc_kc, p_ep, p_dp, p_dm)
                                                      y[gi, unlist(ci)] <- re$cs
                                                      for (g in gids) sim_mean[[k]][[g]][gi] <- ifelse(
@@ -331,7 +356,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                              category = rep.int(rep(cats, nk), c(n_dd)),
                                              logFC = unlist(lfc),
                                              sim_gene = unlist(gs_by_kc),
                                     -        sim_disp = d[unlist(gs_by_kc)]) %>%
                                     +        sim_disp = ds[unlist(gs_by_kc)]) %>%
                                              mutate_at("gene", as.character)
                                          # add true simulation means
                                          sim_mean <- sim_mean %>%
@@ -348,14 +373,22 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          # construct SCE ------------------------------------------------------------
                                          # cell metadata including group, sample, cluster IDs
                                          cd$group_id <- droplevels(cd$group_id)
                                     -    cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
                                     +    cd$sample_id <- if (dd) {
                                     +        factor(paste(cd$sample_id, cd$group_id, sep = "."))
                                     +    } else factor(cd$sample_id)
                                          m <- match(levels(cd$sample_id), cd$sample_id)
                                          gids <- cd$group_id[m]
                                          o <- order(gids)
                                          sids <- levels(cd$sample_id)[o]
                                          cd <- cd %>%
                                     -        mutate_at("cluster_id", factor, levels = kids) %>%
                                     -        mutate_at("sample_id", factor, levels = sids)
                                     +        mutate_at("sample_id", factor, levels = sids) %>%
                                     +        mutate_at("cluster_id", factor, levels = kids)
                                     +    if (!dd) {
                                     +        cd$group_id <- NULL
                                     +        ref_sids <- ref_sids[, 1]
                                     +    }
                                     +    if (no_s) cd$sample_id <- NULL
                                     +    if (no_k) cd$cluster_id <- NULL
                                          # gene metadata storing gene classes & specificities
                                          rd <- DataFrame(class = factor(class,
                                              levels = c("state", "shared", "type")))

Browse code

add log base to dox

HelenaLC authored on 10/06/2020 07:04:11
Showing 1 changed files

R/simData.R

History View file @ d2cca23

@@ -13,7 +13,7 @@
                                      #' @param p_dd numeric vector of length 6.
                                      #'   Specifies the probability of a gene being
                                      #'   EE, EP, DE, DP, DM, or DB, respectively.
                                     -#' @param paired logial specifying whether a paired design should
                                     +#' @param paired logical specifying whether a paired design should
                                      #'   be simulated (both groups use the same set of reference samples)
                                      #'   or not (reference samples are drawn at random).
                                      #' @param p_ep,p_dp,p_dm numeric specifying the proportion of cells
@@ -21,8 +21,8 @@
                                      #' @param p_type numeric. Probability of EE/EP gene being a type-gene.
                                      #'   If a gene is of class "type" in a given cluster, a unique mean
                                      #'   will be used for that gene in the respective cluster.
                                     -#' @param lfc numeric value to use as mean logFC
                                     -#'   for DE, DP, DM, and DB type of genes.
                                     +#' @param lfc numeric value to use as mean logFC
                                     +#'   (logarithm base 2) for DE, DP, DM, and DB type of genes.
                                      #' @param rel_lfc numeric vector of relative logFCs for each cluster.
                                      #'   Should be of length \code{nlevels(x$cluster_id)} with
                                      #'   \code{levels(x$cluster_id)} as names.

Browse code

add details to documentation

HelenaLC authored on 28/03/2020 12:25:31
Showing 1 changed files

R/simData.R

History View file @ 891c676

@@ -32,22 +32,11 @@
                                      #'   \href{http://evolution.genetics.washington.edu/phylip/newicktree.html}{here}.
                                      #'   The distance between the nodes, except for the original branch, will be
                                      #'   translated in the number of shared genes between the clusters belonging to
                                     -#'   these nodes (this relation is controlled with \code{phylo_pars}). The distance
                                     -#'   between two clusters is defined as the sum of the branches lengths
                                     -#'   separating them.
                                     +#'   these nodes (this relation is controlled with \code{phylo_pars}).
                                     +#'   The distance between two clusters is defined as the sum
                                     +#'   of the branches lengths separating them.
                                      #' @param phylo_pars vector of length 2 providing the parameters that control
                                     -#'   the number of type genes. Passed to an exponential's PDF:
                                     -#'   \code{N = #genes x gamma1 * e^(-gamma2 x dist)},
                                     -#'
                                     -#'   \itemize{
                                     -#'   \item  \code{gamma1} is the parameter that controls the percentage of shared
                                     -#'   genes between the nodes. By default 0.1 if a tree is given, meaning that
                                     -#'   maximum 10% of the genes can be used as type genes (if gamma2 = 0).
                                     -#'   However it's advised to tune it depending on the input \code{prep_sce}.
                                     -#'   \code{gamma2} is the 'penalty' of increasing the distance between clusters
                                     -#'   (\code{dist}, defined by \code{phylo_tree}), applied on the number of
                                     -#'   shared genes. Default to 3.
                                     -#'   }
                                     +#'   the number of type genes. Passed to an exponential PDF (see details).
                                      #'
                                      #' @param ng # of genes to simulate. Importantly, for the library sizes
                                      #'   computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense,
@@ -58,13 +47,45 @@
                                      #'   simulation despite \code{ng != nrow(x)}.
                                      #'
                                      #' @details The simulation of type genes can be performed in 2 ways;
                                     -#'   (1) by defining \code{p_type} and thus simulating independant clusters, OR
                                     -#'   (2) by defining both \code{phylo_tree} and \code{phylo_pars}, which will
                                     -#'   simulate a hierarchical structure between the clusters.
                                     +#'   (1) via \code{p_type} to simulate independant clusters, OR
                                     +#'   (2) via \code{phylo_tree} to simulate a hierarchical cluster structure.
                                     +#'
                                     +#'   For (1), a subset of \code{p_type} \% of genes are selected per cluster
                                     +#'   to use a different references genes than the remainder of clusters,
                                     +#'   giving rise to cluster-specific NB means for count sampling.
                                     +#'
                                     +#'   For (2), the number of shared/type genes at each node
                                     +#'   are given by \code{a*G*e^(-b*d)}, where \itemize{
                                     +#'   \item{\code{a} -- controls the percentage of shared genes between nodes.
                                     +#'   By default, at most 10\% of the genes are reserved as type genes
                                     +#'   (when \code{b} = 0). However, it is advised to tune this parameter
                                     +#'   depending on the input \code{prep_sce}.}
                                     +#'   \item{\code{b} -- determines how the number of shared genes
                                     +#'   decreases with increasing distance d between clusters
                                     +#'   (defined through \code{phylo_tree}).}}
                                      #'
                                      #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
                                     -#'   containing multiple clusters & samples across 2 groups.
                                     -#'
                                     +#'   containing multiple clusters & samples across 2 groups
                                     +#'   as well as the following metadata: \describe{
                                     +#'   \item{cell metadata (\code{colData(.)})}{a \code{DataFrame} containing,
                                     +#'   containing, for each cell, it's cluster, sample, and group ID.}
                                     +#'   \item{gene metadata (\code{rowData(.)})}{a \code{DataFrame} containing,
                                     +#'   for each gene, it's \code{class} (one of "state", "type", "none") and
                                     +#'   specificity (\code{specs}; NA for genes of type "state", otherwise
                                     +#'   a character vector of clusters that share the given gene).}
                                     +#'   \item{experiment metadata (\code{metadata(.)})}{
                                     +#'   \describe{
                                     +#'   \item{\code{experiment_info}}{a \code{data.frame}
                                     +#'   summarizing the experimental design.}
                                     +#'   \item{\code{n_cells}}{the number of cells for each sample.}
                                     +#'   \item{\code{gene_info}}{a \code{data.frame} containing, for each gene
                                     +#'   in each cluster, it's differential distribution \code{category},
                                     +#'   mean \code{logFC} (NA for genes for categories "ee" and "ep"),
                                     +#'   gene used as reference (\code{sim_gene}), dispersion \code{sim_disp},
                                     +#'   and simulation means for each group \code{sim_mean.A/B}.}
                                     +#'   \item{\code{ref_sids/kidskids}}{the sample/cluster IDs used as reference.}
                                     +#'   \item{\code{args}}{a list of the function call's input arguments.}}}}
                                     +#'
                                      #' @examples
                                      #' data(sce)
                                      #' library(SingleCellExperiment)
@@ -73,7 +94,7 @@
                                      #' ref <- prepSim(sce)
                                      #'
                                      #' # simulate data
                                     -#' (sim <- simData(ref, nc = 10,
                                     +#' (sim <- simData(ref, nc = 200,
                                      #'   p_dd = c(0.9, 0, 0.1, 0, 0, 0),
                                      #'   ng = 100, force = TRUE,
                                      #'   probs = list(NULL, NULL, c(1, 0))))
@@ -116,7 +137,7 @@
                                      #' # view information about shared 'type' genes
                                      #' table(rowData(sim)$class)
                                      #'
                                     -#' @author Helena L Crowell
                                     +#' @author Helena L Crowell & Anthony Sonrel
                                      #'
                                      #' @references
                                      #' Crowell, HL, Soneson, C, Germain, P-L, Calini, D,
@@ -316,10 +337,10 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          sim_mean <- sim_mean %>%
                                              map(bind_cols) %>%
                                              bind_rows(.id = "cluster_id") %>%
                                     -        mutate_at("cluster_id", factor) %>%
                                              mutate(gene = rep(gs, nk))
                                          gi <- full_join(gi, sim_mean, by = c("gene", "cluster_id")) %>%
                                     -        rename("sim_mean.A" = "A", "sim_mean.B" = "B")
                                     +        rename("sim_mean.A" = "A", "sim_mean.B" = "B") %>%
                                     +        mutate_at("cluster_id", factor)
                                          # reorder
                                          o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
                                          gi <- gi[o, ]; rownames(gi) <- NULL
@@ -332,14 +353,16 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          gids <- cd$group_id[m]
                                          o <- order(gids)
                                          sids <- levels(cd$sample_id)[o]
                                     -    ei <- data.frame(sample_id = sids, group_id = gids[o])
                                     -    cd <- cd %>% mutate_at("sample_id", factor, levels = sids)
                                     +    cd <- cd %>%
                                     +        mutate_at("cluster_id", factor, levels = kids) %>%
                                     +        mutate_at("sample_id", factor, levels = sids)
                                          # gene metadata storing gene classes & specificities
                                          rd <- DataFrame(class = factor(class,
                                              levels = c("state", "shared", "type")))
                                          rd$specs <- as.list(specs)
                                          # simulation metadata including used reference samples/cluster,
                                          # list of input arguments, and simulated genes' metadata
                                     +    ei <- data.frame(sample_id = sids, group_id = gids[o])
                                          md <- list(
                                              experiment_info = ei,
                                              n_cells = table(cd$sample_id),

Browse code

reformat example code

HelenaLC authored on 25/03/2020 08:48:41
Showing 1 changed files

R/simData.R

History View file @ 3ee6a87

@@ -109,7 +109,8 @@
                                      #' plot(read.dendrogram(text = phylo_tree))
                                      #'
                                      #' # simulate clusters accordingly
                                     -#' sim <- simData(ref, phylo_tree = phylo_tree,
                                     +#' sim <- simData(ref,
                                     +#'   phylo_tree = phylo_tree,
                                      #'   phylo_pars = c(0.1, 3),
                                      #'   ng = 500, force = TRUE)
                                      #' # view information about shared 'type' genes

Browse code

code simplifications

HelenaLC authored on 20/03/2020 07:39:07
Showing 1 changed files

R/simData.R

History View file @ 7cfeb38

@@ -35,11 +35,9 @@
                                      #'   these nodes (this relation is controlled with \code{phylo_pars}). The distance
                                      #'   between two clusters is defined as the sum of the branches lengths
                                      #'   separating them.
                                     -#' @param phylo_pars vector of length 2, defining the parameters that control the
                                     -#'   number of type genes. It is passed to an adaptation of the
                                     -#'   exponential's PDF:
                                     -#'
                                     -#'   \code{N = Ngenes x gamma1 * e^(-gamma2 x dist)} ,
                                     +#' @param phylo_pars vector of length 2 providing the parameters that control
                                     +#'   the number of type genes. Passed to an exponential's PDF:
                                     +#'   \code{N = #genes x gamma1 * e^(-gamma2 x dist)},
                                      #'
                                      #'   \itemize{
                                      #'   \item  \code{gamma1} is the parameter that controls the percentage of shared
@@ -59,11 +57,10 @@
                                      #' @param force logical specifying whether to force
                                      #'   simulation despite \code{ng != nrow(x)}.
                                      #'
                                     -#' @details
                                     -#'  The simulation of type genes can be performed in 2 ways; (1) by defining
                                     -#'  \code{p_type} and thus simulating independant clusters, OR (2) by defining both
                                     -#'  \code{phylo_tree} and \code{phylo_pars}, which will simulate a hierarchical structure
                                     -#'  between the clusters. Only one of the options is allowed.
                                     +#' @details The simulation of type genes can be performed in 2 ways;
                                     +#'   (1) by defining \code{p_type} and thus simulating independant clusters, OR
                                     +#'   (2) by defining both \code{phylo_tree} and \code{phylo_pars}, which will
                                     +#'   simulate a hierarchical structure between the clusters.
                                      #'
                                      #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
                                      #'   containing multiple clusters & samples across 2 groups.
@@ -88,7 +85,7 @@
                                      #' table(gi$category)
                                      #'
                                      #' # unbalanced sample sizes
                                     -#' sim <- simData(ref, nc = 100,
                                     +#' sim <- simData(ref, nc = 100, ns = 2,
                                      #'   probs = list(NULL, c(0.25, 0.75), NULL),
                                      #'   ng = 10, force = TRUE)
                                      #' table(sim$sample_id)
@@ -142,7 +139,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          probs = NULL, p_dd = diag(6)[1, ], paired = FALSE,
                                          p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
                                          p_type = 0, lfc = 2, rel_lfc = NULL,
                                     -    phylo_tree = NULL, phylo_pars = c(0, 3),
                                     +    phylo_tree = NULL, phylo_pars = c(ifelse(is.null(phylo_tree), 0, 0.1), 3),
                                          ng = nrow(x), force = FALSE) {
                                          # throughout this code...
@@ -152,9 +149,6 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          # c: DD category
                                          # 0: reference
                                     -    # default shared p if phylo tree given
                                     -    if (missing(phylo_pars)) phylo_pars[1] <- ifelse(is.null(phylo_tree), 0, 0.1)
+                                    -
                                          # store all input arguments to be returned in final output
                                          args <- c(as.list(environment()))

Browse code

phylo_tree distances now affects the downstream node (also applied to final leaves).

Thony authored on 25/02/2020 14:38:40
Showing 1 changed files

R/simData.R

History View file @ 5205a7a

@@ -35,23 +35,20 @@
                                      #'   these nodes (this relation is controlled with \code{phylo_pars}). The distance
                                      #'   between two clusters is defined as the sum of the branches lengths
                                      #'   separating them.
                                     -#' @param phylo_pars list of length 2, defining the parameters that control the
                                     -#'   number of shared/ specific type-genes; \itemize{
                                     -#'   \item The first element of the list is a numeric vector of length 2.
                                     -#'   It defines the number of shared genes as an adaptation of the
                                     +#' @param phylo_pars vector of length 2, defining the parameters that control the
                                     +#'   number of type genes. It is passed to an adaptation of the
                                      #'   exponential's PDF:
                                      #'
                                      #'   \code{N = Ngenes x gamma1 * e^(-gamma2 x dist)} ,
                                      #'
                                     -#'   where \code{gamma1} is the parameter that controls the percentage of shared genes
                                     -#'   between the nodes. By default 0.2, but it's advised to tune it depending
                                     -#'   on the input \code{prep_sce}.
                                     -#'   \code{gamma2} is the 'penalty' of increasing
                                     -#'   the distance between clusters (\code{dist}, defined by \code{phylo_tree}),
                                     -#'   applied on the number of shared genes. Default to -3.
                                     -#'   \item The second element can be a single numeric or a list of length nk.
                                     -#'   It is an equivalent of \code{p_type} for each leaf (i.e. cluster) that is applied
                                     -#'   after the determination of 'shared type genes'.
                                     +#'   \itemize{
                                     +#'   \item  \code{gamma1} is the parameter that controls the percentage of shared
                                     +#'   genes between the nodes. By default 0.1 if a tree is given, meaning that
                                     +#'   maximum 10% of the genes can be used as type genes (if gamma2 = 0).
                                     +#'   However it's advised to tune it depending on the input \code{prep_sce}.
                                     +#'   \code{gamma2} is the 'penalty' of increasing the distance between clusters
                                     +#'   (\code{dist}, defined by \code{phylo_tree}), applied on the number of
                                     +#'   shared genes. Default to 3.
                                      #'   }
                                      #'
                                      #' @param ng # of genes to simulate. Importantly, for the library sizes
@@ -116,7 +113,7 @@
                                      #'
                                      #' # simulate clusters accordingly
                                      #' sim <- simData(ref, phylo_tree = phylo_tree,
                                     -#'   phylo_pars = list(c(0.1, 3), 0.1),
                                     +#'   phylo_pars = c(0.1, 3),
                                      #'   ng = 500, force = TRUE)
                                      #' # view information about shared 'type' genes
                                      #' table(rowData(sim)$class)
@@ -145,7 +142,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          probs = NULL, p_dd = diag(6)[1, ], paired = FALSE,
                                          p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
                                          p_type = 0, lfc = 2, rel_lfc = NULL,
                                     -    phylo_tree = NULL, phylo_pars = list(c(0, 3), 0),
                                     +    phylo_tree = NULL, phylo_pars = c(0, 3),
                                          ng = nrow(x), force = FALSE) {
                                          # throughout this code...
@@ -155,6 +152,9 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          # c: DD category
                                          # 0: reference
                                     +    # default shared p if phylo tree given
                                     +    if (missing(phylo_pars)) phylo_pars[1] <- ifelse(is.null(phylo_tree), 0, 0.1)
+                                    +
                                          # store all input arguments to be returned in final output
                                          args <- c(as.list(environment()))

Browse code

up phylo

HelenaLC authored on 15/02/2020 14:09:19
Showing 1 changed files

R/simData.R

History View file @ 9f75381

@@ -160,40 +160,12 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          # check validity of input arguments
                                          .check_sce(x, req_group = FALSE)
                                     -    .check_args_simData(as.list(environment()))
                                     -    if (!force && ng != nrow(x))
                                     -        stop("Number of simulated genes should match with reference,\n",
                                     -            "but 'ng != nrow(x)'; please specify 'force = TRUE' if\n",
                                     -            "simulation should be forced regardlessly (see '?simData').")
                                     -    if (!is.null(phylo_tree) && p_type != 0)
                                     -        stop("Only one of 'p_type' and 'phylo_tree' can be provided.\n",
                                     -             "Please see the 'Details' section of '?simData'.")
                                     -    if (!length(phylo_pars[[2]]) %in% c(1, nk))
                                     -        stop("The second element of 'phylo_pars' should be correspond\n",
                                     -             " to the number of clusters ('nk') or of length 1.")
                                     -    if (!is.null(phylo_tree) && phylo_pars[[1]][1] == 0)
                                     -        warning("'phylo_pars[[1]][1]' has been set to 0;\n",
                                     -                "'phylo_tree' argument will be ignored.")
                                     -    if (!is.null(phylo_tree) && all(phylo_pars[[2]] == 0))
                                     -        warning("'phylo_pars[[2]]' has been set to 0;\n",
                                     -                "type genes for individual clusters won't be simulated.")
                                     +    args_tmp <- .check_args_simData(as.list(environment()))
                                     +    nk <- args$nk <- args_tmp$nk
                                          # reference IDs
                                          nk0 <- length(kids0 <- set_names(levels(x$cluster_id)))
                                          ns0 <- length(sids0 <- set_names(levels(x$sample_id)))
+                                    -
                                     -    # assure number of simulated clusters
                                     -    # matches with specified phylogeny
                                     -    if (!is.null(phylo_tree)) {
                                     -        kids_phylo <- .get_clusters_from_phylo(phylo_tree)
                                     -        nk_phylo <- length(kids_phylo)
                                     -        ns_phylo <- as.numeric(gsub("[a-z]", "", kids_phylo))
                                     -        if (!all(sort(ns_phylo) == seq_len(nk_phylo)))
                                     -            stop("Some clusters appear to be missing from 'phylo_tree';\n",
                                     -                "please make sure all clusters up to ",
                                     -                dQuote(kids_phylo[which.max(ns_phylo)]), " are present.")
                                     -        if (nk_phylo != nk) args$nk <- nk <- nk_phylo
                                     -    }
                                          # simulation IDs
                                          nk <- length(kids <- set_names(paste0("cluster", seq_len(nk))))
@@ -244,7 +216,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          gs_idx <- .sample_gene_inds(gs, n_dd)
                                          # for ea. cluster, sample set of genes to simulate from
                                     -    gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs)
                                     +    gs_by_k <- setNames(sample(rownames(x), ng, ng > nrow(x)), gs)
                                          gs_by_k <- replicate(nk, gs_by_k)
                                          colnames(gs_by_k) <- kids

Browse code

fix hierarchical sim data

Thony authored on 05/02/2020 14:43:35
Showing 1 changed files

R/simData.R

History View file @ 78fe155

@@ -18,7 +18,7 @@
                                      #'   or not (reference samples are drawn at random).
                                      #' @param p_ep,p_dp,p_dm numeric specifying the proportion of cells
                                      #'   to be shifted to a different expression state in one group (see details).
                                     -#' @param p_type numeric. Probaility of EE/EP gene being a type-gene.
                                     +#' @param p_type numeric. Probability of EE/EP gene being a type-gene.
                                      #'   If a gene is of class "type" in a given cluster, a unique mean
                                      #'   will be used for that gene in the respective cluster.
                                      #' @param lfc numeric value to use as mean logFC
@@ -28,19 +28,32 @@
                                      #'   \code{levels(x$cluster_id)} as names.
                                      #'   Defaults to factor of 1 for all clusters.
                                      #' @param phylo_tree newick tree text representing cluster relations
                                     -#'   and their relative distance. If a tree is given, the distance between
                                     -#'   the clusters will be translated in the number of shared genes
                                     -#'   (this relation is controlled with \code{phylo_pars}). The distance
                                     +#'   and their relative distance. An explanation of the syntax can be found
                                     +#'   \href{http://evolution.genetics.washington.edu/phylip/newicktree.html}{here}.
                                     +#'   The distance between the nodes, except for the original branch, will be
                                     +#'   translated in the number of shared genes between the clusters belonging to
                                     +#'   these nodes (this relation is controlled with \code{phylo_pars}). The distance
                                      #'   between two clusters is defined as the sum of the branches lengths
                                      #'   separating them.
                                     -#' @param phylo_pars numeric vector of length 2. Defines the number of shared
                                     -#'   genes as an adaptation of the exponential's PDF:
                                     -#'   N = Ngenes x gamma1 * e^(-gamma2 x dist) ,
                                     -#'   where gamma1 is the parameter that controls the percentage of shared genes.
                                     -#'   By default, it corresponds to \code{p_type} but it's advised to tune it
                                     -#'   depending on the input prep_sce. gamma2 is the 'penalty' of increasing
                                     -#'   the distance between clusters, applied on the number of shared genes.
                                     -#'   Default to -3.
                                     +#' @param phylo_pars list of length 2, defining the parameters that control the
                                     +#'   number of shared/ specific type-genes; \itemize{
                                     +#'   \item The first element of the list is a numeric vector of length 2.
                                     +#'   It defines the number of shared genes as an adaptation of the
                                     +#'   exponential's PDF:
                                     +#'
                                     +#'   \code{N = Ngenes x gamma1 * e^(-gamma2 x dist)} ,
                                     +#'
                                     +#'   where \code{gamma1} is the parameter that controls the percentage of shared genes
                                     +#'   between the nodes. By default 0.2, but it's advised to tune it depending
                                     +#'   on the input \code{prep_sce}.
                                     +#'   \code{gamma2} is the 'penalty' of increasing
                                     +#'   the distance between clusters (\code{dist}, defined by \code{phylo_tree}),
                                     +#'   applied on the number of shared genes. Default to -3.
                                     +#'   \item The second element can be a single numeric or a list of length nk.
                                     +#'   It is an equivalent of \code{p_type} for each leaf (i.e. cluster) that is applied
                                     +#'   after the determination of 'shared type genes'.
                                     +#'   }
                                     +#'
                                      #' @param ng # of genes to simulate. Importantly, for the library sizes
                                      #'   computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense,
                                      #'   the number of simulated genes should match with the number of genes
@@ -49,6 +62,12 @@
                                      #' @param force logical specifying whether to force
                                      #'   simulation despite \code{ng != nrow(x)}.
                                      #'
                                     +#' @details
                                     +#'  The simulation of type genes can be performed in 2 ways; (1) by defining
                                     +#'  \code{p_type} and thus simulating independant clusters, OR (2) by defining both
                                     +#'  \code{phylo_tree} and \code{phylo_pars}, which will simulate a hierarchical structure
                                     +#'  between the clusters. Only one of the options is allowed.
                                     +#'
                                      #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
                                      #'   containing multiple clusters & samples across 2 groups.
                                      #'
@@ -96,9 +115,11 @@
                                      #' plot(read.dendrogram(text = phylo_tree))
                                      #'
                                      #' # simulate clusters accordingly
                                     -#' sim <- simData(sce, phylo_tree = phylo_tree, ng = 500, force = TRUE)
                                     +#' sim <- simData(ref, phylo_tree = phylo_tree,
                                     +#'   phylo_pars = list(c(0.1, 3), 0.1),
                                     +#'   ng = 500, force = TRUE)
                                      #' # view information about shared 'type' genes
                                     -#' table(metadata(sim)$gene_info$shared_class)
                                     +#' table(rowData(sim)$class)
                                      #'
                                      #' @author Helena L Crowell
                                      #'
@@ -124,7 +145,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          probs = NULL, p_dd = diag(6)[1, ], paired = FALSE,
                                          p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
                                          p_type = 0, lfc = 2, rel_lfc = NULL,
                                     -    phylo_tree = NULL, phylo_pars = c(0.2, 3),
                                     +    phylo_tree = NULL, phylo_pars = list(c(0, 3), 0),
                                          ng = nrow(x), force = FALSE) {
                                          # throughout this code...
@@ -144,6 +165,18 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                              stop("Number of simulated genes should match with reference,\n",
                                                  "but 'ng != nrow(x)'; please specify 'force = TRUE' if\n",
                                                  "simulation should be forced regardlessly (see '?simData').")
                                     +    if (!is.null(phylo_tree) && p_type != 0)
                                     +        stop("Only one of 'p_type' and 'phylo_tree' can be provided.\n",
                                     +             "Please see the 'Details' section of '?simData'.")
                                     +    if (!length(phylo_pars[[2]]) %in% c(1, nk))
                                     +        stop("The second element of 'phylo_pars' should be correspond\n",
                                     +             " to the number of clusters ('nk') or of length 1.")
                                     +    if (!is.null(phylo_tree) && phylo_pars[[1]][1] == 0)
                                     +        warning("'phylo_pars[[1]][1]' has been set to 0;\n",
                                     +                "'phylo_tree' argument will be ignored.")
                                     +    if (!is.null(phylo_tree) && all(phylo_pars[[2]] == 0))
                                     +        warning("'phylo_pars[[2]]' has been set to 0;\n",
                                     +                "type genes for individual clusters won't be simulated.")
                                          # reference IDs
                                          nk0 <- length(kids0 <- set_names(levels(x$cluster_id)))
@@ -221,28 +254,6 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                              gs_by_k <- res$gs_by_k
                                              class  <- res$class
                                              specs <- res$specs
                                     -        if (p_type != 0) {
                                     -            # update gene indices 'gs_idx' to avoid imputing
                                     -            # exclusive type genes in shared type genes
                                     -            gs_idx_tmp <- gs_idx
                                     -            for (c in c("ee", "ep"))
                                     -                for (k in kids) {
                                     -                    u <- gs_idx[[c, k]]
                                     -                    gs_idx_tmp[[c, k]] <- u[!u %in% res$used]
                                     -                }
                                     -            # inpute cluster-specific type genes
                                     -            res <- .impute_type_genes(x, gs_by_k, gs_idx_tmp, p_type)
                                     -            gs_by_k <- res$gs_by_k
                                     -            # update genes classes & specificities
                                     -            is_type <- res$class == "type"
                                     -            # spot checks; any type-gene should not be of class 'shared'
                                     -            # and cluster-specificities should be unassigned at this point
                                     -            stopifnot(
                                     -                all(class[is_type] == "state"),
                                     -                all(is.na(unlist(specs[is_type]))))
                                     -            class[is_type] <- "type"
                                     -            specs[is_type] <- res$specs[is_type]
                                     -        }
                                          # otherwise, simply impute type-genes w/o phylogeny
                                          } else if (p_type != 0) {
                                              res <- .impute_type_genes(x, gs_by_k, gs_idx, p_type)

Browse code

update phylo sim

HelenaLC authored on 03/02/2020 10:36:01
Showing 1 changed files

R/simData.R

History View file @ 3ba161d

@@ -159,7 +159,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                                  stop("Some clusters appear to be missing from 'phylo_tree';\n",
                                                      "please make sure all clusters up to ",
                                                      dQuote(kids_phylo[which.max(ns_phylo)]), " are present.")
                                     -        if (nk_phylo != nk) nk <- nk_phylo
                                     +        if (nk_phylo != nk) args$nk <- nk <- nk_phylo
+                                         }
                                          # simulation IDs
@@ -212,7 +212,8 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          # for ea. cluster, sample set of genes to simulate from
                                          gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs)
                                     -    gs_by_k <- set_colnames(replicate(nk, gs_by_k), kids)
                                     +    gs_by_k <- replicate(nk, gs_by_k)
                                     +    colnames(gs_by_k) <- kids
                                          # when 'phylo_tree' is specified, induce hierarchical cluster structure
                                          if (!is.null(phylo_tree)) {
@@ -343,7 +344,7 @@ simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                              rename("sim_mean.A" = "A", "sim_mean.B" = "B")
                                          # reorder
                                          o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
                                     -    gi <- set_rownames(gi[o, ], NULL)
                                     +    gi <- gi[o, ]; rownames(gi) <- NULL
                                          # construct SCE ------------------------------------------------------------
                                          # cell metadata including group, sample, cluster IDs

Browse code

avoid dependency on magrittr&tibble solely for set_row/colnames&add_column

HelenaLC authored on 02/02/2020 08:37:26
Showing 1 changed files

R/simData.R

History View file @ b4e43ab

@@ -6,7 +6,7 @@
                                      #' across 2 experimental conditions from a real scRNA-seq data set.
                                      #'
                                      #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
                                     -#' @param ng,nc,ns,nk # of genes, cells, samples and clusters to simulate.
                                     +#' @param nc,ns,nk # of cells, samples and clusters to simulate.
                                      #' @param probs a list of length 3 containing probabilities of a cell belonging
                                      #'   to each cluster, sample, and group, respectively. List elements must be
                                      #'   NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.
@@ -27,13 +27,13 @@
                                      #'   Should be of length \code{nlevels(x$cluster_id)} with
                                      #'   \code{levels(x$cluster_id)} as names.
                                      #'   Defaults to factor of 1 for all clusters.
                                     -#' @param cells_phylo newick tree text representing cluster relations
                                     +#' @param phylo_tree newick tree text representing cluster relations
                                      #'   and their relative distance. If a tree is given, the distance between
                                      #'   the clusters will be translated in the number of shared genes
                                     -#'   (this relation is controlled with \code{params_dist}). The distance
                                     +#'   (this relation is controlled with \code{phylo_pars}). The distance
                                      #'   between two clusters is defined as the sum of the branches lengths
                                      #'   separating them.
                                     -#' @param params_dist numeric vector of length 2. Defines the number of shared
                                     +#' @param phylo_pars numeric vector of length 2. Defines the number of shared
                                      #'   genes as an adaptation of the exponential's PDF:
                                      #'   N = Ngenes x gamma1 * e^(-gamma2 x dist) ,
                                      #'   where gamma1 is the parameter that controls the percentage of shared genes.
@@ -41,6 +41,13 @@
                                      #'   depending on the input prep_sce. gamma2 is the 'penalty' of increasing
                                      #'   the distance between clusters, applied on the number of shared genes.
                                      #'   Default to -3.
                                     +#' @param ng # of genes to simulate. Importantly, for the library sizes
                                     +#'   computed by \code{\link{prepSim}} (= \code{exp(x$offset)}) to make sense,
                                     +#'   the number of simulated genes should match with the number of genes
                                     +#'   in the reference. To simulate a reduced number of genes, e.g. for
                                     +#'   testing and development purposes, please set \code{force = TRUE}.
                                     +#' @param force logical specifying whether to force
                                     +#'   simulation despite \code{ng != nrow(x)}.
                                      #'
                                      #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
                                      #'   containing multiple clusters & samples across 2 groups.
@@ -50,11 +57,13 @@
                                      #' library(SingleCellExperiment)
                                      #'
                                      #' # prep. SCE for simulation
                                     -#' sce <- prepSim(sce)
                                     +#' ref <- prepSim(sce)
                                      #'
                                      #' # simulate data
                                     -#' (sim <- simData(sce, ng = 100, nc = 10,
                                     -#'   p_dd = c(0.9, 0, 0.1, 0, 0, 0)))
                                     +#' (sim <- simData(ref, nc = 10,
                                     +#'   p_dd = c(0.9, 0, 0.1, 0, 0, 0),
                                     +#'   ng = 100, force = TRUE,
                                     +#'   probs = list(NULL, NULL, c(1, 0))))
                                      #'
                                      #' # simulation metadata
                                      #' head(gi <- metadata(sim)$gene_info)
@@ -63,30 +72,32 @@
                                      #' table(gi$category)
                                      #'
                                      #' # unbalanced sample sizes
                                     -#' sim <- simData(sce, ng = 10, nc = 100,
                                     -#'   probs = list(NULL, c(0.25, 0.75), NULL))
                                     +#' sim <- simData(ref, nc = 100,
                                     +#'   probs = list(NULL, c(0.25, 0.75), NULL),
                                     +#'   ng = 10, force = TRUE)
                                      #' table(sim$sample_id)
                                      #'
                                      #' # one group only
                                     -#' sim <- simData(sce, ng = 10, nc = 100,
                                     -#'   probs = list(NULL, NULL, c(1, 0)))
                                     +#' sim <- simData(ref, nc = 100,
                                     +#'   probs = list(NULL, NULL, c(1, 0)),
                                     +#'   ng = 10, force = TRUE)
                                      #' levels(sim$group_id)
                                      #'
                                     -#'
                                     -#' # Hierarchical cell-type relations:
                                     -#' # we first define a phylogram representing the relations between 3 clusters
                                     -#' cells_phylo <- "(('cluster1':0.1, 'cluster2':0.1):0.4,'cluster3':0.5);"
                                     -#' # to verify the syntax and the relation, we can plot it
                                     +#' # HIERARCHICAL CLUSTER STRUCTURE
                                     +#' # define phylogram specifying cluster relations
                                     +#' phylo_tree <- "(('cluster1':0.1,'cluster2':0.1):0.4,'cluster3':0.5);"
                                     +#' # verify syntax & visualize relations
                                      #' library(phylogram)
                                     -#' dend <- read.dendrogram(text = cells_phylo)
                                     -#' plot(dend)
                                     -#' # More complex relations are also possible
                                     -#' cells_phylo2 <- "(('cluster1':0.4, 'cluster2':0.4):0.4, ('cluster3':0.5,('cluster4':0.2,'cluster5':0.2,'cluster6':0.2):0.4):0.4);"
                                     -#' dend2 <- read.dendrogram(text = cells_phylo2)
                                     -#' plot(dend2)
                                     -#' # simulate clusters based on these distances
                                     -#' sim <- simData(sce_, nk = 3, cells_phylo = cells_phylo)
                                     -#' # the information about the shared 'type' genes are kept in the metadata
                                     +#' plot(read.dendrogram(text = phylo_tree))
                                     +#'
                                     +#' # let's use a more complex phylogeny
                                     +#' phylo_tree <- "(('cluster1':0.4,'cluster2':0.4):0.4,('cluster3':
                                     +#'   0.5,('cluster4':0.2,'cluster5':0.2,'cluster6':0.2):0.4):0.4);"
                                     +#' plot(read.dendrogram(text = phylo_tree))
                                     +#'
                                     +#' # simulate clusters accordingly
                                     +#' sim <- simData(sce, phylo_tree = phylo_tree, ng = 500, force = TRUE)
                                     +#' # view information about shared 'type' genes
                                      #' table(metadata(sim)$gene_info$shared_class)
                                      #'
                                      #' @author Helena L Crowell
@@ -102,19 +113,19 @@
                                      #' @importFrom data.table data.table
                                      #' @importFrom dplyr mutate_all mutate_at
                                      #' @importFrom edgeR DGEList estimateDisp glmFit
                                     -#' @importFrom magrittr set_colnames set_rownames
                                      #' @importFrom purrr modify_depth set_names
                                      #' @importFrom stats model.matrix rgamma setNames
                                      #' @importFrom SingleCellExperiment SingleCellExperiment
                                      #' @importFrom SummarizedExperiment colData
                                     -#' @importFrom S4Vectors split
                                     +#' @importFrom S4Vectors split unfactor
                                      #' @export
                                     -simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                     +simData <- function(x, nc = 2e3, ns = 3, nk = 3,
                                          probs = NULL, p_dd = diag(6)[1, ], paired = FALSE,
                                          p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
                                          p_type = 0, lfc = 2, rel_lfc = NULL,
                                     -    cells_phylo = NULL, params_dist = c(0.2, 3) ) {
                                     +    phylo_tree = NULL, phylo_pars = c(0.2, 3),
                                     +    ng = nrow(x), force = FALSE) {
                                          # throughout this code...
                                          # k: cluster ID
@@ -123,13 +134,33 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                          # c: DD category
                                          # 0: reference
                                     +    # store all input arguments to be returned in final output
                                     +    args <- c(as.list(environment()))
+                                    +
                                          # check validity of input arguments
                                          .check_sce(x, req_group = FALSE)
                                          .check_args_simData(as.list(environment()))
                                     +    if (!force && ng != nrow(x))
                                     +        stop("Number of simulated genes should match with reference,\n",
                                     +            "but 'ng != nrow(x)'; please specify 'force = TRUE' if\n",
                                     +            "simulation should be forced regardlessly (see '?simData').")
                                          # reference IDs
                                          nk0 <- length(kids0 <- set_names(levels(x$cluster_id)))
                                          ns0 <- length(sids0 <- set_names(levels(x$sample_id)))
+                                    +
                                     +    # assure number of simulated clusters
                                     +    # matches with specified phylogeny
                                     +    if (!is.null(phylo_tree)) {
                                     +        kids_phylo <- .get_clusters_from_phylo(phylo_tree)
                                     +        nk_phylo <- length(kids_phylo)
                                     +        ns_phylo <- as.numeric(gsub("[a-z]", "", kids_phylo))
                                     +        if (!all(sort(ns_phylo) == seq_len(nk_phylo)))
                                     +            stop("Some clusters appear to be missing from 'phylo_tree';\n",
                                     +                "please make sure all clusters up to ",
                                     +                dQuote(kids_phylo[which.max(ns_phylo)]), " are present.")
                                     +        if (nk_phylo != nk) nk <- nk_phylo
                                     +    }
                                          # simulation IDs
                                          nk <- length(kids <- set_names(paste0("cluster", seq_len(nk))))
@@ -183,56 +214,55 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                          gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs)
                                          gs_by_k <- set_colnames(replicate(nk, gs_by_k), kids)
                                     -    if (length(cells_phylo) > 0) {
                                     -        out <- .impute_shared_type_genes(x, gs_by_k, gs_idx, cells_phylo,
                                     -                                         params_dist)
                                     -        gs_by_k <- out[[1]]
                                     -        used_tg <- out[[2]]
                                     -        shared  <- out[[3]]
+                                    -
                                     +    # when 'phylo_tree' is specified, induce hierarchical cluster structure
                                     +    if (!is.null(phylo_tree)) {
                                     +        res <- .impute_shared_type_genes(x, gs_by_k, gs_idx, phylo_tree, phylo_pars)
                                     +        gs_by_k <- res$gs_by_k
                                     +        class  <- res$class
                                     +        specs <- res$specs
                                              if (p_type != 0) {
+                                    -
                                     -            ## update gs_idx to avoid having type genes in the shared type genes
                                     -            gs_idx_red <- gs_idx
                                     -            ee_ep_id <- seq(1, 6 * length(kids) - 5, 6)
                                     -            ee_ep_id <- c(ee_ep_id, ee_ep_id + 1)
                                     -            for (i in ee_ep_id) gs_idx_red[[i]] <- (gs_idx[[i]])[-which(gs_idx[[i]] %in% used_tg)]
                                     -            type_info <- gs_by_k
                                     -            gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx_red, p_type)
+                                    -
                                     -            ## keep gene type info
                                     -            type_info[type_info == gs_by_k] <- NA
                                     -            type_info[!is.na(type_info)] <- "type"
                                     -            type_info[is.na(type_info)] <- "state"
+                                    -
                                     -        } else {
                                     -            type_info <- matrix("state", ng, nk)
                                     +            # update gene indices 'gs_idx' to avoid imputing
                                     +            # exclusive type genes in shared type genes
                                     +            gs_idx_tmp <- gs_idx
                                     +            for (c in c("ee", "ep"))
                                     +                for (k in kids) {
                                     +                    u <- gs_idx[[c, k]]
                                     +                    gs_idx_tmp[[c, k]] <- u[!u %in% res$used]
                                     +                }
                                     +            # inpute cluster-specific type genes
                                     +            res <- .impute_type_genes(x, gs_by_k, gs_idx_tmp, p_type)
                                     +            gs_by_k <- res$gs_by_k
                                     +            # update genes classes & specificities
                                     +            is_type <- res$class == "type"
                                     +            # spot checks; any type-gene should not be of class 'shared'
                                     +            # and cluster-specificities should be unassigned at this point
                                     +            stopifnot(
                                     +                all(class[is_type] == "state"),
                                     +                all(is.na(unlist(specs[is_type]))))
                                     +            class[is_type] <- "type"
                                     +            specs[is_type] <- res$specs[is_type]
+                                             }
                                     +    # otherwise, simply impute type-genes w/o phylogeny
                                     +    } else if (p_type != 0) {
                                     +        res <- .impute_type_genes(x, gs_by_k, gs_idx, p_type)
                                     +        stopifnot(!any(res$class == "shared"))
                                     +        gs_by_k <- res$gs_by_k
                                     +        class <- res$class
                                     +        specs <- res$specs
                                          } else {
                                     -        shared <- matrix("state", ng, nk)
                                     +        class <- rep("state", ng)
                                     +        specs <- rep(NA, ng)
                                     +        names(class) <- names(specs) <- gs
+                                         }
+                                    -
                                     -    # impute type-genes
                                     -    if (p_type != 0 & length(cells_phylo) == 0)  {
                                     -        type_info <- gs_by_k
                                     -        gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx, p_type)
+                                    -
                                     -        ## keep gene type info
                                     -        type_info[type_info == gs_by_k] <- NA
                                     -        type_info[!is.na(type_info)] <- "type"
                                     -        type_info[is.na(type_info)] <- "state"
                                     -    }
                                          # split by cluster & categroy
                                          gs_by_k <- split(gs_by_k, col(gs_by_k))
                                          gs_by_k <- setNames(map(gs_by_k, set_names, gs), kids)
+                                    -
                                          gs_by_kc <- lapply(kids, function(k)
                                              lapply(unfactor(cats), function(c)
                                                  gs_by_k[[k]][gs_idx[[c, k]]]))
                                          # sample logFCs
                                     -    lfc0 <- lfc
                                          lfc <- vapply(kids, function(k)
                                              lapply(unfactor(cats), function(c) {
                                                  n <- n_dd[c, k]
@@ -253,10 +283,13 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                          d <- rowData(x)$dispersion
                                          names(d) <- rownames(x)
                                     +    # initialize list of depth two to store
                                     +    # simulation means in each cluster & group
                                          sim_mean <- lapply(kids, function(k)
                                              lapply(gids, function(g)
                                                  setNames(numeric(ng), gs)))
                                     +    # run simulation -----------------------------------------------------------
                                          for (k in kids) {
                                              for (s in sids) {
                                                  # get reference samples, clusters & cells
@@ -265,7 +298,7 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                                  cs0 <- cs_by_ks[[k0]][s0]
                                                  # get output cell indices
                                     -            ci <- unlist(cs_idx[[k]][[s]])
                                     +            ci <- cs_idx[[k]][[s]]
                                                  for (c in cats[n_dd[, k] != 0]) {
                                                      # sample cells to simulate from
@@ -283,19 +316,13 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                                      lfc_kc <- lfc[[c, k]]
                                                      re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc, p_ep, p_dp, p_dm)
                                     -                y[gi, ci] <- re$cs
                                     +                y[gi, unlist(ci)] <- re$cs
                                                      for (g in gids) sim_mean[[k]][[g]][gi] <- ifelse(
                                                          is.null(re$ms[[g]]), NA, list(re$ms[[g]]))[[1]]
+                                                 }
+                                             }
+                                         }
                                     -    sim_mean <- sim_mean %>%
                                     -        map(bind_cols) %>%
                                     -        bind_rows(.id = "cluster_id") %>%
                                     -        mutate_at("cluster_id", factor) %>%
                                     -        mutate(gene = rep(gs, nk))
+                                    -
                                          # construct gene metadata table storing ------------------------------------
                                          # gene | cluster_id | category | logFC, gene, disp, mean used for sim.
                                          gi <- data.frame(
@@ -307,30 +334,19 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                              sim_disp = d[unlist(gs_by_kc)]) %>%
                                              mutate_at("gene", as.character)
                                          # add true simulation means
                                     +    sim_mean <- sim_mean %>%
                                     +        map(bind_cols) %>%
                                     +        bind_rows(.id = "cluster_id") %>%
                                     +        mutate_at("cluster_id", factor) %>%
                                     +        mutate(gene = rep(gs, nk))
                                          gi <- full_join(gi, sim_mean, by = c("gene", "cluster_id")) %>%
                                              rename("sim_mean.A" = "A", "sim_mean.B" = "B")
                                          # reorder
                                          o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
                                          gi <- set_rownames(gi[o, ], NULL)
                                     -    # add gene infos
                                     -    gi$class <- c(t(type_info))
                                     -    gi$shared_class <- c(t(shared))
                                     -    # parameters
                                     -    params <- list(
                                     -        ng = ng,
                                     -        nc = nc,
                                     -        ns = ns,
                                     -        nk = nk,
                                     -        probs = probs,
                                     -        p_dd = p_dd,
                                     -        p_type = p_type,
                                     -        lfc = lfc0,
                                     -        rel_lfc = rel_lfc,
                                     -        cells_phylo = cells_phylo,
                                     -        params_dist = params_dist
                                     -    )
                                     -    # construct SCE
                                     +    # construct SCE ------------------------------------------------------------
                                     +    # cell metadata including group, sample, cluster IDs
                                          cd$group_id <- droplevels(cd$group_id)
                                          cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
                                          m <- match(levels(cd$sample_id), cd$sample_id)
@@ -339,16 +355,21 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                          sids <- levels(cd$sample_id)[o]
                                          ei <- data.frame(sample_id = sids, group_id = gids[o])
                                          cd <- cd %>% mutate_at("sample_id", factor, levels = sids)
+                                    -
                                     +    # gene metadata storing gene classes & specificities
                                     +    rd <- DataFrame(class = factor(class,
                                     +        levels = c("state", "shared", "type")))
                                     +    rd$specs <- as.list(specs)
                                     +    # simulation metadata including used reference samples/cluster,
                                     +    # list of input arguments, and simulated genes' metadata
                                          md <- list(
                                              experiment_info = ei,
                                              n_cells = table(cd$sample_id),
                                              gene_info = gi,
                                              ref_sids = ref_sids,
                                              ref_kids = ref_kids,
                                     -        parameters = params)
+                                    -
                                     +        args = args)
                                     +    # return SCE
                                          SingleCellExperiment(
                                              assays = list(counts = as.matrix(y)),
                                     -        colData = cd, metadata = md)
                                     +        colData = cd, rowData = rd, metadata = md)
+                                     }

Browse code

add hierarchical clusters simulation

Thony authored on 30/01/2020 20:46:21
Showing 1 changed files

R/simData.R

History View file @ 8b32e24

@@ -27,6 +27,20 @@
                                      #'   Should be of length \code{nlevels(x$cluster_id)} with
                                      #'   \code{levels(x$cluster_id)} as names.
                                      #'   Defaults to factor of 1 for all clusters.
                                     +#' @param cells_phylo newick tree text representing cluster relations
                                     +#'   and their relative distance. If a tree is given, the distance between
                                     +#'   the clusters will be translated in the number of shared genes
                                     +#'   (this relation is controlled with \code{params_dist}). The distance
                                     +#'   between two clusters is defined as the sum of the branches lengths
                                     +#'   separating them.
                                     +#' @param params_dist numeric vector of length 2. Defines the number of shared
                                     +#'   genes as an adaptation of the exponential's PDF:
                                     +#'   N = Ngenes x gamma1 * e^(-gamma2 x dist) ,
                                     +#'   where gamma1 is the parameter that controls the percentage of shared genes.
                                     +#'   By default, it corresponds to \code{p_type} but it's advised to tune it
                                     +#'   depending on the input prep_sce. gamma2 is the 'penalty' of increasing
                                     +#'   the distance between clusters, applied on the number of shared genes.
                                     +#'   Default to -3.
                                      #'
                                      #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
                                      #'   containing multiple clusters & samples across 2 groups.
@@ -57,7 +71,24 @@
                                      #' sim <- simData(sce, ng = 10, nc = 100,
                                      #'   probs = list(NULL, NULL, c(1, 0)))
                                      #' levels(sim$group_id)
                                     -#'
                                     +#'
                                     +#'
                                     +#' # Hierarchical cell-type relations:
                                     +#' # we first define a phylogram representing the relations between 3 clusters
                                     +#' cells_phylo <- "(('cluster1':0.1, 'cluster2':0.1):0.4,'cluster3':0.5);"
                                     +#' # to verify the syntax and the relation, we can plot it
                                     +#' library(phylogram)
                                     +#' dend <- read.dendrogram(text = cells_phylo)
                                     +#' plot(dend)
                                     +#' # More complex relations are also possible
                                     +#' cells_phylo2 <- "(('cluster1':0.4, 'cluster2':0.4):0.4, ('cluster3':0.5,('cluster4':0.2,'cluster5':0.2,'cluster6':0.2):0.4):0.4);"
                                     +#' dend2 <- read.dendrogram(text = cells_phylo2)
                                     +#' plot(dend2)
                                     +#' # simulate clusters based on these distances
                                     +#' sim <- simData(sce_, nk = 3, cells_phylo = cells_phylo)
                                     +#' # the information about the shared 'type' genes are kept in the metadata
                                     +#' table(metadata(sim)$gene_info$shared_class)
                                     +#'
                                      #' @author Helena L Crowell
                                      #'
                                      #' @references
@@ -82,7 +113,8 @@
                                      simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                          probs = NULL, p_dd = diag(6)[1, ], paired = FALSE,
                                          p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
                                     -    p_type = 0, lfc = 2, rel_lfc = NULL) {
                                     +    p_type = 0, lfc = 2, rel_lfc = NULL,
                                     +    cells_phylo = NULL, params_dist = c(0.2, 3) ) {
                                          # throughout this code...
                                          # k: cluster ID
@@ -151,9 +183,45 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                          gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs)
                                          gs_by_k <- set_colnames(replicate(nk, gs_by_k), kids)
                                     +    if (length(cells_phylo) > 0) {
                                     +        out <- .impute_shared_type_genes(x, gs_by_k, gs_idx, cells_phylo,
                                     +                                         params_dist)
                                     +        gs_by_k <- out[[1]]
                                     +        used_tg <- out[[2]]
                                     +        shared  <- out[[3]]
+                                    +
                                     +        if (p_type != 0) {
+                                    +
                                     +            ## update gs_idx to avoid having type genes in the shared type genes
                                     +            gs_idx_red <- gs_idx
                                     +            ee_ep_id <- seq(1, 6 * length(kids) - 5, 6)
                                     +            ee_ep_id <- c(ee_ep_id, ee_ep_id + 1)
                                     +            for (i in ee_ep_id) gs_idx_red[[i]] <- (gs_idx[[i]])[-which(gs_idx[[i]] %in% used_tg)]
                                     +            type_info <- gs_by_k
                                     +            gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx_red, p_type)
+                                    +
                                     +            ## keep gene type info
                                     +            type_info[type_info == gs_by_k] <- NA
                                     +            type_info[!is.na(type_info)] <- "type"
                                     +            type_info[is.na(type_info)] <- "state"
+                                    +
                                     +        } else {
                                     +            type_info <- matrix("state", ng, nk)
                                     +        }
                                     +    } else {
                                     +        shared <- matrix("state", ng, nk)
                                     +    }
+                                    +
                                          # impute type-genes
                                     -    if (p_type != 0)
                                     +    if (p_type != 0 & length(cells_phylo) == 0)  {
                                     +        type_info <- gs_by_k
                                              gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx, p_type)
+                                    +
                                     +        ## keep gene type info
                                     +        type_info[type_info == gs_by_k] <- NA
                                     +        type_info[!is.na(type_info)] <- "type"
                                     +        type_info[is.na(type_info)] <- "state"
                                     +    }
                                          # split by cluster & categroy
                                          gs_by_k <- split(gs_by_k, col(gs_by_k))
@@ -164,6 +232,7 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                                  gs_by_k[[k]][gs_idx[[c, k]]]))
                                          # sample logFCs
                                     +    lfc0 <- lfc
                                          lfc <- vapply(kids, function(k)
                                              lapply(unfactor(cats), function(c) {
                                                  n <- n_dd[c, k]
@@ -243,6 +312,23 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                          # reorder
                                          o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
                                          gi <- set_rownames(gi[o, ], NULL)
                                     +    # add gene infos
                                     +    gi$class <- c(t(type_info))
                                     +    gi$shared_class <- c(t(shared))
                                     +    # parameters
                                     +    params <- list(
                                     +        ng = ng,
                                     +        nc = nc,
                                     +        ns = ns,
                                     +        nk = nk,
                                     +        probs = probs,
                                     +        p_dd = p_dd,
                                     +        p_type = p_type,
                                     +        lfc = lfc0,
                                     +        rel_lfc = rel_lfc,
                                     +        cells_phylo = cells_phylo,
                                     +        params_dist = params_dist
                                     +    )
                                          # construct SCE
                                          cd$group_id <- droplevels(cd$group_id)
@@ -259,7 +345,8 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                              n_cells = table(cd$sample_id),
                                              gene_info = gi,
                                              ref_sids = ref_sids,
                                     -        ref_kids = ref_kids)
                                     +        ref_kids = ref_kids,
                                     +        parameters = params)
                                          SingleCellExperiment(
                                              assays = list(counts = as.matrix(y)),

Browse code

dream fix

HelenaLC authored on 30/01/2020 13:33:29
Showing 1 changed files

R/simData.R

History View file @ d291f4c

@@ -13,6 +13,9 @@
                                      #' @param p_dd numeric vector of length 6.
                                      #'   Specifies the probability of a gene being
                                      #'   EE, EP, DE, DP, DM, or DB, respectively.
                                     +#' @param paired logial specifying whether a paired design should
                                     +#'   be simulated (both groups use the same set of reference samples)
                                     +#'   or not (reference samples are drawn at random).
                                      #' @param p_ep,p_dp,p_dm numeric specifying the proportion of cells
                                      #'   to be shifted to a different expression state in one group (see details).
                                      #' @param p_type numeric. Probaility of EE/EP gene being a type-gene.
@@ -77,7 +80,7 @@
                                      #' @export
                                      simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                     -    probs = NULL, p_dd = diag(6)[1, ],
                                     +    probs = NULL, p_dd = diag(6)[1, ], paired = FALSE,
                                          p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
                                          p_type = 0, lfc = 2, rel_lfc = NULL) {
@@ -103,15 +106,24 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                          # sample reference clusters & samples
                                          ref_kids <- setNames(sample(kids0, nk, nk > nk0), kids)
                                     -    ref_sids <- vapply(gids, function(g)
                                     -        setNames(sample(sids0, ns, ns > ns0),
                                     -            paste0("sample", seq_len(ns))),
                                     -        character(ns))
                                     +    if (paired) {
                                     +        # use same set of reference samples for both groups
                                     +        ref_sids <- sample(sids0, ns, ns > ns0)
                                     +        ref_sids <- replicate(length(gids), ref_sids)
                                     +    } else {
                                     +        # draw reference samples at random for each group
                                     +        ref_sids <- replicate(length(gids),
                                     +            sample(sids0, ns, ns > ns0))
                                     +    }
                                     +    dimnames(ref_sids) <- list(sids, gids)
                                          if (is.null(rel_lfc))
                                              rel_lfc <- rep(1, nk)
                                     -    if (is.null(names(rel_lfc)))
                                     +    if (is.null(names(rel_lfc))) {
                                              names(rel_lfc) <- kids
                                     +    } else {
                                     +        stopifnot(names(rel_lfc) %in% kids0)
                                     +    }
                                          # initialize count matrix
                                          gs <- paste0("gene", seq_len(ng))
@@ -245,7 +257,9 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                          md <- list(
                                              experiment_info = ei,
                                              n_cells = table(cd$sample_id),
                                     -        gene_info = gi)
                                     +        gene_info = gi,
                                     +        ref_sids = ref_sids,
                                     +        ref_kids = ref_kids)
                                          SingleCellExperiment(
                                              assays = list(counts = as.matrix(y)),

Browse code

bug fix when only 1 group is simulated

HelenaLC authored on 26/11/2019 10:36:46
Showing 1 changed files

R/simData.R

History View file @ d0696f3

@@ -36,8 +36,7 @@
                                      #' sce <- prepSim(sce)
                                      #'
                                      #' # simulate data
                                     -#' (sim <- simData(sce,
                                     -#'   n_genes = 100, n_cells = 10,
                                     +#' (sim <- simData(sce, ng = 100, nc = 10,
                                      #'   p_dd = c(0.9, 0, 0.1, 0, 0, 0)))
                                      #'
                                      #' # simulation metadata
@@ -47,14 +46,12 @@
                                      #' table(gi$category)
                                      #'
                                      #' # unbalanced sample sizes
                                     -#' sim <- simData(sce,
                                     -#'   n_genes = 10, n_cells = 100,
                                     +#' sim <- simData(sce, ng = 10, nc = 100,
                                      #'   probs = list(NULL, c(0.25, 0.75), NULL))
                                      #' table(sim$sample_id)
                                      #'
                                      #' # one group only
                                     -#' sim <- simData(sce,
                                     -#'   n_genes = 10, n_cells = 100,
                                     +#' sim <- simData(sce, ng = 10, nc = 100,
                                      #'   probs = list(NULL, NULL, c(1, 0)))
                                      #' levels(sim$group_id)
                                      #'
@@ -236,6 +233,7 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                          gi <- set_rownames(gi[o, ], NULL)
                                          # construct SCE
                                     +    cd$group_id <- droplevels(cd$group_id)
                                          cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
                                          m <- match(levels(cd$sample_id), cd$sample_id)
                                          gids <- cd$group_id[m]

Browse code

add parameter to vary percentage of cells to shift for EP, DP, and DM genes

HelenaLC authored on 19/11/2019 14:36:38
Showing 1 changed files

R/simData.R

History View file @ 6455b77

@@ -13,6 +13,8 @@
                                      #' @param p_dd numeric vector of length 6.
                                      #'   Specifies the probability of a gene being
                                      #'   EE, EP, DE, DP, DM, or DB, respectively.
                                     +#' @param p_ep,p_dp,p_dm numeric specifying the proportion of cells
                                     +#'   to be shifted to a different expression state in one group (see details).
                                      #' @param p_type numeric. Probaility of EE/EP gene being a type-gene.
                                      #'   If a gene is of class "type" in a given cluster, a unique mean
                                      #'   will be used for that gene in the respective cluster.
@@ -78,8 +80,9 @@
                                      #' @export
                                      simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                     -    probs = NULL, p_dd = diag(6)[1, ], p_type = 0,
                                     -    lfc = 2, rel_lfc = NULL) {
                                     +    probs = NULL, p_dd = diag(6)[1, ],
                                     +    p_ep = 0.5, p_dp = 0.3, p_dm = 0.5,
                                     +    p_type = 0, lfc = 2, rel_lfc = NULL) {
                                          # throughout this code...
                                          # k: cluster ID
@@ -201,7 +204,7 @@ simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                                      d_kc <- d[gs0]
                                                      lfc_kc <- lfc[[c, k]]
                                     -                re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc)
                                     +                re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc, p_ep, p_dp, p_dm)
                                                      y[gi, ci] <- re$cs
                                                      for (g in gids) sim_mean[[k]][[g]][gi] <- ifelse(

Browse code

fix sample pairing

HelenaLC authored on 15/11/2019 11:58:09
Showing 1 changed files

R/simData.R

History View file @ 2ad2286

@@ -6,9 +6,7 @@
                                      #' across 2 experimental conditions from a real scRNA-seq data set.
                                      #'
                                      #' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
                                     -#' @param n_genes # of genes to simulate.
                                     -#' @param n_cells # of cells to simulate.
                                     -#'   Either a single numeric or a range to sample from.
                                     +#' @param ng,nc,ns,nk # of genes, cells, samples and clusters to simulate.
                                      #' @param probs a list of length 3 containing probabilities of a cell belonging
                                      #'   to each cluster, sample, and group, respectively. List elements must be
                                      #'   NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.
@@ -79,7 +77,7 @@
                                      #' @importFrom S4Vectors split
                                      #' @export
                                     -simData <- function(x, n_genes = 500, n_cells = 300,
                                     +simData <- function(x, ng = nrow(x), nc = 2e3, ns = 3, nk = 3,
                                          probs = NULL, p_dd = diag(6)[1, ], p_type = 0,
                                          lfc = 2, rel_lfc = NULL) {
@@ -88,15 +86,27 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                          # s: sample ID
                                          # g: group ID
                                          # c: DD category
                                     +    # 0: reference
                                          # check validity of input arguments
                                          .check_sce(x, req_group = FALSE)
                                          .check_args_simData(as.list(environment()))
                                     -    kids <- set_names(levels(x$cluster_id))
                                     -    sids <- set_names(levels(x$sample_id))
                                     +    # reference IDs
                                     +    nk0 <- length(kids0 <- set_names(levels(x$cluster_id)))
                                     +    ns0 <- length(sids0 <- set_names(levels(x$sample_id)))
+                                    +
                                     +    # simulation IDs
                                     +    nk <- length(kids <- set_names(paste0("cluster", seq_len(nk))))
                                     +    sids <- set_names(paste0("sample", seq_len(ns)))
                                          gids <- set_names(c("A", "B"))
                                     -    nk <- length(kids)
+                                    +
                                     +    # sample reference clusters & samples
                                     +    ref_kids <- setNames(sample(kids0, nk, nk > nk0), kids)
                                     +    ref_sids <- vapply(gids, function(g)
                                     +        setNames(sample(sids0, ns, ns > ns0),
                                     +            paste0("sample", seq_len(ns))),
                                     +        character(ns))
                                          if (is.null(rel_lfc))
                                              rel_lfc <- rep(1, nk)
@@ -104,15 +114,15 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                              names(rel_lfc) <- kids
                                          # initialize count matrix
                                     -    gs <- paste0("gene", seq_len(n_genes))
                                     -    cs <- paste0("cell", seq_len(n_cells))
                                     -    y <- matrix(0, n_genes, n_cells, dimnames = list(gs, cs))
                                     +    gs <- paste0("gene", seq_len(ng))
                                     +    cs <- paste0("cell", seq_len(nc))
                                     +    y <- matrix(0, ng, nc, dimnames = list(gs, cs))
                                          # sample cell metadata
                                          cd <- .sample_cell_md(
                                     -        n = n_cells, probs = probs,
                                     -        ids = list(kids, sids, gids)) %>%
                                     -        set_rownames(cs)
                                     +        n = nc, probs = probs,
                                     +        ids = list(kids, sids, gids))
                                     +    rownames(cd) <- cs
                                          cs_idx <- .split_cells(cd, by = colnames(cd))
                                          n_cs <- modify_depth(cs_idx, -1, length)
@@ -120,14 +130,14 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                          cs_by_ks <- .split_cells(x)
                                          # sample nb. of genes to simulate per category & gene indices
                                     -    n_dd <- replicate(nk,
                                     -        table(sample(factor(cats, levels = cats), n_genes, TRUE, p_dd))) %>%
                                     -        set_colnames(kids)
                                     +    n_dd <- table(sample(cats, ng, TRUE, p_dd))
                                     +    n_dd <- replicate(nk, n_dd)
                                     +    colnames(n_dd) <- kids
                                          gs_idx <- .sample_gene_inds(gs, n_dd)
                                          # for ea. cluster, sample set of genes to simulate from
                                     -    gs_by_k <- setNames(sample(rownames(x), n_genes, TRUE), gs)
                                     -    gs_by_k <- replicate(nk, gs_by_k) %>% set_colnames(kids)
                                     +    gs_by_k <- setNames(sample(rownames(x), ng, TRUE), gs)
                                     +    gs_by_k <- set_colnames(replicate(nk, gs_by_k), kids)
                                          # impute type-genes
                                          if (p_type != 0)
@@ -135,74 +145,70 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                          # split by cluster & categroy
                                          gs_by_k <- split(gs_by_k, col(gs_by_k))
                                     -    gs_by_k <- map(gs_by_k, set_names, gs)
                                     -    names(gs_by_k) <- kids
                                     +    gs_by_k <- setNames(map(gs_by_k, set_names, gs), kids)
                                          gs_by_kc <- lapply(kids, function(k)
                                     -        lapply(cats, function(c)
                                     -            gs_by_k[[k]][gs_idx[[c, k]]]) %>%
                                     -            set_names(cats))
                                     +        lapply(unfactor(cats), function(c)
                                     +            gs_by_k[[k]][gs_idx[[c, k]]]))
                                          # sample logFCs
                                          lfc <- vapply(kids, function(k)
                                     -        lapply(cats, function(c) {
                                     +        lapply(unfactor(cats), function(c) {
                                                  n <- n_dd[c, k]
                                                  if (c == "ee") return(rep(NA, n))
                                                  signs <- sample(c(-1, 1), n, TRUE)
                                                  lfcs <- rgamma(n, 4, 4/lfc) * signs
                                                  names(lfcs) <- gs_by_kc[[k]][[c]]
                                                  lfcs * rel_lfc[k]
                                     -        }), vector("list", length(cats))) %>%
                                     -        set_rownames(cats)
+                                    -
                                     +        }), vector("list", length(cats)))
+                                    +
                                          # compute NB parameters
                                     -    o <- exp(colData(x)$offset)
                                     -    m <- lapply(sids, function(s) {
                                     -        cn <- paste("beta", s, sep = ".")
                                     -        k <- grep(cn, names(rowData(x)))
                                     -        b <- exp(rowData(x)[[k]])
                                     -        m <- vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>%
                                     -            set_rownames(rownames(x)) %>%
                                     -            set_colnames(colnames(x))
                                     +    m <- lapply(sids0, function(s) {
                                     +        b <- paste0("beta.", s)
                                     +        b <- exp(rowData(x)[[b]])
                                     +        m <- outer(b, exp(x$offset), "*")
                                     +        dimnames(m) <- dimnames(x); m
                                          })
                                     -    d <- rowData(x)$dispersion %>%
                                     -        set_names(rownames(x))
                                     +    d <- rowData(x)$dispersion
                                     +    names(d) <- rownames(x)
                                          sim_mean <- lapply(kids, function(k)
                                     -        lapply(gids, function(g)
                                     -            setNames(numeric(n_genes), rownames(y))))
                                     +        lapply(gids, function(g)
                                     +            setNames(numeric(ng), gs)))
+                                    +
                                          for (k in kids) {
                                              for (s in sids) {
                                     +            # get reference samples, clusters & cells
                                     +            s0 <- ref_sids[s, ]
                                     +            k0 <- ref_kids[k]
                                     +            cs0 <- cs_by_ks[[k0]][s0]
+                                    +
                                     +            # get output cell indices
                                     +            ci <- unlist(cs_idx[[k]][[s]])
+                                    +
                                                  for (c in cats[n_dd[, k] != 0]) {
                                     -                gs_kc <- gs_by_kc[[k]][[c]]
                                     -                cs_ks <- cs_by_ks[[k]][[s]]
                                     +                # sample cells to simulate from
                                     +                cs_g1 <- sample(cs0[[1]], n_cs[[k]][[s]][[1]], TRUE)
                                     +                cs_g2 <- sample(cs0[[2]], n_cs[[k]][[s]][[2]], TRUE)
                                     -                g1 <- cs_idx[[k]][[s]]$A
                                     -                g2 <- cs_idx[[k]][[s]]$B
                                     +                # get reference genes & output gene indices
                                     +                gs0 <- gs_by_kc[[k]][[c]]
                                     +                gi <- gs_idx[[c, k]]
                                     -                ng1 <- length(g1)
                                     -                ng2 <- length(g2)
+                                    -
                                     -                cs_g1 <- sample(cs_ks, ng1, replace = TRUE)
                                     -                cs_g2 <- sample(cs_ks, ng2, replace = TRUE)
+                                    -
                                     -                m_g1 <- m[[s]][gs_kc, cs_g1, drop = FALSE]
                                     -                m_g2 <- m[[s]][gs_kc, cs_g2, drop = FALSE]
                                     -                d_kc <- d[gs_kc]
                                     +                # get NB parameters
                                     +                m_g1 <- m[[s0[[1]]]][gs0, cs_g1, drop = FALSE]
                                     +                m_g2 <- m[[s0[[2]]]][gs0, cs_g2, drop = FALSE]
                                     +                d_kc <- d[gs0]
                                                      lfc_kc <- lfc[[c, k]]
                                     -                gidx <- gs_idx[[c, k]]
                                     -                cidx <- c(g1, g2)
+                                    -
                                                      re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc)
                                     -                y[gidx, cidx] <- re$cs
                                     +                y[gi, ci] <- re$cs
                                     -                for (g in c("A", "B")) sim_mean[[k]][[g]][gidx] <-
                                     -                    ifelse(is.null(re$ms[[g]]), NA, list(re$ms[[g]]))[[1]]
                                     +                for (g in gids) sim_mean[[k]][[g]][gi] <- ifelse(
                                     +                    is.null(re$ms[[g]]), NA, list(re$ms[[g]]))[[1]]
+                                                 }
+                                             }
+                                         }
+                                    -
                                          sim_mean <- sim_mean %>%
                                              map(bind_cols) %>%
                                              bind_rows(.id = "cluster_id") %>%
@@ -224,7 +230,7 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                              rename("sim_mean.A" = "A", "sim_mean.B" = "B")
                                          # reorder
                                          o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
                                     -    gi <- gi[o, ] %>% set_rownames(NULL)
                                     +    gi <- set_rownames(gi[o, ], NULL)
                                          # construct SCE
                                          cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
@@ -242,6 +248,5 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                          SingleCellExperiment(
                                              assays = list(counts = as.matrix(y)),
                                     -        colData = cd,
                                     -        metadata = md)
                                     +        colData = cd, metadata = md)
+                                     }

Browse code

rmv rounding of means

HelenaLC authored on 11/11/2019 11:13:02
Showing 1 changed files

R/simData.R

History View file @ 635a99f

@@ -161,10 +161,9 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                              cn <- paste("beta", s, sep = ".")
                                              k <- grep(cn, names(rowData(x)))
                                              b <- exp(rowData(x)[[k]])
                                     -        vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>%
                                     +        m <- vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>%
                                                  set_rownames(rownames(x)) %>%
                                     -            set_colnames(colnames(x)) %>%
                                     -            round
                                     +            set_colnames(colnames(x))
                                          })
                                          d <- rowData(x)$dispersion %>%
                                              set_names(rownames(x))

Browse code

lfc default 1 > 2

HelenaLC authored on 12/08/2019 15:23:53
Showing 1 changed files

R/simData.R

History View file @ a173415

@@ -81,7 +81,7 @@
                                      simData <- function(x, n_genes = 500, n_cells = 300,
                                          probs = NULL, p_dd = diag(6)[1, ], p_type = 0,
                                     -    lfc = 1, rel_lfc = NULL) {
                                     +    lfc = 2, rel_lfc = NULL) {
                                          # throughout this code...
                                          # k: cluster ID

Browse code

Merge branch 'devel' of https://github.com/HelenaLC/muscat into devel

HelenaLC authored on 09/08/2019 13:42:08
Showing 0 changed files

Browse code

set default lfc to 1

HelenaLC authored on 09/08/2019 13:41:35
Showing 1 changed files

R/simData.R

History View file @ 7ad9f91

@@ -81,7 +81,7 @@
                                      simData <- function(x, n_genes = 500, n_cells = 300,
                                          probs = NULL, p_dd = diag(6)[1, ], p_type = 0,
                                     -    lfc = 2, rel_lfc = NULL) {
                                     +    lfc = 1, rel_lfc = NULL) {
                                          # throughout this code...
                                          # k: cluster ID

Browse code

fix examples using new example SCE

HelenaLC authored on 06/08/2019 16:10:56
Showing 1 changed files

R/simData.R

History View file @ 4e007e1

@@ -49,7 +49,7 @@
                                      #' # unbalanced sample sizes
                                      #' sim <- simData(sce,
                                      #'   n_genes = 10, n_cells = 100,
                                     -#'   probs = list(NULL, c(0.1, 0.3, 0.6), NULL))
                                     +#'   probs = list(NULL, c(0.25, 0.75), NULL))
                                      #' table(sim$sample_id)
                                      #'
                                      #' # one group only

Browse code

load sce for examples

HelenaLC authored on 05/08/2019 13:20:31
Showing 1 changed files

R/simData.R

History View file @ 12b36e8

@@ -30,6 +30,7 @@
                                      #'
                                      #' @examples
                                      #' data(sce)
                                     +#' library(SingleCellExperiment)
                                      #'
                                      #' # prep. SCE for simulation
                                      #' sce <- prepSim(sce)

Browse code

fix imports

Helena Lucia Crowell authored on 05/08/2019 12:42:47
Showing 1 changed files

R/simData.R

History View file @ 02252d4

@@ -71,7 +71,7 @@
                                      #' @importFrom dplyr mutate_all mutate_at
                                      #' @importFrom edgeR DGEList estimateDisp glmFit
                                      #' @importFrom magrittr set_colnames set_rownames
                                     -#' @importFrom purrr modify_at set_names
                                     +#' @importFrom purrr modify_depth set_names
                                      #' @importFrom stats model.matrix rgamma setNames
                                      #' @importFrom SingleCellExperiment SingleCellExperiment
                                      #' @importFrom SummarizedExperiment colData

Browse code

add wrappers for validity checks of function arguments

Helena Lucia Crowell authored on 05/08/2019 11:11:16
Showing 1 changed files

R/simData.R

History View file @ 87fafb3

@@ -30,9 +30,32 @@
                                      #'
                                      #' @examples
                                      #' data(sce)
                                     -#' simData(sce,
                                     -#'     n_genes = 10, n_cells = 10,
                                     -#'     p_dd = diag(6)[1, ])
                                     +#'
                                     +#' # prep. SCE for simulation
                                     +#' sce <- prepSim(sce)
                                     +#'
                                     +#' # simulate data
                                     +#' (sim <- simData(sce,
                                     +#'   n_genes = 100, n_cells = 10,
                                     +#'   p_dd = c(0.9, 0, 0.1, 0, 0, 0)))
                                     +#'
                                     +#' # simulation metadata
                                     +#' head(gi <- metadata(sim)$gene_info)
                                     +#'
                                     +#' # should be ~10% DE
                                     +#' table(gi$category)
                                     +#'
                                     +#' # unbalanced sample sizes
                                     +#' sim <- simData(sce,
                                     +#'   n_genes = 10, n_cells = 100,
                                     +#'   probs = list(NULL, c(0.1, 0.3, 0.6), NULL))
                                     +#' table(sim$sample_id)
                                     +#'
                                     +#' # one group only
                                     +#' sim <- simData(sce,
                                     +#'   n_genes = 10, n_cells = 100,
                                     +#'   probs = list(NULL, NULL, c(1, 0)))
                                     +#' levels(sim$group_id)
                                      #'
                                      #' @author Helena L Crowell
                                      #'
@@ -53,7 +76,6 @@
                                      #' @importFrom SingleCellExperiment SingleCellExperiment
                                      #' @importFrom SummarizedExperiment colData
                                      #' @importFrom S4Vectors split
                                     -#' @importFrom tibble column_to_rownames
                                      #' @export
                                      simData <- function(x, n_genes = 500, n_cells = 300,
@@ -63,35 +85,22 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                          # throughout this code...
                                          # k: cluster ID
                                          # s: sample ID
                                     -    # c: gene category
                                     +    # g: group ID
                                     +    # c: DD category
                                          # check validity of input arguments
                                          .check_sce(x, req_group = FALSE)
                                     -    stopifnot(is.numeric(n_genes), length(n_genes) == 1)
                                     -    stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
                                     -    stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
                                     -    stopifnot(is.numeric(p_type), length(p_type) == 1, p_type >= 0)
                                     -    stopifnot(is.numeric(lfc), is.numeric(lfc), lfc > 1)
+                                    -
                                     -    kids <- levels(x$cluster_id)
                                     -    sids <- levels(x$sample_id)
                                     -    gids <- c("A", "B")
                                     -    names(kids) <- kids
                                     -    names(sids) <- sids
                                     -    names(gids) <- gids
                                     +    .check_args_simData(as.list(environment()))
+                                    +
                                     +    kids <- set_names(levels(x$cluster_id))
                                     +    sids <- set_names(levels(x$sample_id))
                                     +    gids <- set_names(c("A", "B"))
                                          nk <- length(kids)
                                     -    if (is.null(rel_lfc)) {
                                     +    if (is.null(rel_lfc))
                                              rel_lfc <- rep(1, nk)
                                     +    if (is.null(names(rel_lfc)))
                                              names(rel_lfc) <- kids
                                     -    } else {
                                     -        stopifnot(is.numeric(rel_lfc), length(rel_lfc) == nk, rel_lfc >= 0)
                                     -        if (is.null(names(rel_lfc))) {
                                     -            names(rel_lfc) <- kids
                                     -        } else {
                                     -            stopifnot(setequal(names(rel_lfc), kids))
                                     -        }
                                     -    }
                                          # initialize count matrix
                                          gs <- paste0("gene", seq_len(n_genes))

Browse code

remove my email!!

HelenaLC authored on 02/08/2019 14:19:13
Showing 1 changed files

R/simData.R

History View file @ 1b02bcc

@@ -25,8 +25,6 @@
                                      #'   \code{levels(x$cluster_id)} as names.
                                      #'   Defaults to factor of 1 for all clusters.
                                      #'
                                     -#'
                                     -#'
                                      #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
                                      #'   containing multiple clusters & samples across 2 groups.
                                      #'
@@ -35,6 +33,16 @@
                                      #' simData(sce,
                                      #'     n_genes = 10, n_cells = 10,
                                      #'     p_dd = diag(6)[1, ])
                                     +#'
                                     +#' @author Helena L Crowell
                                     +#'
                                     +#' @references
                                     +#' Crowell, HL, Soneson, C, Germain, P-L, Calini, D,
                                     +#' Collin, L, Raposo, C, Malhotra, D & Robinson, MD:
                                     +#' On the discovery of population-specific state transitions from
                                     +#' multi-sample multi-condition single-cell RNA sequencing data.
                                     +#' \emph{bioRxiv} \strong{713412} (2018).
                                     +#' doi: \url{https://doi.org/10.1101/713412}
                                      #'
                                      #' @importFrom data.table data.table
                                      #' @importFrom dplyr mutate_all mutate_at
@@ -46,7 +54,6 @@
                                      #' @importFrom SummarizedExperiment colData
                                      #' @importFrom S4Vectors split
                                      #' @importFrom tibble column_to_rownames
                                     -#'
                                      #' @export
                                      simData <- function(x, n_genes = 500, n_cells = 300,

Browse code

merge plger branch mm-glmm methods; pass BiocCheck

HelenaLC authored on 03/07/2019 15:01:55
Showing 1 changed files

R/simData.R

History View file @ 1cd7b5c

@@ -117,8 +117,10 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                              gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx, p_type)
                                          # split by cluster & categroy
                                     -    gs_by_k <- gs_by_k %>% split(col(.)) %>%
                                     -        set_names(kids) %>% map(set_names, gs)
                                     +    gs_by_k <- split(gs_by_k, col(gs_by_k))
                                     +    gs_by_k <- map(gs_by_k, set_names, gs)
                                     +    names(gs_by_k) <- kids
+                                    +
                                          gs_by_kc <- lapply(kids, function(k)
                                              lapply(cats, function(c)
                                                  gs_by_k[[k]][gs_idx[[c, k]]]) %>%

Browse code

Merge branch 'devel'

HelenaLC authored on 02/07/2019 08:22:56
Showing 0 changed files

Browse code

merge devel

HelenaLC authored on 02/07/2019 08:22:01
Showing 1 changed files

R/simData.R

History View file @ 98bc0a7

@@ -144,7 +144,8 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                              b <- exp(rowData(x)[[k]])
                                              vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>%
                                                  set_rownames(rownames(x)) %>%
                                     -            set_colnames(colnames(x))
                                     +            set_colnames(colnames(x)) %>%
                                     +            round
                                          })
                                          d <- rowData(x)$dispersion %>%
                                              set_names(rownames(x))

Browse code

bug fix for simulating 1 group only

HelenaLC authored on 17/06/2019 11:10:54
Showing 1 changed files

R/simData.R

History View file @ 1b31064

...	...	@@ -177,8 +177,9 @@ simData <- function(x, n_genes = 500, n_cells = 300,
177	177
178	178	re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc)
179	179	y[gidx, cidx] <- re$cs
180		- sim_mean[[k]]$A[gidx] <- re$ms$A
181		- sim_mean[[k]]$B[gidx] <- re$ms$B
	180	+
	181	+ for (g in c("A", "B")) sim_mean[[k]][[g]][gidx] <-
	182	+ ifelse(is.null(re$ms[[g]]), NA, list(re$ms[[g]]))[[1]]
182	183	}
183	184	}
184	185	}

Browse code

reformat code

HelenaLC authored on 15/05/2019 10:32:01
Showing 1 changed files

R/simData.R

History View file @ 5e2c7b6

@@ -94,7 +94,8 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                          # sample cell metadata
                                          cd <- .sample_cell_md(
                                              n = n_cells, probs = probs,
                                     -        ids = list(kids, sids, gids)) %>% set_rownames(cs)
                                     +        ids = list(kids, sids, gids)) %>%
                                     +        set_rownames(cs)
                                          cs_idx <- .split_cells(cd, by = colnames(cd))
                                          n_cs <- modify_depth(cs_idx, -1, length)

Browse code

add cluster-specific mean logFC shift

HelenaLC authored on 01/04/2019 18:50:57
Showing 1 changed files

R/simData.R

History View file @ a6ea37f

@@ -15,11 +15,17 @@
                                      #' @param p_dd numeric vector of length 6.
                                      #'   Specifies the probability of a gene being
                                      #'   EE, EP, DE, DP, DM, or DB, respectively.
                                     -#' @param fc numeric value to use as mean logFC
                                     -#'   for DE, DP, DM, and DB type of genes.
                                      #' @param p_type numeric. Probaility of EE/EP gene being a type-gene.
                                      #'   If a gene is of class "type" in a given cluster, a unique mean
                                      #'   will be used for that gene in the respective cluster.
                                     +#' @param lfc numeric value to use as mean logFC
                                     +#'   for DE, DP, DM, and DB type of genes.
                                     +#' @param rel_lfc numeric vector of relative logFCs for each cluster.
                                     +#'   Should be of length \code{nlevels(x$cluster_id)} with
                                     +#'   \code{levels(x$cluster_id)} as names.
                                     +#'   Defaults to factor of 1 for all clusters.
                                     +#'
                                     +#'
                                      #'
                                      #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
                                      #'   containing multiple clusters & samples across 2 groups.
@@ -45,7 +51,7 @@
                                      simData <- function(x, n_genes = 500, n_cells = 300,
                                          probs = NULL, p_dd = diag(6)[1, ], p_type = 0,
                                     -    fc = 2, rel_fc = NULL) {
                                     +    lfc = 2, rel_lfc = NULL) {
                                          # throughout this code...
                                          # k: cluster ID
@@ -58,7 +64,7 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                          stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
                                          stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
                                          stopifnot(is.numeric(p_type), length(p_type) == 1, p_type >= 0)
                                     -    stopifnot(is.numeric(fc), is.numeric(fc), fc > 1)
                                     +    stopifnot(is.numeric(lfc), is.numeric(lfc), lfc > 1)
                                          kids <- levels(x$cluster_id)
                                          sids <- levels(x$sample_id)
@@ -67,10 +73,17 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                          names(sids) <- sids
                                          names(gids) <- gids
                                          nk <- length(kids)
                                     -    if (is.null(rel_fc)) {
                                     -        rel_fc <- rep(1, nk)
+                                    +
                                     +    if (is.null(rel_lfc)) {
                                     +        rel_lfc <- rep(1, nk)
                                     +        names(rel_lfc) <- kids
                                          } else {
                                     -        stopifnot(is.numeric(rel_fc), length(rel_fc) == nk, rel_fc >= 0)
                                     +        stopifnot(is.numeric(rel_lfc), length(rel_lfc) == nk, rel_lfc >= 0)
                                     +        if (is.null(names(rel_lfc))) {
                                     +            names(rel_lfc) <- kids
                                     +        } else {
                                     +            stopifnot(setequal(names(rel_lfc), kids))
                                     +        }
+                                         }
                                          # initialize count matrix
@@ -116,9 +129,9 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                                  n <- n_dd[c, k]
                                                  if (c == "ee") return(rep(NA, n))
                                                  signs <- sample(c(-1, 1), n, TRUE)
                                     -            lfc <- rgamma(n, 4, 4/fc) * signs
                                     -            names(lfc) <- gs_by_kc[[k]][[c]]
                                     -            return(lfc)
                                     +            lfcs <- rgamma(n, 4, 4/lfc) * signs
                                     +            names(lfcs) <- gs_by_kc[[k]][[c]]
                                     +            lfcs * rel_lfc[k]
                                              }), vector("list", length(cats))) %>%
                                              set_rownames(cats)

Browse code

code cleaning, update simulation

HelenaLC authored on 28/03/2019 10:21:29
Showing 1 changed files

R/simData.R

History View file @ 38c038b

@@ -17,6 +17,9 @@
                                      #'   EE, EP, DE, DP, DM, or DB, respectively.
                                      #' @param fc numeric value to use as mean logFC
                                      #'   for DE, DP, DM, and DB type of genes.
                                     +#' @param p_type numeric. Probaility of EE/EP gene being a type-gene.
                                     +#'   If a gene is of class "type" in a given cluster, a unique mean
                                     +#'   will be used for that gene in the respective cluster.
                                      #'
                                      #' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
                                      #'   containing multiple clusters & samples across 2 groups.
@@ -41,7 +44,8 @@
                                      #' @export
                                      simData <- function(x, n_genes = 500, n_cells = 300,
                                     -    probs = NULL, p_dd = diag(6)[1, ], fc = 2) {
                                     +    probs = NULL, p_dd = diag(6)[1, ], p_type = 0,
                                     +    fc = 2, rel_fc = NULL) {
                                          # throughout this code...
                                          # k: cluster ID
@@ -53,6 +57,7 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                          stopifnot(is.numeric(n_genes), length(n_genes) == 1)
                                          stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
                                          stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
                                     +    stopifnot(is.numeric(p_type), length(p_type) == 1, p_type >= 0)
                                          stopifnot(is.numeric(fc), is.numeric(fc), fc > 1)
                                          kids <- levels(x$cluster_id)
@@ -62,6 +67,11 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                          names(sids) <- sids
                                          names(gids) <- gids
                                          nk <- length(kids)
                                     +    if (is.null(rel_fc)) {
                                     +        rel_fc <- rep(1, nk)
                                     +    } else {
                                     +        stopifnot(is.numeric(rel_fc), length(rel_fc) == nk, rel_fc >= 0)
                                     +    }
                                          # initialize count matrix
                                          gs <- paste0("gene", seq_len(n_genes))
@@ -86,10 +96,15 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                          # for ea. cluster, sample set of genes to simulate from
                                          gs_by_k <- setNames(sample(rownames(x), n_genes, TRUE), gs)
                                     -    gs_by_k <- replicate(nk, gs_by_k, simplify = FALSE) %>% set_names(kids)
                                     -    #gs_by_k <- replicate(nk,
                                     -    #    setNames(sample(rownames(x), n_genes, TRUE), gs),
                                     -    #    simplify = FALSE) %>% set_names(kids)
                                     +    gs_by_k <- replicate(nk, gs_by_k) %>% set_colnames(kids)
+                                    +
                                     +    # impute type-genes
                                     +    if (p_type != 0)
                                     +        gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx, p_type)
+                                    +
                                     +    # split by cluster & categroy
                                     +    gs_by_k <- gs_by_k %>% split(col(.)) %>%
                                     +        set_names(kids) %>% map(set_names, gs)
                                          gs_by_kc <- lapply(kids, function(k)
                                              lapply(cats, function(c)
                                                  gs_by_k[[k]][gs_idx[[c, k]]]) %>%
@@ -108,12 +123,17 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                              set_rownames(cats)
                                          # compute NB parameters
                                     -    b <- exp(rowData(x)$beta)
                                          o <- exp(colData(x)$offset)
                                     -    m <- vapply(o, function(l) b*l, numeric(nrow(x)))
                                     -    dimnames(m) <- dimnames(x)
                                     -    d <- rowData(x)$dispersion
                                     -    names(d) <- rownames(x)
                                     +    m <- lapply(sids, function(s) {
                                     +        cn <- paste("beta", s, sep = ".")
                                     +        k <- grep(cn, names(rowData(x)))
                                     +        b <- exp(rowData(x)[[k]])
                                     +        vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>%
                                     +            set_rownames(rownames(x)) %>%
                                     +            set_colnames(colnames(x))
                                     +    })
                                     +    d <- rowData(x)$dispersion %>%
                                     +        set_names(rownames(x))
                                          sim_mean <- lapply(kids, function(k)
                                              lapply(gids, function(g)
@@ -133,24 +153,30 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                                      cs_g1 <- sample(cs_ks, ng1, replace = TRUE)
                                                      cs_g2 <- sample(cs_ks, ng2, replace = TRUE)
                                     -                m_g1 <- m[gs_kc, cs_g1, drop = FALSE]
                                     -                m_g2 <- m[gs_kc, cs_g2, drop = FALSE]
                                     +                m_g1 <- m[[s]][gs_kc, cs_g1, drop = FALSE]
                                     +                m_g2 <- m[[s]][gs_kc, cs_g2, drop = FALSE]
                                                      d_kc <- d[gs_kc]
                                                      lfc_kc <- lfc[[c, k]]
                                                      gidx <- gs_idx[[c, k]]
                                                      cidx <- c(g1, g2)
                                     -                counts <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d = d_kc, lfc = lfc_kc)
                                     -                y[gidx, cidx] <- counts
                                     -                sim_mean[[k]]$A[gidx] <- rowMeans(m_g1) # ... * lfc ??
                                     -                sim_mean[[k]]$B[gidx] <- rowMeans(m_g2)
                                     +                re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc)
                                     +                y[gidx, cidx] <- re$cs
                                     +                sim_mean[[k]]$A[gidx] <- re$ms$A
                                     +                sim_mean[[k]]$B[gidx] <- re$ms$B
+                                                 }
+                                             }
+                                         }
                                     -    # construct gene metadata table storing
                                     -    # gene | cluster_id | category | logFC
                                     +    sim_mean <- sim_mean %>%
                                     +        map(bind_cols) %>%
                                     +        bind_rows(.id = "cluster_id") %>%
                                     +        mutate_at("cluster_id", factor) %>%
                                     +        mutate(gene = rep(gs, nk))
+                                    +
                                     +    # construct gene metadata table storing ------------------------------------
                                     +    # gene | cluster_id | category | logFC, gene, disp, mean used for sim.
                                          gi <- data.frame(
                                              gene = unlist(gs_idx),
                                              cluster_id = rep.int(rep(kids, each = length(cats)), c(n_dd)),
@@ -159,15 +185,13 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                              sim_gene = unlist(gs_by_kc),
                                              sim_disp = d[unlist(gs_by_kc)]) %>%
                                              mutate_at("gene", as.character)
                                     +    # add true simulation means
                                     +    gi <- full_join(gi, sim_mean, by = c("gene", "cluster_id")) %>%
                                     +        rename("sim_mean.A" = "A", "sim_mean.B" = "B")
                                     +    # reorder
                                          o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
                                          gi <- gi[o, ] %>% set_rownames(NULL)
                                     -    a <- unlist(map_depth(sim_mean, 1, "A"))
                                     -    b <- unlist(map_depth(sim_mean, 1, "B"))
                                     -    o <- order(as.numeric(gsub(".*\\.[a-z]+", "", names(a))))
                                     -    gi$sim_mean.A <- a[o]
                                     -    gi$sim_mean.B <- b[o]
+                                    -
                                          # construct SCE
                                          cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
                                          m <- match(levels(cd$sample_id), cd$sample_id)
@@ -187,21 +211,3 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                              colData = cd,
                                              metadata = md)
+                                     }
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -

Browse code

add gs used for sim to output sce

HelenaLC authored on 12/03/2019 08:53:17
Showing 1 changed files

R/simData.R

History View file @ 23d823d

@@ -22,8 +22,8 @@
                                      #'   containing multiple clusters & samples across 2 groups.
                                      #'
                                      #' @examples
                                     -#' data(kang)
                                     -#' simData(kang,
                                     +#' data(sce)
                                     +#' simData(sce,
                                      #'     n_genes = 10, n_cells = 10,
                                      #'     p_dd = diag(6)[1, ])
                                      #'
@@ -49,14 +49,14 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                          # c: gene category
                                          # check validity of input arguments
                                     -    stopifnot(is(x, "SingleCellExperiment"))
                                     +    .check_sce(x, req_group = FALSE)
                                          stopifnot(is.numeric(n_genes), length(n_genes) == 1)
                                          stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
                                          stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
                                          stopifnot(is.numeric(fc), is.numeric(fc), fc > 1)
                                     -    kids <- levels(colData(x)$cluster_id)
                                     -    sids <- levels(colData(x)$sample_id)
                                     +    kids <- levels(x$cluster_id)
                                     +    sids <- levels(x$sample_id)
                                          gids <- c("A", "B")
                                          names(kids) <- kids
                                          names(sids) <- sids
@@ -70,7 +70,7 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                          # sample cell metadata
                                          cd <- .sample_cell_md(
                                     -        n = n_cells, probs = NULL,
                                     +        n = n_cells, probs = probs,
                                              ids = list(kids, sids, gids)) %>% set_rownames(cs)
                                          cs_idx <- .split_cells(cd, by = colnames(cd))
                                          n_cs <- modify_depth(cs_idx, -1, length)
@@ -115,6 +115,9 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                          d <- rowData(x)$dispersion
                                          names(d) <- rownames(x)
                                     +    sim_mean <- lapply(kids, function(k)
                                     +        lapply(gids, function(g)
                                     +            setNames(numeric(n_genes), rownames(y))))
                                          for (k in kids) {
                                              for (s in sids) {
                                                  for (c in cats[n_dd[, k] != 0]) {
@@ -135,8 +138,13 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                                      d_kc <- d[gs_kc]
                                                      lfc_kc <- lfc[[c, k]]
                                     +                gidx <- gs_idx[[c, k]]
                                     +                cidx <- c(g1, g2)
+                                    +
                                                      counts <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d = d_kc, lfc = lfc_kc)
                                     -                y[gs_idx[[c, k]], c(g1, g2)] <- counts
                                     +                y[gidx, cidx] <- counts
                                     +                sim_mean[[k]]$A[gidx] <- rowMeans(m_g1) # ... * lfc ??
                                     +                sim_mean[[k]]$B[gidx] <- rowMeans(m_g2)
+                                                 }
+                                             }
+                                         }
@@ -147,11 +155,19 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                              gene = unlist(gs_idx),
                                              cluster_id = rep.int(rep(kids, each = length(cats)), c(n_dd)),
                                              category = rep.int(rep(cats, nk), c(n_dd)),
                                     -        logFC = unlist(lfc)) %>%
                                     +        logFC = unlist(lfc),
                                     +        sim_gene = unlist(gs_by_kc),
                                     +        sim_disp = d[unlist(gs_by_kc)]) %>%
                                              mutate_at("gene", as.character)
                                          o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
                                          gi <- gi[o, ] %>% set_rownames(NULL)
                                     +    a <- unlist(map_depth(sim_mean, 1, "A"))
                                     +    b <- unlist(map_depth(sim_mean, 1, "B"))
                                     +    o <- order(as.numeric(gsub(".*\\.[a-z]+", "", names(a))))
                                     +    gi$sim_mean.A <- a[o]
                                     +    gi$sim_mean.B <- b[o]
+                                    +
                                          # construct SCE
                                          cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
                                          m <- match(levels(cd$sample_id), cd$sample_id)

Browse code

add vignette

HelenaLC authored on 26/02/2019 11:13:34
Showing 1 changed files

R/simData.R

History View file @ f80a808

@@ -9,7 +9,9 @@
                                      #' @param n_genes # of genes to simulate.
                                      #' @param n_cells # of cells to simulate.
                                      #'   Either a single numeric or a range to sample from.
                                     -#' @param ns nb. of genes common to 1, 2, ..., all clusters.
                                     +#' @param probs a list of length 3 containing probabilities of a cell belonging
                                     +#'   to each cluster, sample, and group, respectively. List elements must be
                                     +#'   NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.
                                      #' @param p_dd numeric vector of length 6.
                                      #'   Specifies the probability of a gene being
                                      #'   EE, EP, DE, DP, DM, or DB, respectively.
@@ -23,7 +25,7 @@
                                      #' data(kang)
                                      #' simData(kang,
                                      #'     n_genes = 10, n_cells = 10,
                                     -#'     p_dd = c(1,0,0,0,0,0))
                                     +#'     p_dd = diag(6)[1, ])
                                      #'
                                      #' @importFrom data.table data.table
                                      #' @importFrom dplyr mutate_all mutate_at

Browse code

add limma-voom

HelenaLC authored on 18/02/2019 09:05:33
Showing 1 changed files

R/simData.R

History View file @ 0ed64e2

@@ -82,12 +82,12 @@ simData <- function(x, n_genes = 500, n_cells = 300,
                                              set_colnames(kids)
                                          gs_idx <- .sample_gene_inds(gs, n_dd)
                                     -    # for ea. cluster, sample unique set of genes to simulate from
                                     -    #tmp <- setNames(sample(rownames(x), n_genes, TRUE), gs)
                                     -    #gs_by_k <- replicate(nk, tmp, simplify = FALSE) %>% set_names(kids)
                                     -    gs_by_k <- replicate(nk,
                                     -        setNames(sample(rownames(x), n_genes, TRUE), gs),
                                     -        simplify = FALSE) %>% set_names(kids)
                                     +    # for ea. cluster, sample set of genes to simulate from
                                     +    gs_by_k <- setNames(sample(rownames(x), n_genes, TRUE), gs)
                                     +    gs_by_k <- replicate(nk, gs_by_k, simplify = FALSE) %>% set_names(kids)
                                     +    #gs_by_k <- replicate(nk,
                                     +    #    setNames(sample(rownames(x), n_genes, TRUE), gs),
                                     +    #    simplify = FALSE) %>% set_names(kids)
                                          gs_by_kc <- lapply(kids, function(k)
                                              lapply(cats, function(c)
                                                  gs_by_k[[k]][gs_idx[[c, k]]]) %>%

Browse code

update unit-test for runMAST()

HelenaLC authored on 11/02/2019 07:41:21
Showing 1 changed files

R/simData.R

History View file @ 7f0525d

@@ -35,11 +35,11 @@
                                      #' @importFrom SummarizedExperiment colData
                                      #' @importFrom S4Vectors split
                                      #' @importFrom tibble column_to_rownames
                                     -#' @importFrom zeallot %<-%
                                      #'
                                      #' @export
                                     -simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6)[1, ], fc = 2) {
                                     +simData <- function(x, n_genes = 500, n_cells = 300,
                                     +    probs = NULL, p_dd = diag(6)[1, ], fc = 2) {
                                          # throughout this code...
                                          # k: cluster ID
@@ -83,6 +83,8 @@ simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6
                                          gs_idx <- .sample_gene_inds(gs, n_dd)
                                          # for ea. cluster, sample unique set of genes to simulate from
                                     +    #tmp <- setNames(sample(rownames(x), n_genes, TRUE), gs)
                                     +    #gs_by_k <- replicate(nk, tmp, simplify = FALSE) %>% set_names(kids)
                                          gs_by_k <- replicate(nk,
                                              setNames(sample(rownames(x), n_genes, TRUE), gs),
                                              simplify = FALSE) %>% set_names(kids)

Browse code

all things SE

HelenaLC authored on 06/02/2019 11:32:43
Showing 1 changed files

R/simData.R

History View file @ ce2f7aa

@@ -28,7 +28,8 @@
                                      #' @importFrom data.table data.table
                                      #' @importFrom dplyr mutate_all mutate_at
                                      #' @importFrom edgeR DGEList estimateDisp glmFit
                                     -#' @importFrom purrr modify_at
                                     +#' @importFrom magrittr set_colnames set_rownames
                                     +#' @importFrom purrr modify_at set_names
                                      #' @importFrom stats model.matrix rgamma setNames
                                      #' @importFrom SingleCellExperiment SingleCellExperiment
                                      #' @importFrom SummarizedExperiment colData

Browse code

add devel branch

HelenaLC authored on 05/02/2019 05:36:12
Showing 1 changed files

R/simData.R

History View file @ 306fd36

@@ -142,8 +142,6 @@ simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6
                                              gene = unlist(gs_idx),
                                              cluster_id = rep.int(rep(kids, each = length(cats)), c(n_dd)),
                                              category = rep.int(rep(cats, nk), c(n_dd)),
                                     -        # mean = ,
                                     -        # disp = ,
                                              logFC = unlist(lfc)) %>%
                                              mutate_at("gene", as.character)
                                          o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
@@ -161,7 +159,7 @@ simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6
                                          md <- list(
                                              experiment_info = ei,
                                              n_cells = table(cd$sample_id),
                                     -        gene_info = gi, sim_genes = gs_in)
                                     +        gene_info = gi)
                                          SingleCellExperiment(
                                              assays = list(counts = as.matrix(y)),

Browse code

add drop=FALSE

HelenaLC authored on 31/01/2019 16:45:38
Showing 1 changed files

R/simData.R

History View file @ 0303e5f

@@ -125,8 +125,8 @@ simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6
                                                      cs_g1 <- sample(cs_ks, ng1, replace = TRUE)
                                                      cs_g2 <- sample(cs_ks, ng2, replace = TRUE)
                                     -                m_g1 <- m[gs_kc, cs_g1]
                                     -                m_g2 <- m[gs_kc, cs_g2]
                                     +                m_g1 <- m[gs_kc, cs_g1, drop = FALSE]
                                     +                m_g2 <- m[gs_kc, cs_g2, drop = FALSE]
                                                      d_kc <- d[gs_kc]
                                                      lfc_kc <- lfc[[c, k]]

Browse code

add unit-tested helpers

HelenaLC authored on 31/01/2019 13:53:22
Showing 1 changed files

R/simData.R

History View file @ 1f72f4b

@@ -15,25 +15,35 @@
                                      #'   EE, EP, DE, DP, DM, or DB, respectively.
                                      #' @param fc numeric value to use as mean logFC
                                      #'   for DE, DP, DM, and DB type of genes.
                                     -#' @param seed random seed.
                                     +#'
                                     +#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
                                     +#'   containing multiple clusters & samples across 2 groups.
                                      #'
                                      #' @examples
                                      #' data(kang)
                                      #' simData(kang,
                                      #'     n_genes = 10, n_cells = 10,
                                     -#'     p_dd = c(1,0,0,0,0,0), seed = 1)
                                     +#'     p_dd = c(1,0,0,0,0,0))
                                      #'
                                      #' @importFrom data.table data.table
                                     +#' @importFrom dplyr mutate_all mutate_at
                                      #' @importFrom edgeR DGEList estimateDisp glmFit
                                     +#' @importFrom purrr modify_at
                                      #' @importFrom stats model.matrix rgamma setNames
                                      #' @importFrom SingleCellExperiment SingleCellExperiment
                                      #' @importFrom SummarizedExperiment colData
                                      #' @importFrom S4Vectors split
                                     +#' @importFrom tibble column_to_rownames
                                      #' @importFrom zeallot %<-%
                                      #'
                                      #' @export
                                     -simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
                                     +simData <- function(x, n_genes = 500, n_cells = 300, probs = NULL, p_dd = diag(6)[1, ], fc = 2) {
+                                    +
                                     +    # throughout this code...
                                     +    # k: cluster ID
                                     +    # s: sample ID
                                     +    # c: gene category
                                          # check validity of input arguments
                                          stopifnot(is(x, "SingleCellExperiment"))
@@ -41,157 +51,138 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
                                          stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
                                          stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
                                          stopifnot(is.numeric(fc), is.numeric(fc), fc > 1)
                                     -    stopifnot(is.numeric(seed), length(seed) == 1)
                                     -    cluster_ids <- levels(colData(x)$cluster_id)
                                     -    sample_ids <- levels(colData(x)$sample_id)
                                     -    n_clusters <- length(cluster_ids)
                                     -    n_samples <- length(sample_ids)
+                                    -
                                     -    # split cells by cluster-sample
                                     -    dt <- data.table(
                                     -        cell = colnames(x),
                                     -        cluster_id = colData(x)$cluster_id,
                                     -        sample_id = colData(x)$sample_id)
                                     -    dt_split <- split(dt,
                                     -        by = c("cluster_id", "sample_id"),
                                     -        keep.by = FALSE, flatten = FALSE)
                                     -    cells_by_cluster_sample <- sapply(dt_split, sapply, "[[", "cell")
+                                    -
                                     -    # sample nb. of cells to simulate per cluster-sample
                                     -    if (length(n_cells) == 1) {
                                     -        n_cells <- list(rep(n_cells, 2))
                                     -    } else {
                                     -        n_cells <- replicate(n_clusters * n_samples,
                                     -            list(sample(n_cells[1]:n_cells[2], 2)))
                                     -    }
                                     -    n_cells <- matrix(n_cells,
                                     -        nrow = n_samples, ncol = n_clusters,
                                     -        dimnames = list(sample_ids, cluster_ids))
                                     +    kids <- levels(colData(x)$cluster_id)
                                     +    sids <- levels(colData(x)$sample_id)
                                     +    gids <- c("A", "B")
                                     +    names(kids) <- kids
                                     +    names(sids) <- sids
                                     +    names(gids) <- gids
                                     +    nk <- length(kids)
                                          # initialize count matrix
                                     -    y <- matrix(0,
                                     -        nrow = n_genes,
                                     -        ncol = sum(unlist(n_cells)),
                                     -        dimnames = list(
                                     -            paste0("gene", seq_len(n_genes)),
                                     -            paste0("cell", seq_len(sum(unlist(n_cells))))))
+                                    -
                                     -    # sample nb. of genes to simulate per category
                                     -    ndd <- replicate(n_clusters, {
                                     -        ns <- sample(cats, n_genes, replace = TRUE, prob = p_dd)
                                     -        factor(ns, levels = cats)
                                     -    }, simplify = FALSE)
                                     -    ndd <- sapply(ndd, table)
                                     -    colnames(ndd) <- cluster_ids
+                                    -
                                     -    # sample gene indices
                                     -    is <- sapply(cluster_ids, function(c, gs = rownames(y))
                                     -        sapply(cats, function(cat) {
                                     -            n <- ndd[cat, c]
                                     -            x <- sample(gs, n)
                                     -            gs <<- setdiff(gs, x)
                                     -            return(x) }))
+                                    -
                                     -    # sample cell indices
                                     -    cs <- colnames(y)
                                     -    js <- sapply(cluster_ids, function(c)
                                     -        setNames(lapply(sample_ids, function(s)
                                     -            lapply(n_cells[[s, c]], function(n) {
                                     -                x <- sample(cs, n)
                                     -                cs <<- setdiff(cs, x)
                                     -                return(x) })), sample_ids))
+                                    -
                                     -    # sample genes to simulate from
                                     -    gs <- replicate(n_clusters, sample(rownames(x), n_genes, replace = TRUE))
                                     -    rownames(gs) <- rownames(y)
                                     -    colnames(gs) <- cluster_ids
+                                    -
                                     -    # sample fold-changes
                                     -    lfcs <- sapply(cluster_ids, function(k)
                                     -        sapply(cats, function(c) {
                                     -            n <- ndd[c, k]
                                     +    gs <- paste0("gene", seq_len(n_genes))
                                     +    cs <- paste0("cell", seq_len(n_cells))
                                     +    y <- matrix(0, n_genes, n_cells, dimnames = list(gs, cs))
+                                    +
                                     +    # sample cell metadata
                                     +    cd <- .sample_cell_md(
                                     +        n = n_cells, probs = NULL,
                                     +        ids = list(kids, sids, gids)) %>% set_rownames(cs)
                                     +    cs_idx <- .split_cells(cd, by = colnames(cd))
                                     +    n_cs <- modify_depth(cs_idx, -1, length)
+                                    +
                                     +    # split input cells by cluster-sample
                                     +    cs_by_ks <- .split_cells(x)
+                                    +
                                     +    # sample nb. of genes to simulate per category & gene indices
                                     +    n_dd <- replicate(nk,
                                     +        table(sample(factor(cats, levels = cats), n_genes, TRUE, p_dd))) %>%
                                     +        set_colnames(kids)
                                     +    gs_idx <- .sample_gene_inds(gs, n_dd)
+                                    +
                                     +    # for ea. cluster, sample unique set of genes to simulate from
                                     +    gs_by_k <- replicate(nk,
                                     +        setNames(sample(rownames(x), n_genes, TRUE), gs),
                                     +        simplify = FALSE) %>% set_names(kids)
                                     +    gs_by_kc <- lapply(kids, function(k)
                                     +        lapply(cats, function(c)
                                     +            gs_by_k[[k]][gs_idx[[c, k]]]) %>%
                                     +            set_names(cats))
+                                    +
                                     +    # sample logFCs
                                     +    lfc <- vapply(kids, function(k)
                                     +        lapply(cats, function(c) {
                                     +            n <- n_dd[c, k]
                                                  if (c == "ee") return(rep(NA, n))
                                     -            signs <- sample(c(-1, 1), size = n, replace = TRUE)
                                     -            lfcs <- rgamma(n, 4, 4 / fc) * signs
                                     -            names(lfcs) <- gs[is[[c, k]], k]
                                     -            return(lfcs)
                                     -    }))
+                                    -
                                     -    for (k in cluster_ids) {
                                     -        # get NB parameters
                                     -        m <- rowData(x)[gs[, k], ]$beta
                                     -        d <- rowData(x)[gs[, k], ]$dispersion
                                     -        names(m) <- names(d) <- gs[, k]
+                                    -
                                     -        for (s in sample_ids) {
                                     -            # cells to simulate from
                                     -            cs <- cells_by_cluster_sample[[s, k]]
+                                    -
                                     -            # compute mus
                                     -            o <- setNames(colData(x)[cs, ]$offset, cs)
                                     -            mu <- sapply(exp(o), "*", exp(m))
+                                    -
                                     -            # get cell indices & nb. of cells by group
                                     -            ng1 <- length(g1 <- js[[s, k]][[1]])
                                     -            ng2 <- length(g2 <- js[[s, k]][[2]])
+                                    -
                                     -            # simulate data
                                     -            for (c in cats)
                                     -                if (ndd[c, k] > 0) y[is[[c, k]], c(g1, g2)] <-
                                     -                simdd(c, gs[is[[c, k]], k], cs, ng1, ng2, mu, d, lfcs[[c, k]])
                                     +            signs <- sample(c(-1, 1), n, TRUE)
                                     +            lfc <- rgamma(n, 4, 4/fc) * signs
                                     +            names(lfc) <- gs_by_kc[[k]][[c]]
                                     +            return(lfc)
                                     +        }), vector("list", length(cats))) %>%
                                     +        set_rownames(cats)
+                                    +
                                     +    # compute NB parameters
                                     +    b <- exp(rowData(x)$beta)
                                     +    o <- exp(colData(x)$offset)
                                     +    m <- vapply(o, function(l) b*l, numeric(nrow(x)))
                                     +    dimnames(m) <- dimnames(x)
                                     +    d <- rowData(x)$dispersion
                                     +    names(d) <- rownames(x)
+                                    +
                                     +    for (k in kids) {
                                     +        for (s in sids) {
                                     +            for (c in cats[n_dd[, k] != 0]) {
                                     +                gs_kc <- gs_by_kc[[k]][[c]]
                                     +                cs_ks <- cs_by_ks[[k]][[s]]
+                                    +
                                     +                g1 <- cs_idx[[k]][[s]]$A
                                     +                g2 <- cs_idx[[k]][[s]]$B
+                                    +
                                     +                ng1 <- length(g1)
                                     +                ng2 <- length(g2)
+                                    +
                                     +                cs_g1 <- sample(cs_ks, ng1, replace = TRUE)
                                     +                cs_g2 <- sample(cs_ks, ng2, replace = TRUE)
+                                    +
                                     +                m_g1 <- m[gs_kc, cs_g1]
                                     +                m_g2 <- m[gs_kc, cs_g2]
                                     +                d_kc <- d[gs_kc]
                                     +                lfc_kc <- lfc[[c, k]]
+                                    +
                                     +                counts <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d = d_kc, lfc = lfc_kc)
                                     +                y[gs_idx[[c, k]], c(g1, g2)] <- counts
                                     +            }
+                                             }
+                                         }
                                     -    # construct SCE
                                     -    gi <- do.call(rbind, lapply(cluster_ids, function(k)
                                     -        do.call(rbind, lapply(cats, function(c) if (ndd[c, k] != 0)
                                     -            data.frame(
                                     -                gene = is[[c, k]], cluster_id = k,
                                     -                category = c, logFC = lfcs[[c, k]])))))
                                     -    gi <- gi[order(as.numeric(gsub("[a-z]", "", gi$gene))), ]
                                     -    gi$category <- factor(gi$category, levels = ddSingleCell:::cats)
                                     -    rownames(gi) <- NULL
                                     +    # construct gene metadata table storing
                                     +    # gene | cluster_id | category | logFC
                                     +    gi <- data.frame(
                                     +        gene = unlist(gs_idx),
                                     +        cluster_id = rep.int(rep(kids, each = length(cats)), c(n_dd)),
                                     +        category = rep.int(rep(cats, nk), c(n_dd)),
                                     +        # mean = ,
                                     +        # disp = ,
                                     +        logFC = unlist(lfc)) %>%
                                     +        mutate_at("gene", as.character)
                                     +    o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
                                     +    gi <- gi[o, ] %>% set_rownames(NULL)
                                     -    col_data <- do.call(rbind, lapply(cluster_ids, function(c)
                                     -        do.call(rbind, lapply(sample_ids, function(s)
                                     -            data.frame(
                                     -                row.names = 1,
                                     -                unlist(js[[s, c]]),
                                     -                cluster_id = c, sample_id = s,
                                     -                group_id = rep.int(c("A", "B"), n_cells[[s, c]]))))))
                                     -    col_data <- col_data[colnames(y), ]
                                     -    col_data$sample_id <- factor(paste(col_data$group_id, col_data$sample_id, sep = "."))
                                     +    # construct SCE
                                     +    cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
                                     +    m <- match(levels(cd$sample_id), cd$sample_id)
                                     +    gids <- cd$group_id[m]
                                     +    o <- order(gids)
                                     +    sids <- levels(cd$sample_id)[o]
                                     +    ei <- data.frame(sample_id = sids, group_id = gids[o])
                                     +    cd <- cd %>% mutate_at("sample_id", factor, levels = sids)
                                     -    sample_id <- levels(col_data$sample_id)
                                     -    group_id <- gsub("(A|B)[.].*", "\\1", sample_id)
                                     -    ei <- data.frame(sample_id, group_id)
                                          md <- list(
                                              experiment_info = ei,
                                     -        n_cells = table(col_data$sample_id),
                                     -        gene_info = gi, sim_genes = gs)
                                     +        n_cells = table(cd$sample_id),
                                     +        gene_info = gi, sim_genes = gs_in)
                                          SingleCellExperiment(
                                     -        assays = list(counts = y),
                                     -        colData = col_data,
                                     +        assays = list(counts = as.matrix(y)),
                                     +        colData = cd,
                                              metadata = md)
+                                     }
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
+                                    -
                                     \ No newline at end of file
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +

Browse code

sapply>vapply

HelenaLC authored on 23/01/2019 16:20:14
Showing 1 changed files

R/simData.R

History View file @ e075de0

@@ -13,6 +13,8 @@
                                      #' @param p_dd numeric vector of length 6.
                                      #'   Specifies the probability of a gene being
                                      #'   EE, EP, DE, DP, DM, or DB, respectively.
                                     +#' @param fc numeric value to use as mean logFC
                                     +#'   for DE, DP, DM, and DB type of genes.
                                      #' @param seed random seed.
                                      #'
                                      #' @examples
@@ -21,10 +23,11 @@
                                      #'     n_genes = 10, n_cells = 10,
                                      #'     p_dd = c(1,0,0,0,0,0), seed = 1)
                                      #'
                                     -#' @import SingleCellExperiment
                                      #' @importFrom data.table data.table
                                      #' @importFrom edgeR DGEList estimateDisp glmFit
                                     -#' @importFrom stats model.matrix rnbinom setNames
                                     +#' @importFrom stats model.matrix rgamma setNames
                                     +#' @importFrom SingleCellExperiment SingleCellExperiment
                                     +#' @importFrom SummarizedExperiment colData
                                      #' @importFrom S4Vectors split
                                      #' @importFrom zeallot %<-%
                                      #'
@@ -33,7 +36,7 @@
                                      simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
                                          # check validity of input arguments
                                     -    stopifnot(class(x) == "SingleCellExperiment")
                                     +    stopifnot(is(x, "SingleCellExperiment"))
                                          stopifnot(is.numeric(n_genes), length(n_genes) == 1)
                                          stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
                                          stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
@@ -108,35 +111,35 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
                                          lfcs <- sapply(cluster_ids, function(k)
                                              sapply(cats, function(c) {
                                                  n <- ndd[c, k]
                                     -            if (c %in% c("ee", "ep")) return(rep(NA, n))
                                     +            if (c == "ee") return(rep(NA, n))
                                                  signs <- sample(c(-1, 1), size = n, replace = TRUE)
                                                  lfcs <- rgamma(n, 4, 4 / fc) * signs
                                                  names(lfcs) <- gs[is[[c, k]], k]
                                                  return(lfcs)
                                          }))
                                     -    for (c in cluster_ids) {
                                     +    for (k in cluster_ids) {
                                              # get NB parameters
                                     -        m <- rowData(x)[gs[, c], ]$beta
                                     -        d <- rowData(x)[gs[, c], ]$dispersion
                                     -        names(m) <- names(d) <- gs[, c]
                                     +        m <- rowData(x)[gs[, k], ]$beta
                                     +        d <- rowData(x)[gs[, k], ]$dispersion
                                     +        names(m) <- names(d) <- gs[, k]
                                              for (s in sample_ids) {
                                                  # cells to simulate from
                                     -            cs <- cells_by_cluster_sample[[s, c]]
                                     +            cs <- cells_by_cluster_sample[[s, k]]
                                                  # compute mus
                                                  o <- setNames(colData(x)[cs, ]$offset, cs)
                                                  mu <- sapply(exp(o), "*", exp(m))
                                                  # get cell indices & nb. of cells by group
                                     -            ng1 <- length(g1 <- js[[s, c]][[1]])
                                     -            ng2 <- length(g2 <- js[[s, c]][[2]])
                                     +            ng1 <- length(g1 <- js[[s, k]][[1]])
                                     +            ng2 <- length(g2 <- js[[s, k]][[2]])
                                                  # simulate data
                                     -            for (cat in cats)
                                     -                if (ndd[cat, c] > 0) y[is[[cat, c]], c(g1, g2)] <-
                                     -                simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, lfcs[[cat, c]])
                                     +            for (c in cats)
                                     +                if (ndd[c, k] > 0) y[is[[c, k]], c(g1, g2)] <-
                                     +                simdd(c, gs[is[[c, k]], k], cs, ng1, ng2, mu, d, lfcs[[c, k]])
+                                             }
+                                         }
@@ -156,13 +159,13 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
                                                      row.names = 1,
                                                      unlist(js[[s, c]]),
                                                      cluster_id = c, sample_id = s,
                                     -                group = rep.int(c("A", "B"), n_cells[[s, c]]))))))
                                     +                group_id = rep.int(c("A", "B"), n_cells[[s, c]]))))))
                                          col_data <- col_data[colnames(y), ]
                                     -    col_data$sample_id <- factor(paste(col_data$group, col_data$sample_id, sep = "."))
                                     +    col_data$sample_id <- factor(paste(col_data$group_id, col_data$sample_id, sep = "."))
                                          sample_id <- levels(col_data$sample_id)
                                     -    group <- gsub("(A|B)[.].*", "\\1", sample_id)
                                     -    ei <- data.frame(sample_id, group)
                                     +    group_id <- gsub("(A|B)[.].*", "\\1", sample_id)
                                     +    ei <- data.frame(sample_id, group_id)
                                          md <- list(
                                              experiment_info = ei,
                                              n_cells = table(col_data$sample_id),

Browse code

add logFC columns to metadata

HelenaLC authored on 15/01/2019 12:40:41
Showing 1 changed files

R/simData.R

History View file @ ac1c822

@@ -105,13 +105,14 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
                                          colnames(gs) <- cluster_ids
                                          # sample fold-changes
                                     -    fcs <- sapply(cluster_ids, function(k)
                                     +    lfcs <- sapply(cluster_ids, function(k)
                                              sapply(cats, function(c) {
                                                  n <- ndd[c, k]
                                     +            if (c %in% c("ee", "ep")) return(rep(NA, n))
                                                  signs <- sample(c(-1, 1), size = n, replace = TRUE)
                                     -            fcs <- 2 ^ ( rgamma(n, 4, 4 / fc) * signs )
                                     -            names(fcs) <- gs[is[[c, k]], k]
                                     -            return(fcs)
                                     +            lfcs <- rgamma(n, 4, 4 / fc) * signs
                                     +            names(lfcs) <- gs[is[[c, k]], k]
                                     +            return(lfcs)
                                          }))
                                          for (c in cluster_ids) {
@@ -135,14 +136,16 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
                                                  # simulate data
                                                  for (cat in cats)
                                                      if (ndd[cat, c] > 0) y[is[[cat, c]], c(g1, g2)] <-
                                     -                simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, fcs[[cat, c]])
                                     +                simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, lfcs[[cat, c]])
+                                             }
+                                         }
                                          # construct SCE
                                     -    gi <- do.call(rbind, lapply(cluster_ids, function(c)
                                     -        do.call(rbind, lapply(cats, function(cat) if (ndd[cat, c] != 0)
                                     -            data.frame(gene = is[[cat, c]], cluster_id = c, category = cat)))))
                                     +    gi <- do.call(rbind, lapply(cluster_ids, function(k)
                                     +        do.call(rbind, lapply(cats, function(c) if (ndd[c, k] != 0)
                                     +            data.frame(
                                     +                gene = is[[c, k]], cluster_id = k,
                                     +                category = c, logFC = lfcs[[c, k]])))))
                                          gi <- gi[order(as.numeric(gsub("[a-z]", "", gi$gene))), ]
                                          gi$category <- factor(gi$category, levels = ddSingleCell:::cats)
                                          rownames(gi) <- NULL

Browse code

fix usage of FCs

HelenaLC authored on 15/01/2019 12:30:50
Showing 1 changed files

R/simData.R

History View file @ 5e0f007

@@ -109,8 +109,9 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
                                              sapply(cats, function(c) {
                                                  n <- ndd[c, k]
                                                  signs <- sample(c(-1, 1), size = n, replace = TRUE)
                                     -            fcs <- 2 ^ ( rgamma(n, 4 , 4 / fc) * signs )
                                     -            setNames(fcs, gs[is[[c, k]], k])
                                     +            fcs <- 2 ^ ( rgamma(n, 4, 4 / fc) * signs )
                                     +            names(fcs) <- gs[is[[c, k]], k]
                                     +            return(fcs)
                                          }))
                                          for (c in cluster_ids) {

Browse code

major revision of simulation

HelenaLC authored on 14/01/2019 15:21:44
Showing 1 changed files

R/simData.R

History View file @ fc93152

@@ -32,11 +32,12 @@
                                      simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
                                     +    # check validity of input arguments
                                          stopifnot(class(x) == "SingleCellExperiment")
                                          stopifnot(is.numeric(n_genes), length(n_genes) == 1)
                                          stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
                                          stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
                                     -    stopifnot(is.numeric(fc), is.numeric(fc))
                                     +    stopifnot(is.numeric(fc), is.numeric(fc), fc > 1)
                                          stopifnot(is.numeric(seed), length(seed) == 1)
                                          cluster_ids <- levels(colData(x)$cluster_id)
@@ -47,11 +48,12 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
                                          # split cells by cluster-sample
                                          dt <- data.table(
                                              cell = colnames(x),
                                     -        data.frame(colData(x)))
                                     -    cells <- dt %>% split(
                                     +        cluster_id = colData(x)$cluster_id,
                                     +        sample_id = colData(x)$sample_id)
                                     +    dt_split <- split(dt,
                                              by = c("cluster_id", "sample_id"),
                                              keep.by = FALSE, flatten = FALSE)
                                     -    cells <- sapply(cells, sapply, "[[", "cell")
                                     +    cells_by_cluster_sample <- sapply(dt_split, sapply, "[[", "cell")
                                          # sample nb. of cells to simulate per cluster-sample
                                          if (length(n_cells) == 1) {
@@ -98,10 +100,19 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
                                                      return(x) })), sample_ids))
                                          # sample genes to simulate from
                                     -    gs <- replicate(n_clusters, sample(rownames(x), n_genes))
                                     +    gs <- replicate(n_clusters, sample(rownames(x), n_genes, replace = TRUE))
                                          rownames(gs) <- rownames(y)
                                          colnames(gs) <- cluster_ids
                                     +    # sample fold-changes
                                     +    fcs <- sapply(cluster_ids, function(k)
                                     +        sapply(cats, function(c) {
                                     +            n <- ndd[c, k]
                                     +            signs <- sample(c(-1, 1), size = n, replace = TRUE)
                                     +            fcs <- 2 ^ ( rgamma(n, 4 , 4 / fc) * signs )
                                     +            setNames(fcs, gs[is[[c, k]], k])
                                     +    }))
+                                    +
                                          for (c in cluster_ids) {
                                              # get NB parameters
                                              m <- rowData(x)[gs[, c], ]$beta
@@ -110,7 +121,7 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
                                              for (s in sample_ids) {
                                                  # cells to simulate from
                                     -            cs <- cells[[s, c]]
                                     +            cs <- cells_by_cluster_sample[[s, c]]
                                                  # compute mus
                                                  o <- setNames(colData(x)[cs, ]$offset, cs)
@@ -123,16 +134,16 @@ simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
                                                  # simulate data
                                                  for (cat in cats)
                                                      if (ndd[cat, c] > 0) y[is[[cat, c]], c(g1, g2)] <-
                                     -                simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, fc)
                                     +                simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, fcs[[cat, c]])
+                                             }
+                                         }
                                          # construct SCE
                                          gi <- do.call(rbind, lapply(cluster_ids, function(c)
                                              do.call(rbind, lapply(cats, function(cat) if (ndd[cat, c] != 0)
                                     -            data.frame(genes = is[[cat, c]], cluster_id = c, category = cat)))))
                                     +            data.frame(gene = is[[cat, c]], cluster_id = c, category = cat)))))
                                          gi <- gi[order(as.numeric(gsub("[a-z]", "", gi$gene))), ]
                                     -    levels(gi$category) <- c("ee", "ep", "de", "dp", "dm", "db")
                                     +    gi$category <- factor(gi$category, levels = ddSingleCell:::cats)
                                          rownames(gi) <- NULL
                                          col_data <- do.call(rbind, lapply(cluster_ids, function(c)

Browse code

re-write of aggregateData()

HelenaLC authored on 08/01/2019 11:05:05
Showing 1 changed files

R/simData.R

History View file @ 81184c0

@@ -16,21 +16,28 @@
                                      #' @param seed random seed.
                                      #'
                                      #' @examples
                                     -#' data(kang_se, kang_fit)
                                     -#' simDD(kang_se, kang_fit,
                                     +#' data(kang)
                                     +#' simData(kang,
                                      #'     n_genes = 10, n_cells = 10,
                                      #'     p_dd = c(1,0,0,0,0,0), seed = 1)
                                      #'
                                      #' @import SingleCellExperiment
                                      #' @importFrom data.table data.table
                                      #' @importFrom edgeR DGEList estimateDisp glmFit
                                     -#' @importFrom stats model.matrix rnbinom
                                     +#' @importFrom stats model.matrix rnbinom setNames
                                      #' @importFrom S4Vectors split
                                      #' @importFrom zeallot %<-%
                                      #'
                                      #' @export
                                     -simData <- function(x, n_genes, n_cells, p_dd) {
                                     +simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
+                                    +
                                     +    stopifnot(class(x) == "SingleCellExperiment")
                                     +    stopifnot(is.numeric(n_genes), length(n_genes) == 1)
                                     +    stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
                                     +    stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
                                     +    stopifnot(is.numeric(fc), is.numeric(fc))
                                     +    stopifnot(is.numeric(seed), length(seed) == 1)
                                          cluster_ids <- levels(colData(x)$cluster_id)
                                          sample_ids <- levels(colData(x)$sample_id)
@@ -97,19 +104,16 @@ simData <- function(x, n_genes, n_cells, p_dd) {
                                          for (c in cluster_ids) {
                                              # get NB parameters
                                     -        m <- rowData(x)$mean[gs[, c]]
                                     -        d <- rowData(x)$dispersion[gs[, c]]
+                                    -
                                     -        # get gene indices & nb. of genes by category
                                     -        c(iee, iep, ide, idp, idm, idb) %<-% sapply(cats, function(i) is[i, c])
                                     -        c(nee, nep, nde, ndp, ndm, ndb) %<-% vapply(cats, function(i) ndd[i, c], numeric(1))
                                     +        m <- rowData(x)[gs[, c], ]$beta
                                     +        d <- rowData(x)[gs[, c], ]$dispersion
                                     +        names(m) <- names(d) <- gs[, c]
                                              for (s in sample_ids) {
                                                  # cells to simulate from
                                                  cs <- cells[[s, c]]
                                                  # compute mus
                                     -            o <- colData(x)$offset[cs]
                                     +            o <- setNames(colData(x)[cs, ]$offset, cs)
                                                  mu <- sapply(exp(o), "*", exp(m))
                                                  # get cell indices & nb. of cells by group
@@ -117,12 +121,9 @@ simData <- function(x, n_genes, n_cells, p_dd) {
                                                  ng2 <- length(g2 <- js[[s, c]][[2]])
                                                  # simulate data
                                     -            if (nee > 0) y[iee, c(g1, g2)] <- simdd("ee", gs[iee, c], cs, ng1, ng2, mu, d)
                                     -            if (nep > 0) y[iep, c(g1, g2)] <- simdd("ep", gs[iep, c], cs, ng1, ng2, mu, d)
                                     -            if (nde > 0) y[ide, c(g1, g2)] <- simdd("de", gs[ide, c], cs, ng1, ng2, mu, d)
                                     -            if (ndp > 0) y[idp, c(g1, g2)] <- simdd("dp", gs[idp, c], cs, ng1, ng2, mu, d)
                                     -            if (ndm > 0) y[idm, c(g1, g2)] <- simdd("dm", gs[idm, c], cs, ng1, ng2, mu, d)
                                     -            if (ndb > 0) y[idb, c(g1, g2)] <- simdd("db", gs[idb, c], cs, ng1, ng2, mu, d)
                                     +            for (cat in cats)
                                     +                if (ndd[cat, c] > 0) y[is[[cat, c]], c(g1, g2)] <-
                                     +                simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, fc)
+                                             }
+                                         }

Browse code

add prepData() & aggregateData(), major revision of run_edgeR()

HelenaLC authored on 26/11/2018 20:59:35
Showing 1 changed files

R/simData.R

History View file @ cdc6130

                                     new file mode 100755
@@ -0,0 +1,178 @@
                                     +#' simData
                                     +#'
                                     +#' Simulation of complex scRNA-seq data
                                     +#'
                                     +#' \code{simData} simulates multiple clusters and samples
                                     +#' across 2 experimental conditions from a real scRNA-seq data set.
                                     +#'
                                     +#' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
                                     +#' @param n_genes # of genes to simulate.
                                     +#' @param n_cells # of cells to simulate.
                                     +#'   Either a single numeric or a range to sample from.
                                     +#' @param ns nb. of genes common to 1, 2, ..., all clusters.
                                     +#' @param p_dd numeric vector of length 6.
                                     +#'   Specifies the probability of a gene being
                                     +#'   EE, EP, DE, DP, DM, or DB, respectively.
                                     +#' @param seed random seed.
                                     +#'
                                     +#' @examples
                                     +#' data(kang_se, kang_fit)
                                     +#' simDD(kang_se, kang_fit,
                                     +#'     n_genes = 10, n_cells = 10,
                                     +#'     p_dd = c(1,0,0,0,0,0), seed = 1)
                                     +#'
                                     +#' @import SingleCellExperiment
                                     +#' @importFrom data.table data.table
                                     +#' @importFrom edgeR DGEList estimateDisp glmFit
                                     +#' @importFrom stats model.matrix rnbinom
                                     +#' @importFrom S4Vectors split
                                     +#' @importFrom zeallot %<-%
                                     +#'
                                     +#' @export
+                                    +
                                     +simData <- function(x, n_genes, n_cells, p_dd) {
+                                    +
                                     +    cluster_ids <- levels(colData(x)$cluster_id)
                                     +    sample_ids <- levels(colData(x)$sample_id)
                                     +    n_clusters <- length(cluster_ids)
                                     +    n_samples <- length(sample_ids)
+                                    +
                                     +    # split cells by cluster-sample
                                     +    dt <- data.table(
                                     +        cell = colnames(x),
                                     +        data.frame(colData(x)))
                                     +    cells <- dt %>% split(
                                     +        by = c("cluster_id", "sample_id"),
                                     +        keep.by = FALSE, flatten = FALSE)
                                     +    cells <- sapply(cells, sapply, "[[", "cell")
+                                    +
                                     +    # sample nb. of cells to simulate per cluster-sample
                                     +    if (length(n_cells) == 1) {
                                     +        n_cells <- list(rep(n_cells, 2))
                                     +    } else {
                                     +        n_cells <- replicate(n_clusters * n_samples,
                                     +            list(sample(n_cells[1]:n_cells[2], 2)))
                                     +    }
                                     +    n_cells <- matrix(n_cells,
                                     +        nrow = n_samples, ncol = n_clusters,
                                     +        dimnames = list(sample_ids, cluster_ids))
+                                    +
                                     +    # initialize count matrix
                                     +    y <- matrix(0,
                                     +        nrow = n_genes,
                                     +        ncol = sum(unlist(n_cells)),
                                     +        dimnames = list(
                                     +            paste0("gene", seq_len(n_genes)),
                                     +            paste0("cell", seq_len(sum(unlist(n_cells))))))
+                                    +
                                     +    # sample nb. of genes to simulate per category
                                     +    ndd <- replicate(n_clusters, {
                                     +        ns <- sample(cats, n_genes, replace = TRUE, prob = p_dd)
                                     +        factor(ns, levels = cats)
                                     +    }, simplify = FALSE)
                                     +    ndd <- sapply(ndd, table)
                                     +    colnames(ndd) <- cluster_ids
+                                    +
                                     +    # sample gene indices
                                     +    is <- sapply(cluster_ids, function(c, gs = rownames(y))
                                     +        sapply(cats, function(cat) {
                                     +            n <- ndd[cat, c]
                                     +            x <- sample(gs, n)
                                     +            gs <<- setdiff(gs, x)
                                     +            return(x) }))
+                                    +
                                     +    # sample cell indices
                                     +    cs <- colnames(y)
                                     +    js <- sapply(cluster_ids, function(c)
                                     +        setNames(lapply(sample_ids, function(s)
                                     +            lapply(n_cells[[s, c]], function(n) {
                                     +                x <- sample(cs, n)
                                     +                cs <<- setdiff(cs, x)
                                     +                return(x) })), sample_ids))
+                                    +
                                     +    # sample genes to simulate from
                                     +    gs <- replicate(n_clusters, sample(rownames(x), n_genes))
                                     +    rownames(gs) <- rownames(y)
                                     +    colnames(gs) <- cluster_ids
+                                    +
                                     +    for (c in cluster_ids) {
                                     +        # get NB parameters
                                     +        m <- rowData(x)$mean[gs[, c]]
                                     +        d <- rowData(x)$dispersion[gs[, c]]
+                                    +
                                     +        # get gene indices & nb. of genes by category
                                     +        c(iee, iep, ide, idp, idm, idb) %<-% sapply(cats, function(i) is[i, c])
                                     +        c(nee, nep, nde, ndp, ndm, ndb) %<-% vapply(cats, function(i) ndd[i, c], numeric(1))
+                                    +
                                     +        for (s in sample_ids) {
                                     +            # cells to simulate from
                                     +            cs <- cells[[s, c]]
+                                    +
                                     +            # compute mus
                                     +            o <- colData(x)$offset[cs]
                                     +            mu <- sapply(exp(o), "*", exp(m))
+                                    +
                                     +            # get cell indices & nb. of cells by group
                                     +            ng1 <- length(g1 <- js[[s, c]][[1]])
                                     +            ng2 <- length(g2 <- js[[s, c]][[2]])
+                                    +
                                     +            # simulate data
                                     +            if (nee > 0) y[iee, c(g1, g2)] <- simdd("ee", gs[iee, c], cs, ng1, ng2, mu, d)
                                     +            if (nep > 0) y[iep, c(g1, g2)] <- simdd("ep", gs[iep, c], cs, ng1, ng2, mu, d)
                                     +            if (nde > 0) y[ide, c(g1, g2)] <- simdd("de", gs[ide, c], cs, ng1, ng2, mu, d)
                                     +            if (ndp > 0) y[idp, c(g1, g2)] <- simdd("dp", gs[idp, c], cs, ng1, ng2, mu, d)
                                     +            if (ndm > 0) y[idm, c(g1, g2)] <- simdd("dm", gs[idm, c], cs, ng1, ng2, mu, d)
                                     +            if (ndb > 0) y[idb, c(g1, g2)] <- simdd("db", gs[idb, c], cs, ng1, ng2, mu, d)
                                     +        }
                                     +    }
+                                    +
                                     +    # construct SCE
                                     +    gi <- do.call(rbind, lapply(cluster_ids, function(c)
                                     +        do.call(rbind, lapply(cats, function(cat) if (ndd[cat, c] != 0)
                                     +            data.frame(genes = is[[cat, c]], cluster_id = c, category = cat)))))
                                     +    gi <- gi[order(as.numeric(gsub("[a-z]", "", gi$gene))), ]
                                     +    levels(gi$category) <- c("ee", "ep", "de", "dp", "dm", "db")
                                     +    rownames(gi) <- NULL
+                                    +
                                     +    col_data <- do.call(rbind, lapply(cluster_ids, function(c)
                                     +        do.call(rbind, lapply(sample_ids, function(s)
                                     +            data.frame(
                                     +                row.names = 1,
                                     +                unlist(js[[s, c]]),
                                     +                cluster_id = c, sample_id = s,
                                     +                group = rep.int(c("A", "B"), n_cells[[s, c]]))))))
                                     +    col_data <- col_data[colnames(y), ]
                                     +    col_data$sample_id <- factor(paste(col_data$group, col_data$sample_id, sep = "."))
+                                    +
                                     +    sample_id <- levels(col_data$sample_id)
                                     +    group <- gsub("(A|B)[.].*", "\\1", sample_id)
                                     +    ei <- data.frame(sample_id, group)
                                     +    md <- list(
                                     +        experiment_info = ei,
                                     +        n_cells = table(col_data$sample_id),
                                     +        gene_info = gi, sim_genes = gs)
+                                    +
                                     +    SingleCellExperiment(
                                     +        assays = list(counts = y),
                                     +        colData = col_data,
                                     +        metadata = md)
                                     +}
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
+                                    +
                                     \ No newline at end of file