Bioconductor Code: muscat

Raw Blame Patch Log History
#' simData
#' 
#' Simulation of complex scRNA-seq data 
#' 
#' \code{simData} simulates multiple clusters and samples 
#' across 2 experimental conditions from a real scRNA-seq data set.
#' 
#' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
#' @param n_genes # of genes to simulate. 
#' @param n_cells # of cells to simulate. 
#'   Either a single numeric or a range to sample from.
#' @param ns nb. of genes common to 1, 2, ..., all clusters.
#' @param p_dd numeric vector of length 6.
#'   Specifies the probability of a gene being
#'   EE, EP, DE, DP, DM, or DB, respectively.
#' @param seed random seed. 
#' 
#' @examples
#' data(kang)
#' simData(kang,
#'     n_genes = 10, n_cells = 10,
#'     p_dd = c(1,0,0,0,0,0), seed = 1)
#' 
#' @import SingleCellExperiment 
#' @importFrom data.table data.table
#' @importFrom edgeR DGEList estimateDisp glmFit
#' @importFrom stats model.matrix rnbinom setNames
#' @importFrom S4Vectors split
#' @importFrom zeallot %<-%
#' 
#' @export

simData <- function(x, n_genes, n_cells, p_dd, fc = 2, seed = 1) {
    
    stopifnot(class(x) == "SingleCellExperiment")
    stopifnot(is.numeric(n_genes), length(n_genes) == 1)
    stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
    stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
    stopifnot(is.numeric(fc), is.numeric(fc))
    stopifnot(is.numeric(seed), length(seed) == 1)
    
    cluster_ids <- levels(colData(x)$cluster_id)
    sample_ids <- levels(colData(x)$sample_id)
    n_clusters <- length(cluster_ids)
    n_samples <- length(sample_ids)
    
    # split cells by cluster-sample
    dt <- data.table(
        cell = colnames(x), 
        data.frame(colData(x)))
    cells <- dt %>% split(
        by = c("cluster_id", "sample_id"), 
        keep.by = FALSE, flatten = FALSE)
    cells <- sapply(cells, sapply, "[[", "cell")
    
    # sample nb. of cells to simulate per cluster-sample
    if (length(n_cells) == 1) {
        n_cells <- list(rep(n_cells, 2))
    } else {
        n_cells <- replicate(n_clusters * n_samples, 
            list(sample(n_cells[1]:n_cells[2], 2)))
    }
    n_cells <- matrix(n_cells, 
        nrow = n_samples, ncol = n_clusters, 
        dimnames = list(sample_ids, cluster_ids))
    
    # initialize count matrix
    y <- matrix(0, 
        nrow = n_genes, 
        ncol = sum(unlist(n_cells)),
        dimnames = list(
            paste0("gene", seq_len(n_genes)),
            paste0("cell", seq_len(sum(unlist(n_cells))))))
    
    # sample nb. of genes to simulate per category
    ndd <- replicate(n_clusters, {
        ns <- sample(cats, n_genes, replace = TRUE, prob = p_dd)
        factor(ns, levels = cats) 
    }, simplify = FALSE)
    ndd <- sapply(ndd, table)
    colnames(ndd) <- cluster_ids
    
    # sample gene indices
    is <- sapply(cluster_ids, function(c, gs = rownames(y))
        sapply(cats, function(cat) { 
            n <- ndd[cat, c]
            x <- sample(gs, n)
            gs <<- setdiff(gs, x)
            return(x) }))
    
    # sample cell indices
    cs <- colnames(y)
    js <- sapply(cluster_ids, function(c) 
        setNames(lapply(sample_ids, function(s)
            lapply(n_cells[[s, c]], function(n) {
                x <- sample(cs, n)
                cs <<- setdiff(cs, x)
                return(x) })), sample_ids))
    
    # sample genes to simulate from
    gs <- replicate(n_clusters, sample(rownames(x), n_genes))
    rownames(gs) <- rownames(y)
    colnames(gs) <- cluster_ids
    
    for (c in cluster_ids) {
        # get NB parameters
        m <- rowData(x)[gs[, c], ]$beta
        d <- rowData(x)[gs[, c], ]$dispersion
        names(m) <- names(d) <- gs[, c]
        
        for (s in sample_ids) {
            # cells to simulate from
            cs <- cells[[s, c]]
            
            # compute mus
            o <- setNames(colData(x)[cs, ]$offset, cs)
            mu <- sapply(exp(o), "*", exp(m))
            
            # get cell indices & nb. of cells by group
            ng1 <- length(g1 <- js[[s, c]][[1]])
            ng2 <- length(g2 <- js[[s, c]][[2]])
            
            # simulate data
            for (cat in cats)
                if (ndd[cat, c] > 0) y[is[[cat, c]], c(g1, g2)] <- 
                simdd(cat, gs[is[[cat, c]], c], cs, ng1, ng2, mu, d, fc)
        }
    }
    
    # construct SCE
    gi <- do.call(rbind, lapply(cluster_ids, function(c)
        do.call(rbind, lapply(cats, function(cat) if (ndd[cat, c] != 0)
            data.frame(genes = is[[cat, c]], cluster_id = c, category = cat)))))
    gi <- gi[order(as.numeric(gsub("[a-z]", "", gi$gene))), ]
    levels(gi$category) <- c("ee", "ep", "de", "dp", "dm", "db")
    rownames(gi) <- NULL
    
    col_data <- do.call(rbind, lapply(cluster_ids, function(c) 
        do.call(rbind, lapply(sample_ids, function(s) 
            data.frame(
                row.names = 1,
                unlist(js[[s, c]]), 
                cluster_id = c, sample_id = s, 
                group = rep.int(c("A", "B"), n_cells[[s, c]]))))))
    col_data <- col_data[colnames(y), ]
    col_data$sample_id <- factor(paste(col_data$group, col_data$sample_id, sep = "."))
    
    sample_id <- levels(col_data$sample_id)
    group <- gsub("(A|B)[.].*", "\\1", sample_id)
    ei <- data.frame(sample_id, group)
    md <- list(
        experiment_info = ei,
        n_cells = table(col_data$sample_id),
        gene_info = gi, sim_genes = gs)
    
    SingleCellExperiment(
        assays = list(counts = y),
        colData = col_data, 
        metadata = md)
}