Bioconductor Code: muscat

Raw Blame Patch Log History
#' simData
#' 
#' Simulation of complex scRNA-seq data 
#' 
#' \code{simData} simulates multiple clusters and samples 
#' across 2 experimental conditions from a real scRNA-seq data set.
#' 
#' @param x a \code{\link[SingleCellExperiment]{SingleCellExperiment}}.
#' @param n_genes # of genes to simulate. 
#' @param n_cells # of cells to simulate. 
#'   Either a single numeric or a range to sample from.
#' @param probs a list of length 3 containing probabilities of a cell belonging
#'   to each cluster, sample, and group, respectively. List elements must be 
#'   NULL (equal probabilities) or numeric values in [0, 1] that sum to 1.
#' @param p_dd numeric vector of length 6.
#'   Specifies the probability of a gene being
#'   EE, EP, DE, DP, DM, or DB, respectively.
#' @param p_type numeric. Probaility of EE/EP gene being a type-gene.
#'   If a gene is of class "type" in a given cluster, a unique mean 
#'   will be used for that gene in the respective cluster.
#' @param lfc numeric value to use as mean logFC
#'   for DE, DP, DM, and DB type of genes.
#' @param rel_lfc numeric vector of relative logFCs for each cluster. 
#'   Should be of length \code{nlevels(x$cluster_id)} with 
#'   \code{levels(x$cluster_id)} as names. 
#'   Defaults to factor of 1 for all clusters.
#'   
#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}}
#'   containing multiple clusters & samples across 2 groups.
#' 
#' @examples
#' data(sce)
#' simData(sce,
#'     n_genes = 10, n_cells = 10,
#'     p_dd = diag(6)[1, ])
#'     
#' @author Helena L Crowell
#' 
#' @references 
#' Crowell, HL, Soneson, C, Germain, P-L, Calini, D, 
#' Collin, L, Raposo, C, Malhotra, D & Robinson, MD: 
#' On the discovery of population-specific state transitions from 
#' multi-sample multi-condition single-cell RNA sequencing data. 
#' \emph{bioRxiv} \strong{713412} (2018). 
#' doi: \url{https://doi.org/10.1101/713412}
#' 
#' @importFrom data.table data.table
#' @importFrom dplyr mutate_all mutate_at
#' @importFrom edgeR DGEList estimateDisp glmFit
#' @importFrom magrittr set_colnames set_rownames
#' @importFrom purrr modify_at set_names
#' @importFrom stats model.matrix rgamma setNames
#' @importFrom SingleCellExperiment SingleCellExperiment
#' @importFrom SummarizedExperiment colData
#' @importFrom S4Vectors split
#' @importFrom tibble column_to_rownames
#' @export

simData <- function(x, n_genes = 500, n_cells = 300, 
    probs = NULL, p_dd = diag(6)[1, ], p_type = 0,
    lfc = 2, rel_lfc = NULL) {
    
    # throughout this code...
    # k: cluster ID
    # s: sample ID
    # c: gene category
    
    # check validity of input arguments
    .check_sce(x, req_group = FALSE)
    stopifnot(is.numeric(n_genes), length(n_genes) == 1)
    stopifnot(is.numeric(n_cells), length(n_cells) == 1 | length(n_cells) == 2)
    stopifnot(is.numeric(p_dd), length(p_dd) == 6, sum(p_dd) == 1)
    stopifnot(is.numeric(p_type), length(p_type) == 1, p_type >= 0)
    stopifnot(is.numeric(lfc), is.numeric(lfc), lfc > 1)
    
    kids <- levels(x$cluster_id)
    sids <- levels(x$sample_id)
    gids <- c("A", "B")
    names(kids) <- kids
    names(sids) <- sids
    names(gids) <- gids
    nk <- length(kids)
    
    if (is.null(rel_lfc)) {
        rel_lfc <- rep(1, nk)
        names(rel_lfc) <- kids
    } else {
        stopifnot(is.numeric(rel_lfc), length(rel_lfc) == nk, rel_lfc >= 0)
        if (is.null(names(rel_lfc))) {
            names(rel_lfc) <- kids
        } else {
            stopifnot(setequal(names(rel_lfc), kids))
        }
    }
    
    # initialize count matrix
    gs <- paste0("gene", seq_len(n_genes))
    cs <- paste0("cell", seq_len(n_cells))
    y <- matrix(0, n_genes, n_cells, dimnames = list(gs, cs))
    
    # sample cell metadata
    cd <- .sample_cell_md(
        n = n_cells, probs = probs,
        ids = list(kids, sids, gids)) %>% 
        set_rownames(cs)
    cs_idx <- .split_cells(cd, by = colnames(cd))
    n_cs <- modify_depth(cs_idx, -1, length)
    
    # split input cells by cluster-sample
    cs_by_ks <- .split_cells(x)
    
    # sample nb. of genes to simulate per category & gene indices
    n_dd <- replicate(nk, 
        table(sample(factor(cats, levels = cats), n_genes, TRUE, p_dd))) %>% 
        set_colnames(kids)
    gs_idx <- .sample_gene_inds(gs, n_dd)
    
    # for ea. cluster, sample set of genes to simulate from
    gs_by_k <- setNames(sample(rownames(x), n_genes, TRUE), gs)
    gs_by_k <- replicate(nk, gs_by_k) %>% set_colnames(kids)

    # impute type-genes
    if (p_type != 0)
        gs_by_k <- .impute_type_genes(x, gs_by_k, gs_idx, p_type)

    # split by cluster & categroy
    gs_by_k <- split(gs_by_k, col(gs_by_k))
    gs_by_k <- map(gs_by_k, set_names, gs)
    names(gs_by_k) <- kids
    
    gs_by_kc <- lapply(kids, function(k) 
        lapply(cats, function(c)
            gs_by_k[[k]][gs_idx[[c, k]]]) %>% 
            set_names(cats))
    
    # sample logFCs
    lfc <- vapply(kids, function(k) 
        lapply(cats, function(c) { 
            n <- n_dd[c, k]
            if (c == "ee") return(rep(NA, n))
            signs <- sample(c(-1, 1), n, TRUE)
            lfcs <- rgamma(n, 4, 4/lfc) * signs
            names(lfcs) <- gs_by_kc[[k]][[c]]
            lfcs * rel_lfc[k]
        }), vector("list", length(cats))) %>% 
        set_rownames(cats)
    
    # compute NB parameters
    o <- exp(colData(x)$offset)
    m <- lapply(sids, function(s) {
        cn <- paste("beta", s, sep = ".")
        k <- grep(cn, names(rowData(x)))
        b <- exp(rowData(x)[[k]])
        vapply(o, "*", b, FUN.VALUE = numeric(nrow(x))) %>% 
            set_rownames(rownames(x)) %>% 
            set_colnames(colnames(x)) %>% 
            round
    })
    d <- rowData(x)$dispersion %>% 
        set_names(rownames(x))
    
    sim_mean <- lapply(kids, function(k) 
        lapply(gids, function(g)
            setNames(numeric(n_genes), rownames(y))))
    for (k in kids) {
        for (s in sids) {
            for (c in cats[n_dd[, k] != 0]) {
                gs_kc <- gs_by_kc[[k]][[c]]
                cs_ks <- cs_by_ks[[k]][[s]]
                
                g1 <- cs_idx[[k]][[s]]$A
                g2 <- cs_idx[[k]][[s]]$B
                
                ng1 <- length(g1)
                ng2 <- length(g2) 
                
                cs_g1 <- sample(cs_ks, ng1, replace = TRUE)
                cs_g2 <- sample(cs_ks, ng2, replace = TRUE)
                
                m_g1 <- m[[s]][gs_kc, cs_g1, drop = FALSE]
                m_g2 <- m[[s]][gs_kc, cs_g2, drop = FALSE]
                d_kc <- d[gs_kc]
                lfc_kc <- lfc[[c, k]]
                
                gidx <- gs_idx[[c, k]]
                cidx <- c(g1, g2)
                
                re <- .sim(c, cs_g1, cs_g2, m_g1, m_g2, d_kc, lfc_kc)
                y[gidx, cidx] <- re$cs
                
                for (g in c("A", "B")) sim_mean[[k]][[g]][gidx] <- 
                    ifelse(is.null(re$ms[[g]]), NA, list(re$ms[[g]]))[[1]]
            }
        }
    }
    
    sim_mean <- sim_mean %>%
        map(bind_cols) %>% 
        bind_rows(.id = "cluster_id") %>% 
        mutate_at("cluster_id", factor) %>% 
        mutate(gene = rep(gs, nk))
    
    # construct gene metadata table storing ------------------------------------
    # gene | cluster_id | category | logFC, gene, disp, mean used for sim.
    gi <- data.frame(
        gene = unlist(gs_idx),
        cluster_id = rep.int(rep(kids, each = length(cats)), c(n_dd)),
        category = rep.int(rep(cats, nk), c(n_dd)),
        logFC = unlist(lfc),
        sim_gene = unlist(gs_by_kc),
        sim_disp = d[unlist(gs_by_kc)]) %>% 
        mutate_at("gene", as.character)
    # add true simulation means
    gi <- full_join(gi, sim_mean, by = c("gene", "cluster_id")) %>% 
        rename("sim_mean.A" = "A", "sim_mean.B" = "B")
    # reorder
    o <- order(as.numeric(gsub("[a-z]", "", gi$gene)))
    gi <- gi[o, ] %>% set_rownames(NULL)
    
    # construct SCE
    cd$sample_id <- factor(paste(cd$sample_id, cd$group_id, sep = "."))
    m <- match(levels(cd$sample_id), cd$sample_id)
    gids <- cd$group_id[m]
    o <- order(gids)
    sids <- levels(cd$sample_id)[o]
    ei <- data.frame(sample_id = sids, group_id = gids[o])
    cd <- cd %>% mutate_at("sample_id", factor, levels = sids)
    
    md <- list(
        experiment_info = ei,
        n_cells = table(cd$sample_id),
        gene_info = gi)
    
    SingleCellExperiment(
        assays = list(counts = as.matrix(y)),
        colData = cd, 
        metadata = md)
}