Bioconductor Code: MPAC

Raw Blame Patch Log History
#' @title  Cluster samples by pathway over-representation
#'
#' @param  ovrmat  A matrix of gene set over-representation adjusted p-values
#'                 with rows as gene sets and columns as samples. It is the
#'                 output from `ovrGMT()`.
#'
#' @param n_neighbors  Number of neighbors for clustering. A larger number is
#'                     recommended if the size of samples is large. Default: 10.
#'
#' @param n_random_runs  Number of random runs. Due to randomness introduced
#'                       to the Louvain algorithm in R igraph 1.3.0
#'                       (https://github.com/igraph/rigraph/issues/539), a large
#'                       number of runs are recommended to evaluate randomness
#'                       in the clustering results. Default: 200, which shall be
#'                       safe for sample size < 50. Please increase it
#'                       accordingly for a larger sample size.
#'
#' @inheritParams ppRnaInp
#'
#' @return  A data table with each row representing one clustering result, and
#'          the first column denotes the number of occurrences of a clustering
#'          result and the rest of columns indicating each sample's cluster
#'          index. Rows are ordered by the number of occurrences from high to
#'          low.
#'
#' @examples
#'
#' fovr = system.file('extdata/clSamp/ovrmat.rds', package='MPAC')
#' ovrmat = readRDS(fovr)
#'
#' clSamp(ovrmat)
#'
#' @export
#'
#' @import SingleCellExperiment
#' @importFrom scran modelGeneVar getTopHVGs denoisePCA clusterCells
#' @importFrom BiocSingular RandomParam
#' @importFrom bluster NNGraphParam
#' @importFrom SummarizedExperiment rowData<-
#' @importFrom S4Vectors DataFrame
#' @importFrom BiocParallel bplapply
#'
clSamp <- function(ovrmat, n_neighbors=10, n_random_runs=200, threads=1) {
    n_top_hvgs <- 100

    ovrmat <- ovrmat[, sort(colnames(ovrmat))]
    ovrmat[is.na(ovrmat)] <- 1.0
    sce <- SingleCellExperiment(
        list(logcounts = abs(log10(ovrmat))),
        colData = DataFrame(samp=colnames(ovrmat)),
        rowData = DataFrame(goname=rownames(ovrmat))
    )

    bp <- getBPPARAM(threads)
    dec <- modelGeneVar(sce, BPPARAM=bp)
    top_hvgs <- getTopHVGs(dec, n=n_top_hvgs)
    rowData(sce)$is_top_hvgs <- (rownames(sce) %in% top_hvgs)

    sce <- denoisePCA(sce, subset.row=top_hvgs, technical=dec,
        BSPARAM=RandomParam(), BPPARAM=bp)

    ngp <- NNGraphParam(k=n_neighbors, type='jaccard', cluster.fun='louvain')

    pats <- colnames(sce)

    cldt <- bplapply(seq_len(n_random_runs), function(irep) {
        clusterCells(sce, use.dimred='PCA', BLUSPARAM=ngp) |> as.integer() |>
        data.table(irep=irep, pat=pats, icl=_)
    }, BPPARAM=bp) |> rbindlist() |>
    dcast(irep ~ pat, value.var='icl')

    nreps <- NULL
    cldt[, list(nreps = .N), by=pats] |>
    _[, c('nreps', pats), with=FALSE] |> _[order(-nreps)]
}