a8c963e5 |
#' Prepare Term Data for Enrichment Analysis
|
6a2ba253 |
#'
|
a8c963e5 |
#' Process term data downloaded with the \code{fetch_*} functions, preparing it
#' for fast enrichment analysis using \code{functional_enrichment}.
|
6a2ba253 |
#'
|
b8ec1cdd |
#' @details
#'
|
a8c963e5 |
#' This function takes two tibbles containing functional term information
#' (\code{terms}) and feature mapping (\code{mapping}), and converts them into
#' an object required by \code{functional_enrichment} for efficient analysis.
#' Terms and mapping can be generated with the database access functions
#' included in this package, such as \code{fetch_reactome} or
#' \code{fetch_go_from_go}.
|
b8ec1cdd |
#'
|
572fe00c |
#' @param terms A tibble with at least two columns: \code{term_id} and
|
a8c963e5 |
#' \code{term_name}. This tibble contains information about functional term
#' names and descriptions.
#' @param mapping A tibble with at least two columns, containing the mapping
#' between functional terms and features. One column must be named
#' \code{term_id}, while the other column should have a name specified by the
#' \code{feature_name} argument. For example, if \code{mapping} contains
#' columns \code{term_id}, \code{accession_number}, and \code{gene_symbol},
#' setting \code{feature_name = "gene_symbol"} indicates that gene symbols
#' will be used for enrichment analysis.
#' @param all_features A vector with all feature IDs used as the background for
|
cb0ef045 |
#' enrichment. If not specified, all features found in \code{mapping} will be
#' used, resulting in a larger object size.
|
a8c963e5 |
#' @param feature_name The name of the column in the \code{mapping} tibble to be
#' used as the feature identifier. For example, if \code{mapping} contains
#' columns \code{term_id}, \code{accession_number}, and \code{gene_symbol},
#' setting \code{feature_name = "gene_symbol"} indicates that gene symbols
#' will be used for enrichment analysis.
|
572fe00c |
#'
|
a8c963e5 |
#' @return An object of class \code{fenr_terms} required by
|
f24bfdf4 |
#' \code{functional_enrichment}.
|
a8c963e5 |
#' @importFrom assertthat assert_that
|
6a2ba253 |
#' @export
|
0a5e9dec |
#' @examples
|
39d97805 |
#' \dontrun{
|
bd4eb86c |
#' data(exmpl_all)
|
565383d1 |
#' go <- fetch_go(species = "sgd")
|
17492fda |
#' go_terms <- prepare_for_enrichment(go$terms, go$mapping, exmpl_all,
#' feature_name = "gene_symbol")
|
39d97805 |
#' }
|
17492fda |
prepare_for_enrichment <- function(terms, mapping, all_features = NULL,
feature_name = "gene_id") {
|
7fb769ba |
# Binding variables from non-standard evaluation locally
feature_id <- term_id <- NULL
|
bee6f509 |
# Argument checks
|
a8c963e5 |
assert_that(is.data.frame(terms) || tibble::is_tibble(terms),
msg = "'terms' must be a data frame or tibble.")
|
bee6f509 |
|
a8c963e5 |
assert_that(is.data.frame(mapping) || tibble::is_tibble(mapping),
msg = "'mapping' must be a data frame or tibble.")
|
bee6f509 |
|
a8c963e5 |
assert_that(is.null(all_features) || is.vector(all_features),
msg = "'all_features' must be a vector or NULL.")
|
bee6f509 |
|
a8c963e5 |
assert_that(is.character(feature_name) && length(feature_name) == 1,
msg = "'feature_name' must be a single string.")
|
bee6f509 |
|
f24bfdf4 |
# Check terms
|
a8c963e5 |
assert_that(all(c("term_id", "term_name") %in% colnames(terms)),
msg = "Column names in 'terms' should be 'term_id' and 'term_name'.")
assert_that(anyDuplicated(terms$term_id) == 0,
msg = "Duplicated term_id detected in 'terms'.")
|
f24bfdf4 |
# Check mapping
|
a8c963e5 |
assert_that("term_id" %in% colnames(mapping),
msg = "'mapping' should contain a column named 'term_id'.")
|
f24bfdf4 |
# Check for feature name
|
a8c963e5 |
assert_that(feature_name %in% colnames(mapping),
|
17492fda |
msg = paste0(feature_name, " column not found in mapping table.
Check 'feature_name' argument."))
|
f24bfdf4 |
# Replace empty all_features with everything from mapping
map_features <- mapping[[feature_name]] |>
unique()
if (is.null(all_features)) {
all_features <- map_features
} else {
# Check if mapping is contained in all features
if (length(intersect(all_features, map_features)) == 0)
|
17492fda |
stop("No overlap between 'all_features' and features found in 'mapping'.
Did you provide correct 'all_features'?")
|
6a2ba253 |
}
# Check for missing term descriptions
|
b8ec1cdd |
mis_term <- setdiff(mapping$term_id, terms$term_id)
|
6a2ba253 |
if (length(mis_term) > 0) {
dummy <- tibble::tibble(
term_id = mis_term,
term_name = rep(NA_character_, length(mis_term))
)
|
b8ec1cdd |
terms <- dplyr::bind_rows(terms, dummy)
|
6a2ba253 |
}
|
75128fb4 |
# Hash to select term name
|
4b6d4e6b |
term2name <- new.env(hash = TRUE)
|
c68983f8 |
for (i in seq_len(nrow(terms))) {
|
4b6d4e6b |
r <- terms[i, ]
term2name[[r$term_id]] <- r$term_name
}
|
6a2ba253 |
# feature-term tibble
feature_term <- mapping |>
dplyr::rename(feature_id = !!feature_name) |>
|
75128fb4 |
dplyr::filter(feature_id %in% all_features) |>
|
e61cd340 |
dplyr::select(feature_id, term_id) |>
dplyr::distinct()
|
6a2ba253 |
|
75128fb4 |
# Feature to terms hash
f2t <- feature_term |>
|
6a2ba253 |
dplyr::group_by(feature_id) |>
|
69770f91 |
dplyr::summarise(terms = list(term_id)) |>
tibble::deframe()
|
4b6d4e6b |
feature2term <- new.env(hash = TRUE)
|
69770f91 |
for(feat in names(f2t))
|
4b6d4e6b |
feature2term[[feat]] <- f2t[[feat]]
|
75128fb4 |
# Term to feature hash
t2f <- feature_term |>
|
6a2ba253 |
dplyr::group_by(term_id) |>
|
69770f91 |
dplyr::summarise(features = list(feature_id)) |>
tibble::deframe()
|
4b6d4e6b |
term2feature <- new.env(hash = TRUE)
|
69770f91 |
for(term in names(t2f))
|
4b6d4e6b |
term2feature[[term]] <- t2f[[term]]
|
6a2ba253 |
list(
term2name = term2name,
term2feature = term2feature,
feature2term = feature2term
|
f24bfdf4 |
) |>
|
e11876f7 |
structure(class = "fenr_terms")
|
6a2ba253 |
}
|
a8c963e5 |
#' Fast Functional Enrichment
|
6a2ba253 |
#'
|
a8c963e5 |
#' Perform fast functional enrichment analysis based on the hypergeometric
#' distribution. Designed for use in interactive applications.
|
6a2ba253 |
#'
|
a8c963e5 |
#' @details This function carries out functional enrichment analysis on a
#' selection of features (e.g., differentially expressed genes) using the
#' hypergeometric probability distribution (Fisher's exact test). Features can
#' be genes, proteins, etc. The \code{term_data} object contains functional
#' term information and feature-term mapping.
|
6a2ba253 |
#'
|
a8c963e5 |
#' @param feat_all A character vector with all feature identifiers, serving as
#' the background for enrichment.
|
6a2ba253 |
#' @param feat_sel A character vector with feature identifiers in the selection.
|
a8c963e5 |
#' @param term_data An object of class \code{fenr_terms}, created by
|
572fe00c |
#' \code{prepare_for_enrichment}.
|
a8c963e5 |
#' @param feat2name An optional named list to convert feature IDs into feature
|
572fe00c |
#' names.
|
6a2ba253 |
#'
|
a8c963e5 |
#' @return A tibble with enrichment results, providing the following information
#' for each term:
#' \itemize{
#' \item{\code{N_with} - number of features with this term among all features}
#' \item{\code{n_with_sel} - number of features with this term in the selection}
#' \item{\code{n_expect} - expected number of features with this term in the selection,
#' under the null hypothesis that terms are mapped to features randomly}
#' \item{\code{enrichment} - ratio of n_with_sel / n_expect}
#' \item{\code{odds_ratio} - odds ratio for enrichment; is infinite when all
#' features with the given term are in the selection}
#' \item{\code{p_value} - p-value from a single hypergeometric test}
|
17492fda |
#' \item{\code{p_adjust} - p-value adjusted for multiple tests using the
#' Benjamini-Hochberg approach}
|
a8c963e5 |
#' }.
|
83513bb5 |
#'
|
2e02fc20 |
#' @importFrom assertthat assert_that
|
83513bb5 |
#' @importFrom methods is
#' @export
|
6a2ba253 |
#' @examples
|
39d97805 |
#' \dontrun{
|
bd4eb86c |
#' data(exmpl_all, exmpl_sel)
|
565383d1 |
#' go <- fetch_go(species = "sgd")
#' go_terms <- prepare_for_enrichment(go$terms, go$mapping, exmpl_all, feature_name = "gene_symbol")
#' enr <- functional_enrichment(exmpl_all, exmpl_sel, go_terms)
|
39d97805 |
#' }
|
83513bb5 |
functional_enrichment <- function(feat_all, feat_sel, term_data, feat2name = NULL) {
|
6a2ba253 |
|
7fb769ba |
# Binding variables from non-standard evaluation locally
N_with <- n_with_sel <- n_expect <- enrichment <- odds_ratio <- NULL
desc <- p_value <- p_adjust <- NULL
|
6f7160fa |
# Check for character vectors
assert_that(is.character(feat_all))
assert_that(is.character(feat_sel))
assert_that(length(feat_all) > 1)
assert_that(length(feat_sel) > 1)
|
3b3ff8b9 |
# Check term_data class
|
0a5e9dec |
assert_that(is(term_data, "fenr_terms"))
|
f24bfdf4 |
|
3b3ff8b9 |
# If no overlap between selection and all, return NULL
if(!any(feat_sel %in% feat_all))
return(NULL)
|
6a2ba253 |
# all terms present in the selection
our_terms <- feat_sel |>
|
4b6d4e6b |
purrr::map(~term_data$feature2term[[.x]]) |>
|
6a2ba253 |
unlist() |>
unique()
|
69770f91 |
|
6a2ba253 |
# number of features in selection
N_sel <- length(feat_sel)
# total number of features
N_tot <- length(feat_all)
res <- purrr::map_dfr(our_terms, function(term_id) {
# all features with the term
|
4b6d4e6b |
# term_data$term2feature is a hash environment
tfeats <- term_data$term2feature[[term_id]]
|
284c6d6a |
# necessary if term data contain features not present in feat_all
tfeats <- tfeats[tfeats %in% feat_all]
|
6a2ba253 |
# features from selection with the term
|
de62a26f |
# this is faster than intersect(tfeats, feat_sel)
tfeats_sel <- tfeats[tfeats %in% feat_sel]
|
6a2ba253 |
N_with <- length(tfeats)
N_without <- N_tot - N_with
# building contingency table
n_with_sel <- length(tfeats_sel)
n_without_sel <- N_sel - n_with_sel
n_with_nsel <- N_with - n_with_sel
n_without_nsel <- N_tot - (n_with_sel + n_without_sel + n_with_nsel)
# contingency table is
# | In selection | Not in selection
#-------------------------------------------------
# With term | n_with_sel | n_with_nsel
# Without term | n_without_sel | n_without_nsel
|
83513bb5 |
if (n_with_sel < 2) return(NULL)
|
6a2ba253 |
# Expected number of features in selection, if random
n_expect <- N_with * N_sel / N_tot
# Odds ratio
odds_ratio <- (n_with_sel / n_without_sel) / (n_with_nsel / n_without_nsel)
# Hypergeometric function much faster than fisher.test
p <- 1 - stats::phyper(n_with_sel - 1, N_with, N_without, N_sel)
|
6f7160fa |
# Convert feature IDs to feature names;
|
0c4cd9e4 |
if (!is.null(feat2name))
|
6f7160fa |
tfeats_sel <- feat2name[tfeats_sel] |> unname()
|
6a2ba253 |
|
4b6d4e6b |
term_name <- term_data$term2name[[term_id]]
|
6a2ba253 |
# returns NAs if no term found
if (is.null(term_name)) term_name <- NA_character_
# constructing a tibble in every iteration is more time expensive than `c`,
# even with overhead of converting types afterwards. Not elegant, but fast.
c(
term_id = term_id,
term_name = term_name,
N_with = N_with,
n_with_sel = n_with_sel,
n_expect = n_expect,
enrichment = n_with_sel / n_expect,
odds_ratio = odds_ratio,
ids = paste(tfeats_sel, collapse = ", "),
p_value = p
)
})
|
96690646 |
|
17492fda |
# Drawback - if all selections below minimum, res is tibble 0 x 0, need to
# catch it
|
83513bb5 |
if (nrow(res) == 0) {
res <- NULL
} else {
res <- res |>
|
6a2ba253 |
dplyr::mutate(
|
7fb769ba |
dplyr::across(c(N_with, n_with_sel), as.integer),
dplyr::across(c(n_expect, enrichment, odds_ratio, p_value), as.numeric),
p_adjust = stats::p.adjust(p_value, method = "BH"),
dplyr::across(c(enrichment, odds_ratio, p_value, p_adjust), ~signif(.x, 3)),
|
6a2ba253 |
n_expect = round(n_expect, 2)
) |>
|
83513bb5 |
dplyr::arrange(desc(odds_ratio))
|
6a2ba253 |
}
|
83513bb5 |
res
|
6a2ba253 |
}
|