Bioconductor Code: fenr

R/enrichment.R

a8c963e5	#' Prepare Term Data for Enrichment Analysis
6a2ba253	#'
a8c963e5	#' Process term data downloaded with the \code{fetch_*} functions, preparing it #' for fast enrichment analysis using \code{functional_enrichment}.
6a2ba253	#'
b8ec1cdd	#' @details #'
a8c963e5	#' This function takes two tibbles containing functional term information #' (\code{terms}) and feature mapping (\code{mapping}), and converts them into #' an object required by \code{functional_enrichment} for efficient analysis. #' Terms and mapping can be generated with the database access functions #' included in this package, such as \code{fetch_reactome} or #' \code{fetch_go_from_go}.
b8ec1cdd	#'
572fe00c	#' @param terms A tibble with at least two columns: \code{term_id} and
a8c963e5	#' \code{term_name}. This tibble contains information about functional term #' names and descriptions. #' @param mapping A tibble with at least two columns, containing the mapping #' between functional terms and features. One column must be named #' \code{term_id}, while the other column should have a name specified by the #' \code{feature_name} argument. For example, if \code{mapping} contains #' columns \code{term_id}, \code{accession_number}, and \code{gene_symbol}, #' setting \code{feature_name = "gene_symbol"} indicates that gene symbols #' will be used for enrichment analysis. #' @param all_features A vector with all feature IDs used as the background for
cb0ef045	#' enrichment. If not specified, all features found in \code{mapping} will be #' used, resulting in a larger object size.
a8c963e5	#' @param feature_name The name of the column in the \code{mapping} tibble to be #' used as the feature identifier. For example, if \code{mapping} contains #' columns \code{term_id}, \code{accession_number}, and \code{gene_symbol}, #' setting \code{feature_name = "gene_symbol"} indicates that gene symbols #' will be used for enrichment analysis.
572fe00c	#'
a8c963e5	#' @return An object of class \code{fenr_terms} required by
f24bfdf4	#' \code{functional_enrichment}.
a8c963e5	#' @importFrom assertthat assert_that
6a2ba253	#' @export
0a5e9dec	#' @examples
39d97805	#' \dontrun{
bd4eb86c	#' data(exmpl_all)
565383d1	#' go <- fetch_go(species = "sgd")
17492fda	#' go_terms <- prepare_for_enrichment(go$terms, go$mapping, exmpl_all, #' feature_name = "gene_symbol")
39d97805	#' }
17492fda	prepare_for_enrichment <- function(terms, mapping, all_features = NULL, feature_name = "gene_id") {
7fb769ba	# Binding variables from non-standard evaluation locally feature_id <- term_id <- NULL
bee6f509	# Argument checks
a8c963e5	assert_that(is.data.frame(terms) \|\| tibble::is_tibble(terms), msg = "'terms' must be a data frame or tibble.")
bee6f509
a8c963e5	assert_that(is.data.frame(mapping) \|\| tibble::is_tibble(mapping), msg = "'mapping' must be a data frame or tibble.")
bee6f509
a8c963e5	assert_that(is.null(all_features) \|\| is.vector(all_features), msg = "'all_features' must be a vector or NULL.")
bee6f509
a8c963e5	assert_that(is.character(feature_name) && length(feature_name) == 1, msg = "'feature_name' must be a single string.")
bee6f509
f24bfdf4	# Check terms
a8c963e5	assert_that(all(c("term_id", "term_name") %in% colnames(terms)), msg = "Column names in 'terms' should be 'term_id' and 'term_name'.") assert_that(anyDuplicated(terms$term_id) == 0, msg = "Duplicated term_id detected in 'terms'.")
f24bfdf4	# Check mapping
a8c963e5	assert_that("term_id" %in% colnames(mapping), msg = "'mapping' should contain a column named 'term_id'.")
f24bfdf4	# Check for feature name
a8c963e5	assert_that(feature_name %in% colnames(mapping),
17492fda	msg = paste0(feature_name, " column not found in mapping table. Check 'feature_name' argument."))
f24bfdf4	# Replace empty all_features with everything from mapping map_features <- mapping[[feature_name]] \|> unique() if (is.null(all_features)) { all_features <- map_features } else { # Check if mapping is contained in all features if (length(intersect(all_features, map_features)) == 0)
17492fda	stop("No overlap between 'all_features' and features found in 'mapping'. Did you provide correct 'all_features'?")
6a2ba253	} # Check for missing term descriptions
b8ec1cdd	mis_term <- setdiff(mapping$term_id, terms$term_id)
6a2ba253	if (length(mis_term) > 0) { dummy <- tibble::tibble( term_id = mis_term, term_name = rep(NA_character_, length(mis_term)) )
b8ec1cdd	terms <- dplyr::bind_rows(terms, dummy)
6a2ba253	}
75128fb4	# Hash to select term name
4b6d4e6b	term2name <- new.env(hash = TRUE)
c68983f8	for (i in seq_len(nrow(terms))) {
4b6d4e6b	r <- terms[i, ] term2name[[r$term_id]] <- r$term_name }
6a2ba253	# feature-term tibble feature_term <- mapping \|> dplyr::rename(feature_id = !!feature_name) \|>
75128fb4	dplyr::filter(feature_id %in% all_features) \|>
e61cd340	dplyr::select(feature_id, term_id) \|> dplyr::distinct()
6a2ba253
75128fb4	# Feature to terms hash f2t <- feature_term \|>
6a2ba253	dplyr::group_by(feature_id) \|>
69770f91	dplyr::summarise(terms = list(term_id)) \|> tibble::deframe()
4b6d4e6b	feature2term <- new.env(hash = TRUE)
69770f91	for(feat in names(f2t))
4b6d4e6b	feature2term[[feat]] <- f2t[[feat]]
75128fb4	# Term to feature hash t2f <- feature_term \|>
6a2ba253	dplyr::group_by(term_id) \|>
69770f91	dplyr::summarise(features = list(feature_id)) \|> tibble::deframe()
4b6d4e6b	term2feature <- new.env(hash = TRUE)
69770f91	for(term in names(t2f))
4b6d4e6b	term2feature[[term]] <- t2f[[term]]
6a2ba253	list( term2name = term2name, term2feature = term2feature, feature2term = feature2term
f24bfdf4	) \|>
e11876f7	structure(class = "fenr_terms")
6a2ba253	}
a8c963e5	#' Fast Functional Enrichment
6a2ba253	#'
a8c963e5	#' Perform fast functional enrichment analysis based on the hypergeometric #' distribution. Designed for use in interactive applications.
6a2ba253	#'
a8c963e5	#' @details This function carries out functional enrichment analysis on a #' selection of features (e.g., differentially expressed genes) using the #' hypergeometric probability distribution (Fisher's exact test). Features can #' be genes, proteins, etc. The \code{term_data} object contains functional #' term information and feature-term mapping.
6a2ba253	#'
a8c963e5	#' @param feat_all A character vector with all feature identifiers, serving as #' the background for enrichment.
6a2ba253	#' @param feat_sel A character vector with feature identifiers in the selection.
a8c963e5	#' @param term_data An object of class \code{fenr_terms}, created by
572fe00c	#' \code{prepare_for_enrichment}.
a8c963e5	#' @param feat2name An optional named list to convert feature IDs into feature
572fe00c	#' names.
6a2ba253	#'
a8c963e5	#' @return A tibble with enrichment results, providing the following information #' for each term: #' \itemize{ #' \item{\code{N_with} - number of features with this term among all features} #' \item{\code{n_with_sel} - number of features with this term in the selection} #' \item{\code{n_expect} - expected number of features with this term in the selection, #' under the null hypothesis that terms are mapped to features randomly} #' \item{\code{enrichment} - ratio of n_with_sel / n_expect} #' \item{\code{odds_ratio} - odds ratio for enrichment; is infinite when all #' features with the given term are in the selection} #' \item{\code{p_value} - p-value from a single hypergeometric test}
17492fda	#' \item{\code{p_adjust} - p-value adjusted for multiple tests using the #' Benjamini-Hochberg approach}
a8c963e5	#' }.
83513bb5	#'
2e02fc20	#' @importFrom assertthat assert_that
83513bb5	#' @importFrom methods is #' @export
6a2ba253	#' @examples
39d97805	#' \dontrun{
bd4eb86c	#' data(exmpl_all, exmpl_sel)
565383d1	#' go <- fetch_go(species = "sgd") #' go_terms <- prepare_for_enrichment(go$terms, go$mapping, exmpl_all, feature_name = "gene_symbol") #' enr <- functional_enrichment(exmpl_all, exmpl_sel, go_terms)
39d97805	#' }
83513bb5	functional_enrichment <- function(feat_all, feat_sel, term_data, feat2name = NULL) {
6a2ba253
7fb769ba	# Binding variables from non-standard evaluation locally N_with <- n_with_sel <- n_expect <- enrichment <- odds_ratio <- NULL desc <- p_value <- p_adjust <- NULL
6f7160fa	# Check for character vectors assert_that(is.character(feat_all)) assert_that(is.character(feat_sel)) assert_that(length(feat_all) > 1) assert_that(length(feat_sel) > 1)
3b3ff8b9	# Check term_data class
0a5e9dec	assert_that(is(term_data, "fenr_terms"))
f24bfdf4
3b3ff8b9	# If no overlap between selection and all, return NULL if(!any(feat_sel %in% feat_all)) return(NULL)
6a2ba253	# all terms present in the selection our_terms <- feat_sel \|>
4b6d4e6b	purrr::map(~term_data$feature2term[[.x]]) \|>
6a2ba253	unlist() \|> unique()
69770f91
6a2ba253	# number of features in selection N_sel <- length(feat_sel) # total number of features N_tot <- length(feat_all) res <- purrr::map_dfr(our_terms, function(term_id) { # all features with the term
4b6d4e6b	# term_data$term2feature is a hash environment tfeats <- term_data$term2feature[[term_id]]
284c6d6a	# necessary if term data contain features not present in feat_all tfeats <- tfeats[tfeats %in% feat_all]
6a2ba253	# features from selection with the term
de62a26f	# this is faster than intersect(tfeats, feat_sel) tfeats_sel <- tfeats[tfeats %in% feat_sel]
6a2ba253	N_with <- length(tfeats) N_without <- N_tot - N_with # building contingency table n_with_sel <- length(tfeats_sel) n_without_sel <- N_sel - n_with_sel n_with_nsel <- N_with - n_with_sel n_without_nsel <- N_tot - (n_with_sel + n_without_sel + n_with_nsel) # contingency table is # \| In selection \| Not in selection #------------------------------------------------- # With term \| n_with_sel \| n_with_nsel # Without term \| n_without_sel \| n_without_nsel
83513bb5	if (n_with_sel < 2) return(NULL)
6a2ba253	# Expected number of features in selection, if random n_expect <- N_with * N_sel / N_tot # Odds ratio odds_ratio <- (n_with_sel / n_without_sel) / (n_with_nsel / n_without_nsel) # Hypergeometric function much faster than fisher.test p <- 1 - stats::phyper(n_with_sel - 1, N_with, N_without, N_sel)
6f7160fa	# Convert feature IDs to feature names;
0c4cd9e4	if (!is.null(feat2name))
6f7160fa	tfeats_sel <- feat2name[tfeats_sel] \|> unname()
6a2ba253
4b6d4e6b	term_name <- term_data$term2name[[term_id]]
6a2ba253	# returns NAs if no term found if (is.null(term_name)) term_name <- NA_character_ # constructing a tibble in every iteration is more time expensive than `c`, # even with overhead of converting types afterwards. Not elegant, but fast. c( term_id = term_id, term_name = term_name, N_with = N_with, n_with_sel = n_with_sel, n_expect = n_expect, enrichment = n_with_sel / n_expect, odds_ratio = odds_ratio, ids = paste(tfeats_sel, collapse = ", "), p_value = p ) })
96690646
17492fda	# Drawback - if all selections below minimum, res is tibble 0 x 0, need to # catch it
83513bb5	if (nrow(res) == 0) { res <- NULL } else { res <- res \|>
6a2ba253	dplyr::mutate(
7fb769ba	dplyr::across(c(N_with, n_with_sel), as.integer), dplyr::across(c(n_expect, enrichment, odds_ratio, p_value), as.numeric), p_adjust = stats::p.adjust(p_value, method = "BH"), dplyr::across(c(enrichment, odds_ratio, p_value, p_adjust), ~signif(.x, 3)),
6a2ba253	n_expect = round(n_expect, 2) ) \|>
83513bb5	dplyr::arrange(desc(odds_ratio))
6a2ba253	}
83513bb5	res
6a2ba253	}