#' Prepare Term Data for Enrichment Analysis #' #' Process term data downloaded with the \code{fetch_*} functions, preparing it #' for fast enrichment analysis using \code{functional_enrichment}. #' #' @details #' #' This function takes two tibbles containing functional term information #' (\code{terms}) and feature mapping (\code{mapping}), and converts them into #' an object required by \code{functional_enrichment} for efficient analysis. #' Terms and mapping can be generated with the database access functions #' included in this package, such as \code{fetch_reactome} or #' \code{fetch_go_from_go}. #' #' @param terms A tibble with at least two columns: \code{term_id} and #' \code{term_name}. This tibble contains information about functional term #' names and descriptions. #' @param mapping A tibble with at least two columns, containing the mapping #' between functional terms and features. One column must be named #' \code{term_id}, while the other column should have a name specified by the #' \code{feature_name} argument. For example, if \code{mapping} contains #' columns \code{term_id}, \code{accession_number}, and \code{gene_symbol}, #' setting \code{feature_name = "gene_symbol"} indicates that gene symbols #' will be used for enrichment analysis. #' @param all_features A vector with all feature IDs used as the background for #' enrichment. If not specified, all features found in \code{mapping} will be #' used, resulting in a larger object size. #' @param feature_name The name of the column in the \code{mapping} tibble to be #' used as the feature identifier. For example, if \code{mapping} contains #' columns \code{term_id}, \code{accession_number}, and \code{gene_symbol}, #' setting \code{feature_name = "gene_symbol"} indicates that gene symbols #' will be used for enrichment analysis. #' #' @return An object of class \code{fenr_terms} required by #' \code{functional_enrichment}. #' @importFrom assertthat assert_that #' @export #' @examples #' \dontrun{ #' data(exmpl_all) #' go <- fetch_go(species = "sgd") #' go_terms <- prepare_for_enrichment(go$terms, go$mapping, exmpl_all, #' feature_name = "gene_symbol") #' } prepare_for_enrichment <- function(terms, mapping, all_features = NULL, feature_name = "gene_id") { # Binding variables from non-standard evaluation locally feature_id <- term_id <- NULL # Argument checks assert_that(is.data.frame(terms) || tibble::is_tibble(terms), msg = "'terms' must be a data frame or tibble.") assert_that(is.data.frame(mapping) || tibble::is_tibble(mapping), msg = "'mapping' must be a data frame or tibble.") assert_that(is.null(all_features) || is.vector(all_features), msg = "'all_features' must be a vector or NULL.") assert_that(is.character(feature_name) && length(feature_name) == 1, msg = "'feature_name' must be a single string.") # Check terms assert_that(all(c("term_id", "term_name") %in% colnames(terms)), msg = "Column names in 'terms' should be 'term_id' and 'term_name'.") assert_that(anyDuplicated(terms$term_id) == 0, msg = "Duplicated term_id detected in 'terms'.") # Check mapping assert_that("term_id" %in% colnames(mapping), msg = "'mapping' should contain a column named 'term_id'.") # Check for feature name assert_that(feature_name %in% colnames(mapping), msg = paste0(feature_name, " column not found in mapping table. Check 'feature_name' argument.")) # Replace empty all_features with everything from mapping map_features <- mapping[[feature_name]] |> unique() if (is.null(all_features)) { all_features <- map_features } else { # Check if mapping is contained in all features if (length(intersect(all_features, map_features)) == 0) stop("No overlap between 'all_features' and features found in 'mapping'. Did you provide correct 'all_features'?") } # Check for missing term descriptions mis_term <- setdiff(mapping$term_id, terms$term_id) if (length(mis_term) > 0) { dummy <- tibble::tibble( term_id = mis_term, term_name = rep(NA_character_, length(mis_term)) ) terms <- dplyr::bind_rows(terms, dummy) } # Hash to select term name term2name <- new.env(hash = TRUE) for (i in seq_len(nrow(terms))) { r <- terms[i, ] term2name[[r$term_id]] <- r$term_name } # feature-term tibble feature_term <- mapping |> dplyr::rename(feature_id = !!feature_name) |> dplyr::filter(feature_id %in% all_features) |> dplyr::select(feature_id, term_id) |> dplyr::distinct() # Feature to terms hash f2t <- feature_term |> dplyr::group_by(feature_id) |> dplyr::summarise(terms = list(term_id)) |> tibble::deframe() feature2term <- new.env(hash = TRUE) for(feat in names(f2t)) feature2term[[feat]] <- f2t[[feat]] # Term to feature hash t2f <- feature_term |> dplyr::group_by(term_id) |> dplyr::summarise(features = list(feature_id)) |> tibble::deframe() term2feature <- new.env(hash = TRUE) for(term in names(t2f)) term2feature[[term]] <- t2f[[term]] list( term2name = term2name, term2feature = term2feature, feature2term = feature2term ) |> structure(class = "fenr_terms") } #' Fast Functional Enrichment #' #' Perform fast functional enrichment analysis based on the hypergeometric #' distribution. Designed for use in interactive applications. #' #' @details This function carries out functional enrichment analysis on a #' selection of features (e.g., differentially expressed genes) using the #' hypergeometric probability distribution (Fisher's exact test). Features can #' be genes, proteins, etc. The \code{term_data} object contains functional #' term information and feature-term mapping. #' #' @param feat_all A character vector with all feature identifiers, serving as #' the background for enrichment. #' @param feat_sel A character vector with feature identifiers in the selection. #' @param term_data An object of class \code{fenr_terms}, created by #' \code{prepare_for_enrichment}. #' @param feat2name An optional named list to convert feature IDs into feature #' names. #' #' @return A tibble with enrichment results, providing the following information #' for each term: #' \itemize{ #' \item{\code{N_with} - number of features with this term among all features} #' \item{\code{n_with_sel} - number of features with this term in the selection} #' \item{\code{n_expect} - expected number of features with this term in the selection, #' under the null hypothesis that terms are mapped to features randomly} #' \item{\code{enrichment} - ratio of n_with_sel / n_expect} #' \item{\code{odds_ratio} - odds ratio for enrichment; is infinite when all #' features with the given term are in the selection} #' \item{\code{p_value} - p-value from a single hypergeometric test} #' \item{\code{p_adjust} - p-value adjusted for multiple tests using the #' Benjamini-Hochberg approach} #' }. #' #' @importFrom assertthat assert_that #' @importFrom methods is #' @export #' @examples #' \dontrun{ #' data(exmpl_all, exmpl_sel) #' go <- fetch_go(species = "sgd") #' go_terms <- prepare_for_enrichment(go$terms, go$mapping, exmpl_all, feature_name = "gene_symbol") #' enr <- functional_enrichment(exmpl_all, exmpl_sel, go_terms) #' } functional_enrichment <- function(feat_all, feat_sel, term_data, feat2name = NULL) { # Binding variables from non-standard evaluation locally N_with <- n_with_sel <- n_expect <- enrichment <- odds_ratio <- NULL desc <- p_value <- p_adjust <- NULL # Check for character vectors assert_that(is.character(feat_all)) assert_that(is.character(feat_sel)) assert_that(length(feat_all) > 1) assert_that(length(feat_sel) > 1) # Check term_data class assert_that(is(term_data, "fenr_terms")) # If no overlap between selection and all, return NULL if(!any(feat_sel %in% feat_all)) return(NULL) # all terms present in the selection our_terms <- feat_sel |> purrr::map(~term_data$feature2term[[.x]]) |> unlist() |> unique() # number of features in selection N_sel <- length(feat_sel) # total number of features N_tot <- length(feat_all) res <- purrr::map_dfr(our_terms, function(term_id) { # all features with the term # term_data$term2feature is a hash environment tfeats <- term_data$term2feature[[term_id]] # necessary if term data contain features not present in feat_all tfeats <- tfeats[tfeats %in% feat_all] # features from selection with the term # this is faster than intersect(tfeats, feat_sel) tfeats_sel <- tfeats[tfeats %in% feat_sel] N_with <- length(tfeats) N_without <- N_tot - N_with # building contingency table n_with_sel <- length(tfeats_sel) n_without_sel <- N_sel - n_with_sel n_with_nsel <- N_with - n_with_sel n_without_nsel <- N_tot - (n_with_sel + n_without_sel + n_with_nsel) # contingency table is # | In selection | Not in selection #------------------------------------------------- # With term | n_with_sel | n_with_nsel # Without term | n_without_sel | n_without_nsel if (n_with_sel < 2) return(NULL) # Expected number of features in selection, if random n_expect <- N_with * N_sel / N_tot # Odds ratio odds_ratio <- (n_with_sel / n_without_sel) / (n_with_nsel / n_without_nsel) # Hypergeometric function much faster than fisher.test p <- 1 - stats::phyper(n_with_sel - 1, N_with, N_without, N_sel) # Convert feature IDs to feature names; if (!is.null(feat2name)) tfeats_sel <- feat2name[tfeats_sel] |> unname() term_name <- term_data$term2name[[term_id]] # returns NAs if no term found if (is.null(term_name)) term_name <- NA_character_ # constructing a tibble in every iteration is more time expensive than `c`, # even with overhead of converting types afterwards. Not elegant, but fast. c( term_id = term_id, term_name = term_name, N_with = N_with, n_with_sel = n_with_sel, n_expect = n_expect, enrichment = n_with_sel / n_expect, odds_ratio = odds_ratio, ids = paste(tfeats_sel, collapse = ", "), p_value = p ) }) # Drawback - if all selections below minimum, res is tibble 0 x 0, need to # catch it if (nrow(res) == 0) { res <- NULL } else { res <- res |> dplyr::mutate( dplyr::across(c(N_with, n_with_sel), as.integer), dplyr::across(c(n_expect, enrichment, odds_ratio, p_value), as.numeric), p_adjust = stats::p.adjust(p_value, method = "BH"), dplyr::across(c(enrichment, odds_ratio, p_value, p_adjust), ~signif(.x, 3)), n_expect = round(n_expect, 2) ) |> dplyr::arrange(desc(odds_ratio)) } res }