R/enrichment.R
a8c963e5
 #' Prepare Term Data for Enrichment Analysis
6a2ba253
 #'
a8c963e5
 #' Process term data downloaded with the \code{fetch_*} functions, preparing it
 #' for fast enrichment analysis using \code{functional_enrichment}.
6a2ba253
 #'
b8ec1cdd
 #' @details
 #'
a8c963e5
 #' This function takes two tibbles containing functional term information
 #' (\code{terms}) and feature mapping (\code{mapping}), and converts them into
 #' an object required by \code{functional_enrichment} for efficient analysis.
 #' Terms and mapping can be generated with the database access functions
 #' included in this package, such as \code{fetch_reactome} or
 #' \code{fetch_go_from_go}.
b8ec1cdd
 #'
572fe00c
 #' @param terms A tibble with at least two columns: \code{term_id} and
a8c963e5
 #'   \code{term_name}. This tibble contains information about functional term
 #'   names and descriptions.
 #' @param mapping A tibble with at least two columns, containing the mapping
 #'   between functional terms and features. One column must be named
 #'   \code{term_id}, while the other column should have a name specified by the
 #'   \code{feature_name} argument. For example, if \code{mapping} contains
 #'   columns \code{term_id}, \code{accession_number}, and \code{gene_symbol},
 #'   setting \code{feature_name = "gene_symbol"} indicates that gene symbols
 #'   will be used for enrichment analysis.
 #' @param all_features A vector with all feature IDs used as the background for
cb0ef045
 #'   enrichment. If not specified, all features found in \code{mapping} will be
 #'   used, resulting in a larger object size.
a8c963e5
 #' @param feature_name The name of the column in the \code{mapping} tibble to be
 #'   used as the feature identifier. For example, if \code{mapping} contains
 #'   columns \code{term_id}, \code{accession_number}, and \code{gene_symbol},
 #'   setting \code{feature_name = "gene_symbol"} indicates that gene symbols
 #'   will be used for enrichment analysis.
572fe00c
 #'
a8c963e5
 #' @return An object of class \code{fenr_terms} required by
f24bfdf4
 #'   \code{functional_enrichment}.
a8c963e5
 #' @importFrom assertthat assert_that
6a2ba253
 #' @export
0a5e9dec
 #' @examples
39d97805
 #' \dontrun{
bd4eb86c
 #' data(exmpl_all)
565383d1
 #' go <- fetch_go(species = "sgd")
17492fda
 #' go_terms <- prepare_for_enrichment(go$terms, go$mapping, exmpl_all,
 #'                                    feature_name = "gene_symbol")
39d97805
 #' }
17492fda
 prepare_for_enrichment <- function(terms, mapping, all_features = NULL,
                                    feature_name = "gene_id") {
7fb769ba
   # Binding variables from non-standard evaluation locally
   feature_id <- term_id <- NULL
 
bee6f509
   # Argument checks
a8c963e5
   assert_that(is.data.frame(terms) || tibble::is_tibble(terms),
               msg = "'terms' must be a data frame or tibble.")
bee6f509
 
a8c963e5
   assert_that(is.data.frame(mapping) || tibble::is_tibble(mapping),
               msg = "'mapping' must be a data frame or tibble.")
bee6f509
 
a8c963e5
   assert_that(is.null(all_features) || is.vector(all_features),
               msg = "'all_features' must be a vector or NULL.")
bee6f509
 
a8c963e5
   assert_that(is.character(feature_name) && length(feature_name) == 1,
               msg = "'feature_name' must be a single string.")
bee6f509
 
f24bfdf4
   # Check terms
a8c963e5
   assert_that(all(c("term_id", "term_name") %in% colnames(terms)),
               msg = "Column names in 'terms' should be 'term_id' and 'term_name'.")
 
   assert_that(anyDuplicated(terms$term_id) == 0,
               msg = "Duplicated term_id detected in 'terms'.")
f24bfdf4
 
   # Check mapping
a8c963e5
   assert_that("term_id" %in% colnames(mapping),
               msg = "'mapping' should contain a column named 'term_id'.")
f24bfdf4
 
   # Check for feature name
a8c963e5
   assert_that(feature_name %in% colnames(mapping),
17492fda
               msg = paste0(feature_name, " column not found in mapping table.
                            Check 'feature_name' argument."))
f24bfdf4
 
   # Replace empty all_features with everything from mapping
   map_features <- mapping[[feature_name]] |>
     unique()
   if (is.null(all_features)) {
     all_features <- map_features
   } else {
     # Check if mapping is contained in all features
     if (length(intersect(all_features, map_features)) == 0)
17492fda
       stop("No overlap between 'all_features' and features found in 'mapping'.
            Did you provide correct 'all_features'?")
6a2ba253
   }
 
   # Check for missing term descriptions
b8ec1cdd
   mis_term <- setdiff(mapping$term_id, terms$term_id)
6a2ba253
   if (length(mis_term) > 0) {
     dummy <- tibble::tibble(
       term_id = mis_term,
       term_name = rep(NA_character_, length(mis_term))
     )
b8ec1cdd
     terms <- dplyr::bind_rows(terms, dummy)
6a2ba253
   }
 
75128fb4
   # Hash to select term name
4b6d4e6b
   term2name <- new.env(hash = TRUE)
c68983f8
   for (i in seq_len(nrow(terms))) {
4b6d4e6b
     r <- terms[i, ]
     term2name[[r$term_id]] <- r$term_name
   }
6a2ba253
 
   # feature-term tibble
   feature_term <- mapping |>
     dplyr::rename(feature_id = !!feature_name) |>
75128fb4
     dplyr::filter(feature_id %in% all_features) |>
e61cd340
     dplyr::select(feature_id, term_id) |>
     dplyr::distinct()
6a2ba253
 
75128fb4
   # Feature to terms hash
   f2t <- feature_term |>
6a2ba253
     dplyr::group_by(feature_id) |>
69770f91
     dplyr::summarise(terms = list(term_id)) |>
     tibble::deframe()
4b6d4e6b
   feature2term <- new.env(hash = TRUE)
69770f91
   for(feat in names(f2t))
4b6d4e6b
     feature2term[[feat]] <- f2t[[feat]]
75128fb4
 
   # Term to feature hash
   t2f <- feature_term |>
6a2ba253
     dplyr::group_by(term_id) |>
69770f91
     dplyr::summarise(features = list(feature_id)) |>
     tibble::deframe()
4b6d4e6b
   term2feature <- new.env(hash = TRUE)
69770f91
   for(term in names(t2f))
4b6d4e6b
     term2feature[[term]] <- t2f[[term]]
6a2ba253
 
   list(
     term2name = term2name,
     term2feature = term2feature,
     feature2term = feature2term
f24bfdf4
   ) |>
e11876f7
     structure(class = "fenr_terms")
6a2ba253
 }
 
 
a8c963e5
 
 #' Fast Functional Enrichment
6a2ba253
 #'
a8c963e5
 #' Perform fast functional enrichment analysis based on the hypergeometric
 #' distribution. Designed for use in interactive applications.
6a2ba253
 #'
a8c963e5
 #' @details This function carries out functional enrichment analysis on a
 #'   selection of features (e.g., differentially expressed genes) using the
 #'   hypergeometric probability distribution (Fisher's exact test). Features can
 #'   be genes, proteins, etc. The \code{term_data} object contains functional
 #'   term information and feature-term mapping.
6a2ba253
 #'
a8c963e5
 #' @param feat_all A character vector with all feature identifiers, serving as
 #'   the background for enrichment.
6a2ba253
 #' @param feat_sel A character vector with feature identifiers in the selection.
a8c963e5
 #' @param term_data An object of class \code{fenr_terms}, created by
572fe00c
 #'   \code{prepare_for_enrichment}.
a8c963e5
 #' @param feat2name An optional named list to convert feature IDs into feature
572fe00c
 #'   names.
6a2ba253
 #'
a8c963e5
 #' @return A tibble with enrichment results, providing the following information
 #'   for each term:
 #'   \itemize{
 #'     \item{\code{N_with} - number of features with this term among all features}
 #'     \item{\code{n_with_sel} - number of features with this term in the selection}
 #'     \item{\code{n_expect} - expected number of features with this term in the selection,
 #'       under the null hypothesis that terms are mapped to features randomly}
 #'     \item{\code{enrichment} - ratio of n_with_sel / n_expect}
 #'     \item{\code{odds_ratio} - odds ratio for enrichment; is infinite when all
 #'       features with the given term are in the selection}
 #'     \item{\code{p_value} - p-value from a single hypergeometric test}
17492fda
 #'     \item{\code{p_adjust} - p-value adjusted for multiple tests using the
 #'       Benjamini-Hochberg approach}
a8c963e5
 #'   }.
83513bb5
 #'
2e02fc20
 #' @importFrom assertthat assert_that
83513bb5
 #' @importFrom methods is
 #' @export
6a2ba253
 #' @examples
39d97805
 #' \dontrun{
bd4eb86c
 #' data(exmpl_all, exmpl_sel)
565383d1
 #' go <- fetch_go(species = "sgd")
 #' go_terms <- prepare_for_enrichment(go$terms, go$mapping, exmpl_all, feature_name = "gene_symbol")
 #' enr <- functional_enrichment(exmpl_all, exmpl_sel, go_terms)
39d97805
 #' }
83513bb5
 functional_enrichment <- function(feat_all, feat_sel, term_data, feat2name = NULL) {
6a2ba253
 
7fb769ba
   # Binding variables from non-standard evaluation locally
   N_with <- n_with_sel <- n_expect <- enrichment <- odds_ratio <- NULL
   desc <- p_value <- p_adjust <- NULL
 
6f7160fa
   # Check for character vectors
   assert_that(is.character(feat_all))
   assert_that(is.character(feat_sel))
   assert_that(length(feat_all) > 1)
   assert_that(length(feat_sel) > 1)
 
3b3ff8b9
   # Check term_data class
0a5e9dec
   assert_that(is(term_data, "fenr_terms"))
f24bfdf4
 
3b3ff8b9
   # If no overlap between selection and all, return NULL
   if(!any(feat_sel %in% feat_all))
     return(NULL)
 
6a2ba253
   # all terms present in the selection
   our_terms <- feat_sel |>
4b6d4e6b
     purrr::map(~term_data$feature2term[[.x]]) |>
6a2ba253
     unlist() |>
     unique()
69770f91
 
6a2ba253
   # number of features in selection
   N_sel <- length(feat_sel)
   # total number of features
   N_tot <- length(feat_all)
 
   res <- purrr::map_dfr(our_terms, function(term_id) {
     # all features with the term
4b6d4e6b
     # term_data$term2feature is a hash environment
     tfeats <- term_data$term2feature[[term_id]]
284c6d6a
     # necessary if term data contain features not present in feat_all
     tfeats <- tfeats[tfeats %in% feat_all]
6a2ba253
 
     # features from selection with the term
de62a26f
     # this is faster than intersect(tfeats, feat_sel)
     tfeats_sel <- tfeats[tfeats %in% feat_sel]
6a2ba253
 
     N_with <- length(tfeats)
     N_without <- N_tot - N_with
 
     # building contingency table
     n_with_sel <- length(tfeats_sel)
     n_without_sel <- N_sel - n_with_sel
     n_with_nsel <- N_with - n_with_sel
     n_without_nsel <- N_tot - (n_with_sel + n_without_sel + n_with_nsel)
 
     # contingency table is
     #               | In selection  | Not in selection
     #-------------------------------------------------
     #  With term    | n_with_sel    | n_with_nsel
     #  Without term | n_without_sel | n_without_nsel
 
83513bb5
     if (n_with_sel < 2) return(NULL)
6a2ba253
 
     # Expected number of features in selection, if random
     n_expect <- N_with * N_sel / N_tot
 
     # Odds ratio
     odds_ratio <- (n_with_sel / n_without_sel) / (n_with_nsel / n_without_nsel)
 
     # Hypergeometric function much faster than fisher.test
     p <- 1 - stats::phyper(n_with_sel - 1, N_with, N_without, N_sel)
 
6f7160fa
     # Convert feature IDs to feature names;
0c4cd9e4
     if (!is.null(feat2name))
6f7160fa
       tfeats_sel <- feat2name[tfeats_sel] |> unname()
6a2ba253
 
4b6d4e6b
     term_name <- term_data$term2name[[term_id]]
6a2ba253
     # returns NAs if no term found
     if (is.null(term_name)) term_name <- NA_character_
 
     # constructing a tibble in every iteration is more time expensive than `c`,
     # even with overhead of converting types afterwards. Not elegant, but fast.
     c(
       term_id = term_id,
       term_name = term_name,
       N_with = N_with,
       n_with_sel = n_with_sel,
       n_expect = n_expect,
       enrichment = n_with_sel / n_expect,
       odds_ratio = odds_ratio,
       ids = paste(tfeats_sel, collapse = ", "),
       p_value = p
     )
   })
96690646
 
17492fda
   # Drawback - if all selections below minimum, res is tibble 0 x 0, need to
   # catch it
83513bb5
   if (nrow(res) == 0) {
     res <- NULL
   } else {
     res <- res |>
6a2ba253
       dplyr::mutate(
7fb769ba
         dplyr::across(c(N_with, n_with_sel), as.integer),
         dplyr::across(c(n_expect, enrichment, odds_ratio, p_value), as.numeric),
         p_adjust = stats::p.adjust(p_value, method = "BH"),
         dplyr::across(c(enrichment, odds_ratio, p_value, p_adjust), ~signif(.x, 3)),
6a2ba253
         n_expect = round(n_expect, 2)
       ) |>
83513bb5
       dplyr::arrange(desc(odds_ratio))
6a2ba253
   }
83513bb5
   res
6a2ba253
 }