Bioconductor Code: fenr

Browse code

Speeding up enrichment by using Rfast::Hash

Marek Gierlinski authored on 11/10/2022 12:33:09
Showing 3 changed files

DESCRIPTION index cf0e1b7..530b8e9 100644
R/enrichment.R index 0270d88..e0ccc0d 100644
tests/testthat/test_prepare_for_enrichment.R index 60da034..6d870cd 100644

History View file @ 75128fb

@@ -32,6 +32,7 @@ Imports:
                          readr,
                          stringr,
                          tibble,
                     +    Rfast,
                          httr,
                          XML,
                          jsonlite,

R/enrichment.R

History View file @ 75128fb

@@ -68,26 +68,35 @@ prepare_for_enrichment <- function(terms, mapping, all_features = NULL, feature_
                          terms <- dplyr::bind_rows(terms, dummy)
+                       }
                     -  # List to select term name
                     -  term2name <- terms$term_name |>
                     -    purrr::set_names(terms$term_id)
                     +  # Hash to select term name
                     +  term2name <- Rfast::Hash(
                     +    keys = terms$term_id,
                     +    values = terms$term_name
                     +  )
                        # feature-term tibble
                        feature_term <- mapping |>
                          dplyr::rename(feature_id = !!feature_name) |>
                     -    dplyr::filter(feature_id %in% all_features)
                     +    dplyr::filter(feature_id %in% all_features) |>
                     +    dplyr::select(feature_id, term_id)
                     -  # Feature to terms conversion list
                     -  feature2term <- feature_term |>
                     +  # Feature to terms hash
                     +  f2t <- feature_term |>
                          dplyr::group_by(feature_id) |>
                     -    dplyr::summarise(terms = list(term_id)) |>
                     -    tibble::deframe()
+                    -
                     -  # Term to feature conversion list
                     -  term2feature <- feature_term |>
                     +    dplyr::summarise(terms = list(term_id))
                     +  feature2term <- Rfast::Hash(
                     +    keys = f2t$feature_id,
                     +    values = f2t$terms
                     +  )
+                    +
                     +  # Term to feature hash
                     +  t2f <- feature_term |>
                          dplyr::group_by(term_id) |>
                     -    dplyr::summarise(features = list(feature_id)) |>
                     -    tibble::deframe()
                     +    dplyr::summarise(features = list(feature_id))
                     +  term2feature <- Rfast::Hash(
                     +    keys = t2f$term_id,
                     +    values = t2f$features
                     +  )
                        list(
                          term2name = term2name,
@@ -157,7 +166,7 @@ functional_enrichment <- function(feat_all, feat_sel, term_data, feat2name = NUL
                        # all terms present in the selection
                        our_terms <- feat_sel |>
                     -    purrr::map(\(x) term_data$feature2term[[x]]) |>
                     +    purrr::map(\(x) term_data$feature2term[x]) |>
                          unlist() |>
                          unique()
                        # number of features in selection
@@ -167,7 +176,9 @@ functional_enrichment <- function(feat_all, feat_sel, term_data, feat2name = NUL
                        res <- purrr::map_dfr(our_terms, function(term_id) {
                          # all features with the term
                     -    tfeats <- term_data$term2feature[[term_id]]
                     +    # [[1]] is needed because hash values are one-element lists
                     +    # term_data$term2feature is a Hash object
                     +    tfeats <- term_data$term2feature[term_id][[1]]
                          # features from selection with the term
                          # this is faster than intersect(tfeats, feat_sel)
@@ -201,7 +212,7 @@ functional_enrichment <- function(feat_all, feat_sel, term_data, feat2name = NUL
                          if (!is.null(feat2name)) tfeats_sel <- feat2name[tfeats_sel] |> unname()
                     -    term_name <- term_data$term2name[[term_id]]
                     +    term_name <- term_data$term2name[term_id]
                          # returns NAs if no term found
                          if (is.null(term_name)) term_name <- NA_character_

tests/testthat/test_prepare_for_enrichment.R

History View file @ 75128fb

@@ -15,10 +15,6 @@ terms <- tibble::tibble(
                        term_name = term_names
+                     )
                     -# Prepare data for enrichment
                     -term2name <- term_names |>
                     -  purrr::set_names(term_ids)
+                    -
                      # random selection of features for terms
                      set.seed(666)
                      mapping <- purrr::map2_dfr(term_ids, term_sizes, function(tid, n) {
@@ -28,42 +24,44 @@ mapping <- purrr::map2_dfr(term_ids, term_sizes, function(tid, n) {
+                       )
                      })
                     -# Feature to terms conversion list
                     -feature2term <- mapping |>
                     -  dplyr::group_by(feature_id) |>
                     -  dplyr::summarise(terms = list(term_id)) |>
                     -  tibble::deframe()
+                    -
                     -# Term to feature conversion list
                     -term2feature <- mapping |>
                     -  dplyr::group_by(term_id) |>
                     -  dplyr::summarise(features = list(feature_id)) |>
                     -  tibble::deframe()
+                    -
                     -# final structure required by functional_enrichment
                     -term_data <- list(
                     -  term2name = term2name,
                     -  feature2term = feature2term,
                     -  term2feature = term2feature
                     -) |>
                     -  structure(class = "fenr_terms")
+                    -
                     -test_that("Expected normal output", {
                     +test_that("Expected correct output", {
                        td <- prepare_for_enrichment(terms, mapping, feature_name = "feature_id")
                     -  # Order is not mandatory, so need to sort before comparison
                     -  expect_equal(sort(term_data$term2name), sort(td$term2name))
+                    -
                     -  p1 <- purrr::map2(td$term2feature, term_data$term2feature, function(f1, f2) {
                     -    expect_equal(sort(f1), sort(f2))
                     -  })
+                    -
                     -  p2 <- purrr::map2(td$feature2term, term_data$feature2term, function(f1, f2) {
                     -    expect_equal(sort(f1), sort(f2))
                     -  })
                     +  # Check term names
                     +  for(i in seq_along(terms$term_id)) {
                     +    r <- terms[i, ]
                     +    expect_equal(r$term_name, td$term2name[r$term_id])
                     +  }
+                    +
+                    +
                     +  # Check term-feature hash
                     +  term_ids <- mapping$term_id |> unique()
                     +  chk1 <- term_ids |>
                     +    purrr::map(function(trm) {
                     +      expected <- mapping |>
                     +        dplyr::filter(term_id == trm) |>
                     +        dplyr::pull(feature_id) |>
                     +        sort()
                     +      returned <- td$term2feature[trm][[1]] |>
                     +        sort()
                     +      expect_equal(expected, returned)
                     +    })
+                    +
                     +  # Check feature-term hash
                     +  feature_ids <- mapping$feature_id |> unique()
                     +  chk2 <- feature_ids |>
                     +    purrr::map(function(feat) {
                     +      expected <- mapping |>
                     +        dplyr::filter(feature_id == feat) |>
                     +        dplyr::pull(term_id) |>
                     +        sort()
                     +      returned <- td$feature2term[feat][[1]] |>
                     +        sort()
                     +      expect_equal(expected, returned)
                     +    })
                      })