Browse code

Speeding up enrichment by using Rfast::Hash

Marek Gierlinski authored on 11/10/2022 12:33:09
Showing 3 changed files

... ...
@@ -32,6 +32,7 @@ Imports:
32 32
     readr,
33 33
     stringr,
34 34
     tibble,
35
+    Rfast,
35 36
     httr,
36 37
     XML,
37 38
     jsonlite,
... ...
@@ -68,26 +68,35 @@ prepare_for_enrichment <- function(terms, mapping, all_features = NULL, feature_
68 68
     terms <- dplyr::bind_rows(terms, dummy)
69 69
   }
70 70
 
71
-  # List to select term name
72
-  term2name <- terms$term_name |>
73
-    purrr::set_names(terms$term_id)
71
+  # Hash to select term name
72
+  term2name <- Rfast::Hash(
73
+    keys = terms$term_id,
74
+    values = terms$term_name
75
+  )
74 76
 
75 77
   # feature-term tibble
76 78
   feature_term <- mapping |>
77 79
     dplyr::rename(feature_id = !!feature_name) |>
78
-    dplyr::filter(feature_id %in% all_features)
80
+    dplyr::filter(feature_id %in% all_features) |>
81
+    dplyr::select(feature_id, term_id)
79 82
 
80
-  # Feature to terms conversion list
81
-  feature2term <- feature_term |>
83
+  # Feature to terms hash
84
+  f2t <- feature_term |>
82 85
     dplyr::group_by(feature_id) |>
83
-    dplyr::summarise(terms = list(term_id)) |>
84
-    tibble::deframe()
85
-
86
-  # Term to feature conversion list
87
-  term2feature <- feature_term |>
86
+    dplyr::summarise(terms = list(term_id))
87
+  feature2term <- Rfast::Hash(
88
+    keys = f2t$feature_id,
89
+    values = f2t$terms
90
+  )
91
+
92
+  # Term to feature hash
93
+  t2f <- feature_term |>
88 94
     dplyr::group_by(term_id) |>
89
-    dplyr::summarise(features = list(feature_id)) |>
90
-    tibble::deframe()
95
+    dplyr::summarise(features = list(feature_id))
96
+  term2feature <- Rfast::Hash(
97
+    keys = t2f$term_id,
98
+    values = t2f$features
99
+  )
91 100
 
92 101
   list(
93 102
     term2name = term2name,
... ...
@@ -157,7 +166,7 @@ functional_enrichment <- function(feat_all, feat_sel, term_data, feat2name = NUL
157 166
 
158 167
   # all terms present in the selection
159 168
   our_terms <- feat_sel |>
160
-    purrr::map(\(x) term_data$feature2term[[x]]) |>
169
+    purrr::map(\(x) term_data$feature2term[x]) |>
161 170
     unlist() |>
162 171
     unique()
163 172
   # number of features in selection
... ...
@@ -167,7 +176,9 @@ functional_enrichment <- function(feat_all, feat_sel, term_data, feat2name = NUL
167 176
 
168 177
   res <- purrr::map_dfr(our_terms, function(term_id) {
169 178
     # all features with the term
170
-    tfeats <- term_data$term2feature[[term_id]]
179
+    # [[1]] is needed because hash values are one-element lists
180
+    # term_data$term2feature is a Hash object
181
+    tfeats <- term_data$term2feature[term_id][[1]]
171 182
 
172 183
     # features from selection with the term
173 184
     # this is faster than intersect(tfeats, feat_sel)
... ...
@@ -201,7 +212,7 @@ functional_enrichment <- function(feat_all, feat_sel, term_data, feat2name = NUL
201 212
 
202 213
     if (!is.null(feat2name)) tfeats_sel <- feat2name[tfeats_sel] |> unname()
203 214
 
204
-    term_name <- term_data$term2name[[term_id]]
215
+    term_name <- term_data$term2name[term_id]
205 216
     # returns NAs if no term found
206 217
     if (is.null(term_name)) term_name <- NA_character_
207 218
 
... ...
@@ -15,10 +15,6 @@ terms <- tibble::tibble(
15 15
   term_name = term_names
16 16
 )
17 17
 
18
-# Prepare data for enrichment
19
-term2name <- term_names |>
20
-  purrr::set_names(term_ids)
21
-
22 18
 # random selection of features for terms
23 19
 set.seed(666)
24 20
 mapping <- purrr::map2_dfr(term_ids, term_sizes, function(tid, n) {
... ...
@@ -28,42 +24,44 @@ mapping <- purrr::map2_dfr(term_ids, term_sizes, function(tid, n) {
28 24
   )
29 25
 })
30 26
 
31
-# Feature to terms conversion list
32
-feature2term <- mapping |>
33
-  dplyr::group_by(feature_id) |>
34
-  dplyr::summarise(terms = list(term_id)) |>
35
-  tibble::deframe()
36
-
37
-# Term to feature conversion list
38
-term2feature <- mapping |>
39
-  dplyr::group_by(term_id) |>
40
-  dplyr::summarise(features = list(feature_id)) |>
41
-  tibble::deframe()
42
-
43
-# final structure required by functional_enrichment
44
-term_data <- list(
45
-  term2name = term2name,
46
-  feature2term = feature2term,
47
-  term2feature = term2feature
48
-) |>
49
-  structure(class = "fenr_terms")
50 27
 
51 28
 
52 29
 
53
-
54
-test_that("Expected normal output", {
30
+test_that("Expected correct output", {
55 31
   td <- prepare_for_enrichment(terms, mapping, feature_name = "feature_id")
56 32
 
57
-  # Order is not mandatory, so need to sort before comparison
58
-  expect_equal(sort(term_data$term2name), sort(td$term2name))
59
-
60
-  p1 <- purrr::map2(td$term2feature, term_data$term2feature, function(f1, f2) {
61
-    expect_equal(sort(f1), sort(f2))
62
-  })
63
-
64
-  p2 <- purrr::map2(td$feature2term, term_data$feature2term, function(f1, f2) {
65
-    expect_equal(sort(f1), sort(f2))
66
-  })
33
+  # Check term names
34
+  for(i in seq_along(terms$term_id)) {
35
+    r <- terms[i, ]
36
+    expect_equal(r$term_name, td$term2name[r$term_id])
37
+  }
38
+
39
+
40
+  # Check term-feature hash
41
+  term_ids <- mapping$term_id |> unique()
42
+  chk1 <- term_ids |>
43
+    purrr::map(function(trm) {
44
+      expected <- mapping |>
45
+        dplyr::filter(term_id == trm) |>
46
+        dplyr::pull(feature_id) |>
47
+        sort()
48
+      returned <- td$term2feature[trm][[1]] |>
49
+        sort()
50
+      expect_equal(expected, returned)
51
+    })
52
+
53
+  # Check feature-term hash
54
+  feature_ids <- mapping$feature_id |> unique()
55
+  chk2 <- feature_ids |>
56
+    purrr::map(function(feat) {
57
+      expected <- mapping |>
58
+        dplyr::filter(feature_id == feat) |>
59
+        dplyr::pull(term_id) |>
60
+        sort()
61
+      returned <- td$feature2term[feat][[1]] |>
62
+        sort()
63
+      expect_equal(expected, returned)
64
+    })
67 65
 })
68 66
 
69 67