Browse code

Use explicit string instead of indices when extracting variance component

Jaehyun Joo authored on 21/03/2022 19:19:48
Showing 1 changed files
... ...
@@ -169,7 +169,7 @@ removeICC <- function(x, qc_samples,
169 169
         icc <- tryCatch({
170 170
             fit <- nlme::lme(y ~ 1,  random = ~ 1 | f, data = d,
171 171
                              na.action = na.omit)
172
-            vv <- as.numeric(nlme::VarCorr(fit)[c(1, 2)])
172
+            vv <- as.numeric(nlme::VarCorr(fit)[, "Variance"])
173 173
             vv[1] / sum(vv)
174 174
         },
175 175
         error = function(e) NA_real_
... ...
@@ -235,7 +235,7 @@ removeBlankRatio <- function(x, blank_samples, qc_samples, cut = 2,
235 235
         stop("Please specify QC samples.")
236 236
     }
237 237
     if (missing(blank_samples)) {
238
-        stop("Please specify Blank samples.")
238
+        stop("Please specify blank samples.")
239 239
     }
240 240
     m_qc <- x[, qc_samples, drop = FALSE]
241 241
     m_blank <- x[, blank_samples, drop = FALSE]
Browse code

Fixed the issues in gitignore for *.R files

Jaehyun Joo authored on 18/03/2022 13:00:01
Showing 1 changed files
1 1
new file mode 100644
... ...
@@ -0,0 +1,264 @@
1
+##' Feature filtering based on proportions of missing values
2
+##'
3
+##' Removes Features based on proportions of missing values in the matrix where
4
+##' rows represent features and columns represent samples. Features can be
5
+##' removed based on missing values within a specific group or multiple groups.
6
+##' A feature will be retained, if there is at least one group with a proportion
7
+##' of non-missing values above a cut-off.
8
+##'
9
+##' @param x A matrix-like object.
10
+##' @param group A character vector for the information about each sample's
11
+##'     group.
12
+##' @param levels A string or character vector specifying one or more groups for
13
+##'     filter filtering based on missing values. If \code{NULL}, all group
14
+##'     levels in `group` will be used.
15
+##' @param cut A numeric value between 0 and 1 specifying a minimum proportion
16
+##'     of non-missing values to retain a feature.
17
+##' @return A matrix containing the filtered features.
18
+##'
19
+##' @seealso See [removeFeatures] that provides a
20
+##'   \linkS4class{SummarizedExperiment}-friendly wrapper for this function.
21
+##'
22
+##' @examples
23
+##'
24
+##' data(faahko_se)
25
+##' m <- assay(faahko_se, "raw")
26
+##' g <- colData(faahko_se)$sample_group
27
+##' table(g)
28
+##'
29
+##' ## Filter based on missing values in "KO" and "WT" groups
30
+##' removeMiss(m, group = g, cut = 0.9)
31
+##'
32
+##' ## Consider only "KO" group (can be useful for QC-based filtering)
33
+##' removeMiss(m, group = g, levels = "KO", cut = 0.9)
34
+##'
35
+##' @export
36
+removeMiss <- function(x, group, levels = NULL, cut = 0.7) {
37
+    idx_to_keep <- .removeMiss(x = x, group = group, levels = levels, cut = cut)
38
+    x[idx_to_keep, , drop = FALSE]
39
+}
40
+
41
+.removeMiss <- function(x, group, levels = NULL, cut = 0.7) {
42
+    if (is.null(levels)) {
43
+        levels <- unique(group)
44
+    }
45
+    if (!all(levels %in% group)) {
46
+        stop("All elements of `levels` should be a part of `group`")
47
+    }
48
+    idx_to_keep <- integer(0)
49
+    for (level in levels) {
50
+        m <- x[, level == group, drop = FALSE]
51
+        non_missing_frac <- rowSums(!is.na(m)) / ncol(m)
52
+        idx_to_keep <- union(idx_to_keep, which(non_missing_frac >= cut))
53
+    }
54
+    sort(idx_to_keep)
55
+}
56
+
57
+##' Feature Filtering based on RSD
58
+##'
59
+##' Removes Features with low reproducibility based on a relative standard
60
+##' deviation (also known as coefficient of variation) of QC samples using the
61
+##' data matrix where rows represent features and columns represent samples.
62
+##' Features with a RSD above a cut-off will be removed from the data.
63
+##'
64
+##' @param x A matrix-like object.
65
+##' @param qc_samples A vector of sample names or column indices specifying QC
66
+##'     samples for the calculation of RSD. Must be a subset of
67
+##'     \code{colnames(x)} if it is a character vector.
68
+##' @param cut A numeric value between specifying a RSD cut-off to retain a
69
+##'     feature.
70
+##' @return A matrix containing the filtered features.
71
+##'
72
+##' @seealso See [removeFeatures] that provides a
73
+##'   \linkS4class{SummarizedExperiment}-friendly wrapper for this function.
74
+##'
75
+##' @examples
76
+##'
77
+##' set.seed(1e7)
78
+##'
79
+##' m_bio <- matrix(rlnorm(800, sdlog = 1), ncol = 20)
80
+##' m_qc <- matrix(rlnorm(400, sdlog = 0.25), ncol = 10)
81
+##' m <- cbind(m_bio, m_qc)
82
+##' colnames(m) <- c(paste0("S", seq_len(20)), paste0("Q", seq_len(10)))
83
+##'
84
+##' removeRSD(m, qc_samples = paste0("Q", seq_len(10)))
85
+##'
86
+##' @export
87
+removeRSD <- function(x, qc_samples, cut = 0.3) {
88
+    idx_to_keep <- .removeRSD(x = x, qc_samples = qc_samples, cut = cut)
89
+    x[idx_to_keep, , drop = FALSE]
90
+}
91
+
92
+.removeRSD <- function(x, qc_samples, cut = 0.3) {
93
+    if (missing(qc_samples)) {
94
+        stop("Please specify QC samples.")
95
+    }
96
+    m <- x[, qc_samples, drop = FALSE]
97
+    rsd <- apply(m, 1, function(x) sd(x, na.rm = TRUE) / mean(x, na.rm = TRUE))
98
+    which(rsd <= cut) # This drops rsd = NA
99
+}
100
+
101
+##' Feature Filtering based on ICC
102
+##'
103
+##' Removes Features based on a intraclass correlation coefficient (ICC) using
104
+##' the data matrix where rows represent features and columns represent samples.
105
+##' For each feature, ICC will be calculated using both biological and QC
106
+##' samples to identify how much of the total variation is explained by
107
+##' biological variability, as described in Schiffman, C et al (2019).
108
+##' Informative features are expected to have relatively high variability across
109
+##' the biological samples, compared to QC replicates. Features with an ICC
110
+##' below a cut-off will be removed.
111
+##'
112
+##' @param x A matrix-like object.
113
+##' @param qc_samples A vector of sample names or column indices specifying QC
114
+##'     samples for the calculation of ICC. Must be a subset of
115
+##'     \code{colnames(x)} if it is a character vector.
116
+##' @param bio_samples A vector of sample names or column indices specifying
117
+##'     biological samples for the calculation of ICC. Must be a subset of
118
+##'     \code{colnames(x)} if it is a character vector.
119
+##' @param cut A numeric value between 0 and 1 specifying a ICC cut-off to
120
+##'     retain a feature.
121
+##' @return A matrix containing the filtered features.
122
+##'
123
+##'
124
+##' @references
125
+##' Schiffman, C., Petrick, L., Perttula, K. et al. Filtering procedures for
126
+##' untargeted LC-MS metabolomics data. BMC Bioinformatics 20, 334 (2019).
127
+##' https://doi.org/10.1186/s12859-019-2871-9
128
+##'
129
+##' @seealso See [removeFeatures] that provides a
130
+##'   \linkS4class{SummarizedExperiment}-friendly wrapper for this function.
131
+##'
132
+##' @examples
133
+##'
134
+##' set.seed(1e7)
135
+##'
136
+##' m_bio_1 <- matrix(rlnorm(600, sdlog = 1), ncol = 20)
137
+##' m_bio_2 <- matrix(rlnorm(200, sdlog = 0.3), ncol = 20)
138
+##' m_bio <- rbind(m_bio_1, m_bio_2)
139
+##' m_qc <- matrix(rlnorm(400, sdlog = 0.25), ncol = 10)
140
+##' m <- cbind(m_bio, m_qc)
141
+##' colnames(m) <- c(paste0("S", seq_len(20)), paste0("Q", seq_len(10)))
142
+##'
143
+##' removeICC(m, qc_samples = paste0("Q", seq_len(10)),
144
+##'           bio_samples = paste0("S", seq_len(20)))
145
+##' @export
146
+removeICC <- function(x, qc_samples,
147
+                      bio_samples = setdiff(colnames(x), qc_samples),
148
+                      cut = 0.4) {
149
+    idx_to_keep <- .removeICC(x = x, qc_samples = qc_samples,
150
+                              bio_samples = bio_samples, cut = cut)
151
+    x[idx_to_keep, , drop = FALSE]
152
+}
153
+
154
+.removeICC <- function(x, qc_samples, bio_samples, cut = 0.4) {
155
+    .verify_package("nlme")
156
+    if (missing(qc_samples)) {
157
+        stop("Please specify QC samples.")
158
+    }
159
+    if (missing(bio_samples)) {
160
+        stop("Please specify biological samples.")
161
+    }
162
+    m <- x[, c(bio_samples, qc_samples)]
163
+    f <- factor(c(seq_len(length(bio_samples)), rep("Q", length(qc_samples))))
164
+    icc_vec <- numeric(0)
165
+    pb <- utils::txtProgressBar(1, nrow(x), style = 3)
166
+    message("Calculating ICC...")
167
+    for (i in seq_len(nrow(x))) {
168
+        d <- data.frame(y = m[i, ], f = f)
169
+        icc <- tryCatch({
170
+            fit <- nlme::lme(y ~ 1,  random = ~ 1 | f, data = d,
171
+                             na.action = na.omit)
172
+            vv <- as.numeric(nlme::VarCorr(fit)[c(1, 2)])
173
+            vv[1] / sum(vv)
174
+        },
175
+        error = function(e) NA_real_
176
+        )
177
+        icc_vec <- append(icc_vec, icc)
178
+        utils::setTxtProgressBar(pb, i)
179
+    }
180
+    close(pb)
181
+    which(icc_vec >= cut) # This drops icc = NA
182
+}
183
+
184
+##' Feature Filtering based on QC/blank ratio
185
+##'
186
+##' Removes Features with based on QC/blank ratios using the data matrix where
187
+##' rows represent features and columns represent samples. A feature will be
188
+##' retained if there are not enough blank samples to calculate an intensity
189
+##' ratio for a feature (or completely absent in blank samples). Use
190
+##' [removeMiss] to remove features based on a proportion of missing values.
191
+##' Features with a QC/blank ratio below a cut-off will be discarded.
192
+##'
193
+##' @param x A matrix-like object.
194
+##' @param blank_samples A vector of sample names or column indices specifying
195
+##'     blank samples for the calculation of ratio. Must be a subset of
196
+##'     \code{colnames(x)} if it is a character vector.
197
+##' @param qc_samples A vector of sample names or column indices specifying QC
198
+##'     samples for the calculation of ratio. Must be a subset of
199
+##'     \code{colnames(x)} if it is a character vector.
200
+##' @param cut A numeric value greater than 1 specifying a QC/blank ratio
201
+##'     cut-off to retain a feature.
202
+##' @param type A method to compute a QC/blank ratio. Either "median" or
203
+##'     "mean".
204
+##' @param blank_min_n An integer value specifying the minimum number of blank
205
+##'     samples to calculate a ratio.
206
+##' @return A matrix containing the filtered features.
207
+##'
208
+##' @seealso See [removeFeatures] that provides a
209
+##'   \linkS4class{SummarizedExperiment}-friendly wrapper for this function.
210
+##'
211
+##' @examples
212
+##' set.seed(1e7)
213
+##'
214
+##' m_blank <- matrix(rlnorm(200), ncol = 5)
215
+##' m_qc <- matrix(rlnorm(400, 1), ncol = 10)
216
+##' m <- cbind(m_blank, m_qc)
217
+##' colnames(m) <- c(paste0("B", seq_len(5)), paste0("Q", seq_len(10)))
218
+##'
219
+##' removeBlankRatio(m, blank_samples = paste0("B", seq_len(5)),
220
+##'                  qc_samples = paste0("Q", seq_len(10)))
221
+##'
222
+##' @export
223
+removeBlankRatio <- function(x, blank_samples, qc_samples, cut = 2,
224
+                             type = c("median", "mean"), blank_min_n = 3) {
225
+    idx_to_keep <- .removeBlankRatio(x = x, blank_samples = blank_samples,
226
+                                     qc_samples = qc_samples, cut = cut,
227
+                                     type = type, blank_min_n = blank_min_n)
228
+    x[idx_to_keep, , drop = FALSE]
229
+}
230
+
231
+.removeBlankRatio <- function(x, blank_samples, qc_samples, cut = 2,
232
+                              type = c("median", "mean"), blank_min_n = 3) {
233
+    type <- match.arg(type)
234
+    if (missing(qc_samples)) {
235
+        stop("Please specify QC samples.")
236
+    }
237
+    if (missing(blank_samples)) {
238
+        stop("Please specify Blank samples.")
239
+    }
240
+    m_qc <- x[, qc_samples, drop = FALSE]
241
+    m_blank <- x[, blank_samples, drop = FALSE]
242
+    if (type == "median") {
243
+        qc_summary <- apply(m_qc, 1, median, na.rm = TRUE)
244
+        blank_summary <- apply(m_blank, 1, median, na.rm = TRUE)
245
+    } else {
246
+        qc_summary <- rowMeans(m_qc, na.rm = TRUE)
247
+        mean_summary <- rowMeans(m_blank, na.rm = TRUE)
248
+    }
249
+    ## Get indices to drop according to QC/blank ratio
250
+    ## Trying to keep ratio = NA as we are uncertain about those features
251
+    idx_to_drop <- which(qc_summary / blank_summary < cut)
252
+    if (blank_min_n > 1) {
253
+        ## The number of non-missing blanks is smaller than the predefined
254
+        ## threshold: try not to remove those features as the calculated ratios
255
+        ## would be unreliable
256
+        blank_non_missing <- apply(m_blank, 1, function(x) sum(!is.na(x)))
257
+        idx_to_drop <- setdiff(
258
+            idx_to_drop, which(blank_non_missing < blank_min_n)
259
+        )
260
+    }
261
+    setdiff(seq_len(nrow(x)), idx_to_drop)
262
+}
263
+
264
+