... | ... |
@@ -169,7 +169,7 @@ removeICC <- function(x, qc_samples, |
169 | 169 |
icc <- tryCatch({ |
170 | 170 |
fit <- nlme::lme(y ~ 1, random = ~ 1 | f, data = d, |
171 | 171 |
na.action = na.omit) |
172 |
- vv <- as.numeric(nlme::VarCorr(fit)[c(1, 2)]) |
|
172 |
+ vv <- as.numeric(nlme::VarCorr(fit)[, "Variance"]) |
|
173 | 173 |
vv[1] / sum(vv) |
174 | 174 |
}, |
175 | 175 |
error = function(e) NA_real_ |
... | ... |
@@ -235,7 +235,7 @@ removeBlankRatio <- function(x, blank_samples, qc_samples, cut = 2, |
235 | 235 |
stop("Please specify QC samples.") |
236 | 236 |
} |
237 | 237 |
if (missing(blank_samples)) { |
238 |
- stop("Please specify Blank samples.") |
|
238 |
+ stop("Please specify blank samples.") |
|
239 | 239 |
} |
240 | 240 |
m_qc <- x[, qc_samples, drop = FALSE] |
241 | 241 |
m_blank <- x[, blank_samples, drop = FALSE] |
1 | 1 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,264 @@ |
1 |
+##' Feature filtering based on proportions of missing values |
|
2 |
+##' |
|
3 |
+##' Removes Features based on proportions of missing values in the matrix where |
|
4 |
+##' rows represent features and columns represent samples. Features can be |
|
5 |
+##' removed based on missing values within a specific group or multiple groups. |
|
6 |
+##' A feature will be retained, if there is at least one group with a proportion |
|
7 |
+##' of non-missing values above a cut-off. |
|
8 |
+##' |
|
9 |
+##' @param x A matrix-like object. |
|
10 |
+##' @param group A character vector for the information about each sample's |
|
11 |
+##' group. |
|
12 |
+##' @param levels A string or character vector specifying one or more groups for |
|
13 |
+##' filter filtering based on missing values. If \code{NULL}, all group |
|
14 |
+##' levels in `group` will be used. |
|
15 |
+##' @param cut A numeric value between 0 and 1 specifying a minimum proportion |
|
16 |
+##' of non-missing values to retain a feature. |
|
17 |
+##' @return A matrix containing the filtered features. |
|
18 |
+##' |
|
19 |
+##' @seealso See [removeFeatures] that provides a |
|
20 |
+##' \linkS4class{SummarizedExperiment}-friendly wrapper for this function. |
|
21 |
+##' |
|
22 |
+##' @examples |
|
23 |
+##' |
|
24 |
+##' data(faahko_se) |
|
25 |
+##' m <- assay(faahko_se, "raw") |
|
26 |
+##' g <- colData(faahko_se)$sample_group |
|
27 |
+##' table(g) |
|
28 |
+##' |
|
29 |
+##' ## Filter based on missing values in "KO" and "WT" groups |
|
30 |
+##' removeMiss(m, group = g, cut = 0.9) |
|
31 |
+##' |
|
32 |
+##' ## Consider only "KO" group (can be useful for QC-based filtering) |
|
33 |
+##' removeMiss(m, group = g, levels = "KO", cut = 0.9) |
|
34 |
+##' |
|
35 |
+##' @export |
|
36 |
+removeMiss <- function(x, group, levels = NULL, cut = 0.7) { |
|
37 |
+ idx_to_keep <- .removeMiss(x = x, group = group, levels = levels, cut = cut) |
|
38 |
+ x[idx_to_keep, , drop = FALSE] |
|
39 |
+} |
|
40 |
+ |
|
41 |
+.removeMiss <- function(x, group, levels = NULL, cut = 0.7) { |
|
42 |
+ if (is.null(levels)) { |
|
43 |
+ levels <- unique(group) |
|
44 |
+ } |
|
45 |
+ if (!all(levels %in% group)) { |
|
46 |
+ stop("All elements of `levels` should be a part of `group`") |
|
47 |
+ } |
|
48 |
+ idx_to_keep <- integer(0) |
|
49 |
+ for (level in levels) { |
|
50 |
+ m <- x[, level == group, drop = FALSE] |
|
51 |
+ non_missing_frac <- rowSums(!is.na(m)) / ncol(m) |
|
52 |
+ idx_to_keep <- union(idx_to_keep, which(non_missing_frac >= cut)) |
|
53 |
+ } |
|
54 |
+ sort(idx_to_keep) |
|
55 |
+} |
|
56 |
+ |
|
57 |
+##' Feature Filtering based on RSD |
|
58 |
+##' |
|
59 |
+##' Removes Features with low reproducibility based on a relative standard |
|
60 |
+##' deviation (also known as coefficient of variation) of QC samples using the |
|
61 |
+##' data matrix where rows represent features and columns represent samples. |
|
62 |
+##' Features with a RSD above a cut-off will be removed from the data. |
|
63 |
+##' |
|
64 |
+##' @param x A matrix-like object. |
|
65 |
+##' @param qc_samples A vector of sample names or column indices specifying QC |
|
66 |
+##' samples for the calculation of RSD. Must be a subset of |
|
67 |
+##' \code{colnames(x)} if it is a character vector. |
|
68 |
+##' @param cut A numeric value between specifying a RSD cut-off to retain a |
|
69 |
+##' feature. |
|
70 |
+##' @return A matrix containing the filtered features. |
|
71 |
+##' |
|
72 |
+##' @seealso See [removeFeatures] that provides a |
|
73 |
+##' \linkS4class{SummarizedExperiment}-friendly wrapper for this function. |
|
74 |
+##' |
|
75 |
+##' @examples |
|
76 |
+##' |
|
77 |
+##' set.seed(1e7) |
|
78 |
+##' |
|
79 |
+##' m_bio <- matrix(rlnorm(800, sdlog = 1), ncol = 20) |
|
80 |
+##' m_qc <- matrix(rlnorm(400, sdlog = 0.25), ncol = 10) |
|
81 |
+##' m <- cbind(m_bio, m_qc) |
|
82 |
+##' colnames(m) <- c(paste0("S", seq_len(20)), paste0("Q", seq_len(10))) |
|
83 |
+##' |
|
84 |
+##' removeRSD(m, qc_samples = paste0("Q", seq_len(10))) |
|
85 |
+##' |
|
86 |
+##' @export |
|
87 |
+removeRSD <- function(x, qc_samples, cut = 0.3) { |
|
88 |
+ idx_to_keep <- .removeRSD(x = x, qc_samples = qc_samples, cut = cut) |
|
89 |
+ x[idx_to_keep, , drop = FALSE] |
|
90 |
+} |
|
91 |
+ |
|
92 |
+.removeRSD <- function(x, qc_samples, cut = 0.3) { |
|
93 |
+ if (missing(qc_samples)) { |
|
94 |
+ stop("Please specify QC samples.") |
|
95 |
+ } |
|
96 |
+ m <- x[, qc_samples, drop = FALSE] |
|
97 |
+ rsd <- apply(m, 1, function(x) sd(x, na.rm = TRUE) / mean(x, na.rm = TRUE)) |
|
98 |
+ which(rsd <= cut) # This drops rsd = NA |
|
99 |
+} |
|
100 |
+ |
|
101 |
+##' Feature Filtering based on ICC |
|
102 |
+##' |
|
103 |
+##' Removes Features based on a intraclass correlation coefficient (ICC) using |
|
104 |
+##' the data matrix where rows represent features and columns represent samples. |
|
105 |
+##' For each feature, ICC will be calculated using both biological and QC |
|
106 |
+##' samples to identify how much of the total variation is explained by |
|
107 |
+##' biological variability, as described in Schiffman, C et al (2019). |
|
108 |
+##' Informative features are expected to have relatively high variability across |
|
109 |
+##' the biological samples, compared to QC replicates. Features with an ICC |
|
110 |
+##' below a cut-off will be removed. |
|
111 |
+##' |
|
112 |
+##' @param x A matrix-like object. |
|
113 |
+##' @param qc_samples A vector of sample names or column indices specifying QC |
|
114 |
+##' samples for the calculation of ICC. Must be a subset of |
|
115 |
+##' \code{colnames(x)} if it is a character vector. |
|
116 |
+##' @param bio_samples A vector of sample names or column indices specifying |
|
117 |
+##' biological samples for the calculation of ICC. Must be a subset of |
|
118 |
+##' \code{colnames(x)} if it is a character vector. |
|
119 |
+##' @param cut A numeric value between 0 and 1 specifying a ICC cut-off to |
|
120 |
+##' retain a feature. |
|
121 |
+##' @return A matrix containing the filtered features. |
|
122 |
+##' |
|
123 |
+##' |
|
124 |
+##' @references |
|
125 |
+##' Schiffman, C., Petrick, L., Perttula, K. et al. Filtering procedures for |
|
126 |
+##' untargeted LC-MS metabolomics data. BMC Bioinformatics 20, 334 (2019). |
|
127 |
+##' https://doi.org/10.1186/s12859-019-2871-9 |
|
128 |
+##' |
|
129 |
+##' @seealso See [removeFeatures] that provides a |
|
130 |
+##' \linkS4class{SummarizedExperiment}-friendly wrapper for this function. |
|
131 |
+##' |
|
132 |
+##' @examples |
|
133 |
+##' |
|
134 |
+##' set.seed(1e7) |
|
135 |
+##' |
|
136 |
+##' m_bio_1 <- matrix(rlnorm(600, sdlog = 1), ncol = 20) |
|
137 |
+##' m_bio_2 <- matrix(rlnorm(200, sdlog = 0.3), ncol = 20) |
|
138 |
+##' m_bio <- rbind(m_bio_1, m_bio_2) |
|
139 |
+##' m_qc <- matrix(rlnorm(400, sdlog = 0.25), ncol = 10) |
|
140 |
+##' m <- cbind(m_bio, m_qc) |
|
141 |
+##' colnames(m) <- c(paste0("S", seq_len(20)), paste0("Q", seq_len(10))) |
|
142 |
+##' |
|
143 |
+##' removeICC(m, qc_samples = paste0("Q", seq_len(10)), |
|
144 |
+##' bio_samples = paste0("S", seq_len(20))) |
|
145 |
+##' @export |
|
146 |
+removeICC <- function(x, qc_samples, |
|
147 |
+ bio_samples = setdiff(colnames(x), qc_samples), |
|
148 |
+ cut = 0.4) { |
|
149 |
+ idx_to_keep <- .removeICC(x = x, qc_samples = qc_samples, |
|
150 |
+ bio_samples = bio_samples, cut = cut) |
|
151 |
+ x[idx_to_keep, , drop = FALSE] |
|
152 |
+} |
|
153 |
+ |
|
154 |
+.removeICC <- function(x, qc_samples, bio_samples, cut = 0.4) { |
|
155 |
+ .verify_package("nlme") |
|
156 |
+ if (missing(qc_samples)) { |
|
157 |
+ stop("Please specify QC samples.") |
|
158 |
+ } |
|
159 |
+ if (missing(bio_samples)) { |
|
160 |
+ stop("Please specify biological samples.") |
|
161 |
+ } |
|
162 |
+ m <- x[, c(bio_samples, qc_samples)] |
|
163 |
+ f <- factor(c(seq_len(length(bio_samples)), rep("Q", length(qc_samples)))) |
|
164 |
+ icc_vec <- numeric(0) |
|
165 |
+ pb <- utils::txtProgressBar(1, nrow(x), style = 3) |
|
166 |
+ message("Calculating ICC...") |
|
167 |
+ for (i in seq_len(nrow(x))) { |
|
168 |
+ d <- data.frame(y = m[i, ], f = f) |
|
169 |
+ icc <- tryCatch({ |
|
170 |
+ fit <- nlme::lme(y ~ 1, random = ~ 1 | f, data = d, |
|
171 |
+ na.action = na.omit) |
|
172 |
+ vv <- as.numeric(nlme::VarCorr(fit)[c(1, 2)]) |
|
173 |
+ vv[1] / sum(vv) |
|
174 |
+ }, |
|
175 |
+ error = function(e) NA_real_ |
|
176 |
+ ) |
|
177 |
+ icc_vec <- append(icc_vec, icc) |
|
178 |
+ utils::setTxtProgressBar(pb, i) |
|
179 |
+ } |
|
180 |
+ close(pb) |
|
181 |
+ which(icc_vec >= cut) # This drops icc = NA |
|
182 |
+} |
|
183 |
+ |
|
184 |
+##' Feature Filtering based on QC/blank ratio |
|
185 |
+##' |
|
186 |
+##' Removes Features with based on QC/blank ratios using the data matrix where |
|
187 |
+##' rows represent features and columns represent samples. A feature will be |
|
188 |
+##' retained if there are not enough blank samples to calculate an intensity |
|
189 |
+##' ratio for a feature (or completely absent in blank samples). Use |
|
190 |
+##' [removeMiss] to remove features based on a proportion of missing values. |
|
191 |
+##' Features with a QC/blank ratio below a cut-off will be discarded. |
|
192 |
+##' |
|
193 |
+##' @param x A matrix-like object. |
|
194 |
+##' @param blank_samples A vector of sample names or column indices specifying |
|
195 |
+##' blank samples for the calculation of ratio. Must be a subset of |
|
196 |
+##' \code{colnames(x)} if it is a character vector. |
|
197 |
+##' @param qc_samples A vector of sample names or column indices specifying QC |
|
198 |
+##' samples for the calculation of ratio. Must be a subset of |
|
199 |
+##' \code{colnames(x)} if it is a character vector. |
|
200 |
+##' @param cut A numeric value greater than 1 specifying a QC/blank ratio |
|
201 |
+##' cut-off to retain a feature. |
|
202 |
+##' @param type A method to compute a QC/blank ratio. Either "median" or |
|
203 |
+##' "mean". |
|
204 |
+##' @param blank_min_n An integer value specifying the minimum number of blank |
|
205 |
+##' samples to calculate a ratio. |
|
206 |
+##' @return A matrix containing the filtered features. |
|
207 |
+##' |
|
208 |
+##' @seealso See [removeFeatures] that provides a |
|
209 |
+##' \linkS4class{SummarizedExperiment}-friendly wrapper for this function. |
|
210 |
+##' |
|
211 |
+##' @examples |
|
212 |
+##' set.seed(1e7) |
|
213 |
+##' |
|
214 |
+##' m_blank <- matrix(rlnorm(200), ncol = 5) |
|
215 |
+##' m_qc <- matrix(rlnorm(400, 1), ncol = 10) |
|
216 |
+##' m <- cbind(m_blank, m_qc) |
|
217 |
+##' colnames(m) <- c(paste0("B", seq_len(5)), paste0("Q", seq_len(10))) |
|
218 |
+##' |
|
219 |
+##' removeBlankRatio(m, blank_samples = paste0("B", seq_len(5)), |
|
220 |
+##' qc_samples = paste0("Q", seq_len(10))) |
|
221 |
+##' |
|
222 |
+##' @export |
|
223 |
+removeBlankRatio <- function(x, blank_samples, qc_samples, cut = 2, |
|
224 |
+ type = c("median", "mean"), blank_min_n = 3) { |
|
225 |
+ idx_to_keep <- .removeBlankRatio(x = x, blank_samples = blank_samples, |
|
226 |
+ qc_samples = qc_samples, cut = cut, |
|
227 |
+ type = type, blank_min_n = blank_min_n) |
|
228 |
+ x[idx_to_keep, , drop = FALSE] |
|
229 |
+} |
|
230 |
+ |
|
231 |
+.removeBlankRatio <- function(x, blank_samples, qc_samples, cut = 2, |
|
232 |
+ type = c("median", "mean"), blank_min_n = 3) { |
|
233 |
+ type <- match.arg(type) |
|
234 |
+ if (missing(qc_samples)) { |
|
235 |
+ stop("Please specify QC samples.") |
|
236 |
+ } |
|
237 |
+ if (missing(blank_samples)) { |
|
238 |
+ stop("Please specify Blank samples.") |
|
239 |
+ } |
|
240 |
+ m_qc <- x[, qc_samples, drop = FALSE] |
|
241 |
+ m_blank <- x[, blank_samples, drop = FALSE] |
|
242 |
+ if (type == "median") { |
|
243 |
+ qc_summary <- apply(m_qc, 1, median, na.rm = TRUE) |
|
244 |
+ blank_summary <- apply(m_blank, 1, median, na.rm = TRUE) |
|
245 |
+ } else { |
|
246 |
+ qc_summary <- rowMeans(m_qc, na.rm = TRUE) |
|
247 |
+ mean_summary <- rowMeans(m_blank, na.rm = TRUE) |
|
248 |
+ } |
|
249 |
+ ## Get indices to drop according to QC/blank ratio |
|
250 |
+ ## Trying to keep ratio = NA as we are uncertain about those features |
|
251 |
+ idx_to_drop <- which(qc_summary / blank_summary < cut) |
|
252 |
+ if (blank_min_n > 1) { |
|
253 |
+ ## The number of non-missing blanks is smaller than the predefined |
|
254 |
+ ## threshold: try not to remove those features as the calculated ratios |
|
255 |
+ ## would be unreliable |
|
256 |
+ blank_non_missing <- apply(m_blank, 1, function(x) sum(!is.na(x))) |
|
257 |
+ idx_to_drop <- setdiff( |
|
258 |
+ idx_to_drop, which(blank_non_missing < blank_min_n) |
|
259 |
+ ) |
|
260 |
+ } |
|
261 |
+ setdiff(seq_len(nrow(x)), idx_to_drop) |
|
262 |
+} |
|
263 |
+ |
|
264 |
+ |