R/clusterFeatures.R
92fb2760
 ##' Feature clustering
 ##'
2111ffeb
 ##' Function to cluster LC-MS features according to their retention time and
92fb2760
 ##' intensity correlation across samples with a
 ##' \linkS4class{SummarizedExperiment}.
 ##'
 ##' For soft ionization methods (e.g., LC/ESI-MS) commonly used in metabolomics,
 ##' one or more ions could be generated from an individual compound upon
 ##' ionization. The redundancy of feature data needs to be addressed since we
 ##' typically interested in compounds rather than different ion species. This
 ##' function attempts to identify a group of features from the same compound
 ##' with the following steps:
 ##'
 ##' 1. Features are grouped by their retention times to identify co-eluting
 ##' compounds.
 ##'
 ##' 2. For each retention time-based group, features are further clustered by
 ##' patterns of the intensity correlations across samples to identify a subset
 ##' of features from the same compound.
 ##'
2111ffeb
 ##' The retention time-based grouping is performed using either a hierarchical
92fb2760
 ##' clustering via [hclust] or the methods available in the \pkg{MsFeatures}
 ##' package via [MsFeatures::groupClosest] and [MsFeatures::groupConsecutive].
2111ffeb
 ##' For the \code{rt_grouping} = "hclust", by default, complete-linkage
 ##' clustering is conducted using the Manhattan distance (i.e., difference in
 ##' retention times) where the distance between two clusters is defined as the
 ##' difference in retention times between the farthest pair of elements in the
 ##' two clusters. Group memberships are assigned by specifying the cut height
 ##' for the distance metric. Other linkage methods can be specified with
 ##' \code{hclust_linkage}. Please refer to \code{?hclust} for details. For the
 ##' "closest" and "consecutive", please refer to
92fb2760
 ##' \code{?MsFeatures::groupClosest} and \code{?MsFeatures::groupConsecutive}
2111ffeb
 ##' for the details of algorithms.
 ##'
 ##' For the correlation-based grouping, \code{cor_grouping} = "connected"
 ##' creates a undirected graph using feature correlations as an adjacency matrix
 ##' (i.e., correlations serve as edge weights). The edges whose weights are
 ##' below the cut-off specified by `cor_cut` will be removed from the graph,
 ##' separating features into several disconnected subgroups. Features in the
 ##' same subgroup will be assigned to the same feature cluster. For the
 ##' "louvain", the function further applies the Louvain algorithm to the graph
 ##' in order to identify densely connected features via
 ##' [igraph::cluster_louvain]. For the "SimilarityMatrix",
 ##' [MsFeatures::groupSimilarityMatrix] is used for feature grouping. Please
 ##' refer to \code{?MsFeatures::groupSimilarityMatrix} for the details of
 ##' algorithm.
92fb2760
 ##'
 ##' @param x A \linkS4class{SummarizedExperiment} object.
2111ffeb
 ##' @param i A string or integer value specifying which assay values to use.
da9a8dad
 ##' @param rtime_var A string specifying the name of variable containing a
92fb2760
 ##'   numeric vector of retention times in `rowData(x)`.
 ##' @param rt_cut A numeric value specifying a cut-off for the retention-time
 ##'   based feature grouping.
 ##' @param cor_cut A numeric value specifying a cut-off for the
 ##'   correlation-based feature grouping.
 ##' @param rt_grouping A string specifying which method to use for the
 ##'   retention-time based feature grouping.
 ##' @param cor_grouping A string specifying which method to use for the
 ##'   correlation-based feature grouping.
 ##' @param cor_use A string specifying which method to compute correlations in
 ##'   the presence of missing values. Refer to \code{?cor} for details.
 ##' @param cor_method A string specifying which correlation coefficient is to be
 ##'   computed. See \code{?cor} for details.
2111ffeb
 ##' @param log2 A logical specifying whether feature intensities need to be
92fb2760
 ##'   log2-transformed before calculating a correlation matrix.
2111ffeb
 ##' @param hclust_linkage A string specifying the linkage method to be used when
 ##'   \code{rt_grouping} is "hclust".
92fb2760
 ##' @return A \linkS4class{SummarizedExperiment} object with the grouping
 ##'   results added to columns "rtime_group" (initial grouping on retention
 ##'   times) and "feature_group" in its `rowData`.
 ##'
 ##' @references
 ##'
 ##' Johannes Rainer (2022). MsFeatures: Functionality for Mass Spectrometry
 ##' Features. R package version 1.3.0.
 ##' 'https://github.com/RforMassSpectrometry/MsFeatures
 ##'
 ##' Vincent D. Blondel, Jean-Loup Guillaume, Renaud Lambiotte, Etienne Lefebvre:
 ##' Fast unfolding of communities in large networks. J. Stat. Mech. (2008)
 ##' P10008
 ##'
 ##' Csardi G, Nepusz T: The igraph software package for complex network
 ##' research, InterJournal, Complex Systems 1695. 2006. https://igraph.org
 ##'
 ##' @seealso
 ##'
 ##' See [hclust], [cutree], [MsFeatures::groupClosest],
 ##' [MsFeatures::groupConsecutive], [MsFeatures::groupSimilarityMatrix], and
 ##' [igraph::cluster_louvain] for the underlying functions that do work.
2111ffeb
 ##'
 ##' See [plotRTgroup] to visualize the grouping result.
 ##'
 ##' @examples
193dc8b5
 ##'
 ##' data(faahko_se)
 ##'
2111ffeb
 ##' se <- clusterFeatures(faahko_se, i = "knn_vsn", rtime_var = "rtmed")
 ##' rowData(se)[, c("rtmed", "rtime_group", "feature_group")]
193dc8b5
 ##'
92fb2760
 ##' @export
2111ffeb
 clusterFeatures <- function(x, i, rtime_var = "rtime",
92fb2760
                             rt_cut = 10, cor_cut = 0.7,
                             rt_grouping = c("hclust", "closest", "consecutive"),
2111ffeb
                             cor_grouping = c("louvain", "SimilarityMatrix",
                                              "connected", "none"),
92fb2760
                             cor_use = c("everything", "all.obs",
                                         "complete.obs", "na.or.complete",
                                         "pairwise.complete.obs"),
                             cor_method = c("pearson", "kendall", "spearman"),
2111ffeb
                             log2 = FALSE,
                             hclust_linkage = "complete") {
     if (!is(x, "SummarizedExperiment")) {
         stop("`x` must be a SummarizedExperiment object.")
92fb2760
     }
2111ffeb
     if (!any(colnames(rowData(x)) == rtime_var)) {
         stop("Variable `", rtime_var, "` is not found in `rowData(x)`." )
92fb2760
     }
2111ffeb
     rt_grouping <- match.arg(rt_grouping)
     cor_grouping <- match.arg(cor_grouping)
     rts <- rowData(x)[, rtime_var]
     rt_clus <- .groupRT(rts, method = rt_grouping, cut = rt_cut,
                         hclust_linkage = hclust_linkage)
     rt_clus <- .format_padding(rt_clus)
     rtime_group <- factor(paste0("FG.", rt_clus),
                           levels = unique(paste0("FG.", rt_clus)))
     rowData(x)$rtime_group <- rtime_group
92fb2760
 
2111ffeb
     if (cor_grouping == "none") {
         x
     } else {
         m <- assay(x, i)
         if (log2) {
             m <- log2(m)
         }
         ml <- split.data.frame(m, rtime_group)
         res <- lapply(ml, function(x) {
             .groupCorr(x, use = cor_use, method = cor_method,
                        type = cor_grouping, cut = cor_cut)
         })
         cor_clus <- unsplit(res, f = rtime_group)
         cor_clus <- .format_padding(cor_clus)
         rowData(x)$feature_group <- paste(rtime_group, cor_clus, sep = ".")
         x
92fb2760
     }
 }
 
 
2111ffeb
 .groupRT <- function(x, method = c("hclust", "closest", "consecutive"),
                      cut = 10, hclust_linkage = "complete") {
     method <- match.arg(method)
     if (anyNA(x)) {
         stop("`x` must have no missing values.")
92fb2760
     }
2111ffeb
     switch(
         method,
         hclust = cutree(hclust(dist(x, "manhattan"), hclust_linkage), h = cut),
         closest = {
             .verify_package("MsFeatures")
             clus <- MsFeatures::groupClosest(x, maxDiff = cut)
             .relabel(clus)
         },
         consecutive = {
             .verify_package("MsFeatures")
             clus <- MsFeatures::groupConsecutive(x, maxDiff = cut)
             .relabel(clus)
         }
     )
92fb2760
 }
 
2111ffeb
 .groupCorr <- function(x, use = c("everything", "all.obs",
                                   "complete.obs", "na.or.complete",
                                   "pairwise.complete.obs"),
                        method = c("pearson", "kendall", "spearman"),
                        type = c("louvain", "connected", "SimilarityMatrix"),
                        cut = 0.7
                        ) {
     use <- match.arg(use)
     ## method <- match.arg(method) ## handled with cor function
     type <- match.arg(type)
92fb2760
 
2111ffeb
     ## Calculate a correlation matrix
     m <- cor(t(x), use = use, method = method)
     if (anyNA(m)) {
         stop("A correlation matrix contains NA. ",
              "Please check `x` and consider imputation.")
92fb2760
     }
2111ffeb
 
     switch(
         type,
         connected = {
             g <- igraph::graph_from_adjacency_matrix(
                              m, mode = "lower", weighted = TRUE, diag = FALSE
                          )
             g <- igraph::delete_edges(g, which(igraph::E(g)$weight < cut))
             igraph::components(g)$membership
         },
         louvain = {
             g <- igraph::graph_from_adjacency_matrix(
                              m, mode = "lower", weighted = TRUE, diag = FALSE
                          )
             g <- igraph::delete_edges(g, which(igraph::E(g)$weight < cut))
             res <- igraph::cluster_louvain(g)
             igraph::membership(res)
         },
         SimilarityMatrix = {
             res <- MsFeatures::groupSimilarityMatrix(m, threshold = cut)
             .relabel(res)
         }
     )
92fb2760
 }
 
 .relabel <- function(x) {
2111ffeb
     v <- rle(x)
     newc <- as.integer(factor(v$values, levels = unique(v$values)))
     rep(newc, times = v$lengths)
92fb2760
 }
 
 .format_padding <- function(x) {
2111ffeb
     padding_width <- nchar(max(x))
     x <- formatC(x, width = padding_width, format = "d", flag = 0)
92fb2760
 }