Bioconductor Code: vsclust

Raw Blame Patch Log History
#' 
#' Xie Beni Index of clustering object
#'
#' Calculate the Xie Beni index for validity of the cluster number in clustering 
#' results from running fuzzy c-means or vsclust
#' original publication:
#'
#' @references Xie X.L., Beni G. (1991). A validity measure for fuzzy 
#' clustering, IEEE Transactions on Pattern Analysis and Machine Intelligence, 
#' 13, 841-847.
#' @param clres Output from clustering. Either fclust object or list containing 
#' the objects for `membership` and cluster `centers`
#' @param m Fuzzifier value
#' @return Xie Beni index
#' @examples
#' # Generate some random data
#' data <- matrix(rnorm(seq_len(1000)), nrow=100)
#' # Run clustering
#' clres <- vsclust_algorithm(data, centers=5, m=1.5)
#' # Calculate Xie-Beni index from results
#' cvalidate.xiebeni(clres, 1.5)
#' @export
cvalidate.xiebeni <-
  function(clres, m) {
    xrows <-
      dim(clres$me)[1]
    minimum <-
      -1
    error <-
      clres$within
    ncenters <-
      dim(clres$centers)[1]
    for (i in seq_len(ncenters - 1)) {
      for (j in seq(i + 1,ncenters, 1)) {
        diff <- clres$ce[i,] - clres$ce[j,]
        diffdist <-
          t(diff) %*% t(t(diff))
        if (minimum == -1)
          minimum <-
          diffdist
        if (diffdist < minimum)
          minimum <-
          diffdist
      }
    }
    xiebeni <-
      error / (xrows * minimum)
    return(xiebeni)
  }


#' Plotting vsclust results
#'
#' This function visualizes the clustered quantitative profiles in multiple 
#' figure panels. The parameters allow specifying the main items like axes 
#' labels and color maps. The code is adopted from the MFuzz package.
#'
#' @param dat a numeric data matrix containing the values used in the clustering
#' @param cl clustering results from vsclust_algorithm or Bestcl object from 
#' clustComp function
#' @param mfrow vector of two numbers for the number of rows and colums, figure 
#' panels are distributed in the plot
#' @param colo color map to be used (can be missing)
#' @param minMem filter for showing only features with a higher membership 
#' values than this value
#' @param timeLabels alternative labels for different conditions
#' @param filename for writing into pdf. Will write on screen when using NA
#' @param xlab Label of x-axis
#' @param ylab Label of y-axis
#' @examples
#' #' # Generate some random data
#' data <- matrix(rnorm(seq_len(5000)), nrow=500)
#' # Run clustering
#' clres <- vsclust_algorithm(data, centers=2, m=1.5)
#' mfuzz.plot(data, clres,  mfrow=c(2,3), minMem=0.0)
#' @return Multiple panels showing expression profiles of clustered features 
#' passing the minMem threshold
#' @export
#' @references
#' Schwaemmle V, Jensen ON. VSClust: feature-based variance-sensitive clustering 
#' of omics data. Bioinformatics. 2018 Sep 1;34(17):2965-2972. doi: 
#' 10.1093/bioinformatics/bty224. PMID: 29635359.
#'
#' Schwaemmle V, Hagensen CE. A Tutorial for Variance-Sensitive Clustering and 
#' the Quantitative Analysis of Protein Complexes. Methods Mol Biol. 
#' 2021;2228:433-451. doi: 10.1007/978-1-0716-1024-4_30. PMID: 33950508.
#'
#' Schwaemmle V, Jensen ON. A simple and fast method to determine the parameters 
#' for fuzzy c-means cluster analysis. Bioinformatics. 
#' 2010 Nov 15;26(22):2841-8. doi: 10.1093/bioinformatics/btq534. Epub 2010 Sep 
#' 29. PMID: 20880957.

mfuzz.plot <-
  function (dat,
            cl,
            mfrow = c(1, 1),
            colo,
            minMem = 0,
            timeLabels,
            filename = NA,
            xlab = "Time",
            ylab = "Expression changes")
  {
    clusterindex <- cl[[3]]
    memship <- cl[[4]]
    memship[memship < minMem] <- -1
    colorindex <- integer(dim(dat)[[1]])
    if (missing(colo)) {
      colo <- c(
        "#FF8F00",
        "#FFA700",
        "#FFBF00",
        "#FFD700",
        "#FFEF00",
        "#F7FF00",
        "#DFFF00",
        "#C7FF00",
        "#AFFF00",
        "#97FF00",
        "#80FF00",
        "#68FF00",
        "#50FF00",
        "#38FF00",
        "#20FF00",
        "#08FF00",
        "#00FF10",
        "#00FF28",
        "#00FF40",
        "#00FF58",
        "#00FF70",
        "#00FF87",
        "#00FF9F",
        "#00FFB7",
        "#00FFCF",
        "#00FFE7",
        "#00FFFF",
        "#00E7FF",
        "#00CFFF",
        "#00B7FF",
        "#009FFF",
        "#0087FF",
        "#0070FF",
        "#0058FF",
        "#0040FF",
        "#0028FF",
        "#0010FF",
        "#0800FF",
        "#2000FF",
        "#3800FF",
        "#5000FF",
        "#6800FF",
        "#8000FF",
        "#9700FF",
        "#AF00FF",
        "#C700FF",
        "#DF00FF",
        "#F700FF",
        "#FF00EF",
        "#FF00D7",
        "#FF00BF",
        "#FF00A7",
        "#FF008F",
        "#FF0078",
        "#FF0060",
        "#FF0048",
        "#FF0030",
        "#FF0018"
      )
    }    else {
      if (colo == "fancy") {
        fancyBlue <- c(c(255:0), rep(0, length(c(255:0))),
                        rep(0, length(c(255:150))))
        fanceGreen <-
          c(c(0:255), c(255:0), rep(0, length(c(255:150))))
        fancyRed <- c(c(0:255), rep(255, length(c(255:0))),
                       c(255:150))
        colo <- rgb(blue = fancyBlue / 255,
                    green = fanceGreen / 255,
                    red = fancyRed / 255)
      }
    }
    colorseq <- seq(0, 1, length = length(colo))
    for (j in seq_len(max(clusterindex))) {
      if (sum(clusterindex == j) > 0) {
        tmp <- dat[clusterindex == j,]
        tmpmem <- memship[clusterindex == j, j]
        if (((j - 1) %% (mfrow[1] * mfrow[2])) == 0) {
          if (!is.na(filename)) {
            pdf(filename,
                height = 3 * mfrow[1],
                width = 3 * mfrow[2])
          }
          par(mfrow = mfrow, cex = 0.5)
          if (sum(clusterindex == j) == 0) {
            ymin <- -1
            ymax <- +1
          }
          else {
            ymin <- min(tmp, na.rm = TRUE)
            ymax <- max(tmp, na.rm = TRUE)
          }
          plot.default(
            x = NA,
            xlim = c(1, dim(dat)[[2]]),
            ylim = c(ymin, ymax),
            xlab = xlab,
            ylab = ylab,
            main = paste("Cluster", j),
            axes = FALSE
          )
          if (missing(timeLabels)) {
            axis(1, seq_len(dim(dat)[[2]]), c(seq_len(dim(dat)[[2]])))
            axis(2)
          }
          else {
            axis(1, seq_len(dim(dat)[[2]]), timeLabels)
            axis(2)
          }
        }
        else {
          if (sum(clusterindex == j) == 0) {
            ymin <- -1
            ymax <- +1
          }
          else {
            ymin <- min(tmp, na.rm = TRUE)
            ymax <- max(tmp, na.rm = TRUE)
          }
          plot.default(
            x = NA,
            xlim = c(1, dim(dat)[[2]]),
            ylim = c(ymin, ymax),
            xlab = xlab,
            ylab = ylab,
            main = paste("Cluster", j),
            axes = FALSE
          )
          if (missing(timeLabels)) {
            axis(1, seq_len(dim(dat)[[2]]), seq_len(dim(dat)[[2]]))
            axis(2)
          }
          else {
            axis(1, seq_len(dim(dat)[[2]]), timeLabels)
            axis(2)
          }
        }
        if (!(sum(clusterindex == j) == 0)) {
          for (jj in seq_len(length(colorseq) - 1)) {
            tmpcol <- (tmpmem >= colorseq[jj] & tmpmem <=
                         colorseq[jj + 1])
            
            if (sum(tmpcol, na.rm = TRUE) > 0) {
              tmpind <- which(tmpcol)
              for (k in seq_len(length(tmpind))) {
                lines(tmp[tmpind[k],], col = colo[jj])
              }
            }
          }
        }
      }
    }
    if (!is.na(filename))
      dev.off()
  }

#' Plotting results from estimating the cluster number
#'
#' This function visualizes the output from estimClustNumber, and there 
#' particularly the
#' two validity indices Minimum Centroid Distance and Xie Beni Index.
#'
#' @param ClustInd Matrix with values from validity indices
#' @return Multiple panels showing expression profiles of clustered features 
#' passing the minMem threshold
#' @examples
#' data("artificial_clusters")
#' dat <- averageCond(artificial_clusters, 5, 10)
#' dat <- scale(dat)
#' dat <- cbind(dat, 1)
#' ClustInd <- estimClustNum(dat, 6)
#' estimClust.plot(ClustInd)
#' @export
#' @references
#' Schwaemmle V, Jensen ON. VSClust: feature-based variance-sensitive clustering 
#' of omics data. Bioinformatics. 2018 Sep 1;34(17):2965-2972. doi: 
#' 10.1093/bioinformatics/bty224. PMID: 29635359.
#'
#' Schwaemmle V, Hagensen CE. A Tutorial for Variance-Sensitive Clustering and 
#' the Quantitative Analysis of Protein Complexes. Methods Mol Biol. 
#' '2021;2228:433-451. doi: 10.1007/978-1-0716-1024-4_30. PMID: 33950508.
#'
#' Schwaemmle V, Jensen ON. A simple and fast method to determine the parameters 
#' for fuzzy c-means cluster analysis. Bioinformatics. 2010 
#' Nov 15;26(22):2841-8. doi: 10.1093/bioinformatics/btq534. Epub 2010 Sep 29. 
#' PMID: 20880957.
estimClust.plot <- function(ClustInd) {
  par(mfrow = c(1, 3))
  maxClust <- nrow(ClustInd) + 2
  plot(
    seq(3,maxClust,1),
    ClustInd[seq_len(nrow(ClustInd)), "MinCentroidDist_VSClust"],
    col = 2 ,
    type = "b",
    main = "Min. centroid distance\n(Highest jump is best)",
    xlab = "Number of clusters",
    ylab = "Index",
    ylim = c(min(ClustInd[, grep("MinCentroidDist",
                                 colnames(ClustInd))], na.rm = TRUE),
             max(ClustInd[, grep("MinCentroidDist",
                                 colnames(ClustInd))], na.rm = TRUE))
  )
  lines(seq(3,maxClust,1), ClustInd[seq_len(nrow(ClustInd)), "MinCentroidDist_FCM"],
        col = 3, type = "b")
  dmindist <- optimalClustNum(ClustInd)
  points(dmindist,
         ClustInd[dmindist - 2, "MinCentroidDist_VSClust"],
         pch = 15,
         col = 1,
         cex = 2)
  legend(
    "topright",
    legend = c("VSClust", "Standard"),
    lty = c(1, 1),
    col = 2:3
  )
  grid(NULL, NA, lwd = 1, col = 1)
  plot(
    seq(3,maxClust,1),
    ClustInd[seq_len(nrow(ClustInd)), "XieBeni_VSClust"],
    col = 2,
    type = "b",
    main = "Xie-Beni index\n(Lowest is best)",
    xlab = "Number of clusters",
    ylab = "Index",
    ylim = c(min(ClustInd[, grep("XieBeni", colnames(ClustInd))], na.rm =
                   TRUE),
             max(ClustInd[, grep("XieBeni", colnames(ClustInd))], na.rm =
                   TRUE))
  )
  lines(seq(3,maxClust,1), ClustInd[seq_len(nrow(ClustInd)), "XieBeni_FCM"], type =
          "b", col = 3)
  dxiebeni <- optimalClustNum(ClustInd, index = "XieBeni")
  points(dxiebeni,
         ClustInd[dxiebeni - 2, "XieBeni_VSClust"],
         pch = 15,
         col = 1,
         cex = 2)
  legend(
    "topright",
    legend = c("VSClust", "Standard"),
    lty = c(1, 1),
    col = 2:3
  )
  grid(NULL, NA, lwd = 1, col = 1)
  plot(
    3:maxClust,
    ClustInd[seq_len(nrow(ClustInd)), "NumVSClust"],
    col = 2,
    type = "b",
    main = "Total number of assigned features",
    xlab = "Number of clusters",
    ylab = "Assigned features",
    ylim = c(min(ClustInd[, grep("Num", colnames(ClustInd))], na.rm =
                   TRUE),
             max(ClustInd[, grep("Num", colnames(ClustInd))], na.rm = TRUE))
  )
  lines(3:maxClust, ClustInd[seq_len(nrow(ClustInd)), "NumFCM"], type =
          "b", col = 3)
  legend(
    "topright",
    legend = c("VSClust", "Standard"),
    lty = c(1, 1),
    col = 2:3
  )
  # finally plot
  p <- recordPlot()
}