Bioconductor Code: vsclust

Raw Blame Patch Log History
#' @section VSClust functions:
#' Functions for running VSClust analysis
#'
#' @docType package
#' @name vsclust
#' @useDynLib vsclust
NULL
#> NULL
#' Wrapper for statistical analysis
#'
#' Prepare data for running vsclust clustering.
#' This includes visualization running the functions for the principal component
#' analysis and its visualization, statistical testing with LIMMA, as well as
#' scaling and filtering of missing values
#' @param dat matrix or data frame of numerical data. Columns are samples.
#' Replicates are grouped (i.e. A1, B1, C1, A2, B2, C2) when letters denote
#' conditions and numbers the replicates. In case of `isStat=FALSE`, you need a
#' last column for the standard deviations
#' @param NumReps Number replicates in the data
#' @param NumCond Number of different experimental conditions. The total number
#' of columns needs to be NumReps*NumCond
#' @param isPaired Boolean for running paired or unpaired statistical tests
#' @param isStat Boolean for whether to run statistical test or each column
#' corresponds to a different experimental conditions. Then this function reads
#' feature standard deviations from data frame from the last column
#' @return list with the items `dat` (data matrix of features averaged over
#' replicates and last column with their standard deviations), `qvals` FDRs from
#' the statistical tests (each conditions versus the first), `StatFileOut` all
#' of before for saving in file
#' @examples
#' data <- matrix(rnorm(2000), nrow=200)
#' stats <- PrepareForVSClust(data, 5, 2, isStat=TRUE)
#'
#' @import stats
#' @importFrom matrixStats rowSds
#' @importFrom shiny validate
#' @export
#' @references
#' Schwaemmle V, Jensen ON. VSClust: feature-based variance-sensitive clustering
#' of omics data. Bioinformatics. 2018 Sep 1;34(17):2965-2972. doi:
#' 10.1093/bioinformatics/bty224. PMID: 29635359.
#'
#' Schwaemmle V, Hagensen CE. A Tutorial for Variance-Sensitive Clustering and
#' the Quantitative Analysis of Protein Complexes. Methods Mol Biol.
#' 2021;2228:433-451. doi: 10.1007/978-1-0716-1024-4_30. PMID: 33950508.
#'
#' Schwaemmle V, Jensen ON. A simple and fast method to determine the parameters
#' for fuzzy c-means cluster analysis. Bioinformatics. 2010
#' Nov 15;26(22):2841-8. doi: 10.1093/bioinformatics/btq534. Epub 2010 Sep 29.
#' PMID: 20880957.
PrepareForVSClust <-
  function(dat, NumReps, NumCond, isPaired = FALSE, isStat) {
    qvals <- statFileOut <- Sds <- NULL
    tdat <- NULL
    
    # convert to matrix
    dat <- as.matrix(dat)
    
    # Run statistical testing
    if (isStat) {
      if (ncol(dat) != NumReps * NumCond)
        stop("Number of data columns must correspond to product of conditions
             and replicates!")
      if (isPaired) {
        ttt <- SignAnalysisPaired(dat, NumCond, NumReps)
      } else {
        ttt <- SignAnalysis(dat, NumCond, NumReps)
      }
      
      Sds <- ttt$Sds
      qvals <- ttt$qvalues
      colnames(qvals) <-
        paste("qvalue ", LETTERS702[2:(NumCond)], "vsA", sep = "")
      
      tdat <- averageCond(dat, NumReps, NumCond)
      
    } else {
      Sds <- dat[, ncol(dat)]
      tdat <- dat[, seq_len(ncol(dat) - 1)]
      NumReps <- 1
      NumCond <- ncol(dat) - 1
      dat <- tdat
    }
    
    if (isStat) {
      statFileOut <- cbind(tdat, Sds, qvals)
    } else {
      statFileOut <- cbind(tdat, Sds)
    }
    
    pcaWithVar(dat, NumReps, NumCond, Sds / rowSds(tdat, na.rm = TRUE))
    
    ## Preparing output
    Out <-
      list(dat = cbind(tdat, Sds),
           qvals = qvals,
           statFileOut = statFileOut)
    Out
    
  }

#' Wrapper for statistical analysis for SummarizedExperiment object
#'
#' Prepare data for running vsclust clustering.
#' This includes visualization running the functions for the principal component
#' analysis and its visualization, statistical testing with LIMMA, as well as
#' scaling and filtering of missing values
#' @param se SummarizedExperiment object
#' @param assayname Sample in SummarizedExperiment object
#' @param coldatname Column in colData for extracting replicates
#' @param isPaired Boolean for running paired or unpaired statistical tests
#' @param isStat Boolean for whether to run statistical test or each column
#' corresponds to a different experimental conditions. Then this function reads
#' feature standard deviations from data frame from the last column
#' @return list with the items `dat` (data matrix of features averaged over
#' replicates and last column with their standard deviations), `qvals` FDRs from
#' the statistical tests (each conditions versus the first), `StatFileOut` all
#' of before for saving in file, `NumReps` number of replicates and `NumCond` 
#' number of different experimental conditions
#' @examples
#' data(miniACC, package="MultiAssayExperiment")
#' 
#' stats <- PrepareSEForVSClust(miniACC, coldatname="COC", isStat=TRUE)
#'
#' @import stats
#' @importFrom MultiAssayExperiment assay assays sampleMap colData
#' @importFrom matrixStats rowSds
#' @importFrom shiny validate
#' @export
#' @references
#' Schwaemmle V, Jensen ON. VSClust: feature-based variance-sensitive clustering
#' of omics data. Bioinformatics. 2018 Sep 1;34(17):2965-2972. doi:
#' 10.1093/bioinformatics/bty224. PMID: 29635359.
#'
#' Schwaemmle V, Hagensen CE. A Tutorial for Variance-Sensitive Clustering and
#' the Quantitative Analysis of Protein Complexes. Methods Mol Biol.
#' 2021;2228:433-451. doi: 10.1007/978-1-0716-1024-4_30. PMID: 33950508.
#'
#' Schwaemmle V, Jensen ON. A simple and fast method to determine the parameters
#' for fuzzy c-means cluster analysis. Bioinformatics. 2010
#' Nov 15;26(22):2841-8. doi: 10.1093/bioinformatics/btq534. Epub 2010 Sep 29.
#' PMID: 20880957.
PrepareSEForVSClust <-
  function(se,
           assayname = 1,
           coldatname = NULL,
           isPaired = FALSE,
           isStat) {
    qvals <- statFileOut <- Sds <- NULL
    tdat <- NULL
    
    if (!(class(se) %in% c(
      "SummarizedExperiment",
      "QFeatures",
      "MultiAssayExperiment"
    ))) {
      stop(
        "!! First argument must be a SummarizedExperiment, QFeatures or MultiAssayExperiment objectd"
      )
    }
    
    # convert to matrix
    dat <- assay(se, assayname)
    
    # determine number of conditions and replicates from colData
    NumCond <- NumReps <- 0
    if (!is.null(coldatname)) {
      # change to name if assay is given by index
      if (is.numeric(assayname))
        assayname <- names(se)[assayname]
      sample_names <- sampleMap(se)
      sample_names <- sample_names[sample_names$assay == assayname,]
      rownames(sample_names) <- sample_names$colname
      coldat <- colData(se)[sample_names[colnames(dat), "primary"], coldatname]
      names(coldat) <- colnames(dat)
      NumReps <- max(table(coldat))
      NumCond <- length(unique(coldat))
      message("-- The following categories will be used as experimental 
              conditions:\n",paste(unique(coldat), collapse="\n"))
      if (length(unique(coldat)) < 3)
        stop("!! We need a minimum of three different categories/conditions")
      message("-- Extracted NumReps: ", NumReps, " and NumCond: ", NumCond)
      dat <- balanceData(dat, coldat)
    } else{
      NumCond <- ncol(dat)
      NumReps <- 1
      message("-- No replicates given or no statistical testing, assuming that each
              sample is a different type of sample. Variances will be set to 1")
      dat <- cbind(dat, 1)
    }
    
    # Run statistical testing
    if (isStat) {
      if (ncol(dat) != NumReps * NumCond)
        stop("!! Number of data columns must correspond to product of conditions
             and replicates!")
      if (isPaired) {
        ttt <- SignAnalysisPaired(dat, NumCond, NumReps)
      } else {
        ttt <- SignAnalysis(dat, NumCond, NumReps)
      }
      
      Sds <- ttt$Sds
      qvals <- ttt$qvalues
      colnames(qvals) <-
        paste("qvalue ", LETTERS702[2:(NumCond)], "vsA", sep = "")
      
      tdat <- averageCond(dat, NumReps, NumCond)
      
    } else {
      Sds <- dat[, ncol(dat)]
      tdat <- dat[, seq_len(ncol(dat) - 1)]
      NumReps <- 1
      NumCond <- ncol(dat) - 1
      dat <- tdat
    }
    
    if (isStat) {
      statFileOut <- cbind(tdat, Sds, qvals)
    } else {
      statFileOut <- cbind(tdat, Sds)
    }
    
    pcaWithVar(dat, NumReps, NumCond, Sds / rowSds(tdat, na.rm = TRUE))
    
    ## Preparing output
    Out <-
      list(dat = cbind(tdat, Sds),
           qvals = qvals,
           statFileOut = statFileOut, NumReps=NumReps, NumCond=NumCond)
    Out
    
  }


#' Wrapper for estimation of cluster number
#'
#' This runs the clustering for different numbers of clusters, and estimates the
#' most suitable numbers from applying the minimum centroid distance and the Xie
#' Beni index. Multi-threading is used to shorten the computation times.
#' Given the hierarchical structure of many data sets, the resulting numbers are
#' suggestions. Inspection of the here plotted indices help to determine
#' alternative cluster numbers, given by a strong decay of the minimum centroid
#' distance and/or a low value of the Xie Beni index.
#'
#' @param dat matrix of features averaged over replicates. The last column
#' contains their standard deviation
#' @param maxClust Maximal number of cluster. The minimum is 3
#' @param cores The number of threads to be used for parallelisation
#' @return list with the items `ClustInd`: list of clustering objects for each
#' number of clusters, `p` plot object with plots for validity indices,
#' `numclust` optimal cluster number according to "minimum centroid distance"
#' @examples
#' data <- matrix(rnorm(1000), nrow=100)
#' estim_out <- estimClustNum(data, maxClust=10)
#' best_number <- max(estim_out[1])
#' @import limma
#' @import parallel
#' @import stats
#' @import graphics
#' @import grDevices
#' @importFrom shiny getDefaultReactiveDomain incProgress
#' @importFrom matrixStats rowMaxs
#' @export
estimClustNum <- function(dat,
                          maxClust = 25,
                          cores = 1) {
  ClustInd <- matrix(NA, nrow = maxClust - 2, ncol = 6)
  if (is.null(rownames(dat)))
    rownames(dat) <- seq_len(nrow(dat))
  tData <- dat[, seq_len(ncol(dat) - 1)]
  colnames(tData) <- NULL
  
  # define parallelization
  cl <- makeCluster(cores)
  clusterExport(
    cl = cl,
    varlist = c("vsclust_algorithm"),
    envir = environment()
  )
  clusterEvalQ(cl = cl, library(vsclust))
  
  # Standardise
  sds <- dat[rownames(tData), ncol(dat)]
  # scale standard deviations by the ones in the actual data to cope for the
  # following standardization
  sds <- sds / (rowSds(as.matrix(tData), na.rm = TRUE))
  tData <- t(scale(t(tData)))
  
  multiOut <- lapply(seq(3,maxClust,1), function(x) {
    if (!is.null(getDefaultReactiveDomain())) {
      incProgress(1, detail = paste("Running cluster number", x))
    } else {
      message("Running cluster number", x)
    }
    clustout <- ClustComp(
      tData,
      NClust = x,
      Sds = sds,
      NSs = 16,
      cl = cl
    )
    c(clustout$indices, sum(rowMaxs(clustout$Bestcl$membership) > 0.5),
      sum(rowMaxs(clustout$Bestcl2$membership) > 0.5))
  })
  
  stopCluster(cl)
  
  for (NClust in seq(3,maxClust,1))
    ClustInd[NClust - 2,] <- multiOut[[NClust - 2]]
  rownames(ClustInd) <- paste0("num_clust_", seq(3,maxClust,1))
  colnames(ClustInd) <-
    c(
      "MinCentroidDist_VSClust",
      "XieBeni_VSClust",
      "MinCentroidDist_FCM",
      "XieBeni_FCM",
      "NumVSClust",
      "NumFCM"
    )
  
  # Output
  ClustInd
}

#' Wrapper for running cluster analysis
#'
#' This function runs the clustering and visualizes the results.
#'
#' @param dat matrix or data frame with feature values for different conditions
#' @param NClust Number of cluster for running the clustering
#' @param proteins vector with additional feature information (default is NULL)
#' to be added to the results
#' @param VSClust boolean. TRUE for running the variance-sensitive clustering.
#' Otherwise, the function will call standard fuzzy c-means clustering
#' @param cores Number of threads for the parallelization
#' @return list with the items `dat`(the original data), `Bestcl` clustering
#' results (same as from vsclust_algorithm), `p` (plot object with mfuzz plots),
#' `outFileClust`(suitable matrix with complete information) , `ClustInd`
#' (information about being member of any cluster, feature needs on membership
#' values > 0.5)
#' @examples
#' data(iris)
#' data <- cbind(iris[,seq_len(4)],1)
#' clust_out <- runClustWrapper(data, NClust=3, cores=1)
#' clust_out$p
#' @import parallel
#' @import graphics
#' @importFrom grDevices recordPlot
#' @importFrom shiny getDefaultReactiveDomain incProgress
#' @importFrom matrixStats rowMaxs
#' @export
runClustWrapper <-
  function(dat,
           NClust,
           proteins = NULL,
           VSClust = TRUE,
           cores) {
    tData <- dat[, seq_len(ncol(dat) - 1)]
    sds <- dat[, ncol(dat)]
    
    #Standardize
    # scale standard deviations by the ones in the actual data to cope for the
    # following standardization
    sds <- sds / rowSds(as.matrix(tData), na.rm = TRUE)
    tData <- t(scale(t(tData)))
    if (is.null(rownames(tData))) {
      rownames(tData) <- seq_len(nrow(tData))
    }
    cl <- makeCluster(cores)
    clusterExport(
      cl = cl,
      varlist = c("vsclust_algorithm"),
      envir = environment()
    )
    clusterEvalQ(cl = cl, library(vsclust))
    
    clustout <- ClustComp(
      tData,
      NClust = NClust,
      Sds = sds,
      NSs = 16,
      cl = cl
    )
    stopCluster(cl)
    
    
    if (VSClust) {
      Bestcl <- clustout$Bestcl
    } else {
      Bestcl <- clustout$Bestcl2
    }
    Bestcl <- SwitchOrder(Bestcl, NClust)
    
    # sorting for membership values (globally)
    Bestcl$cluster <-
      Bestcl$cluster[order(rowMaxs(Bestcl$membership, na.rm = TRUE))]
    Bestcl$membership <-
      Bestcl$membership[order(rowMaxs(Bestcl$membership, na.rm = TRUE)),]
    tData <- tData[names(Bestcl$cluster),]
    
    if (!is.null(getDefaultReactiveDomain()))
      incProgress(0.7, detail = paste("Plotting", NClust))
    
    # graphics.off() ## clean up device
    par(lwd = 0.25)
    oldmar <- par("mar")
    par(mar = c(2, 2, 3, 3), mgp = c(2, 1, 0))
    par(mar = par("mar") / max(1, NClust / 20))
    
    mfuzz.plot(
      tData,
      cl = Bestcl,
      mfrow = c(round(sqrt(NClust)), ceiling(sqrt(NClust))),
      minMem = 0.5,
      colo = "fancy"
    )
    p <- recordPlot()
    # par(lwd=1,mar=oldmar)
    
    colnames(Bestcl$membership) <-
      paste("membership of cluster", colnames(Bestcl$membership))
    outFileClust <- tData
    if (!is.null(proteins)) {
      outFileClust <-
        cbind(outFileClust, names =
                as.character(proteins[rownames(outFileClust)]))
    }
    
    rownames(Bestcl$centers) <-
      paste("Cluster", rownames(Bestcl$centers))
    ClustInd <-
      as.data.frame(table(Bestcl$cluster[rowMaxs(Bestcl$membership) > 0.5]))
    if (ncol(ClustInd) == 2)
      colnames(ClustInd) <- c("Cluster", "Members")
    else
      ClustInd <-
      cbind(seq_len(max(Bestcl$cluster)), rep(0, max(Bestcl$cluster)))
    
    ## Output
    Out <-
      list(
        dat = tData,
        Bestcl = Bestcl,
        p = p,
        outFileClust = outFileClust,
        ClustInd = ClustInd
      )
    return(Out)
  }

#### The manual of the following function was removed to avoid calling
#### RDAVIDWEBSERVICE
# Wrapper for functional enrichment
#
# The functional analysis uses the libarary RDAVIDWebService and thus might
# become obsolete as that library is not supported anymore
# The user can select different ID types and different enrichment categories
# like GO terms and pathways.
# Allowed ID types:
# "AFFYMETRIX_3PRIME_IVT_ID",
# "AFFYMETRIX_EXON_GENE_ID", "AGILENT_CHIP_ID",
# "AGILENT_ID", "AGILENT_OLIGO_ID", "APHIDBASE_ID", "BEEBASE_ID",
# "BEETLEBASE_ID", "BGD_ID", "CGNC_ID", "CRYPTODB_ID", "DICTYBASE_ID",
# "ENSEMBL_GENE_ID",
# "ENSEMBL_TRANSCRIPT_ID", "ENTREZ_GENE_ID", "GENOMIC_GI_ACCESSION",
# "FLYBASE_GENE_ID", "GENBANK_ACCESSION",
# "GENPEPT_ACCESSION", "LOCUS_TAG", "ILLUMINA_ID", "MGI_ID", "MIRBASE_ID",
# "OFFICIAL_GENE_SYMBOL", "PFAM_ID", "PIR_ID", "PROTEIN_GI_ACCESSION",
# "MRNA_GI_ACCESSION",
# "REFSEQ_GENOMIC", "REFSEQ_MRNA", "REFSEQ_PROTEIN", "REFSEQ_RNA",
# "RGD_ID", "SGD_ID", "TAIR_ID", "UCSC_GENE_ID", "UNIGENE",
# "UNIPROT_ACCESSION", "UNIPROT_ID", "UNIREF100_ID", "WORMBASE_GENE_ID",
# "WORMPEP_ID", "ZFIN_ID"
# Allowed enrichment categories:
# "GOTERM_MF_ALL","GOTERM_BP_ALL",
# "GOTERM_CC_ALL","GOTERM_MF_FAT","GOTERM_BP_FAT","GOTERM_CC_FAT"),
# "KEGG"="KEGG_PATHWAY","PANTHER_PATHWAY","REACTOME_PATHWAY","BBID","BIOCARTA",
# "DIP","MINT","INTACT","BIOGRID_INTERACTION","GAD_DISEASE","GAD_DISEASE_CLASS",
# "OMIM_DISEASE","INTERPRO","PROSITE","PFAM","SMART","PRODOM","PIR_SUPERFAMILY"
#
# param cl clustering results (either directly from vsclust_algorithm or as
# `Bestcl` object from ClustComp or runClustWrapper)
# param protnames vector providing the corresponding gene/protein names of the
# features (set to NULL for directly using the feature names (default))
# param idtypes type of IDs for features given by genes/proteins (generic gene
# names are not working)
# param infosource Type of gene annotation (e.g. KEGG_PATHWAY)
# return plot object to be able to pass the figures to e.g. shiny
# @export
runFuncEnrich <-
  function(cl, protnames = NULL, idtypes, infosource) {
    Accs <- list()
    for (c in seq_len(max(cl$cluster))) {
      cname <- paste("Cluster", c, sep = "_")
      Accs[[cname]] <-
        names(which(cl$cluster == c & rowMaxs(cl$membership) > 0.5))
      
      Accs[[cname]] <- Accs[[cname]][Accs[[cname]] != ""]
      if (length(Accs[[cname]]) > 0) {
        if (!is.null(protnames)) {
          Accs[[cname]] <- as.character(protnames[Accs[[cname]]])
        }
        
        Accs[[cname]] <- sub("-[0-9]", "", Accs[[cname]])
      }
    }
    # TODO? add extraction of multiple accession numbers
    Accs <- lapply(Accs, function(x)
      unique(ifelse(is.na(x), "B3", x)))
    Accs <- Accs[lapply(Accs, length) > 0]
    x <- NULL
    try(x <-
          compareCluster(
            Accs,
            fun = "enrichDAVID",
            annotation = infosource,
            idType = idtypes,
            david.user = "[email protected]"
          ))
    validate(need(!is.null(x), "No result. Wrong ID type?"))
    if (!is.null(getDefaultReactiveDomain()))
      incProgress(0.7, detail = "received")
    message("got data from DAVID\n")
    x@compareClusterResult <- cbind(x@compareClusterResult,
                                    log10padval =
                                      log10(x@compareClusterResult$p.adjust))
    y <-
      new("compareClusterResult",
          compareClusterResult = x@compareClusterResult)
    if (length(unique(y@compareClusterResult$ID)) > 20) {
      message("Reducing number of DAVID results\n")
      y@compareClusterResult <- y@compareClusterResult[order(y@compareClusterResult$p.adjust)[seq_len(20)],]
      
      y@compareClusterResult$Cluster <-
        as.character(y@compareClusterResult$Cluster)
    }
    
    BHI <- calcBHI(Accs, x)
    return(list(
      fullFuncs = x,
      redFuncs = y,
      BHI = BHI
    ))
    
  }

#' Run VSClust as Shiny app
#'
#' You will get the full functionality of the VSClust workflow with multiple
#' visualizations and downloads
#'
#' @return The shiny app should open in a browser or in RStudio.
#' @examples
#' \donttest{
#' runVSClustApp()}
#' @export
#' @references
#' Schwaemmle V, Jensen ON. VSClust: feature-based variance-sensitive clustering
#' of omics data. Bioinformatics. 2018 Sep 1;34(17):2965-2972. doi:
#' 10.1093/bioinformatics/bty224. PMID: 29635359.
#'
#' Schwaemmle V, Hagensen CE. A Tutorial for Variance-Sensitive Clustering and
#' the Quantitative Analysis of Protein Complexes. Methods Mol Biol.
#' 2021;2228:433-451. doi: 10.1007/978-1-0716-1024-4_30. PMID: 33950508.
#'
#' Schwaemmle V, Jensen ON. A simple and fast method to determine the parameters
#' for fuzzy c-means cluster analysis. Bioinformatics. 2010
#' Nov 15;26(22):2841-8. doi: 10.1093/bioinformatics/btq534. Epub 2010 Sep 29.
#' PMID: 20880957.
runVSClustApp <- function() {
  shiny::runApp(system.file("shiny/", package = "vsclust"))
}