b634e98a |
#' @section VSClust functions:
#' Functions for running VSClust analysis
#'
#' @docType package
#' @name vsclust
#' @useDynLib vsclust
NULL
#> NULL
#' Wrapper for statistical analysis
#'
#' Prepare data for running vsclust clustering.
#' This includes visualization running the functions for the principal component
#' analysis and its visualization, statistical testing with LIMMA, as well as
#' scaling and filtering of missing values
#' @param dat matrix or data frame of numerical data. Columns are samples.
#' Replicates are grouped (i.e. A1, B1, C1, A2, B2, C2) when letters denote
|
a6b033e2 |
#' conditions and numbers the replicates. In case of `isStat=FALSE`, you need a
|
b634e98a |
#' last column for the standard deviations
#' @param NumReps Number replicates in the data
#' @param NumCond Number of different experimental conditions. The total number
#' of columns needs to be NumReps*NumCond
#' @param isPaired Boolean for running paired or unpaired statistical tests
#' @param isStat Boolean for whether to run statistical test or each column
#' corresponds to a different experimental conditions. Then this function reads
#' feature standard deviations from data frame from the last column
#' @return list with the items `dat` (data matrix of features averaged over
#' replicates and last column with their standard deviations), `qvals` FDRs from
#' the statistical tests (each conditions versus the first), `StatFileOut` all
#' of before for saving in file
#' @examples
#' data <- matrix(rnorm(2000), nrow=200)
#' stats <- PrepareForVSClust(data, 5, 2, isStat=TRUE)
#'
#' @import stats
#' @importFrom matrixStats rowSds
#' @importFrom shiny validate
#' @export
#' @references
#' Schwaemmle V, Jensen ON. VSClust: feature-based variance-sensitive clustering
#' of omics data. Bioinformatics. 2018 Sep 1;34(17):2965-2972. doi:
#' 10.1093/bioinformatics/bty224. PMID: 29635359.
#'
#' Schwaemmle V, Hagensen CE. A Tutorial for Variance-Sensitive Clustering and
#' the Quantitative Analysis of Protein Complexes. Methods Mol Biol.
#' 2021;2228:433-451. doi: 10.1007/978-1-0716-1024-4_30. PMID: 33950508.
#'
#' Schwaemmle V, Jensen ON. A simple and fast method to determine the parameters
#' for fuzzy c-means cluster analysis. Bioinformatics. 2010
#' Nov 15;26(22):2841-8. doi: 10.1093/bioinformatics/btq534. Epub 2010 Sep 29.
#' PMID: 20880957.
PrepareForVSClust <-
function(dat, NumReps, NumCond, isPaired = FALSE, isStat) {
qvals <- statFileOut <- Sds <- NULL
tdat <- NULL
# convert to matrix
dat <- as.matrix(dat)
# Run statistical testing
if (isStat) {
if (ncol(dat) != NumReps * NumCond)
stop("Number of data columns must correspond to product of conditions
and replicates!")
if (isPaired) {
ttt <- SignAnalysisPaired(dat, NumCond, NumReps)
} else {
ttt <- SignAnalysis(dat, NumCond, NumReps)
}
Sds <- ttt$Sds
qvals <- ttt$qvalues
colnames(qvals) <-
paste("qvalue ", LETTERS702[2:(NumCond)], "vsA", sep = "")
tdat <- averageCond(dat, NumReps, NumCond)
} else {
Sds <- dat[, ncol(dat)]
tdat <- dat[, seq_len(ncol(dat) - 1)]
NumReps <- 1
NumCond <- ncol(dat) - 1
dat <- tdat
}
if (isStat) {
statFileOut <- cbind(tdat, Sds, qvals)
} else {
statFileOut <- cbind(tdat, Sds)
}
pcaWithVar(dat, NumReps, NumCond, Sds / rowSds(tdat, na.rm = TRUE))
## Preparing output
Out <-
list(dat = cbind(tdat, Sds),
qvals = qvals,
statFileOut = statFileOut)
Out
}
#' Wrapper for statistical analysis for SummarizedExperiment object
#'
#' Prepare data for running vsclust clustering.
#' This includes visualization running the functions for the principal component
#' analysis and its visualization, statistical testing with LIMMA, as well as
#' scaling and filtering of missing values
|
35126831 |
#' @param se SummarizedExperiment object
#' @param assayname Sample in SummarizedExperiment object
#' @param coldatname Column in colData for extracting replicates
|
b634e98a |
#' @param isPaired Boolean for running paired or unpaired statistical tests
#' @param isStat Boolean for whether to run statistical test or each column
#' corresponds to a different experimental conditions. Then this function reads
#' feature standard deviations from data frame from the last column
#' @return list with the items `dat` (data matrix of features averaged over
#' replicates and last column with their standard deviations), `qvals` FDRs from
#' the statistical tests (each conditions versus the first), `StatFileOut` all
#' of before for saving in file, `NumReps` number of replicates and `NumCond`
#' number of different experimental conditions
#' @examples
|
35126831 |
#' data(miniACC, package="MultiAssayExperiment")
|
b634e98a |
#'
#' stats <- PrepareSEForVSClust(miniACC, coldatname="COC", isStat=TRUE)
#'
#' @import stats
|
a6b033e2 |
#' @importFrom MultiAssayExperiment assay assays sampleMap colData
|
b634e98a |
#' @importFrom matrixStats rowSds
#' @importFrom shiny validate
#' @export
#' @references
#' Schwaemmle V, Jensen ON. VSClust: feature-based variance-sensitive clustering
#' of omics data. Bioinformatics. 2018 Sep 1;34(17):2965-2972. doi:
#' 10.1093/bioinformatics/bty224. PMID: 29635359.
#'
#' Schwaemmle V, Hagensen CE. A Tutorial for Variance-Sensitive Clustering and
#' the Quantitative Analysis of Protein Complexes. Methods Mol Biol.
#' 2021;2228:433-451. doi: 10.1007/978-1-0716-1024-4_30. PMID: 33950508.
#'
#' Schwaemmle V, Jensen ON. A simple and fast method to determine the parameters
#' for fuzzy c-means cluster analysis. Bioinformatics. 2010
#' Nov 15;26(22):2841-8. doi: 10.1093/bioinformatics/btq534. Epub 2010 Sep 29.
#' PMID: 20880957.
PrepareSEForVSClust <-
function(se,
assayname = 1,
coldatname = NULL,
isPaired = FALSE,
isStat) {
qvals <- statFileOut <- Sds <- NULL
tdat <- NULL
if (!(class(se) %in% c(
"SummarizedExperiment",
"QFeatures",
"MultiAssayExperiment"
))) {
stop(
"!! First argument must be a SummarizedExperiment, QFeatures or MultiAssayExperiment objectd"
)
}
# convert to matrix
dat <- assay(se, assayname)
# determine number of conditions and replicates from colData
NumCond <- NumReps <- 0
if (!is.null(coldatname)) {
# change to name if assay is given by index
if (is.numeric(assayname))
assayname <- names(se)[assayname]
sample_names <- sampleMap(se)
sample_names <- sample_names[sample_names$assay == assayname,]
rownames(sample_names) <- sample_names$colname
coldat <- colData(se)[sample_names[colnames(dat), "primary"], coldatname]
names(coldat) <- colnames(dat)
NumReps <- max(table(coldat))
NumCond <- length(unique(coldat))
message("-- The following categories will be used as experimental
conditions:\n",paste(unique(coldat), collapse="\n"))
if (length(unique(coldat)) < 3)
stop("!! We need a minimum of three different categories/conditions")
message("-- Extracted NumReps: ", NumReps, " and NumCond: ", NumCond)
dat <- balanceData(dat, coldat)
} else{
NumCond <- ncol(dat)
NumReps <- 1
message("-- No replicates given or no statistical testing, assuming that each
sample is a different type of sample. Variances will be set to 1")
dat <- cbind(dat, 1)
}
# Run statistical testing
if (isStat) {
if (ncol(dat) != NumReps * NumCond)
stop("!! Number of data columns must correspond to product of conditions
and replicates!")
if (isPaired) {
ttt <- SignAnalysisPaired(dat, NumCond, NumReps)
} else {
ttt <- SignAnalysis(dat, NumCond, NumReps)
}
Sds <- ttt$Sds
qvals <- ttt$qvalues
colnames(qvals) <-
paste("qvalue ", LETTERS702[2:(NumCond)], "vsA", sep = "")
tdat <- averageCond(dat, NumReps, NumCond)
} else {
Sds <- dat[, ncol(dat)]
tdat <- dat[, seq_len(ncol(dat) - 1)]
NumReps <- 1
NumCond <- ncol(dat) - 1
dat <- tdat
}
if (isStat) {
statFileOut <- cbind(tdat, Sds, qvals)
} else {
statFileOut <- cbind(tdat, Sds)
}
pcaWithVar(dat, NumReps, NumCond, Sds / rowSds(tdat, na.rm = TRUE))
## Preparing output
Out <-
list(dat = cbind(tdat, Sds),
qvals = qvals,
statFileOut = statFileOut, NumReps=NumReps, NumCond=NumCond)
Out
}
#' Wrapper for estimation of cluster number
#'
#' This runs the clustering for different numbers of clusters, and estimates the
#' most suitable numbers from applying the minimum centroid distance and the Xie
#' Beni index. Multi-threading is used to shorten the computation times.
#' Given the hierarchical structure of many data sets, the resulting numbers are
#' suggestions. Inspection of the here plotted indices help to determine
#' alternative cluster numbers, given by a strong decay of the minimum centroid
#' distance and/or a low value of the Xie Beni index.
#'
#' @param dat matrix of features averaged over replicates. The last column
#' contains their standard deviation
#' @param maxClust Maximal number of cluster. The minimum is 3
#' @param cores The number of threads to be used for parallelisation
#' @return list with the items `ClustInd`: list of clustering objects for each
#' number of clusters, `p` plot object with plots for validity indices,
#' `numclust` optimal cluster number according to "minimum centroid distance"
#' @examples
#' data <- matrix(rnorm(1000), nrow=100)
#' estim_out <- estimClustNum(data, maxClust=10)
#' best_number <- max(estim_out[1])
#' @import limma
#' @import parallel
#' @import stats
#' @import graphics
#' @import grDevices
#' @importFrom shiny getDefaultReactiveDomain incProgress
#' @importFrom matrixStats rowMaxs
#' @export
estimClustNum <- function(dat,
maxClust = 25,
cores = 1) {
ClustInd <- matrix(NA, nrow = maxClust - 2, ncol = 6)
if (is.null(rownames(dat)))
rownames(dat) <- seq_len(nrow(dat))
tData <- dat[, seq_len(ncol(dat) - 1)]
colnames(tData) <- NULL
# define parallelization
cl <- makeCluster(cores)
clusterExport(
cl = cl,
varlist = c("vsclust_algorithm"),
envir = environment()
)
clusterEvalQ(cl = cl, library(vsclust))
# Standardise
sds <- dat[rownames(tData), ncol(dat)]
# scale standard deviations by the ones in the actual data to cope for the
# following standardization
sds <- sds / (rowSds(as.matrix(tData), na.rm = TRUE))
tData <- t(scale(t(tData)))
|
a6b033e2 |
multiOut <- lapply(seq(3,maxClust,1), function(x) {
|
b634e98a |
if (!is.null(getDefaultReactiveDomain())) {
incProgress(1, detail = paste("Running cluster number", x))
} else {
message("Running cluster number", x)
}
clustout <- ClustComp(
tData,
NClust = x,
Sds = sds,
NSs = 16,
cl = cl
)
c(clustout$indices, sum(rowMaxs(clustout$Bestcl$membership) > 0.5),
sum(rowMaxs(clustout$Bestcl2$membership) > 0.5))
})
stopCluster(cl)
|
a6b033e2 |
for (NClust in seq(3,maxClust,1))
|
b634e98a |
ClustInd[NClust - 2,] <- multiOut[[NClust - 2]]
|
a6b033e2 |
rownames(ClustInd) <- paste0("num_clust_", seq(3,maxClust,1))
|
b634e98a |
colnames(ClustInd) <-
c(
"MinCentroidDist_VSClust",
"XieBeni_VSClust",
"MinCentroidDist_FCM",
"XieBeni_FCM",
"NumVSClust",
"NumFCM"
)
# Output
ClustInd
}
#' Wrapper for running cluster analysis
#'
#' This function runs the clustering and visualizes the results.
#'
#' @param dat matrix or data frame with feature values for different conditions
#' @param NClust Number of cluster for running the clustering
#' @param proteins vector with additional feature information (default is NULL)
#' to be added to the results
#' @param VSClust boolean. TRUE for running the variance-sensitive clustering.
#' Otherwise, the function will call standard fuzzy c-means clustering
#' @param cores Number of threads for the parallelization
#' @return list with the items `dat`(the original data), `Bestcl` clustering
#' results (same as from vsclust_algorithm), `p` (plot object with mfuzz plots),
#' `outFileClust`(suitable matrix with complete information) , `ClustInd`
#' (information about being member of any cluster, feature needs on membership
#' values > 0.5)
#' @examples
#' data(iris)
#' data <- cbind(iris[,seq_len(4)],1)
#' clust_out <- runClustWrapper(data, NClust=3, cores=1)
#' clust_out$p
#' @import parallel
#' @import graphics
#' @importFrom grDevices recordPlot
#' @importFrom shiny getDefaultReactiveDomain incProgress
#' @importFrom matrixStats rowMaxs
#' @export
runClustWrapper <-
function(dat,
NClust,
proteins = NULL,
VSClust = TRUE,
cores) {
tData <- dat[, seq_len(ncol(dat) - 1)]
sds <- dat[, ncol(dat)]
#Standardize
# scale standard deviations by the ones in the actual data to cope for the
# following standardization
sds <- sds / rowSds(as.matrix(tData), na.rm = TRUE)
tData <- t(scale(t(tData)))
if (is.null(rownames(tData))) {
rownames(tData) <- seq_len(nrow(tData))
}
cl <- makeCluster(cores)
clusterExport(
cl = cl,
varlist = c("vsclust_algorithm"),
envir = environment()
)
clusterEvalQ(cl = cl, library(vsclust))
clustout <- ClustComp(
tData,
NClust = NClust,
Sds = sds,
NSs = 16,
cl = cl
)
stopCluster(cl)
if (VSClust) {
Bestcl <- clustout$Bestcl
} else {
Bestcl <- clustout$Bestcl2
}
Bestcl <- SwitchOrder(Bestcl, NClust)
# sorting for membership values (globally)
Bestcl$cluster <-
Bestcl$cluster[order(rowMaxs(Bestcl$membership, na.rm = TRUE))]
Bestcl$membership <-
Bestcl$membership[order(rowMaxs(Bestcl$membership, na.rm = TRUE)),]
tData <- tData[names(Bestcl$cluster),]
if (!is.null(getDefaultReactiveDomain()))
incProgress(0.7, detail = paste("Plotting", NClust))
# graphics.off() ## clean up device
par(lwd = 0.25)
oldmar <- par("mar")
par(mar = c(2, 2, 3, 3), mgp = c(2, 1, 0))
par(mar = par("mar") / max(1, NClust / 20))
mfuzz.plot(
tData,
cl = Bestcl,
mfrow = c(round(sqrt(NClust)), ceiling(sqrt(NClust))),
|
a6b033e2 |
minMem = 0.5,
|
b634e98a |
colo = "fancy"
)
p <- recordPlot()
# par(lwd=1,mar=oldmar)
colnames(Bestcl$membership) <-
paste("membership of cluster", colnames(Bestcl$membership))
outFileClust <- tData
if (!is.null(proteins)) {
outFileClust <-
cbind(outFileClust, names =
as.character(proteins[rownames(outFileClust)]))
}
rownames(Bestcl$centers) <-
paste("Cluster", rownames(Bestcl$centers))
ClustInd <-
as.data.frame(table(Bestcl$cluster[rowMaxs(Bestcl$membership) > 0.5]))
if (ncol(ClustInd) == 2)
colnames(ClustInd) <- c("Cluster", "Members")
else
ClustInd <-
cbind(seq_len(max(Bestcl$cluster)), rep(0, max(Bestcl$cluster)))
## Output
Out <-
list(
dat = tData,
Bestcl = Bestcl,
p = p,
outFileClust = outFileClust,
ClustInd = ClustInd
)
return(Out)
}
#### The manual of the following function was removed to avoid calling
#### RDAVIDWEBSERVICE
# Wrapper for functional enrichment
#
# The functional analysis uses the libarary RDAVIDWebService and thus might
# become obsolete as that library is not supported anymore
# The user can select different ID types and different enrichment categories
# like GO terms and pathways.
# Allowed ID types:
# "AFFYMETRIX_3PRIME_IVT_ID",
# "AFFYMETRIX_EXON_GENE_ID", "AGILENT_CHIP_ID",
# "AGILENT_ID", "AGILENT_OLIGO_ID", "APHIDBASE_ID", "BEEBASE_ID",
# "BEETLEBASE_ID", "BGD_ID", "CGNC_ID", "CRYPTODB_ID", "DICTYBASE_ID",
# "ENSEMBL_GENE_ID",
# "ENSEMBL_TRANSCRIPT_ID", "ENTREZ_GENE_ID", "GENOMIC_GI_ACCESSION",
# "FLYBASE_GENE_ID", "GENBANK_ACCESSION",
# "GENPEPT_ACCESSION", "LOCUS_TAG", "ILLUMINA_ID", "MGI_ID", "MIRBASE_ID",
# "OFFICIAL_GENE_SYMBOL", "PFAM_ID", "PIR_ID", "PROTEIN_GI_ACCESSION",
# "MRNA_GI_ACCESSION",
# "REFSEQ_GENOMIC", "REFSEQ_MRNA", "REFSEQ_PROTEIN", "REFSEQ_RNA",
# "RGD_ID", "SGD_ID", "TAIR_ID", "UCSC_GENE_ID", "UNIGENE",
# "UNIPROT_ACCESSION", "UNIPROT_ID", "UNIREF100_ID", "WORMBASE_GENE_ID",
# "WORMPEP_ID", "ZFIN_ID"
# Allowed enrichment categories:
# "GOTERM_MF_ALL","GOTERM_BP_ALL",
# "GOTERM_CC_ALL","GOTERM_MF_FAT","GOTERM_BP_FAT","GOTERM_CC_FAT"),
# "KEGG"="KEGG_PATHWAY","PANTHER_PATHWAY","REACTOME_PATHWAY","BBID","BIOCARTA",
# "DIP","MINT","INTACT","BIOGRID_INTERACTION","GAD_DISEASE","GAD_DISEASE_CLASS",
# "OMIM_DISEASE","INTERPRO","PROSITE","PFAM","SMART","PRODOM","PIR_SUPERFAMILY"
#
# param cl clustering results (either directly from vsclust_algorithm or as
# `Bestcl` object from ClustComp or runClustWrapper)
# param protnames vector providing the corresponding gene/protein names of the
# features (set to NULL for directly using the feature names (default))
# param idtypes type of IDs for features given by genes/proteins (generic gene
# names are not working)
# param infosource Type of gene annotation (e.g. KEGG_PATHWAY)
# return plot object to be able to pass the figures to e.g. shiny
|
388bba80 |
# @export
|
b634e98a |
runFuncEnrich <-
function(cl, protnames = NULL, idtypes, infosource) {
Accs <- list()
for (c in seq_len(max(cl$cluster))) {
cname <- paste("Cluster", c, sep = "_")
Accs[[cname]] <-
names(which(cl$cluster == c & rowMaxs(cl$membership) > 0.5))
Accs[[cname]] <- Accs[[cname]][Accs[[cname]] != ""]
if (length(Accs[[cname]]) > 0) {
if (!is.null(protnames)) {
Accs[[cname]] <- as.character(protnames[Accs[[cname]]])
}
Accs[[cname]] <- sub("-[0-9]", "", Accs[[cname]])
}
}
# TODO? add extraction of multiple accession numbers
Accs <- lapply(Accs, function(x)
unique(ifelse(is.na(x), "B3", x)))
Accs <- Accs[lapply(Accs, length) > 0]
x <- NULL
try(x <-
compareCluster(
Accs,
fun = "enrichDAVID",
annotation = infosource,
idType = idtypes,
david.user = "[email protected]"
))
validate(need(!is.null(x), "No result. Wrong ID type?"))
if (!is.null(getDefaultReactiveDomain()))
incProgress(0.7, detail = "received")
message("got data from DAVID\n")
x@compareClusterResult <- cbind(x@compareClusterResult,
log10padval =
log10(x@compareClusterResult$p.adjust))
y <-
new("compareClusterResult",
compareClusterResult = x@compareClusterResult)
if (length(unique(y@compareClusterResult$ID)) > 20) {
message("Reducing number of DAVID results\n")
y@compareClusterResult <- y@compareClusterResult[order(y@compareClusterResult$p.adjust)[seq_len(20)],]
y@compareClusterResult$Cluster <-
as.character(y@compareClusterResult$Cluster)
}
BHI <- calcBHI(Accs, x)
return(list(
fullFuncs = x,
redFuncs = y,
BHI = BHI
))
}
#' Run VSClust as Shiny app
#'
#' You will get the full functionality of the VSClust workflow with multiple
#' visualizations and downloads
#'
#' @return The shiny app should open in a browser or in RStudio.
#' @examples
#' \donttest{
#' runVSClustApp()}
#' @export
#' @references
#' Schwaemmle V, Jensen ON. VSClust: feature-based variance-sensitive clustering
#' of omics data. Bioinformatics. 2018 Sep 1;34(17):2965-2972. doi:
#' 10.1093/bioinformatics/bty224. PMID: 29635359.
#'
#' Schwaemmle V, Hagensen CE. A Tutorial for Variance-Sensitive Clustering and
#' the Quantitative Analysis of Protein Complexes. Methods Mol Biol.
#' 2021;2228:433-451. doi: 10.1007/978-1-0716-1024-4_30. PMID: 33950508.
#'
#' Schwaemmle V, Jensen ON. A simple and fast method to determine the parameters
#' for fuzzy c-means cluster analysis. Bioinformatics. 2010
#' Nov 15;26(22):2841-8. doi: 10.1093/bioinformatics/btq534. Epub 2010 Sep 29.
#' PMID: 20880957.
runVSClustApp <- function() {
shiny::runApp(system.file("shiny/", package = "vsclust"))
|
35126831 |
}
|