Bioconductor Code: evaluomeR

Browse code

Bugfix

neobernad authored on 19/06/2024 15:33:25
Showing 9 changed files

DESCRIPTION index 36de736..a8e4c8f 100644
R/internalFunctions.R index 0a18f76..2338579 100755
R/metricsAnalysis.R index 1760cb0..93650d0 100755
R/qualityIndices.R index dab5eee..fc48954 100755
tests/testAll.R index fe5be66..d90d2cd 100755
tests/testAnalysis.R index 558111e..da0197a 100755
tests/testMetricsRelevancy.R index 88391ac..c3840b3 100755
tests/testQuality.R index 6067314..ce8b190 100755
tests/testRSKC.R index ee5d972..0000000

History View file @ d84dd4f

@@ -2,7 +2,7 @@ Package: evaluomeR
                      Type: Package
                      Title: Evaluation of Bioinformatics Metrics
                      URL: https://github.com/neobernad/evaluomeR
                     -Version: 1.21.1
                     +Version: 1.21.2
                      Authors@R: c(
                          person("José Antonio", "Bernabé-Díaz", email = "[email protected]", role = c("aut", "cre")),
                          person("Manuel", "Franco", email = "[email protected]", role = "aut"),

R/internalFunctions.R

History View file @ d84dd4f

@@ -156,8 +156,19 @@ clusterbootWrapper <- function(data, B, bootmethod="boot",
                      clusteringWrapper <- function(data, cbi, krange, seed, ...) {
                        cbiHelperResult = helperGetCBI(cbi, krange, ...)
                     -  old.seed <- .Random.seed
                     -  on.exit( { .Random.seed <<- old.seed } )
                     +  if(exists(".Random.seed")){
                     +    # .Random.seed might not exist when launched as background job
                     +    # so only store and reset if it exists
                     +    old.seed <- .Random.seed
                     +  }
+                    +
                     +  on.exit(
                     +    {
                     +      if(exists("old.seed")) {
                     +        .Random.seed <<- old.seed
                     +      }
                     +    }
                     +  )
                        if (!is.null(seed)) set.seed(seed)

R/metricsAnalysis.R

History View file @ d84dd4f

@@ -702,7 +702,7 @@ standardizeStabilityData <- function(stabData, k.range=NULL) {
                      #' @examples
                      #' data("ontMetrics")
                      #' annotated_clusters=annotateClustersByMetric(ontMetrics, k.range=c(2,3), bs=20, seed=100)
                     -#' View(annotated_clusters[['ANOnto']])
                     +#' annotated_clusters[['ANOnto']]
                      annotateClustersByMetric <- function(df, k.range, bs, seed){
                        if (is.null(seed)) {
                          seed = pkg.env$seed
@@ -775,8 +775,8 @@ annotateClustersByMetric <- function(df, k.range, bs, seed){
                      #'
                      #' @examples
                      #' data("ontMetrics")
                     -#' ranges = getMetricRangeByCluster(ontMetrics, k.range=c(2,3), bs=20, seed=100)
                     -#' View(ranges)
                     +#' #ranges = getMetricRangeByCluster(ontMetrics, k.range=c(2,3), bs=20, seed=100)
+                    +
                      getMetricRangeByCluster <- function(df, k.range, bs, seed) {
                        if (is.null(seed)) {
                          seed = pkg.env$seed

R/qualityIndices.R

History View file @ d84dd4f

@@ -172,7 +172,8 @@ qualityRange <- function(data, k.range=c(3,5), cbi="kmeans", getImages=FALSE,
                      #' @references
                      #' \insertRef{kaufman2009finding}{evaluomeR}
                      #'
                     -qualitySet <- function(data, k.set=c(2,4), cbi="kmeans", getImages=FALSE, seed=NULL, ...) {
                     +qualitySet <- function(data, k.set=c(2,4), cbi="kmeans", all_metrics=FALSE,
                     +                       getImages=FALSE, seed=NULL, ...) {
                        k.set.length = length(k.set)
                        if (k.set.length == 0) {
@@ -188,7 +189,8 @@ qualitySet <- function(data, k.set=c(2,4), cbi="kmeans", getImages=FALSE, seed=N
                        data <- as.data.frame(assay(data))
                        suppressWarnings(
                     -    runQualityIndicesSilhouette(data, bs = 1, seed=seed, cbi=cbi, k.set=k.set, ...))
                     +    runQualityIndicesSilhouette(data, bs = 1, seed=seed, cbi=cbi, all_metrics=all_metrics,
                     +                                k.set=k.set, ...))
                        silhouetteData =  suppressWarnings(
                          runSilhouetteTableRange(data, k.set=k.set))

tests/testAll.R

History View file @ d84dd4f

@@ -2,15 +2,15 @@ library(evaluomeR)
                      data("rnaMetrics")
                     -dataFrame <- stability(data=rnaMetrics, k=4, bs=100, getImages = FALSE)
                     -dataFrame <- stabilityRange(data=rnaMetrics, k.range=c(2,4), bs=20, getImages = FALSE)
                     +dataFrame <- stability(data=rnaMetrics, k=4, bs=100, all_metrics = FALSE, getImages = FALSE)
                     +dataFrame <- stabilityRange(data=rnaMetrics, k.range=c(2,4), bs=20, all_metrics = FALSE, getImages = FALSE)
                      assay(dataFrame)
                      # Metric    Mean_stability_k_2  Mean_stability_k_3  Mean_stability_k_4
                      # [1,] "RIN"     "0.825833333333333" "0.778412698412698" "0.69625"
                      # [2,] "DegFact" "0.955595238095238" "0.977777777777778" "0.820833333333333"
                     -dataFrame <- stabilitySet(data=rnaMetrics, k.set=c(2,3,4), bs=20, getImages = FALSE)
                     +dataFrame <- stabilitySet(data=rnaMetrics, k.set=c(2,3,4), bs=20, all_metrics = FALSE, getImages = FALSE)
                     -dataFrame <- quality(data=rnaMetrics, cbi="kmeans", k=3, getImages = FALSE)
                     +dataFrame <- quality(data=rnaMetrics, cbi="kmeans", k=3, all_metrics = FALSE, getImages = FALSE)
                      assay(dataFrame)
                      # Metric    Cluster_1_SilScore  Cluster_2_SilScore  Cluster_3_SilScore
                      # [1,] "RIN"     "0.420502645502646" "0.724044583696066" "0.68338517747747"
@@ -18,7 +18,7 @@ assay(dataFrame)
                      # Avg_Silhouette_Width Cluster_1_Size Cluster_2_Size Cluster_3_Size
                      # [1,] "0.627829396038413"  "4"            "4"            "8"
                      # [2,] "0.737191191352892"  "8"            "5"            "3"
                     -dataFrame <- qualityRange(data=rnaMetrics, k.range=c(2,4), seed = 20, getImages = FALSE)
                     +dataFrame <- qualityRange(data=rnaMetrics, k.range=c(2,4), seed = 20, all_metrics = FALSE, getImages = FALSE)
                      assay(getDataQualityRange(dataFrame, 2))
                      # Metric    Cluster_1_SilScore  Cluster_2_SilScore  Avg_Silhouette_Width Cluster_1_Size
                      # 1 "RIN"     "0.583166775069983" "0.619872562681118" "0.608402004052639"  "5"
@@ -36,24 +36,25 @@ assay(getDataQualityRange(dataFrame, 4))
                      # Cluster_4_Size
                      # 1 "5"
                      # 2 "3"
                     -dataFrame1 <- qualitySet(data=rnaMetrics, k.set=c(2,3,4), getImages = FALSE)
                     +dataFrame1 <- qualitySet(data=rnaMetrics, k.set=c(2,3,4), all_metrics = FALSE, getImages = FALSE)
                      dataFrame <- metricsCorrelations(data=rnaMetrics, getImages = FALSE, margins = c(4,4,11,10))
                      assay(dataFrame, 1)
                     -dataFrame <- stability(data=rnaMetrics, cbi="kmeans", k=2, bs=100, getImages = FALSE)
                     -dataFrame <- stability(data=rnaMetrics, cbi="clara", k=2, bs=100, getImages = FALSE)
                     -dataFrame <- stability(data=rnaMetrics, cbi="clara_pam", k=2, bs=100, getImages = FALSE)
                     -dataFrame <- stability(data=rnaMetrics, cbi="hclust", k=2, bs=100, getImages = FALSE)
                     -dataFrame <- stability(data=rnaMetrics, cbi="pamk", k=2, bs=100, getImages = FALSE)
                     -dataFrame <- stability(data=rnaMetrics, cbi="pamk_pam", k=2, bs=100, getImages = FALSE)
                     +dataFrame <- stability(data=rnaMetrics, cbi="kmeans", k=2, bs=100, all_metrics = FALSE, getImages = FALSE)
                     +dataFrame <- stability(data=rnaMetrics, cbi="clara", k=2, bs=100, all_metrics = FALSE, getImages = FALSE)
                     +dataFrame <- stability(data=rnaMetrics, cbi="clara_pam", k=2, bs=100, all_metrics = FALSE, getImages = FALSE)
                     +dataFrame <- stability(data=rnaMetrics, cbi="hclust", k=2, bs=100, all_metrics = FALSE, getImages = FALSE)
                     +dataFrame <- stability(data=rnaMetrics, cbi="pamk", k=2, bs=100, all_metrics = FALSE, getImages = FALSE)
                     +dataFrame <- stability(data=rnaMetrics, cbi="pamk_pam", k=2, bs=100, all_metrics = FALSE, getImages = FALSE)
                     +dataFrame <- stability(data=rnaMetrics, cbi="rskc", k=2, bs=100, all_metrics = TRUE, L1 = 2, alpha=0, getImages = FALSE)
                      # Supported CBIs:
                      evaluomeRSupportedCBI()
                     -dataFrame <- qualityRange(data=rnaMetrics, k.range=c(2,10), getImages = FALSE)
                     +dataFrame <- qualityRange(data=rnaMetrics, k.range=c(2,10), all_metrics = FALSE, getImages = FALSE)
                      dataFrame
                      #dataFrame <- stabilityRange(data=rnaMetrics, k.range=c(2,8), bs=20, getImages = FALSE)

tests/testAnalysis.R

History View file @ d84dd4f

@@ -8,7 +8,7 @@ cluster = plotMetricsCluster(ontMetrics, scale = TRUE)
                      plotMetricsViolin(rnaMetrics)
                      plotMetricsViolin(ontMetrics, 2)
                     -ntMetricsstabilityData <- stabilityRange(data=rnaMetrics, k.range=c(3,4), bs=20, getImages = FALSE, seed=100)
                     +stabilityData <- stabilityRange(data=rnaMetrics, k.range=c(3,4), bs=20, getImages = FALSE, seed=100)
                      qualityData <- qualityRange(data=rnaMetrics, k.range=c(3,4), getImages = FALSE, seed=100)
                      kOptTable <- getOptimalKValue(stabilityData, qualityData, k.range=c(3,4))

tests/testMetricsRelevancy.R

History View file @ d84dd4f

@@ -33,3 +33,4 @@ test = qualityRange(data=ontMetrics, k.range=c(3,3),
                      # Shows how clusters are partitioned according to the individuals
                      individuals_per_cluster(test$k_3)
+                    +

tests/testQuality.R

History View file @ d84dd4f

@@ -1,7 +1,7 @@
                      library(evaluomeR)
                      library(RSKC)
                      library(sparcl)
+                    -
                     +seed = 100
                      dataFrame <- quality(data=ontMetrics, cbi="kmeans", k=3)
                      assay(dataFrame)
                      # Metric     Cluster_1_SilScore  Cluster_2_SilScore  Cluster_3_SilScore  Avg_Silhouette_Width Cluster_1_Size Cluster_2_Size Cluster_3_Size

tests/testRSKC.R

History View file @ d84dd4f

                     deleted file mode 100755
@@ -1,123 +0,0 @@
                     -library(evaluomeR)
                     -library(RSKC)
                     -library(sparcl)
+                    -
+                    -
                     -# Dataframe for the use case is 'ontMetrics' provided by our evaluomeR package.
+                    -
                     -data("ontMetrics")
                     -df = as.data.frame(assay(ontMetrics))
                     -df["Description"] = NULL # Description column not relevant atm.
                     -head(df, 5)
                     -data("ontMetrics")
+                    -
                     -# RSKC
                     -# Robust and Sparse K-Means clustering [[1]](#1) requires to select mainly three parameters:
                     -# - **nlc**: Number of *K* cluster. It is to be determined by *evaluomeR* optimal *K* algorithm.
                     -#   - **L<sub>1</sub>**: The tuning parameter for sparce clustering. It acts as the upper bound restraint for the vector of weights. 1 $<$ L<sub>1</sub> $\leq$ $\sqrt{num.variables}$.
                     -#   - **$\alpha$**: The trimming portion [[4]](#4) used in the robust clustering.
+                    -
                     -# Optimal K clusters value
                     -# Here, we make use of *evaluomeR* to figure out the optimal $k$ value. The algorithm on how the optimal is calculated is outlined in [[7]](#7). We consider the $k$ range [3,15] for the analysis of the optimal $k$, avoiding $k=2$ to prevent from having binary classifications.
                     -seed=100
                     -k.range=c(3,15)
                     -stabilityData <- stabilityRange(data=ontMetrics, k.range=k.range, bs=20, getImages = FALSE, seed=seed)
                     -qualityData <- qualityRange(data=ontMetrics, k.range=k.range, getImages = FALSE, seed=seed)
                     -optK <- getOptimalKValue(stabilityData, qualityData, k.range=k.range)
+                    -
                     -# Optimal $k$ values individually per input metric are:
                     -optK[c('Metric','Global_optimal_k')]
+                    -
                     -k_values = as.numeric(unlist(optK['Global_optimal_k']))
                     -global_k_value = floor(mean(k_values))
                     -print(paste0("Taking global optimal K value: ", global_k_value))
+                    -
                     -plotMetricsClusterComparison(ontMetrics, k.vector1=global_k_value)
+                    -
                     -# Figuring out the L1 upper boundry
                     -# In [[2]](#2) authors provide description of the algorithm to select the tunning parameter L<sub>1</sub> for
                     -# the sparse K-means, which consist of independent permutations from the same source data matrix and the gap
                     -# statistic [[5]](#5). This algorithm for tuning the L<sub>1</sub> parameter and others described in [[2]](#2)
                     -# are presented in 'sparcl' R package [[6]](#6).
+                    -
                     -dataMatrix = as.matrix(df)
                     -dataMatrix = scale(dataMatrix, TRUE, TRUE)
                     -head(dataMatrix, 5)
+                    -
                     -# Considering that for the dataset the global optimal $k$ is $k=4$, we can now compute the
                     -# permutations to figure out the boundry L<sub>1</sub> with the method 'KMeansSparseCluster.permute'
                     -# from 'sparcl' [[6]](#6).
+                    -
                     -# Note: 1 $<$ L<sub>1</sub> $\leq$ $\sqrt{num.variables}$.
+                    -
                     -wbounds = seq(2,sqrt(ncol(dataMatrix)), len=30)
                     -km.perm <- KMeansSparseCluster.permute(dataMatrix,K=global_k_value,wbounds=wbounds,nperms=5)
                     -print(km.perm)
                     -plot(km.perm)
+                    -
                     -l1 = km.perm$bestw
                     -print(paste0("Best L1 upper bound is: ", l1))
+                    -
+                    -
+                    -
                     -# Metrics relevancy
                     -rskc_out = RSKC(df["ANOnto"], global_k_value, 0.1, L1 = l1, nstart = 200,
                     -                silent=TRUE, scaling = FALSE, correlation = FALSE)
                     -cat(paste0("L1 value: ", l1,"\n"))
                     -cat(names(rskc_out$weights)[1], ": ", rskc_out$weights[1],"\n")
                     -cat(names(rskc_out$weights)[2], ": ", rskc_out$weights[2],"\n")
                     -cat(names(rskc_out$weights)[3], ": ", rskc_out$weights[3],"\n")
                     -cat("---\n")
+                    -
                     -rskc_out
+                    -
                     -# Trimmed cases:
+                    -
                     -# oE: Indices of the cases trimmed in squared Euclidean distances.
                     -# oW:	Indices of the cases trimmed in weighted squared Euclidean distances. If L1 =NULL,
                     -# then oW are the cases trimmed in the Euclidean distance, because all the features have the same weights, i.e., 1's.
                     -union_vector = c(rskc_out$oE,rskc_out$oW)
                     -union_vector_unique = unique(union_vector)
                     -union_vector_unique = sort(union_vector_unique)
+                    -
                     -print(paste0("Trimmed cases from input dataframe: "))
                     -union_vector_unique
+                    -
                     -options(scipen=10)
+                    -
                     -columns = c('metric', 'weight')
                     -rskc_df = data.frame(matrix(ncol = length(columns), nrow = length(rskc_out$weights)))
                     -colnames(rskc_df) = columns
                     -rskc_df['metric'] = names(rskc_out$weights)
                     -rskc_df['weight'] = rskc_out$weights
                     -rskc_df
+                    -
                     -# Relevancy table
                     -rskc_df_sorted = rskc_df[order(rskc_df$weight, decreasing = TRUE), ]
                     -rskc_df_sorted
+                    -
                     -# References <a class="anchor" id="references"></a>
+                    -
                     -#<a id="1">[1]</a>
                     -#  Kondo, Y., Salibian-Barrera, M., & Zamar, R. (2016). RSKC: An R Package for a Robust and Sparse K-Means Clustering Algorithm. Journal of Statistical Software, 72(5), 1–26. https://doi.org/10.18637/jss.v072.i05
+                    -
                     -#<a id="2">[2]</a>
                     -#  Witten, D. M., & Tibshirani, R. (2010). A framework for feature selection in clustering. Journal of the American Statistical Association, 105(490), 713–726. https://doi.org/10.1198/jasa.2010.tm09415
+                    -
                     -#<a id="3">[3]</a>
                     -#  Robert Tibshirani, & Guenther Walther (2005). Cluster Validation by Prediction Strength. Journal of Computational and Graphical Statistics, 14(3), 511-528. https://doi.org/10.1198/106186005X59243
+                    -
                     -#<a id="4">[4]</a>
                     -#  Gordaliza, A. (1991). On the breakdown point of multivariate location estimators based on trimming procedures. Statistics & Probability Letters, 11(5), 387-394. https://doi.org/10.1016/0167-7152(91)90186-U
+                    -
                     -#<a id="5">[5]</a>
                     -#  Tibshirani, R., Walther, G., & Hastie, T. (2001). Estimating the number of clusters in a data set via the gap statistic. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 63(2), 411-423. https://doi.org/10.1111/1467-9868.00293
+                    -
                     -#<a id="6">[6]</a>
                     -#  Witten, D. M., & Tibshirani, R. (2010). sparcl: Perform Sparse Hierarchical Clustering and Sparse K-Means Clustering. R package. https://CRAN.R-project.org/package=sparcl
+                    -
                     -#<a id="7">[7]</a>
                     -#  José Antonio Bernabé-Díaz, Manuel Franco, Juana-María Vivo, Manuel Quesada-Martínez, & Jesualdo T. Fernández-Breis (2022). An automated process for supporting decisions in clustering-based data analysis. Computer Methods and Programs in Biomedicine, 219, 106765. https://doi.org/10.1016/j.cmpb.2022.106765
+                    -
+                    -
+                    -