Browse code

Bugfix

neobernad authored on 19/06/2024 15:33:25
Showing 9 changed files

... ...
@@ -2,7 +2,7 @@ Package: evaluomeR
2 2
 Type: Package
3 3
 Title: Evaluation of Bioinformatics Metrics
4 4
 URL: https://github.com/neobernad/evaluomeR
5
-Version: 1.21.1
5
+Version: 1.21.2
6 6
 Authors@R: c(
7 7
     person("José Antonio", "Bernabé-Díaz", email = "[email protected]", role = c("aut", "cre")),
8 8
     person("Manuel", "Franco", email = "[email protected]", role = "aut"),
... ...
@@ -156,8 +156,19 @@ clusterbootWrapper <- function(data, B, bootmethod="boot",
156 156
 clusteringWrapper <- function(data, cbi, krange, seed, ...) {
157 157
   cbiHelperResult = helperGetCBI(cbi, krange, ...)
158 158
 
159
-  old.seed <- .Random.seed
160
-  on.exit( { .Random.seed <<- old.seed } )
159
+  if(exists(".Random.seed")){
160
+    # .Random.seed might not exist when launched as background job
161
+    # so only store and reset if it exists
162
+    old.seed <- .Random.seed
163
+  }
164
+
165
+  on.exit(
166
+    {
167
+      if(exists("old.seed")) {
168
+        .Random.seed <<- old.seed
169
+      }
170
+    }
171
+  )
161 172
 
162 173
   if (!is.null(seed)) set.seed(seed)
163 174
 
... ...
@@ -702,7 +702,7 @@ standardizeStabilityData <- function(stabData, k.range=NULL) {
702 702
 #' @examples
703 703
 #' data("ontMetrics")
704 704
 #' annotated_clusters=annotateClustersByMetric(ontMetrics, k.range=c(2,3), bs=20, seed=100)
705
-#' View(annotated_clusters[['ANOnto']])
705
+#' annotated_clusters[['ANOnto']]
706 706
 annotateClustersByMetric <- function(df, k.range, bs, seed){
707 707
   if (is.null(seed)) {
708 708
     seed = pkg.env$seed
... ...
@@ -775,8 +775,8 @@ annotateClustersByMetric <- function(df, k.range, bs, seed){
775 775
 #'
776 776
 #' @examples
777 777
 #' data("ontMetrics")
778
-#' ranges = getMetricRangeByCluster(ontMetrics, k.range=c(2,3), bs=20, seed=100)
779
-#' View(ranges)
778
+#' #ranges = getMetricRangeByCluster(ontMetrics, k.range=c(2,3), bs=20, seed=100)
779
+
780 780
 getMetricRangeByCluster <- function(df, k.range, bs, seed) {
781 781
   if (is.null(seed)) {
782 782
     seed = pkg.env$seed
... ...
@@ -172,7 +172,8 @@ qualityRange <- function(data, k.range=c(3,5), cbi="kmeans", getImages=FALSE,
172 172
 #' @references
173 173
 #' \insertRef{kaufman2009finding}{evaluomeR}
174 174
 #'
175
-qualitySet <- function(data, k.set=c(2,4), cbi="kmeans", getImages=FALSE, seed=NULL, ...) {
175
+qualitySet <- function(data, k.set=c(2,4), cbi="kmeans", all_metrics=FALSE,
176
+                       getImages=FALSE, seed=NULL, ...) {
176 177
 
177 178
   k.set.length = length(k.set)
178 179
   if (k.set.length == 0) {
... ...
@@ -188,7 +189,8 @@ qualitySet <- function(data, k.set=c(2,4), cbi="kmeans", getImages=FALSE, seed=N
188 189
   data <- as.data.frame(assay(data))
189 190
 
190 191
   suppressWarnings(
191
-    runQualityIndicesSilhouette(data, bs = 1, seed=seed, cbi=cbi, k.set=k.set, ...))
192
+    runQualityIndicesSilhouette(data, bs = 1, seed=seed, cbi=cbi, all_metrics=all_metrics,
193
+                                k.set=k.set, ...))
192 194
   silhouetteData =  suppressWarnings(
193 195
     runSilhouetteTableRange(data, k.set=k.set))
194 196
 
... ...
@@ -2,15 +2,15 @@ library(evaluomeR)
2 2
 
3 3
 data("rnaMetrics")
4 4
 
5
-dataFrame <- stability(data=rnaMetrics, k=4, bs=100, getImages = FALSE)
6
-dataFrame <- stabilityRange(data=rnaMetrics, k.range=c(2,4), bs=20, getImages = FALSE)
5
+dataFrame <- stability(data=rnaMetrics, k=4, bs=100, all_metrics = FALSE, getImages = FALSE)
6
+dataFrame <- stabilityRange(data=rnaMetrics, k.range=c(2,4), bs=20, all_metrics = FALSE, getImages = FALSE)
7 7
 assay(dataFrame)
8 8
 # Metric    Mean_stability_k_2  Mean_stability_k_3  Mean_stability_k_4
9 9
 # [1,] "RIN"     "0.825833333333333" "0.778412698412698" "0.69625"
10 10
 # [2,] "DegFact" "0.955595238095238" "0.977777777777778" "0.820833333333333"
11
-dataFrame <- stabilitySet(data=rnaMetrics, k.set=c(2,3,4), bs=20, getImages = FALSE)
11
+dataFrame <- stabilitySet(data=rnaMetrics, k.set=c(2,3,4), bs=20, all_metrics = FALSE, getImages = FALSE)
12 12
 
13
-dataFrame <- quality(data=rnaMetrics, cbi="kmeans", k=3, getImages = FALSE)
13
+dataFrame <- quality(data=rnaMetrics, cbi="kmeans", k=3, all_metrics = FALSE, getImages = FALSE)
14 14
 assay(dataFrame)
15 15
 # Metric    Cluster_1_SilScore  Cluster_2_SilScore  Cluster_3_SilScore
16 16
 # [1,] "RIN"     "0.420502645502646" "0.724044583696066" "0.68338517747747"
... ...
@@ -18,7 +18,7 @@ assay(dataFrame)
18 18
 # Avg_Silhouette_Width Cluster_1_Size Cluster_2_Size Cluster_3_Size
19 19
 # [1,] "0.627829396038413"  "4"            "4"            "8"
20 20
 # [2,] "0.737191191352892"  "8"            "5"            "3"
21
-dataFrame <- qualityRange(data=rnaMetrics, k.range=c(2,4), seed = 20, getImages = FALSE)
21
+dataFrame <- qualityRange(data=rnaMetrics, k.range=c(2,4), seed = 20, all_metrics = FALSE, getImages = FALSE)
22 22
 assay(getDataQualityRange(dataFrame, 2))
23 23
 # Metric    Cluster_1_SilScore  Cluster_2_SilScore  Avg_Silhouette_Width Cluster_1_Size
24 24
 # 1 "RIN"     "0.583166775069983" "0.619872562681118" "0.608402004052639"  "5"
... ...
@@ -36,24 +36,25 @@ assay(getDataQualityRange(dataFrame, 4))
36 36
 # Cluster_4_Size
37 37
 # 1 "5"
38 38
 # 2 "3"
39
-dataFrame1 <- qualitySet(data=rnaMetrics, k.set=c(2,3,4), getImages = FALSE)
39
+dataFrame1 <- qualitySet(data=rnaMetrics, k.set=c(2,3,4), all_metrics = FALSE, getImages = FALSE)
40 40
 
41 41
 
42 42
 dataFrame <- metricsCorrelations(data=rnaMetrics, getImages = FALSE, margins = c(4,4,11,10))
43 43
 assay(dataFrame, 1)
44 44
 
45 45
 
46
-dataFrame <- stability(data=rnaMetrics, cbi="kmeans", k=2, bs=100, getImages = FALSE)
47
-dataFrame <- stability(data=rnaMetrics, cbi="clara", k=2, bs=100, getImages = FALSE)
48
-dataFrame <- stability(data=rnaMetrics, cbi="clara_pam", k=2, bs=100, getImages = FALSE)
49
-dataFrame <- stability(data=rnaMetrics, cbi="hclust", k=2, bs=100, getImages = FALSE)
50
-dataFrame <- stability(data=rnaMetrics, cbi="pamk", k=2, bs=100, getImages = FALSE)
51
-dataFrame <- stability(data=rnaMetrics, cbi="pamk_pam", k=2, bs=100, getImages = FALSE)
46
+dataFrame <- stability(data=rnaMetrics, cbi="kmeans", k=2, bs=100, all_metrics = FALSE, getImages = FALSE)
47
+dataFrame <- stability(data=rnaMetrics, cbi="clara", k=2, bs=100, all_metrics = FALSE, getImages = FALSE)
48
+dataFrame <- stability(data=rnaMetrics, cbi="clara_pam", k=2, bs=100, all_metrics = FALSE, getImages = FALSE)
49
+dataFrame <- stability(data=rnaMetrics, cbi="hclust", k=2, bs=100, all_metrics = FALSE, getImages = FALSE)
50
+dataFrame <- stability(data=rnaMetrics, cbi="pamk", k=2, bs=100, all_metrics = FALSE, getImages = FALSE)
51
+dataFrame <- stability(data=rnaMetrics, cbi="pamk_pam", k=2, bs=100, all_metrics = FALSE, getImages = FALSE)
52
+dataFrame <- stability(data=rnaMetrics, cbi="rskc", k=2, bs=100, all_metrics = TRUE, L1 = 2, alpha=0, getImages = FALSE)
52 53
 
53 54
 # Supported CBIs:
54 55
 evaluomeRSupportedCBI()
55 56
 
56
-dataFrame <- qualityRange(data=rnaMetrics, k.range=c(2,10), getImages = FALSE)
57
+dataFrame <- qualityRange(data=rnaMetrics, k.range=c(2,10), all_metrics = FALSE, getImages = FALSE)
57 58
 dataFrame
58 59
 
59 60
 #dataFrame <- stabilityRange(data=rnaMetrics, k.range=c(2,8), bs=20, getImages = FALSE)
... ...
@@ -8,7 +8,7 @@ cluster = plotMetricsCluster(ontMetrics, scale = TRUE)
8 8
 plotMetricsViolin(rnaMetrics)
9 9
 plotMetricsViolin(ontMetrics, 2)
10 10
 
11
-ntMetricsstabilityData <- stabilityRange(data=rnaMetrics, k.range=c(3,4), bs=20, getImages = FALSE, seed=100)
11
+stabilityData <- stabilityRange(data=rnaMetrics, k.range=c(3,4), bs=20, getImages = FALSE, seed=100)
12 12
 qualityData <- qualityRange(data=rnaMetrics, k.range=c(3,4), getImages = FALSE, seed=100)
13 13
 
14 14
 kOptTable <- getOptimalKValue(stabilityData, qualityData, k.range=c(3,4))
... ...
@@ -33,3 +33,4 @@ test = qualityRange(data=ontMetrics, k.range=c(3,3),
33 33
 
34 34
 # Shows how clusters are partitioned according to the individuals
35 35
 individuals_per_cluster(test$k_3)
36
+
... ...
@@ -1,7 +1,7 @@
1 1
 library(evaluomeR)
2 2
 library(RSKC)
3 3
 library(sparcl)
4
-
4
+seed = 100
5 5
 dataFrame <- quality(data=ontMetrics, cbi="kmeans", k=3)
6 6
 assay(dataFrame)
7 7
 # Metric     Cluster_1_SilScore  Cluster_2_SilScore  Cluster_3_SilScore  Avg_Silhouette_Width Cluster_1_Size Cluster_2_Size Cluster_3_Size
8 8
deleted file mode 100755
... ...
@@ -1,123 +0,0 @@
1
-library(evaluomeR)
2
-library(RSKC)
3
-library(sparcl)
4
-
5
-
6
-# Dataframe for the use case is 'ontMetrics' provided by our evaluomeR package.
7
-
8
-data("ontMetrics")
9
-df = as.data.frame(assay(ontMetrics))
10
-df["Description"] = NULL # Description column not relevant atm.
11
-head(df, 5)
12
-data("ontMetrics")
13
-
14
-# RSKC
15
-# Robust and Sparse K-Means clustering [[1]](#1) requires to select mainly three parameters:
16
-# - **nlc**: Number of *K* cluster. It is to be determined by *evaluomeR* optimal *K* algorithm.
17
-#   - **L<sub>1</sub>**: The tuning parameter for sparce clustering. It acts as the upper bound restraint for the vector of weights. 1 $<$ L<sub>1</sub> $\leq$ $\sqrt{num.variables}$.
18
-#   - **$\alpha$**: The trimming portion [[4]](#4) used in the robust clustering.
19
-
20
-# Optimal K clusters value
21
-# Here, we make use of *evaluomeR* to figure out the optimal $k$ value. The algorithm on how the optimal is calculated is outlined in [[7]](#7). We consider the $k$ range [3,15] for the analysis of the optimal $k$, avoiding $k=2$ to prevent from having binary classifications.
22
-seed=100
23
-k.range=c(3,15)
24
-stabilityData <- stabilityRange(data=ontMetrics, k.range=k.range, bs=20, getImages = FALSE, seed=seed)
25
-qualityData <- qualityRange(data=ontMetrics, k.range=k.range, getImages = FALSE, seed=seed)
26
-optK <- getOptimalKValue(stabilityData, qualityData, k.range=k.range)
27
-
28
-# Optimal $k$ values individually per input metric are:
29
-optK[c('Metric','Global_optimal_k')]
30
-
31
-k_values = as.numeric(unlist(optK['Global_optimal_k']))
32
-global_k_value = floor(mean(k_values))
33
-print(paste0("Taking global optimal K value: ", global_k_value))
34
-
35
-plotMetricsClusterComparison(ontMetrics, k.vector1=global_k_value)
36
-
37
-# Figuring out the L1 upper boundry
38
-# In [[2]](#2) authors provide description of the algorithm to select the tunning parameter L<sub>1</sub> for
39
-# the sparse K-means, which consist of independent permutations from the same source data matrix and the gap
40
-# statistic [[5]](#5). This algorithm for tuning the L<sub>1</sub> parameter and others described in [[2]](#2)
41
-# are presented in 'sparcl' R package [[6]](#6).
42
-
43
-dataMatrix = as.matrix(df)
44
-dataMatrix = scale(dataMatrix, TRUE, TRUE)
45
-head(dataMatrix, 5)
46
-
47
-# Considering that for the dataset the global optimal $k$ is $k=4$, we can now compute the
48
-# permutations to figure out the boundry L<sub>1</sub> with the method 'KMeansSparseCluster.permute'
49
-# from 'sparcl' [[6]](#6).
50
-
51
-# Note: 1 $<$ L<sub>1</sub> $\leq$ $\sqrt{num.variables}$.
52
-
53
-wbounds = seq(2,sqrt(ncol(dataMatrix)), len=30)
54
-km.perm <- KMeansSparseCluster.permute(dataMatrix,K=global_k_value,wbounds=wbounds,nperms=5)
55
-print(km.perm)
56
-plot(km.perm)
57
-
58
-l1 = km.perm$bestw
59
-print(paste0("Best L1 upper bound is: ", l1))
60
-
61
-
62
-
63
-# Metrics relevancy
64
-rskc_out = RSKC(df["ANOnto"], global_k_value, 0.1, L1 = l1, nstart = 200,
65
-                silent=TRUE, scaling = FALSE, correlation = FALSE)
66
-cat(paste0("L1 value: ", l1,"\n"))
67
-cat(names(rskc_out$weights)[1], ": ", rskc_out$weights[1],"\n")
68
-cat(names(rskc_out$weights)[2], ": ", rskc_out$weights[2],"\n")
69
-cat(names(rskc_out$weights)[3], ": ", rskc_out$weights[3],"\n")
70
-cat("---\n")
71
-
72
-rskc_out
73
-
74
-# Trimmed cases:
75
-
76
-# oE: Indices of the cases trimmed in squared Euclidean distances.
77
-# oW:	Indices of the cases trimmed in weighted squared Euclidean distances. If L1 =NULL,
78
-# then oW are the cases trimmed in the Euclidean distance, because all the features have the same weights, i.e., 1's.
79
-union_vector = c(rskc_out$oE,rskc_out$oW)
80
-union_vector_unique = unique(union_vector)
81
-union_vector_unique = sort(union_vector_unique)
82
-
83
-print(paste0("Trimmed cases from input dataframe: "))
84
-union_vector_unique
85
-
86
-options(scipen=10)
87
-
88
-columns = c('metric', 'weight')
89
-rskc_df = data.frame(matrix(ncol = length(columns), nrow = length(rskc_out$weights)))
90
-colnames(rskc_df) = columns
91
-rskc_df['metric'] = names(rskc_out$weights)
92
-rskc_df['weight'] = rskc_out$weights
93
-rskc_df
94
-
95
-# Relevancy table
96
-rskc_df_sorted = rskc_df[order(rskc_df$weight, decreasing = TRUE), ]
97
-rskc_df_sorted
98
-
99
-# References <a class="anchor" id="references"></a>
100
-
101
-#<a id="1">[1]</a>
102
-#  Kondo, Y., Salibian-Barrera, M., & Zamar, R. (2016). RSKC: An R Package for a Robust and Sparse K-Means Clustering Algorithm. Journal of Statistical Software, 72(5), 1–26. https://doi.org/10.18637/jss.v072.i05
103
-
104
-#<a id="2">[2]</a>
105
-#  Witten, D. M., & Tibshirani, R. (2010). A framework for feature selection in clustering. Journal of the American Statistical Association, 105(490), 713–726. https://doi.org/10.1198/jasa.2010.tm09415
106
-
107
-#<a id="3">[3]</a>
108
-#  Robert Tibshirani, & Guenther Walther (2005). Cluster Validation by Prediction Strength. Journal of Computational and Graphical Statistics, 14(3), 511-528. https://doi.org/10.1198/106186005X59243
109
-
110
-#<a id="4">[4]</a>
111
-#  Gordaliza, A. (1991). On the breakdown point of multivariate location estimators based on trimming procedures. Statistics & Probability Letters, 11(5), 387-394. https://doi.org/10.1016/0167-7152(91)90186-U
112
-
113
-#<a id="5">[5]</a>
114
-#  Tibshirani, R., Walther, G., & Hastie, T. (2001). Estimating the number of clusters in a data set via the gap statistic. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 63(2), 411-423. https://doi.org/10.1111/1467-9868.00293
115
-
116
-#<a id="6">[6]</a>
117
-#  Witten, D. M., & Tibshirani, R. (2010). sparcl: Perform Sparse Hierarchical Clustering and Sparse K-Means Clustering. R package. https://CRAN.R-project.org/package=sparcl
118
-
119
-#<a id="7">[7]</a>
120
-#  José Antonio Bernabé-Díaz, Manuel Franco, Juana-María Vivo, Manuel Quesada-Martínez, & Jesualdo T. Fernández-Breis (2022). An automated process for supporting decisions in clustering-based data analysis. Computer Methods and Programs in Biomedicine, 219, 106765. https://doi.org/10.1016/j.cmpb.2022.106765
121
-
122
-
123
-