... | ... |
@@ -2,7 +2,7 @@ Package: evaluomeR |
2 | 2 |
Type: Package |
3 | 3 |
Title: Evaluation of Bioinformatics Metrics |
4 | 4 |
URL: https://github.com/neobernad/evaluomeR |
5 |
-Version: 1.21.1 |
|
5 |
+Version: 1.21.2 |
|
6 | 6 |
Authors@R: c( |
7 | 7 |
person("José Antonio", "Bernabé-Díaz", email = "[email protected]", role = c("aut", "cre")), |
8 | 8 |
person("Manuel", "Franco", email = "[email protected]", role = "aut"), |
... | ... |
@@ -156,8 +156,19 @@ clusterbootWrapper <- function(data, B, bootmethod="boot", |
156 | 156 |
clusteringWrapper <- function(data, cbi, krange, seed, ...) { |
157 | 157 |
cbiHelperResult = helperGetCBI(cbi, krange, ...) |
158 | 158 |
|
159 |
- old.seed <- .Random.seed |
|
160 |
- on.exit( { .Random.seed <<- old.seed } ) |
|
159 |
+ if(exists(".Random.seed")){ |
|
160 |
+ # .Random.seed might not exist when launched as background job |
|
161 |
+ # so only store and reset if it exists |
|
162 |
+ old.seed <- .Random.seed |
|
163 |
+ } |
|
164 |
+ |
|
165 |
+ on.exit( |
|
166 |
+ { |
|
167 |
+ if(exists("old.seed")) { |
|
168 |
+ .Random.seed <<- old.seed |
|
169 |
+ } |
|
170 |
+ } |
|
171 |
+ ) |
|
161 | 172 |
|
162 | 173 |
if (!is.null(seed)) set.seed(seed) |
163 | 174 |
|
... | ... |
@@ -702,7 +702,7 @@ standardizeStabilityData <- function(stabData, k.range=NULL) { |
702 | 702 |
#' @examples |
703 | 703 |
#' data("ontMetrics") |
704 | 704 |
#' annotated_clusters=annotateClustersByMetric(ontMetrics, k.range=c(2,3), bs=20, seed=100) |
705 |
-#' View(annotated_clusters[['ANOnto']]) |
|
705 |
+#' annotated_clusters[['ANOnto']] |
|
706 | 706 |
annotateClustersByMetric <- function(df, k.range, bs, seed){ |
707 | 707 |
if (is.null(seed)) { |
708 | 708 |
seed = pkg.env$seed |
... | ... |
@@ -775,8 +775,8 @@ annotateClustersByMetric <- function(df, k.range, bs, seed){ |
775 | 775 |
#' |
776 | 776 |
#' @examples |
777 | 777 |
#' data("ontMetrics") |
778 |
-#' ranges = getMetricRangeByCluster(ontMetrics, k.range=c(2,3), bs=20, seed=100) |
|
779 |
-#' View(ranges) |
|
778 |
+#' #ranges = getMetricRangeByCluster(ontMetrics, k.range=c(2,3), bs=20, seed=100) |
|
779 |
+ |
|
780 | 780 |
getMetricRangeByCluster <- function(df, k.range, bs, seed) { |
781 | 781 |
if (is.null(seed)) { |
782 | 782 |
seed = pkg.env$seed |
... | ... |
@@ -172,7 +172,8 @@ qualityRange <- function(data, k.range=c(3,5), cbi="kmeans", getImages=FALSE, |
172 | 172 |
#' @references |
173 | 173 |
#' \insertRef{kaufman2009finding}{evaluomeR} |
174 | 174 |
#' |
175 |
-qualitySet <- function(data, k.set=c(2,4), cbi="kmeans", getImages=FALSE, seed=NULL, ...) { |
|
175 |
+qualitySet <- function(data, k.set=c(2,4), cbi="kmeans", all_metrics=FALSE, |
|
176 |
+ getImages=FALSE, seed=NULL, ...) { |
|
176 | 177 |
|
177 | 178 |
k.set.length = length(k.set) |
178 | 179 |
if (k.set.length == 0) { |
... | ... |
@@ -188,7 +189,8 @@ qualitySet <- function(data, k.set=c(2,4), cbi="kmeans", getImages=FALSE, seed=N |
188 | 189 |
data <- as.data.frame(assay(data)) |
189 | 190 |
|
190 | 191 |
suppressWarnings( |
191 |
- runQualityIndicesSilhouette(data, bs = 1, seed=seed, cbi=cbi, k.set=k.set, ...)) |
|
192 |
+ runQualityIndicesSilhouette(data, bs = 1, seed=seed, cbi=cbi, all_metrics=all_metrics, |
|
193 |
+ k.set=k.set, ...)) |
|
192 | 194 |
silhouetteData = suppressWarnings( |
193 | 195 |
runSilhouetteTableRange(data, k.set=k.set)) |
194 | 196 |
|
... | ... |
@@ -2,15 +2,15 @@ library(evaluomeR) |
2 | 2 |
|
3 | 3 |
data("rnaMetrics") |
4 | 4 |
|
5 |
-dataFrame <- stability(data=rnaMetrics, k=4, bs=100, getImages = FALSE) |
|
6 |
-dataFrame <- stabilityRange(data=rnaMetrics, k.range=c(2,4), bs=20, getImages = FALSE) |
|
5 |
+dataFrame <- stability(data=rnaMetrics, k=4, bs=100, all_metrics = FALSE, getImages = FALSE) |
|
6 |
+dataFrame <- stabilityRange(data=rnaMetrics, k.range=c(2,4), bs=20, all_metrics = FALSE, getImages = FALSE) |
|
7 | 7 |
assay(dataFrame) |
8 | 8 |
# Metric Mean_stability_k_2 Mean_stability_k_3 Mean_stability_k_4 |
9 | 9 |
# [1,] "RIN" "0.825833333333333" "0.778412698412698" "0.69625" |
10 | 10 |
# [2,] "DegFact" "0.955595238095238" "0.977777777777778" "0.820833333333333" |
11 |
-dataFrame <- stabilitySet(data=rnaMetrics, k.set=c(2,3,4), bs=20, getImages = FALSE) |
|
11 |
+dataFrame <- stabilitySet(data=rnaMetrics, k.set=c(2,3,4), bs=20, all_metrics = FALSE, getImages = FALSE) |
|
12 | 12 |
|
13 |
-dataFrame <- quality(data=rnaMetrics, cbi="kmeans", k=3, getImages = FALSE) |
|
13 |
+dataFrame <- quality(data=rnaMetrics, cbi="kmeans", k=3, all_metrics = FALSE, getImages = FALSE) |
|
14 | 14 |
assay(dataFrame) |
15 | 15 |
# Metric Cluster_1_SilScore Cluster_2_SilScore Cluster_3_SilScore |
16 | 16 |
# [1,] "RIN" "0.420502645502646" "0.724044583696066" "0.68338517747747" |
... | ... |
@@ -18,7 +18,7 @@ assay(dataFrame) |
18 | 18 |
# Avg_Silhouette_Width Cluster_1_Size Cluster_2_Size Cluster_3_Size |
19 | 19 |
# [1,] "0.627829396038413" "4" "4" "8" |
20 | 20 |
# [2,] "0.737191191352892" "8" "5" "3" |
21 |
-dataFrame <- qualityRange(data=rnaMetrics, k.range=c(2,4), seed = 20, getImages = FALSE) |
|
21 |
+dataFrame <- qualityRange(data=rnaMetrics, k.range=c(2,4), seed = 20, all_metrics = FALSE, getImages = FALSE) |
|
22 | 22 |
assay(getDataQualityRange(dataFrame, 2)) |
23 | 23 |
# Metric Cluster_1_SilScore Cluster_2_SilScore Avg_Silhouette_Width Cluster_1_Size |
24 | 24 |
# 1 "RIN" "0.583166775069983" "0.619872562681118" "0.608402004052639" "5" |
... | ... |
@@ -36,24 +36,25 @@ assay(getDataQualityRange(dataFrame, 4)) |
36 | 36 |
# Cluster_4_Size |
37 | 37 |
# 1 "5" |
38 | 38 |
# 2 "3" |
39 |
-dataFrame1 <- qualitySet(data=rnaMetrics, k.set=c(2,3,4), getImages = FALSE) |
|
39 |
+dataFrame1 <- qualitySet(data=rnaMetrics, k.set=c(2,3,4), all_metrics = FALSE, getImages = FALSE) |
|
40 | 40 |
|
41 | 41 |
|
42 | 42 |
dataFrame <- metricsCorrelations(data=rnaMetrics, getImages = FALSE, margins = c(4,4,11,10)) |
43 | 43 |
assay(dataFrame, 1) |
44 | 44 |
|
45 | 45 |
|
46 |
-dataFrame <- stability(data=rnaMetrics, cbi="kmeans", k=2, bs=100, getImages = FALSE) |
|
47 |
-dataFrame <- stability(data=rnaMetrics, cbi="clara", k=2, bs=100, getImages = FALSE) |
|
48 |
-dataFrame <- stability(data=rnaMetrics, cbi="clara_pam", k=2, bs=100, getImages = FALSE) |
|
49 |
-dataFrame <- stability(data=rnaMetrics, cbi="hclust", k=2, bs=100, getImages = FALSE) |
|
50 |
-dataFrame <- stability(data=rnaMetrics, cbi="pamk", k=2, bs=100, getImages = FALSE) |
|
51 |
-dataFrame <- stability(data=rnaMetrics, cbi="pamk_pam", k=2, bs=100, getImages = FALSE) |
|
46 |
+dataFrame <- stability(data=rnaMetrics, cbi="kmeans", k=2, bs=100, all_metrics = FALSE, getImages = FALSE) |
|
47 |
+dataFrame <- stability(data=rnaMetrics, cbi="clara", k=2, bs=100, all_metrics = FALSE, getImages = FALSE) |
|
48 |
+dataFrame <- stability(data=rnaMetrics, cbi="clara_pam", k=2, bs=100, all_metrics = FALSE, getImages = FALSE) |
|
49 |
+dataFrame <- stability(data=rnaMetrics, cbi="hclust", k=2, bs=100, all_metrics = FALSE, getImages = FALSE) |
|
50 |
+dataFrame <- stability(data=rnaMetrics, cbi="pamk", k=2, bs=100, all_metrics = FALSE, getImages = FALSE) |
|
51 |
+dataFrame <- stability(data=rnaMetrics, cbi="pamk_pam", k=2, bs=100, all_metrics = FALSE, getImages = FALSE) |
|
52 |
+dataFrame <- stability(data=rnaMetrics, cbi="rskc", k=2, bs=100, all_metrics = TRUE, L1 = 2, alpha=0, getImages = FALSE) |
|
52 | 53 |
|
53 | 54 |
# Supported CBIs: |
54 | 55 |
evaluomeRSupportedCBI() |
55 | 56 |
|
56 |
-dataFrame <- qualityRange(data=rnaMetrics, k.range=c(2,10), getImages = FALSE) |
|
57 |
+dataFrame <- qualityRange(data=rnaMetrics, k.range=c(2,10), all_metrics = FALSE, getImages = FALSE) |
|
57 | 58 |
dataFrame |
58 | 59 |
|
59 | 60 |
#dataFrame <- stabilityRange(data=rnaMetrics, k.range=c(2,8), bs=20, getImages = FALSE) |
... | ... |
@@ -8,7 +8,7 @@ cluster = plotMetricsCluster(ontMetrics, scale = TRUE) |
8 | 8 |
plotMetricsViolin(rnaMetrics) |
9 | 9 |
plotMetricsViolin(ontMetrics, 2) |
10 | 10 |
|
11 |
-ntMetricsstabilityData <- stabilityRange(data=rnaMetrics, k.range=c(3,4), bs=20, getImages = FALSE, seed=100) |
|
11 |
+stabilityData <- stabilityRange(data=rnaMetrics, k.range=c(3,4), bs=20, getImages = FALSE, seed=100) |
|
12 | 12 |
qualityData <- qualityRange(data=rnaMetrics, k.range=c(3,4), getImages = FALSE, seed=100) |
13 | 13 |
|
14 | 14 |
kOptTable <- getOptimalKValue(stabilityData, qualityData, k.range=c(3,4)) |
... | ... |
@@ -1,7 +1,7 @@ |
1 | 1 |
library(evaluomeR) |
2 | 2 |
library(RSKC) |
3 | 3 |
library(sparcl) |
4 |
- |
|
4 |
+seed = 100 |
|
5 | 5 |
dataFrame <- quality(data=ontMetrics, cbi="kmeans", k=3) |
6 | 6 |
assay(dataFrame) |
7 | 7 |
# Metric Cluster_1_SilScore Cluster_2_SilScore Cluster_3_SilScore Avg_Silhouette_Width Cluster_1_Size Cluster_2_Size Cluster_3_Size |
8 | 8 |
deleted file mode 100755 |
... | ... |
@@ -1,123 +0,0 @@ |
1 |
-library(evaluomeR) |
|
2 |
-library(RSKC) |
|
3 |
-library(sparcl) |
|
4 |
- |
|
5 |
- |
|
6 |
-# Dataframe for the use case is 'ontMetrics' provided by our evaluomeR package. |
|
7 |
- |
|
8 |
-data("ontMetrics") |
|
9 |
-df = as.data.frame(assay(ontMetrics)) |
|
10 |
-df["Description"] = NULL # Description column not relevant atm. |
|
11 |
-head(df, 5) |
|
12 |
-data("ontMetrics") |
|
13 |
- |
|
14 |
-# RSKC |
|
15 |
-# Robust and Sparse K-Means clustering [[1]](#1) requires to select mainly three parameters: |
|
16 |
-# - **nlc**: Number of *K* cluster. It is to be determined by *evaluomeR* optimal *K* algorithm. |
|
17 |
-# - **L<sub>1</sub>**: The tuning parameter for sparce clustering. It acts as the upper bound restraint for the vector of weights. 1 $<$ L<sub>1</sub> $\leq$ $\sqrt{num.variables}$. |
|
18 |
-# - **$\alpha$**: The trimming portion [[4]](#4) used in the robust clustering. |
|
19 |
- |
|
20 |
-# Optimal K clusters value |
|
21 |
-# Here, we make use of *evaluomeR* to figure out the optimal $k$ value. The algorithm on how the optimal is calculated is outlined in [[7]](#7). We consider the $k$ range [3,15] for the analysis of the optimal $k$, avoiding $k=2$ to prevent from having binary classifications. |
|
22 |
-seed=100 |
|
23 |
-k.range=c(3,15) |
|
24 |
-stabilityData <- stabilityRange(data=ontMetrics, k.range=k.range, bs=20, getImages = FALSE, seed=seed) |
|
25 |
-qualityData <- qualityRange(data=ontMetrics, k.range=k.range, getImages = FALSE, seed=seed) |
|
26 |
-optK <- getOptimalKValue(stabilityData, qualityData, k.range=k.range) |
|
27 |
- |
|
28 |
-# Optimal $k$ values individually per input metric are: |
|
29 |
-optK[c('Metric','Global_optimal_k')] |
|
30 |
- |
|
31 |
-k_values = as.numeric(unlist(optK['Global_optimal_k'])) |
|
32 |
-global_k_value = floor(mean(k_values)) |
|
33 |
-print(paste0("Taking global optimal K value: ", global_k_value)) |
|
34 |
- |
|
35 |
-plotMetricsClusterComparison(ontMetrics, k.vector1=global_k_value) |
|
36 |
- |
|
37 |
-# Figuring out the L1 upper boundry |
|
38 |
-# In [[2]](#2) authors provide description of the algorithm to select the tunning parameter L<sub>1</sub> for |
|
39 |
-# the sparse K-means, which consist of independent permutations from the same source data matrix and the gap |
|
40 |
-# statistic [[5]](#5). This algorithm for tuning the L<sub>1</sub> parameter and others described in [[2]](#2) |
|
41 |
-# are presented in 'sparcl' R package [[6]](#6). |
|
42 |
- |
|
43 |
-dataMatrix = as.matrix(df) |
|
44 |
-dataMatrix = scale(dataMatrix, TRUE, TRUE) |
|
45 |
-head(dataMatrix, 5) |
|
46 |
- |
|
47 |
-# Considering that for the dataset the global optimal $k$ is $k=4$, we can now compute the |
|
48 |
-# permutations to figure out the boundry L<sub>1</sub> with the method 'KMeansSparseCluster.permute' |
|
49 |
-# from 'sparcl' [[6]](#6). |
|
50 |
- |
|
51 |
-# Note: 1 $<$ L<sub>1</sub> $\leq$ $\sqrt{num.variables}$. |
|
52 |
- |
|
53 |
-wbounds = seq(2,sqrt(ncol(dataMatrix)), len=30) |
|
54 |
-km.perm <- KMeansSparseCluster.permute(dataMatrix,K=global_k_value,wbounds=wbounds,nperms=5) |
|
55 |
-print(km.perm) |
|
56 |
-plot(km.perm) |
|
57 |
- |
|
58 |
-l1 = km.perm$bestw |
|
59 |
-print(paste0("Best L1 upper bound is: ", l1)) |
|
60 |
- |
|
61 |
- |
|
62 |
- |
|
63 |
-# Metrics relevancy |
|
64 |
-rskc_out = RSKC(df["ANOnto"], global_k_value, 0.1, L1 = l1, nstart = 200, |
|
65 |
- silent=TRUE, scaling = FALSE, correlation = FALSE) |
|
66 |
-cat(paste0("L1 value: ", l1,"\n")) |
|
67 |
-cat(names(rskc_out$weights)[1], ": ", rskc_out$weights[1],"\n") |
|
68 |
-cat(names(rskc_out$weights)[2], ": ", rskc_out$weights[2],"\n") |
|
69 |
-cat(names(rskc_out$weights)[3], ": ", rskc_out$weights[3],"\n") |
|
70 |
-cat("---\n") |
|
71 |
- |
|
72 |
-rskc_out |
|
73 |
- |
|
74 |
-# Trimmed cases: |
|
75 |
- |
|
76 |
-# oE: Indices of the cases trimmed in squared Euclidean distances. |
|
77 |
-# oW: Indices of the cases trimmed in weighted squared Euclidean distances. If L1 =NULL, |
|
78 |
-# then oW are the cases trimmed in the Euclidean distance, because all the features have the same weights, i.e., 1's. |
|
79 |
-union_vector = c(rskc_out$oE,rskc_out$oW) |
|
80 |
-union_vector_unique = unique(union_vector) |
|
81 |
-union_vector_unique = sort(union_vector_unique) |
|
82 |
- |
|
83 |
-print(paste0("Trimmed cases from input dataframe: ")) |
|
84 |
-union_vector_unique |
|
85 |
- |
|
86 |
-options(scipen=10) |
|
87 |
- |
|
88 |
-columns = c('metric', 'weight') |
|
89 |
-rskc_df = data.frame(matrix(ncol = length(columns), nrow = length(rskc_out$weights))) |
|
90 |
-colnames(rskc_df) = columns |
|
91 |
-rskc_df['metric'] = names(rskc_out$weights) |
|
92 |
-rskc_df['weight'] = rskc_out$weights |
|
93 |
-rskc_df |
|
94 |
- |
|
95 |
-# Relevancy table |
|
96 |
-rskc_df_sorted = rskc_df[order(rskc_df$weight, decreasing = TRUE), ] |
|
97 |
-rskc_df_sorted |
|
98 |
- |
|
99 |
-# References <a class="anchor" id="references"></a> |
|
100 |
- |
|
101 |
-#<a id="1">[1]</a> |
|
102 |
-# Kondo, Y., Salibian-Barrera, M., & Zamar, R. (2016). RSKC: An R Package for a Robust and Sparse K-Means Clustering Algorithm. Journal of Statistical Software, 72(5), 1–26. https://doi.org/10.18637/jss.v072.i05 |
|
103 |
- |
|
104 |
-#<a id="2">[2]</a> |
|
105 |
-# Witten, D. M., & Tibshirani, R. (2010). A framework for feature selection in clustering. Journal of the American Statistical Association, 105(490), 713–726. https://doi.org/10.1198/jasa.2010.tm09415 |
|
106 |
- |
|
107 |
-#<a id="3">[3]</a> |
|
108 |
-# Robert Tibshirani, & Guenther Walther (2005). Cluster Validation by Prediction Strength. Journal of Computational and Graphical Statistics, 14(3), 511-528. https://doi.org/10.1198/106186005X59243 |
|
109 |
- |
|
110 |
-#<a id="4">[4]</a> |
|
111 |
-# Gordaliza, A. (1991). On the breakdown point of multivariate location estimators based on trimming procedures. Statistics & Probability Letters, 11(5), 387-394. https://doi.org/10.1016/0167-7152(91)90186-U |
|
112 |
- |
|
113 |
-#<a id="5">[5]</a> |
|
114 |
-# Tibshirani, R., Walther, G., & Hastie, T. (2001). Estimating the number of clusters in a data set via the gap statistic. Journal of the Royal Statistical Society: Series B (Statistical Methodology), 63(2), 411-423. https://doi.org/10.1111/1467-9868.00293 |
|
115 |
- |
|
116 |
-#<a id="6">[6]</a> |
|
117 |
-# Witten, D. M., & Tibshirani, R. (2010). sparcl: Perform Sparse Hierarchical Clustering and Sparse K-Means Clustering. R package. https://CRAN.R-project.org/package=sparcl |
|
118 |
- |
|
119 |
-#<a id="7">[7]</a> |
|
120 |
-# José Antonio Bernabé-Díaz, Manuel Franco, Juana-María Vivo, Manuel Quesada-Martínez, & Jesualdo T. Fernández-Breis (2022). An automated process for supporting decisions in clustering-based data analysis. Computer Methods and Programs in Biomedicine, 219, 106765. https://doi.org/10.1016/j.cmpb.2022.106765 |
|
121 |
- |
|
122 |
- |
|
123 |
- |