#' Estimating number of clusters through internal exhaustive ensemble majority #' voting #' #' @param data A dataframe, where columns are features and rows are data points #' @param min.k Minimum number of clusters for which we calculate stabilities #' @param max.k Maximum number of clusters for which we calculate stabilities #' @param algorithm The clustering algorithm to use for the multiple clustering #' runs to be measured #' #' @return An object of class "clusterVoting" containing a matrix with metric #' scores for every k and internal index, cluster memberships for every k, a #' dataframe with the k votes for every index, k vote frequencies and the #' frequency barplot of the k votes #' #' @export #' #' @examples #' clusterVoting(toy_genes, 4,14,"sc") #' #' @importFrom diceR prepare_data #' @import ggplot2 clusterVoting <- function(data ,min.k ,max.k, algorithm) { data <- as.matrix(data) # data <- prepare_data(data) # Metrics' sizes k_array <- sprintf("k%s",min.k:max.k) k_length <- length(k_array) # Initializing metric scores table scores <- matrix(, nrow = 12, ncol = k_length) colnames(scores) <- k_array rownames(scores) <- c("calinski_harabasz","dunn","silhouette","davies_bouldin","tau", "gamma","g_plus", "c_index", "s_dbw","mcclain_rao","Connectivity", "Compactness") clusters <- matrix(, nrow = dim(data)[1], ncol = k_length) colnames(clusters) <- k_array rownames(clusters) <- rownames(data) counter <- 1 for(current_k in min.k:max.k) { if(algorithm == "sc") { cl <- kernlab::specc(data, centers=current_k, kernel = "rbfdot") cls <- [email protected] } else if(algorithm == "hc") { dist_mat <- stats::dist(data, method = "euclidean") cl <- stats::hclust(dist_mat, method = "average") cls <- stats::cutree(cl, k = current_k) } else if(algorithm == "km") { cl <- stats::kmeans(data, current_k, algorithm = "Hartigan-Wong") cls <- cl$cluster } # New code ch_score <- calinhara(data,cls,cn=2) # fpc: calinski_harabasz #dunn_score <- dunn(clusters = 2, Data = data, method = "euclidean") # clValid: dunn dunn_score <- generalised_dunn_index(data, cls, 1, 1) # clValid: dunn silhouette_score <- silhouette_index(data, cls) # genieclust: silhouette davies_bouldin <- -negated_davies_bouldin_index(data, cls) # genieclust: davies_bouldin tgg <- Index.sPlussMoins(cls, data) # native: tau, gamma, g_plus tau <- tgg$tau gamma <- tgg$gamma g_plus <- tgg$gplus c_index <- Indice.cindex(data,cls) # native: c_index s_dbw <- Index.SDbw(data, cls) mcclain_rao <- Index.15and28(cls,cls,data)$mcclain#mcclain_rao con <- clValid::connectivity(clusters = cls, Data = data) comp <- diceR::compactness(data, cls) criteria <- c(ch_score=ch_score,dunn_score=dunn_score,silhouette_score, davies_bouldin=davies_bouldin,tau=tau,gamma=gamma,g_plus=g_plus, c_index=c_index,s_dbw=s_dbw,mcclain_rao=mcclain_rao, connectivity=con, compactness=comp) criteria <- array(as.numeric(unlist(criteria))) scores[, counter] <- criteria clusters[, counter] <- cls counter <- counter + 1 } scores[is.na(scores)] <- 0 votes <- data.frame(result=integer(), metric=character(), stringsAsFactors=FALSE) b.counter <- 1 # Determine which k has the best score for(metric in seq_len(nrow(scores))) { if(metric %in% c(1,2,3,5,6,10,11)) { metric.res <- which(scores[metric,]==max(scores[metric,])) } else { metric.res <- which(scores[metric,]==min(scores[metric,])) } current.metric <- row.names(scores)[metric] for(res in metric.res) { votes[b.counter, ] <- c(res, current.metric) b.counter <- b.counter + 1 } } # Remove metric votes that voted for every k votes <- votes[votes$metric %in% names(which(table(votes$metric) != dim(scores)[2])), ] # converting into familiar kX form votes[, 1] <- paste0("k", (as.numeric(votes[, 1]) + 1)) ensemble.results <- as.data.frame(table(votes[,1])) colnames(ensemble.results) <- c("k", "Frequency") ensemble.results$Frequency <- as.numeric(ensemble.results$Frequency) ensemble.plot <- ggplot2::ggplot(ensemble.results, aes(k, Frequency, fill = k)) + geom_col() + scale_fill_brewer(palette="Dark2") clusterVoting <- function(internal.metric.scores = scores, cluster.memberships.k = clusters, metric.votes.k = votes, vote.frequencies.k = ensemble.results, vote.frequencies.plot = ensemble.plot){ cv <- list(internal.metric.scores = internal.metric.scores, cluster.memberships.k = cluster.memberships.k, metric.votes.k = metric.votes.k, vote.frequencies.k = vote.frequencies.k, vote.frequencies.plot = vote.frequencies.plot) ## Set the name for the class class(cv) <- "clusterVoting" return(cv) } cluster.voting <- clusterVoting() return(cluster.voting) } Indice.cindex <- function (d, cl) { d <- as.matrix(dist(d, method = 'manhattan')) d <- data.matrix(d) DU <- 0 r <- 0 v_max <- array(1, max(cl)) v_min <- array(1, max(cl)) for (i in 1:max(cl)) { n <- sum(cl == i) if (n > 1) { t <- d[cl == i, cl == i] DU = DU + sum(t)/2 v_max[i] = max(t) if (sum(t == 0) == n) v_min[i] <- min(t[t != 0]) else v_min[i] <- 0 r <- r + n * (n - 1)/2 } } Dmin = min(v_min) Dmax = max(v_max) if (Dmin == Dmax) result <- NA else result <- (DU - r * Dmin)/(Dmax * r - Dmin * r) result } Index.sPlussMoins <- function (cl1,md) { md <- as.matrix(dist(md, method = 'euclidean')) cn1 <- max(cl1) n1 <- length(cl1) dmat <- as.matrix(md) average.distance <- median.distance <- separation <- cluster.size <- within.dist1 <- between.dist1 <- numeric(0) separation.matrix <- matrix(0, ncol = cn1, nrow = cn1) di <- list() for (u in 1:cn1) { cluster.size[u] <- sum(cl1 == u) du <- as.dist(dmat[cl1 == u, cl1 == u]) within.dist1 <- c(within.dist1, du) average.distance[u] <- mean(du) median.distance[u] <- median(du) bv <- numeric(0) for (v in 1:cn1) { if (v != u) { suv <- dmat[cl1 == u, cl1 == v] bv <- c(bv, suv) if (u < v) { separation.matrix[u, v] <- separation.matrix[v,u] <- min(suv) between.dist1 <- c(between.dist1, suv) } } } } nwithin1 <- length(within.dist1) nbetween1 <- length(between.dist1) meanwithin1 <- mean(within.dist1) meanbetween1 <- mean(between.dist1) s.plus <- s.moins <- 0 #s.moins<-sum(rank(c(within.dist1,between.dist1),ties="first")[1:nwithin1]-rank(within.dist1,ties="first")) #s.plus <-sum(rank(c(-within.dist1,-between.dist1),ties="first")[1:nwithin1]-rank(-within.dist1,ties="first")) for (k in 1: nwithin1) { s.plus <- s.plus+(colSums(outer(between.dist1,within.dist1[k], ">"))) s.moins <- s.moins+(colSums(outer(between.dist1,within.dist1[k], "<"))) } Index.Gamma <- (s.plus-s.moins)/(s.plus+s.moins) Index.Gplus <- (2*s.moins)/(n1*(n1-1)) t.tau <- (nwithin1*nbetween1)-(s.plus+s.moins) Index.Tau <- (s.plus-s.moins)/(((n1*(n1-1)/2-t.tau)*(n1*(n1-1)/2))^(1/2)) results <- list(gamma=Index.Gamma, gplus=Index.Gplus, tau=Index.Tau) return(results) } Index.SDbw<-function(x, cl) { x <- as.matrix(x) Scatt<-Average.scattering(cl,x)$scatt Dens.bw<-density.bw(cl,x) SDbw<-Scatt+Dens.bw return(SDbw) } density.clusters<-function(cl, x) { x <- as.matrix(x) k <- max(cl) n <- length(cl) distance <- matrix(0, ncol = 1, nrow = n) density <- matrix(0, ncol = 1, nrow = k) centers.matrix<-centers(cl,x) stdev<-Average.scattering(cl,x)$stdev for(i in 1:n) { u=1 while(cl[i] != u ) u<-u+1 for (j in 1:ncol(x)) { distance[i]<- distance[i]+(x[i,j]-centers.matrix[u,j])^2 } distance[i]<-sqrt(distance[i]) if (distance[i] <= stdev) density[u]= density[u]+1 } dens<-list(distance=distance, density=density) return(dens) } density.bw<-function(cl, x) { x <- as.matrix(x) k <- max(cl) n <- length(cl) centers.matrix<-centers(cl,x) stdev<-Average.scattering(cl,x)$stdev density.bw<- matrix(0, ncol = k, nrow = k) u<-1 for(u in 1:k) { for(v in 1:k) { if(v!=u) { distance<- matrix(0, ncol = 1, nrow = n) moy<-(centers.matrix[u,]+centers.matrix[v,])/2 for(i in 1:n) { if((cl[i]==u)||(cl[i]==v)) { for (j in 1:ncol(x)) { distance[i]<- distance[i]+(x[i,j]-moy[j])^2 } distance[i]<- sqrt(distance[i]) if(distance[i]<= stdev) { density.bw[u,v]<-density.bw[u,v]+1 } } } } } } density.clust<-density.clusters(cl,x)$density S<-0 for(u in 1:k) for(v in 1:k) { if(max(density.clust[u], density.clust[v])!=0) S=S+ (density.bw[u,v]/max(density.clust[u], density.clust[v])) } density.bw<-S/(k*(k-1)) return(density.bw) } centers<-function(cl,x) { x <- as.matrix(x) n <- length(cl) k <- max(cl) centers <- matrix(nrow = k, ncol = ncol(x)) { for (i in 1:k) { for (j in 1:ncol(x)) { centers[i, j] <- mean(x[cl == i, j]) } } } return(centers) } Average.scattering <- function (cl, x) { x <- as.matrix(x) n <- length(cl) k <- max(cl) centers.matrix <- centers(cl,x) cluster.size <- numeric(0) variance.clusters <- matrix(0, ncol = ncol(x), nrow = k) var <- matrix(0, ncol = ncol(x), nrow = k) for (u in 1:k) cluster.size[u] <- sum(cl == u) for (u in 1:k) { for (j in 1:ncol(x)) { for(i in 1:n) { if(cl[i]==u) variance.clusters[u,j]<- variance.clusters[u,j]+(x[i, j]-centers.matrix[u,j])^2 } } } for (u in 1:k) { for (j in 1:ncol(x)) variance.clusters[u,j]= variance.clusters[u,j]/ cluster.size[u] } variance.matrix <- numeric(0) for(j in 1:ncol(x)) variance.matrix[j]=var(x[,j])*(n-1)/n Somme.variance.clusters<-0 for (u in 1:k) Somme.variance.clusters<-Somme.variance.clusters+sqrt((variance.clusters[u,]%*%(variance.clusters[u,]))) # Standard deviation stdev<-(1/k)*sqrt(Somme.variance.clusters) #Average scattering for clusters scat<- (1/k)* (Somme.variance.clusters /sqrt(variance.matrix %*% variance.matrix)) scat <- list(stdev=stdev, centers=centers.matrix, variance.intraclusters= variance.clusters, scatt=scat) return(scat) } Index.15and28 <- function (cl1,cl2,md) { md <- as.matrix(dist(md, method = 'euclidean')) cn1 <- max(cl1) n1 <- length(cl1) dmat <- as.matrix(md) average.distance <- median.distance <- separation <- cluster.size <- within.dist1 <- between.dist1 <- numeric(0) separation.matrix <- matrix(0, ncol = cn1, nrow = cn1) di <- list() for (u in 1:cn1) { cluster.size[u] <- sum(cl1 == u) du <- as.dist(dmat[cl1 == u, cl1 == u]) within.dist1 <- c(within.dist1, du) #average.distance[u] <- mean(du) #median.distance[u] <- median(du) #bv <- numeric(0) for (v in 1:cn1) { if (v != u) { suv <- dmat[cl1 == u, cl1 == v] #bv <- c(bv, suv) if (u < v) { separation.matrix[u, v] <- separation.matrix[v,u] <- min(suv) between.dist1 <- c(between.dist1, suv) } } } } cn2 <- max(cl2) n2 <- length(cl2) dmat <- as.matrix(md) average.distance <- median.distance <- separation <- cluster.size <- within.dist2 <- between.dist2 <- numeric(0) separation.matrix <- matrix(0, ncol = cn2, nrow = cn2) di <- list() for (w in 1:cn2) { cluster.size[w] <- sum(cl2 == w) dw <- as.dist(dmat[cl2 == w, cl2 == w]) within.dist2 <- c(within.dist2, dw) #average.distance[w] <- mean(dw) #median.distance[w] <- median(dw) bx <- numeric(0) for (x in 1:cn2) { if (x != w) { swx <- dmat[cl2 == w, cl2 == x] bx <- c(bx, swx) if (w < x) { separation.matrix[w, x] <- separation.matrix[x,w] <- min(swx) between.dist2 <- c(between.dist2, swx) } } } } nwithin1 <- length(within.dist1) nbetween1 <- length(between.dist1) meanwithin1 <- mean(within.dist1) meanbetween1 <- mean(between.dist1) meanwithin2 <- mean(within.dist2) meanbetween2 <- mean(between.dist2) Index.15 <- (meanbetween2-meanbetween1)/(meanwithin2-meanwithin1) Index.28 <- (meanwithin1/nwithin1)/(meanbetween1/nbetween1) results <- list(frey=Index.15,mcclain=Index.28) return(results) }