R/SGCP_clustering.R
90fec3eb
 
412d446b
 
 ################################################################################# silhouetteIndex
90fec3eb
 silhouette <- function(dis.y, clus.labels){
412d446b
 
90fec3eb
     # dis.y: a dist object
     #       as.dist(y)
     # clus.labels: a vector
     #             contains the clusterlabels for the point
412d446b
 
 
90fec3eb
     if(!is.matrix(dis.y)){ dis.y <- as.matrix(dis.y)}
412d446b
 
90fec3eb
     if(length(clus.labels) != nrow(dis.y)){
         stop("number of elements in clus.labels and number of rows in dis.y are not equal")}
412d446b
 
90fec3eb
     sil <- data.frame(matrix(ncol = 3, nrow = 0))
     colnames(sil) <- c("geneIndices", "silIndex", "clusterLabel")
412d446b
 
90fec3eb
     clusters <- unique(clus.labels)
412d446b
 
 
90fec3eb
     for (inclus in clusters) {
412d446b
 
90fec3eb
         indw <- which(clus.labels == inclus)
         if(length(indw) > 1){
412d446b
 
90fec3eb
             a <- dis.y[indw, indw, drop = FALSE]
412d446b
 
90fec3eb
             clus.size.in <- max(2, ncol(a))
             A <- rowSums(a)/(clus.size.in - 1)
             A <- as.matrix(A)
412d446b
 
90fec3eb
             B <-  matrix(Inf, nrow = length(indw), ncol = 1)
             for(outclus in clusters){
412d446b
 
90fec3eb
                 if(outclus != inclus){
                     indb <- which(clus.labels == outclus)
                     b <- dis.y[indw, indb, drop = FALSE]
                     clus.size.out <- max(1, ncol(b))
                     cur <- as.matrix(rowSums(b)/clus.size.out)
                     B <- pmin(B, cur)
                     B <- as.matrix(B)} # end of if(outclus != inclus)
412d446b
 
 
90fec3eb
             } # end of outclus for loop
             S <- (B - A)/pmax(A, B)
             tdf <- data.frame(geneIndices = indw, silIndex = S, clusterLabel = inclus)
412d446b
 
90fec3eb
         }else{
412d446b
 
90fec3eb
             tdf <- data.frame(geneIndices = indw, silIndex = 0, clusterLabel = inclus)
412d446b
 
90fec3eb
         } # end of index length
         sil <- rbind(sil, tdf)
412d446b
 
90fec3eb
     } # end of inclus for loop
412d446b
 
90fec3eb
     sil <- sil %>%  arrange(clusterLabel,-silIndex)
     sil$clusterLabel <- as.factor(sil$clusterLabel)
412d446b
 
90fec3eb
     return(sil)
412d446b
 
90fec3eb
 } # end of function
412d446b
 
 ################################################################################# divideNorm
90fec3eb
 divideNorm <- function(M, rowWise = TRUE){
412d446b
 
90fec3eb
     # Divide by Norm
     # Argument:
     #         M: a matrix
     #         rowWise: either TRUE or FALSE
     #               if TRUE it divides each row by its norm
     #               if FALSE it divides each column by its norm
     #
     # Value:
     #       M: the scaled matrix
412d446b
 
 
90fec3eb
     if(rowWise == TRUE){
         M <- t(M)
         M <- as.data.frame(lapply(as.data.frame(M), normalization))
         M <- as.matrix(M)
         M <- t(M)
412d446b
 
90fec3eb
     }else if(rowWise == FALSE){
         M <- as.data.frame(lapply(as.data.frame(M), normalization))
         M <- as.matrix(M)
412d446b
 
90fec3eb
     }else{ stop( " rowWise can be either TRUE or FALSE")}
412d446b
 
90fec3eb
     return(M)
412d446b
 
 }
 
 ################################################################################# dropNoise
90fec3eb
 findNoise <- function(evec, thresh = 20){
 
     # drops noise components using eigenvectors
     #
     # Arguments:
     #         evec: a matrix of eigenvectors, corresponds to columns
     #         thresh: an integer
     #                   minimum cluster size to be considered as noise
     #
     # Value:
     #         final_ind: index of noisy genes
 
     v2 <- evec[, 2]
     indp <- which(v2 >= 0)
     p <- length(indp)
     indn <- which(v2 < 0)
     n <- length(n)
 
     if(length(v2) < 2*thresh){
         temp <- paste0("not enough genes, total number of genes is ", length(v2))
         stop(temp)
412d446b
     }
 
90fec3eb
     if(p > thresh && n > thresh){
         message("network has at least to components and each is large")
         message("this package does not support more than one components")
         message("you need to run SGCP on each components by yourself")
 
     }else{
         if(p < thresh){
             final_ind <- indp
 
         }else if(n < thresh) {
             final_ind <- indn
         }
412d446b
 
90fec3eb
     }
 
     return(final_ind)
412d446b
 }
 
90fec3eb
 ################################################################################# clusterPlots
 clusterPlots <- function(plt, tit, xname, yname){
 
     plt <-  plt +
         theme_classic()  +
         theme(axis.text.x = element_text(angle= 45, hjust=1, size = 5, face = 'bold',lineheight = 0.9)) +
         theme(axis.text.y = element_text(size = 15, face = 'bold',lineheight = 0.9)) +
         theme(axis.title.y = element_text(size = 10, face = 'bold',lineheight = 0.9)) +
         theme(plot.title = element_text(size = 20, face = 'bold',lineheight = 0.9, hjust = 0.5)) +
         theme(legend.text = element_text(size = 7))+
         theme(legend.position = c(.9, .9)) +
         labs(x = xname, y = yname, title = tit)
     return(plt)
 
 }
412d446b
 
 ################################################################################# clusterNumber
90fec3eb
 clusterNumber <- function(egvals, maxNum = 102){
 
     # Finds the number of clusters using relative eigenvalues ratios
     #
     # Argument:
     #       egvals: a vector of eigenvalues, sorted non increasing
     #       maxNum: an integer, upper bound on the maegvalsimum number of cluster
     #
     # Value:
     #       numOpt: a list
     #             relativeGap: k for method relativeGap
     #             secondOrderGap: k for method secondOrderGap
     #             additiveGap: k for method additiveGap
     #             plt: plots for each methods
 
 
     if(!is.numeric(egvals)){ stop("eigenvalues egvals are not numeric") }
     if(is.unsorted(rev(egvals))){
         warning("egvals are not sorted", call. = FALSE)
         message("sorting the eigenva;ues...")
         egvals <- sort(egvals, decreasing = TRUE)}
 
     if(round(egvals[1], 7) == 1){egvals <- egvals[-1]}
 
     # method: eigen gap
4f49cf88
     
90fec3eb
     k <- seq_len(maxNum)
     k <- k[-1]
     pairDiff <- abs(diff(egvals[seq_len(maxNum)]))
 
     dfgap <- as.data.frame(cbind(k, pairDiff))
4f49cf88
     
90fec3eb
     optg <- dfgap[which.max(pairDiff),]$k
 
     # method: first and second order
     egvals <- 1 - egvals
     egvals <- egvals[seq_len(min(maxNum, length(egvals)))]
 
     df <- data.frame(eigenVal = egvals, firstOrder = NA, kfirst = NA,
                     secondOrder = NA, ksecond = NA)
 
     i <- 2
     while(i <= nrow(df)){
         df$firstOrder[i] <- df$eigenVal[i]/df$eigenVal[i-1]
         df$kfirst[i] <- i
         i <- i + 1}
 
     i <- 3
     while (i <= nrow(df)) {
         df$secondOrder[i] <- df$firstOrder[i-1] - df$firstOrder[i]
         df$ksecond[i] <- i- 1
         i <- i+1}
 
     df$secondOrder <- c(NA, diff(df$firstOrder))
4f49cf88
  
90fec3eb
 
     # method: first
     resf <- df$firstOrder
     resf <- resf[-1]
     optf <- which.max(resf) + 1
 
     # method: second
     ress <- df$secondOrder
     ress <- ress[-c(1,2)]
     opts <- which.max(ress) + 1
 
     numOpt <- list("relativeGap" = optf, "secondOrderGap" = opts, "additiveGap" = optg)
     temp <- paste0("number of clusters for relativeGap method is ")
     message(temp, optf)
     temp <- paste0("number of clusters for secondOrderGap method is ")
     message(temp, opts)
     temp <- paste0("number of clusters for additiveGap method is ")
     message(temp, optg)
 
 
     pltgap <- ggplot(data = dfgap, aes(x = k, y = pairDiff)) +
         geom_point(size = .5) +
         geom_line()
 
     pltgap <- clusterPlots(pltgap, tit = "Additive Gap Method",
                         xname = "cluster number", yname = "Additive Gap")
 
     pltfirst <- ggplot(data = df, aes(x = kfirst, y = firstOrder)) +
                     geom_point(size = .5) +
                     geom_line()
 
     pltfirst <- clusterPlots(pltfirst, tit = "Relative Gap Method",
                             xname = "cluster number", yname = "Relative Gap")
 
     pltsecond <- ggplot(data = df, aes(x = ksecond, y = secondOrder)) +
         geom_point(size = .5) +
         geom_line()
 
     pltsecond <- clusterPlots(pltsecond, tit = "Second-Order Gap Method",
                             xname = "cluster number", yname = "Second-Order Gap")
 
     plt <- list("additiveGap" = pltgap, "relativeGap" = pltfirst,
                 "secondOrderGap" = pltsecond)
     numOpt <- c(numOpt, "plots" = plt)
     return(numOpt)
412d446b
 }
 
 ################################################################################# conductance
90fec3eb
 conductance <- function(adja, clusLab){
 
     # calculates the conductance of a clustering
     #
     # Arguments:
     #           adja: a matrix,
     #               symmetric sqaure adjacency matrix with values in (0,1)
     #           clusLab: a vector of cluster label
     #
     # Values:
     #       a list: mean, median, clusterConductance
     #               mean: mean over all conduct value per cluster
     #               median: median over all conduct value per cluster
     #               conductance: conductance value per cluster
 
     conduct <- c()
     deg <- matrix(rowSums(adja), ncol = 1)
 
     for(clus in unique(clusLab)){
         indin <- which(clusLab == clus)
         indout <- which(clusLab != clus)
 
         cur <- 0
         M <- adja[indin, indout]
         cur <- cur + sum(M)
 
         conduct <- c(conduct, cur/sum(deg[indin]))
 
     }
     names(conduct) <- unique(clusLab)
 
     newList <- list("mean" = mean(conduct), "median" = median(conduct),
                     "clusterConductance" = conduct)
412d446b
 }
 
 ################################################################################# cvConductance
90fec3eb
 cvConductance <- function(adja, k, Y, X, func = min,
                         maxIter = maxIter, numStart = numStart){
 
     # perform cross validation
     #
     # Arguments:
     #         adja: matrix
     #             symmetric squared with values in (0,1) matrix
     #         k: a list containing
     #           relativeGap: scalar containing k relativeGap method
     #           secondOrderGap: scalar containing k for secondOrderGap method
     #           additiveGap: scalar containing k for additiveGap method
     #           plots: plots of the methods
     #         Y: matrix of eigenvectors
     #         X: vector of eigenvalues
     #         method: either mean, median, or mean
     #                 for comparing conductance indices
     #         maxIter: a number,
     #                 maximum number of iteration in kmeans
     #         numStart: a number
     #                   number of start for kmeans
     #
     # Values:
     #       a list of
     #       method: string of method names
     #       k: scalar for number of cluster
     #       Y: a rectangular matrix of the transformation
     #       X: a vector of eigenvalues correspond to Y
     #       cluster: output of kmeans
     #       clusterLabels: a vector of datapoints labels
 
     message("Conductance Validation...")
 
     # first order method
     krelativeGap <- k$relativeGap
     Yf <- Y[, seq_len(min(2*krelativeGap, ncol(Y)))]
     Xf <- X[seq_len(min(2*krelativeGap, length(X)))]
     Yf <- divideNorm(Yf, rowWise = TRUE)
     clusf <- kmeans(Yf, krelativeGap, iter.max = maxIter, nstart = numStart)
     conf <- conductance(adja = adja, clusLab = clusf$cluster)
 
     # second order method
     ksecondOrderGap <- k$secondOrderGap
     Ys <- Y[, seq_len(min(2*ksecondOrderGap, ncol(Y)))]
     Xs <- X[seq_len(min(2*ksecondOrderGap, length(X)))]
     Ys <- divideNorm(Ys, rowWise = TRUE)
     cluss <- kmeans(Ys, ksecondOrderGap, iter.max = maxIter, nstart = numStart)
     cons <- conductance(adja = adja, clusLab = cluss$cluster)
 
     # eigengap method
     kadditiveGap <- k$additiveGap
     Yg <- Y[, seq_len(min(2*kadditiveGap, ncol(Y)))]
     Xg <- X[seq_len(min(2*kadditiveGap, length(X)))]
     Yg <- divideNorm(Yg, rowWise = TRUE)
     clusg <- kmeans(Yg, kadditiveGap, iter.max = maxIter, nstart = numStart)
     cong <- conductance(adja = adja, clusLab = clusg$cluster)
 
     res <- c(func(conf$clusterConductance),
             func(cons$clusterConductance),
             func(cong$clusterConductance))
     names(res) <- c("relativeGap", "secondOrderGap", "additiveGap")
 
     resmethod <- names(which.min(res))
     temp <- paste0("selected method in cvConductance is ", resmethod )
     message(temp)
 
     methodCon <- list("relativeGap.conductance" = conf,
                     "secondOrderGap.conductance" = cons,
                     "additiveGap.conductance" = cong)
 
     if(resmethod == "relativeGap"){
         method <- "relativeGap"
         k <- krelativeGap
         Y <- Yf
         X <- Xf
         clus <- clusf
         clusterLabels <- clus$cluster
 
     }else if(resmethod == "secondOrderGap"){
         method <- "secondOrderGap"
         k <- ksecondOrderGap
         Y <- Ys
         X <- Xs
         clus <- cluss
         clusterLabels <- clus$cluster
 
 
     }else{
         method <- "additiveGap"
         k <- kadditiveGap
         Y <- Yg
         X <- Xg
         clus <- clusg
         clusterLabels <- clus$cluster }
 
     newList <- list("method" = method, "k" = k, "Y" = Y, "X" = X,
                     "clus" = clus, "clusterLabels" = clusterLabels,
                     "conductance" = methodCon)
 
     return(newList)
412d446b
 }
 
 ################################################################################# sigClusGO
90fec3eb
 sigClusGO <- function(adja, Y, X, k, annotation, geneID,
                     maxIter = 1e+8, numStart = 1000){
 
     # performs geneontology for a single cluster produced by
     #           either relativeGap, secondOrderGap, and additiveGap
     #
     # Arguments:
     #         adja: a matrix
     #               symmetric square matrix with values in (0,1)
     #         Y: the eigenvectors
     #         X: eigenvalues
     #         k: number of clusters
     #         annotation: genome wide annotation
     #         geneID: a vector of gene IDs
     #         maxIter: a number, indicates maximum number of iteration for kmeans
     #         numStart: a number, indicates the number of start for kmeans
     #
     # Value
     #     Y: normalized Y
     #     X: the eigenvalues
     #     k: number of clusters
     #     clus: clustering object returns by kmeans function
     #     conductance: the result returns by conductance function
     #     pvalues: the pvalues associated to the cluster GO
     #     df_clus: the cluster df for GO
     #
 
 
     Yt <- Y[, seq_len(min(2*k, ncol(Y)))]
     Xt <- X[seq_len(min(2*k, length(X)))]
     Yt <- divideNorm(Yt, rowWise = TRUE)
     clust <- kmeans(Yt, k, iter.max = maxIter, nstart = numStart)
     cont <- conductance(adja = adja, clusLab = clust$cluster)
 
     lab.t <- names(which.min(cont$clusterConductance))
     ind.t <- which(clust$cluster == lab.t)
     genes.t <- geneID[ind.t]
 
     nl.t <- GOenrichment(universe_geneIDs = geneID,
                         cluster_geneIDs = genes.t,
                         annotation_db = annotation,
                         lab = lab.t,
                         direction = c("over", "under"),
                         ontology = c("BP", "CC", "MF"),
                         hgCutoff = NULL,
                         condition = TRUE)
 
     newList <- list("Y" = Yt, "X" = Xt, "k" = k, "clus" = clust,
                     "conductance" = cont, "pvalues" = nl.t$GO_summary$Pvalue,
                     "df_clus" = nl.t$GO_summary)
     return(newList)
412d446b
 }
 
 ################################################################################# cvGO
90fec3eb
 cvGO <- function(adja, k, Y, X, annotation, geneID,
412d446b
                 maxIter = maxIter, numStart = numStart, func = sum){
 
90fec3eb
     # perform cross validation
     #
     # Arguments:
     #         adja: a matrix
     #               symmetric square matrix with values in (0,1)
     #         Y: the eigenvectors
     #         X: eigenvalues
     #         k: number of clusters
     #         annotation: genome wide annotation
     #         geneID: a vector of gene IDs
     #         maxIter: a number, indicates maximum number of iteration for kmeans
     #         numStart: a number, indicates the number of start for kmeans
     #         func: function to perform cv for the GOs
     #
     # Values:
     #       a list of containing
     #       method: string of method names
     #       k: scalar for number of cluster
     #       Y: a rectangular matrix of the transformation
     #       X: a vector of eigenvalues correspond to Y
     #       cluster: output of kmeans
     #       clusterLabels: a vector of datapoints labels
     #       conductance for the best selected method
412d446b
 
90fec3eb
     message("\n Gene Ontology Validation...")
412d446b
 
90fec3eb
     # first order method
     message("method relativeGap....")
     nl.f <- sigClusGO(adja, Y, X, k$relativeGap, annotation, geneID,
                     maxIter = maxIter, numStart = numStart)
     f <- -log10(nl.f$pvalues)
412d446b
 
90fec3eb
     if(nrow(nl.f$df_clus) >0 ){
         nl.f$df_clus$clusterNum <- "relativeGap"}
412d446b
 
90fec3eb
     df.f <- nl.f$df_clus
412d446b
 
90fec3eb
     # second order method
412d446b
 
90fec3eb
     message("\n method secondOrderGap...")
     if(k$secondOrderGap != k$relativeGap){
412d446b
 
90fec3eb
         nl.s <- sigClusGO(adja, Y, X, k$secondOrderGap, annotation, geneID,
                         maxIter = maxIter, numStart = numStart)
         s <- -log10(nl.s$pvalues)
412d446b
 
90fec3eb
         if(nrow(nl.s$df_clus) >0 ){
             nl.s$df_clus$clusterNum <- "secondOrderGap" }
412d446b
 
90fec3eb
         df.s <- nl.s$df_clus
412d446b
 
90fec3eb
     }else {
412d446b
 
90fec3eb
         message("GO enrichment is the same is relativeGap!")
         nl.s <- nl.f
         s <- nl.s$pvalues
412d446b
 
90fec3eb
         if(nrow(nl.s$df_clus) >0 ){
             nl.s$df_clus$clusterNum <- "secondOrderGap" }
412d446b
 
90fec3eb
         df.s <- nl.s$df_clus}
412d446b
 
 
90fec3eb
     message("\n method additiveGap....")
     if(k$additiveGap == k$relativeGap){
         message("GO enrichment is the same is relativeGap!")
412d446b
 
90fec3eb
         nl.g <- nl.f
         g <- -log10(nl.g$pvalues)
412d446b
 
90fec3eb
         if(nrow(nl.g$df_clus) >0 ){
             nl.g$df_clus$clusterNum <- "additiveGap"}
412d446b
 
90fec3eb
         df.g <- nl.g$df_clus
412d446b
 
90fec3eb
     }else if(k$additiveGap == k$secondOrderGap){
         message("GO enrichment is the same is secondOrderGap!")
412d446b
 
90fec3eb
         nl.g <- nl.s
         g <- -log10(nl.g$pvalues)
412d446b
 
90fec3eb
         if(nrow(nl.g$df_clus) >0 ){
             nl.g$df_clus$clusterNum <- "additiveGap"}
412d446b
 
90fec3eb
         df.g <- nl.g$df_clus
412d446b
 
90fec3eb
     }else{
         nl.g <- sigClusGO(adja, Y, X, k$additiveGap, annotation, geneID,
                         maxIter = maxIter, numStart = numStart)
         g <- -log10(nl.g$pvalues)
412d446b
 
90fec3eb
         if(nrow(nl.g$df_clus) >0 ){
             nl.g$df_clus$clusterNum <- "additiveGap"}
412d446b
 
90fec3eb
         df.g <- nl.g$df_clus}
412d446b
 
 
90fec3eb
     methodCon <- list("relativeGap.conductance" = nl.f$conductance,
                     "secondOrderGap.conductance" = nl.s$conductance,
                     "additiveGap.conductance" = nl.g$conductance)
412d446b
 
 
90fec3eb
     # taking cv function over p-values
     res <- c(func(f), func(s), func(g))
     names(res) <- c("relativeGap", "secondOrderGap", "additiveGap")
     resmethod <- names(which.max(res))
412d446b
 
 
90fec3eb
     if(resmethod == "relativeGap"){
         method <- "relativeGap"
         k <- nl.f$k
         Y <- nl.f$Y
         X <- nl.f$X
         clus <- nl.f$clus
         clusterLabels <- clus$cluster
         conduct <- nl.f$conductance
         df <- nl.f$df_clus
412d446b
 
90fec3eb
     }else if(resmethod == "secondOrderGap"){
         method <- "secondOrderGap"
         k <- nl.s$k
         Y <- nl.s$Y
         X <- nl.s$X
         clus <- nl.s$clus
         clusterLabels <- clus$cluster
         conduct <- nl.s$conductance
         df <- nl.s$df_clus
412d446b
 
90fec3eb
     }else{
         method <- "additiveGap"
         k <- nl.g$k
         Y <- nl.g$Y
         X <- nl.g$X
         clus <- nl.g$clus
         clusterLabels <- clus$cluster
         conduct <- nl.g$conductance
         df <- nl.g$df_clus
412d446b
 
90fec3eb
     }
412d446b
 
90fec3eb
     df <- rbind(df.f, df.s, df.g)
     newList <- list("method" = method, "k" = k, "Y" = Y, "X" = X,
                     "clus" = clus, "clusterLabels" = clusterLabels,
                     "conductance" = methodCon, "df" = df)
412d446b
 
90fec3eb
     return(newList)
 }
412d446b
 
90fec3eb
 ################################################################################# transformation
147671fb
 transformation <- function(adjaMat, geneID, eff.egs){
412d446b
 
147671fb
     # Calculates the data embedding, eigenvalues and eigenvectors
     #
     # Arguments:
     #         adjaMat: adjacency matrix, square symmetric with values in (0, 1)
     #                  and 0 diagonal
     #         geneID: a vector of geneID
     #                   there is one-to-one correspondence between geneID and rows
     #                   in the adjacency matrix
     #
     # Value: a list of following
     #         Y: eigenvectors
     #         X: eigenvalues
     #         geneID: a vector of geneIDs
     #         ind: a vector of dropping genes
412d446b
 
 
90fec3eb
     d <- as.matrix(as.vector(rowSums(adjaMat)))
 
     ind <- which(d == 0)
     if(length(ind) > 0){
         adjaMat <- adjaMat[-ind, ]
         adjaMat <- adjaMat[, -ind]
147671fb
         if(!is.null(geneID)){geneID <- geneID[-ind] }
     }
412d446b
 
90fec3eb
     # Normalized Laplacian
     message("calculating normalized Laplacian \n it may take time...")
147671fb
 
     D <- diag(rowSums(adjaMat))
     d <- as.matrix(as.vector(rowSums(adjaMat)))
90fec3eb
     diag(D) <- 1/sqrt(diag(D))
     L <- D %*% adjaMat %*% D
412d446b
 
fb737d07
     k <- min(200, nrow(adjaMat))
147671fb
 
90fec3eb
     # Calculate the eigenvalues and eigenvectors
     message("calculating eigenvalues/vectors \n it may take time...")
147671fb
     if(eff.egs){
         egvv <- eigs_sym(L, k = k, which = "LA")
     }else{
         egvv <- eigen(L)}
 
90fec3eb
     Y <- egvv$vectors
     X <- egvv$values
 
     if(!is.unsorted(X)){
         warning(" The eigenvalues are not sorted!", call. = FALSE)
         message("sorting the eigenvalues/vectors...")
         eg <- as.matrix(rbind(X, Y))
         eg <- as.data.frame(t(eg))
         colnames(eg)[1] <- "eigenvalues"
         eg <- eg[order(eg$eigenvalues, decreasing = TRUE), ]
         eg <- t(as.matrix(eg))
         X <- eg[1, ]
         temp <- seq_len(nrow(eg))
         temp <- temp[-1]
         Y <- eg[temp, ]}
 
     # preprocessing to check for noisy genes
     if(sum(round(X, 7) == 1) > 1 ){
         message("dropping noisy genes...")
         nois_ind <- findNoise(evec = Y, thresh = 100)
         adjaMat <- adjaMat[-nois_ind, ]
         adjaMat <- adjaMat[, -nois_ind]
         D <- D[-nois_ind, ]
         D <- D[, -nois_ind]
         ind <- c(ind, nois_ind)
 
         if(!is.null(geneID)){geneID <- geneID[-nois_ind]
         temp <- paste0("number of noisy genes are... ", length(nois_ind))
         message(temp)
         message("calculation from begining...")}
 
         # Calculate the diagonal degree matrix
         D <- diag(rowSums(adjaMat))
         d <- as.matrix(as.vector(rowSums(adjaMat)))
 
         # Normalized Laplacian
         message("calculating normalized Laplacian \n it may take time...")
         diag(D) <- 1/sqrt(diag(D))
         L <- D %*% adjaMat %*% D
 
fb737d07
         k <- min(200, nrow(adjaMat))
147671fb
 
90fec3eb
         # Calculate the eigenvalues and eigenvectors
         message("calculating eigenvalues/vectors \n it may take time...")
147671fb
         if(eff.egs){
             egvv <- eigs_sym(L, k = k, which = "LA")
         }else{
             egvv <- eigen(L)}
 
90fec3eb
         Y <- egvv$vectors
147671fb
         X <- egvv$values } # end of if preprocessing
90fec3eb
 
     #
     Y <- D %*% Y
     Y <- sweep(Y, 2, (t(d) %*% Y)/sum(d))
 
     #
     D <- diag(rowSums(adjaMat))
     scalar <- function(x, D){return(x/sqrt(as.vector(t(x) %*% D %*% x)))}
     Y <- apply(Y, 2, scalar, D)
 
     constant <- t(Y[,1]) %*% D %*% Y[, 1]
     if(round(constant, 7) != 1 ){
         warning("eigenvector 1 condition does not hold! ", call. = FALSE)
         temp <- paste0("the condition value is ",
fb737d07
                 as.double(t(Y[,1]) %*% D %*% Y[, 1] != 1))
90fec3eb
         message(temp) }
 
147671fb
     newList <- list("Y" = Y, "X" = X, "ind" = ind, "geneID" = geneID)
 
 }
 
 ################################################################################# clustering2
 clustering <- function(adjaMat, geneID , annotation_db ,
                         kopt = NULL, method = NULL,
                         func.GO = sum, func.conduct = min,
                         maxIter = 1e8, numStart = 1000, eff.egs = TRUE,
                         saveOrig = TRUE, n_egvec = 200, sil = FALSE){
 
     if(is.data.frame(adjaMat)){
         warning("adjaMat is a dataframe", call. = FALSE)
         message("converting adjancecy to matrix")
         df2mat(adjaMat)}
 
 
     if(!is.matrix(adjaMat)){ stop("input must be a matrix") }
     checkNumeric(adjaMat, " transformation input")
     checkSym(adjaMat, " transformation input")
 
     ng <- nrow(adjaMat)
 
     if(!is.null(geneID)){
         if(length(geneID) != ng){
             warning("length of geneID must be equal to number of rows (or columns) in adjaMat",
                     call. = FALSE)
             message("setting geneIDs to NULL")
             geneID <- NULL} }
 
     if(is.null(geneID)){
         geneID <- paste0(rep("gene", ng), seq_len(ng))}
 
     if(!is.null(annotation_db) & !is.character(annotation_db)){
         stop("annotation_db must be character or NULL", call. = FALSE)}
 
     if(!is.character(geneID)){
         warning("type of geneID is not character")
         message("makig geneIDs type to character")
         geneID <- as.character(geneID) }
 
     if(!is.null(kopt) && kopt != round(kopt)){
         warning("kopt must be either null or an integer", call. = FALSE)
         message("making kopt null")
         kopt <- NULL}
 
     if(length(setdiff(method, c("relativeGap",
                                 "secondOrderGap", "additiveGap"))) != 0){
         warning("method can be either relativeGap, secondOrderGag, or additiveGap",
                 call. = FALSE)
         message("making method to NULL")
         method <- NULL }
 
     if(!is.numeric(maxIter) || !is.numeric(numStart)){
         warning("maxIter and numStart must be numeric and integer", call. = FALSE)
         message("making maxIter and numStart to default")
         maxIter <- 1e+8
         numStart <- 1000}
 
     if(n_egvec != round(n_egvec)){
         warning("n_egvec must be an integer ", call. = FALSE)
         message("setting n_egvec to 200...")
fb737d07
         n_egvec <- 200}
147671fb
     if(n_egvec > 200){
         warning("maximum value for n_egvec is 200", call. = FALSE)
         message("setting n_egvec to 200...")
fb737d07
         n_egvec <- 200 }
147671fb
 
 
     ################### transformation
     nl <- transformation(adjaMat, geneID, eff.egs)
     X <-  nl$X
     Y <- nl$Y
     ind <- nl$ind
     geneID <- nl$geneID
 
90fec3eb
 
     # Drop first eigenvalue and eigenvectors
     X <- X[-1]
     Y <- Y[, -1]
 
     # saving original
 
     if(saveOrig == TRUE){
         temp <- paste0("n_egvec is ", n_egvec)
         message(temp)
         Yorig <- Y[, seq_len(min(n_egvec, ncol(Y)))]
         Xorig <- X[seq_len(min(n_egvec, length(X)))]
4f49cf88
         }
412d446b
 
90fec3eb
     plt <- list()
412d446b
 
90fec3eb
     #################################################################################
     if(is.null(kopt) && is.null(method) && !is.null(annotation_db)){
412d446b
 
90fec3eb
         k <- clusterNumber(egvals = X, maxNum = 102)
         plt <- c(plt, setNames(list(k$plots.relativeGap), "relativeGap"),
fb737d07
             setNames(list(k$plots.secondOrderGap), "secondOrderGap" ),
             setNames(list(k$plots.additiveGap), "additiveGap") )
90fec3eb
 
 
         cvopt <- cvGO(adja = adjaMat, k = k, Y = Y, X = X,
fb737d07
                 annotation = annotation_db, geneID = geneID,
                 maxIter = maxIter, numStart = numStart, func = func.GO)
147671fb
 
 
4f49cf88
         temp <- paste0("\n method ", cvopt$method, " is selected using GO validation and k is ", cvopt$k)
90fec3eb
         message(temp)
 
147671fb
         newList <- list("dropped.indices" = ind, "geneID" = geneID, "method" = cvopt$method,
                         "k" = cvopt$k, "Y" = cvopt$Y, "X" = cvopt$X, "cluster" = cvopt$clus,
                         "clusterLabels" = as.character(cvopt$clusterLabels),
                         "conductance" = cvopt$conductance, "cvGOdf" = cvopt$df, "cv" = "cvGO",
90fec3eb
                         "clusterNumberPlot" = plt)
 
     } else if(is.null(kopt) && is.null(method) && is.null(annotation_db)){
 
         k <- clusterNumber(egvals = X, maxNum = 102)
         plt <- c(plt, setNames(list(k$plots.relativeGap), "reltiveGap"),
fb737d07
             setNames(list(k$plots.secondOrderGap), "secondOrderGap" ),
             setNames(list(k$plots.additiveGap), "additiveGap") )
90fec3eb
 
         cvopt <- cvConductance(adja = adjaMat, k = k, Y = Y, X = X,
fb737d07
                         func = func.conduct,
                         maxIter = maxIter, numStart = numStart)
147671fb
 
4f49cf88
         temp <- paste0("\n method ", cvopt$method, " is selected using conductance index validation and k is ", cvopt$k)
90fec3eb
         message(temp)
 
147671fb
         newList <- list("dropped.indices" = ind, "geneID" = geneID, "method" = cvopt$method,
                         "k" = cvopt$k, "Y" = cvopt$Y, "X" = cvopt$X, "cluster" = cvopt$clus,
                         "clusterLabels" = as.character(cvopt$clusterLabels),
                         "conductance" = cvopt$conductance, "cv" = "cvConductance",
90fec3eb
                         "clusterNumberPlot" = plt)
 
     }else if(is.null(kopt) && !is.null(method)){
         k <- clusterNumber(egvals = X, maxNum = 102)
 
         if(method == "relativeGap"){
 
             # first order method
             krelativeGap <- k$relativeGap
             Yf <- Y[, seq_len(min(2*krelativeGap, ncol(Y)))]
             Xf <- X[seq_len(min(2*krelativeGap, length(X)))]
             Yf <- divideNorm(Yf, rowWise = TRUE)
 
             clusf <- kmeans(Yf, krelativeGap, iter.max = maxIter, nstart = numStart)
             conf <- conductance(adja = adjaMat, clusLab = clusf$cluster)
 
 
4f49cf88
             temp <- paste0("\n method ", "relativeGap", " is selected using user and k is ", k$relativeGap)
90fec3eb
             message(temp)
 
147671fb
             newList <- list("dropped.indices" = ind, "geneID" = geneID, "method" = "relativeGap",
                             "k" = k$relativeGap, "Y" = Yf, "X" = Xf, "cluster" = clusf,
                             "clusterLabels" = as.character(clusf$cluster),
                             "conductance" = conf, "cv" = "userMethod")
90fec3eb
 
         } else if(method == "secondOrderGap"){
 
             # second order method
             ksecondOrderGap <- k$secondOrderGap
             Ys <- Y[, seq_len(min(2*ksecondOrderGap, ncol(Y)))]
             Xs <- X[seq_len(min(2*ksecondOrderGap, length(X)))]
             Ys <- divideNorm(Ys, rowWise = TRUE)
 
             cluss <- kmeans(Ys, ksecondOrderGap, iter.max = maxIter, nstart = numStart)
             cons <- conductance(adja = adjaMat, clusLab = cluss$cluster)
 
4f49cf88
             temp <- paste0("\n method ", "secondOrderGap", " is selected using user and k is ", k$secondOrderGap)
90fec3eb
             message(temp)
 
147671fb
             newList <- list("dropped.indices" = ind, "geneID" = geneID, "method" = "secondOrderGap",
                             "k" = k$secondOrderGap, "Y" = Ys, "X" = Xs, "cluster" = cluss,
                             "clusterLabels" = as.character(cluss$cluster),
                             "conductance" = cons, "cv" = "userMethod")
90fec3eb
 
         } else if(method == "additiveGap"){
 
             # gap method
             kadditiveGap <- k$additiveGap
             Yg <- Y[, seq_len(min(2*kadditiveGap, ncol(Y)))]
             Xg <- X[seq_len(min(2*kadditiveGap, length(X)))]
             Yg <- divideNorm(Yg, rowWise = TRUE)
 
             clusg <- kmeans(Yg, kadditiveGap, iter.max = maxIter, nstart = numStart)
             cong <- conductance(adja = adjaMat, clusLab = clusg$cluster)
 
4f49cf88
             temp <- paste0("\n method ", "additiveGap", " is selected using user and k is ", k$additiveGap)
90fec3eb
             message(temp)
 
147671fb
             newList <- list("dropped.indices" = ind, "geneID" = geneID, "method" = "additiveGap",
                             "k" = k$additiveGap, "Y" = Yg, "X" = Xg, "cluster" = clusg,
                             "clusterLabels" = as.character(clusg$cluster),
                             "conductance" = cong, "cv" = "userMethod")
90fec3eb
 
         } else  {
             stop(" method for number of clusters can be either relativeGap,
                 secondOrderGap, or additiveGap")}
 
     }else if(!is.null(kopt)){
 
         # optimal by user
         Yopt <- Y[, seq_len(min(2*kopt, ncol(Y)))]
         Xopt <- X[seq_len(min(2*kopt, length(X)))]
         Yopt <- divideNorm(Yopt, rowWise = TRUE)
 
         clusopt <- kmeans(Yopt, kopt, iter.max = maxIter, nstart = numStart)
         conopt <- conductance(adja = adjaMat, clusLab = clusopt$cluster)
 
147671fb
         temp <- paste0("\n selected k is ", kopt, " by user")
90fec3eb
         message(temp)
 
147671fb
         newList <- list("dropped.indices" = ind, "geneID" = geneID, "method" = "userkopt",
                         "k" = kopt, "Y" = Yopt, "X" = Xopt, "cluster" = clusopt,
                         "clusterLabels" = as.character(clusopt$cluster),
                         "conductance" = conopt, "cv" = "userkopt")
90fec3eb
     }
 
 
 
     ################### silhouette
     if(sil){
 
         message("calculating the Silhouette index \n it may take time...")
 
147671fb
         sil <- silhouette(dis.y = dist(Y), clus.labels = newList$clusterLabels)
90fec3eb
         sil <- sil %>%  arrange(clusterLabel ,-silIndex)
         sil$clusterLabel <- as.factor(sil$clusterLabel)
         genes <- data.frame(geneID = geneID, geneIndices = seq(1, length(geneID)))
         sil <- inner_join(sil, genes, by = "geneIndices")
         sil <-  sil[ , !(names(sil) %in% "geneIndices")]
         newList <- c(newList, setNames(list(sil), "silhouette"))
 
     }
 
 
     if(saveOrig == TRUE){
 
         tlist <- list("Yorig" = Yorig, "Xorig" = Xorig, "n_egvec" = ncol(Yorig))
         newList <- c(newList, setNames(list(tlist), "original"))}
 
     message("network clustering is done...\n")
 
     return(newList)
412d446b
 
 }
 
 
90fec3eb