90fec3eb |
|
412d446b |
################################################################################# silhouetteIndex
|
90fec3eb |
silhouette <- function(dis.y, clus.labels){
|
412d446b |
|
90fec3eb |
# dis.y: a dist object
# as.dist(y)
# clus.labels: a vector
# contains the clusterlabels for the point
|
412d446b |
|
90fec3eb |
if(!is.matrix(dis.y)){ dis.y <- as.matrix(dis.y)}
|
412d446b |
|
90fec3eb |
if(length(clus.labels) != nrow(dis.y)){
stop("number of elements in clus.labels and number of rows in dis.y are not equal")}
|
412d446b |
|
90fec3eb |
sil <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(sil) <- c("geneIndices", "silIndex", "clusterLabel")
|
412d446b |
|
90fec3eb |
clusters <- unique(clus.labels)
|
412d446b |
|
90fec3eb |
for (inclus in clusters) {
|
412d446b |
|
90fec3eb |
indw <- which(clus.labels == inclus)
if(length(indw) > 1){
|
412d446b |
|
90fec3eb |
a <- dis.y[indw, indw, drop = FALSE]
|
412d446b |
|
90fec3eb |
clus.size.in <- max(2, ncol(a))
A <- rowSums(a)/(clus.size.in - 1)
A <- as.matrix(A)
|
412d446b |
|
90fec3eb |
B <- matrix(Inf, nrow = length(indw), ncol = 1)
for(outclus in clusters){
|
412d446b |
|
90fec3eb |
if(outclus != inclus){
indb <- which(clus.labels == outclus)
b <- dis.y[indw, indb, drop = FALSE]
clus.size.out <- max(1, ncol(b))
cur <- as.matrix(rowSums(b)/clus.size.out)
B <- pmin(B, cur)
B <- as.matrix(B)} # end of if(outclus != inclus)
|
412d446b |
|
90fec3eb |
} # end of outclus for loop
S <- (B - A)/pmax(A, B)
tdf <- data.frame(geneIndices = indw, silIndex = S, clusterLabel = inclus)
|
412d446b |
|
90fec3eb |
}else{
|
412d446b |
|
90fec3eb |
tdf <- data.frame(geneIndices = indw, silIndex = 0, clusterLabel = inclus)
|
412d446b |
|
90fec3eb |
} # end of index length
sil <- rbind(sil, tdf)
|
412d446b |
|
90fec3eb |
} # end of inclus for loop
|
412d446b |
|
90fec3eb |
sil <- sil %>% arrange(clusterLabel,-silIndex)
sil$clusterLabel <- as.factor(sil$clusterLabel)
|
412d446b |
|
90fec3eb |
return(sil)
|
412d446b |
|
90fec3eb |
} # end of function
|
412d446b |
################################################################################# divideNorm
|
90fec3eb |
divideNorm <- function(M, rowWise = TRUE){
|
412d446b |
|
90fec3eb |
# Divide by Norm
# Argument:
# M: a matrix
# rowWise: either TRUE or FALSE
# if TRUE it divides each row by its norm
# if FALSE it divides each column by its norm
#
# Value:
# M: the scaled matrix
|
412d446b |
|
90fec3eb |
if(rowWise == TRUE){
M <- t(M)
M <- as.data.frame(lapply(as.data.frame(M), normalization))
M <- as.matrix(M)
M <- t(M)
|
412d446b |
|
90fec3eb |
}else if(rowWise == FALSE){
M <- as.data.frame(lapply(as.data.frame(M), normalization))
M <- as.matrix(M)
|
412d446b |
|
90fec3eb |
}else{ stop( " rowWise can be either TRUE or FALSE")}
|
412d446b |
|
90fec3eb |
return(M)
|
412d446b |
}
################################################################################# dropNoise
|
90fec3eb |
findNoise <- function(evec, thresh = 20){
# drops noise components using eigenvectors
#
# Arguments:
# evec: a matrix of eigenvectors, corresponds to columns
# thresh: an integer
# minimum cluster size to be considered as noise
#
# Value:
# final_ind: index of noisy genes
v2 <- evec[, 2]
indp <- which(v2 >= 0)
p <- length(indp)
indn <- which(v2 < 0)
n <- length(n)
if(length(v2) < 2*thresh){
temp <- paste0("not enough genes, total number of genes is ", length(v2))
stop(temp)
|
412d446b |
}
|
90fec3eb |
if(p > thresh && n > thresh){
message("network has at least to components and each is large")
message("this package does not support more than one components")
message("you need to run SGCP on each components by yourself")
}else{
if(p < thresh){
final_ind <- indp
}else if(n < thresh) {
final_ind <- indn
}
|
412d446b |
|
90fec3eb |
}
return(final_ind)
|
412d446b |
}
|
90fec3eb |
################################################################################# clusterPlots
clusterPlots <- function(plt, tit, xname, yname){
plt <- plt +
theme_classic() +
theme(axis.text.x = element_text(angle= 45, hjust=1, size = 5, face = 'bold',lineheight = 0.9)) +
theme(axis.text.y = element_text(size = 15, face = 'bold',lineheight = 0.9)) +
theme(axis.title.y = element_text(size = 10, face = 'bold',lineheight = 0.9)) +
theme(plot.title = element_text(size = 20, face = 'bold',lineheight = 0.9, hjust = 0.5)) +
theme(legend.text = element_text(size = 7))+
theme(legend.position = c(.9, .9)) +
labs(x = xname, y = yname, title = tit)
return(plt)
}
|
412d446b |
################################################################################# clusterNumber
|
90fec3eb |
clusterNumber <- function(egvals, maxNum = 102){
# Finds the number of clusters using relative eigenvalues ratios
#
# Argument:
# egvals: a vector of eigenvalues, sorted non increasing
# maxNum: an integer, upper bound on the maegvalsimum number of cluster
#
# Value:
# numOpt: a list
# relativeGap: k for method relativeGap
# secondOrderGap: k for method secondOrderGap
# additiveGap: k for method additiveGap
# plt: plots for each methods
if(!is.numeric(egvals)){ stop("eigenvalues egvals are not numeric") }
if(is.unsorted(rev(egvals))){
warning("egvals are not sorted", call. = FALSE)
message("sorting the eigenva;ues...")
egvals <- sort(egvals, decreasing = TRUE)}
if(round(egvals[1], 7) == 1){egvals <- egvals[-1]}
# method: eigen gap
|
4f49cf88 |
|
90fec3eb |
k <- seq_len(maxNum)
k <- k[-1]
pairDiff <- abs(diff(egvals[seq_len(maxNum)]))
dfgap <- as.data.frame(cbind(k, pairDiff))
|
4f49cf88 |
|
90fec3eb |
optg <- dfgap[which.max(pairDiff),]$k
# method: first and second order
egvals <- 1 - egvals
egvals <- egvals[seq_len(min(maxNum, length(egvals)))]
df <- data.frame(eigenVal = egvals, firstOrder = NA, kfirst = NA,
secondOrder = NA, ksecond = NA)
i <- 2
while(i <= nrow(df)){
df$firstOrder[i] <- df$eigenVal[i]/df$eigenVal[i-1]
df$kfirst[i] <- i
i <- i + 1}
i <- 3
while (i <= nrow(df)) {
df$secondOrder[i] <- df$firstOrder[i-1] - df$firstOrder[i]
df$ksecond[i] <- i- 1
i <- i+1}
df$secondOrder <- c(NA, diff(df$firstOrder))
|
4f49cf88 |
|
90fec3eb |
# method: first
resf <- df$firstOrder
resf <- resf[-1]
optf <- which.max(resf) + 1
# method: second
ress <- df$secondOrder
ress <- ress[-c(1,2)]
opts <- which.max(ress) + 1
numOpt <- list("relativeGap" = optf, "secondOrderGap" = opts, "additiveGap" = optg)
temp <- paste0("number of clusters for relativeGap method is ")
message(temp, optf)
temp <- paste0("number of clusters for secondOrderGap method is ")
message(temp, opts)
temp <- paste0("number of clusters for additiveGap method is ")
message(temp, optg)
pltgap <- ggplot(data = dfgap, aes(x = k, y = pairDiff)) +
geom_point(size = .5) +
geom_line()
pltgap <- clusterPlots(pltgap, tit = "Additive Gap Method",
xname = "cluster number", yname = "Additive Gap")
pltfirst <- ggplot(data = df, aes(x = kfirst, y = firstOrder)) +
geom_point(size = .5) +
geom_line()
pltfirst <- clusterPlots(pltfirst, tit = "Relative Gap Method",
xname = "cluster number", yname = "Relative Gap")
pltsecond <- ggplot(data = df, aes(x = ksecond, y = secondOrder)) +
geom_point(size = .5) +
geom_line()
pltsecond <- clusterPlots(pltsecond, tit = "Second-Order Gap Method",
xname = "cluster number", yname = "Second-Order Gap")
plt <- list("additiveGap" = pltgap, "relativeGap" = pltfirst,
"secondOrderGap" = pltsecond)
numOpt <- c(numOpt, "plots" = plt)
return(numOpt)
|
412d446b |
}
################################################################################# conductance
|
90fec3eb |
conductance <- function(adja, clusLab){
# calculates the conductance of a clustering
#
# Arguments:
# adja: a matrix,
# symmetric sqaure adjacency matrix with values in (0,1)
# clusLab: a vector of cluster label
#
# Values:
# a list: mean, median, clusterConductance
# mean: mean over all conduct value per cluster
# median: median over all conduct value per cluster
# conductance: conductance value per cluster
conduct <- c()
deg <- matrix(rowSums(adja), ncol = 1)
for(clus in unique(clusLab)){
indin <- which(clusLab == clus)
indout <- which(clusLab != clus)
cur <- 0
M <- adja[indin, indout]
cur <- cur + sum(M)
conduct <- c(conduct, cur/sum(deg[indin]))
}
names(conduct) <- unique(clusLab)
newList <- list("mean" = mean(conduct), "median" = median(conduct),
"clusterConductance" = conduct)
|
412d446b |
}
################################################################################# cvConductance
|
90fec3eb |
cvConductance <- function(adja, k, Y, X, func = min,
maxIter = maxIter, numStart = numStart){
# perform cross validation
#
# Arguments:
# adja: matrix
# symmetric squared with values in (0,1) matrix
# k: a list containing
# relativeGap: scalar containing k relativeGap method
# secondOrderGap: scalar containing k for secondOrderGap method
# additiveGap: scalar containing k for additiveGap method
# plots: plots of the methods
# Y: matrix of eigenvectors
# X: vector of eigenvalues
# method: either mean, median, or mean
# for comparing conductance indices
# maxIter: a number,
# maximum number of iteration in kmeans
# numStart: a number
# number of start for kmeans
#
# Values:
# a list of
# method: string of method names
# k: scalar for number of cluster
# Y: a rectangular matrix of the transformation
# X: a vector of eigenvalues correspond to Y
# cluster: output of kmeans
# clusterLabels: a vector of datapoints labels
message("Conductance Validation...")
# first order method
krelativeGap <- k$relativeGap
Yf <- Y[, seq_len(min(2*krelativeGap, ncol(Y)))]
Xf <- X[seq_len(min(2*krelativeGap, length(X)))]
Yf <- divideNorm(Yf, rowWise = TRUE)
clusf <- kmeans(Yf, krelativeGap, iter.max = maxIter, nstart = numStart)
conf <- conductance(adja = adja, clusLab = clusf$cluster)
# second order method
ksecondOrderGap <- k$secondOrderGap
Ys <- Y[, seq_len(min(2*ksecondOrderGap, ncol(Y)))]
Xs <- X[seq_len(min(2*ksecondOrderGap, length(X)))]
Ys <- divideNorm(Ys, rowWise = TRUE)
cluss <- kmeans(Ys, ksecondOrderGap, iter.max = maxIter, nstart = numStart)
cons <- conductance(adja = adja, clusLab = cluss$cluster)
# eigengap method
kadditiveGap <- k$additiveGap
Yg <- Y[, seq_len(min(2*kadditiveGap, ncol(Y)))]
Xg <- X[seq_len(min(2*kadditiveGap, length(X)))]
Yg <- divideNorm(Yg, rowWise = TRUE)
clusg <- kmeans(Yg, kadditiveGap, iter.max = maxIter, nstart = numStart)
cong <- conductance(adja = adja, clusLab = clusg$cluster)
res <- c(func(conf$clusterConductance),
func(cons$clusterConductance),
func(cong$clusterConductance))
names(res) <- c("relativeGap", "secondOrderGap", "additiveGap")
resmethod <- names(which.min(res))
temp <- paste0("selected method in cvConductance is ", resmethod )
message(temp)
methodCon <- list("relativeGap.conductance" = conf,
"secondOrderGap.conductance" = cons,
"additiveGap.conductance" = cong)
if(resmethod == "relativeGap"){
method <- "relativeGap"
k <- krelativeGap
Y <- Yf
X <- Xf
clus <- clusf
clusterLabels <- clus$cluster
}else if(resmethod == "secondOrderGap"){
method <- "secondOrderGap"
k <- ksecondOrderGap
Y <- Ys
X <- Xs
clus <- cluss
clusterLabels <- clus$cluster
}else{
method <- "additiveGap"
k <- kadditiveGap
Y <- Yg
X <- Xg
clus <- clusg
clusterLabels <- clus$cluster }
newList <- list("method" = method, "k" = k, "Y" = Y, "X" = X,
"clus" = clus, "clusterLabels" = clusterLabels,
"conductance" = methodCon)
return(newList)
|
412d446b |
}
################################################################################# sigClusGO
|
90fec3eb |
sigClusGO <- function(adja, Y, X, k, annotation, geneID,
maxIter = 1e+8, numStart = 1000){
# performs geneontology for a single cluster produced by
# either relativeGap, secondOrderGap, and additiveGap
#
# Arguments:
# adja: a matrix
# symmetric square matrix with values in (0,1)
# Y: the eigenvectors
# X: eigenvalues
# k: number of clusters
# annotation: genome wide annotation
# geneID: a vector of gene IDs
# maxIter: a number, indicates maximum number of iteration for kmeans
# numStart: a number, indicates the number of start for kmeans
#
# Value
# Y: normalized Y
# X: the eigenvalues
# k: number of clusters
# clus: clustering object returns by kmeans function
# conductance: the result returns by conductance function
# pvalues: the pvalues associated to the cluster GO
# df_clus: the cluster df for GO
#
Yt <- Y[, seq_len(min(2*k, ncol(Y)))]
Xt <- X[seq_len(min(2*k, length(X)))]
Yt <- divideNorm(Yt, rowWise = TRUE)
clust <- kmeans(Yt, k, iter.max = maxIter, nstart = numStart)
cont <- conductance(adja = adja, clusLab = clust$cluster)
lab.t <- names(which.min(cont$clusterConductance))
ind.t <- which(clust$cluster == lab.t)
genes.t <- geneID[ind.t]
nl.t <- GOenrichment(universe_geneIDs = geneID,
cluster_geneIDs = genes.t,
annotation_db = annotation,
lab = lab.t,
direction = c("over", "under"),
ontology = c("BP", "CC", "MF"),
hgCutoff = NULL,
condition = TRUE)
newList <- list("Y" = Yt, "X" = Xt, "k" = k, "clus" = clust,
"conductance" = cont, "pvalues" = nl.t$GO_summary$Pvalue,
"df_clus" = nl.t$GO_summary)
return(newList)
|
412d446b |
}
################################################################################# cvGO
|
90fec3eb |
cvGO <- function(adja, k, Y, X, annotation, geneID,
|
412d446b |
maxIter = maxIter, numStart = numStart, func = sum){
|
90fec3eb |
# perform cross validation
#
# Arguments:
# adja: a matrix
# symmetric square matrix with values in (0,1)
# Y: the eigenvectors
# X: eigenvalues
# k: number of clusters
# annotation: genome wide annotation
# geneID: a vector of gene IDs
# maxIter: a number, indicates maximum number of iteration for kmeans
# numStart: a number, indicates the number of start for kmeans
# func: function to perform cv for the GOs
#
# Values:
# a list of containing
# method: string of method names
# k: scalar for number of cluster
# Y: a rectangular matrix of the transformation
# X: a vector of eigenvalues correspond to Y
# cluster: output of kmeans
# clusterLabels: a vector of datapoints labels
# conductance for the best selected method
|
412d446b |
|
90fec3eb |
message("\n Gene Ontology Validation...")
|
412d446b |
|
90fec3eb |
# first order method
message("method relativeGap....")
nl.f <- sigClusGO(adja, Y, X, k$relativeGap, annotation, geneID,
maxIter = maxIter, numStart = numStart)
f <- -log10(nl.f$pvalues)
|
412d446b |
|
90fec3eb |
if(nrow(nl.f$df_clus) >0 ){
nl.f$df_clus$clusterNum <- "relativeGap"}
|
412d446b |
|
90fec3eb |
df.f <- nl.f$df_clus
|
412d446b |
|
90fec3eb |
# second order method
|
412d446b |
|
90fec3eb |
message("\n method secondOrderGap...")
if(k$secondOrderGap != k$relativeGap){
|
412d446b |
|
90fec3eb |
nl.s <- sigClusGO(adja, Y, X, k$secondOrderGap, annotation, geneID,
maxIter = maxIter, numStart = numStart)
s <- -log10(nl.s$pvalues)
|
412d446b |
|
90fec3eb |
if(nrow(nl.s$df_clus) >0 ){
nl.s$df_clus$clusterNum <- "secondOrderGap" }
|
412d446b |
|
90fec3eb |
df.s <- nl.s$df_clus
|
412d446b |
|
90fec3eb |
}else {
|
412d446b |
|
90fec3eb |
message("GO enrichment is the same is relativeGap!")
nl.s <- nl.f
s <- nl.s$pvalues
|
412d446b |
|
90fec3eb |
if(nrow(nl.s$df_clus) >0 ){
nl.s$df_clus$clusterNum <- "secondOrderGap" }
|
412d446b |
|
90fec3eb |
df.s <- nl.s$df_clus}
|
412d446b |
|
90fec3eb |
message("\n method additiveGap....")
if(k$additiveGap == k$relativeGap){
message("GO enrichment is the same is relativeGap!")
|
412d446b |
|
90fec3eb |
nl.g <- nl.f
g <- -log10(nl.g$pvalues)
|
412d446b |
|
90fec3eb |
if(nrow(nl.g$df_clus) >0 ){
nl.g$df_clus$clusterNum <- "additiveGap"}
|
412d446b |
|
90fec3eb |
df.g <- nl.g$df_clus
|
412d446b |
|
90fec3eb |
}else if(k$additiveGap == k$secondOrderGap){
message("GO enrichment is the same is secondOrderGap!")
|
412d446b |
|
90fec3eb |
nl.g <- nl.s
g <- -log10(nl.g$pvalues)
|
412d446b |
|
90fec3eb |
if(nrow(nl.g$df_clus) >0 ){
nl.g$df_clus$clusterNum <- "additiveGap"}
|
412d446b |
|
90fec3eb |
df.g <- nl.g$df_clus
|
412d446b |
|
90fec3eb |
}else{
nl.g <- sigClusGO(adja, Y, X, k$additiveGap, annotation, geneID,
maxIter = maxIter, numStart = numStart)
g <- -log10(nl.g$pvalues)
|
412d446b |
|
90fec3eb |
if(nrow(nl.g$df_clus) >0 ){
nl.g$df_clus$clusterNum <- "additiveGap"}
|
412d446b |
|
90fec3eb |
df.g <- nl.g$df_clus}
|
412d446b |
|
90fec3eb |
methodCon <- list("relativeGap.conductance" = nl.f$conductance,
"secondOrderGap.conductance" = nl.s$conductance,
"additiveGap.conductance" = nl.g$conductance)
|
412d446b |
|
90fec3eb |
# taking cv function over p-values
res <- c(func(f), func(s), func(g))
names(res) <- c("relativeGap", "secondOrderGap", "additiveGap")
resmethod <- names(which.max(res))
|
412d446b |
|
90fec3eb |
if(resmethod == "relativeGap"){
method <- "relativeGap"
k <- nl.f$k
Y <- nl.f$Y
X <- nl.f$X
clus <- nl.f$clus
clusterLabels <- clus$cluster
conduct <- nl.f$conductance
df <- nl.f$df_clus
|
412d446b |
|
90fec3eb |
}else if(resmethod == "secondOrderGap"){
method <- "secondOrderGap"
k <- nl.s$k
Y <- nl.s$Y
X <- nl.s$X
clus <- nl.s$clus
clusterLabels <- clus$cluster
conduct <- nl.s$conductance
df <- nl.s$df_clus
|
412d446b |
|
90fec3eb |
}else{
method <- "additiveGap"
k <- nl.g$k
Y <- nl.g$Y
X <- nl.g$X
clus <- nl.g$clus
clusterLabels <- clus$cluster
conduct <- nl.g$conductance
df <- nl.g$df_clus
|
412d446b |
|
90fec3eb |
}
|
412d446b |
|
90fec3eb |
df <- rbind(df.f, df.s, df.g)
newList <- list("method" = method, "k" = k, "Y" = Y, "X" = X,
"clus" = clus, "clusterLabels" = clusterLabels,
"conductance" = methodCon, "df" = df)
|
412d446b |
|
90fec3eb |
return(newList)
}
|
412d446b |
|
90fec3eb |
################################################################################# transformation
|
147671fb |
transformation <- function(adjaMat, geneID, eff.egs){
|
412d446b |
|
147671fb |
# Calculates the data embedding, eigenvalues and eigenvectors
#
# Arguments:
# adjaMat: adjacency matrix, square symmetric with values in (0, 1)
# and 0 diagonal
# geneID: a vector of geneID
# there is one-to-one correspondence between geneID and rows
# in the adjacency matrix
#
# Value: a list of following
# Y: eigenvectors
# X: eigenvalues
# geneID: a vector of geneIDs
# ind: a vector of dropping genes
|
412d446b |
|
90fec3eb |
d <- as.matrix(as.vector(rowSums(adjaMat)))
ind <- which(d == 0)
if(length(ind) > 0){
adjaMat <- adjaMat[-ind, ]
adjaMat <- adjaMat[, -ind]
|
147671fb |
if(!is.null(geneID)){geneID <- geneID[-ind] }
}
|
412d446b |
|
90fec3eb |
# Normalized Laplacian
message("calculating normalized Laplacian \n it may take time...")
|
147671fb |
D <- diag(rowSums(adjaMat))
d <- as.matrix(as.vector(rowSums(adjaMat)))
|
90fec3eb |
diag(D) <- 1/sqrt(diag(D))
L <- D %*% adjaMat %*% D
|
412d446b |
|
fb737d07 |
k <- min(200, nrow(adjaMat))
|
147671fb |
|
90fec3eb |
# Calculate the eigenvalues and eigenvectors
message("calculating eigenvalues/vectors \n it may take time...")
|
147671fb |
if(eff.egs){
egvv <- eigs_sym(L, k = k, which = "LA")
}else{
egvv <- eigen(L)}
|
90fec3eb |
Y <- egvv$vectors
X <- egvv$values
if(!is.unsorted(X)){
warning(" The eigenvalues are not sorted!", call. = FALSE)
message("sorting the eigenvalues/vectors...")
eg <- as.matrix(rbind(X, Y))
eg <- as.data.frame(t(eg))
colnames(eg)[1] <- "eigenvalues"
eg <- eg[order(eg$eigenvalues, decreasing = TRUE), ]
eg <- t(as.matrix(eg))
X <- eg[1, ]
temp <- seq_len(nrow(eg))
temp <- temp[-1]
Y <- eg[temp, ]}
# preprocessing to check for noisy genes
if(sum(round(X, 7) == 1) > 1 ){
message("dropping noisy genes...")
nois_ind <- findNoise(evec = Y, thresh = 100)
adjaMat <- adjaMat[-nois_ind, ]
adjaMat <- adjaMat[, -nois_ind]
D <- D[-nois_ind, ]
D <- D[, -nois_ind]
ind <- c(ind, nois_ind)
if(!is.null(geneID)){geneID <- geneID[-nois_ind]
temp <- paste0("number of noisy genes are... ", length(nois_ind))
message(temp)
message("calculation from begining...")}
# Calculate the diagonal degree matrix
D <- diag(rowSums(adjaMat))
d <- as.matrix(as.vector(rowSums(adjaMat)))
# Normalized Laplacian
message("calculating normalized Laplacian \n it may take time...")
diag(D) <- 1/sqrt(diag(D))
L <- D %*% adjaMat %*% D
|
fb737d07 |
k <- min(200, nrow(adjaMat))
|
147671fb |
|
90fec3eb |
# Calculate the eigenvalues and eigenvectors
message("calculating eigenvalues/vectors \n it may take time...")
|
147671fb |
if(eff.egs){
egvv <- eigs_sym(L, k = k, which = "LA")
}else{
egvv <- eigen(L)}
|
90fec3eb |
Y <- egvv$vectors
|
147671fb |
X <- egvv$values } # end of if preprocessing
|
90fec3eb |
#
Y <- D %*% Y
Y <- sweep(Y, 2, (t(d) %*% Y)/sum(d))
#
D <- diag(rowSums(adjaMat))
scalar <- function(x, D){return(x/sqrt(as.vector(t(x) %*% D %*% x)))}
Y <- apply(Y, 2, scalar, D)
constant <- t(Y[,1]) %*% D %*% Y[, 1]
if(round(constant, 7) != 1 ){
warning("eigenvector 1 condition does not hold! ", call. = FALSE)
temp <- paste0("the condition value is ",
|
fb737d07 |
as.double(t(Y[,1]) %*% D %*% Y[, 1] != 1))
|
90fec3eb |
message(temp) }
|
147671fb |
newList <- list("Y" = Y, "X" = X, "ind" = ind, "geneID" = geneID)
}
################################################################################# clustering2
clustering <- function(adjaMat, geneID , annotation_db ,
kopt = NULL, method = NULL,
func.GO = sum, func.conduct = min,
maxIter = 1e8, numStart = 1000, eff.egs = TRUE,
saveOrig = TRUE, n_egvec = 200, sil = FALSE){
if(is.data.frame(adjaMat)){
warning("adjaMat is a dataframe", call. = FALSE)
message("converting adjancecy to matrix")
df2mat(adjaMat)}
if(!is.matrix(adjaMat)){ stop("input must be a matrix") }
checkNumeric(adjaMat, " transformation input")
checkSym(adjaMat, " transformation input")
ng <- nrow(adjaMat)
if(!is.null(geneID)){
if(length(geneID) != ng){
warning("length of geneID must be equal to number of rows (or columns) in adjaMat",
call. = FALSE)
message("setting geneIDs to NULL")
geneID <- NULL} }
if(is.null(geneID)){
geneID <- paste0(rep("gene", ng), seq_len(ng))}
if(!is.null(annotation_db) & !is.character(annotation_db)){
stop("annotation_db must be character or NULL", call. = FALSE)}
if(!is.character(geneID)){
warning("type of geneID is not character")
message("makig geneIDs type to character")
geneID <- as.character(geneID) }
if(!is.null(kopt) && kopt != round(kopt)){
warning("kopt must be either null or an integer", call. = FALSE)
message("making kopt null")
kopt <- NULL}
if(length(setdiff(method, c("relativeGap",
"secondOrderGap", "additiveGap"))) != 0){
warning("method can be either relativeGap, secondOrderGag, or additiveGap",
call. = FALSE)
message("making method to NULL")
method <- NULL }
if(!is.numeric(maxIter) || !is.numeric(numStart)){
warning("maxIter and numStart must be numeric and integer", call. = FALSE)
message("making maxIter and numStart to default")
maxIter <- 1e+8
numStart <- 1000}
if(n_egvec != round(n_egvec)){
warning("n_egvec must be an integer ", call. = FALSE)
message("setting n_egvec to 200...")
|
fb737d07 |
n_egvec <- 200}
|
147671fb |
if(n_egvec > 200){
warning("maximum value for n_egvec is 200", call. = FALSE)
message("setting n_egvec to 200...")
|
fb737d07 |
n_egvec <- 200 }
|
147671fb |
################### transformation
nl <- transformation(adjaMat, geneID, eff.egs)
X <- nl$X
Y <- nl$Y
ind <- nl$ind
geneID <- nl$geneID
|
90fec3eb |
# Drop first eigenvalue and eigenvectors
X <- X[-1]
Y <- Y[, -1]
# saving original
if(saveOrig == TRUE){
temp <- paste0("n_egvec is ", n_egvec)
message(temp)
Yorig <- Y[, seq_len(min(n_egvec, ncol(Y)))]
Xorig <- X[seq_len(min(n_egvec, length(X)))]
|
4f49cf88 |
}
|
412d446b |
|
90fec3eb |
plt <- list()
|
412d446b |
|
90fec3eb |
#################################################################################
if(is.null(kopt) && is.null(method) && !is.null(annotation_db)){
|
412d446b |
|
90fec3eb |
k <- clusterNumber(egvals = X, maxNum = 102)
plt <- c(plt, setNames(list(k$plots.relativeGap), "relativeGap"),
|
fb737d07 |
setNames(list(k$plots.secondOrderGap), "secondOrderGap" ),
setNames(list(k$plots.additiveGap), "additiveGap") )
|
90fec3eb |
cvopt <- cvGO(adja = adjaMat, k = k, Y = Y, X = X,
|
fb737d07 |
annotation = annotation_db, geneID = geneID,
maxIter = maxIter, numStart = numStart, func = func.GO)
|
147671fb |
|
4f49cf88 |
temp <- paste0("\n method ", cvopt$method, " is selected using GO validation and k is ", cvopt$k)
|
90fec3eb |
message(temp)
|
147671fb |
newList <- list("dropped.indices" = ind, "geneID" = geneID, "method" = cvopt$method,
"k" = cvopt$k, "Y" = cvopt$Y, "X" = cvopt$X, "cluster" = cvopt$clus,
"clusterLabels" = as.character(cvopt$clusterLabels),
"conductance" = cvopt$conductance, "cvGOdf" = cvopt$df, "cv" = "cvGO",
|
90fec3eb |
"clusterNumberPlot" = plt)
} else if(is.null(kopt) && is.null(method) && is.null(annotation_db)){
k <- clusterNumber(egvals = X, maxNum = 102)
plt <- c(plt, setNames(list(k$plots.relativeGap), "reltiveGap"),
|
fb737d07 |
setNames(list(k$plots.secondOrderGap), "secondOrderGap" ),
setNames(list(k$plots.additiveGap), "additiveGap") )
|
90fec3eb |
cvopt <- cvConductance(adja = adjaMat, k = k, Y = Y, X = X,
|
fb737d07 |
func = func.conduct,
maxIter = maxIter, numStart = numStart)
|
147671fb |
|
4f49cf88 |
temp <- paste0("\n method ", cvopt$method, " is selected using conductance index validation and k is ", cvopt$k)
|
90fec3eb |
message(temp)
|
147671fb |
newList <- list("dropped.indices" = ind, "geneID" = geneID, "method" = cvopt$method,
"k" = cvopt$k, "Y" = cvopt$Y, "X" = cvopt$X, "cluster" = cvopt$clus,
"clusterLabels" = as.character(cvopt$clusterLabels),
"conductance" = cvopt$conductance, "cv" = "cvConductance",
|
90fec3eb |
"clusterNumberPlot" = plt)
}else if(is.null(kopt) && !is.null(method)){
k <- clusterNumber(egvals = X, maxNum = 102)
if(method == "relativeGap"){
# first order method
krelativeGap <- k$relativeGap
Yf <- Y[, seq_len(min(2*krelativeGap, ncol(Y)))]
Xf <- X[seq_len(min(2*krelativeGap, length(X)))]
Yf <- divideNorm(Yf, rowWise = TRUE)
clusf <- kmeans(Yf, krelativeGap, iter.max = maxIter, nstart = numStart)
conf <- conductance(adja = adjaMat, clusLab = clusf$cluster)
|
4f49cf88 |
temp <- paste0("\n method ", "relativeGap", " is selected using user and k is ", k$relativeGap)
|
90fec3eb |
message(temp)
|
147671fb |
newList <- list("dropped.indices" = ind, "geneID" = geneID, "method" = "relativeGap",
"k" = k$relativeGap, "Y" = Yf, "X" = Xf, "cluster" = clusf,
"clusterLabels" = as.character(clusf$cluster),
"conductance" = conf, "cv" = "userMethod")
|
90fec3eb |
} else if(method == "secondOrderGap"){
# second order method
ksecondOrderGap <- k$secondOrderGap
Ys <- Y[, seq_len(min(2*ksecondOrderGap, ncol(Y)))]
Xs <- X[seq_len(min(2*ksecondOrderGap, length(X)))]
Ys <- divideNorm(Ys, rowWise = TRUE)
cluss <- kmeans(Ys, ksecondOrderGap, iter.max = maxIter, nstart = numStart)
cons <- conductance(adja = adjaMat, clusLab = cluss$cluster)
|
4f49cf88 |
temp <- paste0("\n method ", "secondOrderGap", " is selected using user and k is ", k$secondOrderGap)
|
90fec3eb |
message(temp)
|
147671fb |
newList <- list("dropped.indices" = ind, "geneID" = geneID, "method" = "secondOrderGap",
"k" = k$secondOrderGap, "Y" = Ys, "X" = Xs, "cluster" = cluss,
"clusterLabels" = as.character(cluss$cluster),
"conductance" = cons, "cv" = "userMethod")
|
90fec3eb |
} else if(method == "additiveGap"){
# gap method
kadditiveGap <- k$additiveGap
Yg <- Y[, seq_len(min(2*kadditiveGap, ncol(Y)))]
Xg <- X[seq_len(min(2*kadditiveGap, length(X)))]
Yg <- divideNorm(Yg, rowWise = TRUE)
clusg <- kmeans(Yg, kadditiveGap, iter.max = maxIter, nstart = numStart)
cong <- conductance(adja = adjaMat, clusLab = clusg$cluster)
|
4f49cf88 |
temp <- paste0("\n method ", "additiveGap", " is selected using user and k is ", k$additiveGap)
|
90fec3eb |
message(temp)
|
147671fb |
newList <- list("dropped.indices" = ind, "geneID" = geneID, "method" = "additiveGap",
"k" = k$additiveGap, "Y" = Yg, "X" = Xg, "cluster" = clusg,
"clusterLabels" = as.character(clusg$cluster),
"conductance" = cong, "cv" = "userMethod")
|
90fec3eb |
} else {
stop(" method for number of clusters can be either relativeGap,
secondOrderGap, or additiveGap")}
}else if(!is.null(kopt)){
# optimal by user
Yopt <- Y[, seq_len(min(2*kopt, ncol(Y)))]
Xopt <- X[seq_len(min(2*kopt, length(X)))]
Yopt <- divideNorm(Yopt, rowWise = TRUE)
clusopt <- kmeans(Yopt, kopt, iter.max = maxIter, nstart = numStart)
conopt <- conductance(adja = adjaMat, clusLab = clusopt$cluster)
|
147671fb |
temp <- paste0("\n selected k is ", kopt, " by user")
|
90fec3eb |
message(temp)
|
147671fb |
newList <- list("dropped.indices" = ind, "geneID" = geneID, "method" = "userkopt",
"k" = kopt, "Y" = Yopt, "X" = Xopt, "cluster" = clusopt,
"clusterLabels" = as.character(clusopt$cluster),
"conductance" = conopt, "cv" = "userkopt")
|
90fec3eb |
}
################### silhouette
if(sil){
message("calculating the Silhouette index \n it may take time...")
|
147671fb |
sil <- silhouette(dis.y = dist(Y), clus.labels = newList$clusterLabels)
|
90fec3eb |
sil <- sil %>% arrange(clusterLabel ,-silIndex)
sil$clusterLabel <- as.factor(sil$clusterLabel)
genes <- data.frame(geneID = geneID, geneIndices = seq(1, length(geneID)))
sil <- inner_join(sil, genes, by = "geneIndices")
sil <- sil[ , !(names(sil) %in% "geneIndices")]
newList <- c(newList, setNames(list(sil), "silhouette"))
}
if(saveOrig == TRUE){
tlist <- list("Yorig" = Yorig, "Xorig" = Xorig, "n_egvec" = ncol(Yorig))
newList <- c(newList, setNames(list(tlist), "original"))}
message("network clustering is done...\n")
return(newList)
|
412d446b |
}
|
90fec3eb |
|