#' ID sanitiser function #' #' Sanitise KEGG identifiers #' #' @param x Character vector, IDs to sanitise #' @param category Character, one of: #' \code{"pathway"}, \code{"module"}, \code{"enzyme"}, #' \code{"ncbi"}, \code{"reaction"}, \code{"compound"} #' #' @return Character vector, sanitised \code{x} #' #' @examples #' FELLA:::sanitise(c("path:hsa00010", "path:hsa00020"), "pathway", "hsa") #' #' @keywords internal sanitise <- function(x, category, organism) { old.attr <- attributes(x) if (category == "pathway") { ans <- gsub("(path:)(.+)(\\d{5})", paste0(organism, "\\3"), x) } if (category == "module") { ans <- gsub("(md:)(.*)(M\\d{5})", "\\3", x) } if (category == "enzyme") { ans <- ifelse( grepl(pattern = "-", x = x), NA, gsub("(ec:)(\\d+\\.\\d+\\.\\d+\\.\\d+)", "\\2", x)) } if (category == "ncbi") { ans <- gsub("(ncbi.+:)(.*\\d+)", "\\2", x) } if (category == "reaction") { ans <- gsub("(rn:)(R\\d{5})", "\\2", x) } if (category == "compound") { ans <- gsub("(cpd:)(C\\d{5})", "\\2", x) } attributes(ans) <- old.attr ans } #' Infer connections to EC #' #' Function \code{infere.con2ec} infers network connections to KEGG EC #' families by passing through genes. This assumes that the category being #' mapped to enzymes is above them. #' #' @param ids Character vector of identifiers to map. For example, #' all the KEGG pathways #' @param ent Character, entity that we are mapping #' (one of \code{"pathway"} and one of \code{"module"}) #' @param ent2gene Named character vector, names are the entity \code{ent} #' and values are genes #' @param gene2enyzme Named character vector, names are genes and #' values are EC enzyme families #' category Character, one of: #' #' @return Two-column data frame. Column \code{"from"} contains the #' KEGG enzyme families whereas \code{"to"} contains the entity \code{ent}. #' #' @examples #' ids <- "hsa00010" #' ent <- "pathway" #' ent2gene <- c("hsa00010" = "hsa:10", "hsa00010" = "hsa:120") #' gene2enzyme <- c("hsa:10" = "1.1.1.1", "hsa:120" = "1.2.3.4") #' FELLA:::infere.con2ec(ids, ent, ent2gene, gene2enzyme) #' #' @keywords internal infere.con2ec <- function(ids, ent, ent2gene, gene2enzyme) { ans <- plyr::ldply( ids, function(x) { aux <- ent2gene[names(ent2gene) == x] ans <- unique(gene2enzyme[names(gene2enzyme) %in% aux]) # names(ans) <- rep(x, length(ans)) data.frame(from = ans, to = rep(x, length(ans))) }) attr(ans, "from") <- "enzyme" attr(ans, "to") <- ent ans } #' Extract largest CC #' #' Function \code{largestcc} extracts the #' largest connected component of an igraph object #' #' @param graph Igraph object #' #' @return Connected igraph object #' #' @examples #' library(igraph) #' g <- barabasi.game(10) + graph.empty(10) #' FELLA:::largestcc(g) #' #' @keywords internal largestcc <- function(graph) { cl <- clusters(graph) x <- which.max(cl$csize) induced.subgraph(graph, which(cl$membership == x)) } #' @title Parse, build and load the KEGG knowledge model #' #' @description #' Function \code{buildGraphFromKEGGREST} makes use of the KEGG #' REST API (requires internet connection) #' to build and return the curated KEGG graph. #' #' @details #' In function \code{buildGraphFromKEGGREST}, #' The user specifies (i) an organism, and (ii) patterns matching #' pathways that should not be included as nodes. #' A graph object, as described in [Picart-Armada, 2017], #' is built from the comprehensive #' KEGG database [Kanehisa, 2017]. #' As described in the main vignette, accessible through #' \code{browseVignettes("FELLA")}, this graph has five levels that #' represent categories of KEGG nodes. #' From top to bottom: pathways, modules, enzymes, reactions and compounds. #' This knowledge representation is resemblant to the one formerly #' used by MetScape [Karnovsky, 2011], in which enzymes connect #' to genes instead of modules and pathways. #' The necessary KEGG annotations #' are retrieved through KEGGREST R package [Tenenbaum, 2013]. #' Connections between pathways/modules and enzymes are inferred through #' organism-specific genes, i.e. an edge is added if a gene #' connects both entries. #' However, in order to enrich metabolomics data, the user has to #' pass the graph object to \code{buildDataFromGraph} #' to obtain the \code{\link{FELLA.USER}} object. #' All the networks are handled with the igraph R package [Csardi, 2006]. #' #' #' @param organism Character, KEGG code for the organism of interest #' @param filter.path Character vector, pathways to filter. #' This is a pattern matched using regexp. #' E.g: \code{"01100"} to filter #' the overview metabolic pathway in any species #' #' @return \code{buildGraphFromKEGGREST} returns the #' curated KEGG graph (class \pkg{igraph}) #' #' @name data-funs #' @rdname data-funs #' #' @import igraph #' @import Matrix #' @import KEGGREST #' @import plyr #' @export buildGraphFromKEGGREST <- function( organism = "hsa", filter.path = NULL) { categories <- listCategories() # Data from KEGGREST # # List of id-name message("Building through KEGGREST...") info.org <- KEGGREST::keggInfo(organism) info.geneannot <- grep( "ncbi-[[:lower:]]+", capture.output(cat(info.org)), value = TRUE) info.geneannot <- gsub("[[:space:]]", "", info.geneannot) cat.geneannot <- head( intersect(c("ncbi-geneid", "ncbi-proteinid"), info.geneannot), 1 ) if (length(cat.geneannot) == 0) stop( "Organism ", organism, " does not appear to have either ", "ncbi-geneid or ncbi-proteinid. Please contact ", "FELLA's maintainer.") message( "Available gene annotations: ", paste(info.geneannot, collapse = ", "), ". Using ", cat.geneannot) list.list <- plyr::llply( stats::setNames(categories, categories), function(category) { # only pathways are organism-specific now # modules are filtered through genes if (category %in% c("pathway")) { ans <- KEGGREST::keggList( database = category, organism = organism) } else { ans <- KEGGREST::keggList(database = category) } names(ans) <- sanitise(names(ans), category, organism) ans }, .progress = "text" ) # Map identifiers to category map.category <- plyr::ldply( list.list, function(categ) data.frame(id = names(categ), stringsAsFactors = FALSE), .id = "category") map.category <- stats::setNames( as.character(map.category$category), map.category$id) # List of kegg links - essentially our edges list.link <- plyr::alply( expand.grid( categories, categories, KEEP.OUT.ATTRS = FALSE, stringsAsFactors = FALSE)[lower.tri( matrix(seq_len(25), nrow = 5)), ], 1, function(row) { original <- KEGGREST::keggLink(row[1], row[2]) df <- data.frame( from = sanitise(original, row[1], organism), to = sanitise(names(original), row[2], organism)) attr(df, "from") <- as.character(row[1]) attr(df, "to") <- as.character(row[2]) df }, .progress = "text" ) attributes(list.link) <- NULL # To mine mapping through enzymes m.path_gene <- KEGGREST::keggLink(organism, "pathway") %>% stats::setNames(., sanitise(names(.), "pathway", organism)) m.mod_gene <- KEGGREST::keggLink(organism, "module") %>% stats::setNames(., sanitise(names(.), "module", organism)) # Gene to enzyme m.gene_enzyme <- KEGGREST::keggLink("enzyme", organism) %>% sanitise(., "enzyme", organism) # Enzyme to gene, but giving the entrez id rather than KEGG's # Map kegg to entrez keggGene2entrez <- KEGGREST::keggConv(cat.geneannot, organism) %>% sanitise(., category = "ncbi") %>% split(., names(.)) # Map kegg enzymes to entrez m.enzyme_gene <- KEGGREST::keggLink(organism, "enzyme") %>% stats::setNames(., sanitise(names(.), "enzyme", organism)) %>% # sanitise(., "gene", organism) %>% split(., names(.), drop = TRUE) %>% plyr::llply( ., function(r) sort(as.character(unique(keggGene2entrez[r])))) # Inferred connections (through genes) con.infere <- list( infere.con2ec( names(list.list$pathway), "pathway", m.path_gene, m.gene_enzyme), infere.con2ec( names(list.list$module), "module", m.mod_gene, m.gene_enzyme) ) # Direct connections df.noinfere <- plyr::ldply( list.link, function(df.piece) { a.from <- attr(df.piece, "from") a.to <- attr(df.piece, "to") if (a.from == "enzyme" & (a.to %in% c("module", "pathway"))) return(NULL) return(df.piece) }, .id = NULL ) df.infere <- plyr::ldply( con.infere, function(df.piece) { a.from <- attr(df.piece, "from") a.to <- attr(df.piece, "to") return(df.piece) } ) matrix.adjacency <- as.matrix(rbind( df.noinfere, df.infere )) message("Done.") message("Building graph...") g.raw <- igraph::simplify( graph.edgelist(matrix.adjacency, directed = TRUE) ) V(g.raw)$com <- match(map.category[V(g.raw)$name], categories) # Nodes without a kegg name are either obsolete or inexistent g.raw <- delete.vertices( g.raw, which(is.na(V(g.raw)$com))) # Enzymes that cannot be inferred should be deleted # (not found in desired species!) g.raw <- delete.vertices( g.raw, which((V(g.raw)$com == 3) & !(V(g.raw)$name %in% df.infere$from))) # Same for the modules that do no belong to the species # Keep only those that have at least one gene associated # # Alternative: keggLink("genome", "compound") and pick only # those of the organism code # The downside is that it takes around 90s, but might be # a safer option # Checked in 19/10/2019 and both approaches are equivalent org.modules <- unique(names(m.mod_gene)) g.raw <- delete.vertices( g.raw, which((V(g.raw)$com == 2) & !(V(g.raw)$name %in% org.modules))) # Order by category and id g.raw <- permute.vertices( g.raw, order(order(V(g.raw)$com, V(g.raw)$name))) # Weighting the edges tmp <- get.edges(g.raw, E(g.raw)) E(g.raw)$weight <- abs(V(g.raw)$com[tmp[, 1]] - V(g.raw)$com[tmp[, 2]]) # Keep only reactions in a pathway # i.e. delete reactions that don't have any 3-weight edge g.raw <- (setdiff( which(V(g.raw)$com == 4), get.edges(g.raw, E(g.raw)[E(g.raw)$weight == 3])[, 1]) %>% delete.vertices(graph = g.raw, .)) # Keep only compounds that are reactants/products in these reactions # i.e. delete compounds that don't have any 1-weight edge g.raw <- (setdiff( which(V(g.raw)$com == 5), get.edges(g.raw, E(g.raw)[E(g.raw)$weight == 1])[, 1]) %>% delete.vertices(graph = g.raw, .)) # Other filtering (remove nodes?) if (!is.null(filter.path)) { names.path <- V(g.raw)[V(g.raw)$com == 1]$name filter.out <- lapply( filter.path, function(p) { which(grepl(p, names.path)) }) names.out <- names.path[unique(unlist(filter.out))] message(paste0("Filtering ", length(names.out), " pathways.")) g.raw <- delete.vertices(g.raw, names.out) } g.raw <- largestcc(g.raw) message("Done.") message("Pruning graph...") # CURATE GRAPH # We start with the graph curation edges.split <- split(seq_len(ecount(g.raw)), E(g.raw)$weight) message(paste0("Current weight: 1 out of 4...")) g.curated <- subgraph.edges( graph = g.raw, eids = edges.split[[1]], delete.vertices = FALSE) for (w in names(edges.split)[-1]) { current.w <- as.numeric(w) message(paste0("Current weight: ", w, " out of 4...")) dist.matrix <- distances(g.curated, mode = "out") list.edges <- edges.split[[w]] list.ends <- ends(g.raw, list.edges) new.edges <- dist.matrix[list.ends] > E(g.raw)$weight[list.edges] g.curated <- add.edges( graph = g.curated, edges = t(list.ends[new.edges, ]), attr = list(weight = E(g.raw)[list.edges[new.edges]]$weight)) } # Final edge weights have to be inverted E(g.curated)$weight <- 1/E(g.curated)$weight tmp <- list.list names(tmp) <- NULL tmp <- unlist(tmp) # Tried sorting according to number of characters # (take shortest names). This is weird, as some names # are not very known. I will leave the original order V(g.curated)$NAME <- strsplit(tmp[V(g.curated)$name], split = "; ") V(g.curated)$entrez <- m.enzyme_gene[V(g.curated)$name] comment(g.curated) <- info.org g.curated$organism <- organism message("Done.") keggdata.graph <- g.curated return(keggdata.graph) }