#' KEGG server base URL #' #' @return A string with URL. This can be changed by options(KEGG_BASE_URL = #' "different/url"). #' @noRd get_kegg_url <- function() { getOption("KEGG_BASE_URL", "https://rest.kegg.jp") } #' Find all species available from KEGG #' #' @param on_error A character string indicating the error handling strategy: #' either "stop" to halt execution, "warn" to issue a warning and return #' `NULL` or "ignore" to return `NULL` without warnings. Defaults to "stop". #' #' @return A tibble, in which column \code{designation} contains species #' designations used in function \code{fetch_kegg}. #' @export #' @examples #' spe <- fetch_kegg_species(on_error = "warn") fetch_kegg_species <- function(on_error = c("stop", "warn", "ignore")) { on_error <- match.arg(on_error) resp <- http_request(get_kegg_url(), "list/organism") if(resp$is_error) return(catch_error("KEGG", resp, on_error)) st <- httr2::resp_body_string(resp$response) readr::read_tsv(I(st), col_names = c("id", "designation", "species", "phylogeny"), show_col_types = FALSE) } #' Download pathway data from KEGG #' #' @param species A valid species designation used by KEGG. #' @param on_error A character string indicating the error handling strategy: #' either "stop" to halt execution, "warn" to issue a warning and return #' `NULL` or "ignore" to return `NULL` without warnings. Defaults to "stop". #' #' @return A tibble with columns \code{gene_id} and \code{term_id}. #' @noRd fetch_kegg_pathways <- function(species, on_error) { # Binding variables from non-standard evaluation locally term_id <- NULL resp <- http_request(get_kegg_url(), stringr::str_glue("list/pathway/{species}")) if(resp$is_error) return(catch_error("KEGG", resp, on_error)) st <- httr2::resp_body_string(resp$response) readr::read_tsv(I(st), col_names = c("term_id", "term_name"), show_col_types = FALSE) |> dplyr::mutate(term_id = stringr::str_remove(term_id, "path:")) } #' Parse flat file returned from KEGG GET API #' #' @param s A string containing flat file result #' #' @return A tibble with \code{gene_is}, \code{gene_symbol} and \code{term_id}. #' @noRd parse_kegg_genes <- function(s) { # Binding variables from non-standard evaluation locally data <- gene_symbol <- gene_id <- NULL entries <- stringr::str_split(s, "///") |> unlist() entries <- entries[-length(entries)] # there is /// at the end purrr::map(entries, function(entry) { d <- stringr::str_split(entry, "\n") |> unlist() n <- length(d) i <- 1 # find ENTRY key while(i <= n & !(stringr::str_detect(d[i], "^ENTRY"))) i <- i + 1 paths <- stringr::str_split(d[i], "\\s+") |> unlist() pathway <- paths[2] # find GENE key i <- 1 while(i <= n & !(stringr::str_detect(d[i], "^GENE"))) i <- i + 1 # no GENE key found if(i > n) return(tibble::tibble( gene_id = character(0), gene_symbol = character(0) )) # extract genes genes <- stringr::str_remove(d[i], "GENE\\s+") i <- i + 1 while(i <= n & stringr::str_detect(d[i], "^\\s+")) { genes <- c(genes, stringr::str_remove(d[i], "^\\s+")) i <- i + 1 } purrr::map(genes, function(gene) { # First element - gene ID, second - gene symbol, if contains semicolon v <- stringr::str_split_1(gene, "\\s+") tibble::tibble( gene_id = v[1], gene_symbol = ifelse( stringr::str_detect(v[2], ";"), stringr::str_remove(v[2], ";.*$"), v[1] ), term_id = pathway ) }) |> purrr::list_rbind() }) |> purrr::list_rbind() } #' Download term - gene mapping from KEGG #' #' @param pathways A character vector with KEGG pathways #' @param batch_size Number of pathways sent to KEGG database in one query. The #' maximum allowed is 10. #' @param on_error A character string indicating the error handling strategy: #' either "stop" to halt execution, "warn" to issue a warning and return #' `NULL` or "ignore" to return `NULL` without warnings. Defaults to "stop". #' @importFrom assertthat assert_that #' @return A tibble with columns \code{gene_id} and \code{term_id} #' @noRd fetch_kegg_mapping <- function(pathways, batch_size, on_error) { assert_that(is.character(pathways)) batches <- split(pathways, ceiling(seq_along(pathways) / batch_size)) raise_error <- FALSE pb <- progress::progress_bar$new(total = length(batches)) tb <- purrr::map(batches, function(batch) { dbentries <- paste(batch, collapse = "+") # this returns a flat file resp <- http_request(get_kegg_url(), stringr::str_glue("get/{dbentries}")) if(resp$is_error) { catch_error("KEGG", resp, on_error) raise_error <<- TRUE return(NULL) } pb$tick() st <- httr2::resp_body_string(resp$response) parse_kegg_genes(st) }) |> purrr::list_rbind() if(raise_error) { return(NULL) } else { return(tb) } } #' Get functional term data from KEGG #' #' Download information (pathway ID and name) and gene-pathway mapping (entrez #' gene ID, gene symbol and pathway ID) from KEGG. Gene symbols are extracted #' from gene descriptions. For some species (e.g. yeast), gene symbols are #' returned instead of entrez IDs and not in gene description. #' #' @param species KEGG species code, for example "hsa" for human. The #' full list of available KEGG species can be found by using \code{fetch_kegg_species}. #' @param batch_size Number of pathways sent to KEGG database in one query. The #' maximum allowed is 10. #' @param on_error A character string indicating the error handling strategy: #' either "stop" to halt execution, "warn" to issue a warning and return #' `NULL` or "ignore" to return `NULL` without warnings. Defaults to "stop". #' #' @return A list with \code{terms} and \code{mapping} tibbles. #' @importFrom assertthat assert_that is.count #' @export #' @examples #' kegg_data <- fetch_kegg("mge", on_error = "warn") fetch_kegg <- function(species, batch_size = 10, on_error = c("stop", "warn", "ignore")) { on_error <- match.arg(on_error) assert_that(!missing(species), msg = "Argument 'species' is missing.") assert_that(is.count(batch_size)) assert_that(batch_size <= 10, msg = "batch_size needs to be between 1 and 10") terms <- fetch_kegg_pathways(species, on_error) if(is.null(terms)) return(NULL) mapping <- fetch_kegg_mapping(terms$term_id, batch_size, on_error) list( terms = terms, mapping = mapping ) }