R/MsBackendMassbankSql-functions.R
a1380b07
 #' @rdname MsBackendMassbankSql
476b908c
 #'
a1380b07
 #' @export MsBackendMassbankSql
 MsBackendMassbankSql <- function() {
b99a42e6
     if (!.has_dbi_package())
a1380b07
         stop("'MsBackendMassbankSql' requires package 'DBI'. Please ",
476b908c
              "install with 'install.packages(\"DBI\")'")
a1380b07
     new("MsBackendMassbankSql")
476b908c
 }
 
b99a42e6
 .has_dbi_package <- function() {
     requireNamespace("DBI", quietly = TRUE)
 }
 
a1380b07
 #' @importFrom DBI dbListTables
 #'
 #' @noRd
476b908c
 .valid_dbcon <- function(x) {
     if (length(x)) {
         if (!inherits(x, "DBIConnection"))
             return("'dbcon' is expected to be a connection to a database")
         tables <- dbListTables(x)
ab3019c2
         if (!all(c("msms_spectrum", "msms_spectrum_peak") %in% tables))
476b908c
             return(paste0("Database lacks some required tables. Is 'dbcon' a",
                           " connection to a MassBank database?"))
     }
     NULL
 }
 
 #' Returns the spectra data, from the database and eventually filling with
 #' *core* spectra variables, if they are not available in the database.
 #'
 #' The data can be either:
 #' - in the database.
 #' - in the local data (if new variables were added with $name <-).
a7054766
 #' - core spectra variables - if they are not in the database they have to be
 #'   initialized with `NA` and the correct data type.
476b908c
 #'
 #' @return a `data.frame` - always, even if only with a single column.
 #'
cab71014
 #' @importFrom IRanges NumericList CharacterList
a1380b07
 #'
 #' @importFrom S4Vectors extractCOLS
 #'
7a7941cf
 #' @importFrom S4Vectors make_zero_col_DFrame
 #'
 #' @importFrom methods as callNextMethod getMethod
a7054766
 #'
476b908c
 #' @author Johannes Rainer
 #'
 #' @noRd
a7054766
 .spectra_data_massbank_sql <- function(x, columns = spectraVariables(x)) {
7a7941cf
     res <- getMethod("spectraData", "MsBackendCached")(x, columns = columns)
     if (is.null(res))
         res <- make_zero_col_DFrame(length(x))
     ## Define what needs to be still retrieved.
     db_cols <- intersect(columns, x@spectraVariables)
     db_cols <- db_cols[!db_cols %in% c("mz", "intensity", colnames(res))]
a7054766
     mz_cols <- intersect(columns, c("mz", "intensity"))
7a7941cf
 
476b908c
     if (length(db_cols)) {
7a7941cf
         res <- cbind(
e6462675
             res, as(.fetch_spectra_data_sql(x, columns = db_cols), "DataFrame"))
cab71014
         if (any(colnames(res) == "synonym"))
             res$synonym <- CharacterList(res$synonym, compress = FALSE)
476b908c
     }
a7054766
     ## Get m/z and intensity values
     if (length(mz_cols)) {
         pks <- .fetch_peaks_sql(x, columns = mz_cols)
ab3019c2
         f <- factor(pks$spectrum_id)
a7054766
         if (any(mz_cols == "mz")) {
7a7941cf
             mzs <- unname(split(pks$mz, f)[as.character(x@spectraIds)])
             res$mz <- NumericList(mzs, compress = FALSE)
a7054766
         }
         if (any(mz_cols == "intensity")) {
7a7941cf
             ints <- unname(
                 split(pks$intensity, f)[as.character(x@spectraIds)])
             res$intensity <- NumericList(ints, compress = FALSE)
a7054766
         }
476b908c
     }
a1380b07
     extractCOLS(res, columns)
476b908c
 }
a7054766
 
a1380b07
 #' @importFrom DBI dbSendQuery dbBind dbFetch dbClearResult
 #'
 #' @noRd
a7054766
 .fetch_peaks_sql <- function(x, columns = c("mz", "intensity")) {
     if (length(x@dbcon)) {
2e9829c4
         dbGetQuery(
             x@dbcon,
             paste0("select spectrum_id,", paste(columns, collapse = ","),
                    " from msms_spectrum_peak where spectrum_id in (",
                    paste0("'", unique(x@spectraIds), "'", collapse = ","),")"))
a7054766
     } else {
6573c5b9
         res <- data.frame(character(), lapply(columns, function(z) numeric()))
         colnames(res) <- c("spectrum_id", columns)
         res
a7054766
     }
 }
 
d59747ce
 ## #' This function ensures that m/z values are ALWAYS returned ordered! Note that
 ## #' this is slightly faster than including `order by mz` in the SQL query.
 ## #'
 ## #' Note however that splitting the data.frame later is slower if the full data
 ## #' frame is ordered by m/z.
 ## #'
 ## #' @noRd
 ## .fetch_peaks_sql_mz_order <- function(x, columns = c("mz", "intensity")) {
 ##     if (length(x@dbcon)) {
 ##         res <- dbGetQuery(
 ##             x@dbcon,
 ##             paste0("select spectrum_id,", paste(columns, collapse = ","),
 ##                    " from msms_spectrum_peak where spectrum_id in (",
 ##                    paste0("'", unique(x@spectraIds), "'", collapse = ","),")"))
 ##         res[order(res$mz), , drop = FALSE]
 ##     } else {
 ##         res <- data.frame(character(), lapply(columns, function(z) numeric()))
 ##         colnames(res) <- c("spectrum_id", columns)
 ##         res
 ##     }
 ## }
 
ab3019c2
 .columns_sql <- c(
     precursorIntensity = "precursor_intensity",
     precursorMz = "precursor_mz_text",
7f6c1c00
     msLevel = "ms_level",
b99a42e6
     compound_id = "msms_spectrum.compound_id",
     collisionEnergy = "collision_energy_text"
ab3019c2
 )
 
 .map_spectraVariables_to_sql <- function(x) {
     for (i in seq_along(.columns_sql))
         x <- sub(names(.columns_sql)[i], .columns_sql[i], x, fixed = TRUE)
     x
 }
 
cab71014
 .map_sql_to_spectraVariables <- function(x) {
     for (i in seq_along(.columns_sql))
         x <- sub(.columns_sql[i], names(.columns_sql[i]), x, fixed = TRUE)
     x
 }
 
7f6c1c00
 #' Simple helper that creates a join query depending on the provided columns.
 #'
 #' @param x `Spectra`.
 #'
 #' @param columns `character` with the column names.
 #'
 #' @noRd
 .join_query <- function(x, columns) {
cab71014
     res <- "msms_spectrum"
7f6c1c00
     if (any(columns %in% [email protected]$ms_compound))
cab71014
         res <- paste0(res, " join ms_compound on (msms_spectrum.compound_id",
                       "=ms_compound.compound_id)")
     res
ab3019c2
 }
 
a7054766
 .fetch_spectra_data_sql <- function(x, columns = c("spectrum_id")) {
cab71014
     orig_columns <- columns
     if (any(columns %in% c("compound_name", "synonym"))) {
         columns <- columns[!columns %in% c("compound_name", "synonym")]
         columns <- unique(c(columns, "compound_id"))
     }
a04eeaf6
     sql_columns <-
         unique(c("spectrum_id", .map_spectraVariables_to_sql(columns)))
     ## That turns out to be faster than dbBind if we use a field in the
     ## database that is unique (such as spectrum_id).
     res <- dbGetQuery(
ab3019c2
         x@dbcon,
7f6c1c00
         paste0("select ", paste(sql_columns, collapse = ","), " from ",
a04eeaf6
                .join_query(x, sql_columns), " where spectrum_id in (",
                paste0("'", unique(x@spectraIds), "'", collapse = ", ") ,")"))
     idx <- match(x@spectraIds, res$spectrum_id)
     res <- res[idx[!is.na(idx)], , drop = FALSE]
     rownames(res) <- NULL
ab3019c2
     if (any(columns == "msLevel")) {
         res$msLevel <- as.integer(sub("MS", "", res$ms_level))
         res$ms_level <- NULL
     }
a7054766
     if (any(columns == "polarity")) {
         pol <- rep(NA_integer_, nrow(res))
         pol[res$polarity == "POSITIVE"] <- 1L
         pol[res$polarity == "NEGATIVE"] <- 0L
         res$polarity <- pol
     }
5b7c5974
     if (any(columns == "publication"))
         res$dataOrigin <- res$publication
ab3019c2
     if (any(columns == "precursorIntensity")) {
         res$precursorIntensity <- as.numeric(res$precursor_intensity)
         res$precursor_intensity <- NULL
     }
e031fbc3
     ## So far we're not dealing with multiple precursor m/z here!
ab3019c2
     if (any(columns == "precursorMz")) {
         suppressWarnings(
             res$precursorMz <- as.numeric(res$precursor_mz_text))
         if (!any(columns == "precursor_mz_text"))
b99a42e6
             res$precursor_mz_text <- NULL
ab3019c2
     }
     if (any(columns == "collisionEnergy")) {
e031fbc3
         suppressWarnings(
df2e7b81
             res$collisionEnergy <- as.numeric(res$collision_energy_text))
ab3019c2
         res$collision_energy_text <- NULL
     }
cab71014
     ## manage synonym and compound_name. Need a second query for that.
     if (any(orig_columns %in% c("synonym", "compound_name"))) {
a04eeaf6
         cmps <- dbGetQuery(
             x@dbcon,
             paste0("select * from synonym where compound_id in (",
                    paste0("'", unique(res$compound_id), "'",
                           collapse = ","), ")"))
cab71014
         cmpl <- split(
             cmps$synonym,
             as.factor(cmps$compound_id))[as.character(res$compound_id)]
         if (any(orig_columns == "compound_name"))
             res$compound_name <- vapply(cmpl, function(z) z[1], character(1))
         if (any(orig_columns == "synonym"))
             res$synonym <- cmpl
     }
     res[, orig_columns, drop = FALSE]
a7054766
 }
ab3019c2
 
cab71014
 .compounds_sql <- function(x, id, columns = "*") {
ab3019c2
     id <- force(id)
cab71014
     qry <- dbSendQuery(
         x, paste0("select ", columns, " from ms_compound join synonym on (",
                   "ms_compound.compound_id=synonym.compound_id)",
                   " where ms_compound.compound_id = ?"))
ab3019c2
     qry <- dbBind(qry, list(id))
     res <- dbFetch(qry)
     dbClearResult(qry)
a5b7b525
     idx <- grep("^compound_id", colnames(res))
     if (length(idx) > 1)
         res <- res[, -idx[-1]]
     res
ab3019c2
 }