Bioconductor Code: MsBackendMassbank

R/MsBackendMassbankSql-functions.R

a1380b07	#' @rdname MsBackendMassbankSql
476b908c	#'
a1380b07	#' @export MsBackendMassbankSql MsBackendMassbankSql <- function() {
b99a42e6	if (!.has_dbi_package())
a1380b07	stop("'MsBackendMassbankSql' requires package 'DBI'. Please ",
476b908c	"install with 'install.packages(\"DBI\")'")
a1380b07	new("MsBackendMassbankSql")
476b908c	}
b99a42e6	.has_dbi_package <- function() { requireNamespace("DBI", quietly = TRUE) }
a1380b07	#' @importFrom DBI dbListTables #' #' @noRd
476b908c	.valid_dbcon <- function(x) { if (length(x)) { if (!inherits(x, "DBIConnection")) return("'dbcon' is expected to be a connection to a database") tables <- dbListTables(x)
ab3019c2	if (!all(c("msms_spectrum", "msms_spectrum_peak") %in% tables))
476b908c	return(paste0("Database lacks some required tables. Is 'dbcon' a", " connection to a MassBank database?")) } NULL } #' Returns the spectra data, from the database and eventually filling with #' core spectra variables, if they are not available in the database. #' #' The data can be either: #' - in the database. #' - in the local data (if new variables were added with $name <-).
a7054766	#' - core spectra variables - if they are not in the database they have to be #' initialized with `NA` and the correct data type.
476b908c	#' #' @return a `data.frame` - always, even if only with a single column. #'
cab71014	#' @importFrom IRanges NumericList CharacterList
a1380b07	#' #' @importFrom S4Vectors extractCOLS #'
7a7941cf	#' @importFrom S4Vectors make_zero_col_DFrame #' #' @importFrom methods as callNextMethod getMethod
a7054766	#'
476b908c	#' @author Johannes Rainer #' #' @noRd
a7054766	.spectra_data_massbank_sql <- function(x, columns = spectraVariables(x)) {
7a7941cf	res <- getMethod("spectraData", "MsBackendCached")(x, columns = columns) if (is.null(res)) res <- make_zero_col_DFrame(length(x)) ## Define what needs to be still retrieved. db_cols <- intersect(columns, x@spectraVariables) db_cols <- db_cols[!db_cols %in% c("mz", "intensity", colnames(res))]
a7054766	mz_cols <- intersect(columns, c("mz", "intensity"))
7a7941cf
476b908c	if (length(db_cols)) {
7a7941cf	res <- cbind(
e6462675	res, as(.fetch_spectra_data_sql(x, columns = db_cols), "DataFrame"))
cab71014	if (any(colnames(res) == "synonym")) res$synonym <- CharacterList(res$synonym, compress = FALSE)
476b908c	}
a7054766	## Get m/z and intensity values if (length(mz_cols)) { pks <- .fetch_peaks_sql(x, columns = mz_cols)
ab3019c2	f <- factor(pks$spectrum_id)
a7054766	if (any(mz_cols == "mz")) {
7a7941cf	mzs <- unname(split(pks$mz, f)[as.character(x@spectraIds)]) res$mz <- NumericList(mzs, compress = FALSE)
a7054766	} if (any(mz_cols == "intensity")) {
7a7941cf	ints <- unname( split(pks$intensity, f)[as.character(x@spectraIds)]) res$intensity <- NumericList(ints, compress = FALSE)
a7054766	}
476b908c	}
a1380b07	extractCOLS(res, columns)
476b908c	}
a7054766
a1380b07	#' @importFrom DBI dbSendQuery dbBind dbFetch dbClearResult #' #' @noRd
a7054766	.fetch_peaks_sql <- function(x, columns = c("mz", "intensity")) { if (length(x@dbcon)) {
2e9829c4	dbGetQuery( x@dbcon, paste0("select spectrum_id,", paste(columns, collapse = ","), " from msms_spectrum_peak where spectrum_id in (", paste0("'", unique(x@spectraIds), "'", collapse = ","),")"))
a7054766	} else {
6573c5b9	res <- data.frame(character(), lapply(columns, function(z) numeric())) colnames(res) <- c("spectrum_id", columns) res
a7054766	} }
d59747ce	## #' This function ensures that m/z values are ALWAYS returned ordered! Note that ## #' this is slightly faster than including `order by mz` in the SQL query. ## #' ## #' Note however that splitting the data.frame later is slower if the full data ## #' frame is ordered by m/z. ## #' ## #' @noRd ## .fetch_peaks_sql_mz_order <- function(x, columns = c("mz", "intensity")) { ## if (length(x@dbcon)) { ## res <- dbGetQuery( ## x@dbcon, ## paste0("select spectrum_id,", paste(columns, collapse = ","), ## " from msms_spectrum_peak where spectrum_id in (", ## paste0("'", unique(x@spectraIds), "'", collapse = ","),")")) ## res[order(res$mz), , drop = FALSE] ## } else { ## res <- data.frame(character(), lapply(columns, function(z) numeric())) ## colnames(res) <- c("spectrum_id", columns) ## res ## } ## }
ab3019c2	.columns_sql <- c( precursorIntensity = "precursor_intensity", precursorMz = "precursor_mz_text",
7f6c1c00	msLevel = "ms_level",
b99a42e6	compound_id = "msms_spectrum.compound_id", collisionEnergy = "collision_energy_text"
ab3019c2	) .map_spectraVariables_to_sql <- function(x) { for (i in seq_along(.columns_sql)) x <- sub(names(.columns_sql)[i], .columns_sql[i], x, fixed = TRUE) x }
cab71014	.map_sql_to_spectraVariables <- function(x) { for (i in seq_along(.columns_sql)) x <- sub(.columns_sql[i], names(.columns_sql[i]), x, fixed = TRUE) x }
7f6c1c00	#' Simple helper that creates a join query depending on the provided columns. #' #' @param x `Spectra`. #' #' @param columns `character` with the column names. #' #' @noRd .join_query <- function(x, columns) {
cab71014	res <- "msms_spectrum"
7f6c1c00	if (any(columns %in% [email protected]$ms_compound))
cab71014	res <- paste0(res, " join ms_compound on (msms_spectrum.compound_id", "=ms_compound.compound_id)") res
ab3019c2	}
a7054766	.fetch_spectra_data_sql <- function(x, columns = c("spectrum_id")) {
cab71014	orig_columns <- columns if (any(columns %in% c("compound_name", "synonym"))) { columns <- columns[!columns %in% c("compound_name", "synonym")] columns <- unique(c(columns, "compound_id")) }
a04eeaf6	sql_columns <- unique(c("spectrum_id", .map_spectraVariables_to_sql(columns))) ## That turns out to be faster than dbBind if we use a field in the ## database that is unique (such as spectrum_id). res <- dbGetQuery(
ab3019c2	x@dbcon,
7f6c1c00	paste0("select ", paste(sql_columns, collapse = ","), " from ",
a04eeaf6	.join_query(x, sql_columns), " where spectrum_id in (", paste0("'", unique(x@spectraIds), "'", collapse = ", ") ,")")) idx <- match(x@spectraIds, res$spectrum_id) res <- res[idx[!is.na(idx)], , drop = FALSE] rownames(res) <- NULL
ab3019c2	if (any(columns == "msLevel")) { res$msLevel <- as.integer(sub("MS", "", res$ms_level)) res$ms_level <- NULL }
a7054766	if (any(columns == "polarity")) { pol <- rep(NA_integer_, nrow(res)) pol[res$polarity == "POSITIVE"] <- 1L pol[res$polarity == "NEGATIVE"] <- 0L res$polarity <- pol }
5b7c5974	if (any(columns == "publication")) res$dataOrigin <- res$publication
ab3019c2	if (any(columns == "precursorIntensity")) { res$precursorIntensity <- as.numeric(res$precursor_intensity) res$precursor_intensity <- NULL }
e031fbc3	## So far we're not dealing with multiple precursor m/z here!
ab3019c2	if (any(columns == "precursorMz")) { suppressWarnings( res$precursorMz <- as.numeric(res$precursor_mz_text)) if (!any(columns == "precursor_mz_text"))
b99a42e6	res$precursor_mz_text <- NULL
ab3019c2	} if (any(columns == "collisionEnergy")) {
e031fbc3	suppressWarnings(
df2e7b81	res$collisionEnergy <- as.numeric(res$collision_energy_text))
ab3019c2	res$collision_energy_text <- NULL }
cab71014	## manage synonym and compound_name. Need a second query for that. if (any(orig_columns %in% c("synonym", "compound_name"))) {
a04eeaf6	cmps <- dbGetQuery( x@dbcon, paste0("select * from synonym where compound_id in (", paste0("'", unique(res$compound_id), "'", collapse = ","), ")"))
cab71014	cmpl <- split( cmps$synonym, as.factor(cmps$compound_id))[as.character(res$compound_id)] if (any(orig_columns == "compound_name")) res$compound_name <- vapply(cmpl, function(z) z[1], character(1)) if (any(orig_columns == "synonym")) res$synonym <- cmpl } res[, orig_columns, drop = FALSE]
a7054766	}
ab3019c2
cab71014	.compounds_sql <- function(x, id, columns = "*") {
ab3019c2	id <- force(id)
cab71014	qry <- dbSendQuery( x, paste0("select ", columns, " from ms_compound join synonym on (", "ms_compound.compound_id=synonym.compound_id)", " where ms_compound.compound_id = ?"))
ab3019c2	qry <- dbBind(qry, list(id)) res <- dbFetch(qry) dbClearResult(qry)
a5b7b525	idx <- grep("^compound_id", colnames(res)) if (length(idx) > 1) res <- res[, -idx[-1]] res
ab3019c2	}