R/MsBackendMassbank.R
13a450da
 #' @include hidden_aliases.R
 NULL
 
 #' @title MS data backend for mgf files
 #'
 #' @aliases MsBackendMassbank-class
 #'
 #' @description
 #'
 #' The `MsBackendMassbank` class supports import of MS/MS spectra data from
6a3e408f
 #' MS/MS spectrum data from
 #' [Massbank](https://github.com/MassBank/MassBank-data)
13a450da
 #' files. After initial import, the full MS data is kept in
b99a42e6
 #' memory. `MsBackendMassbank` extends the
 #' [Spectra::MsBackendDataFrame()] backend
 #' directly and supports thus the [Spectra::applyProcessing()] function to make
6a3e408f
 #' data manipulations persistent.
13a450da
 #'
 #' New objects are created with the `MsBackendMassbank` function. The
 #' `backendInitialize` method has to be subsequently called to
6a3e408f
 #' initialize the object and import MS/MS data from (one or more) MassBank
 #' files. Optional parameter `nonStop` allows to specify whether the
 #' import returns with an error if one of the text files lacks required
13a450da
 #' data, such as `mz` and `intensity` values (default `nonStop =
 #' FALSE`), or whether only affected file(s) is(are) skipped and a
 #' warning is shown (`nonStop = TRUE`). Note that any other error
6a3e408f
 #' will abort import regardless of parameter `nonStop`.
13a450da
 #'
 #' @param object Instance of `MsBackendMassbank` class.
 #'
5b7c5974
 #' @param file for `export`: `character(1)` defining the output file.
 #'
6a3e408f
 #' @param files `character` with the (full) file name(s) of the MassBank file(s)
13a450da
 #'     from which MS/MS data should be imported.
 #'
5b7c5974
 #' @param format for `spectraVariableMapping`: `character(1)` defining the
 #'     format to be used. Currently only `format = "Massbank"` is supported.
 #'
 #' @param mapping for `export`: named `character` vector
 #'     allowing to specify how fields from the Massbank file should be renamed.
 #'     Names are supposed to be the spectra variable name and values of the
 #'     vector the field names in the Massbank file. See output of
797495cd
 #'     `spectraVariableMapping(MsBackendMassbank())` for the expected format.
5b7c5974
 #'
6a3e408f
 #' @param metaBlocks `data.frame` indicating which metadata shall
 #'     be imported. Default is [metaDataBlocks()].
b6ec6667
 #'
13a450da
 #' @param nonStop `logical(1)` whether import should be stopped if an
 #'     xml file does not contain all required fields. Defaults to
 #'     `nonStop = FALSE`.
 #'
 #' @param BPPARAM Parameter object defining the parallel processing
 #'     setup to import data in parallel. Defaults to `BPPARAM =
b99a42e6
 #'     bpparam()`. See [BiocParallel::bpparam()] for more information.
13a450da
 #'
b99a42e6
 #' @param x [Spectra::Spectra()] object that should be exported.
5b7c5974
 #'
13a450da
 #' @param ... Currently ignored.
 #'
 #' @author Michael Witting
 #'
 #' @importClassesFrom Spectra MsBackendDataFrame
 #'
 #' @exportClass MsBackendMassbank
 #'
 #' @name MsBackendMassbank
 #'
6a3e408f
 #' @return `backendInitialize` and `MsBackendMassbank` return an instance of
 #'     `MsBackendMassbank-class`.
 #'
13a450da
 #' @examples
 #'
6a3e408f
 #' ## Create an MsBackendMassbank backend and import data from a test file.
13a450da
 #' fls <- dir(system.file("extdata", package = "MsBackendMassbank"),
331d39bb
 #'     full.names = TRUE, pattern = "txt$")
13a450da
 #' be <- backendInitialize(MsBackendMassbank(), fls)
 #' be
 #'
 #' be$msLevel
 #' be$intensity
 #' be$mz
6a3e408f
 #'
 #' ## Initializing a backend reading additional metadata columns/information
 #' mb <- metaDataBlocks()
 #' mb
 #' mb[1, 2] <- TRUE
 #'
 #' be <- backendInitialize(MsBackendMassbank(), fls, metaBlocks = mb)
 #' spectraVariables(be)
 #' be$instrument
13a450da
 NULL
 
 setClass("MsBackendMassbank",
          contains = "MsBackendDataFrame",
          prototype = prototype(spectraData = DataFrame(),
                                readonly = FALSE,
                                version = "0.1"))
 
f56ab929
 #' @importMethodsFrom Spectra spectraData<- $<- $
 #'
 #' @importMethodsFrom ProtGenerics backendInitialize
13a450da
 #'
 #' @importFrom BiocParallel bpparam
 #'
6a3e408f
 #' @importFrom S4Vectors bindROWS
 #'
13a450da
 #' @importMethodsFrom BiocParallel bplapply
 #'
 #' @importFrom methods validObject
 #'
 #' @exportMethod backendInitialize
 #'
 #' @rdname MsBackendMassbank
 setMethod("backendInitialize", signature = "MsBackendMassbank",
b6ec6667
           function(object, files, metaBlocks = metaDataBlocks(),
                    nonStop = FALSE, ..., BPPARAM = bpparam()) {
6a3e408f
               if (missing(files) || !length(files))
                   stop("Parameter 'files' is mandatory for ", class(object))
               if (!is.character(files))
                   stop("Parameter 'files' is expected to be a character vector",
                        " with the files names from where data should be",
                        " imported")
               suppressWarnings(files <- normalizePath(files))
               if (any(!file.exists(files))) {
                   stop("file(s) ",
                        paste(files[!file.exists(files)], collapse = ", "),
                        " not found")
               }
               ## Import data and rbind.
               message("Start data import from ", length(files), " files ... ",
                       appendLF = FALSE)
               res <- bplapply(files, FUN = .read_massbank,
                               metaBlocks = metaBlocks,
                               nonStop = nonStop, BPPARAM = BPPARAM)
               message("done")
b99a42e6
               if (nonStop && any(lengths(res) == 0))
                   warning("Import failed for some files")
a99bc7da
               ## res <- bindROWS(DataFrame(), objects = res, use.names = FALSE,
               ##                 ignore.mcols = TRUE, check = FALSE)
               ## spectraData(object) <- res
               message("Merging results ...", appendLF = FALSE)
               res <- as(do.call(rbind, res), "DataFrame")
               res$mz <- NumericList(res$mz, compress = FALSE)
               res$intensity <- NumericList(res$intensity, compress = FALSE)
               object@spectraData <- res
               message("done")
6a3e408f
               object$dataStorage <- "<memory>"
               validObject(object)
               object
13a450da
           })
 
 #' @rdname MsBackendMassbank
 #'
 #' @importFrom methods new
 #'
 #' @export MsBackendMassbank
 MsBackendMassbank <- function() {
   new("MsBackendMassbank")
 }
5787d32a
 
797495cd
 #' @importMethodsFrom Spectra spectraVariableMapping
 #'
 #' @exportMethod spectraVariableMapping
5787d32a
 #'
 #' @rdname MsBackendMassbank
797495cd
 setMethod(
     "spectraVariableMapping", "MsBackendMassbank",
8b4f2e1b
     function(object, format = c("Massbank")) {
797495cd
         switch(match.arg(format),
                "Massbank" = c(
                    ## minimal information
                    accession = "ACCESSION:",
                    name = "CH$NAME:",
                    smiles = "CH$SMILES:",
                    exactmass = "CH$EXACT_MASS:",
                    formula = "CH$FORMULA:",
                    inchi = "CH$IUPAC:",
                    cas = "CH$LINK: CAS",
                    inchikey = "CH$LINK: INCHIKEY",
                    collisionEnergy = "AC$MASS_SPECTROMETRY: COLLISION_ENERGY",
                    precursorMz = "MS$FOCUSED_ION: PRECURSOR_M/Z",
                    precursorIntensity = "MS$FOCUSED_ION: PRECURSOR_INT",
                    adduct = "MS$FOCUSED_ION: PRECURSOR_TYPE",
                    rtime = "AC$CHROMATOGRAPHY: RETENTION_TIME",
                    polarity = "AC$MASS_SPECTROMETRY: ION_MODE",
                    splash = "PK$SPLASH:",
                    title = "RECORD_TITLE:",
5787d32a
 
797495cd
                    ## instrument information
                    instrument = "AC$INSTRUMENT:",
                    instrument_type = "AC$INSTRUMENT_TYPE:",
5787d32a
 
797495cd
                    ## ms information
                    ms_ms_type = "AC$MASS_SPECTROMETRY: MS_TYPE",
                    ms_cap_voltage = "AC$MASS_SPECTROMETRY: CAPILLARY_VOLTAGE",
                    ms_col_gas = "AC$MASS_SPECTROMETRY: COLLISION_GAS",
                    ms_desolv_gas_flow =
                        "AC$MASS_SPECTROMETRY: DESOLVATION_GAS_FLOW",
                    ms_desolv_temp =
                        "AC$MASS_SPECTROMETRY: DESOLVATION_TEMPERATURE",
                    ms_ionization = "AC$MASS_SPECTROMETRY: IONIZATION",
                    ms_ionization_energy =
                        "AC$MASS_SPECTROMETRY: IONIZATION_ENERGY",
                    ms_laser = "AC$MASS_SPECTROMETRY: LASER",
                    ms_matrix = "AC$MASS_SPECTROMETRY: MATRIX",
                    ms_mass_accuracy = "AC$MASS_SPECTROMETRY: MASS_ACCURACY",
                    ms_mass_range = "AC$MASS_SPECTROMETRY: MASS_RANGE_MZ",
                    ms_reagent_gas = "AC$MASS_SPECTROMETRY: REAGENT_GAS",
                    ms_resolution = "AC$MASS_SPECTROMETRY: RESOLUTION",
                    ms_scan_setting = "AC$MASS_SPECTROMETRY: SCANNING_SETTING",
                    ms_source_temp = "AC$MASS_SPECTROMETRY: SOURCE_TEMPERATURE",
                    ms_frag_mode = "AC$MASS_SPECTROMETRY: FRAGMENTATION_MODE",
92efaee9
                    ms_kinetic_energy = "AC$MASS_SPECTROMETRY: KINETIC_ENERGY",
                    ms_electron_current = "AC$MASS_SPECTROMETRY: ELECTRON_CURRENT",
                    ms_reaction_time = "AC$MASS_SPECTROMETRY: REACTION_TIME",
5787d32a
 
797495cd
                    ## ims information
                    ims_instrument_type = "AC$ION_MOBILITY: INSTRUMENT_TYPE",
                    ims_drift_gas = "AC$ION_MOBILITY: DRIFT_GAS",
                    ims_drift_time = "AC$ION_MOBILITY: DRIFT_TIME",
                    ims_ccs = "AC$ION_MOBILITY: CCS",
bd76ae2e
 
797495cd
                    ## ms information part II
                    focus_base_peak = "MS$FOCUSED_ION: BASE_PEAK",
                    focus_derivative_form = "MS$FOCUSED_ION: DERIVATIVE_FORM",
                    focus_derivative_mass = "MS$FOCUSED_ION: DERIVATIVE_MASS",
                    focus_derivative_type = "MS$FOCUSED_ION: DERIVATIVE_TYPE",
                    focus_ion_type = "MS$FOCUSED_ION: ION_TYPE",
5787d32a
 
797495cd
                    ## data processing information
                    data_processing_comment = "MS$DATA_PROCESSING: COMMENT",
                    data_processing_deprofile = "MS$DATA_PROCESSING: DEPROFILE",
                    data_processing_find = "MS$DATA_PROCESSING: FIND_PEAK",
                    data_processing_reanalyze = "MS$DATA_PROCESSING: REANALYZE",
                    data_processing_recalibrate =
                        "MS$DATA_PROCESSING: RECALIBRATE",
                    data_processing_whole = "MS$DATA_PROCESSING: WHOLE",
5787d32a
 
797495cd
                    ## chromatography information
                    chrom_carrier_gas = "AC$CHROMATOGRAPHY: CARRIER_GAS",
                    chrom_column = "AC$CHROMATOGRAPHY: COLUMN_NAME",
                    chrom_column_temp = "AC$CHROMATOGRAPHY: COLUMN_TEMPERATURE",
                    chrom_column_temp_gradient =
                        "AC$CHROMATOGRAPHY: COLUMN_TEMPERATURE_GRADIENT",
                    chrom_flow_gradient = "AC$CHROMATOGRAPHY: FLOW_GRADIENT",
                    chrom_flow_rate = "AC$CHROMATOGRAPHY: FLOW_RATE",
                    chrom_inj_temp = "AC$CHROMATOGRAPHY: INJECTION_TEMPERATURE",
                    chrom_inj_temp_gradient =
                        "AC$CHROMATOGRAPHY: INJECTION_TEMPERATURE_GRADIENT",
                    chrom_rti_kovats = "AC$CHROMATOGRAPHY: KOVATS_RTI",
                    chrom_rti_lee = "AC$CHROMATOGRAPHY: LEE_RTI",
                    chrom_rti_naps = "AC$CHROMATOGRAPHY: NAPS_RTI",
                    chrom_rti_uoa = "AC$CHROMATOGRAPHY: UOA_RTI",
                    chrom_rti_uoa_pred = "AC$CHROMATOGRAPHY: UOA_PREDICTED_RTI",
                    chrom_rt = "AC$CHROMATOGRAPHY: RETENTION_TIME",
                    chrom_solvent = "AC$CHROMATOGRAPHY: SOLVENT",
                    chrom_transfer_temp =
                        "AC$CHROMATOGRAPHY: TRANSFERLINE_TEMPERATURE",
5787d32a
 
797495cd
                    ## chemical information
                    compound_class = "CH$COMPOUND_CLASS:",
                    link_cayman = "CH$LINK: CAYMAN",
                    link_chebi = "CH$LINK: CHEBI",
                    link_chembl = "CH$LINK: CHEMBL",
                    link_chempdb = "CH$LINK: CHEMPDB",
                    link_chemspider = "CH$LINK: CHEMSPIDER",
                    link_comptox = "CH$LINK: COMPTOX",
                    link_hmdb = "CH$LINK: HMDB",
                    link_kappaview = "CH$LINK: KAPPAVIEW",
                    link_kegg = "CH$LINK: KEGG",
                    link_knapsack = "CH$LINK: KNAPSACK",
                    link_lipidbank = "CH$LINK: LIPIDBANK",
                    link_lipidmaps = "CH$LINK: LIPIDMAPS",
                    link_nikkaji = "CH$LINK: NIKKAJI",
                    link_pubchem = "CH$LINK: PUBCHEM",
                    link_zinc = "CH$LINK: ZINC",
5787d32a
 
797495cd
                    ## sample information
                    scientific_name = "SP$SCIENTIFIC_NAME:",
                    lineage = "SP$LINEAGE:",
                    link = "SP$LINK:",
                    sample = "SP$SAMPLE:",
5787d32a
 
797495cd
                    ## record information
                    deprecated = "DEPRECATED:",
                    date = "DATE:",
                    authors = "AUTHORS:",
                    license = "LICENSE:",
                    copyright = "COPYRIGHT:",
                    publication = "PUBLICATION:",
                    project = "PROJECT:",
                    comment = "COMMENT:",
5787d32a
 
797495cd
                    ## peak information
                    pknum = "PK$NUM_PEAK:"
                )
                )
     })
5787d32a
 
 #' @importMethodsFrom Spectra export
 #'
 #' @exportMethod export
 #'
 #' @rdname MsBackendMassbank
6a3e408f
 setMethod("export", "MsBackendMassbank",
           function(object, x, file = tempfile(),
797495cd
                    mapping = spectraVariableMapping(MsBackendMassbank()), ...) {
6a3e408f
               .export_massbank(x = x, con = file, mapping = mapping)
           })
a99bc7da
 
 
 ## #' tests...
 ## p <- "/home/jo/Projects/compounds/MassBank/text/MassBank-data-2024.11"
 ## fls <- dir(p, pattern = "txt$", recursive = TRUE, full.names = TRUE)
 
 ## library(Spectra)
 ## mb_spectra <- Spectra(fls,
 ##                       source = MsBackendMassbank(),
 ##                       backend = MsBackendDataFrame(),
 ##                       nonStop = TRUE,
 ##                       BPPARAM = SerialParam())