From 6ffc37ab47461087b78c709ba97f39bb31bc20e4 Mon Sep 17 00:00:00 2001
From: Michele Stravs <stravsmi@eawag.ch>
Date: Fri, 17 May 2024 10:32:05 +0200
Subject: [PATCH] Added eicWorkflow vignette

---
 inst/sample_metadata/compounds.csv   |   5 +
 inst/sample_metadata/infolist_pH.csv |   5 +
 inst/sample_metadata/settings.ini    | 213 +++++++++++++++++++++++++++
 vignettes/RMassBankEics.Rmd          | 183 +++++++++++++++++++++++
 4 files changed, 406 insertions(+)
 create mode 100644 inst/sample_metadata/compounds.csv
 create mode 100644 inst/sample_metadata/infolist_pH.csv
 create mode 100644 inst/sample_metadata/settings.ini
 create mode 100644 vignettes/RMassBankEics.Rmd

diff --git a/inst/sample_metadata/compounds.csv b/inst/sample_metadata/compounds.csv
new file mode 100644
index 0000000..3e6f9ee
--- /dev/null
+++ b/inst/sample_metadata/compounds.csv
@@ -0,0 +1,5 @@
+ID,RT,Name,SMILES,InChIKey
+11509,,1-3-7-Trimethyluric-acid,CN1C2=C(NC1=O)N(C(=O)N(C2=O)C)C,BYXCFUMGEBZDDI-UHFFFAOYSA-N
+11529,,Torasemide-metabolite-M1,CC(C)NC(=O)NS(=O)(=O)C1=C(C=CN=C1)NC2=CC=CC(=C2)CO,WCYVLAMJCQZUCR-UHFFFAOYSA-N
+11531,,Valeryl-4-hydroxyvalsartan,CC(C)C(C(=O)O)N(CC1=CC=C(C=C1)C2=CC=CC=C2C3=NNN=N3)C(=O)CCC(C)O,ICSQZMPILLPFKC-XLDIYJRPSA-N
+11535,,Zolpidem-carboxylic-acid,CC1=CN2C(=NC(=C2CC(=O)N(C)C)C3=CC=C(C=C3)C(=O)O)C=C1,FELZONDEFBLTSP-UHFFFAOYSA-N
diff --git a/inst/sample_metadata/infolist_pH.csv b/inst/sample_metadata/infolist_pH.csv
new file mode 100644
index 0000000..4e52a9c
--- /dev/null
+++ b/inst/sample_metadata/infolist_pH.csv
@@ -0,0 +1,5 @@
+id,dbcas,dbname,dataused,COMMENT_CONFIDENCE,COMMENT_ID,CH$NAME1,CH$NAME2,CH$NAME3,CH$NAME4,CH$NAME5,CH$COMPOUND_CLASS,CH$FORMULA,CH$EXACT_MASS,CH$SMILES,CH$IUPAC,CH$LINK_CAS,CH$LINK_CHEBI,CH$LINK_HMDB,CH$LINK_KEGG,CH$LINK_LIPIDMAPS,CH$LINK_PUBCHEM,CH$LINK_INCHIKEY,CH$LINK_CHEMSPIDER,CH$LINK_COMPTOX
+11509,,1-3-7-Trimethyluric-acid,smiles,standard compound,11509,1-3-7-Trimethyluric-acid,"1,3,7-Trimethyluric acid","1,3,7-trimethyl-9H-purine-2,6,8-trione",,,N/A; Environmental Standard,C8H10N4O3,210.07529018,CN1C2=C(NC1=O)N(C(=O)N(C2=O)C)C,"InChI=1S/C8H10N4O3/c1-10-4-5(9-7(10)14)11(2)8(15)12(3)6(4)13/h1-3H3,(H,9,14)",5415-44-1,691622,,C16361,,CID:79437,BYXCFUMGEBZDDI-UHFFFAOYSA-N,71754,
+11529,,Torasemide-metabolite-M1,smiles,standard compound,11529,Torasemide-metabolite-M1,Hydroxy Torsemide,1-[4-[3-(hydroxymethyl)anilino]pyridin-3-yl]sulfonyl-3-propan-2-ylurea,,,N/A; Environmental Standard,C16H20N4O4S,364.12052612,CC(C)NC(=O)NS(=O)(=O)C1=C(C=CN=C1)NC2=CC=CC(=C2)CO,"InChI=1S/C16H20N4O4S/c1-11(2)18-16(22)20-25(23,24)15-9-17-7-6-14(15)19-13-5-3-4-12(8-13)10-21/h3-9,11,21H,10H2,1-2H3,(H,17,19)(H2,18,20,22)",99300-68-2,155897,,,,CID:14475217,WCYVLAMJCQZUCR-UHFFFAOYSA-N,29790247,
+11531,,Valeryl-4-hydroxyvalsartan,smiles,standard compound,11531,Valeryl-4-hydroxyvalsartan,Valery 4-Hydroxy Valsartan,2-[4-hydroxypentanoyl-[[4-[2-(2H-tetrazol-5-yl)phenyl]phenyl]methyl]amino]-3-methylbutanoic acid,,,N/A; Environmental Standard,C24H29N5O4,451.221954408,CC(C)C(C(=O)O)N(CC1=CC=C(C=C1)C2=CC=CC=C2C3=NNN=N3)C(=O)CCC(C)O,"InChI=1S/C24H29N5O4/c1-15(2)22(24(32)33)29(21(31)13-8-16(3)30)14-17-9-11-18(12-10-17)19-6-4-5-7-20(19)23-25-27-28-26-23/h4-7,9-12,15-16,22,30H,8,13-14H2,1-3H3,(H,32,33)(H,25,26,27,28)",188259-69-0,,,,,CID:17974863,ICSQZMPILLPFKC-UHFFFAOYSA-N,16463911,
+11535,,Zolpidem-carboxylic-acid,smiles,standard compound,11535,Zolpidem-carboxylic-acid,Zolpidem Carboxylic Acid,"4-[3-[2-(dimethylamino)-2-oxoethyl]-6-methylimidazo[1,2-a]pyridin-2-yl]benzoic acid",,,N/A; Environmental Standard,C19H19N3O3,337.142641468,CC1=CN2C(=NC(=C2CC(=O)N(C)C)C3=CC=C(C=C3)C(=O)O)C=C1,"InChI=1S/C19H19N3O3/c1-12-4-9-16-20-18(13-5-7-14(8-6-13)19(24)25)15(22(16)11-12)10-17(23)21(2)3/h4-9,11H,10H2,1-3H3,(H,24,25)",109461-65-6,,,,,CID:11966044,FELZONDEFBLTSP-UHFFFAOYSA-N,10140042,
diff --git a/inst/sample_metadata/settings.ini b/inst/sample_metadata/settings.ini
new file mode 100644
index 0000000..ba9b1a8
--- /dev/null
+++ b/inst/sample_metadata/settings.ini
@@ -0,0 +1,213 @@
+# Sample configuration file for RMassBank.
+# Adapt this file to your needs.
+# NOTE: Do not indent with TAB characters! Use only spaces.
+# (If your editor converts TAB to a certain number of spaces, it's OK.)
+# Use a space after the colon.
+
+# Deprofile input data?
+# Leave empty if input data is already in "centroid" mode.
+# Use values deprofile.spline, deprofile.fwhm or deprofile.localMax to convert the input data with the
+# corresponding algorithm. See ?deprofile
+deprofile: 
+
+# Deviation (in minutes) allowed the for retention time
+rtMargin: 0.4
+# Systematic retention time shift
+rtShift: 0.0
+
+# Directory to OpenBabel. Required for creating molfiles for MassBank export.
+# If no OpenBabel directory is given, RMassBank will attempt to use the CACTUS webservice
+# for SDF generation. You really should install OpenBabel though; the CACTUS structures
+# have explicit hydrogen atoms...
+# Points to the directory where babel.exe (or the Linux "babel" equivalent) lies.
+babeldir: 
+# Example:
+# babeldir: '"C:\Program Files (x86)\OpenBabel-2.3.1"\'
+
+# Which MassBank record version to use; version 2 is advised.
+use_version: 2
+
+# Include reanalyzed peaks?
+use_rean_peaks: TRUE
+
+# annotate the spectra files with (putative) molecular formulas for fragments?
+add_annotation: TRUE
+
+# Annotations for the spectrum:
+annotations:
+    # Author etc. annotation
+    authors: C. Meyer [dtc], B. Beck [dtc,com], J. Hollender [dtc]
+    copyright: Copyright (C) Eawag 2023
+    publication: Meyer, C., Stravs, M., Hollender, J.. How Wastewater Reflects Human Metabolism - Suspect Screening of Pharmaceutical Metabolites in Wastewater Influent. Submitted.
+
+    license: CC BY-SA
+    instrument: Exploris 240 Orbitrap Thermo Scientific
+    instrument_type: LC-ESI-QFT
+    confidence_comment: standard compound
+    compound_class: N/A; Environmental Standard
+    internal_id_fieldname: UCHEM_ID
+    #
+    # HPLC annotations:
+    #
+    # example: lc_gradient: 90/10 at 0 min, 50/50 at 4 min, 5/95 at 17 min, 5/95 at 25 min, 90/10 at 25.1 min, 90/10 at 30 min
+    lc_gradient: 90/10 at 0 min, 50/50 at 4 min, 5/95 at 17 min, 5/95 at 25 min, 90/10 at 25.1 min, 90/10 at 30 min
+    # example: lc_flow: 200 uL/min
+    lc_flow: 200 uL/min
+    # example: lc_solvent_a: water with 0.1% formic acid
+    lc_solvent_a: water with 0.1% formic acid
+    lc_solvent_b: MeOH with 0.1% formic acid
+    # example: lc_column: XBridge C18 3.5um, 2.1x50mm, Waters
+    lc_column: XBridge C18 3.5um, 2.1x50mm, Waters
+    # Prefix for MassBank accession IDs
+    entry_prefix: EQ
+    contributor_prefix: Eawag
+    ms_type: MS2
+    ionization: ESI
+    ms_dataprocessing:
+        RECALIBRATE: loess on assigned fragments and MS1
+
+# Annotator:
+# by default, "annotator.default" is used.
+# If you want to build your custom annotator (check ?annotator.default and the source code),
+# select it here by using e.g.
+# annotator: annotator.myown
+# for a function annotator.myown(annotation)
+
+# List of data-dependent scans in their order (relative to the parent scan), for annotation of the MassBank records
+# For every data-dependent scan event, specify an element with:
+# mode: fragmentation mode, e.g. CID
+# ces: "short" format collision energy (for record title)
+# ce: "long" format collision energy (for annotation field)
+# res: FT resolution
+spectraList:
+ # First scan: HCD 15% NCE, resolution 17500 
+- mode: HCD
+  ces: 15%
+  ce: 15 % (nominal)
+  res: 17500
+ # Third scan, etc.
+- mode: HCD
+  ces: 30%
+  ce: 30 % (nominal)
+  res: 17500
+- mode: HCD
+  ces: 45%
+  ce: 45 % (nominal)
+  res: 17500
+- mode: HCD
+  ces: 60%
+  ce: 60 % (nominal)
+  res: 17500
+- mode: HCD
+  ces: 75%
+  ce: 75 % (nominal)
+  res: 17500
+- mode: HCD
+  ces: 90%
+  ce: 90 % (nominal)
+  res: 17500
+- mode: HCD
+  ces: 120%
+  ce: 120 % (nominal)
+  res: 17500
+- mode: HCD
+  ces: 150%
+  ce: 150 % (nominal)
+  res: 17500
+- mode: HCD
+  ces: 180%
+  ce: 180 % (nominal)
+  res: 17500
+
+
+# Shifts of the starting points for RMassBank accession numbers.
+# Change these if you measure different adducts 
+accessionNumberShifts:
+    pH: 0 # [M+H]+: Accession numbers 1-14
+    pM: 16 # [M]+: 17-30
+    pNa: 32 # [M+Na]+: 33-46
+    mH: 50 # [M-H]-: 51-64
+    mFA: 66 # [M+FA]-: 67-80
+
+# A list of known electronic noise peaks
+electronicNoise:
+
+# Exclusion width of electronic noise peaks (from unmatched peaks, prior to
+# reanalysis)
+electronicNoiseWidth: 0.3
+
+accessionBuilder: "MSBNK-{contributor_prefix}-{entry_prefix}{compound_id(6)}{scan_id(2)}"
+
+# recalibration settings:
+# recalibrate by: dppm or dmz
+recalibrateBy: dppm
+
+# recalibrate MS1:
+# separately (separate)
+# with common curve (common)
+# do not recalibrate (none)
+recalibrateMS1: common
+# Window width to look for MS1 peaks to recalibrate (in ppm)
+recalibrateMS1Window: 10
+
+# Custom recalibration function: You can overwrite the recal function by
+# making any function which takes rcdata$recalfield ~ rcdata$mzFound.
+# The settings define which recal function is used.
+# Note: if recalibrateMS1 is "common", the setting "recalibrator: MS1" is meaningless
+# because the MS1 points will be recalibrated together with the MS2 points with 
+# the MS2 recalibration function.
+recalibrator:
+    MS1: recalibrate.loess
+    MS2: recalibrate.loess
+
+# Define the multiplicity filtering level
+# Default is 2 (peak occurs at least twice)
+# Set this to 1 if you want to turn this option off.
+# Set this to anything > 2 if you want harder filtering
+multiplicityFilter: 2
+
+# Define the title format.
+# You can use all entries from MassBank records as tokens
+# plus the additional token RECORD_TITLE_CE, which is a shortened
+# version of the collision energy specifically for use in the title.
+# Every line is one entry and must have one token in curly brackets
+# e.g. {CH$NAME} or {AC$MASS_SPECTROMETRY: MS_TYPE} plus optionally
+# additional text in front or behind e.g.
+# R={AC$MASS_SPECTROMETRY: RESOLUTION}
+# If this is not specified, it defaults to a title of the format
+# "Dinotefuran; LC-ESI-QFT; MS2; CE: 35%; R=35000; [M+H]+"
+# Note how everything must be in "" here because otherwise the : are getting mangled!
+titleFormat:
+- "{CH$NAME}"
+- "{AC$INSTRUMENT_TYPE}"
+- "{AC$MASS_SPECTROMETRY: MS_TYPE}"
+- "CE: {RECORD_TITLE_CE}"
+- "R={AC$MASS_SPECTROMETRY: RESOLUTION}"
+- "{MS$FOCUSED_ION: PRECURSOR_TYPE}"
+
+# Define filter settings.
+# For Orbitrap, settings of 15 ppm in low mass range, 10 ppm in high
+# mass range, m/z = 120 as mass range division and 5 ppm for recalibrated
+# data overall are recommended. 
+filterSettings:
+    ppmHighMass: 10
+    ppmLowMass: 15
+    massRangeDivision: 120
+    ppmFine: 5
+    prelimCut: 10000
+    prelimCutRatio: 0.05
+    fineCut: 0
+    fineCutRatio: 0
+    specOkLimit: 1000
+    dbeMinLimit: -0.5
+    satelliteMzLimit: 0.5
+    satelliteIntLimit: 0.05
+    
+ # Define raw MS retrieval settings.
+findMsMsRawSettings:
+    ppmFine: 10
+    mzCoarse: 0.5
+    # fillPrecursorScan is FALSE for "good" mzML files which have all the info needed.
+    # However, for example AB Sciex files will have missing precursor scan information,
+    # in which case fillPrecursorScan = TRUE is needed. Try it out.
+    fillPrecursorScan: FALSE
diff --git a/vignettes/RMassBankEics.Rmd b/vignettes/RMassBankEics.Rmd
new file mode 100644
index 0000000..c497d16
--- /dev/null
+++ b/vignettes/RMassBankEics.Rmd
@@ -0,0 +1,183 @@
+---
+title: "RMassBank: eicWorkflow"
+author: "Michael Stravs"
+output:
+  BiocStyle::html_document:
+    toc_float: true
+vignette: >
+  %\VignetteIndexEntry{RMassBank: The workflow by example}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteKeywords{Mass Spectrometry, MS, Metabolomics, Bioinformatics}
+  %\VignetteEncoding{UTF-8}
+  %\VignetteDepends{RMassBank, RMassBankData, BiocStyle}
+  %\VignettePackage{RMassBank}
+---
+```{r echo=FALSE}
+options(width=74)
+```
+# Introduction
+
+The `eicWorkflow` is an enhancement to the RMassBank data processing workflow,
+which uses correlation of parent and fragment EICs as an additional quality criterion.
+It requires data where MS2 are acquired in a targeted way over a retention time window,
+so that MS2 data is acquired over the entire chromatographic compound peak of the 
+precursor and can be correlated with the MS1 EIC.
+
+It can be used 
+
+*RMassBank* is a two-part computational mass spectrometry workflow:
+
+*  In the first step, MSMS spectra of compounds are extracted from raw LC-MS data files, 
+      the MSMS spectra are recalibrated using assigned fragment formulas, and effectively 
+      denoised by using only annotated peaks (plus peaks which can be manually added.)
+*  In the second step, the processed, recalibrated, cleaned data is prepared for 
+      submission to a MassBank database. Compounds are first automatically annotated using 
+      information from the Chemical Translation Service (CTS). After manually checking and 
+      fixing the annotations, the information is compiled together with the spectral data
+      into MassBank records, which can then be uploaded to a MassBank database.
+
+This vignette describes basic usage with the standard workflow. The package is
+flexible and allows for different advanced use cases. Examples of specialized
+applications of *RMassBank* are available at the *RMassBank*
+message board hosted by the Metabolomics-Forum:
+(http://www.metabolomics-forum.com/viewforum.php?f=29).
+
+# Installation and loading
+
+The library is available from Bioconductor ((http://www.bioconductor.org)).
+In addition to the library itself, it is recommended to install the OpenBabel
+chemical toolkit, available from (http://www.openbabel.org) for various
+platforms (or via Linux package distribution systems).
+
+The library is loaded as follows
+
+```{r load-librarysource}
+library(RMassBank)
+``` 
+
+The data used in the following example is available from Zenodo
+(https://zenodo.org/uploads/11198385)
+
+```{r get-zenodo}
+
+data_dir <- tempfile("dir")
+fs::dir_create(data_dir)
+options(timeout = max(300, getOption("timeout")))
+download.file(
+  "https://zenodo.org/api/records/11104371/files-archive",
+  fs::path(data_dir, "data.zip"),
+  mode = "wb"
+)
+zip::unzip(fs::path(data_dir, "data.zip"),
+           exdir = data_dir)
+``` 
+
+# Input files and settings
+
+
+```{r compound.list}
+file.copy(system.file("sample_metadata/compounds.csv", 
+	package="RMassBank"), fs::path(data_dir, "compounds.csv")
+)
+file.copy(system.file("sample_metadata/settings.ini", 
+	package="RMassBank"), fs::path(data_dir, "settings.ini"))
+
+loadList(fs::path(data_dir, "compounds.csv"))
+loadRmbSettings(fs::path(data_dir, "settings.ini"))
+```
+
+## Standard workflow
+
+Here, we quickly run the normal RMassBank workflow as described in the main vignette:
+
+```{r msmsWorkflow}
+
+
+cpds <- readr::read_csv(fs::path(data_dir, "compounds.csv"))
+files <- fs::dir_ls(data_dir, glob="*.mzML")
+match_files <- files |> 
+  stringr::str_match(".*_(?<id>[0-9]+)\\.mzML") |> 
+  magrittr::set_colnames(c("file", "id")) |>
+  tibble::as_tibble()
+match_files$ids
+
+w <- newMsmsWorkspace()
+w <- msmsRead(w, files = match_files$file, cpdids = match_files$id,
+              mode = "pH", readMethod = "mzR")
+
+w <- msmsWorkflow(w, mode="pH", steps=c(2:8))
+
+```
+
+## EIC extraction and correlation
+
+To get higher-quality spectra, we can extract parent and fragment EICs for all 
+compounds and find correlations.
+
+Step 1 extracts the EIC for the parents:
+
+```{r eicWorkflow-step1}
+
+w <- eicWorkflow(w, mode="pH", steps=1)
+
+attr(w@spectra[[2]], "eic") |> 
+  ggplot2::ggplot() +
+  ggplot2::aes(x=rt, y=intensity) +
+  ggplot2::geom_line()
+```
+
+Step 2 extracts EICs for individual fragments. (For this, MSMS must be acquired in targeted mode)
+
+```{r eicWorkflow-step2}
+
+w <- eicWorkflow(w, mode="pH", steps=2)
+
+eics <- attr(w@spectra[[2]]@children[[4]], "eics")
+eics[1:12] |>
+  dplyr::bind_rows(, .id="fragment") |>
+  ggplot2::ggplot() +
+  ggplot2::aes(x=rt, y=intensity) +
+  ggplot2::geom_line() +
+  ggplot2::facet_wrap(ggplot2::vars(mz))
+
+```
+
+Step 3 and 4 calculate parent-fragment correlations,  and use the formula annotation results
+(the original RMassBank approach) to estimate a threshold for correlation.
+
+Step 5 applies the filtering. Note that in contrast to the classical `msmsWorkflow`,
+
+
+```{r eicWorkflow-step345, fig.show='hide'}
+
+w <- eicWorkflow(w, mode="pH", steps=3:4)
+attr(w, "eicScoreFilter")
+getData(w@spectra[[2]]@children[[4]]) |> nrow()
+
+w <- eicWorkflow(w, mode="pH", steps=5)
+getData(w@spectra[[2]]@children[[4]]) |> nrow()
+
+
+```
+# Record export
+
+Records are exported in the usual manner.
+
+```{r}
+mb <- newMbWorkspace(w)
+
+mb <- mbWorkflow(mb)
+
+mb <- resetInfolists(mb)
+fs::dir_create(fs::path(data_dir, "infolists"))
+file.copy(system.file("sample_metadata/infolist_pH.csv", 
+	package="RMassBank"), fs::path(data_dir, "infolists"))
+mb <- loadInfolists(mb, fs::path(data_dir, "infolists"))
+mb <- mbWorkflow(mb)
+```
+
+# Session information
+
+```{r }
+sessionInfo()
+```