From 6ffc37ab47461087b78c709ba97f39bb31bc20e4 Mon Sep 17 00:00:00 2001 From: Michele Stravs Date: Fri, 17 May 2024 10:32:05 +0200 Subject: [PATCH] Added eicWorkflow vignette --- inst/sample_metadata/compounds.csv | 5 + inst/sample_metadata/infolist_pH.csv | 5 + inst/sample_metadata/settings.ini | 213 +++++++++++++++++++++++++++ vignettes/RMassBankEics.Rmd | 183 +++++++++++++++++++++++ 4 files changed, 406 insertions(+) create mode 100644 inst/sample_metadata/compounds.csv create mode 100644 inst/sample_metadata/infolist_pH.csv create mode 100644 inst/sample_metadata/settings.ini create mode 100644 vignettes/RMassBankEics.Rmd diff --git a/inst/sample_metadata/compounds.csv b/inst/sample_metadata/compounds.csv new file mode 100644 index 0000000..3e6f9ee --- /dev/null +++ b/inst/sample_metadata/compounds.csv @@ -0,0 +1,5 @@ +ID,RT,Name,SMILES,InChIKey +11509,,1-3-7-Trimethyluric-acid,CN1C2=C(NC1=O)N(C(=O)N(C2=O)C)C,BYXCFUMGEBZDDI-UHFFFAOYSA-N +11529,,Torasemide-metabolite-M1,CC(C)NC(=O)NS(=O)(=O)C1=C(C=CN=C1)NC2=CC=CC(=C2)CO,WCYVLAMJCQZUCR-UHFFFAOYSA-N +11531,,Valeryl-4-hydroxyvalsartan,CC(C)C(C(=O)O)N(CC1=CC=C(C=C1)C2=CC=CC=C2C3=NNN=N3)C(=O)CCC(C)O,ICSQZMPILLPFKC-XLDIYJRPSA-N +11535,,Zolpidem-carboxylic-acid,CC1=CN2C(=NC(=C2CC(=O)N(C)C)C3=CC=C(C=C3)C(=O)O)C=C1,FELZONDEFBLTSP-UHFFFAOYSA-N diff --git a/inst/sample_metadata/infolist_pH.csv b/inst/sample_metadata/infolist_pH.csv new file mode 100644 index 0000000..4e52a9c --- /dev/null +++ b/inst/sample_metadata/infolist_pH.csv @@ -0,0 +1,5 @@ +id,dbcas,dbname,dataused,COMMENT_CONFIDENCE,COMMENT_ID,CH$NAME1,CH$NAME2,CH$NAME3,CH$NAME4,CH$NAME5,CH$COMPOUND_CLASS,CH$FORMULA,CH$EXACT_MASS,CH$SMILES,CH$IUPAC,CH$LINK_CAS,CH$LINK_CHEBI,CH$LINK_HMDB,CH$LINK_KEGG,CH$LINK_LIPIDMAPS,CH$LINK_PUBCHEM,CH$LINK_INCHIKEY,CH$LINK_CHEMSPIDER,CH$LINK_COMPTOX +11509,,1-3-7-Trimethyluric-acid,smiles,standard compound,11509,1-3-7-Trimethyluric-acid,"1,3,7-Trimethyluric acid","1,3,7-trimethyl-9H-purine-2,6,8-trione",,,N/A; Environmental Standard,C8H10N4O3,210.07529018,CN1C2=C(NC1=O)N(C(=O)N(C2=O)C)C,"InChI=1S/C8H10N4O3/c1-10-4-5(9-7(10)14)11(2)8(15)12(3)6(4)13/h1-3H3,(H,9,14)",5415-44-1,691622,,C16361,,CID:79437,BYXCFUMGEBZDDI-UHFFFAOYSA-N,71754, +11529,,Torasemide-metabolite-M1,smiles,standard compound,11529,Torasemide-metabolite-M1,Hydroxy Torsemide,1-[4-[3-(hydroxymethyl)anilino]pyridin-3-yl]sulfonyl-3-propan-2-ylurea,,,N/A; Environmental Standard,C16H20N4O4S,364.12052612,CC(C)NC(=O)NS(=O)(=O)C1=C(C=CN=C1)NC2=CC=CC(=C2)CO,"InChI=1S/C16H20N4O4S/c1-11(2)18-16(22)20-25(23,24)15-9-17-7-6-14(15)19-13-5-3-4-12(8-13)10-21/h3-9,11,21H,10H2,1-2H3,(H,17,19)(H2,18,20,22)",99300-68-2,155897,,,,CID:14475217,WCYVLAMJCQZUCR-UHFFFAOYSA-N,29790247, +11531,,Valeryl-4-hydroxyvalsartan,smiles,standard compound,11531,Valeryl-4-hydroxyvalsartan,Valery 4-Hydroxy Valsartan,2-[4-hydroxypentanoyl-[[4-[2-(2H-tetrazol-5-yl)phenyl]phenyl]methyl]amino]-3-methylbutanoic acid,,,N/A; Environmental Standard,C24H29N5O4,451.221954408,CC(C)C(C(=O)O)N(CC1=CC=C(C=C1)C2=CC=CC=C2C3=NNN=N3)C(=O)CCC(C)O,"InChI=1S/C24H29N5O4/c1-15(2)22(24(32)33)29(21(31)13-8-16(3)30)14-17-9-11-18(12-10-17)19-6-4-5-7-20(19)23-25-27-28-26-23/h4-7,9-12,15-16,22,30H,8,13-14H2,1-3H3,(H,32,33)(H,25,26,27,28)",188259-69-0,,,,,CID:17974863,ICSQZMPILLPFKC-UHFFFAOYSA-N,16463911, +11535,,Zolpidem-carboxylic-acid,smiles,standard compound,11535,Zolpidem-carboxylic-acid,Zolpidem Carboxylic Acid,"4-[3-[2-(dimethylamino)-2-oxoethyl]-6-methylimidazo[1,2-a]pyridin-2-yl]benzoic acid",,,N/A; Environmental Standard,C19H19N3O3,337.142641468,CC1=CN2C(=NC(=C2CC(=O)N(C)C)C3=CC=C(C=C3)C(=O)O)C=C1,"InChI=1S/C19H19N3O3/c1-12-4-9-16-20-18(13-5-7-14(8-6-13)19(24)25)15(22(16)11-12)10-17(23)21(2)3/h4-9,11H,10H2,1-3H3,(H,24,25)",109461-65-6,,,,,CID:11966044,FELZONDEFBLTSP-UHFFFAOYSA-N,10140042, diff --git a/inst/sample_metadata/settings.ini b/inst/sample_metadata/settings.ini new file mode 100644 index 0000000..ba9b1a8 --- /dev/null +++ b/inst/sample_metadata/settings.ini @@ -0,0 +1,213 @@ +# Sample configuration file for RMassBank. +# Adapt this file to your needs. +# NOTE: Do not indent with TAB characters! Use only spaces. +# (If your editor converts TAB to a certain number of spaces, it's OK.) +# Use a space after the colon. + +# Deprofile input data? +# Leave empty if input data is already in "centroid" mode. +# Use values deprofile.spline, deprofile.fwhm or deprofile.localMax to convert the input data with the +# corresponding algorithm. See ?deprofile +deprofile: + +# Deviation (in minutes) allowed the for retention time +rtMargin: 0.4 +# Systematic retention time shift +rtShift: 0.0 + +# Directory to OpenBabel. Required for creating molfiles for MassBank export. +# If no OpenBabel directory is given, RMassBank will attempt to use the CACTUS webservice +# for SDF generation. You really should install OpenBabel though; the CACTUS structures +# have explicit hydrogen atoms... +# Points to the directory where babel.exe (or the Linux "babel" equivalent) lies. +babeldir: +# Example: +# babeldir: '"C:\Program Files (x86)\OpenBabel-2.3.1"\' + +# Which MassBank record version to use; version 2 is advised. +use_version: 2 + +# Include reanalyzed peaks? +use_rean_peaks: TRUE + +# annotate the spectra files with (putative) molecular formulas for fragments? +add_annotation: TRUE + +# Annotations for the spectrum: +annotations: + # Author etc. annotation + authors: C. Meyer [dtc], B. Beck [dtc,com], J. Hollender [dtc] + copyright: Copyright (C) Eawag 2023 + publication: Meyer, C., Stravs, M., Hollender, J.. How Wastewater Reflects Human Metabolism - Suspect Screening of Pharmaceutical Metabolites in Wastewater Influent. Submitted. + + license: CC BY-SA + instrument: Exploris 240 Orbitrap Thermo Scientific + instrument_type: LC-ESI-QFT + confidence_comment: standard compound + compound_class: N/A; Environmental Standard + internal_id_fieldname: UCHEM_ID + # + # HPLC annotations: + # + # example: lc_gradient: 90/10 at 0 min, 50/50 at 4 min, 5/95 at 17 min, 5/95 at 25 min, 90/10 at 25.1 min, 90/10 at 30 min + lc_gradient: 90/10 at 0 min, 50/50 at 4 min, 5/95 at 17 min, 5/95 at 25 min, 90/10 at 25.1 min, 90/10 at 30 min + # example: lc_flow: 200 uL/min + lc_flow: 200 uL/min + # example: lc_solvent_a: water with 0.1% formic acid + lc_solvent_a: water with 0.1% formic acid + lc_solvent_b: MeOH with 0.1% formic acid + # example: lc_column: XBridge C18 3.5um, 2.1x50mm, Waters + lc_column: XBridge C18 3.5um, 2.1x50mm, Waters + # Prefix for MassBank accession IDs + entry_prefix: EQ + contributor_prefix: Eawag + ms_type: MS2 + ionization: ESI + ms_dataprocessing: + RECALIBRATE: loess on assigned fragments and MS1 + +# Annotator: +# by default, "annotator.default" is used. +# If you want to build your custom annotator (check ?annotator.default and the source code), +# select it here by using e.g. +# annotator: annotator.myown +# for a function annotator.myown(annotation) + +# List of data-dependent scans in their order (relative to the parent scan), for annotation of the MassBank records +# For every data-dependent scan event, specify an element with: +# mode: fragmentation mode, e.g. CID +# ces: "short" format collision energy (for record title) +# ce: "long" format collision energy (for annotation field) +# res: FT resolution +spectraList: + # First scan: HCD 15% NCE, resolution 17500 +- mode: HCD + ces: 15% + ce: 15 % (nominal) + res: 17500 + # Third scan, etc. +- mode: HCD + ces: 30% + ce: 30 % (nominal) + res: 17500 +- mode: HCD + ces: 45% + ce: 45 % (nominal) + res: 17500 +- mode: HCD + ces: 60% + ce: 60 % (nominal) + res: 17500 +- mode: HCD + ces: 75% + ce: 75 % (nominal) + res: 17500 +- mode: HCD + ces: 90% + ce: 90 % (nominal) + res: 17500 +- mode: HCD + ces: 120% + ce: 120 % (nominal) + res: 17500 +- mode: HCD + ces: 150% + ce: 150 % (nominal) + res: 17500 +- mode: HCD + ces: 180% + ce: 180 % (nominal) + res: 17500 + + +# Shifts of the starting points for RMassBank accession numbers. +# Change these if you measure different adducts +accessionNumberShifts: + pH: 0 # [M+H]+: Accession numbers 1-14 + pM: 16 # [M]+: 17-30 + pNa: 32 # [M+Na]+: 33-46 + mH: 50 # [M-H]-: 51-64 + mFA: 66 # [M+FA]-: 67-80 + +# A list of known electronic noise peaks +electronicNoise: + +# Exclusion width of electronic noise peaks (from unmatched peaks, prior to +# reanalysis) +electronicNoiseWidth: 0.3 + +accessionBuilder: "MSBNK-{contributor_prefix}-{entry_prefix}{compound_id(6)}{scan_id(2)}" + +# recalibration settings: +# recalibrate by: dppm or dmz +recalibrateBy: dppm + +# recalibrate MS1: +# separately (separate) +# with common curve (common) +# do not recalibrate (none) +recalibrateMS1: common +# Window width to look for MS1 peaks to recalibrate (in ppm) +recalibrateMS1Window: 10 + +# Custom recalibration function: You can overwrite the recal function by +# making any function which takes rcdata$recalfield ~ rcdata$mzFound. +# The settings define which recal function is used. +# Note: if recalibrateMS1 is "common", the setting "recalibrator: MS1" is meaningless +# because the MS1 points will be recalibrated together with the MS2 points with +# the MS2 recalibration function. +recalibrator: + MS1: recalibrate.loess + MS2: recalibrate.loess + +# Define the multiplicity filtering level +# Default is 2 (peak occurs at least twice) +# Set this to 1 if you want to turn this option off. +# Set this to anything > 2 if you want harder filtering +multiplicityFilter: 2 + +# Define the title format. +# You can use all entries from MassBank records as tokens +# plus the additional token RECORD_TITLE_CE, which is a shortened +# version of the collision energy specifically for use in the title. +# Every line is one entry and must have one token in curly brackets +# e.g. {CH$NAME} or {AC$MASS_SPECTROMETRY: MS_TYPE} plus optionally +# additional text in front or behind e.g. +# R={AC$MASS_SPECTROMETRY: RESOLUTION} +# If this is not specified, it defaults to a title of the format +# "Dinotefuran; LC-ESI-QFT; MS2; CE: 35%; R=35000; [M+H]+" +# Note how everything must be in "" here because otherwise the : are getting mangled! +titleFormat: +- "{CH$NAME}" +- "{AC$INSTRUMENT_TYPE}" +- "{AC$MASS_SPECTROMETRY: MS_TYPE}" +- "CE: {RECORD_TITLE_CE}" +- "R={AC$MASS_SPECTROMETRY: RESOLUTION}" +- "{MS$FOCUSED_ION: PRECURSOR_TYPE}" + +# Define filter settings. +# For Orbitrap, settings of 15 ppm in low mass range, 10 ppm in high +# mass range, m/z = 120 as mass range division and 5 ppm for recalibrated +# data overall are recommended. +filterSettings: + ppmHighMass: 10 + ppmLowMass: 15 + massRangeDivision: 120 + ppmFine: 5 + prelimCut: 10000 + prelimCutRatio: 0.05 + fineCut: 0 + fineCutRatio: 0 + specOkLimit: 1000 + dbeMinLimit: -0.5 + satelliteMzLimit: 0.5 + satelliteIntLimit: 0.05 + + # Define raw MS retrieval settings. +findMsMsRawSettings: + ppmFine: 10 + mzCoarse: 0.5 + # fillPrecursorScan is FALSE for "good" mzML files which have all the info needed. + # However, for example AB Sciex files will have missing precursor scan information, + # in which case fillPrecursorScan = TRUE is needed. Try it out. + fillPrecursorScan: FALSE diff --git a/vignettes/RMassBankEics.Rmd b/vignettes/RMassBankEics.Rmd new file mode 100644 index 0000000..c497d16 --- /dev/null +++ b/vignettes/RMassBankEics.Rmd @@ -0,0 +1,183 @@ +--- +title: "RMassBank: eicWorkflow" +author: "Michael Stravs" +output: + BiocStyle::html_document: + toc_float: true +vignette: > + %\VignetteIndexEntry{RMassBank: The workflow by example} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteKeywords{Mass Spectrometry, MS, Metabolomics, Bioinformatics} + %\VignetteEncoding{UTF-8} + %\VignetteDepends{RMassBank, RMassBankData, BiocStyle} + %\VignettePackage{RMassBank} +--- +```{r echo=FALSE} +options(width=74) +``` +# Introduction + +The `eicWorkflow` is an enhancement to the RMassBank data processing workflow, +which uses correlation of parent and fragment EICs as an additional quality criterion. +It requires data where MS2 are acquired in a targeted way over a retention time window, +so that MS2 data is acquired over the entire chromatographic compound peak of the +precursor and can be correlated with the MS1 EIC. + +It can be used + +*RMassBank* is a two-part computational mass spectrometry workflow: + +* In the first step, MSMS spectra of compounds are extracted from raw LC-MS data files, + the MSMS spectra are recalibrated using assigned fragment formulas, and effectively + denoised by using only annotated peaks (plus peaks which can be manually added.) +* In the second step, the processed, recalibrated, cleaned data is prepared for + submission to a MassBank database. Compounds are first automatically annotated using + information from the Chemical Translation Service (CTS). After manually checking and + fixing the annotations, the information is compiled together with the spectral data + into MassBank records, which can then be uploaded to a MassBank database. + +This vignette describes basic usage with the standard workflow. The package is +flexible and allows for different advanced use cases. Examples of specialized +applications of *RMassBank* are available at the *RMassBank* +message board hosted by the Metabolomics-Forum: +(http://www.metabolomics-forum.com/viewforum.php?f=29). + +# Installation and loading + +The library is available from Bioconductor ((http://www.bioconductor.org)). +In addition to the library itself, it is recommended to install the OpenBabel +chemical toolkit, available from (http://www.openbabel.org) for various +platforms (or via Linux package distribution systems). + +The library is loaded as follows + +```{r load-librarysource} +library(RMassBank) +``` + +The data used in the following example is available from Zenodo +(https://zenodo.org/uploads/11198385) + +```{r get-zenodo} + +data_dir <- tempfile("dir") +fs::dir_create(data_dir) +options(timeout = max(300, getOption("timeout"))) +download.file( + "https://zenodo.org/api/records/11104371/files-archive", + fs::path(data_dir, "data.zip"), + mode = "wb" +) +zip::unzip(fs::path(data_dir, "data.zip"), + exdir = data_dir) +``` + +# Input files and settings + + +```{r compound.list} +file.copy(system.file("sample_metadata/compounds.csv", + package="RMassBank"), fs::path(data_dir, "compounds.csv") +) +file.copy(system.file("sample_metadata/settings.ini", + package="RMassBank"), fs::path(data_dir, "settings.ini")) + +loadList(fs::path(data_dir, "compounds.csv")) +loadRmbSettings(fs::path(data_dir, "settings.ini")) +``` + +## Standard workflow + +Here, we quickly run the normal RMassBank workflow as described in the main vignette: + +```{r msmsWorkflow} + + +cpds <- readr::read_csv(fs::path(data_dir, "compounds.csv")) +files <- fs::dir_ls(data_dir, glob="*.mzML") +match_files <- files |> + stringr::str_match(".*_(?[0-9]+)\\.mzML") |> + magrittr::set_colnames(c("file", "id")) |> + tibble::as_tibble() +match_files$ids + +w <- newMsmsWorkspace() +w <- msmsRead(w, files = match_files$file, cpdids = match_files$id, + mode = "pH", readMethod = "mzR") + +w <- msmsWorkflow(w, mode="pH", steps=c(2:8)) + +``` + +## EIC extraction and correlation + +To get higher-quality spectra, we can extract parent and fragment EICs for all +compounds and find correlations. + +Step 1 extracts the EIC for the parents: + +```{r eicWorkflow-step1} + +w <- eicWorkflow(w, mode="pH", steps=1) + +attr(w@spectra[[2]], "eic") |> + ggplot2::ggplot() + + ggplot2::aes(x=rt, y=intensity) + + ggplot2::geom_line() +``` + +Step 2 extracts EICs for individual fragments. (For this, MSMS must be acquired in targeted mode) + +```{r eicWorkflow-step2} + +w <- eicWorkflow(w, mode="pH", steps=2) + +eics <- attr(w@spectra[[2]]@children[[4]], "eics") +eics[1:12] |> + dplyr::bind_rows(, .id="fragment") |> + ggplot2::ggplot() + + ggplot2::aes(x=rt, y=intensity) + + ggplot2::geom_line() + + ggplot2::facet_wrap(ggplot2::vars(mz)) + +``` + +Step 3 and 4 calculate parent-fragment correlations, and use the formula annotation results +(the original RMassBank approach) to estimate a threshold for correlation. + +Step 5 applies the filtering. Note that in contrast to the classical `msmsWorkflow`, + + +```{r eicWorkflow-step345, fig.show='hide'} + +w <- eicWorkflow(w, mode="pH", steps=3:4) +attr(w, "eicScoreFilter") +getData(w@spectra[[2]]@children[[4]]) |> nrow() + +w <- eicWorkflow(w, mode="pH", steps=5) +getData(w@spectra[[2]]@children[[4]]) |> nrow() + + +``` +# Record export + +Records are exported in the usual manner. + +```{r} +mb <- newMbWorkspace(w) + +mb <- mbWorkflow(mb) + +mb <- resetInfolists(mb) +fs::dir_create(fs::path(data_dir, "infolists")) +file.copy(system.file("sample_metadata/infolist_pH.csv", + package="RMassBank"), fs::path(data_dir, "infolists")) +mb <- loadInfolists(mb, fs::path(data_dir, "infolists")) +mb <- mbWorkflow(mb) +``` + +# Session information + +```{r } +sessionInfo() +```