diff --git a/DESCRIPTION b/DESCRIPTION index adeaffc..4935599 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,63 +1,65 @@ -Package: RMassBank -Type: Package -Title: Workflow to process tandem MS files and build MassBank records -Version: 2.11.2 -Authors@R: c( - person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", - role=c("cre")), - person(given = "Michael A.", family = "Stravs", email = - "michael.stravs@eawag.ch", role=c("aut")), person(given = "Emma L.", - family = "Schymanski", email = "emma.schymanski@eawag.ch", role=c("aut")), - person(given = "Steffen", family = "Neumann", role = "aut", email = - "sneumann@ipb-halle.de"), person(given = "Erik", family = "Muller", role = - "aut", email = "erik.mueller@student.uni-halle.de"), person(given = - "Tobias", family = "Schulze", role = "ctb", email = - "tobias.schulze@ufz.de") ) -Author: Michael Stravs, Emma Schymanski, Steffen Neumann, Erik Mueller, with - contributions from Tobias Schulze -Maintainer: RMassBank at Eawag -Description: Workflow to process tandem MS files and build MassBank records. - Functions include automated extraction of tandem MS spectra, formula - assignment to tandem MS fragments, recalibration of tandem MS spectra with - assigned fragments, spectrum cleanup, automated retrieval of compound - information from Internet databases, and export to MassBank records. -License: Artistic-2.0 -SystemRequirements: OpenBabel -biocViews: Bioinformatics, MassSpectrometry, Metabolomics, Software -Depends: - Rcpp -Encoding: UTF-8 -Imports: - XML,RCurl,rjson,S4Vectors,digest, - rcdk(>= 3.4.7.1),yaml,mzR,methods,Biobase,MSnbase,httr -Suggests: - gplots,RMassBankData, - xcms (>= 1.37.1), - CAMERA, - RUnit, - enviPat -Collate: - 'alternateAnalyze.R' - 'createMassBank.R' - 'formulaCalculator.R' - 'getSplash.R' - 'leCsvAccess.R' - 'leMsMs.r' - 'leMsmsRaw.R' - 'msmsRawExtensions.r' - 'settings_example.R' - 'webAccess.R' - 'deprofile.R' - 'parseMassBank.R' - 'SpectrumClasses.R' - 'SpectrumMethods.R' - 'RmbWorkspace.R' - 'RmbWorkspaceUpdate.R' - 'SpectraSetMethods.R' - 'AggregateMethods.R' - 'validateMassBank.R' - 'tools.R' - 'msmsRead.R' - 'Isotopic_Annotation.R' - 'zzz.R' -RoxygenNote: 6.1.0 +Package: RMassBank +Type: Package +Title: Workflow to process tandem MS files and build MassBank records +Version: 2.15.1 +Authors@R: c( + person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", + role=c("cre")), + person(given = "Michael A.", family = "Stravs", email = + "michael.stravs@eawag.ch", role=c("aut")), person(given = "Emma L.", + family = "Schymanski", email = "emma.schymanski@eawag.ch", role=c("aut")), + person(given = "Steffen", family = "Neumann", role = "aut", email = + "sneumann@ipb-halle.de"), person(given = "Erik", family = "Muller", role = + "aut", email = "erik.mueller@student.uni-halle.de"), person(given = + "Tobias", family = "Schulze", role = "ctb", email = + "tobias.schulze@ufz.de"), person(given = + "Hendrik", family = "Treutler", role = "ctb", email = + "hendrik.treutler@gmail.com") ) +Author: Michael Stravs, Emma Schymanski, Steffen Neumann, Erik Mueller, with + contributions from Tobias Schulze +Maintainer: RMassBank at Eawag +Description: Workflow to process tandem MS files and build MassBank records. + Functions include automated extraction of tandem MS spectra, formula + assignment to tandem MS fragments, recalibration of tandem MS spectra with + assigned fragments, spectrum cleanup, automated retrieval of compound + information from Internet databases, and export to MassBank records. +License: Artistic-2.0 +SystemRequirements: OpenBabel +biocViews: ImmunoOncology, Bioinformatics, MassSpectrometry, Metabolomics, Software +Depends: + Rcpp +Encoding: UTF-8 +Imports: + XML,RCurl,rjson,S4Vectors,digest, + rcdk,yaml,mzR,methods,Biobase,MSnbase,httr, + enviPat +Suggests: + gplots,RMassBankData, + xcms (>= 1.37.1), + CAMERA, + RUnit +Collate: + 'alternateAnalyze.R' + 'createMassBank.R' + 'formulaCalculator.R' + 'getSplash.R' + 'leCsvAccess.R' + 'leMsMs.r' + 'leMsmsRaw.R' + 'msmsRawExtensions.r' + 'settings_example.R' + 'webAccess.R' + 'deprofile.R' + 'parseMassBank.R' + 'SpectrumClasses.R' + 'SpectrumMethods.R' + 'RmbWorkspace.R' + 'RmbWorkspaceUpdate.R' + 'SpectraSetMethods.R' + 'AggregateMethods.R' + 'validateMassBank.R' + 'tools.R' + 'msmsRead.R' + 'Isotopic_Annotation.R' + 'zzz.R' +RoxygenNote: 6.1.1 diff --git a/NAMESPACE b/NAMESPACE index a589a8b..99ec8ed 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,139 +1,142 @@ -# Generated by roxygen2: do not edit by hand - -export(CTS.externalIdSubset) -export(CTS.externalIdTypes) -export(RmbDefaultSettings) -export(RmbSettingsTemplate) -export(add.formula) -export(addMB) -export(addPeaks) -export(addPeaksManually) -export(addProperty) -export(aggregateSpectra) -export(analyzeMsMs) -export(analyzeMsMs.formula) -export(analyzeMsMs.intensity) -export(annotator.default) -export(archiveResults) -export(checkIsotopes) -export(checkSpectra) -export(cleanElnoise) -export(combineMultiplicities) -export(compileRecord) -export(createMolfile) -export(dbe) -export(deprofile) -export(deprofile.fwhm) -export(deprofile.localMax) -export(deprofile.scan) -export(deprofile.spline) -export(exportMassbank) -export(filterMultiplicity) -export(filterPeakSatellites) -export(filterPeaksMultiplicity) -export(findCAS) -export(findEIC) -export(findFormula) -export(findLevel) -export(findMass) -export(findMsMsHR) -export(findMsMsHR.direct) -export(findMsMsHR.mass) -export(findMsMsHR.ticms2) -export(findMsMsHRperxcms) -export(findMsMsHRperxcms.direct) -export(findMz) -export(findMz.formula) -export(findName) -export(findProgress) -export(findRt) -export(findSmiles) -export(flatten) -export(formulastring.to.list) -export(gatherCompound) -export(gatherData) -export(gatherDataBabel) -export(gatherDataUnknown) -export(gatherPubChem) -export(gatherSpectrum) -export(getCSID) -export(getCactus) -export(getCtsKey) -export(getCtsRecord) -export(getMolecule) -export(getPcId) -export(is.valid.formula) -export(list.to.formula) -export(loadInfolist) -export(loadInfolists) -export(loadList) -export(loadMsmsWorkspace) -export(loadRmbSettings) -export(loadRmbSettingsFromEnv) -export(makeMollist) -export(makePeaksCache) -export(makeRecalibration) -export(mbWorkflow) -export(msmsRead) -export(msmsRead.RAW) -export(msmsWorkflow) -export(multiply.formula) -export(newMbWorkspace) -export(newMsmsWorkspace) -export(order.formula) -export(parseMassBank) -export(peaksMatched) -export(peaksUnmatched) -export(plotMbWorkspaces) -export(plotRecalibration) -export(plotRecalibration.direct) -export(ppm) -export(problematicPeaks) -export(processProblematicPeaks) -export(progressBarHook) -export(readMbdata) -export(reanalyzeFailpeak) -export(reanalyzeFailpeaks) -export(recalibrate) -export(recalibrate.addMS1data) -export(recalibrate.identity) -export(recalibrate.linear) -export(recalibrate.loess) -export(recalibrate.mean) -export(recalibrateSingleSpec) -export(recalibrateSpectra) -export(resetInfolists) -export(resetList) -export(selectPeaks) -export(selectSpectra) -export(smiles2mass) -export(spectraCount) -export(to.limits.rcdk) -export(toMassbank) -export(toRMB) -export(updateSettings) -export(validate) -exportClasses(mbWorkspace) -exportClasses(msmsWorkspace) -exportMethods(addProperty) -exportMethods(checkSpectra) -exportMethods(getData) -exportMethods(peaksMatched) -exportMethods(peaksUnmatched) -exportMethods(selectPeaks) -exportMethods(selectSpectra) -exportMethods(setData) -exportMethods(show) -exportMethods(spectraCount) -import(Biobase) -import(MSnbase) -import(RCurl) -import(Rcpp) -import(S4Vectors) -import(XML) -import(digest) -import(methods) -import(mzR) -import(rcdk) -import(rjson) -import(yaml) +# Generated by roxygen2: do not edit by hand + +export(CTS.externalIdSubset) +export(CTS.externalIdTypes) +export(RmbDefaultSettings) +export(RmbSettingsTemplate) +export(add.formula) +export(addMB) +export(addPeaks) +export(addPeaksManually) +export(addProperty) +export(aggregateSpectra) +export(analyzeMsMs) +export(analyzeMsMs.formula) +export(analyzeMsMs.intensity) +export(annotator.default) +export(archiveResults) +export(checkIsotopes) +export(checkSpectra) +export(cleanElnoise) +export(combineMultiplicities) +export(compileRecord) +export(createMolfile) +export(dbe) +export(deprofile) +export(deprofile.fwhm) +export(deprofile.localMax) +export(deprofile.scan) +export(deprofile.spline) +export(exportMassbank) +export(filterMultiplicity) +export(filterPeakSatellites) +export(filterPeaksMultiplicity) +export(findCAS) +export(findEIC) +export(findFormula) +export(findLevel) +export(findMass) +export(findMsMsHR) +export(findMsMsHR.direct) +export(findMsMsHR.mass) +export(findMsMsHR.ticms2) +export(findMsMsHRperxcms) +export(findMsMsHRperxcms.direct) +export(findMsMsHRperMsp) +export(findMsMsHRperMsp.direct) +export(findMz) +export(findMz.formula) +export(findName) +export(findProgress) +export(findRt) +export(findSmiles) +export(flatten) +export(formulastring.to.list) +export(gatherCompound) +export(gatherData) +export(gatherDataBabel) +export(gatherDataUnknown) +export(gatherPubChem) +export(gatherSpectrum) +export(getCSID) +export(getCactus) +export(getCtsKey) +export(getCtsRecord) +export(getMolecule) +export(getPcId) +export(is.valid.formula) +export(list.to.formula) +export(loadInfolist) +export(loadInfolists) +export(loadList) +export(loadMsmsWorkspace) +export(loadRmbSettings) +export(loadRmbSettingsFromEnv) +export(makeMollist) +export(makePeaksCache) +export(makeRecalibration) +export(mbWorkflow) +export(msmsRead) +export(msmsRead.RAW) +export(msmsWorkflow) +export(multiply.formula) +export(newMbWorkspace) +export(newMsmsWorkspace) +export(order.formula) +export(parseMassBank) +export(peaksMatched) +export(peaksUnmatched) +export(plotMbWorkspaces) +export(plotRecalibration) +export(plotRecalibration.direct) +export(ppm) +export(problematicPeaks) +export(processProblematicPeaks) +export(progressBarHook) +export(readMbdata) +export(reanalyzeFailpeak) +export(reanalyzeFailpeaks) +export(recalibrate) +export(recalibrate.addMS1data) +export(recalibrate.identity) +export(recalibrate.linear) +export(recalibrate.loess) +export(recalibrate.mean) +export(recalibrateSingleSpec) +export(recalibrateSpectra) +export(resetInfolists) +export(resetList) +export(selectPeaks) +export(selectSpectra) +export(smiles2mass) +export(spectraCount) +export(to.limits.rcdk) +export(toMassbank) +export(toRMB) +export(updateSettings) +export(validate) +exportClasses(mbWorkspace) +exportClasses(msmsWorkspace) +exportMethods(addProperty) +exportMethods(checkSpectra) +exportMethods(getData) +exportMethods(peaksMatched) +exportMethods(peaksUnmatched) +exportMethods(selectPeaks) +exportMethods(selectSpectra) +exportMethods(setData) +exportMethods(show) +exportMethods(spectraCount) +import(Biobase) +import(MSnbase) +import(RCurl) +import(Rcpp) +import(S4Vectors) +import(XML) +import(digest) +import(methods) +import(mzR) +import(rcdk) +import(rjson) +import(yaml) +import(enviPat) diff --git a/R/Isotopic_Annotation.R b/R/Isotopic_Annotation.R index 760f208..4c76f33 100644 --- a/R/Isotopic_Annotation.R +++ b/R/Isotopic_Annotation.R @@ -53,32 +53,10 @@ checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 0, intensity_precis # Load filtersettings filterSettings = settings$filterSettings - # Assign formula additions according to code - if(mode == "pH") { - allowed_additions <- "H" - mode.charge <- 1 - } else if(mode == "pNa") { - allowed_additions <- "Na" - mode.charge <- 1 - } else if(mode == "pM") { - allowed_additions <- "" - mode.charge <- 1 - } else if(mode == "mM") { - allowed_additions <- "" - mode.charge <- -1 - } else if(mode == "mH") { - allowed_additions <- "H-1" - mode.charge <- -1 - } else if(mode == "mFA") { - allowed_additions <- "C2H3O2" - mode.charge <- -1 - } else if(mode == "pNH4") { - allowed_additions <- "NH4" - mode.charge <- 1 - } else{ - stop("mode = \"", mode, "\" not defined") - } - + # get the adduct additions + adductProperties <- getAdductProperties(mode, msmsPeaks@formula) + allowed_additions <- adductProperties$addition + mode.charge <- adductProperties$charge # "default" isotopes (i.e. those with the highest abundance) defIsotopes <- c("107Ag", "27Al", "40Ar", "75As", "197Au", "11B", "138Ba", "9Be", "209Bi", diff --git a/R/RmbWorkspace.R b/R/RmbWorkspace.R index 72fe617..05b01d2 100755 --- a/R/RmbWorkspace.R +++ b/R/RmbWorkspace.R @@ -144,7 +144,9 @@ setClass("mbWorkspace", # output data: compiled = "list", compiled_ok = "list", + compiled_notOk = "list", mbfiles = "list", + mbfiles_notOk = "list", molfile = "list", ok = "integer", problems = "integer" diff --git a/R/SpectrumMethods.R b/R/SpectrumMethods.R index dc7b71e..e5144a0 100644 --- a/R/SpectrumMethods.R +++ b/R/SpectrumMethods.R @@ -21,7 +21,8 @@ setMethod("getData", c("RmbSpectrum2"), function(s) { peaks <- s@peaksCount cols <- c("mz", "intensity", "satellite", "low", "rawOK", "good", "mzCalc", "formula", "dbe", "formulaCount", "dppm", "dppmBest") - cols.isFilled <- unlist(lapply(cols, function(col) length(slot(s, col)) == peaks)) + slotLength <- unlist(lapply(cols, function(col) length(slot(s, col)))) + cols.isFilled <- slotLength == peaks cols.filled <- cols[cols.isFilled] data <- lapply(cols.filled, function(col) slot(s, col)) data$stringsAsFactors <- FALSE diff --git a/R/alternateAnalyze.R b/R/alternateAnalyze.R index 20626ba..42d2b6e 100644 --- a/R/alternateAnalyze.R +++ b/R/alternateAnalyze.R @@ -100,11 +100,9 @@ analyzeMsMs.formula.optimized <- function(msmsPeaks, mode="pH", detail=FALSE, ru cut <- filterSettings$prelimCut if(is.na(cut)) { - if(mode %in% c("pH", "pM", "pNa", "pNH4")) - cut <- 1e4 - else if(mode %in% c("mH", "mFA","mM")) - cut <- 0 - else stop(paste("The ionization mode", mode, "is unknown.")) + adductProperties <- getAdductProperties(mode, msmsPeaks@formula) + if(adductProperties$charge > 0) cut <- 1e4 + if(adductProperties$charge < 0) cut <- 0 } cutRatio <- filterSettings$prelimCutRatio } else{ @@ -137,32 +135,10 @@ analyzeMsMs.formula.optimized <- function(msmsPeaks, mode="pH", detail=FALSE, ru dppm=0, x1=0,x2=0,x3=0) - # define the adduct additions - if(mode == "pH") { - allowed_additions <- "H" - mode.charge <- 1 - } else if(mode == "pNa") { - allowed_additions <- "Na" - mode.charge <- 1 - } else if(mode == "pM") { - allowed_additions <- "" - mode.charge <- 1 - } else if(mode == "mM") { - allowed_additions <- "" - mode.charge <- -1 - } else if(mode == "mH") { - allowed_additions <- "H-1" - mode.charge <- -1 - } else if(mode == "mFA") { - allowed_additions <- "C2H3O2" - mode.charge <- -1 - } else if(mode == "pNH4") { - allowed_additions <- "NH4" - mode.charge <- 1 - } else{ - stop("mode = \"", mode, "\" not defined") - } - + # get the adduct additions + adductProperties <- getAdductProperties(mode, msmsPeaks@formula) + allowed_additions <- adductProperties$addition + mode.charge <- adductProperties$charge # the ppm range is two-sided here. # The range is slightly expanded because dppm calculation of diff --git a/R/createMassBank.R b/R/createMassBank.R index 7e324b8..f174272 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -75,26 +75,36 @@ loadInfolist <- function(mb, fileName) "COMMENT.ID" # Clear from padding spaces and NAs - mbdata_new <- as.data.frame(t(apply(mbdata_new, 1, function(r) + mbdata_new <- as.data.frame(x = t(apply(mbdata_new, 1, function(r) { # Substitute empty spaces by real NA values r[which(r == "")] <- NA # Trim spaces (in all non-NA fields) - r[which(!is.na(r))] <- sub("^ *([^ ]+) *$", "\\1", r[which(!is.na(r))]) + r[which(!is.na(r))] <- sub(pattern = "^ *([^ ]+) *$", replacement = "\\1", x = r[which(!is.na(r))]) return(r) - }))) + })), stringsAsFactors = FALSE) # use only the columns present in mbdata_archive, no other columns added in excel - mbdata_new <- mbdata_new[, colnames(mb@mbdata_archive)] + colNames <- colnames(mb@mbdata_archive) + commentColNames <- colnames(mbdata_new)[grepl(x = colnames(mbdata_new), pattern = "^COMMENT\\.(?!CONFIDENCE)(?!ID)", perl = TRUE)] + colNames <- c(colNames, commentColNames) + + mbdata_new <- mbdata_new[, colNames] # substitute the old entires with the ones from our files # then find the new (previously inexistent) entries, and rbind them to the table new_entries <- setdiff(mbdata_new$id, mb@mbdata_archive$id) old_entries <- intersect(mbdata_new$id, mb@mbdata_archive$id) + + for(colname in colnames(mb@mbdata_archive)) + mb@mbdata_archive[, colname] <- as.character(mb@mbdata_archive[, colname]) + for(entry in old_entries) mb@mbdata_archive[mb@mbdata_archive$id == entry,] <- mbdata_new[mbdata_new$id == entry,] - mb@mbdata_archive <- rbind(mb@mbdata_archive, - mbdata_new[mbdata_new$id==new_entries,]) + mb@mbdata_archive <- rbind(mb@mbdata_archive, mbdata_new[mbdata_new$id==new_entries,]) + + for(colname in colnames(mb@mbdata_archive)) + mb@mbdata_archive[, colname] <- as.factor(mb@mbdata_archive[, colname]) + return(mb) - } @@ -111,17 +121,21 @@ resetInfolists <- function(mb) CH.IUPAC = character(0), CH.LINK.CAS = character(0), CH.LINK.CHEBI = integer(0), CH.LINK.HMDB = character(0), CH.LINK.KEGG = character(0), CH.LINK.LIPIDMAPS = character(0), CH.LINK.PUBCHEM = character(0), CH.LINK.INCHIKEY = character(0), - CH.LINK.CHEMSPIDER = integer(0)), .Names = c("X", "id", "dbcas", + CH.LINK.CHEMSPIDER = integer(0), CH.LINK.COMPTOX = character(0)), .Names = c("X", "id", "dbcas", "dbname", "dataused", "COMMENT.CONFIDENCE", "COMMENT.ID", "CH.NAME1", "CH.NAME2", "CH.NAME3", "CH.COMPOUND_CLASS", "CH.FORMULA", "CH.EXACT_MASS", "CH.SMILES", "CH.IUPAC", "CH.LINK.CAS", "CH.LINK.CHEBI", - "CH.LINK.HMDB", "CH.LINK.KEGG", "CH.LINK.LIPIDMAPS", "CH.LINK.PUBCHEM", - "CH.LINK.INCHIKEY", "CH.LINK.CHEMSPIDER"), row.names = integer(0), class = "data.frame") + "CH.LINK.HMDB", "CH.LINK.KEGG", "CH.LINK.LIPIDMAPS", "CH.LINK.PUBCHEM", + "CH.LINK.INCHIKEY", "CH.LINK.CHEMSPIDER", "CH.LINK.COMPTOX"), row.names = integer(0), class = "data.frame") + if(getOption("RMassBank")$include_sp_tags) + { + mb@mbdata_archive["SP.SAMPLE"] <- character(0) + } return(mb) } -# The workflow function, i.e. (almost) the only thing you actually need to call. +# The workflow function, i.e. (almost) the only thing you actually need to call. # See below for explanation of steps. #' MassBank record creation workflow #' @@ -229,53 +243,99 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c if(4 %in% steps) { message("mbWorkflow: Step 4. Spectra compilation") - mb@compiled <- lapply( - selectSpectra(mb@spectra, "found", "object"), - function(r) { + mb@compiled <- lapply(X = selectSpectra(mb@spectra, "found", "object"), FUN = function(r) { message(paste("Compiling: ", r@name, sep="")) mbdata <- mb@mbdata_relisted[[which(mb@mbdata_archive$id == as.numeric(r@id))]] if(nrow(mb@additionalPeaks) > 0) - res <-compileRecord(r, mbdata, mb@aggregated, mb@additionalPeaks) + res <-compileRecord(spec = r, mbdata = mbdata, aggregated = mb@aggregated, additionalPeaks = mb@additionalPeaks) else - res <-compileRecord(r, mbdata, mb@aggregated, NULL, retrieval=findLevel(r@id,TRUE)) + res <-compileRecord(spec = r, mbdata = mbdata, aggregated = mb@aggregated, additionalPeaks = NULL, retrieval=findLevel(r@id,TRUE)) return(res) }) # check which compounds have useful spectra - mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) + ok <- unlist(lapply(X = selectSpectra(mb@spectra, "found", "object"), + FUN = function(spec){any(unlist(lapply(X = spec@children, FUN = function(child){child@ok})))})) + notEmpty <- unlist(lapply(X = mb@compiled, FUN = length)) > 0 + ok <- ok & notEmpty + mb@ok <- which(ok) + #mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) mb@problems <- which(is.na(mb@compiled)) - mb@compiled_ok <- mb@compiled[mb@ok] + mb@compiled_ok <- mb@compiled[mb@ok] + mb@compiled_notOk <- mb@compiled[!ok & notEmpty] } # Step 5: Convert the internal tree-like representation of the MassBank data into # flat-text string arrays (basically, into text-file style, but still in memory) if(5 %in% steps) { message("mbWorkflow: Step 5. Flattening records") - mb@mbfiles <- lapply(mb@compiled_ok, function(c) lapply(c, toMassbank)) + mb@mbfiles <- lapply(mb@compiled_ok, function(c) lapply(c, toMassbank)) + mb@mbfiles_notOk <- lapply(mb@compiled_notOk, function(c) lapply(c, toMassbank)) } # Step 6: For all OK records, generate a corresponding molfile with the structure # of the compound, based on the SMILES entry from the MassBank record. (This molfile # is still in memory only, not yet a physical file) if(6 %in% steps) { - message("mbWorkflow: Step 6. Generate molfiles") - mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(as.numeric(c[[1]][['COMMENT']][[getOption("RMassBank")$annotations$internal_id_fieldname]]))) + if(RMassBank.env$export.molfiles){ + message("mbWorkflow: Step 6. Generate molfiles") + mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(as.numeric(c[[1]][['COMMENT']][[getOption("RMassBank")$annotations$internal_id_fieldname]]))) + } else + warning("RMassBank is configured not to export molfiles (RMassBank.env$export.molfiles). Step 6 is therefore ignored.") } # Step 7: If necessary, generate the appropriate subdirectories, and actually write # the files to disk. if(7 %in% steps) { message("mbWorkflow: Step 7. Generate subdirs and export") - dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "moldata", sep='/'),recursive=TRUE) - dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "recdata", sep='/'),recursive=TRUE) - for(cnt in 1:length(mb@compiled_ok)) - exportMassbank(mb@compiled_ok[[cnt]], mb@mbfiles[[cnt]], mb@molfile[[cnt]]) + + ## create folder + filePath_recData_valid <- file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata") + filePath_recData_invalid <- file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata_invalid") + filePath_molData <- file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata") + + if(!file.exists(filePath_recData_valid)) if(!dir.create(filePath_recData_valid,recursive=TRUE)) stop(paste("Could not create folder", filePath_recData_valid)) + if(RMassBank.env$export.molfiles) + if(!file.exists(filePath_molData)) if(!dir.create(filePath_molData,recursive=TRUE)) stop(paste("Could not create folder", filePath_molData)) + if(RMassBank.env$export.invalid & length(mb@mbfiles_notOk) > 0) + if(!file.exists(filePath_recData_invalid)) if(!dir.create(filePath_recData_invalid,recursive=TRUE)) stop(paste("Could not create folder", filePath_recData_invalid)) + + if(length(mb@molfile) == 0) + mb@molfile <- as.list(rep(x = NA, times = length(mb@compiled_ok))) + + ## export valid spectra + for(cnt in seq_along(mb@compiled_ok)){ + exportMassbank_recdata( + accessions = unlist(lapply(X = mb@compiled_ok[[cnt]], FUN = "[", "ACCESSION")), + files = mb@mbfiles[[cnt]], + recDataFolder = filePath_recData_valid + ) + + if(findLevel(mb@compiled_ok[[cnt]][[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]],TRUE)=="standard" & RMassBank.env$export.molfiles) + exportMassbank_moldata( + cpdID = as.numeric(mb@compiled_ok[[cnt]][[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]]), + molfile = mb@molfile[[cnt]], + molDataFolder = filePath_molData + ) + } + + ## export invalid spectra + if(RMassBank.env$export.invalid) + for(cnt in seq_along(mb@compiled_notOk)) + exportMassbank_recdata( + accessions = unlist(lapply(X = mb@compiled_notOk[[cnt]], FUN = "[", "ACCESSION")), + files = mb@mbfiles_notOk[[cnt]], + recDataFolder = filePath_recData_invalid + ) } # Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank # to attribute substances to their corresponding structure molfiles. if(8 %in% steps) { - message("mbWorkflow: Step 8. Create list.tsv") - makeMollist(mb@compiled_ok) + if(RMassBank.env$export.molfiles){ + message("mbWorkflow: Step 8. Create list.tsv") + makeMollist(compiled = mb@compiled_ok) + } else + warning("RMassBank is configured not to export molfiles (RMassBank.env$export.molfiles). Step 8 is therefore ignored.") } return(mb) } @@ -527,6 +587,13 @@ gatherData <- function(id) csid <- getCactus(inchikey_split, 'chemspider_id') } + ##Get CompTox + comptox <- getCompTox(inchikey_split) + + if(is.null(comptox)){ + comptox <- NA + } + ##Use CTS to retrieve information CTSinfo <- getCtsRecord(inchikey_split) @@ -578,7 +645,7 @@ gatherData <- function(id) # COMMENT: EAWAG_UCHEM_ID 1234 # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" mbdata[["COMMENT"]] <- list() - if(findLevel(id) == "0"){ + if(findLevel(id) == "0"){ mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment } else{ level <- findLevel(id) @@ -615,8 +682,18 @@ gatherData <- function(id) if(level == c("5")){ mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" } - } - mbdata[["COMMENT"]][["ID"]] = id + } + + mbdata[["COMMENT"]][["ID"]] = id + + ## add generic COMMENT information + rowIdx <- which(.listEnvEnv$listEnv$compoundList$ID == id) + properties <- colnames(.listEnvEnv$listEnv$compoundList) + properties2 <- gsub(x = properties, pattern = "^COMMENT ", replacement = "") + theseProperties <- grepl(x = properties, pattern = "^COMMENT ") + theseProperties <- theseProperties & (!(unlist(.listEnvEnv$listEnv$compoundList[rowIdx, ]) == "NA" | is.na(unlist(.listEnvEnv$listEnv$compoundList[rowIdx, ])))) + mbdata[["COMMENT"]][properties2[theseProperties]] <- unlist(.listEnvEnv$listEnv$compoundList[rowIdx, theseProperties]) + # here compound info starts mbdata[['CH$NAME']] <- names # Currently we use a fixed value for Compound Class, since there is no useful @@ -711,7 +788,8 @@ gatherData <- function(id) } link[["INCHIKEY"]] <- inchikey_split - if(length(csid)>0) if(any(!is.na(csid))) link[["CHEMSPIDER"]] <- min(as.numeric(as.character(csid))) + link[["COMPTOX"]] <- comptox + if(length(csid)>0) if(any(!is.na(csid))) link[["CHEMSPIDER"]] <- min(as.numeric(as.character(csid[!is.na(csid)]))) mbdata[['CH$LINK']] <- link mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument @@ -1046,16 +1124,20 @@ flatten <- function(mbdata) { .checkMbSettings() + colNames <- names(unlist(mbdata[[1]])) + commentNames <- colNames[grepl(x = colNames, pattern = "^COMMENT\\.")] + colList <- c( "id", "dbcas", "dbname", "dataused", - "COMMENT.CONFIDENCE", + commentNames, + #"COMMENT.CONFIDENCE", # Note: The field name of the internal id field is replaced with the real name # at "compilation" time. Therefore, functions DOWNSTREAM from compileRecord() # must use the full name including the info from options("RMassBank"). - "COMMENT.ID", + #"COMMENT.ID", "CH$NAME1", "CH$NAME2", "CH$NAME3", @@ -1071,7 +1153,9 @@ flatten <- function(mbdata) "CH$LINK.LIPIDMAPS", "CH$LINK.PUBCHEM", "CH$LINK.INCHIKEY", - "CH$LINK.CHEMSPIDER") + "CH$LINK.CHEMSPIDER", + "CH$LINK.COMPTOX" + ) # make an empty data frame with the right length rows <- length(mbdata) cols <- length(colList) @@ -1114,13 +1198,18 @@ readMbdata <- function(row) mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright - mbdata[['PUBLICATION']] <- getOption("RMassBank")$annotations$publication + if(getOption("RMassBank")$annotations$publication!="") { + mbdata[['PUBLICATION']] <- getOption("RMassBank")$annotations$publication + } + commentNames <- names(row)[grepl(x = names(row), pattern = "^COMMENT\\.")] + commentNames <- commentNames[!is.na(row[commentNames])] # Read all determined fields from the file # This is not very flexible, as you can see... colList <- c( - "COMMENT.CONFIDENCE", - "COMMENT.ID", + commentNames, + #"COMMENT.CONFIDENCE", + #"COMMENT.ID", "CH$NAME1", "CH$NAME2", "CH$NAME3", @@ -1136,13 +1225,14 @@ readMbdata <- function(row) "CH$LINK.LIPIDMAPS", "CH$LINK.PUBCHEM", "CH$LINK.INCHIKEY", - "CH$LINK.CHEMSPIDER") + "CH$LINK.CHEMSPIDER", + "CH$LINK.COMPTOX") mbdata[["COMMENT"]] = list() - mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT.CONFIDENCE"]] + #mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT.CONFIDENCE"]] # Again, our ID field. + #mbdata[["COMMENT"]][["ID"]] <- row[["COMMENT.ID"]] + mbdata[["COMMENT"]][gsub(x = commentNames, pattern = "^COMMENT\\.", replacement = "")] <- row[commentNames] - mbdata[["COMMENT"]][["ID"]]<- - row[["COMMENT.ID"]] names = c(row[["CH.NAME1"]], row[["CH.NAME2"]], row[["CH.NAME3"]]) names = names[which(!is.na(names))] @@ -1163,8 +1253,12 @@ readMbdata <- function(row) link[["PUBCHEM"]] = row[["CH.LINK.PUBCHEM"]] link[["INCHIKEY"]] = row[["CH.LINK.INCHIKEY"]] link[["CHEMSPIDER"]] = row[["CH.LINK.CHEMSPIDER"]] + link[["COMPTOX"]] = row[["CH.LINK.COMPTOX"]] link[which(is.na(link))] <- NULL mbdata[["CH$LINK"]] <- link + ## SP$SAMPLE + if(all(nchar(row[["SP.SAMPLE"]]) > 0, row[["SP.SAMPLE"]] != "NA", !is.na(row[["SP.SAMPLE"]]), na.rm = TRUE)) + mbdata[['SP$SAMPLE']] <- row[["SP.SAMPLE"]] # again, these constants are read from the options: mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type @@ -1232,15 +1326,10 @@ gatherCompound <- function(spec, aggregated, additionalPeaks = NULL, retrieval=" imode <- spec@mode # define positive or negative, based on processing mode. - ion_modes <- list( - "pH" = "POSITIVE", - "pNa" = "POSITIVE", - "mH" = "NEGATIVE", - "mFA" = "NEGATIVE", - "pM" = "POSITIVE", - "mM" = "NEGATIVE", - "pNH4" = "POSITIVE") - mode <- ion_modes[[imode]] + adductProperties <- getAdductProperties(imode, spec@formula) + mode <- NULL + if(adductProperties$charge > 0) mode <- "POSITIVE" + if(adductProperties$charge < 0) mode <- "NEGATIVE" # for format 2.01 ac_ms <- list(); @@ -1248,6 +1337,16 @@ gatherCompound <- function(spec, aggregated, additionalPeaks = NULL, retrieval=" ac_ms[['ION_MODE']] <- mode ac_ms[['IONIZATION']] <- getOption("RMassBank")$annotations$ionization + ## add generic AC$MASS_SPECTROMETRY information + properties <- names(getOption("RMassBank")$annotations) + presentProperties <- names(ac_ms)#c('MS_TYPE', 'IONIZATION', 'ION_MODE')#, 'FRAGMENTATION_MODE', 'COLLISION_ENERGY', 'RESOLUTION') + + theseProperties <- grepl(x = properties, pattern = "^AC\\$MASS_SPECTROMETRY_") + properties2 <- gsub(x = properties, pattern = "^AC\\$MASS_SPECTROMETRY_", replacement = "") + theseProperties <- theseProperties & !(properties2 %in% presentProperties) + theseProperties <- theseProperties & (unlist(getOption("RMassBank")$annotations) != "NA") + ac_ms[properties2[theseProperties]] <- unlist(getOption("RMassBank")$annotations[theseProperties]) + # This list could be made customizable. ac_lc <- list(); rt <- spec@parent@rt / 60 @@ -1257,12 +1356,21 @@ gatherCompound <- function(spec, aggregated, additionalPeaks = NULL, retrieval=" ac_lc[['RETENTION_TIME']] <- sprintf("%.3f min", rt) ac_lc[['SOLVENT A']] <- getOption("RMassBank")$annotations$lc_solvent_a ac_lc[['SOLVENT B']] <- getOption("RMassBank")$annotations$lc_solvent_b - + + ## add generic AC$CHROMATOGRAPHY information + #properties <- names(getOption("RMassBank")$annotations) + theseProperties <- grepl(x = properties, pattern = "^AC\\$CHROMATOGRAPHY_") + properties2 <- gsub(x = properties, pattern = "^AC\\$CHROMATOGRAPHY_", replacement = "") + presentProperties <- names(ac_lc)#c('COLUMN_NAME', 'FLOW_GRADIENT', 'FLOW_RATE', 'RETENTION_TIME', 'SOLVENT A', 'SOLVENT B') + theseProperties <- theseProperties & !(properties2 %in% presentProperties) + theseProperties <- theseProperties & (unlist(getOption("RMassBank")$annotations) != "NA") + ac_lc[properties2[theseProperties]] <- unlist(getOption("RMassBank")$annotations[theseProperties]) + # Go through all child spectra, and fill our skeleton with scan data! # Pass them the AC_LC and AC_MS data, which are added at the right place # directly in there. allSpectra <- lapply(spec@children, function(m) - gatherSpectrum(spec, m, ac_ms, ac_lc, aggregated, additionalPeaks, retrieval=retrieval)) + gatherSpectrum(spec = spec, msmsdata = m, ac_ms = ac_ms, ac_lc = ac_lc, aggregated = aggregated, additionalPeaks = additionalPeaks, retrieval=retrieval)) allSpectra <- allSpectra[which(!is.na(allSpectra))] return(allSpectra) } @@ -1288,22 +1396,18 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP { # If the spectrum is not filled, return right now. All "NA" spectra will # not be treated further. - if(msmsdata@ok == FALSE) + if(msmsdata@ok == FALSE & !RMassBank.env$export.invalid) return(NA) # get data scan <- msmsdata@acquisitionNum id <- spec@id # Further fill the ac_ms datasets, and add the ms$focused_ion with spectrum-specific data: - precursor_types <- list( - "pH" = "[M+H]+", - "pNa" = "[M+Na]+", - "mH" = "[M-H]-", - "mFA" = "[M+HCOO-]-", - "pM" = "[M]+", - "mM" = "[M]-", - "pNH4" = "[M+NH4]+") + + adductProperties <- getAdductProperties(spec@mode, spec@formula) + adductString <- adductProperties$adductString + ac_ms[['FRAGMENTATION_MODE']] <- msmsdata@info$mode - #ac_ms['PRECURSOR_TYPE'] <- precursor_types[spec$mode] + #ac_ms['PRECURSOR_TYPE'] <- adductString ac_ms[['COLLISION_ENERGY']] <- msmsdata@info$ce ac_ms[['RESOLUTION']] <- msmsdata@info$res @@ -1313,7 +1417,9 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP ms_fi <- list() ms_fi[['BASE_PEAK']] <- round(mz(spec@parent)[which.max(intensity(spec@parent))],4) ms_fi[['PRECURSOR_M/Z']] <- round(precursorMz$mzCenter,4) - ms_fi[['PRECURSOR_TYPE']] <- precursor_types[spec@mode] + ms_fi[['PRECURSOR_TYPE']] <- adductString + if(all(!is.na(msmsdata@precursorIntensity), msmsdata@precursorIntensity != 0, msmsdata@precursorIntensity != 100, na.rm = TRUE)) + ms_fi[['PRECURSOR_INTENSITY']] <- msmsdata@precursorIntensity # Select all peaks which belong to this spectrum (correct cpdID and scan no.) # from peaksOK @@ -1398,15 +1504,10 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP # add + or - to fragment formulas - formula_tag <- list( - "pH" = "+", - "pNa" = "+", - "mH" = "-", - "mFA" = "-", - "pM" = "+", - "mM" = "-", - "pNH4" = "+") - type <- formula_tag[[spec@mode]] + adductProperties <- getAdductProperties(spec@mode, spec@formula) + type <- NULL + if(adductProperties$charge > 0) type <- "+" + if(adductProperties$charge < 0) type <- "-" annotator <- getOption("RMassBank")$annotator if(is.null(annotator)) @@ -1901,28 +2002,35 @@ toMassbank <- function (mbdata) #' } #' #' @export -exportMassbank <- function(compiled, files, molfile) +exportMassbank <- function(compiled, files, molfile){ + exportMassbank_recdata( + accessions = unlist(lapply(X = compiled, FUN = "[", "ACCESSION")), + files, + recDataFolder = file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata") + ) + exportMassbank_moldata( + cpdID = as.numeric(compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]]), + molfile, + molDataFolder = file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata") + ) +} + +exportMassbank_recdata <- function(accessions, files, recDataFolder) { - molnames <- c() - for(file in 1:length(compiled)) + for(fileIdx in 1:length(accessions)) { - # Read the accession no. from the corresponding "compiled" entry - filename <- compiled[[file]]["ACCESSION"] # use this accession no. as filename - filename <- paste(filename, ".txt", sep="") - write(files[[file]], - file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata",filename) - ) + filename <- paste(accessions[[fileIdx]], ".txt", sep="") + filePath <- file.path(recDataFolder,filename) + write(files[[fileIdx]], filePath) } +} +exportMassbank_moldata <- function(cpdID, molfile, molDataFolder) +{ # Use internal ID for naming the molfiles - if(findLevel(compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]],TRUE)=="standard"){ - molname <- sprintf("%04d", as.numeric( - compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])) - molname <- paste(molname, ".mol", sep="") - write(molfile, - file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata",molname) - ) - } + molname <- sprintf("%04d", cpdID) + molname <- paste(molname, ".mol", sep="") + write(molfile,file.path(molDataFolder,molname)) } # Makes a list.tsv with molfile -> massbank ch$name attribution. diff --git a/R/formulaCalculator.R b/R/formulaCalculator.R index a552058..c045172 100755 --- a/R/formulaCalculator.R +++ b/R/formulaCalculator.R @@ -179,7 +179,9 @@ dbe <- function(formula) "I" = -0.5, "As" = 2.5, "Hg" = 0, - "Na" = -0.5 + "Li" = -0.5, + "Na" = -0.5, + "K" = -0.5 ) count <- 1 for(element in names(formula)) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index 7af0405..bb74462 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -33,14 +33,14 @@ assign("listEnv", NULL, envir=.listEnvEnv) #' @export loadList <- function(path, listEnv = NULL, check = TRUE) { - if(is.null(listEnv)) - listEnv <- .listEnvEnv - if(!file.exists(path)) - stop("The supplied file does not exist, please supply a correct path") - + if(is.null(listEnv)) + listEnv <- .listEnvEnv + if(!file.exists(path)) + stop("The supplied file does not exist, please supply a correct path") + # Try out if the file is comma- or semicolon-separated compoundList <- read.csv(path, stringsAsFactors=FALSE, check.names=FALSE) - n <- colnames(compoundList) + n <- colnames(compoundList) if(!("ID" %in% n)){ # If no ID column, it must be semicolon separated compoundList <- read.csv2(path, stringsAsFactors=FALSE, check.names=FALSE) n <- colnames(compoundList) @@ -81,7 +81,6 @@ loadList <- function(path, listEnv = NULL, check = TRUE) # If "level" is in the compound list we have to check several things: if(newList){ - # a) Are the levels part of the defined levels? # b) Are the values ok for every level? (i.e. all necessary values supplied for each line in the compound list?) @@ -141,7 +140,7 @@ loadList <- function(path, listEnv = NULL, check = TRUE) # If level is "3" or "3a", a valid smiles or formula must be supplied if(level %in% c("3","3a")){ - + if(!is.na(findSmiles(compoundList[i,"ID"]))){ tryCatch( findMz(compoundList[i,"ID"]), @@ -196,7 +195,7 @@ loadList <- function(path, listEnv = NULL, check = TRUE) if(length(d)>0){ stop("Some columns are missing in the compound list. Needs at least ID, Name, SMILES, RT, CAS.") } - + ### ###Test if all the IDs work ### @@ -208,7 +207,10 @@ loadList <- function(path, listEnv = NULL, check = TRUE) tryCatch( findMz(x), error = function(e){ - currEnvir$wrongID <- c(currEnvir$wrongID, x) + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### Error finding SMILES for ID '", x, "': ", e, sep = "")) + + currEnvir$wrongID <- c(currEnvir$wrongID, x) } ) }) @@ -324,6 +326,168 @@ getMolecule <- function(smiles) return(mol) } +knownAdducts <- function(){ + return(getAdductInformation("")$mode) +} +getMonoisotopicMass <- function(formula){ + if(!exists("isotopes")) data("isotopes", package = "enviPat") + + if(formula == "") return(0) + + if(grepl(x = formula, pattern = "-")){ + starts <- gregexpr(text = formula, pattern = "[A-Z]")[[1]] + subFormulas <- sapply(X = seq_along(starts), FUN = function(startIdx){ + ifelse( + test = startIdx < length(starts), + yes = substr(x = formula, start = starts[[startIdx]], stop = starts[[startIdx + 1]] - 1), + no = substr(x = formula, start = starts[[startIdx]], stop = nchar(formula)) + ) + }) + + monoisotopicMass <- sum(sapply(X = subFormulas, FUN = function(subFormula){ + ifelse( + test = grepl(x = subFormula, pattern = "-"), + yes = -enviPat::isopattern(isotopes = isotopes, chemforms = gsub(x = subFormula, pattern = "-", replacement = ""), threshold=0.1, charge = FALSE, verbose = FALSE)[[1]][[1,1]], + no = enviPat::isopattern(isotopes = isotopes, chemforms = subFormula, threshold=0.1, charge = FALSE, verbose = FALSE)[[1]][[1,1]] + ) + })) + } else { + monoisotopicMass <- enviPat::isopattern(isotopes = isotopes, chemforms = formula, threshold=0.1, charge = FALSE, verbose = FALSE)[[1]][[1,1]] + } + return(monoisotopicMass) +} +getAdductInformation <- function(formula){ + adductDf <- as.data.frame(rbind( + + ## positive: M+X + c(mode = "pH", addition = "H", charge = 1, adductString = "[M+H]+"), + c(mode = "pLi", addition = "Li", charge = 1, adductString = "[M+Li]+"), + c(mode = "pNa", addition = "Na", charge = 1, adductString = "[M+Na]+"), + c(mode = "pNa_mO3S_mH", addition = "Na1O-3S-1H-1", charge = 1, adductString = "[M-O3S-H+Na]+"), + c(mode = "pK", addition = "K", charge = 1, adductString = "[M+K]+"), + c(mode = "pM", addition = "", charge = 1, adductString = "[M]+"), + c(mode = "pM_mC7H11NO9S2", addition = "C-7H-11NO-9S-2", charge = 1, adductString = "[M-C7H11NO9S2]+"), + c(mode = "pM_mC7H12NO9S2", addition = "C-7H-12NO-9S-2", charge = 1, adductString = "[M-C7H12NO9S2]+"), + c(mode = "pNH4", addition = "NH4", charge = 1, adductString = "[M+NH4]+"), + c(mode = "p2Na_mH", addition = "Na2H-1", charge = 1, adductString = "[M+2Na-H]+"), + c(mode = "pACN_pH", addition = "C2H4N1", charge = 1, adductString = "[M+ACN+H]+"), + c(mode = "pACN_pNa", addition = "C2H3N1Na1", charge = 1, adductString = "[M+ACN+Na]+"), + c(mode = "pH_mC7H6O", addition = "C-7H-5O-1", charge = 1, adductString = "[M-C7H6O+H]+"), + c(mode = "pH_mC18H30O14", addition = "C-18H-29O-14", charge = 1, adductString = "[M-C18H30O14+H]+"), + c(mode = "pH_mC6H10O5", addition = "C-6H-9O-5", charge = 1, adductString = "[M-C6H10O5+H]+"), + c(mode = "pH_mC12H20O9", addition = "C-12H-19O-9", charge = 1, adductString = "[M-C12H20O9+H]+"), + c(mode = "pH_mC9H8O4_mH2O", addition = "C-9H-9O-5", charge = 1, adductString = "[M-C9H8O4-H2O+H]+"), + c(mode = "pH_mC6H10O5_mH2O", addition = "C-6H-11O-6", charge = 1, adductString = "[M-C6H10O5-H2O+H]+"), + c(mode = "pH_mC5H8NO4", addition = "C-5H-7N-1O-4", charge = 1, adductString = "[M-C5H8NO4+H]+"), + c(mode = "pH_mO3S", addition = "O-3S-1H1", charge = 1, adductString = "[M-O3S+H]+"), + c(mode = "pH_mC6H10O8S", addition = "C-6H-9O-8S-1", charge = 1, adductString = "[M-C6H10O8S+H]+"), + c(mode = "pH_mC5H10N2O", addition = "C-5H-9N-2O-1", charge = 1, adductString = "[M-C5H10N2O+H]+"), + c(mode = "pH_mHO3P", addition = "O-3P-1", charge = 1, adductString = "[M-HO3P+H]+"), + c(mode = "pH_mC4H7", addition = "C-4H-6", charge = 1, adductString = "[M-C4H7+H]+"), + c(mode = "pH_mC6H10O4", addition = "C-6H-9O-4", charge = 1, adductString = "[M-C6H10O4+H]+"), + c(mode = "pH_mC5H8O3", addition = "C-5H-7O-3", charge = 1, adductString = "[M-C5H8O3+H]+"), + c(mode = "pH_mCO", addition = "H-1C-1O-1", charge = 1, adductString = "[M-CO+H]+"), + c(mode = "pH_mO3", addition = "H-1O-3", charge = 1, adductString = "[M-O3+H]+"), + c(mode = "pH_mC3H6", addition = "C-3H-5", charge = 1, adductString = "[M-C3H6+H]+"), + c(mode = "pH_mC4H3O5", addition = "C-4H-2O-5", charge = 1, adductString = "[M-C4H3O5+H]+"), + c(mode = "pH_mC6H11O6", addition = "C-6H-10O-6", charge = 1, adductString = "[M-C6H11O6+H]+"), + c(mode = "pH_mH2O", addition = "H-1O-1", charge = 2, adductString = "[M-H2O+H]+"), + c(mode = "pNa_mH2O", addition = "H-2O-1Na1", charge = 2, adductString = "[M-H2O+Na]+"), + c(mode = "p2H", addition = "H2", charge = 2, adductString = "[M+2H]2+"), + c(mode = "pACN_p2H", addition = "C2H5N1", charge = 2, adductString = "[M+ACN+2H]2+"), + ## positive: 2M+X + c(mode = "pM_pH", addition = add.formula(formula, "H1"), charge = 1, adductString = "[2M+H]+"), + c(mode = "pM_pK", addition = add.formula(formula, "K1"), charge = 1, adductString = "[2M+K]+"), + c(mode = "pM_pNa", addition = add.formula(formula, "Na1"), charge = 1, adductString = "[2M+Na]+"), + c(mode = "pM_pNH4", addition = add.formula(formula, "N1H4"), charge = 1, adductString = "[2M+NH4]+"), + c(mode = "pM_pACN_pH", addition = add.formula(formula, "C2H4N1"), charge = 1, adductString = "[2M+ACN+H]+"), + ## positive: strange positive adducts + c(mode = "pCOONa", addition = "C1O2Na1", charge = 1, adductString = "[M+COONa]+"), + c(mode = "p3H_c1", addition = "H3", charge = 1, adductString = "[M+3H]+"), + c(mode = "pH2O_c1", addition = "H2O1", charge = 1, adductString = "[M+H2O]+"), + c(mode = "pH_m2H2O", addition = "H-3O-2", charge = 1, adductString = "[M-2H2O+H]+"), + c(mode = "pH_pH2O", addition = "H3O1", charge = 1, adductString = "[M+H2O+H]+"), + c(mode = "pH_mNH3", addition = "N-1H-2", charge = 1, adductString = "[M-NH3+H]+"), + c(mode = "p2H_c1", addition = "H2", charge = 1, adductString = "[M+2H]+"), + c(mode = "p_mNH3_c1", addition = "N-1H-3", charge = 1, adductString = "[M-NH2-H]+"), + c(mode = "p_mNH2_pH_c1", addition = "N-1H-1", charge = 1, adductString = "[M-NH2+H]+"), + c(mode = "pM_p2Na_m3H_c1", addition = add.formula(formula, "Na2H-3"), charge = 1, adductString = "[2M+2Na-3H]+"), + c(mode = "pM_pNa_m2H_c1", addition = add.formula(formula, "Na1H-2"), charge = 1, adductString = "[2M+Na-2H]+"), + c(mode = "pM_pNa_mH_c1", addition = add.formula(formula, "Na1H-1"), charge = 1, adductString = "[2M+Na-H]+"), + c(mode = "pM_p2Na_m2H_c1", addition = add.formula(formula, "Na2H-2"), charge = 1, adductString = "[2M+2Na-2H]+"), + c(mode = "pM_pH_m2H2O_c1", addition = add.formula(formula, "H-3O-2"), charge = 1, adductString = "[2M-2H2O+H]+"), + c(mode = "pM_pH_mH2O", addition = add.formula(formula, "H-1O-1"), charge = 1, adductString = "[2M-H2O+H]+"), + c(mode = "pM_pNa_mH2O", addition = add.formula(formula, "H-2O-1Na1"), charge = 1, adductString = "[2M-H2O+Na]+"), + c(mode = "pM_m2H_c1", addition = add.formula(formula, "H-2"), charge = 1, adductString = "[2M-2H]+"), + c(mode = "pM_mH_c2", addition = add.formula(formula, "H-1"), charge = 1, adductString = "[2M-2H+H]+"), + c(mode = "pM_pLi", addition = add.formula(formula, "Li1"), charge = 1, adductString = "[2M+Li]+"), + c(mode = "pM_pH_m2O", addition = add.formula(formula, "O-2H1"), charge = 1, adductString = "[2M-2O+H]+"), + c(mode = "pM_pNa_m2O", addition = add.formula(formula, "O-2Na1"), charge = 1, adductString = "[2M-2O+Na]+"), + c(mode = "pM_pH_m3O", addition = add.formula(formula, "O-3H1"), charge = 1, adductString = "[2M-3O+H]+"), + c(mode = "pM_pNa_m3O", addition = add.formula(formula, "O-3Na1"), charge = 1, adductString = "[2M-3O+Na]+"), + c(mode = "pM_mH_c1", addition = add.formula(formula, "H-1"), charge = 1, adductString = "[2M-H]+"), + c(mode = "pM_mH_pH", addition = formula, charge = 1, adductString = "[2M-H+H]+"), + c(mode = "pH_c2", addition = "H1", charge = 2, adductString = "[M+H]2+"), + + + ## negative: M-X + c(mode = "mH", addition = "H-1", charge = -1, adductString = "[M-H]-"), + c(mode = "mCl", addition = "Cl-1", charge = -1, adductString = "[M+Cl]-"), + c(mode = "mFA", addition = "C1O2H", charge = -1, adductString = "[M+HCOOH-H]-"), + c(mode = "mH_pTFA", addition = "C2F3O2", charge = -1, adductString = "[M+CF3CO2H-H]-"), + + c(mode = "mH_mC6H10O5", addition = "C-6H-11O-5", charge = -1, adductString = "[M-C6H10O5-H]-"), + + c(mode = "mFA_pH", addition = "C1O2H2", charge = -1, adductString = "[M+HCOOH]-"), + c(mode = "mH_mH2O", addition = "H-3O-1", charge = -1, adductString = "[M-H2O-H]-"), + c(mode = "mCO2", addition = "C-1O-2", charge = -1, adductString = "[M-CO2]-"), + c(mode = "mH_mCH3", addition = "C-1H-4", charge = -1, adductString = "[M-CH3-H]-"), + c(mode = "mH_mCO2", addition = "C-1H-1O-2", charge = -1, adductString = "[M-CO2-H]-"), + c(mode = "mCH3", addition = "C-1H-3", charge = -1, adductString = "[M-CH3]-"), + c(mode = "m2H_pNa", addition = "H-2Na1", charge = -1, adductString = "[M+Na-2H]-"), + c(mode = "mM", addition = "", charge = -1, adductString = "[M]-"), + c(mode = "m2H", addition = "H-2", charge = -1, adductString = "[M-2H]-"), ## in case of positively charged compounds + c(mode = "m2H_c2", addition = "H-2", charge = -2, adductString = "[M-2H]2-"), + ## negative: 2M-X + c(mode = "mH_pM", addition = add.formula(formula, "H-1"), charge = -1, adductString = "[2M-H]-"), + c(mode = "mFA_pM", addition = add.formula(formula, "C1O2H"), charge = -1, adductString = "[2M+HCOOH-H]-"), + c(mode = "mH_pM_mH2O", addition = add.formula(formula, "H-3O-1"), charge = -1, adductString = "[2M-H2O-H]-"), + c(mode = "m2H_pM_pNa", addition = add.formula(formula, "H-2Na1"), charge = -1, adductString = "[2M+Na-2H]-"), + ## negative: strange adducts + c(mode = "mpM", addition = formula, charge = -1, adductString = "[2M]-"), + c(mode = "m2H_pHCOOH_pNa", addition = "Na1C1O2", charge = -1, adductString = "[M+HCOOH+Na-2H]-"), + c(mode = "mH_p2H", addition = "H2", charge = -1, adductString = "[M+3H-H]-"), + c(mode = "mH_pH", addition = "H1", charge = -1, adductString = "[M+2H-H]-"), + c(mode = "mH_pH2O", addition = "H1O1", charge = -1, adductString = "[M+H2O-H]-"), + c(mode = "m4H_pM_p3Na", addition = add.formula(formula, "Na3H-4"), charge = -1, adductString = "[2M+3Na-4H]-"), + c(mode = "m2H_mNH3_pNa", addition = add.formula(formula, "Na1N-1H-5"), charge = -1, adductString = "[2M-NH3+Na-2H]-"), + c(mode = "m3H_pM_p2Na", addition = add.formula(formula, "Na2H-3"), charge = -1, adductString = "[2M+2Na-3H]-"), + c(mode = "m3H_pM", addition = add.formula(formula, "H-3"), charge = -1, adductString = "[2M-3H]-"), + c(mode = "mH_p2M", addition = add.formula(formula, add.formula(formula, "H-1")), charge = -1, adductString = "[3M-H]-"), + + ## ??? + c(mode = "", addition = "", charge = 0, adductString = "[M]") + ), stringsAsFactors = F) + adductDf$charge <- as.integer(adductDf$charge) + + if(any(any(duplicated(adductDf$mode)), any(duplicated(adductDf$adductString)))) stop("Invalid adduct table") + + return(adductDf) +} +getAdductProperties <- function(mode, formula){ + if(grepl(x = mode, pattern = "pM") & is.null(formula)) + stop("Cannot calculate pM adduct without formula") + else if(is.null(formula)) formula <- "" + + adductDf <- getAdductInformation(formula) + + if(!(mode %in% adductDf$mode)) + stop("mode = \"", mode, "\" not defined") + + mzopt <- as.list(adductDf[adductDf$mode==mode,]) + return(mzopt) +} + #' Find the exact mass +/- a given margin for a given formula or its ions and adducts. #' #' @param formula The molecular formula in text or list format (see \code{\link{formulastring.to.list}} @@ -338,26 +502,9 @@ getMolecule <- function(smiles) #' @export findMz.formula <- function(formula, mode="pH", ppm=10, deltaMz=0) { - if (!any(mode %in% c("pH", "pNa", "pM", "pNH4", "mH", "mFA", - "mM", ""))) + if (!any(mode %in% knownAdducts())) stop(paste("The ionization mode", mode, "is unknown.")) - mzopt <- list(addition = "", charge = 0) - if (mode == "pH") - mzopt <- list(addition = "H", charge = 1) - if (mode == "pNa") - mzopt <- list(addition = "Na", charge = 1) - if (mode == "pM") - mzopt <- list(addition = "", charge = 1) - if (mode == "pNH4") - mzopt <- list(addition = "NH4", charge = -1) - if (mode == "mH") - mzopt <- list(addition = "H-1", charge = -1) - if (mode == "mFA") - mzopt <- list(addition = "C1O2", charge = -1) - if (mode == "mM") - mzopt <- list(addition = "", charge = -1) - if (mode == "") - mzopt <- list(addition = "", charge = 0) + mzopt <- getAdductProperties(mode, formula) formula <- add.formula(formula, mzopt$addition) # Since in special cases we want to use this with negative and zero number of atoms, we account for this case # by splitting up the formula into positive and negative atom counts (this eliminates the zeroes.) @@ -487,8 +634,8 @@ findSmiles <- function(cpdID) { stop("Compound list must be loaded first.") if(!exists("compoundList", where=.listEnvEnv$listEnv)) stop("Compound list must be loaded first.") - if(.listEnvEnv$listEnv$compoundList[match(cpdID, .listEnvEnv$listEnv$compoundList$ID),"SMILES"] == "") - return(NA) + if(.listEnvEnv$listEnv$compoundList[match(cpdID, .listEnvEnv$listEnv$compoundList$ID),"SMILES"] == "") + return(NA) return(.listEnvEnv$listEnv$compoundList[match(cpdID, .listEnvEnv$listEnv$compoundList$ID),"SMILES"]) } @@ -585,30 +732,10 @@ findMass <- function(cpdID_or_smiles, retrieval="standard", mode = "pH") { # Must calculate mass manually if no formula is given if(retrieval == "unknown"){ - if(mode == "pH") { - mass <- 1.00784 - mode.charge <- 1 - } else if(mode == "pNa") { - mass <- 22.989769 - mode.charge <- 1 - } else if(mode == "pM") { - mass <- 0 - mode.charge <- 1 - } else if(mode == "mM") { - mass <- 0 - mode.charge <- -1 - } else if(mode == "mH") { - mass <- -1.00784 - mode.charge <- -1 - } else if(mode == "mFA") { - mass <- 59.0440 - mode.charge <- -1 - } else if(mode == "pNH4") { - mass <- 18.03846 - mode.charge <- 1 - } else{ - stop("mode = \"", mode, "\" not defined") - } + adductProperties <- getAdductProperties(mode, rcdk::get.formula(findFormula(cpdID_or_smiles))) + allowed_additions <- adductProperties$addition + mode.charge <- adductProperties$charge + mass <- getMonoisotopicMass(allowed_additions) return(findMz(cpdID_or_smiles, mode=mode, retrieval=retrieval)$mzCenter - mass + mode.charge * .emass) } diff --git a/R/leMsMs.r b/R/leMsMs.r index 81c1a57..ecb6d72 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -79,7 +79,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec progressbar = "progressBarHook", MSe = FALSE) { .checkMbSettings() - if(!any(mode %in% c("pH","pNa","pNH4","pM","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) + if(!any(mode %in% knownAdducts())) stop(paste("The ionization mode", mode, "is unknown.")) if(!is.na(archivename)) w@archivename <- archivename @@ -151,7 +151,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec # } else { # analyzeMethod <- "formula" # } - s <- analyzeMsMs(spec, mode=mode, detail=TRUE, run="preliminary", + s <- analyzeMsMs(msmsPeaks = spec, mode=mode, detail=TRUE, run="preliminary", filterSettings = settings$filterSettings, spectraList = settings$spectraList, method = analyzeMethod) # Progress: @@ -168,7 +168,13 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec if(3 %in% steps) { message("msmsWorkflow: Step 3. Aggregate all spectra") - w@aggregated <- aggregateSpectra(w@spectra, addIncomplete=TRUE) + w@aggregated <- aggregateSpectra(spec = w@spectra, addIncomplete=TRUE) + + if(RMassBank.env$verbose.output){ + numberOfPeaksThere <- sum(unlist(lapply(X = w@spectra, FUN = function(spec){ sum(unlist(lapply(X = spec@children, FUN = function(child){ child@peaksCount }))) }))) + if(nrow(w@aggregated) < numberOfPeaksThere) + cat(paste("### Warning ### The aggregation of spectra lead to the removal of ", (numberOfPeaksThere-nrow(w@aggregated)), " / ", numberOfPeaksThere, " peaks\n", sep = "")) + } } if(allUnknown){ @@ -223,7 +229,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec } else { analyzeMethod <- "formula" } - s <- analyzeMsMs(spec, mode=mode, detail=TRUE, run="recalibrated", + s <- analyzeMsMs(msmsPeaks = spec, mode=mode, detail=TRUE, run="recalibrated", filterSettings = settings$filterSettings, spectraList = settings$spectraList, method = analyzeMethod) # Progress: @@ -242,23 +248,44 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec if(6 %in% steps) { message("msmsWorkflow: Step 6. Aggregate recalibrated results") - w@aggregated <- aggregateSpectra(w@spectra, addIncomplete=TRUE) + w@aggregated <- aggregateSpectra(spec = w@spectra, addIncomplete=TRUE) + + if(RMassBank.env$verbose.output){ + numberOfPeaksThere <- sum(unlist(lapply(X = w@spectra, FUN = function(spec){ sum(unlist(lapply(X = spec@children, FUN = function(child){ child@peaksCount }))) }))) + if(nrow(w@aggregated) < numberOfPeaksThere) + cat(paste("### Warning ### The aggregation of spectra lead to the removal of ", (numberOfPeaksThere-nrow(w@aggregated)), " / ", numberOfPeaksThere, " peaks\n", sep = "")) + } + if(!is.na(archivename)) archiveResults(w, paste(archivename, ".RData", sep=''), settings) - w@aggregated <- cleanElnoise(w@aggregated, - settings$electronicNoise, settings$electronicNoiseWidth) + w@aggregated <- cleanElnoise(peaks = w@aggregated, noise = settings$electronicNoise, width = settings$electronicNoiseWidth) + + if(RMassBank.env$verbose.output) + if(sum(w@aggregated$noise) > 0) + cat(paste("### Warning ### ", sum(w@aggregated$noise), " / ", nrow(w@aggregated), " peaks have been identified as electronic noise\n", sep = "")) } # Step 7: reanalyze failpeaks for (mono)oxidation and N2 adduct peaks if(7 %in% steps) { message("msmsWorkflow: Step 7. Reanalyze fail peaks for N2 + O") w@aggregated <- reanalyzeFailpeaks( - w@aggregated, custom_additions="N2O", mode=mode, + aggregated = w@aggregated, custom_additions="N2O", mode=mode, filterSettings=settings$filterSettings, progressbar=progressbar) if(!is.na(archivename)) - archiveResults(w, paste(archivename, "_RA.RData", sep=''), settings) + archiveResults(w, paste(archivename, "_RA.RData", sep=''), settings) + + if(RMassBank.env$verbose.output){ + isNoFormula <- is.na(w@aggregated$formula) & is.na(w@aggregated$reanalyzed.formula) + noFormulaCount <- sum(isNoFormula) + numberOfPeaksThere <- sum(unlist(lapply(X = w@spectra, FUN = function(spec){ sum(unlist(lapply(X = spec@children, FUN = function(child){ child@peaksCount }))) }))) + if(noFormulaCount > 0){ + cat(paste("### Warning ### ", noFormulaCount, " / ", numberOfPeaksThere, " peaks have no molecular formula:\n", sep = "")) + print(w@aggregated[isNoFormula, c("mzFound","intensity","cpdID")]) + } + } } + # Step 8: heuristic filtering based on peak multiplicity; # creation of failpeak list if(8 %in% steps) @@ -266,12 +293,30 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec message("msmsWorkflow: Step 8. Peak multiplicity filtering") if (is.null(settings$multiplicityFilter)) { message("msmsWorkflow: Step 8. Peak multiplicity filtering skipped because multiplicityFilter parameter is not set.") + w@aggregated <- addProperty(w@aggregated, "formulaMultiplicity", "integer", 1) + w@aggregated <- addProperty(w@aggregated, "filterOK", "logical", FALSE) + w@aggregated$filterOK <- !((is.na(w@aggregated$formulaCount) | w@aggregated$formulaCount==0) & (is.na(w@aggregated$reanalyzed.formulaCount) | w@aggregated$reanalyzed.formulaCount==0)) + w@aggregated <- addProperty(w@aggregated, "problematicPeak", "logical", FALSE) } else { # apply heuristic filter - w@aggregated <- filterMultiplicity( - w, archivename, mode, multiplicityFilter = settings$multiplicityFilter) + w@aggregated <- filterMultiplicity(w = w, archivename = archivename, mode = mode, multiplicityFilter = settings$multiplicityFilter) + + if(RMassBank.env$verbose.output){ + peakDfs <- split(x = w@aggregated, f = list("mzFound"=w@aggregated$mzFound, "cpdID"=w@aggregated$cpdID)) + numberOfPeaks <- length(peakDfs) + multiplicityNotOkCount <- numberOfPeaks - sum(unlist(lapply(X = peakDfs, FUN = function(x){any(x$filterOK)}))) + if(multiplicityNotOkCount > 0) + cat(paste("### Warning ### ", multiplicityNotOkCount, " / ", numberOfPeaks, " peaks do not fulfill the multiplicity criterion\n", sep = "")) + } + w@aggregated <- processProblematicPeaks(w, mode, archivename) - + + if(RMassBank.env$verbose.output){ + problematicPeakCount <- sum(w@aggregated$problematicPeak) + if(problematicPeakCount > 0) + cat(paste("### Warning ### ", problematicPeakCount, " / ", nrow(w@aggregated), " peaks are problematic\n", sep = "")) + } + if(!is.na(archivename)) archiveResults(w, paste(archivename, "_RF.RData", sep=''), settings) } @@ -418,6 +463,8 @@ analyzeMsMs <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", ## ); .checkMbSettings() + if(msmsPeaks@found == FALSE) + return(msmsPeaks) # Check whether the spectra can be fitted to the spectra list correctly! if(length(msmsPeaks@children) != length(spectraList)) @@ -430,18 +477,13 @@ analyzeMsMs <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", } - if(msmsPeaks@found == FALSE) - return(msmsPeaks) - if(method=="formula") { - r <- (analyzeMsMs.formula(msmsPeaks, mode, detail, run, filterSettings - )) + r <- analyzeMsMs.formula(msmsPeaks, mode, detail, run, filterSettings) } else if(method == "intensity") { - r <- (analyzeMsMs.intensity(msmsPeaks, mode, detail, run, filterSettings - )) + r <- analyzeMsMs.intensity(msmsPeaks, mode, detail, run, filterSettings) } # Add the spectrum labels to the spectra here. @@ -475,22 +517,17 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi { cut <- 0 cut_ratio <- 0 - if(run=="preliminary") - { + if(run=="preliminary"){ filterMode <- "coarse" - cut <- filterSettings$prelimCut + cut <- filterSettings$prelimCut if(is.na(cut)) { - if(mode %in% c("pH", "pM", "pNa", "pNH4")) - cut <- 1e4 - else if(mode %in% c("mH", "mFA","mM")) - cut <- 0 - else stop(paste("The ionization mode", mode, "is unknown.")) + adductProperties <- getAdductProperties(mode, msmsPeaks@formula) + if(adductProperties$charge > 0) cut <- 1e4 + if(adductProperties$charge < 0) cut <- 0 } - cutRatio <- filterSettings$prelimCutRatio - } - else - { + cutRatio <- filterSettings$prelimCutRatio + } else { filterMode <- "fine" cut <- filterSettings$fineCut cut_ratio <- filterSettings$fineCutRatio @@ -507,12 +544,13 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi # filtering out low-intensity (<1e4) and shoulder peaks (deltam/z < 0.5, intensity # < 5%) and subsequently matching the peaks to formulas using Rcdk, discarding peaks # with insufficient match accuracy or no match. - analyzeTandemShot <- function(child) + analyzeTandemShot <- function(child, childIdx = 0) { - shot <- getData(child) - shot$row <- which(!is.na(shot$mz)) - - + + shot <- getData(child) + shot$row <- which(!is.na(shot$mz)) + + # Filter out low intensity peaks: child@low <- (shot$intensity < cut) | (shot$intensity < max(shot$intensity)*cut_ratio) shot <- shot[!child@low,,drop=FALSE] @@ -520,238 +558,263 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi # Is there still anything left? if(length(which(!child@low))==0) - { - child@ok <- FALSE - return(child) - } - + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("\n### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' contains only low-intensity peaks\n", sep = "")) + + return(child) + } + # Filter out satellite peaks: shot <- filterPeakSatellites(shot, filterSettings) - child@satellite <- rep(TRUE, child@peaksCount) - child@satellite[which(child@low == TRUE)] <- NA - child@satellite[shot$row] <- FALSE - + child@satellite <- rep(TRUE, child@peaksCount) + child@satellite[which(child@low == TRUE)] <- NA + child@satellite[shot$row] <- FALSE + # Is there still anything left? if(nrow(shot)==0) - { - child@ok <- FALSE - return(child) - } - + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("\n### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' contains no peaks after satellite filtering\n", sep = "")) + + return(child) + } + if(max(shot$intensity) < as.numeric(filterSettings$specOkLimit)) - { - child@ok <- FALSE - return(child) - } - + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("\n### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is discarded due to parameter 'specOkLimit'\n", sep = "")) + + return(child) + } + # Crop to 4 digits (necessary because of the recalibrated values) - # this was done for the MOLGEN MSMS type analysis, is not necessary anymore now (23.1.15 MST) + # this was done for the MOLGEN MSMS type analysis, is not necessary anymore now (23.1.15 MST) # shot[,mzColname] <- round(shot[,mzColname], 5) - - # here follows the Rcdk analysis - #------------------------------------ - parentPeaks <- data.frame(mzFound=msmsPeaks@mz, - formula=msmsPeaks@formula, - dppm=0, - x1=0,x2=0,x3=0) - - # define the adduct additions - if(mode == "pH") { - allowed_additions <- "H" - mode.charge <- 1 - } else if(mode == "pNa") { - allowed_additions <- "Na" - mode.charge <- 1 - } else if(mode == "pM") { - allowed_additions <- "" - mode.charge <- 1 - } else if(mode == "mM") { - allowed_additions <- "" - mode.charge <- -1 - } else if(mode == "mH") { - allowed_additions <- "H-1" - mode.charge <- -1 - } else if(mode == "mFA") { - allowed_additions <- "C2H3O2" - mode.charge <- -1 - } else if(mode == "pNH4") { - allowed_additions <- "NH4" - mode.charge <- 1 - } else{ - stop("mode = \"", mode, "\" not defined") - } - - - # the ppm range is two-sided here. - # The range is slightly expanded because dppm calculation of - # generate.formula starts from empirical mass, but dppm cal- - # culation of the evaluation starts from theoretical mass. - # So we don't miss the points on 'the border'. - - if(run=="preliminary") - ppmlimit <- 2 * max(filterSettings$ppmLowMass, filterSettings$ppmHighMass) - else - ppmlimit <- 2.25 * filterSettings$ppmFine - - parent_formula <- add.formula(msmsPeaks@formula, allowed_additions) - dbe_parent <- dbe(parent_formula) - # check whether the formula is valid, i.e. has no negative or zero element numbers. - #print(parent_formula) - if(!is.valid.formula(parent_formula)) - { - child@ok <- FALSE - return(child) - } - - limits <- to.limits.rcdk(parent_formula) - - peakmatrix <- lapply( - split(shot,shot$row) - , function(shot.row) { - # Circumvent bug in rcdk: correct the mass for the charge first, then calculate uncharged formulae - # finally back-correct calculated masses for the charge - mass <- shot.row[["mz"]] - mass.calc <- mass + mode.charge * .emass - peakformula <- suppressWarnings(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), - limits, charge=0)) - + + # here follows the Rcdk analysis + #------------------------------------ + parentPeaks <- data.frame(mzFound=msmsPeaks@mz, + formula=msmsPeaks@formula, + dppm=0, + x1=0,x2=0,x3=0) + + # get the adduct additions + adductProperties <- getAdductProperties(mode, msmsPeaks@formula) + allowed_additions <- adductProperties$addition + mode.charge <- adductProperties$charge + + # the ppm range is two-sided here. + # The range is slightly expanded because dppm calculation of + # generate.formula starts from empirical mass, but dppm cal- + # culation of the evaluation starts from theoretical mass. + # So we don't miss the points on 'the border'. + + if(run=="preliminary") + ppmlimit <- 2 * max(filterSettings$ppmLowMass, filterSettings$ppmHighMass) + else + ppmlimit <- 2.25 * filterSettings$ppmFine + + parent_formula <- add.formula(msmsPeaks@formula, allowed_additions) + dbe_parent <- dbe(parent_formula) + # check whether the formula is valid, i.e. has no negative or zero element numbers. + #print(parent_formula) + if(!is.valid.formula(parent_formula)) + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("\n### Warning ### The precursor ion formula of spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is invalid\n", sep = "")) + + return(child) + } + + limits <- to.limits.rcdk(parent_formula) + + peakmatrix <- lapply(split(shot,shot$row), function(shot.row) { + # Circumvent bug in rcdk: correct the mass for the charge first, then calculate uncharged formulae + # finally back-correct calculated masses for the charge + mass <- shot.row[["mz"]] + mass.calc <- mass + mode.charge * .emass + peakformula <- tryCatch( + suppressWarnings(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), + limits, charge=0)), + error = function(e) list()) + # generate.formula(mass, + # ppm(mass, ppmlimit, p=TRUE), + # limits, charge=1), + #error= function(e) list()) + if(length(peakformula)==0) - return(t(c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, - formula=NA, mzCalc=NA))) - else - { - return(t(sapply(peakformula, function(f) - { - mzCalc <- f@mass - mode.charge * .emass - c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, - formula=f@string, - mzCalc=mzCalc) - }))) - } - - }) - - childPeaks <- as.data.frame(do.call(rbind, peakmatrix)) - - # Reformat the deformatted output correctly (why doesn't R have a better way to do this, e.g. avoid deformatting?) - - childPeaks$row <- as.numeric(as.character(childPeaks$row)) - childPeaks$intensity <- as.numeric(as.character(childPeaks$intensity)) - childPeaks$mz <- as.numeric(as.character(childPeaks$mz)) - childPeaks$formula <- as.character(childPeaks$formula) - childPeaks$mzCalc <- as.numeric(as.character(childPeaks$mzCalc)) - childPeaks$dppm <- (childPeaks$mz / childPeaks$mzCalc - 1) * 1e6 - childPeaks$dbe <- unlist(lapply(childPeaks$formula, dbe)) - - # childPeaks now contains all the good and unmatched peaks - # but not the ones which were cut as satellites or below threshold. - - ## child@mzFound <- rep(NA, child@peaksCount) - ## child@mzFound[childPeaks$row] <- as.numeric(as.character(childPeaks$mzFound)) - ## - ## child@formula <- rep(NA, child@peaksCount) - ## child@formula[childPeaks$row] <- as.character(childPeaks$formula) - ## - ## child@mzCalc <- rep(NA, child@peaksCount) - ## child@mzCalc[childPeaks$row] <- as.numeric(as.character(childPeaks$mzCalc)) - ## - ## child@dppm<- rep(NA, child@peaksCount) - ## child@dppm[childPeaks$row] <- (childPeaks$mzFound / childPeaks$mzCalc - 1) * 1e6 - # delete the NA data out again, because MolgenMsMs doesn't have them - # here and they will be re-added later - # (this is just left like this for "historical" reasons) - #childPeaks <- childPeaks[!is.na(childPeaks$formula),] - # check if a peak was recognized (here for the first time, - # otherwise the next command would fail) - - if(nrow(childPeaks)==0) - { - child@ok <- FALSE - return(child) - } - - # now apply the rule-based filters to get rid of total junk: - # dbe >= -0.5, dbe excess over mother cpd < 3 - # dbe() has been adapted to return NA for NA input - #iff_rcdk_pM_eln$maxvalence <- unlist(lapply(diff_rcdk_pM_eln$formula.rcdk, maxvalence)) - temp.child.ok <- (childPeaks$dbe >= filterSettings$dbeMinLimit) - # & dbe < dbe_parent + 3) - # check if a peak was recognized - if(length(which(temp.child.ok)) == 0) - { - child@ok <- FALSE - return(child) - } + return(t(c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, + formula=NA, mzCalc=NA))) + else + { + return(t(sapply(peakformula, function(f) + { + mzCalc <- f@mass - mode.charge * .emass + c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, + formula=f@string, + mzCalc=mzCalc) + }))) + } + }) + + childPeaks <- as.data.frame(do.call(rbind, peakmatrix)) + + presentElements <- unique(unlist(lapply(X = lapply(X = childPeaks$formula, FUN = formulastring.to.list), FUN = names))) + atomDBEs <- sapply(X = presentElements, FUN = dbe) + unknownElements <- names(atomDBEs)[sapply(X = atomDBEs, FUN = function(atomDBE){length(atomDBE)==0})] + if(length(unknownElements) > 0) stop(paste("Element(s)", paste(unknownElements), "cannot be assigned a DBE")) + + # Reformat the deformatted output correctly (why doesn't R have a better way to do this, e.g. avoid deformatting?) + + childPeaks$row <- as.numeric(as.character(childPeaks$row)) + childPeaks$intensity <- as.numeric(as.character(childPeaks$intensity)) + childPeaks$mz <- as.numeric(as.character(childPeaks$mz)) + childPeaks$formula <- as.character(childPeaks$formula) + childPeaks$mzCalc <- as.numeric(as.character(childPeaks$mzCalc)) + childPeaks$dppm <- (childPeaks$mz / childPeaks$mzCalc - 1) * 1e6 + childPeaks$dbe <- unlist(lapply(childPeaks$formula, dbe)) + + # childPeaks now contains all the good and unmatched peaks + # but not the ones which were cut as satellites or below threshold. + + ## child@mzFound <- rep(NA, child@peaksCount) + ## child@mzFound[childPeaks$row] <- as.numeric(as.character(childPeaks$mzFound)) + ## + ## child@formula <- rep(NA, child@peaksCount) + ## child@formula[childPeaks$row] <- as.character(childPeaks$formula) + ## + ## child@mzCalc <- rep(NA, child@peaksCount) + ## child@mzCalc[childPeaks$row] <- as.numeric(as.character(childPeaks$mzCalc)) + ## + ## child@dppm<- rep(NA, child@peaksCount) + ## child@dppm[childPeaks$row] <- (childPeaks$mzFound / childPeaks$mzCalc - 1) * 1e6 + # delete the NA data out again, because MolgenMsMs doesn't have them + # here and they will be re-added later + # (this is just left like this for "historical" reasons) + #childPeaks <- childPeaks[!is.na(childPeaks$formula),] + # check if a peak was recognized (here for the first time, + # otherwise the next command would fail) + + if(nrow(childPeaks)==0) + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("\n### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is empty\n", sep = "")) + + return(child) + } + + if(all(is.na(childPeaks$formula))) + { + child@ok <- FALSE + child@good <- rep(FALSE, length(childPeaks$formula)) + + if(RMassBank.env$verbose.output) + cat(paste("\n### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' comprises no peaks which could be assiged to a molecular formula\n", sep = "")) + + return(child) + } + + # now apply the rule-based filters to get rid of total junk: + # dbe >= -0.5, dbe excess over mother cpd < 3 + # dbe() has been adapted to return NA for NA input + #iff_rcdk_pM_eln$maxvalence <- unlist(lapply(diff_rcdk_pM_eln$formula.rcdk, maxvalence)) + temp.child.ok <- (childPeaks$dbe >= filterSettings$dbeMinLimit) + # & dbe < dbe_parent + 3) + # check if a peak was recognized + if(length(which(temp.child.ok)) == 0) + { + child@ok <- FALSE + child@good <- rep(FALSE, length(temp.child.ok)) + + if(RMassBank.env$verbose.output) + cat(paste("\n### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' comprises no peaks which fulfil the dbeMinLimit criterion\n", sep = "")) + + return(child) + } #browser() - # find the best ppm value - bestPpm <- aggregate(as.data.frame(childPeaks[!is.na(childPeaks$dppm),"dppm"]), - list(childPeaks[!is.na(childPeaks$dppm),"row"]), - function(dppm) dppm[[which.min(abs(dppm))]]) + # find the best ppm value + bestPpm <- aggregate( + x = as.data.frame(childPeaks[!is.na(childPeaks$dppm),"dppm"]), + by = list(childPeaks[!is.na(childPeaks$dppm),"row"]), + FUN = function(dppm) dppm[[which.min(abs(dppm))]] + ) colnames(bestPpm) <- c("row", "dppmBest") childPeaks <- merge(childPeaks, bestPpm, by="row", all.x=TRUE) - - # Deactivated the following lines because we never actually want to look at the "old" formula count. - # To be verified (cf Refiltering, failpeak list and comparable things) - - ## # count formulas found per mass - ## countFormulasTab <- xtabs( ~formula + mz, data=childPeaks) - ## countFormulas <- colSums(countFormulasTab) - ## childPeaks$formulaCount <- countFormulas[as.character(childPeaks$row)] - + + # Deactivated the following lines because we never actually want to look at the "old" formula count. + # To be verified (cf Refiltering, failpeak list and comparable things) + + ## # count formulas found per mass + ## countFormulasTab <- xtabs( ~formula + mz, data=childPeaks) + ## countFormulas <- colSums(countFormulasTab) + ## childPeaks$formulaCount <- countFormulas[as.character(childPeaks$row)] + # filter results childPeaksFilt <- filterLowaccResults(childPeaks, filterMode, filterSettings) childPeaksGood <- childPeaksFilt[["TRUE"]] childPeaksBad <- childPeaksFilt[["FALSE"]] - if(is.null(childPeaksGood)){ - childPeaksGood <- childPeaks[c(),,drop=FALSE] - childPeaksGood$good <- logical(0) + if(is.null(childPeaksGood)){ + childPeaksGood <- childPeaks[c(),,drop=FALSE] + childPeaksGood$good <- logical(0) } - if(is.null(childPeaksBad)) - childPeaksBad <- childPeaks[c(),,drop=FALSE] - childPeaksUnassigned <- childPeaks[is.na(childPeaks$dppm),,drop=FALSE] - childPeaksUnassigned$good <- rep(FALSE, nrow(childPeaksUnassigned)) - # count formulas within new limits + if(is.null(childPeaksBad)) + childPeaksBad <- childPeaks[c(),,drop=FALSE] + childPeaksUnassigned <- childPeaks[is.na(childPeaks$dppm),,drop=FALSE] + childPeaksUnassigned$good <- rep(FALSE, nrow(childPeaksUnassigned)) + + # count formulas within new limits # (the results of the "old" count stay in childPeaksInt and are returned # in $childPeaks) - countFormulasTab <- xtabs( ~formula + mz, data=childPeaksGood) - countFormulas <- colSums(countFormulasTab) - childPeaksGood$formulaCount <- countFormulas[as.character(childPeaksGood$mz)] - - childPeaksUnassigned$formulaCount <- rep(NA, nrow(childPeaksUnassigned)) - childPeaksBad$formulaCount <- rep(NA, nrow(childPeaksBad)) - childPeaksBad$good <- rep(FALSE, nrow(childPeaksBad)) - - # Now: childPeaksGood (containing the new, recounted peaks with good = TRUE), and childPeaksBad (containing the - # peaks with good=FALSE, i.e. outside filter criteria, with the old formula count even though it is worthless) - # are bound together. - childPeaksBad <- childPeaksBad[,colnames(childPeaksGood),drop=FALSE] - childPeaksUnassigned <- childPeaksUnassigned[,colnames(childPeaksGood),drop=FALSE] - childPeaks <- rbind(childPeaksGood, childPeaksBad, childPeaksUnassigned) - - # Now let's cross fingers. Add a good=NA column to the unmatched peaks and reorder the columns - # to match order in childPeaks. After that, setData to the child slot. - - childPeaksOmitted <- getData(child) - childPeaksOmitted <- childPeaksOmitted[child@low | child@satellite,,drop=FALSE] - childPeaksOmitted$rawOK <- rep(FALSE, nrow(childPeaksOmitted)) - childPeaksOmitted$good <- rep(FALSE, nrow(childPeaksOmitted)) - childPeaksOmitted$dppm <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$formula <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$mzCalc <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$dbe <- rep(NA, nrow(childPeaksOmitted)) + countFormulasTab <- xtabs( ~formula + mz, data=childPeaksGood) + countFormulas <- colSums(countFormulasTab) + childPeaksGood$formulaCount <- countFormulas[as.character(childPeaksGood$mz)] + + childPeaksUnassigned$formulaCount <- rep(NA, nrow(childPeaksUnassigned)) + childPeaksBad$formulaCount <- rep(NA, nrow(childPeaksBad)) + childPeaksBad$good <- rep(FALSE, nrow(childPeaksBad)) + + # Now: childPeaksGood (containing the new, recounted peaks with good = TRUE), and childPeaksBad (containing the + # peaks with good=FALSE, i.e. outside filter criteria, with the old formula count even though it is worthless) + # are bound together. + childPeaksBad <- childPeaksBad[,colnames(childPeaksGood),drop=FALSE] + childPeaksUnassigned <- childPeaksUnassigned[,colnames(childPeaksGood),drop=FALSE] + childPeaks <- rbind(childPeaksGood, childPeaksBad, childPeaksUnassigned) + + # Now let's cross fingers. Add a good=NA column to the unmatched peaks and reorder the columns + # to match order in childPeaks. After that, setData to the child slot. + + childPeaksOmitted <- getData(child) + childPeaksOmitted <- childPeaksOmitted[child@low | child@satellite,,drop=FALSE] + childPeaksOmitted$rawOK <- rep(FALSE, nrow(childPeaksOmitted)) + childPeaksOmitted$good <- rep(FALSE, nrow(childPeaksOmitted)) + childPeaksOmitted$dppm <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$formula <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$mzCalc <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$dbe <- rep(NA, nrow(childPeaksOmitted)) childPeaksOmitted$dppmBest <- rep(NA, nrow(childPeaksOmitted)) childPeaksOmitted$formulaCount <- rep(0, nrow(childPeaksOmitted)) - childPeaks$satellite <- rep(FALSE, nrow(childPeaks)) - childPeaks$low <- rep(FALSE, nrow(childPeaks)) - childPeaks$rawOK <- rep(TRUE, nrow(childPeaks)) - - childPeaks <- childPeaks[,colnames(childPeaksOmitted), drop=FALSE] - - childPeaksTotal <- rbind(childPeaks, childPeaksOmitted) - child <- setData(child, childPeaksTotal) - child@ok <- TRUE - - return(child) + childPeaks$satellite <- rep(FALSE, nrow(childPeaks)) + childPeaks$low <- rep(FALSE, nrow(childPeaks)) + childPeaks$rawOK <- rep(TRUE, nrow(childPeaks)) + + childPeaks <- childPeaks[,colnames(childPeaksOmitted), drop=FALSE] + + childPeaksTotal <- rbind(childPeaks, childPeaksOmitted) + child <- setData(child, childPeaksTotal) + child@ok <- TRUE + + return(child) } # I believe these lines were fixed to remove a warning but in the refactored workflow "mzranges" doesn't exist anymore. @@ -766,11 +829,24 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi ## ## mzmin <- min(mzranges[,1], na.rm=TRUE) ## mzmax <- max(mzranges[,2], na.rm=TRUE) - children <- lapply(msmsPeaks@children, analyzeTandemShot) + children <- lapply(seq_along(msmsPeaks@children), + function(i) analyzeTandemShot(msmsPeaks@children[[i]], + childIdx = i)) + ## correct fields in case of invalid data + children <- lapply(children, function(child){ + if(child@ok) return(child) + child@rawOK <- rep(x = TRUE, times = child@peaksCount) + child@good <- rep(x = FALSE, times = child@peaksCount) + child@mzCalc <- as.numeric( rep(x = NA, times = child@peaksCount)) + child@formula <- as.character(rep(x = NA, times = child@peaksCount)) + child@dbe <- as.numeric( rep(x = NA, times = child@peaksCount)) + child@formulaCount <- as.integer( rep(x = NA, times = child@peaksCount)) + child@dppm <- as.numeric( rep(x = NA, times = child@peaksCount)) + child@dppmBest <- as.numeric( rep(x = NA, times = child@peaksCount)) + return(child) + }) - - ## shots <- mapply(function(shot, scan, info) ## { ## shot$scan <- scan @@ -796,11 +872,9 @@ analyzeMsMs.intensity <- function(msmsPeaks, mode="pH", detail=FALSE, run="preli cut <- filterSettings$prelimCut if(is.na(cut)) { - if(mode %in% c("pH", "pM", "pNa", "pNH4")) - cut <- 1e4 - else if(mode %in% c("mH", "mFA", "mM")) - cut <- 0 - else stop(paste("The ionization mode", mode, "is unknown.")) + adductProperties <- getAdductProperties(mode, msmsPeaks@formula) + if(adductProperties$charge > 0) cut <- 1e4 + if(adductProperties$charge < 0) cut <- 0 } cutRatio <- filterSettings$prelimCutRatio } @@ -1046,12 +1120,22 @@ aggregateSpectra <- function(spec, addIncomplete=FALSE) return(table.c) }) table.cpd <- do.call(rbind, tables.c) + + ## complete missing columns if necessary + ## mz intensity good scan cpdID parentScan + ## mz intensity good mzCalc formula dbe formulaCount dppm dppmBest scan cpdID parentScan + columnNames <- c( "mzCalc", "formula", "dbe", "formulaCount", "dppm", "dppmBest") + if(all(!(columnNames %in% colnames(table.cpd)))) + for(columnName in columnNames) + table.cpd[, columnName] <- as.numeric(rep(x = NA, times = nrow(table.cpd))) + table.cpd$cpdID <- rep(s@id, nrow(table.cpd)) table.cpd$parentScan <- rep(s@parent@acquisitionNum, nrow(table.cpd)) return(table.cpd) }) #return(compoundTables) aggTable <- do.call(rbind, compoundTables) + if(is.null(aggTable)) aggTable <- data.frame("mz"=numeric(), "intensity"=numeric(), "good"=logical(), "mzCalc"=numeric(), "formula"=character(), "dbe"=numeric(), "formulaCount"=integer(), "dppm"=numeric(), "dppmBest"=numeric(), "scan"=integer(), "cpdID"=integer(), "parentScan"=integer(), stringsAsFactors=FALSE) colnames(aggTable)[1] <- "mzFound" aggTable <- addProperty(aggTable, "dppmRc", "numeric") @@ -1091,22 +1175,21 @@ aggregateSpectra <- function(spec, addIncomplete=FALSE) cleanElnoise <- function(peaks, noise=getOption("RMassBank")$electronicNoise, width = getOption("RMassBank")$electronicNoiseWidth) { - peaks <- addProperty(peaks, "noise", "logical", FALSE) - - # I don't think this makes sense if using one big table... - ## # use only best peaks - ## p_best <- peaks[is.na(peaks$dppmBest) | (peaks$dppm == peaks$dppmBest),] - - # remove known electronic noise - p_eln <- peaks - for(noisePeak in noise) - { + + # I don't think this makes sense if using one big table... + ## # use only best peaks + ## p_best <- peaks[is.na(peaks$dppmBest) | (peaks$dppm == peaks$dppmBest),] + + # remove known electronic noise + p_eln <- peaks + for(noisePeak in noise) + { noiseMatches <- which(!((p_eln$mzFound > noisePeak + width) | (p_eln$mzFound < noisePeak - width))) if(length(noiseMatches) > 0) p_eln[noiseMatches, "noise"] <- TRUE - } - return(p_eln) + } + return(p_eln) } #' Identify intense peaks (in a list of unmatched peaks) @@ -1202,7 +1285,7 @@ processProblematicPeaks <- function(w, mode, archivename = NA) # Add the info to specs specs <- addProperty(specs, "problematicPeak", "logical", FALSE) - specs[match(fp$index, specs$index),"problematicPeak"] <- TRUE + if(nrow(specs) > 0) specs[match(fp$index, specs$index),"problematicPeak"] <- TRUE # Select the columns for output into the failpeaks file fp <- fp[,c("OK", "name", "cpdID", "scan", "mzFound", "formula", @@ -1297,7 +1380,7 @@ makeRecalibration <- function(w, mode, stop("No spectra present to generate recalibration curve.") rcdata <- peaksMatched(w) - rcdata <- rcdata[rcdata$formulaCount == 1, ,drop=FALSE] + rcdata <- rcdata[!is.na(rcdata$formulaCount) & rcdata$formulaCount == 1, ,drop=FALSE] rcdata <- rcdata[,c("mzFound", "dppm", "mzCalc")] @@ -1336,9 +1419,9 @@ makeRecalibration <- function(w, mode, # plot the model par(mfrow=c(2,2)) if(nrow(rcdata)>0) - plotRecalibration.direct(rcdata, rc, rc.ms1, "MS2", - range(rcdata$mzFound), - recalibrateBy) + plotRecalibration.direct(rcdata = rcdata, rc = rc, rc.ms1 = rc.ms1, title = "MS2", + mzrange = range(rcdata$mzFound), + recalibrateBy = recalibrateBy) if(nrow(ms1data)>0) plotRecalibration.direct(ms1data, rc, rc.ms1, "MS1", range(ms1data$mzFound), @@ -1716,32 +1799,6 @@ reanalyzeFailpeak <- function(custom_additions, mass, cpdID, counter, pb = NULL, # here follows the Rcdk analysis #------------------------------------ - # define the adduct additions - if(mode == "pH") { - allowed_additions <- "H" - mode.charge <- 1 - } else if(mode == "pNa") { - allowed_additions <- "Na" - mode.charge <- 1 - } else if(mode == "pM") { - allowed_additions <- "" - mode.charge <- 1 - } else if(mode == "mM") { - allowed_additions <- "" - mode.charge <- -1 - } else if(mode == "mH") { - allowed_additions <- "H-1" - mode.charge <- -1 - } else if(mode == "mFA") { - allowed_additions <- "C2H3O2" - mode.charge <- -1 - } else if(mode == "pNH4") { - allowed_additions <- "NH4" - mode.charge <- 1 - } else { - stop("mode = \"", mode, "\" not defined") - } - # the ppm range is two-sided here. # The range is slightly expanded because dppm calculation of # generate.formula starts from empirical mass, but dppm cal- @@ -1750,6 +1807,11 @@ reanalyzeFailpeak <- function(custom_additions, mass, cpdID, counter, pb = NULL, db_formula <- findFormula(cpdID, retrieval=findLevel(cpdID,TRUE)) + # get the adduct additions + adductProperties <- getAdductProperties(mode, db_formula) + allowed_additions <- adductProperties$addition + mode.charge <- adductProperties$charge + ppmlimit <- 2.25 * filterSettings$ppmFine parent_formula <- add.formula(db_formula, allowed_additions) parent_formula <- add.formula(parent_formula, custom_additions) @@ -1758,8 +1820,10 @@ reanalyzeFailpeak <- function(custom_additions, mass, cpdID, counter, pb = NULL, #print(parent_formula) limits <- to.limits.rcdk(parent_formula) - peakformula <- suppressWarnings(generate.formula(mass, ppm(mass, ppmlimit, p=TRUE), - limits, charge=mode.charge)) + peakformula <- tryCatch( + suppressWarnings(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), + limits, charge=0)), + error = function(e) list()) # was a formula found? If not, return empty result if(length(peakformula)==0) return(as.data.frame( @@ -1865,10 +1929,10 @@ filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE) peaks <- cbind(peaks, data.frame(formulaMultiplicity=numeric())) if(recalcBest){ if(formulacol == "formula"){ - warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.") + cat(paste("### Warning ### filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.")) } if(formulacol == "reanalyzed.formula"){ - warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.") + warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point.") } peaks$fM_factor <- as.factor(peaks$formulaMultiplicity) return(peaks) @@ -1877,8 +1941,7 @@ filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE) else { # calculate duplicity info - multInfo <- aggregate(as.data.frame(peaks$scan), - list(peaks$cpdID, peaks[,formulacol]), FUN=length) + multInfo <- aggregate(as.data.frame(peaks$scan), list(peaks$cpdID, peaks[,formulacol]), FUN=length) # just for comparison: # nform <- unique(paste(pks$cpdID,pks$formula)) @@ -2020,22 +2083,16 @@ filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE) filterMultiplicity <- function(w, archivename=NA, mode="pH", recalcBest = TRUE, multiplicityFilter = getOption("RMassBank")$multiplicityFilter) { - # Read multiplicity filter setting - # For backwards compatibility: If the option is not set, define as 2 - # (which was the behaviour before we introduced the option) - if(is.null(multiplicityFilter)) - multiplicityFilter <- 2 + # Read multiplicity filter setting + # For backwards compatibility: If the option is not set, define as 2 + # (which was the behaviour before we introduced the option) + if(is.null(multiplicityFilter)) + multiplicityFilter <- 2 - specs <- w@aggregated - - peaksFiltered <- filterPeaksMultiplicity(peaksMatched(specs), - "formula", recalcBest) - - - peaksFilteredReanalysis <- - filterPeaksMultiplicity(specs[!is.na(specs$matchedReanalysis) & specs$matchedReanalysis,,drop=FALSE], "reanalyzed.formula", FALSE) - - + specs <- w@aggregated + + peaksFiltered <- filterPeaksMultiplicity(peaksMatched(specs), "formula", recalcBest) + peaksFilteredReanalysis <- filterPeaksMultiplicity(specs[!is.na(specs$matchedReanalysis) & specs$matchedReanalysis,,drop=FALSE], "reanalyzed.formula", FALSE) specs <- addProperty(specs, "formulaMultiplicity", "numeric", 0) @@ -2063,7 +2120,7 @@ filterMultiplicity <- function(w, archivename=NA, mode="pH", recalcBest = TRUE, # Kick the M+H+ satellites out of peaksReanOK: peaksReanOK$mzCenter <- as.numeric( - unlist(lapply(peaksReanOK$cpdID, function(id) findMz(id, retrieval=findLevel(id,TRUE))$mzCenter)) ) + unlist(lapply(peaksReanOK$cpdID, function(id) findMz(id, mode=mode, retrieval=findLevel(id,TRUE))$mzCenter)) ) peaksReanBad <- peaksReanOK[ !((peaksReanOK$mzFound < peaksReanOK$mzCenter - 1) | (peaksReanOK$mzFound > peaksReanOK$mzCenter + 1)),] diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index 2f761c6..0f0c43c 100644 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -198,14 +198,14 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, headerData[which(headerData$msLevel == 1),"precursorScanNum"] <- 0 } # bugfix 201803: PRM scans that were performed before the first full scan (found in some files) - headerData <- headerData[ - !((headerData$msLevel == 2) & (headerData$precursorScanNum == 0)),,drop=FALSE - ] + headerData <- headerData[!((headerData$msLevel == 2) & + (is.na(headerData$precursorScanNum))),, + drop = FALSE] # Find MS2 spectra with precursors which are in the allowed - # scan filter (coarse limit) range - findValidPrecursors <- headerData[ - (headerData$precursorMZ > mz - limit.coarse) & - (headerData$precursorMZ < mz + limit.coarse),] + # scan filter (coarse limit) range; which to get rid of NAs + findValidPrecursors <- headerData[which( + headerData$precursorMZ > (mz - limit.coarse) & + headerData$precursorMZ < (mz + limit.coarse)), ] # Find the precursors for the found spectra validPrecursors <- unique(findValidPrecursors$precursorScanNum) # check whether the precursors are real: must be within fine limits! @@ -245,9 +245,11 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, spectra <- lapply(eic$scan, function(masterScan) { masterHeader <- headerData[headerData$acquisitionNum == masterScan,] - childHeaders <- headerData[(headerData$precursorScanNum == masterScan) - & (headerData$precursorMZ > mz - limit.coarse) - & (headerData$precursorMZ < mz + limit.coarse) ,] + childHeaders <- headerData[ + which(headerData$precursorScanNum == masterScan + & headerData$precursorMZ > (mz - limit.coarse) + & headerData$precursorMZ < (mz + limit.coarse)) , , + drop = FALSE] # Fix 9.10.17: headers now include non-numeric columns, leading to errors in data conversion. # Remove non-numeric columns @@ -512,7 +514,11 @@ findMsMsHRperxcms.direct <- function(fileName, cpdID, mode="pH", findPeaksArgs = pspIndex <- which(sapply(anmsms@pspectra, function(x) {candidates[[i]][closestCandidate] %in% x})) } else{ # Else choose the candidate with the closest RT - pspIndex <- which.min(abs(getRT(anmsms) - RT)) + if(RMassBank.env$strictMsMsSpectraSelection){ + pspIndex <- NULL + } else { + pspIndex <- which.min(abs(getRT(anmsms) - RT)) + } } # 2nd best: Spectrum closest to MS1 @@ -546,6 +552,290 @@ findMsMsHRperxcms.direct <- function(fileName, cpdID, mode="pH", findPeaksArgs = return(metaspec) } +################################################################################ +## new +findMsMsHRperMsp <- function(fileName, cpdIDs, mode="pH"){ + # Find mz + #mzLimits <- findMz(cpdIDs, mode) + #mz <- mzLimits$mzCenter + + # If there are more files than cpdIDs + if(length(fileName) > 1){ + fspectra <- list() + + for(i in 1:length(fileName)){ + fspectra[[i]] <- findMsMsHRperMsp.direct(fileName[i], cpdIDs, mode=mode) + } + + spectra <- toRMB(unlist(unlist(fspectra, FALSE),FALSE), cpdIDs, mode) + + } else if(length(cpdIDs) > 1){ # If there are more cpdIDs than files + + spectra <- findMsMsHRperMsp.direct(fileName = fileName, cpdIDs = cpdIDs, mode=mode) + + P <- lapply(1:length(spectra), function(i){ + sp <- toRMB(msmsXCMSspecs = spectra[[i]], cpdID = cpdIDs[i], mode = mode) + sp@id <- as.character(as.integer(cpdIDs[i])) + sp@name <- findName(cpdIDs[i]) + sp@formula <- findFormula(cpdIDs[i]) + sp@mode <- mode + + if(length(sp@children) == 1){ + sp@children[[1]]@rawOK <- rep(x = TRUE, times = sp@children[[1]]@peaksCount) + sp@children[[1]]@good <- rep(x = TRUE, times = sp@children[[1]]@peaksCount) + #sp@children[[1]]@good <- TRUE + } + + return(sp) + }) + return(P) + + } else { # There is a file for every cpdID + spectra <- toRMB(msmsXCMSspecs = unlist(findMsMsHRperMsp.direct(fileName = fileName, cpdIDs = cpdIDs, mode=mode),FALSE), cpdID = cpdIDs, mode = mode) + } + + sp <- spectra + + #sp@mz <- mzLimits + sp@id <- as.character(as.integer(cpdIDs)) + sp@name <- findName(cpdIDs) + sp@formula <- findFormula(cpdIDs) + sp@mode <- mode + + return(sp) +} + +#' @describeIn findMsMsHRperMsp A submethod of findMsMsHrperxcms that retrieves basic spectrum data +#' @export +findMsMsHRperMsp.direct <- function(fileName, cpdIDs, mode="pH") { + + #requireNamespace("CAMERA",quietly=TRUE) + #requireNamespace("xcms",quietly=TRUE) + + ## + ## MSMS + ## + + # Read file + suppressWarnings(xrmsms <- read.msp(file = fileName)) + xrmsms <- xrmsms[unlist(lapply(X = xrmsms, FUN = function(spectrum){nrow(spectrum$pspectrum)})) > 0] + + ## If file is not MSe, split by collision energy + #if(MSe == FALSE){ + # # Also, fake MS1 from the MSn data + # suppressWarnings(xrs <- split(xcms::msn2xcmsRaw(xrmsms), f = xrmsms@msnCollisionEnergy)) + #} else{ + # # Else, MSn data will already be in MS1 + # xrs <- list() + # xrs[[1]] <- xrmsms + #} + #xrs <- xrmsms + + mzabs <- 0.1 + + # Definitions + whichmissing <- vector() + metaspec <- list() + + mzs <- unlist(lapply(X = xrmsms, FUN = function(x){ x$PRECURSORMZ })) + rts <- unlist(lapply(X = xrmsms, FUN = function(x){ if(x$RETENTIONTIME == "NA") return(NA) else return(x$RETENTIONTIME) })) + precursorTable <- data.frame(stringsAsFactors = FALSE, + mz = as.numeric(mzs), + rt = as.numeric(rts) + ) + precursorTable[, "rt"] <- precursorTable[, "rt"] * 60 + + ## + ## Retrieval over all supplied cpdIDs + ## + + for(idIdx in seq_along(cpdIDs)){ + + # Find all relevant information for the current cpdID + spectrum <- NULL + RT <- findRt(cpdIDs[[idIdx]])$RT * 60 + parentMass <- findMz(cpdIDs[[idIdx]], mode=mode)$mzCenter + + # Is the information in the compound list? + if(is.na(parentMass)){ + stop(paste("There was no matching entry to the supplied cpdID", cpdIDs[[idIdx]] ,"\n Please check the cpdIDs and the compoundlist.")) + } + + # Go over every collision energy of the MS2 + #for(i in seq_along(xrs)){ + + + + if (nrow(precursorTable) == 0) { + ## no peaks there + #spectrum <- matrix(0,2,7) + next + } else{ + ## at least one peak there + + # Get the peaklist + #pl <- xrs[[i]]$pspectrum + #pl <- data.frame("mz" = pl[, "mz"], "rt" = xrs[[i]]$RETENTIONTIME, stringsAsFactors = F) + + maximumParentMass <- parentMass + mzabs + minimumParentMass <- parentMass - mzabs + maximumRT <- RT * 1.1 + minimumRT <- RT * 0.9 + + mzMatch <- + precursorTable[,"mz", drop=FALSE] < maximumParentMass & + precursorTable[,"mz", drop=FALSE] > minimumParentMass + rtMatch <- + precursorTable[,"rt", drop=FALSE] < maximumRT & + precursorTable[,"rt", drop=FALSE] > minimumRT + + mzMatch[is.na(mzMatch)] <- TRUE ## RT not given + if(is.na(RT)) + rtMatch <- TRUE + + # Find precursor peak within limits + candidates <- which( mzMatch & rtMatch ) + + # Annotate and group by FWHM (full width at half maximum) + #capture.output(anmsms <- CAMERA::xsAnnotate(xsmsms[[i]])) + #capture.output(anmsms <- CAMERA::groupFWHM(anmsms)) + + # If a candidate fulfills the condition, choose the closest and retrieve the index of those pesudospectra + if(length(candidates) > 0){ + if(is.na(RT)){ + pspIndex <- candidates[[1]] + + if(RMassBank.env$verbose.output) + cat(paste("\n### Info ### Compound ", cpdIDs[[idIdx]], ": RT is not given. ", length(candidates), " candidates in range. Taking the first hit: mz[", minimumParentMass, ", ", maximumParentMass , "] vs mz ", precursorTable[pspIndex,"mz"], ".\n", sep = "")) + } else { + closestCandidate <- which.min(abs(RT - precursorTable[candidates, "rt"])) + pspIndex <- candidates[[closestCandidate]] + if(RMassBank.env$verbose.output) + cat(paste("\n### Info ### Compound ", cpdIDs[[idIdx]], ": ", length(candidates), " candidates in range. Taking the closest hit regarding RT (", RT, "): mz[", minimumParentMass, ", ", maximumParentMass , "] x rt[", minimumRT, ", ", maximumRT, "] vs (mz ", precursorTable[pspIndex,"mz"], ", rt ", precursorTable[pspIndex,"rt"], ")\n", sep = "")) + } + } else{ + # Else choose the candidate with the closest RT + if(RMassBank.env$strictMsMsSpectraSelection){ + pspIndex <- NULL + cat(paste("\n### Warning ### Compound ", cpdIDs[[idIdx]], ": No candidates in range.\n", sep = "")) + } else { + pspIndex <- which.min(abs(RT - precursorTable[, "rt"])) + cat(paste("\n### Warning ### Compound ", cpdIDs[[idIdx]], ": No candidates in range. Taking the closest hit regarding RT (", RT, "): mz[", minimumParentMass, ", ", maximumParentMass , "] x rt[", minimumRT, ", ", maximumRT, "] vs (mz ", precursorTable[pspIndex,"mz"], ", rt ", precursorTable[pspIndex,"rt"], ")\n", sep = "")) + } + } + + # 2nd best: Spectrum closest to MS1 + # pspIndex <- which.min( abs(getRT(anmsms) - actualRT)) + + ## If the plot parameter was supplied, plot it + #if((plots == TRUE) && (length(pspIndex) > 0)){ + # CAMERA::plotPsSpectrum(anmsms, pspIndex, log=TRUE, mzrange=c(0, findMz(cpdIDs)[[3]]), maxlabel=10) + #} + + # If there is a number of indexes, retrieve the pseudospectra + if(length(pspIndex) != 0){ + spectrum <- xrmsms[[pspIndex]] + } else { + # Else note the spectrum as missing + whichmissing <- c(whichmissing,idIdx) + #spectrum <- matrix(0,2,7) + } + } + #} + + # If XCMSspectra were found but there are some missing for some collision energies, fill these XCMSspectra + #if((length(XCMSspectra) != 0) && length(whichmissing)){ + # for(i in whichmissing){ + # XCMSspectra[[idIdx]] <- matrix(0,2,7) + # } + #} + + if(is.null(spectrum)){ + metaspec[[idIdx]] <- list(matrix(0,1,7)) + } else { + mz <- as.numeric(spectrum$pspectrum[, "mz"]) + rt <- as.numeric(ifelse(test = spectrum$RETENTIONTIME=="NA", yes = NA, no = spectrum$RETENTIONTIME)) + metaspec[[idIdx]] <- list(data.frame( + stringsAsFactors = F, + "mz" = mz, + "mzmin" = mz, + "mzmax" = mz, + "rt" = rt, + "rtmin" = rt, + "rtmax" = rt, + "into" = as.numeric(spectrum$pspectrum[, "intensity"]), + "into_parent" = as.numeric(spectrum$INTENSITY) + )) + } + } + + return(metaspec) +} + +## adapted from the Bioconductor package 'metaMS' (method 'read.msp') +read.msp <- function(file){ + get.text.value <- function(x, field, do.err = TRUE) { + if(trimws(x) == field) return("") + woppa <- strsplit(x, field) + woppa.lengths <- sapply(woppa, length) + if (all(woppa.lengths == 2)) { + sapply(woppa, function(y) gsub("^ +", "", y[2])) + } + else { + if (do.err) { + stop(paste("Invalid field", field, "in", x[woppa.lengths != 2])) + } + else { + NULL + } + } + } + read.compound <- function(strs) { + fields.idx <- grep(":", strs) + fields <- sapply(strsplit(strs[fields.idx], ":"), "[[", 1) + pk.idx <- which(fields == "Num Peaks") + if (length(pk.idx) == 0) + stop("No spectrum found") + cmpnd <- lapply(fields.idx[-pk.idx], function(x) get.text.value(strs[x], paste(fields[x], ":", sep = ""))) + names(cmpnd) <- fields[-pk.idx] + if(!("INTENSITY" %in% names(cmpnd))) cmpnd$"INTENSITY" <- 100 + + cmpnd$PRECURSORMZ <- gsub(x = cmpnd$PRECURSORMZ, pattern = ",", replacement = ".") + cmpnd$RETENTIONTIME <- gsub(x = cmpnd$RETENTIONTIME, pattern = ",", replacement = ".") + cmpnd$INTENSITY <- gsub(x = cmpnd$INTENSITY, pattern = ",", replacement = ".") + + ## minutes to seconds + #cmpnd$RETENTIONTIME <- as.numeric(cmpnd$RETENTIONTIME) * 60 + + nlines <- length(strs) + npeaks <- as.numeric(get.text.value(strs[pk.idx], "Num Peaks:")) + peaks.idx <- (pk.idx + 1):nlines + pks <- gsub("^ +", "", unlist(strsplit(strs[peaks.idx], ";"))) + pks <- pks[pks != ""] + if (length(pks) != npeaks) + stop(paste("Not the right number of peaks in compound '", cmpnd$Name, "' (", npeaks, " vs ", length(pks), ") in file '", file, "'", sep = "")) + pklst <- strsplit(x = pks, split = "\t| ") + pklst <- lapply(pklst, function(x) x[x != ""]) + cmz <- as.numeric(gsub(x = sapply(pklst, "[[", 1), pattern = ",", replacement = ".")) + cintens <- as.numeric(gsub(x = sapply(pklst, "[[", 2), pattern = ",", replacement = ".")) + finaltab <- matrix(c(cmz, cintens), ncol = 2) + if (any(table(cmz) > 1)) { + warning("Duplicate mass in compound ", cmpnd$Name, " (CAS ", cmpnd$CAS, ")... summing up intensities") + finaltab <- aggregate(finaltab[, 2], by = list(finaltab[, 1]), FUN = sum) + } + colnames(finaltab) <- c("mz", "intensity") + c(cmpnd, list(pspectrum = finaltab)) + } + huhn <- readLines(con = file) + starts <- which(regexpr("(Name:)|(NAME:) ", huhn) == 1) + ends <- c(starts[-1] - 1, length(huhn)) + lapply(1:length(starts), function(i){ + read.compound(huhn[starts[[i]]:ends[[i]]]) + }) +} +## new +################################################################################ + # Finds the EIC for a mass trace with a window of x ppm. # (For ppm = 10, this is +5 / -5 ppm from the non-recalibrated mz.) #' Extract EICs @@ -695,10 +985,10 @@ toRMB <- function(msmsXCMSspecs = NA, cpdID = NA, mode="pH", MS1spec = NA){ mockenv$mockAcqnum <- mockenv$mockAcqnum + 1 ## Find peak table - pks <- matrix(nrow = length(spec[,1]), ncol = 2) + pks <- matrix(nrow = nrow(spec), ncol = 2) colnames(pks) <- c("mz","int") - pks[,1] <- spec[,1] - pks[,2] <- spec[,7] + pks[,1] <- spec[,"mz"] + pks[,2] <- spec[,"into"] ## Deprofiling not necessary for XCMS @@ -707,24 +997,24 @@ toRMB <- function(msmsXCMSspecs = NA, cpdID = NA, mode="pH", MS1spec = NA){ mz = pks[,"mz"], intensity = pks[,"int"], precScanNum = as.integer(1), - precursorMz = findMz(cpdID)$mzCenter, - precursorIntensity = 0, + precursorMz = findMz(cpdID, mode=mode)$mzCenter, + precursorIntensity = ifelse(test = "into_parent" %in% colnames(spec), yes = spec[,"into_parent"], no = 0), precursorCharge = as.integer(1), collisionEnergy = 0, tic = 0, peaksCount = nrow(spec), - rt = median(spec[,4]), + rt = median(spec[,"rt"]), acquisitionNum = as.integer(mockenv$mockAcqnum), centroided = TRUE )) }) msmsSpecs <- as(do.call(c, msmsSpecs), "SimpleList") - + ##Build the new objects masterSpec <- new("Spectrum1", mz = findMz(cpdID,mode=mode)$mzCenter, - intensity = 100, + intensity = ifelse(test = msmsSpecs[[1]]@precursorIntensity != 0, yes = msmsSpecs[[1]]@precursorIntensity, no = 100), polarity = as.integer(0), peaksCount = as.integer(1), rt = msmsSpecs[[1]]@rt, @@ -813,7 +1103,7 @@ addPeaksManually <- function(w, cpdID = NA, handSpec, mode = "pH"){ mz = handSpec[,"mz"], intensity = handSpec[,"int"], precScanNum = as.integer(1), - precursorMz = findMz(cpdID)$mzCenter, + precursorMz = findMz(cpdID,mode=mode)$mzCenter, precursorIntensity = 0, precursorCharge = as.integer(1), collisionEnergy = 0, @@ -859,3 +1149,4 @@ addMB <- function(w, cpdID, fileName, mode){ w <- addPeaksManually(w, cpdID, peaklist[[1]], mode) return(w) } + diff --git a/R/msmsRawExtensions.r b/R/msmsRawExtensions.r index 5cdc1c7..dafc6fd 100644 --- a/R/msmsRawExtensions.r +++ b/R/msmsRawExtensions.r @@ -62,8 +62,8 @@ findMsMsHR.ticms2 <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA # Find MS2 spectra with precursors which are in the allowed # scan filter (coarse limit) range findValidPrecursors <- headerData[ - (headerData$precursorMZ > mz - limit.coarse) & - (headerData$precursorMZ < mz + limit.coarse),] + which(headerData$precursorMZ > (mz - limit.coarse) & + headerData$precursorMZ < (mz + limit.coarse)),] # Find the precursors for the found spectra @@ -94,10 +94,12 @@ findMsMsHR.ticms2 <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA ret$parentHeader[1,4:20] <- 0 ret$parentHeader[1,6] <- NA - childHeaders <- headerData[(headerData$acquisitionNum == masterScan) - & (headerData$precursorMZ > mz - limit.coarse) - & (headerData$precursorMZ < mz + limit.coarse) ,] - + childHeaders <- headerData[ + which(headerData$acquisitionNum == masterScan + & headerData$precursorMZ > (mz - limit.coarse) + & headerData$precursorMZ < (mz + limit.coarse)), , + drop = FALSE] + childScans <- childHeaders$acquisitionNum ret$parentScan <- min(childScans)-1 ret$parentHeader[1,1:3] <- min(childScans)-1 diff --git a/R/msmsRead.R b/R/msmsRead.R index 408d8dc..68be5d0 100644 --- a/R/msmsRead.R +++ b/R/msmsRead.R @@ -1,370 +1,420 @@ -#' -#' Extracts and processes spectra from a specified file list, according to -#' loaded options and given parameters. -#' -#' The filenames of the raw LC-MS runs are read from the array \code{files} -#' in the global enviroment. -#' See the vignette \code{vignette("RMassBank")} for further details about the -#' workflow. -#' -#' @param w A \code{msmsWorkspace} to work with. -#' @param filetable The path to a .csv-file that contains the columns "Files" and "ID" supplying -#' the relationships between files and compound IDs. Either this or the parameter "files" need -#' to be specified. -#' @param files A vector or list containing the filenames of the files that are to be read as spectra. -#' For the IDs to be inferred from the filenames alone, there need to be exactly 2 underscores. -#' @param cpdids A vector or list containing the compound IDs of the files that are to be read as spectra. -#' The ordering of this and \code{files} implicitly assigns each ID to the corresponding file. -#' If this is supplied, then the IDs implicitly named in the filenames are ignored. -#' @param readMethod Several methods are available to get peak lists from the files. -#' Currently supported are "mzR", "xcms", "MassBank" and "peaklist". -#' The first two read MS/MS raw data, and differ in the strategy -#' used to extract peaks. MassBank will read existing records, -#' so that e.g. a recalibration can be performed, and "peaklist" -#' just requires a CSV with two columns and the column header "mz", "int". -#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). -#' @param confirmMode Defaults to false (use most intense precursor). Value 1 uses -#' the 2nd-most intense precursor for a chosen ion (and its data-dependent scans) -#' , etc. -#' @param useRtLimit Whether to enforce the given retention time window. -#' @param Args A list of arguments that will be handed to the xcms-method findPeaks via do.call -#' @param settings Options to be used for processing. Defaults to the options loaded via -#' \code{\link{loadRmbSettings}} et al. Refer to there for specific settings. -#' @param progressbar The progress bar callback to use. Only needed for specialized applications. -#' Cf. the documentation of \code{\link{progressBarHook}} for usage. -#' @param MSe A boolean value that determines whether the spectra were recorded using MSe or not -#' @param plots A boolean value that determines whether the pseudospectra in XCMS should be plotted -#' @return The \code{msmsWorkspace} with msms-spectra read. -#' @seealso \code{\link{msmsWorkspace-class}}, \code{\link{msmsWorkflow}} -#' @author Michael Stravs, Eawag -#' @author Erik Mueller, UFZ -#' @export -msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, - readMethod, mode, confirmMode = FALSE, useRtLimit = TRUE, - Args = NULL, settings = getOption("RMassBank"), - progressbar = "progressBarHook", MSe = FALSE, plots = FALSE){ - .checkMbSettings() - ##Read the files and cpdids according to the definition - ##All cases are silently accepted, as long as they can be handled according to one definition - if(!any(mode %in% c("pH","pNa","pM","pNH4","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) - - if(is.null(filetable)){ - ##If no filetable is supplied, filenames must be named explicitly - if(is.null(files)) - stop("Please supply the files") - - ##Assign the filenames to the workspace - w@files <- unlist(files) - - ##If no filetable is supplied, cpdids must be delivered explicitly or implicitly within the filenames - if(is.null(cpdids)){ - splitfn <- strsplit(files,"_") - splitsfn <- sapply(splitfn, function(x) x[length(x)-1]) - if(suppressWarnings(any(is.na(as.numeric(splitsfn)[1])))) - stop("Please supply the cpdids corresponding to the files in the filetable or the filenames") - cpdids <- splitsfn - } - } else{ - ##If a filetable is supplied read it - tab <- read.csv(filetable, stringsAsFactors = FALSE) - w@files <- tab[,"Files"] - cpdids <- tab[,"ID"] - } - - ##If there's more cpdids than filenames or the other way around, then abort - if(length(w@files) != length(cpdids)){ - stop("There are a different number of cpdids than files") - } - - if(!(readMethod %in% c("mzR","peaklist","xcms","minimal"))){ - stop("The supplied method does not exist") - } - - if(!all(file.exists(w@files))){ - stop("The supplied files ", paste(w@files[!file.exists(w@files)]), " don't exist") - } - - # na.ids <- which(is.na(sapply(cpdids, findSmiles))) - - # if(length(na.ids)){ - # stop("The supplied compound ids ", paste(cpdids[na.ids], collapse=" "), " don't have a corresponding smiles entry. Maybe they are missing from the compound list") - # } - - ##This should work - if(readMethod == "minimal"){ - ##Edit options - opt <- getOption("RMassBank") - opt$recalibrator$MS1 <- "recalibrate.identity" - opt$recalibrator$MS2 <- "recalibrate.identity" - opt$add_annotation==FALSE - options(RMassBank=opt) - ##Edit analyzemethod - analyzeMethod <- "intensity" - } - - if(readMethod == "mzR"){ - ##Progressbar - nLen <- length(w@files) - nProg <- 0 - pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) - - count <- 1 - envir <- environment() - w@spectra <- as(lapply(w@files, function(fileName) { - - # Find compound ID - cpdID <- cpdids[count] - retrieval <- findLevel(cpdID,TRUE) - # Set counter up - envir$count <- envir$count + 1 - - # Retrieve spectrum data - spec <- findMsMsHR(fileName = fileName, - cpdID = cpdID, mode = mode, confirmMode = confirmMode, useRtLimit = useRtLimit, - ppmFine = settings$findMsMsRawSettings$ppmFine, - mzCoarse = settings$findMsMsRawSettings$mzCoarse, - fillPrecursorScan = settings$findMsMsRawSettings$fillPrecursorScan, - rtMargin = settings$rtMargin, - deprofile = settings$deprofile, retrieval=retrieval) - gc() - - # Progress: - nProg <<- nProg + 1 - pb <- do.call(progressbar, list(object=pb, value= nProg)) - - return(spec) - } ), "SimpleList") - names(w@spectra) <- basename(as.character(w@files)) - return(w) - } - - ##xcms-readmethod - if(readMethod == "xcms"){ - - ##Load libraries - requireNamespace("xcms",quietly=TRUE) - requireNamespace("CAMERA",quietly=TRUE) - - ##Find unique files and cpdIDs - ufiles <- unique(w@files) - uIDs <- unique(cpdids) - nLen <- length(ufiles) - - ##Progressbar - nProg <- 0 - pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) - i <- 1 - - ##Routine for the case of multiple cpdIDs per file - if(length(uIDs) > length(ufiles)){ - w@spectra <- as(unlist(lapply(ufiles, function(currentFile){ - fileIDs <- cpdids[which(w@files == currentFile)] - spec <- findMsMsHRperxcms(currentFile, fileIDs, mode=mode, findPeaksArgs=Args, plots, MSe = MSe) - gc() - - # Progress: - nProg <<- nProg + 1 - pb <- do.call(progressbar, list(object=pb, value= nProg)) - - return(spec) - }),FALSE),"SimpleList") - return(w) - } - - ##Routine for the other cases - w@spectra <- as(lapply(uIDs, function(ID){ - # Find files corresponding to the compoundID - currentFile <- w@files[which(cpdids == ID)] - - # Retrieve spectrum data - spec <- findMsMsHRperxcms(currentFile, ID, mode=mode, findPeaksArgs=Args, plots, MSe = MSe) - gc() - - # Progress: - nProg <<- nProg + 1 - pb <- do.call(progressbar, list(object=pb, value= nProg)) - - return(spec) - }),"SimpleList") - ##If there are more files than unique cpdIDs, only remember the first file for every cpdID - w@files <- w@files[sapply(uIDs, function(ID){ - return(which(cpdids == ID)[1]) - })] - return(w) - } - - ##Peaklist-readmethod - if((readMethod == "peaklist") || (readMethod=="minimal")){ - w <- createSpecsFromPeaklists(w, cpdids, filenames=w@files, mode=mode) - uIDs <- unique(cpdids) - files <- list() - - for(i in 1:length(uIDs)){ - indices <- sapply(cpdids,function(a){return(uIDs[i] %in% a)}) - files[[i]] <- w@files[indices] - } - - w@files <- sapply(files,function(file){return(file[1])}) - message("Peaks read") - return(w) - } - - -} - -#' -#' Extracts and processes spectra from a list of xcms-Objects -#' -#' The filenames of the raw LC-MS runs are read from the array \code{files} -#' in the global enviroment. -#' See the vignette \code{vignette("RMassBank")} for further details about the -#' workflow. -#' -#' @param w A \code{msmsWorkspace} to work with. -#' @param xRAW A list of xcmsRaw objects whose peaks should be detected and added to the workspace. -#' The relevant data must be in the MS1 data of the xcmsRaw object. You can coerce the -#' msn-data in a usable object with the \code{msn2xcmsRaw} function of xcms. -#' @param cpdids A vector or list containing the compound IDs of the files that are to be read as spectra. -#' The ordering of this and \code{files} implicitly assigns each ID to the corresponding file. -#' If this is supplied, then the IDs implicitly named in the filenames are ignored. -#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). -#' @param findPeaksArgs A list of arguments that will be handed to the xcms-method findPeaks via do.call -#' @param settings Options to be used for processing. Defaults to the options loaded via -#' \code{\link{loadRmbSettings}} et al. Refer to there for specific settings. -#' @param progressbar The progress bar callback to use. Only needed for specialized applications. -#' Cf. the documentation of \code{\link{progressBarHook}} for usage. -#' @param plots A boolean value that determines whether the pseudospectra in XCMS should be plotted -#' @return The \code{msmsWorkspace} with msms-spectra read. -#' @seealso \code{\link{msmsWorkspace-class}}, \code{\link{msmsWorkflow}} -#' @author Michael Stravs, Eawag -#' @author Erik Mueller, UFZ -#' @export -msmsRead.RAW <- function(w, xRAW = NULL, cpdids = NULL, mode, findPeaksArgs = NULL, - settings = getOption("RMassBank"), progressbar = "progressBarHook", plots = FALSE){ - - requireNamespace("xcms", quietly=TRUE) - - ##xRAW will be coerced into a list of length 1 if it is an xcmsRaw-object - if(class(xRAW) == "xcmsRaw"){ - xRAW <- list(xRAW) - } - - ##Error messages - if((class(xRAW) != "list") || any(sapply(xRAW, function(x) class(x) != "xcmsRaw"))){ - stop("No list of xcmsRaw-objects supplied") - } - - if(is.null(cpdids)){ - stop("No cpdids supplied") - } - - #msnExist <- which(sapply(xRAW,function(x) length(x@msnPrecursorScan) != 0)) - #print(length(msnExist)) - #print(length(xRAW)) - - #if(length(msnExist) != length(xRAW)){ - # stop(paste("No msn data in list elements", setdiff(1:length(xRAW),msnExist))) - #} - - requireNamespace("CAMERA",quietly=TRUE) - - parentMass <- findMz(cpdids[1], mode=mode)$mzCenter - if(is.na(parentMass)){ - stop(paste("There was no matching entry to the supplied cpdID", cpdids[1] ,"\n Please check the cpdIDs and the compoundlist.")) - } - - RT <- findRt(cpdids[1])$RT * 60 - mzabs <- 0.1 - - getRT <- function(xa) { - rt <- sapply(xa@pspectra, function(x) {median(peaks(xa@xcmsSet)[x, "rt"])}) - } - - suppressWarnings(setReplicate <- xcms::xcmsSet(files=xRAW[[1]]@filepath, method="MS1")) - xsmsms <- as.list(replicate(length(xRAW),setReplicate)) - candidates <- list() - anmsms <- list() - psp <- list() - spectra <- list() - whichmissing <- vector() - metaspec <- list() - for(i in 1:length(xRAW)){ - devnull <- suppressWarnings(capture.output(xcms::peaks(xsmsms[[i]]) <- do.call(xcms::findPeaks,c(findPeaksArgs, object = xRAW[[i]])))) - - if (nrow(xcms::peaks(xsmsms[[i]])) == 0) { ##If there are no peaks - spectra[[i]] <- matrix(0,2,7) - next - } else{ - ## Get pspec - pl <- xcms::peaks(xsmsms[[i]])[,c("mz", "rt"), drop=FALSE] - - ## Best: find precursor peak - candidates[[i]] <- which( pl[,"mz", drop=FALSE] < parentMass + mzabs & pl[,"mz", drop=FALSE] > parentMass - mzabs - & pl[,"rt", drop=FALSE] < RT * 1.1 & pl[,"rt", drop=FALSE] > RT * 0.9 ) - devnull <- capture.output(anmsms[[i]] <- CAMERA::xsAnnotate(xsmsms[[i]])) - devnull <- capture.output(anmsms[[i]] <- CAMERA::groupFWHM(anmsms[[i]])) - - if(length(candidates[[i]]) > 0){ - closestCandidate <- which.min (abs( RT - pl[candidates[[i]], "rt", drop=FALSE])) - psp[[i]] <- which(sapply(anmsms[[i]]@pspectra, function(x) {candidates[[i]][closestCandidate] %in% x})) - } else{ - psp[[i]] <- which.min( abs(getRT(anmsms[[i]]) - RT) ) - } - ## Now find the pspec for compound - - ## 2nd best: Spectrum closest to MS1 - ##psp <- which.min( abs(getRT(anmsms) - actualRT)) - - ## 3rd Best: find pspec closest to RT from spreadsheet - ##psp <- which.min( abs(getRT(anmsms) - RT) ) - if((plots == TRUE) && (length(psp[[i]]) > 0)){ - CAMERA::plotPsSpectrum(anmsms[[i]], psp[[i]], log=TRUE, mzrange=c(0, findMz(cpdids[1])[[3]]), maxlabel=10) - } - if(length(psp[[i]]) != 0){ - spectra[[i]] <- CAMERA::getpspectra(anmsms[[i]], psp[[i]]) - } else { - whichmissing <- c(whichmissing,i) - } - } - } - if(length(spectra) != 0){ - for(i in whichmissing){ - spectra[[i]] <- matrix(0,2,7) - } - } - - sp <- toRMB(spectra,cpdids,"mH") - sp@id <- as.character(as.integer(cpdids)) - sp@name <- findName(cpdids) - sp@formula <- findFormula(cpdids) - sp@mode <- mode - - if(length(w@spectra) != 0){ - IDindex <- sapply(w@spectra,function(s) s@id == cpdids) - if(length(IDindex)){ - spectraNum <- length(w@spectra[[which(IDindex)]]@children) - w@spectra[[which(IDindex)]]@children[[spectraNum+1]] <- sp@children[[1]] - } else { - w@spectra[[length(w@spectra)+1]] <- sp - } - } else{ - w@spectra[[1]] <- sp - } - - if(all(w@files != xRAW[[1]]@filepath)){ - w@files <- c(w@files,xRAW[[1]]@filepath) - } else{ - for(i in 2:(length(w@files)+1)){ - currentFPath <- paste0(xRAW[[1]]@filepath,"_",i) - if(all(w@files != currentFPath)){ - w@files <- c(w@files,currentFPath) - break - } - } - } - - return(w) -} - +#' +#' Extracts and processes spectra from a specified file list, according to +#' loaded options and given parameters. +#' +#' The filenames of the raw LC-MS runs are read from the array \code{files} +#' in the global enviroment. +#' See the vignette \code{vignette("RMassBank")} for further details about the +#' workflow. +#' +#' @param w A \code{msmsWorkspace} to work with. +#' @param filetable The path to a .csv-file that contains the columns "Files" and "ID" supplying +#' the relationships between files and compound IDs. Either this or the parameter "files" need +#' to be specified. +#' @param files A vector or list containing the filenames of the files that are to be read as spectra. +#' For the IDs to be inferred from the filenames alone, there need to be exactly 2 underscores. +#' @param cpdids A vector or list containing the compound IDs of the files that are to be read as spectra. +#' The ordering of this and \code{files} implicitly assigns each ID to the corresponding file. +#' If this is supplied, then the IDs implicitly named in the filenames are ignored. +#' @param readMethod Several methods are available to get peak lists from the files. +#' Currently supported are "mzR", "xcms", "MassBank" and "peaklist". +#' The first two read MS/MS raw data, and differ in the strategy +#' used to extract peaks. MassBank will read existing records, +#' so that e.g. a recalibration can be performed, and "peaklist" +#' just requires a CSV with two columns and the column header "mz", "int". +#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +#' @param confirmMode Defaults to false (use most intense precursor). Value 1 uses +#' the 2nd-most intense precursor for a chosen ion (and its data-dependent scans) +#' , etc. +#' @param useRtLimit Whether to enforce the given retention time window. +#' @param Args A list of arguments that will be handed to the xcms-method findPeaks via do.call +#' @param settings Options to be used for processing. Defaults to the options loaded via +#' \code{\link{loadRmbSettings}} et al. Refer to there for specific settings. +#' @param progressbar The progress bar callback to use. Only needed for specialized applications. +#' Cf. the documentation of \code{\link{progressBarHook}} for usage. +#' @param MSe A boolean value that determines whether the spectra were recorded using MSe or not +#' @param plots A boolean value that determines whether the pseudospectra in XCMS should be plotted +#' @return The \code{msmsWorkspace} with msms-spectra read. +#' @seealso \code{\link{msmsWorkspace-class}}, \code{\link{msmsWorkflow}} +#' @author Michael Stravs, Eawag +#' @author Erik Mueller, UFZ +#' @export +msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, + readMethod, mode, confirmMode = FALSE, useRtLimit = TRUE, + Args = NULL, settings = getOption("RMassBank"), + progressbar = "progressBarHook", MSe = FALSE, plots = FALSE){ + .checkMbSettings() + ##Read the files and cpdids according to the definition + ##All cases are silently accepted, as long as they can be handled according to one definition + if(!any(mode %in% knownAdducts())) stop(paste("The ionization mode", mode, "is unknown.")) + + if(is.null(filetable)){ + ##If no filetable is supplied, filenames must be named explicitly + if(is.null(files)) + stop("Please supply the files") + + ##Assign the filenames to the workspace + w@files <- unlist(files) + + ##If no filetable is supplied, cpdids must be delivered explicitly or implicitly within the filenames + if(is.null(cpdids)){ + splitfn <- strsplit(files,"_") + splitsfn <- sapply(splitfn, function(x) x[length(x)-1]) + if(suppressWarnings(any(is.na(as.numeric(splitsfn)[1])))) + stop("Please supply the cpdids corresponding to the files in the filetable or the filenames") + cpdids <- splitsfn + } + } else{ + ##If a filetable is supplied read it + tab <- read.csv(filetable, stringsAsFactors = FALSE) + w@files <- tab[,"Files"] + cpdids <- tab[,"ID"] + } + + ##If there's more cpdids than filenames or the other way around, then abort + if(length(w@files) != length(cpdids)){ + stop("There are a different number of cpdids than files") + } + + if(!(readMethod %in% c("mzR","peaklist","xcms","minimal","msp"))){ + stop("The supplied method does not exist") + } + + if(!all(file.exists(w@files))){ + stop("The supplied files ", paste(w@files[!file.exists(w@files)]), " don't exist") + } + + # na.ids <- which(is.na(sapply(cpdids, findSmiles))) + + # if(length(na.ids)){ + # stop("The supplied compound ids ", paste(cpdids[na.ids], collapse=" "), " don't have a corresponding smiles entry. Maybe they are missing from the compound list") + # } + + ##This should work + if(readMethod == "minimal"){ + ##Edit options + opt <- getOption("RMassBank") + opt$recalibrator$MS1 <- "recalibrate.identity" + opt$recalibrator$MS2 <- "recalibrate.identity" + opt$add_annotation==FALSE + options(RMassBank=opt) + ##Edit analyzemethod + analyzeMethod <- "intensity" + } + + if(readMethod == "mzR"){ + ##Progressbar + nLen <- length(w@files) + nProg <- 0 + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) + + count <- 1 + envir <- environment() + w@spectra <- as(lapply(w@files, function(fileName) { + + # Find compound ID + cpdID <- cpdids[count] + retrieval <- findLevel(cpdID,TRUE) + # Set counter up + envir$count <- envir$count + 1 + + # Retrieve spectrum data + spec <- findMsMsHR(fileName = fileName, + cpdID = cpdID, mode = mode, confirmMode = confirmMode, useRtLimit = useRtLimit, + ppmFine = settings$findMsMsRawSettings$ppmFine, + mzCoarse = settings$findMsMsRawSettings$mzCoarse, + fillPrecursorScan = settings$findMsMsRawSettings$fillPrecursorScan, + rtMargin = settings$rtMargin, + deprofile = settings$deprofile, retrieval=retrieval) + gc() + + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(spec) + } ), "SimpleList") + names(w@spectra) <- basename(as.character(w@files)) + } + + ##xcms-readmethod + if(readMethod == "xcms"){ + + ##Load libraries + requireNamespace("xcms",quietly=TRUE) + requireNamespace("CAMERA",quietly=TRUE) + + ##Find unique files and cpdIDs + ufiles <- unique(w@files) + uIDs <- unique(cpdids) + nLen <- length(ufiles) + + ##Progressbar + nProg <- 0 + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) + i <- 1 + + ##Routine for the case of multiple cpdIDs per file + if(length(uIDs) > length(ufiles)){ + w@spectra <- as(unlist(lapply(ufiles, function(currentFile){ + fileIDs <- cpdids[which(w@files == currentFile)] + spec <- findMsMsHRperxcms(currentFile, fileIDs, mode=mode, findPeaksArgs=Args, plots, MSe = MSe) + gc() + + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(spec) + }),FALSE),"SimpleList") + } else { + ##Routine for the other cases + w@spectra <- as(lapply(uIDs, function(ID){ + # Find files corresponding to the compoundID + currentFile <- w@files[which(cpdids == ID)] + + # Retrieve spectrum data + spec <- findMsMsHRperxcms(currentFile, ID, mode=mode, findPeaksArgs=Args, plots, MSe = MSe) + gc() + + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(spec) + }),"SimpleList") + ##If there are more files than unique cpdIDs, only remember the first file for every cpdID + w@files <- w@files[sapply(uIDs, function(ID){ + return(which(cpdids == ID)[1]) + })] + } + } + + ##Peaklist-readmethod + if((readMethod == "peaklist") || (readMethod=="minimal")){ + w <- createSpecsFromPeaklists(w, cpdids, filenames=w@files, mode=mode) + uIDs <- unique(cpdids) + files <- list() + + for(i in 1:length(uIDs)){ + indices <- sapply(cpdids,function(a){return(uIDs[i] %in% a)}) + files[[i]] <- w@files[indices] + } + + w@files <- sapply(files,function(file){return(file[1])}) + message("Peaks read") + } + + ##MSP-readmethod + if(readMethod == "msp"){ + ##Find unique files and cpdIDs + ufiles <- unique(w@files) + uIDs <- unique(cpdids) + nLen <- length(ufiles) + + ##Progressbar + nProg <- 0 + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) + i <- 1 + + ##Routine for the case of multiple cpdIDs per file + if(length(uIDs) > length(ufiles)){ + w@spectra <- as(unlist(lapply(ufiles, function(currentFile){ + fileIDs <- cpdids[which(w@files == currentFile)] + spec <- findMsMsHRperMsp(fileName = currentFile, cpdIDs = fileIDs, mode=mode) + gc() + + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(spec) + }),FALSE),"SimpleList") + } else { + ##Routine for the other cases + w@spectra <- as(lapply(uIDs, function(ID){ + # Find files corresponding to the compoundID + currentFile <- w@files[which(cpdids == ID)] + + # Retrieve spectrum data + spec <- findMsMsHRperMsp(fileName = currentFile, cpdIDs = ID, mode=mode) + gc() + + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(spec) + }),"SimpleList") + ##If there are more files than unique cpdIDs, only remember the first file for every cpdID + w@files <- w@files[sapply(uIDs, function(ID){ + return(which(cpdids == ID)[1]) + })] + } + } + + ## verbose output + if(RMassBank.env$verbose.output) + for(parentIdx in seq_along(w@spectra)) + if(!w@spectra[[parentIdx]]@found) + cat(paste("### Warning ### No precursor ion was detected for ID '", w@spectra[[parentIdx]]@id, "'\n", sep = "")) + + return(w) +} + +#' +#' Extracts and processes spectra from a list of xcms-Objects +#' +#' The filenames of the raw LC-MS runs are read from the array \code{files} +#' in the global enviroment. +#' See the vignette \code{vignette("RMassBank")} for further details about the +#' workflow. +#' +#' @param w A \code{msmsWorkspace} to work with. +#' @param xRAW A list of xcmsRaw objects whose peaks should be detected and added to the workspace. +#' The relevant data must be in the MS1 data of the xcmsRaw object. You can coerce the +#' msn-data in a usable object with the \code{msn2xcmsRaw} function of xcms. +#' @param cpdids A vector or list containing the compound IDs of the files that are to be read as spectra. +#' The ordering of this and \code{files} implicitly assigns each ID to the corresponding file. +#' If this is supplied, then the IDs implicitly named in the filenames are ignored. +#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +#' @param findPeaksArgs A list of arguments that will be handed to the xcms-method findPeaks via do.call +#' @param settings Options to be used for processing. Defaults to the options loaded via +#' \code{\link{loadRmbSettings}} et al. Refer to there for specific settings. +#' @param progressbar The progress bar callback to use. Only needed for specialized applications. +#' Cf. the documentation of \code{\link{progressBarHook}} for usage. +#' @param plots A boolean value that determines whether the pseudospectra in XCMS should be plotted +#' @return The \code{msmsWorkspace} with msms-spectra read. +#' @seealso \code{\link{msmsWorkspace-class}}, \code{\link{msmsWorkflow}} +#' @author Michael Stravs, Eawag +#' @author Erik Mueller, UFZ +#' @export +msmsRead.RAW <- function(w, xRAW = NULL, cpdids = NULL, mode, findPeaksArgs = NULL, + settings = getOption("RMassBank"), progressbar = "progressBarHook", plots = FALSE){ + + requireNamespace("xcms", quietly=TRUE) + + ##xRAW will be coerced into a list of length 1 if it is an xcmsRaw-object + if(class(xRAW) == "xcmsRaw"){ + xRAW <- list(xRAW) + } + + ##Error messages + if((class(xRAW) != "list") || any(sapply(xRAW, function(x) class(x) != "xcmsRaw"))){ + stop("No list of xcmsRaw-objects supplied") + } + + if(is.null(cpdids)){ + stop("No cpdids supplied") + } + + #msnExist <- which(sapply(xRAW,function(x) length(x@msnPrecursorScan) != 0)) + #print(length(msnExist)) + #print(length(xRAW)) + + #if(length(msnExist) != length(xRAW)){ + # stop(paste("No msn data in list elements", setdiff(1:length(xRAW),msnExist))) + #} + + requireNamespace("CAMERA",quietly=TRUE) + + parentMass <- findMz(cpdids[1], mode=mode)$mzCenter + if(is.na(parentMass)){ + stop(paste("There was no matching entry to the supplied cpdID", cpdids[1] ,"\n Please check the cpdIDs and the compoundlist.")) + } + + RT <- findRt(cpdids[1])$RT * 60 + mzabs <- 0.1 + + getRT <- function(xa) { + rt <- sapply(xa@pspectra, function(x) {median(peaks(xa@xcmsSet)[x, "rt"])}) + } + + suppressWarnings(setReplicate <- xcms::xcmsSet(files=xRAW[[1]]@filepath, method="MS1")) + xsmsms <- as.list(replicate(length(xRAW),setReplicate)) + candidates <- list() + anmsms <- list() + psp <- list() + spectra <- list() + whichmissing <- vector() + metaspec <- list() + for(i in 1:length(xRAW)){ + devnull <- suppressWarnings(capture.output(xcms::peaks(xsmsms[[i]]) <- do.call(xcms::findPeaks,c(findPeaksArgs, object = xRAW[[i]])))) + + if (nrow(xcms::peaks(xsmsms[[i]])) == 0) { ##If there are no peaks + spectra[[i]] <- matrix(0,2,7) + next + } else{ + ## Get pspec + pl <- xcms::peaks(xsmsms[[i]])[,c("mz", "rt"), drop=FALSE] + + ## Best: find precursor peak + candidates[[i]] <- which( pl[,"mz", drop=FALSE] < parentMass + mzabs & pl[,"mz", drop=FALSE] > parentMass - mzabs + & pl[,"rt", drop=FALSE] < RT * 1.1 & pl[,"rt", drop=FALSE] > RT * 0.9 ) + devnull <- capture.output(anmsms[[i]] <- CAMERA::xsAnnotate(xsmsms[[i]])) + devnull <- capture.output(anmsms[[i]] <- CAMERA::groupFWHM(anmsms[[i]])) + + if(length(candidates[[i]]) > 0){ + closestCandidate <- which.min (abs( RT - pl[candidates[[i]], "rt", drop=FALSE])) + psp[[i]] <- which(sapply(anmsms[[i]]@pspectra, function(x) {candidates[[i]][closestCandidate] %in% x})) + } else{ + psp[[i]] <- which.min( abs(getRT(anmsms[[i]]) - RT) ) + } + ## Now find the pspec for compound + + ## 2nd best: Spectrum closest to MS1 + ##psp <- which.min( abs(getRT(anmsms) - actualRT)) + + ## 3rd Best: find pspec closest to RT from spreadsheet + ##psp <- which.min( abs(getRT(anmsms) - RT) ) + if((plots == TRUE) && (length(psp[[i]]) > 0)){ + CAMERA::plotPsSpectrum(anmsms[[i]], psp[[i]], log=TRUE, mzrange=c(0, findMz(cpdids[1])[[3]]), maxlabel=10) + } + if(length(psp[[i]]) != 0){ + spectra[[i]] <- CAMERA::getpspectra(anmsms[[i]], psp[[i]]) + } else { + whichmissing <- c(whichmissing,i) + } + } + } + if(length(spectra) != 0){ + for(i in whichmissing){ + spectra[[i]] <- matrix(0,2,7) + } + } + + sp <- toRMB(spectra,cpdids,"mH") + sp@id <- as.character(as.integer(cpdids)) + sp@name <- findName(cpdids) + sp@formula <- findFormula(cpdids) + sp@mode <- mode + + if(length(w@spectra) != 0){ + IDindex <- sapply(w@spectra,function(s) s@id == cpdids) + if(length(IDindex)){ + spectraNum <- length(w@spectra[[which(IDindex)]]@children) + w@spectra[[which(IDindex)]]@children[[spectraNum+1]] <- sp@children[[1]] + } else { + w@spectra[[length(w@spectra)+1]] <- sp + } + } else{ + w@spectra[[1]] <- sp + } + + if(all(w@files != xRAW[[1]]@filepath)){ + w@files <- c(w@files,xRAW[[1]]@filepath) + } else{ + for(i in 2:(length(w@files)+1)){ + currentFPath <- paste0(xRAW[[1]]@filepath,"_",i) + if(all(w@files != currentFPath)){ + w@files <- c(w@files,currentFPath) + break + } + } + } + + return(w) +} + diff --git a/R/settings_example.R b/R/settings_example.R index b8fb266..b3ada46 100755 --- a/R/settings_example.R +++ b/R/settings_example.R @@ -215,6 +215,7 @@ NULL "RECALIBRATE" = "loess on assigned fragments and MS1" ) ), + include_sp_tags = FALSE, # List of data-dependent scans in their order (relative to the parent scan) # list(mode, ces, ce, res): # mode: fragmentation mode diff --git a/R/webAccess.R b/R/webAccess.R index c2e3fdc..723ece7 100755 --- a/R/webAccess.R +++ b/R/webAccess.R @@ -2,8 +2,41 @@ NULL ## library(XML) ## library(RCurl) +## library(jsonlite) +retrieveDataWithRetry <- function(url, timeout, maximumNumberOfRetries = 5, retryDelayInSeconds = 3){ + #data <- getURL(URLencode(url), timeout=5) + + data <- NULL + queryIsSuccessful <- FALSE + numberOfRetries <- 0 + while(!queryIsSuccessful & numberOfRetries < maximumNumberOfRetries){ + data <- tryCatch( + expr = { + data <- getURL(url = url, timeout = timeout) + queryIsSuccessful <- TRUE + data + }, + warning=function(w){ + numberOfRetries <<- numberOfRetries + 1 + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### Web query failed (", numberOfRetries, " / ", maximumNumberOfRetries, ") for url '", url, "' because of warning '", w, "'\n", sep = "")) + if(numberOfRetries < maximumNumberOfRetries) + Sys.sleep(time = retryDelayInSeconds) + }, + error=function(e){ + numberOfRetries <<- numberOfRetries + 1 + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### Web query failed (", numberOfRetries, " / ", maximumNumberOfRetries, ") for url '", url, "' because of error '", e, "'\n", sep = "")) + if(numberOfRetries < maximumNumberOfRetries) + Sys.sleep(time = retryDelayInSeconds) + } + ) + } + + return(data) +} #' Retrieve information from Cactus #' @@ -352,6 +385,45 @@ getPcCHEBI <- function(query, from = "inchikey") } } +#' Retrieves DTXSID (if it exists) from EPA Comptox Dashboard +#' +#' @usage getCompTox(query) +#' @param query The InChIKey of the compound. +#' @return Returns the DTXSID. +#' +#' +#' @examples +#' +#' \dontrun{ +#' # getCompTox("MKXZASYAUGDDCJ-NJAFHUGGSA-N") +#' } +#' +#' @author Adelene Lai +#' @export + +getCompTox <- function(query) +{ + baseURL <- "https://actorws.epa.gov/actorws/chemIdentifier/v01/resolve.json?identifier=" + url <- paste0(baseURL,query) + errorvar <- 0 + currEnvir <- environment() + tryCatch( + data <- getURL(URLencode(url), timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 #TRUE? + } + ) + + if(errorvar){ #if TRUE? + warning("EPA web service is currently offline") + return(NA) + } + + r <- fromJSON(data) #returns list + return(r$DataRow$dtxsid) + + } + #' Retrieve the Chemspider ID for a given compound #' #' Given an InChIKey, this function queries the chemspider web API to retrieve @@ -378,16 +450,22 @@ getCSID <- function(query) baseURL <- "http://www.chemspider.com/InChI.asmx/InChIKeyToCSID?inchi_key=" url <- paste0(baseURL, query) - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url), timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ + #errorvar <- 0 + #currEnvir <- environment() + # + #tryCatch( + # data <- getURL(URLencode(url), timeout=5), + # error=function(e){ + # currEnvir$errorvar <- 1 + #}) + # + #if(errorvar){ + # warning("Chemspider is currently offline") + # return(NA) + #} + + data <- retrieveDataWithRetry(url = URLencode(url), timeout = 5) + if(is.null(data)){ warning("Chemspider is currently offline") return(NA) } @@ -544,3 +622,4 @@ getPcSDF <- function(query, from = "smiles"){ data <- c(strsplit(substring(data,1,molEnd),"\n")[[1]],"$$$$") return(data) } + diff --git a/R/zzz.R b/R/zzz.R index 0fc2413..7fe2be5 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,9 +1,15 @@ -.onLoad <- function(libname, pkgname) { - RMassBank.env <<- new.env() - RMassBank.env$ReadAnnotation <- FALSE - RMassBank.env$testnumber <- 1 - mb <- list() - attach(RMassBank.env) -} - +.onLoad <- function(libname, pkgname) { + RMassBank.env <<- new.env() + RMassBank.env$ReadAnnotation <- FALSE + RMassBank.env$testnumber <- 1 + ## new variables + RMassBank.env$verbose.output <- FALSE + RMassBank.env$export.invalid <- FALSE + RMassBank.env$export.molfiles <- TRUE + RMassBank.env$strictMsMsSpectraSelection <- FALSE + + mb <- list() + attach(RMassBank.env) +} + utils::globalVariables(c("cpdID", "isotopes","mzCalc")) \ No newline at end of file diff --git a/inst/NEWS b/inst/NEWS index 9952729..1a5c7bd 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,3 +1,7 @@ +Changes in version 2.11.2 + +- Avoid writing out empty PUBLICATIONS + Changes in version 2.11.1 - Fix error in workflow steps 2 and 7 with rcdk >= 3.4.9 diff --git a/inst/RMB_options.ini b/inst/RMB_options.ini index 10574b1..3718cd9 100755 --- a/inst/RMB_options.ini +++ b/inst/RMB_options.ini @@ -64,6 +64,8 @@ annotations: ms_dataprocessing: RECALIBRATE: loess on assigned fragments and MS1 +include_sp_tags: FALSE + # Annotator: # by default, "annotator.default" is used. # If you want to build your custom annotator (check ?annotator.default and the source code), diff --git a/vignettes/RMassBank.Rnw b/vignettes/RMassBank.Rnw index 2df68b6..cd20fea 100755 --- a/vignettes/RMassBank.Rnw +++ b/vignettes/RMassBank.Rnw @@ -1,5 +1,5 @@ % \VignetteIndexEntry{RMassBank walkthrough} -% \VignettePackage{rcdk} +% \VignettePackage{RMassBank} % \VignetteKeywords{} %% To generate the Latex code %library(RMassBank)