From 7eaad6e2e25c761f98bfb5bc9cd5befade38cdff Mon Sep 17 00:00:00 2001 From: Treutler Date: Wed, 29 Nov 2017 12:58:46 +0100 Subject: [PATCH 01/71] Added option 'msp' to parameter 'readMethod' in function 'msmsRead' and implemented the corresponding functionality in file 'leMsmsRaw.R'. --- R/createMassBank.R | 4086 +++++++++++++++++++------------------- R/leMsMs.r | 4630 ++++++++++++++++++++++---------------------- R/leMsmsRaw.R | 1940 +++++++++++-------- R/msmsRead.R | 789 ++++---- R/zzz.R | 17 +- 5 files changed, 5881 insertions(+), 5581 deletions(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index cc95afd..8c32c8d 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -1,2043 +1,2043 @@ -# Script for writing MassBank files - -#testtest change -#' Load MassBank compound information lists -#' -#' Loads MassBank compound information lists (i.e. the lists which were created -#' in the first two steps of the MassBank \code{\link{mbWorkflow}} and -#' subsequently edited by hand.). -#' -#' \code{resetInfolists} clears the information lists, i.e. it creates a new -#' empty list in \code{mbdata_archive}. \code{loadInfolist} loads a single CSV -#' file, whereas \code{loadInfolists} loads a whole directory. -#' -#' @aliases loadInfolists loadInfolist resetInfolists -#' @usage loadInfolists(mb, path) -#' -#' loadInfolist(mb, fileName) -#' -#' resetInfolists(mb) -#' @param path Directory in which the namelists reside. All CSV files in this -#' directory will be loaded. -#' @param fileName A single namelist to be loaded. -#' @param mb The \code{mbWorkspace} to load/reset the lists in. -#' @return The new workspace with loaded/reset lists. -#' @author Michael Stravs -#' @examples -#' -#' # -#' \dontrun{mb <- resetInfolists(mb) -#' mb <- loadInfolist(mb, "my_csv_infolist.csv")} -#' -#' @export -loadInfolists <- function(mb, path) -{ - archivefiles <- list.files(path, ".csv", full.names=TRUE) - for(afile in archivefiles) - mb <- loadInfolist(mb, afile) - return(mb) -} - -# Load an "infolist". This loads a CSV file which should contain the entries -# edited and controlled by hand. All compound infos from fileName are added into the -# global mbdata_archive. Entries with a cpdID which was already present, are substituted -# by new entries from the fileName file. -#' @export -loadInfolist <- function(mb, fileName) -{ - # Prime a new infolist if it doesn't exist - if(ncol(mb@mbdata_archive) == 0) - mb <- resetInfolists(mb) - mbdata_new <- read.csv(fileName, sep=",", stringsAsFactors=FALSE) - # Legacy check for loading the Uchem format files. - # Even if dbname_* are not used downstream of here, it's still good to keep them - # for debugging reasons. - n <- colnames(mbdata_new) - cols <- c("id","dbcas","dataused") - - # Check if comma-separated or semicolon-separated - d <- setdiff(cols, n) - if(length(d)>0){ - mbdata_new <- read.csv2(fileName, stringsAsFactors=FALSE) - n <- colnames(mbdata_new) - d2 <- setdiff(cols, n) - if(length(d2) > 0){ - stop("Some columns are missing in the infolist.") - } - } - if("dbname_d" %in% colnames(mbdata_new)) - { - colnames(mbdata_new)[[which(colnames(mbdata_new)=="dbname_d")]] <- "dbname" - # dbname_e will be dropped because of the select= in the subset below. - } - if("COMMENT.EAWAG_UCHEM_ID" %in% colnames(mbdata_new)) - colnames(mbdata_new)[[which(colnames(mbdata_new)== "COMMENT.EAWAG_UCHEM_ID")]] <- - "COMMENT.ID" - - # Clear from padding spaces and NAs - mbdata_new <- as.data.frame(t(apply(mbdata_new, 1, function(r) - { - # Substitute empty spaces by real NA values - r[which(r == "")] <- NA - # Trim spaces (in all non-NA fields) - r[which(!is.na(r))] <- sub("^ *([^ ]+) *$", "\\1", r[which(!is.na(r))]) - return(r) - }))) - # use only the columns present in mbdata_archive, no other columns added in excel - mbdata_new <- mbdata_new[, colnames(mb@mbdata_archive)] - # substitute the old entires with the ones from our files - # then find the new (previously inexistent) entries, and rbind them to the table - new_entries <- setdiff(mbdata_new$id, mb@mbdata_archive$id) - old_entries <- intersect(mbdata_new$id, mb@mbdata_archive$id) - for(entry in old_entries) - mb@mbdata_archive[mb@mbdata_archive$id == entry,] <- mbdata_new[mbdata_new$id == entry,] - mb@mbdata_archive <- rbind(mb@mbdata_archive, - mbdata_new[mbdata_new$id==new_entries,]) - return(mb) - -} - - -# Resets the mbdata_archive to an empty version. -#' @export -resetInfolists <- function(mb) -{ - mb@mbdata_archive <- - structure(list(X = integer(0), id = integer(0), dbcas = character(0), - dbname = character(0), dataused = character(0), COMMENT.CONFIDENCE = character(0), - COMMENT.ID = integer(0), CH.NAME1 = character(0), - CH.NAME2 = character(0), CH.NAME3 = character(0), CH.COMPOUND_CLASS = character(0), - CH.FORMULA = character(0), CH.EXACT_MASS = numeric(0), CH.SMILES = character(0), - CH.IUPAC = character(0), CH.LINK.CAS = character(0), CH.LINK.CHEBI = integer(0), - CH.LINK.HMDB = character(0), CH.LINK.KEGG = character(0), CH.LINK.LIPIDMAPS = character(0), - CH.LINK.PUBCHEM = character(0), CH.LINK.INCHIKEY = character(0), - CH.LINK.CHEMSPIDER = integer(0)), .Names = c("X", "id", "dbcas", - "dbname", "dataused", "COMMENT.CONFIDENCE", "COMMENT.ID", - "CH.NAME1", "CH.NAME2", "CH.NAME3", "CH.COMPOUND_CLASS", "CH.FORMULA", - "CH.EXACT_MASS", "CH.SMILES", "CH.IUPAC", "CH.LINK.CAS", "CH.LINK.CHEBI", - "CH.LINK.HMDB", "CH.LINK.KEGG", "CH.LINK.LIPIDMAPS", "CH.LINK.PUBCHEM", - "CH.LINK.INCHIKEY", "CH.LINK.CHEMSPIDER"), row.names = integer(0), class = "data.frame") - return(mb) - -} - -# The workflow function, i.e. (almost) the only thing you actually need to call. -# See below for explanation of steps. -#' MassBank record creation workflow -#' -#' Uses data generated by \code{\link{msmsWorkflow}} to create MassBank records. -#' -#' See the vignette \code{vignette("RMassBank")} for detailed informations about the usage. -#' -#' Steps: -#' -#' Step 1: Find which compounds don't have annotation information yet. For these -#' compounds, pull information from several databases (using gatherData). -#' -#' Step 2: If new compounds were found, then export the infolist.csv and stop the workflow. -#' Otherwise, continue. -#' -#' Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. -#' -#' Step 4: Compile the spectra. Using the skeletons from the archive data, create -#' MassBank records per compound and fill them with peak data for each spectrum. -#' Also, assign accession numbers based on scan mode and relative scan no. -#' -#' Step 5: Convert the internal tree-like representation of the MassBank data into -#' flat-text string arrays (basically, into text-file style, but still in memory) -#' -#' Step 6: For all OK records, generate a corresponding molfile with the structure -#' of the compound, based on the SMILES entry from the MassBank record. (This molfile -#' is still in memory only, not yet a physical file) -#' -#' Step 7: If necessary, generate the appropriate subdirectories, and actually write -#' the files to disk. -#' -#' Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank -#' to attribute substances to their corresponding structure molfiles. -#' -#' @param steps Which steps in the workflow to perform. -#' @param infolist_path A path where to store newly downloaded compound informations, -#' which should then be manually inspected. -#' @param mb The \code{mbWorkspace} to work in. -#' @param gatherData A variable denoting whether to retrieve information using several online databases \code{gatherData= "online"} -#' or to use the local babel installation \code{gatherData= "babel"}. Note that babel is used either way, if a directory is given -#' in the settings. This setting will be ignored if retrieval is set to "standard" -#' @return The processed \code{mbWorkspace}. -#' @seealso \code{\link{mbWorkspace-class}} -#' @author Michael A. Stravs, Eawag -#' @examples \dontrun{ -#' mb <- newMbWorkspace(w) # w being a msmsWorkspace -#' mb <- loadInfolists(mb, "D:/myInfolistPath") -#' mb <- mbWorkflow(mb, steps=c(1:3), "newinfos.csv") -#' -#' } -#' @export -mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.csv", gatherData = "online") -{ - # Step 1: Find which compounds don't have annotation information yet. For these - # compounds, pull information from CTS (using gatherData). - if(1 %in% steps) - { - mbdata_ids <- lapply(selectSpectra(mb@spectra, "found", "object"), function(spec) spec@id) - message("mbWorkflow: Step 1. Gather info from several databases") - # Which IDs are not in mbdata_archive yet? - new_ids <- setdiff(as.numeric(unlist(mbdata_ids)), mb@mbdata_archive$id) - mb@mbdata <- lapply(new_ids, function(id) - { - if(findLevel(id,TRUE) == "standard"){ - if(gatherData == "online"){ - - d <- gatherData(id) - } - if(gatherData == "babel"){ - # message("mbWorkflow: Step 1. Gather info using babel") - d <- gatherDataBabel(id) - } - } else{ - # message("mbWorkflow: Step 1. Gather no info - Unknown structure") - d <- gatherDataUnknown(id, mb@spectra[[1]]@mode, retrieval=findLevel(id,TRUE)) - } - message(paste(id, ": ", d$dataused, sep='')) - return(d) - }) - } - # Step 2: If new compounds were found, then export the infolist.csv and stop the workflow. - # Otherwise, continue! - if(2 %in% steps) - { - message("mbWorkflow: Step 2. Export infolist (if required)") - if(length(mb@mbdata)>0) - { - mbdata_mat <- flatten(mb@mbdata) - write.csv(as.data.frame(mbdata_mat),infolist_path, na="") - message(paste("The file", infolist_path, "was generated with new compound information. Please check and edit the table, and add it to your infolist folder.")) - return(mb) - } - else - message("No new data added.") - } - # Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. - if(3 %in% steps) - { - message("mbWorkflow: Step 3. Data reformatting") - mb@mbdata_relisted <- apply(mb@mbdata_archive, 1, readMbdata) - } - # Step 4: Compile the spectra! Using the skeletons from the archive data, create - # MassBank records per compound and fill them with peak data for each spectrum. - # Also, assign accession numbers based on scan mode and relative scan no. - if(4 %in% steps) - { - message("mbWorkflow: Step 4. Spectra compilation") - mb@compiled <- lapply( - selectSpectra(mb@spectra, "found", "object"), - function(r) { - message(paste("Compiling: ", r@name, sep="")) - mbdata <- mb@mbdata_relisted[[which(mb@mbdata_archive$id == as.numeric(r@id))]] - if(nrow(mb@additionalPeaks) > 0) - res <-compileRecord(r, mbdata, mb@aggregated, mb@additionalPeaks) - else - res <-compileRecord(r, mbdata, mb@aggregated, NULL, retrieval=findLevel(r@id,TRUE)) - return(res) - }) - # check which compounds have useful spectra - mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) - mb@problems <- which(is.na(mb@compiled)) - mb@compiled_ok <- mb@compiled[mb@ok] - } - # Step 5: Convert the internal tree-like representation of the MassBank data into - # flat-text string arrays (basically, into text-file style, but still in memory) - if(5 %in% steps) - { - message("mbWorkflow: Step 5. Flattening records") - mb@mbfiles <- lapply(mb@compiled_ok, function(c) lapply(c, toMassbank)) - } - # Step 6: For all OK records, generate a corresponding molfile with the structure - # of the compound, based on the SMILES entry from the MassBank record. (This molfile - # is still in memory only, not yet a physical file) - if(6 %in% steps) - { - message("mbWorkflow: Step 6. Generate molfiles") - mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(as.numeric(c[[1]][['COMMENT']][[getOption("RMassBank")$annotations$internal_id_fieldname]]))) - } - # Step 7: If necessary, generate the appropriate subdirectories, and actually write - # the files to disk. - if(7 %in% steps) - { - message("mbWorkflow: Step 7. Generate subdirs and export") - dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "moldata", sep='/'),recursive=TRUE) - dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "recdata", sep='/'),recursive=TRUE) - for(cnt in 1:length(mb@compiled_ok)) - exportMassbank(mb@compiled_ok[[cnt]], mb@mbfiles[[cnt]], mb@molfile[[cnt]]) - } - # Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank - # to attribute substances to their corresponding structure molfiles. - if(8 %in% steps) - { - message("mbWorkflow: Step 8. Create list.tsv") - makeMollist(mb@compiled_ok) - } - return(mb) -} - - -# Calls openbabel and converts the SMILES code string (or retrieves the SMILES code from -# the ID, and then calls openbabel) to create a molfile in text format. -# If fileName is given, the file is directly stored. Otherwise, it is returned as a -# character array. -#' Create MOL file for a chemical structure -#' -#' Creates a MOL file (in memory or on disk) for a compound specified by the -#' compound ID or by a SMILES code. -#' -#' The function invokes OpenBabel (and therefore needs a correctly set -#' OpenBabel path in the RMassBank settings), using the SMILES code retrieved -#' with \code{findSmiles} or using the SMILES code directly. The current -#' implementation of the workflow uses the latter version, reading the SMILES -#' code directly from the MassBank record itself. -#' -#' @usage createMolfile(id_or_smiles, fileName = FALSE) -#' @param id_or_smiles The compound ID or a SMILES code. -#' @param fileName If the filename is set, the file is written directly to disk -#' using the specified filename. Otherwise, it is returned as a text array. -#' @return A character array containing the MOL/SDF format file, ready to be -#' written to disk. -#' @author Michael Stravs -#' @seealso \code{\link{findSmiles}} -#' @references OpenBabel: \url{http://openbabel.org} -#' @examples -#' -#' # Benzene: -#' \dontrun{ -#' createMolfile("C1=CC=CC=C1") -#' } -#' -#' @export -createMolfile <- function(id_or_smiles, fileName = FALSE) -{ - .checkMbSettings() - babeldir <- getOption("RMassBank")$babeldir - - if(!is.numeric(id_or_smiles)){ - smiles <- id_or_smiles - } else{ - if(findLevel(id_or_smiles,TRUE) != "standard"){ - return(c(" ","$$$$")) - } - smiles <- findSmiles(id_or_smiles) - } - # if no babeldir was set, get the result from cactus. - if(is.na(babeldir)) - { - res <- getCactus(smiles, "sdf") - - if(any(is.na(res))){ - res <- getPcSDF(smiles) - } - if(any(is.na(res))){ - stop("Pubchem and Cactus both seem to be down.") - } - if(is.character(fileName)) - writeLines(res, fileName) - } - # otherwise use the better-tested OpenBabel toolkit. - else - { - if(!is.character(fileName)) - cmd <- paste(babeldir, "babel -ismi -osdf -d -b --gen2D", sep='') - else - cmd <- paste(babeldir, "babel -ismi -osdf ", fileName , " -d -b --gen2D", sep='') - res <- system(cmd, intern=TRUE, input=smiles, ignore.stderr=TRUE) - # If we wrote to a file, read it back as return value. - if(is.character(fileName)) - res <- readLines(fileName) - } - return(c(" ","$$$$")) - return(res) -} - - - -# Retrieve annotation data for a compound, from the internet service Pubchem -#' Retrieve supplemental annotation data from Pubchem -#' -#' Retrieves annotation data for a compound from the internet service Pubchem -#' based on the inchikey generated by babel or Cactus -#' -#' The data retrieved is the Pubchem CID, a synonym from the Pubchem database, -#' the IUPAC name (using the preferred if available) and a Chebi link -#' -#' @usage gatherPubChem(key) -#' @param key An Inchi-Key -#' @return Returns a list with 4 slots: -#' \code{PcID} The Pubchem CID -#' \code{Synonym} An arbitrary synonym for the compound name -#' \code{IUPAC} A IUPAC-name (preferred if available) -#' \code{Chebi} The identification number of the chebi database -#' @author Erik Mueller -#' @seealso \code{\link{mbWorkflow}} -#' @references Pubchem REST: -#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} -#' Chebi: -#' \url{http://www.ebi.ac.uk/chebi} -#' @examples -#' -#' # Gather data for compound ID 131 -#' \dontrun{gatherPubChem("QEIXBXXKTUNWDK-UHFFFAOYSA-N")} -#' -#' @export -gatherPubChem <- function(key){ - - PubChemData <- list() - - ##Trycatches are there because pubchem has connection issues 1 in 50 times. - ##Write NA into the respective fields if something goes wrong with the conenction or the data. - - ##Retrieve Pubchem CID - tryCatch( - PubChemData$PcID <- getPcId(key), - error=function(e){ - PubChemData$PcID <<- NA - }) - - ##Retrieve a synonym to the name - tryCatch( - PubChemData$Synonym <- getPcSynonym(key), - error=function(e){ - PubChemData$Synonym <<- NA - }) - - ##Retrieve the IUPAC-name - tryCatch( - PubChemData$IUPAC <- getPcIUPAC(key), - error=function(e){ - PubChemData$IUPAC <<- NA - }) - - ##Retrieve the Chebi-ID - tryCatch( - PubChemData$Chebi <- getPcCHEBI(key), - error=function(e){ - PubChemData$Chebi <<- NA - }) - - return(PubChemData) -} - -# Retrieve annotation data for a compound, from the internet services Cactvs, Pubchem, Chemspider and CTS. -#' Retrieve annotation data -#' -#' Retrieves annotation data for a compound from the internet services CTS, Pubchem, Chemspider and -#' Cactvs, based on the SMILES code and name of the compounds stored in the -#' compound list. -#' -#' Composes the "upper part" of a MassBank record filled with chemical data -#' about the compound: name, exact mass, structure, CAS no., links to PubChem, -#' KEGG, ChemSpider. The instrument type is also written into this block (even -#' if not strictly part of the chemical information). Additionally, index -#' fields are added at the start of the record, which will be removed later: -#' \code{id, dbcas, dbname} from the compound list, \code{dataused} to indicate -#' the used identifier for CTS search (\code{smiles} or \code{dbname}). -#' -#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are -#' inserted empty and will be filled later on. -#' -#' @usage gatherData(id) -#' @aliases gatherData -#' @param id The compound ID. -#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., -#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... -#' @author Michael Stravs -#' @seealso \code{\link{mbWorkflow}} -#' @references Chemical Translation Service: -#' \url{http://uranus.fiehnlab.ucdavis.edu:8080/cts/homePage} -#' cactus Chemical Identifier Resolver: -#' \url{http://cactus.nci.nih.gov/chemical/structure} -#' MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' Pubchem REST: -#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} -#' Chemspider InChI conversion: -#' \url{https://www.chemspider.com/InChI.asmx} -#' @examples -#' -#' # Gather data for compound ID 131 -#' \dontrun{gatherData(131)} -#' -#' @export -gatherData <- function(id) -{ - ##Preamble: Is a babeldir supplied? - ##If yes, use it - - .checkMbSettings() - usebabel=TRUE - babeldir <- getOption("RMassBank")$babeldir - - if(is.na(babeldir)){ - usebabel=FALSE - } - - - ##Get all useful information from the local "database" (from the CSV sheet) - - smiles <- findSmiles(id) - mass <- findMass(smiles) - dbcas <- findCAS(id) - dbname <- findName(id) - if(is.na(dbname)) dbname <- "" - if(is.na(dbcas)) dbcas <- "" - iupacName <- dbname - synonym <- dbname - formula <- findFormula(id) - - ##Convert SMILES to InChI key via Cactvs or babel. CTS doesn't "interpret" the SMILES per se, - ##it just matches identical known SMILES, so we need to convert to a "searchable" and - ##standardized format beforehand. Other databases are able to interpret the smiles. - - if(usebabel){ - cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey') - inchikey_split <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE) - } else{ - inchikey <- getCactus(smiles, 'stdinchikey') - if(!is.na(inchikey)){ - ##Split the "InChiKey=" part off the key - inchikey_split <- strsplit(inchikey, "=", fixed=TRUE)[[1]][[2]] - } else{ - inchikey_split <- getPcInchiKey(smiles) - } - } - - ##Use Pubchem to retrieve information - PcInfo <- gatherPubChem(inchikey_split) - - if(!is.null(PcInfo$Synonym) & !is.na(PcInfo$Synonym)){ - synonym <- PcInfo$Synonym - } - - if(!is.null(PcInfo$IUPAC) & !is.na(PcInfo$IUPAC)){ - iupacName <- PcInfo$IUPAC - } - - ##Get Chemspider-ID - csid <- getCSID(inchikey_split) - - if(is.na(csid)){ - ##Get ChemSpider ID from Cactus if the Chemspider page is down - csid <- getCactus(inchikey_split, 'chemspider_id') - } - - ##Use CTS to retrieve information - CTSinfo <- getCtsRecord(inchikey_split) - - if((CTSinfo[1] == "Sorry, we couldn't find any matching results") || is.null(CTSinfo[1])) - { - CTSinfo <- NA - } - - ##List the names - if(iupacName == ""){ - warning(paste0("Compound ID ",id,": no IUPAC name could be identified.")) - } - - if(toupper(dbname) == toupper(synonym)){ - synonym <- dbname - } - - if(toupper(dbname) == toupper(iupacName)){ - iupacName <- dbname - } - - if(toupper(synonym) == toupper(iupacName)){ - synonym <- iupacName - } - - names <- as.list(unique(c(dbname, synonym, iupacName))) - - ##If no name is found, it must be supplied in one way or another - if(all(sapply(names, function(x) x == ""))){ - stop("RMassBank wasn't able to extract a usable name for this compound from any database. Please supply a name manually.") - } - - # Start to fill the MassBank record. - # The top 4 entries will not go into the final record; they are used to identify - # the record and also to facilitate manual editing of the exported record table. - mbdata <- list() - mbdata[['id']] <- id - mbdata[['dbcas']] <- dbcas - mbdata[['dbname']] <- dbname - mbdata[['dataused']] <- "smiles" - mbdata[['ACCESSION']] <- "" - mbdata[['RECORD_TITLE']] <- "" - mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") - mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors - mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license - mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright - # Confidence annotation and internal ID annotation. - # The ID of the compound will be written like: - # COMMENT: EAWAG_UCHEM_ID 1234 - # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" - mbdata[["COMMENT"]] <- list() - if(findLevel(id) == "0"){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment - } else{ - level <- findLevel(id) - if(level %in% c("1","1a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" - } - if(level == c("2")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" - } - if(level == c("2a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" - } - if(level == c("2b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" - } - if(level == c("3")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" - } - if(level == c("3a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" - } - if(level == c("3b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" - } - if(level == c("3c")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" - } - if(level == c("3d")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" - } - if(level == c("4")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" - } - if(level == c("5")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" - } - } - mbdata[["COMMENT"]][["ID"]] = id - # here compound info starts - mbdata[['CH$NAME']] <- names - # Currently we use a fixed value for Compound Class, since there is no useful - # convention of what should go there and what shouldn't, and the field is not used - # in search queries. - mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class - mbdata[['CH$FORMULA']] <- formula - mbdata[['CH$EXACT_MASS']] <- mass - mbdata[['CH$SMILES']] <- smiles - - if(usebabel){ - cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi') - mbdata[['CH$IUPAC']] <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE) - } else{ - mbdata[['CH$IUPAC']] <- getCactus(smiles, "stdinchi") - } - - - - # Add all CH$LINK fields present in the compound datasets - link <- list() - # CAS - if(!is.na(CTSinfo[1])){ - if("CAS" %in% CTS.externalIdTypes(CTSinfo)) - { - # Prefer database CAS if it is also listed in the CTS results. - # otherwise take the shortest one. - cas <- CTS.externalIdSubset(CTSinfo,"CAS") - if(dbcas %in% cas) - link[["CAS"]] <- dbcas - else - link[["CAS"]] <- cas[[which.min(nchar(cas))]] - } else{ - if(dbcas != ""){ - link[["CAS"]] <- dbcas - } - } - } else{ - if(dbcas != ""){ - link[["CAS"]] <- dbcas - } - } - - - # CHEBI - if(is.na(PcInfo$Chebi[1])){ - if(!is.na(CTSinfo[1])){ - if("ChEBI" %in% CTS.externalIdTypes(CTSinfo)) - { - # Cut off front "CHEBI:" if present - chebi <- CTS.externalIdSubset(CTSinfo,"ChEBI") - chebi <- chebi[[which.min(nchar(chebi))]] - chebi <- strsplit(chebi,":")[[1]] - link[["CHEBI"]] <- chebi[[length(chebi)]] - } - } - } else{ - chebi <- PcInfo$Chebi - chebi <- chebi[[which.min(nchar(chebi))]] - chebi <- strsplit(chebi,":")[[1]] - link[["CHEBI"]] <- chebi[[length(chebi)]] - } - # HMDB - if(!is.na(CTSinfo[1])){ - if("Human Metabolome Database" %in% CTS.externalIdTypes(CTSinfo)) - link[["HMDB"]] <- CTS.externalIdSubset(CTSinfo,"HMDB")[[1]] - # KEGG - if("KEGG" %in% CTS.externalIdTypes(CTSinfo)) - link[["KEGG"]] <- CTS.externalIdSubset(CTSinfo,"KEGG")[[1]] - # LipidMAPS - if("LipidMAPS" %in% CTS.externalIdTypes(CTSinfo)) - link[["LIPIDMAPS"]] <- CTS.externalIdSubset(CTSinfo,"LipidMAPS")[[1]] - } - # PubChem CID - if(is.na(PcInfo$PcID[1])){ - if(!is.na(CTSinfo[1])){ - if("PubChem CID" %in% CTS.externalIdTypes(CTSinfo)) - { - pc <- CTS.externalIdSubset(CTSinfo,"PubChem CID") - link[["PUBCHEM"]] <- paste0(min(pc)) - } - } - } else{ - link[["PUBCHEM"]] <- PcInfo$PcID[1] - } - - - if(!is.null(link[["PUBCHEM"]])){ - if(substr(link[["PUBCHEM"]],1,4) != "CID:"){ - link[["PUBCHEM"]] <- paste0("CID:", link[["PUBCHEM"]]) - } - } - - link[["INCHIKEY"]] <- inchikey_split - if(length(csid)>0) if(any(!is.na(csid))) link[["CHEMSPIDER"]] <- min(as.numeric(as.character(csid))) - mbdata[['CH$LINK']] <- link - - mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument - mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type - - return(mbdata) -} - -# Retrieve annotation data for a compound, using only babel -#' Retrieve annotation data -#' -#' Retrieves annotation data for a compound by using babel, -#' based on the SMILES code and name of the compounds stored in the -#' compound list. -#' -#' Composes the "upper part" of a MassBank record filled with chemical data -#' about the compound: name, exact mass, structure, CAS no.. -#' The instrument type is also written into this block (even -#' if not strictly part of the chemical information). Additionally, index -#' fields are added at the start of the record, which will be removed later: -#' \code{id, dbcas, dbname} from the compound list. -#' -#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are -#' inserted empty and will be filled later on. -#' -#' This function is an alternative to gatherData, in case CTS is down or if information -#' on one or more of the compounds in the compound list are sparse -#' -#' @usage gatherDataBabel(id) -#' @param id The compound ID. -#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., -#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... -#' @author Michael Stravs, Erik Mueller -#' @seealso \code{\link{mbWorkflow}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' -#' # Gather data for compound ID 131 -#' \dontrun{gatherDataBabel(131)} -#' -#' @export -gatherDataBabel <- function(id){ - .checkMbSettings() - babeldir <- getOption("RMassBank")$babeldir - smiles <- findSmiles(id) - - - # if no babeldir was set, throw an error that says that either CTS or babel have to be used - if(is.na(babeldir)) - { - stop("No babeldir supplied; It is currently not possible to convert the information without either babel or CTS") - } else { - ###Babel conversion - cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey') - inchikey <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE) - cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi') - inchi <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE) - - ##Read from Compoundlist - smiles <- findSmiles(id) - mass <- findMass(smiles) - dbcas <- findCAS(id) - dbname <- findName(id) - if(is.na(dbname)) dbname <- "" - if(is.na(dbcas)) dbcas <- "" - formula <- findFormula(id) - - ##Create - mbdata <- list() - mbdata[['id']] <- id - mbdata[['dbcas']] <- dbcas - mbdata[['dbname']] <- dbname - mbdata[['dataused']] <- "smiles" - mbdata[['ACCESSION']] <- "" - mbdata[['RECORD_TITLE']] <- "" - mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") - mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors - mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license - mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright - # Confidence annotation and internal ID annotation. - # The ID of the compound will be written like: - # COMMENT: EAWAG_UCHEM_ID 1234 - # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" - mbdata[["COMMENT"]] <- list() - if(findLevel(id) == "0"){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment - } else{ - level <- findLevel(id) - if(level %in% c("1","1a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" - } - if(level == c("2")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" - } - if(level == c("2a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" - } - if(level == c("2b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" - } - if(level == c("3")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" - } - if(level == c("3a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" - } - if(level == c("3b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" - } - if(level == c("3c")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" - } - if(level == c("3d")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" - } - if(level == c("4")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" - } - if(level == c("5")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" - } - } - mbdata[["COMMENT"]][["ID"]] <- id - - # here compound info starts - mbdata[['CH$NAME']] <- as.list(dbname) - - # Currently we use a fixed value for Compound Class, since there is no useful - # convention of what should go there and what shouldn't, and the field is not used - # in search queries. - mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class - mbdata[['CH$FORMULA']] <- formula - mbdata[['CH$EXACT_MASS']] <- mass - mbdata[['CH$SMILES']] <- smiles - mbdata[['CH$IUPAC']] <- inchi - - link <- list() - if(dbcas != "") - link[["CAS"]] <- dbcas - link[["INCHIKEY"]] <- inchikey - mbdata[['CH$LINK']] <- link - mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument - mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type - } - return(mbdata) -} - -# Retrieve annotation data for a compound, using only babel -#' Retrieve annotation data -#' -#' Retrieves annotation data for an unknown compound by using basic information present -#' -#' Composes the "upper part" of a MassBank record filled with chemical data -#' about the compound: name, exact mass, structure, CAS no.. -#' The instrument type is also written into this block (even -#' if not strictly part of the chemical information). Additionally, index -#' fields are added at the start of the record, which will be removed later: -#' \code{id, dbcas, dbname} from the compound list. -#' -#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are -#' inserted empty and will be filled later on. -#' -#' This function is used to generate the data in case a substance is unknown, -#' i.e. not enough information is present to derive anything about formulas or links -#' -#' @usage gatherDataUnknown(id, mode, retrieval) -#' @param id The compound ID. -#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). -#' @param retrieval A value that determines whether the files should be handled either as "standard", -#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" -#' if the only know thing is the m/z -#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., -#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... -#' @author Michael Stravs, Erik Mueller -#' @seealso \code{\link{mbWorkflow}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' -#' # Gather data for compound ID 131 -#' \dontrun{gatherDataUnknown(131,"pH")} -#' -#' @export -gatherDataUnknown <- function(id, mode, retrieval){ - .checkMbSettings() - - ##Read from Compoundlist - smiles <- "" - if(retrieval == "unknown"){ - mass <- findMass(id, "unknown", mode) - formula <- "" - } - if(retrieval == "tentative"){ - mass <- findMass(id, "tentative", mode) - formula <- findFormula(id, "tentative") - } - dbcas <- NA - dbname <- findName(id) - if(is.na(dbname)) dbname <- paste("Unknown ID:",id) - if(is.na(dbcas)) dbcas <- "" - - - - ##Create - mbdata <- list() - mbdata[['id']] <- id - mbdata[['dbcas']] <- dbcas - mbdata[['dbname']] <- dbname - mbdata[['dataused']] <- "none" - mbdata[['ACCESSION']] <- "" - mbdata[['RECORD_TITLE']] <- "" - mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") - mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors - mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license - mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright - # Confidence annotation and internal ID annotation. - # The ID of the compound will be written like: - # COMMENT: EAWAG_UCHEM_ID 1234 - # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" - mbdata[["COMMENT"]] <- list() - if(findLevel(id) == "0"){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment - } else{ - level <- findLevel(id) - if(level %in% c("1","1a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" - } - if(level == c("2")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" - } - if(level == c("2a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" - } - if(level == c("2b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" - } - if(level == c("3")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" - } - if(level == c("3a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" - } - if(level == c("3b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" - } - if(level == c("3c")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" - } - if(level == c("3d")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" - } - if(level == c("4")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" - } - if(level == c("5")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" - } - } - mbdata[["COMMENT"]][["ID"]] <- id - - # here compound info starts - mbdata[['CH$NAME']] <- as.list(dbname) - - # Currently we use a fixed value for Compound Class, since there is no useful - # convention of what should go there and what shouldn't, and the field is not used - # in search queries. - mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class - mbdata[['CH$FORMULA']] <- formula - mbdata[['CH$EXACT_MASS']] <- mass - mbdata[['CH$SMILES']] <- "" - mbdata[['CH$IUPAC']] <- "" - - link <- list() - mbdata[['CH$LINK']] <- link - mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument - mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type - - return(mbdata) -} - -# Flatten the internal tree-like representation of MassBank data to a flat table. -# Note that this limits us, in that the fields should be constant over all records! -# Therefore, e.g. the fixed number of 3 names which may be filled. -# If anybody has a cooler solution, I'll be happy to hear from you :) -# -# Note: the records from gatherData have additional information which is discarded, like -# author, copyright etc. They will be re-filled automatically when reading the file. -#' Flatten, or re-read, MassBank header blocks -#' -#' \code{flatten} converts a list of MassBank compound information sets (as -#' retrieved by \code{\link{gatherData}}) to a flat table, to be exported into -#' an \link[=loadInfolist]{infolist}. \code{readMbdata} reads a single record -#' from an infolist flat table back into a MassBank (half-)entry. -#' -#' Neither the flattening system itself nor the implementation are particularly -#' fantastic, but since hand-checking of records is a necessary evil, there is -#' currently no alternative (short of coding a complete GUI for this and -#' working directly on the records.) -#' -#' @aliases flatten readMbdata -#' @usage flatten(mbdata) -#' -#' readMbdata(row) -#' @param mbdata A list of MassBank compound information sets as returned from -#' \code{\link{gatherData}}. -#' @param row One row of MassBank compound information retrieved from an -#' infolist. -#' @return \code{flatten} returns a matrix (not a data frame) to be written to -#' CSV. -#' -#' \code{readMbdata} returns a list of type \code{list(id= \var{compoundID}, -#' ..., 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. -#' @author Michael Stravs -#' @seealso \code{\link{gatherData}},\code{\link{loadInfolist}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples \dontrun{ -#' # Collect some data to flatten -#' ids <- c(40,50,60,70) -#' data <- lapply(ids, gatherData) -#' # Flatten the data trees to a table -#' flat.table <- flatten(data) -#' # reimport the table into a tree -#' data.reimported <- apply(flat.table, 1, readMbdata) -#' } -#' -#' @export -#' -flatten <- function(mbdata) -{ - .checkMbSettings() - - colList <- c( - "id", - "dbcas", - "dbname", - "dataused", - "COMMENT.CONFIDENCE", - # Note: The field name of the internal id field is replaced with the real name - # at "compilation" time. Therefore, functions DOWNSTREAM from compileRecord() - # must use the full name including the info from options("RMassBank"). - "COMMENT.ID", - "CH$NAME1", - "CH$NAME2", - "CH$NAME3", - "CH$COMPOUND_CLASS", - "CH$FORMULA", - "CH$EXACT_MASS", - "CH$SMILES", - "CH$IUPAC", - "CH$LINK.CAS", - "CH$LINK.CHEBI", - "CH$LINK.HMDB", - "CH$LINK.KEGG", - "CH$LINK.LIPIDMAPS", - "CH$LINK.PUBCHEM", - "CH$LINK.INCHIKEY", - "CH$LINK.CHEMSPIDER") - # make an empty data frame with the right length - rows <- length(mbdata) - cols <- length(colList) - mbframe <- matrix(data=NA, nrow=rows, ncol=cols) - colnames(mbframe) <- colList - #browser() - for(row in 1:rows) - { - # fill in all the data into the dataframe: all columns which - # a) exist in the target dataframe and b) exist in the (unlisted) MB record - # are written into the dataframe. - data <- unlist(mbdata[[row]]) - # bugfix for the case of only one name - if(!("CH$NAME1" %in% names(data))) - data[["CH$NAME1"]] <- data[["CH$NAME"]] - datacols <- intersect(colList, names(data)) - mbframe[row,datacols] <- data[datacols] - } - return(mbframe) - -} - -# Read data from a flat-table MassBank record row and feed it into a -# MassBank tree-like record. Also, prime the ACCESSION and RECORD_TITLE fields in the -# correct position in the record. -#' @export -readMbdata <- function(row) -{ - .checkMbSettings() - - # Listify the table row. Lists are just cooler to work with :) - row <- as.list(row) - - mbdata <- list() - # Accession and title are added empty for now, to have them in the right place. - # Constants are read from the options or generated. - mbdata[['ACCESSION']] <- "" - mbdata[['RECORD_TITLE']] <- "" - mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") - mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors - mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license - mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright - mbdata[['PUBLICATION']] <- getOption("RMassBank")$annotations$publication - - # Read all determined fields from the file - # This is not very flexible, as you can see... - colList <- c( - "COMMENT.CONFIDENCE", - "COMMENT.ID", - "CH$NAME1", - "CH$NAME2", - "CH$NAME3", - "CH$COMPOUND_CLASS", - "CH$FORMULA", - "CH$EXACT_MASS", - "CH$SMILES", - "CH$IUPAC", - "CH$LINK.CAS", - "CH$LINK.CHEBI", - "CH$LINK.HMDB", - "CH$LINK.KEGG", - "CH$LINK.LIPIDMAPS", - "CH$LINK.PUBCHEM", - "CH$LINK.INCHIKEY", - "CH$LINK.CHEMSPIDER") - mbdata[["COMMENT"]] = list() - mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT.CONFIDENCE"]] - # Again, our ID field. - - mbdata[["COMMENT"]][["ID"]]<- - row[["COMMENT.ID"]] - names = c(row[["CH.NAME1"]], row[["CH.NAME2"]], row[["CH.NAME3"]]) - names = names[which(!is.na(names))] - - names <- gsub("'", "`", names) - mbdata[["CH$NAME"]] = names - mbdata[["CH$COMPOUND_CLASS"]] = row[["CH.COMPOUND_CLASS"]] - mbdata[["CH$FORMULA"]] = row[["CH.FORMULA"]] - mbdata[["CH$EXACT_MASS"]] = row[["CH.EXACT_MASS"]] - mbdata[["CH$SMILES"]] = row[["CH.SMILES"]] - mbdata[["CH$IUPAC"]] = row[["CH.IUPAC"]] - # Add all links and then eliminate the NA values from the tree. - link = list() - link[["CAS"]] = row[["CH.LINK.CAS"]] - link[["CHEBI"]] = row[["CH.LINK.CHEBI"]] - link[["HMDB"]] = row[["CH.LINK.HMDB"]] - link[["KEGG"]] = row[["CH.LINK.KEGG"]] - link[["LIPIDMAPS"]] = row[["CH.LINK.LIPIDMAPS"]] - link[["PUBCHEM"]] = row[["CH.LINK.PUBCHEM"]] - link[["INCHIKEY"]] = row[["CH.LINK.INCHIKEY"]] - link[["CHEMSPIDER"]] = row[["CH.LINK.CHEMSPIDER"]] - link[which(is.na(link))] <- NULL - mbdata[["CH$LINK"]] <- link - # again, these constants are read from the options: - mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument - mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type - - return(mbdata) - -} - -# For each compound, this function creates the "lower part" of the MassBank record, i.e. -# everything that comes after AC$INSTRUMENT_TYPE. -#' Compose data block of MassBank record -#' -#' \code{gatherCompound} composes the data blocks (the "lower half") of all -#' MassBank records for a compound, using the annotation data in the RMassBank -#' options, spectrum info data from the \code{analyzedSpec}-type record and the -#' peaks from the reanalyzed, multiplicity-filtered peak table. It calls -#' \code{gatherSpectrum} for each child spectrum. -#' -#' The returned data blocks are in format \code{list( "AC\$MASS_SPECTROMETRY" = -#' list('FRAGMENTATION_MODE' = 'CID', ...), ...)} etc. -#' -#' @aliases gatherCompound gatherSpectrum -#' @usage gatherCompound(spec, aggregated, additionalPeaks = NULL, retrieval="standard") -#' -#' gatherSpectrum(spec, msmsdata, ac_ms, ac_lc, aggregated, -#' additionalPeaks = NULL, retrieval="standard") -#' @param spec A \code{RmbSpectraSet} object, representing a compound with multiple spectra. -#' @param aggregated An aggregate peak table where the peaks are extracted from. -#' @param msmsdata A \code{RmbSpectrum2} object from the \code{spec} spectra set, representing a single spectrum to give a record. -#' @param ac_ms,ac_lc Information for the AC\$MASS_SPECTROMETRY and -#' AC\$CHROMATOGRAPHY fields in the MassBank record, created by -#' \code{gatherCompound} and then fed into \code{gatherSpectrum}. -#' @param additionalPeaks If present, a table with additional peaks to add into the spectra. -#' As loaded with \code{\link{addPeaks}}. -#' @param retrieval A value that determines whether the files should be handled either as "standard", -#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" -#' if the only know thing is the m/z -#' @return \code{gatherCompound} returns a list of tree-like MassBank data -#' blocks. \code{gatherSpectrum} returns one single MassBank data block or -#' \code{NA} if no useful peak is in the spectrum. -#' @note Note that the global table \code{additionalPeaks} is also used as an -#' additional source of peaks. -#' @author Michael Stravs -#' @seealso \code{\link{mbWorkflow}}, \code{\link{compileRecord}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples \dontrun{ -#' myspectrum <- w@@spectra[[1]] -#' massbankdata <- gatherCompound(myspectrum, w@@aggregated) -#' # Note: ac_lc and ac_ms are data blocks usually generated in gatherCompound and -#' # passed on from there. The call below gives a relatively useless result :) -#' ac_lc_dummy <- list() -#' ac_ms_dummy <- list() -#' justOneSpectrum <- gatherSpectrum(myspectrum, myspectrum@@child[[2]], -#' ac_ms_dummy, ac_lc_dummy, w@@aggregated) -#' } -#' -#' -#' @export -gatherCompound <- function(spec, aggregated, additionalPeaks = NULL, retrieval="standard") -{ - # compound ID - id <- spec@id - # processing mode - imode <- spec@mode - - # define positive or negative, based on processing mode. - ion_modes <- list( - "pH" = "POSITIVE", - "pNa" = "POSITIVE", - "mH" = "NEGATIVE", - "mFA" = "NEGATIVE", - "pM" = "POSITIVE", - "mM" = "NEGATIVE", - "pNH4" = "POSITIVE") - mode <- ion_modes[[imode]] - - # for format 2.01 - ac_ms <- list(); - ac_ms[['MS_TYPE']] <- getOption("RMassBank")$annotations$ms_type - ac_ms[['IONIZATION']] <- getOption("RMassBank")$annotations$ionization - ac_ms[['ION_MODE']] <- mode - - # This list could be made customizable. - ac_lc <- list(); - rt <- spec@parent@rt / 60 - ac_lc[['COLUMN_NAME']] <- getOption("RMassBank")$annotations$lc_column - ac_lc[['FLOW_GRADIENT']] <- getOption("RMassBank")$annotations$lc_gradient - ac_lc[['FLOW_RATE']] <- getOption("RMassBank")$annotations$lc_flow - ac_lc[['RETENTION_TIME']] <- sprintf("%.3f min", rt) - ac_lc[['SOLVENT A']] <- getOption("RMassBank")$annotations$lc_solvent_a - ac_lc[['SOLVENT B']] <- getOption("RMassBank")$annotations$lc_solvent_b - - # Go through all child spectra, and fill our skeleton with scan data! - # Pass them the AC_LC and AC_MS data, which are added at the right place - # directly in there. - allSpectra <- lapply(spec@children, function(m) - gatherSpectrum(spec, m, ac_ms, ac_lc, aggregated, additionalPeaks, retrieval=retrieval)) - allSpectra <- allSpectra[which(!is.na(allSpectra))] - return(allSpectra) -} - - - -# Process one single MSMS child scan. -# spec: an object of "analyzedSpectrum" type (i.e. contains -# 14x (or other number) msmsdata, info, mzrange, -# compound ID, parent MS1, cpd id...) -# msmsdata: the msmsdata sub-object from the spec which is the child scan we want to process. -# Contains childFilt, childBad, scan #, etc. Note that the peaks are actually not -# taken from here! They were taken from msmsdata initially, but after introduction -# of the refiltration and multiplicity filtering, this was changed. Now only the -# scan information is actually taken from msmsdata. -# ac_ms, ac_lc: pre-filled info for the MassBank dataset (see above) -# refiltered: the refilteredRcSpecs dataset which contains our good peaks :) -# Contains peaksOK, peaksReanOK, peaksFiltered, peaksFilteredReanalysis, -# peaksProblematic. Currently we use peaksOK and peaksReanOK to create the files. -# (Also, the global additionalPeaks table is used.) -#' @export -gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalPeaks = NULL, retrieval = "standard") -{ - # If the spectrum is not filled, return right now. All "NA" spectra will - # not be treated further. - if(msmsdata@ok == FALSE) - return(NA) - # get data - scan <- msmsdata@acquisitionNum - id <- spec@id - # Further fill the ac_ms datasets, and add the ms$focused_ion with spectrum-specific data: - precursor_types <- list( - "pH" = "[M+H]+", - "pNa" = "[M+Na]+", - "mH" = "[M-H]-", - "mFA" = "[M+HCOO-]-", - "pM" = "[M]+", - "mM" = "[M]-", - "pNH4" = "[M+NH4]+") - ac_ms[['FRAGMENTATION_MODE']] <- msmsdata@info$mode - #ac_ms['PRECURSOR_TYPE'] <- precursor_types[spec$mode] - ac_ms[['COLLISION_ENERGY']] <- msmsdata@info$ce - ac_ms[['RESOLUTION']] <- msmsdata@info$res - - # Calculate exact precursor mass with Rcdk, and find the base peak from the parent - # spectrum. (Yes, that's what belongs here, I think.) - precursorMz <- findMz(spec@id, spec@mode, retrieval=retrieval) - ms_fi <- list() - ms_fi[['BASE_PEAK']] <- round(mz(spec@parent)[which.max(intensity(spec@parent))],4) - ms_fi[['PRECURSOR_M/Z']] <- round(precursorMz$mzCenter,4) - ms_fi[['PRECURSOR_TYPE']] <- precursor_types[spec@mode] - - # Select all peaks which belong to this spectrum (correct cpdID and scan no.) - # from peaksOK - # Note: Here and below it would be easy to customize the source of the peaks. - # Originally the peaks came from msmsdata$childFilt, and the subset - # was used where dppm == dppmBest (because childFilt still contains multiple formulas) - # per peak. - peaks <- aggregated[aggregated$filterOK,,drop=FALSE] - peaks <- peaks[(peaks$cpdID == id) & (peaks$scan == msmsdata@acquisitionNum),,drop=FALSE] - - # No peaks? Aha, bye - if(nrow(peaks) == 0) - return(NA) - - # If we don't include the reanalyzed peaks: - if(!getOption("RMassBank")$use_rean_peaks) - peaks <- peaks[is.na(peaks$matchedReanalysis),,drop=FALSE] - # but if we include them: - else - { - # for info, the following data will be used in the default annotator: - # annotation <- annotation[,c("mzSpec","formula", "formulaCount", "mzCalc", "dppm")] - # and in the peaklist itself: - # c("mzSpec", "int", "intrel") - peaks[!is.na(peaks$matchedReanalysis),"formula"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.formula"] - peaks[!is.na(peaks$matchedReanalysis),"mzCalc"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.mzCalc"] - peaks[!is.na(peaks$matchedReanalysis),"dppm"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.dppm"] - peaks[!is.na(peaks$matchedReanalysis),"dbe"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.dbe"] - peaks[!is.na(peaks$matchedReanalysis),"formulaCount"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.formulaCount"] - } - - # Calculate relative intensity and make a formatted m/z to use in the output - # (mzSpec, for "spectrum") - peaks$intrel <- floor(peaks$intensity / max(peaks$intensity) * 999) - peaks$mzSpec <- round(peaks$mzFound, 4) - # reorder peaks after addition of the reanalyzed ones - peaks <- peaks[order(peaks$mzSpec),] - - # Also format the other values, which are used in the annotation - peaks$dppm <- round(peaks$dppm, 2) - peaks$mzCalc <- round(peaks$mzCalc, 4) - peaks$intensity <- round(peaks$intensity, 1) - # copy the peak table to the annotation table. (The peak table will then be extended - # with peaks from the global "additional_peaks" table, which can be used to add peaks - # to the spectra by hand. - annotation <- peaks - # Keep only peaks with relative intensity >= 1 o/oo, since the MassBank record - # makes no sense otherwise. Also, keep only the columns needed in the output. - peaks <- peaks[ peaks$intrel >= 1, c("mzSpec", "intensity", "intrel")] - - # Here add the additional peaks if there are any for this compound! - # They are added without any annotation. - if(!is.null(additionalPeaks)) - { - # select the peaks from the corresponding spectrum which were marked with "OK=1" in the table. - spec_add_peaks <- additionalPeaks[ - (additionalPeaks$OK == 1) & - (additionalPeaks$cpdID == spec@id) & - (additionalPeaks$scan == msmsdata@acquisitionNum), - c("mzFound", "intensity")] - # If there are peaks to add: - if(nrow(spec_add_peaks)>0) - { - # add the column for rel. int. - spec_add_peaks$intrel <- 0 - # format m/z value - spec_add_peaks$mzSpec <- round(spec_add_peaks$mzFound, 4) - # bind tables together - peaks <- rbind(peaks, spec_add_peaks[,c("mzSpec", "intensity", "intrel")]) - # recalculate rel.int. and reorder list - peaks$intrel <- floor(peaks$intensity / max(peaks$intensity) * 999) - # Again, select the correct columns, and drop values with rel.int. <1 o/oo - # NOTE: If the highest additional peak is > than the previous highest peak, - # this can lead to the situation that a peak is in "annotation" but not in "peaks"! - # See below. - peaks <- peaks[ peaks$intrel >= 1, c("mzSpec", "intensity", "intrel")] - # Reorder again. - peaks <- peaks[order(peaks$mzSpec),] - } - } - - - - # add + or - to fragment formulas - formula_tag <- list( - "pH" = "+", - "pNa" = "+", - "mH" = "-", - "mFA" = "-", - "pM" = "+", - "mM" = "-", - "pNH4" = "+") - type <- formula_tag[[spec@mode]] - - annotator <- getOption("RMassBank")$annotator - if(is.null(annotator)) - annotator <- "annotator.default" - - - - # Here, the relative intensity is recalculated using the newly added additional - # peaks from the peak list. Therefore, we throw superfluous peaks out again. - # NOTE: It is a valid question whether or not we should kick peaks out at this stage. - # The alternative would be to leave the survivors at 1 o/oo, but keep them in the spectrum. - annotation$intrel <- floor(annotation$intensity / max(peaks$intensity) * 999) - annotation <- annotation[annotation$intrel >= 1,] - - annotation <- do.call(annotator, list(annotation= annotation, type=type)) - - - # Name the columns correctly. - colnames(peaks) <- c("m/z", "int.", "rel.int.") - peaknum <- nrow(peaks) - - # Create the "lower part" of the record. - mbdata <- list() - # Add the AC$MS, AC$LC info. - if(getOption("RMassBank")$use_version == 2) - { - mbdata[["AC$MASS_SPECTROMETRY"]] <- ac_ms - mbdata[["AC$CHROMATOGRAPHY"]] <- ac_lc - } - else - { - # Fix for MassBank data format 1, where ION_MODE must be renamed to MODE - mbdata[["AC$ANALYTICAL_CONDITION"]] <- c(ac_ms, ac_lc) - names(mbdata[["AC$ANALYTICAL_CONDITION"]])[[3]] <- "MODE" - } - # Add the MS$FOCUSED_ION info. - mbdata[["MS$FOCUSED_ION"]] <- ms_fi - - ## The SPLASH is a hash value calculated across all peaks - ## http://splash.fiehnlab.ucdavis.edu/ - ## Has to be temporarily added as "PK$SPLASH" in the "lower" part - ## of the record, but will later be moved "up" when merging parts in compileRecord() - - # the data processing tag :) - # Change by Tobias: - # I suggest to add here the current version number of the clone due to better distinction between different makes of MB records - # Could be automatised from DESCRIPTION file? - if(getOption("RMassBank")$use_rean_peaks) - processingComment <- list("REANALYZE" = "Peaks with additional N2/O included") - else - processingComment <- list() - mbdata[["MS$DATA_PROCESSING"]] <- c( - getOption("RMassBank")$annotations$ms_dataprocessing, - processingComment, - list("WHOLE" = paste("RMassBank", packageVersion("RMassBank"))) - ) - - mbdata[["PK$SPLASH"]] <- list(SPLASH = getSplash(peaks[,c("m/z", "int.")])) - - # Annotation: - if(getOption("RMassBank")$add_annotation && (findLevel(id,TRUE)!="unknown")) - mbdata[["PK$ANNOTATION"]] <- annotation - - # Peak table - mbdata[["PK$NUM_PEAK"]] <- peaknum - mbdata[["PK$PEAK"]] <- peaks - # These two entries will be thrown out later, but they are necessary to build the - # record title and the accession number. - mbdata[["RECORD_TITLE_CE"]] <- msmsdata@info$ces #formatted collision energy - # Mode of relative scan calculation: by default it is calculated relative to the - # parent scan. If a corresponding option is set, it will be calculated from the first - # present child scan in the list. - relativeScan <- "fromParent" - if(!is.null(getOption("RMassBank")$recomputeRelativeScan)) - if(getOption("RMassBank")$recomputeRelativeScan == "fromFirstChild") - relativeScan <- "fromFirstChild" - if(relativeScan == "fromParent") - mbdata[["SUBSCAN"]] <- msmsdata@acquisitionNum - spec@parent@acquisitionNum #relative scan - else if(relativeScan == "fromFirstChild"){ - firstChild <- min(unlist(lapply(spec@children,function(d) d@acquisitionNum))) - mbdata[["SUBSCAN"]] <- msmsdata@acquisitionNum - firstChild + 1 - } - return(mbdata) -} - - -# This compiles a MassBank record from the analyzedRcSpecs format (using the peaks from -# refilteredRcSpecs) together with the compound annotation data. -# Correspondingly: -# spec: contains the analyzedRcSpec-format spectrum collection to be compiled -# (i.e. a block of length(spectraList) child spectra) -# mbdata: contains the corresponding MassBank "header" (the upper part of the record) -# until INSTRUMENT TYPE. -# refiltered: the refilteredRcSpecs which contain our nice peaks. -#' Compile MassBank records -#' -#' Takes a spectra block for a compound, as returned from -#' \code{\link{analyzeMsMs}}, and an aggregated cleaned peak table, together -#' with a MassBank information block, as stored in the infolists and loaded via -#' \code{\link{loadInfolist}}/\code{\link{readMbdata}} and processes them to a -#' MassBank record -#' -#' \code{compileRecord} calls \code{\link{gatherCompound}} to create blocks of -#' spectrum data, and finally fills in the record title and accession number, -#' renames the "internal ID" comment field and removes dummy fields. -#' -#' @usage compileRecord(spec, mbdata, aggregated, additionalPeaks = NULL, retrieval="standard") -#' @param spec A \code{RmbSpectraSet} for a compound, after analysis (\code{\link{analyzeMsMs}}). -#' Note that \bold{peaks are not read from this -#' object anymore}: Peaks come from the \code{aggregated} dataframe (and from -#' the global \code{additionalPeaks} dataframe; cf. \code{\link{addPeaks}} for -#' usage information.) -#' @param mbdata The information data block for the record header, as stored in -#' \code{mbdata_relisted} after loading an infolist. -#' @param aggregated An aggregated peak data table containing information about refiltered spectra etc. -#' @param additionalPeaks If present, a table with additional peaks to add into the spectra. -#' As loaded with \code{\link{addPeaks}}. -#' @param retrieval A value that determines whether the files should be handled either as "standard", -#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" -#' if the only know thing is the m/z -#' @return Returns a MassBank record in list format: e.g. -#' \code{list("ACCESSION" = "XX123456", "RECORD_TITLE" = "Cubane", ..., -#' "CH\$LINK" = list( "CAS" = "12-345-6", "CHEMSPIDER" = 1111, ...))} -#' @author Michael Stravs -#' @seealso \code{\link{mbWorkflow}}, \code{\link{addPeaks}}, -#' \code{\link{gatherCompound}}, \code{\link{toMassbank}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' -#' # -#' \dontrun{myspec <- w@@spectra[[2]]} -#' # after having loaded an infolist: -#' \dontrun{mbdata <- mbdata_relisted[[which(mbdata_archive\$id == as.numeric(myspec\$id))]]} -#' \dontrun{compiled <- compileRecord(myspec, mbdata, w@@aggregated)} -#' -#' @export -compileRecord <- function(spec, mbdata, aggregated, additionalPeaks = NULL, retrieval="standard") -{ - # gather the individual spectra data - mblist <- gatherCompound(spec, aggregated, additionalPeaks, retrieval=retrieval) - # this returns a n-member list of "lower parts" of spectra (one for each subscan). - # (n being the number of child scans per parent scan.) - # Now we put the two parts together. - # (lapply on all n subscans, returns a list.) - mblist_c <- lapply(mblist, function(l) - { - # This is the step which sticks together the upper and the lower part of the - # record (the upper being compound-specific and the lower being scan-specific.) - # Note that the accession number and record title (in the upper part) must of course - # be filled in with scan-specific info. - mbrecord <- c(mbdata, l) - - # Here is the right place to fix the name of the INTERNAL ID field. - names(mbrecord[["COMMENT"]])[[which(names(mbrecord[["COMMENT"]]) == "ID")]] <- - getOption("RMassBank")$annotations$internal_id_fieldname - # get mode parameter (for accession number generation) depending on version - # of record definition - # Change by Tobias: - # I suggest to include fragmentation mode here for information - if(getOption("RMassBank")$use_version == 2) - mode <- mbrecord[["AC$MASS_SPECTROMETRY"]][["ION_MODE"]] - else - mode <- mbrecord[["AC$ANALYTICAL_CONDITION"]][["MODE"]] - # Generate the title and then delete the temprary RECORD_TITLE_CE field used before - mbrecord[["RECORD_TITLE"]] <- .parseTitleString(mbrecord) - mbrecord[["RECORD_TITLE_CE"]] <- NULL - # Calculate the accession number from the options. - shift <- getOption("RMassBank")$accessionNumberShifts[[spec@mode]] - mbrecord[["ACCESSION"]] <- sprintf("%s%04d%02d", getOption("RMassBank")$annotations$entry_prefix, as.numeric(spec@id), as.numeric(mbrecord[["SUBSCAN"]])+shift) - # Clear the "SUBSCAN" field. - mbrecord[["SUBSCAN"]] <- NULL - # return the record. - return(mbrecord) - }) -} - - - -#' Generate peak annotation from peaklist -#' -#' Generates the PK$ANNOTATION entry from the peaklist obtained. This function is -#' overridable by using the "annotator" option in the settings file. -#' -#' @param annotation A peak list to be annotated. Contains columns: -#' \code{"cpdID","formula","mzFound" ,"scan","mzCalc","dppm", -#' "dbe","mz","int","formulaCount","parentScan","fM_factor","dppmBest", -#' "formulaMultiplicity","intrel","mzSpec"} -#' -#' @param type The ion type to be added to annotated formulas ("+" or "-" usually) -#' -#' @return The annotated peak table. Table \code{colnames()} will be used for the -#' titles (preferrably don't use spaces in the column titles; however no format is -#' strictly enforced by the MassBank data format. -#' -#' @examples -#' \dontrun{ -#' annotation <- annotator.default(annotation) -#' } -#' @author Michele Stravs, Eawag -#' @export -annotator.default <- function(annotation, type) -{ - - annotation$formula <- paste(annotation$formula, type, sep='') - # Select the right columns and name them correctly for output. - annotation <- annotation[,c("mzSpec","formula", "formulaCount", "mzCalc", "dppm")] - colnames(annotation) <- c("m/z", "tentative_formula", "formula_count", "mass", "error(ppm)") - return(annotation) -} - -#' Parse record title -#' -#' Parses a title for a single MassBank record using the title format -#' specified in the option titleFormat. Internally used, not exported. -#' -#' If the option is not set, a standard title format is used (for record definition -#' version 1 or 2). -#' -#' @usage .parseTitleString(mbrecord) -#' @param mbrecord A MassBank record in list format, as returned from -#' \code{\link{gatherSpectrum}}. -#' @return A string with the title. -#' @author Michael Stravs, Eawag -#' @seealso \code{\link{compileRecord}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' \dontrun{ -#' # used in compileRecord() -#' title <- .parseTitleString(mbrecord) -#' } -#' -#' -#' -.parseTitleString <- function(mbrecord) -{ - - varlist <- getOption("RMassBank")$titleFormat - - # Set the standard title format. - if(is.null(varlist)) - { - if(getOption("RMassBank")$use_version == 2) - { - varlist <- c( - "{CH$NAME}", - "{AC$INSTRUMENT_TYPE}", - "{AC$MASS_SPECTROMETRY: MS_TYPE}", - "CE: {RECORD_TITLE_CE}", - "R={AC$MASS_SPECTROMETRY: RESOLUTION}", - "{MS$FOCUSED_ION: PRECURSOR_TYPE}" - ) - } - else - { - varlist <- c( - "{CH$NAME}", - "{AC$INSTRUMENT_TYPE}", - "{AC$ANALYTICAL_CONDITION: MS_TYPE}", - "CE: {RECORD_TITLE_CE}", - "R={AC$ANALYTICAL_CONDITION: RESOLUTION}", - "{MS$FOCUSED_ION: PRECURSOR_TYPE}" - ) - } - } - - - # Extract a {XXX} argument from each title section. - # check that every title has one and only one match - args <- regexec("\\{(.*)\\}", varlist) - arglist <- regmatches(varlist, args) - if(any(unlist(lapply(arglist, length)) != 2)) - stop("Title format is incorrectly specified: a section with not exactly 1 parameters") - - parsedVars <- lapply(varlist, function(var) - { - # Extract the specified parameter inside the {}. - # I.e. from a string like "R={BLA: BLUB}" return "BLA: BLUB" - args <- regexec("\\{(.*)\\}", var) - arg <- regmatches(var, args)[[1]][[2]] - # Split the parameter by colon if necessary - splitVar <- strsplit(arg, ": ")[[1]] - # Read the parameter value from the record - if(length(splitVar) == 2) - replaceVar <- mbrecord[[splitVar[[1]]]][[splitVar[[2]]]] - else if(length(splitVar) == 1) - replaceVar <- mbrecord[[splitVar]] - else - stop(paste( - "Title format is incorrectly specified:", var) - ) - # Fix problems: NULL returns - if(is.null(replaceVar)) - replaceVar <- "" - # Fix problems: Names will have >= 1 match. Take the first - if(length(replaceVar) > 1) - replaceVar <- replaceVar[[1]] - - # Fix problems: Unknowns might have no name - if(!length(replaceVar)){ - replaceVar <- "" - } - - # Substitute the parameter value into the string - parsedVar <- sub("\\{(.*)\\}", replaceVar, var) - return(parsedVar) - }) - title <- paste(parsedVars, collapse="; ") - return(title) -} - - -# This converts the tree-like list (as obtained e.g. from compileRecord()) -# into a plain text array, which can then be dumped to a file suitable for -# MassBank upload. -#' Write MassBank record into character array -#' -#' Writes a MassBank record in list format to a text array. -#' -#' The function is a general conversion tool for the MassBank format; i.e. the -#' field names are not fixed. \code{mbdata} must be a named list, and the -#' entries can be as follows: \itemize{ -#' \item A single text line: -#' -#' \code{'CH\$EXACT_MASS' = '329.1023'} -#' -#' is written as -#' -#' \code{CH\$EXACT_MASS: 329.1023} -#' \item A character array: -#' -#' \code{'CH\$NAME' = c('2-Aminobenzimidazole', '1H-Benzimidazol-2-amine')} -#' -#' is written as -#' -#' \code{CH\$NAME: 2-Aminobenzimidazole} -#' -#' \code{CH\$NAME: 1H-Benzimidazol-2-amine} -#' -#' \item A named list of strings: -#' -#' \code{'CH\$LINK' = list('CHEBI' = "27822", "KEGG" = "C10901")} -#' -#' is written as -#' -#' \code{CH\$LINK: CHEBI 27822} -#' -#' \code{CH\$LINK: KEGG C10901} -#' -#' \item A data frame (e.g. the peak table) is written as specified in -#' the MassBank record format (Section 2.6.3): the column names are used as -#' headers for the first line, all data rows are printed space-separated. -#' } -#' -#' @usage toMassbank(mbdata) -#' @param mbdata A MassBank record in list format. -#' @return The result is a text array, which is ready to be written to the disk -#' as a file. -#' @note The function iterates over the list item names. \bold{This means that -#' duplicate entries in \code{mbdata} are (partially) discarded!} The correct -#' way to add them is by making a character array (as specified above): Instead -#' of \code{'CH\$NAME' = 'bla', 'CH\$NAME' = 'blub'} specify \code{'CH\$NAME' = -#' c('bla','blub')}. -#' @author Michael Stravs -#' @seealso \code{\link{compileRecord}}, \code{\link{mbWorkflow}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' \dontrun{ -#' # Read just the compound info skeleton from the Internet for some compound ID -#' id <- 35 -#' mbdata <- gatherData(id) -#' #' # Export the mbdata blocks to line arrays -#' # (there is no spectrum information, just the compound info...) -#' mbtext <- toMassbank(mbdata) -#' } -#' -#' @export -toMassbank <- function (mbdata) -{ - # mbf is an array of lines and count is the line counter. - # Very old-school, but it works. :) - mbf <- character(0) - count <- 1 - lapply(names(mbdata), function(entry) - { - # If entry is a char line, add it to the file. - # If it is a named sublist, add each subentry with name - # If it is an unnamed sublist, add each subentry without name - # if it is a dataframe, write in PEAKS mode - - # Note: this is were I liked "lapply" a little too much. "for" would - # be more idiomatic, and wouldn't need the <<- assignments. - - # Data frame: table mode. A header line and one space-separated line for - # each data frame row. - if(is.data.frame(mbdata[[entry]])) - { - mbf[[count]] <<- paste(entry,": " , - paste(colnames(mbdata[[entry]]), collapse=" "), - sep='') - count <<- count+1 - for(row in 1:nrow(mbdata[[entry]])) - { - mbf[[count]] <<- paste(" ", - paste(mbdata[[entry]][row,],collapse=" "), - sep="") - count <<- count+1 - } - #browser() - } - # List with named items: Write every entry like CH$LINK: CAS 12-345-678 - else if(is.list(mbdata[[entry]]) & !is.null(names(mbdata[[entry]]))) - { - - lapply(names(mbdata[[entry]]), function(subentry) - { - if(subentry != "SPLASH"){ - mbf[[count]] <<- paste(entry,": ",subentry, " ", mbdata[[entry]][[subentry]], sep='') - } else { - mbf[[count]] <<- paste(entry,": ", mbdata[[entry]][[subentry]], sep='') - } - #print(mbf) - count <<- count + 1 - }) - } - # Array (or list) of unnamed items: Write every entry like CH$NAME: Paracetamol - # (iterative entry without subindices) - else if (length(mbdata[[entry]]) > 1 & is.null(names(mbdata[[entry]]))) - { - lapply(mbdata[[entry]], function(subentry) - { - mbf[[count]] <<- paste(entry,": ",subentry, sep='') - #print(mbf) - count <<- count + 1 - }) - } - # Length is 1: just write the entry like PK$NUM_PEAKS: 131 - else - { - mbf[[count]] <<- paste(entry,": ",mbdata[[entry]], sep='') - count <<- count + 1 - } - } - ) # End of lapply block (per child spectrum) - # Add mandatory EOF marker - mbf[[count]] <- "//" - return(mbf) -} - -# Exports compiled and massbanked spectra, with their associated molfiles, to physical files. -# "compiled" is still used here, because we need an accessible accession number. -# In the plain text arrays, the accession number is already "hidden". -# compiled: is ONE "compiled" entry, i.e. ONE compound with e.g. 14 spectra. -# files: is a return value from lapply(toMassbank), i.e. contains 14 plain-text arrays -# (for a 14-spectra method) -# molfile: a molfile from createMolfile -#' Export internally stored MassBank data to files -#' -#' Exports MassBank recfile data arrays and corresponding molfiles to physical -#' files on hard disk, for one compound. -#' -#' The data from \code{compiled} is still used here, because it contains the -#' "visible" accession number. In the plain-text format contained in -#' \code{files}, the accession number is not "accessible" anymore since it's in -#' the file. -#' -#' @usage exportMassbank(compiled, files, molfile) -#' @param compiled Is ONE "compiled" entry, i.e. ONE compound with e.g. 14 -#' spectra, as returned from \code{\link{compileRecord}}. -#' @param files A n-membered array (usually a return value from -#' \code{lapply(\link{toMassbank})}), i.e. contains n plain-text arrays with -#' MassBank records. -#' @param molfile A molfile from \code{\link{createMolfile}} -#' @return No return value. -#' @note An improvement would be to write the accession numbers into -#' \code{names(compiled)} and later into \code{names(files)} so \code{compiled} -#' wouldn't be needed here anymore. (The compound ID would have to go into -#' \code{names(molfile)}, since it is also retrieved from \code{compiled}.) -#' @author Michael Stravs -#' @seealso \code{\link{createMolfile}}, \code{\link{compileRecord}}, -#' \code{\link{toMassbank}}, \code{\link{mbWorkflow}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' \dontrun{ -#' compiled <- compileRecord(record, mbdata, refilteredRcSpecs) -#' mbfiles <- toMassbank(compiled) -#' molfile <- createMolfile(compiled[[1]][["CH$SMILES"]]) -#' exportMassbank(compiled, mbfiles, molfile) -#' } -#' -#' @export -exportMassbank <- function(compiled, files, molfile) -{ - molnames <- c() - for(file in 1:length(compiled)) - { - # Read the accession no. from the corresponding "compiled" entry - filename <- compiled[[file]]["ACCESSION"] - # use this accession no. as filename - filename <- paste(filename, ".txt", sep="") - write(files[[file]], - file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata",filename) - ) - } - # Use internal ID for naming the molfiles - if(findLevel(compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]],TRUE)=="standard"){ - molname <- sprintf("%04d", as.numeric( - compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])) - molname <- paste(molname, ".mol", sep="") - write(molfile, - file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata",molname) - ) - } -} - -# Makes a list.tsv with molfile -> massbank ch$name attribution. - -#' Write list.tsv file -#' -#' Makes a list.tsv file in the "moldata" folder. -#' -#' Generates the list.tsv file which is needed by MassBank to connect records with -#' their respective molfiles. The first compound name is linked to a mol-file with -#' the compound ID (e.g. 2334.mol for ID 2334). -#' -#' @param compiled A list of compiled spectra (in tree-format, as returned by \code{compileRecord}). -#' @return No return value. -#' @author Michael A. Stravs, Eawag -#' @examples \dontrun{ -#' compiled <- compileRecord(record, mbdata, refilteredRcSpecs) -#' # a list.tsv for only one record: -#' clist <- list(compiled) -#' makeMollist(clist) -#' } -#' @export -makeMollist <- function(compiled) -{ - # For every "compiled" entry (here, compiled is not one "compiled" entry but the total - # list of all compiled spectra), extract the uppermost CH$NAME and the ID (from the - # first spectrum.) Make the ID into 0000 format. - - tsvlist <- t(sapply(compiled, function(entry) - { - name <- entry[[1]][["CH$NAME"]][[1]] - id <- sprintf("%04d", as.numeric(entry[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])) - molfilename <- paste(id,".mol",sep='') - return(c(name,molfilename)) - })) - - IDs <- sapply(compiled, function(entry) return( sprintf("%04d", as.numeric(entry[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])))) - level <- sapply(IDs, findLevel, compact=TRUE) - validentries <- which(level == "standard") - # Write the file with the - write.table(tsvlist[validentries,], - paste(getOption("RMassBank")$annotations$entry_prefix,"/moldata/list.tsv", sep=''), - quote = FALSE, - sep="\t", - row.names=FALSE, - col.names=FALSE - ) -} - - -# Load a dataframe or file into additional_peaks (or add additional points in there.) -# The columns cpdID, scan, mzFound, int, OK are mandatory. OK=1 means that the peaks -# will be added into the spectrum. mzFound and int will be taken for the table. -# No annotation will be written. -# Add peaks to the spectra by hand - -#' Add additional peaks to spectra -#' -#' Loads a table with additional peaks to add to the MassBank spectra. Required -#' columns are \code{cpdID, scan, int, mzFound, OK}. -#' -#' All peaks with OK=1 will be included in the spectra. -#' -#' @usage addPeaks(mb, filename_or_dataframe) -#' @param mb The \code{mbWorkspace} to load the peaks into. -#' @param filename_or_dataframe Filename of the csv file, or name of the R -#' dataframe containing the peaklist. -#' @return The \code{mbWorkspace} with loaded additional peaks. -#' @author Michael Stravs -#' @seealso \code{\link{mbWorkflow}} -#' @examples -#' -#' \dontrun{addPeaks("myrun_additionalPeaks.csv")} -#' -#' @export -addPeaks <- function(mb, filename_or_dataframe) -{ - - errorvar <- 0 - currEnvir <- environment() - d <- 1 - - if(is.data.frame(filename_or_dataframe)) - df <- filename_or_dataframe - else - tryCatch( - df <- read.csv(filename_or_dataframe), - error=function(e){ - currEnvir$errorvar <- 1 - }) - # I change your heuristic fix to another heuristic fix, because I will have to test for a column name change... - - if(!errorvar){ - - if(ncol(df) < 2) - df <- read.csv(filename_or_dataframe, sep=";") - # here: the column int was renamed to intensity, and we need to be able to read old files. sorry. - if(!("intensity" %in% colnames(df)) & ("int" %in% colnames(df))) - df$intensity <- df$int - - cols <- c("cpdID", "scan", "mzFound", "intensity", "OK") - n <- colnames(df) - # Check if comma-separated or semicolon-separated - d <- setdiff(cols, n) - if(length(d)>0){ - stop("Some columns are missing in the additional peak list. Needs at least cpdID, scan, mzFound, intensity, OK.") - } - } - - culled_df <- df[,c("cpdID", "scan", "mzFound", "intensity", "OK")] - - - if(nrow(mb@additionalPeaks) == 0) - mb@additionalPeaks <- culled_df - else - mb@additionalPeaks <- rbind(mb@additionalPeaks, culled_df) - return(mb) -} +# Script for writing MassBank files + +#testtest change +#' Load MassBank compound information lists +#' +#' Loads MassBank compound information lists (i.e. the lists which were created +#' in the first two steps of the MassBank \code{\link{mbWorkflow}} and +#' subsequently edited by hand.). +#' +#' \code{resetInfolists} clears the information lists, i.e. it creates a new +#' empty list in \code{mbdata_archive}. \code{loadInfolist} loads a single CSV +#' file, whereas \code{loadInfolists} loads a whole directory. +#' +#' @aliases loadInfolists loadInfolist resetInfolists +#' @usage loadInfolists(mb, path) +#' +#' loadInfolist(mb, fileName) +#' +#' resetInfolists(mb) +#' @param path Directory in which the namelists reside. All CSV files in this +#' directory will be loaded. +#' @param fileName A single namelist to be loaded. +#' @param mb The \code{mbWorkspace} to load/reset the lists in. +#' @return The new workspace with loaded/reset lists. +#' @author Michael Stravs +#' @examples +#' +#' # +#' \dontrun{mb <- resetInfolists(mb) +#' mb <- loadInfolist(mb, "my_csv_infolist.csv")} +#' +#' @export +loadInfolists <- function(mb, path) +{ + archivefiles <- list.files(path, ".csv", full.names=TRUE) + for(afile in archivefiles) + mb <- loadInfolist(mb, afile) + return(mb) +} + +# Load an "infolist". This loads a CSV file which should contain the entries +# edited and controlled by hand. All compound infos from fileName are added into the +# global mbdata_archive. Entries with a cpdID which was already present, are substituted +# by new entries from the fileName file. +#' @export +loadInfolist <- function(mb, fileName) +{ + # Prime a new infolist if it doesn't exist + if(ncol(mb@mbdata_archive) == 0) + mb <- resetInfolists(mb) + mbdata_new <- read.csv(fileName, sep=",", stringsAsFactors=FALSE) + # Legacy check for loading the Uchem format files. + # Even if dbname_* are not used downstream of here, it's still good to keep them + # for debugging reasons. + n <- colnames(mbdata_new) + cols <- c("id","dbcas","dataused") + + # Check if comma-separated or semicolon-separated + d <- setdiff(cols, n) + if(length(d)>0){ + mbdata_new <- read.csv2(fileName, stringsAsFactors=FALSE) + n <- colnames(mbdata_new) + d2 <- setdiff(cols, n) + if(length(d2) > 0){ + stop("Some columns are missing in the infolist.") + } + } + if("dbname_d" %in% colnames(mbdata_new)) + { + colnames(mbdata_new)[[which(colnames(mbdata_new)=="dbname_d")]] <- "dbname" + # dbname_e will be dropped because of the select= in the subset below. + } + if("COMMENT.EAWAG_UCHEM_ID" %in% colnames(mbdata_new)) + colnames(mbdata_new)[[which(colnames(mbdata_new)== "COMMENT.EAWAG_UCHEM_ID")]] <- + "COMMENT.ID" + + # Clear from padding spaces and NAs + mbdata_new <- as.data.frame(t(apply(mbdata_new, 1, function(r) + { + # Substitute empty spaces by real NA values + r[which(r == "")] <- NA + # Trim spaces (in all non-NA fields) + r[which(!is.na(r))] <- sub("^ *([^ ]+) *$", "\\1", r[which(!is.na(r))]) + return(r) + }))) + # use only the columns present in mbdata_archive, no other columns added in excel + mbdata_new <- mbdata_new[, colnames(mb@mbdata_archive)] + # substitute the old entires with the ones from our files + # then find the new (previously inexistent) entries, and rbind them to the table + new_entries <- setdiff(mbdata_new$id, mb@mbdata_archive$id) + old_entries <- intersect(mbdata_new$id, mb@mbdata_archive$id) + for(entry in old_entries) + mb@mbdata_archive[mb@mbdata_archive$id == entry,] <- mbdata_new[mbdata_new$id == entry,] + mb@mbdata_archive <- rbind(mb@mbdata_archive, + mbdata_new[mbdata_new$id==new_entries,]) + return(mb) + +} + + +# Resets the mbdata_archive to an empty version. +#' @export +resetInfolists <- function(mb) +{ + mb@mbdata_archive <- + structure(list(X = integer(0), id = integer(0), dbcas = character(0), + dbname = character(0), dataused = character(0), COMMENT.CONFIDENCE = character(0), + COMMENT.ID = integer(0), CH.NAME1 = character(0), + CH.NAME2 = character(0), CH.NAME3 = character(0), CH.COMPOUND_CLASS = character(0), + CH.FORMULA = character(0), CH.EXACT_MASS = numeric(0), CH.SMILES = character(0), + CH.IUPAC = character(0), CH.LINK.CAS = character(0), CH.LINK.CHEBI = integer(0), + CH.LINK.HMDB = character(0), CH.LINK.KEGG = character(0), CH.LINK.LIPIDMAPS = character(0), + CH.LINK.PUBCHEM = character(0), CH.LINK.INCHIKEY = character(0), + CH.LINK.CHEMSPIDER = integer(0)), .Names = c("X", "id", "dbcas", + "dbname", "dataused", "COMMENT.CONFIDENCE", "COMMENT.ID", + "CH.NAME1", "CH.NAME2", "CH.NAME3", "CH.COMPOUND_CLASS", "CH.FORMULA", + "CH.EXACT_MASS", "CH.SMILES", "CH.IUPAC", "CH.LINK.CAS", "CH.LINK.CHEBI", + "CH.LINK.HMDB", "CH.LINK.KEGG", "CH.LINK.LIPIDMAPS", "CH.LINK.PUBCHEM", + "CH.LINK.INCHIKEY", "CH.LINK.CHEMSPIDER"), row.names = integer(0), class = "data.frame") + return(mb) + +} + +# The workflow function, i.e. (almost) the only thing you actually need to call. +# See below for explanation of steps. +#' MassBank record creation workflow +#' +#' Uses data generated by \code{\link{msmsWorkflow}} to create MassBank records. +#' +#' See the vignette \code{vignette("RMassBank")} for detailed informations about the usage. +#' +#' Steps: +#' +#' Step 1: Find which compounds don't have annotation information yet. For these +#' compounds, pull information from several databases (using gatherData). +#' +#' Step 2: If new compounds were found, then export the infolist.csv and stop the workflow. +#' Otherwise, continue. +#' +#' Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. +#' +#' Step 4: Compile the spectra. Using the skeletons from the archive data, create +#' MassBank records per compound and fill them with peak data for each spectrum. +#' Also, assign accession numbers based on scan mode and relative scan no. +#' +#' Step 5: Convert the internal tree-like representation of the MassBank data into +#' flat-text string arrays (basically, into text-file style, but still in memory) +#' +#' Step 6: For all OK records, generate a corresponding molfile with the structure +#' of the compound, based on the SMILES entry from the MassBank record. (This molfile +#' is still in memory only, not yet a physical file) +#' +#' Step 7: If necessary, generate the appropriate subdirectories, and actually write +#' the files to disk. +#' +#' Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank +#' to attribute substances to their corresponding structure molfiles. +#' +#' @param steps Which steps in the workflow to perform. +#' @param infolist_path A path where to store newly downloaded compound informations, +#' which should then be manually inspected. +#' @param mb The \code{mbWorkspace} to work in. +#' @param gatherData A variable denoting whether to retrieve information using several online databases \code{gatherData= "online"} +#' or to use the local babel installation \code{gatherData= "babel"}. Note that babel is used either way, if a directory is given +#' in the settings. This setting will be ignored if retrieval is set to "standard" +#' @return The processed \code{mbWorkspace}. +#' @seealso \code{\link{mbWorkspace-class}} +#' @author Michael A. Stravs, Eawag +#' @examples \dontrun{ +#' mb <- newMbWorkspace(w) # w being a msmsWorkspace +#' mb <- loadInfolists(mb, "D:/myInfolistPath") +#' mb <- mbWorkflow(mb, steps=c(1:3), "newinfos.csv") +#' +#' } +#' @export +mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.csv", gatherData = "online") +{ + # Step 1: Find which compounds don't have annotation information yet. For these + # compounds, pull information from CTS (using gatherData). + if(1 %in% steps) + { + mbdata_ids <- lapply(selectSpectra(mb@spectra, "found", "object"), function(spec) spec@id) + message("mbWorkflow: Step 1. Gather info from several databases") + # Which IDs are not in mbdata_archive yet? + new_ids <- setdiff(as.numeric(unlist(mbdata_ids)), mb@mbdata_archive$id) + mb@mbdata <- lapply(new_ids, function(id) + { + if(findLevel(id,TRUE) == "standard"){ + if(gatherData == "online"){ + + d <- gatherData(id) + } + if(gatherData == "babel"){ + # message("mbWorkflow: Step 1. Gather info using babel") + d <- gatherDataBabel(id) + } + } else{ + # message("mbWorkflow: Step 1. Gather no info - Unknown structure") + d <- gatherDataUnknown(id, mb@spectra[[1]]@mode, retrieval=findLevel(id,TRUE)) + } + message(paste(id, ": ", d$dataused, sep='')) + return(d) + }) + } + # Step 2: If new compounds were found, then export the infolist.csv and stop the workflow. + # Otherwise, continue! + if(2 %in% steps) + { + message("mbWorkflow: Step 2. Export infolist (if required)") + if(length(mb@mbdata)>0) + { + mbdata_mat <- flatten(mb@mbdata) + write.csv(as.data.frame(mbdata_mat),infolist_path, na="") + message(paste("The file", infolist_path, "was generated with new compound information. Please check and edit the table, and add it to your infolist folder.")) + return(mb) + } + else + message("No new data added.") + } + # Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. + if(3 %in% steps) + { + message("mbWorkflow: Step 3. Data reformatting") + mb@mbdata_relisted <- apply(mb@mbdata_archive, 1, readMbdata) + } + # Step 4: Compile the spectra! Using the skeletons from the archive data, create + # MassBank records per compound and fill them with peak data for each spectrum. + # Also, assign accession numbers based on scan mode and relative scan no. + if(4 %in% steps) + { + message("mbWorkflow: Step 4. Spectra compilation") + mb@compiled <- lapply( + selectSpectra(mb@spectra, "found", "object"), + function(r) { + message(paste("Compiling: ", r@name, sep="")) + mbdata <- mb@mbdata_relisted[[which(mb@mbdata_archive$id == as.numeric(r@id))]] + if(nrow(mb@additionalPeaks) > 0) + res <-compileRecord(r, mbdata, mb@aggregated, mb@additionalPeaks) + else + res <-compileRecord(spec = r, mbdata = mbdata, aggregated = mb@aggregated, additionalPeaks = NULL, retrieval=findLevel(r@id,TRUE)) + return(res) + }) + # check which compounds have useful spectra + mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) + mb@problems <- which(is.na(mb@compiled)) + mb@compiled_ok <- mb@compiled[mb@ok] + } + # Step 5: Convert the internal tree-like representation of the MassBank data into + # flat-text string arrays (basically, into text-file style, but still in memory) + if(5 %in% steps) + { + message("mbWorkflow: Step 5. Flattening records") + mb@mbfiles <- lapply(mb@compiled_ok, function(c) lapply(c, toMassbank)) + } + # Step 6: For all OK records, generate a corresponding molfile with the structure + # of the compound, based on the SMILES entry from the MassBank record. (This molfile + # is still in memory only, not yet a physical file) + if(6 %in% steps) + { + message("mbWorkflow: Step 6. Generate molfiles") + mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(as.numeric(c[[1]][['COMMENT']][[getOption("RMassBank")$annotations$internal_id_fieldname]]))) + } + # Step 7: If necessary, generate the appropriate subdirectories, and actually write + # the files to disk. + if(7 %in% steps) + { + message("mbWorkflow: Step 7. Generate subdirs and export") + dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "moldata", sep='/'),recursive=TRUE) + dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "recdata", sep='/'),recursive=TRUE) + for(cnt in 1:length(mb@compiled_ok)) + exportMassbank(mb@compiled_ok[[cnt]], mb@mbfiles[[cnt]], mb@molfile[[cnt]]) + } + # Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank + # to attribute substances to their corresponding structure molfiles. + if(8 %in% steps) + { + message("mbWorkflow: Step 8. Create list.tsv") + makeMollist(mb@compiled_ok) + } + return(mb) +} + + +# Calls openbabel and converts the SMILES code string (or retrieves the SMILES code from +# the ID, and then calls openbabel) to create a molfile in text format. +# If fileName is given, the file is directly stored. Otherwise, it is returned as a +# character array. +#' Create MOL file for a chemical structure +#' +#' Creates a MOL file (in memory or on disk) for a compound specified by the +#' compound ID or by a SMILES code. +#' +#' The function invokes OpenBabel (and therefore needs a correctly set +#' OpenBabel path in the RMassBank settings), using the SMILES code retrieved +#' with \code{findSmiles} or using the SMILES code directly. The current +#' implementation of the workflow uses the latter version, reading the SMILES +#' code directly from the MassBank record itself. +#' +#' @usage createMolfile(id_or_smiles, fileName = FALSE) +#' @param id_or_smiles The compound ID or a SMILES code. +#' @param fileName If the filename is set, the file is written directly to disk +#' using the specified filename. Otherwise, it is returned as a text array. +#' @return A character array containing the MOL/SDF format file, ready to be +#' written to disk. +#' @author Michael Stravs +#' @seealso \code{\link{findSmiles}} +#' @references OpenBabel: \url{http://openbabel.org} +#' @examples +#' +#' # Benzene: +#' \dontrun{ +#' createMolfile("C1=CC=CC=C1") +#' } +#' +#' @export +createMolfile <- function(id_or_smiles, fileName = FALSE) +{ + .checkMbSettings() + babeldir <- getOption("RMassBank")$babeldir + + if(!is.numeric(id_or_smiles)){ + smiles <- id_or_smiles + } else{ + if(findLevel(id_or_smiles,TRUE) != "standard"){ + return(c(" ","$$$$")) + } + smiles <- findSmiles(id_or_smiles) + } + # if no babeldir was set, get the result from cactus. + if(is.na(babeldir)) + { + res <- getCactus(smiles, "sdf") + + if(any(is.na(res))){ + res <- getPcSDF(smiles) + } + if(any(is.na(res))){ + stop("Pubchem and Cactus both seem to be down.") + } + if(is.character(fileName)) + writeLines(res, fileName) + } + # otherwise use the better-tested OpenBabel toolkit. + else + { + if(!is.character(fileName)) + cmd <- paste(babeldir, "babel -ismi -osdf -d -b --gen2D", sep='') + else + cmd <- paste(babeldir, "babel -ismi -osdf ", fileName , " -d -b --gen2D", sep='') + res <- system(cmd, intern=TRUE, input=smiles, ignore.stderr=TRUE) + # If we wrote to a file, read it back as return value. + if(is.character(fileName)) + res <- readLines(fileName) + } + return(c(" ","$$$$")) + return(res) +} + + + +# Retrieve annotation data for a compound, from the internet service Pubchem +#' Retrieve supplemental annotation data from Pubchem +#' +#' Retrieves annotation data for a compound from the internet service Pubchem +#' based on the inchikey generated by babel or Cactus +#' +#' The data retrieved is the Pubchem CID, a synonym from the Pubchem database, +#' the IUPAC name (using the preferred if available) and a Chebi link +#' +#' @usage gatherPubChem(key) +#' @param key An Inchi-Key +#' @return Returns a list with 4 slots: +#' \code{PcID} The Pubchem CID +#' \code{Synonym} An arbitrary synonym for the compound name +#' \code{IUPAC} A IUPAC-name (preferred if available) +#' \code{Chebi} The identification number of the chebi database +#' @author Erik Mueller +#' @seealso \code{\link{mbWorkflow}} +#' @references Pubchem REST: +#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} +#' Chebi: +#' \url{http://www.ebi.ac.uk/chebi} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherPubChem("QEIXBXXKTUNWDK-UHFFFAOYSA-N")} +#' +#' @export +gatherPubChem <- function(key){ + + PubChemData <- list() + + ##Trycatches are there because pubchem has connection issues 1 in 50 times. + ##Write NA into the respective fields if something goes wrong with the conenction or the data. + + ##Retrieve Pubchem CID + tryCatch( + PubChemData$PcID <- getPcId(key), + error=function(e){ + PubChemData$PcID <<- NA + }) + + ##Retrieve a synonym to the name + tryCatch( + PubChemData$Synonym <- getPcSynonym(key), + error=function(e){ + PubChemData$Synonym <<- NA + }) + + ##Retrieve the IUPAC-name + tryCatch( + PubChemData$IUPAC <- getPcIUPAC(key), + error=function(e){ + PubChemData$IUPAC <<- NA + }) + + ##Retrieve the Chebi-ID + tryCatch( + PubChemData$Chebi <- getPcCHEBI(key), + error=function(e){ + PubChemData$Chebi <<- NA + }) + + return(PubChemData) +} + +# Retrieve annotation data for a compound, from the internet services Cactvs, Pubchem, Chemspider and CTS. +#' Retrieve annotation data +#' +#' Retrieves annotation data for a compound from the internet services CTS, Pubchem, Chemspider and +#' Cactvs, based on the SMILES code and name of the compounds stored in the +#' compound list. +#' +#' Composes the "upper part" of a MassBank record filled with chemical data +#' about the compound: name, exact mass, structure, CAS no., links to PubChem, +#' KEGG, ChemSpider. The instrument type is also written into this block (even +#' if not strictly part of the chemical information). Additionally, index +#' fields are added at the start of the record, which will be removed later: +#' \code{id, dbcas, dbname} from the compound list, \code{dataused} to indicate +#' the used identifier for CTS search (\code{smiles} or \code{dbname}). +#' +#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are +#' inserted empty and will be filled later on. +#' +#' @usage gatherData(id) +#' @aliases gatherData +#' @param id The compound ID. +#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., +#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... +#' @author Michael Stravs +#' @seealso \code{\link{mbWorkflow}} +#' @references Chemical Translation Service: +#' \url{http://uranus.fiehnlab.ucdavis.edu:8080/cts/homePage} +#' cactus Chemical Identifier Resolver: +#' \url{http://cactus.nci.nih.gov/chemical/structure} +#' MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' Pubchem REST: +#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} +#' Chemspider InChI conversion: +#' \url{https://www.chemspider.com/InChI.asmx} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherData(131)} +#' +#' @export +gatherData <- function(id) +{ + ##Preamble: Is a babeldir supplied? + ##If yes, use it + + .checkMbSettings() + usebabel=TRUE + babeldir <- getOption("RMassBank")$babeldir + + if(is.na(babeldir)){ + usebabel=FALSE + } + + + ##Get all useful information from the local "database" (from the CSV sheet) + + smiles <- findSmiles(id) + mass <- findMass(smiles) + dbcas <- findCAS(id) + dbname <- findName(id) + if(is.na(dbname)) dbname <- "" + if(is.na(dbcas)) dbcas <- "" + iupacName <- dbname + synonym <- dbname + formula <- findFormula(id) + + ##Convert SMILES to InChI key via Cactvs or babel. CTS doesn't "interpret" the SMILES per se, + ##it just matches identical known SMILES, so we need to convert to a "searchable" and + ##standardized format beforehand. Other databases are able to interpret the smiles. + + if(usebabel){ + cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey') + inchikey_split <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE) + } else{ + inchikey <- getCactus(smiles, 'stdinchikey') + if(!is.na(inchikey)){ + ##Split the "InChiKey=" part off the key + inchikey_split <- strsplit(inchikey, "=", fixed=TRUE)[[1]][[2]] + } else{ + inchikey_split <- getPcInchiKey(smiles) + } + } + + ##Use Pubchem to retrieve information + PcInfo <- gatherPubChem(inchikey_split) + + if(!is.null(PcInfo$Synonym) & !is.na(PcInfo$Synonym)){ + synonym <- PcInfo$Synonym + } + + if(!is.null(PcInfo$IUPAC) & !is.na(PcInfo$IUPAC)){ + iupacName <- PcInfo$IUPAC + } + + ##Get Chemspider-ID + csid <- getCSID(inchikey_split) + + if(is.na(csid)){ + ##Get ChemSpider ID from Cactus if the Chemspider page is down + csid <- getCactus(inchikey_split, 'chemspider_id') + } + + ##Use CTS to retrieve information + CTSinfo <- getCtsRecord(inchikey_split) + + if((CTSinfo[1] == "Sorry, we couldn't find any matching results") || is.null(CTSinfo[1])) + { + CTSinfo <- NA + } + + ##List the names + if(iupacName == ""){ + warning(paste0("Compound ID ",id,": no IUPAC name could be identified.")) + } + + if(toupper(dbname) == toupper(synonym)){ + synonym <- dbname + } + + if(toupper(dbname) == toupper(iupacName)){ + iupacName <- dbname + } + + if(toupper(synonym) == toupper(iupacName)){ + synonym <- iupacName + } + + names <- as.list(unique(c(dbname, synonym, iupacName))) + + ##If no name is found, it must be supplied in one way or another + if(all(sapply(names, function(x) x == ""))){ + stop("RMassBank wasn't able to extract a usable name for this compound from any database. Please supply a name manually.") + } + + # Start to fill the MassBank record. + # The top 4 entries will not go into the final record; they are used to identify + # the record and also to facilitate manual editing of the exported record table. + mbdata <- list() + mbdata[['id']] <- id + mbdata[['dbcas']] <- dbcas + mbdata[['dbname']] <- dbname + mbdata[['dataused']] <- "smiles" + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors + mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license + mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright + # Confidence annotation and internal ID annotation. + # The ID of the compound will be written like: + # COMMENT: EAWAG_UCHEM_ID 1234 + # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" + mbdata[["COMMENT"]] <- list() + if(findLevel(id) == "0"){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment + } else{ + level <- findLevel(id) + if(level %in% c("1","1a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" + } + if(level == c("2")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" + } + if(level == c("2a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" + } + if(level == c("2b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" + } + if(level == c("3")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" + } + if(level == c("3a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" + } + if(level == c("3b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" + } + if(level == c("3c")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" + } + if(level == c("3d")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" + } + if(level == c("4")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" + } + if(level == c("5")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" + } + } + mbdata[["COMMENT"]][["ID"]] = id + # here compound info starts + mbdata[['CH$NAME']] <- names + # Currently we use a fixed value for Compound Class, since there is no useful + # convention of what should go there and what shouldn't, and the field is not used + # in search queries. + mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class + mbdata[['CH$FORMULA']] <- formula + mbdata[['CH$EXACT_MASS']] <- mass + mbdata[['CH$SMILES']] <- smiles + + if(usebabel){ + cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi') + mbdata[['CH$IUPAC']] <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE) + } else{ + mbdata[['CH$IUPAC']] <- getCactus(smiles, "stdinchi") + } + + + + # Add all CH$LINK fields present in the compound datasets + link <- list() + # CAS + if(!is.na(CTSinfo[1])){ + if("CAS" %in% CTS.externalIdTypes(CTSinfo)) + { + # Prefer database CAS if it is also listed in the CTS results. + # otherwise take the shortest one. + cas <- CTS.externalIdSubset(CTSinfo,"CAS") + if(dbcas %in% cas) + link[["CAS"]] <- dbcas + else + link[["CAS"]] <- cas[[which.min(nchar(cas))]] + } else{ + if(dbcas != ""){ + link[["CAS"]] <- dbcas + } + } + } else{ + if(dbcas != ""){ + link[["CAS"]] <- dbcas + } + } + + + # CHEBI + if(is.na(PcInfo$Chebi[1])){ + if(!is.na(CTSinfo[1])){ + if("ChEBI" %in% CTS.externalIdTypes(CTSinfo)) + { + # Cut off front "CHEBI:" if present + chebi <- CTS.externalIdSubset(CTSinfo,"ChEBI") + chebi <- chebi[[which.min(nchar(chebi))]] + chebi <- strsplit(chebi,":")[[1]] + link[["CHEBI"]] <- chebi[[length(chebi)]] + } + } + } else{ + chebi <- PcInfo$Chebi + chebi <- chebi[[which.min(nchar(chebi))]] + chebi <- strsplit(chebi,":")[[1]] + link[["CHEBI"]] <- chebi[[length(chebi)]] + } + # HMDB + if(!is.na(CTSinfo[1])){ + if("Human Metabolome Database" %in% CTS.externalIdTypes(CTSinfo)) + link[["HMDB"]] <- CTS.externalIdSubset(CTSinfo,"HMDB")[[1]] + # KEGG + if("KEGG" %in% CTS.externalIdTypes(CTSinfo)) + link[["KEGG"]] <- CTS.externalIdSubset(CTSinfo,"KEGG")[[1]] + # LipidMAPS + if("LipidMAPS" %in% CTS.externalIdTypes(CTSinfo)) + link[["LIPIDMAPS"]] <- CTS.externalIdSubset(CTSinfo,"LipidMAPS")[[1]] + } + # PubChem CID + if(is.na(PcInfo$PcID[1])){ + if(!is.na(CTSinfo[1])){ + if("PubChem CID" %in% CTS.externalIdTypes(CTSinfo)) + { + pc <- CTS.externalIdSubset(CTSinfo,"PubChem CID") + link[["PUBCHEM"]] <- paste0(min(pc)) + } + } + } else{ + link[["PUBCHEM"]] <- PcInfo$PcID[1] + } + + + if(!is.null(link[["PUBCHEM"]])){ + if(substr(link[["PUBCHEM"]],1,4) != "CID:"){ + link[["PUBCHEM"]] <- paste0("CID:", link[["PUBCHEM"]]) + } + } + + link[["INCHIKEY"]] <- inchikey_split + if(length(csid)>0) if(any(!is.na(csid))) link[["CHEMSPIDER"]] <- min(as.numeric(as.character(csid))) + mbdata[['CH$LINK']] <- link + + mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument + mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type + + return(mbdata) +} + +# Retrieve annotation data for a compound, using only babel +#' Retrieve annotation data +#' +#' Retrieves annotation data for a compound by using babel, +#' based on the SMILES code and name of the compounds stored in the +#' compound list. +#' +#' Composes the "upper part" of a MassBank record filled with chemical data +#' about the compound: name, exact mass, structure, CAS no.. +#' The instrument type is also written into this block (even +#' if not strictly part of the chemical information). Additionally, index +#' fields are added at the start of the record, which will be removed later: +#' \code{id, dbcas, dbname} from the compound list. +#' +#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are +#' inserted empty and will be filled later on. +#' +#' This function is an alternative to gatherData, in case CTS is down or if information +#' on one or more of the compounds in the compound list are sparse +#' +#' @usage gatherDataBabel(id) +#' @param id The compound ID. +#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., +#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... +#' @author Michael Stravs, Erik Mueller +#' @seealso \code{\link{mbWorkflow}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherDataBabel(131)} +#' +#' @export +gatherDataBabel <- function(id){ + .checkMbSettings() + babeldir <- getOption("RMassBank")$babeldir + smiles <- findSmiles(id) + + + # if no babeldir was set, throw an error that says that either CTS or babel have to be used + if(is.na(babeldir)) + { + stop("No babeldir supplied; It is currently not possible to convert the information without either babel or CTS") + } else { + ###Babel conversion + cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey') + inchikey <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE) + cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi') + inchi <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE) + + ##Read from Compoundlist + smiles <- findSmiles(id) + mass <- findMass(smiles) + dbcas <- findCAS(id) + dbname <- findName(id) + if(is.na(dbname)) dbname <- "" + if(is.na(dbcas)) dbcas <- "" + formula <- findFormula(id) + + ##Create + mbdata <- list() + mbdata[['id']] <- id + mbdata[['dbcas']] <- dbcas + mbdata[['dbname']] <- dbname + mbdata[['dataused']] <- "smiles" + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors + mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license + mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright + # Confidence annotation and internal ID annotation. + # The ID of the compound will be written like: + # COMMENT: EAWAG_UCHEM_ID 1234 + # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" + mbdata[["COMMENT"]] <- list() + if(findLevel(id) == "0"){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment + } else{ + level <- findLevel(id) + if(level %in% c("1","1a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" + } + if(level == c("2")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" + } + if(level == c("2a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" + } + if(level == c("2b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" + } + if(level == c("3")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" + } + if(level == c("3a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" + } + if(level == c("3b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" + } + if(level == c("3c")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" + } + if(level == c("3d")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" + } + if(level == c("4")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" + } + if(level == c("5")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" + } + } + mbdata[["COMMENT"]][["ID"]] <- id + + # here compound info starts + mbdata[['CH$NAME']] <- as.list(dbname) + + # Currently we use a fixed value for Compound Class, since there is no useful + # convention of what should go there and what shouldn't, and the field is not used + # in search queries. + mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class + mbdata[['CH$FORMULA']] <- formula + mbdata[['CH$EXACT_MASS']] <- mass + mbdata[['CH$SMILES']] <- smiles + mbdata[['CH$IUPAC']] <- inchi + + link <- list() + if(dbcas != "") + link[["CAS"]] <- dbcas + link[["INCHIKEY"]] <- inchikey + mbdata[['CH$LINK']] <- link + mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument + mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type + } + return(mbdata) +} + +# Retrieve annotation data for a compound, using only babel +#' Retrieve annotation data +#' +#' Retrieves annotation data for an unknown compound by using basic information present +#' +#' Composes the "upper part" of a MassBank record filled with chemical data +#' about the compound: name, exact mass, structure, CAS no.. +#' The instrument type is also written into this block (even +#' if not strictly part of the chemical information). Additionally, index +#' fields are added at the start of the record, which will be removed later: +#' \code{id, dbcas, dbname} from the compound list. +#' +#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are +#' inserted empty and will be filled later on. +#' +#' This function is used to generate the data in case a substance is unknown, +#' i.e. not enough information is present to derive anything about formulas or links +#' +#' @usage gatherDataUnknown(id, mode, retrieval) +#' @param id The compound ID. +#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z +#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., +#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... +#' @author Michael Stravs, Erik Mueller +#' @seealso \code{\link{mbWorkflow}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherDataUnknown(131,"pH")} +#' +#' @export +gatherDataUnknown <- function(id, mode, retrieval){ + .checkMbSettings() + + ##Read from Compoundlist + smiles <- "" + if(retrieval == "unknown"){ + mass <- findMass(id, "unknown", mode) + formula <- "" + } + if(retrieval == "tentative"){ + mass <- findMass(id, "tentative", mode) + formula <- findFormula(id, "tentative") + } + dbcas <- NA + dbname <- findName(id) + if(is.na(dbname)) dbname <- paste("Unknown ID:",id) + if(is.na(dbcas)) dbcas <- "" + + + + ##Create + mbdata <- list() + mbdata[['id']] <- id + mbdata[['dbcas']] <- dbcas + mbdata[['dbname']] <- dbname + mbdata[['dataused']] <- "none" + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors + mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license + mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright + # Confidence annotation and internal ID annotation. + # The ID of the compound will be written like: + # COMMENT: EAWAG_UCHEM_ID 1234 + # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" + mbdata[["COMMENT"]] <- list() + if(findLevel(id) == "0"){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment + } else{ + level <- findLevel(id) + if(level %in% c("1","1a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" + } + if(level == c("2")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" + } + if(level == c("2a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" + } + if(level == c("2b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" + } + if(level == c("3")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" + } + if(level == c("3a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" + } + if(level == c("3b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" + } + if(level == c("3c")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" + } + if(level == c("3d")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" + } + if(level == c("4")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" + } + if(level == c("5")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" + } + } + mbdata[["COMMENT"]][["ID"]] <- id + + # here compound info starts + mbdata[['CH$NAME']] <- as.list(dbname) + + # Currently we use a fixed value for Compound Class, since there is no useful + # convention of what should go there and what shouldn't, and the field is not used + # in search queries. + mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class + mbdata[['CH$FORMULA']] <- formula + mbdata[['CH$EXACT_MASS']] <- mass + mbdata[['CH$SMILES']] <- "" + mbdata[['CH$IUPAC']] <- "" + + link <- list() + mbdata[['CH$LINK']] <- link + mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument + mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type + + return(mbdata) +} + +# Flatten the internal tree-like representation of MassBank data to a flat table. +# Note that this limits us, in that the fields should be constant over all records! +# Therefore, e.g. the fixed number of 3 names which may be filled. +# If anybody has a cooler solution, I'll be happy to hear from you :) +# +# Note: the records from gatherData have additional information which is discarded, like +# author, copyright etc. They will be re-filled automatically when reading the file. +#' Flatten, or re-read, MassBank header blocks +#' +#' \code{flatten} converts a list of MassBank compound information sets (as +#' retrieved by \code{\link{gatherData}}) to a flat table, to be exported into +#' an \link[=loadInfolist]{infolist}. \code{readMbdata} reads a single record +#' from an infolist flat table back into a MassBank (half-)entry. +#' +#' Neither the flattening system itself nor the implementation are particularly +#' fantastic, but since hand-checking of records is a necessary evil, there is +#' currently no alternative (short of coding a complete GUI for this and +#' working directly on the records.) +#' +#' @aliases flatten readMbdata +#' @usage flatten(mbdata) +#' +#' readMbdata(row) +#' @param mbdata A list of MassBank compound information sets as returned from +#' \code{\link{gatherData}}. +#' @param row One row of MassBank compound information retrieved from an +#' infolist. +#' @return \code{flatten} returns a matrix (not a data frame) to be written to +#' CSV. +#' +#' \code{readMbdata} returns a list of type \code{list(id= \var{compoundID}, +#' ..., 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. +#' @author Michael Stravs +#' @seealso \code{\link{gatherData}},\code{\link{loadInfolist}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples \dontrun{ +#' # Collect some data to flatten +#' ids <- c(40,50,60,70) +#' data <- lapply(ids, gatherData) +#' # Flatten the data trees to a table +#' flat.table <- flatten(data) +#' # reimport the table into a tree +#' data.reimported <- apply(flat.table, 1, readMbdata) +#' } +#' +#' @export +#' +flatten <- function(mbdata) +{ + .checkMbSettings() + + colList <- c( + "id", + "dbcas", + "dbname", + "dataused", + "COMMENT.CONFIDENCE", + # Note: The field name of the internal id field is replaced with the real name + # at "compilation" time. Therefore, functions DOWNSTREAM from compileRecord() + # must use the full name including the info from options("RMassBank"). + "COMMENT.ID", + "CH$NAME1", + "CH$NAME2", + "CH$NAME3", + "CH$COMPOUND_CLASS", + "CH$FORMULA", + "CH$EXACT_MASS", + "CH$SMILES", + "CH$IUPAC", + "CH$LINK.CAS", + "CH$LINK.CHEBI", + "CH$LINK.HMDB", + "CH$LINK.KEGG", + "CH$LINK.LIPIDMAPS", + "CH$LINK.PUBCHEM", + "CH$LINK.INCHIKEY", + "CH$LINK.CHEMSPIDER") + # make an empty data frame with the right length + rows <- length(mbdata) + cols <- length(colList) + mbframe <- matrix(data=NA, nrow=rows, ncol=cols) + colnames(mbframe) <- colList + #browser() + for(row in 1:rows) + { + # fill in all the data into the dataframe: all columns which + # a) exist in the target dataframe and b) exist in the (unlisted) MB record + # are written into the dataframe. + data <- unlist(mbdata[[row]]) + # bugfix for the case of only one name + if(!("CH$NAME1" %in% names(data))) + data[["CH$NAME1"]] <- data[["CH$NAME"]] + datacols <- intersect(colList, names(data)) + mbframe[row,datacols] <- data[datacols] + } + return(mbframe) + +} + +# Read data from a flat-table MassBank record row and feed it into a +# MassBank tree-like record. Also, prime the ACCESSION and RECORD_TITLE fields in the +# correct position in the record. +#' @export +readMbdata <- function(row) +{ + .checkMbSettings() + + # Listify the table row. Lists are just cooler to work with :) + row <- as.list(row) + + mbdata <- list() + # Accession and title are added empty for now, to have them in the right place. + # Constants are read from the options or generated. + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors + mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license + mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright + mbdata[['PUBLICATION']] <- getOption("RMassBank")$annotations$publication + + # Read all determined fields from the file + # This is not very flexible, as you can see... + colList <- c( + "COMMENT.CONFIDENCE", + "COMMENT.ID", + "CH$NAME1", + "CH$NAME2", + "CH$NAME3", + "CH$COMPOUND_CLASS", + "CH$FORMULA", + "CH$EXACT_MASS", + "CH$SMILES", + "CH$IUPAC", + "CH$LINK.CAS", + "CH$LINK.CHEBI", + "CH$LINK.HMDB", + "CH$LINK.KEGG", + "CH$LINK.LIPIDMAPS", + "CH$LINK.PUBCHEM", + "CH$LINK.INCHIKEY", + "CH$LINK.CHEMSPIDER") + mbdata[["COMMENT"]] = list() + mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT.CONFIDENCE"]] + # Again, our ID field. + + mbdata[["COMMENT"]][["ID"]]<- + row[["COMMENT.ID"]] + names = c(row[["CH.NAME1"]], row[["CH.NAME2"]], row[["CH.NAME3"]]) + names = names[which(!is.na(names))] + + names <- gsub("'", "`", names) + mbdata[["CH$NAME"]] = names + mbdata[["CH$COMPOUND_CLASS"]] = row[["CH.COMPOUND_CLASS"]] + mbdata[["CH$FORMULA"]] = row[["CH.FORMULA"]] + mbdata[["CH$EXACT_MASS"]] = row[["CH.EXACT_MASS"]] + mbdata[["CH$SMILES"]] = row[["CH.SMILES"]] + mbdata[["CH$IUPAC"]] = row[["CH.IUPAC"]] + # Add all links and then eliminate the NA values from the tree. + link = list() + link[["CAS"]] = row[["CH.LINK.CAS"]] + link[["CHEBI"]] = row[["CH.LINK.CHEBI"]] + link[["HMDB"]] = row[["CH.LINK.HMDB"]] + link[["KEGG"]] = row[["CH.LINK.KEGG"]] + link[["LIPIDMAPS"]] = row[["CH.LINK.LIPIDMAPS"]] + link[["PUBCHEM"]] = row[["CH.LINK.PUBCHEM"]] + link[["INCHIKEY"]] = row[["CH.LINK.INCHIKEY"]] + link[["CHEMSPIDER"]] = row[["CH.LINK.CHEMSPIDER"]] + link[which(is.na(link))] <- NULL + mbdata[["CH$LINK"]] <- link + # again, these constants are read from the options: + mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument + mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type + + return(mbdata) + +} + +# For each compound, this function creates the "lower part" of the MassBank record, i.e. +# everything that comes after AC$INSTRUMENT_TYPE. +#' Compose data block of MassBank record +#' +#' \code{gatherCompound} composes the data blocks (the "lower half") of all +#' MassBank records for a compound, using the annotation data in the RMassBank +#' options, spectrum info data from the \code{analyzedSpec}-type record and the +#' peaks from the reanalyzed, multiplicity-filtered peak table. It calls +#' \code{gatherSpectrum} for each child spectrum. +#' +#' The returned data blocks are in format \code{list( "AC\$MASS_SPECTROMETRY" = +#' list('FRAGMENTATION_MODE' = 'CID', ...), ...)} etc. +#' +#' @aliases gatherCompound gatherSpectrum +#' @usage gatherCompound(spec, aggregated, additionalPeaks = NULL, retrieval="standard") +#' +#' gatherSpectrum(spec, msmsdata, ac_ms, ac_lc, aggregated, +#' additionalPeaks = NULL, retrieval="standard") +#' @param spec A \code{RmbSpectraSet} object, representing a compound with multiple spectra. +#' @param aggregated An aggregate peak table where the peaks are extracted from. +#' @param msmsdata A \code{RmbSpectrum2} object from the \code{spec} spectra set, representing a single spectrum to give a record. +#' @param ac_ms,ac_lc Information for the AC\$MASS_SPECTROMETRY and +#' AC\$CHROMATOGRAPHY fields in the MassBank record, created by +#' \code{gatherCompound} and then fed into \code{gatherSpectrum}. +#' @param additionalPeaks If present, a table with additional peaks to add into the spectra. +#' As loaded with \code{\link{addPeaks}}. +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z +#' @return \code{gatherCompound} returns a list of tree-like MassBank data +#' blocks. \code{gatherSpectrum} returns one single MassBank data block or +#' \code{NA} if no useful peak is in the spectrum. +#' @note Note that the global table \code{additionalPeaks} is also used as an +#' additional source of peaks. +#' @author Michael Stravs +#' @seealso \code{\link{mbWorkflow}}, \code{\link{compileRecord}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples \dontrun{ +#' myspectrum <- w@@spectra[[1]] +#' massbankdata <- gatherCompound(myspectrum, w@@aggregated) +#' # Note: ac_lc and ac_ms are data blocks usually generated in gatherCompound and +#' # passed on from there. The call below gives a relatively useless result :) +#' ac_lc_dummy <- list() +#' ac_ms_dummy <- list() +#' justOneSpectrum <- gatherSpectrum(myspectrum, myspectrum@@child[[2]], +#' ac_ms_dummy, ac_lc_dummy, w@@aggregated) +#' } +#' +#' +#' @export +gatherCompound <- function(spec, aggregated, additionalPeaks = NULL, retrieval="standard") +{ + # compound ID + id <- spec@id + # processing mode + imode <- spec@mode + + # define positive or negative, based on processing mode. + ion_modes <- list( + "pH" = "POSITIVE", + "pNa" = "POSITIVE", + "mH" = "NEGATIVE", + "mFA" = "NEGATIVE", + "pM" = "POSITIVE", + "mM" = "NEGATIVE", + "pNH4" = "POSITIVE") + mode <- ion_modes[[imode]] + + # for format 2.01 + ac_ms <- list(); + ac_ms[['MS_TYPE']] <- getOption("RMassBank")$annotations$ms_type + ac_ms[['IONIZATION']] <- getOption("RMassBank")$annotations$ionization + ac_ms[['ION_MODE']] <- mode + + # This list could be made customizable. + ac_lc <- list(); + rt <- spec@parent@rt / 60 + ac_lc[['COLUMN_NAME']] <- getOption("RMassBank")$annotations$lc_column + ac_lc[['FLOW_GRADIENT']] <- getOption("RMassBank")$annotations$lc_gradient + ac_lc[['FLOW_RATE']] <- getOption("RMassBank")$annotations$lc_flow + ac_lc[['RETENTION_TIME']] <- sprintf("%.3f min", rt) + ac_lc[['SOLVENT A']] <- getOption("RMassBank")$annotations$lc_solvent_a + ac_lc[['SOLVENT B']] <- getOption("RMassBank")$annotations$lc_solvent_b + + # Go through all child spectra, and fill our skeleton with scan data! + # Pass them the AC_LC and AC_MS data, which are added at the right place + # directly in there. + allSpectra <- lapply(spec@children, function(m) + gatherSpectrum(spec = spec, msmsdata = m, ac_ms = ac_ms, ac_lc = ac_lc, aggregated = aggregated, additionalPeaks = additionalPeaks, retrieval=retrieval)) + allSpectra <- allSpectra[which(!is.na(allSpectra))] + return(allSpectra) +} + + + +# Process one single MSMS child scan. +# spec: an object of "analyzedSpectrum" type (i.e. contains +# 14x (or other number) msmsdata, info, mzrange, +# compound ID, parent MS1, cpd id...) +# msmsdata: the msmsdata sub-object from the spec which is the child scan we want to process. +# Contains childFilt, childBad, scan #, etc. Note that the peaks are actually not +# taken from here! They were taken from msmsdata initially, but after introduction +# of the refiltration and multiplicity filtering, this was changed. Now only the +# scan information is actually taken from msmsdata. +# ac_ms, ac_lc: pre-filled info for the MassBank dataset (see above) +# refiltered: the refilteredRcSpecs dataset which contains our good peaks :) +# Contains peaksOK, peaksReanOK, peaksFiltered, peaksFilteredReanalysis, +# peaksProblematic. Currently we use peaksOK and peaksReanOK to create the files. +# (Also, the global additionalPeaks table is used.) +#' @export +gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalPeaks = NULL, retrieval = "standard") +{ + # If the spectrum is not filled, return right now. All "NA" spectra will + # not be treated further. + if(msmsdata@ok == FALSE) + return(NA) + # get data + scan <- msmsdata@acquisitionNum + id <- spec@id + # Further fill the ac_ms datasets, and add the ms$focused_ion with spectrum-specific data: + precursor_types <- list( + "pH" = "[M+H]+", + "pNa" = "[M+Na]+", + "mH" = "[M-H]-", + "mFA" = "[M+HCOO-]-", + "pM" = "[M]+", + "mM" = "[M]-", + "pNH4" = "[M+NH4]+") + ac_ms[['FRAGMENTATION_MODE']] <- msmsdata@info$mode + #ac_ms['PRECURSOR_TYPE'] <- precursor_types[spec$mode] + ac_ms[['COLLISION_ENERGY']] <- msmsdata@info$ce + ac_ms[['RESOLUTION']] <- msmsdata@info$res + + # Calculate exact precursor mass with Rcdk, and find the base peak from the parent + # spectrum. (Yes, that's what belongs here, I think.) + precursorMz <- findMz(spec@id, spec@mode, retrieval=retrieval) + ms_fi <- list() + ms_fi[['BASE_PEAK']] <- round(mz(spec@parent)[which.max(intensity(spec@parent))],4) + ms_fi[['PRECURSOR_M/Z']] <- round(precursorMz$mzCenter,4) + ms_fi[['PRECURSOR_TYPE']] <- precursor_types[spec@mode] + + # Select all peaks which belong to this spectrum (correct cpdID and scan no.) + # from peaksOK + # Note: Here and below it would be easy to customize the source of the peaks. + # Originally the peaks came from msmsdata$childFilt, and the subset + # was used where dppm == dppmBest (because childFilt still contains multiple formulas) + # per peak. + peaks <- aggregated[aggregated$filterOK,,drop=FALSE] + peaks <- peaks[(peaks$cpdID == id) & (peaks$scan == msmsdata@acquisitionNum),,drop=FALSE] + + # No peaks? Aha, bye + if(nrow(peaks) == 0) + return(NA) + + # If we don't include the reanalyzed peaks: + if(!getOption("RMassBank")$use_rean_peaks) + peaks <- peaks[is.na(peaks$matchedReanalysis),,drop=FALSE] + # but if we include them: + else + { + # for info, the following data will be used in the default annotator: + # annotation <- annotation[,c("mzSpec","formula", "formulaCount", "mzCalc", "dppm")] + # and in the peaklist itself: + # c("mzSpec", "int", "intrel") + peaks[!is.na(peaks$matchedReanalysis),"formula"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.formula"] + peaks[!is.na(peaks$matchedReanalysis),"mzCalc"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.mzCalc"] + peaks[!is.na(peaks$matchedReanalysis),"dppm"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.dppm"] + peaks[!is.na(peaks$matchedReanalysis),"dbe"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.dbe"] + peaks[!is.na(peaks$matchedReanalysis),"formulaCount"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.formulaCount"] + } + + # Calculate relative intensity and make a formatted m/z to use in the output + # (mzSpec, for "spectrum") + peaks$intrel <- floor(peaks$intensity / max(peaks$intensity) * 999) + peaks$mzSpec <- round(peaks$mzFound, 4) + # reorder peaks after addition of the reanalyzed ones + peaks <- peaks[order(peaks$mzSpec),] + + # Also format the other values, which are used in the annotation + peaks$dppm <- round(peaks$dppm, 2) + peaks$mzCalc <- round(peaks$mzCalc, 4) + peaks$intensity <- round(peaks$intensity, 1) + # copy the peak table to the annotation table. (The peak table will then be extended + # with peaks from the global "additional_peaks" table, which can be used to add peaks + # to the spectra by hand. + annotation <- peaks + # Keep only peaks with relative intensity >= 1 o/oo, since the MassBank record + # makes no sense otherwise. Also, keep only the columns needed in the output. + peaks <- peaks[ peaks$intrel >= 1, c("mzSpec", "intensity", "intrel")] + + # Here add the additional peaks if there are any for this compound! + # They are added without any annotation. + if(!is.null(additionalPeaks)) + { + # select the peaks from the corresponding spectrum which were marked with "OK=1" in the table. + spec_add_peaks <- additionalPeaks[ + (additionalPeaks$OK == 1) & + (additionalPeaks$cpdID == spec@id) & + (additionalPeaks$scan == msmsdata@acquisitionNum), + c("mzFound", "intensity")] + # If there are peaks to add: + if(nrow(spec_add_peaks)>0) + { + # add the column for rel. int. + spec_add_peaks$intrel <- 0 + # format m/z value + spec_add_peaks$mzSpec <- round(spec_add_peaks$mzFound, 4) + # bind tables together + peaks <- rbind(peaks, spec_add_peaks[,c("mzSpec", "intensity", "intrel")]) + # recalculate rel.int. and reorder list + peaks$intrel <- floor(peaks$intensity / max(peaks$intensity) * 999) + # Again, select the correct columns, and drop values with rel.int. <1 o/oo + # NOTE: If the highest additional peak is > than the previous highest peak, + # this can lead to the situation that a peak is in "annotation" but not in "peaks"! + # See below. + peaks <- peaks[ peaks$intrel >= 1, c("mzSpec", "intensity", "intrel")] + # Reorder again. + peaks <- peaks[order(peaks$mzSpec),] + } + } + + + + # add + or - to fragment formulas + formula_tag <- list( + "pH" = "+", + "pNa" = "+", + "mH" = "-", + "mFA" = "-", + "pM" = "+", + "mM" = "-", + "pNH4" = "+") + type <- formula_tag[[spec@mode]] + + annotator <- getOption("RMassBank")$annotator + if(is.null(annotator)) + annotator <- "annotator.default" + + + + # Here, the relative intensity is recalculated using the newly added additional + # peaks from the peak list. Therefore, we throw superfluous peaks out again. + # NOTE: It is a valid question whether or not we should kick peaks out at this stage. + # The alternative would be to leave the survivors at 1 o/oo, but keep them in the spectrum. + annotation$intrel <- floor(annotation$intensity / max(peaks$intensity) * 999) + annotation <- annotation[annotation$intrel >= 1,] + + annotation <- do.call(annotator, list(annotation= annotation, type=type)) + + + # Name the columns correctly. + colnames(peaks) <- c("m/z", "int.", "rel.int.") + peaknum <- nrow(peaks) + + # Create the "lower part" of the record. + mbdata <- list() + # Add the AC$MS, AC$LC info. + if(getOption("RMassBank")$use_version == 2) + { + mbdata[["AC$MASS_SPECTROMETRY"]] <- ac_ms + mbdata[["AC$CHROMATOGRAPHY"]] <- ac_lc + } + else + { + # Fix for MassBank data format 1, where ION_MODE must be renamed to MODE + mbdata[["AC$ANALYTICAL_CONDITION"]] <- c(ac_ms, ac_lc) + names(mbdata[["AC$ANALYTICAL_CONDITION"]])[[3]] <- "MODE" + } + # Add the MS$FOCUSED_ION info. + mbdata[["MS$FOCUSED_ION"]] <- ms_fi + + ## The SPLASH is a hash value calculated across all peaks + ## http://splash.fiehnlab.ucdavis.edu/ + ## Has to be temporarily added as "PK$SPLASH" in the "lower" part + ## of the record, but will later be moved "up" when merging parts in compileRecord() + + # the data processing tag :) + # Change by Tobias: + # I suggest to add here the current version number of the clone due to better distinction between different makes of MB records + # Could be automatised from DESCRIPTION file? + if(getOption("RMassBank")$use_rean_peaks) + processingComment <- list("REANALYZE" = "Peaks with additional N2/O included") + else + processingComment <- list() + mbdata[["MS$DATA_PROCESSING"]] <- c( + getOption("RMassBank")$annotations$ms_dataprocessing, + processingComment, + list("WHOLE" = paste("RMassBank", packageVersion("RMassBank"))) + ) + + mbdata[["PK$SPLASH"]] <- list(SPLASH = getSplash(peaks[,c("m/z", "int.")])) + + # Annotation: + if(getOption("RMassBank")$add_annotation && (findLevel(id,TRUE)!="unknown")) + mbdata[["PK$ANNOTATION"]] <- annotation + + # Peak table + mbdata[["PK$NUM_PEAK"]] <- peaknum + mbdata[["PK$PEAK"]] <- peaks + # These two entries will be thrown out later, but they are necessary to build the + # record title and the accession number. + mbdata[["RECORD_TITLE_CE"]] <- msmsdata@info$ces #formatted collision energy + # Mode of relative scan calculation: by default it is calculated relative to the + # parent scan. If a corresponding option is set, it will be calculated from the first + # present child scan in the list. + relativeScan <- "fromParent" + if(!is.null(getOption("RMassBank")$recomputeRelativeScan)) + if(getOption("RMassBank")$recomputeRelativeScan == "fromFirstChild") + relativeScan <- "fromFirstChild" + if(relativeScan == "fromParent") + mbdata[["SUBSCAN"]] <- msmsdata@acquisitionNum - spec@parent@acquisitionNum #relative scan + else if(relativeScan == "fromFirstChild"){ + firstChild <- min(unlist(lapply(spec@children,function(d) d@acquisitionNum))) + mbdata[["SUBSCAN"]] <- msmsdata@acquisitionNum - firstChild + 1 + } + return(mbdata) +} + + +# This compiles a MassBank record from the analyzedRcSpecs format (using the peaks from +# refilteredRcSpecs) together with the compound annotation data. +# Correspondingly: +# spec: contains the analyzedRcSpec-format spectrum collection to be compiled +# (i.e. a block of length(spectraList) child spectra) +# mbdata: contains the corresponding MassBank "header" (the upper part of the record) +# until INSTRUMENT TYPE. +# refiltered: the refilteredRcSpecs which contain our nice peaks. +#' Compile MassBank records +#' +#' Takes a spectra block for a compound, as returned from +#' \code{\link{analyzeMsMs}}, and an aggregated cleaned peak table, together +#' with a MassBank information block, as stored in the infolists and loaded via +#' \code{\link{loadInfolist}}/\code{\link{readMbdata}} and processes them to a +#' MassBank record +#' +#' \code{compileRecord} calls \code{\link{gatherCompound}} to create blocks of +#' spectrum data, and finally fills in the record title and accession number, +#' renames the "internal ID" comment field and removes dummy fields. +#' +#' @usage compileRecord(spec, mbdata, aggregated, additionalPeaks = NULL, retrieval="standard") +#' @param spec A \code{RmbSpectraSet} for a compound, after analysis (\code{\link{analyzeMsMs}}). +#' Note that \bold{peaks are not read from this +#' object anymore}: Peaks come from the \code{aggregated} dataframe (and from +#' the global \code{additionalPeaks} dataframe; cf. \code{\link{addPeaks}} for +#' usage information.) +#' @param mbdata The information data block for the record header, as stored in +#' \code{mbdata_relisted} after loading an infolist. +#' @param aggregated An aggregated peak data table containing information about refiltered spectra etc. +#' @param additionalPeaks If present, a table with additional peaks to add into the spectra. +#' As loaded with \code{\link{addPeaks}}. +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z +#' @return Returns a MassBank record in list format: e.g. +#' \code{list("ACCESSION" = "XX123456", "RECORD_TITLE" = "Cubane", ..., +#' "CH\$LINK" = list( "CAS" = "12-345-6", "CHEMSPIDER" = 1111, ...))} +#' @author Michael Stravs +#' @seealso \code{\link{mbWorkflow}}, \code{\link{addPeaks}}, +#' \code{\link{gatherCompound}}, \code{\link{toMassbank}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' +#' # +#' \dontrun{myspec <- w@@spectra[[2]]} +#' # after having loaded an infolist: +#' \dontrun{mbdata <- mbdata_relisted[[which(mbdata_archive\$id == as.numeric(myspec\$id))]]} +#' \dontrun{compiled <- compileRecord(myspec, mbdata, w@@aggregated)} +#' +#' @export +compileRecord <- function(spec, mbdata, aggregated, additionalPeaks = NULL, retrieval="standard") +{ + # gather the individual spectra data + mblist <- gatherCompound(spec, aggregated, additionalPeaks, retrieval=retrieval) + # this returns a n-member list of "lower parts" of spectra (one for each subscan). + # (n being the number of child scans per parent scan.) + # Now we put the two parts together. + # (lapply on all n subscans, returns a list.) + mblist_c <- lapply(mblist, function(l) + { + # This is the step which sticks together the upper and the lower part of the + # record (the upper being compound-specific and the lower being scan-specific.) + # Note that the accession number and record title (in the upper part) must of course + # be filled in with scan-specific info. + mbrecord <- c(mbdata, l) + + # Here is the right place to fix the name of the INTERNAL ID field. + names(mbrecord[["COMMENT"]])[[which(names(mbrecord[["COMMENT"]]) == "ID")]] <- + getOption("RMassBank")$annotations$internal_id_fieldname + # get mode parameter (for accession number generation) depending on version + # of record definition + # Change by Tobias: + # I suggest to include fragmentation mode here for information + if(getOption("RMassBank")$use_version == 2) + mode <- mbrecord[["AC$MASS_SPECTROMETRY"]][["ION_MODE"]] + else + mode <- mbrecord[["AC$ANALYTICAL_CONDITION"]][["MODE"]] + # Generate the title and then delete the temprary RECORD_TITLE_CE field used before + mbrecord[["RECORD_TITLE"]] <- .parseTitleString(mbrecord) + mbrecord[["RECORD_TITLE_CE"]] <- NULL + # Calculate the accession number from the options. + shift <- getOption("RMassBank")$accessionNumberShifts[[spec@mode]] + mbrecord[["ACCESSION"]] <- sprintf("%s%04d%02d", getOption("RMassBank")$annotations$entry_prefix, as.numeric(spec@id), as.numeric(mbrecord[["SUBSCAN"]])+shift) + # Clear the "SUBSCAN" field. + mbrecord[["SUBSCAN"]] <- NULL + # return the record. + return(mbrecord) + }) +} + + + +#' Generate peak annotation from peaklist +#' +#' Generates the PK$ANNOTATION entry from the peaklist obtained. This function is +#' overridable by using the "annotator" option in the settings file. +#' +#' @param annotation A peak list to be annotated. Contains columns: +#' \code{"cpdID","formula","mzFound" ,"scan","mzCalc","dppm", +#' "dbe","mz","int","formulaCount","parentScan","fM_factor","dppmBest", +#' "formulaMultiplicity","intrel","mzSpec"} +#' +#' @param type The ion type to be added to annotated formulas ("+" or "-" usually) +#' +#' @return The annotated peak table. Table \code{colnames()} will be used for the +#' titles (preferrably don't use spaces in the column titles; however no format is +#' strictly enforced by the MassBank data format. +#' +#' @examples +#' \dontrun{ +#' annotation <- annotator.default(annotation) +#' } +#' @author Michele Stravs, Eawag +#' @export +annotator.default <- function(annotation, type) +{ + + annotation$formula <- paste(annotation$formula, type, sep='') + # Select the right columns and name them correctly for output. + annotation <- annotation[,c("mzSpec","formula", "formulaCount", "mzCalc", "dppm")] + colnames(annotation) <- c("m/z", "tentative_formula", "formula_count", "mass", "error(ppm)") + return(annotation) +} + +#' Parse record title +#' +#' Parses a title for a single MassBank record using the title format +#' specified in the option titleFormat. Internally used, not exported. +#' +#' If the option is not set, a standard title format is used (for record definition +#' version 1 or 2). +#' +#' @usage .parseTitleString(mbrecord) +#' @param mbrecord A MassBank record in list format, as returned from +#' \code{\link{gatherSpectrum}}. +#' @return A string with the title. +#' @author Michael Stravs, Eawag +#' @seealso \code{\link{compileRecord}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' \dontrun{ +#' # used in compileRecord() +#' title <- .parseTitleString(mbrecord) +#' } +#' +#' +#' +.parseTitleString <- function(mbrecord) +{ + + varlist <- getOption("RMassBank")$titleFormat + + # Set the standard title format. + if(is.null(varlist)) + { + if(getOption("RMassBank")$use_version == 2) + { + varlist <- c( + "{CH$NAME}", + "{AC$INSTRUMENT_TYPE}", + "{AC$MASS_SPECTROMETRY: MS_TYPE}", + "CE: {RECORD_TITLE_CE}", + "R={AC$MASS_SPECTROMETRY: RESOLUTION}", + "{MS$FOCUSED_ION: PRECURSOR_TYPE}" + ) + } + else + { + varlist <- c( + "{CH$NAME}", + "{AC$INSTRUMENT_TYPE}", + "{AC$ANALYTICAL_CONDITION: MS_TYPE}", + "CE: {RECORD_TITLE_CE}", + "R={AC$ANALYTICAL_CONDITION: RESOLUTION}", + "{MS$FOCUSED_ION: PRECURSOR_TYPE}" + ) + } + } + + + # Extract a {XXX} argument from each title section. + # check that every title has one and only one match + args <- regexec("\\{(.*)\\}", varlist) + arglist <- regmatches(varlist, args) + if(any(unlist(lapply(arglist, length)) != 2)) + stop("Title format is incorrectly specified: a section with not exactly 1 parameters") + + parsedVars <- lapply(varlist, function(var) + { + # Extract the specified parameter inside the {}. + # I.e. from a string like "R={BLA: BLUB}" return "BLA: BLUB" + args <- regexec("\\{(.*)\\}", var) + arg <- regmatches(var, args)[[1]][[2]] + # Split the parameter by colon if necessary + splitVar <- strsplit(arg, ": ")[[1]] + # Read the parameter value from the record + if(length(splitVar) == 2) + replaceVar <- mbrecord[[splitVar[[1]]]][[splitVar[[2]]]] + else if(length(splitVar) == 1) + replaceVar <- mbrecord[[splitVar]] + else + stop(paste( + "Title format is incorrectly specified:", var) + ) + # Fix problems: NULL returns + if(is.null(replaceVar)) + replaceVar <- "" + # Fix problems: Names will have >= 1 match. Take the first + if(length(replaceVar) > 1) + replaceVar <- replaceVar[[1]] + + # Fix problems: Unknowns might have no name + if(!length(replaceVar)){ + replaceVar <- "" + } + + # Substitute the parameter value into the string + parsedVar <- sub("\\{(.*)\\}", replaceVar, var) + return(parsedVar) + }) + title <- paste(parsedVars, collapse="; ") + return(title) +} + + +# This converts the tree-like list (as obtained e.g. from compileRecord()) +# into a plain text array, which can then be dumped to a file suitable for +# MassBank upload. +#' Write MassBank record into character array +#' +#' Writes a MassBank record in list format to a text array. +#' +#' The function is a general conversion tool for the MassBank format; i.e. the +#' field names are not fixed. \code{mbdata} must be a named list, and the +#' entries can be as follows: \itemize{ +#' \item A single text line: +#' +#' \code{'CH\$EXACT_MASS' = '329.1023'} +#' +#' is written as +#' +#' \code{CH\$EXACT_MASS: 329.1023} +#' \item A character array: +#' +#' \code{'CH\$NAME' = c('2-Aminobenzimidazole', '1H-Benzimidazol-2-amine')} +#' +#' is written as +#' +#' \code{CH\$NAME: 2-Aminobenzimidazole} +#' +#' \code{CH\$NAME: 1H-Benzimidazol-2-amine} +#' +#' \item A named list of strings: +#' +#' \code{'CH\$LINK' = list('CHEBI' = "27822", "KEGG" = "C10901")} +#' +#' is written as +#' +#' \code{CH\$LINK: CHEBI 27822} +#' +#' \code{CH\$LINK: KEGG C10901} +#' +#' \item A data frame (e.g. the peak table) is written as specified in +#' the MassBank record format (Section 2.6.3): the column names are used as +#' headers for the first line, all data rows are printed space-separated. +#' } +#' +#' @usage toMassbank(mbdata) +#' @param mbdata A MassBank record in list format. +#' @return The result is a text array, which is ready to be written to the disk +#' as a file. +#' @note The function iterates over the list item names. \bold{This means that +#' duplicate entries in \code{mbdata} are (partially) discarded!} The correct +#' way to add them is by making a character array (as specified above): Instead +#' of \code{'CH\$NAME' = 'bla', 'CH\$NAME' = 'blub'} specify \code{'CH\$NAME' = +#' c('bla','blub')}. +#' @author Michael Stravs +#' @seealso \code{\link{compileRecord}}, \code{\link{mbWorkflow}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' \dontrun{ +#' # Read just the compound info skeleton from the Internet for some compound ID +#' id <- 35 +#' mbdata <- gatherData(id) +#' #' # Export the mbdata blocks to line arrays +#' # (there is no spectrum information, just the compound info...) +#' mbtext <- toMassbank(mbdata) +#' } +#' +#' @export +toMassbank <- function (mbdata) +{ + # mbf is an array of lines and count is the line counter. + # Very old-school, but it works. :) + mbf <- character(0) + count <- 1 + lapply(names(mbdata), function(entry) + { + # If entry is a char line, add it to the file. + # If it is a named sublist, add each subentry with name + # If it is an unnamed sublist, add each subentry without name + # if it is a dataframe, write in PEAKS mode + + # Note: this is were I liked "lapply" a little too much. "for" would + # be more idiomatic, and wouldn't need the <<- assignments. + + # Data frame: table mode. A header line and one space-separated line for + # each data frame row. + if(is.data.frame(mbdata[[entry]])) + { + mbf[[count]] <<- paste(entry,": " , + paste(colnames(mbdata[[entry]]), collapse=" "), + sep='') + count <<- count+1 + for(row in 1:nrow(mbdata[[entry]])) + { + mbf[[count]] <<- paste(" ", + paste(mbdata[[entry]][row,],collapse=" "), + sep="") + count <<- count+1 + } + #browser() + } + # List with named items: Write every entry like CH$LINK: CAS 12-345-678 + else if(is.list(mbdata[[entry]]) & !is.null(names(mbdata[[entry]]))) + { + + lapply(names(mbdata[[entry]]), function(subentry) + { + if(subentry != "SPLASH"){ + mbf[[count]] <<- paste(entry,": ",subentry, " ", mbdata[[entry]][[subentry]], sep='') + } else { + mbf[[count]] <<- paste(entry,": ", mbdata[[entry]][[subentry]], sep='') + } + #print(mbf) + count <<- count + 1 + }) + } + # Array (or list) of unnamed items: Write every entry like CH$NAME: Paracetamol + # (iterative entry without subindices) + else if (length(mbdata[[entry]]) > 1 & is.null(names(mbdata[[entry]]))) + { + lapply(mbdata[[entry]], function(subentry) + { + mbf[[count]] <<- paste(entry,": ",subentry, sep='') + #print(mbf) + count <<- count + 1 + }) + } + # Length is 1: just write the entry like PK$NUM_PEAKS: 131 + else + { + mbf[[count]] <<- paste(entry,": ",mbdata[[entry]], sep='') + count <<- count + 1 + } + } + ) # End of lapply block (per child spectrum) + # Add mandatory EOF marker + mbf[[count]] <- "//" + return(mbf) +} + +# Exports compiled and massbanked spectra, with their associated molfiles, to physical files. +# "compiled" is still used here, because we need an accessible accession number. +# In the plain text arrays, the accession number is already "hidden". +# compiled: is ONE "compiled" entry, i.e. ONE compound with e.g. 14 spectra. +# files: is a return value from lapply(toMassbank), i.e. contains 14 plain-text arrays +# (for a 14-spectra method) +# molfile: a molfile from createMolfile +#' Export internally stored MassBank data to files +#' +#' Exports MassBank recfile data arrays and corresponding molfiles to physical +#' files on hard disk, for one compound. +#' +#' The data from \code{compiled} is still used here, because it contains the +#' "visible" accession number. In the plain-text format contained in +#' \code{files}, the accession number is not "accessible" anymore since it's in +#' the file. +#' +#' @usage exportMassbank(compiled, files, molfile) +#' @param compiled Is ONE "compiled" entry, i.e. ONE compound with e.g. 14 +#' spectra, as returned from \code{\link{compileRecord}}. +#' @param files A n-membered array (usually a return value from +#' \code{lapply(\link{toMassbank})}), i.e. contains n plain-text arrays with +#' MassBank records. +#' @param molfile A molfile from \code{\link{createMolfile}} +#' @return No return value. +#' @note An improvement would be to write the accession numbers into +#' \code{names(compiled)} and later into \code{names(files)} so \code{compiled} +#' wouldn't be needed here anymore. (The compound ID would have to go into +#' \code{names(molfile)}, since it is also retrieved from \code{compiled}.) +#' @author Michael Stravs +#' @seealso \code{\link{createMolfile}}, \code{\link{compileRecord}}, +#' \code{\link{toMassbank}}, \code{\link{mbWorkflow}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' \dontrun{ +#' compiled <- compileRecord(record, mbdata, refilteredRcSpecs) +#' mbfiles <- toMassbank(compiled) +#' molfile <- createMolfile(compiled[[1]][["CH$SMILES"]]) +#' exportMassbank(compiled, mbfiles, molfile) +#' } +#' +#' @export +exportMassbank <- function(compiled, files, molfile) +{ + molnames <- c() + for(file in 1:length(compiled)) + { + # Read the accession no. from the corresponding "compiled" entry + filename <- compiled[[file]]["ACCESSION"] + # use this accession no. as filename + filename <- paste(filename, ".txt", sep="") + write(files[[file]], + file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata",filename) + ) + } + # Use internal ID for naming the molfiles + if(findLevel(compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]],TRUE)=="standard"){ + molname <- sprintf("%04d", as.numeric( + compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])) + molname <- paste(molname, ".mol", sep="") + write(molfile, + file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata",molname) + ) + } +} + +# Makes a list.tsv with molfile -> massbank ch$name attribution. + +#' Write list.tsv file +#' +#' Makes a list.tsv file in the "moldata" folder. +#' +#' Generates the list.tsv file which is needed by MassBank to connect records with +#' their respective molfiles. The first compound name is linked to a mol-file with +#' the compound ID (e.g. 2334.mol for ID 2334). +#' +#' @param compiled A list of compiled spectra (in tree-format, as returned by \code{compileRecord}). +#' @return No return value. +#' @author Michael A. Stravs, Eawag +#' @examples \dontrun{ +#' compiled <- compileRecord(record, mbdata, refilteredRcSpecs) +#' # a list.tsv for only one record: +#' clist <- list(compiled) +#' makeMollist(clist) +#' } +#' @export +makeMollist <- function(compiled) +{ + # For every "compiled" entry (here, compiled is not one "compiled" entry but the total + # list of all compiled spectra), extract the uppermost CH$NAME and the ID (from the + # first spectrum.) Make the ID into 0000 format. + + tsvlist <- t(sapply(compiled, function(entry) + { + name <- entry[[1]][["CH$NAME"]][[1]] + id <- sprintf("%04d", as.numeric(entry[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])) + molfilename <- paste(id,".mol",sep='') + return(c(name,molfilename)) + })) + + IDs <- sapply(compiled, function(entry) return( sprintf("%04d", as.numeric(entry[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])))) + level <- sapply(IDs, findLevel, compact=TRUE) + validentries <- which(level == "standard") + # Write the file with the + write.table(tsvlist[validentries,], + paste(getOption("RMassBank")$annotations$entry_prefix,"/moldata/list.tsv", sep=''), + quote = FALSE, + sep="\t", + row.names=FALSE, + col.names=FALSE + ) +} + + +# Load a dataframe or file into additional_peaks (or add additional points in there.) +# The columns cpdID, scan, mzFound, int, OK are mandatory. OK=1 means that the peaks +# will be added into the spectrum. mzFound and int will be taken for the table. +# No annotation will be written. +# Add peaks to the spectra by hand + +#' Add additional peaks to spectra +#' +#' Loads a table with additional peaks to add to the MassBank spectra. Required +#' columns are \code{cpdID, scan, int, mzFound, OK}. +#' +#' All peaks with OK=1 will be included in the spectra. +#' +#' @usage addPeaks(mb, filename_or_dataframe) +#' @param mb The \code{mbWorkspace} to load the peaks into. +#' @param filename_or_dataframe Filename of the csv file, or name of the R +#' dataframe containing the peaklist. +#' @return The \code{mbWorkspace} with loaded additional peaks. +#' @author Michael Stravs +#' @seealso \code{\link{mbWorkflow}} +#' @examples +#' +#' \dontrun{addPeaks("myrun_additionalPeaks.csv")} +#' +#' @export +addPeaks <- function(mb, filename_or_dataframe) +{ + + errorvar <- 0 + currEnvir <- environment() + d <- 1 + + if(is.data.frame(filename_or_dataframe)) + df <- filename_or_dataframe + else + tryCatch( + df <- read.csv(filename_or_dataframe), + error=function(e){ + currEnvir$errorvar <- 1 + }) + # I change your heuristic fix to another heuristic fix, because I will have to test for a column name change... + + if(!errorvar){ + + if(ncol(df) < 2) + df <- read.csv(filename_or_dataframe, sep=";") + # here: the column int was renamed to intensity, and we need to be able to read old files. sorry. + if(!("intensity" %in% colnames(df)) & ("int" %in% colnames(df))) + df$intensity <- df$int + + cols <- c("cpdID", "scan", "mzFound", "intensity", "OK") + n <- colnames(df) + # Check if comma-separated or semicolon-separated + d <- setdiff(cols, n) + if(length(d)>0){ + stop("Some columns are missing in the additional peak list. Needs at least cpdID, scan, mzFound, intensity, OK.") + } + } + + culled_df <- df[,c("cpdID", "scan", "mzFound", "intensity", "OK")] + + + if(nrow(mb@additionalPeaks) == 0) + mb@additionalPeaks <- culled_df + else + mb@additionalPeaks <- rbind(mb@additionalPeaks, culled_df) + return(mb) +} diff --git a/R/leMsMs.r b/R/leMsMs.r index 3590b7d..935ae57 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -1,2315 +1,2315 @@ - -#library(xcms) - -#' Backup \code{msmsWorkflow} results -#' -#' Writes the results from different \code{msmsWorkflow} steps to a file. -#' -#' @aliases archiveResults -#' @usage archiveResults(w, fileName, settings = getOption("RMassBank")) -#' @param w The \code{msmsWorkspace} to be saved. -#' @param fileName The filename to store the results under. -#' @param settings The settings to be stored into the msmsWorkspace image. -#' @examples -#' -#' # This doesn't really make a lot of sense, -#' # it stores an empty workspace. -#' RmbDefaultSettings() -#' w <- newMsmsWorkspace() -#' archiveResults(w, "narcotics.RData") -#' -#' @export -archiveResults <- function(w, fileName, settings = getOption("RMassBank")) -{ - # save the settings into the settings slot - w@settings <- settings - # save - save(w, file=fileName) - -} - - -#' RMassBank mass spectrometry pipeline -#' -#' Extracts and processes spectra from a specified file list, according to -#' loaded options and given parameters. -#' -#' The filenames of the raw LC-MS runs are read from the array \code{files} -#' in the global enviroment. -#' See the vignette \code{vignette("RMassBank")} for further details about the -#' workflow. -#' -#' @param w A \code{msmsWorkspace} to work with. -#' @param mode \code{"pH", "pNa", "pM", "mH", "mM", "mFA", "pNH4"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-, [M+NH4]+). -#' @param steps Which steps of the workflow to process. See the vignette -#' \code{vignette("RMassBank")} for details. -#' @param confirmMode Defaults to false (use most intense precursor). Value 1 uses -#' the 2nd-most intense precursor for a chosen ion (and its data-dependent scans) -#' , etc. -#' @param newRecalibration Whether to generate a new recalibration curve (\code{TRUE}, default) or -#' to reuse the currently stored curve (\code{FALSE}, useful e.g. for adduct-processing runs.) -#' @param useRtLimit Whether to enforce the given retention time window. -#' @param archivename The prefix under which to store the analyzed result files. -#' @param readMethod Several methods are available to get peak lists from the files. -#' Currently supported are "mzR", "xcms", "MassBank" and "peaklist". -#' The first two read MS/MS raw data, and differ in the strategy -#' used to extract peaks. MassBank will read existing records, -#' so that e.g. a recalibration can be performed, and "peaklist" -#' just requires a CSV with two columns and the column header "mz", "int". -#' @param findPeaksArgs A list of arguments that will be handed to the xcms-method findPeaks via do.call -#' @param plots A parameter that determines whether the spectra should be plotted or not (This parameter is only used for the xcms-method) -#' @param precursorscan.cf Whether to fill precursor scans. To be used with files which for -#' some reasons do not contain precursor scan IDs in the mzML, e.g. AB Sciex converted -#' files. -#' @param settings Options to be used for processing. Defaults to the options loaded via -#' \code{\link{loadRmbSettings}} et al. Refer to there for specific settings. -#' @param analyzeMethod The "method" parameter to pass to \code{\link{analyzeMsMs}}. -#' @param progressbar The progress bar callback to use. Only needed for specialized applications. -#' Cf. the documentation of \code{\link{progressBarHook}} for usage. -#' @param MSe A boolean value that determines whether the spectra were recorded using MSe or not -#' @return The processed \code{msmsWorkspace}. -#' @seealso \code{\link{msmsWorkspace-class}} -#' @author Michael Stravs, Eawag -#' @export -msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRecalibration = TRUE, - useRtLimit = TRUE, archivename=NA, readMethod = "mzR", findPeaksArgs = NULL, plots = FALSE, - precursorscan.cf = FALSE, - settings = getOption("RMassBank"), analyzeMethod = "formula", - progressbar = "progressBarHook", MSe = FALSE) -{ - .checkMbSettings() - if(!any(mode %in% c("pH","pNa","pNH4","pM","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) - - if(!is.na(archivename)) - w@archivename <- archivename - - # Make a progress bar: - nProg <- 0 - nLen <- length(w@files) - - allUnknown <- FALSE - - # If all compounds are unknown some specific conditions apply - if(all(.listEnvEnv$listEnv$compoundList$Level == "5")){ - allUnknown <- TRUE - message("All compounds are unknown, the workflow will be adjusted accordingly") - } - - if(readMethod == "minimal"){ - ##Edit options - opt <- getOption("RMassBank") - opt$recalibrator$MS1 <- "recalibrate.identity" - opt$recalibrator$MS2 <- "recalibrate.identity" - opt$add_annotation <- FALSE - opt$multiplicityFilter <- 1 - options(RMassBank=opt) - settings <- getOption("RMassBank") - ##Edit analyzemethod - analyzeMethod <- "intensity" - } - - # clean rerun functionality: - # if any step after 3 has been run, rerunning steps 4 or below needs moving back to the parent workspace. - # However, the recalibration must be preserved, because: - # if someone runs - # w <- msmsWorkflow(w, steps=c(1:4)), - # then substitutes the recalibration - # w@rc <- myrecal - # then runs step 4 again - # w <- msmsWorkflow(w, steps=c(4), newRecalibration=FALSE) - # the rc and rc.ms1 must be preserved and not taken from the parent workspace - if(!all(steps > 4) & !is.null(w@parent)) - { - rc <- w@rc - rc.ms1 <- w@rc.ms1 - w <- w@parent - w@rc <- rc - w@rc.ms1 <- rc.ms1 - } - - # Step 1: acquire all MSMS spectra from files - if(1 %in% steps) - { - message("msmsWorkflow: Step 1. Acquire all MSMS spectra from files") - w <- msmsRead(w = w, files = w@files, readMethod=readMethod, mode=mode, confirmMode = confirmMode, useRtLimit = useRtLimit, - Args = findPeaksArgs, settings = settings, progressbar = progressbar, MSe = MSe) - } - # Step 2: first run analysis before recalibration - if(2 %in% steps) - { - nProg <- 0 - message("msmsWorkflow: Step 2. First analysis pre recalibration") - if(allUnknown){ - analyzeMethod <- "intensity" - } - pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) - w@spectra <- as(lapply(w@spectra, function(spec) { - #print(spec$id) - # if(findLevel(spec@id,TRUE) == "unknown"){ - # analyzeMethod <- "intensity" - # } else { - # analyzeMethod <- "formula" - # } - s <- analyzeMsMs(spec, mode=mode, detail=TRUE, run="preliminary", - filterSettings = settings$filterSettings, - spectraList = settings$spectraList, method = analyzeMethod) - # Progress: - nProg <<- nProg + 1 - pb <- do.call(progressbar, list(object=pb, value= nProg)) - - return(s) - }), "SimpleList") - ## for(f in w@files) - ## w@spectra[[basename(as.character(f))]]@name <- basename(as.character(f)) - suppressWarnings(do.call(progressbar, list(object=pb, close=TRUE))) - } - # Step 3: aggregate all spectra - if(3 %in% steps) - { - message("msmsWorkflow: Step 3. Aggregate all spectra") - w@aggregated <- aggregateSpectra(w@spectra, addIncomplete=TRUE) - } - - if(allUnknown){ - w@aggregated$noise <- FALSE - w@aggregated$noise <- FALSE - w@aggregated$reanalyzed.formula <- NA - w@aggregated$reanalyzed.mzCalc <- NA - w@aggregated$reanalyzed.dppm <- NA - w@aggregated$reanalyzed.formulaCount <- NA - w@aggregated$reanalyzed.dbe <- NA - w@aggregated$matchedReanalysis <- NA - w@aggregated$filterOK <- TRUE - w@aggregated$problematicPeak <- FALSE - w@aggregated$formulaMultiplicity <- unlist(sapply(table(w@aggregated$cpdID),function(x) rep(x,x))) - return(w) - } - - - # Step 4: recalibrate all m/z values in raw spectra - if(4 %in% steps) - { - message("msmsWorkflow: Step 4. Recalibrate m/z values in raw spectra") - if(newRecalibration) - { - # note: makeRecalibration takes w as argument now, because it needs to get the MS1 spectra from @spectra - recal <- makeRecalibration(w, mode, - recalibrateBy = settings$recalibrateBy, - recalibrateMS1 = settings$recalibrateMS1, - recalibrator = settings$recalibrator, - recalibrateMS1Window = settings$recalibrateMS1Window) - w@rc <- recal$rc - w@rc.ms1 <- recal$rc.ms1 - } - w@parent <- w - w@aggregated <- data.frame() - spectra <- recalibrateSpectra(mode, w@spectra, w = w, - recalibrateBy = settings$recalibrateBy, - recalibrateMS1 = settings$recalibrateMS1) - w@spectra <- spectra - } - # Step 5: re-analysis on recalibrated spectra - if(5 %in% steps) - { - nProg <- 0 - message("msmsWorkflow: Step 5. Reanalyze recalibrated spectra") - pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) - - w@spectra <- as(lapply(w@spectra, function(spec) { - #print(spec$id) - if(findLevel(spec@id,TRUE) == "unknown"){ - analyzeMethod <- "intensity" - } else { - analyzeMethod <- "formula" - } - s <- analyzeMsMs(spec, mode=mode, detail=TRUE, run="recalibrated", - filterSettings = settings$filterSettings, - spectraList = settings$spectraList, method = analyzeMethod) - # Progress: - nProg <<- nProg + 1 - pb <- do.call(progressbar, list(object=pb, value= nProg)) - - return(s) - }), "SimpleList") - ## for(f in w@files) - ## w@spectra[[basename(as.character(f))]]@name <- basename(as.character(f)) - suppressWarnings(do.call(progressbar, list(object=pb, close=TRUE))) - - do.call(progressbar, list(object=pb, close=TRUE)) - } - # Step 6: aggregate recalibrated results - if(6 %in% steps) - { - message("msmsWorkflow: Step 6. Aggregate recalibrated results") - w@aggregated <- aggregateSpectra(w@spectra, addIncomplete=TRUE) - if(!is.na(archivename)) - archiveResults(w, paste(archivename, ".RData", sep=''), settings) - w@aggregated <- cleanElnoise(w@aggregated, - settings$electronicNoise, settings$electronicNoiseWidth) - } - # Step 7: reanalyze failpeaks for (mono)oxidation and N2 adduct peaks - if(7 %in% steps) - { - message("msmsWorkflow: Step 7. Reanalyze fail peaks for N2 + O") - w@aggregated <- reanalyzeFailpeaks( - w@aggregated, custom_additions="N2O", mode=mode, - filterSettings=settings$filterSettings, - progressbar=progressbar) - if(!is.na(archivename)) - archiveResults(w, paste(archivename, "_RA.RData", sep=''), settings) - } - # Step 8: heuristic filtering based on peak multiplicity; - # creation of failpeak list - if(8 %in% steps) - { - message("msmsWorkflow: Step 8. Peak multiplicity filtering") - if (is.null(settings$multiplicityFilter)) { - message("msmsWorkflow: Step 8. Peak multiplicity filtering skipped because multiplicityFilter parameter is not set.") - } else { - # apply heuristic filter - w@aggregated <- filterMultiplicity( - w, archivename, mode, settings$multiplicityFilter) - w@aggregated <- processProblematicPeaks(w, mode, archivename) - - if(!is.na(archivename)) - archiveResults(w, paste(archivename, "_RF.RData", sep=''), settings) - } - } - message("msmsWorkflow: Done.") - return(w) -} - -#' Analyze MSMS spectra -#' -#' Analyzes MSMS spectra of a compound by fitting formulas to each subpeak. -#' -#' The analysis function uses Rcdk. Note -#' that in this step, \emph{satellite peaks} are removed by a simple heuristic -#' rule (refer to the documentation of \code{\link{filterPeakSatellites}} for details.) -#' -## # @usage analyzeMsMs(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", -## # filterSettings = getOption("RMassBank")$filterSettings, -## # spectraList = getOption("RMassBank")$spectraList, method="formula") -## # -## # analyzeMsMs.formula(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", -## # filterSettings = getOption("RMassBank")$filterSettings, -## # spectraList = getOption("RMassBank")$spectraList) -## # -## # analyzeMsMs.intensity(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", -## # filterSettings = getOption("RMassBank")$filterSettings, -## # spectraList = getOption("RMassBank")$spectraList) -#' -#' @param msmsPeaks A \code{RmbSpectraSet} object. -#' Corresponds to a parent spectrum and children MSMS spectra of one compound (plus some metadata). -#' The objects are typically generated with \code{\link{findMsMsHR}}, and populate the \code{@@spectrum} slot -#' in a \code{msmsWorkspace} (refer to the corresponding -#' documentation for the precise format specifications). -#' @param mode Specifies the processing mode, i.e. which molecule species the -#' spectra contain. \code{\var{pH}} (positive H) specifies [M+H]+, -#' \code{\var{pNa}} specifies [M+Na]+, \code{\var{pM}} specifies [M]+, -#' \code{\var{mH}} and \code{\var{mNa}} specify [M-H]- and [M-Na]-, -#' respectively. (I apologize for the naming of \code{\var{pH}} which has -#' absolutely nothing to do with chemical \emph{pH} values.) -#' @param detail Whether detailed return information should be provided -#' (defaults to \code{FALSE}). See below. -#' @param run \code{"preliminary"} or \code{"recalibrated"}. In the -#' \code{preliminary} run, mass tolerance is set to 10 ppm (above m/z 120) and -#' 15 ppm (below m/z 120), the default intensity cutoff is $10^4$ for positive -#' mode (no default cutoff in negative mode), and the column \code{"mz"} from -#' the spectra is used as data source. In the \code{recalibrated} run, the -#' mass tolerance is set to 5 ppm over the whole mass range, the default cutoff -#' is 0 and the column \code{"mzRecal"} is used as source for the m/z values. -#' Defaults to \code{"preliminary"}. -#' @param filterSettings -#' Settings for the filter parameters, by default loaded from the RMassBank settings -#' set with e.g. \code{\link{loadRmbSettings}}. Must contain: -#' \itemize{ -#' \item \code{ppmHighMass}, allowed ppm deviation before recalibration -#' for high mass range -#' \item \code{ppmLowMass}, allowed ppm deviation before recalibration -#' for low mass range -#' \item \code{massRangeDivision}, division point between high and low mass -#' range (before recalibration) -#' \item \code{ppmFine}, allowed ppm deviation overall after recalibration -#' \item \code{prelimCut}, intensity cutoff for peaks in preliminary run -#' \item \code{prelimCutRatio}, relative intensity cutoff for peaks in -#' preliminary run, e.g. 0.01 = 1% -#' \item \code{fineCut}, intensity cutoff for peaks in second run -#' \item \code{fineCutRatio}, relative intensity cutoff for peaks in -#' second run -#' \item \code{specOkLimit}, minimum intensity of base peak for spectrum -#' to be accepted for processing -#' \item \code{dbeMinLimit}, minimum double bond equivalent for accepted -#' molecular subformula. -#' \item \code{satelliteMzLimit}, for satellite peak filtering -#' (\code{\link{filterPeakSatellites}}: mass window to use for satellite -#' removal -#' \item \code{satelliteIntLimit}, the relative intensity below which to -#' discard "satellites". (refer to \code{\link{filterPeakSatellites}}). -#' } -#' @param spectraList The list of MS/MS spectra present in each data block. As also -#' defined in the settings file. -#' @param method Selects which function to actually use for data evaluation. The default -#' "formula" runs a full analysis via formula assignment to fragment peaks. The -#' alternative setting "intensity" calls a "mock" implementation which circumvents -#' formula assignment and filters peaks purely based on intensity cutoffs and the -#' satellite filtering. (In this case, the ppm and dbe related settings in filterSettings -#' are ignored.) -#' @return The processed \code{RmbSpectraSet} object. -#' Added (or filled, respectively, since the slots are present before) data include -#' \item{list("complete")}{whether all spectra have useful value} -#' \item{list("empty")}{whether there are no useful spectra} -#' \item{list("children")}{ -#' The processed \code{RmbSpectrum2} objects (in a \code{RmbSpectrum2List}). -#' \itemize{ -#' \item \code{ok} if the spectrum was successfully processed with at least one resulting peak -#' \item \code{mz}, \code{intensity}: note that mz/int pairs can be duplicated when multiple matches -#' are found for one mz value, therefore the two slots are not necessarily unchanged from before -#' \item \code{rawOK} (logical) whether the m/z peak passes satellite/low removal -#' \item \code{low}, \code{satellite} if \code{TRUE}, the peak failed cutoff (\code{low}) or was removed as \code{satellite} -#' \item \code{formula}, \code{mzCalc}, \code{dppm}, \code{dbe} Formula, calculated mass, ppm deviation and dbe assigned to a peak -#' \item \code{formulaCount}, \code{dppmBest} Number of formulae matched for this m/z value and ppm deviation of the best match -#' \item \code{info} Spectrum identifying information (collision energy, resolution, collision mode) from -#' the \code{spectraList} -#' \item All other entries are retained from the original \code{RmbSpectrum2}. -#' } -#' } -#' @aliases analyzeMsMs analyzeMsMs.formula analyzeMsMs.intensity -#' @author Michael Stravs -#' @seealso \code{\link{msmsWorkflow}}, \code{\link{filterLowaccResults}}, -#' \code{\link{filterPeakSatellites}}, \code{\link{reanalyzeFailpeaks}} -#' @examples -#' -#' \dontrun{analyzed <- analyzeMsMs(spec, "pH", TRUE)} -#' -#' @export -analyzeMsMs <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", - filterSettings = getOption("RMassBank")$filterSettings, - spectraList = getOption("RMassBank")$spectraList, method="formula") -{ - ## .RmbSpectraSet <- setClass("RmbSpectraSet", - ## representation = representation( - ## parent = "Spectrum1", - ## children = "RmbSpectrum2List", - ## # These are done as slots and not as S4 functions, because they are set during the workflow - ## # in "checking" steps. It's easier. - ## found = "logical", - ## complete = "logical", - ## empty = "logical", - ## formula = "character", - ## id = "integer", - ## mz = "numeric", - ## name = "character", - ## annotations = "list" - ## ), - ## prototype = prototype( - ## parent = new("Spectrum1"), - ## children = new("RmbSpectrum2List"), - ## found = FALSE, - ## complete = NA, - ## empty = NA, - ## formula = character(), - ## id = integer(), - ## mz = numeric(), - ## name = character(), - ## annotations = list() - ## ) - ## ); - .checkMbSettings() - - - # Check whether the spectra can be fitted to the spectra list correctly! - if(length(msmsPeaks@children) != length(spectraList)) - { - warning(paste0( - "The spectra count of the substance ", msmsPeaks@id, " (", length(msmsPeaks@children), " spectra) doesn't match the provided spectra list (", length(spectraList), " spectra)." - )) - msmsPeaks@found <- FALSE - return(msmsPeaks) - - } - - if(msmsPeaks@found == FALSE) - return(msmsPeaks) - - if(method=="formula") - { - r <- (analyzeMsMs.formula(msmsPeaks, mode, detail, run, filterSettings - )) - } - else if(method == "intensity") - { - r <- (analyzeMsMs.intensity(msmsPeaks, mode, detail, run, filterSettings - )) - } - - # Add the spectrum labels to the spectra here. - # If there is any better place to do this, please tell me. I hate it. - # However, the info should be added in msmsWorkflow not in mbWorkflow, because two msmsWorkspaces with different spectraLists can be - # merged together with all the combine / pack stuff. - children <- mapply(function(spec, info) - { - spec@info <- info - spec - }, r@children, spectraList, SIMPLIFY=FALSE) - r@children <- as(children, "SimpleList") - - - #nspectra <- length(spectraList) - ok <- unlist(lapply(r@children, function(c) c@ok)) - r@complete <- FALSE - r@empty <- FALSE - if(all(ok)) - r@complete <- TRUE - if(all(!ok)) - r@empty <- TRUE - return(r) -} - - -#' @describeIn analyzeMsMs Analyze the peaks using formula annotation -#' @export -analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", - filterSettings = getOption("RMassBank")$filterSettings) -{ - cut <- 0 - cut_ratio <- 0 - if(run=="preliminary") - { - filterMode <- "coarse" - cut <- filterSettings$prelimCut - if(is.na(cut)) - { - if(mode %in% c("pH", "pM", "pNa", "pNH4")) - cut <- 1e4 - else if(mode %in% c("mH", "mFA","mM")) - cut <- 0 - else stop(paste("The ionization mode", mode, "is unknown.")) - } - cutRatio <- filterSettings$prelimCutRatio - } - else - { - filterMode <- "fine" - cut <- filterSettings$fineCut - cut_ratio <- filterSettings$fineCutRatio - if(is.na(cut)) cut <- 0 - } - - # find whole spectrum of parent peak, so we have reasonable data to feed into - # MolgenMsMs - parentSpectrum <- msmsPeaks@parent - - - # On each spectrum the following function analyzeTandemShot will be applied. - # It takes the raw peaks matrix as argument (mz, int) and processes the spectrum by - # filtering out low-intensity (<1e4) and shoulder peaks (deltam/z < 0.5, intensity - # < 5%) and subsequently matching the peaks to formulas using Rcdk, discarding peaks - # with insufficient match accuracy or no match. - analyzeTandemShot <- function(child) - { - shot <- getData(child) - shot$row <- which(!is.na(shot$mz)) - - - # Filter out low intensity peaks: - child@low <- (shot$intensity < cut) | (shot$intensity < max(shot$intensity)*cut_ratio) - shot <- shot[!child@low,,drop=FALSE] - shot_full <- shot - - # Is there still anything left? - if(length(which(!child@low))==0) - { - child@ok <- FALSE - return(child) - } - - # Filter out satellite peaks: - shot <- filterPeakSatellites(shot, filterSettings) - child@satellite <- rep(TRUE, child@peaksCount) - child@satellite[which(child@low == TRUE)] <- NA - child@satellite[shot$row] <- FALSE - - # Is there still anything left? - if(nrow(shot)==0) - { - child@ok <- FALSE - return(child) - } - - if(max(shot$intensity) < as.numeric(filterSettings$specOkLimit)) - { - child@ok <- FALSE - return(child) - } - - # Crop to 4 digits (necessary because of the recalibrated values) - # this was done for the MOLGEN MSMS type analysis, is not necessary anymore now (23.1.15 MST) - # shot[,mzColname] <- round(shot[,mzColname], 5) - - # here follows the Rcdk analysis - #------------------------------------ - parentPeaks <- data.frame(mzFound=msmsPeaks@mz, - formula=msmsPeaks@formula, - dppm=0, - x1=0,x2=0,x3=0) - - # define the adduct additions - if(mode == "pH") { - allowed_additions <- "H" - mode.charge <- 1 - } else if(mode == "pNa") { - allowed_additions <- "Na" - mode.charge <- 1 - } else if(mode == "pM") { - allowed_additions <- "" - mode.charge <- 1 - } else if(mode == "mM") { - allowed_additions <- "" - mode.charge <- -1 - } else if(mode == "mH") { - allowed_additions <- "H-1" - mode.charge <- -1 - } else if(mode == "mFA") { - allowed_additions <- "C2H3O2" - mode.charge <- -1 - } else if(mode == "pNH4") { - allowed_additions <- "NH4" - mode.charge <- 1 - } else{ - stop("mode = \"", mode, "\" not defined") - } - - - # the ppm range is two-sided here. - # The range is slightly expanded because dppm calculation of - # generate.formula starts from empirical mass, but dppm cal- - # culation of the evaluation starts from theoretical mass. - # So we don't miss the points on 'the border'. - - if(run=="preliminary") - ppmlimit <- 2 * max(filterSettings$ppmLowMass, filterSettings$ppmHighMass) - else - ppmlimit <- 2.25 * filterSettings$ppmFine - - parent_formula <- add.formula(msmsPeaks@formula, allowed_additions) - dbe_parent <- dbe(parent_formula) - # check whether the formula is valid, i.e. has no negative or zero element numbers. - #print(parent_formula) - if(!is.valid.formula(parent_formula)) - { - child@ok <- FALSE - return(child) - } - - limits <- to.limits.rcdk(parent_formula) - - peakmatrix <- lapply( - split(shot,shot$row) - , function(shot.row) { - # Circumvent bug in rcdk: correct the mass for the charge first, then calculate uncharged formulae - # finally back-correct calculated masses for the charge - mass <- shot.row[["mz"]] - mass.calc <- mass + mode.charge * .emass - peakformula <- tryCatch(suppressWarnings(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), - limits, charge=0)), error=function(e) NA) - #peakformula <- tryCatch( - # generate.formula(mass, - # ppm(mass, ppmlimit, p=TRUE), - # limits, charge=1), - #error= function(e) list()) - - if(!is.list(peakformula)) - return(t(c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, - formula=NA, mzCalc=NA))) - else - { - return(t(sapply(peakformula, function(f) - { - mzCalc <- f@mass - mode.charge * .emass - c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, - formula=f@string, - mzCalc=mzCalc) - }))) - } - - }) - - childPeaks <- as.data.frame(do.call(rbind, peakmatrix)) - - # Reformat the deformatted output correctly (why doesn't R have a better way to do this, e.g. avoid deformatting?) - - childPeaks$row <- as.numeric(as.character(childPeaks$row)) - childPeaks$intensity <- as.numeric(as.character(childPeaks$intensity)) - childPeaks$mz <- as.numeric(as.character(childPeaks$mz)) - childPeaks$formula <- as.character(childPeaks$formula) - childPeaks$mzCalc <- as.numeric(as.character(childPeaks$mzCalc)) - childPeaks$dppm <- (childPeaks$mz / childPeaks$mzCalc - 1) * 1e6 - childPeaks$dbe <- unlist(lapply(childPeaks$formula, dbe)) - - # childPeaks now contains all the good and unmatched peaks - # but not the ones which were cut as satellites or below threshold. - - ## child@mzFound <- rep(NA, child@peaksCount) - ## child@mzFound[childPeaks$row] <- as.numeric(as.character(childPeaks$mzFound)) - ## - ## child@formula <- rep(NA, child@peaksCount) - ## child@formula[childPeaks$row] <- as.character(childPeaks$formula) - ## - ## child@mzCalc <- rep(NA, child@peaksCount) - ## child@mzCalc[childPeaks$row] <- as.numeric(as.character(childPeaks$mzCalc)) - ## - ## child@dppm<- rep(NA, child@peaksCount) - ## child@dppm[childPeaks$row] <- (childPeaks$mzFound / childPeaks$mzCalc - 1) * 1e6 - # delete the NA data out again, because MolgenMsMs doesn't have them - # here and they will be re-added later - # (this is just left like this for "historical" reasons) - #childPeaks <- childPeaks[!is.na(childPeaks$formula),] - # check if a peak was recognized (here for the first time, - # otherwise the next command would fail) - - if(nrow(childPeaks)==0) - { - child@ok <- FALSE - return(child) - } - - # now apply the rule-based filters to get rid of total junk: - # dbe >= -0.5, dbe excess over mother cpd < 3 - # dbe() has been adapted to return NA for NA input - #iff_rcdk_pM_eln$maxvalence <- unlist(lapply(diff_rcdk_pM_eln$formula.rcdk, maxvalence)) - temp.child.ok <- (childPeaks$dbe >= filterSettings$dbeMinLimit) - # & dbe < dbe_parent + 3) - # check if a peak was recognized - if(length(which(temp.child.ok)) == 0) - { - child@ok <- FALSE - return(child) - } - #browser() - # find the best ppm value - bestPpm <- aggregate(as.data.frame(childPeaks[!is.na(childPeaks$dppm),"dppm"]), - list(childPeaks[!is.na(childPeaks$dppm),"row"]), - function(dppm) dppm[[which.min(abs(dppm))]]) - colnames(bestPpm) <- c("row", "dppmBest") - childPeaks <- merge(childPeaks, bestPpm, by="row", all.x=TRUE) - - # Deactivated the following lines because we never actually want to look at the "old" formula count. - # To be verified (cf Refiltering, failpeak list and comparable things) - - ## # count formulas found per mass - ## countFormulasTab <- xtabs( ~formula + mz, data=childPeaks) - ## countFormulas <- colSums(countFormulasTab) - ## childPeaks$formulaCount <- countFormulas[as.character(childPeaks$row)] - - # filter results - childPeaksFilt <- filterLowaccResults(childPeaks, filterMode, filterSettings) - childPeaksGood <- childPeaksFilt[["TRUE"]] - childPeaksBad <- childPeaksFilt[["FALSE"]] - if(is.null(childPeaksGood)){ - childPeaksGood <- childPeaks[c(),,drop=FALSE] - childPeaksGood$good <- logical(0) - } - if(is.null(childPeaksBad)) - childPeaksBad <- childPeaks[c(),,drop=FALSE] - childPeaksUnassigned <- childPeaks[is.na(childPeaks$dppm),,drop=FALSE] - childPeaksUnassigned$good <- rep(FALSE, nrow(childPeaksUnassigned)) - # count formulas within new limits - # (the results of the "old" count stay in childPeaksInt and are returned - # in $childPeaks) - countFormulasTab <- xtabs( ~formula + mz, data=childPeaksGood) - countFormulas <- colSums(countFormulasTab) - childPeaksGood$formulaCount <- countFormulas[as.character(childPeaksGood$mz)] - - childPeaksUnassigned$formulaCount <- rep(NA, nrow(childPeaksUnassigned)) - childPeaksBad$formulaCount <- rep(NA, nrow(childPeaksBad)) - childPeaksBad$good <- rep(FALSE, nrow(childPeaksBad)) - - # Now: childPeaksGood (containing the new, recounted peaks with good = TRUE), and childPeaksBad (containing the - # peaks with good=FALSE, i.e. outside filter criteria, with the old formula count even though it is worthless) - # are bound together. - childPeaksBad <- childPeaksBad[,colnames(childPeaksGood),drop=FALSE] - childPeaksUnassigned <- childPeaksUnassigned[,colnames(childPeaksGood),drop=FALSE] - childPeaks <- rbind(childPeaksGood, childPeaksBad, childPeaksUnassigned) - - # Now let's cross fingers. Add a good=NA column to the unmatched peaks and reorder the columns - # to match order in childPeaks. After that, setData to the child slot. - - childPeaksOmitted <- getData(child) - childPeaksOmitted <- childPeaksOmitted[child@low | child@satellite,,drop=FALSE] - childPeaksOmitted$rawOK <- rep(FALSE, nrow(childPeaksOmitted)) - childPeaksOmitted$good <- rep(FALSE, nrow(childPeaksOmitted)) - childPeaksOmitted$dppm <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$formula <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$mzCalc <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$dbe <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$dppmBest <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$formulaCount <- rep(0, nrow(childPeaksOmitted)) - childPeaks$satellite <- rep(FALSE, nrow(childPeaks)) - childPeaks$low <- rep(FALSE, nrow(childPeaks)) - childPeaks$rawOK <- rep(TRUE, nrow(childPeaks)) - - childPeaks <- childPeaks[,colnames(childPeaksOmitted), drop=FALSE] - - childPeaksTotal <- rbind(childPeaks, childPeaksOmitted) - child <- setData(child, childPeaksTotal) - child@ok <- TRUE - - return(child) - } - - # I believe these lines were fixed to remove a warning but in the refactored workflow "mzranges" doesn't exist anymore. - # Leave here for now - ## mzranges <- t(sapply(shots, function(p) { - ## if(!is.null(p$childRaw)){ - ## return(range(p$childRaw[,mzColname])) - ## } else { - ## return(c(NA,NA)) - ## } - ## })) - ## - ## mzmin <- min(mzranges[,1], na.rm=TRUE) - ## mzmax <- max(mzranges[,2], na.rm=TRUE) - children <- lapply(msmsPeaks@children, analyzeTandemShot) - - - - -## shots <- mapply(function(shot, scan, info) - ## { - ## shot$scan <- scan - ## shot$info <- info - ## shot$header <- msmsPeaks$childHeaders[as.character(scan),] - ## return(shot) - ## }, shots, msmsPeaks$childScans, spectraList, SIMPLIFY=FALSE) - msmsPeaks@children <- as(children, "SimpleList") - return(msmsPeaks) -} - - -#' @describeIn analyzeMsMs Analyze the peaks going only by intensity values -#' @export -analyzeMsMs.intensity <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", - filterSettings = getOption("RMassBank")$filterSettings) -{ - cut <- 0 - cut_ratio <- 0 - if(run=="preliminary") - { - filterMode <- "coarse" - cut <- filterSettings$prelimCut - if(is.na(cut)) - { - if(mode %in% c("pH", "pM", "pNa", "pNH4")) - cut <- 1e4 - else if(mode %in% c("mH", "mFA", "mM")) - cut <- 0 - else stop(paste("The ionization mode", mode, "is unknown.")) - } - cutRatio <- filterSettings$prelimCutRatio - } - else - { - filterMode <- "fine" - cut <- filterSettings$fineCut - cut_ratio <- filterSettings$fineCutRatio - if(is.na(cut)) cut <- 0 - } - - # find whole spectrum of parent peak, so we have reasonable data to feed into - - - # On each spectrum the following function analyzeTandemShot will be applied. - # It takes the raw peaks matrix as argument (mz, int) and processes the spectrum by - # filtering out low-intensity (<1e4) and shoulder peaks (deltam/z < 0.5, intensity - # < 5%) and subsequently matching the peaks to formulas using Rcdk, discarding peaks - # with insufficient match accuracy or no match. - analyzeTandemShot <- function(child) - { - shot <- getData(child) - shot$row <- which(!is.na(shot$mz)) - - # Filter out low intensity peaks: - child@low <- (shot$intensity < cut) | (shot$intensity < max(shot$intensity)*cut_ratio) - shot_full <- shot - shot <- shot[!child@low,,drop=FALSE] - - - # Is there still anything left? - if(length(which(!child@low))==0) - { - child@ok <- FALSE - return(child) - } - - # Filter out satellite peaks: - shot <- filterPeakSatellites(shot, filterSettings) - child@satellite <- rep(TRUE, child@peaksCount) - child@satellite[which(child@low == TRUE)] <- NA - child@satellite[shot$row] <- FALSE - - # Is there still anything left? - if(nrow(shot)==0) - { - child@ok <- FALSE - return(child) - } - - if(max(shot$intensity) < as.numeric(filterSettings$specOkLimit)) - { - child@ok <- FALSE - return(child) - } - - - # here follows the fake analysis - #------------------------------------ - parentPeaks <- data.frame(mzFound=msmsPeaks@mz, - formula=msmsPeaks@formula, - dppm=0, - x1=0,x2=0,x3=0) - - childPeaks <- addProperty(shot_full, "rawOK", "logical", FALSE) - childPeaks[!(child@low | child@satellite),"rawOK"] <- TRUE - - childPeaks <- addProperty(childPeaks, "good", "logical", FALSE) - childPeaks[childPeaks$rawOK,"good"] <- TRUE - - childPeaks <- addProperty(childPeaks, "mzCalc", "numeric") - childPeaks[childPeaks$rawOK,"mzCalc"] <- childPeaks[childPeaks$rawOK,"mz"] - - childPeaks <- addProperty(childPeaks, "formula", "character") - childPeaks[childPeaks$rawOK,"formula"] <- "" - - childPeaks <- addProperty(childPeaks, "dbe", "numeric") - childPeaks[childPeaks$rawOK,"dbe"] <- 0 - - childPeaks <- addProperty(childPeaks, "formulaCount", "integer") - childPeaks[childPeaks$rawOK,"formulaCount"] <- 1 - - childPeaks <- addProperty(childPeaks, "dppm", "numeric") - childPeaks[childPeaks$rawOK,"dppm"] <- 0 - - childPeaks <- addProperty(childPeaks, "dppmBest", "numeric") - childPeaks[childPeaks$rawOK,"dppmBest"] <- 0 - - child <- setData(child, childPeaks) - child@ok <- TRUE - return(child) - } - children <- lapply(msmsPeaks@children, analyzeTandemShot) - msmsPeaks@children <- as(children, "SimpleList") - #browser() - - return(msmsPeaks) - - # Omit all the stuff below for now, I don't believe it is needed. One thing is that spectraList info will have to be added somewhere else. - ## shots <- mapply(function(shot, scan, info) - ## { - ## shot$scan <- scan - ## shot$info <- info - ## shot$header <- msmsPeaks$childHeaders[as.character(scan),] - ## return(shot) - ## }, shots, msmsPeaks$childScans, spectraList, SIMPLIFY=FALSE) - ## - ## mzranges <- t(sapply(shots, function(p) {return(range(p$childRaw[,mzColname]))})) - ## mzmin <- min(mzranges[,1], na.rm=TRUE) - ## mzmax <- max(mzranges[,2], na.rm=TRUE) - ## - ## return(list( - ## msmsdata=shots, - ## mzrange=c(mzmin, mzmax), - ## id=msmsPeaks$id, - ## mode=mode, - ## parentHeader = msmsPeaks$parentHeader, - ## parentMs = msmsPeaks$parentPeak, - ## formula = msmsPeaks$formula, - ## foundOK = TRUE)) -} - - -#' Filter peaks with low accuracy -#' -#' Filters a peak table (with annotated formulas) for accuracy. Low-accuracy -#' peaks are removed. -#' -#' In the \code{coarse} mode, mass tolerance is set to 10 ppm (above m/z 120) -#' and 15 ppm (below m/z 120). This is useful for formula assignment before -#' recalibration, where a wide window is desirable to accomodate the high mass -#' deviations at low m/z values, so we get a nice recalibration curve. -#' -#' In the \code{fine} run, the mass tolerance is set to 5 ppm over the whole -#' mass range. This should be applied after recalibration. -#' -#' @usage filterLowaccResults(peaks, mode="fine", filterSettings = getOption("RMassBank")$filterSettings) -#' @param peaks A data frame with at least the columns \code{mzFound} and -#' \code{dppm}. -#' @param mode \code{coarse} or \code{fine}, see below. -#' @param filterSettings Settings for filtering. For details, see documentation of -#' \code{\link{analyzeMsMs}} -#' @return A \code{list(TRUE = goodPeakDataframe, FALSE = badPeakDataframe)} is -#' returned: A data frame with all peaks which are "good" is in -#' \code{return[["TRUE"]]}. -#' @author Michael Stravs -#' @seealso \code{\link{analyzeMsMs}}, \code{\link{filterPeakSatellites}} -#' @examples -#' -#' # from analyzeMsMs: -#' \dontrun{childPeaksFilt <- filterLowaccResults(childPeaksInt, filterMode)} -#' -#' -filterLowaccResults <- function(peaks, mode="fine", filterSettings = getOption("RMassBank")$filterSettings) -{ - # Check if filter settings are properly set, otherwise use defaults - if(is.null(filterSettings)) - { - filterSettings <- list( - ppmHighMass = 10, - ppmLowMass = 15, - massRangeDivision = 120, - ppmFine = 5) - } - - peaks$good = NA - peaks[!is.na(peaks$dppm), "good"] <- TRUE - - # coarse mode: to use for determinating the recalibration function - if(mode=="coarse") - { - if(nrow(peaks[which(abs(peaks$dppm) > filterSettings$ppmHighMass),])>0) - peaks[which(abs(peaks$dppm) > filterSettings$ppmHighMass), "good"] <- FALSE - if(nrow(peaks[which(peaks$mz > filterSettings$massRangeDivision & abs(peaks$dppm) > filterSettings$ppmLowMass),])>0) - peaks[which(peaks$mz > filterSettings$massRangeDivision & abs(peaks$dppm) > filterSettings$ppmLowMass), "good"] <- FALSE - } - # fine mode: for use after recalibration - else - { - if(nrow(peaks[which(abs(peaks$dppm) > filterSettings$ppmFine),]) > 0) - peaks[which(abs(peaks$dppm) > filterSettings$ppmFine), "good"] <- FALSE - } - return(split(peaks, peaks$good)) -} - -#' Aggregate analyzed spectra -#' -#' Groups an array of analyzed spectra and creates aggregated peak tables -#' -#' \code{\var{addIncomplete}} is relevant for recalibration. For recalibration, -#' we want to use only high-confidence peaks, therefore we set -#' \code{\var{addIncomplete}} to \code{FALSE}. When we want to generate a peak -#' list for actually generating MassBank records, we want to include all peaks -#' into the peak tables. -#' -#' @usage aggregateSpectra(spec, addIncomplete=FALSE) -#' @param spec The \code{RmbSpectraSetList} of spectra sets (\code{RmbSpectraSet} objects) to aggregate -#' @param addIncomplete Whether or not the peaks from incomplete files (files -#' for which less than the maximal number of spectra are present) -#' @return -#' A summary \code{data.frame} with all peaks (possibly multiple rows for one m/z value from a spectrum, see below) with columns: -#' \item{mzFound, intensity}{Mass and intensity of the peak} -#' \item{good}{if the peak passes filter criteria} -#' \item{mzCalc, formula, dbe, dppm}{calculated mass, formula, dbe and ppm deviation of the assigned formula} -#' \item{formulaCount, dppmBest}{Number of matched formulae for this m/z value, and ppm deviation of the best match} -#' \item{scan, cpdID, parentScan}{Scan number of the child and parent spectrum in the raw file, also the compound ID to which the peak belongs} -#' \item{dppmRc}{ppm deviation recalculated from the aggregation function} -#' \item{index}{Aggregate-table peak index, so the table can be subsetted, edited and results reinserted back into this table easily} -#' Further columns are later added by workflow steps 6 (electronic noise culler), 7 and 8. -#' -#' @author Michael Stravs -#' @seealso \code{\link{msmsWorkflow}}, \code{\link{analyzeMsMs}} -#' @examples -#' -#' ## As used in the workflow: -#' \dontrun{% -#' w@@spectra <- lapply(w@@spectra, function(spec) -#' analyzeMsMs(spec, mode="pH", detail=TRUE, run="recalibrated", cut=0, cut_ratio=0 ) ) -#' w@@aggregate <- aggregateSpectra(w@@spectra) -#' } -#' -#' @export -aggregateSpectra <- function(spec, addIncomplete=FALSE) -{ - - if(addIncomplete) - aggSpectra <- selectSpectra(spec, "found", "object") - else - aggSpectra <- selectSpectra(spec, "complete", "object") - - compoundTables <- lapply(aggSpectra, function(s) - { - tables.c <- lapply(s@children, function(c) - { - table.c <- getData(c) - table.c <- table.c[table.c$rawOK,,drop=FALSE] - # remove superfluous columns, since only rawOK peaks are selected anyway - table.c$rawOK <- NULL - table.c$low <- NULL - table.c$satellite <- NULL - # add scan no - table.c$scan <- rep(c@acquisitionNum, nrow(table.c)) - return(table.c) - }) - table.cpd <- do.call(rbind, tables.c) - table.cpd$cpdID <- rep(s@id, nrow(table.cpd)) - table.cpd$parentScan <- rep(s@parent@acquisitionNum, nrow(table.cpd)) - return(table.cpd) - }) - #return(compoundTables) - aggTable <- do.call(rbind, compoundTables) - colnames(aggTable)[1] <- "mzFound" - - aggTable <- addProperty(aggTable, "dppmRc", "numeric") - aggTable <- addProperty(aggTable, "index", "integer") - if(nrow(aggTable) > 0) - aggTable$index <- 1:nrow(aggTable) - - aggTable[aggTable$good, "dppmRc"] <- (aggTable[aggTable$good, "mzFound"]/aggTable[aggTable$good, "mzCalc"] - 1)*1e6 - - - return(aggTable) -} - -#' Remove electronic noise -#' -#' Removes known electronic noise peaks from a peak table -#' -#' @usage cleanElnoise(peaks, noise=getOption("RMassBank")$electronicNoise, -#' width = getOption("RMassBank")$electronicNoiseWidth) -#' @param peaks An aggregated peak frame as described in \code{\link{aggregateSpectra}}. Columns -#' \code{mzFound}, \code{dppm} and \code{dppmBest} are needed. -#' @param noise A numeric vector of known m/z of electronic noise peaks from the instrument -#' Defaults to the entries in the RMassBank settings. -#' @param width The window for the noise peak in m/z units. Defaults to the entries in -#' the RMassBank settings. -#' @return Extends the aggregate data frame by column \code{noise} (logical), which is \code{TRUE} if the peak is marked as noise. -#' -#' @author Michael Stravs -#' @seealso \code{\link{msmsWorkflow}} -#' @examples -#' # As used in the workflow: -#' \dontrun{ -#' w@@aggregated <- -#' cleanElnoise(w@@aggregated) -#' } -#' @export -cleanElnoise <- function(peaks, noise=getOption("RMassBank")$electronicNoise, - width = getOption("RMassBank")$electronicNoiseWidth) -{ - - peaks <- addProperty(peaks, "noise", "logical", FALSE) - - # I don't think this makes sense if using one big table... - ## # use only best peaks - ## p_best <- peaks[is.na(peaks$dppmBest) | (peaks$dppm == peaks$dppmBest),] - - # remove known electronic noise - p_eln <- peaks - for(noisePeak in noise) - { - noiseMatches <- which(!((p_eln$mzFound > noisePeak + width) | (p_eln$mzFound < noisePeak - width))) - if(length(noiseMatches) > 0) - p_eln[noiseMatches, "noise"] <- TRUE - } - return(p_eln) -} - -#' Identify intense peaks (in a list of unmatched peaks) -#' -#' Finds a list of peaks in spectra with a high relative intensity (>10% and -#' 1e4, or >1% and 1e5) to write a list of peaks which must be manually -#' checked. Peaks orbiting around the parent peak mass (calculated from the -#' compound ID), which are very likely co-isolated substances, are ignored. -#' -#' -#' @usage problematicPeaks(peaks_unmatched, peaks_matched, mode = "pH") -#' @param peaks_unmatched Table of unmatched peaks, with at least \code{cpdID, -#' scan, mzFound, int}. -#' @param peaks_matched Table of matched peaks (used for base peak reference), -#' with at least \code{cpdID, scan, int}. -#' @param mode Processing mode (\code{"pH", "pNa"} etc.) -#' @return A filtered table with the potentially problematic peaks, including -#' the precursor mass and MSMS base peak intensity (\code{aMax}) for reference. -#' @author Michael Stravs -#' @seealso \code{\link{msmsWorkflow}} -#' @examples \dontrun{ -#' # As used in the workflow: -#' fp <- problematicPeaks(specs[!specs$filterOK & !specs$noise & -#' ((specs$dppm == specs$dppmBest) | (is.na(specs$dppmBest))) -#' ,,drop=FALSE], peaksMatched(w), mode) -#' } -#' @export -problematicPeaks <- function(peaks_unmatched, peaks_matched, mode="pH") -{ - # find spectrum maximum for each peak, and merge into table - if(nrow(peaks_matched) == 0){ - assIntMax <- data.frame(list(integer(0),integer(0),integer(0))) - } else{ - assIntMax <- as.data.frame(aggregate(as.data.frame(peaks_matched$intensity), - by=list(peaks_matched$cpdID, peaks_matched$scan), max)) - } - colnames(assIntMax) <- c("cpdID", "scan", "aMax") - peaks_unmatched <- merge(peaks_unmatched, assIntMax) - # which of these peaks are intense? - p_control <- peaks_unmatched[ - ( (peaks_unmatched$intensity > 1e5) & - (peaks_unmatched$intensity > 0.01*peaks_unmatched$aMax)) - | ( (peaks_unmatched$intensity > 1e4) & - (peaks_unmatched$intensity > 0.1* peaks_unmatched$aMax)) ,] - # find parent m/z to exclude co-isolated peaks - #p_control$mzCenter <- numeric(nrow(p_control)) - p_control$mzCenter <- as.numeric( - unlist(lapply(p_control$cpdID, function(id) findMz(id, mode, retrieval=findLevel(id,TRUE))$mzCenter)) ) - p_control_noMH <- p_control[ - (p_control$mzFound < p_control$mzCenter - 1) | - (p_control$mzFound > p_control$mzCenter + 1),] - return(p_control_noMH) -} - - -#' Generate list of problematic peaks -#' -#' Generates a list of intense unmatched peaks for further review (the "failpeak list") and exports it if the archive name is given. -#' -#' @param w \code{msmsWorkspace} to analyze. -#' @param mode Processing mode (pH etc) -#' @param archivename Base name of the archive to write to (for "abc" the exported failpeaks list will be "abc_Failpeaks.csv"). -#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" -#' if the only know thing is the m/z -#' @return Returns the aggregate data.frame with added column "\code{problematic}" (logical) which marks peaks which match the problematic criteria -#' -#' @author stravsmi -#' @export -processProblematicPeaks <- function(w, mode, archivename = NA) -{ - - specs <- w@aggregated - fp <- problematicPeaks(specs[!specs$filterOK & !specs$noise & - ((specs$dppm == specs$dppmBest) | (is.na(specs$dppmBest))) - ,,drop=FALSE], peaksMatched(w), mode) - fp$OK <- rep('', nrow(fp)) - fp$name <- rownames(fp) - - fp <- fp[with(fp, - order(cpdID, mzCalc, scan)), - ] - - # Select the correct precursor scans. This serves to filter the list - # for the cases where multiple workspaces were combined after step 7 - # with combineMultiplicities. - # Note that this has drawbacks. Leaving the "duplicates" in would make it more easy - # to identify legitimate unformulaed peaks. We might experiment by marking them up - # somehow. - precursors <- unlist(lapply(selectSpectra(w, "found", "object"), function(s) s@parent@acquisitionNum)) - fp <- fp[ - fp$parentScan %in% precursors - ,] - - # Add the info to specs - specs <- addProperty(specs, "problematicPeak", "logical", FALSE) - specs[match(fp$index, specs$index),"problematicPeak"] <- TRUE - - # Select the columns for output into the failpeaks file - fp <- fp[,c("OK", "name", "cpdID", "scan", "mzFound", "formula", - "reanalyzed.formula", "mzCalc", "reanalyzed.mzCalc", "dppm", "reanalyzed.dppm", "dbe", "reanalyzed.dbe", "intensity", - "formulaCount", "reanalyzed.formulaCount", "parentScan", "aMax", "mzCenter")] - if(!is.na(archivename)) - write.csv(fp, file= - paste(archivename,"_Failpeaks.csv", sep=''), row.names=FALSE) - - return(specs) -} - -#' Recalibrate MS/MS spectra -#' -#' Recalibrates MS/MS spectra by building a recalibration curve of the -#' assigned putative fragments of all spectra in \code{aggregatedSpecs} -#' (measured mass vs. mass of putative associated fragment) and additionally -#' the parent ion peaks. -#' -#' Note that the actually used recalibration functions are governed by the -#' general MassBank settings (see \code{\link{recalibrate}}). -#' -#' If a set of acquired LC-MS runs contains spectra for two different ion types -#' (e.g. [M+H]+ and [M+Na]+) which should both be processed by RMassBank, it is -#' necessary to do this in two separate runs. Since it is likely that one ion type -#' will be the vast majority of spectra (e.g. most in [M+H]+ mode), and only few -#' spectra will be present for other specific adducts (e.g. only few [M+Na]+ spectra), -#' it is possible that too few spectra are present to build a good recalibration curve -#' using only e.g. the [M+Na]+ ions. Therefore we recommend, for one set of LC/MS runs, -#' to build the recalibration curve for one ion type -#' (\code{msmsWorkflow(mode="pH", steps=c(1:8), newRecalibration=TRUE)}) -#' and reuse the same curve for processing different ion types -#' (\code{msmsWorkflow(mode="pNa", steps=c(1:8), newRecalibration=FALSE)}). -#' This also ensures a consistent recalibration across all spectra of the same batch. -#' -#' @usage makeRecalibration(w, mode, -#' recalibrateBy = getOption("RMassBank")$recalibrateBy, -#' recalibrateMS1 = getOption("RMassBank")$recalibrateMS1, -#' recalibrator = getOption("RMassBank")$recalibrator, -#' recalibrateMS1Window = getOption("RMassBank")$recalibrateMS1Window -#' ) -#' -#' recalibrateSpectra(mode, rawspec = NULL, rc = NULL, rc.ms1=NULL, w = NULL, -#' recalibrateBy = getOption("RMassBank")$recalibrateBy, -#' recalibrateMS1 = getOption("RMassBank")$recalibrateMS1) -#' -#' recalibrateSingleSpec(spectrum, rc, -#' recalibrateBy = getOption("RMassBank")$recalibrateBy) -#' @aliases makeRecalibration recalibrateSpectra recalibrateSingleSpec -#' @param w For \code{makeRecalibration}: to perform the recalibration with. For \code{recalibrateSpectra}: -#' the \code{msmsWorkspace} which contains the recalibration curves (alternatively to specifying \code{rc, rc.ms1}). -#' @param spectrum For \code{recalibrateSingleSpec}: -#' a \code{MSnbase} \code{Spectrum}-derived object, commonly a \code{RmbSpectrum2} for MS2 or \code{Spectrum1} for MS1. -#' @param mode \code{"pH", "pNa", "pM", "mH", "mM", "mFA"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-). -#' @param rawspec For \code{recalibrateSpectra}:an \code{RmbSpectraSetList} of \code{RmbSpectraSet} objects -#' , as the \code{w@@spectra} slot from \code{msmsWorkspace} or any object returned by \code{\link{findMsMsHR}}. -#' If empty, no spectra are recalibrated, but the recalibration curve is -#' returned. -#' @param rc,rc.ms1 The recalibration curves to be used in the recalibration. -#' @param recalibrateBy Whether recalibration should be done by ppm ("ppm") or by m/z ("mz"). -#' @param recalibrateMS1 Whether MS1 spectra should be recalibrated separately ("separate"), -#' together with MS2 ("common") or not at all ("none"). Usually taken from settings. -#' @param recalibrator The recalibrator functions to be used. -#' Refer to \code{\link{recalibrate}} for details. Usually taken from settings. -#' @param recalibrateMS1Window Window width to look for MS1 peaks to recalibrate (in ppm). -#' @return \code{makeRecalibration}: a \code{list(rc, rc.ms1)} with recalibration curves -#' for the MS2 and MS1 spectra. -#' -#' \code{recalibrateSpectra}: if \code{rawspec} is not \code{NULL}, returns the recalibrated -#' spectra as \code{RmbSpectraSetList}. All spectra have their mass recalibrated and evaluation data deleted. -#' -#' \code{recalibrateSingleSpec}: the recalibrated \code{Spectrum} (same object, recalibrated masses, -#' evaluation data like assigned formulae etc. deleted). -#' -#' @examples \dontrun{ -#' rcCurve <- recalibrateSpectra(w, "pH") -#' w@@spectra <- recalibrateSpectra(mode="pH", rawspec=w@@spectra, w=myWorkspace) -#' w@@spectra <- recalibrateSpectra(mode="pH", rawspec=w@@spectra, rcCurve$rc, rcCurve$rc.ms1) -#' } -#' -#' @author Michael Stravs, Eawag -#' @export -makeRecalibration <- function(w, mode, - recalibrateBy = getOption("RMassBank")$recalibrateBy, - recalibrateMS1 = getOption("RMassBank")$recalibrateMS1, - recalibrator = getOption("RMassBank")$recalibrator, - recalibrateMS1Window = getOption("RMassBank")$recalibrateMS1Window - ) -{ - if(is.null(w@spectra)) - stop("No spectra present to generate recalibration curve.") - - rcdata <- peaksMatched(w) - rcdata <- rcdata[rcdata$formulaCount == 1, ,drop=FALSE] - - rcdata <- rcdata[,c("mzFound", "dppm", "mzCalc")] - - if(nrow(rcdata) == 0) - stop("No peaks matched to generate recalibration curve.") - - ms1data <- recalibrate.addMS1data(w@spectra, mode, recalibrateMS1Window) - ms1data <- ms1data[,c("mzFound", "dppm", "mzCalc")] - - if (recalibrateMS1 != "none") { - ## Add m/z values from MS1 to calibration datapoints - rcdata <- rbind(rcdata, ms1data) - } - - rcdata$dmz <- rcdata$mzFound - rcdata$mzCalc - ms1data$dmz <- ms1data$mzFound - ms1data$mzCalc - - if(recalibrateBy == "dppm") - { - rcdata$recalfield <- rcdata$dppm - ms1data$recalfield <- ms1data$dppm - } - else - { - rcdata$recalfield <- rcdata$dmz - ms1data$recalfield <- ms1data$dmz - } - - # generate recalibration model - rc <- do.call(recalibrator$MS2, list(rcdata)) - if(recalibrateMS1 == "separate") - rc.ms1 <- do.call(recalibrator$MS1, list(ms1data)) - else - rc.ms1 <- rc - - # plot the model - par(mfrow=c(2,2)) - if(nrow(rcdata)>0) - plotRecalibration.direct(rcdata, rc, rc.ms1, "MS2", - range(rcdata$mzFound), - recalibrateBy) - if(nrow(ms1data)>0) - plotRecalibration.direct(ms1data, rc, rc.ms1, "MS1", - range(ms1data$mzFound), - recalibrateBy) - # Return the computed recalibration curves - return(list(rc=rc, rc.ms1=rc.ms1)) -} - - - -#' Plot the recalibration graph. -#' -#' @aliases plotRecalibration plotRecalibration.direct -#' @usage plotRecalibration(w, recalibrateBy = getOption("RMassBank")$recalibrateBy) -#' -#' plotRecalibration.direct(rcdata, rc, rc.ms1, title, mzrange, -#' recalibrateBy = getOption("RMassBank")$recalibrateBy) -#' -#' @param w The workspace to plot the calibration graph from -#' @param rcdata A data frame with columns \code{recalfield} and \code{mzFound}. -#' @param rc Predictor for MS2 data -#' @param rc.ms1 Predictor for MS1 data -#' @param title Prefix for the graph titles -#' @param mzrange m/z value range for the graph -#' @param recalibrateBy Whether recalibration was done by ppm ("ppm") or by m/z ("mz"). -#' Important only for graph labeling here. -#' -#' @author Michele Stravs, Eawag -#' @export -plotRecalibration <- function(w, recalibrateBy = getOption("RMassBank")$recalibrateBy) -{ - spec <- w@aggregated - if(!is.null(w@parent)) - spec <- w@parent@aggregated - - rcdata <- data.frame(mzFound = w@rc$x, recalfield = w@rc$y) - ms1data <- data.frame(mzFound = w@rc.ms1$x, recalfield = w@rc.ms1$y) - - - - par(mfrow=c(2,2)) - if(nrow(rcdata)>0) - plotRecalibration.direct(rcdata, w@rc, w@rc.ms1, "MS2", - range(spec$mzFound[which(spec$good)]),recalibrateBy) - if(nrow(ms1data)>0) - plotRecalibration.direct(ms1data, w@rc, w@rc.ms1, "MS1", - range(ms1data$mzFound),recalibrateBy) - -} - -#' @export -plotRecalibration.direct <- function(rcdata, rc, rc.ms1, title, mzrange, - recalibrateBy = getOption("RMassBank")$recalibrateBy - ) -{ - if(recalibrateBy == "dppm") - ylab.plot <- expression(paste(delta, "ppm")) - else - ylab.plot <- expression(paste(delta, "m/z")) - - plot(recalfield ~ mzFound, data=rcdata, - xlab = "m/z", ylab = ylab.plot, main=paste(title, "scatterplot")) - RcModelMz <- seq(mzrange[[1]], mzrange[[2]], by=0.2) - RcModelRecal <- predict(rc, newdata= data.frame(mzFound =RcModelMz)) - RcModelRecalMs1 <- predict(rc.ms1, newdata= data.frame(mzFound =RcModelMz)) - lines(RcModelMz, RcModelRecal, col="blue") - lines(RcModelMz, RcModelRecalMs1, col="yellow") - if((length(unique(rcdata$mzFound))>1) & - (length(unique(rcdata$recalfield))>1)) - { - if(requireNamespace("gplots",quietly=TRUE)) - { - - gplots::hist2d(rcdata$mzFound, rcdata$recalfield, - col=c("white", heat.colors(12)), xlab="m/z", - ylab = ylab.plot, main=paste(title, "density")) - lines(RcModelMz, RcModelRecal, col="blue") - lines(RcModelMz, RcModelRecalMs1, col="yellow") - } - else - { - message("Package gplots not installed. The recalibration density plot will not be displayed.") - message("To install gplots: install.packages('gplots')") - } - } -} - - -#' @export -recalibrateSpectra <- function(mode, rawspec = NULL, rc = NULL, rc.ms1=NULL, w = NULL, - recalibrateBy = getOption("RMassBank")$recalibrateBy, - recalibrateMS1 = getOption("RMassBank")$recalibrateMS1) -{ - # Load the recal curves from the workspace if one is specified. - if(!is.null(w)) - { - rc <- w@rc - rc.ms1 <- w@rc.ms1 - } - if(is.null(rc) || is.null(rc.ms1)) - stop("Please specify the recalibration curves either via workspace (w) or via parameters rc, rc.ms1.") - - # Do the recalibration - if(!is.null(rawspec)) - { - # go through all raw spectra and recalculate m/z values - recalibratedSpecs <- lapply(rawspec, function(s) - { - if(s@found) - { - # recalculate tandem spectrum peaks - recalSpectra <- lapply(s@children, function(p) - { - recalibrateSingleSpec(p, rc, recalibrateBy) - }) - s@children <- as(recalSpectra, "SimpleList") - # recalculate MS1 spectrum if required - if(recalibrateMS1 != "none") - { - s@parent <- recalibrateSingleSpec(s@parent, rc.ms1, recalibrateBy) - } - } - s@empty <- NA - s@complete <- NA - return(s) - } ) - return(as(recalibratedSpecs, "SimpleList")) - } - else # no rawspec passed - return(list()) -} - -#' @export -recalibrateSingleSpec <- function(spectrum, rc, - recalibrateBy = getOption("RMassBank")$recalibrateBy) -{ - spectrum.df <- as.data.frame(spectrum) - spectrum.df <- spectrum.df[!duplicated(spectrum.df$mz),,drop=FALSE] - spectrum.df <- spectrum.df[order(spectrum.df$mz),,drop=FALSE] - - mzVals <- spectrum.df - if(nrow(mzVals) > 0) - { - # Fix the column names so our - # prediction functions choose the right - # rows. - colnames(mzVals) <- c("mzFound", "int") - drecal <- predict(rc, newdata=mzVals) - if(recalibrateBy == "dppm") - mzRecal <- mzVals$mzFound / (1 + drecal/1e6) - else - mzRecal <- mzVals$mzFound - drecal - # And rename them back so our "mz" column is - # called "mz" again - } - spectrum.df$mz <- mzRecal - - - # now comes the part that I don't like too much; this could be improved by using as.data.frame instead of getData and correspondingly - # also not use setData. For now I leave it like this. - # The problem is that I am not sure whether the default behaviour of as.RmbSpectrum2 should be clean=TRUE or FALSE, - # and vice versa, I am not sure if as.data.frame should return only mz/int or the whole table. - - if(is(spectrum, "RmbSpectrum2")) - { - # this removes all evaluated data that were added in step 2 except for @ok I think - colnames(spectrum.df) <- c("mz", "intensity") - spectrum <- setData(spectrum, spectrum.df, clean=TRUE) - # It also avoids making a new object when we don't know what class it should be - } - else - { - # for Spectrum1 or all others that we don't know - spectrum@mz <- spectrum.df$mz - spectrum@intensity <- spectrum.df$i - } - - return(spectrum) -} - - - - - -#' Filter satellite peaks -#' -#' Filters satellite peaks in FT spectra which arise from FT artifacts and from -#' conversion to stick mode. A very simple rule is used which holds mostly true -#' for MSMS spectra (and shouldn't be applied to MS1 spectra which contain -#' isotope structures...) -#' -#' The function cuts off all peaks within 0.5 m/z from every peak, in -#' decreasing intensity order, which are below 5% of the referring peak's -#' intensity. E.g. for peaks m/z=100, int=100; m/z=100.2, int=2, m/z=100.3, -#' int=6, m/z 150, int=10: The most intense peak (m/z=100) is selected, all -#' neighborhood peaks below 5% are removed (in this case, only the m/z=100.2 -#' peak) and the next less intense peak is selected. Here this is the m/z=150 -#' peak. All low-intensity neighborhood peaks are removed (nothing). The next -#' less intense peak is selected (m/z=100.3) and again neighborhood peaks are -#' cut away (nothing to cut here. Note that the m/z = 100.2 peak was alredy -#' removed.) -#' -#' @usage filterPeakSatellites(peaks, filterSettings = getOption("RMassBank")$filterSettings) -#' @param peaks A peak dataframe with at least the columns \code{mz, int}. Note -#' that \code{mz} is used even for the recalibrated spectra, i.e. the -#' desatellited spectrum is identical for both the unrecalibrated and the -#' recalibrated spectra. -#' @param filterSettings The settings used for filtering. Refer to \code{\link{analyzeMsMs}} -#' documentation for filter settings. -#' @return Returns the peak table with satellite peaks removed. -#' @note This is a very crude rule, but works remarkably well for our spectra. -#' @author Michael Stravs -#' @seealso \code{\link{analyzeMsMs}}, \code{\link{filterLowaccResults}} -#' @examples -#' -#' # From the workflow: -#' \dontrun{ -#' # Filter out satellite peaks: -#' shot <- filterPeakSatellites(shot) -#' shot_satellite_n <- setdiff(row.names(shot_full), row.names(shot)) -#' shot_satellite <- shot_full[shot_satellite_n,] -#' # shot_satellite contains the peaks which were eliminated as satellites. -#' } -#' -#' @export -filterPeakSatellites <- function(peaks, filterSettings = getOption("RMassBank")$filterSettings) -{ - cutoff_int_limit <- filterSettings$satelliteIntLimit - cutoff_mz_limit <- filterSettings$satelliteMzLimit - # Order by intensity (descending) - peaks_o <- peaks[order(peaks$intensity, decreasing=TRUE),,drop=FALSE] - n <- 1 - # As long as there are peaks left AND the last peak is small enough (relative - # to selected), move to the next peak - while(n < nrow(peaks_o)) - { - if(peaks_o[nrow(peaks_o),"intensity"] >= cutoff_int_limit *peaks_o[n,"intensity"]) - break - # remove all peaks within cutoff_mz_limit (std. m/z = 0.5) which have intensity - # of less than 5% relative to their "parent" peak - # - peaks_l <- peaks_o[ (peaks_o$mz > peaks_o[n,"mz"] - cutoff_mz_limit) - & (peaks_o$mz < peaks_o[n,"mz"] + cutoff_mz_limit) - & (peaks_o$intensity < cutoff_int_limit * peaks_o[n,"intensity"]),,drop=FALSE] - peaks_o <- peaks_o[ !((peaks_o$mz > peaks_o[n,"mz"] - cutoff_mz_limit) - & (peaks_o$mz < peaks_o[n,"mz"] + cutoff_mz_limit) - & (peaks_o$intensity < cutoff_int_limit * peaks_o[n,"intensity"]) - ),,drop=FALSE] - n <- n+1 - } - return(peaks_o[order(peaks_o$mz),,drop=FALSE]) -} - - -#' Reanalyze unmatched peaks -#' -#' Reanalysis of peaks with no matching molecular formula by allowing -#' additional elements (e.g. "N2O"). -#' -#' \code{reanalyzeFailpeaks} examines the \code{unmatchedPeaksC} table in -#' \code{specs} and sends every peak through \code{reanalyzeFailpeak}. -#' -#' @aliases reanalyzeFailpeaks reanalyzeFailpeak -#' @usage reanalyzeFailpeaks(aggregated, custom_additions, mode, filterSettings = -#' getOption("RMassBank")$filterSettings, progressbar = "progressBarHook") -#' reanalyzeFailpeak(custom_additions, mass, cpdID, counter, pb = NULL, mode, -#' filterSettings = getOption("RMassBank")$filterSettings) -#' @param aggregated A peake aggregate table (\code{w@@aggregate}) (after processing electronic noise removal!) -#' @param custom_additions The allowed additions, e.g. "N2O". -#' @param mode Processing mode (\code{"pH", "pNa", "mH"} etc.) -#' @param mass (Usually recalibrated) m/z value of the peak. -#' @param cpdID Compound ID of this spectrum. -#' @param counter Current peak index (used exclusively for the progress -#' indicator) -#' @param pb A progressbar object to display progress on, as passed by -#' \code{reanalyzeFailpeaks} to \code{reanalyzeFailpeak}. No progress -#' is displayed if NULL. -#' @param progressbar The progress bar callback to use. Only needed for specialized -#' applications. Cf. the documentation of \code{\link{progressBarHook}} for usage. -#' @param filterSettings Settings for filtering data. Refer to\code{\link{analyzeMsMs}} for settings. -#' @return The aggregate data frame extended by the columns: -#' #' \item{reanalyzed.???}{If reanalysis (step 7) has already been processed: matching values from the reanalyzed peaks} -#' \item{matchedReanalysis}{Whether reanalysis has matched (\code{TRUE}), not matched(\code{FALSE}) or has not been conducted for the peak(\code{NA}).} -#' -#' It would be good to merge the analysis functions of \code{analyzeMsMs} with -#' the one used here, to simplify code changes. -#' @author Michael Stravs -#' @seealso \code{\link{analyzeMsMs}}, \code{\link{msmsWorkflow}} -#' @examples -#' -#' ## As used in the workflow: -#' \dontrun{ -#' reanalyzedRcSpecs <- reanalyzeFailpeaks(w@@aggregated, custom_additions="N2O", mode="pH") -#' # A single peak: -#' reanalyzeFailpeak("N2O", 105.0447, 1234, 1, 1, "pH") -#' } -#' -#' @export -reanalyzeFailpeaks <- function(aggregated, custom_additions, mode, filterSettings = - getOption("RMassBank")$filterSettings, progressbar = "progressBarHook") -{ - - fp <- peaksUnmatched(aggregated, cleaned=TRUE) - fp <- fp[is.na(fp$dppm) | (fp$dppm == fp$dppmBest),] - #fp <- pu[!pu$noise,,drop=FALSE] - - custom_additions_l <- as.list(rep(x=custom_additions, times=nrow(fp))) - mode_l <- as.list(rep(x=mode, times=nrow(fp))) - nLen <- nrow(fp) - - pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=max(nLen,1))) - temp <- data.frame() - if(nLen == 0) - { - message("reanalyzeFailpeaks: No peaks to reanalyze.") - temp <- data.frame( - "reanalyzed.formula" = character(), - "reanalyzed.mzCalc" = numeric(), - "reanalyzed.dppm" = numeric(), - "reanalyzed.formulaCount" = numeric(), - "reanalyzed.dbe" = numeric()) - } - else - { - counter <- as.list(1:nrow(fp)) - # this is the reanalysis step: run reanalyze.failpeak (with the relevant parameters) - # on each failpeak. - temp <- mapply(reanalyzeFailpeak, custom_additions_l, fp$mzFound, fp$cpdID, counter, - MoreArgs=list(mode=mode, pb=list(hook=progressbar, bar=pb), filterSettings=filterSettings)) - # reformat the result and attach it to specs - temp <- as.data.frame(t(temp)) - temp <- temp[,c("reanalyzed.formula", "reanalyzed.mzCalc", "reanalyzed.dppm", - "reanalyzed.formulaCount", "reanalyzed.dbe")] - } - - # Add columns to the aggregated table (they are then filled in with the obtained values for reanalyzed peaks and left - # empty otherwise - aggregated <- addProperty(aggregated, "reanalyzed.formula", "character") - aggregated <- addProperty(aggregated, "reanalyzed.mzCalc", "numeric") - aggregated <- addProperty(aggregated, "reanalyzed.dppm", "numeric") - aggregated <- addProperty(aggregated, "reanalyzed.formulaCount", "numeric") - aggregated <- addProperty(aggregated, "reanalyzed.dbe", "numeric") - aggregated <- addProperty(aggregated, "matchedReanalysis", "logical", NA) - - - peaksReanalyzed <- cbind(fp, temp) - - # Since some columns are in "list" type, they disturb later on. - # therefore, fix them and make them normal vectors. - listcols <- unlist(lapply(colnames(peaksReanalyzed), function(col) - is.list(peaksReanalyzed[,col]))) - for(col in colnames(peaksReanalyzed)[which(listcols==TRUE)]) - peaksReanalyzed[,col] <- - unlist(peaksReanalyzed[,col]) - - peaksReanalyzed$matchedReanalysis <- !is.na(peaksReanalyzed$reanalyzed.dppm) - - # Substitute in the reanalyzed peaks into the aggregated table - aggregated[match(peaksReanalyzed$index, aggregated$index),] <- peaksReanalyzed - - do.call(progressbar, list(object=pb, close=TRUE)) - return(aggregated) -} - - -#' @export -reanalyzeFailpeak <- function(custom_additions, mass, cpdID, counter, pb = NULL, mode, - filterSettings = getOption("RMassBank")$filterSettings) -{ - # the counter to show the progress - if(!is.null(pb)) - { - do.call(pb$hook, list(object=pb$bar, value=counter)) - } - # here follows the Rcdk analysis - #------------------------------------ - - # define the adduct additions - if(mode == "pH") { - allowed_additions <- "H" - mode.charge <- 1 - } else if(mode == "pNa") { - allowed_additions <- "Na" - mode.charge <- 1 - } else if(mode == "pM") { - allowed_additions <- "" - mode.charge <- 1 - } else if(mode == "mM") { - allowed_additions <- "" - mode.charge <- -1 - } else if(mode == "mH") { - allowed_additions <- "H-1" - mode.charge <- -1 - } else if(mode == "mFA") { - allowed_additions <- "C2H3O2" - mode.charge <- -1 - } else { - stop("mode = \"", mode, "\" not defined") - } - - # the ppm range is two-sided here. - # The range is slightly expanded because dppm calculation of - # generate.formula starts from empirical mass, but dppm cal- - # culation of the evaluation starts from theoretical mass. - # So we don't miss the points on 'the border'. - - db_formula <- findFormula(cpdID, retrieval=findLevel(cpdID,TRUE)) - - ppmlimit <- 2.25 * filterSettings$ppmFine - parent_formula <- add.formula(db_formula, allowed_additions) - parent_formula <- add.formula(parent_formula, custom_additions) - dbe_parent <- dbe(parent_formula) - # check whether the formula is valid, i.e. has no negative or zero element numbers. - #print(parent_formula) - limits <- to.limits.rcdk(parent_formula) - - peakformula <- tryCatch(suppressWarnings(generate.formula(mass, ppm(mass, ppmlimit, p=TRUE), - limits, charge=mode.charge)), error=function(e) NA) - # was a formula found? If not, return empty result - if(!is.list(peakformula)) - return(as.data.frame( - t(c(mzFound=as.numeric(as.character(mass)), - reanalyzed.formula=NA, reanalyzed.mzCalc=NA, reanalyzed.dppm=NA, - reanalyzed.formulaCount=0, - reanalyzed.dbe=NA)))) - else # if is.list(peakformula) - # formula found? then return the one with lowest dppm - { - # calculate dppm for all formulas - peakformula <- sapply(peakformula, function(f) - { - l <- list(mzFound=as.numeric(as.character(mass)), - reanalyzed.formula=as.character(f@string), - reanalyzed.mzCalc=as.numeric(as.character(f@mass)) - ) - - return(unlist(l)) - }) - - # filter out bad dbe stuff - peakformula <- as.data.frame(t(peakformula)) - # for some reason completely oblivious to me, the columns in peakformula - # are still factors, even though i de-factored them by hand. - # Therefore, convert them again... - peakformula$mzFound <- as.numeric(as.character(peakformula$mzFound)) - peakformula$reanalyzed.formula <- as.character(peakformula$reanalyzed.formula) - peakformula$reanalyzed.mzCalc <- as.numeric(as.character(peakformula$reanalyzed.mzCalc)) - - peakformula$reanalyzed.dppm <- (peakformula$mzFound / peakformula$reanalyzed.mzCalc - 1) * 1e6 - peakformula$reanalyzed.formulaCount=nrow(peakformula) - - # filter out bad dbe and high ppm stuff - peakformula$reanalyzed.dbe <- unlist(lapply(peakformula$reanalyzed.formula, dbe)) - peakformula <- peakformula[(peakformula$reanalyzed.dbe >= filterSettings$dbeMinLimit) - & (abs(peakformula$reanalyzed.dppm) < filterSettings$ppmFine),] - # is there still something left? - if(nrow(peakformula) == 0) - return(as.data.frame( - t(c(mzFound=as.numeric(as.character(mass)), - reanalyzed.formula=NA, reanalyzed.mzCalc=NA, reanalyzed.dppm=NA, - reanalyzed.formulaCount=0, reanalyzed.dbe = NA)))) - else - { - #update formula count to the remaining formulas - peakformula$reanalyzed.formulaCount=nrow(peakformula) - return(peakformula[which.min(abs(peakformula$reanalyzed.dppm)),]) - } - - } # endif is.list(peakformula) - - - - } - -#' Multiplicity filtering: Removes peaks which occur only once in a n-spectra set. -#' -#' For every compound, every peak (with annotated formula) is compared -#' across all spectra. Peaks whose formula occurs only once for all collision energies -#' / spectra types, are discarded. This eliminates "stochastic formula hits" of pure -#' electronic noise peaks efficiently from the spectra. Note that in the author's -#' experimental setup two spectra were recorded at every collision energy, -#' and therefore every peak-formula should appear -#' at least twice if it is real, even if it is by chance a fragment which appears -#' on only one collision energy setting. The function was not tested in a different -#' setup. Therefore, use with a bit of caution. -#' @usage filterPeaksMultiplicity(peaks, formulacol, recalcBest = TRUE) -#' @param peaks An aggregate peak data.frame containing all peaks to be analyzed; with at least -#' the columns \code{cpdID, scan, mzFound} and one column for the formula -#' specified with the \code{formulacol} parameter. -#' @param formulacol Which column the assigned formula is stored in. (Needed to separately process \code{"formula"} and -#' \code{"reanalyzed.formula"} multiplicites.) -#' @param recalcBest Whether the best formula for each peak should be re-determined. -#' This is necessary for results from the ordinary \code{\link{analyzeMsMs}} -#' analysis which allows multiple potential formulas per peak - the old best match -#' could potentially have been dropped because of multiplicity filtering. For results -#' from \code{\link{reanalyzeFailpeak}} this is not necessary, since only one potential -#' formula is assigned in this case. -#' @return The peak table is returned, enriched with columns: -#' \itemize{ -#' \item{\code{formulaMultiplicity}}{The # of occurrences of this formula -#' in the spectra of its compounds.} -#' } -#' @examples \dontrun{ -#' peaksFiltered <- filterPeaksMultiplicity(peaksMatched(w), -#' "formula", TRUE) -#' peaksOK <- subset(peaksFiltered, formulaMultiplicity > 1) -#' } -#' @author Michael Stravs, EAWAG -#' @export -filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE) -{ - # create dummy for the case that we have no rows - multInfo <- data.frame(cpdID = character(), - formulacol = character(), - formulaMultiplicity = numeric()) - # rename (because "formulacol" is not the actually correct name) - colnames(multInfo) <- c("cpdID", formulacol, "formulaMultiplicity") - - if(!is.data.frame(peaks) || (nrow(peaks) == 0) ) - { - peaks <- cbind(peaks, data.frame(formulaMultiplicity=numeric())) - if(recalcBest){ - if(formulacol == "formula"){ - warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.") - } - if(formulacol == "reanalyzed.formula"){ - warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.") - } - peaks$fM_factor <- as.factor(peaks$formulaMultiplicity) - return(peaks) - } - } - else - { - # calculate duplicity info - multInfo <- aggregate(as.data.frame(peaks$scan), - list(peaks$cpdID, peaks[,formulacol]), FUN=length) - # just for comparison: - # nform <- unique(paste(pks$cpdID,pks$formula)) - - # merge the duplicity info into the peak table - colnames(multInfo) <- c("cpdID", formulacol, "formulaMultiplicity") - peaks <- merge(peaks, multInfo) - } - - # separate log intensity data by duplicity (needs duplicity as a factor) - # and boxplot - peaks$fM_factor <- as.factor(peaks$formulaMultiplicity) - - # nostalgy: dppmBest first, to compare :) - # now we prioritize the most frequent formula instead, and only then apply the - # dppmBest rule - #pks2 <- subset(pks, dppm==dppmBest) - - # split peak intensity by multiplicity - peakMultiplicitySets <- split(log(peaks$int,10), peaks$fM_factor) - #boxplot(peakMultiplicitySets) - # nice plot :) - #if(length(peakMultiplicitySets) > 0) - # q <- quantile(peakMultiplicitySets[[1]], c(0,.25,.5,.75,.95,1)) - pk_data <- lapply(peakMultiplicitySets, length) - - # now by formula, not by peak: - multInfo$fM_factor <- as.factor(multInfo$formulaMultiplicity) - # the formulas are split into bins with their multiplicity - # (14 bins for our 14-spectra method) - formulaMultiplicitySets <- split(multInfo[,formulacol], multInfo$fM_factor) - formulaMultiplicityHist <- lapply(formulaMultiplicitySets, length) - - # if we use recalcBest, then we recalculate which peak in the - # list was best. We do this for the peaks matched in the first analysis. - # The peaks from the reanalysis are single anyway and don't get this additional - # treatment. - - if(recalcBest == FALSE) - return(peaks) - - # prioritize duplicate peaks - # get unique peaks with their maximum-multiplicity formula attached - best_mult <- aggregate(as.data.frame(peaks$formulaMultiplicity), - list(peaks$cpdID, peaks$scan, peaks$mzFound), - max) - colnames(best_mult) <- c("cpdID", "scan", "mzFound", "bestMultiplicity") - peaks <- merge(peaks, best_mult) - peaks <- peaks[peaks$formulaMultiplicity==peaks$bestMultiplicity,] - - # now we also have to recalculate dppmBest since the "old best" may have been - # dropped. - peaks$dppmBest <- NULL - bestPpm <- aggregate(as.data.frame(peaks$dppm), - list(peaks$cpdID, peaks$scan, peaks$mzFound), - function(dppm) dppm[[which.min(abs(dppm))]]) - colnames(bestPpm) <- c("cpdID", "scan", "mzFound", "dppmBest") - peaks <- merge(peaks, bestPpm) - pks_best <- peaks[peaks$dppm==peaks$dppmBest,] - - # And, iteratively, the multiplicity also must be recalculated, because we dropped - # some peaks and the multiplicites of some of the formulas will have decreased. - - pks_best$formulaMultiplicity <- NULL - pks_best$bestMultiplicity <- NULL - multInfo_best <- aggregate(as.data.frame(pks_best$scan), - list(pks_best$cpdID, pks_best[,formulacol]), - FUN=length) - colnames(multInfo_best) <- c("cpdID", formulacol, "formulaMultiplicity") - pks_best <- merge(pks_best, multInfo_best) - pks_best$fM_factor <- as.factor(pks_best$formulaMultiplicity) - multInfo_best$fM_factor <- as.factor(multInfo_best$formulaMultiplicity) - - formulaMultplicitySets_best <- split(multInfo_best[,formulacol], multInfo_best$fM_factor) - formulaMultplicityHist_best <- lapply(formulaMultplicitySets_best, length) - - peakMultiplicitySets_best <- split(log(pks_best$int,10), pks_best$fM_factor) - #boxplot(peakMultiplicitySets_best) - #q <- quantile(peakMultiplicitySets_best[[1]], c(0,.25,.5,.75,.95,1)) - #peakMultiplicityHist_best <- lapply(peakMultiplicitySets_best, length) - #q - pks_best$fM_factor <- NULL - # this returns the "best" peaks (first by formula multiplicity, then by dppm) - # before actually cutting the bad ones off. - - - return(pks_best) -} - - -#' filterMultiplicity -#' -#' Multiplicity filtering: Removes peaks which occur only once in a n-spectra -#' set. -#' -#' This function executes multiplicity filtering for a set of spectra using the -#' workhorse function \code{\link{filterPeaksMultiplicity}} (see details there) -#' and retrieves problematic filtered peaks (peaks which are of high intensity -#' but were discarded, because either no formula was assigned or it was not -#' present at least 2x), using the workhorse function -#' \code{\link{problematicPeaks}}. The results are returned in a format ready -#' for further processing with \code{\link{mbWorkflow}}. -#' -#' @usage filterMultiplicity(w, archivename=NA, mode="pH", recalcBest = TRUE, -#' multiplicityFilter = getOption("RMassBank")$multiplicityFilter) -#' @param w Workspace containing the data to be processed (aggregate table and \code{RmbSpectraSet} objects) -#' @param archivename The archive name, used for generation of -#' archivename_Failpeaks.csv -#' @param mode Mode of ion analysis -#' @param recalcBest Boolean, whether to recalculate the formula multiplicity -#' after the first multiplicity filtering step. Sometimes, setting this -#' to FALSE can be a solution if you have many compounds with e.g. fluorine -#' atoms, which often have multiple assigned formulas per peak and might occasionally -#' lose peaks because of that. -#' @param multiplicityFilter Threshold for the multiplicity filter. If set to 1, -#' no filtering will apply (minimum 1 occurrence of peak). 2 equals minimum -#' 2 occurrences etc. -#' @return A list object with values: -#' \item{peaksOK}{ Peaks with >1-fold formula multiplicity from the -#' "normal" peak analysis. } -#' \item{peaksReanOK}{ Peaks with >1-fold formula multiplicity from -#' peak reanalysis. } -#' \item{peaksFiltered}{ All peaks with annotated formula multiplicity from -#' first analysis. } -#' \item{peaksFilteredReanalysis}{ All peaks with annotated -#' formula multiplicity from peak reanalysis. } -#' \item{peaksProblematic}{ Peaks with high intensity which do not match -#' inclusion criteria -> possible false negatives. The list will be -#' exported into archivename_failpeaks.csv. -#' } -#' @author Michael Stravs -#' @seealso -#' \code{\link{filterPeaksMultiplicity}},\code{\link{problematicPeaks}} -#' @examples -#' \dontrun{ -#' refilteredRcSpecs <- filterMultiplicity( -#' w, "myarchive", "pH") -#' } -#' @export -filterMultiplicity <- function(w, archivename=NA, mode="pH", recalcBest = TRUE, - multiplicityFilter = getOption("RMassBank")$multiplicityFilter) -{ - # Read multiplicity filter setting - # For backwards compatibility: If the option is not set, define as 2 - # (which was the behaviour before we introduced the option) - if(is.null(multiplicityFilter)) - multiplicityFilter <- 2 - - specs <- w@aggregated - - peaksFiltered <- filterPeaksMultiplicity(peaksMatched(specs), - "formula", recalcBest) - - - peaksFilteredReanalysis <- - filterPeaksMultiplicity(specs[!is.na(specs$matchedReanalysis) & specs$matchedReanalysis,,drop=FALSE], "reanalyzed.formula", FALSE) - - - - specs <- addProperty(specs, "formulaMultiplicity", "numeric", 0) - - # Reorder the columns of the filtered peaks such that they match the columns - # of the original aggregated table; such that the columns can be substituted in. - - peaksFiltered <- peaksFiltered[,colnames(specs)] - peaksFilteredReanalysis <- peaksFilteredReanalysis[,colnames(specs)] - - # substitute into the parent dataframe - specs[match(peaksFiltered$index,specs$index),] <- peaksFiltered - specs[match(peaksFilteredReanalysis$index,specs$index),] <- peaksFilteredReanalysis - - - specs <- addProperty(specs, "filterOK", "logical", FALSE) - - OKindex <- which(specs$formulaMultiplicity > (multiplicityFilter - 1)) - - if(length(OKindex)){ - specs[OKindex,"filterOK"] <- TRUE - } - - peaksReanOK <- specs[ - specs$filterOK & !is.na(specs$matchedReanalysis) & specs$matchedReanalysis,,drop=FALSE] - - # Kick the M+H+ satellites out of peaksReanOK: - peaksReanOK$mzCenter <- as.numeric( - unlist(lapply(peaksReanOK$cpdID, function(id) findMz(id, retrieval=findLevel(id,TRUE))$mzCenter)) ) - peaksReanBad <- peaksReanOK[ - !((peaksReanOK$mzFound < peaksReanOK$mzCenter - 1) | - (peaksReanOK$mzFound > peaksReanOK$mzCenter + 1)),] - notOKindex <- match(peaksReanBad$index, specs$index) - if(length(notOKindex)){ - specs[notOKindex,"filterOK"] <- FALSE - } - - - return(specs) -} - -#' Return MS1 peaks to be used for recalibration -#' -#' Returns the precursor peaks for all MS1 spectra in the \code{spec} dataset -#' with annotated formula to be used in recalibration. -#' -#' For all spectra in \code{spec$specFound}, the precursor ion is extracted from -#' the MS1 precursor spectrum. All found ions are returned in a data frame with a -#' format matching \code{spec$peaksMatched} and therefore suitable for \code{rbind}ing -#' to the \code{spec$peaksMatched} table. However, only minimal information needed for -#' recalibration is returned. -#' -#' @usage recalibrate.addMS1data(spec,mode="pH", recalibrateMS1Window = -#' getOption("RMassBank")$recalibrateMS1Window) -#' @param spec A \code{msmsWorkspace} or \code{RmbSpectraSetList} containing spectra for which MS1 "peaks" should be "constructed". -#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). -#' @param recalibrateMS1Window Window width to look for MS1 peaks to recalibrate (in ppm). -#' @return A dataframe with columns \code{mzFound, formula, mzCalc, dppm, dbe, int, -#' dppmBest, formulaCount, good, cpdID, scan, parentScan, dppmRc}. However, -#' columns \code{dbe, int, formulaCount, good, scan, parentScan} do not contain -#' real information and are provided only as fillers. -#' @examples \dontrun{ -#' # More or less as used in recalibrateSpectra: -#' rcdata <- peaksMatched(w) -#' rcdata <- rcdata[rcdata$formulaCount == 1, ,drop=FALSE] -#' ms1data <- recalibrate.addMS1data(w, "pH", 15) -#' rcdata <- rbind(rcdata, ms1data) -#' # ... continue constructing recalibration curve with rcdata -#' } -#' @author Michael Stravs, EAWAG -#' @export -recalibrate.addMS1data <- function(spec,mode="pH", recalibrateMS1Window = - getOption("RMassBank")$recalibrateMS1Window) -{ - ## which_OK <- lapply(validPrecursors, function(pscan) - ## { - ## pplist <- as.data.frame( - ## mzR::peaks(msRaw, which(headerData$acquisitionNum == pscan))) - ## colnames(pplist) <- c("mz","int") - ## pplist <- subset(pplist, mz >= mzLimits$mzMin & mz <= mzLimits$mzMax) - ## if(nrow(pplist) > 0) - ## return(TRUE) - ## return(FALSE) - ## }) - - specFound <- selectSpectra(spec, "found", "object") - - ms1peaks <- lapply(specFound, function(cpd){ - mzL <- findMz.formula(cpd@formula,mode,recalibrateMS1Window,0) - mzCalc <- mzL$mzCenter - ms1 <- mz(cpd@parent) - - mzFound <- ms1[which.min(abs(ms1 - mzL$mzCenter))] - if(!length(mzFound)){ - return(c( - mzFound = NA, - mzCalc = mzCalc, - dppm = NA - )) - } else { - dppmRc <- (mzFound/mzCalc - 1)*1e6 - return(c( - mzFound = mzFound, - mzCalc = mzCalc, - dppm = dppmRc, - id=cpd@id - )) - } - }) - ms1peaks <- as.data.frame(do.call(rbind, ms1peaks), stringsAsFactors=FALSE) - # convert numbers to numeric - tonum <- c("mzFound", "dppm", "mzCalc") - ms1peaks[,tonum] <- as.numeric(unlist(ms1peaks[,tonum])) - # throw out NA stuff - ms1peaks <- ms1peaks[!is.na(ms1peaks$mzFound),] - return(ms1peaks) -} - - -# Custom recalibration function: You can overwrite the recal function by -# making any function which takes rcdata$recalfield ~ rcdata$mzFound. -# The settings define which recal function is used -# getOption("RMassBank")$recalibrator = list( -# MS1 = "recalibrate.loess", -# MS2 = "recalibrate.loess") - -#' Predefined recalibration functions. -#' -#' Predefined fits to use for recalibration: Loess fit and GAM fit. -#' -#' \code{recalibrate.loess()} provides a Loess fit (\code{recalibrate.loess}) -#' to a given recalibration parameter. -#' If MS and MS/MS data should be fit together, recalibrate.loess -#' provides good default settings for Orbitrap instruments. -#' -#' \code{recalibrate.identity()} returns a non-recalibration, i.e. a predictor -#' which predicts 0 for all input values. This can be used if the user wants to -#' skip recalibration in the RMassBank workflow. -#' -#' #' \code{recalibrate.mean()} and \code{recalibrate.linear()} are simple recalibrations -#' which return a constant shift or a linear recalibration. They will be only useful -#' in particular cases. -#' -#' \code{recalibrate()} itself is only a dummy function and does not do anything. -#' -#' Alternatively other functions can be defined. Which functions are used for recalibration -#' is specified by the RMassBank options file. (Note: if \code{recalibrateMS1: common}, the -#' \code{recalibrator: MS1} value is irrelevant, since for a common curve generated with -#' the function specified in \code{recalibrator: MS2} will be used.) -#' -#' @aliases recalibrate.loess recalibrate recalibrate.identity recalibrate.mean recalibrate.linear -#' @usage recalibrate.loess(rcdata) -#' -#' recalibrate.identity(rcdata) -#' -#' recalibrate.mean(rcdata) -#' -#' recalibrate.linear(rcdata) -#' -#' @param rcdata A data frame with at least the columns \code{recalfield} and -#' \code{mzFound}. \code{recalfield} will usually contain delta(ppm) or -#' delta(mz) values and is the target parameter for the recalibration. -#' @return Returns a model for recalibration to be used with \code{predict} and the like. -#' @examples \dontrun{ -#' rcdata <- subset(spec$peaksMatched, formulaCount==1) -#' ms1data <- recalibrate.addMS1data(spec, mode, 15) -#' rcdata <- rbind(rcdata, ms1data) -#' rcdata$recalfield <- rcdata$dppm -#' rcCurve <- recalibrate.loess(rcdata) -#' # define a spectrum and recalibrate it -#' s <- matrix(c(100,150,200,88.8887,95.0005,222.2223), ncol=2) -#' colnames(s) <- c("mz", "int") -#' recalS <- recalibrateSingleSpec(s, rcCurve) -#' -#' Alternative: define an custom recalibrator function with different parameters -#' recalibrate.MyOwnLoess <- function(rcdata) -#' { -#' return(loess(recalfield ~ mzFound, data=rcdata, family=c("symmetric"), -#' degree = 2, span=0.4)) -#' } -#' # This can then be specified in the RMassBank settings file: -#' # recalibrateMS1: common -#' # recalibrator: -#' # MS1: recalibrate.loess -#' # MS2: recalibrate.MyOwnLoess") -#' # [...] -#' } -#' @author Michael Stravs, EAWAG -#' @export -recalibrate <- function() -{ - return(NA) -} - -#' @export -recalibrate.loess <- function(rcdata) -{ - span <- 0.25 - # ex XCMS (permission by Steffen): heuristically decide on loess vs linear - mingroups <- nrow(rcdata[!is.na(rcdata$mzFound),]) - if(mingroups < 4) - { - warning("recalibrate.loess: Not enough data points, omitting recalibration") - return(recalibrate.identity(rcdata)) - } else if (mingroups*span < 4) { - span <- 4/mingroups - warning("recalibrate.loess: Span too small, resetting to ", round(span, 2)) - } - return(loess(recalfield ~ mzFound, data=rcdata, family=c("symmetric"), - degree = 1, span=0.25, surface="direct" )) -} - -#' @export -recalibrate.identity <- function(rcdata) -{ - return(lm(recalfield ~ 0, data=rcdata)) -} - -#' @export -recalibrate.mean <- function(rcdata) -{ - return(lm(recalfield ~ 1, data=rcdata)) -} - -#' @export -recalibrate.linear <- function(rcdata) -{ - return(lm(recalfield ~ mzFound, data=rcdata)) -} - -#' Standard progress bar hook. -#' -#' This function provides a standard implementation for the progress bar in RMassBank. -#' -#' RMassBank calls the progress bar function in the following three ways: -#' \code{pb <- progressBarHook(object=NULL, value=0, min=0, max=LEN)} -#' to create a new progress bar. -#' \code{pb <- progressBarHook(object=pb, value= VAL)} -#' to set the progress bar to a new value (between the set \code{min} and \code{max}) -#' \code{progressBarHook(object=pb, close=TRUE)} -#' to close the progress bar. (The actual calls are performed with \code{do.call}, -#' e.g. -#' \code{progressbar <- "progressBarHook" -#' pb <- do.call(progressbar, list(object=pb, value= nProg)) -#' }. See the source code for details.) -#' -#' To substitute the standard progress bar for an alternative implementation (e.g. for -#' use in a GUI), the developer can write his own function which behaves in the same way -#' as \code{progressBarHook}, i.e. takes the same parameters and can be called in the -#' same way. -#' -#' @param object An identifier representing an instance of a progress bar. -#' @param value The new value to assign to the progress indicator -#' @param min The minimal value of the progress indicator -#' @param max The maximal value of the progress indicator -#' @param close If \code{TRUE}, the progress bar is closed. -#' @return Returns a progress bar instance identifier (i.e. an identifier -#' which can be used as \code{object} in subsequent calls.) -#' -#' @author Michele Stravs, Eawag -#' @export -progressBarHook <- function(object = NULL, value = 0, min = 0, max = 100, close = FALSE) -{ - if(is.null(object)) - { - object <- txtProgressBar(min, max, value, style=3, file=stderr()) - } - if(close) - close(object) - else - { - setTxtProgressBar(object, value) - return(object) - } -} + +#library(xcms) + +#' Backup \code{msmsWorkflow} results +#' +#' Writes the results from different \code{msmsWorkflow} steps to a file. +#' +#' @aliases archiveResults +#' @usage archiveResults(w, fileName, settings = getOption("RMassBank")) +#' @param w The \code{msmsWorkspace} to be saved. +#' @param fileName The filename to store the results under. +#' @param settings The settings to be stored into the msmsWorkspace image. +#' @examples +#' +#' # This doesn't really make a lot of sense, +#' # it stores an empty workspace. +#' RmbDefaultSettings() +#' w <- newMsmsWorkspace() +#' archiveResults(w, "narcotics.RData") +#' +#' @export +archiveResults <- function(w, fileName, settings = getOption("RMassBank")) +{ + # save the settings into the settings slot + w@settings <- settings + # save + save(w, file=fileName) + +} + + +#' RMassBank mass spectrometry pipeline +#' +#' Extracts and processes spectra from a specified file list, according to +#' loaded options and given parameters. +#' +#' The filenames of the raw LC-MS runs are read from the array \code{files} +#' in the global enviroment. +#' See the vignette \code{vignette("RMassBank")} for further details about the +#' workflow. +#' +#' @param w A \code{msmsWorkspace} to work with. +#' @param mode \code{"pH", "pNa", "pM", "mH", "mM", "mFA", "pNH4"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-, [M+NH4]+). +#' @param steps Which steps of the workflow to process. See the vignette +#' \code{vignette("RMassBank")} for details. +#' @param confirmMode Defaults to false (use most intense precursor). Value 1 uses +#' the 2nd-most intense precursor for a chosen ion (and its data-dependent scans) +#' , etc. +#' @param newRecalibration Whether to generate a new recalibration curve (\code{TRUE}, default) or +#' to reuse the currently stored curve (\code{FALSE}, useful e.g. for adduct-processing runs.) +#' @param useRtLimit Whether to enforce the given retention time window. +#' @param archivename The prefix under which to store the analyzed result files. +#' @param readMethod Several methods are available to get peak lists from the files. +#' Currently supported are "mzR", "xcms", "MassBank" and "peaklist". +#' The first two read MS/MS raw data, and differ in the strategy +#' used to extract peaks. MassBank will read existing records, +#' so that e.g. a recalibration can be performed, and "peaklist" +#' just requires a CSV with two columns and the column header "mz", "int". +#' @param findPeaksArgs A list of arguments that will be handed to the xcms-method findPeaks via do.call +#' @param plots A parameter that determines whether the spectra should be plotted or not (This parameter is only used for the xcms-method) +#' @param precursorscan.cf Whether to fill precursor scans. To be used with files which for +#' some reasons do not contain precursor scan IDs in the mzML, e.g. AB Sciex converted +#' files. +#' @param settings Options to be used for processing. Defaults to the options loaded via +#' \code{\link{loadRmbSettings}} et al. Refer to there for specific settings. +#' @param analyzeMethod The "method" parameter to pass to \code{\link{analyzeMsMs}}. +#' @param progressbar The progress bar callback to use. Only needed for specialized applications. +#' Cf. the documentation of \code{\link{progressBarHook}} for usage. +#' @param MSe A boolean value that determines whether the spectra were recorded using MSe or not +#' @return The processed \code{msmsWorkspace}. +#' @seealso \code{\link{msmsWorkspace-class}} +#' @author Michael Stravs, Eawag +#' @export +msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRecalibration = TRUE, + useRtLimit = TRUE, archivename=NA, readMethod = "mzR", findPeaksArgs = NULL, plots = FALSE, + precursorscan.cf = FALSE, + settings = getOption("RMassBank"), analyzeMethod = "formula", + progressbar = "progressBarHook", MSe = FALSE) +{ + .checkMbSettings() + if(!any(mode %in% c("pH","pNa","pNH4","pM","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) + + if(!is.na(archivename)) + w@archivename <- archivename + + # Make a progress bar: + nProg <- 0 + nLen <- length(w@files) + + allUnknown <- FALSE + + # If all compounds are unknown some specific conditions apply + if(all(.listEnvEnv$listEnv$compoundList$Level == "5")){ + allUnknown <- TRUE + message("All compounds are unknown, the workflow will be adjusted accordingly") + } + + if(readMethod == "minimal"){ + ##Edit options + opt <- getOption("RMassBank") + opt$recalibrator$MS1 <- "recalibrate.identity" + opt$recalibrator$MS2 <- "recalibrate.identity" + opt$add_annotation <- FALSE + opt$multiplicityFilter <- 1 + options(RMassBank=opt) + settings <- getOption("RMassBank") + ##Edit analyzemethod + analyzeMethod <- "intensity" + } + + # clean rerun functionality: + # if any step after 3 has been run, rerunning steps 4 or below needs moving back to the parent workspace. + # However, the recalibration must be preserved, because: + # if someone runs + # w <- msmsWorkflow(w, steps=c(1:4)), + # then substitutes the recalibration + # w@rc <- myrecal + # then runs step 4 again + # w <- msmsWorkflow(w, steps=c(4), newRecalibration=FALSE) + # the rc and rc.ms1 must be preserved and not taken from the parent workspace + if(!all(steps > 4) & !is.null(w@parent)) + { + rc <- w@rc + rc.ms1 <- w@rc.ms1 + w <- w@parent + w@rc <- rc + w@rc.ms1 <- rc.ms1 + } + + # Step 1: acquire all MSMS spectra from files + if(1 %in% steps) + { + message("msmsWorkflow: Step 1. Acquire all MSMS spectra from files") + w <- msmsRead(w = w, files = w@files, readMethod=readMethod, mode=mode, confirmMode = confirmMode, useRtLimit = useRtLimit, + Args = findPeaksArgs, settings = settings, progressbar = progressbar, MSe = MSe) + } + # Step 2: first run analysis before recalibration + if(2 %in% steps) + { + nProg <- 0 + message("msmsWorkflow: Step 2. First analysis pre recalibration") + if(allUnknown){ + analyzeMethod <- "intensity" + } + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) + w@spectra <- as(lapply(w@spectra, function(spec) { + #print(spec$id) + # if(findLevel(spec@id,TRUE) == "unknown"){ + # analyzeMethod <- "intensity" + # } else { + # analyzeMethod <- "formula" + # } + s <- analyzeMsMs(msmsPeaks = spec, mode=mode, detail=TRUE, run="preliminary", + filterSettings = settings$filterSettings, + spectraList = settings$spectraList, method = analyzeMethod) + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(s) + }), "SimpleList") + ## for(f in w@files) + ## w@spectra[[basename(as.character(f))]]@name <- basename(as.character(f)) + suppressWarnings(do.call(progressbar, list(object=pb, close=TRUE))) + } + # Step 3: aggregate all spectra + if(3 %in% steps) + { + message("msmsWorkflow: Step 3. Aggregate all spectra") + w@aggregated <- aggregateSpectra(spec = w@spectra, addIncomplete=TRUE) + } + + if(allUnknown){ + w@aggregated$noise <- FALSE + w@aggregated$noise <- FALSE + w@aggregated$reanalyzed.formula <- NA + w@aggregated$reanalyzed.mzCalc <- NA + w@aggregated$reanalyzed.dppm <- NA + w@aggregated$reanalyzed.formulaCount <- NA + w@aggregated$reanalyzed.dbe <- NA + w@aggregated$matchedReanalysis <- NA + w@aggregated$filterOK <- TRUE + w@aggregated$problematicPeak <- FALSE + w@aggregated$formulaMultiplicity <- unlist(sapply(table(w@aggregated$cpdID),function(x) rep(x,x))) + return(w) + } + + + # Step 4: recalibrate all m/z values in raw spectra + if(4 %in% steps) + { + message("msmsWorkflow: Step 4. Recalibrate m/z values in raw spectra") + if(newRecalibration) + { + # note: makeRecalibration takes w as argument now, because it needs to get the MS1 spectra from @spectra + recal <- makeRecalibration(w, mode, + recalibrateBy = settings$recalibrateBy, + recalibrateMS1 = settings$recalibrateMS1, + recalibrator = settings$recalibrator, + recalibrateMS1Window = settings$recalibrateMS1Window) + w@rc <- recal$rc + w@rc.ms1 <- recal$rc.ms1 + } + w@parent <- w + w@aggregated <- data.frame() + spectra <- recalibrateSpectra(mode, w@spectra, w = w, + recalibrateBy = settings$recalibrateBy, + recalibrateMS1 = settings$recalibrateMS1) + w@spectra <- spectra + } + # Step 5: re-analysis on recalibrated spectra + if(5 %in% steps) + { + nProg <- 0 + message("msmsWorkflow: Step 5. Reanalyze recalibrated spectra") + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) + + w@spectra <- as(lapply(w@spectra, function(spec) { + #print(spec$id) + if(findLevel(spec@id,TRUE) == "unknown"){ + analyzeMethod <- "intensity" + } else { + analyzeMethod <- "formula" + } + s <- analyzeMsMs(msmsPeaks = spec, mode=mode, detail=TRUE, run="recalibrated", + filterSettings = settings$filterSettings, + spectraList = settings$spectraList, method = analyzeMethod) + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(s) + }), "SimpleList") + ## for(f in w@files) + ## w@spectra[[basename(as.character(f))]]@name <- basename(as.character(f)) + suppressWarnings(do.call(progressbar, list(object=pb, close=TRUE))) + + do.call(progressbar, list(object=pb, close=TRUE)) + } + # Step 6: aggregate recalibrated results + if(6 %in% steps) + { + message("msmsWorkflow: Step 6. Aggregate recalibrated results") + w@aggregated <- aggregateSpectra(w@spectra, addIncomplete=TRUE) + if(!is.na(archivename)) + archiveResults(w, paste(archivename, ".RData", sep=''), settings) + w@aggregated <- cleanElnoise(w@aggregated, + settings$electronicNoise, settings$electronicNoiseWidth) + } + # Step 7: reanalyze failpeaks for (mono)oxidation and N2 adduct peaks + if(7 %in% steps) + { + message("msmsWorkflow: Step 7. Reanalyze fail peaks for N2 + O") + w@aggregated <- reanalyzeFailpeaks( + w@aggregated, custom_additions="N2O", mode=mode, + filterSettings=settings$filterSettings, + progressbar=progressbar) + if(!is.na(archivename)) + archiveResults(w, paste(archivename, "_RA.RData", sep=''), settings) + } + # Step 8: heuristic filtering based on peak multiplicity; + # creation of failpeak list + if(8 %in% steps) + { + message("msmsWorkflow: Step 8. Peak multiplicity filtering") + if (is.null(settings$multiplicityFilter)) { + message("msmsWorkflow: Step 8. Peak multiplicity filtering skipped because multiplicityFilter parameter is not set.") + } else { + # apply heuristic filter + w@aggregated <- filterMultiplicity( + w, archivename, mode, settings$multiplicityFilter) + w@aggregated <- processProblematicPeaks(w, mode, archivename) + + if(!is.na(archivename)) + archiveResults(w, paste(archivename, "_RF.RData", sep=''), settings) + } + } + message("msmsWorkflow: Done.") + return(w) +} + +#' Analyze MSMS spectra +#' +#' Analyzes MSMS spectra of a compound by fitting formulas to each subpeak. +#' +#' The analysis function uses Rcdk. Note +#' that in this step, \emph{satellite peaks} are removed by a simple heuristic +#' rule (refer to the documentation of \code{\link{filterPeakSatellites}} for details.) +#' +## # @usage analyzeMsMs(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", +## # filterSettings = getOption("RMassBank")$filterSettings, +## # spectraList = getOption("RMassBank")$spectraList, method="formula") +## # +## # analyzeMsMs.formula(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", +## # filterSettings = getOption("RMassBank")$filterSettings, +## # spectraList = getOption("RMassBank")$spectraList) +## # +## # analyzeMsMs.intensity(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", +## # filterSettings = getOption("RMassBank")$filterSettings, +## # spectraList = getOption("RMassBank")$spectraList) +#' +#' @param msmsPeaks A \code{RmbSpectraSet} object. +#' Corresponds to a parent spectrum and children MSMS spectra of one compound (plus some metadata). +#' The objects are typically generated with \code{\link{findMsMsHR}}, and populate the \code{@@spectrum} slot +#' in a \code{msmsWorkspace} (refer to the corresponding +#' documentation for the precise format specifications). +#' @param mode Specifies the processing mode, i.e. which molecule species the +#' spectra contain. \code{\var{pH}} (positive H) specifies [M+H]+, +#' \code{\var{pNa}} specifies [M+Na]+, \code{\var{pM}} specifies [M]+, +#' \code{\var{mH}} and \code{\var{mNa}} specify [M-H]- and [M-Na]-, +#' respectively. (I apologize for the naming of \code{\var{pH}} which has +#' absolutely nothing to do with chemical \emph{pH} values.) +#' @param detail Whether detailed return information should be provided +#' (defaults to \code{FALSE}). See below. +#' @param run \code{"preliminary"} or \code{"recalibrated"}. In the +#' \code{preliminary} run, mass tolerance is set to 10 ppm (above m/z 120) and +#' 15 ppm (below m/z 120), the default intensity cutoff is $10^4$ for positive +#' mode (no default cutoff in negative mode), and the column \code{"mz"} from +#' the spectra is used as data source. In the \code{recalibrated} run, the +#' mass tolerance is set to 5 ppm over the whole mass range, the default cutoff +#' is 0 and the column \code{"mzRecal"} is used as source for the m/z values. +#' Defaults to \code{"preliminary"}. +#' @param filterSettings +#' Settings for the filter parameters, by default loaded from the RMassBank settings +#' set with e.g. \code{\link{loadRmbSettings}}. Must contain: +#' \itemize{ +#' \item \code{ppmHighMass}, allowed ppm deviation before recalibration +#' for high mass range +#' \item \code{ppmLowMass}, allowed ppm deviation before recalibration +#' for low mass range +#' \item \code{massRangeDivision}, division point between high and low mass +#' range (before recalibration) +#' \item \code{ppmFine}, allowed ppm deviation overall after recalibration +#' \item \code{prelimCut}, intensity cutoff for peaks in preliminary run +#' \item \code{prelimCutRatio}, relative intensity cutoff for peaks in +#' preliminary run, e.g. 0.01 = 1% +#' \item \code{fineCut}, intensity cutoff for peaks in second run +#' \item \code{fineCutRatio}, relative intensity cutoff for peaks in +#' second run +#' \item \code{specOkLimit}, minimum intensity of base peak for spectrum +#' to be accepted for processing +#' \item \code{dbeMinLimit}, minimum double bond equivalent for accepted +#' molecular subformula. +#' \item \code{satelliteMzLimit}, for satellite peak filtering +#' (\code{\link{filterPeakSatellites}}: mass window to use for satellite +#' removal +#' \item \code{satelliteIntLimit}, the relative intensity below which to +#' discard "satellites". (refer to \code{\link{filterPeakSatellites}}). +#' } +#' @param spectraList The list of MS/MS spectra present in each data block. As also +#' defined in the settings file. +#' @param method Selects which function to actually use for data evaluation. The default +#' "formula" runs a full analysis via formula assignment to fragment peaks. The +#' alternative setting "intensity" calls a "mock" implementation which circumvents +#' formula assignment and filters peaks purely based on intensity cutoffs and the +#' satellite filtering. (In this case, the ppm and dbe related settings in filterSettings +#' are ignored.) +#' @return The processed \code{RmbSpectraSet} object. +#' Added (or filled, respectively, since the slots are present before) data include +#' \item{list("complete")}{whether all spectra have useful value} +#' \item{list("empty")}{whether there are no useful spectra} +#' \item{list("children")}{ +#' The processed \code{RmbSpectrum2} objects (in a \code{RmbSpectrum2List}). +#' \itemize{ +#' \item \code{ok} if the spectrum was successfully processed with at least one resulting peak +#' \item \code{mz}, \code{intensity}: note that mz/int pairs can be duplicated when multiple matches +#' are found for one mz value, therefore the two slots are not necessarily unchanged from before +#' \item \code{rawOK} (logical) whether the m/z peak passes satellite/low removal +#' \item \code{low}, \code{satellite} if \code{TRUE}, the peak failed cutoff (\code{low}) or was removed as \code{satellite} +#' \item \code{formula}, \code{mzCalc}, \code{dppm}, \code{dbe} Formula, calculated mass, ppm deviation and dbe assigned to a peak +#' \item \code{formulaCount}, \code{dppmBest} Number of formulae matched for this m/z value and ppm deviation of the best match +#' \item \code{info} Spectrum identifying information (collision energy, resolution, collision mode) from +#' the \code{spectraList} +#' \item All other entries are retained from the original \code{RmbSpectrum2}. +#' } +#' } +#' @aliases analyzeMsMs analyzeMsMs.formula analyzeMsMs.intensity +#' @author Michael Stravs +#' @seealso \code{\link{msmsWorkflow}}, \code{\link{filterLowaccResults}}, +#' \code{\link{filterPeakSatellites}}, \code{\link{reanalyzeFailpeaks}} +#' @examples +#' +#' \dontrun{analyzed <- analyzeMsMs(spec, "pH", TRUE)} +#' +#' @export +analyzeMsMs <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", + filterSettings = getOption("RMassBank")$filterSettings, + spectraList = getOption("RMassBank")$spectraList, method="formula") +{ + ## .RmbSpectraSet <- setClass("RmbSpectraSet", + ## representation = representation( + ## parent = "Spectrum1", + ## children = "RmbSpectrum2List", + ## # These are done as slots and not as S4 functions, because they are set during the workflow + ## # in "checking" steps. It's easier. + ## found = "logical", + ## complete = "logical", + ## empty = "logical", + ## formula = "character", + ## id = "integer", + ## mz = "numeric", + ## name = "character", + ## annotations = "list" + ## ), + ## prototype = prototype( + ## parent = new("Spectrum1"), + ## children = new("RmbSpectrum2List"), + ## found = FALSE, + ## complete = NA, + ## empty = NA, + ## formula = character(), + ## id = integer(), + ## mz = numeric(), + ## name = character(), + ## annotations = list() + ## ) + ## ); + .checkMbSettings() + + + # Check whether the spectra can be fitted to the spectra list correctly! + if(length(msmsPeaks@children) != length(spectraList)) + { + warning(paste0( + "The spectra count of the substance ", msmsPeaks@id, " (", length(msmsPeaks@children), " spectra) doesn't match the provided spectra list (", length(spectraList), " spectra)." + )) + msmsPeaks@found <- FALSE + return(msmsPeaks) + + } + + if(msmsPeaks@found == FALSE) + return(msmsPeaks) + + if(method=="formula") + { + r <- (analyzeMsMs.formula(msmsPeaks, mode, detail, run, filterSettings + )) + } + else if(method == "intensity") + { + r <- (analyzeMsMs.intensity(msmsPeaks, mode, detail, run, filterSettings + )) + } + + # Add the spectrum labels to the spectra here. + # If there is any better place to do this, please tell me. I hate it. + # However, the info should be added in msmsWorkflow not in mbWorkflow, because two msmsWorkspaces with different spectraLists can be + # merged together with all the combine / pack stuff. + children <- mapply(function(spec, info) + { + spec@info <- info + spec + }, r@children, spectraList, SIMPLIFY=FALSE) + r@children <- as(children, "SimpleList") + + + #nspectra <- length(spectraList) + ok <- unlist(lapply(r@children, function(c) c@ok)) + r@complete <- FALSE + r@empty <- FALSE + if(all(ok)) + r@complete <- TRUE + if(all(!ok)) + r@empty <- TRUE + return(r) +} + + +#' @describeIn analyzeMsMs Analyze the peaks using formula annotation +#' @export +analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", + filterSettings = getOption("RMassBank")$filterSettings) +{ + cut <- 0 + cut_ratio <- 0 + if(run=="preliminary") + { + filterMode <- "coarse" + cut <- filterSettings$prelimCut + if(is.na(cut)) + { + if(mode %in% c("pH", "pM", "pNa", "pNH4")) + cut <- 1e4 + else if(mode %in% c("mH", "mFA","mM")) + cut <- 0 + else stop(paste("The ionization mode", mode, "is unknown.")) + } + cutRatio <- filterSettings$prelimCutRatio + } + else + { + filterMode <- "fine" + cut <- filterSettings$fineCut + cut_ratio <- filterSettings$fineCutRatio + if(is.na(cut)) cut <- 0 + } + + # find whole spectrum of parent peak, so we have reasonable data to feed into + # MolgenMsMs + parentSpectrum <- msmsPeaks@parent + + + # On each spectrum the following function analyzeTandemShot will be applied. + # It takes the raw peaks matrix as argument (mz, int) and processes the spectrum by + # filtering out low-intensity (<1e4) and shoulder peaks (deltam/z < 0.5, intensity + # < 5%) and subsequently matching the peaks to formulas using Rcdk, discarding peaks + # with insufficient match accuracy or no match. + analyzeTandemShot <- function(child) + { + shot <- getData(child) + shot$row <- which(!is.na(shot$mz)) + + + # Filter out low intensity peaks: + child@low <- (shot$intensity < cut) | (shot$intensity < max(shot$intensity)*cut_ratio) + shot <- shot[!child@low,,drop=FALSE] + shot_full <- shot + + # Is there still anything left? + if(length(which(!child@low))==0) + { + child@ok <- FALSE + return(child) + } + + # Filter out satellite peaks: + shot <- filterPeakSatellites(shot, filterSettings) + child@satellite <- rep(TRUE, child@peaksCount) + child@satellite[which(child@low == TRUE)] <- NA + child@satellite[shot$row] <- FALSE + + # Is there still anything left? + if(nrow(shot)==0) + { + child@ok <- FALSE + return(child) + } + + if(max(shot$intensity) < as.numeric(filterSettings$specOkLimit)) + { + child@ok <- FALSE + return(child) + } + + # Crop to 4 digits (necessary because of the recalibrated values) + # this was done for the MOLGEN MSMS type analysis, is not necessary anymore now (23.1.15 MST) + # shot[,mzColname] <- round(shot[,mzColname], 5) + + # here follows the Rcdk analysis + #------------------------------------ + parentPeaks <- data.frame(mzFound=msmsPeaks@mz, + formula=msmsPeaks@formula, + dppm=0, + x1=0,x2=0,x3=0) + + # define the adduct additions + if(mode == "pH") { + allowed_additions <- "H" + mode.charge <- 1 + } else if(mode == "pNa") { + allowed_additions <- "Na" + mode.charge <- 1 + } else if(mode == "pM") { + allowed_additions <- "" + mode.charge <- 1 + } else if(mode == "mM") { + allowed_additions <- "" + mode.charge <- -1 + } else if(mode == "mH") { + allowed_additions <- "H-1" + mode.charge <- -1 + } else if(mode == "mFA") { + allowed_additions <- "C2H3O2" + mode.charge <- -1 + } else if(mode == "pNH4") { + allowed_additions <- "NH4" + mode.charge <- 1 + } else{ + stop("mode = \"", mode, "\" not defined") + } + + + # the ppm range is two-sided here. + # The range is slightly expanded because dppm calculation of + # generate.formula starts from empirical mass, but dppm cal- + # culation of the evaluation starts from theoretical mass. + # So we don't miss the points on 'the border'. + + if(run=="preliminary") + ppmlimit <- 2 * max(filterSettings$ppmLowMass, filterSettings$ppmHighMass) + else + ppmlimit <- 2.25 * filterSettings$ppmFine + + parent_formula <- add.formula(msmsPeaks@formula, allowed_additions) + dbe_parent <- dbe(parent_formula) + # check whether the formula is valid, i.e. has no negative or zero element numbers. + #print(parent_formula) + if(!is.valid.formula(parent_formula)) + { + child@ok <- FALSE + return(child) + } + + limits <- to.limits.rcdk(parent_formula) + + peakmatrix <- lapply( + split(shot,shot$row) + , function(shot.row) { + # Circumvent bug in rcdk: correct the mass for the charge first, then calculate uncharged formulae + # finally back-correct calculated masses for the charge + mass <- shot.row[["mz"]] + mass.calc <- mass + mode.charge * .emass + peakformula <- tryCatch(suppressWarnings(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), + limits, charge=0)), error=function(e) NA) + #peakformula <- tryCatch( + # generate.formula(mass, + # ppm(mass, ppmlimit, p=TRUE), + # limits, charge=1), + #error= function(e) list()) + + if(!is.list(peakformula)) + return(t(c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, + formula=NA, mzCalc=NA))) + else + { + return(t(sapply(peakformula, function(f) + { + mzCalc <- f@mass - mode.charge * .emass + c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, + formula=f@string, + mzCalc=mzCalc) + }))) + } + + }) + + childPeaks <- as.data.frame(do.call(rbind, peakmatrix)) + + # Reformat the deformatted output correctly (why doesn't R have a better way to do this, e.g. avoid deformatting?) + + childPeaks$row <- as.numeric(as.character(childPeaks$row)) + childPeaks$intensity <- as.numeric(as.character(childPeaks$intensity)) + childPeaks$mz <- as.numeric(as.character(childPeaks$mz)) + childPeaks$formula <- as.character(childPeaks$formula) + childPeaks$mzCalc <- as.numeric(as.character(childPeaks$mzCalc)) + childPeaks$dppm <- (childPeaks$mz / childPeaks$mzCalc - 1) * 1e6 + childPeaks$dbe <- unlist(lapply(childPeaks$formula, dbe)) + + # childPeaks now contains all the good and unmatched peaks + # but not the ones which were cut as satellites or below threshold. + + ## child@mzFound <- rep(NA, child@peaksCount) + ## child@mzFound[childPeaks$row] <- as.numeric(as.character(childPeaks$mzFound)) + ## + ## child@formula <- rep(NA, child@peaksCount) + ## child@formula[childPeaks$row] <- as.character(childPeaks$formula) + ## + ## child@mzCalc <- rep(NA, child@peaksCount) + ## child@mzCalc[childPeaks$row] <- as.numeric(as.character(childPeaks$mzCalc)) + ## + ## child@dppm<- rep(NA, child@peaksCount) + ## child@dppm[childPeaks$row] <- (childPeaks$mzFound / childPeaks$mzCalc - 1) * 1e6 + # delete the NA data out again, because MolgenMsMs doesn't have them + # here and they will be re-added later + # (this is just left like this for "historical" reasons) + #childPeaks <- childPeaks[!is.na(childPeaks$formula),] + # check if a peak was recognized (here for the first time, + # otherwise the next command would fail) + + if(nrow(childPeaks)==0) + { + child@ok <- FALSE + return(child) + } + + # now apply the rule-based filters to get rid of total junk: + # dbe >= -0.5, dbe excess over mother cpd < 3 + # dbe() has been adapted to return NA for NA input + #iff_rcdk_pM_eln$maxvalence <- unlist(lapply(diff_rcdk_pM_eln$formula.rcdk, maxvalence)) + temp.child.ok <- (childPeaks$dbe >= filterSettings$dbeMinLimit) + # & dbe < dbe_parent + 3) + # check if a peak was recognized + if(length(which(temp.child.ok)) == 0) + { + child@ok <- FALSE + return(child) + } + #browser() + # find the best ppm value + bestPpm <- aggregate(as.data.frame(childPeaks[!is.na(childPeaks$dppm),"dppm"]), + list(childPeaks[!is.na(childPeaks$dppm),"row"]), + function(dppm) dppm[[which.min(abs(dppm))]]) + colnames(bestPpm) <- c("row", "dppmBest") + childPeaks <- merge(childPeaks, bestPpm, by="row", all.x=TRUE) + + # Deactivated the following lines because we never actually want to look at the "old" formula count. + # To be verified (cf Refiltering, failpeak list and comparable things) + + ## # count formulas found per mass + ## countFormulasTab <- xtabs( ~formula + mz, data=childPeaks) + ## countFormulas <- colSums(countFormulasTab) + ## childPeaks$formulaCount <- countFormulas[as.character(childPeaks$row)] + + # filter results + childPeaksFilt <- filterLowaccResults(childPeaks, filterMode, filterSettings) + childPeaksGood <- childPeaksFilt[["TRUE"]] + childPeaksBad <- childPeaksFilt[["FALSE"]] + if(is.null(childPeaksGood)){ + childPeaksGood <- childPeaks[c(),,drop=FALSE] + childPeaksGood$good <- logical(0) + } + if(is.null(childPeaksBad)) + childPeaksBad <- childPeaks[c(),,drop=FALSE] + childPeaksUnassigned <- childPeaks[is.na(childPeaks$dppm),,drop=FALSE] + childPeaksUnassigned$good <- rep(FALSE, nrow(childPeaksUnassigned)) + # count formulas within new limits + # (the results of the "old" count stay in childPeaksInt and are returned + # in $childPeaks) + countFormulasTab <- xtabs( ~formula + mz, data=childPeaksGood) + countFormulas <- colSums(countFormulasTab) + childPeaksGood$formulaCount <- countFormulas[as.character(childPeaksGood$mz)] + + childPeaksUnassigned$formulaCount <- rep(NA, nrow(childPeaksUnassigned)) + childPeaksBad$formulaCount <- rep(NA, nrow(childPeaksBad)) + childPeaksBad$good <- rep(FALSE, nrow(childPeaksBad)) + + # Now: childPeaksGood (containing the new, recounted peaks with good = TRUE), and childPeaksBad (containing the + # peaks with good=FALSE, i.e. outside filter criteria, with the old formula count even though it is worthless) + # are bound together. + childPeaksBad <- childPeaksBad[,colnames(childPeaksGood),drop=FALSE] + childPeaksUnassigned <- childPeaksUnassigned[,colnames(childPeaksGood),drop=FALSE] + childPeaks <- rbind(childPeaksGood, childPeaksBad, childPeaksUnassigned) + + # Now let's cross fingers. Add a good=NA column to the unmatched peaks and reorder the columns + # to match order in childPeaks. After that, setData to the child slot. + + childPeaksOmitted <- getData(child) + childPeaksOmitted <- childPeaksOmitted[child@low | child@satellite,,drop=FALSE] + childPeaksOmitted$rawOK <- rep(FALSE, nrow(childPeaksOmitted)) + childPeaksOmitted$good <- rep(FALSE, nrow(childPeaksOmitted)) + childPeaksOmitted$dppm <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$formula <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$mzCalc <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$dbe <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$dppmBest <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$formulaCount <- rep(0, nrow(childPeaksOmitted)) + childPeaks$satellite <- rep(FALSE, nrow(childPeaks)) + childPeaks$low <- rep(FALSE, nrow(childPeaks)) + childPeaks$rawOK <- rep(TRUE, nrow(childPeaks)) + + childPeaks <- childPeaks[,colnames(childPeaksOmitted), drop=FALSE] + + childPeaksTotal <- rbind(childPeaks, childPeaksOmitted) + child <- setData(child, childPeaksTotal) + child@ok <- TRUE + + return(child) + } + + # I believe these lines were fixed to remove a warning but in the refactored workflow "mzranges" doesn't exist anymore. + # Leave here for now + ## mzranges <- t(sapply(shots, function(p) { + ## if(!is.null(p$childRaw)){ + ## return(range(p$childRaw[,mzColname])) + ## } else { + ## return(c(NA,NA)) + ## } + ## })) + ## + ## mzmin <- min(mzranges[,1], na.rm=TRUE) + ## mzmax <- max(mzranges[,2], na.rm=TRUE) + children <- lapply(msmsPeaks@children, analyzeTandemShot) + + + + +## shots <- mapply(function(shot, scan, info) + ## { + ## shot$scan <- scan + ## shot$info <- info + ## shot$header <- msmsPeaks$childHeaders[as.character(scan),] + ## return(shot) + ## }, shots, msmsPeaks$childScans, spectraList, SIMPLIFY=FALSE) + msmsPeaks@children <- as(children, "SimpleList") + return(msmsPeaks) +} + + +#' @describeIn analyzeMsMs Analyze the peaks going only by intensity values +#' @export +analyzeMsMs.intensity <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", + filterSettings = getOption("RMassBank")$filterSettings) +{ + cut <- 0 + cut_ratio <- 0 + if(run=="preliminary") + { + filterMode <- "coarse" + cut <- filterSettings$prelimCut + if(is.na(cut)) + { + if(mode %in% c("pH", "pM", "pNa", "pNH4")) + cut <- 1e4 + else if(mode %in% c("mH", "mFA", "mM")) + cut <- 0 + else stop(paste("The ionization mode", mode, "is unknown.")) + } + cutRatio <- filterSettings$prelimCutRatio + } + else + { + filterMode <- "fine" + cut <- filterSettings$fineCut + cut_ratio <- filterSettings$fineCutRatio + if(is.na(cut)) cut <- 0 + } + + # find whole spectrum of parent peak, so we have reasonable data to feed into + + + # On each spectrum the following function analyzeTandemShot will be applied. + # It takes the raw peaks matrix as argument (mz, int) and processes the spectrum by + # filtering out low-intensity (<1e4) and shoulder peaks (deltam/z < 0.5, intensity + # < 5%) and subsequently matching the peaks to formulas using Rcdk, discarding peaks + # with insufficient match accuracy or no match. + analyzeTandemShot <- function(child) + { + shot <- getData(child) + shot$row <- which(!is.na(shot$mz)) + + # Filter out low intensity peaks: + child@low <- (shot$intensity < cut) | (shot$intensity < max(shot$intensity)*cut_ratio) + shot_full <- shot + shot <- shot[!child@low,,drop=FALSE] + + + # Is there still anything left? + if(length(which(!child@low))==0) + { + child@ok <- FALSE + return(child) + } + + # Filter out satellite peaks: + shot <- filterPeakSatellites(shot, filterSettings) + child@satellite <- rep(TRUE, child@peaksCount) + child@satellite[which(child@low == TRUE)] <- NA + child@satellite[shot$row] <- FALSE + + # Is there still anything left? + if(nrow(shot)==0) + { + child@ok <- FALSE + return(child) + } + + if(max(shot$intensity) < as.numeric(filterSettings$specOkLimit)) + { + child@ok <- FALSE + return(child) + } + + + # here follows the fake analysis + #------------------------------------ + parentPeaks <- data.frame(mzFound=msmsPeaks@mz, + formula=msmsPeaks@formula, + dppm=0, + x1=0,x2=0,x3=0) + + childPeaks <- addProperty(shot_full, "rawOK", "logical", FALSE) + childPeaks[!(child@low | child@satellite),"rawOK"] <- TRUE + + childPeaks <- addProperty(childPeaks, "good", "logical", FALSE) + childPeaks[childPeaks$rawOK,"good"] <- TRUE + + childPeaks <- addProperty(childPeaks, "mzCalc", "numeric") + childPeaks[childPeaks$rawOK,"mzCalc"] <- childPeaks[childPeaks$rawOK,"mz"] + + childPeaks <- addProperty(childPeaks, "formula", "character") + childPeaks[childPeaks$rawOK,"formula"] <- "" + + childPeaks <- addProperty(childPeaks, "dbe", "numeric") + childPeaks[childPeaks$rawOK,"dbe"] <- 0 + + childPeaks <- addProperty(childPeaks, "formulaCount", "integer") + childPeaks[childPeaks$rawOK,"formulaCount"] <- 1 + + childPeaks <- addProperty(childPeaks, "dppm", "numeric") + childPeaks[childPeaks$rawOK,"dppm"] <- 0 + + childPeaks <- addProperty(childPeaks, "dppmBest", "numeric") + childPeaks[childPeaks$rawOK,"dppmBest"] <- 0 + + child <- setData(child, childPeaks) + child@ok <- TRUE + return(child) + } + children <- lapply(msmsPeaks@children, analyzeTandemShot) + msmsPeaks@children <- as(children, "SimpleList") + #browser() + + return(msmsPeaks) + + # Omit all the stuff below for now, I don't believe it is needed. One thing is that spectraList info will have to be added somewhere else. + ## shots <- mapply(function(shot, scan, info) + ## { + ## shot$scan <- scan + ## shot$info <- info + ## shot$header <- msmsPeaks$childHeaders[as.character(scan),] + ## return(shot) + ## }, shots, msmsPeaks$childScans, spectraList, SIMPLIFY=FALSE) + ## + ## mzranges <- t(sapply(shots, function(p) {return(range(p$childRaw[,mzColname]))})) + ## mzmin <- min(mzranges[,1], na.rm=TRUE) + ## mzmax <- max(mzranges[,2], na.rm=TRUE) + ## + ## return(list( + ## msmsdata=shots, + ## mzrange=c(mzmin, mzmax), + ## id=msmsPeaks$id, + ## mode=mode, + ## parentHeader = msmsPeaks$parentHeader, + ## parentMs = msmsPeaks$parentPeak, + ## formula = msmsPeaks$formula, + ## foundOK = TRUE)) +} + + +#' Filter peaks with low accuracy +#' +#' Filters a peak table (with annotated formulas) for accuracy. Low-accuracy +#' peaks are removed. +#' +#' In the \code{coarse} mode, mass tolerance is set to 10 ppm (above m/z 120) +#' and 15 ppm (below m/z 120). This is useful for formula assignment before +#' recalibration, where a wide window is desirable to accomodate the high mass +#' deviations at low m/z values, so we get a nice recalibration curve. +#' +#' In the \code{fine} run, the mass tolerance is set to 5 ppm over the whole +#' mass range. This should be applied after recalibration. +#' +#' @usage filterLowaccResults(peaks, mode="fine", filterSettings = getOption("RMassBank")$filterSettings) +#' @param peaks A data frame with at least the columns \code{mzFound} and +#' \code{dppm}. +#' @param mode \code{coarse} or \code{fine}, see below. +#' @param filterSettings Settings for filtering. For details, see documentation of +#' \code{\link{analyzeMsMs}} +#' @return A \code{list(TRUE = goodPeakDataframe, FALSE = badPeakDataframe)} is +#' returned: A data frame with all peaks which are "good" is in +#' \code{return[["TRUE"]]}. +#' @author Michael Stravs +#' @seealso \code{\link{analyzeMsMs}}, \code{\link{filterPeakSatellites}} +#' @examples +#' +#' # from analyzeMsMs: +#' \dontrun{childPeaksFilt <- filterLowaccResults(childPeaksInt, filterMode)} +#' +#' +filterLowaccResults <- function(peaks, mode="fine", filterSettings = getOption("RMassBank")$filterSettings) +{ + # Check if filter settings are properly set, otherwise use defaults + if(is.null(filterSettings)) + { + filterSettings <- list( + ppmHighMass = 10, + ppmLowMass = 15, + massRangeDivision = 120, + ppmFine = 5) + } + + peaks$good = NA + peaks[!is.na(peaks$dppm), "good"] <- TRUE + + # coarse mode: to use for determinating the recalibration function + if(mode=="coarse") + { + if(nrow(peaks[which(abs(peaks$dppm) > filterSettings$ppmHighMass),])>0) + peaks[which(abs(peaks$dppm) > filterSettings$ppmHighMass), "good"] <- FALSE + if(nrow(peaks[which(peaks$mz > filterSettings$massRangeDivision & abs(peaks$dppm) > filterSettings$ppmLowMass),])>0) + peaks[which(peaks$mz > filterSettings$massRangeDivision & abs(peaks$dppm) > filterSettings$ppmLowMass), "good"] <- FALSE + } + # fine mode: for use after recalibration + else + { + if(nrow(peaks[which(abs(peaks$dppm) > filterSettings$ppmFine),]) > 0) + peaks[which(abs(peaks$dppm) > filterSettings$ppmFine), "good"] <- FALSE + } + return(split(peaks, peaks$good)) +} + +#' Aggregate analyzed spectra +#' +#' Groups an array of analyzed spectra and creates aggregated peak tables +#' +#' \code{\var{addIncomplete}} is relevant for recalibration. For recalibration, +#' we want to use only high-confidence peaks, therefore we set +#' \code{\var{addIncomplete}} to \code{FALSE}. When we want to generate a peak +#' list for actually generating MassBank records, we want to include all peaks +#' into the peak tables. +#' +#' @usage aggregateSpectra(spec, addIncomplete=FALSE) +#' @param spec The \code{RmbSpectraSetList} of spectra sets (\code{RmbSpectraSet} objects) to aggregate +#' @param addIncomplete Whether or not the peaks from incomplete files (files +#' for which less than the maximal number of spectra are present) +#' @return +#' A summary \code{data.frame} with all peaks (possibly multiple rows for one m/z value from a spectrum, see below) with columns: +#' \item{mzFound, intensity}{Mass and intensity of the peak} +#' \item{good}{if the peak passes filter criteria} +#' \item{mzCalc, formula, dbe, dppm}{calculated mass, formula, dbe and ppm deviation of the assigned formula} +#' \item{formulaCount, dppmBest}{Number of matched formulae for this m/z value, and ppm deviation of the best match} +#' \item{scan, cpdID, parentScan}{Scan number of the child and parent spectrum in the raw file, also the compound ID to which the peak belongs} +#' \item{dppmRc}{ppm deviation recalculated from the aggregation function} +#' \item{index}{Aggregate-table peak index, so the table can be subsetted, edited and results reinserted back into this table easily} +#' Further columns are later added by workflow steps 6 (electronic noise culler), 7 and 8. +#' +#' @author Michael Stravs +#' @seealso \code{\link{msmsWorkflow}}, \code{\link{analyzeMsMs}} +#' @examples +#' +#' ## As used in the workflow: +#' \dontrun{% +#' w@@spectra <- lapply(w@@spectra, function(spec) +#' analyzeMsMs(spec, mode="pH", detail=TRUE, run="recalibrated", cut=0, cut_ratio=0 ) ) +#' w@@aggregate <- aggregateSpectra(w@@spectra) +#' } +#' +#' @export +aggregateSpectra <- function(spec, addIncomplete=FALSE) +{ + + if(addIncomplete) + aggSpectra <- selectSpectra(spec, "found", "object") + else + aggSpectra <- selectSpectra(spec, "complete", "object") + + compoundTables <- lapply(aggSpectra, function(s) + { + tables.c <- lapply(s@children, function(c) + { + table.c <- getData(c) + table.c <- table.c[table.c$rawOK,,drop=FALSE] + # remove superfluous columns, since only rawOK peaks are selected anyway + table.c$rawOK <- NULL + table.c$low <- NULL + table.c$satellite <- NULL + # add scan no + table.c$scan <- rep(c@acquisitionNum, nrow(table.c)) + return(table.c) + }) + table.cpd <- do.call(rbind, tables.c) + table.cpd$cpdID <- rep(s@id, nrow(table.cpd)) + table.cpd$parentScan <- rep(s@parent@acquisitionNum, nrow(table.cpd)) + return(table.cpd) + }) + #return(compoundTables) + aggTable <- do.call(rbind, compoundTables) + colnames(aggTable)[1] <- "mzFound" + + aggTable <- addProperty(aggTable, "dppmRc", "numeric") + aggTable <- addProperty(aggTable, "index", "integer") + if(nrow(aggTable) > 0) + aggTable$index <- 1:nrow(aggTable) + + aggTable[aggTable$good, "dppmRc"] <- (aggTable[aggTable$good, "mzFound"]/aggTable[aggTable$good, "mzCalc"] - 1)*1e6 + + + return(aggTable) +} + +#' Remove electronic noise +#' +#' Removes known electronic noise peaks from a peak table +#' +#' @usage cleanElnoise(peaks, noise=getOption("RMassBank")$electronicNoise, +#' width = getOption("RMassBank")$electronicNoiseWidth) +#' @param peaks An aggregated peak frame as described in \code{\link{aggregateSpectra}}. Columns +#' \code{mzFound}, \code{dppm} and \code{dppmBest} are needed. +#' @param noise A numeric vector of known m/z of electronic noise peaks from the instrument +#' Defaults to the entries in the RMassBank settings. +#' @param width The window for the noise peak in m/z units. Defaults to the entries in +#' the RMassBank settings. +#' @return Extends the aggregate data frame by column \code{noise} (logical), which is \code{TRUE} if the peak is marked as noise. +#' +#' @author Michael Stravs +#' @seealso \code{\link{msmsWorkflow}} +#' @examples +#' # As used in the workflow: +#' \dontrun{ +#' w@@aggregated <- +#' cleanElnoise(w@@aggregated) +#' } +#' @export +cleanElnoise <- function(peaks, noise=getOption("RMassBank")$electronicNoise, + width = getOption("RMassBank")$electronicNoiseWidth) +{ + + peaks <- addProperty(peaks, "noise", "logical", FALSE) + + # I don't think this makes sense if using one big table... + ## # use only best peaks + ## p_best <- peaks[is.na(peaks$dppmBest) | (peaks$dppm == peaks$dppmBest),] + + # remove known electronic noise + p_eln <- peaks + for(noisePeak in noise) + { + noiseMatches <- which(!((p_eln$mzFound > noisePeak + width) | (p_eln$mzFound < noisePeak - width))) + if(length(noiseMatches) > 0) + p_eln[noiseMatches, "noise"] <- TRUE + } + return(p_eln) +} + +#' Identify intense peaks (in a list of unmatched peaks) +#' +#' Finds a list of peaks in spectra with a high relative intensity (>10% and +#' 1e4, or >1% and 1e5) to write a list of peaks which must be manually +#' checked. Peaks orbiting around the parent peak mass (calculated from the +#' compound ID), which are very likely co-isolated substances, are ignored. +#' +#' +#' @usage problematicPeaks(peaks_unmatched, peaks_matched, mode = "pH") +#' @param peaks_unmatched Table of unmatched peaks, with at least \code{cpdID, +#' scan, mzFound, int}. +#' @param peaks_matched Table of matched peaks (used for base peak reference), +#' with at least \code{cpdID, scan, int}. +#' @param mode Processing mode (\code{"pH", "pNa"} etc.) +#' @return A filtered table with the potentially problematic peaks, including +#' the precursor mass and MSMS base peak intensity (\code{aMax}) for reference. +#' @author Michael Stravs +#' @seealso \code{\link{msmsWorkflow}} +#' @examples \dontrun{ +#' # As used in the workflow: +#' fp <- problematicPeaks(specs[!specs$filterOK & !specs$noise & +#' ((specs$dppm == specs$dppmBest) | (is.na(specs$dppmBest))) +#' ,,drop=FALSE], peaksMatched(w), mode) +#' } +#' @export +problematicPeaks <- function(peaks_unmatched, peaks_matched, mode="pH") +{ + # find spectrum maximum for each peak, and merge into table + if(nrow(peaks_matched) == 0){ + assIntMax <- data.frame(list(integer(0),integer(0),integer(0))) + } else{ + assIntMax <- as.data.frame(aggregate(as.data.frame(peaks_matched$intensity), + by=list(peaks_matched$cpdID, peaks_matched$scan), max)) + } + colnames(assIntMax) <- c("cpdID", "scan", "aMax") + peaks_unmatched <- merge(peaks_unmatched, assIntMax) + # which of these peaks are intense? + p_control <- peaks_unmatched[ + ( (peaks_unmatched$intensity > 1e5) & + (peaks_unmatched$intensity > 0.01*peaks_unmatched$aMax)) + | ( (peaks_unmatched$intensity > 1e4) & + (peaks_unmatched$intensity > 0.1* peaks_unmatched$aMax)) ,] + # find parent m/z to exclude co-isolated peaks + #p_control$mzCenter <- numeric(nrow(p_control)) + p_control$mzCenter <- as.numeric( + unlist(lapply(p_control$cpdID, function(id) findMz(id, mode, retrieval=findLevel(id,TRUE))$mzCenter)) ) + p_control_noMH <- p_control[ + (p_control$mzFound < p_control$mzCenter - 1) | + (p_control$mzFound > p_control$mzCenter + 1),] + return(p_control_noMH) +} + + +#' Generate list of problematic peaks +#' +#' Generates a list of intense unmatched peaks for further review (the "failpeak list") and exports it if the archive name is given. +#' +#' @param w \code{msmsWorkspace} to analyze. +#' @param mode Processing mode (pH etc) +#' @param archivename Base name of the archive to write to (for "abc" the exported failpeaks list will be "abc_Failpeaks.csv"). +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z +#' @return Returns the aggregate data.frame with added column "\code{problematic}" (logical) which marks peaks which match the problematic criteria +#' +#' @author stravsmi +#' @export +processProblematicPeaks <- function(w, mode, archivename = NA) +{ + + specs <- w@aggregated + fp <- problematicPeaks(specs[!specs$filterOK & !specs$noise & + ((specs$dppm == specs$dppmBest) | (is.na(specs$dppmBest))) + ,,drop=FALSE], peaksMatched(w), mode) + fp$OK <- rep('', nrow(fp)) + fp$name <- rownames(fp) + + fp <- fp[with(fp, + order(cpdID, mzCalc, scan)), + ] + + # Select the correct precursor scans. This serves to filter the list + # for the cases where multiple workspaces were combined after step 7 + # with combineMultiplicities. + # Note that this has drawbacks. Leaving the "duplicates" in would make it more easy + # to identify legitimate unformulaed peaks. We might experiment by marking them up + # somehow. + precursors <- unlist(lapply(selectSpectra(w, "found", "object"), function(s) s@parent@acquisitionNum)) + fp <- fp[ + fp$parentScan %in% precursors + ,] + + # Add the info to specs + specs <- addProperty(specs, "problematicPeak", "logical", FALSE) + specs[match(fp$index, specs$index),"problematicPeak"] <- TRUE + + # Select the columns for output into the failpeaks file + fp <- fp[,c("OK", "name", "cpdID", "scan", "mzFound", "formula", + "reanalyzed.formula", "mzCalc", "reanalyzed.mzCalc", "dppm", "reanalyzed.dppm", "dbe", "reanalyzed.dbe", "intensity", + "formulaCount", "reanalyzed.formulaCount", "parentScan", "aMax", "mzCenter")] + if(!is.na(archivename)) + write.csv(fp, file= + paste(archivename,"_Failpeaks.csv", sep=''), row.names=FALSE) + + return(specs) +} + +#' Recalibrate MS/MS spectra +#' +#' Recalibrates MS/MS spectra by building a recalibration curve of the +#' assigned putative fragments of all spectra in \code{aggregatedSpecs} +#' (measured mass vs. mass of putative associated fragment) and additionally +#' the parent ion peaks. +#' +#' Note that the actually used recalibration functions are governed by the +#' general MassBank settings (see \code{\link{recalibrate}}). +#' +#' If a set of acquired LC-MS runs contains spectra for two different ion types +#' (e.g. [M+H]+ and [M+Na]+) which should both be processed by RMassBank, it is +#' necessary to do this in two separate runs. Since it is likely that one ion type +#' will be the vast majority of spectra (e.g. most in [M+H]+ mode), and only few +#' spectra will be present for other specific adducts (e.g. only few [M+Na]+ spectra), +#' it is possible that too few spectra are present to build a good recalibration curve +#' using only e.g. the [M+Na]+ ions. Therefore we recommend, for one set of LC/MS runs, +#' to build the recalibration curve for one ion type +#' (\code{msmsWorkflow(mode="pH", steps=c(1:8), newRecalibration=TRUE)}) +#' and reuse the same curve for processing different ion types +#' (\code{msmsWorkflow(mode="pNa", steps=c(1:8), newRecalibration=FALSE)}). +#' This also ensures a consistent recalibration across all spectra of the same batch. +#' +#' @usage makeRecalibration(w, mode, +#' recalibrateBy = getOption("RMassBank")$recalibrateBy, +#' recalibrateMS1 = getOption("RMassBank")$recalibrateMS1, +#' recalibrator = getOption("RMassBank")$recalibrator, +#' recalibrateMS1Window = getOption("RMassBank")$recalibrateMS1Window +#' ) +#' +#' recalibrateSpectra(mode, rawspec = NULL, rc = NULL, rc.ms1=NULL, w = NULL, +#' recalibrateBy = getOption("RMassBank")$recalibrateBy, +#' recalibrateMS1 = getOption("RMassBank")$recalibrateMS1) +#' +#' recalibrateSingleSpec(spectrum, rc, +#' recalibrateBy = getOption("RMassBank")$recalibrateBy) +#' @aliases makeRecalibration recalibrateSpectra recalibrateSingleSpec +#' @param w For \code{makeRecalibration}: to perform the recalibration with. For \code{recalibrateSpectra}: +#' the \code{msmsWorkspace} which contains the recalibration curves (alternatively to specifying \code{rc, rc.ms1}). +#' @param spectrum For \code{recalibrateSingleSpec}: +#' a \code{MSnbase} \code{Spectrum}-derived object, commonly a \code{RmbSpectrum2} for MS2 or \code{Spectrum1} for MS1. +#' @param mode \code{"pH", "pNa", "pM", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-). +#' @param rawspec For \code{recalibrateSpectra}:an \code{RmbSpectraSetList} of \code{RmbSpectraSet} objects +#' , as the \code{w@@spectra} slot from \code{msmsWorkspace} or any object returned by \code{\link{findMsMsHR}}. +#' If empty, no spectra are recalibrated, but the recalibration curve is +#' returned. +#' @param rc,rc.ms1 The recalibration curves to be used in the recalibration. +#' @param recalibrateBy Whether recalibration should be done by ppm ("ppm") or by m/z ("mz"). +#' @param recalibrateMS1 Whether MS1 spectra should be recalibrated separately ("separate"), +#' together with MS2 ("common") or not at all ("none"). Usually taken from settings. +#' @param recalibrator The recalibrator functions to be used. +#' Refer to \code{\link{recalibrate}} for details. Usually taken from settings. +#' @param recalibrateMS1Window Window width to look for MS1 peaks to recalibrate (in ppm). +#' @return \code{makeRecalibration}: a \code{list(rc, rc.ms1)} with recalibration curves +#' for the MS2 and MS1 spectra. +#' +#' \code{recalibrateSpectra}: if \code{rawspec} is not \code{NULL}, returns the recalibrated +#' spectra as \code{RmbSpectraSetList}. All spectra have their mass recalibrated and evaluation data deleted. +#' +#' \code{recalibrateSingleSpec}: the recalibrated \code{Spectrum} (same object, recalibrated masses, +#' evaluation data like assigned formulae etc. deleted). +#' +#' @examples \dontrun{ +#' rcCurve <- recalibrateSpectra(w, "pH") +#' w@@spectra <- recalibrateSpectra(mode="pH", rawspec=w@@spectra, w=myWorkspace) +#' w@@spectra <- recalibrateSpectra(mode="pH", rawspec=w@@spectra, rcCurve$rc, rcCurve$rc.ms1) +#' } +#' +#' @author Michael Stravs, Eawag +#' @export +makeRecalibration <- function(w, mode, + recalibrateBy = getOption("RMassBank")$recalibrateBy, + recalibrateMS1 = getOption("RMassBank")$recalibrateMS1, + recalibrator = getOption("RMassBank")$recalibrator, + recalibrateMS1Window = getOption("RMassBank")$recalibrateMS1Window + ) +{ + if(is.null(w@spectra)) + stop("No spectra present to generate recalibration curve.") + + rcdata <- peaksMatched(w) + rcdata <- rcdata[rcdata$formulaCount == 1, ,drop=FALSE] + + rcdata <- rcdata[,c("mzFound", "dppm", "mzCalc")] + + if(nrow(rcdata) == 0) + stop("No peaks matched to generate recalibration curve.") + + ms1data <- recalibrate.addMS1data(w@spectra, mode, recalibrateMS1Window) + ms1data <- ms1data[,c("mzFound", "dppm", "mzCalc")] + + if (recalibrateMS1 != "none") { + ## Add m/z values from MS1 to calibration datapoints + rcdata <- rbind(rcdata, ms1data) + } + + rcdata$dmz <- rcdata$mzFound - rcdata$mzCalc + ms1data$dmz <- ms1data$mzFound - ms1data$mzCalc + + if(recalibrateBy == "dppm") + { + rcdata$recalfield <- rcdata$dppm + ms1data$recalfield <- ms1data$dppm + } + else + { + rcdata$recalfield <- rcdata$dmz + ms1data$recalfield <- ms1data$dmz + } + + # generate recalibration model + rc <- do.call(recalibrator$MS2, list(rcdata)) + if(recalibrateMS1 == "separate") + rc.ms1 <- do.call(recalibrator$MS1, list(ms1data)) + else + rc.ms1 <- rc + + # plot the model + par(mfrow=c(2,2)) + if(nrow(rcdata)>0) + plotRecalibration.direct(rcdata, rc, rc.ms1, "MS2", + range(rcdata$mzFound), + recalibrateBy) + if(nrow(ms1data)>0) + plotRecalibration.direct(ms1data, rc, rc.ms1, "MS1", + range(ms1data$mzFound), + recalibrateBy) + # Return the computed recalibration curves + return(list(rc=rc, rc.ms1=rc.ms1)) +} + + + +#' Plot the recalibration graph. +#' +#' @aliases plotRecalibration plotRecalibration.direct +#' @usage plotRecalibration(w, recalibrateBy = getOption("RMassBank")$recalibrateBy) +#' +#' plotRecalibration.direct(rcdata, rc, rc.ms1, title, mzrange, +#' recalibrateBy = getOption("RMassBank")$recalibrateBy) +#' +#' @param w The workspace to plot the calibration graph from +#' @param rcdata A data frame with columns \code{recalfield} and \code{mzFound}. +#' @param rc Predictor for MS2 data +#' @param rc.ms1 Predictor for MS1 data +#' @param title Prefix for the graph titles +#' @param mzrange m/z value range for the graph +#' @param recalibrateBy Whether recalibration was done by ppm ("ppm") or by m/z ("mz"). +#' Important only for graph labeling here. +#' +#' @author Michele Stravs, Eawag +#' @export +plotRecalibration <- function(w, recalibrateBy = getOption("RMassBank")$recalibrateBy) +{ + spec <- w@aggregated + if(!is.null(w@parent)) + spec <- w@parent@aggregated + + rcdata <- data.frame(mzFound = w@rc$x, recalfield = w@rc$y) + ms1data <- data.frame(mzFound = w@rc.ms1$x, recalfield = w@rc.ms1$y) + + + + par(mfrow=c(2,2)) + if(nrow(rcdata)>0) + plotRecalibration.direct(rcdata, w@rc, w@rc.ms1, "MS2", + range(spec$mzFound[which(spec$good)]),recalibrateBy) + if(nrow(ms1data)>0) + plotRecalibration.direct(ms1data, w@rc, w@rc.ms1, "MS1", + range(ms1data$mzFound),recalibrateBy) + +} + +#' @export +plotRecalibration.direct <- function(rcdata, rc, rc.ms1, title, mzrange, + recalibrateBy = getOption("RMassBank")$recalibrateBy + ) +{ + if(recalibrateBy == "dppm") + ylab.plot <- expression(paste(delta, "ppm")) + else + ylab.plot <- expression(paste(delta, "m/z")) + + plot(recalfield ~ mzFound, data=rcdata, + xlab = "m/z", ylab = ylab.plot, main=paste(title, "scatterplot")) + RcModelMz <- seq(mzrange[[1]], mzrange[[2]], by=0.2) + RcModelRecal <- predict(rc, newdata= data.frame(mzFound =RcModelMz)) + RcModelRecalMs1 <- predict(rc.ms1, newdata= data.frame(mzFound =RcModelMz)) + lines(RcModelMz, RcModelRecal, col="blue") + lines(RcModelMz, RcModelRecalMs1, col="yellow") + if((length(unique(rcdata$mzFound))>1) & + (length(unique(rcdata$recalfield))>1)) + { + if(requireNamespace("gplots",quietly=TRUE)) + { + + gplots::hist2d(rcdata$mzFound, rcdata$recalfield, + col=c("white", heat.colors(12)), xlab="m/z", + ylab = ylab.plot, main=paste(title, "density")) + lines(RcModelMz, RcModelRecal, col="blue") + lines(RcModelMz, RcModelRecalMs1, col="yellow") + } + else + { + message("Package gplots not installed. The recalibration density plot will not be displayed.") + message("To install gplots: install.packages('gplots')") + } + } +} + + +#' @export +recalibrateSpectra <- function(mode, rawspec = NULL, rc = NULL, rc.ms1=NULL, w = NULL, + recalibrateBy = getOption("RMassBank")$recalibrateBy, + recalibrateMS1 = getOption("RMassBank")$recalibrateMS1) +{ + # Load the recal curves from the workspace if one is specified. + if(!is.null(w)) + { + rc <- w@rc + rc.ms1 <- w@rc.ms1 + } + if(is.null(rc) || is.null(rc.ms1)) + stop("Please specify the recalibration curves either via workspace (w) or via parameters rc, rc.ms1.") + + # Do the recalibration + if(!is.null(rawspec)) + { + # go through all raw spectra and recalculate m/z values + recalibratedSpecs <- lapply(rawspec, function(s) + { + if(s@found) + { + # recalculate tandem spectrum peaks + recalSpectra <- lapply(s@children, function(p) + { + recalibrateSingleSpec(p, rc, recalibrateBy) + }) + s@children <- as(recalSpectra, "SimpleList") + # recalculate MS1 spectrum if required + if(recalibrateMS1 != "none") + { + s@parent <- recalibrateSingleSpec(s@parent, rc.ms1, recalibrateBy) + } + } + s@empty <- NA + s@complete <- NA + return(s) + } ) + return(as(recalibratedSpecs, "SimpleList")) + } + else # no rawspec passed + return(list()) +} + +#' @export +recalibrateSingleSpec <- function(spectrum, rc, + recalibrateBy = getOption("RMassBank")$recalibrateBy) +{ + spectrum.df <- as.data.frame(spectrum) + spectrum.df <- spectrum.df[!duplicated(spectrum.df$mz),,drop=FALSE] + spectrum.df <- spectrum.df[order(spectrum.df$mz),,drop=FALSE] + + mzVals <- spectrum.df + if(nrow(mzVals) > 0) + { + # Fix the column names so our + # prediction functions choose the right + # rows. + colnames(mzVals) <- c("mzFound", "int") + drecal <- predict(rc, newdata=mzVals) + if(recalibrateBy == "dppm") + mzRecal <- mzVals$mzFound / (1 + drecal/1e6) + else + mzRecal <- mzVals$mzFound - drecal + # And rename them back so our "mz" column is + # called "mz" again + } + spectrum.df$mz <- mzRecal + + + # now comes the part that I don't like too much; this could be improved by using as.data.frame instead of getData and correspondingly + # also not use setData. For now I leave it like this. + # The problem is that I am not sure whether the default behaviour of as.RmbSpectrum2 should be clean=TRUE or FALSE, + # and vice versa, I am not sure if as.data.frame should return only mz/int or the whole table. + + if(is(spectrum, "RmbSpectrum2")) + { + # this removes all evaluated data that were added in step 2 except for @ok I think + colnames(spectrum.df) <- c("mz", "intensity") + spectrum <- setData(spectrum, spectrum.df, clean=TRUE) + # It also avoids making a new object when we don't know what class it should be + } + else + { + # for Spectrum1 or all others that we don't know + spectrum@mz <- spectrum.df$mz + spectrum@intensity <- spectrum.df$i + } + + return(spectrum) +} + + + + + +#' Filter satellite peaks +#' +#' Filters satellite peaks in FT spectra which arise from FT artifacts and from +#' conversion to stick mode. A very simple rule is used which holds mostly true +#' for MSMS spectra (and shouldn't be applied to MS1 spectra which contain +#' isotope structures...) +#' +#' The function cuts off all peaks within 0.5 m/z from every peak, in +#' decreasing intensity order, which are below 5% of the referring peak's +#' intensity. E.g. for peaks m/z=100, int=100; m/z=100.2, int=2, m/z=100.3, +#' int=6, m/z 150, int=10: The most intense peak (m/z=100) is selected, all +#' neighborhood peaks below 5% are removed (in this case, only the m/z=100.2 +#' peak) and the next less intense peak is selected. Here this is the m/z=150 +#' peak. All low-intensity neighborhood peaks are removed (nothing). The next +#' less intense peak is selected (m/z=100.3) and again neighborhood peaks are +#' cut away (nothing to cut here. Note that the m/z = 100.2 peak was alredy +#' removed.) +#' +#' @usage filterPeakSatellites(peaks, filterSettings = getOption("RMassBank")$filterSettings) +#' @param peaks A peak dataframe with at least the columns \code{mz, int}. Note +#' that \code{mz} is used even for the recalibrated spectra, i.e. the +#' desatellited spectrum is identical for both the unrecalibrated and the +#' recalibrated spectra. +#' @param filterSettings The settings used for filtering. Refer to \code{\link{analyzeMsMs}} +#' documentation for filter settings. +#' @return Returns the peak table with satellite peaks removed. +#' @note This is a very crude rule, but works remarkably well for our spectra. +#' @author Michael Stravs +#' @seealso \code{\link{analyzeMsMs}}, \code{\link{filterLowaccResults}} +#' @examples +#' +#' # From the workflow: +#' \dontrun{ +#' # Filter out satellite peaks: +#' shot <- filterPeakSatellites(shot) +#' shot_satellite_n <- setdiff(row.names(shot_full), row.names(shot)) +#' shot_satellite <- shot_full[shot_satellite_n,] +#' # shot_satellite contains the peaks which were eliminated as satellites. +#' } +#' +#' @export +filterPeakSatellites <- function(peaks, filterSettings = getOption("RMassBank")$filterSettings) +{ + cutoff_int_limit <- filterSettings$satelliteIntLimit + cutoff_mz_limit <- filterSettings$satelliteMzLimit + # Order by intensity (descending) + peaks_o <- peaks[order(peaks$intensity, decreasing=TRUE),,drop=FALSE] + n <- 1 + # As long as there are peaks left AND the last peak is small enough (relative + # to selected), move to the next peak + while(n < nrow(peaks_o)) + { + if(peaks_o[nrow(peaks_o),"intensity"] >= cutoff_int_limit *peaks_o[n,"intensity"]) + break + # remove all peaks within cutoff_mz_limit (std. m/z = 0.5) which have intensity + # of less than 5% relative to their "parent" peak + # + peaks_l <- peaks_o[ (peaks_o$mz > peaks_o[n,"mz"] - cutoff_mz_limit) + & (peaks_o$mz < peaks_o[n,"mz"] + cutoff_mz_limit) + & (peaks_o$intensity < cutoff_int_limit * peaks_o[n,"intensity"]),,drop=FALSE] + peaks_o <- peaks_o[ !((peaks_o$mz > peaks_o[n,"mz"] - cutoff_mz_limit) + & (peaks_o$mz < peaks_o[n,"mz"] + cutoff_mz_limit) + & (peaks_o$intensity < cutoff_int_limit * peaks_o[n,"intensity"]) + ),,drop=FALSE] + n <- n+1 + } + return(peaks_o[order(peaks_o$mz),,drop=FALSE]) +} + + +#' Reanalyze unmatched peaks +#' +#' Reanalysis of peaks with no matching molecular formula by allowing +#' additional elements (e.g. "N2O"). +#' +#' \code{reanalyzeFailpeaks} examines the \code{unmatchedPeaksC} table in +#' \code{specs} and sends every peak through \code{reanalyzeFailpeak}. +#' +#' @aliases reanalyzeFailpeaks reanalyzeFailpeak +#' @usage reanalyzeFailpeaks(aggregated, custom_additions, mode, filterSettings = +#' getOption("RMassBank")$filterSettings, progressbar = "progressBarHook") +#' reanalyzeFailpeak(custom_additions, mass, cpdID, counter, pb = NULL, mode, +#' filterSettings = getOption("RMassBank")$filterSettings) +#' @param aggregated A peake aggregate table (\code{w@@aggregate}) (after processing electronic noise removal!) +#' @param custom_additions The allowed additions, e.g. "N2O". +#' @param mode Processing mode (\code{"pH", "pNa", "mH"} etc.) +#' @param mass (Usually recalibrated) m/z value of the peak. +#' @param cpdID Compound ID of this spectrum. +#' @param counter Current peak index (used exclusively for the progress +#' indicator) +#' @param pb A progressbar object to display progress on, as passed by +#' \code{reanalyzeFailpeaks} to \code{reanalyzeFailpeak}. No progress +#' is displayed if NULL. +#' @param progressbar The progress bar callback to use. Only needed for specialized +#' applications. Cf. the documentation of \code{\link{progressBarHook}} for usage. +#' @param filterSettings Settings for filtering data. Refer to\code{\link{analyzeMsMs}} for settings. +#' @return The aggregate data frame extended by the columns: +#' #' \item{reanalyzed.???}{If reanalysis (step 7) has already been processed: matching values from the reanalyzed peaks} +#' \item{matchedReanalysis}{Whether reanalysis has matched (\code{TRUE}), not matched(\code{FALSE}) or has not been conducted for the peak(\code{NA}).} +#' +#' It would be good to merge the analysis functions of \code{analyzeMsMs} with +#' the one used here, to simplify code changes. +#' @author Michael Stravs +#' @seealso \code{\link{analyzeMsMs}}, \code{\link{msmsWorkflow}} +#' @examples +#' +#' ## As used in the workflow: +#' \dontrun{ +#' reanalyzedRcSpecs <- reanalyzeFailpeaks(w@@aggregated, custom_additions="N2O", mode="pH") +#' # A single peak: +#' reanalyzeFailpeak("N2O", 105.0447, 1234, 1, 1, "pH") +#' } +#' +#' @export +reanalyzeFailpeaks <- function(aggregated, custom_additions, mode, filterSettings = + getOption("RMassBank")$filterSettings, progressbar = "progressBarHook") +{ + + fp <- peaksUnmatched(aggregated, cleaned=TRUE) + fp <- fp[is.na(fp$dppm) | (fp$dppm == fp$dppmBest),] + #fp <- pu[!pu$noise,,drop=FALSE] + + custom_additions_l <- as.list(rep(x=custom_additions, times=nrow(fp))) + mode_l <- as.list(rep(x=mode, times=nrow(fp))) + nLen <- nrow(fp) + + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=max(nLen,1))) + temp <- data.frame() + if(nLen == 0) + { + message("reanalyzeFailpeaks: No peaks to reanalyze.") + temp <- data.frame( + "reanalyzed.formula" = character(), + "reanalyzed.mzCalc" = numeric(), + "reanalyzed.dppm" = numeric(), + "reanalyzed.formulaCount" = numeric(), + "reanalyzed.dbe" = numeric()) + } + else + { + counter <- as.list(1:nrow(fp)) + # this is the reanalysis step: run reanalyze.failpeak (with the relevant parameters) + # on each failpeak. + temp <- mapply(reanalyzeFailpeak, custom_additions_l, fp$mzFound, fp$cpdID, counter, + MoreArgs=list(mode=mode, pb=list(hook=progressbar, bar=pb), filterSettings=filterSettings)) + # reformat the result and attach it to specs + temp <- as.data.frame(t(temp)) + temp <- temp[,c("reanalyzed.formula", "reanalyzed.mzCalc", "reanalyzed.dppm", + "reanalyzed.formulaCount", "reanalyzed.dbe")] + } + + # Add columns to the aggregated table (they are then filled in with the obtained values for reanalyzed peaks and left + # empty otherwise + aggregated <- addProperty(aggregated, "reanalyzed.formula", "character") + aggregated <- addProperty(aggregated, "reanalyzed.mzCalc", "numeric") + aggregated <- addProperty(aggregated, "reanalyzed.dppm", "numeric") + aggregated <- addProperty(aggregated, "reanalyzed.formulaCount", "numeric") + aggregated <- addProperty(aggregated, "reanalyzed.dbe", "numeric") + aggregated <- addProperty(aggregated, "matchedReanalysis", "logical", NA) + + + peaksReanalyzed <- cbind(fp, temp) + + # Since some columns are in "list" type, they disturb later on. + # therefore, fix them and make them normal vectors. + listcols <- unlist(lapply(colnames(peaksReanalyzed), function(col) + is.list(peaksReanalyzed[,col]))) + for(col in colnames(peaksReanalyzed)[which(listcols==TRUE)]) + peaksReanalyzed[,col] <- + unlist(peaksReanalyzed[,col]) + + peaksReanalyzed$matchedReanalysis <- !is.na(peaksReanalyzed$reanalyzed.dppm) + + # Substitute in the reanalyzed peaks into the aggregated table + aggregated[match(peaksReanalyzed$index, aggregated$index),] <- peaksReanalyzed + + do.call(progressbar, list(object=pb, close=TRUE)) + return(aggregated) +} + + +#' @export +reanalyzeFailpeak <- function(custom_additions, mass, cpdID, counter, pb = NULL, mode, + filterSettings = getOption("RMassBank")$filterSettings) +{ + # the counter to show the progress + if(!is.null(pb)) + { + do.call(pb$hook, list(object=pb$bar, value=counter)) + } + # here follows the Rcdk analysis + #------------------------------------ + + # define the adduct additions + if(mode == "pH") { + allowed_additions <- "H" + mode.charge <- 1 + } else if(mode == "pNa") { + allowed_additions <- "Na" + mode.charge <- 1 + } else if(mode == "pM") { + allowed_additions <- "" + mode.charge <- 1 + } else if(mode == "mM") { + allowed_additions <- "" + mode.charge <- -1 + } else if(mode == "mH") { + allowed_additions <- "H-1" + mode.charge <- -1 + } else if(mode == "mFA") { + allowed_additions <- "C2H3O2" + mode.charge <- -1 + } else { + stop("mode = \"", mode, "\" not defined") + } + + # the ppm range is two-sided here. + # The range is slightly expanded because dppm calculation of + # generate.formula starts from empirical mass, but dppm cal- + # culation of the evaluation starts from theoretical mass. + # So we don't miss the points on 'the border'. + + db_formula <- findFormula(cpdID, retrieval=findLevel(cpdID,TRUE)) + + ppmlimit <- 2.25 * filterSettings$ppmFine + parent_formula <- add.formula(db_formula, allowed_additions) + parent_formula <- add.formula(parent_formula, custom_additions) + dbe_parent <- dbe(parent_formula) + # check whether the formula is valid, i.e. has no negative or zero element numbers. + #print(parent_formula) + limits <- to.limits.rcdk(parent_formula) + + peakformula <- tryCatch(suppressWarnings(generate.formula(mass, ppm(mass, ppmlimit, p=TRUE), + limits, charge=mode.charge)), error=function(e) NA) + # was a formula found? If not, return empty result + if(!is.list(peakformula)) + return(as.data.frame( + t(c(mzFound=as.numeric(as.character(mass)), + reanalyzed.formula=NA, reanalyzed.mzCalc=NA, reanalyzed.dppm=NA, + reanalyzed.formulaCount=0, + reanalyzed.dbe=NA)))) + else # if is.list(peakformula) + # formula found? then return the one with lowest dppm + { + # calculate dppm for all formulas + peakformula <- sapply(peakformula, function(f) + { + l <- list(mzFound=as.numeric(as.character(mass)), + reanalyzed.formula=as.character(f@string), + reanalyzed.mzCalc=as.numeric(as.character(f@mass)) + ) + + return(unlist(l)) + }) + + # filter out bad dbe stuff + peakformula <- as.data.frame(t(peakformula)) + # for some reason completely oblivious to me, the columns in peakformula + # are still factors, even though i de-factored them by hand. + # Therefore, convert them again... + peakformula$mzFound <- as.numeric(as.character(peakformula$mzFound)) + peakformula$reanalyzed.formula <- as.character(peakformula$reanalyzed.formula) + peakformula$reanalyzed.mzCalc <- as.numeric(as.character(peakformula$reanalyzed.mzCalc)) + + peakformula$reanalyzed.dppm <- (peakformula$mzFound / peakformula$reanalyzed.mzCalc - 1) * 1e6 + peakformula$reanalyzed.formulaCount=nrow(peakformula) + + # filter out bad dbe and high ppm stuff + peakformula$reanalyzed.dbe <- unlist(lapply(peakformula$reanalyzed.formula, dbe)) + peakformula <- peakformula[(peakformula$reanalyzed.dbe >= filterSettings$dbeMinLimit) + & (abs(peakformula$reanalyzed.dppm) < filterSettings$ppmFine),] + # is there still something left? + if(nrow(peakformula) == 0) + return(as.data.frame( + t(c(mzFound=as.numeric(as.character(mass)), + reanalyzed.formula=NA, reanalyzed.mzCalc=NA, reanalyzed.dppm=NA, + reanalyzed.formulaCount=0, reanalyzed.dbe = NA)))) + else + { + #update formula count to the remaining formulas + peakformula$reanalyzed.formulaCount=nrow(peakformula) + return(peakformula[which.min(abs(peakformula$reanalyzed.dppm)),]) + } + + } # endif is.list(peakformula) + + + + } + +#' Multiplicity filtering: Removes peaks which occur only once in a n-spectra set. +#' +#' For every compound, every peak (with annotated formula) is compared +#' across all spectra. Peaks whose formula occurs only once for all collision energies +#' / spectra types, are discarded. This eliminates "stochastic formula hits" of pure +#' electronic noise peaks efficiently from the spectra. Note that in the author's +#' experimental setup two spectra were recorded at every collision energy, +#' and therefore every peak-formula should appear +#' at least twice if it is real, even if it is by chance a fragment which appears +#' on only one collision energy setting. The function was not tested in a different +#' setup. Therefore, use with a bit of caution. +#' @usage filterPeaksMultiplicity(peaks, formulacol, recalcBest = TRUE) +#' @param peaks An aggregate peak data.frame containing all peaks to be analyzed; with at least +#' the columns \code{cpdID, scan, mzFound} and one column for the formula +#' specified with the \code{formulacol} parameter. +#' @param formulacol Which column the assigned formula is stored in. (Needed to separately process \code{"formula"} and +#' \code{"reanalyzed.formula"} multiplicites.) +#' @param recalcBest Whether the best formula for each peak should be re-determined. +#' This is necessary for results from the ordinary \code{\link{analyzeMsMs}} +#' analysis which allows multiple potential formulas per peak - the old best match +#' could potentially have been dropped because of multiplicity filtering. For results +#' from \code{\link{reanalyzeFailpeak}} this is not necessary, since only one potential +#' formula is assigned in this case. +#' @return The peak table is returned, enriched with columns: +#' \itemize{ +#' \item{\code{formulaMultiplicity}}{The # of occurrences of this formula +#' in the spectra of its compounds.} +#' } +#' @examples \dontrun{ +#' peaksFiltered <- filterPeaksMultiplicity(peaksMatched(w), +#' "formula", TRUE) +#' peaksOK <- subset(peaksFiltered, formulaMultiplicity > 1) +#' } +#' @author Michael Stravs, EAWAG +#' @export +filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE) +{ + # create dummy for the case that we have no rows + multInfo <- data.frame(cpdID = character(), + formulacol = character(), + formulaMultiplicity = numeric()) + # rename (because "formulacol" is not the actually correct name) + colnames(multInfo) <- c("cpdID", formulacol, "formulaMultiplicity") + + if(!is.data.frame(peaks) || (nrow(peaks) == 0) ) + { + peaks <- cbind(peaks, data.frame(formulaMultiplicity=numeric())) + if(recalcBest){ + if(formulacol == "formula"){ + warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.") + } + if(formulacol == "reanalyzed.formula"){ + warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.") + } + peaks$fM_factor <- as.factor(peaks$formulaMultiplicity) + return(peaks) + } + } + else + { + # calculate duplicity info + multInfo <- aggregate(as.data.frame(peaks$scan), + list(peaks$cpdID, peaks[,formulacol]), FUN=length) + # just for comparison: + # nform <- unique(paste(pks$cpdID,pks$formula)) + + # merge the duplicity info into the peak table + colnames(multInfo) <- c("cpdID", formulacol, "formulaMultiplicity") + peaks <- merge(peaks, multInfo) + } + + # separate log intensity data by duplicity (needs duplicity as a factor) + # and boxplot + peaks$fM_factor <- as.factor(peaks$formulaMultiplicity) + + # nostalgy: dppmBest first, to compare :) + # now we prioritize the most frequent formula instead, and only then apply the + # dppmBest rule + #pks2 <- subset(pks, dppm==dppmBest) + + # split peak intensity by multiplicity + peakMultiplicitySets <- split(log(peaks$int,10), peaks$fM_factor) + #boxplot(peakMultiplicitySets) + # nice plot :) + #if(length(peakMultiplicitySets) > 0) + # q <- quantile(peakMultiplicitySets[[1]], c(0,.25,.5,.75,.95,1)) + pk_data <- lapply(peakMultiplicitySets, length) + + # now by formula, not by peak: + multInfo$fM_factor <- as.factor(multInfo$formulaMultiplicity) + # the formulas are split into bins with their multiplicity + # (14 bins for our 14-spectra method) + formulaMultiplicitySets <- split(multInfo[,formulacol], multInfo$fM_factor) + formulaMultiplicityHist <- lapply(formulaMultiplicitySets, length) + + # if we use recalcBest, then we recalculate which peak in the + # list was best. We do this for the peaks matched in the first analysis. + # The peaks from the reanalysis are single anyway and don't get this additional + # treatment. + + if(recalcBest == FALSE) + return(peaks) + + # prioritize duplicate peaks + # get unique peaks with their maximum-multiplicity formula attached + best_mult <- aggregate(as.data.frame(peaks$formulaMultiplicity), + list(peaks$cpdID, peaks$scan, peaks$mzFound), + max) + colnames(best_mult) <- c("cpdID", "scan", "mzFound", "bestMultiplicity") + peaks <- merge(peaks, best_mult) + peaks <- peaks[peaks$formulaMultiplicity==peaks$bestMultiplicity,] + + # now we also have to recalculate dppmBest since the "old best" may have been + # dropped. + peaks$dppmBest <- NULL + bestPpm <- aggregate(as.data.frame(peaks$dppm), + list(peaks$cpdID, peaks$scan, peaks$mzFound), + function(dppm) dppm[[which.min(abs(dppm))]]) + colnames(bestPpm) <- c("cpdID", "scan", "mzFound", "dppmBest") + peaks <- merge(peaks, bestPpm) + pks_best <- peaks[peaks$dppm==peaks$dppmBest,] + + # And, iteratively, the multiplicity also must be recalculated, because we dropped + # some peaks and the multiplicites of some of the formulas will have decreased. + + pks_best$formulaMultiplicity <- NULL + pks_best$bestMultiplicity <- NULL + multInfo_best <- aggregate(as.data.frame(pks_best$scan), + list(pks_best$cpdID, pks_best[,formulacol]), + FUN=length) + colnames(multInfo_best) <- c("cpdID", formulacol, "formulaMultiplicity") + pks_best <- merge(pks_best, multInfo_best) + pks_best$fM_factor <- as.factor(pks_best$formulaMultiplicity) + multInfo_best$fM_factor <- as.factor(multInfo_best$formulaMultiplicity) + + formulaMultplicitySets_best <- split(multInfo_best[,formulacol], multInfo_best$fM_factor) + formulaMultplicityHist_best <- lapply(formulaMultplicitySets_best, length) + + peakMultiplicitySets_best <- split(log(pks_best$int,10), pks_best$fM_factor) + #boxplot(peakMultiplicitySets_best) + #q <- quantile(peakMultiplicitySets_best[[1]], c(0,.25,.5,.75,.95,1)) + #peakMultiplicityHist_best <- lapply(peakMultiplicitySets_best, length) + #q + pks_best$fM_factor <- NULL + # this returns the "best" peaks (first by formula multiplicity, then by dppm) + # before actually cutting the bad ones off. + + + return(pks_best) +} + + +#' filterMultiplicity +#' +#' Multiplicity filtering: Removes peaks which occur only once in a n-spectra +#' set. +#' +#' This function executes multiplicity filtering for a set of spectra using the +#' workhorse function \code{\link{filterPeaksMultiplicity}} (see details there) +#' and retrieves problematic filtered peaks (peaks which are of high intensity +#' but were discarded, because either no formula was assigned or it was not +#' present at least 2x), using the workhorse function +#' \code{\link{problematicPeaks}}. The results are returned in a format ready +#' for further processing with \code{\link{mbWorkflow}}. +#' +#' @usage filterMultiplicity(w, archivename=NA, mode="pH", recalcBest = TRUE, +#' multiplicityFilter = getOption("RMassBank")$multiplicityFilter) +#' @param w Workspace containing the data to be processed (aggregate table and \code{RmbSpectraSet} objects) +#' @param archivename The archive name, used for generation of +#' archivename_Failpeaks.csv +#' @param mode Mode of ion analysis +#' @param recalcBest Boolean, whether to recalculate the formula multiplicity +#' after the first multiplicity filtering step. Sometimes, setting this +#' to FALSE can be a solution if you have many compounds with e.g. fluorine +#' atoms, which often have multiple assigned formulas per peak and might occasionally +#' lose peaks because of that. +#' @param multiplicityFilter Threshold for the multiplicity filter. If set to 1, +#' no filtering will apply (minimum 1 occurrence of peak). 2 equals minimum +#' 2 occurrences etc. +#' @return A list object with values: +#' \item{peaksOK}{ Peaks with >1-fold formula multiplicity from the +#' "normal" peak analysis. } +#' \item{peaksReanOK}{ Peaks with >1-fold formula multiplicity from +#' peak reanalysis. } +#' \item{peaksFiltered}{ All peaks with annotated formula multiplicity from +#' first analysis. } +#' \item{peaksFilteredReanalysis}{ All peaks with annotated +#' formula multiplicity from peak reanalysis. } +#' \item{peaksProblematic}{ Peaks with high intensity which do not match +#' inclusion criteria -> possible false negatives. The list will be +#' exported into archivename_failpeaks.csv. +#' } +#' @author Michael Stravs +#' @seealso +#' \code{\link{filterPeaksMultiplicity}},\code{\link{problematicPeaks}} +#' @examples +#' \dontrun{ +#' refilteredRcSpecs <- filterMultiplicity( +#' w, "myarchive", "pH") +#' } +#' @export +filterMultiplicity <- function(w, archivename=NA, mode="pH", recalcBest = TRUE, + multiplicityFilter = getOption("RMassBank")$multiplicityFilter) +{ + # Read multiplicity filter setting + # For backwards compatibility: If the option is not set, define as 2 + # (which was the behaviour before we introduced the option) + if(is.null(multiplicityFilter)) + multiplicityFilter <- 2 + + specs <- w@aggregated + + peaksFiltered <- filterPeaksMultiplicity(peaksMatched(specs), + "formula", recalcBest) + + + peaksFilteredReanalysis <- + filterPeaksMultiplicity(specs[!is.na(specs$matchedReanalysis) & specs$matchedReanalysis,,drop=FALSE], "reanalyzed.formula", FALSE) + + + + specs <- addProperty(specs, "formulaMultiplicity", "numeric", 0) + + # Reorder the columns of the filtered peaks such that they match the columns + # of the original aggregated table; such that the columns can be substituted in. + + peaksFiltered <- peaksFiltered[,colnames(specs)] + peaksFilteredReanalysis <- peaksFilteredReanalysis[,colnames(specs)] + + # substitute into the parent dataframe + specs[match(peaksFiltered$index,specs$index),] <- peaksFiltered + specs[match(peaksFilteredReanalysis$index,specs$index),] <- peaksFilteredReanalysis + + + specs <- addProperty(specs, "filterOK", "logical", FALSE) + + OKindex <- which(specs$formulaMultiplicity > (multiplicityFilter - 1)) + + if(length(OKindex)){ + specs[OKindex,"filterOK"] <- TRUE + } + + peaksReanOK <- specs[ + specs$filterOK & !is.na(specs$matchedReanalysis) & specs$matchedReanalysis,,drop=FALSE] + + # Kick the M+H+ satellites out of peaksReanOK: + peaksReanOK$mzCenter <- as.numeric( + unlist(lapply(peaksReanOK$cpdID, function(id) findMz(id, retrieval=findLevel(id,TRUE))$mzCenter)) ) + peaksReanBad <- peaksReanOK[ + !((peaksReanOK$mzFound < peaksReanOK$mzCenter - 1) | + (peaksReanOK$mzFound > peaksReanOK$mzCenter + 1)),] + notOKindex <- match(peaksReanBad$index, specs$index) + if(length(notOKindex)){ + specs[notOKindex,"filterOK"] <- FALSE + } + + + return(specs) +} + +#' Return MS1 peaks to be used for recalibration +#' +#' Returns the precursor peaks for all MS1 spectra in the \code{spec} dataset +#' with annotated formula to be used in recalibration. +#' +#' For all spectra in \code{spec$specFound}, the precursor ion is extracted from +#' the MS1 precursor spectrum. All found ions are returned in a data frame with a +#' format matching \code{spec$peaksMatched} and therefore suitable for \code{rbind}ing +#' to the \code{spec$peaksMatched} table. However, only minimal information needed for +#' recalibration is returned. +#' +#' @usage recalibrate.addMS1data(spec,mode="pH", recalibrateMS1Window = +#' getOption("RMassBank")$recalibrateMS1Window) +#' @param spec A \code{msmsWorkspace} or \code{RmbSpectraSetList} containing spectra for which MS1 "peaks" should be "constructed". +#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +#' @param recalibrateMS1Window Window width to look for MS1 peaks to recalibrate (in ppm). +#' @return A dataframe with columns \code{mzFound, formula, mzCalc, dppm, dbe, int, +#' dppmBest, formulaCount, good, cpdID, scan, parentScan, dppmRc}. However, +#' columns \code{dbe, int, formulaCount, good, scan, parentScan} do not contain +#' real information and are provided only as fillers. +#' @examples \dontrun{ +#' # More or less as used in recalibrateSpectra: +#' rcdata <- peaksMatched(w) +#' rcdata <- rcdata[rcdata$formulaCount == 1, ,drop=FALSE] +#' ms1data <- recalibrate.addMS1data(w, "pH", 15) +#' rcdata <- rbind(rcdata, ms1data) +#' # ... continue constructing recalibration curve with rcdata +#' } +#' @author Michael Stravs, EAWAG +#' @export +recalibrate.addMS1data <- function(spec,mode="pH", recalibrateMS1Window = + getOption("RMassBank")$recalibrateMS1Window) +{ + ## which_OK <- lapply(validPrecursors, function(pscan) + ## { + ## pplist <- as.data.frame( + ## mzR::peaks(msRaw, which(headerData$acquisitionNum == pscan))) + ## colnames(pplist) <- c("mz","int") + ## pplist <- subset(pplist, mz >= mzLimits$mzMin & mz <= mzLimits$mzMax) + ## if(nrow(pplist) > 0) + ## return(TRUE) + ## return(FALSE) + ## }) + + specFound <- selectSpectra(spec, "found", "object") + + ms1peaks <- lapply(specFound, function(cpd){ + mzL <- findMz.formula(cpd@formula,mode,recalibrateMS1Window,0) + mzCalc <- mzL$mzCenter + ms1 <- mz(cpd@parent) + + mzFound <- ms1[which.min(abs(ms1 - mzL$mzCenter))] + if(!length(mzFound)){ + return(c( + mzFound = NA, + mzCalc = mzCalc, + dppm = NA + )) + } else { + dppmRc <- (mzFound/mzCalc - 1)*1e6 + return(c( + mzFound = mzFound, + mzCalc = mzCalc, + dppm = dppmRc, + id=cpd@id + )) + } + }) + ms1peaks <- as.data.frame(do.call(rbind, ms1peaks), stringsAsFactors=FALSE) + # convert numbers to numeric + tonum <- c("mzFound", "dppm", "mzCalc") + ms1peaks[,tonum] <- as.numeric(unlist(ms1peaks[,tonum])) + # throw out NA stuff + ms1peaks <- ms1peaks[!is.na(ms1peaks$mzFound),] + return(ms1peaks) +} + + +# Custom recalibration function: You can overwrite the recal function by +# making any function which takes rcdata$recalfield ~ rcdata$mzFound. +# The settings define which recal function is used +# getOption("RMassBank")$recalibrator = list( +# MS1 = "recalibrate.loess", +# MS2 = "recalibrate.loess") + +#' Predefined recalibration functions. +#' +#' Predefined fits to use for recalibration: Loess fit and GAM fit. +#' +#' \code{recalibrate.loess()} provides a Loess fit (\code{recalibrate.loess}) +#' to a given recalibration parameter. +#' If MS and MS/MS data should be fit together, recalibrate.loess +#' provides good default settings for Orbitrap instruments. +#' +#' \code{recalibrate.identity()} returns a non-recalibration, i.e. a predictor +#' which predicts 0 for all input values. This can be used if the user wants to +#' skip recalibration in the RMassBank workflow. +#' +#' #' \code{recalibrate.mean()} and \code{recalibrate.linear()} are simple recalibrations +#' which return a constant shift or a linear recalibration. They will be only useful +#' in particular cases. +#' +#' \code{recalibrate()} itself is only a dummy function and does not do anything. +#' +#' Alternatively other functions can be defined. Which functions are used for recalibration +#' is specified by the RMassBank options file. (Note: if \code{recalibrateMS1: common}, the +#' \code{recalibrator: MS1} value is irrelevant, since for a common curve generated with +#' the function specified in \code{recalibrator: MS2} will be used.) +#' +#' @aliases recalibrate.loess recalibrate recalibrate.identity recalibrate.mean recalibrate.linear +#' @usage recalibrate.loess(rcdata) +#' +#' recalibrate.identity(rcdata) +#' +#' recalibrate.mean(rcdata) +#' +#' recalibrate.linear(rcdata) +#' +#' @param rcdata A data frame with at least the columns \code{recalfield} and +#' \code{mzFound}. \code{recalfield} will usually contain delta(ppm) or +#' delta(mz) values and is the target parameter for the recalibration. +#' @return Returns a model for recalibration to be used with \code{predict} and the like. +#' @examples \dontrun{ +#' rcdata <- subset(spec$peaksMatched, formulaCount==1) +#' ms1data <- recalibrate.addMS1data(spec, mode, 15) +#' rcdata <- rbind(rcdata, ms1data) +#' rcdata$recalfield <- rcdata$dppm +#' rcCurve <- recalibrate.loess(rcdata) +#' # define a spectrum and recalibrate it +#' s <- matrix(c(100,150,200,88.8887,95.0005,222.2223), ncol=2) +#' colnames(s) <- c("mz", "int") +#' recalS <- recalibrateSingleSpec(s, rcCurve) +#' +#' Alternative: define an custom recalibrator function with different parameters +#' recalibrate.MyOwnLoess <- function(rcdata) +#' { +#' return(loess(recalfield ~ mzFound, data=rcdata, family=c("symmetric"), +#' degree = 2, span=0.4)) +#' } +#' # This can then be specified in the RMassBank settings file: +#' # recalibrateMS1: common +#' # recalibrator: +#' # MS1: recalibrate.loess +#' # MS2: recalibrate.MyOwnLoess") +#' # [...] +#' } +#' @author Michael Stravs, EAWAG +#' @export +recalibrate <- function() +{ + return(NA) +} + +#' @export +recalibrate.loess <- function(rcdata) +{ + span <- 0.25 + # ex XCMS (permission by Steffen): heuristically decide on loess vs linear + mingroups <- nrow(rcdata[!is.na(rcdata$mzFound),]) + if(mingroups < 4) + { + warning("recalibrate.loess: Not enough data points, omitting recalibration") + return(recalibrate.identity(rcdata)) + } else if (mingroups*span < 4) { + span <- 4/mingroups + warning("recalibrate.loess: Span too small, resetting to ", round(span, 2)) + } + return(loess(recalfield ~ mzFound, data=rcdata, family=c("symmetric"), + degree = 1, span=0.25, surface="direct" )) +} + +#' @export +recalibrate.identity <- function(rcdata) +{ + return(lm(recalfield ~ 0, data=rcdata)) +} + +#' @export +recalibrate.mean <- function(rcdata) +{ + return(lm(recalfield ~ 1, data=rcdata)) +} + +#' @export +recalibrate.linear <- function(rcdata) +{ + return(lm(recalfield ~ mzFound, data=rcdata)) +} + +#' Standard progress bar hook. +#' +#' This function provides a standard implementation for the progress bar in RMassBank. +#' +#' RMassBank calls the progress bar function in the following three ways: +#' \code{pb <- progressBarHook(object=NULL, value=0, min=0, max=LEN)} +#' to create a new progress bar. +#' \code{pb <- progressBarHook(object=pb, value= VAL)} +#' to set the progress bar to a new value (between the set \code{min} and \code{max}) +#' \code{progressBarHook(object=pb, close=TRUE)} +#' to close the progress bar. (The actual calls are performed with \code{do.call}, +#' e.g. +#' \code{progressbar <- "progressBarHook" +#' pb <- do.call(progressbar, list(object=pb, value= nProg)) +#' }. See the source code for details.) +#' +#' To substitute the standard progress bar for an alternative implementation (e.g. for +#' use in a GUI), the developer can write his own function which behaves in the same way +#' as \code{progressBarHook}, i.e. takes the same parameters and can be called in the +#' same way. +#' +#' @param object An identifier representing an instance of a progress bar. +#' @param value The new value to assign to the progress indicator +#' @param min The minimal value of the progress indicator +#' @param max The maximal value of the progress indicator +#' @param close If \code{TRUE}, the progress bar is closed. +#' @return Returns a progress bar instance identifier (i.e. an identifier +#' which can be used as \code{object} in subsequent calls.) +#' +#' @author Michele Stravs, Eawag +#' @export +progressBarHook <- function(object = NULL, value = 0, min = 0, max = 100, close = FALSE) +{ + if(is.null(object)) + { + object <- txtProgressBar(min, max, value, style=3, file=stderr()) + } + if(close) + close(object) + else + { + setTxtProgressBar(object, value) + return(object) + } +} diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index f69458c..c347d5b 100644 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -1,848 +1,1098 @@ -## For generating the NAMESPACE -#' @import mzR - -#' @import Rcpp -## Was not in manually written NAMESPACE ? -#' @import RCurl -#' @import XML -#' @import methods -#' @import mzR -#' @import rcdk -#' @import rjson -#' @import yaml -#' @import digest -NULL # This is required so that roxygen knows where the first manpage starts - - -# # importClassesFrom mzR ## Causes error -# # importMethodsFrom mzR - -#' Extract MS/MS spectra for specified precursor -#' -#' Extracts MS/MS spectra from LC-MS raw data for a specified precursor, specified -#' either via the RMassBank compound list (see \code{\link{loadList}}) or via a mass. -#' -#' Different versions of the function get the data from different sources. Note that -#' findMsMsHR and findMsMsHR.direct differ mainly in that findMsMsHR opens a file -#' whereas findMsMs.direct uses an open file handle - both are intended to be used -#' in a full process which involves compound lists etc. In contrast, findMsMsHR.mass -#' is a low-level function which uses the mass directly for lookup and is intended for -#' use as a standalone function in unrelated applications. -#' -#' @note \code{findMsMs.direct} is deactivated -#' -## # @usage findMsMsHR(fileName, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE, -## # ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine, -## # mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse, -## # fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan, -## # rtMargin = getOption("RMassBank")$rtMargin, -## # deprofile = getOption("RMassBank")$deprofile, -## # headerCache = NULL, -## # peaksCache = NULL) -## # -## # findMsMsHR.mass(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA, -## # headerCache = NULL, fillPrecursorScan = FALSE, -## # deprofile = getOption("RMassBank")$deprofile, peaksCache = NULL, cpdID = NA) -#' -#' -#' @aliases findMsMsHR.mass findMsMsHR -#' @param fileName The file to open and search the MS2 spectrum in. -#' @param msRaw The opened raw file (mzR file handle) to search the MS2 spectrum in. Specify either this -#' or \code{fileName}. -#' @param cpdID The compound ID in the compound list (see \code{\link{loadList}}) -#' to use for formula lookup. Note: In \\code{findMsMsHR.mass}, this is entirely optional and -#' used only in case a warning must be displayed; compound lookup is done via mass only. -#' @param mz The mass to use for spectrum search. -#' @param ppmFine The limit in ppm to use for fine limit (see below) calculation. -#' @param mzCoarse The coarse limit to use for locating potential MS2 scans: -#' this tolerance is used when finding scans with a suitable precursor -#' ion value. -#' @param limit.fine The fine limit to use for locating MS2 scans: this tolerance -#' is used when locating an appropriate analyte peak in the MS1 precursor -#' spectrum. -#' @param limit.coarse Parameter in \code{findMsMsHR.mass} corresponding to \code{mzCoarse}. -#' (The parameters are distinct to clearly conceptually distinguish findMsMsHR.mass -#' (a standalone useful function) from the cpdID based functions (workflow functions).) -#' @param mode The processing mode (determines which ion/adduct is searched): -#' \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). -#' @param confirmMode Whether to use the highest-intensity precursor (=0), second- -#' highest (=1), third-highest (=2)... -#' @param useRtLimit Whether to respect retention time limits from the compound list. -#' @param rtLimits \code{c(min, max)}: Minimum and maximum retention time to use -#' when locating the MS2 scans. -#' @param headerCache If present, the complete \code{mzR::header(msRaw)}. Passing -#' this value is useful if spectra for multiple compounds should be -#' extracted from the same mzML file, since it avoids getting the data -#' freshly from \code{msRaw} for every compound. -#' @param peaksCache If present, the complete output of \code{mzR::peaks(msRaw)}. This speeds up the lookup -#' if multiple compounds should be searched in the same file. -#' @param maxCount The maximal number of spectra groups to return. One spectra group -#' consists of all data-dependent scans from the same precursor whose precursor -#' mass matches the specified search mass. -#' @param fillPrecursorScan If \code{TRUE}, the precursor scan will be filled from MS1 data. -#' To be used for data where the precursor scan is not stored in the raw data. -#' @param rtMargin The retention time tolerance to use. -#' @param deprofile Whether deprofiling should take place, and what method should be -#' used (cf. \code{\link{deprofile}}) -#' @param retrieval A value that determines whether the files should be handled either as "standard", -#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" -#' if the only know thing is the m/z -#' @return An \code{RmbSpectraSet} (for \code{findMsMsHR}). Contains parent MS1 spectrum (\code{@@parent}), a block of dependent MS2 spectra ((\code{@@children}) -#' and some metadata (\code{id},\code{mz},\code{name},\code{mode} in which the spectrum was acquired. -#' -#' For \code{findMsMsHR.mass}: a list of \code{RmbSpectraSet}s as defined above, sorted -#' by decreasing precursor intensity. -#' -#' @examples \dontrun{ -#' loadList("mycompoundlist.csv") -#' # if Atrazine has compound ID 1: -#' msms_atrazine <- findMsMsHR(fileName = "Atrazine_0001_pos.mzML", cpdID = 1, mode = "pH") -#' # Or alternatively: -#' msRaw <- openMSfile("Atrazine_0001_pos.mzML") -#' msms_atrazine <- findMsMsHR(msRaw=msRaw, cpdID = 1, mode = "pH") -#' # Or directly by mass (this will return a list of spectra sets): -#' mz <- findMz(1)$mzCenter -#' msms_atrazine_all <- findMsMsHR.mass(msRaw, mz, 1, ppm(msRaw, 10, p=TRUE)) -#' msms_atrazine <- msms_atrazine_all[[1]] -#' } -#' @author Michael A. Stravs, Eawag -#' @seealso findEIC -#' @export -findMsMsHR <- function(fileName = NULL, msRaw = NULL, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE, - ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine, - mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse, - fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan, - rtMargin = getOption("RMassBank")$rtMargin, - deprofile = getOption("RMassBank")$deprofile, - headerCache = NULL, - peaksCache = NULL, - retrieval="standard") -{ - - # access data directly for finding the MS/MS data. This is done using - # mzR. - if(!is.null(fileName) & !is.null(msRaw)) - stop("Both MS raw data and MS filename given. Only one can be handled at the same time.") - if(!is.null(fileName)) - msRaw <- openMSfile(fileName) - - mzLimits <- findMz(cpdID, mode, retrieval=retrieval) - mz <- mzLimits$mzCenter - limit.fine <- ppm(mz, ppmFine, p=TRUE) - if(!useRtLimit) - rtLimits <- NA - else - { - dbRt <- findRt(cpdID) - rtLimits <- c(dbRt$RT - rtMargin, dbRt$RT + rtMargin) * 60 - } - spectra <- findMsMsHR.mass(msRaw, mz, mzCoarse, limit.fine, rtLimits, confirmMode + 1,headerCache - ,fillPrecursorScan, deprofile, peaksCache, cpdID) - # check whether a) spectrum was found and b) enough spectra were found - if(length(spectra) < (confirmMode + 1)) - sp <- new("RmbSpectraSet", found=FALSE) - else - sp <- spectra[[confirmMode + 1]] - - #sp@mz <- mzLimits - sp@id <- as.character(as.integer(cpdID)) - sp@name <- findName(cpdID) - ENV <- environment() - if(retrieval == "unknown"){ - sp@formula <- "" - } else{ - sp@formula <- findFormula(cpdID, retrieval=retrieval) - } - sp@mode <- mode - - # If we had to open the file, we have to close it again - if(!is.null(fileName)) - mzR::close(msRaw) - - return(sp) -} - -#' @describeIn findMsMsHR A submethod of find MsMsHR that retrieves basic spectrum data -#' @export -findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA, +## For generating the NAMESPACE +#' @import mzR + +#' @import Rcpp +## Was not in manually written NAMESPACE ? +#' @import RCurl +#' @import XML +#' @import methods +#' @import mzR +#' @import rcdk +#' @import rjson +#' @import yaml +#' @import digest +NULL # This is required so that roxygen knows where the first manpage starts + + +# # importClassesFrom mzR ## Causes error +# # importMethodsFrom mzR + +#' Extract MS/MS spectra for specified precursor +#' +#' Extracts MS/MS spectra from LC-MS raw data for a specified precursor, specified +#' either via the RMassBank compound list (see \code{\link{loadList}}) or via a mass. +#' +#' Different versions of the function get the data from different sources. Note that +#' findMsMsHR and findMsMsHR.direct differ mainly in that findMsMsHR opens a file +#' whereas findMsMs.direct uses an open file handle - both are intended to be used +#' in a full process which involves compound lists etc. In contrast, findMsMsHR.mass +#' is a low-level function which uses the mass directly for lookup and is intended for +#' use as a standalone function in unrelated applications. +#' +#' @note \code{findMsMs.direct} is deactivated +#' +## # @usage findMsMsHR(fileName, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE, +## # ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine, +## # mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse, +## # fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan, +## # rtMargin = getOption("RMassBank")$rtMargin, +## # deprofile = getOption("RMassBank")$deprofile, +## # headerCache = NULL, +## # peaksCache = NULL) +## # +## # findMsMsHR.mass(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA, +## # headerCache = NULL, fillPrecursorScan = FALSE, +## # deprofile = getOption("RMassBank")$deprofile, peaksCache = NULL, cpdID = NA) +#' +#' +#' @aliases findMsMsHR.mass findMsMsHR +#' @param fileName The file to open and search the MS2 spectrum in. +#' @param msRaw The opened raw file (mzR file handle) to search the MS2 spectrum in. Specify either this +#' or \code{fileName}. +#' @param cpdID The compound ID in the compound list (see \code{\link{loadList}}) +#' to use for formula lookup. Note: In \\code{findMsMsHR.mass}, this is entirely optional and +#' used only in case a warning must be displayed; compound lookup is done via mass only. +#' @param mz The mass to use for spectrum search. +#' @param ppmFine The limit in ppm to use for fine limit (see below) calculation. +#' @param mzCoarse The coarse limit to use for locating potential MS2 scans: +#' this tolerance is used when finding scans with a suitable precursor +#' ion value. +#' @param limit.fine The fine limit to use for locating MS2 scans: this tolerance +#' is used when locating an appropriate analyte peak in the MS1 precursor +#' spectrum. +#' @param limit.coarse Parameter in \code{findMsMsHR.mass} corresponding to \code{mzCoarse}. +#' (The parameters are distinct to clearly conceptually distinguish findMsMsHR.mass +#' (a standalone useful function) from the cpdID based functions (workflow functions).) +#' @param mode The processing mode (determines which ion/adduct is searched): +#' \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +#' @param confirmMode Whether to use the highest-intensity precursor (=0), second- +#' highest (=1), third-highest (=2)... +#' @param useRtLimit Whether to respect retention time limits from the compound list. +#' @param rtLimits \code{c(min, max)}: Minimum and maximum retention time to use +#' when locating the MS2 scans. +#' @param headerCache If present, the complete \code{mzR::header(msRaw)}. Passing +#' this value is useful if spectra for multiple compounds should be +#' extracted from the same mzML file, since it avoids getting the data +#' freshly from \code{msRaw} for every compound. +#' @param peaksCache If present, the complete output of \code{mzR::peaks(msRaw)}. This speeds up the lookup +#' if multiple compounds should be searched in the same file. +#' @param maxCount The maximal number of spectra groups to return. One spectra group +#' consists of all data-dependent scans from the same precursor whose precursor +#' mass matches the specified search mass. +#' @param fillPrecursorScan If \code{TRUE}, the precursor scan will be filled from MS1 data. +#' To be used for data where the precursor scan is not stored in the raw data. +#' @param rtMargin The retention time tolerance to use. +#' @param deprofile Whether deprofiling should take place, and what method should be +#' used (cf. \code{\link{deprofile}}) +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z +#' @return An \code{RmbSpectraSet} (for \code{findMsMsHR}). Contains parent MS1 spectrum (\code{@@parent}), a block of dependent MS2 spectra ((\code{@@children}) +#' and some metadata (\code{id},\code{mz},\code{name},\code{mode} in which the spectrum was acquired. +#' +#' For \code{findMsMsHR.mass}: a list of \code{RmbSpectraSet}s as defined above, sorted +#' by decreasing precursor intensity. +#' +#' @examples \dontrun{ +#' loadList("mycompoundlist.csv") +#' # if Atrazine has compound ID 1: +#' msms_atrazine <- findMsMsHR(fileName = "Atrazine_0001_pos.mzML", cpdID = 1, mode = "pH") +#' # Or alternatively: +#' msRaw <- openMSfile("Atrazine_0001_pos.mzML") +#' msms_atrazine <- findMsMsHR(msRaw=msRaw, cpdID = 1, mode = "pH") +#' # Or directly by mass (this will return a list of spectra sets): +#' mz <- findMz(1)$mzCenter +#' msms_atrazine_all <- findMsMsHR.mass(msRaw, mz, 1, ppm(msRaw, 10, p=TRUE)) +#' msms_atrazine <- msms_atrazine_all[[1]] +#' } +#' @author Michael A. Stravs, Eawag +#' @seealso findEIC +#' @export +findMsMsHR <- function(fileName = NULL, msRaw = NULL, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE, + ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine, + mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse, + fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan, + rtMargin = getOption("RMassBank")$rtMargin, + deprofile = getOption("RMassBank")$deprofile, + headerCache = NULL, + peaksCache = NULL, + retrieval="standard") +{ + + # access data directly for finding the MS/MS data. This is done using + # mzR. + if(!is.null(fileName) & !is.null(msRaw)) + stop("Both MS raw data and MS filename given. Only one can be handled at the same time.") + if(!is.null(fileName)) + msRaw <- openMSfile(fileName) + + mzLimits <- findMz(cpdID, mode, retrieval=retrieval) + mz <- mzLimits$mzCenter + limit.fine <- ppm(mz, ppmFine, p=TRUE) + if(!useRtLimit) + rtLimits <- NA + else + { + dbRt <- findRt(cpdID) + rtLimits <- c(dbRt$RT - rtMargin, dbRt$RT + rtMargin) * 60 + } + spectra <- findMsMsHR.mass(msRaw, mz, mzCoarse, limit.fine, rtLimits, confirmMode + 1,headerCache + ,fillPrecursorScan, deprofile, peaksCache, cpdID) + # check whether a) spectrum was found and b) enough spectra were found + if(length(spectra) < (confirmMode + 1)) + sp <- new("RmbSpectraSet", found=FALSE) + else + sp <- spectra[[confirmMode + 1]] + + #sp@mz <- mzLimits + sp@id <- as.character(as.integer(cpdID)) + sp@name <- findName(cpdID) + ENV <- environment() + if(retrieval == "unknown"){ + sp@formula <- "" + } else{ + sp@formula <- findFormula(cpdID, retrieval=retrieval) + } + sp@mode <- mode + + # If we had to open the file, we have to close it again + if(!is.null(fileName)) + mzR::close(msRaw) + + return(sp) +} + +#' @describeIn findMsMsHR A submethod of find MsMsHR that retrieves basic spectrum data +#' @export +findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA, headerCache = NULL, fillPrecursorScan = FALSE, deprofile = getOption("RMassBank")$deprofile, peaksCache = NULL, cpdID = NA) -{ - eic <- findEIC(msRaw, mz, limit.fine, rtLimits, headerCache=headerCache, - peaksCache=peaksCache) - # if(!is.na(rtLimits)) - # { - # eic <- subset(eic, rt >= rtLimits[[1]] & rt <= rtLimits[[2]]) - # } - if(!is.null(headerCache)) - headerData <- headerCache - else - headerData <- as.data.frame(header(msRaw)) - - - ###If no precursor scan number, fill the number - if(length(unique(headerData$precursorScanNum)) == 1){ - fillPrecursorScan <- TRUE - } - - if(fillPrecursorScan == TRUE) - { - # reset the precursor scan number. first set to NA, then - # carry forward the precursor scan number from the last parent scan - headerData$precursorScanNum <- NA - headerData[which(headerData$msLevel == 1),"precursorScanNum"] <- - headerData[which(headerData$msLevel == 1),"acquisitionNum"] - headerData[,"precursorScanNum"] <- .locf(headerData[,"precursorScanNum"]) - # Clear the actual MS1 precursor scan number again - headerData[which(headerData$msLevel == 1),"precursorScanNum"] <- 0 - } - - # Find MS2 spectra with precursors which are in the allowed - # scan filter (coarse limit) range - findValidPrecursors <- headerData[ - (headerData$precursorMZ > mz - limit.coarse) & - (headerData$precursorMZ < mz + limit.coarse),] - # Find the precursors for the found spectra - validPrecursors <- unique(findValidPrecursors$precursorScanNum) - # check whether the precursors are real: must be within fine limits! - # previously even "bad" precursors were taken. e.g. 1-benzylpiperazine - which_OK <- lapply(validPrecursors, function(pscan) - { - pplist <- as.data.frame( - mzR::peaks(msRaw, which(headerData$acquisitionNum == pscan))) - colnames(pplist) <- c("mz","int") - pplist <- pplist[(pplist$mz >= mz -limit.fine) - & (pplist$mz <= mz + limit.fine),] - if(nrow(pplist) > 0) - return(TRUE) - return(FALSE) - }) - validPrecursors <- validPrecursors[which(which_OK==TRUE)] - if(length(validPrecursors) == 0){ - if(!is.na(cpdID)) - warning(paste0("No precursor was detected for compound, ", cpdID, " with m/z ", mz, ". Please check the mass and retention time window.")) - else - warning(paste0("No precursor was detected for m/z ", mz, ". Please check the mass and retention time window.")) - } - # Crop the "EIC" to the valid precursor scans - eic <- eic[eic$scan %in% validPrecursors,] - # Order by intensity, descending - eic <- eic[order(eic$intensity, decreasing=TRUE),] - if(nrow(eic) == 0) - return(list( - new("RmbSpectraSet", - found=FALSE))) - if(!is.na(maxCount)) - { - spectraCount <- min(maxCount, nrow(eic)) - eic <- eic[1:spectraCount,] - } - # Construct all spectra groups in decreasing intensity order - spectra <- lapply(eic$scan, function(masterScan) - { - masterHeader <- headerData[headerData$acquisitionNum == masterScan,] - childHeaders <- headerData[(headerData$precursorScanNum == masterScan) - & (headerData$precursorMZ > mz - limit.coarse) - & (headerData$precursorMZ < mz + limit.coarse) ,] - childScans <- childHeaders$seqNum - - msPeaks <- mzR::peaks(msRaw, masterHeader$seqNum) - # if deprofile option is set: run deprofiling - deprofile.setting <- deprofile - if(!is.na(deprofile.setting)) - msPeaks <- deprofile.scan( - msPeaks, method = deprofile.setting, noise = NA, colnames = FALSE - ) - colnames(msPeaks) <- c("mz","int") - - msmsSpecs <- apply(childHeaders, 1, function(line) - { - pks <- mzR::peaks(msRaw, line["seqNum"]) - - if(!is.na(deprofile.setting)) - { - pks <- deprofile.scan( - pks, method = deprofile.setting, noise = NA, colnames = FALSE - ) - } - - new("RmbSpectrum2", - mz = pks[,1], - intensity = pks[,2], - precScanNum = as.integer(line["precursorScanNum"]), - precursorMz = line["precursorMZ"], - precursorIntensity = line["precursorIntensity"], - precursorCharge = as.integer(line["precursorCharge"]), - collisionEnergy = line["collisionEnergy"], - tic = line["totIonCurrent"], - peaksCount = line["peaksCount"], - rt = line["retentionTime"], - acquisitionNum = as.integer(line["seqNum"]), - centroided = TRUE - ) - }) - msmsSpecs <- as(do.call(c, msmsSpecs), "SimpleList") - - - - # build the new objects - masterSpec <- new("Spectrum1", - mz = msPeaks[,"mz"], - intensity = msPeaks[,"int"], - polarity = as.integer(masterHeader$polarity), - peaksCount = as.integer(masterHeader$peaksCount), - rt = masterHeader$retentionTime, - acquisitionNum = as.integer(masterHeader$seqNum), - tic = masterHeader$totIonCurrent, - centroided = TRUE - ) - - spectraSet <- new("RmbSpectraSet", - parent = masterSpec, - children = msmsSpecs, - found = TRUE, - #complete = NA, - #empty = NA, - #formula = character(), - mz = mz - #name = character(), - #annotations = list() - ) - return(spectraSet) - }) - names(spectra) <- eic$acquisitionNum - return(spectra) -} - - -#' Discontinued: find MS/MS spectrum from open raw file -#' -#' This interface has been discontinued. \code{\link{findMsMsHR}} now supports the same parameters (use named -#' parameters). -#' -#' @param msRaw x -#' @param cpdID x -#' @param mode x -#' @param confirmMode x -#' @param useRtLimit x -#' @param ppmFine x -#' @param mzCoarse x -#' @param fillPrecursorScan x -#' @param rtMargin x -#' @param deprofile x -#' @param headerCache x -#' @return an error -#' -#' @author stravsmi -#' @export -findMsMsHR.direct <- function(msRaw, cpdID, mode = "pH", confirmMode = 0, useRtLimit = TRUE, - ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine, - mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse, - fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan, - rtMargin = getOption("RMassBank")$rtMargin, - deprofile = getOption("RMassBank")$deprofile, - headerCache = NULL) -{ - stop("Support for this interface has been discontinued. Use findMsMsHR with the same parameters instead (use named parameter msRaw)") -} - -#' Read in mz-files using XCMS -#' -#' Picks peaks from mz-files and returns the pseudospectra that CAMERA creates with the help of XCMS -#' -#' @aliases findMsMsHRperxcms.direct findMsMsHRperxcms -#' @param fileName The path to the mz-file that should be read -#' @param cpdID The compoundID(s) of the compound that has been used for the file -#' @param mode The ionization mode that has been used for the spectrum represented by the peaklist -#' @param findPeaksArgs A list of arguments that will be handed to the xcms-method findPeaks via do.call -#' @param plots A parameter that determines whether the spectra should be plotted or not -#' @param MSe A boolean value that determines whether the spectra were recorded using MSe or not -#' @return The spectra generated from XCMS -#' @seealso \code{\link{msmsWorkflow}} \code{\link{toRMB}} -#' @author Erik Mueller -#' @examples \dontrun{ -#' fileList <- list.files(system.file("XCMSinput", package = "RMassBank"), "Glucolesquerellin", full.names=TRUE)[3] -#' loadList(system.file("XCMSinput/compoundList.csv",package="RMassBank")) -#' psp <- findMsMsHRperxcms(fileList,2184) -#' } -#' @export -findMsMsHRperxcms <- function(fileName, cpdID, mode="pH", findPeaksArgs = NULL, plots = FALSE, MSe = FALSE){ - - # Find mz - mzLimits <- findMz(cpdID, mode) - mz <- mzLimits$mzCenter - - - # If there are more files than cpdIDs - if(length(fileName) > 1){ - fspectra <- list() - - for(i in 1:length(fileName)){ - fspectra[[i]] <- findMsMsHRperxcms.direct(fileName[i], cpdID, mode=mode, findPeaksArgs = findPeaksArgs, plots = plots, MSe = MSe) - } - - spectra <- toRMB(unlist(unlist(fspectra, FALSE),FALSE), cpdID, mode) - - } else if(length(cpdID) > 1){ # If there are more cpdIDs than files - - spectra <- findMsMsHRperxcms.direct(fileName, cpdID, mode=mode, findPeaksArgs = findPeaksArgs, plots = plots, MSe = MSe) - - P <- lapply(1:length(spectra), function(i){ - sp <- toRMB(spectra[[i]], cpdID[i], mode) - sp@id <- as.character(as.integer(cpdID[i])) - sp@name <- findName(cpdID[i]) - sp@formula <- findFormula(cpdID[i]) - sp@mode <- mode - return(sp) - }) - return(P) - - } else { # There is a file for every cpdID - spectra <- toRMB(unlist(findMsMsHRperxcms.direct(fileName, cpdID, mode=mode, findPeaksArgs = NULL, plots = FALSE, MSe = FALSE),FALSE)) - } - - sp <- spectra - - #sp@mz <- mzLimits - sp@id <- as.character(as.integer(cpdID)) - sp@name <- findName(cpdID) - sp@formula <- findFormula(cpdID) - sp@mode <- mode - - return(sp) -} - -#' @describeIn findMsMsHRperxcms A submethod of findMsMsHrperxcms that retrieves basic spectrum data -#' @export -findMsMsHRperxcms.direct <- function(fileName, cpdID, mode="pH", findPeaksArgs = NULL, plots = FALSE, MSe = FALSE) { - - requireNamespace("CAMERA",quietly=TRUE) - requireNamespace("xcms",quietly=TRUE) - - ## - ## getRT function - ## - - getRT <- function(xa) { - rt <- sapply(xa@pspectra, function(x) {median(xcms::peaks(xa@xcmsSet)[x, "rt"])}) - } - - ## - ## MSMS - ## - - # Read file - suppressWarnings(xrmsms <- xcms::xcmsRaw(fileName, includeMSn=TRUE)) - - - # If file is not MSe, split by collision energy - if(MSe == FALSE){ - # Also, fake MS1 from the MSn data - suppressWarnings(xrs <- split(xcms::msn2xcmsRaw(xrmsms), f = xrmsms@msnCollisionEnergy)) - } else{ - # Else, MSn data will already be in MS1 - xrs <- list() - xrs[[1]] <- xrmsms - } - - # Fake a simplistic xcmsSet - suppressWarnings(setReplicate <- xcms::xcmsSet(files=fileName, method="MS1")) - xsmsms <- as.list(replicate(length(xrs),setReplicate)) - - mzabs <- 0.1 - - # Definitions - whichmissing <- vector() - metaspec <- list() - - ## - ## Retrieval over all supplied cpdIDs - ## - - for(ID in 1:length(cpdID)){ - - # Find all relevant information for the current cpdID - XCMSspectra <- list() - RT <- findRt(cpdID[ID])$RT * 60 - parentMass <- findMz(cpdID[ID], mode=mode)$mzCenter - - # Is the information in the compound list? - if(is.na(parentMass)){ - stop(paste("There was no matching entry to the supplied cpdID", cpdID[ID] ,"\n Please check the cpdIDs and the compoundlist.")) - } - - # Go over every collision energy of the MS2 - for(i in 1:length(xrs)){ - - suppressWarnings(capture.output(xcms::peaks(xsmsms[[i]]) <- do.call(xcms::findPeaks,c(findPeaksArgs, object = xrs[[i]])))) - - if (nrow(xcms::peaks(xsmsms[[i]])) == 0) { - XCMSspectra[[i]] <- matrix(0,2,7) - next - } else{ - - # Get the peaklist - pl <- xcms::peaks(xsmsms[[i]])[,c("mz", "rt"), drop=FALSE] - - # Find precursor peak within limits - candidates <- which( pl[,"mz", drop=FALSE] < parentMass + mzabs & pl[,"mz", drop=FALSE] > parentMass - mzabs - & pl[,"rt", drop=FALSE] < RT * 1.1 & pl[,"rt", drop=FALSE] > RT * 0.9 ) - - # Annotate and group by FWHM (full width at half maximum) - capture.output(anmsms <- CAMERA::xsAnnotate(xsmsms[[i]])) - capture.output(anmsms <- CAMERA::groupFWHM(anmsms)) - - # If a candidate fulfills the condition, choose the closest and retrieve the index of those pesudospectra - if(length(candidates) > 0){ - closestCandidate <- which.min(abs(RT - pl[candidates, "rt", drop=FALSE])) - pspIndex <- which(sapply(anmsms@pspectra, function(x) {candidates[[i]][closestCandidate] %in% x})) - } else{ - # Else choose the candidate with the closest RT - pspIndex <- which.min(abs(getRT(anmsms) - RT)) - } - - # 2nd best: Spectrum closest to MS1 - # pspIndex <- which.min( abs(getRT(anmsms) - actualRT)) - - # If the plot parameter was supplied, plot it - if((plots == TRUE) && (length(pspIndex) > 0)){ - CAMERA::plotPsSpectrum(anmsms, pspIndex, log=TRUE, mzrange=c(0, findMz(cpdID)[[3]]), maxlabel=10) - } - - # If there is a number of indexes, retrieve the pseudospectra - if(length(pspIndex) != 0){ - XCMSspectra[[i]] <- CAMERA::getpspectra(anmsms, pspIndex) - } else { - # Else note the spectrum as missing - whichmissing <- c(whichmissing,i) - } - } - } - - # If XCMSspectra were found but there are some missing for some collision energies, fill these XCMSspectra - if((length(XCMSspectra) != 0) && length(whichmissing)){ - for(i in whichmissing){ - XCMSspectra[[i]] <- matrix(0,2,7) - } - } - - metaspec[[ID]] <- XCMSspectra - } - - return(metaspec) -} - -# Finds the EIC for a mass trace with a window of x ppm. -# (For ppm = 10, this is +5 / -5 ppm from the non-recalibrated mz.) -#' Extract EICs -#' -#' Extract EICs from raw data for a determined mass window. -#' -#' @param msRaw The mzR file handle -#' @param mz The mass or mass range to extract the EIC for: either a single mass -#' (with the range specified by \code{limit} below) or a mass range -#' in the form of \code{c(min, max)}. -#' @param limit If a single mass was given for \code{mz}: the mass window to extract. -#' A limit of 0.001 means that the EIC will be returned for \code{[mz - 0.001, mz + 0.001]}. -#' @param rtLimit If given, the retention time limits in form \code{c(rtmin, rtmax)} in seconds. -#' @param headerCache If present, the complete \code{mzR::header(msRaw)}. Passing -#' this value is useful if spectra for multiple compounds should be -#' extracted from the same mzML file, since it avoids getting the data -#' freshly from \code{msRaw} for every compound. -#' @param peaksCache If present, the complete output of \code{mzR::peaks(msRaw)}. This speeds up the lookup -#' if multiple compounds should be searched in the same file. -#' @param floatingRecalibration -#' A fitting function that \code{predict()}s a mass shift based on the retention time. Can be used -#' if a lockmass calibration is known (however you have to build the calibration yourself.) -#' @return A \code{[rt, intensity, scan]} matrix (\code{scan} being the scan number.) -#' @author Michael A. Stravs, Eawag -#' @seealso findMsMsHR -#' @export -findEIC <- function(msRaw, mz, limit = NULL, rtLimit = NA, headerCache = NULL, floatingRecalibration = NULL, - peaksCache = NULL) -{ - # calculate mz upper and lower limits for "integration" - if(all(c("mzMin", "mzMax") %in% names(mz))) - mzlimits <- c(mz$mzMin, mz$mzMax) - else - mzlimits <- c(mz - limit, mz + limit) - # Find peaklists for all MS1 scans - if(!is.null(headerCache)) - headerData <- as.data.frame(headerCache) - else - headerData <- as.data.frame(header(msRaw)) - # Add row numbering because I'm not sure if seqNum or acquisitionNum correspond to anything really - if(nrow(headerData) > 0) - headerData$rowNum <- 1:nrow(headerData) - else - headerData$rowNum <- integer(0) - - # If RT limit is already given, retrieve only candidates in the first place, - # since this makes everything much faster. - if(all(!is.na(rtLimit))) - headerMS1 <- headerData[ - (headerData$msLevel == 1) & (headerData$retentionTime >= rtLimit[[1]]) - & (headerData$retentionTime <= rtLimit[[2]]) - ,] - else - headerMS1 <- headerData[headerData$msLevel == 1,] - if(is.null(peaksCache)) - pks <- mzR::peaks(msRaw, headerMS1$seqNum) - else - pks <- peaksCache[headerMS1$rowNum] - - # Sum intensities in the given mass window for each scan - if(is.null(floatingRecalibration)) - { - headerMS1$mzMin <- mzlimits[[1]] - headerMS1$mzMax <- mzlimits[[2]] - } - else - { - headerMS1$mzMin <- mzlimits[[1]] + predict(floatingRecalibration, headerMS1$retentionTime) - headerMS1$mzMax <- mzlimits[[2]] + predict(floatingRecalibration, headerMS1$retentionTime) - } - intensity <- unlist(lapply(1:nrow(headerMS1), function(row){ +{ + eic <- findEIC(msRaw, mz, limit.fine, rtLimits, headerCache=headerCache, + peaksCache=peaksCache) + # if(!is.na(rtLimits)) + # { + # eic <- subset(eic, rt >= rtLimits[[1]] & rt <= rtLimits[[2]]) + # } + if(!is.null(headerCache)) + headerData <- headerCache + else + headerData <- as.data.frame(header(msRaw)) + + + ###If no precursor scan number, fill the number + if(length(unique(headerData$precursorScanNum)) == 1){ + fillPrecursorScan <- TRUE + } + + if(fillPrecursorScan == TRUE) + { + # reset the precursor scan number. first set to NA, then + # carry forward the precursor scan number from the last parent scan + headerData$precursorScanNum <- NA + headerData[which(headerData$msLevel == 1),"precursorScanNum"] <- + headerData[which(headerData$msLevel == 1),"acquisitionNum"] + headerData[,"precursorScanNum"] <- .locf(headerData[,"precursorScanNum"]) + # Clear the actual MS1 precursor scan number again + headerData[which(headerData$msLevel == 1),"precursorScanNum"] <- 0 + } + + # Find MS2 spectra with precursors which are in the allowed + # scan filter (coarse limit) range + findValidPrecursors <- headerData[ + (headerData$precursorMZ > mz - limit.coarse) & + (headerData$precursorMZ < mz + limit.coarse),] + # Find the precursors for the found spectra + validPrecursors <- unique(findValidPrecursors$precursorScanNum) + # check whether the precursors are real: must be within fine limits! + # previously even "bad" precursors were taken. e.g. 1-benzylpiperazine + which_OK <- lapply(validPrecursors, function(pscan) + { + pplist <- as.data.frame( + mzR::peaks(msRaw, which(headerData$acquisitionNum == pscan))) + colnames(pplist) <- c("mz","int") + pplist <- pplist[(pplist$mz >= mz -limit.fine) + & (pplist$mz <= mz + limit.fine),] + if(nrow(pplist) > 0) + return(TRUE) + return(FALSE) + }) + validPrecursors <- validPrecursors[which(which_OK==TRUE)] + if(length(validPrecursors) == 0){ + if(!is.na(cpdID)) + warning(paste0("No precursor was detected for compound, ", cpdID, " with m/z ", mz, ". Please check the mass and retention time window.")) + else + warning(paste0("No precursor was detected for m/z ", mz, ". Please check the mass and retention time window.")) + } + # Crop the "EIC" to the valid precursor scans + eic <- eic[eic$scan %in% validPrecursors,] + # Order by intensity, descending + eic <- eic[order(eic$intensity, decreasing=TRUE),] + if(nrow(eic) == 0) + return(list( + new("RmbSpectraSet", + found=FALSE))) + if(!is.na(maxCount)) + { + spectraCount <- min(maxCount, nrow(eic)) + eic <- eic[1:spectraCount,] + } + # Construct all spectra groups in decreasing intensity order + spectra <- lapply(eic$scan, function(masterScan) + { + masterHeader <- headerData[headerData$acquisitionNum == masterScan,] + childHeaders <- headerData[(headerData$precursorScanNum == masterScan) + & (headerData$precursorMZ > mz - limit.coarse) + & (headerData$precursorMZ < mz + limit.coarse) ,] + childScans <- childHeaders$seqNum + + msPeaks <- mzR::peaks(msRaw, masterHeader$seqNum) + # if deprofile option is set: run deprofiling + deprofile.setting <- deprofile + if(!is.na(deprofile.setting)) + msPeaks <- deprofile.scan( + msPeaks, method = deprofile.setting, noise = NA, colnames = FALSE + ) + colnames(msPeaks) <- c("mz","int") + + msmsSpecs <- apply(childHeaders, 1, function(line) + { + pks <- mzR::peaks(msRaw, line["seqNum"]) + + if(!is.na(deprofile.setting)) + { + pks <- deprofile.scan( + pks, method = deprofile.setting, noise = NA, colnames = FALSE + ) + } + + new("RmbSpectrum2", + mz = pks[,1], + intensity = pks[,2], + precScanNum = as.integer(line["precursorScanNum"]), + precursorMz = line["precursorMZ"], + precursorIntensity = line["precursorIntensity"], + precursorCharge = as.integer(line["precursorCharge"]), + collisionEnergy = line["collisionEnergy"], + tic = line["totIonCurrent"], + peaksCount = line["peaksCount"], + rt = line["retentionTime"], + acquisitionNum = as.integer(line["seqNum"]), + centroided = TRUE + ) + }) + msmsSpecs <- as(do.call(c, msmsSpecs), "SimpleList") + + + + # build the new objects + masterSpec <- new("Spectrum1", + mz = msPeaks[,"mz"], + intensity = msPeaks[,"int"], + polarity = as.integer(masterHeader$polarity), + peaksCount = as.integer(masterHeader$peaksCount), + rt = masterHeader$retentionTime, + acquisitionNum = as.integer(masterHeader$seqNum), + tic = masterHeader$totIonCurrent, + centroided = TRUE + ) + + spectraSet <- new("RmbSpectraSet", + parent = masterSpec, + children = msmsSpecs, + found = TRUE, + #complete = NA, + #empty = NA, + #formula = character(), + mz = mz + #name = character(), + #annotations = list() + ) + return(spectraSet) + }) + names(spectra) <- eic$acquisitionNum + return(spectra) +} + + +#' Discontinued: find MS/MS spectrum from open raw file +#' +#' This interface has been discontinued. \code{\link{findMsMsHR}} now supports the same parameters (use named +#' parameters). +#' +#' @param msRaw x +#' @param cpdID x +#' @param mode x +#' @param confirmMode x +#' @param useRtLimit x +#' @param ppmFine x +#' @param mzCoarse x +#' @param fillPrecursorScan x +#' @param rtMargin x +#' @param deprofile x +#' @param headerCache x +#' @return an error +#' +#' @author stravsmi +#' @export +findMsMsHR.direct <- function(msRaw, cpdID, mode = "pH", confirmMode = 0, useRtLimit = TRUE, + ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine, + mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse, + fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan, + rtMargin = getOption("RMassBank")$rtMargin, + deprofile = getOption("RMassBank")$deprofile, + headerCache = NULL) +{ + stop("Support for this interface has been discontinued. Use findMsMsHR with the same parameters instead (use named parameter msRaw)") +} + +#' Read in mz-files using XCMS +#' +#' Picks peaks from mz-files and returns the pseudospectra that CAMERA creates with the help of XCMS +#' +#' @aliases findMsMsHRperxcms.direct findMsMsHRperxcms +#' @param fileName The path to the mz-file that should be read +#' @param cpdID The compoundID(s) of the compound that has been used for the file +#' @param mode The ionization mode that has been used for the spectrum represented by the peaklist +#' @param findPeaksArgs A list of arguments that will be handed to the xcms-method findPeaks via do.call +#' @param plots A parameter that determines whether the spectra should be plotted or not +#' @param MSe A boolean value that determines whether the spectra were recorded using MSe or not +#' @return The spectra generated from XCMS +#' @seealso \code{\link{msmsWorkflow}} \code{\link{toRMB}} +#' @author Erik Mueller +#' @examples \dontrun{ +#' fileList <- list.files(system.file("XCMSinput", package = "RMassBank"), "Glucolesquerellin", full.names=TRUE)[3] +#' loadList(system.file("XCMSinput/compoundList.csv",package="RMassBank")) +#' psp <- findMsMsHRperxcms(fileList,2184) +#' } +#' @export +findMsMsHRperxcms <- function(fileName, cpdID, mode="pH", findPeaksArgs = NULL, plots = FALSE, MSe = FALSE){ + + # Find mz + mzLimits <- findMz(cpdID, mode) + mz <- mzLimits$mzCenter + + + # If there are more files than cpdIDs + if(length(fileName) > 1){ + fspectra <- list() + + for(i in 1:length(fileName)){ + fspectra[[i]] <- findMsMsHRperxcms.direct(fileName[i], cpdID, mode=mode, findPeaksArgs = findPeaksArgs, plots = plots, MSe = MSe) + } + + spectra <- toRMB(unlist(unlist(fspectra, FALSE),FALSE), cpdID, mode) + + } else if(length(cpdID) > 1){ # If there are more cpdIDs than files + + spectra <- findMsMsHRperxcms.direct(fileName, cpdID, mode=mode, findPeaksArgs = findPeaksArgs, plots = plots, MSe = MSe) + + P <- lapply(1:length(spectra), function(i){ + sp <- toRMB(spectra[[i]], cpdID[i], mode) + sp@id <- as.character(as.integer(cpdID[i])) + sp@name <- findName(cpdID[i]) + sp@formula <- findFormula(cpdID[i]) + sp@mode <- mode + return(sp) + }) + return(P) + + } else { # There is a file for every cpdID + spectra <- toRMB(unlist(findMsMsHRperxcms.direct(fileName, cpdID, mode=mode, findPeaksArgs = NULL, plots = FALSE, MSe = FALSE),FALSE)) + } + + sp <- spectra + + #sp@mz <- mzLimits + sp@id <- as.character(as.integer(cpdID)) + sp@name <- findName(cpdID) + sp@formula <- findFormula(cpdID) + sp@mode <- mode + + return(sp) +} + +#' @describeIn findMsMsHRperxcms A submethod of findMsMsHrperxcms that retrieves basic spectrum data +#' @export +findMsMsHRperxcms.direct <- function(fileName, cpdID, mode="pH", findPeaksArgs = NULL, plots = FALSE, MSe = FALSE) { + + requireNamespace("CAMERA",quietly=TRUE) + requireNamespace("xcms",quietly=TRUE) + + ## + ## getRT function + ## + + getRT <- function(xa) { + rt <- sapply(xa@pspectra, function(x) {median(xcms::peaks(xa@xcmsSet)[x, "rt"])}) + } + + ## + ## MSMS + ## + + # Read file + suppressWarnings(xrmsms <- xcms::xcmsRaw(fileName, includeMSn=TRUE)) + + + # If file is not MSe, split by collision energy + if(MSe == FALSE){ + # Also, fake MS1 from the MSn data + suppressWarnings(xrs <- split(xcms::msn2xcmsRaw(xrmsms), f = xrmsms@msnCollisionEnergy)) + } else{ + # Else, MSn data will already be in MS1 + xrs <- list() + xrs[[1]] <- xrmsms + } + + # Fake a simplistic xcmsSet + suppressWarnings(setReplicate <- xcms::xcmsSet(files=fileName, method="MS1")) + xsmsms <- as.list(replicate(length(xrs),setReplicate)) + + mzabs <- 0.1 + + # Definitions + whichmissing <- vector() + metaspec <- list() + + ## + ## Retrieval over all supplied cpdIDs + ## + + for(ID in 1:length(cpdID)){ + + # Find all relevant information for the current cpdID + XCMSspectra <- list() + RT <- findRt(cpdID[ID])$RT * 60 + parentMass <- findMz(cpdID[ID], mode=mode)$mzCenter + + # Is the information in the compound list? + if(is.na(parentMass)){ + stop(paste("There was no matching entry to the supplied cpdID", cpdID[ID] ,"\n Please check the cpdIDs and the compoundlist.")) + } + + # Go over every collision energy of the MS2 + for(i in 1:length(xrs)){ + + suppressWarnings(capture.output(xcms::peaks(xsmsms[[i]]) <- do.call(xcms::findPeaks,c(findPeaksArgs, object = xrs[[i]])))) + + if (nrow(xcms::peaks(xsmsms[[i]])) == 0) { + XCMSspectra[[i]] <- matrix(0,2,7) + next + } else{ + + # Get the peaklist + pl <- xcms::peaks(xsmsms[[i]])[,c("mz", "rt"), drop=FALSE] + + # Find precursor peak within limits + candidates <- which( pl[,"mz", drop=FALSE] < parentMass + mzabs & pl[,"mz", drop=FALSE] > parentMass - mzabs + & pl[,"rt", drop=FALSE] < RT * 1.1 & pl[,"rt", drop=FALSE] > RT * 0.9 ) + + # Annotate and group by FWHM (full width at half maximum) + capture.output(anmsms <- CAMERA::xsAnnotate(xsmsms[[i]])) + capture.output(anmsms <- CAMERA::groupFWHM(anmsms)) + + # If a candidate fulfills the condition, choose the closest and retrieve the index of those pesudospectra + if(length(candidates) > 0){ + closestCandidate <- which.min(abs(RT - pl[candidates, "rt", drop=FALSE])) + pspIndex <- which(sapply(anmsms@pspectra, function(x) {candidates[[i]][closestCandidate] %in% x})) + } else{ + # Else choose the candidate with the closest RT + pspIndex <- which.min(abs(getRT(anmsms) - RT)) + } + + # 2nd best: Spectrum closest to MS1 + # pspIndex <- which.min( abs(getRT(anmsms) - actualRT)) + + # If the plot parameter was supplied, plot it + if((plots == TRUE) && (length(pspIndex) > 0)){ + CAMERA::plotPsSpectrum(anmsms, pspIndex, log=TRUE, mzrange=c(0, findMz(cpdID)[[3]]), maxlabel=10) + } + + # If there is a number of indexes, retrieve the pseudospectra + if(length(pspIndex) != 0){ + XCMSspectra[[i]] <- CAMERA::getpspectra(anmsms, pspIndex) + } else { + # Else note the spectrum as missing + whichmissing <- c(whichmissing,i) + } + } + } + + # If XCMSspectra were found but there are some missing for some collision energies, fill these XCMSspectra + if((length(XCMSspectra) != 0) && length(whichmissing)){ + for(i in whichmissing){ + XCMSspectra[[i]] <- matrix(0,2,7) + } + } + + metaspec[[ID]] <- XCMSspectra + } + + return(metaspec) +} + +################################################################################ +## new +findMsMsHRperMsp <- function(fileName, cpdIDs, mode="pH"){ + # Find mz + #mzLimits <- findMz(cpdIDs, mode) + #mz <- mzLimits$mzCenter + + # If there are more files than cpdIDs + if(length(fileName) > 1){ + fspectra <- list() + + for(i in 1:length(fileName)){ + fspectra[[i]] <- findMsMsHRperMsp.direct(fileName[i], cpdIDs, mode=mode) + } + + spectra <- toRMB(unlist(unlist(fspectra, FALSE),FALSE), cpdIDs, mode) + + } else if(length(cpdIDs) > 1){ # If there are more cpdIDs than files + + spectra <- findMsMsHRperMsp.direct(fileName = fileName, cpdIDs = cpdIDs, mode=mode) + + P <- lapply(1:length(spectra), function(i){ + sp <- toRMB(msmsXCMSspecs = spectra[[i]], cpdID = cpdIDs[i], mode = mode) + sp@id <- as.character(as.integer(cpdIDs[i])) + sp@name <- findName(cpdIDs[i]) + sp@formula <- findFormula(cpdIDs[i]) + sp@mode <- mode + + if(length(sp@children) == 1){ + sp@children[[1]]@rawOK <- rep(x = TRUE, times = sp@children[[1]]@peaksCount) + sp@children[[1]]@good <- rep(x = TRUE, times = sp@children[[1]]@peaksCount) + #sp@children[[1]]@good <- TRUE + } + + return(sp) + }) + return(P) + + } else { # There is a file for every cpdID + spectra <- toRMB(unlist(findMsMsHRperMsp.direct(fileName, cpdIDs, mode=mode),FALSE)) + } + + sp <- spectra + + #sp@mz <- mzLimits + sp@id <- as.character(as.integer(cpdIDs)) + sp@name <- findName(cpdIDs) + sp@formula <- findFormula(cpdIDs) + sp@mode <- mode + + return(sp) +} + +#' @describeIn findMsMsHRperMsp A submethod of findMsMsHrperxcms that retrieves basic spectrum data +#' @export +findMsMsHRperMsp.direct <- function(fileName, cpdIDs, mode="pH") { + + #requireNamespace("CAMERA",quietly=TRUE) + #requireNamespace("xcms",quietly=TRUE) + + ## + ## MSMS + ## + + # Read file + suppressWarnings(xrmsms <- read.msp(file = fileName)) + + ## If file is not MSe, split by collision energy + #if(MSe == FALSE){ + # # Also, fake MS1 from the MSn data + # suppressWarnings(xrs <- split(xcms::msn2xcmsRaw(xrmsms), f = xrmsms@msnCollisionEnergy)) + #} else{ + # # Else, MSn data will already be in MS1 + # xrs <- list() + # xrs[[1]] <- xrmsms + #} + #xrs <- xrmsms + + mzabs <- 0.1 + + # Definitions + whichmissing <- vector() + metaspec <- list() + + precursorTable <- data.frame(stringsAsFactors = FALSE, + mz = as.numeric(unlist(lapply(X = xrmsms, FUN = function(x){ x$PRECURSORMZ }))), + rt = as.numeric(unlist(lapply(X = xrmsms, FUN = function(x){ x$RETENTIONTIME }))) + ) + + ## + ## Retrieval over all supplied cpdIDs + ## + + for(idIdx in seq_along(cpdIDs)){ + + # Find all relevant information for the current cpdID + spectrum <- NULL + RT <- findRt(cpdIDs[[idIdx]])$RT * 60 + parentMass <- findMz(cpdIDs[[idIdx]], mode=mode)$mzCenter + + # Is the information in the compound list? + if(is.na(parentMass)){ + stop(paste("There was no matching entry to the supplied cpdID", cpdIDs[[idIdx]] ,"\n Please check the cpdIDs and the compoundlist.")) + } + + # Go over every collision energy of the MS2 + #for(i in seq_along(xrs)){ + + + + if (nrow(precursorTable) == 0) { + ## no peaks there + #spectrum <- matrix(0,2,7) + next + } else{ + ## at least one peak there + + # Get the peaklist + #pl <- xrs[[i]]$pspectrum + #pl <- data.frame("mz" = pl[, "mz"], "rt" = xrs[[i]]$RETENTIONTIME, stringsAsFactors = F) + + mzMatch <- + precursorTable[,"mz", drop=FALSE] < parentMass + mzabs & + precursorTable[,"mz", drop=FALSE] > parentMass - mzabs + rtMatch <- + precursorTable[,"rt", drop=FALSE] < RT * 1.1 & + precursorTable[,"rt", drop=FALSE] > RT * 0.9 + + if(is.na(RT)) + rtMatch <- TRUE + + # Find precursor peak within limits + candidates <- which( mzMatch & rtMatch ) + + # Annotate and group by FWHM (full width at half maximum) + #capture.output(anmsms <- CAMERA::xsAnnotate(xsmsms[[i]])) + #capture.output(anmsms <- CAMERA::groupFWHM(anmsms)) + + # If a candidate fulfills the condition, choose the closest and retrieve the index of those pesudospectra + if(length(candidates) > 0){ + if(is.na(RT)){ + pspIndex <- candidates[[1]] + } else { + closestCandidate <- which.min(abs(RT - precursorTable[candidates, "rt", drop=FALSE])) + pspIndex <- candidates[[closestCandidate]] + } + } else{ + # Else choose the candidate with the closest RT + pspIndex <- which.min(abs(precursorTable[,"rt"] - RT)) + } + + # 2nd best: Spectrum closest to MS1 + # pspIndex <- which.min( abs(getRT(anmsms) - actualRT)) + + ## If the plot parameter was supplied, plot it + #if((plots == TRUE) && (length(pspIndex) > 0)){ + # CAMERA::plotPsSpectrum(anmsms, pspIndex, log=TRUE, mzrange=c(0, findMz(cpdIDs)[[3]]), maxlabel=10) + #} + + # If there is a number of indexes, retrieve the pseudospectra + if(length(pspIndex) != 0){ + spectrum <- xrmsms[[pspIndex]] + } else { + # Else note the spectrum as missing + whichmissing <- c(whichmissing,idIdx) + #spectrum <- matrix(0,2,7) + } + } + #} + + # If XCMSspectra were found but there are some missing for some collision energies, fill these XCMSspectra + #if((length(XCMSspectra) != 0) && length(whichmissing)){ + # for(i in whichmissing){ + # XCMSspectra[[idIdx]] <- matrix(0,2,7) + # } + #} + + if(is.null(spectrum)){ + metaspec[[idIdx]] <- list(matrix(0,1,7)) + } else { + metaspec[[idIdx]] <- list(data.frame( + stringsAsFactors = F, + "mz" = as.numeric(spectrum$pspectrum[, "mz"]), + "mzmin" = as.numeric(spectrum$pspectrum[, "mz"]), + "mzmax" = as.numeric(spectrum$pspectrum[, "mz"]), + "rt" = as.numeric(spectrum$RETENTIONTIME), + "rtmin" = as.numeric(spectrum$RETENTIONTIME), + "rtmax" = as.numeric(spectrum$RETENTIONTIME), + "into" = as.numeric(spectrum$pspectrum[, "intensity"]) + )) + } + } + + return(metaspec) +} + +## adapted from the Bioconductor package 'metaMS' (method 'read.msp') +read.msp <- function(file){ + get.text.value <- function(x, field, do.err = TRUE) { + woppa <- strsplit(x, field) + woppa.lengths <- sapply(woppa, length) + if (all(woppa.lengths == 2)) { + sapply(woppa, function(y) gsub("^ +", "", y[2])) + } + else { + if (do.err) { + stop(paste("Invalid field", field, "in", x[woppa.lengths != 2])) + } + else { + NULL + } + } + } + read.compound <- function(strs) { + fields.idx <- grep(":", strs) + fields <- sapply(strsplit(strs[fields.idx], ":"), "[[", 1) + pk.idx <- which(fields == "Num Peaks") + if (length(pk.idx) == 0) + stop("No spectrum found") + cmpnd <- lapply(fields.idx[-pk.idx], function(x) get.text.value(strs[x], paste(fields[x], ":", sep = ""))) + names(cmpnd) <- fields[-pk.idx] + nlines <- length(strs) + npeaks <- as.numeric(get.text.value(strs[pk.idx], "Num Peaks:")) + peaks.idx <- (pk.idx + 1):nlines + pks <- gsub("^ +", "", unlist(strsplit(strs[peaks.idx], ";"))) + pks <- pks[pks != ""] + if (length(pks) != npeaks) + stop("Not the right number of peaks in compound", cmpnd$Name) + pklst <- strsplit(x = pks, split = "\t| ") + pklst <- lapply(pklst, function(x) x[x != ""]) + cmz <- as.numeric(sapply(pklst, "[[", 1)) + cintens <- as.numeric(sapply(pklst, "[[", 2)) + finaltab <- matrix(c(cmz, cintens), ncol = 2) + if (any(table(cmz) > 1)) { + warning("Duplicate mass in compound ", cmpnd$Name, " (CAS ", cmpnd$CAS, ")... summing up intensities") + finaltab <- aggregate(finaltab[, 2], by = list(finaltab[, 1]), FUN = sum) + } + colnames(finaltab) <- c("mz", "intensity") + c(cmpnd, list(pspectrum = finaltab)) + } + huhn <- readLines(con = file) + starts <- which(regexpr("(Name:)|(NAME:) ", huhn) == 1) + ends <- c(starts[-1] - 1, length(huhn)) + lapply(1:length(starts), function(i){ + read.compound(huhn[starts[[i]]:ends[[i]]]) + }) +} +## new +################################################################################ + +# Finds the EIC for a mass trace with a window of x ppm. +# (For ppm = 10, this is +5 / -5 ppm from the non-recalibrated mz.) +#' Extract EICs +#' +#' Extract EICs from raw data for a determined mass window. +#' +#' @param msRaw The mzR file handle +#' @param mz The mass or mass range to extract the EIC for: either a single mass +#' (with the range specified by \code{limit} below) or a mass range +#' in the form of \code{c(min, max)}. +#' @param limit If a single mass was given for \code{mz}: the mass window to extract. +#' A limit of 0.001 means that the EIC will be returned for \code{[mz - 0.001, mz + 0.001]}. +#' @param rtLimit If given, the retention time limits in form \code{c(rtmin, rtmax)} in seconds. +#' @param headerCache If present, the complete \code{mzR::header(msRaw)}. Passing +#' this value is useful if spectra for multiple compounds should be +#' extracted from the same mzML file, since it avoids getting the data +#' freshly from \code{msRaw} for every compound. +#' @param peaksCache If present, the complete output of \code{mzR::peaks(msRaw)}. This speeds up the lookup +#' if multiple compounds should be searched in the same file. +#' @param floatingRecalibration +#' A fitting function that \code{predict()}s a mass shift based on the retention time. Can be used +#' if a lockmass calibration is known (however you have to build the calibration yourself.) +#' @return A \code{[rt, intensity, scan]} matrix (\code{scan} being the scan number.) +#' @author Michael A. Stravs, Eawag +#' @seealso findMsMsHR +#' @export +findEIC <- function(msRaw, mz, limit = NULL, rtLimit = NA, headerCache = NULL, floatingRecalibration = NULL, + peaksCache = NULL) +{ + # calculate mz upper and lower limits for "integration" + if(all(c("mzMin", "mzMax") %in% names(mz))) + mzlimits <- c(mz$mzMin, mz$mzMax) + else + mzlimits <- c(mz - limit, mz + limit) + # Find peaklists for all MS1 scans + if(!is.null(headerCache)) + headerData <- as.data.frame(headerCache) + else + headerData <- as.data.frame(header(msRaw)) + # Add row numbering because I'm not sure if seqNum or acquisitionNum correspond to anything really + if(nrow(headerData) > 0) + headerData$rowNum <- 1:nrow(headerData) + else + headerData$rowNum <- integer(0) + + # If RT limit is already given, retrieve only candidates in the first place, + # since this makes everything much faster. + if(all(!is.na(rtLimit))) + headerMS1 <- headerData[ + (headerData$msLevel == 1) & (headerData$retentionTime >= rtLimit[[1]]) + & (headerData$retentionTime <= rtLimit[[2]]) + ,] + else + headerMS1 <- headerData[headerData$msLevel == 1,] + if(is.null(peaksCache)) + pks <- mzR::peaks(msRaw, headerMS1$seqNum) + else + pks <- peaksCache[headerMS1$rowNum] + + # Sum intensities in the given mass window for each scan + if(is.null(floatingRecalibration)) + { + headerMS1$mzMin <- mzlimits[[1]] + headerMS1$mzMax <- mzlimits[[2]] + } + else + { + headerMS1$mzMin <- mzlimits[[1]] + predict(floatingRecalibration, headerMS1$retentionTime) + headerMS1$mzMax <- mzlimits[[2]] + predict(floatingRecalibration, headerMS1$retentionTime) + } + intensity <- unlist(lapply(1:nrow(headerMS1), function(row){ peaktable <- pks[[row]] - sum(peaktable[ - which((peaktable[,1] >= headerMS1[row,"mzMin"]) & (peaktable[,1] <= headerMS1[row,"mzMax"])),2 - ]) - - })) - return(data.frame(rt = headerMS1$retentionTime, intensity=intensity, scan=headerMS1$acquisitionNum)) -} - - -#' Generate peaks cache -#' -#' Generates a peak cache table for use with \code{\link{findMsMsHR}} functions. -#' -#' @param msRaw the input raw datafile (opened) -#' @param headerCache the cached header, or subset thereof for which peaks should be extracted. Peak extraction goes -#' by \code{seqNum}. -#' @return A list of dataframes as from \code{mzR::peaks}. -#' -#' @author stravsmi -#' @export -makePeaksCache <- function(msRaw, headerCache) -{ - mzR::peaks(msRaw, headerCache$seqNum) -} - -#' Conversion of XCMS-pseudospectra into RMassBank-spectra -#' -#' Converts a pseudospectrum extracted from XCMS using CAMERA into the msmsWorkspace(at)spectrum-format that RMassBank uses -#' -#' @usage toRMB(msmsXCMSspecs, cpdID, mode, MS1spec) -#' @param msmsXCMSspecs The compoundID of the compound that has been used for the peaklist -#' @param cpdID The compound ID of the substance of the given spectrum -#' @param mode The ionization mode that has been used for the spectrum -#' @param MS1spec The MS1-spectrum from XCMS, which can be optionally supplied -#' @return One list element of the (at)specs-entry from an msmsWorkspace -#' @seealso \code{\link{msmsWorkspace-class}} -#' @author Erik Mueller -#' @examples \dontrun{ -#' XCMSpspectra <- findmsmsHRperxcms.direct("Glucolesquerellin_2184_1.mzdata", 2184) -#' wspecs <- toRMB(XCMSpspectra) -#' } -#' @export -toRMB <- function(msmsXCMSspecs = NA, cpdID = NA, mode="pH", MS1spec = NA){ - - - ##Basic parameters - mz <- findMz(cpdID,mode=mode)$mzCenter - id <- cpdID - formula <- findFormula(cpdID) - - if(length(msmsXCMSspecs) == 0){ - return(new("RmbSpectraSet",found=FALSE)) - } - - foundOK <- !any(sapply(msmsXCMSspecs, function(x) all(x == 0))) - - - if(!foundOK){ - return(new("RmbSpectraSet",found=FALSE)) - } - - if(suppressWarnings(is.na(msmsXCMSspecs)[1])){ - stop("You need a readable spectrum!") - } - - if(is.na(cpdID)){ - stop("Please supply the compoundID!") - } - - mockAcqnum <- 1 - mockenv <- environment() - - msmsSpecs <- lapply(msmsXCMSspecs, function(spec){ - ## Mock acquisition num - mockenv$mockAcqnum <- mockenv$mockAcqnum + 1 - - ## Find peak table - pks <- matrix(nrow = length(spec[,1]), ncol = 2) - colnames(pks) <- c("mz","int") - pks[,1] <- spec[,1] - pks[,2] <- spec[,7] - - ## Deprofiling not necessary for XCMS - - ## New spectrum object - return(new("RmbSpectrum2", - mz = pks[,"mz"], - intensity = pks[,"int"], - precScanNum = as.integer(1), - precursorMz = findMz(cpdID)$mzCenter, - precursorIntensity = 0, - precursorCharge = as.integer(1), - collisionEnergy = 0, - tic = 0, - peaksCount = nrow(spec), - rt = median(spec[,4]), - acquisitionNum = as.integer(mockenv$mockAcqnum), - centroided = TRUE - )) - }) - - msmsSpecs <- as(do.call(c, msmsSpecs), "SimpleList") - - ##Build the new objects - masterSpec <- new("Spectrum1", - mz = findMz(cpdID,mode=mode)$mzCenter, - intensity = 100, - polarity = as.integer(0), - peaksCount = as.integer(1), - rt = msmsSpecs[[1]]@rt, - acquisitionNum = as.integer(1), - tic = 0, - centroided = TRUE - ) - - spectraSet <- new("RmbSpectraSet", - parent = masterSpec, - children = msmsSpecs, - found = TRUE, - #complete = NA, - #empty = NA, - #formula = character(), - mz = mz - #name = character(), - #annotations = list() - ) - - return(spectraSet) -} - -#' Addition of manual peaklists -#' -#' Adds a manual peaklist in matrix-format -#' -#' @usage addPeaksManually(w, cpdID, handSpec, mode) -#' @param w The msmsWorkspace that the peaklist should be added to. -#' @param cpdID The compoundID of the compound that has been used for the peaklist -#' @param handSpec A peaklist with 2 columns, one with "mz", one with "int" -#' @param mode The ionization mode that has been used for the spectrum represented by the peaklist -#' @return The \code{msmsWorkspace} with the additional peaklist added to the right spectrum -#' @seealso \code{\link{msmsWorkflow}} -#' @author Erik Mueller -#' @examples \dontrun{ -#' handSpec <- cbind(mz=c(274.986685367956, 259.012401087427, 95.9493025990907, 96.9573002472772), -#' int=c(357,761, 2821, 3446)) -#' addPeaksManually(w, cpdID, handSpec) -#' } -#' @export -addPeaksManually <- function(w, cpdID = NA, handSpec, mode = "pH"){ - - - if(is.na(cpdID)){ - stop("Please supply the compoundID!") - } - - # For the case that the cpdID turns up for the first time - # a new spectrumset needs to be created - if(!(cpdID %in% sapply(w@spectra,function(s) s@id))){ - - # Create fake MS1 spectrum - masterSpec <- new("Spectrum1", - mz = findMz(cpdID,mode=mode)$mzCenter, - intensity = 100, - polarity = as.integer(0), - peaksCount = as.integer(1), - rt = findRt(cpdID)$RT, - acquisitionNum = as.integer(1), - tic = 0, - centroided = TRUE - ) - - # Create fake spectrumset - spectraSet <- new("RmbSpectraSet", - parent = masterSpec, - found = TRUE, - #complete = NA, - #empty = NA, - id = as.character(as.integer(cpdID)), - formula = findFormula(cpdID), - mz = findMz(cpdID,mode=mode)$mzCenter, - name = findName(cpdID), - mode = mode - #annotations = list() - ) - - w@spectra[[length(w@spectra) + 1]] <- spectraSet - } - - specIndex <- which(cpdID == sapply(w@spectra, function(s) s@id)) - - # New spectrum object - w@spectra[[specIndex]]@children[[length(w@spectra[[specIndex]]@children) + 1]] <- new("RmbSpectrum2", - mz = handSpec[,"mz"], - intensity = handSpec[,"int"], - precScanNum = as.integer(1), - precursorMz = findMz(cpdID)$mzCenter, - precursorIntensity = 0, - precursorCharge = as.integer(1), - collisionEnergy = 0, - tic = 0, - peaksCount = nrow(handSpec), - rt = findRt(cpdID)$RT, - acquisitionNum = as.integer(length(w@spectra[[specIndex]]@children) + 2), - centroided = TRUE) - return(w) -} - - -createSpecsFromPeaklists <- function(w, cpdIDs, filenames, mode="pH"){ - for(j in 1:length(filenames)){ - w <- addPeaksManually(w,cpdIDs[j],as.matrix(read.csv(filenames[j]), header=TRUE),mode) - } - - return(w) -} - - -#' MassBank-record Addition -#' -#' Adds the peaklist of a MassBank-Record to the specs of an msmsWorkspace -#' -#' @aliases addMB -#' @usage addMB(w, cpdID, fileName, mode) -#' @param w The msmsWorkspace that the peaklist should be added to. -#' @param cpdID The compoundID of the compound that has been used for the record -#' @param fileName The path to the record -#' @param mode The ionization mode that has been used to create the record -#' @return The \code{msmsWorkspace} with the additional peaklist from the record -#' @seealso \code{\link{addPeaksManually}} -#' @author Erik Mueller -#' @examples \dontrun{ -#' addMB("filepath_to_records/RC00001.txt") -#' } -#' @export -addMB <- function(w, cpdID, fileName, mode){ - mb <- parseMassBank(fileName) - peaklist <- list() - peaklist[[1]] <- mb@compiled_ok[[1]][["PK$PEAK"]][,1:2] - w <- addPeaksManually(w, cpdID, peaklist[[1]], mode) - return(w) -} + sum(peaktable[ + which((peaktable[,1] >= headerMS1[row,"mzMin"]) & (peaktable[,1] <= headerMS1[row,"mzMax"])),2 + ]) + + })) + return(data.frame(rt = headerMS1$retentionTime, intensity=intensity, scan=headerMS1$acquisitionNum)) +} + + +#' Generate peaks cache +#' +#' Generates a peak cache table for use with \code{\link{findMsMsHR}} functions. +#' +#' @param msRaw the input raw datafile (opened) +#' @param headerCache the cached header, or subset thereof for which peaks should be extracted. Peak extraction goes +#' by \code{seqNum}. +#' @return A list of dataframes as from \code{mzR::peaks}. +#' +#' @author stravsmi +#' @export +makePeaksCache <- function(msRaw, headerCache) +{ + mzR::peaks(msRaw, headerCache$seqNum) +} + +#' Conversion of XCMS-pseudospectra into RMassBank-spectra +#' +#' Converts a pseudospectrum extracted from XCMS using CAMERA into the msmsWorkspace(at)spectrum-format that RMassBank uses +#' +#' @usage toRMB(msmsXCMSspecs, cpdID, mode, MS1spec) +#' @param msmsXCMSspecs The compoundID of the compound that has been used for the peaklist +#' @param cpdID The compound ID of the substance of the given spectrum +#' @param mode The ionization mode that has been used for the spectrum +#' @param MS1spec The MS1-spectrum from XCMS, which can be optionally supplied +#' @return One list element of the (at)specs-entry from an msmsWorkspace +#' @seealso \code{\link{msmsWorkspace-class}} +#' @author Erik Mueller +#' @examples \dontrun{ +#' XCMSpspectra <- findmsmsHRperxcms.direct("Glucolesquerellin_2184_1.mzdata", 2184) +#' wspecs <- toRMB(XCMSpspectra) +#' } +#' @export +toRMB <- function(msmsXCMSspecs = NA, cpdID = NA, mode="pH", MS1spec = NA){ + + + ##Basic parameters + mz <- findMz(cpdID,mode=mode)$mzCenter + id <- cpdID + formula <- findFormula(cpdID) + + if(length(msmsXCMSspecs) == 0){ + return(new("RmbSpectraSet",found=FALSE)) + } + + foundOK <- !any(sapply(msmsXCMSspecs, function(x) all(x == 0))) + + + if(!foundOK){ + return(new("RmbSpectraSet",found=FALSE)) + } + + if(suppressWarnings(is.na(msmsXCMSspecs)[1])){ + stop("You need a readable spectrum!") + } + + if(is.na(cpdID)){ + stop("Please supply the compoundID!") + } + + mockAcqnum <- 1 + mockenv <- environment() + + msmsSpecs <- lapply(msmsXCMSspecs, function(spec){ + ## Mock acquisition num + mockenv$mockAcqnum <- mockenv$mockAcqnum + 1 + + ## Find peak table + pks <- matrix(nrow = length(spec[,1]), ncol = 2) + colnames(pks) <- c("mz","int") + pks[,1] <- spec[,1] + pks[,2] <- spec[,7] + + ## Deprofiling not necessary for XCMS + + ## New spectrum object + return(new("RmbSpectrum2", + mz = pks[,"mz"], + intensity = pks[,"int"], + precScanNum = as.integer(1), + precursorMz = findMz(cpdID)$mzCenter, + precursorIntensity = 0, + precursorCharge = as.integer(1), + collisionEnergy = 0, + tic = 0, + peaksCount = nrow(spec), + rt = median(spec[,4]), + acquisitionNum = as.integer(mockenv$mockAcqnum), + centroided = TRUE + )) + }) + + msmsSpecs <- as(do.call(c, msmsSpecs), "SimpleList") + + ##Build the new objects + masterSpec <- new("Spectrum1", + mz = findMz(cpdID,mode=mode)$mzCenter, + intensity = 100, + polarity = as.integer(0), + peaksCount = as.integer(1), + rt = msmsSpecs[[1]]@rt, + acquisitionNum = as.integer(1), + tic = 0, + centroided = TRUE + ) + + spectraSet <- new("RmbSpectraSet", + parent = masterSpec, + children = msmsSpecs, + found = TRUE, + #complete = NA, + #empty = NA, + #formula = character(), + mz = mz + #name = character(), + #annotations = list() + ) + + return(spectraSet) +} + +#' Addition of manual peaklists +#' +#' Adds a manual peaklist in matrix-format +#' +#' @usage addPeaksManually(w, cpdID, handSpec, mode) +#' @param w The msmsWorkspace that the peaklist should be added to. +#' @param cpdID The compoundID of the compound that has been used for the peaklist +#' @param handSpec A peaklist with 2 columns, one with "mz", one with "int" +#' @param mode The ionization mode that has been used for the spectrum represented by the peaklist +#' @return The \code{msmsWorkspace} with the additional peaklist added to the right spectrum +#' @seealso \code{\link{msmsWorkflow}} +#' @author Erik Mueller +#' @examples \dontrun{ +#' handSpec <- cbind(mz=c(274.986685367956, 259.012401087427, 95.9493025990907, 96.9573002472772), +#' int=c(357,761, 2821, 3446)) +#' addPeaksManually(w, cpdID, handSpec) +#' } +#' @export +addPeaksManually <- function(w, cpdID = NA, handSpec, mode = "pH"){ + + + if(is.na(cpdID)){ + stop("Please supply the compoundID!") + } + + # For the case that the cpdID turns up for the first time + # a new spectrumset needs to be created + if(!(cpdID %in% sapply(w@spectra,function(s) s@id))){ + + # Create fake MS1 spectrum + masterSpec <- new("Spectrum1", + mz = findMz(cpdID,mode=mode)$mzCenter, + intensity = 100, + polarity = as.integer(0), + peaksCount = as.integer(1), + rt = findRt(cpdID)$RT, + acquisitionNum = as.integer(1), + tic = 0, + centroided = TRUE + ) + + # Create fake spectrumset + spectraSet <- new("RmbSpectraSet", + parent = masterSpec, + found = TRUE, + #complete = NA, + #empty = NA, + id = as.character(as.integer(cpdID)), + formula = findFormula(cpdID), + mz = findMz(cpdID,mode=mode)$mzCenter, + name = findName(cpdID), + mode = mode + #annotations = list() + ) + + w@spectra[[length(w@spectra) + 1]] <- spectraSet + } + + specIndex <- which(cpdID == sapply(w@spectra, function(s) s@id)) + + # New spectrum object + w@spectra[[specIndex]]@children[[length(w@spectra[[specIndex]]@children) + 1]] <- new("RmbSpectrum2", + mz = handSpec[,"mz"], + intensity = handSpec[,"int"], + precScanNum = as.integer(1), + precursorMz = findMz(cpdID)$mzCenter, + precursorIntensity = 0, + precursorCharge = as.integer(1), + collisionEnergy = 0, + tic = 0, + peaksCount = nrow(handSpec), + rt = findRt(cpdID)$RT, + acquisitionNum = as.integer(length(w@spectra[[specIndex]]@children) + 2), + centroided = TRUE) + return(w) +} + + +createSpecsFromPeaklists <- function(w, cpdIDs, filenames, mode="pH"){ + for(j in 1:length(filenames)){ + w <- addPeaksManually(w,cpdIDs[j],as.matrix(read.csv(filenames[j]), header=TRUE),mode) + } + + return(w) +} + + +#' MassBank-record Addition +#' +#' Adds the peaklist of a MassBank-Record to the specs of an msmsWorkspace +#' +#' @aliases addMB +#' @usage addMB(w, cpdID, fileName, mode) +#' @param w The msmsWorkspace that the peaklist should be added to. +#' @param cpdID The compoundID of the compound that has been used for the record +#' @param fileName The path to the record +#' @param mode The ionization mode that has been used to create the record +#' @return The \code{msmsWorkspace} with the additional peaklist from the record +#' @seealso \code{\link{addPeaksManually}} +#' @author Erik Mueller +#' @examples \dontrun{ +#' addMB("filepath_to_records/RC00001.txt") +#' } +#' @export +addMB <- function(w, cpdID, fileName, mode){ + mb <- parseMassBank(fileName) + peaklist <- list() + peaklist[[1]] <- mb@compiled_ok[[1]][["PK$PEAK"]][,1:2] + w <- addPeaksManually(w, cpdID, peaklist[[1]], mode) + return(w) +} diff --git a/R/msmsRead.R b/R/msmsRead.R index 408d8dc..6ceceba 100644 --- a/R/msmsRead.R +++ b/R/msmsRead.R @@ -1,370 +1,419 @@ -#' -#' Extracts and processes spectra from a specified file list, according to -#' loaded options and given parameters. -#' -#' The filenames of the raw LC-MS runs are read from the array \code{files} -#' in the global enviroment. -#' See the vignette \code{vignette("RMassBank")} for further details about the -#' workflow. -#' -#' @param w A \code{msmsWorkspace} to work with. -#' @param filetable The path to a .csv-file that contains the columns "Files" and "ID" supplying -#' the relationships between files and compound IDs. Either this or the parameter "files" need -#' to be specified. -#' @param files A vector or list containing the filenames of the files that are to be read as spectra. -#' For the IDs to be inferred from the filenames alone, there need to be exactly 2 underscores. -#' @param cpdids A vector or list containing the compound IDs of the files that are to be read as spectra. -#' The ordering of this and \code{files} implicitly assigns each ID to the corresponding file. -#' If this is supplied, then the IDs implicitly named in the filenames are ignored. -#' @param readMethod Several methods are available to get peak lists from the files. -#' Currently supported are "mzR", "xcms", "MassBank" and "peaklist". -#' The first two read MS/MS raw data, and differ in the strategy -#' used to extract peaks. MassBank will read existing records, -#' so that e.g. a recalibration can be performed, and "peaklist" -#' just requires a CSV with two columns and the column header "mz", "int". -#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). -#' @param confirmMode Defaults to false (use most intense precursor). Value 1 uses -#' the 2nd-most intense precursor for a chosen ion (and its data-dependent scans) -#' , etc. -#' @param useRtLimit Whether to enforce the given retention time window. -#' @param Args A list of arguments that will be handed to the xcms-method findPeaks via do.call -#' @param settings Options to be used for processing. Defaults to the options loaded via -#' \code{\link{loadRmbSettings}} et al. Refer to there for specific settings. -#' @param progressbar The progress bar callback to use. Only needed for specialized applications. -#' Cf. the documentation of \code{\link{progressBarHook}} for usage. -#' @param MSe A boolean value that determines whether the spectra were recorded using MSe or not -#' @param plots A boolean value that determines whether the pseudospectra in XCMS should be plotted -#' @return The \code{msmsWorkspace} with msms-spectra read. -#' @seealso \code{\link{msmsWorkspace-class}}, \code{\link{msmsWorkflow}} -#' @author Michael Stravs, Eawag -#' @author Erik Mueller, UFZ -#' @export -msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, - readMethod, mode, confirmMode = FALSE, useRtLimit = TRUE, - Args = NULL, settings = getOption("RMassBank"), - progressbar = "progressBarHook", MSe = FALSE, plots = FALSE){ - .checkMbSettings() - ##Read the files and cpdids according to the definition - ##All cases are silently accepted, as long as they can be handled according to one definition - if(!any(mode %in% c("pH","pNa","pM","pNH4","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) - - if(is.null(filetable)){ - ##If no filetable is supplied, filenames must be named explicitly - if(is.null(files)) - stop("Please supply the files") - - ##Assign the filenames to the workspace - w@files <- unlist(files) - - ##If no filetable is supplied, cpdids must be delivered explicitly or implicitly within the filenames - if(is.null(cpdids)){ - splitfn <- strsplit(files,"_") - splitsfn <- sapply(splitfn, function(x) x[length(x)-1]) - if(suppressWarnings(any(is.na(as.numeric(splitsfn)[1])))) - stop("Please supply the cpdids corresponding to the files in the filetable or the filenames") - cpdids <- splitsfn - } - } else{ - ##If a filetable is supplied read it - tab <- read.csv(filetable, stringsAsFactors = FALSE) - w@files <- tab[,"Files"] - cpdids <- tab[,"ID"] - } - - ##If there's more cpdids than filenames or the other way around, then abort - if(length(w@files) != length(cpdids)){ - stop("There are a different number of cpdids than files") - } - - if(!(readMethod %in% c("mzR","peaklist","xcms","minimal"))){ - stop("The supplied method does not exist") - } - - if(!all(file.exists(w@files))){ - stop("The supplied files ", paste(w@files[!file.exists(w@files)]), " don't exist") - } - - # na.ids <- which(is.na(sapply(cpdids, findSmiles))) - - # if(length(na.ids)){ - # stop("The supplied compound ids ", paste(cpdids[na.ids], collapse=" "), " don't have a corresponding smiles entry. Maybe they are missing from the compound list") - # } - - ##This should work - if(readMethod == "minimal"){ - ##Edit options - opt <- getOption("RMassBank") - opt$recalibrator$MS1 <- "recalibrate.identity" - opt$recalibrator$MS2 <- "recalibrate.identity" - opt$add_annotation==FALSE - options(RMassBank=opt) - ##Edit analyzemethod - analyzeMethod <- "intensity" - } - - if(readMethod == "mzR"){ - ##Progressbar - nLen <- length(w@files) - nProg <- 0 - pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) - - count <- 1 - envir <- environment() - w@spectra <- as(lapply(w@files, function(fileName) { - - # Find compound ID - cpdID <- cpdids[count] - retrieval <- findLevel(cpdID,TRUE) - # Set counter up - envir$count <- envir$count + 1 - - # Retrieve spectrum data - spec <- findMsMsHR(fileName = fileName, - cpdID = cpdID, mode = mode, confirmMode = confirmMode, useRtLimit = useRtLimit, - ppmFine = settings$findMsMsRawSettings$ppmFine, - mzCoarse = settings$findMsMsRawSettings$mzCoarse, - fillPrecursorScan = settings$findMsMsRawSettings$fillPrecursorScan, - rtMargin = settings$rtMargin, - deprofile = settings$deprofile, retrieval=retrieval) - gc() - - # Progress: - nProg <<- nProg + 1 - pb <- do.call(progressbar, list(object=pb, value= nProg)) - - return(spec) - } ), "SimpleList") - names(w@spectra) <- basename(as.character(w@files)) - return(w) - } - - ##xcms-readmethod - if(readMethod == "xcms"){ - - ##Load libraries - requireNamespace("xcms",quietly=TRUE) - requireNamespace("CAMERA",quietly=TRUE) - - ##Find unique files and cpdIDs - ufiles <- unique(w@files) - uIDs <- unique(cpdids) - nLen <- length(ufiles) - - ##Progressbar - nProg <- 0 - pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) - i <- 1 - - ##Routine for the case of multiple cpdIDs per file - if(length(uIDs) > length(ufiles)){ - w@spectra <- as(unlist(lapply(ufiles, function(currentFile){ - fileIDs <- cpdids[which(w@files == currentFile)] - spec <- findMsMsHRperxcms(currentFile, fileIDs, mode=mode, findPeaksArgs=Args, plots, MSe = MSe) - gc() - - # Progress: - nProg <<- nProg + 1 - pb <- do.call(progressbar, list(object=pb, value= nProg)) - - return(spec) - }),FALSE),"SimpleList") - return(w) - } - - ##Routine for the other cases - w@spectra <- as(lapply(uIDs, function(ID){ - # Find files corresponding to the compoundID - currentFile <- w@files[which(cpdids == ID)] - - # Retrieve spectrum data - spec <- findMsMsHRperxcms(currentFile, ID, mode=mode, findPeaksArgs=Args, plots, MSe = MSe) - gc() - - # Progress: - nProg <<- nProg + 1 - pb <- do.call(progressbar, list(object=pb, value= nProg)) - - return(spec) - }),"SimpleList") - ##If there are more files than unique cpdIDs, only remember the first file for every cpdID - w@files <- w@files[sapply(uIDs, function(ID){ - return(which(cpdids == ID)[1]) - })] - return(w) - } - - ##Peaklist-readmethod - if((readMethod == "peaklist") || (readMethod=="minimal")){ - w <- createSpecsFromPeaklists(w, cpdids, filenames=w@files, mode=mode) - uIDs <- unique(cpdids) - files <- list() - - for(i in 1:length(uIDs)){ - indices <- sapply(cpdids,function(a){return(uIDs[i] %in% a)}) - files[[i]] <- w@files[indices] - } - - w@files <- sapply(files,function(file){return(file[1])}) - message("Peaks read") - return(w) - } - - -} - -#' -#' Extracts and processes spectra from a list of xcms-Objects -#' -#' The filenames of the raw LC-MS runs are read from the array \code{files} -#' in the global enviroment. -#' See the vignette \code{vignette("RMassBank")} for further details about the -#' workflow. -#' -#' @param w A \code{msmsWorkspace} to work with. -#' @param xRAW A list of xcmsRaw objects whose peaks should be detected and added to the workspace. -#' The relevant data must be in the MS1 data of the xcmsRaw object. You can coerce the -#' msn-data in a usable object with the \code{msn2xcmsRaw} function of xcms. -#' @param cpdids A vector or list containing the compound IDs of the files that are to be read as spectra. -#' The ordering of this and \code{files} implicitly assigns each ID to the corresponding file. -#' If this is supplied, then the IDs implicitly named in the filenames are ignored. -#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). -#' @param findPeaksArgs A list of arguments that will be handed to the xcms-method findPeaks via do.call -#' @param settings Options to be used for processing. Defaults to the options loaded via -#' \code{\link{loadRmbSettings}} et al. Refer to there for specific settings. -#' @param progressbar The progress bar callback to use. Only needed for specialized applications. -#' Cf. the documentation of \code{\link{progressBarHook}} for usage. -#' @param plots A boolean value that determines whether the pseudospectra in XCMS should be plotted -#' @return The \code{msmsWorkspace} with msms-spectra read. -#' @seealso \code{\link{msmsWorkspace-class}}, \code{\link{msmsWorkflow}} -#' @author Michael Stravs, Eawag -#' @author Erik Mueller, UFZ -#' @export -msmsRead.RAW <- function(w, xRAW = NULL, cpdids = NULL, mode, findPeaksArgs = NULL, - settings = getOption("RMassBank"), progressbar = "progressBarHook", plots = FALSE){ - - requireNamespace("xcms", quietly=TRUE) - - ##xRAW will be coerced into a list of length 1 if it is an xcmsRaw-object - if(class(xRAW) == "xcmsRaw"){ - xRAW <- list(xRAW) - } - - ##Error messages - if((class(xRAW) != "list") || any(sapply(xRAW, function(x) class(x) != "xcmsRaw"))){ - stop("No list of xcmsRaw-objects supplied") - } - - if(is.null(cpdids)){ - stop("No cpdids supplied") - } - - #msnExist <- which(sapply(xRAW,function(x) length(x@msnPrecursorScan) != 0)) - #print(length(msnExist)) - #print(length(xRAW)) - - #if(length(msnExist) != length(xRAW)){ - # stop(paste("No msn data in list elements", setdiff(1:length(xRAW),msnExist))) - #} - - requireNamespace("CAMERA",quietly=TRUE) - - parentMass <- findMz(cpdids[1], mode=mode)$mzCenter - if(is.na(parentMass)){ - stop(paste("There was no matching entry to the supplied cpdID", cpdids[1] ,"\n Please check the cpdIDs and the compoundlist.")) - } - - RT <- findRt(cpdids[1])$RT * 60 - mzabs <- 0.1 - - getRT <- function(xa) { - rt <- sapply(xa@pspectra, function(x) {median(peaks(xa@xcmsSet)[x, "rt"])}) - } - - suppressWarnings(setReplicate <- xcms::xcmsSet(files=xRAW[[1]]@filepath, method="MS1")) - xsmsms <- as.list(replicate(length(xRAW),setReplicate)) - candidates <- list() - anmsms <- list() - psp <- list() - spectra <- list() - whichmissing <- vector() - metaspec <- list() - for(i in 1:length(xRAW)){ - devnull <- suppressWarnings(capture.output(xcms::peaks(xsmsms[[i]]) <- do.call(xcms::findPeaks,c(findPeaksArgs, object = xRAW[[i]])))) - - if (nrow(xcms::peaks(xsmsms[[i]])) == 0) { ##If there are no peaks - spectra[[i]] <- matrix(0,2,7) - next - } else{ - ## Get pspec - pl <- xcms::peaks(xsmsms[[i]])[,c("mz", "rt"), drop=FALSE] - - ## Best: find precursor peak - candidates[[i]] <- which( pl[,"mz", drop=FALSE] < parentMass + mzabs & pl[,"mz", drop=FALSE] > parentMass - mzabs - & pl[,"rt", drop=FALSE] < RT * 1.1 & pl[,"rt", drop=FALSE] > RT * 0.9 ) - devnull <- capture.output(anmsms[[i]] <- CAMERA::xsAnnotate(xsmsms[[i]])) - devnull <- capture.output(anmsms[[i]] <- CAMERA::groupFWHM(anmsms[[i]])) - - if(length(candidates[[i]]) > 0){ - closestCandidate <- which.min (abs( RT - pl[candidates[[i]], "rt", drop=FALSE])) - psp[[i]] <- which(sapply(anmsms[[i]]@pspectra, function(x) {candidates[[i]][closestCandidate] %in% x})) - } else{ - psp[[i]] <- which.min( abs(getRT(anmsms[[i]]) - RT) ) - } - ## Now find the pspec for compound - - ## 2nd best: Spectrum closest to MS1 - ##psp <- which.min( abs(getRT(anmsms) - actualRT)) - - ## 3rd Best: find pspec closest to RT from spreadsheet - ##psp <- which.min( abs(getRT(anmsms) - RT) ) - if((plots == TRUE) && (length(psp[[i]]) > 0)){ - CAMERA::plotPsSpectrum(anmsms[[i]], psp[[i]], log=TRUE, mzrange=c(0, findMz(cpdids[1])[[3]]), maxlabel=10) - } - if(length(psp[[i]]) != 0){ - spectra[[i]] <- CAMERA::getpspectra(anmsms[[i]], psp[[i]]) - } else { - whichmissing <- c(whichmissing,i) - } - } - } - if(length(spectra) != 0){ - for(i in whichmissing){ - spectra[[i]] <- matrix(0,2,7) - } - } - - sp <- toRMB(spectra,cpdids,"mH") - sp@id <- as.character(as.integer(cpdids)) - sp@name <- findName(cpdids) - sp@formula <- findFormula(cpdids) - sp@mode <- mode - - if(length(w@spectra) != 0){ - IDindex <- sapply(w@spectra,function(s) s@id == cpdids) - if(length(IDindex)){ - spectraNum <- length(w@spectra[[which(IDindex)]]@children) - w@spectra[[which(IDindex)]]@children[[spectraNum+1]] <- sp@children[[1]] - } else { - w@spectra[[length(w@spectra)+1]] <- sp - } - } else{ - w@spectra[[1]] <- sp - } - - if(all(w@files != xRAW[[1]]@filepath)){ - w@files <- c(w@files,xRAW[[1]]@filepath) - } else{ - for(i in 2:(length(w@files)+1)){ - currentFPath <- paste0(xRAW[[1]]@filepath,"_",i) - if(all(w@files != currentFPath)){ - w@files <- c(w@files,currentFPath) - break - } - } - } - - return(w) -} - +#' +#' Extracts and processes spectra from a specified file list, according to +#' loaded options and given parameters. +#' +#' The filenames of the raw LC-MS runs are read from the array \code{files} +#' in the global enviroment. +#' See the vignette \code{vignette("RMassBank")} for further details about the +#' workflow. +#' +#' @param w A \code{msmsWorkspace} to work with. +#' @param filetable The path to a .csv-file that contains the columns "Files" and "ID" supplying +#' the relationships between files and compound IDs. Either this or the parameter "files" need +#' to be specified. +#' @param files A vector or list containing the filenames of the files that are to be read as spectra. +#' For the IDs to be inferred from the filenames alone, there need to be exactly 2 underscores. +#' @param cpdids A vector or list containing the compound IDs of the files that are to be read as spectra. +#' The ordering of this and \code{files} implicitly assigns each ID to the corresponding file. +#' If this is supplied, then the IDs implicitly named in the filenames are ignored. +#' @param readMethod Several methods are available to get peak lists from the files. +#' Currently supported are "mzR", "xcms", "MassBank" and "peaklist". +#' The first two read MS/MS raw data, and differ in the strategy +#' used to extract peaks. MassBank will read existing records, +#' so that e.g. a recalibration can be performed, and "peaklist" +#' just requires a CSV with two columns and the column header "mz", "int". +#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +#' @param confirmMode Defaults to false (use most intense precursor). Value 1 uses +#' the 2nd-most intense precursor for a chosen ion (and its data-dependent scans) +#' , etc. +#' @param useRtLimit Whether to enforce the given retention time window. +#' @param Args A list of arguments that will be handed to the xcms-method findPeaks via do.call +#' @param settings Options to be used for processing. Defaults to the options loaded via +#' \code{\link{loadRmbSettings}} et al. Refer to there for specific settings. +#' @param progressbar The progress bar callback to use. Only needed for specialized applications. +#' Cf. the documentation of \code{\link{progressBarHook}} for usage. +#' @param MSe A boolean value that determines whether the spectra were recorded using MSe or not +#' @param plots A boolean value that determines whether the pseudospectra in XCMS should be plotted +#' @return The \code{msmsWorkspace} with msms-spectra read. +#' @seealso \code{\link{msmsWorkspace-class}}, \code{\link{msmsWorkflow}} +#' @author Michael Stravs, Eawag +#' @author Erik Mueller, UFZ +#' @export +msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, + readMethod, mode, confirmMode = FALSE, useRtLimit = TRUE, + Args = NULL, settings = getOption("RMassBank"), + progressbar = "progressBarHook", MSe = FALSE, plots = FALSE){ + .checkMbSettings() + ##Read the files and cpdids according to the definition + ##All cases are silently accepted, as long as they can be handled according to one definition + if(!any(mode %in% c("pH","pNa","pM","pNH4","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) + + if(is.null(filetable)){ + ##If no filetable is supplied, filenames must be named explicitly + if(is.null(files)) + stop("Please supply the files") + + ##Assign the filenames to the workspace + w@files <- unlist(files) + + ##If no filetable is supplied, cpdids must be delivered explicitly or implicitly within the filenames + if(is.null(cpdids)){ + splitfn <- strsplit(files,"_") + splitsfn <- sapply(splitfn, function(x) x[length(x)-1]) + if(suppressWarnings(any(is.na(as.numeric(splitsfn)[1])))) + stop("Please supply the cpdids corresponding to the files in the filetable or the filenames") + cpdids <- splitsfn + } + } else{ + ##If a filetable is supplied read it + tab <- read.csv(filetable, stringsAsFactors = FALSE) + w@files <- tab[,"Files"] + cpdids <- tab[,"ID"] + } + + ##If there's more cpdids than filenames or the other way around, then abort + if(length(w@files) != length(cpdids)){ + stop("There are a different number of cpdids than files") + } + + if(!(readMethod %in% c("mzR","peaklist","xcms","minimal","msp"))){ + stop("The supplied method does not exist") + } + + if(!all(file.exists(w@files))){ + stop("The supplied files ", paste(w@files[!file.exists(w@files)]), " don't exist") + } + + # na.ids <- which(is.na(sapply(cpdids, findSmiles))) + + # if(length(na.ids)){ + # stop("The supplied compound ids ", paste(cpdids[na.ids], collapse=" "), " don't have a corresponding smiles entry. Maybe they are missing from the compound list") + # } + + ##This should work + if(readMethod == "minimal"){ + ##Edit options + opt <- getOption("RMassBank") + opt$recalibrator$MS1 <- "recalibrate.identity" + opt$recalibrator$MS2 <- "recalibrate.identity" + opt$add_annotation==FALSE + options(RMassBank=opt) + ##Edit analyzemethod + analyzeMethod <- "intensity" + } + + if(readMethod == "mzR"){ + ##Progressbar + nLen <- length(w@files) + nProg <- 0 + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) + + count <- 1 + envir <- environment() + w@spectra <- as(lapply(w@files, function(fileName) { + + # Find compound ID + cpdID <- cpdids[count] + retrieval <- findLevel(cpdID,TRUE) + # Set counter up + envir$count <- envir$count + 1 + + # Retrieve spectrum data + spec <- findMsMsHR(fileName = fileName, + cpdID = cpdID, mode = mode, confirmMode = confirmMode, useRtLimit = useRtLimit, + ppmFine = settings$findMsMsRawSettings$ppmFine, + mzCoarse = settings$findMsMsRawSettings$mzCoarse, + fillPrecursorScan = settings$findMsMsRawSettings$fillPrecursorScan, + rtMargin = settings$rtMargin, + deprofile = settings$deprofile, retrieval=retrieval) + gc() + + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(spec) + } ), "SimpleList") + names(w@spectra) <- basename(as.character(w@files)) + return(w) + } + + ##xcms-readmethod + if(readMethod == "xcms"){ + + ##Load libraries + requireNamespace("xcms",quietly=TRUE) + requireNamespace("CAMERA",quietly=TRUE) + + ##Find unique files and cpdIDs + ufiles <- unique(w@files) + uIDs <- unique(cpdids) + nLen <- length(ufiles) + + ##Progressbar + nProg <- 0 + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) + i <- 1 + + ##Routine for the case of multiple cpdIDs per file + if(length(uIDs) > length(ufiles)){ + w@spectra <- as(unlist(lapply(ufiles, function(currentFile){ + fileIDs <- cpdids[which(w@files == currentFile)] + spec <- findMsMsHRperxcms(currentFile, fileIDs, mode=mode, findPeaksArgs=Args, plots, MSe = MSe) + gc() + + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(spec) + }),FALSE),"SimpleList") + return(w) + } + + ##Routine for the other cases + w@spectra <- as(lapply(uIDs, function(ID){ + # Find files corresponding to the compoundID + currentFile <- w@files[which(cpdids == ID)] + + # Retrieve spectrum data + spec <- findMsMsHRperxcms(currentFile, ID, mode=mode, findPeaksArgs=Args, plots, MSe = MSe) + gc() + + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(spec) + }),"SimpleList") + ##If there are more files than unique cpdIDs, only remember the first file for every cpdID + w@files <- w@files[sapply(uIDs, function(ID){ + return(which(cpdids == ID)[1]) + })] + return(w) + } + + ##Peaklist-readmethod + if((readMethod == "peaklist") || (readMethod=="minimal")){ + w <- createSpecsFromPeaklists(w, cpdids, filenames=w@files, mode=mode) + uIDs <- unique(cpdids) + files <- list() + + for(i in 1:length(uIDs)){ + indices <- sapply(cpdids,function(a){return(uIDs[i] %in% a)}) + files[[i]] <- w@files[indices] + } + + w@files <- sapply(files,function(file){return(file[1])}) + message("Peaks read") + return(w) + } + ##MSP-readmethod + if(readMethod == "msp"){ + ##Find unique files and cpdIDs + ufiles <- unique(w@files) + uIDs <- unique(cpdids) + nLen <- length(ufiles) + + ##Progressbar + nProg <- 0 + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) + i <- 1 + + ##Routine for the case of multiple cpdIDs per file + if(length(uIDs) > length(ufiles)){ + w@spectra <- as(unlist(lapply(ufiles, function(currentFile){ + fileIDs <- cpdids[which(w@files == currentFile)] + spec <- findMsMsHRperMsp(fileName = currentFile, cpdIDs = fileIDs, mode=mode) + gc() + + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(spec) + }),FALSE),"SimpleList") + #w@spectra <- lapply(FUN = w@s) + return(w) + } + + ##Routine for the other cases + w@spectra <- as(lapply(uIDs, function(ID){ + # Find files corresponding to the compoundID + currentFile <- w@files[which(cpdids == ID)] + + # Retrieve spectrum data + spec <- findMsMsHRperMsp(currentFile, ID, mode=mode) + gc() + + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(spec) + }),"SimpleList") + ##If there are more files than unique cpdIDs, only remember the first file for every cpdID + w@files <- w@files[sapply(uIDs, function(ID){ + return(which(cpdids == ID)[1]) + })] + return(w) + } + +} + +#' +#' Extracts and processes spectra from a list of xcms-Objects +#' +#' The filenames of the raw LC-MS runs are read from the array \code{files} +#' in the global enviroment. +#' See the vignette \code{vignette("RMassBank")} for further details about the +#' workflow. +#' +#' @param w A \code{msmsWorkspace} to work with. +#' @param xRAW A list of xcmsRaw objects whose peaks should be detected and added to the workspace. +#' The relevant data must be in the MS1 data of the xcmsRaw object. You can coerce the +#' msn-data in a usable object with the \code{msn2xcmsRaw} function of xcms. +#' @param cpdids A vector or list containing the compound IDs of the files that are to be read as spectra. +#' The ordering of this and \code{files} implicitly assigns each ID to the corresponding file. +#' If this is supplied, then the IDs implicitly named in the filenames are ignored. +#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +#' @param findPeaksArgs A list of arguments that will be handed to the xcms-method findPeaks via do.call +#' @param settings Options to be used for processing. Defaults to the options loaded via +#' \code{\link{loadRmbSettings}} et al. Refer to there for specific settings. +#' @param progressbar The progress bar callback to use. Only needed for specialized applications. +#' Cf. the documentation of \code{\link{progressBarHook}} for usage. +#' @param plots A boolean value that determines whether the pseudospectra in XCMS should be plotted +#' @return The \code{msmsWorkspace} with msms-spectra read. +#' @seealso \code{\link{msmsWorkspace-class}}, \code{\link{msmsWorkflow}} +#' @author Michael Stravs, Eawag +#' @author Erik Mueller, UFZ +#' @export +msmsRead.RAW <- function(w, xRAW = NULL, cpdids = NULL, mode, findPeaksArgs = NULL, + settings = getOption("RMassBank"), progressbar = "progressBarHook", plots = FALSE){ + + requireNamespace("xcms", quietly=TRUE) + + ##xRAW will be coerced into a list of length 1 if it is an xcmsRaw-object + if(class(xRAW) == "xcmsRaw"){ + xRAW <- list(xRAW) + } + + ##Error messages + if((class(xRAW) != "list") || any(sapply(xRAW, function(x) class(x) != "xcmsRaw"))){ + stop("No list of xcmsRaw-objects supplied") + } + + if(is.null(cpdids)){ + stop("No cpdids supplied") + } + + #msnExist <- which(sapply(xRAW,function(x) length(x@msnPrecursorScan) != 0)) + #print(length(msnExist)) + #print(length(xRAW)) + + #if(length(msnExist) != length(xRAW)){ + # stop(paste("No msn data in list elements", setdiff(1:length(xRAW),msnExist))) + #} + + requireNamespace("CAMERA",quietly=TRUE) + + parentMass <- findMz(cpdids[1], mode=mode)$mzCenter + if(is.na(parentMass)){ + stop(paste("There was no matching entry to the supplied cpdID", cpdids[1] ,"\n Please check the cpdIDs and the compoundlist.")) + } + + RT <- findRt(cpdids[1])$RT * 60 + mzabs <- 0.1 + + getRT <- function(xa) { + rt <- sapply(xa@pspectra, function(x) {median(peaks(xa@xcmsSet)[x, "rt"])}) + } + + suppressWarnings(setReplicate <- xcms::xcmsSet(files=xRAW[[1]]@filepath, method="MS1")) + xsmsms <- as.list(replicate(length(xRAW),setReplicate)) + candidates <- list() + anmsms <- list() + psp <- list() + spectra <- list() + whichmissing <- vector() + metaspec <- list() + for(i in 1:length(xRAW)){ + devnull <- suppressWarnings(capture.output(xcms::peaks(xsmsms[[i]]) <- do.call(xcms::findPeaks,c(findPeaksArgs, object = xRAW[[i]])))) + + if (nrow(xcms::peaks(xsmsms[[i]])) == 0) { ##If there are no peaks + spectra[[i]] <- matrix(0,2,7) + next + } else{ + ## Get pspec + pl <- xcms::peaks(xsmsms[[i]])[,c("mz", "rt"), drop=FALSE] + + ## Best: find precursor peak + candidates[[i]] <- which( pl[,"mz", drop=FALSE] < parentMass + mzabs & pl[,"mz", drop=FALSE] > parentMass - mzabs + & pl[,"rt", drop=FALSE] < RT * 1.1 & pl[,"rt", drop=FALSE] > RT * 0.9 ) + devnull <- capture.output(anmsms[[i]] <- CAMERA::xsAnnotate(xsmsms[[i]])) + devnull <- capture.output(anmsms[[i]] <- CAMERA::groupFWHM(anmsms[[i]])) + + if(length(candidates[[i]]) > 0){ + closestCandidate <- which.min (abs( RT - pl[candidates[[i]], "rt", drop=FALSE])) + psp[[i]] <- which(sapply(anmsms[[i]]@pspectra, function(x) {candidates[[i]][closestCandidate] %in% x})) + } else{ + psp[[i]] <- which.min( abs(getRT(anmsms[[i]]) - RT) ) + } + ## Now find the pspec for compound + + ## 2nd best: Spectrum closest to MS1 + ##psp <- which.min( abs(getRT(anmsms) - actualRT)) + + ## 3rd Best: find pspec closest to RT from spreadsheet + ##psp <- which.min( abs(getRT(anmsms) - RT) ) + if((plots == TRUE) && (length(psp[[i]]) > 0)){ + CAMERA::plotPsSpectrum(anmsms[[i]], psp[[i]], log=TRUE, mzrange=c(0, findMz(cpdids[1])[[3]]), maxlabel=10) + } + if(length(psp[[i]]) != 0){ + spectra[[i]] <- CAMERA::getpspectra(anmsms[[i]], psp[[i]]) + } else { + whichmissing <- c(whichmissing,i) + } + } + } + if(length(spectra) != 0){ + for(i in whichmissing){ + spectra[[i]] <- matrix(0,2,7) + } + } + + sp <- toRMB(spectra,cpdids,"mH") + sp@id <- as.character(as.integer(cpdids)) + sp@name <- findName(cpdids) + sp@formula <- findFormula(cpdids) + sp@mode <- mode + + if(length(w@spectra) != 0){ + IDindex <- sapply(w@spectra,function(s) s@id == cpdids) + if(length(IDindex)){ + spectraNum <- length(w@spectra[[which(IDindex)]]@children) + w@spectra[[which(IDindex)]]@children[[spectraNum+1]] <- sp@children[[1]] + } else { + w@spectra[[length(w@spectra)+1]] <- sp + } + } else{ + w@spectra[[1]] <- sp + } + + if(all(w@files != xRAW[[1]]@filepath)){ + w@files <- c(w@files,xRAW[[1]]@filepath) + } else{ + for(i in 2:(length(w@files)+1)){ + currentFPath <- paste0(xRAW[[1]]@filepath,"_",i) + if(all(w@files != currentFPath)){ + w@files <- c(w@files,currentFPath) + break + } + } + } + + return(w) +} + diff --git a/R/zzz.R b/R/zzz.R index 0fc2413..b6b8755 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,9 +1,10 @@ -.onLoad <- function(libname, pkgname) { - RMassBank.env <<- new.env() - RMassBank.env$ReadAnnotation <- FALSE - RMassBank.env$testnumber <- 1 - mb <- list() - attach(RMassBank.env) -} - +.onLoad <- function(libname, pkgname) { + RMassBank.env <<- new.env() + RMassBank.env$ReadAnnotation <- FALSE + RMassBank.env$testnumber <- 1 + + mb <- list() + attach(RMassBank.env) +} + utils::globalVariables(c("cpdID", "isotopes","mzCalc")) \ No newline at end of file From 08d02be8f0876dbc264761cf338f0a35c78db634 Mon Sep 17 00:00:00 2001 From: Treutler Date: Wed, 29 Nov 2017 13:29:27 +0100 Subject: [PATCH 02/71] fixed the pubchem-url for curl queries: http --> https --- R/webAccess.R | 1092 ++++++++++++++++++++++++------------------------- 1 file changed, 546 insertions(+), 546 deletions(-) diff --git a/R/webAccess.R b/R/webAccess.R index 204b345..e55ebc0 100755 --- a/R/webAccess.R +++ b/R/webAccess.R @@ -1,546 +1,546 @@ -#' @import XML RCurl rjson -NULL -## library(XML) -## library(RCurl) - - - -#' Retrieve information from Cactus -#' -#' Retrieves information from the Cactus Chemical Identifier Resolver -#' (PubChem). -#' -#' It is not necessary to specify in which format the \code{identifier} is. -#' Somehow, cactus does this automatically. -#' -#' @usage getCactus(identifier, representation) -#' @param identifier Any identifier interpreted by the resolver, e.g. an InChI -#' key or a SMILES code. -#' @param representation The desired representation, as required from the -#' resolver. e.g. \code{stdinchikey}, \code{chemspider_id}, \code{formula}... -#' Refer to the webpage for details. -#' @return The result of the query, in plain text. Can be NA, or one or -#' multiple lines (character array) of results. -#' @note Note that the InChI key is retrieved with a prefix (\code{InChIkey=}), -#' which must be removed for most database searches in other databases (e.g. -#' CTS). -#' @author Michael Stravs -#' @seealso \code{\link{getCtsRecord}}, \code{\link{getPcId}} -#' @references cactus Chemical Identifier Resolver: -#' \url{http://cactus.nci.nih.gov/chemical/structure} -#' @examples -#' -#' # Benzene: -#' getCactus("C1=CC=CC=C1", "cas") -#' getCactus("C1=CC=CC=C1", "stdinchikey") -#' getCactus("C1=CC=CC=C1", "chemspider_id") -#' -#' @export -getCactus <- function(identifier, representation) -{ - - ret <- tryCatch( - getURLContent(paste( - "https://cactus.nci.nih.gov/chemical/structure/", - URLencode(identifier), "/", representation, sep='')), - error = function(e) NA) - if(is.na(ret)) - return(NA) - if(ret=="

Page not found (404)

\n") - return(NA) - return(unlist(strsplit(ret, "\n"))) -} - -#' Search Pubchem CID -#' -#' Retrieves PubChem CIDs for a search term. -#' -#' Only the first result is returned currently. \bold{The function should be -#' regarded as experimental and has not thoroughly been tested.} -#' -#' @usage getPcId(query, from = "inchikey") -#' @param query ID to be converted -#' @param from Type of input ID -#' @return The PubChem CID (in string type). -#' @author Michael Stravs, Erik Mueller -#' @seealso \code{\link{getCtsRecord}}, \code{\link{getCactus}} -#' @references PubChem search: \url{http://pubchem.ncbi.nlm.nih.gov/} -#' -#' Pubchem REST: -#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} -#' @examples -#' getPcId("MKXZASYAUGDDCJ-NJAFHUGGSA-N") -#' -#' @export -getPcId <- function(query, from = "inchikey") -{ - baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "description", "json", sep="/") - - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url),timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - # This happens if the InChI key is not found: - r <- fromJSON(data) - - if(!is.null(r$Fault)) - return(NA) - - titleEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Title)))) - - titleEntry <- titleEntry[which.min(sapply(titleEntry, function(x)r$InformationList$Information[[x]]$CID))] - - PcID <- r$InformationList$Information[[titleEntry]]$CID - - if(is.null(PcID)){ - return(NA) - } else{ - return(PcID) - } -} - -# The following function is unfinished. -# getPcRecord <- function(pcid) -# { -# baseUrl <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" -# term <- paste(baseUrl, "esummary.fcgi?db=pccompound&id=", URLencode(as.character(pcid)), -# -# sep='') -# ret <- getURL(term) -# xml <- xmlParseDoc(ret,asText=TRUE) -# browser() -# } - - -# Note: some of the CHEBI codes returned are erroneous. (When the entry in -# CTS starts with "CHEBI:" instead of just the number, the XML output) -# Also, there is no ChemSpider ID in the XML output, unfortunately. - - - - - - - - -#' Retrieve information from CTS -#' -#' Retrieves a complete CTS record from the InChI key. -#' -#' @usage getCtsRecord(key) -#' -#' @param key The InChI key. -#' @return Returns a list with all information from CTS: \code{inchikey, -#' inchicode, formula, exactmass} contain single values. \code{synonyms} contains -#' an unordered list of scored synonyms (\code{type, name, score}, where \code{type} -#' indicates either a normal name or a specific IUPAC name, see below). -#' \code{externalIds} contains an unordered list of identifiers of the compound in -#' various databases (\code{name, value}, where \code{name} is the database name and -#' \code{value} the identifier in that database.) -#' -#' @note Currently, the CTS results are still incomplete; the name scores are all 0, -#' formula and exact mass return zero. -#' @references Chemical Translation Service: -#' \url{http://cts.fiehnlab.ucdavis.edu} -#' -#' @examples -#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") -#' # show all synonym "types" -#' types <- unique(unlist(lapply(data$synonyms, function(i) i$type))) -#' \dontrun{print(types)} -#' -#' @author Michele Stravs, Eawag -#' @export -getCtsRecord <- function(key) -{ - baseURL <- "http://cts.fiehnlab.ucdavis.edu/service/compound/" - - errorvar <- 0 - currEnvir <- environment() - - ##tryCatch a CTS timeout - ## - tryCatch( - { - data <- getURL(paste0(baseURL,key), timeout=7) - }, - error=function(e){ - currEnvir$errorvar <- 1 - } - ) - - if(errorvar){ - warning("CTS seems to be currently unavailable or incapable of interpreting your request") - return(NULL) - } - - r <- fromJSON(data) - if(length(r) == 1) - if(r == "You entered an invalid InChIKey") - return(list()) - return(r) -} - -#' Convert a single ID to another using CTS. -#' -#' @usage getCtsKey(query, from = "Chemical Name", to = "InChIKey") -#' @param query ID to be converted -#' @param from Type of input ID -#' @param to Desired output ID -#' @return An unordered array with the resulting converted key(s). -#' -#' @examples -#' k <- getCtsKey("benzene", "Chemical Name", "InChIKey") -#' @author Michele Stravs, Eawag -#' @export -getCtsKey <- function(query, from = "Chemical Name", to = "InChIKey") -{ - baseURL <- "http://cts.fiehnlab.ucdavis.edu/service/convert" - url <- paste(baseURL, from, to, query, sep='/') - errorvar <- 0 - currEnvir <- environment() - - ##tryCatch a CTS timeout - ## - tryCatch( - { - data <- getURL(URLencode(url), timeout=7) - }, - error=function(e){ - currEnvir$errorvar <- 1 - } - ) - - if(errorvar){ - warning("CTS seems to be currently unavailable or incapable of interpreting your request") - return(NULL) - } - - - r <- fromJSON(data) - if(length(r) == 0) - return(NULL) - else - { - # read out the results in simplest form: - results <- unlist(lapply(r, function(row) row$result)) - return(results) - } -} - -#' Select a subset of external IDs from a CTS record. -#' -#' @usage CTS.externalIdSubset(data, database) -#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. -#' @param database The database for which keys should be returned. -#' @return Returns an array of all external identifiers stored in the record for the -#' given database. -#' -#' @examples -#' -#' \dontrun{ -#' # Return all CAS registry numbers stored for benzene. -#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") -#' cas <- CTS.externalIdSubset(data, "CAS") -#' } -#' -#' @author Michele Stravs, Eawag -#' @export -CTS.externalIdSubset <- function(data, database) -{ - select <- which(unlist(lapply(data$externalIds, function(id) - { - id[["name"]] == database - }))) - keyEntries <- data$externalIds[select] - keys <- unlist(lapply(keyEntries, function(e) e[["value"]])) -} - -#' Find all available databases for a CTS record -#' -#' @usage CTS.externalIdTypes(data) -#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. -#' @return Returns an array of all database names for which there are external -#' identifiers stored in the record. -#' -#' @examples -#' -#' \dontrun{ -#' # Return all databases for which the benzene entry has -#' # links in the CTS record. -#' -#' data <- getCTS("UHOVQNZJYSORNB-UHFFFAOYSA-N") -#' databases <- CTS.externalIdTypes(data) -#' } -#' -#' @author Michele Stravs, Eawag -#' @export -CTS.externalIdTypes <- function(data) -{ - unique(unlist(lapply(data$externalIds, function(id) - { - id[["name"]] - }))) -} - -.pubChemOnline <- function(){ - baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, "inchikey", "QEIXBXXKTUNWDK-UHFFFAOYSA-N", "description", "json", sep="/") - - errorvar <- 0 - currEnvir <- environment() - tryCatch( - ret <- getURL(URLencode(url), timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - warning("Pubchem is currently offline") - return(FALSE) - } else{ - return(TRUE) - } -} - - - -getPcCHEBI <- function(query, from = "inchikey") -{ - # Get the JSON-Data from Pubchem - baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "synonyms", "json", sep="/") - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url),timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - r <- fromJSON(data) - - # This happens if the InChI key is not found: - if(!is.null(r$Fault)) - return(NA) - - # Find the entries which contain Chebi-links - synonymEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Synonym)))) - synonymList <- r$InformationList$Information[[synonymEntry]]$Synonym - matchChebi <- which(grepl("CHEBI:", synonymList, fixed=TRUE)) - - # It doesn't matter if the db is down or if chebi isn't found, so return NA also - if(length(matchChebi) == 0){ - return (NA) - } else { - return (sapply(matchChebi, function(x) synonymList[[x]])) - } -} - -#' Retrieve the Chemspider ID for a given compound -#' -#' Given an InChIKey, this function queries the chemspider web API to retrieve -#' the Chemspider ID of he compound with that InChIkey. -#' -#' @usage getCSID(query) -#' -#' @param query The InChIKey of the compound -#' @return Returns the chemspide -#' -#' @examples -#' -#' \dontrun{ -#' # Return all CAS registry numbers stored for benzene. -#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") -#' cas <- CTS.externalIdSubset(data, "CAS") -#' } -#' -#' @author Michele Stravs, Eawag -#' @author Erik Mueller, UFZ -#' @export -getCSID <- function(query) -{ - baseURL <- "http://www.chemspider.com/InChI.asmx/InChIKeyToCSID?inchi_key=" - url <- paste0(baseURL, query) - - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url), timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - warning("Chemspider is currently offline") - return(NA) - } - - xml <- xmlParseDoc(data,asText=TRUE) - # the returned XML document contains only the root node called "string" which contains the correct CSID - idNodes <- getNodeSet(xml, "/") - id <- xmlValue(idNodes[[1]]) - return(id) -} - -##This function returns a sensible name for the compound -getPcSynonym <- function (query, from = "inchikey") -{ - # Get the JSON-Data from Pubchem - baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "description", "json", sep="/") - - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url),timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - r <- fromJSON(data) - - # This happens if the InChI key is not found: - if(!is.null(r$Fault)) - return(NA) - - # Find the synonym - - titleEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Title)))) - - titleEntry <- titleEntry[which.min(sapply(titleEntry, function(x)r$InformationList$Information[[x]]$CID))] - - title <- r$InformationList$Information[[titleEntry]]$Title - - if(is.null(title)){ - return(NA) - } else{ - return(title) - } -} - - -##A function to retrieve a IUPAC Name from Pubchem -getPcIUPAC <- function (query, from = "inchikey") -{ - # Get the JSON-Data from Pubchem - baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "record", "json", sep="/") - - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url),timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - r <- fromJSON(data) - - # This happens if the InChI key is not found: - if(!is.null(r$Fault)) - return(NA) - - # Find the IUPAC-Names - if(!is.null(r$PC_Compounds[[1]]$props)){ - IUPACIndex <- which(unlist(lapply(r$PC_Compounds[[1]]$props, function(i) (i$urn$label == "IUPAC Name")))) - if(length(IUPACIndex) > 0){ - # Retrieve all IUPAC-Names - IUPACEntries <- lapply(IUPACIndex, function(x) r$PC_Compounds[[1]]$props[[x]]) - if(!is.null(IUPACEntries)){ - # Is there a preferred IUPAC-Name? If yes, retrieve that - PrefIUPAC <- which(unlist(lapply(IUPACEntries, function(x) x$urn$name == "Preferred"))) - } else{return(NA)} - } else{return(NA)} - } else{return(NA)} - - - if(length(PrefIUPAC) == 1){ - return(IUPACEntries[[PrefIUPAC]]$value$sval) - } else{ - # Else it doesn't matter which - return(IUPACEntries[[1]]$value$sval) - } -} - -getPcInchiKey <- function(query, from = "smiles"){ - # Get the JSON-Data from Pubchem - baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "record", "json", sep="/") - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url),timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - r <- fromJSON(data) - - # This happens if the InChI key is not found: - if(!is.null(r$Fault)) - return(NA) - - # Find the entries which contain Chebi-links - if(!is.null(r$PC_Compounds[[1]]$props)){ - INKEYindex <- which(sapply(r$PC_Compounds[[1]]$props, function(x) x$urn$label) == "InChIKey") - if(length(INKEYindex) > 0){ - return(r$PC_Compounds[[1]]$props[[INKEYindex]]$value$sval) - } else{return(NA)} - } else{return(NA)} - - -} - -getPcSDF <- function(query, from = "smiles"){ - baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "sdf", sep="/") - - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url),timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - molEnd <- regexpr(data,pattern="M END",fixed=TRUE)+5 - data <- c(strsplit(substring(data,1,molEnd),"\n")[[1]],"$$$$") - return(data) -} +#' @import XML RCurl rjson +NULL +## library(XML) +## library(RCurl) + + + +#' Retrieve information from Cactus +#' +#' Retrieves information from the Cactus Chemical Identifier Resolver +#' (PubChem). +#' +#' It is not necessary to specify in which format the \code{identifier} is. +#' Somehow, cactus does this automatically. +#' +#' @usage getCactus(identifier, representation) +#' @param identifier Any identifier interpreted by the resolver, e.g. an InChI +#' key or a SMILES code. +#' @param representation The desired representation, as required from the +#' resolver. e.g. \code{stdinchikey}, \code{chemspider_id}, \code{formula}... +#' Refer to the webpage for details. +#' @return The result of the query, in plain text. Can be NA, or one or +#' multiple lines (character array) of results. +#' @note Note that the InChI key is retrieved with a prefix (\code{InChIkey=}), +#' which must be removed for most database searches in other databases (e.g. +#' CTS). +#' @author Michael Stravs +#' @seealso \code{\link{getCtsRecord}}, \code{\link{getPcId}} +#' @references cactus Chemical Identifier Resolver: +#' \url{http://cactus.nci.nih.gov/chemical/structure} +#' @examples +#' +#' # Benzene: +#' getCactus("C1=CC=CC=C1", "cas") +#' getCactus("C1=CC=CC=C1", "stdinchikey") +#' getCactus("C1=CC=CC=C1", "chemspider_id") +#' +#' @export +getCactus <- function(identifier, representation) +{ + + ret <- tryCatch( + getURLContent(paste( + "https://cactus.nci.nih.gov/chemical/structure/", + URLencode(identifier), "/", representation, sep='')), + error = function(e) NA) + if(is.na(ret)) + return(NA) + if(ret=="

Page not found (404)

\n") + return(NA) + return(unlist(strsplit(ret, "\n"))) +} + +#' Search Pubchem CID +#' +#' Retrieves PubChem CIDs for a search term. +#' +#' Only the first result is returned currently. \bold{The function should be +#' regarded as experimental and has not thoroughly been tested.} +#' +#' @usage getPcId(query, from = "inchikey") +#' @param query ID to be converted +#' @param from Type of input ID +#' @return The PubChem CID (in string type). +#' @author Michael Stravs, Erik Mueller +#' @seealso \code{\link{getCtsRecord}}, \code{\link{getCactus}} +#' @references PubChem search: \url{http://pubchem.ncbi.nlm.nih.gov/} +#' +#' Pubchem REST: +#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} +#' @examples +#' getPcId("MKXZASYAUGDDCJ-NJAFHUGGSA-N") +#' +#' @export +getPcId <- function(query, from = "inchikey") +{ + baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "description", "json", sep="/") + + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url),timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + # This happens if the InChI key is not found: + r <- fromJSON(data) + + if(!is.null(r$Fault)) + return(NA) + + titleEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Title)))) + + titleEntry <- titleEntry[which.min(sapply(titleEntry, function(x)r$InformationList$Information[[x]]$CID))] + + PcID <- r$InformationList$Information[[titleEntry]]$CID + + if(is.null(PcID)){ + return(NA) + } else{ + return(PcID) + } +} + +# The following function is unfinished. +# getPcRecord <- function(pcid) +# { +# baseUrl <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" +# term <- paste(baseUrl, "esummary.fcgi?db=pccompound&id=", URLencode(as.character(pcid)), +# +# sep='') +# ret <- getURL(term) +# xml <- xmlParseDoc(ret,asText=TRUE) +# browser() +# } + + +# Note: some of the CHEBI codes returned are erroneous. (When the entry in +# CTS starts with "CHEBI:" instead of just the number, the XML output) +# Also, there is no ChemSpider ID in the XML output, unfortunately. + + + + + + + + +#' Retrieve information from CTS +#' +#' Retrieves a complete CTS record from the InChI key. +#' +#' @usage getCtsRecord(key) +#' +#' @param key The InChI key. +#' @return Returns a list with all information from CTS: \code{inchikey, +#' inchicode, formula, exactmass} contain single values. \code{synonyms} contains +#' an unordered list of scored synonyms (\code{type, name, score}, where \code{type} +#' indicates either a normal name or a specific IUPAC name, see below). +#' \code{externalIds} contains an unordered list of identifiers of the compound in +#' various databases (\code{name, value}, where \code{name} is the database name and +#' \code{value} the identifier in that database.) +#' +#' @note Currently, the CTS results are still incomplete; the name scores are all 0, +#' formula and exact mass return zero. +#' @references Chemical Translation Service: +#' \url{http://cts.fiehnlab.ucdavis.edu} +#' +#' @examples +#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") +#' # show all synonym "types" +#' types <- unique(unlist(lapply(data$synonyms, function(i) i$type))) +#' \dontrun{print(types)} +#' +#' @author Michele Stravs, Eawag +#' @export +getCtsRecord <- function(key) +{ + baseURL <- "http://cts.fiehnlab.ucdavis.edu/service/compound/" + + errorvar <- 0 + currEnvir <- environment() + + ##tryCatch a CTS timeout + ## + tryCatch( + { + data <- getURL(paste0(baseURL,key), timeout=7) + }, + error=function(e){ + currEnvir$errorvar <- 1 + } + ) + + if(errorvar){ + warning("CTS seems to be currently unavailable or incapable of interpreting your request") + return(NULL) + } + + r <- fromJSON(data) + if(length(r) == 1) + if(r == "You entered an invalid InChIKey") + return(list()) + return(r) +} + +#' Convert a single ID to another using CTS. +#' +#' @usage getCtsKey(query, from = "Chemical Name", to = "InChIKey") +#' @param query ID to be converted +#' @param from Type of input ID +#' @param to Desired output ID +#' @return An unordered array with the resulting converted key(s). +#' +#' @examples +#' k <- getCtsKey("benzene", "Chemical Name", "InChIKey") +#' @author Michele Stravs, Eawag +#' @export +getCtsKey <- function(query, from = "Chemical Name", to = "InChIKey") +{ + baseURL <- "http://cts.fiehnlab.ucdavis.edu/service/convert" + url <- paste(baseURL, from, to, query, sep='/') + errorvar <- 0 + currEnvir <- environment() + + ##tryCatch a CTS timeout + ## + tryCatch( + { + data <- getURL(URLencode(url), timeout=7) + }, + error=function(e){ + currEnvir$errorvar <- 1 + } + ) + + if(errorvar){ + warning("CTS seems to be currently unavailable or incapable of interpreting your request") + return(NULL) + } + + + r <- fromJSON(data) + if(length(r) == 0) + return(NULL) + else + { + # read out the results in simplest form: + results <- unlist(lapply(r, function(row) row$result)) + return(results) + } +} + +#' Select a subset of external IDs from a CTS record. +#' +#' @usage CTS.externalIdSubset(data, database) +#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. +#' @param database The database for which keys should be returned. +#' @return Returns an array of all external identifiers stored in the record for the +#' given database. +#' +#' @examples +#' +#' \dontrun{ +#' # Return all CAS registry numbers stored for benzene. +#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") +#' cas <- CTS.externalIdSubset(data, "CAS") +#' } +#' +#' @author Michele Stravs, Eawag +#' @export +CTS.externalIdSubset <- function(data, database) +{ + select <- which(unlist(lapply(data$externalIds, function(id) + { + id[["name"]] == database + }))) + keyEntries <- data$externalIds[select] + keys <- unlist(lapply(keyEntries, function(e) e[["value"]])) +} + +#' Find all available databases for a CTS record +#' +#' @usage CTS.externalIdTypes(data) +#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. +#' @return Returns an array of all database names for which there are external +#' identifiers stored in the record. +#' +#' @examples +#' +#' \dontrun{ +#' # Return all databases for which the benzene entry has +#' # links in the CTS record. +#' +#' data <- getCTS("UHOVQNZJYSORNB-UHFFFAOYSA-N") +#' databases <- CTS.externalIdTypes(data) +#' } +#' +#' @author Michele Stravs, Eawag +#' @export +CTS.externalIdTypes <- function(data) +{ + unique(unlist(lapply(data$externalIds, function(id) + { + id[["name"]] + }))) +} + +.pubChemOnline <- function(){ + baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, "inchikey", "QEIXBXXKTUNWDK-UHFFFAOYSA-N", "description", "json", sep="/") + + errorvar <- 0 + currEnvir <- environment() + tryCatch( + ret <- getURL(URLencode(url), timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + warning("Pubchem is currently offline") + return(FALSE) + } else{ + return(TRUE) + } +} + + + +getPcCHEBI <- function(query, from = "inchikey") +{ + # Get the JSON-Data from Pubchem + baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "synonyms", "json", sep="/") + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url),timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + r <- fromJSON(data) + + # This happens if the InChI key is not found: + if(!is.null(r$Fault)) + return(NA) + + # Find the entries which contain Chebi-links + synonymEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Synonym)))) + synonymList <- r$InformationList$Information[[synonymEntry]]$Synonym + matchChebi <- which(grepl("CHEBI:", synonymList, fixed=TRUE)) + + # It doesn't matter if the db is down or if chebi isn't found, so return NA also + if(length(matchChebi) == 0){ + return (NA) + } else { + return (sapply(matchChebi, function(x) synonymList[[x]])) + } +} + +#' Retrieve the Chemspider ID for a given compound +#' +#' Given an InChIKey, this function queries the chemspider web API to retrieve +#' the Chemspider ID of he compound with that InChIkey. +#' +#' @usage getCSID(query) +#' +#' @param query The InChIKey of the compound +#' @return Returns the chemspide +#' +#' @examples +#' +#' \dontrun{ +#' # Return all CAS registry numbers stored for benzene. +#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") +#' cas <- CTS.externalIdSubset(data, "CAS") +#' } +#' +#' @author Michele Stravs, Eawag +#' @author Erik Mueller, UFZ +#' @export +getCSID <- function(query) +{ + baseURL <- "http://www.chemspider.com/InChI.asmx/InChIKeyToCSID?inchi_key=" + url <- paste0(baseURL, query) + + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url), timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + warning("Chemspider is currently offline") + return(NA) + } + + xml <- xmlParseDoc(data,asText=TRUE) + # the returned XML document contains only the root node called "string" which contains the correct CSID + idNodes <- getNodeSet(xml, "/") + id <- xmlValue(idNodes[[1]]) + return(id) +} + +##This function returns a sensible name for the compound +getPcSynonym <- function (query, from = "inchikey") +{ + # Get the JSON-Data from Pubchem + baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "description", "json", sep="/") + + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url),timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + r <- fromJSON(data) + + # This happens if the InChI key is not found: + if(!is.null(r$Fault)) + return(NA) + + # Find the synonym + + titleEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Title)))) + + titleEntry <- titleEntry[which.min(sapply(titleEntry, function(x)r$InformationList$Information[[x]]$CID))] + + title <- r$InformationList$Information[[titleEntry]]$Title + + if(is.null(title)){ + return(NA) + } else{ + return(title) + } +} + + +##A function to retrieve a IUPAC Name from Pubchem +getPcIUPAC <- function (query, from = "inchikey") +{ + # Get the JSON-Data from Pubchem + baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "record", "json", sep="/") + + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url),timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + r <- fromJSON(data) + + # This happens if the InChI key is not found: + if(!is.null(r$Fault)) + return(NA) + + # Find the IUPAC-Names + if(!is.null(r$PC_Compounds[[1]]$props)){ + IUPACIndex <- which(unlist(lapply(r$PC_Compounds[[1]]$props, function(i) (i$urn$label == "IUPAC Name")))) + if(length(IUPACIndex) > 0){ + # Retrieve all IUPAC-Names + IUPACEntries <- lapply(IUPACIndex, function(x) r$PC_Compounds[[1]]$props[[x]]) + if(!is.null(IUPACEntries)){ + # Is there a preferred IUPAC-Name? If yes, retrieve that + PrefIUPAC <- which(unlist(lapply(IUPACEntries, function(x) x$urn$name == "Preferred"))) + } else{return(NA)} + } else{return(NA)} + } else{return(NA)} + + + if(length(PrefIUPAC) == 1){ + return(IUPACEntries[[PrefIUPAC]]$value$sval) + } else{ + # Else it doesn't matter which + return(IUPACEntries[[1]]$value$sval) + } +} + +getPcInchiKey <- function(query, from = "smiles"){ + # Get the JSON-Data from Pubchem + baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "record", "json", sep="/") + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url),timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + r <- fromJSON(data) + + # This happens if the InChI key is not found: + if(!is.null(r$Fault)) + return(NA) + + # Find the entries which contain Chebi-links + if(!is.null(r$PC_Compounds[[1]]$props)){ + INKEYindex <- which(sapply(r$PC_Compounds[[1]]$props, function(x) x$urn$label) == "InChIKey") + if(length(INKEYindex) > 0){ + return(r$PC_Compounds[[1]]$props[[INKEYindex]]$value$sval) + } else{return(NA)} + } else{return(NA)} + + +} + +getPcSDF <- function(query, from = "smiles"){ + baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "sdf", sep="/") + + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url),timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + molEnd <- regexpr(data,pattern="M END",fixed=TRUE)+5 + data <- c(strsplit(substring(data,1,molEnd),"\n")[[1]],"$$$$") + return(data) +} From 219e19e7f7e41838ced7db7c3a9170c2ae221227 Mon Sep 17 00:00:00 2001 From: Treutler Date: Thu, 30 Nov 2017 14:56:52 +0100 Subject: [PATCH 03/71] Added the msp-functions to NAMESPACE --- NAMESPACE | 280 +++++++++++++++++++++++++++--------------------------- 1 file changed, 141 insertions(+), 139 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index a589a8b..5cdd5f1 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,139 +1,141 @@ -# Generated by roxygen2: do not edit by hand - -export(CTS.externalIdSubset) -export(CTS.externalIdTypes) -export(RmbDefaultSettings) -export(RmbSettingsTemplate) -export(add.formula) -export(addMB) -export(addPeaks) -export(addPeaksManually) -export(addProperty) -export(aggregateSpectra) -export(analyzeMsMs) -export(analyzeMsMs.formula) -export(analyzeMsMs.intensity) -export(annotator.default) -export(archiveResults) -export(checkIsotopes) -export(checkSpectra) -export(cleanElnoise) -export(combineMultiplicities) -export(compileRecord) -export(createMolfile) -export(dbe) -export(deprofile) -export(deprofile.fwhm) -export(deprofile.localMax) -export(deprofile.scan) -export(deprofile.spline) -export(exportMassbank) -export(filterMultiplicity) -export(filterPeakSatellites) -export(filterPeaksMultiplicity) -export(findCAS) -export(findEIC) -export(findFormula) -export(findLevel) -export(findMass) -export(findMsMsHR) -export(findMsMsHR.direct) -export(findMsMsHR.mass) -export(findMsMsHR.ticms2) -export(findMsMsHRperxcms) -export(findMsMsHRperxcms.direct) -export(findMz) -export(findMz.formula) -export(findName) -export(findProgress) -export(findRt) -export(findSmiles) -export(flatten) -export(formulastring.to.list) -export(gatherCompound) -export(gatherData) -export(gatherDataBabel) -export(gatherDataUnknown) -export(gatherPubChem) -export(gatherSpectrum) -export(getCSID) -export(getCactus) -export(getCtsKey) -export(getCtsRecord) -export(getMolecule) -export(getPcId) -export(is.valid.formula) -export(list.to.formula) -export(loadInfolist) -export(loadInfolists) -export(loadList) -export(loadMsmsWorkspace) -export(loadRmbSettings) -export(loadRmbSettingsFromEnv) -export(makeMollist) -export(makePeaksCache) -export(makeRecalibration) -export(mbWorkflow) -export(msmsRead) -export(msmsRead.RAW) -export(msmsWorkflow) -export(multiply.formula) -export(newMbWorkspace) -export(newMsmsWorkspace) -export(order.formula) -export(parseMassBank) -export(peaksMatched) -export(peaksUnmatched) -export(plotMbWorkspaces) -export(plotRecalibration) -export(plotRecalibration.direct) -export(ppm) -export(problematicPeaks) -export(processProblematicPeaks) -export(progressBarHook) -export(readMbdata) -export(reanalyzeFailpeak) -export(reanalyzeFailpeaks) -export(recalibrate) -export(recalibrate.addMS1data) -export(recalibrate.identity) -export(recalibrate.linear) -export(recalibrate.loess) -export(recalibrate.mean) -export(recalibrateSingleSpec) -export(recalibrateSpectra) -export(resetInfolists) -export(resetList) -export(selectPeaks) -export(selectSpectra) -export(smiles2mass) -export(spectraCount) -export(to.limits.rcdk) -export(toMassbank) -export(toRMB) -export(updateSettings) -export(validate) -exportClasses(mbWorkspace) -exportClasses(msmsWorkspace) -exportMethods(addProperty) -exportMethods(checkSpectra) -exportMethods(getData) -exportMethods(peaksMatched) -exportMethods(peaksUnmatched) -exportMethods(selectPeaks) -exportMethods(selectSpectra) -exportMethods(setData) -exportMethods(show) -exportMethods(spectraCount) -import(Biobase) -import(MSnbase) -import(RCurl) -import(Rcpp) -import(S4Vectors) -import(XML) -import(digest) -import(methods) -import(mzR) -import(rcdk) -import(rjson) -import(yaml) +# Generated by roxygen2: do not edit by hand + +export(CTS.externalIdSubset) +export(CTS.externalIdTypes) +export(RmbDefaultSettings) +export(RmbSettingsTemplate) +export(add.formula) +export(addMB) +export(addPeaks) +export(addPeaksManually) +export(addProperty) +export(aggregateSpectra) +export(analyzeMsMs) +export(analyzeMsMs.formula) +export(analyzeMsMs.intensity) +export(annotator.default) +export(archiveResults) +export(checkIsotopes) +export(checkSpectra) +export(cleanElnoise) +export(combineMultiplicities) +export(compileRecord) +export(createMolfile) +export(dbe) +export(deprofile) +export(deprofile.fwhm) +export(deprofile.localMax) +export(deprofile.scan) +export(deprofile.spline) +export(exportMassbank) +export(filterMultiplicity) +export(filterPeakSatellites) +export(filterPeaksMultiplicity) +export(findCAS) +export(findEIC) +export(findFormula) +export(findLevel) +export(findMass) +export(findMsMsHR) +export(findMsMsHR.direct) +export(findMsMsHR.mass) +export(findMsMsHR.ticms2) +export(findMsMsHRperxcms) +export(findMsMsHRperxcms.direct) +export(findMsMsHRperMsp) +export(findMsMsHRperMsp.direct) +export(findMz) +export(findMz.formula) +export(findName) +export(findProgress) +export(findRt) +export(findSmiles) +export(flatten) +export(formulastring.to.list) +export(gatherCompound) +export(gatherData) +export(gatherDataBabel) +export(gatherDataUnknown) +export(gatherPubChem) +export(gatherSpectrum) +export(getCSID) +export(getCactus) +export(getCtsKey) +export(getCtsRecord) +export(getMolecule) +export(getPcId) +export(is.valid.formula) +export(list.to.formula) +export(loadInfolist) +export(loadInfolists) +export(loadList) +export(loadMsmsWorkspace) +export(loadRmbSettings) +export(loadRmbSettingsFromEnv) +export(makeMollist) +export(makePeaksCache) +export(makeRecalibration) +export(mbWorkflow) +export(msmsRead) +export(msmsRead.RAW) +export(msmsWorkflow) +export(multiply.formula) +export(newMbWorkspace) +export(newMsmsWorkspace) +export(order.formula) +export(parseMassBank) +export(peaksMatched) +export(peaksUnmatched) +export(plotMbWorkspaces) +export(plotRecalibration) +export(plotRecalibration.direct) +export(ppm) +export(problematicPeaks) +export(processProblematicPeaks) +export(progressBarHook) +export(readMbdata) +export(reanalyzeFailpeak) +export(reanalyzeFailpeaks) +export(recalibrate) +export(recalibrate.addMS1data) +export(recalibrate.identity) +export(recalibrate.linear) +export(recalibrate.loess) +export(recalibrate.mean) +export(recalibrateSingleSpec) +export(recalibrateSpectra) +export(resetInfolists) +export(resetList) +export(selectPeaks) +export(selectSpectra) +export(smiles2mass) +export(spectraCount) +export(to.limits.rcdk) +export(toMassbank) +export(toRMB) +export(updateSettings) +export(validate) +exportClasses(mbWorkspace) +exportClasses(msmsWorkspace) +exportMethods(addProperty) +exportMethods(checkSpectra) +exportMethods(getData) +exportMethods(peaksMatched) +exportMethods(peaksUnmatched) +exportMethods(selectPeaks) +exportMethods(selectSpectra) +exportMethods(setData) +exportMethods(show) +exportMethods(spectraCount) +import(Biobase) +import(MSnbase) +import(RCurl) +import(Rcpp) +import(S4Vectors) +import(XML) +import(digest) +import(methods) +import(mzR) +import(rcdk) +import(rjson) +import(yaml) From 52ed7116472e502babb435fc2d3059c90e79206d Mon Sep 17 00:00:00 2001 From: Treutler Date: Thu, 30 Nov 2017 14:57:43 +0100 Subject: [PATCH 04/71] Added optional parameter RMassBank.env$verbose.output which prints information about absent/filtered peaks during the msmsWorkflow --- R/leMsMs.r | 131 ++++++++++++++++++++++++++++++++++----------------- R/msmsRead.R | 58 ++++++++++++++++++----- R/zzz.R | 1 + 3 files changed, 137 insertions(+), 53 deletions(-) diff --git a/R/leMsMs.r b/R/leMsMs.r index 935ae57..3e38f8a 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -169,6 +169,12 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec { message("msmsWorkflow: Step 3. Aggregate all spectra") w@aggregated <- aggregateSpectra(spec = w@spectra, addIncomplete=TRUE) + + if(RMassBank.env$verbose.output){ + numberOfPeaksThere <- sum(unlist(lapply(X = w@spectra, FUN = function(spec){ sum(unlist(lapply(X = spec@children, FUN = function(child){ child@peaksCount }))) }))) + if(nrow(w@aggregated) < numberOfPeaksThere) + cat(paste("### Warning ### The aggregation of spectra lead to the removal of ", (numberOfPeaksThere-nrow(w@aggregated)), " / ", numberOfPeaksThere, " peaks\n", sep = "")) + } } if(allUnknown){ @@ -243,21 +249,37 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec { message("msmsWorkflow: Step 6. Aggregate recalibrated results") w@aggregated <- aggregateSpectra(w@spectra, addIncomplete=TRUE) + + if(RMassBank.env$verbose.output){ + numberOfPeaksThere <- sum(unlist(lapply(X = w@spectra, FUN = function(spec){ sum(unlist(lapply(X = spec@children, FUN = function(child){ child@peaksCount }))) }))) + if(nrow(w@aggregated) < numberOfPeaksThere) + cat(paste("### Warning ### The aggregation of spectra lead to the removal of ", (numberOfPeaksThere-nrow(w@aggregated)), " / ", numberOfPeaksThere, " peaks\n", sep = "")) + } + if(!is.na(archivename)) archiveResults(w, paste(archivename, ".RData", sep=''), settings) - w@aggregated <- cleanElnoise(w@aggregated, - settings$electronicNoise, settings$electronicNoiseWidth) + w@aggregated <- cleanElnoise(peaks = w@aggregated, noise = settings$electronicNoise, width = settings$electronicNoiseWidth) + + if(RMassBank.env$verbose.output) + if(sum(w@aggregated$noise) > 0) + cat(paste("### Warning ### ", sum(w@aggregated$noise), " / ", nrow(w@aggregated), " peaks have been identified as electronic noise\n", sep = "")) } # Step 7: reanalyze failpeaks for (mono)oxidation and N2 adduct peaks if(7 %in% steps) { message("msmsWorkflow: Step 7. Reanalyze fail peaks for N2 + O") w@aggregated <- reanalyzeFailpeaks( - w@aggregated, custom_additions="N2O", mode=mode, + aggregated = w@aggregated, custom_additions="N2O", mode=mode, filterSettings=settings$filterSettings, progressbar=progressbar) if(!is.na(archivename)) archiveResults(w, paste(archivename, "_RA.RData", sep=''), settings) + + if(RMassBank.env$verbose.output){ + noFormulaCount <- sum(is.na(w@aggregated$formula) & is.na(w@aggregated$reanalyzed.formula)) + if(noFormulaCount > 0) + cat(paste("### Warning ### ", noFormulaCount, " / ", nrow(w@aggregated), " peaks have no molecular formula\n", sep = "")) + } } # Step 8: heuristic filtering based on peak multiplicity; # creation of failpeak list @@ -268,10 +290,22 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec message("msmsWorkflow: Step 8. Peak multiplicity filtering skipped because multiplicityFilter parameter is not set.") } else { # apply heuristic filter - w@aggregated <- filterMultiplicity( - w, archivename, mode, settings$multiplicityFilter) + w@aggregated <- filterMultiplicity(w, archivename, mode, settings$multiplicityFilter) + + if(RMassBank.env$verbose.output){ + multiplicityNotOkCount <- sum(!w@aggregated$filterOK) + if(multiplicityNotOkCount > 0) + cat(paste("### Warning ### ", multiplicityNotOkCount, " / ", nrow(w@aggregated), " peaks do not fulfill the multiplicity criterion\n", sep = "")) + } + w@aggregated <- processProblematicPeaks(w, mode, archivename) - + + if(RMassBank.env$verbose.output){ + problematicPeakCount <- sum(w@aggregated$problematicPeak) + if(problematicPeakCount > 0) + cat(paste("### Warning ### ", problematicPeakCount, " / ", nrow(w@aggregated), " peaks are problematic\n", sep = "")) + } + if(!is.na(archivename)) archiveResults(w, paste(archivename, "_RF.RData", sep=''), settings) } @@ -509,6 +543,9 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi # with insufficient match accuracy or no match. analyzeTandemShot <- function(child) { + childIdx <- which(sapply(X = seq_along(w@spectra[[3]]@children), FUN = function(i){ + all(child@mz == w@spectra[[3]]@children[[i]]@mz) & all(child@rt == w@spectra[[3]]@children[[i]]@rt) & all(child@intensity == w@spectra[[3]]@children[[i]]@intensity) } + )) shot <- getData(child) shot$row <- which(!is.na(shot$mz)) @@ -522,6 +559,10 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi if(length(which(!child@low))==0) { child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' contains only low-intensity peaks\n", sep = "")) + return(child) } @@ -535,12 +576,20 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi if(nrow(shot)==0) { child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' contains no peaks after satellite filtering\n", sep = "")) + return(child) } if(max(shot$intensity) < as.numeric(filterSettings$specOkLimit)) { child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is discarded due to parameter 'specOkLimit'\n", sep = "")) + return(child) } @@ -578,9 +627,8 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi allowed_additions <- "NH4" mode.charge <- 1 } else{ - stop("mode = \"", mode, "\" not defined") - } - + stop("mode = \"", mode, "\" not defined") + } # the ppm range is two-sided here. # The range is slightly expanded because dppm calculation of @@ -600,6 +648,10 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi if(!is.valid.formula(parent_formula)) { child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### The precursor ion formula of spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is invalid\n", sep = "")) + return(child) } @@ -612,8 +664,8 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi # finally back-correct calculated masses for the charge mass <- shot.row[["mz"]] mass.calc <- mass + mode.charge * .emass - peakformula <- tryCatch(suppressWarnings(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), - limits, charge=0)), error=function(e) NA) + peakformula <- tryCatch(suppressWarnings(generate.formula(mass = mass.calc, window = ppm(mass.calc, ppmlimit, p=TRUE), + elements = limits, charge=0)), error=function(e) NA) #peakformula <- tryCatch( # generate.formula(mass, # ppm(mass, ppmlimit, p=TRUE), @@ -633,8 +685,7 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi mzCalc=mzCalc) }))) } - - }) + }) childPeaks <- as.data.frame(do.call(rbind, peakmatrix)) @@ -672,6 +723,10 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi if(nrow(childPeaks)==0) { child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is empty\n", sep = "")) + return(child) } @@ -1096,22 +1151,21 @@ aggregateSpectra <- function(spec, addIncomplete=FALSE) cleanElnoise <- function(peaks, noise=getOption("RMassBank")$electronicNoise, width = getOption("RMassBank")$electronicNoiseWidth) { - peaks <- addProperty(peaks, "noise", "logical", FALSE) - - # I don't think this makes sense if using one big table... - ## # use only best peaks - ## p_best <- peaks[is.na(peaks$dppmBest) | (peaks$dppm == peaks$dppmBest),] - - # remove known electronic noise - p_eln <- peaks - for(noisePeak in noise) - { + + # I don't think this makes sense if using one big table... + ## # use only best peaks + ## p_best <- peaks[is.na(peaks$dppmBest) | (peaks$dppm == peaks$dppmBest),] + + # remove known electronic noise + p_eln <- peaks + for(noisePeak in noise) + { noiseMatches <- which(!((p_eln$mzFound > noisePeak + width) | (p_eln$mzFound < noisePeak - width))) if(length(noiseMatches) > 0) p_eln[noiseMatches, "noise"] <- TRUE - } - return(p_eln) + } + return(p_eln) } #' Identify intense peaks (in a list of unmatched peaks) @@ -1879,8 +1933,7 @@ filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE) else { # calculate duplicity info - multInfo <- aggregate(as.data.frame(peaks$scan), - list(peaks$cpdID, peaks[,formulacol]), FUN=length) + multInfo <- aggregate(as.data.frame(peaks$scan), list(peaks$cpdID, peaks[,formulacol]), FUN=length) # just for comparison: # nform <- unique(paste(pks$cpdID,pks$formula)) @@ -2022,22 +2075,16 @@ filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE) filterMultiplicity <- function(w, archivename=NA, mode="pH", recalcBest = TRUE, multiplicityFilter = getOption("RMassBank")$multiplicityFilter) { - # Read multiplicity filter setting - # For backwards compatibility: If the option is not set, define as 2 - # (which was the behaviour before we introduced the option) - if(is.null(multiplicityFilter)) - multiplicityFilter <- 2 + # Read multiplicity filter setting + # For backwards compatibility: If the option is not set, define as 2 + # (which was the behaviour before we introduced the option) + if(is.null(multiplicityFilter)) + multiplicityFilter <- 2 - specs <- w@aggregated - - peaksFiltered <- filterPeaksMultiplicity(peaksMatched(specs), - "formula", recalcBest) - - - peaksFilteredReanalysis <- - filterPeaksMultiplicity(specs[!is.na(specs$matchedReanalysis) & specs$matchedReanalysis,,drop=FALSE], "reanalyzed.formula", FALSE) - - + specs <- w@aggregated + + peaksFiltered <- filterPeaksMultiplicity(peaksMatched(specs), "formula", recalcBest) + peaksFilteredReanalysis <- filterPeaksMultiplicity(specs[!is.na(specs$matchedReanalysis) & specs$matchedReanalysis,,drop=FALSE], "reanalyzed.formula", FALSE) specs <- addProperty(specs, "formulaMultiplicity", "numeric", 0) diff --git a/R/msmsRead.R b/R/msmsRead.R index 6ceceba..8f684a0 100644 --- a/R/msmsRead.R +++ b/R/msmsRead.R @@ -92,16 +92,16 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, # } ##This should work - if(readMethod == "minimal"){ - ##Edit options - opt <- getOption("RMassBank") - opt$recalibrator$MS1 <- "recalibrate.identity" - opt$recalibrator$MS2 <- "recalibrate.identity" - opt$add_annotation==FALSE - options(RMassBank=opt) - ##Edit analyzemethod - analyzeMethod <- "intensity" - } + if(readMethod == "minimal"){ + ##Edit options + opt <- getOption("RMassBank") + opt$recalibrator$MS1 <- "recalibrate.identity" + opt$recalibrator$MS2 <- "recalibrate.identity" + opt$add_annotation==FALSE + options(RMassBank=opt) + ##Edit analyzemethod + analyzeMethod <- "intensity" + } if(readMethod == "mzR"){ ##Progressbar @@ -136,6 +136,12 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, return(spec) } ), "SimpleList") names(w@spectra) <- basename(as.character(w@files)) + + if(RMassBank.env$verbose.output) + for(specIdx in seq_along(w@spectra)) + if(!w@spectra[[specIdx]]@found) + cat(paste("### Warning ### No precursor ion was detected for ID '", w@spectra[[specIdx]]@id, "'\n", sep = "")) + return(w) } @@ -169,6 +175,12 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, return(spec) }),FALSE),"SimpleList") + + if(RMassBank.env$verbose.output) + for(specIdx in seq_along(w@spectra)) + if(!w@spectra[[specIdx]]@found) + cat(paste("### Warning ### No precursor ion was detected for ID '", w@spectra[[specIdx]]@id, "'\n", sep = "")) + return(w) } @@ -191,6 +203,12 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, w@files <- w@files[sapply(uIDs, function(ID){ return(which(cpdids == ID)[1]) })] + + if(RMassBank.env$verbose.output) + for(specIdx in seq_along(w@spectra)) + if(!w@spectra[[specIdx]]@found) + cat(paste("### Warning ### No precursor ion was detected for ID '", w@spectra[[specIdx]]@id, "'\n", sep = "")) + return(w) } @@ -207,8 +225,15 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, w@files <- sapply(files,function(file){return(file[1])}) message("Peaks read") + + if(RMassBank.env$verbose.output) + for(specIdx in seq_along(w@spectra)) + if(!w@spectra[[specIdx]]@found) + cat(paste("### Warning ### No precursor ion was detected for ID '", w@spectra[[specIdx]]@id, "'\n", sep = "")) + return(w) } + ##MSP-readmethod if(readMethod == "msp"){ ##Find unique files and cpdIDs @@ -234,7 +259,12 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, return(spec) }),FALSE),"SimpleList") - #w@spectra <- lapply(FUN = w@s) + + if(RMassBank.env$verbose.output) + for(specIdx in seq_along(w@spectra)) + if(!w@spectra[[specIdx]]@found) + cat(paste("### Warning ### No precursor ion was detected for ID '", w@spectra[[specIdx]]@id, "'\n", sep = "")) + return(w) } @@ -257,6 +287,12 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, w@files <- w@files[sapply(uIDs, function(ID){ return(which(cpdids == ID)[1]) })] + + if(RMassBank.env$verbose.output) + for(specIdx in seq_along(w@spectra)) + if(!w@spectra[[specIdx]]@found) + cat(paste("### Warning ### No precursor ion was detected for ID '", w@spectra[[specIdx]]@id, "'\n", sep = "")) + return(w) } diff --git a/R/zzz.R b/R/zzz.R index b6b8755..3a53d8c 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -2,6 +2,7 @@ RMassBank.env <<- new.env() RMassBank.env$ReadAnnotation <- FALSE RMassBank.env$testnumber <- 1 + RMassBank.env$verbose.output <- TRUE mb <- list() attach(RMassBank.env) From 785c29a0bfbc04e2bc358fdb005719ddb3aa5a40 Mon Sep 17 00:00:00 2001 From: Treutler Date: Wed, 10 Jan 2018 10:29:13 +0100 Subject: [PATCH 05/71] minor changes --- R/createMassBank.R | 2 +- R/leMsMs.r | 19 +++++++++++-------- R/msmsRead.R | 2 +- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index 8c32c8d..1e09961 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -267,7 +267,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c message("mbWorkflow: Step 7. Generate subdirs and export") dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "moldata", sep='/'),recursive=TRUE) dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "recdata", sep='/'),recursive=TRUE) - for(cnt in 1:length(mb@compiled_ok)) + for(cnt in seq_along(mb@compiled_ok)) exportMassbank(mb@compiled_ok[[cnt]], mb@mbfiles[[cnt]], mb@molfile[[cnt]]) } # Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank diff --git a/R/leMsMs.r b/R/leMsMs.r index 3e38f8a..42e9f2b 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -273,12 +273,15 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec filterSettings=settings$filterSettings, progressbar=progressbar) if(!is.na(archivename)) - archiveResults(w, paste(archivename, "_RA.RData", sep=''), settings) + archiveResults(w, paste(archivename, "_RA.RData", sep=''), settings) if(RMassBank.env$verbose.output){ - noFormulaCount <- sum(is.na(w@aggregated$formula) & is.na(w@aggregated$reanalyzed.formula)) - if(noFormulaCount > 0) - cat(paste("### Warning ### ", noFormulaCount, " / ", nrow(w@aggregated), " peaks have no molecular formula\n", sep = "")) + isNoFormula <- is.na(w@aggregated$formula) & is.na(w@aggregated$reanalyzed.formula) + noFormulaCount <- sum(isNoFormula) + if(noFormulaCount > 0){ + cat(paste("### Warning ### ", noFormulaCount, " / ", nrow(unique(x = w@aggregated[, c("mzFound", "intensity", "formulaCount", "dppmBest")])), " peaks have no molecular formula:\n", sep = "")) + print(w@aggregated[isNoFormula, c("mzFound","intensity","cpdID")]) + } } } # Step 8: heuristic filtering based on peak multiplicity; @@ -290,12 +293,12 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec message("msmsWorkflow: Step 8. Peak multiplicity filtering skipped because multiplicityFilter parameter is not set.") } else { # apply heuristic filter - w@aggregated <- filterMultiplicity(w, archivename, mode, settings$multiplicityFilter) + w@aggregated <- filterMultiplicity(w = w, archivename = archivename, mode = mode, multiplicityFilter = settings$multiplicityFilter) if(RMassBank.env$verbose.output){ multiplicityNotOkCount <- sum(!w@aggregated$filterOK) if(multiplicityNotOkCount > 0) - cat(paste("### Warning ### ", multiplicityNotOkCount, " / ", nrow(w@aggregated), " peaks do not fulfill the multiplicity criterion\n", sep = "")) + cat(paste("### Warning ### ", multiplicityNotOkCount, " / ", nrow(w@aggregated[, c("mzFound", "intensity", "formulaCount", "dppmBest")]), " peaks do not fulfill the multiplicity criterion\n", sep = "")) } w@aggregated <- processProblematicPeaks(w, mode, archivename) @@ -543,8 +546,8 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi # with insufficient match accuracy or no match. analyzeTandemShot <- function(child) { - childIdx <- which(sapply(X = seq_along(w@spectra[[3]]@children), FUN = function(i){ - all(child@mz == w@spectra[[3]]@children[[i]]@mz) & all(child@rt == w@spectra[[3]]@children[[i]]@rt) & all(child@intensity == w@spectra[[3]]@children[[i]]@intensity) } + childIdx <- which(sapply(X = seq_along(msmsPeaks@children), FUN = function(i){ + all(child@mz == msmsPeaks@children[[i]]@mz) & all(child@rt == msmsPeaks@children[[i]]@rt) & all(child@intensity == msmsPeaks@children[[i]]@intensity) } )) shot <- getData(child) shot$row <- which(!is.na(shot$mz)) diff --git a/R/msmsRead.R b/R/msmsRead.R index 8f684a0..0b79035 100644 --- a/R/msmsRead.R +++ b/R/msmsRead.R @@ -140,7 +140,7 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, if(RMassBank.env$verbose.output) for(specIdx in seq_along(w@spectra)) if(!w@spectra[[specIdx]]@found) - cat(paste("### Warning ### No precursor ion was detected for ID '", w@spectra[[specIdx]]@id, "'\n", sep = "")) + cat(paste("### Warning ### No precursor ion detected for ID '", w@spectra[[specIdx]]@id, "'\n", sep = "")) return(w) } From dab3e18f2b88049b5c893276a09cd5c126adaf95 Mon Sep 17 00:00:00 2001 From: Treutler Date: Wed, 10 Jan 2018 14:01:55 +0100 Subject: [PATCH 06/71] minor change --- R/createMassBank.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index 1e09961..a00bd0e 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -352,7 +352,7 @@ createMolfile <- function(id_or_smiles, fileName = FALSE) if(is.character(fileName)) res <- readLines(fileName) } - return(c(" ","$$$$")) + #return(c(" ","$$$$")) return(res) } From 7f36ba82d5315ef541167e8753913b117d2b1933 Mon Sep 17 00:00:00 2001 From: Treutler Date: Wed, 10 Jan 2018 14:38:11 +0100 Subject: [PATCH 07/71] fixed line endings --- R/createMassBank.R | 3 ++- R/leMsMs.r | 1 + R/leMsmsRaw.R | 1 + R/webAccess.R | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index a00bd0e..9675e76 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -615,7 +615,8 @@ gatherData <- function(id) if(level == c("5")){ mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" } - } + } + mbdata[["COMMENT"]][["ID"]] = id # here compound info starts mbdata[['CH$NAME']] <- names diff --git a/R/leMsMs.r b/R/leMsMs.r index 42e9f2b..0fc02da 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -284,6 +284,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec } } } + # Step 8: heuristic filtering based on peak multiplicity; # creation of failpeak list if(8 %in% steps) diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index c347d5b..096be69 100644 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -1096,3 +1096,4 @@ addMB <- function(w, cpdID, fileName, mode){ w <- addPeaksManually(w, cpdID, peaklist[[1]], mode) return(w) } + diff --git a/R/webAccess.R b/R/webAccess.R index e55ebc0..91d67b7 100755 --- a/R/webAccess.R +++ b/R/webAccess.R @@ -544,3 +544,4 @@ getPcSDF <- function(query, from = "smiles"){ data <- c(strsplit(substring(data,1,molEnd),"\n")[[1]],"$$$$") return(data) } + From f9ef3ef92f59adece8a3be7c517bede36b9fcb9f Mon Sep 17 00:00:00 2001 From: Treutler Date: Wed, 10 Jan 2018 14:49:23 +0100 Subject: [PATCH 08/71] fixed line endings #2 --- R/createMassBank.R | 4088 +++++++++++++++++++------------------- R/leMsMs.r | 4732 ++++++++++++++++++++++---------------------- R/leMsmsRaw.R | 2198 ++++++++++---------- R/webAccess.R | 1094 +++++----- 4 files changed, 6056 insertions(+), 6056 deletions(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index 9675e76..872bcb7 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -1,2044 +1,2044 @@ -# Script for writing MassBank files - -#testtest change -#' Load MassBank compound information lists -#' -#' Loads MassBank compound information lists (i.e. the lists which were created -#' in the first two steps of the MassBank \code{\link{mbWorkflow}} and -#' subsequently edited by hand.). -#' -#' \code{resetInfolists} clears the information lists, i.e. it creates a new -#' empty list in \code{mbdata_archive}. \code{loadInfolist} loads a single CSV -#' file, whereas \code{loadInfolists} loads a whole directory. -#' -#' @aliases loadInfolists loadInfolist resetInfolists -#' @usage loadInfolists(mb, path) -#' -#' loadInfolist(mb, fileName) -#' -#' resetInfolists(mb) -#' @param path Directory in which the namelists reside. All CSV files in this -#' directory will be loaded. -#' @param fileName A single namelist to be loaded. -#' @param mb The \code{mbWorkspace} to load/reset the lists in. -#' @return The new workspace with loaded/reset lists. -#' @author Michael Stravs -#' @examples -#' -#' # -#' \dontrun{mb <- resetInfolists(mb) -#' mb <- loadInfolist(mb, "my_csv_infolist.csv")} -#' -#' @export -loadInfolists <- function(mb, path) -{ - archivefiles <- list.files(path, ".csv", full.names=TRUE) - for(afile in archivefiles) - mb <- loadInfolist(mb, afile) - return(mb) -} - -# Load an "infolist". This loads a CSV file which should contain the entries -# edited and controlled by hand. All compound infos from fileName are added into the -# global mbdata_archive. Entries with a cpdID which was already present, are substituted -# by new entries from the fileName file. -#' @export -loadInfolist <- function(mb, fileName) -{ - # Prime a new infolist if it doesn't exist - if(ncol(mb@mbdata_archive) == 0) - mb <- resetInfolists(mb) - mbdata_new <- read.csv(fileName, sep=",", stringsAsFactors=FALSE) - # Legacy check for loading the Uchem format files. - # Even if dbname_* are not used downstream of here, it's still good to keep them - # for debugging reasons. - n <- colnames(mbdata_new) - cols <- c("id","dbcas","dataused") - - # Check if comma-separated or semicolon-separated - d <- setdiff(cols, n) - if(length(d)>0){ - mbdata_new <- read.csv2(fileName, stringsAsFactors=FALSE) - n <- colnames(mbdata_new) - d2 <- setdiff(cols, n) - if(length(d2) > 0){ - stop("Some columns are missing in the infolist.") - } - } - if("dbname_d" %in% colnames(mbdata_new)) - { - colnames(mbdata_new)[[which(colnames(mbdata_new)=="dbname_d")]] <- "dbname" - # dbname_e will be dropped because of the select= in the subset below. - } - if("COMMENT.EAWAG_UCHEM_ID" %in% colnames(mbdata_new)) - colnames(mbdata_new)[[which(colnames(mbdata_new)== "COMMENT.EAWAG_UCHEM_ID")]] <- - "COMMENT.ID" - - # Clear from padding spaces and NAs - mbdata_new <- as.data.frame(t(apply(mbdata_new, 1, function(r) - { - # Substitute empty spaces by real NA values - r[which(r == "")] <- NA - # Trim spaces (in all non-NA fields) - r[which(!is.na(r))] <- sub("^ *([^ ]+) *$", "\\1", r[which(!is.na(r))]) - return(r) - }))) - # use only the columns present in mbdata_archive, no other columns added in excel - mbdata_new <- mbdata_new[, colnames(mb@mbdata_archive)] - # substitute the old entires with the ones from our files - # then find the new (previously inexistent) entries, and rbind them to the table - new_entries <- setdiff(mbdata_new$id, mb@mbdata_archive$id) - old_entries <- intersect(mbdata_new$id, mb@mbdata_archive$id) - for(entry in old_entries) - mb@mbdata_archive[mb@mbdata_archive$id == entry,] <- mbdata_new[mbdata_new$id == entry,] - mb@mbdata_archive <- rbind(mb@mbdata_archive, - mbdata_new[mbdata_new$id==new_entries,]) - return(mb) - -} - - -# Resets the mbdata_archive to an empty version. -#' @export -resetInfolists <- function(mb) -{ - mb@mbdata_archive <- - structure(list(X = integer(0), id = integer(0), dbcas = character(0), - dbname = character(0), dataused = character(0), COMMENT.CONFIDENCE = character(0), - COMMENT.ID = integer(0), CH.NAME1 = character(0), - CH.NAME2 = character(0), CH.NAME3 = character(0), CH.COMPOUND_CLASS = character(0), - CH.FORMULA = character(0), CH.EXACT_MASS = numeric(0), CH.SMILES = character(0), - CH.IUPAC = character(0), CH.LINK.CAS = character(0), CH.LINK.CHEBI = integer(0), - CH.LINK.HMDB = character(0), CH.LINK.KEGG = character(0), CH.LINK.LIPIDMAPS = character(0), - CH.LINK.PUBCHEM = character(0), CH.LINK.INCHIKEY = character(0), - CH.LINK.CHEMSPIDER = integer(0)), .Names = c("X", "id", "dbcas", - "dbname", "dataused", "COMMENT.CONFIDENCE", "COMMENT.ID", - "CH.NAME1", "CH.NAME2", "CH.NAME3", "CH.COMPOUND_CLASS", "CH.FORMULA", - "CH.EXACT_MASS", "CH.SMILES", "CH.IUPAC", "CH.LINK.CAS", "CH.LINK.CHEBI", - "CH.LINK.HMDB", "CH.LINK.KEGG", "CH.LINK.LIPIDMAPS", "CH.LINK.PUBCHEM", - "CH.LINK.INCHIKEY", "CH.LINK.CHEMSPIDER"), row.names = integer(0), class = "data.frame") - return(mb) - -} - -# The workflow function, i.e. (almost) the only thing you actually need to call. -# See below for explanation of steps. -#' MassBank record creation workflow -#' -#' Uses data generated by \code{\link{msmsWorkflow}} to create MassBank records. -#' -#' See the vignette \code{vignette("RMassBank")} for detailed informations about the usage. -#' -#' Steps: -#' -#' Step 1: Find which compounds don't have annotation information yet. For these -#' compounds, pull information from several databases (using gatherData). -#' -#' Step 2: If new compounds were found, then export the infolist.csv and stop the workflow. -#' Otherwise, continue. -#' -#' Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. -#' -#' Step 4: Compile the spectra. Using the skeletons from the archive data, create -#' MassBank records per compound and fill them with peak data for each spectrum. -#' Also, assign accession numbers based on scan mode and relative scan no. -#' -#' Step 5: Convert the internal tree-like representation of the MassBank data into -#' flat-text string arrays (basically, into text-file style, but still in memory) -#' -#' Step 6: For all OK records, generate a corresponding molfile with the structure -#' of the compound, based on the SMILES entry from the MassBank record. (This molfile -#' is still in memory only, not yet a physical file) -#' -#' Step 7: If necessary, generate the appropriate subdirectories, and actually write -#' the files to disk. -#' -#' Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank -#' to attribute substances to their corresponding structure molfiles. -#' -#' @param steps Which steps in the workflow to perform. -#' @param infolist_path A path where to store newly downloaded compound informations, -#' which should then be manually inspected. -#' @param mb The \code{mbWorkspace} to work in. -#' @param gatherData A variable denoting whether to retrieve information using several online databases \code{gatherData= "online"} -#' or to use the local babel installation \code{gatherData= "babel"}. Note that babel is used either way, if a directory is given -#' in the settings. This setting will be ignored if retrieval is set to "standard" -#' @return The processed \code{mbWorkspace}. -#' @seealso \code{\link{mbWorkspace-class}} -#' @author Michael A. Stravs, Eawag -#' @examples \dontrun{ -#' mb <- newMbWorkspace(w) # w being a msmsWorkspace -#' mb <- loadInfolists(mb, "D:/myInfolistPath") -#' mb <- mbWorkflow(mb, steps=c(1:3), "newinfos.csv") -#' -#' } -#' @export -mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.csv", gatherData = "online") -{ - # Step 1: Find which compounds don't have annotation information yet. For these - # compounds, pull information from CTS (using gatherData). - if(1 %in% steps) - { - mbdata_ids <- lapply(selectSpectra(mb@spectra, "found", "object"), function(spec) spec@id) - message("mbWorkflow: Step 1. Gather info from several databases") - # Which IDs are not in mbdata_archive yet? - new_ids <- setdiff(as.numeric(unlist(mbdata_ids)), mb@mbdata_archive$id) - mb@mbdata <- lapply(new_ids, function(id) - { - if(findLevel(id,TRUE) == "standard"){ - if(gatherData == "online"){ - - d <- gatherData(id) - } - if(gatherData == "babel"){ - # message("mbWorkflow: Step 1. Gather info using babel") - d <- gatherDataBabel(id) - } - } else{ - # message("mbWorkflow: Step 1. Gather no info - Unknown structure") - d <- gatherDataUnknown(id, mb@spectra[[1]]@mode, retrieval=findLevel(id,TRUE)) - } - message(paste(id, ": ", d$dataused, sep='')) - return(d) - }) - } - # Step 2: If new compounds were found, then export the infolist.csv and stop the workflow. - # Otherwise, continue! - if(2 %in% steps) - { - message("mbWorkflow: Step 2. Export infolist (if required)") - if(length(mb@mbdata)>0) - { - mbdata_mat <- flatten(mb@mbdata) - write.csv(as.data.frame(mbdata_mat),infolist_path, na="") - message(paste("The file", infolist_path, "was generated with new compound information. Please check and edit the table, and add it to your infolist folder.")) - return(mb) - } - else - message("No new data added.") - } - # Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. - if(3 %in% steps) - { - message("mbWorkflow: Step 3. Data reformatting") - mb@mbdata_relisted <- apply(mb@mbdata_archive, 1, readMbdata) - } - # Step 4: Compile the spectra! Using the skeletons from the archive data, create - # MassBank records per compound and fill them with peak data for each spectrum. - # Also, assign accession numbers based on scan mode and relative scan no. - if(4 %in% steps) - { - message("mbWorkflow: Step 4. Spectra compilation") - mb@compiled <- lapply( - selectSpectra(mb@spectra, "found", "object"), - function(r) { - message(paste("Compiling: ", r@name, sep="")) - mbdata <- mb@mbdata_relisted[[which(mb@mbdata_archive$id == as.numeric(r@id))]] - if(nrow(mb@additionalPeaks) > 0) - res <-compileRecord(r, mbdata, mb@aggregated, mb@additionalPeaks) - else - res <-compileRecord(spec = r, mbdata = mbdata, aggregated = mb@aggregated, additionalPeaks = NULL, retrieval=findLevel(r@id,TRUE)) - return(res) - }) - # check which compounds have useful spectra - mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) - mb@problems <- which(is.na(mb@compiled)) - mb@compiled_ok <- mb@compiled[mb@ok] - } - # Step 5: Convert the internal tree-like representation of the MassBank data into - # flat-text string arrays (basically, into text-file style, but still in memory) - if(5 %in% steps) - { - message("mbWorkflow: Step 5. Flattening records") - mb@mbfiles <- lapply(mb@compiled_ok, function(c) lapply(c, toMassbank)) - } - # Step 6: For all OK records, generate a corresponding molfile with the structure - # of the compound, based on the SMILES entry from the MassBank record. (This molfile - # is still in memory only, not yet a physical file) - if(6 %in% steps) - { - message("mbWorkflow: Step 6. Generate molfiles") - mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(as.numeric(c[[1]][['COMMENT']][[getOption("RMassBank")$annotations$internal_id_fieldname]]))) - } - # Step 7: If necessary, generate the appropriate subdirectories, and actually write - # the files to disk. - if(7 %in% steps) - { - message("mbWorkflow: Step 7. Generate subdirs and export") - dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "moldata", sep='/'),recursive=TRUE) - dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "recdata", sep='/'),recursive=TRUE) - for(cnt in seq_along(mb@compiled_ok)) - exportMassbank(mb@compiled_ok[[cnt]], mb@mbfiles[[cnt]], mb@molfile[[cnt]]) - } - # Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank - # to attribute substances to their corresponding structure molfiles. - if(8 %in% steps) - { - message("mbWorkflow: Step 8. Create list.tsv") - makeMollist(mb@compiled_ok) - } - return(mb) -} - - -# Calls openbabel and converts the SMILES code string (or retrieves the SMILES code from -# the ID, and then calls openbabel) to create a molfile in text format. -# If fileName is given, the file is directly stored. Otherwise, it is returned as a -# character array. -#' Create MOL file for a chemical structure -#' -#' Creates a MOL file (in memory or on disk) for a compound specified by the -#' compound ID or by a SMILES code. -#' -#' The function invokes OpenBabel (and therefore needs a correctly set -#' OpenBabel path in the RMassBank settings), using the SMILES code retrieved -#' with \code{findSmiles} or using the SMILES code directly. The current -#' implementation of the workflow uses the latter version, reading the SMILES -#' code directly from the MassBank record itself. -#' -#' @usage createMolfile(id_or_smiles, fileName = FALSE) -#' @param id_or_smiles The compound ID or a SMILES code. -#' @param fileName If the filename is set, the file is written directly to disk -#' using the specified filename. Otherwise, it is returned as a text array. -#' @return A character array containing the MOL/SDF format file, ready to be -#' written to disk. -#' @author Michael Stravs -#' @seealso \code{\link{findSmiles}} -#' @references OpenBabel: \url{http://openbabel.org} -#' @examples -#' -#' # Benzene: -#' \dontrun{ -#' createMolfile("C1=CC=CC=C1") -#' } -#' -#' @export -createMolfile <- function(id_or_smiles, fileName = FALSE) -{ - .checkMbSettings() - babeldir <- getOption("RMassBank")$babeldir - - if(!is.numeric(id_or_smiles)){ - smiles <- id_or_smiles - } else{ - if(findLevel(id_or_smiles,TRUE) != "standard"){ - return(c(" ","$$$$")) - } - smiles <- findSmiles(id_or_smiles) - } - # if no babeldir was set, get the result from cactus. - if(is.na(babeldir)) - { - res <- getCactus(smiles, "sdf") - - if(any(is.na(res))){ - res <- getPcSDF(smiles) - } - if(any(is.na(res))){ - stop("Pubchem and Cactus both seem to be down.") - } - if(is.character(fileName)) - writeLines(res, fileName) - } - # otherwise use the better-tested OpenBabel toolkit. - else - { - if(!is.character(fileName)) - cmd <- paste(babeldir, "babel -ismi -osdf -d -b --gen2D", sep='') - else - cmd <- paste(babeldir, "babel -ismi -osdf ", fileName , " -d -b --gen2D", sep='') - res <- system(cmd, intern=TRUE, input=smiles, ignore.stderr=TRUE) - # If we wrote to a file, read it back as return value. - if(is.character(fileName)) - res <- readLines(fileName) - } - #return(c(" ","$$$$")) - return(res) -} - - - -# Retrieve annotation data for a compound, from the internet service Pubchem -#' Retrieve supplemental annotation data from Pubchem -#' -#' Retrieves annotation data for a compound from the internet service Pubchem -#' based on the inchikey generated by babel or Cactus -#' -#' The data retrieved is the Pubchem CID, a synonym from the Pubchem database, -#' the IUPAC name (using the preferred if available) and a Chebi link -#' -#' @usage gatherPubChem(key) -#' @param key An Inchi-Key -#' @return Returns a list with 4 slots: -#' \code{PcID} The Pubchem CID -#' \code{Synonym} An arbitrary synonym for the compound name -#' \code{IUPAC} A IUPAC-name (preferred if available) -#' \code{Chebi} The identification number of the chebi database -#' @author Erik Mueller -#' @seealso \code{\link{mbWorkflow}} -#' @references Pubchem REST: -#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} -#' Chebi: -#' \url{http://www.ebi.ac.uk/chebi} -#' @examples -#' -#' # Gather data for compound ID 131 -#' \dontrun{gatherPubChem("QEIXBXXKTUNWDK-UHFFFAOYSA-N")} -#' -#' @export -gatherPubChem <- function(key){ - - PubChemData <- list() - - ##Trycatches are there because pubchem has connection issues 1 in 50 times. - ##Write NA into the respective fields if something goes wrong with the conenction or the data. - - ##Retrieve Pubchem CID - tryCatch( - PubChemData$PcID <- getPcId(key), - error=function(e){ - PubChemData$PcID <<- NA - }) - - ##Retrieve a synonym to the name - tryCatch( - PubChemData$Synonym <- getPcSynonym(key), - error=function(e){ - PubChemData$Synonym <<- NA - }) - - ##Retrieve the IUPAC-name - tryCatch( - PubChemData$IUPAC <- getPcIUPAC(key), - error=function(e){ - PubChemData$IUPAC <<- NA - }) - - ##Retrieve the Chebi-ID - tryCatch( - PubChemData$Chebi <- getPcCHEBI(key), - error=function(e){ - PubChemData$Chebi <<- NA - }) - - return(PubChemData) -} - -# Retrieve annotation data for a compound, from the internet services Cactvs, Pubchem, Chemspider and CTS. -#' Retrieve annotation data -#' -#' Retrieves annotation data for a compound from the internet services CTS, Pubchem, Chemspider and -#' Cactvs, based on the SMILES code and name of the compounds stored in the -#' compound list. -#' -#' Composes the "upper part" of a MassBank record filled with chemical data -#' about the compound: name, exact mass, structure, CAS no., links to PubChem, -#' KEGG, ChemSpider. The instrument type is also written into this block (even -#' if not strictly part of the chemical information). Additionally, index -#' fields are added at the start of the record, which will be removed later: -#' \code{id, dbcas, dbname} from the compound list, \code{dataused} to indicate -#' the used identifier for CTS search (\code{smiles} or \code{dbname}). -#' -#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are -#' inserted empty and will be filled later on. -#' -#' @usage gatherData(id) -#' @aliases gatherData -#' @param id The compound ID. -#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., -#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... -#' @author Michael Stravs -#' @seealso \code{\link{mbWorkflow}} -#' @references Chemical Translation Service: -#' \url{http://uranus.fiehnlab.ucdavis.edu:8080/cts/homePage} -#' cactus Chemical Identifier Resolver: -#' \url{http://cactus.nci.nih.gov/chemical/structure} -#' MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' Pubchem REST: -#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} -#' Chemspider InChI conversion: -#' \url{https://www.chemspider.com/InChI.asmx} -#' @examples -#' -#' # Gather data for compound ID 131 -#' \dontrun{gatherData(131)} -#' -#' @export -gatherData <- function(id) -{ - ##Preamble: Is a babeldir supplied? - ##If yes, use it - - .checkMbSettings() - usebabel=TRUE - babeldir <- getOption("RMassBank")$babeldir - - if(is.na(babeldir)){ - usebabel=FALSE - } - - - ##Get all useful information from the local "database" (from the CSV sheet) - - smiles <- findSmiles(id) - mass <- findMass(smiles) - dbcas <- findCAS(id) - dbname <- findName(id) - if(is.na(dbname)) dbname <- "" - if(is.na(dbcas)) dbcas <- "" - iupacName <- dbname - synonym <- dbname - formula <- findFormula(id) - - ##Convert SMILES to InChI key via Cactvs or babel. CTS doesn't "interpret" the SMILES per se, - ##it just matches identical known SMILES, so we need to convert to a "searchable" and - ##standardized format beforehand. Other databases are able to interpret the smiles. - - if(usebabel){ - cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey') - inchikey_split <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE) - } else{ - inchikey <- getCactus(smiles, 'stdinchikey') - if(!is.na(inchikey)){ - ##Split the "InChiKey=" part off the key - inchikey_split <- strsplit(inchikey, "=", fixed=TRUE)[[1]][[2]] - } else{ - inchikey_split <- getPcInchiKey(smiles) - } - } - - ##Use Pubchem to retrieve information - PcInfo <- gatherPubChem(inchikey_split) - - if(!is.null(PcInfo$Synonym) & !is.na(PcInfo$Synonym)){ - synonym <- PcInfo$Synonym - } - - if(!is.null(PcInfo$IUPAC) & !is.na(PcInfo$IUPAC)){ - iupacName <- PcInfo$IUPAC - } - - ##Get Chemspider-ID - csid <- getCSID(inchikey_split) - - if(is.na(csid)){ - ##Get ChemSpider ID from Cactus if the Chemspider page is down - csid <- getCactus(inchikey_split, 'chemspider_id') - } - - ##Use CTS to retrieve information - CTSinfo <- getCtsRecord(inchikey_split) - - if((CTSinfo[1] == "Sorry, we couldn't find any matching results") || is.null(CTSinfo[1])) - { - CTSinfo <- NA - } - - ##List the names - if(iupacName == ""){ - warning(paste0("Compound ID ",id,": no IUPAC name could be identified.")) - } - - if(toupper(dbname) == toupper(synonym)){ - synonym <- dbname - } - - if(toupper(dbname) == toupper(iupacName)){ - iupacName <- dbname - } - - if(toupper(synonym) == toupper(iupacName)){ - synonym <- iupacName - } - - names <- as.list(unique(c(dbname, synonym, iupacName))) - - ##If no name is found, it must be supplied in one way or another - if(all(sapply(names, function(x) x == ""))){ - stop("RMassBank wasn't able to extract a usable name for this compound from any database. Please supply a name manually.") - } - - # Start to fill the MassBank record. - # The top 4 entries will not go into the final record; they are used to identify - # the record and also to facilitate manual editing of the exported record table. - mbdata <- list() - mbdata[['id']] <- id - mbdata[['dbcas']] <- dbcas - mbdata[['dbname']] <- dbname - mbdata[['dataused']] <- "smiles" - mbdata[['ACCESSION']] <- "" - mbdata[['RECORD_TITLE']] <- "" - mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") - mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors - mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license - mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright - # Confidence annotation and internal ID annotation. - # The ID of the compound will be written like: - # COMMENT: EAWAG_UCHEM_ID 1234 - # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" - mbdata[["COMMENT"]] <- list() - if(findLevel(id) == "0"){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment - } else{ - level <- findLevel(id) - if(level %in% c("1","1a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" - } - if(level == c("2")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" - } - if(level == c("2a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" - } - if(level == c("2b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" - } - if(level == c("3")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" - } - if(level == c("3a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" - } - if(level == c("3b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" - } - if(level == c("3c")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" - } - if(level == c("3d")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" - } - if(level == c("4")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" - } - if(level == c("5")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" - } - } - - mbdata[["COMMENT"]][["ID"]] = id - # here compound info starts - mbdata[['CH$NAME']] <- names - # Currently we use a fixed value for Compound Class, since there is no useful - # convention of what should go there and what shouldn't, and the field is not used - # in search queries. - mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class - mbdata[['CH$FORMULA']] <- formula - mbdata[['CH$EXACT_MASS']] <- mass - mbdata[['CH$SMILES']] <- smiles - - if(usebabel){ - cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi') - mbdata[['CH$IUPAC']] <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE) - } else{ - mbdata[['CH$IUPAC']] <- getCactus(smiles, "stdinchi") - } - - - - # Add all CH$LINK fields present in the compound datasets - link <- list() - # CAS - if(!is.na(CTSinfo[1])){ - if("CAS" %in% CTS.externalIdTypes(CTSinfo)) - { - # Prefer database CAS if it is also listed in the CTS results. - # otherwise take the shortest one. - cas <- CTS.externalIdSubset(CTSinfo,"CAS") - if(dbcas %in% cas) - link[["CAS"]] <- dbcas - else - link[["CAS"]] <- cas[[which.min(nchar(cas))]] - } else{ - if(dbcas != ""){ - link[["CAS"]] <- dbcas - } - } - } else{ - if(dbcas != ""){ - link[["CAS"]] <- dbcas - } - } - - - # CHEBI - if(is.na(PcInfo$Chebi[1])){ - if(!is.na(CTSinfo[1])){ - if("ChEBI" %in% CTS.externalIdTypes(CTSinfo)) - { - # Cut off front "CHEBI:" if present - chebi <- CTS.externalIdSubset(CTSinfo,"ChEBI") - chebi <- chebi[[which.min(nchar(chebi))]] - chebi <- strsplit(chebi,":")[[1]] - link[["CHEBI"]] <- chebi[[length(chebi)]] - } - } - } else{ - chebi <- PcInfo$Chebi - chebi <- chebi[[which.min(nchar(chebi))]] - chebi <- strsplit(chebi,":")[[1]] - link[["CHEBI"]] <- chebi[[length(chebi)]] - } - # HMDB - if(!is.na(CTSinfo[1])){ - if("Human Metabolome Database" %in% CTS.externalIdTypes(CTSinfo)) - link[["HMDB"]] <- CTS.externalIdSubset(CTSinfo,"HMDB")[[1]] - # KEGG - if("KEGG" %in% CTS.externalIdTypes(CTSinfo)) - link[["KEGG"]] <- CTS.externalIdSubset(CTSinfo,"KEGG")[[1]] - # LipidMAPS - if("LipidMAPS" %in% CTS.externalIdTypes(CTSinfo)) - link[["LIPIDMAPS"]] <- CTS.externalIdSubset(CTSinfo,"LipidMAPS")[[1]] - } - # PubChem CID - if(is.na(PcInfo$PcID[1])){ - if(!is.na(CTSinfo[1])){ - if("PubChem CID" %in% CTS.externalIdTypes(CTSinfo)) - { - pc <- CTS.externalIdSubset(CTSinfo,"PubChem CID") - link[["PUBCHEM"]] <- paste0(min(pc)) - } - } - } else{ - link[["PUBCHEM"]] <- PcInfo$PcID[1] - } - - - if(!is.null(link[["PUBCHEM"]])){ - if(substr(link[["PUBCHEM"]],1,4) != "CID:"){ - link[["PUBCHEM"]] <- paste0("CID:", link[["PUBCHEM"]]) - } - } - - link[["INCHIKEY"]] <- inchikey_split - if(length(csid)>0) if(any(!is.na(csid))) link[["CHEMSPIDER"]] <- min(as.numeric(as.character(csid))) - mbdata[['CH$LINK']] <- link - - mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument - mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type - - return(mbdata) -} - -# Retrieve annotation data for a compound, using only babel -#' Retrieve annotation data -#' -#' Retrieves annotation data for a compound by using babel, -#' based on the SMILES code and name of the compounds stored in the -#' compound list. -#' -#' Composes the "upper part" of a MassBank record filled with chemical data -#' about the compound: name, exact mass, structure, CAS no.. -#' The instrument type is also written into this block (even -#' if not strictly part of the chemical information). Additionally, index -#' fields are added at the start of the record, which will be removed later: -#' \code{id, dbcas, dbname} from the compound list. -#' -#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are -#' inserted empty and will be filled later on. -#' -#' This function is an alternative to gatherData, in case CTS is down or if information -#' on one or more of the compounds in the compound list are sparse -#' -#' @usage gatherDataBabel(id) -#' @param id The compound ID. -#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., -#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... -#' @author Michael Stravs, Erik Mueller -#' @seealso \code{\link{mbWorkflow}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' -#' # Gather data for compound ID 131 -#' \dontrun{gatherDataBabel(131)} -#' -#' @export -gatherDataBabel <- function(id){ - .checkMbSettings() - babeldir <- getOption("RMassBank")$babeldir - smiles <- findSmiles(id) - - - # if no babeldir was set, throw an error that says that either CTS or babel have to be used - if(is.na(babeldir)) - { - stop("No babeldir supplied; It is currently not possible to convert the information without either babel or CTS") - } else { - ###Babel conversion - cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey') - inchikey <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE) - cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi') - inchi <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE) - - ##Read from Compoundlist - smiles <- findSmiles(id) - mass <- findMass(smiles) - dbcas <- findCAS(id) - dbname <- findName(id) - if(is.na(dbname)) dbname <- "" - if(is.na(dbcas)) dbcas <- "" - formula <- findFormula(id) - - ##Create - mbdata <- list() - mbdata[['id']] <- id - mbdata[['dbcas']] <- dbcas - mbdata[['dbname']] <- dbname - mbdata[['dataused']] <- "smiles" - mbdata[['ACCESSION']] <- "" - mbdata[['RECORD_TITLE']] <- "" - mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") - mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors - mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license - mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright - # Confidence annotation and internal ID annotation. - # The ID of the compound will be written like: - # COMMENT: EAWAG_UCHEM_ID 1234 - # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" - mbdata[["COMMENT"]] <- list() - if(findLevel(id) == "0"){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment - } else{ - level <- findLevel(id) - if(level %in% c("1","1a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" - } - if(level == c("2")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" - } - if(level == c("2a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" - } - if(level == c("2b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" - } - if(level == c("3")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" - } - if(level == c("3a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" - } - if(level == c("3b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" - } - if(level == c("3c")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" - } - if(level == c("3d")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" - } - if(level == c("4")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" - } - if(level == c("5")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" - } - } - mbdata[["COMMENT"]][["ID"]] <- id - - # here compound info starts - mbdata[['CH$NAME']] <- as.list(dbname) - - # Currently we use a fixed value for Compound Class, since there is no useful - # convention of what should go there and what shouldn't, and the field is not used - # in search queries. - mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class - mbdata[['CH$FORMULA']] <- formula - mbdata[['CH$EXACT_MASS']] <- mass - mbdata[['CH$SMILES']] <- smiles - mbdata[['CH$IUPAC']] <- inchi - - link <- list() - if(dbcas != "") - link[["CAS"]] <- dbcas - link[["INCHIKEY"]] <- inchikey - mbdata[['CH$LINK']] <- link - mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument - mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type - } - return(mbdata) -} - -# Retrieve annotation data for a compound, using only babel -#' Retrieve annotation data -#' -#' Retrieves annotation data for an unknown compound by using basic information present -#' -#' Composes the "upper part" of a MassBank record filled with chemical data -#' about the compound: name, exact mass, structure, CAS no.. -#' The instrument type is also written into this block (even -#' if not strictly part of the chemical information). Additionally, index -#' fields are added at the start of the record, which will be removed later: -#' \code{id, dbcas, dbname} from the compound list. -#' -#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are -#' inserted empty and will be filled later on. -#' -#' This function is used to generate the data in case a substance is unknown, -#' i.e. not enough information is present to derive anything about formulas or links -#' -#' @usage gatherDataUnknown(id, mode, retrieval) -#' @param id The compound ID. -#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). -#' @param retrieval A value that determines whether the files should be handled either as "standard", -#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" -#' if the only know thing is the m/z -#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., -#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... -#' @author Michael Stravs, Erik Mueller -#' @seealso \code{\link{mbWorkflow}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' -#' # Gather data for compound ID 131 -#' \dontrun{gatherDataUnknown(131,"pH")} -#' -#' @export -gatherDataUnknown <- function(id, mode, retrieval){ - .checkMbSettings() - - ##Read from Compoundlist - smiles <- "" - if(retrieval == "unknown"){ - mass <- findMass(id, "unknown", mode) - formula <- "" - } - if(retrieval == "tentative"){ - mass <- findMass(id, "tentative", mode) - formula <- findFormula(id, "tentative") - } - dbcas <- NA - dbname <- findName(id) - if(is.na(dbname)) dbname <- paste("Unknown ID:",id) - if(is.na(dbcas)) dbcas <- "" - - - - ##Create - mbdata <- list() - mbdata[['id']] <- id - mbdata[['dbcas']] <- dbcas - mbdata[['dbname']] <- dbname - mbdata[['dataused']] <- "none" - mbdata[['ACCESSION']] <- "" - mbdata[['RECORD_TITLE']] <- "" - mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") - mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors - mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license - mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright - # Confidence annotation and internal ID annotation. - # The ID of the compound will be written like: - # COMMENT: EAWAG_UCHEM_ID 1234 - # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" - mbdata[["COMMENT"]] <- list() - if(findLevel(id) == "0"){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment - } else{ - level <- findLevel(id) - if(level %in% c("1","1a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" - } - if(level == c("2")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" - } - if(level == c("2a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" - } - if(level == c("2b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" - } - if(level == c("3")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" - } - if(level == c("3a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" - } - if(level == c("3b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" - } - if(level == c("3c")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" - } - if(level == c("3d")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" - } - if(level == c("4")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" - } - if(level == c("5")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" - } - } - mbdata[["COMMENT"]][["ID"]] <- id - - # here compound info starts - mbdata[['CH$NAME']] <- as.list(dbname) - - # Currently we use a fixed value for Compound Class, since there is no useful - # convention of what should go there and what shouldn't, and the field is not used - # in search queries. - mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class - mbdata[['CH$FORMULA']] <- formula - mbdata[['CH$EXACT_MASS']] <- mass - mbdata[['CH$SMILES']] <- "" - mbdata[['CH$IUPAC']] <- "" - - link <- list() - mbdata[['CH$LINK']] <- link - mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument - mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type - - return(mbdata) -} - -# Flatten the internal tree-like representation of MassBank data to a flat table. -# Note that this limits us, in that the fields should be constant over all records! -# Therefore, e.g. the fixed number of 3 names which may be filled. -# If anybody has a cooler solution, I'll be happy to hear from you :) -# -# Note: the records from gatherData have additional information which is discarded, like -# author, copyright etc. They will be re-filled automatically when reading the file. -#' Flatten, or re-read, MassBank header blocks -#' -#' \code{flatten} converts a list of MassBank compound information sets (as -#' retrieved by \code{\link{gatherData}}) to a flat table, to be exported into -#' an \link[=loadInfolist]{infolist}. \code{readMbdata} reads a single record -#' from an infolist flat table back into a MassBank (half-)entry. -#' -#' Neither the flattening system itself nor the implementation are particularly -#' fantastic, but since hand-checking of records is a necessary evil, there is -#' currently no alternative (short of coding a complete GUI for this and -#' working directly on the records.) -#' -#' @aliases flatten readMbdata -#' @usage flatten(mbdata) -#' -#' readMbdata(row) -#' @param mbdata A list of MassBank compound information sets as returned from -#' \code{\link{gatherData}}. -#' @param row One row of MassBank compound information retrieved from an -#' infolist. -#' @return \code{flatten} returns a matrix (not a data frame) to be written to -#' CSV. -#' -#' \code{readMbdata} returns a list of type \code{list(id= \var{compoundID}, -#' ..., 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. -#' @author Michael Stravs -#' @seealso \code{\link{gatherData}},\code{\link{loadInfolist}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples \dontrun{ -#' # Collect some data to flatten -#' ids <- c(40,50,60,70) -#' data <- lapply(ids, gatherData) -#' # Flatten the data trees to a table -#' flat.table <- flatten(data) -#' # reimport the table into a tree -#' data.reimported <- apply(flat.table, 1, readMbdata) -#' } -#' -#' @export -#' -flatten <- function(mbdata) -{ - .checkMbSettings() - - colList <- c( - "id", - "dbcas", - "dbname", - "dataused", - "COMMENT.CONFIDENCE", - # Note: The field name of the internal id field is replaced with the real name - # at "compilation" time. Therefore, functions DOWNSTREAM from compileRecord() - # must use the full name including the info from options("RMassBank"). - "COMMENT.ID", - "CH$NAME1", - "CH$NAME2", - "CH$NAME3", - "CH$COMPOUND_CLASS", - "CH$FORMULA", - "CH$EXACT_MASS", - "CH$SMILES", - "CH$IUPAC", - "CH$LINK.CAS", - "CH$LINK.CHEBI", - "CH$LINK.HMDB", - "CH$LINK.KEGG", - "CH$LINK.LIPIDMAPS", - "CH$LINK.PUBCHEM", - "CH$LINK.INCHIKEY", - "CH$LINK.CHEMSPIDER") - # make an empty data frame with the right length - rows <- length(mbdata) - cols <- length(colList) - mbframe <- matrix(data=NA, nrow=rows, ncol=cols) - colnames(mbframe) <- colList - #browser() - for(row in 1:rows) - { - # fill in all the data into the dataframe: all columns which - # a) exist in the target dataframe and b) exist in the (unlisted) MB record - # are written into the dataframe. - data <- unlist(mbdata[[row]]) - # bugfix for the case of only one name - if(!("CH$NAME1" %in% names(data))) - data[["CH$NAME1"]] <- data[["CH$NAME"]] - datacols <- intersect(colList, names(data)) - mbframe[row,datacols] <- data[datacols] - } - return(mbframe) - -} - -# Read data from a flat-table MassBank record row and feed it into a -# MassBank tree-like record. Also, prime the ACCESSION and RECORD_TITLE fields in the -# correct position in the record. -#' @export -readMbdata <- function(row) -{ - .checkMbSettings() - - # Listify the table row. Lists are just cooler to work with :) - row <- as.list(row) - - mbdata <- list() - # Accession and title are added empty for now, to have them in the right place. - # Constants are read from the options or generated. - mbdata[['ACCESSION']] <- "" - mbdata[['RECORD_TITLE']] <- "" - mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") - mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors - mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license - mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright - mbdata[['PUBLICATION']] <- getOption("RMassBank")$annotations$publication - - # Read all determined fields from the file - # This is not very flexible, as you can see... - colList <- c( - "COMMENT.CONFIDENCE", - "COMMENT.ID", - "CH$NAME1", - "CH$NAME2", - "CH$NAME3", - "CH$COMPOUND_CLASS", - "CH$FORMULA", - "CH$EXACT_MASS", - "CH$SMILES", - "CH$IUPAC", - "CH$LINK.CAS", - "CH$LINK.CHEBI", - "CH$LINK.HMDB", - "CH$LINK.KEGG", - "CH$LINK.LIPIDMAPS", - "CH$LINK.PUBCHEM", - "CH$LINK.INCHIKEY", - "CH$LINK.CHEMSPIDER") - mbdata[["COMMENT"]] = list() - mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT.CONFIDENCE"]] - # Again, our ID field. - - mbdata[["COMMENT"]][["ID"]]<- - row[["COMMENT.ID"]] - names = c(row[["CH.NAME1"]], row[["CH.NAME2"]], row[["CH.NAME3"]]) - names = names[which(!is.na(names))] - - names <- gsub("'", "`", names) - mbdata[["CH$NAME"]] = names - mbdata[["CH$COMPOUND_CLASS"]] = row[["CH.COMPOUND_CLASS"]] - mbdata[["CH$FORMULA"]] = row[["CH.FORMULA"]] - mbdata[["CH$EXACT_MASS"]] = row[["CH.EXACT_MASS"]] - mbdata[["CH$SMILES"]] = row[["CH.SMILES"]] - mbdata[["CH$IUPAC"]] = row[["CH.IUPAC"]] - # Add all links and then eliminate the NA values from the tree. - link = list() - link[["CAS"]] = row[["CH.LINK.CAS"]] - link[["CHEBI"]] = row[["CH.LINK.CHEBI"]] - link[["HMDB"]] = row[["CH.LINK.HMDB"]] - link[["KEGG"]] = row[["CH.LINK.KEGG"]] - link[["LIPIDMAPS"]] = row[["CH.LINK.LIPIDMAPS"]] - link[["PUBCHEM"]] = row[["CH.LINK.PUBCHEM"]] - link[["INCHIKEY"]] = row[["CH.LINK.INCHIKEY"]] - link[["CHEMSPIDER"]] = row[["CH.LINK.CHEMSPIDER"]] - link[which(is.na(link))] <- NULL - mbdata[["CH$LINK"]] <- link - # again, these constants are read from the options: - mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument - mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type - - return(mbdata) - -} - -# For each compound, this function creates the "lower part" of the MassBank record, i.e. -# everything that comes after AC$INSTRUMENT_TYPE. -#' Compose data block of MassBank record -#' -#' \code{gatherCompound} composes the data blocks (the "lower half") of all -#' MassBank records for a compound, using the annotation data in the RMassBank -#' options, spectrum info data from the \code{analyzedSpec}-type record and the -#' peaks from the reanalyzed, multiplicity-filtered peak table. It calls -#' \code{gatherSpectrum} for each child spectrum. -#' -#' The returned data blocks are in format \code{list( "AC\$MASS_SPECTROMETRY" = -#' list('FRAGMENTATION_MODE' = 'CID', ...), ...)} etc. -#' -#' @aliases gatherCompound gatherSpectrum -#' @usage gatherCompound(spec, aggregated, additionalPeaks = NULL, retrieval="standard") -#' -#' gatherSpectrum(spec, msmsdata, ac_ms, ac_lc, aggregated, -#' additionalPeaks = NULL, retrieval="standard") -#' @param spec A \code{RmbSpectraSet} object, representing a compound with multiple spectra. -#' @param aggregated An aggregate peak table where the peaks are extracted from. -#' @param msmsdata A \code{RmbSpectrum2} object from the \code{spec} spectra set, representing a single spectrum to give a record. -#' @param ac_ms,ac_lc Information for the AC\$MASS_SPECTROMETRY and -#' AC\$CHROMATOGRAPHY fields in the MassBank record, created by -#' \code{gatherCompound} and then fed into \code{gatherSpectrum}. -#' @param additionalPeaks If present, a table with additional peaks to add into the spectra. -#' As loaded with \code{\link{addPeaks}}. -#' @param retrieval A value that determines whether the files should be handled either as "standard", -#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" -#' if the only know thing is the m/z -#' @return \code{gatherCompound} returns a list of tree-like MassBank data -#' blocks. \code{gatherSpectrum} returns one single MassBank data block or -#' \code{NA} if no useful peak is in the spectrum. -#' @note Note that the global table \code{additionalPeaks} is also used as an -#' additional source of peaks. -#' @author Michael Stravs -#' @seealso \code{\link{mbWorkflow}}, \code{\link{compileRecord}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples \dontrun{ -#' myspectrum <- w@@spectra[[1]] -#' massbankdata <- gatherCompound(myspectrum, w@@aggregated) -#' # Note: ac_lc and ac_ms are data blocks usually generated in gatherCompound and -#' # passed on from there. The call below gives a relatively useless result :) -#' ac_lc_dummy <- list() -#' ac_ms_dummy <- list() -#' justOneSpectrum <- gatherSpectrum(myspectrum, myspectrum@@child[[2]], -#' ac_ms_dummy, ac_lc_dummy, w@@aggregated) -#' } -#' -#' -#' @export -gatherCompound <- function(spec, aggregated, additionalPeaks = NULL, retrieval="standard") -{ - # compound ID - id <- spec@id - # processing mode - imode <- spec@mode - - # define positive or negative, based on processing mode. - ion_modes <- list( - "pH" = "POSITIVE", - "pNa" = "POSITIVE", - "mH" = "NEGATIVE", - "mFA" = "NEGATIVE", - "pM" = "POSITIVE", - "mM" = "NEGATIVE", - "pNH4" = "POSITIVE") - mode <- ion_modes[[imode]] - - # for format 2.01 - ac_ms <- list(); - ac_ms[['MS_TYPE']] <- getOption("RMassBank")$annotations$ms_type - ac_ms[['IONIZATION']] <- getOption("RMassBank")$annotations$ionization - ac_ms[['ION_MODE']] <- mode - - # This list could be made customizable. - ac_lc <- list(); - rt <- spec@parent@rt / 60 - ac_lc[['COLUMN_NAME']] <- getOption("RMassBank")$annotations$lc_column - ac_lc[['FLOW_GRADIENT']] <- getOption("RMassBank")$annotations$lc_gradient - ac_lc[['FLOW_RATE']] <- getOption("RMassBank")$annotations$lc_flow - ac_lc[['RETENTION_TIME']] <- sprintf("%.3f min", rt) - ac_lc[['SOLVENT A']] <- getOption("RMassBank")$annotations$lc_solvent_a - ac_lc[['SOLVENT B']] <- getOption("RMassBank")$annotations$lc_solvent_b - - # Go through all child spectra, and fill our skeleton with scan data! - # Pass them the AC_LC and AC_MS data, which are added at the right place - # directly in there. - allSpectra <- lapply(spec@children, function(m) - gatherSpectrum(spec = spec, msmsdata = m, ac_ms = ac_ms, ac_lc = ac_lc, aggregated = aggregated, additionalPeaks = additionalPeaks, retrieval=retrieval)) - allSpectra <- allSpectra[which(!is.na(allSpectra))] - return(allSpectra) -} - - - -# Process one single MSMS child scan. -# spec: an object of "analyzedSpectrum" type (i.e. contains -# 14x (or other number) msmsdata, info, mzrange, -# compound ID, parent MS1, cpd id...) -# msmsdata: the msmsdata sub-object from the spec which is the child scan we want to process. -# Contains childFilt, childBad, scan #, etc. Note that the peaks are actually not -# taken from here! They were taken from msmsdata initially, but after introduction -# of the refiltration and multiplicity filtering, this was changed. Now only the -# scan information is actually taken from msmsdata. -# ac_ms, ac_lc: pre-filled info for the MassBank dataset (see above) -# refiltered: the refilteredRcSpecs dataset which contains our good peaks :) -# Contains peaksOK, peaksReanOK, peaksFiltered, peaksFilteredReanalysis, -# peaksProblematic. Currently we use peaksOK and peaksReanOK to create the files. -# (Also, the global additionalPeaks table is used.) -#' @export -gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalPeaks = NULL, retrieval = "standard") -{ - # If the spectrum is not filled, return right now. All "NA" spectra will - # not be treated further. - if(msmsdata@ok == FALSE) - return(NA) - # get data - scan <- msmsdata@acquisitionNum - id <- spec@id - # Further fill the ac_ms datasets, and add the ms$focused_ion with spectrum-specific data: - precursor_types <- list( - "pH" = "[M+H]+", - "pNa" = "[M+Na]+", - "mH" = "[M-H]-", - "mFA" = "[M+HCOO-]-", - "pM" = "[M]+", - "mM" = "[M]-", - "pNH4" = "[M+NH4]+") - ac_ms[['FRAGMENTATION_MODE']] <- msmsdata@info$mode - #ac_ms['PRECURSOR_TYPE'] <- precursor_types[spec$mode] - ac_ms[['COLLISION_ENERGY']] <- msmsdata@info$ce - ac_ms[['RESOLUTION']] <- msmsdata@info$res - - # Calculate exact precursor mass with Rcdk, and find the base peak from the parent - # spectrum. (Yes, that's what belongs here, I think.) - precursorMz <- findMz(spec@id, spec@mode, retrieval=retrieval) - ms_fi <- list() - ms_fi[['BASE_PEAK']] <- round(mz(spec@parent)[which.max(intensity(spec@parent))],4) - ms_fi[['PRECURSOR_M/Z']] <- round(precursorMz$mzCenter,4) - ms_fi[['PRECURSOR_TYPE']] <- precursor_types[spec@mode] - - # Select all peaks which belong to this spectrum (correct cpdID and scan no.) - # from peaksOK - # Note: Here and below it would be easy to customize the source of the peaks. - # Originally the peaks came from msmsdata$childFilt, and the subset - # was used where dppm == dppmBest (because childFilt still contains multiple formulas) - # per peak. - peaks <- aggregated[aggregated$filterOK,,drop=FALSE] - peaks <- peaks[(peaks$cpdID == id) & (peaks$scan == msmsdata@acquisitionNum),,drop=FALSE] - - # No peaks? Aha, bye - if(nrow(peaks) == 0) - return(NA) - - # If we don't include the reanalyzed peaks: - if(!getOption("RMassBank")$use_rean_peaks) - peaks <- peaks[is.na(peaks$matchedReanalysis),,drop=FALSE] - # but if we include them: - else - { - # for info, the following data will be used in the default annotator: - # annotation <- annotation[,c("mzSpec","formula", "formulaCount", "mzCalc", "dppm")] - # and in the peaklist itself: - # c("mzSpec", "int", "intrel") - peaks[!is.na(peaks$matchedReanalysis),"formula"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.formula"] - peaks[!is.na(peaks$matchedReanalysis),"mzCalc"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.mzCalc"] - peaks[!is.na(peaks$matchedReanalysis),"dppm"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.dppm"] - peaks[!is.na(peaks$matchedReanalysis),"dbe"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.dbe"] - peaks[!is.na(peaks$matchedReanalysis),"formulaCount"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.formulaCount"] - } - - # Calculate relative intensity and make a formatted m/z to use in the output - # (mzSpec, for "spectrum") - peaks$intrel <- floor(peaks$intensity / max(peaks$intensity) * 999) - peaks$mzSpec <- round(peaks$mzFound, 4) - # reorder peaks after addition of the reanalyzed ones - peaks <- peaks[order(peaks$mzSpec),] - - # Also format the other values, which are used in the annotation - peaks$dppm <- round(peaks$dppm, 2) - peaks$mzCalc <- round(peaks$mzCalc, 4) - peaks$intensity <- round(peaks$intensity, 1) - # copy the peak table to the annotation table. (The peak table will then be extended - # with peaks from the global "additional_peaks" table, which can be used to add peaks - # to the spectra by hand. - annotation <- peaks - # Keep only peaks with relative intensity >= 1 o/oo, since the MassBank record - # makes no sense otherwise. Also, keep only the columns needed in the output. - peaks <- peaks[ peaks$intrel >= 1, c("mzSpec", "intensity", "intrel")] - - # Here add the additional peaks if there are any for this compound! - # They are added without any annotation. - if(!is.null(additionalPeaks)) - { - # select the peaks from the corresponding spectrum which were marked with "OK=1" in the table. - spec_add_peaks <- additionalPeaks[ - (additionalPeaks$OK == 1) & - (additionalPeaks$cpdID == spec@id) & - (additionalPeaks$scan == msmsdata@acquisitionNum), - c("mzFound", "intensity")] - # If there are peaks to add: - if(nrow(spec_add_peaks)>0) - { - # add the column for rel. int. - spec_add_peaks$intrel <- 0 - # format m/z value - spec_add_peaks$mzSpec <- round(spec_add_peaks$mzFound, 4) - # bind tables together - peaks <- rbind(peaks, spec_add_peaks[,c("mzSpec", "intensity", "intrel")]) - # recalculate rel.int. and reorder list - peaks$intrel <- floor(peaks$intensity / max(peaks$intensity) * 999) - # Again, select the correct columns, and drop values with rel.int. <1 o/oo - # NOTE: If the highest additional peak is > than the previous highest peak, - # this can lead to the situation that a peak is in "annotation" but not in "peaks"! - # See below. - peaks <- peaks[ peaks$intrel >= 1, c("mzSpec", "intensity", "intrel")] - # Reorder again. - peaks <- peaks[order(peaks$mzSpec),] - } - } - - - - # add + or - to fragment formulas - formula_tag <- list( - "pH" = "+", - "pNa" = "+", - "mH" = "-", - "mFA" = "-", - "pM" = "+", - "mM" = "-", - "pNH4" = "+") - type <- formula_tag[[spec@mode]] - - annotator <- getOption("RMassBank")$annotator - if(is.null(annotator)) - annotator <- "annotator.default" - - - - # Here, the relative intensity is recalculated using the newly added additional - # peaks from the peak list. Therefore, we throw superfluous peaks out again. - # NOTE: It is a valid question whether or not we should kick peaks out at this stage. - # The alternative would be to leave the survivors at 1 o/oo, but keep them in the spectrum. - annotation$intrel <- floor(annotation$intensity / max(peaks$intensity) * 999) - annotation <- annotation[annotation$intrel >= 1,] - - annotation <- do.call(annotator, list(annotation= annotation, type=type)) - - - # Name the columns correctly. - colnames(peaks) <- c("m/z", "int.", "rel.int.") - peaknum <- nrow(peaks) - - # Create the "lower part" of the record. - mbdata <- list() - # Add the AC$MS, AC$LC info. - if(getOption("RMassBank")$use_version == 2) - { - mbdata[["AC$MASS_SPECTROMETRY"]] <- ac_ms - mbdata[["AC$CHROMATOGRAPHY"]] <- ac_lc - } - else - { - # Fix for MassBank data format 1, where ION_MODE must be renamed to MODE - mbdata[["AC$ANALYTICAL_CONDITION"]] <- c(ac_ms, ac_lc) - names(mbdata[["AC$ANALYTICAL_CONDITION"]])[[3]] <- "MODE" - } - # Add the MS$FOCUSED_ION info. - mbdata[["MS$FOCUSED_ION"]] <- ms_fi - - ## The SPLASH is a hash value calculated across all peaks - ## http://splash.fiehnlab.ucdavis.edu/ - ## Has to be temporarily added as "PK$SPLASH" in the "lower" part - ## of the record, but will later be moved "up" when merging parts in compileRecord() - - # the data processing tag :) - # Change by Tobias: - # I suggest to add here the current version number of the clone due to better distinction between different makes of MB records - # Could be automatised from DESCRIPTION file? - if(getOption("RMassBank")$use_rean_peaks) - processingComment <- list("REANALYZE" = "Peaks with additional N2/O included") - else - processingComment <- list() - mbdata[["MS$DATA_PROCESSING"]] <- c( - getOption("RMassBank")$annotations$ms_dataprocessing, - processingComment, - list("WHOLE" = paste("RMassBank", packageVersion("RMassBank"))) - ) - - mbdata[["PK$SPLASH"]] <- list(SPLASH = getSplash(peaks[,c("m/z", "int.")])) - - # Annotation: - if(getOption("RMassBank")$add_annotation && (findLevel(id,TRUE)!="unknown")) - mbdata[["PK$ANNOTATION"]] <- annotation - - # Peak table - mbdata[["PK$NUM_PEAK"]] <- peaknum - mbdata[["PK$PEAK"]] <- peaks - # These two entries will be thrown out later, but they are necessary to build the - # record title and the accession number. - mbdata[["RECORD_TITLE_CE"]] <- msmsdata@info$ces #formatted collision energy - # Mode of relative scan calculation: by default it is calculated relative to the - # parent scan. If a corresponding option is set, it will be calculated from the first - # present child scan in the list. - relativeScan <- "fromParent" - if(!is.null(getOption("RMassBank")$recomputeRelativeScan)) - if(getOption("RMassBank")$recomputeRelativeScan == "fromFirstChild") - relativeScan <- "fromFirstChild" - if(relativeScan == "fromParent") - mbdata[["SUBSCAN"]] <- msmsdata@acquisitionNum - spec@parent@acquisitionNum #relative scan - else if(relativeScan == "fromFirstChild"){ - firstChild <- min(unlist(lapply(spec@children,function(d) d@acquisitionNum))) - mbdata[["SUBSCAN"]] <- msmsdata@acquisitionNum - firstChild + 1 - } - return(mbdata) -} - - -# This compiles a MassBank record from the analyzedRcSpecs format (using the peaks from -# refilteredRcSpecs) together with the compound annotation data. -# Correspondingly: -# spec: contains the analyzedRcSpec-format spectrum collection to be compiled -# (i.e. a block of length(spectraList) child spectra) -# mbdata: contains the corresponding MassBank "header" (the upper part of the record) -# until INSTRUMENT TYPE. -# refiltered: the refilteredRcSpecs which contain our nice peaks. -#' Compile MassBank records -#' -#' Takes a spectra block for a compound, as returned from -#' \code{\link{analyzeMsMs}}, and an aggregated cleaned peak table, together -#' with a MassBank information block, as stored in the infolists and loaded via -#' \code{\link{loadInfolist}}/\code{\link{readMbdata}} and processes them to a -#' MassBank record -#' -#' \code{compileRecord} calls \code{\link{gatherCompound}} to create blocks of -#' spectrum data, and finally fills in the record title and accession number, -#' renames the "internal ID" comment field and removes dummy fields. -#' -#' @usage compileRecord(spec, mbdata, aggregated, additionalPeaks = NULL, retrieval="standard") -#' @param spec A \code{RmbSpectraSet} for a compound, after analysis (\code{\link{analyzeMsMs}}). -#' Note that \bold{peaks are not read from this -#' object anymore}: Peaks come from the \code{aggregated} dataframe (and from -#' the global \code{additionalPeaks} dataframe; cf. \code{\link{addPeaks}} for -#' usage information.) -#' @param mbdata The information data block for the record header, as stored in -#' \code{mbdata_relisted} after loading an infolist. -#' @param aggregated An aggregated peak data table containing information about refiltered spectra etc. -#' @param additionalPeaks If present, a table with additional peaks to add into the spectra. -#' As loaded with \code{\link{addPeaks}}. -#' @param retrieval A value that determines whether the files should be handled either as "standard", -#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" -#' if the only know thing is the m/z -#' @return Returns a MassBank record in list format: e.g. -#' \code{list("ACCESSION" = "XX123456", "RECORD_TITLE" = "Cubane", ..., -#' "CH\$LINK" = list( "CAS" = "12-345-6", "CHEMSPIDER" = 1111, ...))} -#' @author Michael Stravs -#' @seealso \code{\link{mbWorkflow}}, \code{\link{addPeaks}}, -#' \code{\link{gatherCompound}}, \code{\link{toMassbank}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' -#' # -#' \dontrun{myspec <- w@@spectra[[2]]} -#' # after having loaded an infolist: -#' \dontrun{mbdata <- mbdata_relisted[[which(mbdata_archive\$id == as.numeric(myspec\$id))]]} -#' \dontrun{compiled <- compileRecord(myspec, mbdata, w@@aggregated)} -#' -#' @export -compileRecord <- function(spec, mbdata, aggregated, additionalPeaks = NULL, retrieval="standard") -{ - # gather the individual spectra data - mblist <- gatherCompound(spec, aggregated, additionalPeaks, retrieval=retrieval) - # this returns a n-member list of "lower parts" of spectra (one for each subscan). - # (n being the number of child scans per parent scan.) - # Now we put the two parts together. - # (lapply on all n subscans, returns a list.) - mblist_c <- lapply(mblist, function(l) - { - # This is the step which sticks together the upper and the lower part of the - # record (the upper being compound-specific and the lower being scan-specific.) - # Note that the accession number and record title (in the upper part) must of course - # be filled in with scan-specific info. - mbrecord <- c(mbdata, l) - - # Here is the right place to fix the name of the INTERNAL ID field. - names(mbrecord[["COMMENT"]])[[which(names(mbrecord[["COMMENT"]]) == "ID")]] <- - getOption("RMassBank")$annotations$internal_id_fieldname - # get mode parameter (for accession number generation) depending on version - # of record definition - # Change by Tobias: - # I suggest to include fragmentation mode here for information - if(getOption("RMassBank")$use_version == 2) - mode <- mbrecord[["AC$MASS_SPECTROMETRY"]][["ION_MODE"]] - else - mode <- mbrecord[["AC$ANALYTICAL_CONDITION"]][["MODE"]] - # Generate the title and then delete the temprary RECORD_TITLE_CE field used before - mbrecord[["RECORD_TITLE"]] <- .parseTitleString(mbrecord) - mbrecord[["RECORD_TITLE_CE"]] <- NULL - # Calculate the accession number from the options. - shift <- getOption("RMassBank")$accessionNumberShifts[[spec@mode]] - mbrecord[["ACCESSION"]] <- sprintf("%s%04d%02d", getOption("RMassBank")$annotations$entry_prefix, as.numeric(spec@id), as.numeric(mbrecord[["SUBSCAN"]])+shift) - # Clear the "SUBSCAN" field. - mbrecord[["SUBSCAN"]] <- NULL - # return the record. - return(mbrecord) - }) -} - - - -#' Generate peak annotation from peaklist -#' -#' Generates the PK$ANNOTATION entry from the peaklist obtained. This function is -#' overridable by using the "annotator" option in the settings file. -#' -#' @param annotation A peak list to be annotated. Contains columns: -#' \code{"cpdID","formula","mzFound" ,"scan","mzCalc","dppm", -#' "dbe","mz","int","formulaCount","parentScan","fM_factor","dppmBest", -#' "formulaMultiplicity","intrel","mzSpec"} -#' -#' @param type The ion type to be added to annotated formulas ("+" or "-" usually) -#' -#' @return The annotated peak table. Table \code{colnames()} will be used for the -#' titles (preferrably don't use spaces in the column titles; however no format is -#' strictly enforced by the MassBank data format. -#' -#' @examples -#' \dontrun{ -#' annotation <- annotator.default(annotation) -#' } -#' @author Michele Stravs, Eawag -#' @export -annotator.default <- function(annotation, type) -{ - - annotation$formula <- paste(annotation$formula, type, sep='') - # Select the right columns and name them correctly for output. - annotation <- annotation[,c("mzSpec","formula", "formulaCount", "mzCalc", "dppm")] - colnames(annotation) <- c("m/z", "tentative_formula", "formula_count", "mass", "error(ppm)") - return(annotation) -} - -#' Parse record title -#' -#' Parses a title for a single MassBank record using the title format -#' specified in the option titleFormat. Internally used, not exported. -#' -#' If the option is not set, a standard title format is used (for record definition -#' version 1 or 2). -#' -#' @usage .parseTitleString(mbrecord) -#' @param mbrecord A MassBank record in list format, as returned from -#' \code{\link{gatherSpectrum}}. -#' @return A string with the title. -#' @author Michael Stravs, Eawag -#' @seealso \code{\link{compileRecord}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' \dontrun{ -#' # used in compileRecord() -#' title <- .parseTitleString(mbrecord) -#' } -#' -#' -#' -.parseTitleString <- function(mbrecord) -{ - - varlist <- getOption("RMassBank")$titleFormat - - # Set the standard title format. - if(is.null(varlist)) - { - if(getOption("RMassBank")$use_version == 2) - { - varlist <- c( - "{CH$NAME}", - "{AC$INSTRUMENT_TYPE}", - "{AC$MASS_SPECTROMETRY: MS_TYPE}", - "CE: {RECORD_TITLE_CE}", - "R={AC$MASS_SPECTROMETRY: RESOLUTION}", - "{MS$FOCUSED_ION: PRECURSOR_TYPE}" - ) - } - else - { - varlist <- c( - "{CH$NAME}", - "{AC$INSTRUMENT_TYPE}", - "{AC$ANALYTICAL_CONDITION: MS_TYPE}", - "CE: {RECORD_TITLE_CE}", - "R={AC$ANALYTICAL_CONDITION: RESOLUTION}", - "{MS$FOCUSED_ION: PRECURSOR_TYPE}" - ) - } - } - - - # Extract a {XXX} argument from each title section. - # check that every title has one and only one match - args <- regexec("\\{(.*)\\}", varlist) - arglist <- regmatches(varlist, args) - if(any(unlist(lapply(arglist, length)) != 2)) - stop("Title format is incorrectly specified: a section with not exactly 1 parameters") - - parsedVars <- lapply(varlist, function(var) - { - # Extract the specified parameter inside the {}. - # I.e. from a string like "R={BLA: BLUB}" return "BLA: BLUB" - args <- regexec("\\{(.*)\\}", var) - arg <- regmatches(var, args)[[1]][[2]] - # Split the parameter by colon if necessary - splitVar <- strsplit(arg, ": ")[[1]] - # Read the parameter value from the record - if(length(splitVar) == 2) - replaceVar <- mbrecord[[splitVar[[1]]]][[splitVar[[2]]]] - else if(length(splitVar) == 1) - replaceVar <- mbrecord[[splitVar]] - else - stop(paste( - "Title format is incorrectly specified:", var) - ) - # Fix problems: NULL returns - if(is.null(replaceVar)) - replaceVar <- "" - # Fix problems: Names will have >= 1 match. Take the first - if(length(replaceVar) > 1) - replaceVar <- replaceVar[[1]] - - # Fix problems: Unknowns might have no name - if(!length(replaceVar)){ - replaceVar <- "" - } - - # Substitute the parameter value into the string - parsedVar <- sub("\\{(.*)\\}", replaceVar, var) - return(parsedVar) - }) - title <- paste(parsedVars, collapse="; ") - return(title) -} - - -# This converts the tree-like list (as obtained e.g. from compileRecord()) -# into a plain text array, which can then be dumped to a file suitable for -# MassBank upload. -#' Write MassBank record into character array -#' -#' Writes a MassBank record in list format to a text array. -#' -#' The function is a general conversion tool for the MassBank format; i.e. the -#' field names are not fixed. \code{mbdata} must be a named list, and the -#' entries can be as follows: \itemize{ -#' \item A single text line: -#' -#' \code{'CH\$EXACT_MASS' = '329.1023'} -#' -#' is written as -#' -#' \code{CH\$EXACT_MASS: 329.1023} -#' \item A character array: -#' -#' \code{'CH\$NAME' = c('2-Aminobenzimidazole', '1H-Benzimidazol-2-amine')} -#' -#' is written as -#' -#' \code{CH\$NAME: 2-Aminobenzimidazole} -#' -#' \code{CH\$NAME: 1H-Benzimidazol-2-amine} -#' -#' \item A named list of strings: -#' -#' \code{'CH\$LINK' = list('CHEBI' = "27822", "KEGG" = "C10901")} -#' -#' is written as -#' -#' \code{CH\$LINK: CHEBI 27822} -#' -#' \code{CH\$LINK: KEGG C10901} -#' -#' \item A data frame (e.g. the peak table) is written as specified in -#' the MassBank record format (Section 2.6.3): the column names are used as -#' headers for the first line, all data rows are printed space-separated. -#' } -#' -#' @usage toMassbank(mbdata) -#' @param mbdata A MassBank record in list format. -#' @return The result is a text array, which is ready to be written to the disk -#' as a file. -#' @note The function iterates over the list item names. \bold{This means that -#' duplicate entries in \code{mbdata} are (partially) discarded!} The correct -#' way to add them is by making a character array (as specified above): Instead -#' of \code{'CH\$NAME' = 'bla', 'CH\$NAME' = 'blub'} specify \code{'CH\$NAME' = -#' c('bla','blub')}. -#' @author Michael Stravs -#' @seealso \code{\link{compileRecord}}, \code{\link{mbWorkflow}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' \dontrun{ -#' # Read just the compound info skeleton from the Internet for some compound ID -#' id <- 35 -#' mbdata <- gatherData(id) -#' #' # Export the mbdata blocks to line arrays -#' # (there is no spectrum information, just the compound info...) -#' mbtext <- toMassbank(mbdata) -#' } -#' -#' @export -toMassbank <- function (mbdata) -{ - # mbf is an array of lines and count is the line counter. - # Very old-school, but it works. :) - mbf <- character(0) - count <- 1 - lapply(names(mbdata), function(entry) - { - # If entry is a char line, add it to the file. - # If it is a named sublist, add each subentry with name - # If it is an unnamed sublist, add each subentry without name - # if it is a dataframe, write in PEAKS mode - - # Note: this is were I liked "lapply" a little too much. "for" would - # be more idiomatic, and wouldn't need the <<- assignments. - - # Data frame: table mode. A header line and one space-separated line for - # each data frame row. - if(is.data.frame(mbdata[[entry]])) - { - mbf[[count]] <<- paste(entry,": " , - paste(colnames(mbdata[[entry]]), collapse=" "), - sep='') - count <<- count+1 - for(row in 1:nrow(mbdata[[entry]])) - { - mbf[[count]] <<- paste(" ", - paste(mbdata[[entry]][row,],collapse=" "), - sep="") - count <<- count+1 - } - #browser() - } - # List with named items: Write every entry like CH$LINK: CAS 12-345-678 - else if(is.list(mbdata[[entry]]) & !is.null(names(mbdata[[entry]]))) - { - - lapply(names(mbdata[[entry]]), function(subentry) - { - if(subentry != "SPLASH"){ - mbf[[count]] <<- paste(entry,": ",subentry, " ", mbdata[[entry]][[subentry]], sep='') - } else { - mbf[[count]] <<- paste(entry,": ", mbdata[[entry]][[subentry]], sep='') - } - #print(mbf) - count <<- count + 1 - }) - } - # Array (or list) of unnamed items: Write every entry like CH$NAME: Paracetamol - # (iterative entry without subindices) - else if (length(mbdata[[entry]]) > 1 & is.null(names(mbdata[[entry]]))) - { - lapply(mbdata[[entry]], function(subentry) - { - mbf[[count]] <<- paste(entry,": ",subentry, sep='') - #print(mbf) - count <<- count + 1 - }) - } - # Length is 1: just write the entry like PK$NUM_PEAKS: 131 - else - { - mbf[[count]] <<- paste(entry,": ",mbdata[[entry]], sep='') - count <<- count + 1 - } - } - ) # End of lapply block (per child spectrum) - # Add mandatory EOF marker - mbf[[count]] <- "//" - return(mbf) -} - -# Exports compiled and massbanked spectra, with their associated molfiles, to physical files. -# "compiled" is still used here, because we need an accessible accession number. -# In the plain text arrays, the accession number is already "hidden". -# compiled: is ONE "compiled" entry, i.e. ONE compound with e.g. 14 spectra. -# files: is a return value from lapply(toMassbank), i.e. contains 14 plain-text arrays -# (for a 14-spectra method) -# molfile: a molfile from createMolfile -#' Export internally stored MassBank data to files -#' -#' Exports MassBank recfile data arrays and corresponding molfiles to physical -#' files on hard disk, for one compound. -#' -#' The data from \code{compiled} is still used here, because it contains the -#' "visible" accession number. In the plain-text format contained in -#' \code{files}, the accession number is not "accessible" anymore since it's in -#' the file. -#' -#' @usage exportMassbank(compiled, files, molfile) -#' @param compiled Is ONE "compiled" entry, i.e. ONE compound with e.g. 14 -#' spectra, as returned from \code{\link{compileRecord}}. -#' @param files A n-membered array (usually a return value from -#' \code{lapply(\link{toMassbank})}), i.e. contains n plain-text arrays with -#' MassBank records. -#' @param molfile A molfile from \code{\link{createMolfile}} -#' @return No return value. -#' @note An improvement would be to write the accession numbers into -#' \code{names(compiled)} and later into \code{names(files)} so \code{compiled} -#' wouldn't be needed here anymore. (The compound ID would have to go into -#' \code{names(molfile)}, since it is also retrieved from \code{compiled}.) -#' @author Michael Stravs -#' @seealso \code{\link{createMolfile}}, \code{\link{compileRecord}}, -#' \code{\link{toMassbank}}, \code{\link{mbWorkflow}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' \dontrun{ -#' compiled <- compileRecord(record, mbdata, refilteredRcSpecs) -#' mbfiles <- toMassbank(compiled) -#' molfile <- createMolfile(compiled[[1]][["CH$SMILES"]]) -#' exportMassbank(compiled, mbfiles, molfile) -#' } -#' -#' @export -exportMassbank <- function(compiled, files, molfile) -{ - molnames <- c() - for(file in 1:length(compiled)) - { - # Read the accession no. from the corresponding "compiled" entry - filename <- compiled[[file]]["ACCESSION"] - # use this accession no. as filename - filename <- paste(filename, ".txt", sep="") - write(files[[file]], - file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata",filename) - ) - } - # Use internal ID for naming the molfiles - if(findLevel(compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]],TRUE)=="standard"){ - molname <- sprintf("%04d", as.numeric( - compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])) - molname <- paste(molname, ".mol", sep="") - write(molfile, - file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata",molname) - ) - } -} - -# Makes a list.tsv with molfile -> massbank ch$name attribution. - -#' Write list.tsv file -#' -#' Makes a list.tsv file in the "moldata" folder. -#' -#' Generates the list.tsv file which is needed by MassBank to connect records with -#' their respective molfiles. The first compound name is linked to a mol-file with -#' the compound ID (e.g. 2334.mol for ID 2334). -#' -#' @param compiled A list of compiled spectra (in tree-format, as returned by \code{compileRecord}). -#' @return No return value. -#' @author Michael A. Stravs, Eawag -#' @examples \dontrun{ -#' compiled <- compileRecord(record, mbdata, refilteredRcSpecs) -#' # a list.tsv for only one record: -#' clist <- list(compiled) -#' makeMollist(clist) -#' } -#' @export -makeMollist <- function(compiled) -{ - # For every "compiled" entry (here, compiled is not one "compiled" entry but the total - # list of all compiled spectra), extract the uppermost CH$NAME and the ID (from the - # first spectrum.) Make the ID into 0000 format. - - tsvlist <- t(sapply(compiled, function(entry) - { - name <- entry[[1]][["CH$NAME"]][[1]] - id <- sprintf("%04d", as.numeric(entry[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])) - molfilename <- paste(id,".mol",sep='') - return(c(name,molfilename)) - })) - - IDs <- sapply(compiled, function(entry) return( sprintf("%04d", as.numeric(entry[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])))) - level <- sapply(IDs, findLevel, compact=TRUE) - validentries <- which(level == "standard") - # Write the file with the - write.table(tsvlist[validentries,], - paste(getOption("RMassBank")$annotations$entry_prefix,"/moldata/list.tsv", sep=''), - quote = FALSE, - sep="\t", - row.names=FALSE, - col.names=FALSE - ) -} - - -# Load a dataframe or file into additional_peaks (or add additional points in there.) -# The columns cpdID, scan, mzFound, int, OK are mandatory. OK=1 means that the peaks -# will be added into the spectrum. mzFound and int will be taken for the table. -# No annotation will be written. -# Add peaks to the spectra by hand - -#' Add additional peaks to spectra -#' -#' Loads a table with additional peaks to add to the MassBank spectra. Required -#' columns are \code{cpdID, scan, int, mzFound, OK}. -#' -#' All peaks with OK=1 will be included in the spectra. -#' -#' @usage addPeaks(mb, filename_or_dataframe) -#' @param mb The \code{mbWorkspace} to load the peaks into. -#' @param filename_or_dataframe Filename of the csv file, or name of the R -#' dataframe containing the peaklist. -#' @return The \code{mbWorkspace} with loaded additional peaks. -#' @author Michael Stravs -#' @seealso \code{\link{mbWorkflow}} -#' @examples -#' -#' \dontrun{addPeaks("myrun_additionalPeaks.csv")} -#' -#' @export -addPeaks <- function(mb, filename_or_dataframe) -{ - - errorvar <- 0 - currEnvir <- environment() - d <- 1 - - if(is.data.frame(filename_or_dataframe)) - df <- filename_or_dataframe - else - tryCatch( - df <- read.csv(filename_or_dataframe), - error=function(e){ - currEnvir$errorvar <- 1 - }) - # I change your heuristic fix to another heuristic fix, because I will have to test for a column name change... - - if(!errorvar){ - - if(ncol(df) < 2) - df <- read.csv(filename_or_dataframe, sep=";") - # here: the column int was renamed to intensity, and we need to be able to read old files. sorry. - if(!("intensity" %in% colnames(df)) & ("int" %in% colnames(df))) - df$intensity <- df$int - - cols <- c("cpdID", "scan", "mzFound", "intensity", "OK") - n <- colnames(df) - # Check if comma-separated or semicolon-separated - d <- setdiff(cols, n) - if(length(d)>0){ - stop("Some columns are missing in the additional peak list. Needs at least cpdID, scan, mzFound, intensity, OK.") - } - } - - culled_df <- df[,c("cpdID", "scan", "mzFound", "intensity", "OK")] - - - if(nrow(mb@additionalPeaks) == 0) - mb@additionalPeaks <- culled_df - else - mb@additionalPeaks <- rbind(mb@additionalPeaks, culled_df) - return(mb) -} +# Script for writing MassBank files + +#testtest change +#' Load MassBank compound information lists +#' +#' Loads MassBank compound information lists (i.e. the lists which were created +#' in the first two steps of the MassBank \code{\link{mbWorkflow}} and +#' subsequently edited by hand.). +#' +#' \code{resetInfolists} clears the information lists, i.e. it creates a new +#' empty list in \code{mbdata_archive}. \code{loadInfolist} loads a single CSV +#' file, whereas \code{loadInfolists} loads a whole directory. +#' +#' @aliases loadInfolists loadInfolist resetInfolists +#' @usage loadInfolists(mb, path) +#' +#' loadInfolist(mb, fileName) +#' +#' resetInfolists(mb) +#' @param path Directory in which the namelists reside. All CSV files in this +#' directory will be loaded. +#' @param fileName A single namelist to be loaded. +#' @param mb The \code{mbWorkspace} to load/reset the lists in. +#' @return The new workspace with loaded/reset lists. +#' @author Michael Stravs +#' @examples +#' +#' # +#' \dontrun{mb <- resetInfolists(mb) +#' mb <- loadInfolist(mb, "my_csv_infolist.csv")} +#' +#' @export +loadInfolists <- function(mb, path) +{ + archivefiles <- list.files(path, ".csv", full.names=TRUE) + for(afile in archivefiles) + mb <- loadInfolist(mb, afile) + return(mb) +} + +# Load an "infolist". This loads a CSV file which should contain the entries +# edited and controlled by hand. All compound infos from fileName are added into the +# global mbdata_archive. Entries with a cpdID which was already present, are substituted +# by new entries from the fileName file. +#' @export +loadInfolist <- function(mb, fileName) +{ + # Prime a new infolist if it doesn't exist + if(ncol(mb@mbdata_archive) == 0) + mb <- resetInfolists(mb) + mbdata_new <- read.csv(fileName, sep=",", stringsAsFactors=FALSE) + # Legacy check for loading the Uchem format files. + # Even if dbname_* are not used downstream of here, it's still good to keep them + # for debugging reasons. + n <- colnames(mbdata_new) + cols <- c("id","dbcas","dataused") + + # Check if comma-separated or semicolon-separated + d <- setdiff(cols, n) + if(length(d)>0){ + mbdata_new <- read.csv2(fileName, stringsAsFactors=FALSE) + n <- colnames(mbdata_new) + d2 <- setdiff(cols, n) + if(length(d2) > 0){ + stop("Some columns are missing in the infolist.") + } + } + if("dbname_d" %in% colnames(mbdata_new)) + { + colnames(mbdata_new)[[which(colnames(mbdata_new)=="dbname_d")]] <- "dbname" + # dbname_e will be dropped because of the select= in the subset below. + } + if("COMMENT.EAWAG_UCHEM_ID" %in% colnames(mbdata_new)) + colnames(mbdata_new)[[which(colnames(mbdata_new)== "COMMENT.EAWAG_UCHEM_ID")]] <- + "COMMENT.ID" + + # Clear from padding spaces and NAs + mbdata_new <- as.data.frame(t(apply(mbdata_new, 1, function(r) + { + # Substitute empty spaces by real NA values + r[which(r == "")] <- NA + # Trim spaces (in all non-NA fields) + r[which(!is.na(r))] <- sub("^ *([^ ]+) *$", "\\1", r[which(!is.na(r))]) + return(r) + }))) + # use only the columns present in mbdata_archive, no other columns added in excel + mbdata_new <- mbdata_new[, colnames(mb@mbdata_archive)] + # substitute the old entires with the ones from our files + # then find the new (previously inexistent) entries, and rbind them to the table + new_entries <- setdiff(mbdata_new$id, mb@mbdata_archive$id) + old_entries <- intersect(mbdata_new$id, mb@mbdata_archive$id) + for(entry in old_entries) + mb@mbdata_archive[mb@mbdata_archive$id == entry,] <- mbdata_new[mbdata_new$id == entry,] + mb@mbdata_archive <- rbind(mb@mbdata_archive, + mbdata_new[mbdata_new$id==new_entries,]) + return(mb) + +} + + +# Resets the mbdata_archive to an empty version. +#' @export +resetInfolists <- function(mb) +{ + mb@mbdata_archive <- + structure(list(X = integer(0), id = integer(0), dbcas = character(0), + dbname = character(0), dataused = character(0), COMMENT.CONFIDENCE = character(0), + COMMENT.ID = integer(0), CH.NAME1 = character(0), + CH.NAME2 = character(0), CH.NAME3 = character(0), CH.COMPOUND_CLASS = character(0), + CH.FORMULA = character(0), CH.EXACT_MASS = numeric(0), CH.SMILES = character(0), + CH.IUPAC = character(0), CH.LINK.CAS = character(0), CH.LINK.CHEBI = integer(0), + CH.LINK.HMDB = character(0), CH.LINK.KEGG = character(0), CH.LINK.LIPIDMAPS = character(0), + CH.LINK.PUBCHEM = character(0), CH.LINK.INCHIKEY = character(0), + CH.LINK.CHEMSPIDER = integer(0)), .Names = c("X", "id", "dbcas", + "dbname", "dataused", "COMMENT.CONFIDENCE", "COMMENT.ID", + "CH.NAME1", "CH.NAME2", "CH.NAME3", "CH.COMPOUND_CLASS", "CH.FORMULA", + "CH.EXACT_MASS", "CH.SMILES", "CH.IUPAC", "CH.LINK.CAS", "CH.LINK.CHEBI", + "CH.LINK.HMDB", "CH.LINK.KEGG", "CH.LINK.LIPIDMAPS", "CH.LINK.PUBCHEM", + "CH.LINK.INCHIKEY", "CH.LINK.CHEMSPIDER"), row.names = integer(0), class = "data.frame") + return(mb) + +} + +# The workflow function, i.e. (almost) the only thing you actually need to call. +# See below for explanation of steps. +#' MassBank record creation workflow +#' +#' Uses data generated by \code{\link{msmsWorkflow}} to create MassBank records. +#' +#' See the vignette \code{vignette("RMassBank")} for detailed informations about the usage. +#' +#' Steps: +#' +#' Step 1: Find which compounds don't have annotation information yet. For these +#' compounds, pull information from several databases (using gatherData). +#' +#' Step 2: If new compounds were found, then export the infolist.csv and stop the workflow. +#' Otherwise, continue. +#' +#' Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. +#' +#' Step 4: Compile the spectra. Using the skeletons from the archive data, create +#' MassBank records per compound and fill them with peak data for each spectrum. +#' Also, assign accession numbers based on scan mode and relative scan no. +#' +#' Step 5: Convert the internal tree-like representation of the MassBank data into +#' flat-text string arrays (basically, into text-file style, but still in memory) +#' +#' Step 6: For all OK records, generate a corresponding molfile with the structure +#' of the compound, based on the SMILES entry from the MassBank record. (This molfile +#' is still in memory only, not yet a physical file) +#' +#' Step 7: If necessary, generate the appropriate subdirectories, and actually write +#' the files to disk. +#' +#' Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank +#' to attribute substances to their corresponding structure molfiles. +#' +#' @param steps Which steps in the workflow to perform. +#' @param infolist_path A path where to store newly downloaded compound informations, +#' which should then be manually inspected. +#' @param mb The \code{mbWorkspace} to work in. +#' @param gatherData A variable denoting whether to retrieve information using several online databases \code{gatherData= "online"} +#' or to use the local babel installation \code{gatherData= "babel"}. Note that babel is used either way, if a directory is given +#' in the settings. This setting will be ignored if retrieval is set to "standard" +#' @return The processed \code{mbWorkspace}. +#' @seealso \code{\link{mbWorkspace-class}} +#' @author Michael A. Stravs, Eawag +#' @examples \dontrun{ +#' mb <- newMbWorkspace(w) # w being a msmsWorkspace +#' mb <- loadInfolists(mb, "D:/myInfolistPath") +#' mb <- mbWorkflow(mb, steps=c(1:3), "newinfos.csv") +#' +#' } +#' @export +mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.csv", gatherData = "online") +{ + # Step 1: Find which compounds don't have annotation information yet. For these + # compounds, pull information from CTS (using gatherData). + if(1 %in% steps) + { + mbdata_ids <- lapply(selectSpectra(mb@spectra, "found", "object"), function(spec) spec@id) + message("mbWorkflow: Step 1. Gather info from several databases") + # Which IDs are not in mbdata_archive yet? + new_ids <- setdiff(as.numeric(unlist(mbdata_ids)), mb@mbdata_archive$id) + mb@mbdata <- lapply(new_ids, function(id) + { + if(findLevel(id,TRUE) == "standard"){ + if(gatherData == "online"){ + + d <- gatherData(id) + } + if(gatherData == "babel"){ + # message("mbWorkflow: Step 1. Gather info using babel") + d <- gatherDataBabel(id) + } + } else{ + # message("mbWorkflow: Step 1. Gather no info - Unknown structure") + d <- gatherDataUnknown(id, mb@spectra[[1]]@mode, retrieval=findLevel(id,TRUE)) + } + message(paste(id, ": ", d$dataused, sep='')) + return(d) + }) + } + # Step 2: If new compounds were found, then export the infolist.csv and stop the workflow. + # Otherwise, continue! + if(2 %in% steps) + { + message("mbWorkflow: Step 2. Export infolist (if required)") + if(length(mb@mbdata)>0) + { + mbdata_mat <- flatten(mb@mbdata) + write.csv(as.data.frame(mbdata_mat),infolist_path, na="") + message(paste("The file", infolist_path, "was generated with new compound information. Please check and edit the table, and add it to your infolist folder.")) + return(mb) + } + else + message("No new data added.") + } + # Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. + if(3 %in% steps) + { + message("mbWorkflow: Step 3. Data reformatting") + mb@mbdata_relisted <- apply(mb@mbdata_archive, 1, readMbdata) + } + # Step 4: Compile the spectra! Using the skeletons from the archive data, create + # MassBank records per compound and fill them with peak data for each spectrum. + # Also, assign accession numbers based on scan mode and relative scan no. + if(4 %in% steps) + { + message("mbWorkflow: Step 4. Spectra compilation") + mb@compiled <- lapply( + selectSpectra(mb@spectra, "found", "object"), + function(r) { + message(paste("Compiling: ", r@name, sep="")) + mbdata <- mb@mbdata_relisted[[which(mb@mbdata_archive$id == as.numeric(r@id))]] + if(nrow(mb@additionalPeaks) > 0) + res <-compileRecord(r, mbdata, mb@aggregated, mb@additionalPeaks) + else + res <-compileRecord(spec = r, mbdata = mbdata, aggregated = mb@aggregated, additionalPeaks = NULL, retrieval=findLevel(r@id,TRUE)) + return(res) + }) + # check which compounds have useful spectra + mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) + mb@problems <- which(is.na(mb@compiled)) + mb@compiled_ok <- mb@compiled[mb@ok] + } + # Step 5: Convert the internal tree-like representation of the MassBank data into + # flat-text string arrays (basically, into text-file style, but still in memory) + if(5 %in% steps) + { + message("mbWorkflow: Step 5. Flattening records") + mb@mbfiles <- lapply(mb@compiled_ok, function(c) lapply(c, toMassbank)) + } + # Step 6: For all OK records, generate a corresponding molfile with the structure + # of the compound, based on the SMILES entry from the MassBank record. (This molfile + # is still in memory only, not yet a physical file) + if(6 %in% steps) + { + message("mbWorkflow: Step 6. Generate molfiles") + mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(as.numeric(c[[1]][['COMMENT']][[getOption("RMassBank")$annotations$internal_id_fieldname]]))) + } + # Step 7: If necessary, generate the appropriate subdirectories, and actually write + # the files to disk. + if(7 %in% steps) + { + message("mbWorkflow: Step 7. Generate subdirs and export") + dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "moldata", sep='/'),recursive=TRUE) + dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "recdata", sep='/'),recursive=TRUE) + for(cnt in seq_along(mb@compiled_ok)) + exportMassbank(mb@compiled_ok[[cnt]], mb@mbfiles[[cnt]], mb@molfile[[cnt]]) + } + # Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank + # to attribute substances to their corresponding structure molfiles. + if(8 %in% steps) + { + message("mbWorkflow: Step 8. Create list.tsv") + makeMollist(mb@compiled_ok) + } + return(mb) +} + + +# Calls openbabel and converts the SMILES code string (or retrieves the SMILES code from +# the ID, and then calls openbabel) to create a molfile in text format. +# If fileName is given, the file is directly stored. Otherwise, it is returned as a +# character array. +#' Create MOL file for a chemical structure +#' +#' Creates a MOL file (in memory or on disk) for a compound specified by the +#' compound ID or by a SMILES code. +#' +#' The function invokes OpenBabel (and therefore needs a correctly set +#' OpenBabel path in the RMassBank settings), using the SMILES code retrieved +#' with \code{findSmiles} or using the SMILES code directly. The current +#' implementation of the workflow uses the latter version, reading the SMILES +#' code directly from the MassBank record itself. +#' +#' @usage createMolfile(id_or_smiles, fileName = FALSE) +#' @param id_or_smiles The compound ID or a SMILES code. +#' @param fileName If the filename is set, the file is written directly to disk +#' using the specified filename. Otherwise, it is returned as a text array. +#' @return A character array containing the MOL/SDF format file, ready to be +#' written to disk. +#' @author Michael Stravs +#' @seealso \code{\link{findSmiles}} +#' @references OpenBabel: \url{http://openbabel.org} +#' @examples +#' +#' # Benzene: +#' \dontrun{ +#' createMolfile("C1=CC=CC=C1") +#' } +#' +#' @export +createMolfile <- function(id_or_smiles, fileName = FALSE) +{ + .checkMbSettings() + babeldir <- getOption("RMassBank")$babeldir + + if(!is.numeric(id_or_smiles)){ + smiles <- id_or_smiles + } else{ + if(findLevel(id_or_smiles,TRUE) != "standard"){ + return(c(" ","$$$$")) + } + smiles <- findSmiles(id_or_smiles) + } + # if no babeldir was set, get the result from cactus. + if(is.na(babeldir)) + { + res <- getCactus(smiles, "sdf") + + if(any(is.na(res))){ + res <- getPcSDF(smiles) + } + if(any(is.na(res))){ + stop("Pubchem and Cactus both seem to be down.") + } + if(is.character(fileName)) + writeLines(res, fileName) + } + # otherwise use the better-tested OpenBabel toolkit. + else + { + if(!is.character(fileName)) + cmd <- paste(babeldir, "babel -ismi -osdf -d -b --gen2D", sep='') + else + cmd <- paste(babeldir, "babel -ismi -osdf ", fileName , " -d -b --gen2D", sep='') + res <- system(cmd, intern=TRUE, input=smiles, ignore.stderr=TRUE) + # If we wrote to a file, read it back as return value. + if(is.character(fileName)) + res <- readLines(fileName) + } + #return(c(" ","$$$$")) + return(res) +} + + + +# Retrieve annotation data for a compound, from the internet service Pubchem +#' Retrieve supplemental annotation data from Pubchem +#' +#' Retrieves annotation data for a compound from the internet service Pubchem +#' based on the inchikey generated by babel or Cactus +#' +#' The data retrieved is the Pubchem CID, a synonym from the Pubchem database, +#' the IUPAC name (using the preferred if available) and a Chebi link +#' +#' @usage gatherPubChem(key) +#' @param key An Inchi-Key +#' @return Returns a list with 4 slots: +#' \code{PcID} The Pubchem CID +#' \code{Synonym} An arbitrary synonym for the compound name +#' \code{IUPAC} A IUPAC-name (preferred if available) +#' \code{Chebi} The identification number of the chebi database +#' @author Erik Mueller +#' @seealso \code{\link{mbWorkflow}} +#' @references Pubchem REST: +#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} +#' Chebi: +#' \url{http://www.ebi.ac.uk/chebi} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherPubChem("QEIXBXXKTUNWDK-UHFFFAOYSA-N")} +#' +#' @export +gatherPubChem <- function(key){ + + PubChemData <- list() + + ##Trycatches are there because pubchem has connection issues 1 in 50 times. + ##Write NA into the respective fields if something goes wrong with the conenction or the data. + + ##Retrieve Pubchem CID + tryCatch( + PubChemData$PcID <- getPcId(key), + error=function(e){ + PubChemData$PcID <<- NA + }) + + ##Retrieve a synonym to the name + tryCatch( + PubChemData$Synonym <- getPcSynonym(key), + error=function(e){ + PubChemData$Synonym <<- NA + }) + + ##Retrieve the IUPAC-name + tryCatch( + PubChemData$IUPAC <- getPcIUPAC(key), + error=function(e){ + PubChemData$IUPAC <<- NA + }) + + ##Retrieve the Chebi-ID + tryCatch( + PubChemData$Chebi <- getPcCHEBI(key), + error=function(e){ + PubChemData$Chebi <<- NA + }) + + return(PubChemData) +} + +# Retrieve annotation data for a compound, from the internet services Cactvs, Pubchem, Chemspider and CTS. +#' Retrieve annotation data +#' +#' Retrieves annotation data for a compound from the internet services CTS, Pubchem, Chemspider and +#' Cactvs, based on the SMILES code and name of the compounds stored in the +#' compound list. +#' +#' Composes the "upper part" of a MassBank record filled with chemical data +#' about the compound: name, exact mass, structure, CAS no., links to PubChem, +#' KEGG, ChemSpider. The instrument type is also written into this block (even +#' if not strictly part of the chemical information). Additionally, index +#' fields are added at the start of the record, which will be removed later: +#' \code{id, dbcas, dbname} from the compound list, \code{dataused} to indicate +#' the used identifier for CTS search (\code{smiles} or \code{dbname}). +#' +#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are +#' inserted empty and will be filled later on. +#' +#' @usage gatherData(id) +#' @aliases gatherData +#' @param id The compound ID. +#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., +#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... +#' @author Michael Stravs +#' @seealso \code{\link{mbWorkflow}} +#' @references Chemical Translation Service: +#' \url{http://uranus.fiehnlab.ucdavis.edu:8080/cts/homePage} +#' cactus Chemical Identifier Resolver: +#' \url{http://cactus.nci.nih.gov/chemical/structure} +#' MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' Pubchem REST: +#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} +#' Chemspider InChI conversion: +#' \url{https://www.chemspider.com/InChI.asmx} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherData(131)} +#' +#' @export +gatherData <- function(id) +{ + ##Preamble: Is a babeldir supplied? + ##If yes, use it + + .checkMbSettings() + usebabel=TRUE + babeldir <- getOption("RMassBank")$babeldir + + if(is.na(babeldir)){ + usebabel=FALSE + } + + + ##Get all useful information from the local "database" (from the CSV sheet) + + smiles <- findSmiles(id) + mass <- findMass(smiles) + dbcas <- findCAS(id) + dbname <- findName(id) + if(is.na(dbname)) dbname <- "" + if(is.na(dbcas)) dbcas <- "" + iupacName <- dbname + synonym <- dbname + formula <- findFormula(id) + + ##Convert SMILES to InChI key via Cactvs or babel. CTS doesn't "interpret" the SMILES per se, + ##it just matches identical known SMILES, so we need to convert to a "searchable" and + ##standardized format beforehand. Other databases are able to interpret the smiles. + + if(usebabel){ + cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey') + inchikey_split <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE) + } else{ + inchikey <- getCactus(smiles, 'stdinchikey') + if(!is.na(inchikey)){ + ##Split the "InChiKey=" part off the key + inchikey_split <- strsplit(inchikey, "=", fixed=TRUE)[[1]][[2]] + } else{ + inchikey_split <- getPcInchiKey(smiles) + } + } + + ##Use Pubchem to retrieve information + PcInfo <- gatherPubChem(inchikey_split) + + if(!is.null(PcInfo$Synonym) & !is.na(PcInfo$Synonym)){ + synonym <- PcInfo$Synonym + } + + if(!is.null(PcInfo$IUPAC) & !is.na(PcInfo$IUPAC)){ + iupacName <- PcInfo$IUPAC + } + + ##Get Chemspider-ID + csid <- getCSID(inchikey_split) + + if(is.na(csid)){ + ##Get ChemSpider ID from Cactus if the Chemspider page is down + csid <- getCactus(inchikey_split, 'chemspider_id') + } + + ##Use CTS to retrieve information + CTSinfo <- getCtsRecord(inchikey_split) + + if((CTSinfo[1] == "Sorry, we couldn't find any matching results") || is.null(CTSinfo[1])) + { + CTSinfo <- NA + } + + ##List the names + if(iupacName == ""){ + warning(paste0("Compound ID ",id,": no IUPAC name could be identified.")) + } + + if(toupper(dbname) == toupper(synonym)){ + synonym <- dbname + } + + if(toupper(dbname) == toupper(iupacName)){ + iupacName <- dbname + } + + if(toupper(synonym) == toupper(iupacName)){ + synonym <- iupacName + } + + names <- as.list(unique(c(dbname, synonym, iupacName))) + + ##If no name is found, it must be supplied in one way or another + if(all(sapply(names, function(x) x == ""))){ + stop("RMassBank wasn't able to extract a usable name for this compound from any database. Please supply a name manually.") + } + + # Start to fill the MassBank record. + # The top 4 entries will not go into the final record; they are used to identify + # the record and also to facilitate manual editing of the exported record table. + mbdata <- list() + mbdata[['id']] <- id + mbdata[['dbcas']] <- dbcas + mbdata[['dbname']] <- dbname + mbdata[['dataused']] <- "smiles" + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors + mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license + mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright + # Confidence annotation and internal ID annotation. + # The ID of the compound will be written like: + # COMMENT: EAWAG_UCHEM_ID 1234 + # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" + mbdata[["COMMENT"]] <- list() + if(findLevel(id) == "0"){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment + } else{ + level <- findLevel(id) + if(level %in% c("1","1a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" + } + if(level == c("2")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" + } + if(level == c("2a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" + } + if(level == c("2b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" + } + if(level == c("3")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" + } + if(level == c("3a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" + } + if(level == c("3b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" + } + if(level == c("3c")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" + } + if(level == c("3d")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" + } + if(level == c("4")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" + } + if(level == c("5")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" + } + } + + mbdata[["COMMENT"]][["ID"]] = id + # here compound info starts + mbdata[['CH$NAME']] <- names + # Currently we use a fixed value for Compound Class, since there is no useful + # convention of what should go there and what shouldn't, and the field is not used + # in search queries. + mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class + mbdata[['CH$FORMULA']] <- formula + mbdata[['CH$EXACT_MASS']] <- mass + mbdata[['CH$SMILES']] <- smiles + + if(usebabel){ + cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi') + mbdata[['CH$IUPAC']] <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE) + } else{ + mbdata[['CH$IUPAC']] <- getCactus(smiles, "stdinchi") + } + + + + # Add all CH$LINK fields present in the compound datasets + link <- list() + # CAS + if(!is.na(CTSinfo[1])){ + if("CAS" %in% CTS.externalIdTypes(CTSinfo)) + { + # Prefer database CAS if it is also listed in the CTS results. + # otherwise take the shortest one. + cas <- CTS.externalIdSubset(CTSinfo,"CAS") + if(dbcas %in% cas) + link[["CAS"]] <- dbcas + else + link[["CAS"]] <- cas[[which.min(nchar(cas))]] + } else{ + if(dbcas != ""){ + link[["CAS"]] <- dbcas + } + } + } else{ + if(dbcas != ""){ + link[["CAS"]] <- dbcas + } + } + + + # CHEBI + if(is.na(PcInfo$Chebi[1])){ + if(!is.na(CTSinfo[1])){ + if("ChEBI" %in% CTS.externalIdTypes(CTSinfo)) + { + # Cut off front "CHEBI:" if present + chebi <- CTS.externalIdSubset(CTSinfo,"ChEBI") + chebi <- chebi[[which.min(nchar(chebi))]] + chebi <- strsplit(chebi,":")[[1]] + link[["CHEBI"]] <- chebi[[length(chebi)]] + } + } + } else{ + chebi <- PcInfo$Chebi + chebi <- chebi[[which.min(nchar(chebi))]] + chebi <- strsplit(chebi,":")[[1]] + link[["CHEBI"]] <- chebi[[length(chebi)]] + } + # HMDB + if(!is.na(CTSinfo[1])){ + if("Human Metabolome Database" %in% CTS.externalIdTypes(CTSinfo)) + link[["HMDB"]] <- CTS.externalIdSubset(CTSinfo,"HMDB")[[1]] + # KEGG + if("KEGG" %in% CTS.externalIdTypes(CTSinfo)) + link[["KEGG"]] <- CTS.externalIdSubset(CTSinfo,"KEGG")[[1]] + # LipidMAPS + if("LipidMAPS" %in% CTS.externalIdTypes(CTSinfo)) + link[["LIPIDMAPS"]] <- CTS.externalIdSubset(CTSinfo,"LipidMAPS")[[1]] + } + # PubChem CID + if(is.na(PcInfo$PcID[1])){ + if(!is.na(CTSinfo[1])){ + if("PubChem CID" %in% CTS.externalIdTypes(CTSinfo)) + { + pc <- CTS.externalIdSubset(CTSinfo,"PubChem CID") + link[["PUBCHEM"]] <- paste0(min(pc)) + } + } + } else{ + link[["PUBCHEM"]] <- PcInfo$PcID[1] + } + + + if(!is.null(link[["PUBCHEM"]])){ + if(substr(link[["PUBCHEM"]],1,4) != "CID:"){ + link[["PUBCHEM"]] <- paste0("CID:", link[["PUBCHEM"]]) + } + } + + link[["INCHIKEY"]] <- inchikey_split + if(length(csid)>0) if(any(!is.na(csid))) link[["CHEMSPIDER"]] <- min(as.numeric(as.character(csid))) + mbdata[['CH$LINK']] <- link + + mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument + mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type + + return(mbdata) +} + +# Retrieve annotation data for a compound, using only babel +#' Retrieve annotation data +#' +#' Retrieves annotation data for a compound by using babel, +#' based on the SMILES code and name of the compounds stored in the +#' compound list. +#' +#' Composes the "upper part" of a MassBank record filled with chemical data +#' about the compound: name, exact mass, structure, CAS no.. +#' The instrument type is also written into this block (even +#' if not strictly part of the chemical information). Additionally, index +#' fields are added at the start of the record, which will be removed later: +#' \code{id, dbcas, dbname} from the compound list. +#' +#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are +#' inserted empty and will be filled later on. +#' +#' This function is an alternative to gatherData, in case CTS is down or if information +#' on one or more of the compounds in the compound list are sparse +#' +#' @usage gatherDataBabel(id) +#' @param id The compound ID. +#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., +#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... +#' @author Michael Stravs, Erik Mueller +#' @seealso \code{\link{mbWorkflow}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherDataBabel(131)} +#' +#' @export +gatherDataBabel <- function(id){ + .checkMbSettings() + babeldir <- getOption("RMassBank")$babeldir + smiles <- findSmiles(id) + + + # if no babeldir was set, throw an error that says that either CTS or babel have to be used + if(is.na(babeldir)) + { + stop("No babeldir supplied; It is currently not possible to convert the information without either babel or CTS") + } else { + ###Babel conversion + cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey') + inchikey <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE) + cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi') + inchi <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE) + + ##Read from Compoundlist + smiles <- findSmiles(id) + mass <- findMass(smiles) + dbcas <- findCAS(id) + dbname <- findName(id) + if(is.na(dbname)) dbname <- "" + if(is.na(dbcas)) dbcas <- "" + formula <- findFormula(id) + + ##Create + mbdata <- list() + mbdata[['id']] <- id + mbdata[['dbcas']] <- dbcas + mbdata[['dbname']] <- dbname + mbdata[['dataused']] <- "smiles" + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors + mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license + mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright + # Confidence annotation and internal ID annotation. + # The ID of the compound will be written like: + # COMMENT: EAWAG_UCHEM_ID 1234 + # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" + mbdata[["COMMENT"]] <- list() + if(findLevel(id) == "0"){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment + } else{ + level <- findLevel(id) + if(level %in% c("1","1a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" + } + if(level == c("2")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" + } + if(level == c("2a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" + } + if(level == c("2b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" + } + if(level == c("3")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" + } + if(level == c("3a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" + } + if(level == c("3b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" + } + if(level == c("3c")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" + } + if(level == c("3d")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" + } + if(level == c("4")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" + } + if(level == c("5")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" + } + } + mbdata[["COMMENT"]][["ID"]] <- id + + # here compound info starts + mbdata[['CH$NAME']] <- as.list(dbname) + + # Currently we use a fixed value for Compound Class, since there is no useful + # convention of what should go there and what shouldn't, and the field is not used + # in search queries. + mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class + mbdata[['CH$FORMULA']] <- formula + mbdata[['CH$EXACT_MASS']] <- mass + mbdata[['CH$SMILES']] <- smiles + mbdata[['CH$IUPAC']] <- inchi + + link <- list() + if(dbcas != "") + link[["CAS"]] <- dbcas + link[["INCHIKEY"]] <- inchikey + mbdata[['CH$LINK']] <- link + mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument + mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type + } + return(mbdata) +} + +# Retrieve annotation data for a compound, using only babel +#' Retrieve annotation data +#' +#' Retrieves annotation data for an unknown compound by using basic information present +#' +#' Composes the "upper part" of a MassBank record filled with chemical data +#' about the compound: name, exact mass, structure, CAS no.. +#' The instrument type is also written into this block (even +#' if not strictly part of the chemical information). Additionally, index +#' fields are added at the start of the record, which will be removed later: +#' \code{id, dbcas, dbname} from the compound list. +#' +#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are +#' inserted empty and will be filled later on. +#' +#' This function is used to generate the data in case a substance is unknown, +#' i.e. not enough information is present to derive anything about formulas or links +#' +#' @usage gatherDataUnknown(id, mode, retrieval) +#' @param id The compound ID. +#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z +#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., +#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... +#' @author Michael Stravs, Erik Mueller +#' @seealso \code{\link{mbWorkflow}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherDataUnknown(131,"pH")} +#' +#' @export +gatherDataUnknown <- function(id, mode, retrieval){ + .checkMbSettings() + + ##Read from Compoundlist + smiles <- "" + if(retrieval == "unknown"){ + mass <- findMass(id, "unknown", mode) + formula <- "" + } + if(retrieval == "tentative"){ + mass <- findMass(id, "tentative", mode) + formula <- findFormula(id, "tentative") + } + dbcas <- NA + dbname <- findName(id) + if(is.na(dbname)) dbname <- paste("Unknown ID:",id) + if(is.na(dbcas)) dbcas <- "" + + + + ##Create + mbdata <- list() + mbdata[['id']] <- id + mbdata[['dbcas']] <- dbcas + mbdata[['dbname']] <- dbname + mbdata[['dataused']] <- "none" + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors + mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license + mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright + # Confidence annotation and internal ID annotation. + # The ID of the compound will be written like: + # COMMENT: EAWAG_UCHEM_ID 1234 + # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" + mbdata[["COMMENT"]] <- list() + if(findLevel(id) == "0"){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment + } else{ + level <- findLevel(id) + if(level %in% c("1","1a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" + } + if(level == c("2")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" + } + if(level == c("2a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" + } + if(level == c("2b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" + } + if(level == c("3")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" + } + if(level == c("3a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" + } + if(level == c("3b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" + } + if(level == c("3c")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" + } + if(level == c("3d")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" + } + if(level == c("4")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" + } + if(level == c("5")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" + } + } + mbdata[["COMMENT"]][["ID"]] <- id + + # here compound info starts + mbdata[['CH$NAME']] <- as.list(dbname) + + # Currently we use a fixed value for Compound Class, since there is no useful + # convention of what should go there and what shouldn't, and the field is not used + # in search queries. + mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class + mbdata[['CH$FORMULA']] <- formula + mbdata[['CH$EXACT_MASS']] <- mass + mbdata[['CH$SMILES']] <- "" + mbdata[['CH$IUPAC']] <- "" + + link <- list() + mbdata[['CH$LINK']] <- link + mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument + mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type + + return(mbdata) +} + +# Flatten the internal tree-like representation of MassBank data to a flat table. +# Note that this limits us, in that the fields should be constant over all records! +# Therefore, e.g. the fixed number of 3 names which may be filled. +# If anybody has a cooler solution, I'll be happy to hear from you :) +# +# Note: the records from gatherData have additional information which is discarded, like +# author, copyright etc. They will be re-filled automatically when reading the file. +#' Flatten, or re-read, MassBank header blocks +#' +#' \code{flatten} converts a list of MassBank compound information sets (as +#' retrieved by \code{\link{gatherData}}) to a flat table, to be exported into +#' an \link[=loadInfolist]{infolist}. \code{readMbdata} reads a single record +#' from an infolist flat table back into a MassBank (half-)entry. +#' +#' Neither the flattening system itself nor the implementation are particularly +#' fantastic, but since hand-checking of records is a necessary evil, there is +#' currently no alternative (short of coding a complete GUI for this and +#' working directly on the records.) +#' +#' @aliases flatten readMbdata +#' @usage flatten(mbdata) +#' +#' readMbdata(row) +#' @param mbdata A list of MassBank compound information sets as returned from +#' \code{\link{gatherData}}. +#' @param row One row of MassBank compound information retrieved from an +#' infolist. +#' @return \code{flatten} returns a matrix (not a data frame) to be written to +#' CSV. +#' +#' \code{readMbdata} returns a list of type \code{list(id= \var{compoundID}, +#' ..., 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. +#' @author Michael Stravs +#' @seealso \code{\link{gatherData}},\code{\link{loadInfolist}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples \dontrun{ +#' # Collect some data to flatten +#' ids <- c(40,50,60,70) +#' data <- lapply(ids, gatherData) +#' # Flatten the data trees to a table +#' flat.table <- flatten(data) +#' # reimport the table into a tree +#' data.reimported <- apply(flat.table, 1, readMbdata) +#' } +#' +#' @export +#' +flatten <- function(mbdata) +{ + .checkMbSettings() + + colList <- c( + "id", + "dbcas", + "dbname", + "dataused", + "COMMENT.CONFIDENCE", + # Note: The field name of the internal id field is replaced with the real name + # at "compilation" time. Therefore, functions DOWNSTREAM from compileRecord() + # must use the full name including the info from options("RMassBank"). + "COMMENT.ID", + "CH$NAME1", + "CH$NAME2", + "CH$NAME3", + "CH$COMPOUND_CLASS", + "CH$FORMULA", + "CH$EXACT_MASS", + "CH$SMILES", + "CH$IUPAC", + "CH$LINK.CAS", + "CH$LINK.CHEBI", + "CH$LINK.HMDB", + "CH$LINK.KEGG", + "CH$LINK.LIPIDMAPS", + "CH$LINK.PUBCHEM", + "CH$LINK.INCHIKEY", + "CH$LINK.CHEMSPIDER") + # make an empty data frame with the right length + rows <- length(mbdata) + cols <- length(colList) + mbframe <- matrix(data=NA, nrow=rows, ncol=cols) + colnames(mbframe) <- colList + #browser() + for(row in 1:rows) + { + # fill in all the data into the dataframe: all columns which + # a) exist in the target dataframe and b) exist in the (unlisted) MB record + # are written into the dataframe. + data <- unlist(mbdata[[row]]) + # bugfix for the case of only one name + if(!("CH$NAME1" %in% names(data))) + data[["CH$NAME1"]] <- data[["CH$NAME"]] + datacols <- intersect(colList, names(data)) + mbframe[row,datacols] <- data[datacols] + } + return(mbframe) + +} + +# Read data from a flat-table MassBank record row and feed it into a +# MassBank tree-like record. Also, prime the ACCESSION and RECORD_TITLE fields in the +# correct position in the record. +#' @export +readMbdata <- function(row) +{ + .checkMbSettings() + + # Listify the table row. Lists are just cooler to work with :) + row <- as.list(row) + + mbdata <- list() + # Accession and title are added empty for now, to have them in the right place. + # Constants are read from the options or generated. + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors + mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license + mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright + mbdata[['PUBLICATION']] <- getOption("RMassBank")$annotations$publication + + # Read all determined fields from the file + # This is not very flexible, as you can see... + colList <- c( + "COMMENT.CONFIDENCE", + "COMMENT.ID", + "CH$NAME1", + "CH$NAME2", + "CH$NAME3", + "CH$COMPOUND_CLASS", + "CH$FORMULA", + "CH$EXACT_MASS", + "CH$SMILES", + "CH$IUPAC", + "CH$LINK.CAS", + "CH$LINK.CHEBI", + "CH$LINK.HMDB", + "CH$LINK.KEGG", + "CH$LINK.LIPIDMAPS", + "CH$LINK.PUBCHEM", + "CH$LINK.INCHIKEY", + "CH$LINK.CHEMSPIDER") + mbdata[["COMMENT"]] = list() + mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT.CONFIDENCE"]] + # Again, our ID field. + + mbdata[["COMMENT"]][["ID"]]<- + row[["COMMENT.ID"]] + names = c(row[["CH.NAME1"]], row[["CH.NAME2"]], row[["CH.NAME3"]]) + names = names[which(!is.na(names))] + + names <- gsub("'", "`", names) + mbdata[["CH$NAME"]] = names + mbdata[["CH$COMPOUND_CLASS"]] = row[["CH.COMPOUND_CLASS"]] + mbdata[["CH$FORMULA"]] = row[["CH.FORMULA"]] + mbdata[["CH$EXACT_MASS"]] = row[["CH.EXACT_MASS"]] + mbdata[["CH$SMILES"]] = row[["CH.SMILES"]] + mbdata[["CH$IUPAC"]] = row[["CH.IUPAC"]] + # Add all links and then eliminate the NA values from the tree. + link = list() + link[["CAS"]] = row[["CH.LINK.CAS"]] + link[["CHEBI"]] = row[["CH.LINK.CHEBI"]] + link[["HMDB"]] = row[["CH.LINK.HMDB"]] + link[["KEGG"]] = row[["CH.LINK.KEGG"]] + link[["LIPIDMAPS"]] = row[["CH.LINK.LIPIDMAPS"]] + link[["PUBCHEM"]] = row[["CH.LINK.PUBCHEM"]] + link[["INCHIKEY"]] = row[["CH.LINK.INCHIKEY"]] + link[["CHEMSPIDER"]] = row[["CH.LINK.CHEMSPIDER"]] + link[which(is.na(link))] <- NULL + mbdata[["CH$LINK"]] <- link + # again, these constants are read from the options: + mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument + mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type + + return(mbdata) + +} + +# For each compound, this function creates the "lower part" of the MassBank record, i.e. +# everything that comes after AC$INSTRUMENT_TYPE. +#' Compose data block of MassBank record +#' +#' \code{gatherCompound} composes the data blocks (the "lower half") of all +#' MassBank records for a compound, using the annotation data in the RMassBank +#' options, spectrum info data from the \code{analyzedSpec}-type record and the +#' peaks from the reanalyzed, multiplicity-filtered peak table. It calls +#' \code{gatherSpectrum} for each child spectrum. +#' +#' The returned data blocks are in format \code{list( "AC\$MASS_SPECTROMETRY" = +#' list('FRAGMENTATION_MODE' = 'CID', ...), ...)} etc. +#' +#' @aliases gatherCompound gatherSpectrum +#' @usage gatherCompound(spec, aggregated, additionalPeaks = NULL, retrieval="standard") +#' +#' gatherSpectrum(spec, msmsdata, ac_ms, ac_lc, aggregated, +#' additionalPeaks = NULL, retrieval="standard") +#' @param spec A \code{RmbSpectraSet} object, representing a compound with multiple spectra. +#' @param aggregated An aggregate peak table where the peaks are extracted from. +#' @param msmsdata A \code{RmbSpectrum2} object from the \code{spec} spectra set, representing a single spectrum to give a record. +#' @param ac_ms,ac_lc Information for the AC\$MASS_SPECTROMETRY and +#' AC\$CHROMATOGRAPHY fields in the MassBank record, created by +#' \code{gatherCompound} and then fed into \code{gatherSpectrum}. +#' @param additionalPeaks If present, a table with additional peaks to add into the spectra. +#' As loaded with \code{\link{addPeaks}}. +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z +#' @return \code{gatherCompound} returns a list of tree-like MassBank data +#' blocks. \code{gatherSpectrum} returns one single MassBank data block or +#' \code{NA} if no useful peak is in the spectrum. +#' @note Note that the global table \code{additionalPeaks} is also used as an +#' additional source of peaks. +#' @author Michael Stravs +#' @seealso \code{\link{mbWorkflow}}, \code{\link{compileRecord}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples \dontrun{ +#' myspectrum <- w@@spectra[[1]] +#' massbankdata <- gatherCompound(myspectrum, w@@aggregated) +#' # Note: ac_lc and ac_ms are data blocks usually generated in gatherCompound and +#' # passed on from there. The call below gives a relatively useless result :) +#' ac_lc_dummy <- list() +#' ac_ms_dummy <- list() +#' justOneSpectrum <- gatherSpectrum(myspectrum, myspectrum@@child[[2]], +#' ac_ms_dummy, ac_lc_dummy, w@@aggregated) +#' } +#' +#' +#' @export +gatherCompound <- function(spec, aggregated, additionalPeaks = NULL, retrieval="standard") +{ + # compound ID + id <- spec@id + # processing mode + imode <- spec@mode + + # define positive or negative, based on processing mode. + ion_modes <- list( + "pH" = "POSITIVE", + "pNa" = "POSITIVE", + "mH" = "NEGATIVE", + "mFA" = "NEGATIVE", + "pM" = "POSITIVE", + "mM" = "NEGATIVE", + "pNH4" = "POSITIVE") + mode <- ion_modes[[imode]] + + # for format 2.01 + ac_ms <- list(); + ac_ms[['MS_TYPE']] <- getOption("RMassBank")$annotations$ms_type + ac_ms[['IONIZATION']] <- getOption("RMassBank")$annotations$ionization + ac_ms[['ION_MODE']] <- mode + + # This list could be made customizable. + ac_lc <- list(); + rt <- spec@parent@rt / 60 + ac_lc[['COLUMN_NAME']] <- getOption("RMassBank")$annotations$lc_column + ac_lc[['FLOW_GRADIENT']] <- getOption("RMassBank")$annotations$lc_gradient + ac_lc[['FLOW_RATE']] <- getOption("RMassBank")$annotations$lc_flow + ac_lc[['RETENTION_TIME']] <- sprintf("%.3f min", rt) + ac_lc[['SOLVENT A']] <- getOption("RMassBank")$annotations$lc_solvent_a + ac_lc[['SOLVENT B']] <- getOption("RMassBank")$annotations$lc_solvent_b + + # Go through all child spectra, and fill our skeleton with scan data! + # Pass them the AC_LC and AC_MS data, which are added at the right place + # directly in there. + allSpectra <- lapply(spec@children, function(m) + gatherSpectrum(spec = spec, msmsdata = m, ac_ms = ac_ms, ac_lc = ac_lc, aggregated = aggregated, additionalPeaks = additionalPeaks, retrieval=retrieval)) + allSpectra <- allSpectra[which(!is.na(allSpectra))] + return(allSpectra) +} + + + +# Process one single MSMS child scan. +# spec: an object of "analyzedSpectrum" type (i.e. contains +# 14x (or other number) msmsdata, info, mzrange, +# compound ID, parent MS1, cpd id...) +# msmsdata: the msmsdata sub-object from the spec which is the child scan we want to process. +# Contains childFilt, childBad, scan #, etc. Note that the peaks are actually not +# taken from here! They were taken from msmsdata initially, but after introduction +# of the refiltration and multiplicity filtering, this was changed. Now only the +# scan information is actually taken from msmsdata. +# ac_ms, ac_lc: pre-filled info for the MassBank dataset (see above) +# refiltered: the refilteredRcSpecs dataset which contains our good peaks :) +# Contains peaksOK, peaksReanOK, peaksFiltered, peaksFilteredReanalysis, +# peaksProblematic. Currently we use peaksOK and peaksReanOK to create the files. +# (Also, the global additionalPeaks table is used.) +#' @export +gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalPeaks = NULL, retrieval = "standard") +{ + # If the spectrum is not filled, return right now. All "NA" spectra will + # not be treated further. + if(msmsdata@ok == FALSE) + return(NA) + # get data + scan <- msmsdata@acquisitionNum + id <- spec@id + # Further fill the ac_ms datasets, and add the ms$focused_ion with spectrum-specific data: + precursor_types <- list( + "pH" = "[M+H]+", + "pNa" = "[M+Na]+", + "mH" = "[M-H]-", + "mFA" = "[M+HCOO-]-", + "pM" = "[M]+", + "mM" = "[M]-", + "pNH4" = "[M+NH4]+") + ac_ms[['FRAGMENTATION_MODE']] <- msmsdata@info$mode + #ac_ms['PRECURSOR_TYPE'] <- precursor_types[spec$mode] + ac_ms[['COLLISION_ENERGY']] <- msmsdata@info$ce + ac_ms[['RESOLUTION']] <- msmsdata@info$res + + # Calculate exact precursor mass with Rcdk, and find the base peak from the parent + # spectrum. (Yes, that's what belongs here, I think.) + precursorMz <- findMz(spec@id, spec@mode, retrieval=retrieval) + ms_fi <- list() + ms_fi[['BASE_PEAK']] <- round(mz(spec@parent)[which.max(intensity(spec@parent))],4) + ms_fi[['PRECURSOR_M/Z']] <- round(precursorMz$mzCenter,4) + ms_fi[['PRECURSOR_TYPE']] <- precursor_types[spec@mode] + + # Select all peaks which belong to this spectrum (correct cpdID and scan no.) + # from peaksOK + # Note: Here and below it would be easy to customize the source of the peaks. + # Originally the peaks came from msmsdata$childFilt, and the subset + # was used where dppm == dppmBest (because childFilt still contains multiple formulas) + # per peak. + peaks <- aggregated[aggregated$filterOK,,drop=FALSE] + peaks <- peaks[(peaks$cpdID == id) & (peaks$scan == msmsdata@acquisitionNum),,drop=FALSE] + + # No peaks? Aha, bye + if(nrow(peaks) == 0) + return(NA) + + # If we don't include the reanalyzed peaks: + if(!getOption("RMassBank")$use_rean_peaks) + peaks <- peaks[is.na(peaks$matchedReanalysis),,drop=FALSE] + # but if we include them: + else + { + # for info, the following data will be used in the default annotator: + # annotation <- annotation[,c("mzSpec","formula", "formulaCount", "mzCalc", "dppm")] + # and in the peaklist itself: + # c("mzSpec", "int", "intrel") + peaks[!is.na(peaks$matchedReanalysis),"formula"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.formula"] + peaks[!is.na(peaks$matchedReanalysis),"mzCalc"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.mzCalc"] + peaks[!is.na(peaks$matchedReanalysis),"dppm"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.dppm"] + peaks[!is.na(peaks$matchedReanalysis),"dbe"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.dbe"] + peaks[!is.na(peaks$matchedReanalysis),"formulaCount"] <- peaks[!is.na(peaks$matchedReanalysis),"reanalyzed.formulaCount"] + } + + # Calculate relative intensity and make a formatted m/z to use in the output + # (mzSpec, for "spectrum") + peaks$intrel <- floor(peaks$intensity / max(peaks$intensity) * 999) + peaks$mzSpec <- round(peaks$mzFound, 4) + # reorder peaks after addition of the reanalyzed ones + peaks <- peaks[order(peaks$mzSpec),] + + # Also format the other values, which are used in the annotation + peaks$dppm <- round(peaks$dppm, 2) + peaks$mzCalc <- round(peaks$mzCalc, 4) + peaks$intensity <- round(peaks$intensity, 1) + # copy the peak table to the annotation table. (The peak table will then be extended + # with peaks from the global "additional_peaks" table, which can be used to add peaks + # to the spectra by hand. + annotation <- peaks + # Keep only peaks with relative intensity >= 1 o/oo, since the MassBank record + # makes no sense otherwise. Also, keep only the columns needed in the output. + peaks <- peaks[ peaks$intrel >= 1, c("mzSpec", "intensity", "intrel")] + + # Here add the additional peaks if there are any for this compound! + # They are added without any annotation. + if(!is.null(additionalPeaks)) + { + # select the peaks from the corresponding spectrum which were marked with "OK=1" in the table. + spec_add_peaks <- additionalPeaks[ + (additionalPeaks$OK == 1) & + (additionalPeaks$cpdID == spec@id) & + (additionalPeaks$scan == msmsdata@acquisitionNum), + c("mzFound", "intensity")] + # If there are peaks to add: + if(nrow(spec_add_peaks)>0) + { + # add the column for rel. int. + spec_add_peaks$intrel <- 0 + # format m/z value + spec_add_peaks$mzSpec <- round(spec_add_peaks$mzFound, 4) + # bind tables together + peaks <- rbind(peaks, spec_add_peaks[,c("mzSpec", "intensity", "intrel")]) + # recalculate rel.int. and reorder list + peaks$intrel <- floor(peaks$intensity / max(peaks$intensity) * 999) + # Again, select the correct columns, and drop values with rel.int. <1 o/oo + # NOTE: If the highest additional peak is > than the previous highest peak, + # this can lead to the situation that a peak is in "annotation" but not in "peaks"! + # See below. + peaks <- peaks[ peaks$intrel >= 1, c("mzSpec", "intensity", "intrel")] + # Reorder again. + peaks <- peaks[order(peaks$mzSpec),] + } + } + + + + # add + or - to fragment formulas + formula_tag <- list( + "pH" = "+", + "pNa" = "+", + "mH" = "-", + "mFA" = "-", + "pM" = "+", + "mM" = "-", + "pNH4" = "+") + type <- formula_tag[[spec@mode]] + + annotator <- getOption("RMassBank")$annotator + if(is.null(annotator)) + annotator <- "annotator.default" + + + + # Here, the relative intensity is recalculated using the newly added additional + # peaks from the peak list. Therefore, we throw superfluous peaks out again. + # NOTE: It is a valid question whether or not we should kick peaks out at this stage. + # The alternative would be to leave the survivors at 1 o/oo, but keep them in the spectrum. + annotation$intrel <- floor(annotation$intensity / max(peaks$intensity) * 999) + annotation <- annotation[annotation$intrel >= 1,] + + annotation <- do.call(annotator, list(annotation= annotation, type=type)) + + + # Name the columns correctly. + colnames(peaks) <- c("m/z", "int.", "rel.int.") + peaknum <- nrow(peaks) + + # Create the "lower part" of the record. + mbdata <- list() + # Add the AC$MS, AC$LC info. + if(getOption("RMassBank")$use_version == 2) + { + mbdata[["AC$MASS_SPECTROMETRY"]] <- ac_ms + mbdata[["AC$CHROMATOGRAPHY"]] <- ac_lc + } + else + { + # Fix for MassBank data format 1, where ION_MODE must be renamed to MODE + mbdata[["AC$ANALYTICAL_CONDITION"]] <- c(ac_ms, ac_lc) + names(mbdata[["AC$ANALYTICAL_CONDITION"]])[[3]] <- "MODE" + } + # Add the MS$FOCUSED_ION info. + mbdata[["MS$FOCUSED_ION"]] <- ms_fi + + ## The SPLASH is a hash value calculated across all peaks + ## http://splash.fiehnlab.ucdavis.edu/ + ## Has to be temporarily added as "PK$SPLASH" in the "lower" part + ## of the record, but will later be moved "up" when merging parts in compileRecord() + + # the data processing tag :) + # Change by Tobias: + # I suggest to add here the current version number of the clone due to better distinction between different makes of MB records + # Could be automatised from DESCRIPTION file? + if(getOption("RMassBank")$use_rean_peaks) + processingComment <- list("REANALYZE" = "Peaks with additional N2/O included") + else + processingComment <- list() + mbdata[["MS$DATA_PROCESSING"]] <- c( + getOption("RMassBank")$annotations$ms_dataprocessing, + processingComment, + list("WHOLE" = paste("RMassBank", packageVersion("RMassBank"))) + ) + + mbdata[["PK$SPLASH"]] <- list(SPLASH = getSplash(peaks[,c("m/z", "int.")])) + + # Annotation: + if(getOption("RMassBank")$add_annotation && (findLevel(id,TRUE)!="unknown")) + mbdata[["PK$ANNOTATION"]] <- annotation + + # Peak table + mbdata[["PK$NUM_PEAK"]] <- peaknum + mbdata[["PK$PEAK"]] <- peaks + # These two entries will be thrown out later, but they are necessary to build the + # record title and the accession number. + mbdata[["RECORD_TITLE_CE"]] <- msmsdata@info$ces #formatted collision energy + # Mode of relative scan calculation: by default it is calculated relative to the + # parent scan. If a corresponding option is set, it will be calculated from the first + # present child scan in the list. + relativeScan <- "fromParent" + if(!is.null(getOption("RMassBank")$recomputeRelativeScan)) + if(getOption("RMassBank")$recomputeRelativeScan == "fromFirstChild") + relativeScan <- "fromFirstChild" + if(relativeScan == "fromParent") + mbdata[["SUBSCAN"]] <- msmsdata@acquisitionNum - spec@parent@acquisitionNum #relative scan + else if(relativeScan == "fromFirstChild"){ + firstChild <- min(unlist(lapply(spec@children,function(d) d@acquisitionNum))) + mbdata[["SUBSCAN"]] <- msmsdata@acquisitionNum - firstChild + 1 + } + return(mbdata) +} + + +# This compiles a MassBank record from the analyzedRcSpecs format (using the peaks from +# refilteredRcSpecs) together with the compound annotation data. +# Correspondingly: +# spec: contains the analyzedRcSpec-format spectrum collection to be compiled +# (i.e. a block of length(spectraList) child spectra) +# mbdata: contains the corresponding MassBank "header" (the upper part of the record) +# until INSTRUMENT TYPE. +# refiltered: the refilteredRcSpecs which contain our nice peaks. +#' Compile MassBank records +#' +#' Takes a spectra block for a compound, as returned from +#' \code{\link{analyzeMsMs}}, and an aggregated cleaned peak table, together +#' with a MassBank information block, as stored in the infolists and loaded via +#' \code{\link{loadInfolist}}/\code{\link{readMbdata}} and processes them to a +#' MassBank record +#' +#' \code{compileRecord} calls \code{\link{gatherCompound}} to create blocks of +#' spectrum data, and finally fills in the record title and accession number, +#' renames the "internal ID" comment field and removes dummy fields. +#' +#' @usage compileRecord(spec, mbdata, aggregated, additionalPeaks = NULL, retrieval="standard") +#' @param spec A \code{RmbSpectraSet} for a compound, after analysis (\code{\link{analyzeMsMs}}). +#' Note that \bold{peaks are not read from this +#' object anymore}: Peaks come from the \code{aggregated} dataframe (and from +#' the global \code{additionalPeaks} dataframe; cf. \code{\link{addPeaks}} for +#' usage information.) +#' @param mbdata The information data block for the record header, as stored in +#' \code{mbdata_relisted} after loading an infolist. +#' @param aggregated An aggregated peak data table containing information about refiltered spectra etc. +#' @param additionalPeaks If present, a table with additional peaks to add into the spectra. +#' As loaded with \code{\link{addPeaks}}. +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z +#' @return Returns a MassBank record in list format: e.g. +#' \code{list("ACCESSION" = "XX123456", "RECORD_TITLE" = "Cubane", ..., +#' "CH\$LINK" = list( "CAS" = "12-345-6", "CHEMSPIDER" = 1111, ...))} +#' @author Michael Stravs +#' @seealso \code{\link{mbWorkflow}}, \code{\link{addPeaks}}, +#' \code{\link{gatherCompound}}, \code{\link{toMassbank}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' +#' # +#' \dontrun{myspec <- w@@spectra[[2]]} +#' # after having loaded an infolist: +#' \dontrun{mbdata <- mbdata_relisted[[which(mbdata_archive\$id == as.numeric(myspec\$id))]]} +#' \dontrun{compiled <- compileRecord(myspec, mbdata, w@@aggregated)} +#' +#' @export +compileRecord <- function(spec, mbdata, aggregated, additionalPeaks = NULL, retrieval="standard") +{ + # gather the individual spectra data + mblist <- gatherCompound(spec, aggregated, additionalPeaks, retrieval=retrieval) + # this returns a n-member list of "lower parts" of spectra (one for each subscan). + # (n being the number of child scans per parent scan.) + # Now we put the two parts together. + # (lapply on all n subscans, returns a list.) + mblist_c <- lapply(mblist, function(l) + { + # This is the step which sticks together the upper and the lower part of the + # record (the upper being compound-specific and the lower being scan-specific.) + # Note that the accession number and record title (in the upper part) must of course + # be filled in with scan-specific info. + mbrecord <- c(mbdata, l) + + # Here is the right place to fix the name of the INTERNAL ID field. + names(mbrecord[["COMMENT"]])[[which(names(mbrecord[["COMMENT"]]) == "ID")]] <- + getOption("RMassBank")$annotations$internal_id_fieldname + # get mode parameter (for accession number generation) depending on version + # of record definition + # Change by Tobias: + # I suggest to include fragmentation mode here for information + if(getOption("RMassBank")$use_version == 2) + mode <- mbrecord[["AC$MASS_SPECTROMETRY"]][["ION_MODE"]] + else + mode <- mbrecord[["AC$ANALYTICAL_CONDITION"]][["MODE"]] + # Generate the title and then delete the temprary RECORD_TITLE_CE field used before + mbrecord[["RECORD_TITLE"]] <- .parseTitleString(mbrecord) + mbrecord[["RECORD_TITLE_CE"]] <- NULL + # Calculate the accession number from the options. + shift <- getOption("RMassBank")$accessionNumberShifts[[spec@mode]] + mbrecord[["ACCESSION"]] <- sprintf("%s%04d%02d", getOption("RMassBank")$annotations$entry_prefix, as.numeric(spec@id), as.numeric(mbrecord[["SUBSCAN"]])+shift) + # Clear the "SUBSCAN" field. + mbrecord[["SUBSCAN"]] <- NULL + # return the record. + return(mbrecord) + }) +} + + + +#' Generate peak annotation from peaklist +#' +#' Generates the PK$ANNOTATION entry from the peaklist obtained. This function is +#' overridable by using the "annotator" option in the settings file. +#' +#' @param annotation A peak list to be annotated. Contains columns: +#' \code{"cpdID","formula","mzFound" ,"scan","mzCalc","dppm", +#' "dbe","mz","int","formulaCount","parentScan","fM_factor","dppmBest", +#' "formulaMultiplicity","intrel","mzSpec"} +#' +#' @param type The ion type to be added to annotated formulas ("+" or "-" usually) +#' +#' @return The annotated peak table. Table \code{colnames()} will be used for the +#' titles (preferrably don't use spaces in the column titles; however no format is +#' strictly enforced by the MassBank data format. +#' +#' @examples +#' \dontrun{ +#' annotation <- annotator.default(annotation) +#' } +#' @author Michele Stravs, Eawag +#' @export +annotator.default <- function(annotation, type) +{ + + annotation$formula <- paste(annotation$formula, type, sep='') + # Select the right columns and name them correctly for output. + annotation <- annotation[,c("mzSpec","formula", "formulaCount", "mzCalc", "dppm")] + colnames(annotation) <- c("m/z", "tentative_formula", "formula_count", "mass", "error(ppm)") + return(annotation) +} + +#' Parse record title +#' +#' Parses a title for a single MassBank record using the title format +#' specified in the option titleFormat. Internally used, not exported. +#' +#' If the option is not set, a standard title format is used (for record definition +#' version 1 or 2). +#' +#' @usage .parseTitleString(mbrecord) +#' @param mbrecord A MassBank record in list format, as returned from +#' \code{\link{gatherSpectrum}}. +#' @return A string with the title. +#' @author Michael Stravs, Eawag +#' @seealso \code{\link{compileRecord}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' \dontrun{ +#' # used in compileRecord() +#' title <- .parseTitleString(mbrecord) +#' } +#' +#' +#' +.parseTitleString <- function(mbrecord) +{ + + varlist <- getOption("RMassBank")$titleFormat + + # Set the standard title format. + if(is.null(varlist)) + { + if(getOption("RMassBank")$use_version == 2) + { + varlist <- c( + "{CH$NAME}", + "{AC$INSTRUMENT_TYPE}", + "{AC$MASS_SPECTROMETRY: MS_TYPE}", + "CE: {RECORD_TITLE_CE}", + "R={AC$MASS_SPECTROMETRY: RESOLUTION}", + "{MS$FOCUSED_ION: PRECURSOR_TYPE}" + ) + } + else + { + varlist <- c( + "{CH$NAME}", + "{AC$INSTRUMENT_TYPE}", + "{AC$ANALYTICAL_CONDITION: MS_TYPE}", + "CE: {RECORD_TITLE_CE}", + "R={AC$ANALYTICAL_CONDITION: RESOLUTION}", + "{MS$FOCUSED_ION: PRECURSOR_TYPE}" + ) + } + } + + + # Extract a {XXX} argument from each title section. + # check that every title has one and only one match + args <- regexec("\\{(.*)\\}", varlist) + arglist <- regmatches(varlist, args) + if(any(unlist(lapply(arglist, length)) != 2)) + stop("Title format is incorrectly specified: a section with not exactly 1 parameters") + + parsedVars <- lapply(varlist, function(var) + { + # Extract the specified parameter inside the {}. + # I.e. from a string like "R={BLA: BLUB}" return "BLA: BLUB" + args <- regexec("\\{(.*)\\}", var) + arg <- regmatches(var, args)[[1]][[2]] + # Split the parameter by colon if necessary + splitVar <- strsplit(arg, ": ")[[1]] + # Read the parameter value from the record + if(length(splitVar) == 2) + replaceVar <- mbrecord[[splitVar[[1]]]][[splitVar[[2]]]] + else if(length(splitVar) == 1) + replaceVar <- mbrecord[[splitVar]] + else + stop(paste( + "Title format is incorrectly specified:", var) + ) + # Fix problems: NULL returns + if(is.null(replaceVar)) + replaceVar <- "" + # Fix problems: Names will have >= 1 match. Take the first + if(length(replaceVar) > 1) + replaceVar <- replaceVar[[1]] + + # Fix problems: Unknowns might have no name + if(!length(replaceVar)){ + replaceVar <- "" + } + + # Substitute the parameter value into the string + parsedVar <- sub("\\{(.*)\\}", replaceVar, var) + return(parsedVar) + }) + title <- paste(parsedVars, collapse="; ") + return(title) +} + + +# This converts the tree-like list (as obtained e.g. from compileRecord()) +# into a plain text array, which can then be dumped to a file suitable for +# MassBank upload. +#' Write MassBank record into character array +#' +#' Writes a MassBank record in list format to a text array. +#' +#' The function is a general conversion tool for the MassBank format; i.e. the +#' field names are not fixed. \code{mbdata} must be a named list, and the +#' entries can be as follows: \itemize{ +#' \item A single text line: +#' +#' \code{'CH\$EXACT_MASS' = '329.1023'} +#' +#' is written as +#' +#' \code{CH\$EXACT_MASS: 329.1023} +#' \item A character array: +#' +#' \code{'CH\$NAME' = c('2-Aminobenzimidazole', '1H-Benzimidazol-2-amine')} +#' +#' is written as +#' +#' \code{CH\$NAME: 2-Aminobenzimidazole} +#' +#' \code{CH\$NAME: 1H-Benzimidazol-2-amine} +#' +#' \item A named list of strings: +#' +#' \code{'CH\$LINK' = list('CHEBI' = "27822", "KEGG" = "C10901")} +#' +#' is written as +#' +#' \code{CH\$LINK: CHEBI 27822} +#' +#' \code{CH\$LINK: KEGG C10901} +#' +#' \item A data frame (e.g. the peak table) is written as specified in +#' the MassBank record format (Section 2.6.3): the column names are used as +#' headers for the first line, all data rows are printed space-separated. +#' } +#' +#' @usage toMassbank(mbdata) +#' @param mbdata A MassBank record in list format. +#' @return The result is a text array, which is ready to be written to the disk +#' as a file. +#' @note The function iterates over the list item names. \bold{This means that +#' duplicate entries in \code{mbdata} are (partially) discarded!} The correct +#' way to add them is by making a character array (as specified above): Instead +#' of \code{'CH\$NAME' = 'bla', 'CH\$NAME' = 'blub'} specify \code{'CH\$NAME' = +#' c('bla','blub')}. +#' @author Michael Stravs +#' @seealso \code{\link{compileRecord}}, \code{\link{mbWorkflow}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' \dontrun{ +#' # Read just the compound info skeleton from the Internet for some compound ID +#' id <- 35 +#' mbdata <- gatherData(id) +#' #' # Export the mbdata blocks to line arrays +#' # (there is no spectrum information, just the compound info...) +#' mbtext <- toMassbank(mbdata) +#' } +#' +#' @export +toMassbank <- function (mbdata) +{ + # mbf is an array of lines and count is the line counter. + # Very old-school, but it works. :) + mbf <- character(0) + count <- 1 + lapply(names(mbdata), function(entry) + { + # If entry is a char line, add it to the file. + # If it is a named sublist, add each subentry with name + # If it is an unnamed sublist, add each subentry without name + # if it is a dataframe, write in PEAKS mode + + # Note: this is were I liked "lapply" a little too much. "for" would + # be more idiomatic, and wouldn't need the <<- assignments. + + # Data frame: table mode. A header line and one space-separated line for + # each data frame row. + if(is.data.frame(mbdata[[entry]])) + { + mbf[[count]] <<- paste(entry,": " , + paste(colnames(mbdata[[entry]]), collapse=" "), + sep='') + count <<- count+1 + for(row in 1:nrow(mbdata[[entry]])) + { + mbf[[count]] <<- paste(" ", + paste(mbdata[[entry]][row,],collapse=" "), + sep="") + count <<- count+1 + } + #browser() + } + # List with named items: Write every entry like CH$LINK: CAS 12-345-678 + else if(is.list(mbdata[[entry]]) & !is.null(names(mbdata[[entry]]))) + { + + lapply(names(mbdata[[entry]]), function(subentry) + { + if(subentry != "SPLASH"){ + mbf[[count]] <<- paste(entry,": ",subentry, " ", mbdata[[entry]][[subentry]], sep='') + } else { + mbf[[count]] <<- paste(entry,": ", mbdata[[entry]][[subentry]], sep='') + } + #print(mbf) + count <<- count + 1 + }) + } + # Array (or list) of unnamed items: Write every entry like CH$NAME: Paracetamol + # (iterative entry without subindices) + else if (length(mbdata[[entry]]) > 1 & is.null(names(mbdata[[entry]]))) + { + lapply(mbdata[[entry]], function(subentry) + { + mbf[[count]] <<- paste(entry,": ",subentry, sep='') + #print(mbf) + count <<- count + 1 + }) + } + # Length is 1: just write the entry like PK$NUM_PEAKS: 131 + else + { + mbf[[count]] <<- paste(entry,": ",mbdata[[entry]], sep='') + count <<- count + 1 + } + } + ) # End of lapply block (per child spectrum) + # Add mandatory EOF marker + mbf[[count]] <- "//" + return(mbf) +} + +# Exports compiled and massbanked spectra, with their associated molfiles, to physical files. +# "compiled" is still used here, because we need an accessible accession number. +# In the plain text arrays, the accession number is already "hidden". +# compiled: is ONE "compiled" entry, i.e. ONE compound with e.g. 14 spectra. +# files: is a return value from lapply(toMassbank), i.e. contains 14 plain-text arrays +# (for a 14-spectra method) +# molfile: a molfile from createMolfile +#' Export internally stored MassBank data to files +#' +#' Exports MassBank recfile data arrays and corresponding molfiles to physical +#' files on hard disk, for one compound. +#' +#' The data from \code{compiled} is still used here, because it contains the +#' "visible" accession number. In the plain-text format contained in +#' \code{files}, the accession number is not "accessible" anymore since it's in +#' the file. +#' +#' @usage exportMassbank(compiled, files, molfile) +#' @param compiled Is ONE "compiled" entry, i.e. ONE compound with e.g. 14 +#' spectra, as returned from \code{\link{compileRecord}}. +#' @param files A n-membered array (usually a return value from +#' \code{lapply(\link{toMassbank})}), i.e. contains n plain-text arrays with +#' MassBank records. +#' @param molfile A molfile from \code{\link{createMolfile}} +#' @return No return value. +#' @note An improvement would be to write the accession numbers into +#' \code{names(compiled)} and later into \code{names(files)} so \code{compiled} +#' wouldn't be needed here anymore. (The compound ID would have to go into +#' \code{names(molfile)}, since it is also retrieved from \code{compiled}.) +#' @author Michael Stravs +#' @seealso \code{\link{createMolfile}}, \code{\link{compileRecord}}, +#' \code{\link{toMassbank}}, \code{\link{mbWorkflow}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' \dontrun{ +#' compiled <- compileRecord(record, mbdata, refilteredRcSpecs) +#' mbfiles <- toMassbank(compiled) +#' molfile <- createMolfile(compiled[[1]][["CH$SMILES"]]) +#' exportMassbank(compiled, mbfiles, molfile) +#' } +#' +#' @export +exportMassbank <- function(compiled, files, molfile) +{ + molnames <- c() + for(file in 1:length(compiled)) + { + # Read the accession no. from the corresponding "compiled" entry + filename <- compiled[[file]]["ACCESSION"] + # use this accession no. as filename + filename <- paste(filename, ".txt", sep="") + write(files[[file]], + file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata",filename) + ) + } + # Use internal ID for naming the molfiles + if(findLevel(compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]],TRUE)=="standard"){ + molname <- sprintf("%04d", as.numeric( + compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])) + molname <- paste(molname, ".mol", sep="") + write(molfile, + file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata",molname) + ) + } +} + +# Makes a list.tsv with molfile -> massbank ch$name attribution. + +#' Write list.tsv file +#' +#' Makes a list.tsv file in the "moldata" folder. +#' +#' Generates the list.tsv file which is needed by MassBank to connect records with +#' their respective molfiles. The first compound name is linked to a mol-file with +#' the compound ID (e.g. 2334.mol for ID 2334). +#' +#' @param compiled A list of compiled spectra (in tree-format, as returned by \code{compileRecord}). +#' @return No return value. +#' @author Michael A. Stravs, Eawag +#' @examples \dontrun{ +#' compiled <- compileRecord(record, mbdata, refilteredRcSpecs) +#' # a list.tsv for only one record: +#' clist <- list(compiled) +#' makeMollist(clist) +#' } +#' @export +makeMollist <- function(compiled) +{ + # For every "compiled" entry (here, compiled is not one "compiled" entry but the total + # list of all compiled spectra), extract the uppermost CH$NAME and the ID (from the + # first spectrum.) Make the ID into 0000 format. + + tsvlist <- t(sapply(compiled, function(entry) + { + name <- entry[[1]][["CH$NAME"]][[1]] + id <- sprintf("%04d", as.numeric(entry[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])) + molfilename <- paste(id,".mol",sep='') + return(c(name,molfilename)) + })) + + IDs <- sapply(compiled, function(entry) return( sprintf("%04d", as.numeric(entry[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])))) + level <- sapply(IDs, findLevel, compact=TRUE) + validentries <- which(level == "standard") + # Write the file with the + write.table(tsvlist[validentries,], + paste(getOption("RMassBank")$annotations$entry_prefix,"/moldata/list.tsv", sep=''), + quote = FALSE, + sep="\t", + row.names=FALSE, + col.names=FALSE + ) +} + + +# Load a dataframe or file into additional_peaks (or add additional points in there.) +# The columns cpdID, scan, mzFound, int, OK are mandatory. OK=1 means that the peaks +# will be added into the spectrum. mzFound and int will be taken for the table. +# No annotation will be written. +# Add peaks to the spectra by hand + +#' Add additional peaks to spectra +#' +#' Loads a table with additional peaks to add to the MassBank spectra. Required +#' columns are \code{cpdID, scan, int, mzFound, OK}. +#' +#' All peaks with OK=1 will be included in the spectra. +#' +#' @usage addPeaks(mb, filename_or_dataframe) +#' @param mb The \code{mbWorkspace} to load the peaks into. +#' @param filename_or_dataframe Filename of the csv file, or name of the R +#' dataframe containing the peaklist. +#' @return The \code{mbWorkspace} with loaded additional peaks. +#' @author Michael Stravs +#' @seealso \code{\link{mbWorkflow}} +#' @examples +#' +#' \dontrun{addPeaks("myrun_additionalPeaks.csv")} +#' +#' @export +addPeaks <- function(mb, filename_or_dataframe) +{ + + errorvar <- 0 + currEnvir <- environment() + d <- 1 + + if(is.data.frame(filename_or_dataframe)) + df <- filename_or_dataframe + else + tryCatch( + df <- read.csv(filename_or_dataframe), + error=function(e){ + currEnvir$errorvar <- 1 + }) + # I change your heuristic fix to another heuristic fix, because I will have to test for a column name change... + + if(!errorvar){ + + if(ncol(df) < 2) + df <- read.csv(filename_or_dataframe, sep=";") + # here: the column int was renamed to intensity, and we need to be able to read old files. sorry. + if(!("intensity" %in% colnames(df)) & ("int" %in% colnames(df))) + df$intensity <- df$int + + cols <- c("cpdID", "scan", "mzFound", "intensity", "OK") + n <- colnames(df) + # Check if comma-separated or semicolon-separated + d <- setdiff(cols, n) + if(length(d)>0){ + stop("Some columns are missing in the additional peak list. Needs at least cpdID, scan, mzFound, intensity, OK.") + } + } + + culled_df <- df[,c("cpdID", "scan", "mzFound", "intensity", "OK")] + + + if(nrow(mb@additionalPeaks) == 0) + mb@additionalPeaks <- culled_df + else + mb@additionalPeaks <- rbind(mb@additionalPeaks, culled_df) + return(mb) +} diff --git a/R/leMsMs.r b/R/leMsMs.r index 0fc02da..da3f612 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -1,2366 +1,2366 @@ - -#library(xcms) - -#' Backup \code{msmsWorkflow} results -#' -#' Writes the results from different \code{msmsWorkflow} steps to a file. -#' -#' @aliases archiveResults -#' @usage archiveResults(w, fileName, settings = getOption("RMassBank")) -#' @param w The \code{msmsWorkspace} to be saved. -#' @param fileName The filename to store the results under. -#' @param settings The settings to be stored into the msmsWorkspace image. -#' @examples -#' -#' # This doesn't really make a lot of sense, -#' # it stores an empty workspace. -#' RmbDefaultSettings() -#' w <- newMsmsWorkspace() -#' archiveResults(w, "narcotics.RData") -#' -#' @export -archiveResults <- function(w, fileName, settings = getOption("RMassBank")) -{ - # save the settings into the settings slot - w@settings <- settings - # save - save(w, file=fileName) - -} - - -#' RMassBank mass spectrometry pipeline -#' -#' Extracts and processes spectra from a specified file list, according to -#' loaded options and given parameters. -#' -#' The filenames of the raw LC-MS runs are read from the array \code{files} -#' in the global enviroment. -#' See the vignette \code{vignette("RMassBank")} for further details about the -#' workflow. -#' -#' @param w A \code{msmsWorkspace} to work with. -#' @param mode \code{"pH", "pNa", "pM", "mH", "mM", "mFA", "pNH4"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-, [M+NH4]+). -#' @param steps Which steps of the workflow to process. See the vignette -#' \code{vignette("RMassBank")} for details. -#' @param confirmMode Defaults to false (use most intense precursor). Value 1 uses -#' the 2nd-most intense precursor for a chosen ion (and its data-dependent scans) -#' , etc. -#' @param newRecalibration Whether to generate a new recalibration curve (\code{TRUE}, default) or -#' to reuse the currently stored curve (\code{FALSE}, useful e.g. for adduct-processing runs.) -#' @param useRtLimit Whether to enforce the given retention time window. -#' @param archivename The prefix under which to store the analyzed result files. -#' @param readMethod Several methods are available to get peak lists from the files. -#' Currently supported are "mzR", "xcms", "MassBank" and "peaklist". -#' The first two read MS/MS raw data, and differ in the strategy -#' used to extract peaks. MassBank will read existing records, -#' so that e.g. a recalibration can be performed, and "peaklist" -#' just requires a CSV with two columns and the column header "mz", "int". -#' @param findPeaksArgs A list of arguments that will be handed to the xcms-method findPeaks via do.call -#' @param plots A parameter that determines whether the spectra should be plotted or not (This parameter is only used for the xcms-method) -#' @param precursorscan.cf Whether to fill precursor scans. To be used with files which for -#' some reasons do not contain precursor scan IDs in the mzML, e.g. AB Sciex converted -#' files. -#' @param settings Options to be used for processing. Defaults to the options loaded via -#' \code{\link{loadRmbSettings}} et al. Refer to there for specific settings. -#' @param analyzeMethod The "method" parameter to pass to \code{\link{analyzeMsMs}}. -#' @param progressbar The progress bar callback to use. Only needed for specialized applications. -#' Cf. the documentation of \code{\link{progressBarHook}} for usage. -#' @param MSe A boolean value that determines whether the spectra were recorded using MSe or not -#' @return The processed \code{msmsWorkspace}. -#' @seealso \code{\link{msmsWorkspace-class}} -#' @author Michael Stravs, Eawag -#' @export -msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRecalibration = TRUE, - useRtLimit = TRUE, archivename=NA, readMethod = "mzR", findPeaksArgs = NULL, plots = FALSE, - precursorscan.cf = FALSE, - settings = getOption("RMassBank"), analyzeMethod = "formula", - progressbar = "progressBarHook", MSe = FALSE) -{ - .checkMbSettings() - if(!any(mode %in% c("pH","pNa","pNH4","pM","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) - - if(!is.na(archivename)) - w@archivename <- archivename - - # Make a progress bar: - nProg <- 0 - nLen <- length(w@files) - - allUnknown <- FALSE - - # If all compounds are unknown some specific conditions apply - if(all(.listEnvEnv$listEnv$compoundList$Level == "5")){ - allUnknown <- TRUE - message("All compounds are unknown, the workflow will be adjusted accordingly") - } - - if(readMethod == "minimal"){ - ##Edit options - opt <- getOption("RMassBank") - opt$recalibrator$MS1 <- "recalibrate.identity" - opt$recalibrator$MS2 <- "recalibrate.identity" - opt$add_annotation <- FALSE - opt$multiplicityFilter <- 1 - options(RMassBank=opt) - settings <- getOption("RMassBank") - ##Edit analyzemethod - analyzeMethod <- "intensity" - } - - # clean rerun functionality: - # if any step after 3 has been run, rerunning steps 4 or below needs moving back to the parent workspace. - # However, the recalibration must be preserved, because: - # if someone runs - # w <- msmsWorkflow(w, steps=c(1:4)), - # then substitutes the recalibration - # w@rc <- myrecal - # then runs step 4 again - # w <- msmsWorkflow(w, steps=c(4), newRecalibration=FALSE) - # the rc and rc.ms1 must be preserved and not taken from the parent workspace - if(!all(steps > 4) & !is.null(w@parent)) - { - rc <- w@rc - rc.ms1 <- w@rc.ms1 - w <- w@parent - w@rc <- rc - w@rc.ms1 <- rc.ms1 - } - - # Step 1: acquire all MSMS spectra from files - if(1 %in% steps) - { - message("msmsWorkflow: Step 1. Acquire all MSMS spectra from files") - w <- msmsRead(w = w, files = w@files, readMethod=readMethod, mode=mode, confirmMode = confirmMode, useRtLimit = useRtLimit, - Args = findPeaksArgs, settings = settings, progressbar = progressbar, MSe = MSe) - } - # Step 2: first run analysis before recalibration - if(2 %in% steps) - { - nProg <- 0 - message("msmsWorkflow: Step 2. First analysis pre recalibration") - if(allUnknown){ - analyzeMethod <- "intensity" - } - pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) - w@spectra <- as(lapply(w@spectra, function(spec) { - #print(spec$id) - # if(findLevel(spec@id,TRUE) == "unknown"){ - # analyzeMethod <- "intensity" - # } else { - # analyzeMethod <- "formula" - # } - s <- analyzeMsMs(msmsPeaks = spec, mode=mode, detail=TRUE, run="preliminary", - filterSettings = settings$filterSettings, - spectraList = settings$spectraList, method = analyzeMethod) - # Progress: - nProg <<- nProg + 1 - pb <- do.call(progressbar, list(object=pb, value= nProg)) - - return(s) - }), "SimpleList") - ## for(f in w@files) - ## w@spectra[[basename(as.character(f))]]@name <- basename(as.character(f)) - suppressWarnings(do.call(progressbar, list(object=pb, close=TRUE))) - } - # Step 3: aggregate all spectra - if(3 %in% steps) - { - message("msmsWorkflow: Step 3. Aggregate all spectra") - w@aggregated <- aggregateSpectra(spec = w@spectra, addIncomplete=TRUE) - - if(RMassBank.env$verbose.output){ - numberOfPeaksThere <- sum(unlist(lapply(X = w@spectra, FUN = function(spec){ sum(unlist(lapply(X = spec@children, FUN = function(child){ child@peaksCount }))) }))) - if(nrow(w@aggregated) < numberOfPeaksThere) - cat(paste("### Warning ### The aggregation of spectra lead to the removal of ", (numberOfPeaksThere-nrow(w@aggregated)), " / ", numberOfPeaksThere, " peaks\n", sep = "")) - } - } - - if(allUnknown){ - w@aggregated$noise <- FALSE - w@aggregated$noise <- FALSE - w@aggregated$reanalyzed.formula <- NA - w@aggregated$reanalyzed.mzCalc <- NA - w@aggregated$reanalyzed.dppm <- NA - w@aggregated$reanalyzed.formulaCount <- NA - w@aggregated$reanalyzed.dbe <- NA - w@aggregated$matchedReanalysis <- NA - w@aggregated$filterOK <- TRUE - w@aggregated$problematicPeak <- FALSE - w@aggregated$formulaMultiplicity <- unlist(sapply(table(w@aggregated$cpdID),function(x) rep(x,x))) - return(w) - } - - - # Step 4: recalibrate all m/z values in raw spectra - if(4 %in% steps) - { - message("msmsWorkflow: Step 4. Recalibrate m/z values in raw spectra") - if(newRecalibration) - { - # note: makeRecalibration takes w as argument now, because it needs to get the MS1 spectra from @spectra - recal <- makeRecalibration(w, mode, - recalibrateBy = settings$recalibrateBy, - recalibrateMS1 = settings$recalibrateMS1, - recalibrator = settings$recalibrator, - recalibrateMS1Window = settings$recalibrateMS1Window) - w@rc <- recal$rc - w@rc.ms1 <- recal$rc.ms1 - } - w@parent <- w - w@aggregated <- data.frame() - spectra <- recalibrateSpectra(mode, w@spectra, w = w, - recalibrateBy = settings$recalibrateBy, - recalibrateMS1 = settings$recalibrateMS1) - w@spectra <- spectra - } - # Step 5: re-analysis on recalibrated spectra - if(5 %in% steps) - { - nProg <- 0 - message("msmsWorkflow: Step 5. Reanalyze recalibrated spectra") - pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) - - w@spectra <- as(lapply(w@spectra, function(spec) { - #print(spec$id) - if(findLevel(spec@id,TRUE) == "unknown"){ - analyzeMethod <- "intensity" - } else { - analyzeMethod <- "formula" - } - s <- analyzeMsMs(msmsPeaks = spec, mode=mode, detail=TRUE, run="recalibrated", - filterSettings = settings$filterSettings, - spectraList = settings$spectraList, method = analyzeMethod) - # Progress: - nProg <<- nProg + 1 - pb <- do.call(progressbar, list(object=pb, value= nProg)) - - return(s) - }), "SimpleList") - ## for(f in w@files) - ## w@spectra[[basename(as.character(f))]]@name <- basename(as.character(f)) - suppressWarnings(do.call(progressbar, list(object=pb, close=TRUE))) - - do.call(progressbar, list(object=pb, close=TRUE)) - } - # Step 6: aggregate recalibrated results - if(6 %in% steps) - { - message("msmsWorkflow: Step 6. Aggregate recalibrated results") - w@aggregated <- aggregateSpectra(w@spectra, addIncomplete=TRUE) - - if(RMassBank.env$verbose.output){ - numberOfPeaksThere <- sum(unlist(lapply(X = w@spectra, FUN = function(spec){ sum(unlist(lapply(X = spec@children, FUN = function(child){ child@peaksCount }))) }))) - if(nrow(w@aggregated) < numberOfPeaksThere) - cat(paste("### Warning ### The aggregation of spectra lead to the removal of ", (numberOfPeaksThere-nrow(w@aggregated)), " / ", numberOfPeaksThere, " peaks\n", sep = "")) - } - - if(!is.na(archivename)) - archiveResults(w, paste(archivename, ".RData", sep=''), settings) - w@aggregated <- cleanElnoise(peaks = w@aggregated, noise = settings$electronicNoise, width = settings$electronicNoiseWidth) - - if(RMassBank.env$verbose.output) - if(sum(w@aggregated$noise) > 0) - cat(paste("### Warning ### ", sum(w@aggregated$noise), " / ", nrow(w@aggregated), " peaks have been identified as electronic noise\n", sep = "")) - } - # Step 7: reanalyze failpeaks for (mono)oxidation and N2 adduct peaks - if(7 %in% steps) - { - message("msmsWorkflow: Step 7. Reanalyze fail peaks for N2 + O") - w@aggregated <- reanalyzeFailpeaks( - aggregated = w@aggregated, custom_additions="N2O", mode=mode, - filterSettings=settings$filterSettings, - progressbar=progressbar) - if(!is.na(archivename)) - archiveResults(w, paste(archivename, "_RA.RData", sep=''), settings) - - if(RMassBank.env$verbose.output){ - isNoFormula <- is.na(w@aggregated$formula) & is.na(w@aggregated$reanalyzed.formula) - noFormulaCount <- sum(isNoFormula) - if(noFormulaCount > 0){ - cat(paste("### Warning ### ", noFormulaCount, " / ", nrow(unique(x = w@aggregated[, c("mzFound", "intensity", "formulaCount", "dppmBest")])), " peaks have no molecular formula:\n", sep = "")) - print(w@aggregated[isNoFormula, c("mzFound","intensity","cpdID")]) - } - } - } - - # Step 8: heuristic filtering based on peak multiplicity; - # creation of failpeak list - if(8 %in% steps) - { - message("msmsWorkflow: Step 8. Peak multiplicity filtering") - if (is.null(settings$multiplicityFilter)) { - message("msmsWorkflow: Step 8. Peak multiplicity filtering skipped because multiplicityFilter parameter is not set.") - } else { - # apply heuristic filter - w@aggregated <- filterMultiplicity(w = w, archivename = archivename, mode = mode, multiplicityFilter = settings$multiplicityFilter) - - if(RMassBank.env$verbose.output){ - multiplicityNotOkCount <- sum(!w@aggregated$filterOK) - if(multiplicityNotOkCount > 0) - cat(paste("### Warning ### ", multiplicityNotOkCount, " / ", nrow(w@aggregated[, c("mzFound", "intensity", "formulaCount", "dppmBest")]), " peaks do not fulfill the multiplicity criterion\n", sep = "")) - } - - w@aggregated <- processProblematicPeaks(w, mode, archivename) - - if(RMassBank.env$verbose.output){ - problematicPeakCount <- sum(w@aggregated$problematicPeak) - if(problematicPeakCount > 0) - cat(paste("### Warning ### ", problematicPeakCount, " / ", nrow(w@aggregated), " peaks are problematic\n", sep = "")) - } - - if(!is.na(archivename)) - archiveResults(w, paste(archivename, "_RF.RData", sep=''), settings) - } - } - message("msmsWorkflow: Done.") - return(w) -} - -#' Analyze MSMS spectra -#' -#' Analyzes MSMS spectra of a compound by fitting formulas to each subpeak. -#' -#' The analysis function uses Rcdk. Note -#' that in this step, \emph{satellite peaks} are removed by a simple heuristic -#' rule (refer to the documentation of \code{\link{filterPeakSatellites}} for details.) -#' -## # @usage analyzeMsMs(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", -## # filterSettings = getOption("RMassBank")$filterSettings, -## # spectraList = getOption("RMassBank")$spectraList, method="formula") -## # -## # analyzeMsMs.formula(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", -## # filterSettings = getOption("RMassBank")$filterSettings, -## # spectraList = getOption("RMassBank")$spectraList) -## # -## # analyzeMsMs.intensity(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", -## # filterSettings = getOption("RMassBank")$filterSettings, -## # spectraList = getOption("RMassBank")$spectraList) -#' -#' @param msmsPeaks A \code{RmbSpectraSet} object. -#' Corresponds to a parent spectrum and children MSMS spectra of one compound (plus some metadata). -#' The objects are typically generated with \code{\link{findMsMsHR}}, and populate the \code{@@spectrum} slot -#' in a \code{msmsWorkspace} (refer to the corresponding -#' documentation for the precise format specifications). -#' @param mode Specifies the processing mode, i.e. which molecule species the -#' spectra contain. \code{\var{pH}} (positive H) specifies [M+H]+, -#' \code{\var{pNa}} specifies [M+Na]+, \code{\var{pM}} specifies [M]+, -#' \code{\var{mH}} and \code{\var{mNa}} specify [M-H]- and [M-Na]-, -#' respectively. (I apologize for the naming of \code{\var{pH}} which has -#' absolutely nothing to do with chemical \emph{pH} values.) -#' @param detail Whether detailed return information should be provided -#' (defaults to \code{FALSE}). See below. -#' @param run \code{"preliminary"} or \code{"recalibrated"}. In the -#' \code{preliminary} run, mass tolerance is set to 10 ppm (above m/z 120) and -#' 15 ppm (below m/z 120), the default intensity cutoff is $10^4$ for positive -#' mode (no default cutoff in negative mode), and the column \code{"mz"} from -#' the spectra is used as data source. In the \code{recalibrated} run, the -#' mass tolerance is set to 5 ppm over the whole mass range, the default cutoff -#' is 0 and the column \code{"mzRecal"} is used as source for the m/z values. -#' Defaults to \code{"preliminary"}. -#' @param filterSettings -#' Settings for the filter parameters, by default loaded from the RMassBank settings -#' set with e.g. \code{\link{loadRmbSettings}}. Must contain: -#' \itemize{ -#' \item \code{ppmHighMass}, allowed ppm deviation before recalibration -#' for high mass range -#' \item \code{ppmLowMass}, allowed ppm deviation before recalibration -#' for low mass range -#' \item \code{massRangeDivision}, division point between high and low mass -#' range (before recalibration) -#' \item \code{ppmFine}, allowed ppm deviation overall after recalibration -#' \item \code{prelimCut}, intensity cutoff for peaks in preliminary run -#' \item \code{prelimCutRatio}, relative intensity cutoff for peaks in -#' preliminary run, e.g. 0.01 = 1% -#' \item \code{fineCut}, intensity cutoff for peaks in second run -#' \item \code{fineCutRatio}, relative intensity cutoff for peaks in -#' second run -#' \item \code{specOkLimit}, minimum intensity of base peak for spectrum -#' to be accepted for processing -#' \item \code{dbeMinLimit}, minimum double bond equivalent for accepted -#' molecular subformula. -#' \item \code{satelliteMzLimit}, for satellite peak filtering -#' (\code{\link{filterPeakSatellites}}: mass window to use for satellite -#' removal -#' \item \code{satelliteIntLimit}, the relative intensity below which to -#' discard "satellites". (refer to \code{\link{filterPeakSatellites}}). -#' } -#' @param spectraList The list of MS/MS spectra present in each data block. As also -#' defined in the settings file. -#' @param method Selects which function to actually use for data evaluation. The default -#' "formula" runs a full analysis via formula assignment to fragment peaks. The -#' alternative setting "intensity" calls a "mock" implementation which circumvents -#' formula assignment and filters peaks purely based on intensity cutoffs and the -#' satellite filtering. (In this case, the ppm and dbe related settings in filterSettings -#' are ignored.) -#' @return The processed \code{RmbSpectraSet} object. -#' Added (or filled, respectively, since the slots are present before) data include -#' \item{list("complete")}{whether all spectra have useful value} -#' \item{list("empty")}{whether there are no useful spectra} -#' \item{list("children")}{ -#' The processed \code{RmbSpectrum2} objects (in a \code{RmbSpectrum2List}). -#' \itemize{ -#' \item \code{ok} if the spectrum was successfully processed with at least one resulting peak -#' \item \code{mz}, \code{intensity}: note that mz/int pairs can be duplicated when multiple matches -#' are found for one mz value, therefore the two slots are not necessarily unchanged from before -#' \item \code{rawOK} (logical) whether the m/z peak passes satellite/low removal -#' \item \code{low}, \code{satellite} if \code{TRUE}, the peak failed cutoff (\code{low}) or was removed as \code{satellite} -#' \item \code{formula}, \code{mzCalc}, \code{dppm}, \code{dbe} Formula, calculated mass, ppm deviation and dbe assigned to a peak -#' \item \code{formulaCount}, \code{dppmBest} Number of formulae matched for this m/z value and ppm deviation of the best match -#' \item \code{info} Spectrum identifying information (collision energy, resolution, collision mode) from -#' the \code{spectraList} -#' \item All other entries are retained from the original \code{RmbSpectrum2}. -#' } -#' } -#' @aliases analyzeMsMs analyzeMsMs.formula analyzeMsMs.intensity -#' @author Michael Stravs -#' @seealso \code{\link{msmsWorkflow}}, \code{\link{filterLowaccResults}}, -#' \code{\link{filterPeakSatellites}}, \code{\link{reanalyzeFailpeaks}} -#' @examples -#' -#' \dontrun{analyzed <- analyzeMsMs(spec, "pH", TRUE)} -#' -#' @export -analyzeMsMs <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", - filterSettings = getOption("RMassBank")$filterSettings, - spectraList = getOption("RMassBank")$spectraList, method="formula") -{ - ## .RmbSpectraSet <- setClass("RmbSpectraSet", - ## representation = representation( - ## parent = "Spectrum1", - ## children = "RmbSpectrum2List", - ## # These are done as slots and not as S4 functions, because they are set during the workflow - ## # in "checking" steps. It's easier. - ## found = "logical", - ## complete = "logical", - ## empty = "logical", - ## formula = "character", - ## id = "integer", - ## mz = "numeric", - ## name = "character", - ## annotations = "list" - ## ), - ## prototype = prototype( - ## parent = new("Spectrum1"), - ## children = new("RmbSpectrum2List"), - ## found = FALSE, - ## complete = NA, - ## empty = NA, - ## formula = character(), - ## id = integer(), - ## mz = numeric(), - ## name = character(), - ## annotations = list() - ## ) - ## ); - .checkMbSettings() - - - # Check whether the spectra can be fitted to the spectra list correctly! - if(length(msmsPeaks@children) != length(spectraList)) - { - warning(paste0( - "The spectra count of the substance ", msmsPeaks@id, " (", length(msmsPeaks@children), " spectra) doesn't match the provided spectra list (", length(spectraList), " spectra)." - )) - msmsPeaks@found <- FALSE - return(msmsPeaks) - - } - - if(msmsPeaks@found == FALSE) - return(msmsPeaks) - - if(method=="formula") - { - r <- (analyzeMsMs.formula(msmsPeaks, mode, detail, run, filterSettings - )) - } - else if(method == "intensity") - { - r <- (analyzeMsMs.intensity(msmsPeaks, mode, detail, run, filterSettings - )) - } - - # Add the spectrum labels to the spectra here. - # If there is any better place to do this, please tell me. I hate it. - # However, the info should be added in msmsWorkflow not in mbWorkflow, because two msmsWorkspaces with different spectraLists can be - # merged together with all the combine / pack stuff. - children <- mapply(function(spec, info) - { - spec@info <- info - spec - }, r@children, spectraList, SIMPLIFY=FALSE) - r@children <- as(children, "SimpleList") - - - #nspectra <- length(spectraList) - ok <- unlist(lapply(r@children, function(c) c@ok)) - r@complete <- FALSE - r@empty <- FALSE - if(all(ok)) - r@complete <- TRUE - if(all(!ok)) - r@empty <- TRUE - return(r) -} - - -#' @describeIn analyzeMsMs Analyze the peaks using formula annotation -#' @export -analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", - filterSettings = getOption("RMassBank")$filterSettings) -{ - cut <- 0 - cut_ratio <- 0 - if(run=="preliminary") - { - filterMode <- "coarse" - cut <- filterSettings$prelimCut - if(is.na(cut)) - { - if(mode %in% c("pH", "pM", "pNa", "pNH4")) - cut <- 1e4 - else if(mode %in% c("mH", "mFA","mM")) - cut <- 0 - else stop(paste("The ionization mode", mode, "is unknown.")) - } - cutRatio <- filterSettings$prelimCutRatio - } - else - { - filterMode <- "fine" - cut <- filterSettings$fineCut - cut_ratio <- filterSettings$fineCutRatio - if(is.na(cut)) cut <- 0 - } - - # find whole spectrum of parent peak, so we have reasonable data to feed into - # MolgenMsMs - parentSpectrum <- msmsPeaks@parent - - - # On each spectrum the following function analyzeTandemShot will be applied. - # It takes the raw peaks matrix as argument (mz, int) and processes the spectrum by - # filtering out low-intensity (<1e4) and shoulder peaks (deltam/z < 0.5, intensity - # < 5%) and subsequently matching the peaks to formulas using Rcdk, discarding peaks - # with insufficient match accuracy or no match. - analyzeTandemShot <- function(child) - { - childIdx <- which(sapply(X = seq_along(msmsPeaks@children), FUN = function(i){ - all(child@mz == msmsPeaks@children[[i]]@mz) & all(child@rt == msmsPeaks@children[[i]]@rt) & all(child@intensity == msmsPeaks@children[[i]]@intensity) } - )) - shot <- getData(child) - shot$row <- which(!is.na(shot$mz)) - - - # Filter out low intensity peaks: - child@low <- (shot$intensity < cut) | (shot$intensity < max(shot$intensity)*cut_ratio) - shot <- shot[!child@low,,drop=FALSE] - shot_full <- shot - - # Is there still anything left? - if(length(which(!child@low))==0) - { - child@ok <- FALSE - - if(RMassBank.env$verbose.output) - cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' contains only low-intensity peaks\n", sep = "")) - - return(child) - } - - # Filter out satellite peaks: - shot <- filterPeakSatellites(shot, filterSettings) - child@satellite <- rep(TRUE, child@peaksCount) - child@satellite[which(child@low == TRUE)] <- NA - child@satellite[shot$row] <- FALSE - - # Is there still anything left? - if(nrow(shot)==0) - { - child@ok <- FALSE - - if(RMassBank.env$verbose.output) - cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' contains no peaks after satellite filtering\n", sep = "")) - - return(child) - } - - if(max(shot$intensity) < as.numeric(filterSettings$specOkLimit)) - { - child@ok <- FALSE - - if(RMassBank.env$verbose.output) - cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is discarded due to parameter 'specOkLimit'\n", sep = "")) - - return(child) - } - - # Crop to 4 digits (necessary because of the recalibrated values) - # this was done for the MOLGEN MSMS type analysis, is not necessary anymore now (23.1.15 MST) - # shot[,mzColname] <- round(shot[,mzColname], 5) - - # here follows the Rcdk analysis - #------------------------------------ - parentPeaks <- data.frame(mzFound=msmsPeaks@mz, - formula=msmsPeaks@formula, - dppm=0, - x1=0,x2=0,x3=0) - - # define the adduct additions - if(mode == "pH") { - allowed_additions <- "H" - mode.charge <- 1 - } else if(mode == "pNa") { - allowed_additions <- "Na" - mode.charge <- 1 - } else if(mode == "pM") { - allowed_additions <- "" - mode.charge <- 1 - } else if(mode == "mM") { - allowed_additions <- "" - mode.charge <- -1 - } else if(mode == "mH") { - allowed_additions <- "H-1" - mode.charge <- -1 - } else if(mode == "mFA") { - allowed_additions <- "C2H3O2" - mode.charge <- -1 - } else if(mode == "pNH4") { - allowed_additions <- "NH4" - mode.charge <- 1 - } else{ - stop("mode = \"", mode, "\" not defined") - } - - # the ppm range is two-sided here. - # The range is slightly expanded because dppm calculation of - # generate.formula starts from empirical mass, but dppm cal- - # culation of the evaluation starts from theoretical mass. - # So we don't miss the points on 'the border'. - - if(run=="preliminary") - ppmlimit <- 2 * max(filterSettings$ppmLowMass, filterSettings$ppmHighMass) - else - ppmlimit <- 2.25 * filterSettings$ppmFine - - parent_formula <- add.formula(msmsPeaks@formula, allowed_additions) - dbe_parent <- dbe(parent_formula) - # check whether the formula is valid, i.e. has no negative or zero element numbers. - #print(parent_formula) - if(!is.valid.formula(parent_formula)) - { - child@ok <- FALSE - - if(RMassBank.env$verbose.output) - cat(paste("### Warning ### The precursor ion formula of spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is invalid\n", sep = "")) - - return(child) - } - - limits <- to.limits.rcdk(parent_formula) - - peakmatrix <- lapply( - split(shot,shot$row) - , function(shot.row) { - # Circumvent bug in rcdk: correct the mass for the charge first, then calculate uncharged formulae - # finally back-correct calculated masses for the charge - mass <- shot.row[["mz"]] - mass.calc <- mass + mode.charge * .emass - peakformula <- tryCatch(suppressWarnings(generate.formula(mass = mass.calc, window = ppm(mass.calc, ppmlimit, p=TRUE), - elements = limits, charge=0)), error=function(e) NA) - #peakformula <- tryCatch( - # generate.formula(mass, - # ppm(mass, ppmlimit, p=TRUE), - # limits, charge=1), - #error= function(e) list()) - - if(!is.list(peakformula)) - return(t(c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, - formula=NA, mzCalc=NA))) - else - { - return(t(sapply(peakformula, function(f) - { - mzCalc <- f@mass - mode.charge * .emass - c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, - formula=f@string, - mzCalc=mzCalc) - }))) - } - }) - - childPeaks <- as.data.frame(do.call(rbind, peakmatrix)) - - # Reformat the deformatted output correctly (why doesn't R have a better way to do this, e.g. avoid deformatting?) - - childPeaks$row <- as.numeric(as.character(childPeaks$row)) - childPeaks$intensity <- as.numeric(as.character(childPeaks$intensity)) - childPeaks$mz <- as.numeric(as.character(childPeaks$mz)) - childPeaks$formula <- as.character(childPeaks$formula) - childPeaks$mzCalc <- as.numeric(as.character(childPeaks$mzCalc)) - childPeaks$dppm <- (childPeaks$mz / childPeaks$mzCalc - 1) * 1e6 - childPeaks$dbe <- unlist(lapply(childPeaks$formula, dbe)) - - # childPeaks now contains all the good and unmatched peaks - # but not the ones which were cut as satellites or below threshold. - - ## child@mzFound <- rep(NA, child@peaksCount) - ## child@mzFound[childPeaks$row] <- as.numeric(as.character(childPeaks$mzFound)) - ## - ## child@formula <- rep(NA, child@peaksCount) - ## child@formula[childPeaks$row] <- as.character(childPeaks$formula) - ## - ## child@mzCalc <- rep(NA, child@peaksCount) - ## child@mzCalc[childPeaks$row] <- as.numeric(as.character(childPeaks$mzCalc)) - ## - ## child@dppm<- rep(NA, child@peaksCount) - ## child@dppm[childPeaks$row] <- (childPeaks$mzFound / childPeaks$mzCalc - 1) * 1e6 - # delete the NA data out again, because MolgenMsMs doesn't have them - # here and they will be re-added later - # (this is just left like this for "historical" reasons) - #childPeaks <- childPeaks[!is.na(childPeaks$formula),] - # check if a peak was recognized (here for the first time, - # otherwise the next command would fail) - - if(nrow(childPeaks)==0) - { - child@ok <- FALSE - - if(RMassBank.env$verbose.output) - cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is empty\n", sep = "")) - - return(child) - } - - # now apply the rule-based filters to get rid of total junk: - # dbe >= -0.5, dbe excess over mother cpd < 3 - # dbe() has been adapted to return NA for NA input - #iff_rcdk_pM_eln$maxvalence <- unlist(lapply(diff_rcdk_pM_eln$formula.rcdk, maxvalence)) - temp.child.ok <- (childPeaks$dbe >= filterSettings$dbeMinLimit) - # & dbe < dbe_parent + 3) - # check if a peak was recognized - if(length(which(temp.child.ok)) == 0) - { - child@ok <- FALSE - return(child) - } - #browser() - # find the best ppm value - bestPpm <- aggregate(as.data.frame(childPeaks[!is.na(childPeaks$dppm),"dppm"]), - list(childPeaks[!is.na(childPeaks$dppm),"row"]), - function(dppm) dppm[[which.min(abs(dppm))]]) - colnames(bestPpm) <- c("row", "dppmBest") - childPeaks <- merge(childPeaks, bestPpm, by="row", all.x=TRUE) - - # Deactivated the following lines because we never actually want to look at the "old" formula count. - # To be verified (cf Refiltering, failpeak list and comparable things) - - ## # count formulas found per mass - ## countFormulasTab <- xtabs( ~formula + mz, data=childPeaks) - ## countFormulas <- colSums(countFormulasTab) - ## childPeaks$formulaCount <- countFormulas[as.character(childPeaks$row)] - - # filter results - childPeaksFilt <- filterLowaccResults(childPeaks, filterMode, filterSettings) - childPeaksGood <- childPeaksFilt[["TRUE"]] - childPeaksBad <- childPeaksFilt[["FALSE"]] - if(is.null(childPeaksGood)){ - childPeaksGood <- childPeaks[c(),,drop=FALSE] - childPeaksGood$good <- logical(0) - } - if(is.null(childPeaksBad)) - childPeaksBad <- childPeaks[c(),,drop=FALSE] - childPeaksUnassigned <- childPeaks[is.na(childPeaks$dppm),,drop=FALSE] - childPeaksUnassigned$good <- rep(FALSE, nrow(childPeaksUnassigned)) - # count formulas within new limits - # (the results of the "old" count stay in childPeaksInt and are returned - # in $childPeaks) - countFormulasTab <- xtabs( ~formula + mz, data=childPeaksGood) - countFormulas <- colSums(countFormulasTab) - childPeaksGood$formulaCount <- countFormulas[as.character(childPeaksGood$mz)] - - childPeaksUnassigned$formulaCount <- rep(NA, nrow(childPeaksUnassigned)) - childPeaksBad$formulaCount <- rep(NA, nrow(childPeaksBad)) - childPeaksBad$good <- rep(FALSE, nrow(childPeaksBad)) - - # Now: childPeaksGood (containing the new, recounted peaks with good = TRUE), and childPeaksBad (containing the - # peaks with good=FALSE, i.e. outside filter criteria, with the old formula count even though it is worthless) - # are bound together. - childPeaksBad <- childPeaksBad[,colnames(childPeaksGood),drop=FALSE] - childPeaksUnassigned <- childPeaksUnassigned[,colnames(childPeaksGood),drop=FALSE] - childPeaks <- rbind(childPeaksGood, childPeaksBad, childPeaksUnassigned) - - # Now let's cross fingers. Add a good=NA column to the unmatched peaks and reorder the columns - # to match order in childPeaks. After that, setData to the child slot. - - childPeaksOmitted <- getData(child) - childPeaksOmitted <- childPeaksOmitted[child@low | child@satellite,,drop=FALSE] - childPeaksOmitted$rawOK <- rep(FALSE, nrow(childPeaksOmitted)) - childPeaksOmitted$good <- rep(FALSE, nrow(childPeaksOmitted)) - childPeaksOmitted$dppm <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$formula <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$mzCalc <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$dbe <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$dppmBest <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$formulaCount <- rep(0, nrow(childPeaksOmitted)) - childPeaks$satellite <- rep(FALSE, nrow(childPeaks)) - childPeaks$low <- rep(FALSE, nrow(childPeaks)) - childPeaks$rawOK <- rep(TRUE, nrow(childPeaks)) - - childPeaks <- childPeaks[,colnames(childPeaksOmitted), drop=FALSE] - - childPeaksTotal <- rbind(childPeaks, childPeaksOmitted) - child <- setData(child, childPeaksTotal) - child@ok <- TRUE - - return(child) - } - - # I believe these lines were fixed to remove a warning but in the refactored workflow "mzranges" doesn't exist anymore. - # Leave here for now - ## mzranges <- t(sapply(shots, function(p) { - ## if(!is.null(p$childRaw)){ - ## return(range(p$childRaw[,mzColname])) - ## } else { - ## return(c(NA,NA)) - ## } - ## })) - ## - ## mzmin <- min(mzranges[,1], na.rm=TRUE) - ## mzmax <- max(mzranges[,2], na.rm=TRUE) - children <- lapply(msmsPeaks@children, analyzeTandemShot) - - - - -## shots <- mapply(function(shot, scan, info) - ## { - ## shot$scan <- scan - ## shot$info <- info - ## shot$header <- msmsPeaks$childHeaders[as.character(scan),] - ## return(shot) - ## }, shots, msmsPeaks$childScans, spectraList, SIMPLIFY=FALSE) - msmsPeaks@children <- as(children, "SimpleList") - return(msmsPeaks) -} - - -#' @describeIn analyzeMsMs Analyze the peaks going only by intensity values -#' @export -analyzeMsMs.intensity <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", - filterSettings = getOption("RMassBank")$filterSettings) -{ - cut <- 0 - cut_ratio <- 0 - if(run=="preliminary") - { - filterMode <- "coarse" - cut <- filterSettings$prelimCut - if(is.na(cut)) - { - if(mode %in% c("pH", "pM", "pNa", "pNH4")) - cut <- 1e4 - else if(mode %in% c("mH", "mFA", "mM")) - cut <- 0 - else stop(paste("The ionization mode", mode, "is unknown.")) - } - cutRatio <- filterSettings$prelimCutRatio - } - else - { - filterMode <- "fine" - cut <- filterSettings$fineCut - cut_ratio <- filterSettings$fineCutRatio - if(is.na(cut)) cut <- 0 - } - - # find whole spectrum of parent peak, so we have reasonable data to feed into - - - # On each spectrum the following function analyzeTandemShot will be applied. - # It takes the raw peaks matrix as argument (mz, int) and processes the spectrum by - # filtering out low-intensity (<1e4) and shoulder peaks (deltam/z < 0.5, intensity - # < 5%) and subsequently matching the peaks to formulas using Rcdk, discarding peaks - # with insufficient match accuracy or no match. - analyzeTandemShot <- function(child) - { - shot <- getData(child) - shot$row <- which(!is.na(shot$mz)) - - # Filter out low intensity peaks: - child@low <- (shot$intensity < cut) | (shot$intensity < max(shot$intensity)*cut_ratio) - shot_full <- shot - shot <- shot[!child@low,,drop=FALSE] - - - # Is there still anything left? - if(length(which(!child@low))==0) - { - child@ok <- FALSE - return(child) - } - - # Filter out satellite peaks: - shot <- filterPeakSatellites(shot, filterSettings) - child@satellite <- rep(TRUE, child@peaksCount) - child@satellite[which(child@low == TRUE)] <- NA - child@satellite[shot$row] <- FALSE - - # Is there still anything left? - if(nrow(shot)==0) - { - child@ok <- FALSE - return(child) - } - - if(max(shot$intensity) < as.numeric(filterSettings$specOkLimit)) - { - child@ok <- FALSE - return(child) - } - - - # here follows the fake analysis - #------------------------------------ - parentPeaks <- data.frame(mzFound=msmsPeaks@mz, - formula=msmsPeaks@formula, - dppm=0, - x1=0,x2=0,x3=0) - - childPeaks <- addProperty(shot_full, "rawOK", "logical", FALSE) - childPeaks[!(child@low | child@satellite),"rawOK"] <- TRUE - - childPeaks <- addProperty(childPeaks, "good", "logical", FALSE) - childPeaks[childPeaks$rawOK,"good"] <- TRUE - - childPeaks <- addProperty(childPeaks, "mzCalc", "numeric") - childPeaks[childPeaks$rawOK,"mzCalc"] <- childPeaks[childPeaks$rawOK,"mz"] - - childPeaks <- addProperty(childPeaks, "formula", "character") - childPeaks[childPeaks$rawOK,"formula"] <- "" - - childPeaks <- addProperty(childPeaks, "dbe", "numeric") - childPeaks[childPeaks$rawOK,"dbe"] <- 0 - - childPeaks <- addProperty(childPeaks, "formulaCount", "integer") - childPeaks[childPeaks$rawOK,"formulaCount"] <- 1 - - childPeaks <- addProperty(childPeaks, "dppm", "numeric") - childPeaks[childPeaks$rawOK,"dppm"] <- 0 - - childPeaks <- addProperty(childPeaks, "dppmBest", "numeric") - childPeaks[childPeaks$rawOK,"dppmBest"] <- 0 - - child <- setData(child, childPeaks) - child@ok <- TRUE - return(child) - } - children <- lapply(msmsPeaks@children, analyzeTandemShot) - msmsPeaks@children <- as(children, "SimpleList") - #browser() - - return(msmsPeaks) - - # Omit all the stuff below for now, I don't believe it is needed. One thing is that spectraList info will have to be added somewhere else. - ## shots <- mapply(function(shot, scan, info) - ## { - ## shot$scan <- scan - ## shot$info <- info - ## shot$header <- msmsPeaks$childHeaders[as.character(scan),] - ## return(shot) - ## }, shots, msmsPeaks$childScans, spectraList, SIMPLIFY=FALSE) - ## - ## mzranges <- t(sapply(shots, function(p) {return(range(p$childRaw[,mzColname]))})) - ## mzmin <- min(mzranges[,1], na.rm=TRUE) - ## mzmax <- max(mzranges[,2], na.rm=TRUE) - ## - ## return(list( - ## msmsdata=shots, - ## mzrange=c(mzmin, mzmax), - ## id=msmsPeaks$id, - ## mode=mode, - ## parentHeader = msmsPeaks$parentHeader, - ## parentMs = msmsPeaks$parentPeak, - ## formula = msmsPeaks$formula, - ## foundOK = TRUE)) -} - - -#' Filter peaks with low accuracy -#' -#' Filters a peak table (with annotated formulas) for accuracy. Low-accuracy -#' peaks are removed. -#' -#' In the \code{coarse} mode, mass tolerance is set to 10 ppm (above m/z 120) -#' and 15 ppm (below m/z 120). This is useful for formula assignment before -#' recalibration, where a wide window is desirable to accomodate the high mass -#' deviations at low m/z values, so we get a nice recalibration curve. -#' -#' In the \code{fine} run, the mass tolerance is set to 5 ppm over the whole -#' mass range. This should be applied after recalibration. -#' -#' @usage filterLowaccResults(peaks, mode="fine", filterSettings = getOption("RMassBank")$filterSettings) -#' @param peaks A data frame with at least the columns \code{mzFound} and -#' \code{dppm}. -#' @param mode \code{coarse} or \code{fine}, see below. -#' @param filterSettings Settings for filtering. For details, see documentation of -#' \code{\link{analyzeMsMs}} -#' @return A \code{list(TRUE = goodPeakDataframe, FALSE = badPeakDataframe)} is -#' returned: A data frame with all peaks which are "good" is in -#' \code{return[["TRUE"]]}. -#' @author Michael Stravs -#' @seealso \code{\link{analyzeMsMs}}, \code{\link{filterPeakSatellites}} -#' @examples -#' -#' # from analyzeMsMs: -#' \dontrun{childPeaksFilt <- filterLowaccResults(childPeaksInt, filterMode)} -#' -#' -filterLowaccResults <- function(peaks, mode="fine", filterSettings = getOption("RMassBank")$filterSettings) -{ - # Check if filter settings are properly set, otherwise use defaults - if(is.null(filterSettings)) - { - filterSettings <- list( - ppmHighMass = 10, - ppmLowMass = 15, - massRangeDivision = 120, - ppmFine = 5) - } - - peaks$good = NA - peaks[!is.na(peaks$dppm), "good"] <- TRUE - - # coarse mode: to use for determinating the recalibration function - if(mode=="coarse") - { - if(nrow(peaks[which(abs(peaks$dppm) > filterSettings$ppmHighMass),])>0) - peaks[which(abs(peaks$dppm) > filterSettings$ppmHighMass), "good"] <- FALSE - if(nrow(peaks[which(peaks$mz > filterSettings$massRangeDivision & abs(peaks$dppm) > filterSettings$ppmLowMass),])>0) - peaks[which(peaks$mz > filterSettings$massRangeDivision & abs(peaks$dppm) > filterSettings$ppmLowMass), "good"] <- FALSE - } - # fine mode: for use after recalibration - else - { - if(nrow(peaks[which(abs(peaks$dppm) > filterSettings$ppmFine),]) > 0) - peaks[which(abs(peaks$dppm) > filterSettings$ppmFine), "good"] <- FALSE - } - return(split(peaks, peaks$good)) -} - -#' Aggregate analyzed spectra -#' -#' Groups an array of analyzed spectra and creates aggregated peak tables -#' -#' \code{\var{addIncomplete}} is relevant for recalibration. For recalibration, -#' we want to use only high-confidence peaks, therefore we set -#' \code{\var{addIncomplete}} to \code{FALSE}. When we want to generate a peak -#' list for actually generating MassBank records, we want to include all peaks -#' into the peak tables. -#' -#' @usage aggregateSpectra(spec, addIncomplete=FALSE) -#' @param spec The \code{RmbSpectraSetList} of spectra sets (\code{RmbSpectraSet} objects) to aggregate -#' @param addIncomplete Whether or not the peaks from incomplete files (files -#' for which less than the maximal number of spectra are present) -#' @return -#' A summary \code{data.frame} with all peaks (possibly multiple rows for one m/z value from a spectrum, see below) with columns: -#' \item{mzFound, intensity}{Mass and intensity of the peak} -#' \item{good}{if the peak passes filter criteria} -#' \item{mzCalc, formula, dbe, dppm}{calculated mass, formula, dbe and ppm deviation of the assigned formula} -#' \item{formulaCount, dppmBest}{Number of matched formulae for this m/z value, and ppm deviation of the best match} -#' \item{scan, cpdID, parentScan}{Scan number of the child and parent spectrum in the raw file, also the compound ID to which the peak belongs} -#' \item{dppmRc}{ppm deviation recalculated from the aggregation function} -#' \item{index}{Aggregate-table peak index, so the table can be subsetted, edited and results reinserted back into this table easily} -#' Further columns are later added by workflow steps 6 (electronic noise culler), 7 and 8. -#' -#' @author Michael Stravs -#' @seealso \code{\link{msmsWorkflow}}, \code{\link{analyzeMsMs}} -#' @examples -#' -#' ## As used in the workflow: -#' \dontrun{% -#' w@@spectra <- lapply(w@@spectra, function(spec) -#' analyzeMsMs(spec, mode="pH", detail=TRUE, run="recalibrated", cut=0, cut_ratio=0 ) ) -#' w@@aggregate <- aggregateSpectra(w@@spectra) -#' } -#' -#' @export -aggregateSpectra <- function(spec, addIncomplete=FALSE) -{ - - if(addIncomplete) - aggSpectra <- selectSpectra(spec, "found", "object") - else - aggSpectra <- selectSpectra(spec, "complete", "object") - - compoundTables <- lapply(aggSpectra, function(s) - { - tables.c <- lapply(s@children, function(c) - { - table.c <- getData(c) - table.c <- table.c[table.c$rawOK,,drop=FALSE] - # remove superfluous columns, since only rawOK peaks are selected anyway - table.c$rawOK <- NULL - table.c$low <- NULL - table.c$satellite <- NULL - # add scan no - table.c$scan <- rep(c@acquisitionNum, nrow(table.c)) - return(table.c) - }) - table.cpd <- do.call(rbind, tables.c) - table.cpd$cpdID <- rep(s@id, nrow(table.cpd)) - table.cpd$parentScan <- rep(s@parent@acquisitionNum, nrow(table.cpd)) - return(table.cpd) - }) - #return(compoundTables) - aggTable <- do.call(rbind, compoundTables) - colnames(aggTable)[1] <- "mzFound" - - aggTable <- addProperty(aggTable, "dppmRc", "numeric") - aggTable <- addProperty(aggTable, "index", "integer") - if(nrow(aggTable) > 0) - aggTable$index <- 1:nrow(aggTable) - - aggTable[aggTable$good, "dppmRc"] <- (aggTable[aggTable$good, "mzFound"]/aggTable[aggTable$good, "mzCalc"] - 1)*1e6 - - - return(aggTable) -} - -#' Remove electronic noise -#' -#' Removes known electronic noise peaks from a peak table -#' -#' @usage cleanElnoise(peaks, noise=getOption("RMassBank")$electronicNoise, -#' width = getOption("RMassBank")$electronicNoiseWidth) -#' @param peaks An aggregated peak frame as described in \code{\link{aggregateSpectra}}. Columns -#' \code{mzFound}, \code{dppm} and \code{dppmBest} are needed. -#' @param noise A numeric vector of known m/z of electronic noise peaks from the instrument -#' Defaults to the entries in the RMassBank settings. -#' @param width The window for the noise peak in m/z units. Defaults to the entries in -#' the RMassBank settings. -#' @return Extends the aggregate data frame by column \code{noise} (logical), which is \code{TRUE} if the peak is marked as noise. -#' -#' @author Michael Stravs -#' @seealso \code{\link{msmsWorkflow}} -#' @examples -#' # As used in the workflow: -#' \dontrun{ -#' w@@aggregated <- -#' cleanElnoise(w@@aggregated) -#' } -#' @export -cleanElnoise <- function(peaks, noise=getOption("RMassBank")$electronicNoise, - width = getOption("RMassBank")$electronicNoiseWidth) -{ - peaks <- addProperty(peaks, "noise", "logical", FALSE) - - # I don't think this makes sense if using one big table... - ## # use only best peaks - ## p_best <- peaks[is.na(peaks$dppmBest) | (peaks$dppm == peaks$dppmBest),] - - # remove known electronic noise - p_eln <- peaks - for(noisePeak in noise) - { - noiseMatches <- which(!((p_eln$mzFound > noisePeak + width) | (p_eln$mzFound < noisePeak - width))) - if(length(noiseMatches) > 0) - p_eln[noiseMatches, "noise"] <- TRUE - } - return(p_eln) -} - -#' Identify intense peaks (in a list of unmatched peaks) -#' -#' Finds a list of peaks in spectra with a high relative intensity (>10% and -#' 1e4, or >1% and 1e5) to write a list of peaks which must be manually -#' checked. Peaks orbiting around the parent peak mass (calculated from the -#' compound ID), which are very likely co-isolated substances, are ignored. -#' -#' -#' @usage problematicPeaks(peaks_unmatched, peaks_matched, mode = "pH") -#' @param peaks_unmatched Table of unmatched peaks, with at least \code{cpdID, -#' scan, mzFound, int}. -#' @param peaks_matched Table of matched peaks (used for base peak reference), -#' with at least \code{cpdID, scan, int}. -#' @param mode Processing mode (\code{"pH", "pNa"} etc.) -#' @return A filtered table with the potentially problematic peaks, including -#' the precursor mass and MSMS base peak intensity (\code{aMax}) for reference. -#' @author Michael Stravs -#' @seealso \code{\link{msmsWorkflow}} -#' @examples \dontrun{ -#' # As used in the workflow: -#' fp <- problematicPeaks(specs[!specs$filterOK & !specs$noise & -#' ((specs$dppm == specs$dppmBest) | (is.na(specs$dppmBest))) -#' ,,drop=FALSE], peaksMatched(w), mode) -#' } -#' @export -problematicPeaks <- function(peaks_unmatched, peaks_matched, mode="pH") -{ - # find spectrum maximum for each peak, and merge into table - if(nrow(peaks_matched) == 0){ - assIntMax <- data.frame(list(integer(0),integer(0),integer(0))) - } else{ - assIntMax <- as.data.frame(aggregate(as.data.frame(peaks_matched$intensity), - by=list(peaks_matched$cpdID, peaks_matched$scan), max)) - } - colnames(assIntMax) <- c("cpdID", "scan", "aMax") - peaks_unmatched <- merge(peaks_unmatched, assIntMax) - # which of these peaks are intense? - p_control <- peaks_unmatched[ - ( (peaks_unmatched$intensity > 1e5) & - (peaks_unmatched$intensity > 0.01*peaks_unmatched$aMax)) - | ( (peaks_unmatched$intensity > 1e4) & - (peaks_unmatched$intensity > 0.1* peaks_unmatched$aMax)) ,] - # find parent m/z to exclude co-isolated peaks - #p_control$mzCenter <- numeric(nrow(p_control)) - p_control$mzCenter <- as.numeric( - unlist(lapply(p_control$cpdID, function(id) findMz(id, mode, retrieval=findLevel(id,TRUE))$mzCenter)) ) - p_control_noMH <- p_control[ - (p_control$mzFound < p_control$mzCenter - 1) | - (p_control$mzFound > p_control$mzCenter + 1),] - return(p_control_noMH) -} - - -#' Generate list of problematic peaks -#' -#' Generates a list of intense unmatched peaks for further review (the "failpeak list") and exports it if the archive name is given. -#' -#' @param w \code{msmsWorkspace} to analyze. -#' @param mode Processing mode (pH etc) -#' @param archivename Base name of the archive to write to (for "abc" the exported failpeaks list will be "abc_Failpeaks.csv"). -#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" -#' if the only know thing is the m/z -#' @return Returns the aggregate data.frame with added column "\code{problematic}" (logical) which marks peaks which match the problematic criteria -#' -#' @author stravsmi -#' @export -processProblematicPeaks <- function(w, mode, archivename = NA) -{ - - specs <- w@aggregated - fp <- problematicPeaks(specs[!specs$filterOK & !specs$noise & - ((specs$dppm == specs$dppmBest) | (is.na(specs$dppmBest))) - ,,drop=FALSE], peaksMatched(w), mode) - fp$OK <- rep('', nrow(fp)) - fp$name <- rownames(fp) - - fp <- fp[with(fp, - order(cpdID, mzCalc, scan)), - ] - - # Select the correct precursor scans. This serves to filter the list - # for the cases where multiple workspaces were combined after step 7 - # with combineMultiplicities. - # Note that this has drawbacks. Leaving the "duplicates" in would make it more easy - # to identify legitimate unformulaed peaks. We might experiment by marking them up - # somehow. - precursors <- unlist(lapply(selectSpectra(w, "found", "object"), function(s) s@parent@acquisitionNum)) - fp <- fp[ - fp$parentScan %in% precursors - ,] - - # Add the info to specs - specs <- addProperty(specs, "problematicPeak", "logical", FALSE) - specs[match(fp$index, specs$index),"problematicPeak"] <- TRUE - - # Select the columns for output into the failpeaks file - fp <- fp[,c("OK", "name", "cpdID", "scan", "mzFound", "formula", - "reanalyzed.formula", "mzCalc", "reanalyzed.mzCalc", "dppm", "reanalyzed.dppm", "dbe", "reanalyzed.dbe", "intensity", - "formulaCount", "reanalyzed.formulaCount", "parentScan", "aMax", "mzCenter")] - if(!is.na(archivename)) - write.csv(fp, file= - paste(archivename,"_Failpeaks.csv", sep=''), row.names=FALSE) - - return(specs) -} - -#' Recalibrate MS/MS spectra -#' -#' Recalibrates MS/MS spectra by building a recalibration curve of the -#' assigned putative fragments of all spectra in \code{aggregatedSpecs} -#' (measured mass vs. mass of putative associated fragment) and additionally -#' the parent ion peaks. -#' -#' Note that the actually used recalibration functions are governed by the -#' general MassBank settings (see \code{\link{recalibrate}}). -#' -#' If a set of acquired LC-MS runs contains spectra for two different ion types -#' (e.g. [M+H]+ and [M+Na]+) which should both be processed by RMassBank, it is -#' necessary to do this in two separate runs. Since it is likely that one ion type -#' will be the vast majority of spectra (e.g. most in [M+H]+ mode), and only few -#' spectra will be present for other specific adducts (e.g. only few [M+Na]+ spectra), -#' it is possible that too few spectra are present to build a good recalibration curve -#' using only e.g. the [M+Na]+ ions. Therefore we recommend, for one set of LC/MS runs, -#' to build the recalibration curve for one ion type -#' (\code{msmsWorkflow(mode="pH", steps=c(1:8), newRecalibration=TRUE)}) -#' and reuse the same curve for processing different ion types -#' (\code{msmsWorkflow(mode="pNa", steps=c(1:8), newRecalibration=FALSE)}). -#' This also ensures a consistent recalibration across all spectra of the same batch. -#' -#' @usage makeRecalibration(w, mode, -#' recalibrateBy = getOption("RMassBank")$recalibrateBy, -#' recalibrateMS1 = getOption("RMassBank")$recalibrateMS1, -#' recalibrator = getOption("RMassBank")$recalibrator, -#' recalibrateMS1Window = getOption("RMassBank")$recalibrateMS1Window -#' ) -#' -#' recalibrateSpectra(mode, rawspec = NULL, rc = NULL, rc.ms1=NULL, w = NULL, -#' recalibrateBy = getOption("RMassBank")$recalibrateBy, -#' recalibrateMS1 = getOption("RMassBank")$recalibrateMS1) -#' -#' recalibrateSingleSpec(spectrum, rc, -#' recalibrateBy = getOption("RMassBank")$recalibrateBy) -#' @aliases makeRecalibration recalibrateSpectra recalibrateSingleSpec -#' @param w For \code{makeRecalibration}: to perform the recalibration with. For \code{recalibrateSpectra}: -#' the \code{msmsWorkspace} which contains the recalibration curves (alternatively to specifying \code{rc, rc.ms1}). -#' @param spectrum For \code{recalibrateSingleSpec}: -#' a \code{MSnbase} \code{Spectrum}-derived object, commonly a \code{RmbSpectrum2} for MS2 or \code{Spectrum1} for MS1. -#' @param mode \code{"pH", "pNa", "pM", "mH", "mM", "mFA"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-). -#' @param rawspec For \code{recalibrateSpectra}:an \code{RmbSpectraSetList} of \code{RmbSpectraSet} objects -#' , as the \code{w@@spectra} slot from \code{msmsWorkspace} or any object returned by \code{\link{findMsMsHR}}. -#' If empty, no spectra are recalibrated, but the recalibration curve is -#' returned. -#' @param rc,rc.ms1 The recalibration curves to be used in the recalibration. -#' @param recalibrateBy Whether recalibration should be done by ppm ("ppm") or by m/z ("mz"). -#' @param recalibrateMS1 Whether MS1 spectra should be recalibrated separately ("separate"), -#' together with MS2 ("common") or not at all ("none"). Usually taken from settings. -#' @param recalibrator The recalibrator functions to be used. -#' Refer to \code{\link{recalibrate}} for details. Usually taken from settings. -#' @param recalibrateMS1Window Window width to look for MS1 peaks to recalibrate (in ppm). -#' @return \code{makeRecalibration}: a \code{list(rc, rc.ms1)} with recalibration curves -#' for the MS2 and MS1 spectra. -#' -#' \code{recalibrateSpectra}: if \code{rawspec} is not \code{NULL}, returns the recalibrated -#' spectra as \code{RmbSpectraSetList}. All spectra have their mass recalibrated and evaluation data deleted. -#' -#' \code{recalibrateSingleSpec}: the recalibrated \code{Spectrum} (same object, recalibrated masses, -#' evaluation data like assigned formulae etc. deleted). -#' -#' @examples \dontrun{ -#' rcCurve <- recalibrateSpectra(w, "pH") -#' w@@spectra <- recalibrateSpectra(mode="pH", rawspec=w@@spectra, w=myWorkspace) -#' w@@spectra <- recalibrateSpectra(mode="pH", rawspec=w@@spectra, rcCurve$rc, rcCurve$rc.ms1) -#' } -#' -#' @author Michael Stravs, Eawag -#' @export -makeRecalibration <- function(w, mode, - recalibrateBy = getOption("RMassBank")$recalibrateBy, - recalibrateMS1 = getOption("RMassBank")$recalibrateMS1, - recalibrator = getOption("RMassBank")$recalibrator, - recalibrateMS1Window = getOption("RMassBank")$recalibrateMS1Window - ) -{ - if(is.null(w@spectra)) - stop("No spectra present to generate recalibration curve.") - - rcdata <- peaksMatched(w) - rcdata <- rcdata[rcdata$formulaCount == 1, ,drop=FALSE] - - rcdata <- rcdata[,c("mzFound", "dppm", "mzCalc")] - - if(nrow(rcdata) == 0) - stop("No peaks matched to generate recalibration curve.") - - ms1data <- recalibrate.addMS1data(w@spectra, mode, recalibrateMS1Window) - ms1data <- ms1data[,c("mzFound", "dppm", "mzCalc")] - - if (recalibrateMS1 != "none") { - ## Add m/z values from MS1 to calibration datapoints - rcdata <- rbind(rcdata, ms1data) - } - - rcdata$dmz <- rcdata$mzFound - rcdata$mzCalc - ms1data$dmz <- ms1data$mzFound - ms1data$mzCalc - - if(recalibrateBy == "dppm") - { - rcdata$recalfield <- rcdata$dppm - ms1data$recalfield <- ms1data$dppm - } - else - { - rcdata$recalfield <- rcdata$dmz - ms1data$recalfield <- ms1data$dmz - } - - # generate recalibration model - rc <- do.call(recalibrator$MS2, list(rcdata)) - if(recalibrateMS1 == "separate") - rc.ms1 <- do.call(recalibrator$MS1, list(ms1data)) - else - rc.ms1 <- rc - - # plot the model - par(mfrow=c(2,2)) - if(nrow(rcdata)>0) - plotRecalibration.direct(rcdata, rc, rc.ms1, "MS2", - range(rcdata$mzFound), - recalibrateBy) - if(nrow(ms1data)>0) - plotRecalibration.direct(ms1data, rc, rc.ms1, "MS1", - range(ms1data$mzFound), - recalibrateBy) - # Return the computed recalibration curves - return(list(rc=rc, rc.ms1=rc.ms1)) -} - - - -#' Plot the recalibration graph. -#' -#' @aliases plotRecalibration plotRecalibration.direct -#' @usage plotRecalibration(w, recalibrateBy = getOption("RMassBank")$recalibrateBy) -#' -#' plotRecalibration.direct(rcdata, rc, rc.ms1, title, mzrange, -#' recalibrateBy = getOption("RMassBank")$recalibrateBy) -#' -#' @param w The workspace to plot the calibration graph from -#' @param rcdata A data frame with columns \code{recalfield} and \code{mzFound}. -#' @param rc Predictor for MS2 data -#' @param rc.ms1 Predictor for MS1 data -#' @param title Prefix for the graph titles -#' @param mzrange m/z value range for the graph -#' @param recalibrateBy Whether recalibration was done by ppm ("ppm") or by m/z ("mz"). -#' Important only for graph labeling here. -#' -#' @author Michele Stravs, Eawag -#' @export -plotRecalibration <- function(w, recalibrateBy = getOption("RMassBank")$recalibrateBy) -{ - spec <- w@aggregated - if(!is.null(w@parent)) - spec <- w@parent@aggregated - - rcdata <- data.frame(mzFound = w@rc$x, recalfield = w@rc$y) - ms1data <- data.frame(mzFound = w@rc.ms1$x, recalfield = w@rc.ms1$y) - - - - par(mfrow=c(2,2)) - if(nrow(rcdata)>0) - plotRecalibration.direct(rcdata, w@rc, w@rc.ms1, "MS2", - range(spec$mzFound[which(spec$good)]),recalibrateBy) - if(nrow(ms1data)>0) - plotRecalibration.direct(ms1data, w@rc, w@rc.ms1, "MS1", - range(ms1data$mzFound),recalibrateBy) - -} - -#' @export -plotRecalibration.direct <- function(rcdata, rc, rc.ms1, title, mzrange, - recalibrateBy = getOption("RMassBank")$recalibrateBy - ) -{ - if(recalibrateBy == "dppm") - ylab.plot <- expression(paste(delta, "ppm")) - else - ylab.plot <- expression(paste(delta, "m/z")) - - plot(recalfield ~ mzFound, data=rcdata, - xlab = "m/z", ylab = ylab.plot, main=paste(title, "scatterplot")) - RcModelMz <- seq(mzrange[[1]], mzrange[[2]], by=0.2) - RcModelRecal <- predict(rc, newdata= data.frame(mzFound =RcModelMz)) - RcModelRecalMs1 <- predict(rc.ms1, newdata= data.frame(mzFound =RcModelMz)) - lines(RcModelMz, RcModelRecal, col="blue") - lines(RcModelMz, RcModelRecalMs1, col="yellow") - if((length(unique(rcdata$mzFound))>1) & - (length(unique(rcdata$recalfield))>1)) - { - if(requireNamespace("gplots",quietly=TRUE)) - { - - gplots::hist2d(rcdata$mzFound, rcdata$recalfield, - col=c("white", heat.colors(12)), xlab="m/z", - ylab = ylab.plot, main=paste(title, "density")) - lines(RcModelMz, RcModelRecal, col="blue") - lines(RcModelMz, RcModelRecalMs1, col="yellow") - } - else - { - message("Package gplots not installed. The recalibration density plot will not be displayed.") - message("To install gplots: install.packages('gplots')") - } - } -} - - -#' @export -recalibrateSpectra <- function(mode, rawspec = NULL, rc = NULL, rc.ms1=NULL, w = NULL, - recalibrateBy = getOption("RMassBank")$recalibrateBy, - recalibrateMS1 = getOption("RMassBank")$recalibrateMS1) -{ - # Load the recal curves from the workspace if one is specified. - if(!is.null(w)) - { - rc <- w@rc - rc.ms1 <- w@rc.ms1 - } - if(is.null(rc) || is.null(rc.ms1)) - stop("Please specify the recalibration curves either via workspace (w) or via parameters rc, rc.ms1.") - - # Do the recalibration - if(!is.null(rawspec)) - { - # go through all raw spectra and recalculate m/z values - recalibratedSpecs <- lapply(rawspec, function(s) - { - if(s@found) - { - # recalculate tandem spectrum peaks - recalSpectra <- lapply(s@children, function(p) - { - recalibrateSingleSpec(p, rc, recalibrateBy) - }) - s@children <- as(recalSpectra, "SimpleList") - # recalculate MS1 spectrum if required - if(recalibrateMS1 != "none") - { - s@parent <- recalibrateSingleSpec(s@parent, rc.ms1, recalibrateBy) - } - } - s@empty <- NA - s@complete <- NA - return(s) - } ) - return(as(recalibratedSpecs, "SimpleList")) - } - else # no rawspec passed - return(list()) -} - -#' @export -recalibrateSingleSpec <- function(spectrum, rc, - recalibrateBy = getOption("RMassBank")$recalibrateBy) -{ - spectrum.df <- as.data.frame(spectrum) - spectrum.df <- spectrum.df[!duplicated(spectrum.df$mz),,drop=FALSE] - spectrum.df <- spectrum.df[order(spectrum.df$mz),,drop=FALSE] - - mzVals <- spectrum.df - if(nrow(mzVals) > 0) - { - # Fix the column names so our - # prediction functions choose the right - # rows. - colnames(mzVals) <- c("mzFound", "int") - drecal <- predict(rc, newdata=mzVals) - if(recalibrateBy == "dppm") - mzRecal <- mzVals$mzFound / (1 + drecal/1e6) - else - mzRecal <- mzVals$mzFound - drecal - # And rename them back so our "mz" column is - # called "mz" again - } - spectrum.df$mz <- mzRecal - - - # now comes the part that I don't like too much; this could be improved by using as.data.frame instead of getData and correspondingly - # also not use setData. For now I leave it like this. - # The problem is that I am not sure whether the default behaviour of as.RmbSpectrum2 should be clean=TRUE or FALSE, - # and vice versa, I am not sure if as.data.frame should return only mz/int or the whole table. - - if(is(spectrum, "RmbSpectrum2")) - { - # this removes all evaluated data that were added in step 2 except for @ok I think - colnames(spectrum.df) <- c("mz", "intensity") - spectrum <- setData(spectrum, spectrum.df, clean=TRUE) - # It also avoids making a new object when we don't know what class it should be - } - else - { - # for Spectrum1 or all others that we don't know - spectrum@mz <- spectrum.df$mz - spectrum@intensity <- spectrum.df$i - } - - return(spectrum) -} - - - - - -#' Filter satellite peaks -#' -#' Filters satellite peaks in FT spectra which arise from FT artifacts and from -#' conversion to stick mode. A very simple rule is used which holds mostly true -#' for MSMS spectra (and shouldn't be applied to MS1 spectra which contain -#' isotope structures...) -#' -#' The function cuts off all peaks within 0.5 m/z from every peak, in -#' decreasing intensity order, which are below 5% of the referring peak's -#' intensity. E.g. for peaks m/z=100, int=100; m/z=100.2, int=2, m/z=100.3, -#' int=6, m/z 150, int=10: The most intense peak (m/z=100) is selected, all -#' neighborhood peaks below 5% are removed (in this case, only the m/z=100.2 -#' peak) and the next less intense peak is selected. Here this is the m/z=150 -#' peak. All low-intensity neighborhood peaks are removed (nothing). The next -#' less intense peak is selected (m/z=100.3) and again neighborhood peaks are -#' cut away (nothing to cut here. Note that the m/z = 100.2 peak was alredy -#' removed.) -#' -#' @usage filterPeakSatellites(peaks, filterSettings = getOption("RMassBank")$filterSettings) -#' @param peaks A peak dataframe with at least the columns \code{mz, int}. Note -#' that \code{mz} is used even for the recalibrated spectra, i.e. the -#' desatellited spectrum is identical for both the unrecalibrated and the -#' recalibrated spectra. -#' @param filterSettings The settings used for filtering. Refer to \code{\link{analyzeMsMs}} -#' documentation for filter settings. -#' @return Returns the peak table with satellite peaks removed. -#' @note This is a very crude rule, but works remarkably well for our spectra. -#' @author Michael Stravs -#' @seealso \code{\link{analyzeMsMs}}, \code{\link{filterLowaccResults}} -#' @examples -#' -#' # From the workflow: -#' \dontrun{ -#' # Filter out satellite peaks: -#' shot <- filterPeakSatellites(shot) -#' shot_satellite_n <- setdiff(row.names(shot_full), row.names(shot)) -#' shot_satellite <- shot_full[shot_satellite_n,] -#' # shot_satellite contains the peaks which were eliminated as satellites. -#' } -#' -#' @export -filterPeakSatellites <- function(peaks, filterSettings = getOption("RMassBank")$filterSettings) -{ - cutoff_int_limit <- filterSettings$satelliteIntLimit - cutoff_mz_limit <- filterSettings$satelliteMzLimit - # Order by intensity (descending) - peaks_o <- peaks[order(peaks$intensity, decreasing=TRUE),,drop=FALSE] - n <- 1 - # As long as there are peaks left AND the last peak is small enough (relative - # to selected), move to the next peak - while(n < nrow(peaks_o)) - { - if(peaks_o[nrow(peaks_o),"intensity"] >= cutoff_int_limit *peaks_o[n,"intensity"]) - break - # remove all peaks within cutoff_mz_limit (std. m/z = 0.5) which have intensity - # of less than 5% relative to their "parent" peak - # - peaks_l <- peaks_o[ (peaks_o$mz > peaks_o[n,"mz"] - cutoff_mz_limit) - & (peaks_o$mz < peaks_o[n,"mz"] + cutoff_mz_limit) - & (peaks_o$intensity < cutoff_int_limit * peaks_o[n,"intensity"]),,drop=FALSE] - peaks_o <- peaks_o[ !((peaks_o$mz > peaks_o[n,"mz"] - cutoff_mz_limit) - & (peaks_o$mz < peaks_o[n,"mz"] + cutoff_mz_limit) - & (peaks_o$intensity < cutoff_int_limit * peaks_o[n,"intensity"]) - ),,drop=FALSE] - n <- n+1 - } - return(peaks_o[order(peaks_o$mz),,drop=FALSE]) -} - - -#' Reanalyze unmatched peaks -#' -#' Reanalysis of peaks with no matching molecular formula by allowing -#' additional elements (e.g. "N2O"). -#' -#' \code{reanalyzeFailpeaks} examines the \code{unmatchedPeaksC} table in -#' \code{specs} and sends every peak through \code{reanalyzeFailpeak}. -#' -#' @aliases reanalyzeFailpeaks reanalyzeFailpeak -#' @usage reanalyzeFailpeaks(aggregated, custom_additions, mode, filterSettings = -#' getOption("RMassBank")$filterSettings, progressbar = "progressBarHook") -#' reanalyzeFailpeak(custom_additions, mass, cpdID, counter, pb = NULL, mode, -#' filterSettings = getOption("RMassBank")$filterSettings) -#' @param aggregated A peake aggregate table (\code{w@@aggregate}) (after processing electronic noise removal!) -#' @param custom_additions The allowed additions, e.g. "N2O". -#' @param mode Processing mode (\code{"pH", "pNa", "mH"} etc.) -#' @param mass (Usually recalibrated) m/z value of the peak. -#' @param cpdID Compound ID of this spectrum. -#' @param counter Current peak index (used exclusively for the progress -#' indicator) -#' @param pb A progressbar object to display progress on, as passed by -#' \code{reanalyzeFailpeaks} to \code{reanalyzeFailpeak}. No progress -#' is displayed if NULL. -#' @param progressbar The progress bar callback to use. Only needed for specialized -#' applications. Cf. the documentation of \code{\link{progressBarHook}} for usage. -#' @param filterSettings Settings for filtering data. Refer to\code{\link{analyzeMsMs}} for settings. -#' @return The aggregate data frame extended by the columns: -#' #' \item{reanalyzed.???}{If reanalysis (step 7) has already been processed: matching values from the reanalyzed peaks} -#' \item{matchedReanalysis}{Whether reanalysis has matched (\code{TRUE}), not matched(\code{FALSE}) or has not been conducted for the peak(\code{NA}).} -#' -#' It would be good to merge the analysis functions of \code{analyzeMsMs} with -#' the one used here, to simplify code changes. -#' @author Michael Stravs -#' @seealso \code{\link{analyzeMsMs}}, \code{\link{msmsWorkflow}} -#' @examples -#' -#' ## As used in the workflow: -#' \dontrun{ -#' reanalyzedRcSpecs <- reanalyzeFailpeaks(w@@aggregated, custom_additions="N2O", mode="pH") -#' # A single peak: -#' reanalyzeFailpeak("N2O", 105.0447, 1234, 1, 1, "pH") -#' } -#' -#' @export -reanalyzeFailpeaks <- function(aggregated, custom_additions, mode, filterSettings = - getOption("RMassBank")$filterSettings, progressbar = "progressBarHook") -{ - - fp <- peaksUnmatched(aggregated, cleaned=TRUE) - fp <- fp[is.na(fp$dppm) | (fp$dppm == fp$dppmBest),] - #fp <- pu[!pu$noise,,drop=FALSE] - - custom_additions_l <- as.list(rep(x=custom_additions, times=nrow(fp))) - mode_l <- as.list(rep(x=mode, times=nrow(fp))) - nLen <- nrow(fp) - - pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=max(nLen,1))) - temp <- data.frame() - if(nLen == 0) - { - message("reanalyzeFailpeaks: No peaks to reanalyze.") - temp <- data.frame( - "reanalyzed.formula" = character(), - "reanalyzed.mzCalc" = numeric(), - "reanalyzed.dppm" = numeric(), - "reanalyzed.formulaCount" = numeric(), - "reanalyzed.dbe" = numeric()) - } - else - { - counter <- as.list(1:nrow(fp)) - # this is the reanalysis step: run reanalyze.failpeak (with the relevant parameters) - # on each failpeak. - temp <- mapply(reanalyzeFailpeak, custom_additions_l, fp$mzFound, fp$cpdID, counter, - MoreArgs=list(mode=mode, pb=list(hook=progressbar, bar=pb), filterSettings=filterSettings)) - # reformat the result and attach it to specs - temp <- as.data.frame(t(temp)) - temp <- temp[,c("reanalyzed.formula", "reanalyzed.mzCalc", "reanalyzed.dppm", - "reanalyzed.formulaCount", "reanalyzed.dbe")] - } - - # Add columns to the aggregated table (they are then filled in with the obtained values for reanalyzed peaks and left - # empty otherwise - aggregated <- addProperty(aggregated, "reanalyzed.formula", "character") - aggregated <- addProperty(aggregated, "reanalyzed.mzCalc", "numeric") - aggregated <- addProperty(aggregated, "reanalyzed.dppm", "numeric") - aggregated <- addProperty(aggregated, "reanalyzed.formulaCount", "numeric") - aggregated <- addProperty(aggregated, "reanalyzed.dbe", "numeric") - aggregated <- addProperty(aggregated, "matchedReanalysis", "logical", NA) - - - peaksReanalyzed <- cbind(fp, temp) - - # Since some columns are in "list" type, they disturb later on. - # therefore, fix them and make them normal vectors. - listcols <- unlist(lapply(colnames(peaksReanalyzed), function(col) - is.list(peaksReanalyzed[,col]))) - for(col in colnames(peaksReanalyzed)[which(listcols==TRUE)]) - peaksReanalyzed[,col] <- - unlist(peaksReanalyzed[,col]) - - peaksReanalyzed$matchedReanalysis <- !is.na(peaksReanalyzed$reanalyzed.dppm) - - # Substitute in the reanalyzed peaks into the aggregated table - aggregated[match(peaksReanalyzed$index, aggregated$index),] <- peaksReanalyzed - - do.call(progressbar, list(object=pb, close=TRUE)) - return(aggregated) -} - - -#' @export -reanalyzeFailpeak <- function(custom_additions, mass, cpdID, counter, pb = NULL, mode, - filterSettings = getOption("RMassBank")$filterSettings) -{ - # the counter to show the progress - if(!is.null(pb)) - { - do.call(pb$hook, list(object=pb$bar, value=counter)) - } - # here follows the Rcdk analysis - #------------------------------------ - - # define the adduct additions - if(mode == "pH") { - allowed_additions <- "H" - mode.charge <- 1 - } else if(mode == "pNa") { - allowed_additions <- "Na" - mode.charge <- 1 - } else if(mode == "pM") { - allowed_additions <- "" - mode.charge <- 1 - } else if(mode == "mM") { - allowed_additions <- "" - mode.charge <- -1 - } else if(mode == "mH") { - allowed_additions <- "H-1" - mode.charge <- -1 - } else if(mode == "mFA") { - allowed_additions <- "C2H3O2" - mode.charge <- -1 - } else { - stop("mode = \"", mode, "\" not defined") - } - - # the ppm range is two-sided here. - # The range is slightly expanded because dppm calculation of - # generate.formula starts from empirical mass, but dppm cal- - # culation of the evaluation starts from theoretical mass. - # So we don't miss the points on 'the border'. - - db_formula <- findFormula(cpdID, retrieval=findLevel(cpdID,TRUE)) - - ppmlimit <- 2.25 * filterSettings$ppmFine - parent_formula <- add.formula(db_formula, allowed_additions) - parent_formula <- add.formula(parent_formula, custom_additions) - dbe_parent <- dbe(parent_formula) - # check whether the formula is valid, i.e. has no negative or zero element numbers. - #print(parent_formula) - limits <- to.limits.rcdk(parent_formula) - - peakformula <- tryCatch(suppressWarnings(generate.formula(mass, ppm(mass, ppmlimit, p=TRUE), - limits, charge=mode.charge)), error=function(e) NA) - # was a formula found? If not, return empty result - if(!is.list(peakformula)) - return(as.data.frame( - t(c(mzFound=as.numeric(as.character(mass)), - reanalyzed.formula=NA, reanalyzed.mzCalc=NA, reanalyzed.dppm=NA, - reanalyzed.formulaCount=0, - reanalyzed.dbe=NA)))) - else # if is.list(peakformula) - # formula found? then return the one with lowest dppm - { - # calculate dppm for all formulas - peakformula <- sapply(peakformula, function(f) - { - l <- list(mzFound=as.numeric(as.character(mass)), - reanalyzed.formula=as.character(f@string), - reanalyzed.mzCalc=as.numeric(as.character(f@mass)) - ) - - return(unlist(l)) - }) - - # filter out bad dbe stuff - peakformula <- as.data.frame(t(peakformula)) - # for some reason completely oblivious to me, the columns in peakformula - # are still factors, even though i de-factored them by hand. - # Therefore, convert them again... - peakformula$mzFound <- as.numeric(as.character(peakformula$mzFound)) - peakformula$reanalyzed.formula <- as.character(peakformula$reanalyzed.formula) - peakformula$reanalyzed.mzCalc <- as.numeric(as.character(peakformula$reanalyzed.mzCalc)) - - peakformula$reanalyzed.dppm <- (peakformula$mzFound / peakformula$reanalyzed.mzCalc - 1) * 1e6 - peakformula$reanalyzed.formulaCount=nrow(peakformula) - - # filter out bad dbe and high ppm stuff - peakformula$reanalyzed.dbe <- unlist(lapply(peakformula$reanalyzed.formula, dbe)) - peakformula <- peakformula[(peakformula$reanalyzed.dbe >= filterSettings$dbeMinLimit) - & (abs(peakformula$reanalyzed.dppm) < filterSettings$ppmFine),] - # is there still something left? - if(nrow(peakformula) == 0) - return(as.data.frame( - t(c(mzFound=as.numeric(as.character(mass)), - reanalyzed.formula=NA, reanalyzed.mzCalc=NA, reanalyzed.dppm=NA, - reanalyzed.formulaCount=0, reanalyzed.dbe = NA)))) - else - { - #update formula count to the remaining formulas - peakformula$reanalyzed.formulaCount=nrow(peakformula) - return(peakformula[which.min(abs(peakformula$reanalyzed.dppm)),]) - } - - } # endif is.list(peakformula) - - - - } - -#' Multiplicity filtering: Removes peaks which occur only once in a n-spectra set. -#' -#' For every compound, every peak (with annotated formula) is compared -#' across all spectra. Peaks whose formula occurs only once for all collision energies -#' / spectra types, are discarded. This eliminates "stochastic formula hits" of pure -#' electronic noise peaks efficiently from the spectra. Note that in the author's -#' experimental setup two spectra were recorded at every collision energy, -#' and therefore every peak-formula should appear -#' at least twice if it is real, even if it is by chance a fragment which appears -#' on only one collision energy setting. The function was not tested in a different -#' setup. Therefore, use with a bit of caution. -#' @usage filterPeaksMultiplicity(peaks, formulacol, recalcBest = TRUE) -#' @param peaks An aggregate peak data.frame containing all peaks to be analyzed; with at least -#' the columns \code{cpdID, scan, mzFound} and one column for the formula -#' specified with the \code{formulacol} parameter. -#' @param formulacol Which column the assigned formula is stored in. (Needed to separately process \code{"formula"} and -#' \code{"reanalyzed.formula"} multiplicites.) -#' @param recalcBest Whether the best formula for each peak should be re-determined. -#' This is necessary for results from the ordinary \code{\link{analyzeMsMs}} -#' analysis which allows multiple potential formulas per peak - the old best match -#' could potentially have been dropped because of multiplicity filtering. For results -#' from \code{\link{reanalyzeFailpeak}} this is not necessary, since only one potential -#' formula is assigned in this case. -#' @return The peak table is returned, enriched with columns: -#' \itemize{ -#' \item{\code{formulaMultiplicity}}{The # of occurrences of this formula -#' in the spectra of its compounds.} -#' } -#' @examples \dontrun{ -#' peaksFiltered <- filterPeaksMultiplicity(peaksMatched(w), -#' "formula", TRUE) -#' peaksOK <- subset(peaksFiltered, formulaMultiplicity > 1) -#' } -#' @author Michael Stravs, EAWAG -#' @export -filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE) -{ - # create dummy for the case that we have no rows - multInfo <- data.frame(cpdID = character(), - formulacol = character(), - formulaMultiplicity = numeric()) - # rename (because "formulacol" is not the actually correct name) - colnames(multInfo) <- c("cpdID", formulacol, "formulaMultiplicity") - - if(!is.data.frame(peaks) || (nrow(peaks) == 0) ) - { - peaks <- cbind(peaks, data.frame(formulaMultiplicity=numeric())) - if(recalcBest){ - if(formulacol == "formula"){ - warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.") - } - if(formulacol == "reanalyzed.formula"){ - warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.") - } - peaks$fM_factor <- as.factor(peaks$formulaMultiplicity) - return(peaks) - } - } - else - { - # calculate duplicity info - multInfo <- aggregate(as.data.frame(peaks$scan), list(peaks$cpdID, peaks[,formulacol]), FUN=length) - # just for comparison: - # nform <- unique(paste(pks$cpdID,pks$formula)) - - # merge the duplicity info into the peak table - colnames(multInfo) <- c("cpdID", formulacol, "formulaMultiplicity") - peaks <- merge(peaks, multInfo) - } - - # separate log intensity data by duplicity (needs duplicity as a factor) - # and boxplot - peaks$fM_factor <- as.factor(peaks$formulaMultiplicity) - - # nostalgy: dppmBest first, to compare :) - # now we prioritize the most frequent formula instead, and only then apply the - # dppmBest rule - #pks2 <- subset(pks, dppm==dppmBest) - - # split peak intensity by multiplicity - peakMultiplicitySets <- split(log(peaks$int,10), peaks$fM_factor) - #boxplot(peakMultiplicitySets) - # nice plot :) - #if(length(peakMultiplicitySets) > 0) - # q <- quantile(peakMultiplicitySets[[1]], c(0,.25,.5,.75,.95,1)) - pk_data <- lapply(peakMultiplicitySets, length) - - # now by formula, not by peak: - multInfo$fM_factor <- as.factor(multInfo$formulaMultiplicity) - # the formulas are split into bins with their multiplicity - # (14 bins for our 14-spectra method) - formulaMultiplicitySets <- split(multInfo[,formulacol], multInfo$fM_factor) - formulaMultiplicityHist <- lapply(formulaMultiplicitySets, length) - - # if we use recalcBest, then we recalculate which peak in the - # list was best. We do this for the peaks matched in the first analysis. - # The peaks from the reanalysis are single anyway and don't get this additional - # treatment. - - if(recalcBest == FALSE) - return(peaks) - - # prioritize duplicate peaks - # get unique peaks with their maximum-multiplicity formula attached - best_mult <- aggregate(as.data.frame(peaks$formulaMultiplicity), - list(peaks$cpdID, peaks$scan, peaks$mzFound), - max) - colnames(best_mult) <- c("cpdID", "scan", "mzFound", "bestMultiplicity") - peaks <- merge(peaks, best_mult) - peaks <- peaks[peaks$formulaMultiplicity==peaks$bestMultiplicity,] - - # now we also have to recalculate dppmBest since the "old best" may have been - # dropped. - peaks$dppmBest <- NULL - bestPpm <- aggregate(as.data.frame(peaks$dppm), - list(peaks$cpdID, peaks$scan, peaks$mzFound), - function(dppm) dppm[[which.min(abs(dppm))]]) - colnames(bestPpm) <- c("cpdID", "scan", "mzFound", "dppmBest") - peaks <- merge(peaks, bestPpm) - pks_best <- peaks[peaks$dppm==peaks$dppmBest,] - - # And, iteratively, the multiplicity also must be recalculated, because we dropped - # some peaks and the multiplicites of some of the formulas will have decreased. - - pks_best$formulaMultiplicity <- NULL - pks_best$bestMultiplicity <- NULL - multInfo_best <- aggregate(as.data.frame(pks_best$scan), - list(pks_best$cpdID, pks_best[,formulacol]), - FUN=length) - colnames(multInfo_best) <- c("cpdID", formulacol, "formulaMultiplicity") - pks_best <- merge(pks_best, multInfo_best) - pks_best$fM_factor <- as.factor(pks_best$formulaMultiplicity) - multInfo_best$fM_factor <- as.factor(multInfo_best$formulaMultiplicity) - - formulaMultplicitySets_best <- split(multInfo_best[,formulacol], multInfo_best$fM_factor) - formulaMultplicityHist_best <- lapply(formulaMultplicitySets_best, length) - - peakMultiplicitySets_best <- split(log(pks_best$int,10), pks_best$fM_factor) - #boxplot(peakMultiplicitySets_best) - #q <- quantile(peakMultiplicitySets_best[[1]], c(0,.25,.5,.75,.95,1)) - #peakMultiplicityHist_best <- lapply(peakMultiplicitySets_best, length) - #q - pks_best$fM_factor <- NULL - # this returns the "best" peaks (first by formula multiplicity, then by dppm) - # before actually cutting the bad ones off. - - - return(pks_best) -} - - -#' filterMultiplicity -#' -#' Multiplicity filtering: Removes peaks which occur only once in a n-spectra -#' set. -#' -#' This function executes multiplicity filtering for a set of spectra using the -#' workhorse function \code{\link{filterPeaksMultiplicity}} (see details there) -#' and retrieves problematic filtered peaks (peaks which are of high intensity -#' but were discarded, because either no formula was assigned or it was not -#' present at least 2x), using the workhorse function -#' \code{\link{problematicPeaks}}. The results are returned in a format ready -#' for further processing with \code{\link{mbWorkflow}}. -#' -#' @usage filterMultiplicity(w, archivename=NA, mode="pH", recalcBest = TRUE, -#' multiplicityFilter = getOption("RMassBank")$multiplicityFilter) -#' @param w Workspace containing the data to be processed (aggregate table and \code{RmbSpectraSet} objects) -#' @param archivename The archive name, used for generation of -#' archivename_Failpeaks.csv -#' @param mode Mode of ion analysis -#' @param recalcBest Boolean, whether to recalculate the formula multiplicity -#' after the first multiplicity filtering step. Sometimes, setting this -#' to FALSE can be a solution if you have many compounds with e.g. fluorine -#' atoms, which often have multiple assigned formulas per peak and might occasionally -#' lose peaks because of that. -#' @param multiplicityFilter Threshold for the multiplicity filter. If set to 1, -#' no filtering will apply (minimum 1 occurrence of peak). 2 equals minimum -#' 2 occurrences etc. -#' @return A list object with values: -#' \item{peaksOK}{ Peaks with >1-fold formula multiplicity from the -#' "normal" peak analysis. } -#' \item{peaksReanOK}{ Peaks with >1-fold formula multiplicity from -#' peak reanalysis. } -#' \item{peaksFiltered}{ All peaks with annotated formula multiplicity from -#' first analysis. } -#' \item{peaksFilteredReanalysis}{ All peaks with annotated -#' formula multiplicity from peak reanalysis. } -#' \item{peaksProblematic}{ Peaks with high intensity which do not match -#' inclusion criteria -> possible false negatives. The list will be -#' exported into archivename_failpeaks.csv. -#' } -#' @author Michael Stravs -#' @seealso -#' \code{\link{filterPeaksMultiplicity}},\code{\link{problematicPeaks}} -#' @examples -#' \dontrun{ -#' refilteredRcSpecs <- filterMultiplicity( -#' w, "myarchive", "pH") -#' } -#' @export -filterMultiplicity <- function(w, archivename=NA, mode="pH", recalcBest = TRUE, - multiplicityFilter = getOption("RMassBank")$multiplicityFilter) -{ - # Read multiplicity filter setting - # For backwards compatibility: If the option is not set, define as 2 - # (which was the behaviour before we introduced the option) - if(is.null(multiplicityFilter)) - multiplicityFilter <- 2 - - specs <- w@aggregated - - peaksFiltered <- filterPeaksMultiplicity(peaksMatched(specs), "formula", recalcBest) - peaksFilteredReanalysis <- filterPeaksMultiplicity(specs[!is.na(specs$matchedReanalysis) & specs$matchedReanalysis,,drop=FALSE], "reanalyzed.formula", FALSE) - - specs <- addProperty(specs, "formulaMultiplicity", "numeric", 0) - - # Reorder the columns of the filtered peaks such that they match the columns - # of the original aggregated table; such that the columns can be substituted in. - - peaksFiltered <- peaksFiltered[,colnames(specs)] - peaksFilteredReanalysis <- peaksFilteredReanalysis[,colnames(specs)] - - # substitute into the parent dataframe - specs[match(peaksFiltered$index,specs$index),] <- peaksFiltered - specs[match(peaksFilteredReanalysis$index,specs$index),] <- peaksFilteredReanalysis - - - specs <- addProperty(specs, "filterOK", "logical", FALSE) - - OKindex <- which(specs$formulaMultiplicity > (multiplicityFilter - 1)) - - if(length(OKindex)){ - specs[OKindex,"filterOK"] <- TRUE - } - - peaksReanOK <- specs[ - specs$filterOK & !is.na(specs$matchedReanalysis) & specs$matchedReanalysis,,drop=FALSE] - - # Kick the M+H+ satellites out of peaksReanOK: - peaksReanOK$mzCenter <- as.numeric( - unlist(lapply(peaksReanOK$cpdID, function(id) findMz(id, retrieval=findLevel(id,TRUE))$mzCenter)) ) - peaksReanBad <- peaksReanOK[ - !((peaksReanOK$mzFound < peaksReanOK$mzCenter - 1) | - (peaksReanOK$mzFound > peaksReanOK$mzCenter + 1)),] - notOKindex <- match(peaksReanBad$index, specs$index) - if(length(notOKindex)){ - specs[notOKindex,"filterOK"] <- FALSE - } - - - return(specs) -} - -#' Return MS1 peaks to be used for recalibration -#' -#' Returns the precursor peaks for all MS1 spectra in the \code{spec} dataset -#' with annotated formula to be used in recalibration. -#' -#' For all spectra in \code{spec$specFound}, the precursor ion is extracted from -#' the MS1 precursor spectrum. All found ions are returned in a data frame with a -#' format matching \code{spec$peaksMatched} and therefore suitable for \code{rbind}ing -#' to the \code{spec$peaksMatched} table. However, only minimal information needed for -#' recalibration is returned. -#' -#' @usage recalibrate.addMS1data(spec,mode="pH", recalibrateMS1Window = -#' getOption("RMassBank")$recalibrateMS1Window) -#' @param spec A \code{msmsWorkspace} or \code{RmbSpectraSetList} containing spectra for which MS1 "peaks" should be "constructed". -#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). -#' @param recalibrateMS1Window Window width to look for MS1 peaks to recalibrate (in ppm). -#' @return A dataframe with columns \code{mzFound, formula, mzCalc, dppm, dbe, int, -#' dppmBest, formulaCount, good, cpdID, scan, parentScan, dppmRc}. However, -#' columns \code{dbe, int, formulaCount, good, scan, parentScan} do not contain -#' real information and are provided only as fillers. -#' @examples \dontrun{ -#' # More or less as used in recalibrateSpectra: -#' rcdata <- peaksMatched(w) -#' rcdata <- rcdata[rcdata$formulaCount == 1, ,drop=FALSE] -#' ms1data <- recalibrate.addMS1data(w, "pH", 15) -#' rcdata <- rbind(rcdata, ms1data) -#' # ... continue constructing recalibration curve with rcdata -#' } -#' @author Michael Stravs, EAWAG -#' @export -recalibrate.addMS1data <- function(spec,mode="pH", recalibrateMS1Window = - getOption("RMassBank")$recalibrateMS1Window) -{ - ## which_OK <- lapply(validPrecursors, function(pscan) - ## { - ## pplist <- as.data.frame( - ## mzR::peaks(msRaw, which(headerData$acquisitionNum == pscan))) - ## colnames(pplist) <- c("mz","int") - ## pplist <- subset(pplist, mz >= mzLimits$mzMin & mz <= mzLimits$mzMax) - ## if(nrow(pplist) > 0) - ## return(TRUE) - ## return(FALSE) - ## }) - - specFound <- selectSpectra(spec, "found", "object") - - ms1peaks <- lapply(specFound, function(cpd){ - mzL <- findMz.formula(cpd@formula,mode,recalibrateMS1Window,0) - mzCalc <- mzL$mzCenter - ms1 <- mz(cpd@parent) - - mzFound <- ms1[which.min(abs(ms1 - mzL$mzCenter))] - if(!length(mzFound)){ - return(c( - mzFound = NA, - mzCalc = mzCalc, - dppm = NA - )) - } else { - dppmRc <- (mzFound/mzCalc - 1)*1e6 - return(c( - mzFound = mzFound, - mzCalc = mzCalc, - dppm = dppmRc, - id=cpd@id - )) - } - }) - ms1peaks <- as.data.frame(do.call(rbind, ms1peaks), stringsAsFactors=FALSE) - # convert numbers to numeric - tonum <- c("mzFound", "dppm", "mzCalc") - ms1peaks[,tonum] <- as.numeric(unlist(ms1peaks[,tonum])) - # throw out NA stuff - ms1peaks <- ms1peaks[!is.na(ms1peaks$mzFound),] - return(ms1peaks) -} - - -# Custom recalibration function: You can overwrite the recal function by -# making any function which takes rcdata$recalfield ~ rcdata$mzFound. -# The settings define which recal function is used -# getOption("RMassBank")$recalibrator = list( -# MS1 = "recalibrate.loess", -# MS2 = "recalibrate.loess") - -#' Predefined recalibration functions. -#' -#' Predefined fits to use for recalibration: Loess fit and GAM fit. -#' -#' \code{recalibrate.loess()} provides a Loess fit (\code{recalibrate.loess}) -#' to a given recalibration parameter. -#' If MS and MS/MS data should be fit together, recalibrate.loess -#' provides good default settings for Orbitrap instruments. -#' -#' \code{recalibrate.identity()} returns a non-recalibration, i.e. a predictor -#' which predicts 0 for all input values. This can be used if the user wants to -#' skip recalibration in the RMassBank workflow. -#' -#' #' \code{recalibrate.mean()} and \code{recalibrate.linear()} are simple recalibrations -#' which return a constant shift or a linear recalibration. They will be only useful -#' in particular cases. -#' -#' \code{recalibrate()} itself is only a dummy function and does not do anything. -#' -#' Alternatively other functions can be defined. Which functions are used for recalibration -#' is specified by the RMassBank options file. (Note: if \code{recalibrateMS1: common}, the -#' \code{recalibrator: MS1} value is irrelevant, since for a common curve generated with -#' the function specified in \code{recalibrator: MS2} will be used.) -#' -#' @aliases recalibrate.loess recalibrate recalibrate.identity recalibrate.mean recalibrate.linear -#' @usage recalibrate.loess(rcdata) -#' -#' recalibrate.identity(rcdata) -#' -#' recalibrate.mean(rcdata) -#' -#' recalibrate.linear(rcdata) -#' -#' @param rcdata A data frame with at least the columns \code{recalfield} and -#' \code{mzFound}. \code{recalfield} will usually contain delta(ppm) or -#' delta(mz) values and is the target parameter for the recalibration. -#' @return Returns a model for recalibration to be used with \code{predict} and the like. -#' @examples \dontrun{ -#' rcdata <- subset(spec$peaksMatched, formulaCount==1) -#' ms1data <- recalibrate.addMS1data(spec, mode, 15) -#' rcdata <- rbind(rcdata, ms1data) -#' rcdata$recalfield <- rcdata$dppm -#' rcCurve <- recalibrate.loess(rcdata) -#' # define a spectrum and recalibrate it -#' s <- matrix(c(100,150,200,88.8887,95.0005,222.2223), ncol=2) -#' colnames(s) <- c("mz", "int") -#' recalS <- recalibrateSingleSpec(s, rcCurve) -#' -#' Alternative: define an custom recalibrator function with different parameters -#' recalibrate.MyOwnLoess <- function(rcdata) -#' { -#' return(loess(recalfield ~ mzFound, data=rcdata, family=c("symmetric"), -#' degree = 2, span=0.4)) -#' } -#' # This can then be specified in the RMassBank settings file: -#' # recalibrateMS1: common -#' # recalibrator: -#' # MS1: recalibrate.loess -#' # MS2: recalibrate.MyOwnLoess") -#' # [...] -#' } -#' @author Michael Stravs, EAWAG -#' @export -recalibrate <- function() -{ - return(NA) -} - -#' @export -recalibrate.loess <- function(rcdata) -{ - span <- 0.25 - # ex XCMS (permission by Steffen): heuristically decide on loess vs linear - mingroups <- nrow(rcdata[!is.na(rcdata$mzFound),]) - if(mingroups < 4) - { - warning("recalibrate.loess: Not enough data points, omitting recalibration") - return(recalibrate.identity(rcdata)) - } else if (mingroups*span < 4) { - span <- 4/mingroups - warning("recalibrate.loess: Span too small, resetting to ", round(span, 2)) - } - return(loess(recalfield ~ mzFound, data=rcdata, family=c("symmetric"), - degree = 1, span=0.25, surface="direct" )) -} - -#' @export -recalibrate.identity <- function(rcdata) -{ - return(lm(recalfield ~ 0, data=rcdata)) -} - -#' @export -recalibrate.mean <- function(rcdata) -{ - return(lm(recalfield ~ 1, data=rcdata)) -} - -#' @export -recalibrate.linear <- function(rcdata) -{ - return(lm(recalfield ~ mzFound, data=rcdata)) -} - -#' Standard progress bar hook. -#' -#' This function provides a standard implementation for the progress bar in RMassBank. -#' -#' RMassBank calls the progress bar function in the following three ways: -#' \code{pb <- progressBarHook(object=NULL, value=0, min=0, max=LEN)} -#' to create a new progress bar. -#' \code{pb <- progressBarHook(object=pb, value= VAL)} -#' to set the progress bar to a new value (between the set \code{min} and \code{max}) -#' \code{progressBarHook(object=pb, close=TRUE)} -#' to close the progress bar. (The actual calls are performed with \code{do.call}, -#' e.g. -#' \code{progressbar <- "progressBarHook" -#' pb <- do.call(progressbar, list(object=pb, value= nProg)) -#' }. See the source code for details.) -#' -#' To substitute the standard progress bar for an alternative implementation (e.g. for -#' use in a GUI), the developer can write his own function which behaves in the same way -#' as \code{progressBarHook}, i.e. takes the same parameters and can be called in the -#' same way. -#' -#' @param object An identifier representing an instance of a progress bar. -#' @param value The new value to assign to the progress indicator -#' @param min The minimal value of the progress indicator -#' @param max The maximal value of the progress indicator -#' @param close If \code{TRUE}, the progress bar is closed. -#' @return Returns a progress bar instance identifier (i.e. an identifier -#' which can be used as \code{object} in subsequent calls.) -#' -#' @author Michele Stravs, Eawag -#' @export -progressBarHook <- function(object = NULL, value = 0, min = 0, max = 100, close = FALSE) -{ - if(is.null(object)) - { - object <- txtProgressBar(min, max, value, style=3, file=stderr()) - } - if(close) - close(object) - else - { - setTxtProgressBar(object, value) - return(object) - } -} + +#library(xcms) + +#' Backup \code{msmsWorkflow} results +#' +#' Writes the results from different \code{msmsWorkflow} steps to a file. +#' +#' @aliases archiveResults +#' @usage archiveResults(w, fileName, settings = getOption("RMassBank")) +#' @param w The \code{msmsWorkspace} to be saved. +#' @param fileName The filename to store the results under. +#' @param settings The settings to be stored into the msmsWorkspace image. +#' @examples +#' +#' # This doesn't really make a lot of sense, +#' # it stores an empty workspace. +#' RmbDefaultSettings() +#' w <- newMsmsWorkspace() +#' archiveResults(w, "narcotics.RData") +#' +#' @export +archiveResults <- function(w, fileName, settings = getOption("RMassBank")) +{ + # save the settings into the settings slot + w@settings <- settings + # save + save(w, file=fileName) + +} + + +#' RMassBank mass spectrometry pipeline +#' +#' Extracts and processes spectra from a specified file list, according to +#' loaded options and given parameters. +#' +#' The filenames of the raw LC-MS runs are read from the array \code{files} +#' in the global enviroment. +#' See the vignette \code{vignette("RMassBank")} for further details about the +#' workflow. +#' +#' @param w A \code{msmsWorkspace} to work with. +#' @param mode \code{"pH", "pNa", "pM", "mH", "mM", "mFA", "pNH4"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-, [M+NH4]+). +#' @param steps Which steps of the workflow to process. See the vignette +#' \code{vignette("RMassBank")} for details. +#' @param confirmMode Defaults to false (use most intense precursor). Value 1 uses +#' the 2nd-most intense precursor for a chosen ion (and its data-dependent scans) +#' , etc. +#' @param newRecalibration Whether to generate a new recalibration curve (\code{TRUE}, default) or +#' to reuse the currently stored curve (\code{FALSE}, useful e.g. for adduct-processing runs.) +#' @param useRtLimit Whether to enforce the given retention time window. +#' @param archivename The prefix under which to store the analyzed result files. +#' @param readMethod Several methods are available to get peak lists from the files. +#' Currently supported are "mzR", "xcms", "MassBank" and "peaklist". +#' The first two read MS/MS raw data, and differ in the strategy +#' used to extract peaks. MassBank will read existing records, +#' so that e.g. a recalibration can be performed, and "peaklist" +#' just requires a CSV with two columns and the column header "mz", "int". +#' @param findPeaksArgs A list of arguments that will be handed to the xcms-method findPeaks via do.call +#' @param plots A parameter that determines whether the spectra should be plotted or not (This parameter is only used for the xcms-method) +#' @param precursorscan.cf Whether to fill precursor scans. To be used with files which for +#' some reasons do not contain precursor scan IDs in the mzML, e.g. AB Sciex converted +#' files. +#' @param settings Options to be used for processing. Defaults to the options loaded via +#' \code{\link{loadRmbSettings}} et al. Refer to there for specific settings. +#' @param analyzeMethod The "method" parameter to pass to \code{\link{analyzeMsMs}}. +#' @param progressbar The progress bar callback to use. Only needed for specialized applications. +#' Cf. the documentation of \code{\link{progressBarHook}} for usage. +#' @param MSe A boolean value that determines whether the spectra were recorded using MSe or not +#' @return The processed \code{msmsWorkspace}. +#' @seealso \code{\link{msmsWorkspace-class}} +#' @author Michael Stravs, Eawag +#' @export +msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRecalibration = TRUE, + useRtLimit = TRUE, archivename=NA, readMethod = "mzR", findPeaksArgs = NULL, plots = FALSE, + precursorscan.cf = FALSE, + settings = getOption("RMassBank"), analyzeMethod = "formula", + progressbar = "progressBarHook", MSe = FALSE) +{ + .checkMbSettings() + if(!any(mode %in% c("pH","pNa","pNH4","pM","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) + + if(!is.na(archivename)) + w@archivename <- archivename + + # Make a progress bar: + nProg <- 0 + nLen <- length(w@files) + + allUnknown <- FALSE + + # If all compounds are unknown some specific conditions apply + if(all(.listEnvEnv$listEnv$compoundList$Level == "5")){ + allUnknown <- TRUE + message("All compounds are unknown, the workflow will be adjusted accordingly") + } + + if(readMethod == "minimal"){ + ##Edit options + opt <- getOption("RMassBank") + opt$recalibrator$MS1 <- "recalibrate.identity" + opt$recalibrator$MS2 <- "recalibrate.identity" + opt$add_annotation <- FALSE + opt$multiplicityFilter <- 1 + options(RMassBank=opt) + settings <- getOption("RMassBank") + ##Edit analyzemethod + analyzeMethod <- "intensity" + } + + # clean rerun functionality: + # if any step after 3 has been run, rerunning steps 4 or below needs moving back to the parent workspace. + # However, the recalibration must be preserved, because: + # if someone runs + # w <- msmsWorkflow(w, steps=c(1:4)), + # then substitutes the recalibration + # w@rc <- myrecal + # then runs step 4 again + # w <- msmsWorkflow(w, steps=c(4), newRecalibration=FALSE) + # the rc and rc.ms1 must be preserved and not taken from the parent workspace + if(!all(steps > 4) & !is.null(w@parent)) + { + rc <- w@rc + rc.ms1 <- w@rc.ms1 + w <- w@parent + w@rc <- rc + w@rc.ms1 <- rc.ms1 + } + + # Step 1: acquire all MSMS spectra from files + if(1 %in% steps) + { + message("msmsWorkflow: Step 1. Acquire all MSMS spectra from files") + w <- msmsRead(w = w, files = w@files, readMethod=readMethod, mode=mode, confirmMode = confirmMode, useRtLimit = useRtLimit, + Args = findPeaksArgs, settings = settings, progressbar = progressbar, MSe = MSe) + } + # Step 2: first run analysis before recalibration + if(2 %in% steps) + { + nProg <- 0 + message("msmsWorkflow: Step 2. First analysis pre recalibration") + if(allUnknown){ + analyzeMethod <- "intensity" + } + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) + w@spectra <- as(lapply(w@spectra, function(spec) { + #print(spec$id) + # if(findLevel(spec@id,TRUE) == "unknown"){ + # analyzeMethod <- "intensity" + # } else { + # analyzeMethod <- "formula" + # } + s <- analyzeMsMs(msmsPeaks = spec, mode=mode, detail=TRUE, run="preliminary", + filterSettings = settings$filterSettings, + spectraList = settings$spectraList, method = analyzeMethod) + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(s) + }), "SimpleList") + ## for(f in w@files) + ## w@spectra[[basename(as.character(f))]]@name <- basename(as.character(f)) + suppressWarnings(do.call(progressbar, list(object=pb, close=TRUE))) + } + # Step 3: aggregate all spectra + if(3 %in% steps) + { + message("msmsWorkflow: Step 3. Aggregate all spectra") + w@aggregated <- aggregateSpectra(spec = w@spectra, addIncomplete=TRUE) + + if(RMassBank.env$verbose.output){ + numberOfPeaksThere <- sum(unlist(lapply(X = w@spectra, FUN = function(spec){ sum(unlist(lapply(X = spec@children, FUN = function(child){ child@peaksCount }))) }))) + if(nrow(w@aggregated) < numberOfPeaksThere) + cat(paste("### Warning ### The aggregation of spectra lead to the removal of ", (numberOfPeaksThere-nrow(w@aggregated)), " / ", numberOfPeaksThere, " peaks\n", sep = "")) + } + } + + if(allUnknown){ + w@aggregated$noise <- FALSE + w@aggregated$noise <- FALSE + w@aggregated$reanalyzed.formula <- NA + w@aggregated$reanalyzed.mzCalc <- NA + w@aggregated$reanalyzed.dppm <- NA + w@aggregated$reanalyzed.formulaCount <- NA + w@aggregated$reanalyzed.dbe <- NA + w@aggregated$matchedReanalysis <- NA + w@aggregated$filterOK <- TRUE + w@aggregated$problematicPeak <- FALSE + w@aggregated$formulaMultiplicity <- unlist(sapply(table(w@aggregated$cpdID),function(x) rep(x,x))) + return(w) + } + + + # Step 4: recalibrate all m/z values in raw spectra + if(4 %in% steps) + { + message("msmsWorkflow: Step 4. Recalibrate m/z values in raw spectra") + if(newRecalibration) + { + # note: makeRecalibration takes w as argument now, because it needs to get the MS1 spectra from @spectra + recal <- makeRecalibration(w, mode, + recalibrateBy = settings$recalibrateBy, + recalibrateMS1 = settings$recalibrateMS1, + recalibrator = settings$recalibrator, + recalibrateMS1Window = settings$recalibrateMS1Window) + w@rc <- recal$rc + w@rc.ms1 <- recal$rc.ms1 + } + w@parent <- w + w@aggregated <- data.frame() + spectra <- recalibrateSpectra(mode, w@spectra, w = w, + recalibrateBy = settings$recalibrateBy, + recalibrateMS1 = settings$recalibrateMS1) + w@spectra <- spectra + } + # Step 5: re-analysis on recalibrated spectra + if(5 %in% steps) + { + nProg <- 0 + message("msmsWorkflow: Step 5. Reanalyze recalibrated spectra") + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=nLen)) + + w@spectra <- as(lapply(w@spectra, function(spec) { + #print(spec$id) + if(findLevel(spec@id,TRUE) == "unknown"){ + analyzeMethod <- "intensity" + } else { + analyzeMethod <- "formula" + } + s <- analyzeMsMs(msmsPeaks = spec, mode=mode, detail=TRUE, run="recalibrated", + filterSettings = settings$filterSettings, + spectraList = settings$spectraList, method = analyzeMethod) + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(s) + }), "SimpleList") + ## for(f in w@files) + ## w@spectra[[basename(as.character(f))]]@name <- basename(as.character(f)) + suppressWarnings(do.call(progressbar, list(object=pb, close=TRUE))) + + do.call(progressbar, list(object=pb, close=TRUE)) + } + # Step 6: aggregate recalibrated results + if(6 %in% steps) + { + message("msmsWorkflow: Step 6. Aggregate recalibrated results") + w@aggregated <- aggregateSpectra(w@spectra, addIncomplete=TRUE) + + if(RMassBank.env$verbose.output){ + numberOfPeaksThere <- sum(unlist(lapply(X = w@spectra, FUN = function(spec){ sum(unlist(lapply(X = spec@children, FUN = function(child){ child@peaksCount }))) }))) + if(nrow(w@aggregated) < numberOfPeaksThere) + cat(paste("### Warning ### The aggregation of spectra lead to the removal of ", (numberOfPeaksThere-nrow(w@aggregated)), " / ", numberOfPeaksThere, " peaks\n", sep = "")) + } + + if(!is.na(archivename)) + archiveResults(w, paste(archivename, ".RData", sep=''), settings) + w@aggregated <- cleanElnoise(peaks = w@aggregated, noise = settings$electronicNoise, width = settings$electronicNoiseWidth) + + if(RMassBank.env$verbose.output) + if(sum(w@aggregated$noise) > 0) + cat(paste("### Warning ### ", sum(w@aggregated$noise), " / ", nrow(w@aggregated), " peaks have been identified as electronic noise\n", sep = "")) + } + # Step 7: reanalyze failpeaks for (mono)oxidation and N2 adduct peaks + if(7 %in% steps) + { + message("msmsWorkflow: Step 7. Reanalyze fail peaks for N2 + O") + w@aggregated <- reanalyzeFailpeaks( + aggregated = w@aggregated, custom_additions="N2O", mode=mode, + filterSettings=settings$filterSettings, + progressbar=progressbar) + if(!is.na(archivename)) + archiveResults(w, paste(archivename, "_RA.RData", sep=''), settings) + + if(RMassBank.env$verbose.output){ + isNoFormula <- is.na(w@aggregated$formula) & is.na(w@aggregated$reanalyzed.formula) + noFormulaCount <- sum(isNoFormula) + if(noFormulaCount > 0){ + cat(paste("### Warning ### ", noFormulaCount, " / ", nrow(unique(x = w@aggregated[, c("mzFound", "intensity", "formulaCount", "dppmBest")])), " peaks have no molecular formula:\n", sep = "")) + print(w@aggregated[isNoFormula, c("mzFound","intensity","cpdID")]) + } + } + } + + # Step 8: heuristic filtering based on peak multiplicity; + # creation of failpeak list + if(8 %in% steps) + { + message("msmsWorkflow: Step 8. Peak multiplicity filtering") + if (is.null(settings$multiplicityFilter)) { + message("msmsWorkflow: Step 8. Peak multiplicity filtering skipped because multiplicityFilter parameter is not set.") + } else { + # apply heuristic filter + w@aggregated <- filterMultiplicity(w = w, archivename = archivename, mode = mode, multiplicityFilter = settings$multiplicityFilter) + + if(RMassBank.env$verbose.output){ + multiplicityNotOkCount <- sum(!w@aggregated$filterOK) + if(multiplicityNotOkCount > 0) + cat(paste("### Warning ### ", multiplicityNotOkCount, " / ", nrow(w@aggregated[, c("mzFound", "intensity", "formulaCount", "dppmBest")]), " peaks do not fulfill the multiplicity criterion\n", sep = "")) + } + + w@aggregated <- processProblematicPeaks(w, mode, archivename) + + if(RMassBank.env$verbose.output){ + problematicPeakCount <- sum(w@aggregated$problematicPeak) + if(problematicPeakCount > 0) + cat(paste("### Warning ### ", problematicPeakCount, " / ", nrow(w@aggregated), " peaks are problematic\n", sep = "")) + } + + if(!is.na(archivename)) + archiveResults(w, paste(archivename, "_RF.RData", sep=''), settings) + } + } + message("msmsWorkflow: Done.") + return(w) +} + +#' Analyze MSMS spectra +#' +#' Analyzes MSMS spectra of a compound by fitting formulas to each subpeak. +#' +#' The analysis function uses Rcdk. Note +#' that in this step, \emph{satellite peaks} are removed by a simple heuristic +#' rule (refer to the documentation of \code{\link{filterPeakSatellites}} for details.) +#' +## # @usage analyzeMsMs(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", +## # filterSettings = getOption("RMassBank")$filterSettings, +## # spectraList = getOption("RMassBank")$spectraList, method="formula") +## # +## # analyzeMsMs.formula(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", +## # filterSettings = getOption("RMassBank")$filterSettings, +## # spectraList = getOption("RMassBank")$spectraList) +## # +## # analyzeMsMs.intensity(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", +## # filterSettings = getOption("RMassBank")$filterSettings, +## # spectraList = getOption("RMassBank")$spectraList) +#' +#' @param msmsPeaks A \code{RmbSpectraSet} object. +#' Corresponds to a parent spectrum and children MSMS spectra of one compound (plus some metadata). +#' The objects are typically generated with \code{\link{findMsMsHR}}, and populate the \code{@@spectrum} slot +#' in a \code{msmsWorkspace} (refer to the corresponding +#' documentation for the precise format specifications). +#' @param mode Specifies the processing mode, i.e. which molecule species the +#' spectra contain. \code{\var{pH}} (positive H) specifies [M+H]+, +#' \code{\var{pNa}} specifies [M+Na]+, \code{\var{pM}} specifies [M]+, +#' \code{\var{mH}} and \code{\var{mNa}} specify [M-H]- and [M-Na]-, +#' respectively. (I apologize for the naming of \code{\var{pH}} which has +#' absolutely nothing to do with chemical \emph{pH} values.) +#' @param detail Whether detailed return information should be provided +#' (defaults to \code{FALSE}). See below. +#' @param run \code{"preliminary"} or \code{"recalibrated"}. In the +#' \code{preliminary} run, mass tolerance is set to 10 ppm (above m/z 120) and +#' 15 ppm (below m/z 120), the default intensity cutoff is $10^4$ for positive +#' mode (no default cutoff in negative mode), and the column \code{"mz"} from +#' the spectra is used as data source. In the \code{recalibrated} run, the +#' mass tolerance is set to 5 ppm over the whole mass range, the default cutoff +#' is 0 and the column \code{"mzRecal"} is used as source for the m/z values. +#' Defaults to \code{"preliminary"}. +#' @param filterSettings +#' Settings for the filter parameters, by default loaded from the RMassBank settings +#' set with e.g. \code{\link{loadRmbSettings}}. Must contain: +#' \itemize{ +#' \item \code{ppmHighMass}, allowed ppm deviation before recalibration +#' for high mass range +#' \item \code{ppmLowMass}, allowed ppm deviation before recalibration +#' for low mass range +#' \item \code{massRangeDivision}, division point between high and low mass +#' range (before recalibration) +#' \item \code{ppmFine}, allowed ppm deviation overall after recalibration +#' \item \code{prelimCut}, intensity cutoff for peaks in preliminary run +#' \item \code{prelimCutRatio}, relative intensity cutoff for peaks in +#' preliminary run, e.g. 0.01 = 1% +#' \item \code{fineCut}, intensity cutoff for peaks in second run +#' \item \code{fineCutRatio}, relative intensity cutoff for peaks in +#' second run +#' \item \code{specOkLimit}, minimum intensity of base peak for spectrum +#' to be accepted for processing +#' \item \code{dbeMinLimit}, minimum double bond equivalent for accepted +#' molecular subformula. +#' \item \code{satelliteMzLimit}, for satellite peak filtering +#' (\code{\link{filterPeakSatellites}}: mass window to use for satellite +#' removal +#' \item \code{satelliteIntLimit}, the relative intensity below which to +#' discard "satellites". (refer to \code{\link{filterPeakSatellites}}). +#' } +#' @param spectraList The list of MS/MS spectra present in each data block. As also +#' defined in the settings file. +#' @param method Selects which function to actually use for data evaluation. The default +#' "formula" runs a full analysis via formula assignment to fragment peaks. The +#' alternative setting "intensity" calls a "mock" implementation which circumvents +#' formula assignment and filters peaks purely based on intensity cutoffs and the +#' satellite filtering. (In this case, the ppm and dbe related settings in filterSettings +#' are ignored.) +#' @return The processed \code{RmbSpectraSet} object. +#' Added (or filled, respectively, since the slots are present before) data include +#' \item{list("complete")}{whether all spectra have useful value} +#' \item{list("empty")}{whether there are no useful spectra} +#' \item{list("children")}{ +#' The processed \code{RmbSpectrum2} objects (in a \code{RmbSpectrum2List}). +#' \itemize{ +#' \item \code{ok} if the spectrum was successfully processed with at least one resulting peak +#' \item \code{mz}, \code{intensity}: note that mz/int pairs can be duplicated when multiple matches +#' are found for one mz value, therefore the two slots are not necessarily unchanged from before +#' \item \code{rawOK} (logical) whether the m/z peak passes satellite/low removal +#' \item \code{low}, \code{satellite} if \code{TRUE}, the peak failed cutoff (\code{low}) or was removed as \code{satellite} +#' \item \code{formula}, \code{mzCalc}, \code{dppm}, \code{dbe} Formula, calculated mass, ppm deviation and dbe assigned to a peak +#' \item \code{formulaCount}, \code{dppmBest} Number of formulae matched for this m/z value and ppm deviation of the best match +#' \item \code{info} Spectrum identifying information (collision energy, resolution, collision mode) from +#' the \code{spectraList} +#' \item All other entries are retained from the original \code{RmbSpectrum2}. +#' } +#' } +#' @aliases analyzeMsMs analyzeMsMs.formula analyzeMsMs.intensity +#' @author Michael Stravs +#' @seealso \code{\link{msmsWorkflow}}, \code{\link{filterLowaccResults}}, +#' \code{\link{filterPeakSatellites}}, \code{\link{reanalyzeFailpeaks}} +#' @examples +#' +#' \dontrun{analyzed <- analyzeMsMs(spec, "pH", TRUE)} +#' +#' @export +analyzeMsMs <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", + filterSettings = getOption("RMassBank")$filterSettings, + spectraList = getOption("RMassBank")$spectraList, method="formula") +{ + ## .RmbSpectraSet <- setClass("RmbSpectraSet", + ## representation = representation( + ## parent = "Spectrum1", + ## children = "RmbSpectrum2List", + ## # These are done as slots and not as S4 functions, because they are set during the workflow + ## # in "checking" steps. It's easier. + ## found = "logical", + ## complete = "logical", + ## empty = "logical", + ## formula = "character", + ## id = "integer", + ## mz = "numeric", + ## name = "character", + ## annotations = "list" + ## ), + ## prototype = prototype( + ## parent = new("Spectrum1"), + ## children = new("RmbSpectrum2List"), + ## found = FALSE, + ## complete = NA, + ## empty = NA, + ## formula = character(), + ## id = integer(), + ## mz = numeric(), + ## name = character(), + ## annotations = list() + ## ) + ## ); + .checkMbSettings() + + + # Check whether the spectra can be fitted to the spectra list correctly! + if(length(msmsPeaks@children) != length(spectraList)) + { + warning(paste0( + "The spectra count of the substance ", msmsPeaks@id, " (", length(msmsPeaks@children), " spectra) doesn't match the provided spectra list (", length(spectraList), " spectra)." + )) + msmsPeaks@found <- FALSE + return(msmsPeaks) + + } + + if(msmsPeaks@found == FALSE) + return(msmsPeaks) + + if(method=="formula") + { + r <- (analyzeMsMs.formula(msmsPeaks, mode, detail, run, filterSettings + )) + } + else if(method == "intensity") + { + r <- (analyzeMsMs.intensity(msmsPeaks, mode, detail, run, filterSettings + )) + } + + # Add the spectrum labels to the spectra here. + # If there is any better place to do this, please tell me. I hate it. + # However, the info should be added in msmsWorkflow not in mbWorkflow, because two msmsWorkspaces with different spectraLists can be + # merged together with all the combine / pack stuff. + children <- mapply(function(spec, info) + { + spec@info <- info + spec + }, r@children, spectraList, SIMPLIFY=FALSE) + r@children <- as(children, "SimpleList") + + + #nspectra <- length(spectraList) + ok <- unlist(lapply(r@children, function(c) c@ok)) + r@complete <- FALSE + r@empty <- FALSE + if(all(ok)) + r@complete <- TRUE + if(all(!ok)) + r@empty <- TRUE + return(r) +} + + +#' @describeIn analyzeMsMs Analyze the peaks using formula annotation +#' @export +analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", + filterSettings = getOption("RMassBank")$filterSettings) +{ + cut <- 0 + cut_ratio <- 0 + if(run=="preliminary") + { + filterMode <- "coarse" + cut <- filterSettings$prelimCut + if(is.na(cut)) + { + if(mode %in% c("pH", "pM", "pNa", "pNH4")) + cut <- 1e4 + else if(mode %in% c("mH", "mFA","mM")) + cut <- 0 + else stop(paste("The ionization mode", mode, "is unknown.")) + } + cutRatio <- filterSettings$prelimCutRatio + } + else + { + filterMode <- "fine" + cut <- filterSettings$fineCut + cut_ratio <- filterSettings$fineCutRatio + if(is.na(cut)) cut <- 0 + } + + # find whole spectrum of parent peak, so we have reasonable data to feed into + # MolgenMsMs + parentSpectrum <- msmsPeaks@parent + + + # On each spectrum the following function analyzeTandemShot will be applied. + # It takes the raw peaks matrix as argument (mz, int) and processes the spectrum by + # filtering out low-intensity (<1e4) and shoulder peaks (deltam/z < 0.5, intensity + # < 5%) and subsequently matching the peaks to formulas using Rcdk, discarding peaks + # with insufficient match accuracy or no match. + analyzeTandemShot <- function(child) + { + childIdx <- which(sapply(X = seq_along(msmsPeaks@children), FUN = function(i){ + all(child@mz == msmsPeaks@children[[i]]@mz) & all(child@rt == msmsPeaks@children[[i]]@rt) & all(child@intensity == msmsPeaks@children[[i]]@intensity) } + )) + shot <- getData(child) + shot$row <- which(!is.na(shot$mz)) + + + # Filter out low intensity peaks: + child@low <- (shot$intensity < cut) | (shot$intensity < max(shot$intensity)*cut_ratio) + shot <- shot[!child@low,,drop=FALSE] + shot_full <- shot + + # Is there still anything left? + if(length(which(!child@low))==0) + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' contains only low-intensity peaks\n", sep = "")) + + return(child) + } + + # Filter out satellite peaks: + shot <- filterPeakSatellites(shot, filterSettings) + child@satellite <- rep(TRUE, child@peaksCount) + child@satellite[which(child@low == TRUE)] <- NA + child@satellite[shot$row] <- FALSE + + # Is there still anything left? + if(nrow(shot)==0) + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' contains no peaks after satellite filtering\n", sep = "")) + + return(child) + } + + if(max(shot$intensity) < as.numeric(filterSettings$specOkLimit)) + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is discarded due to parameter 'specOkLimit'\n", sep = "")) + + return(child) + } + + # Crop to 4 digits (necessary because of the recalibrated values) + # this was done for the MOLGEN MSMS type analysis, is not necessary anymore now (23.1.15 MST) + # shot[,mzColname] <- round(shot[,mzColname], 5) + + # here follows the Rcdk analysis + #------------------------------------ + parentPeaks <- data.frame(mzFound=msmsPeaks@mz, + formula=msmsPeaks@formula, + dppm=0, + x1=0,x2=0,x3=0) + + # define the adduct additions + if(mode == "pH") { + allowed_additions <- "H" + mode.charge <- 1 + } else if(mode == "pNa") { + allowed_additions <- "Na" + mode.charge <- 1 + } else if(mode == "pM") { + allowed_additions <- "" + mode.charge <- 1 + } else if(mode == "mM") { + allowed_additions <- "" + mode.charge <- -1 + } else if(mode == "mH") { + allowed_additions <- "H-1" + mode.charge <- -1 + } else if(mode == "mFA") { + allowed_additions <- "C2H3O2" + mode.charge <- -1 + } else if(mode == "pNH4") { + allowed_additions <- "NH4" + mode.charge <- 1 + } else{ + stop("mode = \"", mode, "\" not defined") + } + + # the ppm range is two-sided here. + # The range is slightly expanded because dppm calculation of + # generate.formula starts from empirical mass, but dppm cal- + # culation of the evaluation starts from theoretical mass. + # So we don't miss the points on 'the border'. + + if(run=="preliminary") + ppmlimit <- 2 * max(filterSettings$ppmLowMass, filterSettings$ppmHighMass) + else + ppmlimit <- 2.25 * filterSettings$ppmFine + + parent_formula <- add.formula(msmsPeaks@formula, allowed_additions) + dbe_parent <- dbe(parent_formula) + # check whether the formula is valid, i.e. has no negative or zero element numbers. + #print(parent_formula) + if(!is.valid.formula(parent_formula)) + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### The precursor ion formula of spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is invalid\n", sep = "")) + + return(child) + } + + limits <- to.limits.rcdk(parent_formula) + + peakmatrix <- lapply( + split(shot,shot$row) + , function(shot.row) { + # Circumvent bug in rcdk: correct the mass for the charge first, then calculate uncharged formulae + # finally back-correct calculated masses for the charge + mass <- shot.row[["mz"]] + mass.calc <- mass + mode.charge * .emass + peakformula <- tryCatch(suppressWarnings(generate.formula(mass = mass.calc, window = ppm(mass.calc, ppmlimit, p=TRUE), + elements = limits, charge=0)), error=function(e) NA) + #peakformula <- tryCatch( + # generate.formula(mass, + # ppm(mass, ppmlimit, p=TRUE), + # limits, charge=1), + #error= function(e) list()) + + if(!is.list(peakformula)) + return(t(c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, + formula=NA, mzCalc=NA))) + else + { + return(t(sapply(peakformula, function(f) + { + mzCalc <- f@mass - mode.charge * .emass + c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, + formula=f@string, + mzCalc=mzCalc) + }))) + } + }) + + childPeaks <- as.data.frame(do.call(rbind, peakmatrix)) + + # Reformat the deformatted output correctly (why doesn't R have a better way to do this, e.g. avoid deformatting?) + + childPeaks$row <- as.numeric(as.character(childPeaks$row)) + childPeaks$intensity <- as.numeric(as.character(childPeaks$intensity)) + childPeaks$mz <- as.numeric(as.character(childPeaks$mz)) + childPeaks$formula <- as.character(childPeaks$formula) + childPeaks$mzCalc <- as.numeric(as.character(childPeaks$mzCalc)) + childPeaks$dppm <- (childPeaks$mz / childPeaks$mzCalc - 1) * 1e6 + childPeaks$dbe <- unlist(lapply(childPeaks$formula, dbe)) + + # childPeaks now contains all the good and unmatched peaks + # but not the ones which were cut as satellites or below threshold. + + ## child@mzFound <- rep(NA, child@peaksCount) + ## child@mzFound[childPeaks$row] <- as.numeric(as.character(childPeaks$mzFound)) + ## + ## child@formula <- rep(NA, child@peaksCount) + ## child@formula[childPeaks$row] <- as.character(childPeaks$formula) + ## + ## child@mzCalc <- rep(NA, child@peaksCount) + ## child@mzCalc[childPeaks$row] <- as.numeric(as.character(childPeaks$mzCalc)) + ## + ## child@dppm<- rep(NA, child@peaksCount) + ## child@dppm[childPeaks$row] <- (childPeaks$mzFound / childPeaks$mzCalc - 1) * 1e6 + # delete the NA data out again, because MolgenMsMs doesn't have them + # here and they will be re-added later + # (this is just left like this for "historical" reasons) + #childPeaks <- childPeaks[!is.na(childPeaks$formula),] + # check if a peak was recognized (here for the first time, + # otherwise the next command would fail) + + if(nrow(childPeaks)==0) + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is empty\n", sep = "")) + + return(child) + } + + # now apply the rule-based filters to get rid of total junk: + # dbe >= -0.5, dbe excess over mother cpd < 3 + # dbe() has been adapted to return NA for NA input + #iff_rcdk_pM_eln$maxvalence <- unlist(lapply(diff_rcdk_pM_eln$formula.rcdk, maxvalence)) + temp.child.ok <- (childPeaks$dbe >= filterSettings$dbeMinLimit) + # & dbe < dbe_parent + 3) + # check if a peak was recognized + if(length(which(temp.child.ok)) == 0) + { + child@ok <- FALSE + return(child) + } + #browser() + # find the best ppm value + bestPpm <- aggregate(as.data.frame(childPeaks[!is.na(childPeaks$dppm),"dppm"]), + list(childPeaks[!is.na(childPeaks$dppm),"row"]), + function(dppm) dppm[[which.min(abs(dppm))]]) + colnames(bestPpm) <- c("row", "dppmBest") + childPeaks <- merge(childPeaks, bestPpm, by="row", all.x=TRUE) + + # Deactivated the following lines because we never actually want to look at the "old" formula count. + # To be verified (cf Refiltering, failpeak list and comparable things) + + ## # count formulas found per mass + ## countFormulasTab <- xtabs( ~formula + mz, data=childPeaks) + ## countFormulas <- colSums(countFormulasTab) + ## childPeaks$formulaCount <- countFormulas[as.character(childPeaks$row)] + + # filter results + childPeaksFilt <- filterLowaccResults(childPeaks, filterMode, filterSettings) + childPeaksGood <- childPeaksFilt[["TRUE"]] + childPeaksBad <- childPeaksFilt[["FALSE"]] + if(is.null(childPeaksGood)){ + childPeaksGood <- childPeaks[c(),,drop=FALSE] + childPeaksGood$good <- logical(0) + } + if(is.null(childPeaksBad)) + childPeaksBad <- childPeaks[c(),,drop=FALSE] + childPeaksUnassigned <- childPeaks[is.na(childPeaks$dppm),,drop=FALSE] + childPeaksUnassigned$good <- rep(FALSE, nrow(childPeaksUnassigned)) + # count formulas within new limits + # (the results of the "old" count stay in childPeaksInt and are returned + # in $childPeaks) + countFormulasTab <- xtabs( ~formula + mz, data=childPeaksGood) + countFormulas <- colSums(countFormulasTab) + childPeaksGood$formulaCount <- countFormulas[as.character(childPeaksGood$mz)] + + childPeaksUnassigned$formulaCount <- rep(NA, nrow(childPeaksUnassigned)) + childPeaksBad$formulaCount <- rep(NA, nrow(childPeaksBad)) + childPeaksBad$good <- rep(FALSE, nrow(childPeaksBad)) + + # Now: childPeaksGood (containing the new, recounted peaks with good = TRUE), and childPeaksBad (containing the + # peaks with good=FALSE, i.e. outside filter criteria, with the old formula count even though it is worthless) + # are bound together. + childPeaksBad <- childPeaksBad[,colnames(childPeaksGood),drop=FALSE] + childPeaksUnassigned <- childPeaksUnassigned[,colnames(childPeaksGood),drop=FALSE] + childPeaks <- rbind(childPeaksGood, childPeaksBad, childPeaksUnassigned) + + # Now let's cross fingers. Add a good=NA column to the unmatched peaks and reorder the columns + # to match order in childPeaks. After that, setData to the child slot. + + childPeaksOmitted <- getData(child) + childPeaksOmitted <- childPeaksOmitted[child@low | child@satellite,,drop=FALSE] + childPeaksOmitted$rawOK <- rep(FALSE, nrow(childPeaksOmitted)) + childPeaksOmitted$good <- rep(FALSE, nrow(childPeaksOmitted)) + childPeaksOmitted$dppm <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$formula <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$mzCalc <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$dbe <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$dppmBest <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$formulaCount <- rep(0, nrow(childPeaksOmitted)) + childPeaks$satellite <- rep(FALSE, nrow(childPeaks)) + childPeaks$low <- rep(FALSE, nrow(childPeaks)) + childPeaks$rawOK <- rep(TRUE, nrow(childPeaks)) + + childPeaks <- childPeaks[,colnames(childPeaksOmitted), drop=FALSE] + + childPeaksTotal <- rbind(childPeaks, childPeaksOmitted) + child <- setData(child, childPeaksTotal) + child@ok <- TRUE + + return(child) + } + + # I believe these lines were fixed to remove a warning but in the refactored workflow "mzranges" doesn't exist anymore. + # Leave here for now + ## mzranges <- t(sapply(shots, function(p) { + ## if(!is.null(p$childRaw)){ + ## return(range(p$childRaw[,mzColname])) + ## } else { + ## return(c(NA,NA)) + ## } + ## })) + ## + ## mzmin <- min(mzranges[,1], na.rm=TRUE) + ## mzmax <- max(mzranges[,2], na.rm=TRUE) + children <- lapply(msmsPeaks@children, analyzeTandemShot) + + + + +## shots <- mapply(function(shot, scan, info) + ## { + ## shot$scan <- scan + ## shot$info <- info + ## shot$header <- msmsPeaks$childHeaders[as.character(scan),] + ## return(shot) + ## }, shots, msmsPeaks$childScans, spectraList, SIMPLIFY=FALSE) + msmsPeaks@children <- as(children, "SimpleList") + return(msmsPeaks) +} + + +#' @describeIn analyzeMsMs Analyze the peaks going only by intensity values +#' @export +analyzeMsMs.intensity <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", + filterSettings = getOption("RMassBank")$filterSettings) +{ + cut <- 0 + cut_ratio <- 0 + if(run=="preliminary") + { + filterMode <- "coarse" + cut <- filterSettings$prelimCut + if(is.na(cut)) + { + if(mode %in% c("pH", "pM", "pNa", "pNH4")) + cut <- 1e4 + else if(mode %in% c("mH", "mFA", "mM")) + cut <- 0 + else stop(paste("The ionization mode", mode, "is unknown.")) + } + cutRatio <- filterSettings$prelimCutRatio + } + else + { + filterMode <- "fine" + cut <- filterSettings$fineCut + cut_ratio <- filterSettings$fineCutRatio + if(is.na(cut)) cut <- 0 + } + + # find whole spectrum of parent peak, so we have reasonable data to feed into + + + # On each spectrum the following function analyzeTandemShot will be applied. + # It takes the raw peaks matrix as argument (mz, int) and processes the spectrum by + # filtering out low-intensity (<1e4) and shoulder peaks (deltam/z < 0.5, intensity + # < 5%) and subsequently matching the peaks to formulas using Rcdk, discarding peaks + # with insufficient match accuracy or no match. + analyzeTandemShot <- function(child) + { + shot <- getData(child) + shot$row <- which(!is.na(shot$mz)) + + # Filter out low intensity peaks: + child@low <- (shot$intensity < cut) | (shot$intensity < max(shot$intensity)*cut_ratio) + shot_full <- shot + shot <- shot[!child@low,,drop=FALSE] + + + # Is there still anything left? + if(length(which(!child@low))==0) + { + child@ok <- FALSE + return(child) + } + + # Filter out satellite peaks: + shot <- filterPeakSatellites(shot, filterSettings) + child@satellite <- rep(TRUE, child@peaksCount) + child@satellite[which(child@low == TRUE)] <- NA + child@satellite[shot$row] <- FALSE + + # Is there still anything left? + if(nrow(shot)==0) + { + child@ok <- FALSE + return(child) + } + + if(max(shot$intensity) < as.numeric(filterSettings$specOkLimit)) + { + child@ok <- FALSE + return(child) + } + + + # here follows the fake analysis + #------------------------------------ + parentPeaks <- data.frame(mzFound=msmsPeaks@mz, + formula=msmsPeaks@formula, + dppm=0, + x1=0,x2=0,x3=0) + + childPeaks <- addProperty(shot_full, "rawOK", "logical", FALSE) + childPeaks[!(child@low | child@satellite),"rawOK"] <- TRUE + + childPeaks <- addProperty(childPeaks, "good", "logical", FALSE) + childPeaks[childPeaks$rawOK,"good"] <- TRUE + + childPeaks <- addProperty(childPeaks, "mzCalc", "numeric") + childPeaks[childPeaks$rawOK,"mzCalc"] <- childPeaks[childPeaks$rawOK,"mz"] + + childPeaks <- addProperty(childPeaks, "formula", "character") + childPeaks[childPeaks$rawOK,"formula"] <- "" + + childPeaks <- addProperty(childPeaks, "dbe", "numeric") + childPeaks[childPeaks$rawOK,"dbe"] <- 0 + + childPeaks <- addProperty(childPeaks, "formulaCount", "integer") + childPeaks[childPeaks$rawOK,"formulaCount"] <- 1 + + childPeaks <- addProperty(childPeaks, "dppm", "numeric") + childPeaks[childPeaks$rawOK,"dppm"] <- 0 + + childPeaks <- addProperty(childPeaks, "dppmBest", "numeric") + childPeaks[childPeaks$rawOK,"dppmBest"] <- 0 + + child <- setData(child, childPeaks) + child@ok <- TRUE + return(child) + } + children <- lapply(msmsPeaks@children, analyzeTandemShot) + msmsPeaks@children <- as(children, "SimpleList") + #browser() + + return(msmsPeaks) + + # Omit all the stuff below for now, I don't believe it is needed. One thing is that spectraList info will have to be added somewhere else. + ## shots <- mapply(function(shot, scan, info) + ## { + ## shot$scan <- scan + ## shot$info <- info + ## shot$header <- msmsPeaks$childHeaders[as.character(scan),] + ## return(shot) + ## }, shots, msmsPeaks$childScans, spectraList, SIMPLIFY=FALSE) + ## + ## mzranges <- t(sapply(shots, function(p) {return(range(p$childRaw[,mzColname]))})) + ## mzmin <- min(mzranges[,1], na.rm=TRUE) + ## mzmax <- max(mzranges[,2], na.rm=TRUE) + ## + ## return(list( + ## msmsdata=shots, + ## mzrange=c(mzmin, mzmax), + ## id=msmsPeaks$id, + ## mode=mode, + ## parentHeader = msmsPeaks$parentHeader, + ## parentMs = msmsPeaks$parentPeak, + ## formula = msmsPeaks$formula, + ## foundOK = TRUE)) +} + + +#' Filter peaks with low accuracy +#' +#' Filters a peak table (with annotated formulas) for accuracy. Low-accuracy +#' peaks are removed. +#' +#' In the \code{coarse} mode, mass tolerance is set to 10 ppm (above m/z 120) +#' and 15 ppm (below m/z 120). This is useful for formula assignment before +#' recalibration, where a wide window is desirable to accomodate the high mass +#' deviations at low m/z values, so we get a nice recalibration curve. +#' +#' In the \code{fine} run, the mass tolerance is set to 5 ppm over the whole +#' mass range. This should be applied after recalibration. +#' +#' @usage filterLowaccResults(peaks, mode="fine", filterSettings = getOption("RMassBank")$filterSettings) +#' @param peaks A data frame with at least the columns \code{mzFound} and +#' \code{dppm}. +#' @param mode \code{coarse} or \code{fine}, see below. +#' @param filterSettings Settings for filtering. For details, see documentation of +#' \code{\link{analyzeMsMs}} +#' @return A \code{list(TRUE = goodPeakDataframe, FALSE = badPeakDataframe)} is +#' returned: A data frame with all peaks which are "good" is in +#' \code{return[["TRUE"]]}. +#' @author Michael Stravs +#' @seealso \code{\link{analyzeMsMs}}, \code{\link{filterPeakSatellites}} +#' @examples +#' +#' # from analyzeMsMs: +#' \dontrun{childPeaksFilt <- filterLowaccResults(childPeaksInt, filterMode)} +#' +#' +filterLowaccResults <- function(peaks, mode="fine", filterSettings = getOption("RMassBank")$filterSettings) +{ + # Check if filter settings are properly set, otherwise use defaults + if(is.null(filterSettings)) + { + filterSettings <- list( + ppmHighMass = 10, + ppmLowMass = 15, + massRangeDivision = 120, + ppmFine = 5) + } + + peaks$good = NA + peaks[!is.na(peaks$dppm), "good"] <- TRUE + + # coarse mode: to use for determinating the recalibration function + if(mode=="coarse") + { + if(nrow(peaks[which(abs(peaks$dppm) > filterSettings$ppmHighMass),])>0) + peaks[which(abs(peaks$dppm) > filterSettings$ppmHighMass), "good"] <- FALSE + if(nrow(peaks[which(peaks$mz > filterSettings$massRangeDivision & abs(peaks$dppm) > filterSettings$ppmLowMass),])>0) + peaks[which(peaks$mz > filterSettings$massRangeDivision & abs(peaks$dppm) > filterSettings$ppmLowMass), "good"] <- FALSE + } + # fine mode: for use after recalibration + else + { + if(nrow(peaks[which(abs(peaks$dppm) > filterSettings$ppmFine),]) > 0) + peaks[which(abs(peaks$dppm) > filterSettings$ppmFine), "good"] <- FALSE + } + return(split(peaks, peaks$good)) +} + +#' Aggregate analyzed spectra +#' +#' Groups an array of analyzed spectra and creates aggregated peak tables +#' +#' \code{\var{addIncomplete}} is relevant for recalibration. For recalibration, +#' we want to use only high-confidence peaks, therefore we set +#' \code{\var{addIncomplete}} to \code{FALSE}. When we want to generate a peak +#' list for actually generating MassBank records, we want to include all peaks +#' into the peak tables. +#' +#' @usage aggregateSpectra(spec, addIncomplete=FALSE) +#' @param spec The \code{RmbSpectraSetList} of spectra sets (\code{RmbSpectraSet} objects) to aggregate +#' @param addIncomplete Whether or not the peaks from incomplete files (files +#' for which less than the maximal number of spectra are present) +#' @return +#' A summary \code{data.frame} with all peaks (possibly multiple rows for one m/z value from a spectrum, see below) with columns: +#' \item{mzFound, intensity}{Mass and intensity of the peak} +#' \item{good}{if the peak passes filter criteria} +#' \item{mzCalc, formula, dbe, dppm}{calculated mass, formula, dbe and ppm deviation of the assigned formula} +#' \item{formulaCount, dppmBest}{Number of matched formulae for this m/z value, and ppm deviation of the best match} +#' \item{scan, cpdID, parentScan}{Scan number of the child and parent spectrum in the raw file, also the compound ID to which the peak belongs} +#' \item{dppmRc}{ppm deviation recalculated from the aggregation function} +#' \item{index}{Aggregate-table peak index, so the table can be subsetted, edited and results reinserted back into this table easily} +#' Further columns are later added by workflow steps 6 (electronic noise culler), 7 and 8. +#' +#' @author Michael Stravs +#' @seealso \code{\link{msmsWorkflow}}, \code{\link{analyzeMsMs}} +#' @examples +#' +#' ## As used in the workflow: +#' \dontrun{% +#' w@@spectra <- lapply(w@@spectra, function(spec) +#' analyzeMsMs(spec, mode="pH", detail=TRUE, run="recalibrated", cut=0, cut_ratio=0 ) ) +#' w@@aggregate <- aggregateSpectra(w@@spectra) +#' } +#' +#' @export +aggregateSpectra <- function(spec, addIncomplete=FALSE) +{ + + if(addIncomplete) + aggSpectra <- selectSpectra(spec, "found", "object") + else + aggSpectra <- selectSpectra(spec, "complete", "object") + + compoundTables <- lapply(aggSpectra, function(s) + { + tables.c <- lapply(s@children, function(c) + { + table.c <- getData(c) + table.c <- table.c[table.c$rawOK,,drop=FALSE] + # remove superfluous columns, since only rawOK peaks are selected anyway + table.c$rawOK <- NULL + table.c$low <- NULL + table.c$satellite <- NULL + # add scan no + table.c$scan <- rep(c@acquisitionNum, nrow(table.c)) + return(table.c) + }) + table.cpd <- do.call(rbind, tables.c) + table.cpd$cpdID <- rep(s@id, nrow(table.cpd)) + table.cpd$parentScan <- rep(s@parent@acquisitionNum, nrow(table.cpd)) + return(table.cpd) + }) + #return(compoundTables) + aggTable <- do.call(rbind, compoundTables) + colnames(aggTable)[1] <- "mzFound" + + aggTable <- addProperty(aggTable, "dppmRc", "numeric") + aggTable <- addProperty(aggTable, "index", "integer") + if(nrow(aggTable) > 0) + aggTable$index <- 1:nrow(aggTable) + + aggTable[aggTable$good, "dppmRc"] <- (aggTable[aggTable$good, "mzFound"]/aggTable[aggTable$good, "mzCalc"] - 1)*1e6 + + + return(aggTable) +} + +#' Remove electronic noise +#' +#' Removes known electronic noise peaks from a peak table +#' +#' @usage cleanElnoise(peaks, noise=getOption("RMassBank")$electronicNoise, +#' width = getOption("RMassBank")$electronicNoiseWidth) +#' @param peaks An aggregated peak frame as described in \code{\link{aggregateSpectra}}. Columns +#' \code{mzFound}, \code{dppm} and \code{dppmBest} are needed. +#' @param noise A numeric vector of known m/z of electronic noise peaks from the instrument +#' Defaults to the entries in the RMassBank settings. +#' @param width The window for the noise peak in m/z units. Defaults to the entries in +#' the RMassBank settings. +#' @return Extends the aggregate data frame by column \code{noise} (logical), which is \code{TRUE} if the peak is marked as noise. +#' +#' @author Michael Stravs +#' @seealso \code{\link{msmsWorkflow}} +#' @examples +#' # As used in the workflow: +#' \dontrun{ +#' w@@aggregated <- +#' cleanElnoise(w@@aggregated) +#' } +#' @export +cleanElnoise <- function(peaks, noise=getOption("RMassBank")$electronicNoise, + width = getOption("RMassBank")$electronicNoiseWidth) +{ + peaks <- addProperty(peaks, "noise", "logical", FALSE) + + # I don't think this makes sense if using one big table... + ## # use only best peaks + ## p_best <- peaks[is.na(peaks$dppmBest) | (peaks$dppm == peaks$dppmBest),] + + # remove known electronic noise + p_eln <- peaks + for(noisePeak in noise) + { + noiseMatches <- which(!((p_eln$mzFound > noisePeak + width) | (p_eln$mzFound < noisePeak - width))) + if(length(noiseMatches) > 0) + p_eln[noiseMatches, "noise"] <- TRUE + } + return(p_eln) +} + +#' Identify intense peaks (in a list of unmatched peaks) +#' +#' Finds a list of peaks in spectra with a high relative intensity (>10% and +#' 1e4, or >1% and 1e5) to write a list of peaks which must be manually +#' checked. Peaks orbiting around the parent peak mass (calculated from the +#' compound ID), which are very likely co-isolated substances, are ignored. +#' +#' +#' @usage problematicPeaks(peaks_unmatched, peaks_matched, mode = "pH") +#' @param peaks_unmatched Table of unmatched peaks, with at least \code{cpdID, +#' scan, mzFound, int}. +#' @param peaks_matched Table of matched peaks (used for base peak reference), +#' with at least \code{cpdID, scan, int}. +#' @param mode Processing mode (\code{"pH", "pNa"} etc.) +#' @return A filtered table with the potentially problematic peaks, including +#' the precursor mass and MSMS base peak intensity (\code{aMax}) for reference. +#' @author Michael Stravs +#' @seealso \code{\link{msmsWorkflow}} +#' @examples \dontrun{ +#' # As used in the workflow: +#' fp <- problematicPeaks(specs[!specs$filterOK & !specs$noise & +#' ((specs$dppm == specs$dppmBest) | (is.na(specs$dppmBest))) +#' ,,drop=FALSE], peaksMatched(w), mode) +#' } +#' @export +problematicPeaks <- function(peaks_unmatched, peaks_matched, mode="pH") +{ + # find spectrum maximum for each peak, and merge into table + if(nrow(peaks_matched) == 0){ + assIntMax <- data.frame(list(integer(0),integer(0),integer(0))) + } else{ + assIntMax <- as.data.frame(aggregate(as.data.frame(peaks_matched$intensity), + by=list(peaks_matched$cpdID, peaks_matched$scan), max)) + } + colnames(assIntMax) <- c("cpdID", "scan", "aMax") + peaks_unmatched <- merge(peaks_unmatched, assIntMax) + # which of these peaks are intense? + p_control <- peaks_unmatched[ + ( (peaks_unmatched$intensity > 1e5) & + (peaks_unmatched$intensity > 0.01*peaks_unmatched$aMax)) + | ( (peaks_unmatched$intensity > 1e4) & + (peaks_unmatched$intensity > 0.1* peaks_unmatched$aMax)) ,] + # find parent m/z to exclude co-isolated peaks + #p_control$mzCenter <- numeric(nrow(p_control)) + p_control$mzCenter <- as.numeric( + unlist(lapply(p_control$cpdID, function(id) findMz(id, mode, retrieval=findLevel(id,TRUE))$mzCenter)) ) + p_control_noMH <- p_control[ + (p_control$mzFound < p_control$mzCenter - 1) | + (p_control$mzFound > p_control$mzCenter + 1),] + return(p_control_noMH) +} + + +#' Generate list of problematic peaks +#' +#' Generates a list of intense unmatched peaks for further review (the "failpeak list") and exports it if the archive name is given. +#' +#' @param w \code{msmsWorkspace} to analyze. +#' @param mode Processing mode (pH etc) +#' @param archivename Base name of the archive to write to (for "abc" the exported failpeaks list will be "abc_Failpeaks.csv"). +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z +#' @return Returns the aggregate data.frame with added column "\code{problematic}" (logical) which marks peaks which match the problematic criteria +#' +#' @author stravsmi +#' @export +processProblematicPeaks <- function(w, mode, archivename = NA) +{ + + specs <- w@aggregated + fp <- problematicPeaks(specs[!specs$filterOK & !specs$noise & + ((specs$dppm == specs$dppmBest) | (is.na(specs$dppmBest))) + ,,drop=FALSE], peaksMatched(w), mode) + fp$OK <- rep('', nrow(fp)) + fp$name <- rownames(fp) + + fp <- fp[with(fp, + order(cpdID, mzCalc, scan)), + ] + + # Select the correct precursor scans. This serves to filter the list + # for the cases where multiple workspaces were combined after step 7 + # with combineMultiplicities. + # Note that this has drawbacks. Leaving the "duplicates" in would make it more easy + # to identify legitimate unformulaed peaks. We might experiment by marking them up + # somehow. + precursors <- unlist(lapply(selectSpectra(w, "found", "object"), function(s) s@parent@acquisitionNum)) + fp <- fp[ + fp$parentScan %in% precursors + ,] + + # Add the info to specs + specs <- addProperty(specs, "problematicPeak", "logical", FALSE) + specs[match(fp$index, specs$index),"problematicPeak"] <- TRUE + + # Select the columns for output into the failpeaks file + fp <- fp[,c("OK", "name", "cpdID", "scan", "mzFound", "formula", + "reanalyzed.formula", "mzCalc", "reanalyzed.mzCalc", "dppm", "reanalyzed.dppm", "dbe", "reanalyzed.dbe", "intensity", + "formulaCount", "reanalyzed.formulaCount", "parentScan", "aMax", "mzCenter")] + if(!is.na(archivename)) + write.csv(fp, file= + paste(archivename,"_Failpeaks.csv", sep=''), row.names=FALSE) + + return(specs) +} + +#' Recalibrate MS/MS spectra +#' +#' Recalibrates MS/MS spectra by building a recalibration curve of the +#' assigned putative fragments of all spectra in \code{aggregatedSpecs} +#' (measured mass vs. mass of putative associated fragment) and additionally +#' the parent ion peaks. +#' +#' Note that the actually used recalibration functions are governed by the +#' general MassBank settings (see \code{\link{recalibrate}}). +#' +#' If a set of acquired LC-MS runs contains spectra for two different ion types +#' (e.g. [M+H]+ and [M+Na]+) which should both be processed by RMassBank, it is +#' necessary to do this in two separate runs. Since it is likely that one ion type +#' will be the vast majority of spectra (e.g. most in [M+H]+ mode), and only few +#' spectra will be present for other specific adducts (e.g. only few [M+Na]+ spectra), +#' it is possible that too few spectra are present to build a good recalibration curve +#' using only e.g. the [M+Na]+ ions. Therefore we recommend, for one set of LC/MS runs, +#' to build the recalibration curve for one ion type +#' (\code{msmsWorkflow(mode="pH", steps=c(1:8), newRecalibration=TRUE)}) +#' and reuse the same curve for processing different ion types +#' (\code{msmsWorkflow(mode="pNa", steps=c(1:8), newRecalibration=FALSE)}). +#' This also ensures a consistent recalibration across all spectra of the same batch. +#' +#' @usage makeRecalibration(w, mode, +#' recalibrateBy = getOption("RMassBank")$recalibrateBy, +#' recalibrateMS1 = getOption("RMassBank")$recalibrateMS1, +#' recalibrator = getOption("RMassBank")$recalibrator, +#' recalibrateMS1Window = getOption("RMassBank")$recalibrateMS1Window +#' ) +#' +#' recalibrateSpectra(mode, rawspec = NULL, rc = NULL, rc.ms1=NULL, w = NULL, +#' recalibrateBy = getOption("RMassBank")$recalibrateBy, +#' recalibrateMS1 = getOption("RMassBank")$recalibrateMS1) +#' +#' recalibrateSingleSpec(spectrum, rc, +#' recalibrateBy = getOption("RMassBank")$recalibrateBy) +#' @aliases makeRecalibration recalibrateSpectra recalibrateSingleSpec +#' @param w For \code{makeRecalibration}: to perform the recalibration with. For \code{recalibrateSpectra}: +#' the \code{msmsWorkspace} which contains the recalibration curves (alternatively to specifying \code{rc, rc.ms1}). +#' @param spectrum For \code{recalibrateSingleSpec}: +#' a \code{MSnbase} \code{Spectrum}-derived object, commonly a \code{RmbSpectrum2} for MS2 or \code{Spectrum1} for MS1. +#' @param mode \code{"pH", "pNa", "pM", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M-H]-, [M]-, [M+FA]-). +#' @param rawspec For \code{recalibrateSpectra}:an \code{RmbSpectraSetList} of \code{RmbSpectraSet} objects +#' , as the \code{w@@spectra} slot from \code{msmsWorkspace} or any object returned by \code{\link{findMsMsHR}}. +#' If empty, no spectra are recalibrated, but the recalibration curve is +#' returned. +#' @param rc,rc.ms1 The recalibration curves to be used in the recalibration. +#' @param recalibrateBy Whether recalibration should be done by ppm ("ppm") or by m/z ("mz"). +#' @param recalibrateMS1 Whether MS1 spectra should be recalibrated separately ("separate"), +#' together with MS2 ("common") or not at all ("none"). Usually taken from settings. +#' @param recalibrator The recalibrator functions to be used. +#' Refer to \code{\link{recalibrate}} for details. Usually taken from settings. +#' @param recalibrateMS1Window Window width to look for MS1 peaks to recalibrate (in ppm). +#' @return \code{makeRecalibration}: a \code{list(rc, rc.ms1)} with recalibration curves +#' for the MS2 and MS1 spectra. +#' +#' \code{recalibrateSpectra}: if \code{rawspec} is not \code{NULL}, returns the recalibrated +#' spectra as \code{RmbSpectraSetList}. All spectra have their mass recalibrated and evaluation data deleted. +#' +#' \code{recalibrateSingleSpec}: the recalibrated \code{Spectrum} (same object, recalibrated masses, +#' evaluation data like assigned formulae etc. deleted). +#' +#' @examples \dontrun{ +#' rcCurve <- recalibrateSpectra(w, "pH") +#' w@@spectra <- recalibrateSpectra(mode="pH", rawspec=w@@spectra, w=myWorkspace) +#' w@@spectra <- recalibrateSpectra(mode="pH", rawspec=w@@spectra, rcCurve$rc, rcCurve$rc.ms1) +#' } +#' +#' @author Michael Stravs, Eawag +#' @export +makeRecalibration <- function(w, mode, + recalibrateBy = getOption("RMassBank")$recalibrateBy, + recalibrateMS1 = getOption("RMassBank")$recalibrateMS1, + recalibrator = getOption("RMassBank")$recalibrator, + recalibrateMS1Window = getOption("RMassBank")$recalibrateMS1Window + ) +{ + if(is.null(w@spectra)) + stop("No spectra present to generate recalibration curve.") + + rcdata <- peaksMatched(w) + rcdata <- rcdata[rcdata$formulaCount == 1, ,drop=FALSE] + + rcdata <- rcdata[,c("mzFound", "dppm", "mzCalc")] + + if(nrow(rcdata) == 0) + stop("No peaks matched to generate recalibration curve.") + + ms1data <- recalibrate.addMS1data(w@spectra, mode, recalibrateMS1Window) + ms1data <- ms1data[,c("mzFound", "dppm", "mzCalc")] + + if (recalibrateMS1 != "none") { + ## Add m/z values from MS1 to calibration datapoints + rcdata <- rbind(rcdata, ms1data) + } + + rcdata$dmz <- rcdata$mzFound - rcdata$mzCalc + ms1data$dmz <- ms1data$mzFound - ms1data$mzCalc + + if(recalibrateBy == "dppm") + { + rcdata$recalfield <- rcdata$dppm + ms1data$recalfield <- ms1data$dppm + } + else + { + rcdata$recalfield <- rcdata$dmz + ms1data$recalfield <- ms1data$dmz + } + + # generate recalibration model + rc <- do.call(recalibrator$MS2, list(rcdata)) + if(recalibrateMS1 == "separate") + rc.ms1 <- do.call(recalibrator$MS1, list(ms1data)) + else + rc.ms1 <- rc + + # plot the model + par(mfrow=c(2,2)) + if(nrow(rcdata)>0) + plotRecalibration.direct(rcdata, rc, rc.ms1, "MS2", + range(rcdata$mzFound), + recalibrateBy) + if(nrow(ms1data)>0) + plotRecalibration.direct(ms1data, rc, rc.ms1, "MS1", + range(ms1data$mzFound), + recalibrateBy) + # Return the computed recalibration curves + return(list(rc=rc, rc.ms1=rc.ms1)) +} + + + +#' Plot the recalibration graph. +#' +#' @aliases plotRecalibration plotRecalibration.direct +#' @usage plotRecalibration(w, recalibrateBy = getOption("RMassBank")$recalibrateBy) +#' +#' plotRecalibration.direct(rcdata, rc, rc.ms1, title, mzrange, +#' recalibrateBy = getOption("RMassBank")$recalibrateBy) +#' +#' @param w The workspace to plot the calibration graph from +#' @param rcdata A data frame with columns \code{recalfield} and \code{mzFound}. +#' @param rc Predictor for MS2 data +#' @param rc.ms1 Predictor for MS1 data +#' @param title Prefix for the graph titles +#' @param mzrange m/z value range for the graph +#' @param recalibrateBy Whether recalibration was done by ppm ("ppm") or by m/z ("mz"). +#' Important only for graph labeling here. +#' +#' @author Michele Stravs, Eawag +#' @export +plotRecalibration <- function(w, recalibrateBy = getOption("RMassBank")$recalibrateBy) +{ + spec <- w@aggregated + if(!is.null(w@parent)) + spec <- w@parent@aggregated + + rcdata <- data.frame(mzFound = w@rc$x, recalfield = w@rc$y) + ms1data <- data.frame(mzFound = w@rc.ms1$x, recalfield = w@rc.ms1$y) + + + + par(mfrow=c(2,2)) + if(nrow(rcdata)>0) + plotRecalibration.direct(rcdata, w@rc, w@rc.ms1, "MS2", + range(spec$mzFound[which(spec$good)]),recalibrateBy) + if(nrow(ms1data)>0) + plotRecalibration.direct(ms1data, w@rc, w@rc.ms1, "MS1", + range(ms1data$mzFound),recalibrateBy) + +} + +#' @export +plotRecalibration.direct <- function(rcdata, rc, rc.ms1, title, mzrange, + recalibrateBy = getOption("RMassBank")$recalibrateBy + ) +{ + if(recalibrateBy == "dppm") + ylab.plot <- expression(paste(delta, "ppm")) + else + ylab.plot <- expression(paste(delta, "m/z")) + + plot(recalfield ~ mzFound, data=rcdata, + xlab = "m/z", ylab = ylab.plot, main=paste(title, "scatterplot")) + RcModelMz <- seq(mzrange[[1]], mzrange[[2]], by=0.2) + RcModelRecal <- predict(rc, newdata= data.frame(mzFound =RcModelMz)) + RcModelRecalMs1 <- predict(rc.ms1, newdata= data.frame(mzFound =RcModelMz)) + lines(RcModelMz, RcModelRecal, col="blue") + lines(RcModelMz, RcModelRecalMs1, col="yellow") + if((length(unique(rcdata$mzFound))>1) & + (length(unique(rcdata$recalfield))>1)) + { + if(requireNamespace("gplots",quietly=TRUE)) + { + + gplots::hist2d(rcdata$mzFound, rcdata$recalfield, + col=c("white", heat.colors(12)), xlab="m/z", + ylab = ylab.plot, main=paste(title, "density")) + lines(RcModelMz, RcModelRecal, col="blue") + lines(RcModelMz, RcModelRecalMs1, col="yellow") + } + else + { + message("Package gplots not installed. The recalibration density plot will not be displayed.") + message("To install gplots: install.packages('gplots')") + } + } +} + + +#' @export +recalibrateSpectra <- function(mode, rawspec = NULL, rc = NULL, rc.ms1=NULL, w = NULL, + recalibrateBy = getOption("RMassBank")$recalibrateBy, + recalibrateMS1 = getOption("RMassBank")$recalibrateMS1) +{ + # Load the recal curves from the workspace if one is specified. + if(!is.null(w)) + { + rc <- w@rc + rc.ms1 <- w@rc.ms1 + } + if(is.null(rc) || is.null(rc.ms1)) + stop("Please specify the recalibration curves either via workspace (w) or via parameters rc, rc.ms1.") + + # Do the recalibration + if(!is.null(rawspec)) + { + # go through all raw spectra and recalculate m/z values + recalibratedSpecs <- lapply(rawspec, function(s) + { + if(s@found) + { + # recalculate tandem spectrum peaks + recalSpectra <- lapply(s@children, function(p) + { + recalibrateSingleSpec(p, rc, recalibrateBy) + }) + s@children <- as(recalSpectra, "SimpleList") + # recalculate MS1 spectrum if required + if(recalibrateMS1 != "none") + { + s@parent <- recalibrateSingleSpec(s@parent, rc.ms1, recalibrateBy) + } + } + s@empty <- NA + s@complete <- NA + return(s) + } ) + return(as(recalibratedSpecs, "SimpleList")) + } + else # no rawspec passed + return(list()) +} + +#' @export +recalibrateSingleSpec <- function(spectrum, rc, + recalibrateBy = getOption("RMassBank")$recalibrateBy) +{ + spectrum.df <- as.data.frame(spectrum) + spectrum.df <- spectrum.df[!duplicated(spectrum.df$mz),,drop=FALSE] + spectrum.df <- spectrum.df[order(spectrum.df$mz),,drop=FALSE] + + mzVals <- spectrum.df + if(nrow(mzVals) > 0) + { + # Fix the column names so our + # prediction functions choose the right + # rows. + colnames(mzVals) <- c("mzFound", "int") + drecal <- predict(rc, newdata=mzVals) + if(recalibrateBy == "dppm") + mzRecal <- mzVals$mzFound / (1 + drecal/1e6) + else + mzRecal <- mzVals$mzFound - drecal + # And rename them back so our "mz" column is + # called "mz" again + } + spectrum.df$mz <- mzRecal + + + # now comes the part that I don't like too much; this could be improved by using as.data.frame instead of getData and correspondingly + # also not use setData. For now I leave it like this. + # The problem is that I am not sure whether the default behaviour of as.RmbSpectrum2 should be clean=TRUE or FALSE, + # and vice versa, I am not sure if as.data.frame should return only mz/int or the whole table. + + if(is(spectrum, "RmbSpectrum2")) + { + # this removes all evaluated data that were added in step 2 except for @ok I think + colnames(spectrum.df) <- c("mz", "intensity") + spectrum <- setData(spectrum, spectrum.df, clean=TRUE) + # It also avoids making a new object when we don't know what class it should be + } + else + { + # for Spectrum1 or all others that we don't know + spectrum@mz <- spectrum.df$mz + spectrum@intensity <- spectrum.df$i + } + + return(spectrum) +} + + + + + +#' Filter satellite peaks +#' +#' Filters satellite peaks in FT spectra which arise from FT artifacts and from +#' conversion to stick mode. A very simple rule is used which holds mostly true +#' for MSMS spectra (and shouldn't be applied to MS1 spectra which contain +#' isotope structures...) +#' +#' The function cuts off all peaks within 0.5 m/z from every peak, in +#' decreasing intensity order, which are below 5% of the referring peak's +#' intensity. E.g. for peaks m/z=100, int=100; m/z=100.2, int=2, m/z=100.3, +#' int=6, m/z 150, int=10: The most intense peak (m/z=100) is selected, all +#' neighborhood peaks below 5% are removed (in this case, only the m/z=100.2 +#' peak) and the next less intense peak is selected. Here this is the m/z=150 +#' peak. All low-intensity neighborhood peaks are removed (nothing). The next +#' less intense peak is selected (m/z=100.3) and again neighborhood peaks are +#' cut away (nothing to cut here. Note that the m/z = 100.2 peak was alredy +#' removed.) +#' +#' @usage filterPeakSatellites(peaks, filterSettings = getOption("RMassBank")$filterSettings) +#' @param peaks A peak dataframe with at least the columns \code{mz, int}. Note +#' that \code{mz} is used even for the recalibrated spectra, i.e. the +#' desatellited spectrum is identical for both the unrecalibrated and the +#' recalibrated spectra. +#' @param filterSettings The settings used for filtering. Refer to \code{\link{analyzeMsMs}} +#' documentation for filter settings. +#' @return Returns the peak table with satellite peaks removed. +#' @note This is a very crude rule, but works remarkably well for our spectra. +#' @author Michael Stravs +#' @seealso \code{\link{analyzeMsMs}}, \code{\link{filterLowaccResults}} +#' @examples +#' +#' # From the workflow: +#' \dontrun{ +#' # Filter out satellite peaks: +#' shot <- filterPeakSatellites(shot) +#' shot_satellite_n <- setdiff(row.names(shot_full), row.names(shot)) +#' shot_satellite <- shot_full[shot_satellite_n,] +#' # shot_satellite contains the peaks which were eliminated as satellites. +#' } +#' +#' @export +filterPeakSatellites <- function(peaks, filterSettings = getOption("RMassBank")$filterSettings) +{ + cutoff_int_limit <- filterSettings$satelliteIntLimit + cutoff_mz_limit <- filterSettings$satelliteMzLimit + # Order by intensity (descending) + peaks_o <- peaks[order(peaks$intensity, decreasing=TRUE),,drop=FALSE] + n <- 1 + # As long as there are peaks left AND the last peak is small enough (relative + # to selected), move to the next peak + while(n < nrow(peaks_o)) + { + if(peaks_o[nrow(peaks_o),"intensity"] >= cutoff_int_limit *peaks_o[n,"intensity"]) + break + # remove all peaks within cutoff_mz_limit (std. m/z = 0.5) which have intensity + # of less than 5% relative to their "parent" peak + # + peaks_l <- peaks_o[ (peaks_o$mz > peaks_o[n,"mz"] - cutoff_mz_limit) + & (peaks_o$mz < peaks_o[n,"mz"] + cutoff_mz_limit) + & (peaks_o$intensity < cutoff_int_limit * peaks_o[n,"intensity"]),,drop=FALSE] + peaks_o <- peaks_o[ !((peaks_o$mz > peaks_o[n,"mz"] - cutoff_mz_limit) + & (peaks_o$mz < peaks_o[n,"mz"] + cutoff_mz_limit) + & (peaks_o$intensity < cutoff_int_limit * peaks_o[n,"intensity"]) + ),,drop=FALSE] + n <- n+1 + } + return(peaks_o[order(peaks_o$mz),,drop=FALSE]) +} + + +#' Reanalyze unmatched peaks +#' +#' Reanalysis of peaks with no matching molecular formula by allowing +#' additional elements (e.g. "N2O"). +#' +#' \code{reanalyzeFailpeaks} examines the \code{unmatchedPeaksC} table in +#' \code{specs} and sends every peak through \code{reanalyzeFailpeak}. +#' +#' @aliases reanalyzeFailpeaks reanalyzeFailpeak +#' @usage reanalyzeFailpeaks(aggregated, custom_additions, mode, filterSettings = +#' getOption("RMassBank")$filterSettings, progressbar = "progressBarHook") +#' reanalyzeFailpeak(custom_additions, mass, cpdID, counter, pb = NULL, mode, +#' filterSettings = getOption("RMassBank")$filterSettings) +#' @param aggregated A peake aggregate table (\code{w@@aggregate}) (after processing electronic noise removal!) +#' @param custom_additions The allowed additions, e.g. "N2O". +#' @param mode Processing mode (\code{"pH", "pNa", "mH"} etc.) +#' @param mass (Usually recalibrated) m/z value of the peak. +#' @param cpdID Compound ID of this spectrum. +#' @param counter Current peak index (used exclusively for the progress +#' indicator) +#' @param pb A progressbar object to display progress on, as passed by +#' \code{reanalyzeFailpeaks} to \code{reanalyzeFailpeak}. No progress +#' is displayed if NULL. +#' @param progressbar The progress bar callback to use. Only needed for specialized +#' applications. Cf. the documentation of \code{\link{progressBarHook}} for usage. +#' @param filterSettings Settings for filtering data. Refer to\code{\link{analyzeMsMs}} for settings. +#' @return The aggregate data frame extended by the columns: +#' #' \item{reanalyzed.???}{If reanalysis (step 7) has already been processed: matching values from the reanalyzed peaks} +#' \item{matchedReanalysis}{Whether reanalysis has matched (\code{TRUE}), not matched(\code{FALSE}) or has not been conducted for the peak(\code{NA}).} +#' +#' It would be good to merge the analysis functions of \code{analyzeMsMs} with +#' the one used here, to simplify code changes. +#' @author Michael Stravs +#' @seealso \code{\link{analyzeMsMs}}, \code{\link{msmsWorkflow}} +#' @examples +#' +#' ## As used in the workflow: +#' \dontrun{ +#' reanalyzedRcSpecs <- reanalyzeFailpeaks(w@@aggregated, custom_additions="N2O", mode="pH") +#' # A single peak: +#' reanalyzeFailpeak("N2O", 105.0447, 1234, 1, 1, "pH") +#' } +#' +#' @export +reanalyzeFailpeaks <- function(aggregated, custom_additions, mode, filterSettings = + getOption("RMassBank")$filterSettings, progressbar = "progressBarHook") +{ + + fp <- peaksUnmatched(aggregated, cleaned=TRUE) + fp <- fp[is.na(fp$dppm) | (fp$dppm == fp$dppmBest),] + #fp <- pu[!pu$noise,,drop=FALSE] + + custom_additions_l <- as.list(rep(x=custom_additions, times=nrow(fp))) + mode_l <- as.list(rep(x=mode, times=nrow(fp))) + nLen <- nrow(fp) + + pb <- do.call(progressbar, list(object=NULL, value=0, min=0, max=max(nLen,1))) + temp <- data.frame() + if(nLen == 0) + { + message("reanalyzeFailpeaks: No peaks to reanalyze.") + temp <- data.frame( + "reanalyzed.formula" = character(), + "reanalyzed.mzCalc" = numeric(), + "reanalyzed.dppm" = numeric(), + "reanalyzed.formulaCount" = numeric(), + "reanalyzed.dbe" = numeric()) + } + else + { + counter <- as.list(1:nrow(fp)) + # this is the reanalysis step: run reanalyze.failpeak (with the relevant parameters) + # on each failpeak. + temp <- mapply(reanalyzeFailpeak, custom_additions_l, fp$mzFound, fp$cpdID, counter, + MoreArgs=list(mode=mode, pb=list(hook=progressbar, bar=pb), filterSettings=filterSettings)) + # reformat the result and attach it to specs + temp <- as.data.frame(t(temp)) + temp <- temp[,c("reanalyzed.formula", "reanalyzed.mzCalc", "reanalyzed.dppm", + "reanalyzed.formulaCount", "reanalyzed.dbe")] + } + + # Add columns to the aggregated table (they are then filled in with the obtained values for reanalyzed peaks and left + # empty otherwise + aggregated <- addProperty(aggregated, "reanalyzed.formula", "character") + aggregated <- addProperty(aggregated, "reanalyzed.mzCalc", "numeric") + aggregated <- addProperty(aggregated, "reanalyzed.dppm", "numeric") + aggregated <- addProperty(aggregated, "reanalyzed.formulaCount", "numeric") + aggregated <- addProperty(aggregated, "reanalyzed.dbe", "numeric") + aggregated <- addProperty(aggregated, "matchedReanalysis", "logical", NA) + + + peaksReanalyzed <- cbind(fp, temp) + + # Since some columns are in "list" type, they disturb later on. + # therefore, fix them and make them normal vectors. + listcols <- unlist(lapply(colnames(peaksReanalyzed), function(col) + is.list(peaksReanalyzed[,col]))) + for(col in colnames(peaksReanalyzed)[which(listcols==TRUE)]) + peaksReanalyzed[,col] <- + unlist(peaksReanalyzed[,col]) + + peaksReanalyzed$matchedReanalysis <- !is.na(peaksReanalyzed$reanalyzed.dppm) + + # Substitute in the reanalyzed peaks into the aggregated table + aggregated[match(peaksReanalyzed$index, aggregated$index),] <- peaksReanalyzed + + do.call(progressbar, list(object=pb, close=TRUE)) + return(aggregated) +} + + +#' @export +reanalyzeFailpeak <- function(custom_additions, mass, cpdID, counter, pb = NULL, mode, + filterSettings = getOption("RMassBank")$filterSettings) +{ + # the counter to show the progress + if(!is.null(pb)) + { + do.call(pb$hook, list(object=pb$bar, value=counter)) + } + # here follows the Rcdk analysis + #------------------------------------ + + # define the adduct additions + if(mode == "pH") { + allowed_additions <- "H" + mode.charge <- 1 + } else if(mode == "pNa") { + allowed_additions <- "Na" + mode.charge <- 1 + } else if(mode == "pM") { + allowed_additions <- "" + mode.charge <- 1 + } else if(mode == "mM") { + allowed_additions <- "" + mode.charge <- -1 + } else if(mode == "mH") { + allowed_additions <- "H-1" + mode.charge <- -1 + } else if(mode == "mFA") { + allowed_additions <- "C2H3O2" + mode.charge <- -1 + } else { + stop("mode = \"", mode, "\" not defined") + } + + # the ppm range is two-sided here. + # The range is slightly expanded because dppm calculation of + # generate.formula starts from empirical mass, but dppm cal- + # culation of the evaluation starts from theoretical mass. + # So we don't miss the points on 'the border'. + + db_formula <- findFormula(cpdID, retrieval=findLevel(cpdID,TRUE)) + + ppmlimit <- 2.25 * filterSettings$ppmFine + parent_formula <- add.formula(db_formula, allowed_additions) + parent_formula <- add.formula(parent_formula, custom_additions) + dbe_parent <- dbe(parent_formula) + # check whether the formula is valid, i.e. has no negative or zero element numbers. + #print(parent_formula) + limits <- to.limits.rcdk(parent_formula) + + peakformula <- tryCatch(suppressWarnings(generate.formula(mass, ppm(mass, ppmlimit, p=TRUE), + limits, charge=mode.charge)), error=function(e) NA) + # was a formula found? If not, return empty result + if(!is.list(peakformula)) + return(as.data.frame( + t(c(mzFound=as.numeric(as.character(mass)), + reanalyzed.formula=NA, reanalyzed.mzCalc=NA, reanalyzed.dppm=NA, + reanalyzed.formulaCount=0, + reanalyzed.dbe=NA)))) + else # if is.list(peakformula) + # formula found? then return the one with lowest dppm + { + # calculate dppm for all formulas + peakformula <- sapply(peakformula, function(f) + { + l <- list(mzFound=as.numeric(as.character(mass)), + reanalyzed.formula=as.character(f@string), + reanalyzed.mzCalc=as.numeric(as.character(f@mass)) + ) + + return(unlist(l)) + }) + + # filter out bad dbe stuff + peakformula <- as.data.frame(t(peakformula)) + # for some reason completely oblivious to me, the columns in peakformula + # are still factors, even though i de-factored them by hand. + # Therefore, convert them again... + peakformula$mzFound <- as.numeric(as.character(peakformula$mzFound)) + peakformula$reanalyzed.formula <- as.character(peakformula$reanalyzed.formula) + peakformula$reanalyzed.mzCalc <- as.numeric(as.character(peakformula$reanalyzed.mzCalc)) + + peakformula$reanalyzed.dppm <- (peakformula$mzFound / peakformula$reanalyzed.mzCalc - 1) * 1e6 + peakformula$reanalyzed.formulaCount=nrow(peakformula) + + # filter out bad dbe and high ppm stuff + peakformula$reanalyzed.dbe <- unlist(lapply(peakformula$reanalyzed.formula, dbe)) + peakformula <- peakformula[(peakformula$reanalyzed.dbe >= filterSettings$dbeMinLimit) + & (abs(peakformula$reanalyzed.dppm) < filterSettings$ppmFine),] + # is there still something left? + if(nrow(peakformula) == 0) + return(as.data.frame( + t(c(mzFound=as.numeric(as.character(mass)), + reanalyzed.formula=NA, reanalyzed.mzCalc=NA, reanalyzed.dppm=NA, + reanalyzed.formulaCount=0, reanalyzed.dbe = NA)))) + else + { + #update formula count to the remaining formulas + peakformula$reanalyzed.formulaCount=nrow(peakformula) + return(peakformula[which.min(abs(peakformula$reanalyzed.dppm)),]) + } + + } # endif is.list(peakformula) + + + + } + +#' Multiplicity filtering: Removes peaks which occur only once in a n-spectra set. +#' +#' For every compound, every peak (with annotated formula) is compared +#' across all spectra. Peaks whose formula occurs only once for all collision energies +#' / spectra types, are discarded. This eliminates "stochastic formula hits" of pure +#' electronic noise peaks efficiently from the spectra. Note that in the author's +#' experimental setup two spectra were recorded at every collision energy, +#' and therefore every peak-formula should appear +#' at least twice if it is real, even if it is by chance a fragment which appears +#' on only one collision energy setting. The function was not tested in a different +#' setup. Therefore, use with a bit of caution. +#' @usage filterPeaksMultiplicity(peaks, formulacol, recalcBest = TRUE) +#' @param peaks An aggregate peak data.frame containing all peaks to be analyzed; with at least +#' the columns \code{cpdID, scan, mzFound} and one column for the formula +#' specified with the \code{formulacol} parameter. +#' @param formulacol Which column the assigned formula is stored in. (Needed to separately process \code{"formula"} and +#' \code{"reanalyzed.formula"} multiplicites.) +#' @param recalcBest Whether the best formula for each peak should be re-determined. +#' This is necessary for results from the ordinary \code{\link{analyzeMsMs}} +#' analysis which allows multiple potential formulas per peak - the old best match +#' could potentially have been dropped because of multiplicity filtering. For results +#' from \code{\link{reanalyzeFailpeak}} this is not necessary, since only one potential +#' formula is assigned in this case. +#' @return The peak table is returned, enriched with columns: +#' \itemize{ +#' \item{\code{formulaMultiplicity}}{The # of occurrences of this formula +#' in the spectra of its compounds.} +#' } +#' @examples \dontrun{ +#' peaksFiltered <- filterPeaksMultiplicity(peaksMatched(w), +#' "formula", TRUE) +#' peaksOK <- subset(peaksFiltered, formulaMultiplicity > 1) +#' } +#' @author Michael Stravs, EAWAG +#' @export +filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE) +{ + # create dummy for the case that we have no rows + multInfo <- data.frame(cpdID = character(), + formulacol = character(), + formulaMultiplicity = numeric()) + # rename (because "formulacol" is not the actually correct name) + colnames(multInfo) <- c("cpdID", formulacol, "formulaMultiplicity") + + if(!is.data.frame(peaks) || (nrow(peaks) == 0) ) + { + peaks <- cbind(peaks, data.frame(formulaMultiplicity=numeric())) + if(recalcBest){ + if(formulacol == "formula"){ + warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.") + } + if(formulacol == "reanalyzed.formula"){ + warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.") + } + peaks$fM_factor <- as.factor(peaks$formulaMultiplicity) + return(peaks) + } + } + else + { + # calculate duplicity info + multInfo <- aggregate(as.data.frame(peaks$scan), list(peaks$cpdID, peaks[,formulacol]), FUN=length) + # just for comparison: + # nform <- unique(paste(pks$cpdID,pks$formula)) + + # merge the duplicity info into the peak table + colnames(multInfo) <- c("cpdID", formulacol, "formulaMultiplicity") + peaks <- merge(peaks, multInfo) + } + + # separate log intensity data by duplicity (needs duplicity as a factor) + # and boxplot + peaks$fM_factor <- as.factor(peaks$formulaMultiplicity) + + # nostalgy: dppmBest first, to compare :) + # now we prioritize the most frequent formula instead, and only then apply the + # dppmBest rule + #pks2 <- subset(pks, dppm==dppmBest) + + # split peak intensity by multiplicity + peakMultiplicitySets <- split(log(peaks$int,10), peaks$fM_factor) + #boxplot(peakMultiplicitySets) + # nice plot :) + #if(length(peakMultiplicitySets) > 0) + # q <- quantile(peakMultiplicitySets[[1]], c(0,.25,.5,.75,.95,1)) + pk_data <- lapply(peakMultiplicitySets, length) + + # now by formula, not by peak: + multInfo$fM_factor <- as.factor(multInfo$formulaMultiplicity) + # the formulas are split into bins with their multiplicity + # (14 bins for our 14-spectra method) + formulaMultiplicitySets <- split(multInfo[,formulacol], multInfo$fM_factor) + formulaMultiplicityHist <- lapply(formulaMultiplicitySets, length) + + # if we use recalcBest, then we recalculate which peak in the + # list was best. We do this for the peaks matched in the first analysis. + # The peaks from the reanalysis are single anyway and don't get this additional + # treatment. + + if(recalcBest == FALSE) + return(peaks) + + # prioritize duplicate peaks + # get unique peaks with their maximum-multiplicity formula attached + best_mult <- aggregate(as.data.frame(peaks$formulaMultiplicity), + list(peaks$cpdID, peaks$scan, peaks$mzFound), + max) + colnames(best_mult) <- c("cpdID", "scan", "mzFound", "bestMultiplicity") + peaks <- merge(peaks, best_mult) + peaks <- peaks[peaks$formulaMultiplicity==peaks$bestMultiplicity,] + + # now we also have to recalculate dppmBest since the "old best" may have been + # dropped. + peaks$dppmBest <- NULL + bestPpm <- aggregate(as.data.frame(peaks$dppm), + list(peaks$cpdID, peaks$scan, peaks$mzFound), + function(dppm) dppm[[which.min(abs(dppm))]]) + colnames(bestPpm) <- c("cpdID", "scan", "mzFound", "dppmBest") + peaks <- merge(peaks, bestPpm) + pks_best <- peaks[peaks$dppm==peaks$dppmBest,] + + # And, iteratively, the multiplicity also must be recalculated, because we dropped + # some peaks and the multiplicites of some of the formulas will have decreased. + + pks_best$formulaMultiplicity <- NULL + pks_best$bestMultiplicity <- NULL + multInfo_best <- aggregate(as.data.frame(pks_best$scan), + list(pks_best$cpdID, pks_best[,formulacol]), + FUN=length) + colnames(multInfo_best) <- c("cpdID", formulacol, "formulaMultiplicity") + pks_best <- merge(pks_best, multInfo_best) + pks_best$fM_factor <- as.factor(pks_best$formulaMultiplicity) + multInfo_best$fM_factor <- as.factor(multInfo_best$formulaMultiplicity) + + formulaMultplicitySets_best <- split(multInfo_best[,formulacol], multInfo_best$fM_factor) + formulaMultplicityHist_best <- lapply(formulaMultplicitySets_best, length) + + peakMultiplicitySets_best <- split(log(pks_best$int,10), pks_best$fM_factor) + #boxplot(peakMultiplicitySets_best) + #q <- quantile(peakMultiplicitySets_best[[1]], c(0,.25,.5,.75,.95,1)) + #peakMultiplicityHist_best <- lapply(peakMultiplicitySets_best, length) + #q + pks_best$fM_factor <- NULL + # this returns the "best" peaks (first by formula multiplicity, then by dppm) + # before actually cutting the bad ones off. + + + return(pks_best) +} + + +#' filterMultiplicity +#' +#' Multiplicity filtering: Removes peaks which occur only once in a n-spectra +#' set. +#' +#' This function executes multiplicity filtering for a set of spectra using the +#' workhorse function \code{\link{filterPeaksMultiplicity}} (see details there) +#' and retrieves problematic filtered peaks (peaks which are of high intensity +#' but were discarded, because either no formula was assigned or it was not +#' present at least 2x), using the workhorse function +#' \code{\link{problematicPeaks}}. The results are returned in a format ready +#' for further processing with \code{\link{mbWorkflow}}. +#' +#' @usage filterMultiplicity(w, archivename=NA, mode="pH", recalcBest = TRUE, +#' multiplicityFilter = getOption("RMassBank")$multiplicityFilter) +#' @param w Workspace containing the data to be processed (aggregate table and \code{RmbSpectraSet} objects) +#' @param archivename The archive name, used for generation of +#' archivename_Failpeaks.csv +#' @param mode Mode of ion analysis +#' @param recalcBest Boolean, whether to recalculate the formula multiplicity +#' after the first multiplicity filtering step. Sometimes, setting this +#' to FALSE can be a solution if you have many compounds with e.g. fluorine +#' atoms, which often have multiple assigned formulas per peak and might occasionally +#' lose peaks because of that. +#' @param multiplicityFilter Threshold for the multiplicity filter. If set to 1, +#' no filtering will apply (minimum 1 occurrence of peak). 2 equals minimum +#' 2 occurrences etc. +#' @return A list object with values: +#' \item{peaksOK}{ Peaks with >1-fold formula multiplicity from the +#' "normal" peak analysis. } +#' \item{peaksReanOK}{ Peaks with >1-fold formula multiplicity from +#' peak reanalysis. } +#' \item{peaksFiltered}{ All peaks with annotated formula multiplicity from +#' first analysis. } +#' \item{peaksFilteredReanalysis}{ All peaks with annotated +#' formula multiplicity from peak reanalysis. } +#' \item{peaksProblematic}{ Peaks with high intensity which do not match +#' inclusion criteria -> possible false negatives. The list will be +#' exported into archivename_failpeaks.csv. +#' } +#' @author Michael Stravs +#' @seealso +#' \code{\link{filterPeaksMultiplicity}},\code{\link{problematicPeaks}} +#' @examples +#' \dontrun{ +#' refilteredRcSpecs <- filterMultiplicity( +#' w, "myarchive", "pH") +#' } +#' @export +filterMultiplicity <- function(w, archivename=NA, mode="pH", recalcBest = TRUE, + multiplicityFilter = getOption("RMassBank")$multiplicityFilter) +{ + # Read multiplicity filter setting + # For backwards compatibility: If the option is not set, define as 2 + # (which was the behaviour before we introduced the option) + if(is.null(multiplicityFilter)) + multiplicityFilter <- 2 + + specs <- w@aggregated + + peaksFiltered <- filterPeaksMultiplicity(peaksMatched(specs), "formula", recalcBest) + peaksFilteredReanalysis <- filterPeaksMultiplicity(specs[!is.na(specs$matchedReanalysis) & specs$matchedReanalysis,,drop=FALSE], "reanalyzed.formula", FALSE) + + specs <- addProperty(specs, "formulaMultiplicity", "numeric", 0) + + # Reorder the columns of the filtered peaks such that they match the columns + # of the original aggregated table; such that the columns can be substituted in. + + peaksFiltered <- peaksFiltered[,colnames(specs)] + peaksFilteredReanalysis <- peaksFilteredReanalysis[,colnames(specs)] + + # substitute into the parent dataframe + specs[match(peaksFiltered$index,specs$index),] <- peaksFiltered + specs[match(peaksFilteredReanalysis$index,specs$index),] <- peaksFilteredReanalysis + + + specs <- addProperty(specs, "filterOK", "logical", FALSE) + + OKindex <- which(specs$formulaMultiplicity > (multiplicityFilter - 1)) + + if(length(OKindex)){ + specs[OKindex,"filterOK"] <- TRUE + } + + peaksReanOK <- specs[ + specs$filterOK & !is.na(specs$matchedReanalysis) & specs$matchedReanalysis,,drop=FALSE] + + # Kick the M+H+ satellites out of peaksReanOK: + peaksReanOK$mzCenter <- as.numeric( + unlist(lapply(peaksReanOK$cpdID, function(id) findMz(id, retrieval=findLevel(id,TRUE))$mzCenter)) ) + peaksReanBad <- peaksReanOK[ + !((peaksReanOK$mzFound < peaksReanOK$mzCenter - 1) | + (peaksReanOK$mzFound > peaksReanOK$mzCenter + 1)),] + notOKindex <- match(peaksReanBad$index, specs$index) + if(length(notOKindex)){ + specs[notOKindex,"filterOK"] <- FALSE + } + + + return(specs) +} + +#' Return MS1 peaks to be used for recalibration +#' +#' Returns the precursor peaks for all MS1 spectra in the \code{spec} dataset +#' with annotated formula to be used in recalibration. +#' +#' For all spectra in \code{spec$specFound}, the precursor ion is extracted from +#' the MS1 precursor spectrum. All found ions are returned in a data frame with a +#' format matching \code{spec$peaksMatched} and therefore suitable for \code{rbind}ing +#' to the \code{spec$peaksMatched} table. However, only minimal information needed for +#' recalibration is returned. +#' +#' @usage recalibrate.addMS1data(spec,mode="pH", recalibrateMS1Window = +#' getOption("RMassBank")$recalibrateMS1Window) +#' @param spec A \code{msmsWorkspace} or \code{RmbSpectraSetList} containing spectra for which MS1 "peaks" should be "constructed". +#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +#' @param recalibrateMS1Window Window width to look for MS1 peaks to recalibrate (in ppm). +#' @return A dataframe with columns \code{mzFound, formula, mzCalc, dppm, dbe, int, +#' dppmBest, formulaCount, good, cpdID, scan, parentScan, dppmRc}. However, +#' columns \code{dbe, int, formulaCount, good, scan, parentScan} do not contain +#' real information and are provided only as fillers. +#' @examples \dontrun{ +#' # More or less as used in recalibrateSpectra: +#' rcdata <- peaksMatched(w) +#' rcdata <- rcdata[rcdata$formulaCount == 1, ,drop=FALSE] +#' ms1data <- recalibrate.addMS1data(w, "pH", 15) +#' rcdata <- rbind(rcdata, ms1data) +#' # ... continue constructing recalibration curve with rcdata +#' } +#' @author Michael Stravs, EAWAG +#' @export +recalibrate.addMS1data <- function(spec,mode="pH", recalibrateMS1Window = + getOption("RMassBank")$recalibrateMS1Window) +{ + ## which_OK <- lapply(validPrecursors, function(pscan) + ## { + ## pplist <- as.data.frame( + ## mzR::peaks(msRaw, which(headerData$acquisitionNum == pscan))) + ## colnames(pplist) <- c("mz","int") + ## pplist <- subset(pplist, mz >= mzLimits$mzMin & mz <= mzLimits$mzMax) + ## if(nrow(pplist) > 0) + ## return(TRUE) + ## return(FALSE) + ## }) + + specFound <- selectSpectra(spec, "found", "object") + + ms1peaks <- lapply(specFound, function(cpd){ + mzL <- findMz.formula(cpd@formula,mode,recalibrateMS1Window,0) + mzCalc <- mzL$mzCenter + ms1 <- mz(cpd@parent) + + mzFound <- ms1[which.min(abs(ms1 - mzL$mzCenter))] + if(!length(mzFound)){ + return(c( + mzFound = NA, + mzCalc = mzCalc, + dppm = NA + )) + } else { + dppmRc <- (mzFound/mzCalc - 1)*1e6 + return(c( + mzFound = mzFound, + mzCalc = mzCalc, + dppm = dppmRc, + id=cpd@id + )) + } + }) + ms1peaks <- as.data.frame(do.call(rbind, ms1peaks), stringsAsFactors=FALSE) + # convert numbers to numeric + tonum <- c("mzFound", "dppm", "mzCalc") + ms1peaks[,tonum] <- as.numeric(unlist(ms1peaks[,tonum])) + # throw out NA stuff + ms1peaks <- ms1peaks[!is.na(ms1peaks$mzFound),] + return(ms1peaks) +} + + +# Custom recalibration function: You can overwrite the recal function by +# making any function which takes rcdata$recalfield ~ rcdata$mzFound. +# The settings define which recal function is used +# getOption("RMassBank")$recalibrator = list( +# MS1 = "recalibrate.loess", +# MS2 = "recalibrate.loess") + +#' Predefined recalibration functions. +#' +#' Predefined fits to use for recalibration: Loess fit and GAM fit. +#' +#' \code{recalibrate.loess()} provides a Loess fit (\code{recalibrate.loess}) +#' to a given recalibration parameter. +#' If MS and MS/MS data should be fit together, recalibrate.loess +#' provides good default settings for Orbitrap instruments. +#' +#' \code{recalibrate.identity()} returns a non-recalibration, i.e. a predictor +#' which predicts 0 for all input values. This can be used if the user wants to +#' skip recalibration in the RMassBank workflow. +#' +#' #' \code{recalibrate.mean()} and \code{recalibrate.linear()} are simple recalibrations +#' which return a constant shift or a linear recalibration. They will be only useful +#' in particular cases. +#' +#' \code{recalibrate()} itself is only a dummy function and does not do anything. +#' +#' Alternatively other functions can be defined. Which functions are used for recalibration +#' is specified by the RMassBank options file. (Note: if \code{recalibrateMS1: common}, the +#' \code{recalibrator: MS1} value is irrelevant, since for a common curve generated with +#' the function specified in \code{recalibrator: MS2} will be used.) +#' +#' @aliases recalibrate.loess recalibrate recalibrate.identity recalibrate.mean recalibrate.linear +#' @usage recalibrate.loess(rcdata) +#' +#' recalibrate.identity(rcdata) +#' +#' recalibrate.mean(rcdata) +#' +#' recalibrate.linear(rcdata) +#' +#' @param rcdata A data frame with at least the columns \code{recalfield} and +#' \code{mzFound}. \code{recalfield} will usually contain delta(ppm) or +#' delta(mz) values and is the target parameter for the recalibration. +#' @return Returns a model for recalibration to be used with \code{predict} and the like. +#' @examples \dontrun{ +#' rcdata <- subset(spec$peaksMatched, formulaCount==1) +#' ms1data <- recalibrate.addMS1data(spec, mode, 15) +#' rcdata <- rbind(rcdata, ms1data) +#' rcdata$recalfield <- rcdata$dppm +#' rcCurve <- recalibrate.loess(rcdata) +#' # define a spectrum and recalibrate it +#' s <- matrix(c(100,150,200,88.8887,95.0005,222.2223), ncol=2) +#' colnames(s) <- c("mz", "int") +#' recalS <- recalibrateSingleSpec(s, rcCurve) +#' +#' Alternative: define an custom recalibrator function with different parameters +#' recalibrate.MyOwnLoess <- function(rcdata) +#' { +#' return(loess(recalfield ~ mzFound, data=rcdata, family=c("symmetric"), +#' degree = 2, span=0.4)) +#' } +#' # This can then be specified in the RMassBank settings file: +#' # recalibrateMS1: common +#' # recalibrator: +#' # MS1: recalibrate.loess +#' # MS2: recalibrate.MyOwnLoess") +#' # [...] +#' } +#' @author Michael Stravs, EAWAG +#' @export +recalibrate <- function() +{ + return(NA) +} + +#' @export +recalibrate.loess <- function(rcdata) +{ + span <- 0.25 + # ex XCMS (permission by Steffen): heuristically decide on loess vs linear + mingroups <- nrow(rcdata[!is.na(rcdata$mzFound),]) + if(mingroups < 4) + { + warning("recalibrate.loess: Not enough data points, omitting recalibration") + return(recalibrate.identity(rcdata)) + } else if (mingroups*span < 4) { + span <- 4/mingroups + warning("recalibrate.loess: Span too small, resetting to ", round(span, 2)) + } + return(loess(recalfield ~ mzFound, data=rcdata, family=c("symmetric"), + degree = 1, span=0.25, surface="direct" )) +} + +#' @export +recalibrate.identity <- function(rcdata) +{ + return(lm(recalfield ~ 0, data=rcdata)) +} + +#' @export +recalibrate.mean <- function(rcdata) +{ + return(lm(recalfield ~ 1, data=rcdata)) +} + +#' @export +recalibrate.linear <- function(rcdata) +{ + return(lm(recalfield ~ mzFound, data=rcdata)) +} + +#' Standard progress bar hook. +#' +#' This function provides a standard implementation for the progress bar in RMassBank. +#' +#' RMassBank calls the progress bar function in the following three ways: +#' \code{pb <- progressBarHook(object=NULL, value=0, min=0, max=LEN)} +#' to create a new progress bar. +#' \code{pb <- progressBarHook(object=pb, value= VAL)} +#' to set the progress bar to a new value (between the set \code{min} and \code{max}) +#' \code{progressBarHook(object=pb, close=TRUE)} +#' to close the progress bar. (The actual calls are performed with \code{do.call}, +#' e.g. +#' \code{progressbar <- "progressBarHook" +#' pb <- do.call(progressbar, list(object=pb, value= nProg)) +#' }. See the source code for details.) +#' +#' To substitute the standard progress bar for an alternative implementation (e.g. for +#' use in a GUI), the developer can write his own function which behaves in the same way +#' as \code{progressBarHook}, i.e. takes the same parameters and can be called in the +#' same way. +#' +#' @param object An identifier representing an instance of a progress bar. +#' @param value The new value to assign to the progress indicator +#' @param min The minimal value of the progress indicator +#' @param max The maximal value of the progress indicator +#' @param close If \code{TRUE}, the progress bar is closed. +#' @return Returns a progress bar instance identifier (i.e. an identifier +#' which can be used as \code{object} in subsequent calls.) +#' +#' @author Michele Stravs, Eawag +#' @export +progressBarHook <- function(object = NULL, value = 0, min = 0, max = 100, close = FALSE) +{ + if(is.null(object)) + { + object <- txtProgressBar(min, max, value, style=3, file=stderr()) + } + if(close) + close(object) + else + { + setTxtProgressBar(object, value) + return(object) + } +} diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index 096be69..1f8f5f3 100644 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -1,1099 +1,1099 @@ -## For generating the NAMESPACE -#' @import mzR - -#' @import Rcpp -## Was not in manually written NAMESPACE ? -#' @import RCurl -#' @import XML -#' @import methods -#' @import mzR -#' @import rcdk -#' @import rjson -#' @import yaml -#' @import digest -NULL # This is required so that roxygen knows where the first manpage starts - - -# # importClassesFrom mzR ## Causes error -# # importMethodsFrom mzR - -#' Extract MS/MS spectra for specified precursor -#' -#' Extracts MS/MS spectra from LC-MS raw data for a specified precursor, specified -#' either via the RMassBank compound list (see \code{\link{loadList}}) or via a mass. -#' -#' Different versions of the function get the data from different sources. Note that -#' findMsMsHR and findMsMsHR.direct differ mainly in that findMsMsHR opens a file -#' whereas findMsMs.direct uses an open file handle - both are intended to be used -#' in a full process which involves compound lists etc. In contrast, findMsMsHR.mass -#' is a low-level function which uses the mass directly for lookup and is intended for -#' use as a standalone function in unrelated applications. -#' -#' @note \code{findMsMs.direct} is deactivated -#' -## # @usage findMsMsHR(fileName, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE, -## # ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine, -## # mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse, -## # fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan, -## # rtMargin = getOption("RMassBank")$rtMargin, -## # deprofile = getOption("RMassBank")$deprofile, -## # headerCache = NULL, -## # peaksCache = NULL) -## # -## # findMsMsHR.mass(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA, -## # headerCache = NULL, fillPrecursorScan = FALSE, -## # deprofile = getOption("RMassBank")$deprofile, peaksCache = NULL, cpdID = NA) -#' -#' -#' @aliases findMsMsHR.mass findMsMsHR -#' @param fileName The file to open and search the MS2 spectrum in. -#' @param msRaw The opened raw file (mzR file handle) to search the MS2 spectrum in. Specify either this -#' or \code{fileName}. -#' @param cpdID The compound ID in the compound list (see \code{\link{loadList}}) -#' to use for formula lookup. Note: In \\code{findMsMsHR.mass}, this is entirely optional and -#' used only in case a warning must be displayed; compound lookup is done via mass only. -#' @param mz The mass to use for spectrum search. -#' @param ppmFine The limit in ppm to use for fine limit (see below) calculation. -#' @param mzCoarse The coarse limit to use for locating potential MS2 scans: -#' this tolerance is used when finding scans with a suitable precursor -#' ion value. -#' @param limit.fine The fine limit to use for locating MS2 scans: this tolerance -#' is used when locating an appropriate analyte peak in the MS1 precursor -#' spectrum. -#' @param limit.coarse Parameter in \code{findMsMsHR.mass} corresponding to \code{mzCoarse}. -#' (The parameters are distinct to clearly conceptually distinguish findMsMsHR.mass -#' (a standalone useful function) from the cpdID based functions (workflow functions).) -#' @param mode The processing mode (determines which ion/adduct is searched): -#' \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). -#' @param confirmMode Whether to use the highest-intensity precursor (=0), second- -#' highest (=1), third-highest (=2)... -#' @param useRtLimit Whether to respect retention time limits from the compound list. -#' @param rtLimits \code{c(min, max)}: Minimum and maximum retention time to use -#' when locating the MS2 scans. -#' @param headerCache If present, the complete \code{mzR::header(msRaw)}. Passing -#' this value is useful if spectra for multiple compounds should be -#' extracted from the same mzML file, since it avoids getting the data -#' freshly from \code{msRaw} for every compound. -#' @param peaksCache If present, the complete output of \code{mzR::peaks(msRaw)}. This speeds up the lookup -#' if multiple compounds should be searched in the same file. -#' @param maxCount The maximal number of spectra groups to return. One spectra group -#' consists of all data-dependent scans from the same precursor whose precursor -#' mass matches the specified search mass. -#' @param fillPrecursorScan If \code{TRUE}, the precursor scan will be filled from MS1 data. -#' To be used for data where the precursor scan is not stored in the raw data. -#' @param rtMargin The retention time tolerance to use. -#' @param deprofile Whether deprofiling should take place, and what method should be -#' used (cf. \code{\link{deprofile}}) -#' @param retrieval A value that determines whether the files should be handled either as "standard", -#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" -#' if the only know thing is the m/z -#' @return An \code{RmbSpectraSet} (for \code{findMsMsHR}). Contains parent MS1 spectrum (\code{@@parent}), a block of dependent MS2 spectra ((\code{@@children}) -#' and some metadata (\code{id},\code{mz},\code{name},\code{mode} in which the spectrum was acquired. -#' -#' For \code{findMsMsHR.mass}: a list of \code{RmbSpectraSet}s as defined above, sorted -#' by decreasing precursor intensity. -#' -#' @examples \dontrun{ -#' loadList("mycompoundlist.csv") -#' # if Atrazine has compound ID 1: -#' msms_atrazine <- findMsMsHR(fileName = "Atrazine_0001_pos.mzML", cpdID = 1, mode = "pH") -#' # Or alternatively: -#' msRaw <- openMSfile("Atrazine_0001_pos.mzML") -#' msms_atrazine <- findMsMsHR(msRaw=msRaw, cpdID = 1, mode = "pH") -#' # Or directly by mass (this will return a list of spectra sets): -#' mz <- findMz(1)$mzCenter -#' msms_atrazine_all <- findMsMsHR.mass(msRaw, mz, 1, ppm(msRaw, 10, p=TRUE)) -#' msms_atrazine <- msms_atrazine_all[[1]] -#' } -#' @author Michael A. Stravs, Eawag -#' @seealso findEIC -#' @export -findMsMsHR <- function(fileName = NULL, msRaw = NULL, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE, - ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine, - mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse, - fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan, - rtMargin = getOption("RMassBank")$rtMargin, - deprofile = getOption("RMassBank")$deprofile, - headerCache = NULL, - peaksCache = NULL, - retrieval="standard") -{ - - # access data directly for finding the MS/MS data. This is done using - # mzR. - if(!is.null(fileName) & !is.null(msRaw)) - stop("Both MS raw data and MS filename given. Only one can be handled at the same time.") - if(!is.null(fileName)) - msRaw <- openMSfile(fileName) - - mzLimits <- findMz(cpdID, mode, retrieval=retrieval) - mz <- mzLimits$mzCenter - limit.fine <- ppm(mz, ppmFine, p=TRUE) - if(!useRtLimit) - rtLimits <- NA - else - { - dbRt <- findRt(cpdID) - rtLimits <- c(dbRt$RT - rtMargin, dbRt$RT + rtMargin) * 60 - } - spectra <- findMsMsHR.mass(msRaw, mz, mzCoarse, limit.fine, rtLimits, confirmMode + 1,headerCache - ,fillPrecursorScan, deprofile, peaksCache, cpdID) - # check whether a) spectrum was found and b) enough spectra were found - if(length(spectra) < (confirmMode + 1)) - sp <- new("RmbSpectraSet", found=FALSE) - else - sp <- spectra[[confirmMode + 1]] - - #sp@mz <- mzLimits - sp@id <- as.character(as.integer(cpdID)) - sp@name <- findName(cpdID) - ENV <- environment() - if(retrieval == "unknown"){ - sp@formula <- "" - } else{ - sp@formula <- findFormula(cpdID, retrieval=retrieval) - } - sp@mode <- mode - - # If we had to open the file, we have to close it again - if(!is.null(fileName)) - mzR::close(msRaw) - - return(sp) -} - -#' @describeIn findMsMsHR A submethod of find MsMsHR that retrieves basic spectrum data -#' @export -findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA, - headerCache = NULL, fillPrecursorScan = FALSE, - deprofile = getOption("RMassBank")$deprofile, peaksCache = NULL, cpdID = NA) -{ - eic <- findEIC(msRaw, mz, limit.fine, rtLimits, headerCache=headerCache, - peaksCache=peaksCache) - # if(!is.na(rtLimits)) - # { - # eic <- subset(eic, rt >= rtLimits[[1]] & rt <= rtLimits[[2]]) - # } - if(!is.null(headerCache)) - headerData <- headerCache - else - headerData <- as.data.frame(header(msRaw)) - - - ###If no precursor scan number, fill the number - if(length(unique(headerData$precursorScanNum)) == 1){ - fillPrecursorScan <- TRUE - } - - if(fillPrecursorScan == TRUE) - { - # reset the precursor scan number. first set to NA, then - # carry forward the precursor scan number from the last parent scan - headerData$precursorScanNum <- NA - headerData[which(headerData$msLevel == 1),"precursorScanNum"] <- - headerData[which(headerData$msLevel == 1),"acquisitionNum"] - headerData[,"precursorScanNum"] <- .locf(headerData[,"precursorScanNum"]) - # Clear the actual MS1 precursor scan number again - headerData[which(headerData$msLevel == 1),"precursorScanNum"] <- 0 - } - - # Find MS2 spectra with precursors which are in the allowed - # scan filter (coarse limit) range - findValidPrecursors <- headerData[ - (headerData$precursorMZ > mz - limit.coarse) & - (headerData$precursorMZ < mz + limit.coarse),] - # Find the precursors for the found spectra - validPrecursors <- unique(findValidPrecursors$precursorScanNum) - # check whether the precursors are real: must be within fine limits! - # previously even "bad" precursors were taken. e.g. 1-benzylpiperazine - which_OK <- lapply(validPrecursors, function(pscan) - { - pplist <- as.data.frame( - mzR::peaks(msRaw, which(headerData$acquisitionNum == pscan))) - colnames(pplist) <- c("mz","int") - pplist <- pplist[(pplist$mz >= mz -limit.fine) - & (pplist$mz <= mz + limit.fine),] - if(nrow(pplist) > 0) - return(TRUE) - return(FALSE) - }) - validPrecursors <- validPrecursors[which(which_OK==TRUE)] - if(length(validPrecursors) == 0){ - if(!is.na(cpdID)) - warning(paste0("No precursor was detected for compound, ", cpdID, " with m/z ", mz, ". Please check the mass and retention time window.")) - else - warning(paste0("No precursor was detected for m/z ", mz, ". Please check the mass and retention time window.")) - } - # Crop the "EIC" to the valid precursor scans - eic <- eic[eic$scan %in% validPrecursors,] - # Order by intensity, descending - eic <- eic[order(eic$intensity, decreasing=TRUE),] - if(nrow(eic) == 0) - return(list( - new("RmbSpectraSet", - found=FALSE))) - if(!is.na(maxCount)) - { - spectraCount <- min(maxCount, nrow(eic)) - eic <- eic[1:spectraCount,] - } - # Construct all spectra groups in decreasing intensity order - spectra <- lapply(eic$scan, function(masterScan) - { - masterHeader <- headerData[headerData$acquisitionNum == masterScan,] - childHeaders <- headerData[(headerData$precursorScanNum == masterScan) - & (headerData$precursorMZ > mz - limit.coarse) - & (headerData$precursorMZ < mz + limit.coarse) ,] - childScans <- childHeaders$seqNum - - msPeaks <- mzR::peaks(msRaw, masterHeader$seqNum) - # if deprofile option is set: run deprofiling - deprofile.setting <- deprofile - if(!is.na(deprofile.setting)) - msPeaks <- deprofile.scan( - msPeaks, method = deprofile.setting, noise = NA, colnames = FALSE - ) - colnames(msPeaks) <- c("mz","int") - - msmsSpecs <- apply(childHeaders, 1, function(line) - { - pks <- mzR::peaks(msRaw, line["seqNum"]) - - if(!is.na(deprofile.setting)) - { - pks <- deprofile.scan( - pks, method = deprofile.setting, noise = NA, colnames = FALSE - ) - } - - new("RmbSpectrum2", - mz = pks[,1], - intensity = pks[,2], - precScanNum = as.integer(line["precursorScanNum"]), - precursorMz = line["precursorMZ"], - precursorIntensity = line["precursorIntensity"], - precursorCharge = as.integer(line["precursorCharge"]), - collisionEnergy = line["collisionEnergy"], - tic = line["totIonCurrent"], - peaksCount = line["peaksCount"], - rt = line["retentionTime"], - acquisitionNum = as.integer(line["seqNum"]), - centroided = TRUE - ) - }) - msmsSpecs <- as(do.call(c, msmsSpecs), "SimpleList") - - - - # build the new objects - masterSpec <- new("Spectrum1", - mz = msPeaks[,"mz"], - intensity = msPeaks[,"int"], - polarity = as.integer(masterHeader$polarity), - peaksCount = as.integer(masterHeader$peaksCount), - rt = masterHeader$retentionTime, - acquisitionNum = as.integer(masterHeader$seqNum), - tic = masterHeader$totIonCurrent, - centroided = TRUE - ) - - spectraSet <- new("RmbSpectraSet", - parent = masterSpec, - children = msmsSpecs, - found = TRUE, - #complete = NA, - #empty = NA, - #formula = character(), - mz = mz - #name = character(), - #annotations = list() - ) - return(spectraSet) - }) - names(spectra) <- eic$acquisitionNum - return(spectra) -} - - -#' Discontinued: find MS/MS spectrum from open raw file -#' -#' This interface has been discontinued. \code{\link{findMsMsHR}} now supports the same parameters (use named -#' parameters). -#' -#' @param msRaw x -#' @param cpdID x -#' @param mode x -#' @param confirmMode x -#' @param useRtLimit x -#' @param ppmFine x -#' @param mzCoarse x -#' @param fillPrecursorScan x -#' @param rtMargin x -#' @param deprofile x -#' @param headerCache x -#' @return an error -#' -#' @author stravsmi -#' @export -findMsMsHR.direct <- function(msRaw, cpdID, mode = "pH", confirmMode = 0, useRtLimit = TRUE, - ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine, - mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse, - fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan, - rtMargin = getOption("RMassBank")$rtMargin, - deprofile = getOption("RMassBank")$deprofile, - headerCache = NULL) -{ - stop("Support for this interface has been discontinued. Use findMsMsHR with the same parameters instead (use named parameter msRaw)") -} - -#' Read in mz-files using XCMS -#' -#' Picks peaks from mz-files and returns the pseudospectra that CAMERA creates with the help of XCMS -#' -#' @aliases findMsMsHRperxcms.direct findMsMsHRperxcms -#' @param fileName The path to the mz-file that should be read -#' @param cpdID The compoundID(s) of the compound that has been used for the file -#' @param mode The ionization mode that has been used for the spectrum represented by the peaklist -#' @param findPeaksArgs A list of arguments that will be handed to the xcms-method findPeaks via do.call -#' @param plots A parameter that determines whether the spectra should be plotted or not -#' @param MSe A boolean value that determines whether the spectra were recorded using MSe or not -#' @return The spectra generated from XCMS -#' @seealso \code{\link{msmsWorkflow}} \code{\link{toRMB}} -#' @author Erik Mueller -#' @examples \dontrun{ -#' fileList <- list.files(system.file("XCMSinput", package = "RMassBank"), "Glucolesquerellin", full.names=TRUE)[3] -#' loadList(system.file("XCMSinput/compoundList.csv",package="RMassBank")) -#' psp <- findMsMsHRperxcms(fileList,2184) -#' } -#' @export -findMsMsHRperxcms <- function(fileName, cpdID, mode="pH", findPeaksArgs = NULL, plots = FALSE, MSe = FALSE){ - - # Find mz - mzLimits <- findMz(cpdID, mode) - mz <- mzLimits$mzCenter - - - # If there are more files than cpdIDs - if(length(fileName) > 1){ - fspectra <- list() - - for(i in 1:length(fileName)){ - fspectra[[i]] <- findMsMsHRperxcms.direct(fileName[i], cpdID, mode=mode, findPeaksArgs = findPeaksArgs, plots = plots, MSe = MSe) - } - - spectra <- toRMB(unlist(unlist(fspectra, FALSE),FALSE), cpdID, mode) - - } else if(length(cpdID) > 1){ # If there are more cpdIDs than files - - spectra <- findMsMsHRperxcms.direct(fileName, cpdID, mode=mode, findPeaksArgs = findPeaksArgs, plots = plots, MSe = MSe) - - P <- lapply(1:length(spectra), function(i){ - sp <- toRMB(spectra[[i]], cpdID[i], mode) - sp@id <- as.character(as.integer(cpdID[i])) - sp@name <- findName(cpdID[i]) - sp@formula <- findFormula(cpdID[i]) - sp@mode <- mode - return(sp) - }) - return(P) - - } else { # There is a file for every cpdID - spectra <- toRMB(unlist(findMsMsHRperxcms.direct(fileName, cpdID, mode=mode, findPeaksArgs = NULL, plots = FALSE, MSe = FALSE),FALSE)) - } - - sp <- spectra - - #sp@mz <- mzLimits - sp@id <- as.character(as.integer(cpdID)) - sp@name <- findName(cpdID) - sp@formula <- findFormula(cpdID) - sp@mode <- mode - - return(sp) -} - -#' @describeIn findMsMsHRperxcms A submethod of findMsMsHrperxcms that retrieves basic spectrum data -#' @export -findMsMsHRperxcms.direct <- function(fileName, cpdID, mode="pH", findPeaksArgs = NULL, plots = FALSE, MSe = FALSE) { - - requireNamespace("CAMERA",quietly=TRUE) - requireNamespace("xcms",quietly=TRUE) - - ## - ## getRT function - ## - - getRT <- function(xa) { - rt <- sapply(xa@pspectra, function(x) {median(xcms::peaks(xa@xcmsSet)[x, "rt"])}) - } - - ## - ## MSMS - ## - - # Read file - suppressWarnings(xrmsms <- xcms::xcmsRaw(fileName, includeMSn=TRUE)) - - - # If file is not MSe, split by collision energy - if(MSe == FALSE){ - # Also, fake MS1 from the MSn data - suppressWarnings(xrs <- split(xcms::msn2xcmsRaw(xrmsms), f = xrmsms@msnCollisionEnergy)) - } else{ - # Else, MSn data will already be in MS1 - xrs <- list() - xrs[[1]] <- xrmsms - } - - # Fake a simplistic xcmsSet - suppressWarnings(setReplicate <- xcms::xcmsSet(files=fileName, method="MS1")) - xsmsms <- as.list(replicate(length(xrs),setReplicate)) - - mzabs <- 0.1 - - # Definitions - whichmissing <- vector() - metaspec <- list() - - ## - ## Retrieval over all supplied cpdIDs - ## - - for(ID in 1:length(cpdID)){ - - # Find all relevant information for the current cpdID - XCMSspectra <- list() - RT <- findRt(cpdID[ID])$RT * 60 - parentMass <- findMz(cpdID[ID], mode=mode)$mzCenter - - # Is the information in the compound list? - if(is.na(parentMass)){ - stop(paste("There was no matching entry to the supplied cpdID", cpdID[ID] ,"\n Please check the cpdIDs and the compoundlist.")) - } - - # Go over every collision energy of the MS2 - for(i in 1:length(xrs)){ - - suppressWarnings(capture.output(xcms::peaks(xsmsms[[i]]) <- do.call(xcms::findPeaks,c(findPeaksArgs, object = xrs[[i]])))) - - if (nrow(xcms::peaks(xsmsms[[i]])) == 0) { - XCMSspectra[[i]] <- matrix(0,2,7) - next - } else{ - - # Get the peaklist - pl <- xcms::peaks(xsmsms[[i]])[,c("mz", "rt"), drop=FALSE] - - # Find precursor peak within limits - candidates <- which( pl[,"mz", drop=FALSE] < parentMass + mzabs & pl[,"mz", drop=FALSE] > parentMass - mzabs - & pl[,"rt", drop=FALSE] < RT * 1.1 & pl[,"rt", drop=FALSE] > RT * 0.9 ) - - # Annotate and group by FWHM (full width at half maximum) - capture.output(anmsms <- CAMERA::xsAnnotate(xsmsms[[i]])) - capture.output(anmsms <- CAMERA::groupFWHM(anmsms)) - - # If a candidate fulfills the condition, choose the closest and retrieve the index of those pesudospectra - if(length(candidates) > 0){ - closestCandidate <- which.min(abs(RT - pl[candidates, "rt", drop=FALSE])) - pspIndex <- which(sapply(anmsms@pspectra, function(x) {candidates[[i]][closestCandidate] %in% x})) - } else{ - # Else choose the candidate with the closest RT - pspIndex <- which.min(abs(getRT(anmsms) - RT)) - } - - # 2nd best: Spectrum closest to MS1 - # pspIndex <- which.min( abs(getRT(anmsms) - actualRT)) - - # If the plot parameter was supplied, plot it - if((plots == TRUE) && (length(pspIndex) > 0)){ - CAMERA::plotPsSpectrum(anmsms, pspIndex, log=TRUE, mzrange=c(0, findMz(cpdID)[[3]]), maxlabel=10) - } - - # If there is a number of indexes, retrieve the pseudospectra - if(length(pspIndex) != 0){ - XCMSspectra[[i]] <- CAMERA::getpspectra(anmsms, pspIndex) - } else { - # Else note the spectrum as missing - whichmissing <- c(whichmissing,i) - } - } - } - - # If XCMSspectra were found but there are some missing for some collision energies, fill these XCMSspectra - if((length(XCMSspectra) != 0) && length(whichmissing)){ - for(i in whichmissing){ - XCMSspectra[[i]] <- matrix(0,2,7) - } - } - - metaspec[[ID]] <- XCMSspectra - } - - return(metaspec) -} - -################################################################################ -## new -findMsMsHRperMsp <- function(fileName, cpdIDs, mode="pH"){ - # Find mz - #mzLimits <- findMz(cpdIDs, mode) - #mz <- mzLimits$mzCenter - - # If there are more files than cpdIDs - if(length(fileName) > 1){ - fspectra <- list() - - for(i in 1:length(fileName)){ - fspectra[[i]] <- findMsMsHRperMsp.direct(fileName[i], cpdIDs, mode=mode) - } - - spectra <- toRMB(unlist(unlist(fspectra, FALSE),FALSE), cpdIDs, mode) - - } else if(length(cpdIDs) > 1){ # If there are more cpdIDs than files - - spectra <- findMsMsHRperMsp.direct(fileName = fileName, cpdIDs = cpdIDs, mode=mode) - - P <- lapply(1:length(spectra), function(i){ - sp <- toRMB(msmsXCMSspecs = spectra[[i]], cpdID = cpdIDs[i], mode = mode) - sp@id <- as.character(as.integer(cpdIDs[i])) - sp@name <- findName(cpdIDs[i]) - sp@formula <- findFormula(cpdIDs[i]) - sp@mode <- mode - - if(length(sp@children) == 1){ - sp@children[[1]]@rawOK <- rep(x = TRUE, times = sp@children[[1]]@peaksCount) - sp@children[[1]]@good <- rep(x = TRUE, times = sp@children[[1]]@peaksCount) - #sp@children[[1]]@good <- TRUE - } - - return(sp) - }) - return(P) - - } else { # There is a file for every cpdID - spectra <- toRMB(unlist(findMsMsHRperMsp.direct(fileName, cpdIDs, mode=mode),FALSE)) - } - - sp <- spectra - - #sp@mz <- mzLimits - sp@id <- as.character(as.integer(cpdIDs)) - sp@name <- findName(cpdIDs) - sp@formula <- findFormula(cpdIDs) - sp@mode <- mode - - return(sp) -} - -#' @describeIn findMsMsHRperMsp A submethod of findMsMsHrperxcms that retrieves basic spectrum data -#' @export -findMsMsHRperMsp.direct <- function(fileName, cpdIDs, mode="pH") { - - #requireNamespace("CAMERA",quietly=TRUE) - #requireNamespace("xcms",quietly=TRUE) - - ## - ## MSMS - ## - - # Read file - suppressWarnings(xrmsms <- read.msp(file = fileName)) - - ## If file is not MSe, split by collision energy - #if(MSe == FALSE){ - # # Also, fake MS1 from the MSn data - # suppressWarnings(xrs <- split(xcms::msn2xcmsRaw(xrmsms), f = xrmsms@msnCollisionEnergy)) - #} else{ - # # Else, MSn data will already be in MS1 - # xrs <- list() - # xrs[[1]] <- xrmsms - #} - #xrs <- xrmsms - - mzabs <- 0.1 - - # Definitions - whichmissing <- vector() - metaspec <- list() - - precursorTable <- data.frame(stringsAsFactors = FALSE, - mz = as.numeric(unlist(lapply(X = xrmsms, FUN = function(x){ x$PRECURSORMZ }))), - rt = as.numeric(unlist(lapply(X = xrmsms, FUN = function(x){ x$RETENTIONTIME }))) - ) - - ## - ## Retrieval over all supplied cpdIDs - ## - - for(idIdx in seq_along(cpdIDs)){ - - # Find all relevant information for the current cpdID - spectrum <- NULL - RT <- findRt(cpdIDs[[idIdx]])$RT * 60 - parentMass <- findMz(cpdIDs[[idIdx]], mode=mode)$mzCenter - - # Is the information in the compound list? - if(is.na(parentMass)){ - stop(paste("There was no matching entry to the supplied cpdID", cpdIDs[[idIdx]] ,"\n Please check the cpdIDs and the compoundlist.")) - } - - # Go over every collision energy of the MS2 - #for(i in seq_along(xrs)){ - - - - if (nrow(precursorTable) == 0) { - ## no peaks there - #spectrum <- matrix(0,2,7) - next - } else{ - ## at least one peak there - - # Get the peaklist - #pl <- xrs[[i]]$pspectrum - #pl <- data.frame("mz" = pl[, "mz"], "rt" = xrs[[i]]$RETENTIONTIME, stringsAsFactors = F) - - mzMatch <- - precursorTable[,"mz", drop=FALSE] < parentMass + mzabs & - precursorTable[,"mz", drop=FALSE] > parentMass - mzabs - rtMatch <- - precursorTable[,"rt", drop=FALSE] < RT * 1.1 & - precursorTable[,"rt", drop=FALSE] > RT * 0.9 - - if(is.na(RT)) - rtMatch <- TRUE - - # Find precursor peak within limits - candidates <- which( mzMatch & rtMatch ) - - # Annotate and group by FWHM (full width at half maximum) - #capture.output(anmsms <- CAMERA::xsAnnotate(xsmsms[[i]])) - #capture.output(anmsms <- CAMERA::groupFWHM(anmsms)) - - # If a candidate fulfills the condition, choose the closest and retrieve the index of those pesudospectra - if(length(candidates) > 0){ - if(is.na(RT)){ - pspIndex <- candidates[[1]] - } else { - closestCandidate <- which.min(abs(RT - precursorTable[candidates, "rt", drop=FALSE])) - pspIndex <- candidates[[closestCandidate]] - } - } else{ - # Else choose the candidate with the closest RT - pspIndex <- which.min(abs(precursorTable[,"rt"] - RT)) - } - - # 2nd best: Spectrum closest to MS1 - # pspIndex <- which.min( abs(getRT(anmsms) - actualRT)) - - ## If the plot parameter was supplied, plot it - #if((plots == TRUE) && (length(pspIndex) > 0)){ - # CAMERA::plotPsSpectrum(anmsms, pspIndex, log=TRUE, mzrange=c(0, findMz(cpdIDs)[[3]]), maxlabel=10) - #} - - # If there is a number of indexes, retrieve the pseudospectra - if(length(pspIndex) != 0){ - spectrum <- xrmsms[[pspIndex]] - } else { - # Else note the spectrum as missing - whichmissing <- c(whichmissing,idIdx) - #spectrum <- matrix(0,2,7) - } - } - #} - - # If XCMSspectra were found but there are some missing for some collision energies, fill these XCMSspectra - #if((length(XCMSspectra) != 0) && length(whichmissing)){ - # for(i in whichmissing){ - # XCMSspectra[[idIdx]] <- matrix(0,2,7) - # } - #} - - if(is.null(spectrum)){ - metaspec[[idIdx]] <- list(matrix(0,1,7)) - } else { - metaspec[[idIdx]] <- list(data.frame( - stringsAsFactors = F, - "mz" = as.numeric(spectrum$pspectrum[, "mz"]), - "mzmin" = as.numeric(spectrum$pspectrum[, "mz"]), - "mzmax" = as.numeric(spectrum$pspectrum[, "mz"]), - "rt" = as.numeric(spectrum$RETENTIONTIME), - "rtmin" = as.numeric(spectrum$RETENTIONTIME), - "rtmax" = as.numeric(spectrum$RETENTIONTIME), - "into" = as.numeric(spectrum$pspectrum[, "intensity"]) - )) - } - } - - return(metaspec) -} - -## adapted from the Bioconductor package 'metaMS' (method 'read.msp') -read.msp <- function(file){ - get.text.value <- function(x, field, do.err = TRUE) { - woppa <- strsplit(x, field) - woppa.lengths <- sapply(woppa, length) - if (all(woppa.lengths == 2)) { - sapply(woppa, function(y) gsub("^ +", "", y[2])) - } - else { - if (do.err) { - stop(paste("Invalid field", field, "in", x[woppa.lengths != 2])) - } - else { - NULL - } - } - } - read.compound <- function(strs) { - fields.idx <- grep(":", strs) - fields <- sapply(strsplit(strs[fields.idx], ":"), "[[", 1) - pk.idx <- which(fields == "Num Peaks") - if (length(pk.idx) == 0) - stop("No spectrum found") - cmpnd <- lapply(fields.idx[-pk.idx], function(x) get.text.value(strs[x], paste(fields[x], ":", sep = ""))) - names(cmpnd) <- fields[-pk.idx] - nlines <- length(strs) - npeaks <- as.numeric(get.text.value(strs[pk.idx], "Num Peaks:")) - peaks.idx <- (pk.idx + 1):nlines - pks <- gsub("^ +", "", unlist(strsplit(strs[peaks.idx], ";"))) - pks <- pks[pks != ""] - if (length(pks) != npeaks) - stop("Not the right number of peaks in compound", cmpnd$Name) - pklst <- strsplit(x = pks, split = "\t| ") - pklst <- lapply(pklst, function(x) x[x != ""]) - cmz <- as.numeric(sapply(pklst, "[[", 1)) - cintens <- as.numeric(sapply(pklst, "[[", 2)) - finaltab <- matrix(c(cmz, cintens), ncol = 2) - if (any(table(cmz) > 1)) { - warning("Duplicate mass in compound ", cmpnd$Name, " (CAS ", cmpnd$CAS, ")... summing up intensities") - finaltab <- aggregate(finaltab[, 2], by = list(finaltab[, 1]), FUN = sum) - } - colnames(finaltab) <- c("mz", "intensity") - c(cmpnd, list(pspectrum = finaltab)) - } - huhn <- readLines(con = file) - starts <- which(regexpr("(Name:)|(NAME:) ", huhn) == 1) - ends <- c(starts[-1] - 1, length(huhn)) - lapply(1:length(starts), function(i){ - read.compound(huhn[starts[[i]]:ends[[i]]]) - }) -} -## new -################################################################################ - -# Finds the EIC for a mass trace with a window of x ppm. -# (For ppm = 10, this is +5 / -5 ppm from the non-recalibrated mz.) -#' Extract EICs -#' -#' Extract EICs from raw data for a determined mass window. -#' -#' @param msRaw The mzR file handle -#' @param mz The mass or mass range to extract the EIC for: either a single mass -#' (with the range specified by \code{limit} below) or a mass range -#' in the form of \code{c(min, max)}. -#' @param limit If a single mass was given for \code{mz}: the mass window to extract. -#' A limit of 0.001 means that the EIC will be returned for \code{[mz - 0.001, mz + 0.001]}. -#' @param rtLimit If given, the retention time limits in form \code{c(rtmin, rtmax)} in seconds. -#' @param headerCache If present, the complete \code{mzR::header(msRaw)}. Passing -#' this value is useful if spectra for multiple compounds should be -#' extracted from the same mzML file, since it avoids getting the data -#' freshly from \code{msRaw} for every compound. -#' @param peaksCache If present, the complete output of \code{mzR::peaks(msRaw)}. This speeds up the lookup -#' if multiple compounds should be searched in the same file. -#' @param floatingRecalibration -#' A fitting function that \code{predict()}s a mass shift based on the retention time. Can be used -#' if a lockmass calibration is known (however you have to build the calibration yourself.) -#' @return A \code{[rt, intensity, scan]} matrix (\code{scan} being the scan number.) -#' @author Michael A. Stravs, Eawag -#' @seealso findMsMsHR -#' @export -findEIC <- function(msRaw, mz, limit = NULL, rtLimit = NA, headerCache = NULL, floatingRecalibration = NULL, - peaksCache = NULL) -{ - # calculate mz upper and lower limits for "integration" - if(all(c("mzMin", "mzMax") %in% names(mz))) - mzlimits <- c(mz$mzMin, mz$mzMax) - else - mzlimits <- c(mz - limit, mz + limit) - # Find peaklists for all MS1 scans - if(!is.null(headerCache)) - headerData <- as.data.frame(headerCache) - else - headerData <- as.data.frame(header(msRaw)) - # Add row numbering because I'm not sure if seqNum or acquisitionNum correspond to anything really - if(nrow(headerData) > 0) - headerData$rowNum <- 1:nrow(headerData) - else - headerData$rowNum <- integer(0) - - # If RT limit is already given, retrieve only candidates in the first place, - # since this makes everything much faster. - if(all(!is.na(rtLimit))) - headerMS1 <- headerData[ - (headerData$msLevel == 1) & (headerData$retentionTime >= rtLimit[[1]]) - & (headerData$retentionTime <= rtLimit[[2]]) - ,] - else - headerMS1 <- headerData[headerData$msLevel == 1,] - if(is.null(peaksCache)) - pks <- mzR::peaks(msRaw, headerMS1$seqNum) - else - pks <- peaksCache[headerMS1$rowNum] - - # Sum intensities in the given mass window for each scan - if(is.null(floatingRecalibration)) - { - headerMS1$mzMin <- mzlimits[[1]] - headerMS1$mzMax <- mzlimits[[2]] - } - else - { - headerMS1$mzMin <- mzlimits[[1]] + predict(floatingRecalibration, headerMS1$retentionTime) - headerMS1$mzMax <- mzlimits[[2]] + predict(floatingRecalibration, headerMS1$retentionTime) - } - intensity <- unlist(lapply(1:nrow(headerMS1), function(row){ - peaktable <- pks[[row]] - sum(peaktable[ - which((peaktable[,1] >= headerMS1[row,"mzMin"]) & (peaktable[,1] <= headerMS1[row,"mzMax"])),2 - ]) - - })) - return(data.frame(rt = headerMS1$retentionTime, intensity=intensity, scan=headerMS1$acquisitionNum)) -} - - -#' Generate peaks cache -#' -#' Generates a peak cache table for use with \code{\link{findMsMsHR}} functions. -#' -#' @param msRaw the input raw datafile (opened) -#' @param headerCache the cached header, or subset thereof for which peaks should be extracted. Peak extraction goes -#' by \code{seqNum}. -#' @return A list of dataframes as from \code{mzR::peaks}. -#' -#' @author stravsmi -#' @export -makePeaksCache <- function(msRaw, headerCache) -{ - mzR::peaks(msRaw, headerCache$seqNum) -} - -#' Conversion of XCMS-pseudospectra into RMassBank-spectra -#' -#' Converts a pseudospectrum extracted from XCMS using CAMERA into the msmsWorkspace(at)spectrum-format that RMassBank uses -#' -#' @usage toRMB(msmsXCMSspecs, cpdID, mode, MS1spec) -#' @param msmsXCMSspecs The compoundID of the compound that has been used for the peaklist -#' @param cpdID The compound ID of the substance of the given spectrum -#' @param mode The ionization mode that has been used for the spectrum -#' @param MS1spec The MS1-spectrum from XCMS, which can be optionally supplied -#' @return One list element of the (at)specs-entry from an msmsWorkspace -#' @seealso \code{\link{msmsWorkspace-class}} -#' @author Erik Mueller -#' @examples \dontrun{ -#' XCMSpspectra <- findmsmsHRperxcms.direct("Glucolesquerellin_2184_1.mzdata", 2184) -#' wspecs <- toRMB(XCMSpspectra) -#' } -#' @export -toRMB <- function(msmsXCMSspecs = NA, cpdID = NA, mode="pH", MS1spec = NA){ - - - ##Basic parameters - mz <- findMz(cpdID,mode=mode)$mzCenter - id <- cpdID - formula <- findFormula(cpdID) - - if(length(msmsXCMSspecs) == 0){ - return(new("RmbSpectraSet",found=FALSE)) - } - - foundOK <- !any(sapply(msmsXCMSspecs, function(x) all(x == 0))) - - - if(!foundOK){ - return(new("RmbSpectraSet",found=FALSE)) - } - - if(suppressWarnings(is.na(msmsXCMSspecs)[1])){ - stop("You need a readable spectrum!") - } - - if(is.na(cpdID)){ - stop("Please supply the compoundID!") - } - - mockAcqnum <- 1 - mockenv <- environment() - - msmsSpecs <- lapply(msmsXCMSspecs, function(spec){ - ## Mock acquisition num - mockenv$mockAcqnum <- mockenv$mockAcqnum + 1 - - ## Find peak table - pks <- matrix(nrow = length(spec[,1]), ncol = 2) - colnames(pks) <- c("mz","int") - pks[,1] <- spec[,1] - pks[,2] <- spec[,7] - - ## Deprofiling not necessary for XCMS - - ## New spectrum object - return(new("RmbSpectrum2", - mz = pks[,"mz"], - intensity = pks[,"int"], - precScanNum = as.integer(1), - precursorMz = findMz(cpdID)$mzCenter, - precursorIntensity = 0, - precursorCharge = as.integer(1), - collisionEnergy = 0, - tic = 0, - peaksCount = nrow(spec), - rt = median(spec[,4]), - acquisitionNum = as.integer(mockenv$mockAcqnum), - centroided = TRUE - )) - }) - - msmsSpecs <- as(do.call(c, msmsSpecs), "SimpleList") - - ##Build the new objects - masterSpec <- new("Spectrum1", - mz = findMz(cpdID,mode=mode)$mzCenter, - intensity = 100, - polarity = as.integer(0), - peaksCount = as.integer(1), - rt = msmsSpecs[[1]]@rt, - acquisitionNum = as.integer(1), - tic = 0, - centroided = TRUE - ) - - spectraSet <- new("RmbSpectraSet", - parent = masterSpec, - children = msmsSpecs, - found = TRUE, - #complete = NA, - #empty = NA, - #formula = character(), - mz = mz - #name = character(), - #annotations = list() - ) - - return(spectraSet) -} - -#' Addition of manual peaklists -#' -#' Adds a manual peaklist in matrix-format -#' -#' @usage addPeaksManually(w, cpdID, handSpec, mode) -#' @param w The msmsWorkspace that the peaklist should be added to. -#' @param cpdID The compoundID of the compound that has been used for the peaklist -#' @param handSpec A peaklist with 2 columns, one with "mz", one with "int" -#' @param mode The ionization mode that has been used for the spectrum represented by the peaklist -#' @return The \code{msmsWorkspace} with the additional peaklist added to the right spectrum -#' @seealso \code{\link{msmsWorkflow}} -#' @author Erik Mueller -#' @examples \dontrun{ -#' handSpec <- cbind(mz=c(274.986685367956, 259.012401087427, 95.9493025990907, 96.9573002472772), -#' int=c(357,761, 2821, 3446)) -#' addPeaksManually(w, cpdID, handSpec) -#' } -#' @export -addPeaksManually <- function(w, cpdID = NA, handSpec, mode = "pH"){ - - - if(is.na(cpdID)){ - stop("Please supply the compoundID!") - } - - # For the case that the cpdID turns up for the first time - # a new spectrumset needs to be created - if(!(cpdID %in% sapply(w@spectra,function(s) s@id))){ - - # Create fake MS1 spectrum - masterSpec <- new("Spectrum1", - mz = findMz(cpdID,mode=mode)$mzCenter, - intensity = 100, - polarity = as.integer(0), - peaksCount = as.integer(1), - rt = findRt(cpdID)$RT, - acquisitionNum = as.integer(1), - tic = 0, - centroided = TRUE - ) - - # Create fake spectrumset - spectraSet <- new("RmbSpectraSet", - parent = masterSpec, - found = TRUE, - #complete = NA, - #empty = NA, - id = as.character(as.integer(cpdID)), - formula = findFormula(cpdID), - mz = findMz(cpdID,mode=mode)$mzCenter, - name = findName(cpdID), - mode = mode - #annotations = list() - ) - - w@spectra[[length(w@spectra) + 1]] <- spectraSet - } - - specIndex <- which(cpdID == sapply(w@spectra, function(s) s@id)) - - # New spectrum object - w@spectra[[specIndex]]@children[[length(w@spectra[[specIndex]]@children) + 1]] <- new("RmbSpectrum2", - mz = handSpec[,"mz"], - intensity = handSpec[,"int"], - precScanNum = as.integer(1), - precursorMz = findMz(cpdID)$mzCenter, - precursorIntensity = 0, - precursorCharge = as.integer(1), - collisionEnergy = 0, - tic = 0, - peaksCount = nrow(handSpec), - rt = findRt(cpdID)$RT, - acquisitionNum = as.integer(length(w@spectra[[specIndex]]@children) + 2), - centroided = TRUE) - return(w) -} - - -createSpecsFromPeaklists <- function(w, cpdIDs, filenames, mode="pH"){ - for(j in 1:length(filenames)){ - w <- addPeaksManually(w,cpdIDs[j],as.matrix(read.csv(filenames[j]), header=TRUE),mode) - } - - return(w) -} - - -#' MassBank-record Addition -#' -#' Adds the peaklist of a MassBank-Record to the specs of an msmsWorkspace -#' -#' @aliases addMB -#' @usage addMB(w, cpdID, fileName, mode) -#' @param w The msmsWorkspace that the peaklist should be added to. -#' @param cpdID The compoundID of the compound that has been used for the record -#' @param fileName The path to the record -#' @param mode The ionization mode that has been used to create the record -#' @return The \code{msmsWorkspace} with the additional peaklist from the record -#' @seealso \code{\link{addPeaksManually}} -#' @author Erik Mueller -#' @examples \dontrun{ -#' addMB("filepath_to_records/RC00001.txt") -#' } -#' @export -addMB <- function(w, cpdID, fileName, mode){ - mb <- parseMassBank(fileName) - peaklist <- list() - peaklist[[1]] <- mb@compiled_ok[[1]][["PK$PEAK"]][,1:2] - w <- addPeaksManually(w, cpdID, peaklist[[1]], mode) - return(w) -} - +## For generating the NAMESPACE +#' @import mzR + +#' @import Rcpp +## Was not in manually written NAMESPACE ? +#' @import RCurl +#' @import XML +#' @import methods +#' @import mzR +#' @import rcdk +#' @import rjson +#' @import yaml +#' @import digest +NULL # This is required so that roxygen knows where the first manpage starts + + +# # importClassesFrom mzR ## Causes error +# # importMethodsFrom mzR + +#' Extract MS/MS spectra for specified precursor +#' +#' Extracts MS/MS spectra from LC-MS raw data for a specified precursor, specified +#' either via the RMassBank compound list (see \code{\link{loadList}}) or via a mass. +#' +#' Different versions of the function get the data from different sources. Note that +#' findMsMsHR and findMsMsHR.direct differ mainly in that findMsMsHR opens a file +#' whereas findMsMs.direct uses an open file handle - both are intended to be used +#' in a full process which involves compound lists etc. In contrast, findMsMsHR.mass +#' is a low-level function which uses the mass directly for lookup and is intended for +#' use as a standalone function in unrelated applications. +#' +#' @note \code{findMsMs.direct} is deactivated +#' +## # @usage findMsMsHR(fileName, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE, +## # ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine, +## # mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse, +## # fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan, +## # rtMargin = getOption("RMassBank")$rtMargin, +## # deprofile = getOption("RMassBank")$deprofile, +## # headerCache = NULL, +## # peaksCache = NULL) +## # +## # findMsMsHR.mass(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA, +## # headerCache = NULL, fillPrecursorScan = FALSE, +## # deprofile = getOption("RMassBank")$deprofile, peaksCache = NULL, cpdID = NA) +#' +#' +#' @aliases findMsMsHR.mass findMsMsHR +#' @param fileName The file to open and search the MS2 spectrum in. +#' @param msRaw The opened raw file (mzR file handle) to search the MS2 spectrum in. Specify either this +#' or \code{fileName}. +#' @param cpdID The compound ID in the compound list (see \code{\link{loadList}}) +#' to use for formula lookup. Note: In \\code{findMsMsHR.mass}, this is entirely optional and +#' used only in case a warning must be displayed; compound lookup is done via mass only. +#' @param mz The mass to use for spectrum search. +#' @param ppmFine The limit in ppm to use for fine limit (see below) calculation. +#' @param mzCoarse The coarse limit to use for locating potential MS2 scans: +#' this tolerance is used when finding scans with a suitable precursor +#' ion value. +#' @param limit.fine The fine limit to use for locating MS2 scans: this tolerance +#' is used when locating an appropriate analyte peak in the MS1 precursor +#' spectrum. +#' @param limit.coarse Parameter in \code{findMsMsHR.mass} corresponding to \code{mzCoarse}. +#' (The parameters are distinct to clearly conceptually distinguish findMsMsHR.mass +#' (a standalone useful function) from the cpdID based functions (workflow functions).) +#' @param mode The processing mode (determines which ion/adduct is searched): +#' \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +#' @param confirmMode Whether to use the highest-intensity precursor (=0), second- +#' highest (=1), third-highest (=2)... +#' @param useRtLimit Whether to respect retention time limits from the compound list. +#' @param rtLimits \code{c(min, max)}: Minimum and maximum retention time to use +#' when locating the MS2 scans. +#' @param headerCache If present, the complete \code{mzR::header(msRaw)}. Passing +#' this value is useful if spectra for multiple compounds should be +#' extracted from the same mzML file, since it avoids getting the data +#' freshly from \code{msRaw} for every compound. +#' @param peaksCache If present, the complete output of \code{mzR::peaks(msRaw)}. This speeds up the lookup +#' if multiple compounds should be searched in the same file. +#' @param maxCount The maximal number of spectra groups to return. One spectra group +#' consists of all data-dependent scans from the same precursor whose precursor +#' mass matches the specified search mass. +#' @param fillPrecursorScan If \code{TRUE}, the precursor scan will be filled from MS1 data. +#' To be used for data where the precursor scan is not stored in the raw data. +#' @param rtMargin The retention time tolerance to use. +#' @param deprofile Whether deprofiling should take place, and what method should be +#' used (cf. \code{\link{deprofile}}) +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z +#' @return An \code{RmbSpectraSet} (for \code{findMsMsHR}). Contains parent MS1 spectrum (\code{@@parent}), a block of dependent MS2 spectra ((\code{@@children}) +#' and some metadata (\code{id},\code{mz},\code{name},\code{mode} in which the spectrum was acquired. +#' +#' For \code{findMsMsHR.mass}: a list of \code{RmbSpectraSet}s as defined above, sorted +#' by decreasing precursor intensity. +#' +#' @examples \dontrun{ +#' loadList("mycompoundlist.csv") +#' # if Atrazine has compound ID 1: +#' msms_atrazine <- findMsMsHR(fileName = "Atrazine_0001_pos.mzML", cpdID = 1, mode = "pH") +#' # Or alternatively: +#' msRaw <- openMSfile("Atrazine_0001_pos.mzML") +#' msms_atrazine <- findMsMsHR(msRaw=msRaw, cpdID = 1, mode = "pH") +#' # Or directly by mass (this will return a list of spectra sets): +#' mz <- findMz(1)$mzCenter +#' msms_atrazine_all <- findMsMsHR.mass(msRaw, mz, 1, ppm(msRaw, 10, p=TRUE)) +#' msms_atrazine <- msms_atrazine_all[[1]] +#' } +#' @author Michael A. Stravs, Eawag +#' @seealso findEIC +#' @export +findMsMsHR <- function(fileName = NULL, msRaw = NULL, cpdID, mode="pH",confirmMode =0, useRtLimit = TRUE, + ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine, + mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse, + fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan, + rtMargin = getOption("RMassBank")$rtMargin, + deprofile = getOption("RMassBank")$deprofile, + headerCache = NULL, + peaksCache = NULL, + retrieval="standard") +{ + + # access data directly for finding the MS/MS data. This is done using + # mzR. + if(!is.null(fileName) & !is.null(msRaw)) + stop("Both MS raw data and MS filename given. Only one can be handled at the same time.") + if(!is.null(fileName)) + msRaw <- openMSfile(fileName) + + mzLimits <- findMz(cpdID, mode, retrieval=retrieval) + mz <- mzLimits$mzCenter + limit.fine <- ppm(mz, ppmFine, p=TRUE) + if(!useRtLimit) + rtLimits <- NA + else + { + dbRt <- findRt(cpdID) + rtLimits <- c(dbRt$RT - rtMargin, dbRt$RT + rtMargin) * 60 + } + spectra <- findMsMsHR.mass(msRaw, mz, mzCoarse, limit.fine, rtLimits, confirmMode + 1,headerCache + ,fillPrecursorScan, deprofile, peaksCache, cpdID) + # check whether a) spectrum was found and b) enough spectra were found + if(length(spectra) < (confirmMode + 1)) + sp <- new("RmbSpectraSet", found=FALSE) + else + sp <- spectra[[confirmMode + 1]] + + #sp@mz <- mzLimits + sp@id <- as.character(as.integer(cpdID)) + sp@name <- findName(cpdID) + ENV <- environment() + if(retrieval == "unknown"){ + sp@formula <- "" + } else{ + sp@formula <- findFormula(cpdID, retrieval=retrieval) + } + sp@mode <- mode + + # If we had to open the file, we have to close it again + if(!is.null(fileName)) + mzR::close(msRaw) + + return(sp) +} + +#' @describeIn findMsMsHR A submethod of find MsMsHR that retrieves basic spectrum data +#' @export +findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, maxCount = NA, + headerCache = NULL, fillPrecursorScan = FALSE, + deprofile = getOption("RMassBank")$deprofile, peaksCache = NULL, cpdID = NA) +{ + eic <- findEIC(msRaw, mz, limit.fine, rtLimits, headerCache=headerCache, + peaksCache=peaksCache) + # if(!is.na(rtLimits)) + # { + # eic <- subset(eic, rt >= rtLimits[[1]] & rt <= rtLimits[[2]]) + # } + if(!is.null(headerCache)) + headerData <- headerCache + else + headerData <- as.data.frame(header(msRaw)) + + + ###If no precursor scan number, fill the number + if(length(unique(headerData$precursorScanNum)) == 1){ + fillPrecursorScan <- TRUE + } + + if(fillPrecursorScan == TRUE) + { + # reset the precursor scan number. first set to NA, then + # carry forward the precursor scan number from the last parent scan + headerData$precursorScanNum <- NA + headerData[which(headerData$msLevel == 1),"precursorScanNum"] <- + headerData[which(headerData$msLevel == 1),"acquisitionNum"] + headerData[,"precursorScanNum"] <- .locf(headerData[,"precursorScanNum"]) + # Clear the actual MS1 precursor scan number again + headerData[which(headerData$msLevel == 1),"precursorScanNum"] <- 0 + } + + # Find MS2 spectra with precursors which are in the allowed + # scan filter (coarse limit) range + findValidPrecursors <- headerData[ + (headerData$precursorMZ > mz - limit.coarse) & + (headerData$precursorMZ < mz + limit.coarse),] + # Find the precursors for the found spectra + validPrecursors <- unique(findValidPrecursors$precursorScanNum) + # check whether the precursors are real: must be within fine limits! + # previously even "bad" precursors were taken. e.g. 1-benzylpiperazine + which_OK <- lapply(validPrecursors, function(pscan) + { + pplist <- as.data.frame( + mzR::peaks(msRaw, which(headerData$acquisitionNum == pscan))) + colnames(pplist) <- c("mz","int") + pplist <- pplist[(pplist$mz >= mz -limit.fine) + & (pplist$mz <= mz + limit.fine),] + if(nrow(pplist) > 0) + return(TRUE) + return(FALSE) + }) + validPrecursors <- validPrecursors[which(which_OK==TRUE)] + if(length(validPrecursors) == 0){ + if(!is.na(cpdID)) + warning(paste0("No precursor was detected for compound, ", cpdID, " with m/z ", mz, ". Please check the mass and retention time window.")) + else + warning(paste0("No precursor was detected for m/z ", mz, ". Please check the mass and retention time window.")) + } + # Crop the "EIC" to the valid precursor scans + eic <- eic[eic$scan %in% validPrecursors,] + # Order by intensity, descending + eic <- eic[order(eic$intensity, decreasing=TRUE),] + if(nrow(eic) == 0) + return(list( + new("RmbSpectraSet", + found=FALSE))) + if(!is.na(maxCount)) + { + spectraCount <- min(maxCount, nrow(eic)) + eic <- eic[1:spectraCount,] + } + # Construct all spectra groups in decreasing intensity order + spectra <- lapply(eic$scan, function(masterScan) + { + masterHeader <- headerData[headerData$acquisitionNum == masterScan,] + childHeaders <- headerData[(headerData$precursorScanNum == masterScan) + & (headerData$precursorMZ > mz - limit.coarse) + & (headerData$precursorMZ < mz + limit.coarse) ,] + childScans <- childHeaders$seqNum + + msPeaks <- mzR::peaks(msRaw, masterHeader$seqNum) + # if deprofile option is set: run deprofiling + deprofile.setting <- deprofile + if(!is.na(deprofile.setting)) + msPeaks <- deprofile.scan( + msPeaks, method = deprofile.setting, noise = NA, colnames = FALSE + ) + colnames(msPeaks) <- c("mz","int") + + msmsSpecs <- apply(childHeaders, 1, function(line) + { + pks <- mzR::peaks(msRaw, line["seqNum"]) + + if(!is.na(deprofile.setting)) + { + pks <- deprofile.scan( + pks, method = deprofile.setting, noise = NA, colnames = FALSE + ) + } + + new("RmbSpectrum2", + mz = pks[,1], + intensity = pks[,2], + precScanNum = as.integer(line["precursorScanNum"]), + precursorMz = line["precursorMZ"], + precursorIntensity = line["precursorIntensity"], + precursorCharge = as.integer(line["precursorCharge"]), + collisionEnergy = line["collisionEnergy"], + tic = line["totIonCurrent"], + peaksCount = line["peaksCount"], + rt = line["retentionTime"], + acquisitionNum = as.integer(line["seqNum"]), + centroided = TRUE + ) + }) + msmsSpecs <- as(do.call(c, msmsSpecs), "SimpleList") + + + + # build the new objects + masterSpec <- new("Spectrum1", + mz = msPeaks[,"mz"], + intensity = msPeaks[,"int"], + polarity = as.integer(masterHeader$polarity), + peaksCount = as.integer(masterHeader$peaksCount), + rt = masterHeader$retentionTime, + acquisitionNum = as.integer(masterHeader$seqNum), + tic = masterHeader$totIonCurrent, + centroided = TRUE + ) + + spectraSet <- new("RmbSpectraSet", + parent = masterSpec, + children = msmsSpecs, + found = TRUE, + #complete = NA, + #empty = NA, + #formula = character(), + mz = mz + #name = character(), + #annotations = list() + ) + return(spectraSet) + }) + names(spectra) <- eic$acquisitionNum + return(spectra) +} + + +#' Discontinued: find MS/MS spectrum from open raw file +#' +#' This interface has been discontinued. \code{\link{findMsMsHR}} now supports the same parameters (use named +#' parameters). +#' +#' @param msRaw x +#' @param cpdID x +#' @param mode x +#' @param confirmMode x +#' @param useRtLimit x +#' @param ppmFine x +#' @param mzCoarse x +#' @param fillPrecursorScan x +#' @param rtMargin x +#' @param deprofile x +#' @param headerCache x +#' @return an error +#' +#' @author stravsmi +#' @export +findMsMsHR.direct <- function(msRaw, cpdID, mode = "pH", confirmMode = 0, useRtLimit = TRUE, + ppmFine = getOption("RMassBank")$findMsMsRawSettings$ppmFine, + mzCoarse = getOption("RMassBank")$findMsMsRawSettings$mzCoarse, + fillPrecursorScan = getOption("RMassBank")$findMsMsRawSettings$fillPrecursorScan, + rtMargin = getOption("RMassBank")$rtMargin, + deprofile = getOption("RMassBank")$deprofile, + headerCache = NULL) +{ + stop("Support for this interface has been discontinued. Use findMsMsHR with the same parameters instead (use named parameter msRaw)") +} + +#' Read in mz-files using XCMS +#' +#' Picks peaks from mz-files and returns the pseudospectra that CAMERA creates with the help of XCMS +#' +#' @aliases findMsMsHRperxcms.direct findMsMsHRperxcms +#' @param fileName The path to the mz-file that should be read +#' @param cpdID The compoundID(s) of the compound that has been used for the file +#' @param mode The ionization mode that has been used for the spectrum represented by the peaklist +#' @param findPeaksArgs A list of arguments that will be handed to the xcms-method findPeaks via do.call +#' @param plots A parameter that determines whether the spectra should be plotted or not +#' @param MSe A boolean value that determines whether the spectra were recorded using MSe or not +#' @return The spectra generated from XCMS +#' @seealso \code{\link{msmsWorkflow}} \code{\link{toRMB}} +#' @author Erik Mueller +#' @examples \dontrun{ +#' fileList <- list.files(system.file("XCMSinput", package = "RMassBank"), "Glucolesquerellin", full.names=TRUE)[3] +#' loadList(system.file("XCMSinput/compoundList.csv",package="RMassBank")) +#' psp <- findMsMsHRperxcms(fileList,2184) +#' } +#' @export +findMsMsHRperxcms <- function(fileName, cpdID, mode="pH", findPeaksArgs = NULL, plots = FALSE, MSe = FALSE){ + + # Find mz + mzLimits <- findMz(cpdID, mode) + mz <- mzLimits$mzCenter + + + # If there are more files than cpdIDs + if(length(fileName) > 1){ + fspectra <- list() + + for(i in 1:length(fileName)){ + fspectra[[i]] <- findMsMsHRperxcms.direct(fileName[i], cpdID, mode=mode, findPeaksArgs = findPeaksArgs, plots = plots, MSe = MSe) + } + + spectra <- toRMB(unlist(unlist(fspectra, FALSE),FALSE), cpdID, mode) + + } else if(length(cpdID) > 1){ # If there are more cpdIDs than files + + spectra <- findMsMsHRperxcms.direct(fileName, cpdID, mode=mode, findPeaksArgs = findPeaksArgs, plots = plots, MSe = MSe) + + P <- lapply(1:length(spectra), function(i){ + sp <- toRMB(spectra[[i]], cpdID[i], mode) + sp@id <- as.character(as.integer(cpdID[i])) + sp@name <- findName(cpdID[i]) + sp@formula <- findFormula(cpdID[i]) + sp@mode <- mode + return(sp) + }) + return(P) + + } else { # There is a file for every cpdID + spectra <- toRMB(unlist(findMsMsHRperxcms.direct(fileName, cpdID, mode=mode, findPeaksArgs = NULL, plots = FALSE, MSe = FALSE),FALSE)) + } + + sp <- spectra + + #sp@mz <- mzLimits + sp@id <- as.character(as.integer(cpdID)) + sp@name <- findName(cpdID) + sp@formula <- findFormula(cpdID) + sp@mode <- mode + + return(sp) +} + +#' @describeIn findMsMsHRperxcms A submethod of findMsMsHrperxcms that retrieves basic spectrum data +#' @export +findMsMsHRperxcms.direct <- function(fileName, cpdID, mode="pH", findPeaksArgs = NULL, plots = FALSE, MSe = FALSE) { + + requireNamespace("CAMERA",quietly=TRUE) + requireNamespace("xcms",quietly=TRUE) + + ## + ## getRT function + ## + + getRT <- function(xa) { + rt <- sapply(xa@pspectra, function(x) {median(xcms::peaks(xa@xcmsSet)[x, "rt"])}) + } + + ## + ## MSMS + ## + + # Read file + suppressWarnings(xrmsms <- xcms::xcmsRaw(fileName, includeMSn=TRUE)) + + + # If file is not MSe, split by collision energy + if(MSe == FALSE){ + # Also, fake MS1 from the MSn data + suppressWarnings(xrs <- split(xcms::msn2xcmsRaw(xrmsms), f = xrmsms@msnCollisionEnergy)) + } else{ + # Else, MSn data will already be in MS1 + xrs <- list() + xrs[[1]] <- xrmsms + } + + # Fake a simplistic xcmsSet + suppressWarnings(setReplicate <- xcms::xcmsSet(files=fileName, method="MS1")) + xsmsms <- as.list(replicate(length(xrs),setReplicate)) + + mzabs <- 0.1 + + # Definitions + whichmissing <- vector() + metaspec <- list() + + ## + ## Retrieval over all supplied cpdIDs + ## + + for(ID in 1:length(cpdID)){ + + # Find all relevant information for the current cpdID + XCMSspectra <- list() + RT <- findRt(cpdID[ID])$RT * 60 + parentMass <- findMz(cpdID[ID], mode=mode)$mzCenter + + # Is the information in the compound list? + if(is.na(parentMass)){ + stop(paste("There was no matching entry to the supplied cpdID", cpdID[ID] ,"\n Please check the cpdIDs and the compoundlist.")) + } + + # Go over every collision energy of the MS2 + for(i in 1:length(xrs)){ + + suppressWarnings(capture.output(xcms::peaks(xsmsms[[i]]) <- do.call(xcms::findPeaks,c(findPeaksArgs, object = xrs[[i]])))) + + if (nrow(xcms::peaks(xsmsms[[i]])) == 0) { + XCMSspectra[[i]] <- matrix(0,2,7) + next + } else{ + + # Get the peaklist + pl <- xcms::peaks(xsmsms[[i]])[,c("mz", "rt"), drop=FALSE] + + # Find precursor peak within limits + candidates <- which( pl[,"mz", drop=FALSE] < parentMass + mzabs & pl[,"mz", drop=FALSE] > parentMass - mzabs + & pl[,"rt", drop=FALSE] < RT * 1.1 & pl[,"rt", drop=FALSE] > RT * 0.9 ) + + # Annotate and group by FWHM (full width at half maximum) + capture.output(anmsms <- CAMERA::xsAnnotate(xsmsms[[i]])) + capture.output(anmsms <- CAMERA::groupFWHM(anmsms)) + + # If a candidate fulfills the condition, choose the closest and retrieve the index of those pesudospectra + if(length(candidates) > 0){ + closestCandidate <- which.min(abs(RT - pl[candidates, "rt", drop=FALSE])) + pspIndex <- which(sapply(anmsms@pspectra, function(x) {candidates[[i]][closestCandidate] %in% x})) + } else{ + # Else choose the candidate with the closest RT + pspIndex <- which.min(abs(getRT(anmsms) - RT)) + } + + # 2nd best: Spectrum closest to MS1 + # pspIndex <- which.min( abs(getRT(anmsms) - actualRT)) + + # If the plot parameter was supplied, plot it + if((plots == TRUE) && (length(pspIndex) > 0)){ + CAMERA::plotPsSpectrum(anmsms, pspIndex, log=TRUE, mzrange=c(0, findMz(cpdID)[[3]]), maxlabel=10) + } + + # If there is a number of indexes, retrieve the pseudospectra + if(length(pspIndex) != 0){ + XCMSspectra[[i]] <- CAMERA::getpspectra(anmsms, pspIndex) + } else { + # Else note the spectrum as missing + whichmissing <- c(whichmissing,i) + } + } + } + + # If XCMSspectra were found but there are some missing for some collision energies, fill these XCMSspectra + if((length(XCMSspectra) != 0) && length(whichmissing)){ + for(i in whichmissing){ + XCMSspectra[[i]] <- matrix(0,2,7) + } + } + + metaspec[[ID]] <- XCMSspectra + } + + return(metaspec) +} + +################################################################################ +## new +findMsMsHRperMsp <- function(fileName, cpdIDs, mode="pH"){ + # Find mz + #mzLimits <- findMz(cpdIDs, mode) + #mz <- mzLimits$mzCenter + + # If there are more files than cpdIDs + if(length(fileName) > 1){ + fspectra <- list() + + for(i in 1:length(fileName)){ + fspectra[[i]] <- findMsMsHRperMsp.direct(fileName[i], cpdIDs, mode=mode) + } + + spectra <- toRMB(unlist(unlist(fspectra, FALSE),FALSE), cpdIDs, mode) + + } else if(length(cpdIDs) > 1){ # If there are more cpdIDs than files + + spectra <- findMsMsHRperMsp.direct(fileName = fileName, cpdIDs = cpdIDs, mode=mode) + + P <- lapply(1:length(spectra), function(i){ + sp <- toRMB(msmsXCMSspecs = spectra[[i]], cpdID = cpdIDs[i], mode = mode) + sp@id <- as.character(as.integer(cpdIDs[i])) + sp@name <- findName(cpdIDs[i]) + sp@formula <- findFormula(cpdIDs[i]) + sp@mode <- mode + + if(length(sp@children) == 1){ + sp@children[[1]]@rawOK <- rep(x = TRUE, times = sp@children[[1]]@peaksCount) + sp@children[[1]]@good <- rep(x = TRUE, times = sp@children[[1]]@peaksCount) + #sp@children[[1]]@good <- TRUE + } + + return(sp) + }) + return(P) + + } else { # There is a file for every cpdID + spectra <- toRMB(unlist(findMsMsHRperMsp.direct(fileName, cpdIDs, mode=mode),FALSE)) + } + + sp <- spectra + + #sp@mz <- mzLimits + sp@id <- as.character(as.integer(cpdIDs)) + sp@name <- findName(cpdIDs) + sp@formula <- findFormula(cpdIDs) + sp@mode <- mode + + return(sp) +} + +#' @describeIn findMsMsHRperMsp A submethod of findMsMsHrperxcms that retrieves basic spectrum data +#' @export +findMsMsHRperMsp.direct <- function(fileName, cpdIDs, mode="pH") { + + #requireNamespace("CAMERA",quietly=TRUE) + #requireNamespace("xcms",quietly=TRUE) + + ## + ## MSMS + ## + + # Read file + suppressWarnings(xrmsms <- read.msp(file = fileName)) + + ## If file is not MSe, split by collision energy + #if(MSe == FALSE){ + # # Also, fake MS1 from the MSn data + # suppressWarnings(xrs <- split(xcms::msn2xcmsRaw(xrmsms), f = xrmsms@msnCollisionEnergy)) + #} else{ + # # Else, MSn data will already be in MS1 + # xrs <- list() + # xrs[[1]] <- xrmsms + #} + #xrs <- xrmsms + + mzabs <- 0.1 + + # Definitions + whichmissing <- vector() + metaspec <- list() + + precursorTable <- data.frame(stringsAsFactors = FALSE, + mz = as.numeric(unlist(lapply(X = xrmsms, FUN = function(x){ x$PRECURSORMZ }))), + rt = as.numeric(unlist(lapply(X = xrmsms, FUN = function(x){ x$RETENTIONTIME }))) + ) + + ## + ## Retrieval over all supplied cpdIDs + ## + + for(idIdx in seq_along(cpdIDs)){ + + # Find all relevant information for the current cpdID + spectrum <- NULL + RT <- findRt(cpdIDs[[idIdx]])$RT * 60 + parentMass <- findMz(cpdIDs[[idIdx]], mode=mode)$mzCenter + + # Is the information in the compound list? + if(is.na(parentMass)){ + stop(paste("There was no matching entry to the supplied cpdID", cpdIDs[[idIdx]] ,"\n Please check the cpdIDs and the compoundlist.")) + } + + # Go over every collision energy of the MS2 + #for(i in seq_along(xrs)){ + + + + if (nrow(precursorTable) == 0) { + ## no peaks there + #spectrum <- matrix(0,2,7) + next + } else{ + ## at least one peak there + + # Get the peaklist + #pl <- xrs[[i]]$pspectrum + #pl <- data.frame("mz" = pl[, "mz"], "rt" = xrs[[i]]$RETENTIONTIME, stringsAsFactors = F) + + mzMatch <- + precursorTable[,"mz", drop=FALSE] < parentMass + mzabs & + precursorTable[,"mz", drop=FALSE] > parentMass - mzabs + rtMatch <- + precursorTable[,"rt", drop=FALSE] < RT * 1.1 & + precursorTable[,"rt", drop=FALSE] > RT * 0.9 + + if(is.na(RT)) + rtMatch <- TRUE + + # Find precursor peak within limits + candidates <- which( mzMatch & rtMatch ) + + # Annotate and group by FWHM (full width at half maximum) + #capture.output(anmsms <- CAMERA::xsAnnotate(xsmsms[[i]])) + #capture.output(anmsms <- CAMERA::groupFWHM(anmsms)) + + # If a candidate fulfills the condition, choose the closest and retrieve the index of those pesudospectra + if(length(candidates) > 0){ + if(is.na(RT)){ + pspIndex <- candidates[[1]] + } else { + closestCandidate <- which.min(abs(RT - precursorTable[candidates, "rt", drop=FALSE])) + pspIndex <- candidates[[closestCandidate]] + } + } else{ + # Else choose the candidate with the closest RT + pspIndex <- which.min(abs(precursorTable[,"rt"] - RT)) + } + + # 2nd best: Spectrum closest to MS1 + # pspIndex <- which.min( abs(getRT(anmsms) - actualRT)) + + ## If the plot parameter was supplied, plot it + #if((plots == TRUE) && (length(pspIndex) > 0)){ + # CAMERA::plotPsSpectrum(anmsms, pspIndex, log=TRUE, mzrange=c(0, findMz(cpdIDs)[[3]]), maxlabel=10) + #} + + # If there is a number of indexes, retrieve the pseudospectra + if(length(pspIndex) != 0){ + spectrum <- xrmsms[[pspIndex]] + } else { + # Else note the spectrum as missing + whichmissing <- c(whichmissing,idIdx) + #spectrum <- matrix(0,2,7) + } + } + #} + + # If XCMSspectra were found but there are some missing for some collision energies, fill these XCMSspectra + #if((length(XCMSspectra) != 0) && length(whichmissing)){ + # for(i in whichmissing){ + # XCMSspectra[[idIdx]] <- matrix(0,2,7) + # } + #} + + if(is.null(spectrum)){ + metaspec[[idIdx]] <- list(matrix(0,1,7)) + } else { + metaspec[[idIdx]] <- list(data.frame( + stringsAsFactors = F, + "mz" = as.numeric(spectrum$pspectrum[, "mz"]), + "mzmin" = as.numeric(spectrum$pspectrum[, "mz"]), + "mzmax" = as.numeric(spectrum$pspectrum[, "mz"]), + "rt" = as.numeric(spectrum$RETENTIONTIME), + "rtmin" = as.numeric(spectrum$RETENTIONTIME), + "rtmax" = as.numeric(spectrum$RETENTIONTIME), + "into" = as.numeric(spectrum$pspectrum[, "intensity"]) + )) + } + } + + return(metaspec) +} + +## adapted from the Bioconductor package 'metaMS' (method 'read.msp') +read.msp <- function(file){ + get.text.value <- function(x, field, do.err = TRUE) { + woppa <- strsplit(x, field) + woppa.lengths <- sapply(woppa, length) + if (all(woppa.lengths == 2)) { + sapply(woppa, function(y) gsub("^ +", "", y[2])) + } + else { + if (do.err) { + stop(paste("Invalid field", field, "in", x[woppa.lengths != 2])) + } + else { + NULL + } + } + } + read.compound <- function(strs) { + fields.idx <- grep(":", strs) + fields <- sapply(strsplit(strs[fields.idx], ":"), "[[", 1) + pk.idx <- which(fields == "Num Peaks") + if (length(pk.idx) == 0) + stop("No spectrum found") + cmpnd <- lapply(fields.idx[-pk.idx], function(x) get.text.value(strs[x], paste(fields[x], ":", sep = ""))) + names(cmpnd) <- fields[-pk.idx] + nlines <- length(strs) + npeaks <- as.numeric(get.text.value(strs[pk.idx], "Num Peaks:")) + peaks.idx <- (pk.idx + 1):nlines + pks <- gsub("^ +", "", unlist(strsplit(strs[peaks.idx], ";"))) + pks <- pks[pks != ""] + if (length(pks) != npeaks) + stop("Not the right number of peaks in compound", cmpnd$Name) + pklst <- strsplit(x = pks, split = "\t| ") + pklst <- lapply(pklst, function(x) x[x != ""]) + cmz <- as.numeric(sapply(pklst, "[[", 1)) + cintens <- as.numeric(sapply(pklst, "[[", 2)) + finaltab <- matrix(c(cmz, cintens), ncol = 2) + if (any(table(cmz) > 1)) { + warning("Duplicate mass in compound ", cmpnd$Name, " (CAS ", cmpnd$CAS, ")... summing up intensities") + finaltab <- aggregate(finaltab[, 2], by = list(finaltab[, 1]), FUN = sum) + } + colnames(finaltab) <- c("mz", "intensity") + c(cmpnd, list(pspectrum = finaltab)) + } + huhn <- readLines(con = file) + starts <- which(regexpr("(Name:)|(NAME:) ", huhn) == 1) + ends <- c(starts[-1] - 1, length(huhn)) + lapply(1:length(starts), function(i){ + read.compound(huhn[starts[[i]]:ends[[i]]]) + }) +} +## new +################################################################################ + +# Finds the EIC for a mass trace with a window of x ppm. +# (For ppm = 10, this is +5 / -5 ppm from the non-recalibrated mz.) +#' Extract EICs +#' +#' Extract EICs from raw data for a determined mass window. +#' +#' @param msRaw The mzR file handle +#' @param mz The mass or mass range to extract the EIC for: either a single mass +#' (with the range specified by \code{limit} below) or a mass range +#' in the form of \code{c(min, max)}. +#' @param limit If a single mass was given for \code{mz}: the mass window to extract. +#' A limit of 0.001 means that the EIC will be returned for \code{[mz - 0.001, mz + 0.001]}. +#' @param rtLimit If given, the retention time limits in form \code{c(rtmin, rtmax)} in seconds. +#' @param headerCache If present, the complete \code{mzR::header(msRaw)}. Passing +#' this value is useful if spectra for multiple compounds should be +#' extracted from the same mzML file, since it avoids getting the data +#' freshly from \code{msRaw} for every compound. +#' @param peaksCache If present, the complete output of \code{mzR::peaks(msRaw)}. This speeds up the lookup +#' if multiple compounds should be searched in the same file. +#' @param floatingRecalibration +#' A fitting function that \code{predict()}s a mass shift based on the retention time. Can be used +#' if a lockmass calibration is known (however you have to build the calibration yourself.) +#' @return A \code{[rt, intensity, scan]} matrix (\code{scan} being the scan number.) +#' @author Michael A. Stravs, Eawag +#' @seealso findMsMsHR +#' @export +findEIC <- function(msRaw, mz, limit = NULL, rtLimit = NA, headerCache = NULL, floatingRecalibration = NULL, + peaksCache = NULL) +{ + # calculate mz upper and lower limits for "integration" + if(all(c("mzMin", "mzMax") %in% names(mz))) + mzlimits <- c(mz$mzMin, mz$mzMax) + else + mzlimits <- c(mz - limit, mz + limit) + # Find peaklists for all MS1 scans + if(!is.null(headerCache)) + headerData <- as.data.frame(headerCache) + else + headerData <- as.data.frame(header(msRaw)) + # Add row numbering because I'm not sure if seqNum or acquisitionNum correspond to anything really + if(nrow(headerData) > 0) + headerData$rowNum <- 1:nrow(headerData) + else + headerData$rowNum <- integer(0) + + # If RT limit is already given, retrieve only candidates in the first place, + # since this makes everything much faster. + if(all(!is.na(rtLimit))) + headerMS1 <- headerData[ + (headerData$msLevel == 1) & (headerData$retentionTime >= rtLimit[[1]]) + & (headerData$retentionTime <= rtLimit[[2]]) + ,] + else + headerMS1 <- headerData[headerData$msLevel == 1,] + if(is.null(peaksCache)) + pks <- mzR::peaks(msRaw, headerMS1$seqNum) + else + pks <- peaksCache[headerMS1$rowNum] + + # Sum intensities in the given mass window for each scan + if(is.null(floatingRecalibration)) + { + headerMS1$mzMin <- mzlimits[[1]] + headerMS1$mzMax <- mzlimits[[2]] + } + else + { + headerMS1$mzMin <- mzlimits[[1]] + predict(floatingRecalibration, headerMS1$retentionTime) + headerMS1$mzMax <- mzlimits[[2]] + predict(floatingRecalibration, headerMS1$retentionTime) + } + intensity <- unlist(lapply(1:nrow(headerMS1), function(row){ + peaktable <- pks[[row]] + sum(peaktable[ + which((peaktable[,1] >= headerMS1[row,"mzMin"]) & (peaktable[,1] <= headerMS1[row,"mzMax"])),2 + ]) + + })) + return(data.frame(rt = headerMS1$retentionTime, intensity=intensity, scan=headerMS1$acquisitionNum)) +} + + +#' Generate peaks cache +#' +#' Generates a peak cache table for use with \code{\link{findMsMsHR}} functions. +#' +#' @param msRaw the input raw datafile (opened) +#' @param headerCache the cached header, or subset thereof for which peaks should be extracted. Peak extraction goes +#' by \code{seqNum}. +#' @return A list of dataframes as from \code{mzR::peaks}. +#' +#' @author stravsmi +#' @export +makePeaksCache <- function(msRaw, headerCache) +{ + mzR::peaks(msRaw, headerCache$seqNum) +} + +#' Conversion of XCMS-pseudospectra into RMassBank-spectra +#' +#' Converts a pseudospectrum extracted from XCMS using CAMERA into the msmsWorkspace(at)spectrum-format that RMassBank uses +#' +#' @usage toRMB(msmsXCMSspecs, cpdID, mode, MS1spec) +#' @param msmsXCMSspecs The compoundID of the compound that has been used for the peaklist +#' @param cpdID The compound ID of the substance of the given spectrum +#' @param mode The ionization mode that has been used for the spectrum +#' @param MS1spec The MS1-spectrum from XCMS, which can be optionally supplied +#' @return One list element of the (at)specs-entry from an msmsWorkspace +#' @seealso \code{\link{msmsWorkspace-class}} +#' @author Erik Mueller +#' @examples \dontrun{ +#' XCMSpspectra <- findmsmsHRperxcms.direct("Glucolesquerellin_2184_1.mzdata", 2184) +#' wspecs <- toRMB(XCMSpspectra) +#' } +#' @export +toRMB <- function(msmsXCMSspecs = NA, cpdID = NA, mode="pH", MS1spec = NA){ + + + ##Basic parameters + mz <- findMz(cpdID,mode=mode)$mzCenter + id <- cpdID + formula <- findFormula(cpdID) + + if(length(msmsXCMSspecs) == 0){ + return(new("RmbSpectraSet",found=FALSE)) + } + + foundOK <- !any(sapply(msmsXCMSspecs, function(x) all(x == 0))) + + + if(!foundOK){ + return(new("RmbSpectraSet",found=FALSE)) + } + + if(suppressWarnings(is.na(msmsXCMSspecs)[1])){ + stop("You need a readable spectrum!") + } + + if(is.na(cpdID)){ + stop("Please supply the compoundID!") + } + + mockAcqnum <- 1 + mockenv <- environment() + + msmsSpecs <- lapply(msmsXCMSspecs, function(spec){ + ## Mock acquisition num + mockenv$mockAcqnum <- mockenv$mockAcqnum + 1 + + ## Find peak table + pks <- matrix(nrow = length(spec[,1]), ncol = 2) + colnames(pks) <- c("mz","int") + pks[,1] <- spec[,1] + pks[,2] <- spec[,7] + + ## Deprofiling not necessary for XCMS + + ## New spectrum object + return(new("RmbSpectrum2", + mz = pks[,"mz"], + intensity = pks[,"int"], + precScanNum = as.integer(1), + precursorMz = findMz(cpdID)$mzCenter, + precursorIntensity = 0, + precursorCharge = as.integer(1), + collisionEnergy = 0, + tic = 0, + peaksCount = nrow(spec), + rt = median(spec[,4]), + acquisitionNum = as.integer(mockenv$mockAcqnum), + centroided = TRUE + )) + }) + + msmsSpecs <- as(do.call(c, msmsSpecs), "SimpleList") + + ##Build the new objects + masterSpec <- new("Spectrum1", + mz = findMz(cpdID,mode=mode)$mzCenter, + intensity = 100, + polarity = as.integer(0), + peaksCount = as.integer(1), + rt = msmsSpecs[[1]]@rt, + acquisitionNum = as.integer(1), + tic = 0, + centroided = TRUE + ) + + spectraSet <- new("RmbSpectraSet", + parent = masterSpec, + children = msmsSpecs, + found = TRUE, + #complete = NA, + #empty = NA, + #formula = character(), + mz = mz + #name = character(), + #annotations = list() + ) + + return(spectraSet) +} + +#' Addition of manual peaklists +#' +#' Adds a manual peaklist in matrix-format +#' +#' @usage addPeaksManually(w, cpdID, handSpec, mode) +#' @param w The msmsWorkspace that the peaklist should be added to. +#' @param cpdID The compoundID of the compound that has been used for the peaklist +#' @param handSpec A peaklist with 2 columns, one with "mz", one with "int" +#' @param mode The ionization mode that has been used for the spectrum represented by the peaklist +#' @return The \code{msmsWorkspace} with the additional peaklist added to the right spectrum +#' @seealso \code{\link{msmsWorkflow}} +#' @author Erik Mueller +#' @examples \dontrun{ +#' handSpec <- cbind(mz=c(274.986685367956, 259.012401087427, 95.9493025990907, 96.9573002472772), +#' int=c(357,761, 2821, 3446)) +#' addPeaksManually(w, cpdID, handSpec) +#' } +#' @export +addPeaksManually <- function(w, cpdID = NA, handSpec, mode = "pH"){ + + + if(is.na(cpdID)){ + stop("Please supply the compoundID!") + } + + # For the case that the cpdID turns up for the first time + # a new spectrumset needs to be created + if(!(cpdID %in% sapply(w@spectra,function(s) s@id))){ + + # Create fake MS1 spectrum + masterSpec <- new("Spectrum1", + mz = findMz(cpdID,mode=mode)$mzCenter, + intensity = 100, + polarity = as.integer(0), + peaksCount = as.integer(1), + rt = findRt(cpdID)$RT, + acquisitionNum = as.integer(1), + tic = 0, + centroided = TRUE + ) + + # Create fake spectrumset + spectraSet <- new("RmbSpectraSet", + parent = masterSpec, + found = TRUE, + #complete = NA, + #empty = NA, + id = as.character(as.integer(cpdID)), + formula = findFormula(cpdID), + mz = findMz(cpdID,mode=mode)$mzCenter, + name = findName(cpdID), + mode = mode + #annotations = list() + ) + + w@spectra[[length(w@spectra) + 1]] <- spectraSet + } + + specIndex <- which(cpdID == sapply(w@spectra, function(s) s@id)) + + # New spectrum object + w@spectra[[specIndex]]@children[[length(w@spectra[[specIndex]]@children) + 1]] <- new("RmbSpectrum2", + mz = handSpec[,"mz"], + intensity = handSpec[,"int"], + precScanNum = as.integer(1), + precursorMz = findMz(cpdID)$mzCenter, + precursorIntensity = 0, + precursorCharge = as.integer(1), + collisionEnergy = 0, + tic = 0, + peaksCount = nrow(handSpec), + rt = findRt(cpdID)$RT, + acquisitionNum = as.integer(length(w@spectra[[specIndex]]@children) + 2), + centroided = TRUE) + return(w) +} + + +createSpecsFromPeaklists <- function(w, cpdIDs, filenames, mode="pH"){ + for(j in 1:length(filenames)){ + w <- addPeaksManually(w,cpdIDs[j],as.matrix(read.csv(filenames[j]), header=TRUE),mode) + } + + return(w) +} + + +#' MassBank-record Addition +#' +#' Adds the peaklist of a MassBank-Record to the specs of an msmsWorkspace +#' +#' @aliases addMB +#' @usage addMB(w, cpdID, fileName, mode) +#' @param w The msmsWorkspace that the peaklist should be added to. +#' @param cpdID The compoundID of the compound that has been used for the record +#' @param fileName The path to the record +#' @param mode The ionization mode that has been used to create the record +#' @return The \code{msmsWorkspace} with the additional peaklist from the record +#' @seealso \code{\link{addPeaksManually}} +#' @author Erik Mueller +#' @examples \dontrun{ +#' addMB("filepath_to_records/RC00001.txt") +#' } +#' @export +addMB <- function(w, cpdID, fileName, mode){ + mb <- parseMassBank(fileName) + peaklist <- list() + peaklist[[1]] <- mb@compiled_ok[[1]][["PK$PEAK"]][,1:2] + w <- addPeaksManually(w, cpdID, peaklist[[1]], mode) + return(w) +} + diff --git a/R/webAccess.R b/R/webAccess.R index 91d67b7..e075e65 100755 --- a/R/webAccess.R +++ b/R/webAccess.R @@ -1,547 +1,547 @@ -#' @import XML RCurl rjson -NULL -## library(XML) -## library(RCurl) - - - -#' Retrieve information from Cactus -#' -#' Retrieves information from the Cactus Chemical Identifier Resolver -#' (PubChem). -#' -#' It is not necessary to specify in which format the \code{identifier} is. -#' Somehow, cactus does this automatically. -#' -#' @usage getCactus(identifier, representation) -#' @param identifier Any identifier interpreted by the resolver, e.g. an InChI -#' key or a SMILES code. -#' @param representation The desired representation, as required from the -#' resolver. e.g. \code{stdinchikey}, \code{chemspider_id}, \code{formula}... -#' Refer to the webpage for details. -#' @return The result of the query, in plain text. Can be NA, or one or -#' multiple lines (character array) of results. -#' @note Note that the InChI key is retrieved with a prefix (\code{InChIkey=}), -#' which must be removed for most database searches in other databases (e.g. -#' CTS). -#' @author Michael Stravs -#' @seealso \code{\link{getCtsRecord}}, \code{\link{getPcId}} -#' @references cactus Chemical Identifier Resolver: -#' \url{http://cactus.nci.nih.gov/chemical/structure} -#' @examples -#' -#' # Benzene: -#' getCactus("C1=CC=CC=C1", "cas") -#' getCactus("C1=CC=CC=C1", "stdinchikey") -#' getCactus("C1=CC=CC=C1", "chemspider_id") -#' -#' @export -getCactus <- function(identifier, representation) -{ - - ret <- tryCatch( - getURLContent(paste( - "https://cactus.nci.nih.gov/chemical/structure/", - URLencode(identifier), "/", representation, sep='')), - error = function(e) NA) - if(is.na(ret)) - return(NA) - if(ret=="

Page not found (404)

\n") - return(NA) - return(unlist(strsplit(ret, "\n"))) -} - -#' Search Pubchem CID -#' -#' Retrieves PubChem CIDs for a search term. -#' -#' Only the first result is returned currently. \bold{The function should be -#' regarded as experimental and has not thoroughly been tested.} -#' -#' @usage getPcId(query, from = "inchikey") -#' @param query ID to be converted -#' @param from Type of input ID -#' @return The PubChem CID (in string type). -#' @author Michael Stravs, Erik Mueller -#' @seealso \code{\link{getCtsRecord}}, \code{\link{getCactus}} -#' @references PubChem search: \url{http://pubchem.ncbi.nlm.nih.gov/} -#' -#' Pubchem REST: -#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} -#' @examples -#' getPcId("MKXZASYAUGDDCJ-NJAFHUGGSA-N") -#' -#' @export -getPcId <- function(query, from = "inchikey") -{ - baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "description", "json", sep="/") - - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url),timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - # This happens if the InChI key is not found: - r <- fromJSON(data) - - if(!is.null(r$Fault)) - return(NA) - - titleEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Title)))) - - titleEntry <- titleEntry[which.min(sapply(titleEntry, function(x)r$InformationList$Information[[x]]$CID))] - - PcID <- r$InformationList$Information[[titleEntry]]$CID - - if(is.null(PcID)){ - return(NA) - } else{ - return(PcID) - } -} - -# The following function is unfinished. -# getPcRecord <- function(pcid) -# { -# baseUrl <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" -# term <- paste(baseUrl, "esummary.fcgi?db=pccompound&id=", URLencode(as.character(pcid)), -# -# sep='') -# ret <- getURL(term) -# xml <- xmlParseDoc(ret,asText=TRUE) -# browser() -# } - - -# Note: some of the CHEBI codes returned are erroneous. (When the entry in -# CTS starts with "CHEBI:" instead of just the number, the XML output) -# Also, there is no ChemSpider ID in the XML output, unfortunately. - - - - - - - - -#' Retrieve information from CTS -#' -#' Retrieves a complete CTS record from the InChI key. -#' -#' @usage getCtsRecord(key) -#' -#' @param key The InChI key. -#' @return Returns a list with all information from CTS: \code{inchikey, -#' inchicode, formula, exactmass} contain single values. \code{synonyms} contains -#' an unordered list of scored synonyms (\code{type, name, score}, where \code{type} -#' indicates either a normal name or a specific IUPAC name, see below). -#' \code{externalIds} contains an unordered list of identifiers of the compound in -#' various databases (\code{name, value}, where \code{name} is the database name and -#' \code{value} the identifier in that database.) -#' -#' @note Currently, the CTS results are still incomplete; the name scores are all 0, -#' formula and exact mass return zero. -#' @references Chemical Translation Service: -#' \url{http://cts.fiehnlab.ucdavis.edu} -#' -#' @examples -#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") -#' # show all synonym "types" -#' types <- unique(unlist(lapply(data$synonyms, function(i) i$type))) -#' \dontrun{print(types)} -#' -#' @author Michele Stravs, Eawag -#' @export -getCtsRecord <- function(key) -{ - baseURL <- "http://cts.fiehnlab.ucdavis.edu/service/compound/" - - errorvar <- 0 - currEnvir <- environment() - - ##tryCatch a CTS timeout - ## - tryCatch( - { - data <- getURL(paste0(baseURL,key), timeout=7) - }, - error=function(e){ - currEnvir$errorvar <- 1 - } - ) - - if(errorvar){ - warning("CTS seems to be currently unavailable or incapable of interpreting your request") - return(NULL) - } - - r <- fromJSON(data) - if(length(r) == 1) - if(r == "You entered an invalid InChIKey") - return(list()) - return(r) -} - -#' Convert a single ID to another using CTS. -#' -#' @usage getCtsKey(query, from = "Chemical Name", to = "InChIKey") -#' @param query ID to be converted -#' @param from Type of input ID -#' @param to Desired output ID -#' @return An unordered array with the resulting converted key(s). -#' -#' @examples -#' k <- getCtsKey("benzene", "Chemical Name", "InChIKey") -#' @author Michele Stravs, Eawag -#' @export -getCtsKey <- function(query, from = "Chemical Name", to = "InChIKey") -{ - baseURL <- "http://cts.fiehnlab.ucdavis.edu/service/convert" - url <- paste(baseURL, from, to, query, sep='/') - errorvar <- 0 - currEnvir <- environment() - - ##tryCatch a CTS timeout - ## - tryCatch( - { - data <- getURL(URLencode(url), timeout=7) - }, - error=function(e){ - currEnvir$errorvar <- 1 - } - ) - - if(errorvar){ - warning("CTS seems to be currently unavailable or incapable of interpreting your request") - return(NULL) - } - - - r <- fromJSON(data) - if(length(r) == 0) - return(NULL) - else - { - # read out the results in simplest form: - results <- unlist(lapply(r, function(row) row$result)) - return(results) - } -} - -#' Select a subset of external IDs from a CTS record. -#' -#' @usage CTS.externalIdSubset(data, database) -#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. -#' @param database The database for which keys should be returned. -#' @return Returns an array of all external identifiers stored in the record for the -#' given database. -#' -#' @examples -#' -#' \dontrun{ -#' # Return all CAS registry numbers stored for benzene. -#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") -#' cas <- CTS.externalIdSubset(data, "CAS") -#' } -#' -#' @author Michele Stravs, Eawag -#' @export -CTS.externalIdSubset <- function(data, database) -{ - select <- which(unlist(lapply(data$externalIds, function(id) - { - id[["name"]] == database - }))) - keyEntries <- data$externalIds[select] - keys <- unlist(lapply(keyEntries, function(e) e[["value"]])) -} - -#' Find all available databases for a CTS record -#' -#' @usage CTS.externalIdTypes(data) -#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. -#' @return Returns an array of all database names for which there are external -#' identifiers stored in the record. -#' -#' @examples -#' -#' \dontrun{ -#' # Return all databases for which the benzene entry has -#' # links in the CTS record. -#' -#' data <- getCTS("UHOVQNZJYSORNB-UHFFFAOYSA-N") -#' databases <- CTS.externalIdTypes(data) -#' } -#' -#' @author Michele Stravs, Eawag -#' @export -CTS.externalIdTypes <- function(data) -{ - unique(unlist(lapply(data$externalIds, function(id) - { - id[["name"]] - }))) -} - -.pubChemOnline <- function(){ - baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, "inchikey", "QEIXBXXKTUNWDK-UHFFFAOYSA-N", "description", "json", sep="/") - - errorvar <- 0 - currEnvir <- environment() - tryCatch( - ret <- getURL(URLencode(url), timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - warning("Pubchem is currently offline") - return(FALSE) - } else{ - return(TRUE) - } -} - - - -getPcCHEBI <- function(query, from = "inchikey") -{ - # Get the JSON-Data from Pubchem - baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "synonyms", "json", sep="/") - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url),timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - r <- fromJSON(data) - - # This happens if the InChI key is not found: - if(!is.null(r$Fault)) - return(NA) - - # Find the entries which contain Chebi-links - synonymEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Synonym)))) - synonymList <- r$InformationList$Information[[synonymEntry]]$Synonym - matchChebi <- which(grepl("CHEBI:", synonymList, fixed=TRUE)) - - # It doesn't matter if the db is down or if chebi isn't found, so return NA also - if(length(matchChebi) == 0){ - return (NA) - } else { - return (sapply(matchChebi, function(x) synonymList[[x]])) - } -} - -#' Retrieve the Chemspider ID for a given compound -#' -#' Given an InChIKey, this function queries the chemspider web API to retrieve -#' the Chemspider ID of he compound with that InChIkey. -#' -#' @usage getCSID(query) -#' -#' @param query The InChIKey of the compound -#' @return Returns the chemspide -#' -#' @examples -#' -#' \dontrun{ -#' # Return all CAS registry numbers stored for benzene. -#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") -#' cas <- CTS.externalIdSubset(data, "CAS") -#' } -#' -#' @author Michele Stravs, Eawag -#' @author Erik Mueller, UFZ -#' @export -getCSID <- function(query) -{ - baseURL <- "http://www.chemspider.com/InChI.asmx/InChIKeyToCSID?inchi_key=" - url <- paste0(baseURL, query) - - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url), timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - warning("Chemspider is currently offline") - return(NA) - } - - xml <- xmlParseDoc(data,asText=TRUE) - # the returned XML document contains only the root node called "string" which contains the correct CSID - idNodes <- getNodeSet(xml, "/") - id <- xmlValue(idNodes[[1]]) - return(id) -} - -##This function returns a sensible name for the compound -getPcSynonym <- function (query, from = "inchikey") -{ - # Get the JSON-Data from Pubchem - baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "description", "json", sep="/") - - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url),timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - r <- fromJSON(data) - - # This happens if the InChI key is not found: - if(!is.null(r$Fault)) - return(NA) - - # Find the synonym - - titleEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Title)))) - - titleEntry <- titleEntry[which.min(sapply(titleEntry, function(x)r$InformationList$Information[[x]]$CID))] - - title <- r$InformationList$Information[[titleEntry]]$Title - - if(is.null(title)){ - return(NA) - } else{ - return(title) - } -} - - -##A function to retrieve a IUPAC Name from Pubchem -getPcIUPAC <- function (query, from = "inchikey") -{ - # Get the JSON-Data from Pubchem - baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "record", "json", sep="/") - - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url),timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - r <- fromJSON(data) - - # This happens if the InChI key is not found: - if(!is.null(r$Fault)) - return(NA) - - # Find the IUPAC-Names - if(!is.null(r$PC_Compounds[[1]]$props)){ - IUPACIndex <- which(unlist(lapply(r$PC_Compounds[[1]]$props, function(i) (i$urn$label == "IUPAC Name")))) - if(length(IUPACIndex) > 0){ - # Retrieve all IUPAC-Names - IUPACEntries <- lapply(IUPACIndex, function(x) r$PC_Compounds[[1]]$props[[x]]) - if(!is.null(IUPACEntries)){ - # Is there a preferred IUPAC-Name? If yes, retrieve that - PrefIUPAC <- which(unlist(lapply(IUPACEntries, function(x) x$urn$name == "Preferred"))) - } else{return(NA)} - } else{return(NA)} - } else{return(NA)} - - - if(length(PrefIUPAC) == 1){ - return(IUPACEntries[[PrefIUPAC]]$value$sval) - } else{ - # Else it doesn't matter which - return(IUPACEntries[[1]]$value$sval) - } -} - -getPcInchiKey <- function(query, from = "smiles"){ - # Get the JSON-Data from Pubchem - baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "record", "json", sep="/") - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url),timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - r <- fromJSON(data) - - # This happens if the InChI key is not found: - if(!is.null(r$Fault)) - return(NA) - - # Find the entries which contain Chebi-links - if(!is.null(r$PC_Compounds[[1]]$props)){ - INKEYindex <- which(sapply(r$PC_Compounds[[1]]$props, function(x) x$urn$label) == "InChIKey") - if(length(INKEYindex) > 0){ - return(r$PC_Compounds[[1]]$props[[INKEYindex]]$value$sval) - } else{return(NA)} - } else{return(NA)} - - -} - -getPcSDF <- function(query, from = "smiles"){ - baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" - url <- paste(baseURL, from, query, "sdf", sep="/") - - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url),timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ - return(NA) - } - - molEnd <- regexpr(data,pattern="M END",fixed=TRUE)+5 - data <- c(strsplit(substring(data,1,molEnd),"\n")[[1]],"$$$$") - return(data) -} - +#' @import XML RCurl rjson +NULL +## library(XML) +## library(RCurl) + + + +#' Retrieve information from Cactus +#' +#' Retrieves information from the Cactus Chemical Identifier Resolver +#' (PubChem). +#' +#' It is not necessary to specify in which format the \code{identifier} is. +#' Somehow, cactus does this automatically. +#' +#' @usage getCactus(identifier, representation) +#' @param identifier Any identifier interpreted by the resolver, e.g. an InChI +#' key or a SMILES code. +#' @param representation The desired representation, as required from the +#' resolver. e.g. \code{stdinchikey}, \code{chemspider_id}, \code{formula}... +#' Refer to the webpage for details. +#' @return The result of the query, in plain text. Can be NA, or one or +#' multiple lines (character array) of results. +#' @note Note that the InChI key is retrieved with a prefix (\code{InChIkey=}), +#' which must be removed for most database searches in other databases (e.g. +#' CTS). +#' @author Michael Stravs +#' @seealso \code{\link{getCtsRecord}}, \code{\link{getPcId}} +#' @references cactus Chemical Identifier Resolver: +#' \url{http://cactus.nci.nih.gov/chemical/structure} +#' @examples +#' +#' # Benzene: +#' getCactus("C1=CC=CC=C1", "cas") +#' getCactus("C1=CC=CC=C1", "stdinchikey") +#' getCactus("C1=CC=CC=C1", "chemspider_id") +#' +#' @export +getCactus <- function(identifier, representation) +{ + + ret <- tryCatch( + getURLContent(paste( + "https://cactus.nci.nih.gov/chemical/structure/", + URLencode(identifier), "/", representation, sep='')), + error = function(e) NA) + if(is.na(ret)) + return(NA) + if(ret=="

Page not found (404)

\n") + return(NA) + return(unlist(strsplit(ret, "\n"))) +} + +#' Search Pubchem CID +#' +#' Retrieves PubChem CIDs for a search term. +#' +#' Only the first result is returned currently. \bold{The function should be +#' regarded as experimental and has not thoroughly been tested.} +#' +#' @usage getPcId(query, from = "inchikey") +#' @param query ID to be converted +#' @param from Type of input ID +#' @return The PubChem CID (in string type). +#' @author Michael Stravs, Erik Mueller +#' @seealso \code{\link{getCtsRecord}}, \code{\link{getCactus}} +#' @references PubChem search: \url{http://pubchem.ncbi.nlm.nih.gov/} +#' +#' Pubchem REST: +#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} +#' @examples +#' getPcId("MKXZASYAUGDDCJ-NJAFHUGGSA-N") +#' +#' @export +getPcId <- function(query, from = "inchikey") +{ + baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "description", "json", sep="/") + + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url),timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + # This happens if the InChI key is not found: + r <- fromJSON(data) + + if(!is.null(r$Fault)) + return(NA) + + titleEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Title)))) + + titleEntry <- titleEntry[which.min(sapply(titleEntry, function(x)r$InformationList$Information[[x]]$CID))] + + PcID <- r$InformationList$Information[[titleEntry]]$CID + + if(is.null(PcID)){ + return(NA) + } else{ + return(PcID) + } +} + +# The following function is unfinished. +# getPcRecord <- function(pcid) +# { +# baseUrl <- "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/" +# term <- paste(baseUrl, "esummary.fcgi?db=pccompound&id=", URLencode(as.character(pcid)), +# +# sep='') +# ret <- getURL(term) +# xml <- xmlParseDoc(ret,asText=TRUE) +# browser() +# } + + +# Note: some of the CHEBI codes returned are erroneous. (When the entry in +# CTS starts with "CHEBI:" instead of just the number, the XML output) +# Also, there is no ChemSpider ID in the XML output, unfortunately. + + + + + + + + +#' Retrieve information from CTS +#' +#' Retrieves a complete CTS record from the InChI key. +#' +#' @usage getCtsRecord(key) +#' +#' @param key The InChI key. +#' @return Returns a list with all information from CTS: \code{inchikey, +#' inchicode, formula, exactmass} contain single values. \code{synonyms} contains +#' an unordered list of scored synonyms (\code{type, name, score}, where \code{type} +#' indicates either a normal name or a specific IUPAC name, see below). +#' \code{externalIds} contains an unordered list of identifiers of the compound in +#' various databases (\code{name, value}, where \code{name} is the database name and +#' \code{value} the identifier in that database.) +#' +#' @note Currently, the CTS results are still incomplete; the name scores are all 0, +#' formula and exact mass return zero. +#' @references Chemical Translation Service: +#' \url{http://cts.fiehnlab.ucdavis.edu} +#' +#' @examples +#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") +#' # show all synonym "types" +#' types <- unique(unlist(lapply(data$synonyms, function(i) i$type))) +#' \dontrun{print(types)} +#' +#' @author Michele Stravs, Eawag +#' @export +getCtsRecord <- function(key) +{ + baseURL <- "http://cts.fiehnlab.ucdavis.edu/service/compound/" + + errorvar <- 0 + currEnvir <- environment() + + ##tryCatch a CTS timeout + ## + tryCatch( + { + data <- getURL(paste0(baseURL,key), timeout=7) + }, + error=function(e){ + currEnvir$errorvar <- 1 + } + ) + + if(errorvar){ + warning("CTS seems to be currently unavailable or incapable of interpreting your request") + return(NULL) + } + + r <- fromJSON(data) + if(length(r) == 1) + if(r == "You entered an invalid InChIKey") + return(list()) + return(r) +} + +#' Convert a single ID to another using CTS. +#' +#' @usage getCtsKey(query, from = "Chemical Name", to = "InChIKey") +#' @param query ID to be converted +#' @param from Type of input ID +#' @param to Desired output ID +#' @return An unordered array with the resulting converted key(s). +#' +#' @examples +#' k <- getCtsKey("benzene", "Chemical Name", "InChIKey") +#' @author Michele Stravs, Eawag +#' @export +getCtsKey <- function(query, from = "Chemical Name", to = "InChIKey") +{ + baseURL <- "http://cts.fiehnlab.ucdavis.edu/service/convert" + url <- paste(baseURL, from, to, query, sep='/') + errorvar <- 0 + currEnvir <- environment() + + ##tryCatch a CTS timeout + ## + tryCatch( + { + data <- getURL(URLencode(url), timeout=7) + }, + error=function(e){ + currEnvir$errorvar <- 1 + } + ) + + if(errorvar){ + warning("CTS seems to be currently unavailable or incapable of interpreting your request") + return(NULL) + } + + + r <- fromJSON(data) + if(length(r) == 0) + return(NULL) + else + { + # read out the results in simplest form: + results <- unlist(lapply(r, function(row) row$result)) + return(results) + } +} + +#' Select a subset of external IDs from a CTS record. +#' +#' @usage CTS.externalIdSubset(data, database) +#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. +#' @param database The database for which keys should be returned. +#' @return Returns an array of all external identifiers stored in the record for the +#' given database. +#' +#' @examples +#' +#' \dontrun{ +#' # Return all CAS registry numbers stored for benzene. +#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") +#' cas <- CTS.externalIdSubset(data, "CAS") +#' } +#' +#' @author Michele Stravs, Eawag +#' @export +CTS.externalIdSubset <- function(data, database) +{ + select <- which(unlist(lapply(data$externalIds, function(id) + { + id[["name"]] == database + }))) + keyEntries <- data$externalIds[select] + keys <- unlist(lapply(keyEntries, function(e) e[["value"]])) +} + +#' Find all available databases for a CTS record +#' +#' @usage CTS.externalIdTypes(data) +#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. +#' @return Returns an array of all database names for which there are external +#' identifiers stored in the record. +#' +#' @examples +#' +#' \dontrun{ +#' # Return all databases for which the benzene entry has +#' # links in the CTS record. +#' +#' data <- getCTS("UHOVQNZJYSORNB-UHFFFAOYSA-N") +#' databases <- CTS.externalIdTypes(data) +#' } +#' +#' @author Michele Stravs, Eawag +#' @export +CTS.externalIdTypes <- function(data) +{ + unique(unlist(lapply(data$externalIds, function(id) + { + id[["name"]] + }))) +} + +.pubChemOnline <- function(){ + baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, "inchikey", "QEIXBXXKTUNWDK-UHFFFAOYSA-N", "description", "json", sep="/") + + errorvar <- 0 + currEnvir <- environment() + tryCatch( + ret <- getURL(URLencode(url), timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + warning("Pubchem is currently offline") + return(FALSE) + } else{ + return(TRUE) + } +} + + + +getPcCHEBI <- function(query, from = "inchikey") +{ + # Get the JSON-Data from Pubchem + baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "synonyms", "json", sep="/") + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url),timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + r <- fromJSON(data) + + # This happens if the InChI key is not found: + if(!is.null(r$Fault)) + return(NA) + + # Find the entries which contain Chebi-links + synonymEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Synonym)))) + synonymList <- r$InformationList$Information[[synonymEntry]]$Synonym + matchChebi <- which(grepl("CHEBI:", synonymList, fixed=TRUE)) + + # It doesn't matter if the db is down or if chebi isn't found, so return NA also + if(length(matchChebi) == 0){ + return (NA) + } else { + return (sapply(matchChebi, function(x) synonymList[[x]])) + } +} + +#' Retrieve the Chemspider ID for a given compound +#' +#' Given an InChIKey, this function queries the chemspider web API to retrieve +#' the Chemspider ID of he compound with that InChIkey. +#' +#' @usage getCSID(query) +#' +#' @param query The InChIKey of the compound +#' @return Returns the chemspide +#' +#' @examples +#' +#' \dontrun{ +#' # Return all CAS registry numbers stored for benzene. +#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") +#' cas <- CTS.externalIdSubset(data, "CAS") +#' } +#' +#' @author Michele Stravs, Eawag +#' @author Erik Mueller, UFZ +#' @export +getCSID <- function(query) +{ + baseURL <- "http://www.chemspider.com/InChI.asmx/InChIKeyToCSID?inchi_key=" + url <- paste0(baseURL, query) + + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url), timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + warning("Chemspider is currently offline") + return(NA) + } + + xml <- xmlParseDoc(data,asText=TRUE) + # the returned XML document contains only the root node called "string" which contains the correct CSID + idNodes <- getNodeSet(xml, "/") + id <- xmlValue(idNodes[[1]]) + return(id) +} + +##This function returns a sensible name for the compound +getPcSynonym <- function (query, from = "inchikey") +{ + # Get the JSON-Data from Pubchem + baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "description", "json", sep="/") + + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url),timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + r <- fromJSON(data) + + # This happens if the InChI key is not found: + if(!is.null(r$Fault)) + return(NA) + + # Find the synonym + + titleEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Title)))) + + titleEntry <- titleEntry[which.min(sapply(titleEntry, function(x)r$InformationList$Information[[x]]$CID))] + + title <- r$InformationList$Information[[titleEntry]]$Title + + if(is.null(title)){ + return(NA) + } else{ + return(title) + } +} + + +##A function to retrieve a IUPAC Name from Pubchem +getPcIUPAC <- function (query, from = "inchikey") +{ + # Get the JSON-Data from Pubchem + baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "record", "json", sep="/") + + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url),timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + r <- fromJSON(data) + + # This happens if the InChI key is not found: + if(!is.null(r$Fault)) + return(NA) + + # Find the IUPAC-Names + if(!is.null(r$PC_Compounds[[1]]$props)){ + IUPACIndex <- which(unlist(lapply(r$PC_Compounds[[1]]$props, function(i) (i$urn$label == "IUPAC Name")))) + if(length(IUPACIndex) > 0){ + # Retrieve all IUPAC-Names + IUPACEntries <- lapply(IUPACIndex, function(x) r$PC_Compounds[[1]]$props[[x]]) + if(!is.null(IUPACEntries)){ + # Is there a preferred IUPAC-Name? If yes, retrieve that + PrefIUPAC <- which(unlist(lapply(IUPACEntries, function(x) x$urn$name == "Preferred"))) + } else{return(NA)} + } else{return(NA)} + } else{return(NA)} + + + if(length(PrefIUPAC) == 1){ + return(IUPACEntries[[PrefIUPAC]]$value$sval) + } else{ + # Else it doesn't matter which + return(IUPACEntries[[1]]$value$sval) + } +} + +getPcInchiKey <- function(query, from = "smiles"){ + # Get the JSON-Data from Pubchem + baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "record", "json", sep="/") + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url),timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + r <- fromJSON(data) + + # This happens if the InChI key is not found: + if(!is.null(r$Fault)) + return(NA) + + # Find the entries which contain Chebi-links + if(!is.null(r$PC_Compounds[[1]]$props)){ + INKEYindex <- which(sapply(r$PC_Compounds[[1]]$props, function(x) x$urn$label) == "InChIKey") + if(length(INKEYindex) > 0){ + return(r$PC_Compounds[[1]]$props[[INKEYindex]]$value$sval) + } else{return(NA)} + } else{return(NA)} + + +} + +getPcSDF <- function(query, from = "smiles"){ + baseURL <- "http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" + url <- paste(baseURL, from, query, "sdf", sep="/") + + errorvar <- 0 + currEnvir <- environment() + + tryCatch( + data <- getURL(URLencode(url),timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + molEnd <- regexpr(data,pattern="M END",fixed=TRUE)+5 + data <- c(strsplit(substring(data,1,molEnd),"\n")[[1]],"$$$$") + return(data) +} + From 55215a0fb3b6721e55991f97b7b6c1b38e43be7e Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 27 Jul 2018 12:33:46 +0200 Subject: [PATCH 09/71] minor changes --- R/leCsvAccess.R | 17 +++---- R/leMsmsRaw.R | 2 +- R/msmsRead.R | 131 ++++++++++++++++++------------------------------ 3 files changed, 57 insertions(+), 93 deletions(-) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index 7af0405..965c94b 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -33,14 +33,14 @@ assign("listEnv", NULL, envir=.listEnvEnv) #' @export loadList <- function(path, listEnv = NULL, check = TRUE) { - if(is.null(listEnv)) - listEnv <- .listEnvEnv - if(!file.exists(path)) - stop("The supplied file does not exist, please supply a correct path") - + if(is.null(listEnv)) + listEnv <- .listEnvEnv + if(!file.exists(path)) + stop("The supplied file does not exist, please supply a correct path") + # Try out if the file is comma- or semicolon-separated compoundList <- read.csv(path, stringsAsFactors=FALSE, check.names=FALSE) - n <- colnames(compoundList) + n <- colnames(compoundList) if(!("ID" %in% n)){ # If no ID column, it must be semicolon separated compoundList <- read.csv2(path, stringsAsFactors=FALSE, check.names=FALSE) n <- colnames(compoundList) @@ -81,7 +81,6 @@ loadList <- function(path, listEnv = NULL, check = TRUE) # If "level" is in the compound list we have to check several things: if(newList){ - # a) Are the levels part of the defined levels? # b) Are the values ok for every level? (i.e. all necessary values supplied for each line in the compound list?) @@ -141,7 +140,7 @@ loadList <- function(path, listEnv = NULL, check = TRUE) # If level is "3" or "3a", a valid smiles or formula must be supplied if(level %in% c("3","3a")){ - + if(!is.na(findSmiles(compoundList[i,"ID"]))){ tryCatch( findMz(compoundList[i,"ID"]), @@ -196,7 +195,7 @@ loadList <- function(path, listEnv = NULL, check = TRUE) if(length(d)>0){ stop("Some columns are missing in the compound list. Needs at least ID, Name, SMILES, RT, CAS.") } - + ### ###Test if all the IDs work ### diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index 18fc7b4..522239f 100644 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -582,7 +582,7 @@ findMsMsHRperMsp <- function(fileName, cpdIDs, mode="pH"){ return(P) } else { # There is a file for every cpdID - spectra <- toRMB(unlist(findMsMsHRperMsp.direct(fileName, cpdIDs, mode=mode),FALSE)) + spectra <- toRMB(msmsXCMSspecs = unlist(findMsMsHRperMsp.direct(fileName, cpdIDs, mode=mode),FALSE), cpdID = cpdIDs) } sp <- spectra diff --git a/R/msmsRead.R b/R/msmsRead.R index 0b79035..113bf2e 100644 --- a/R/msmsRead.R +++ b/R/msmsRead.R @@ -48,7 +48,7 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, ##Read the files and cpdids according to the definition ##All cases are silently accepted, as long as they can be handled according to one definition if(!any(mode %in% c("pH","pNa","pM","pNH4","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) - + if(is.null(filetable)){ ##If no filetable is supplied, filenames must be named explicitly if(is.null(files)) @@ -136,13 +136,6 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, return(spec) } ), "SimpleList") names(w@spectra) <- basename(as.character(w@files)) - - if(RMassBank.env$verbose.output) - for(specIdx in seq_along(w@spectra)) - if(!w@spectra[[specIdx]]@found) - cat(paste("### Warning ### No precursor ion detected for ID '", w@spectra[[specIdx]]@id, "'\n", sep = "")) - - return(w) } ##xcms-readmethod @@ -175,41 +168,27 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, return(spec) }),FALSE),"SimpleList") - - if(RMassBank.env$verbose.output) - for(specIdx in seq_along(w@spectra)) - if(!w@spectra[[specIdx]]@found) - cat(paste("### Warning ### No precursor ion was detected for ID '", w@spectra[[specIdx]]@id, "'\n", sep = "")) - - return(w) + } else { + ##Routine for the other cases + w@spectra <- as(lapply(uIDs, function(ID){ + # Find files corresponding to the compoundID + currentFile <- w@files[which(cpdids == ID)] + + # Retrieve spectrum data + spec <- findMsMsHRperxcms(currentFile, ID, mode=mode, findPeaksArgs=Args, plots, MSe = MSe) + gc() + + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(spec) + }),"SimpleList") + ##If there are more files than unique cpdIDs, only remember the first file for every cpdID + w@files <- w@files[sapply(uIDs, function(ID){ + return(which(cpdids == ID)[1]) + })] } - - ##Routine for the other cases - w@spectra <- as(lapply(uIDs, function(ID){ - # Find files corresponding to the compoundID - currentFile <- w@files[which(cpdids == ID)] - - # Retrieve spectrum data - spec <- findMsMsHRperxcms(currentFile, ID, mode=mode, findPeaksArgs=Args, plots, MSe = MSe) - gc() - - # Progress: - nProg <<- nProg + 1 - pb <- do.call(progressbar, list(object=pb, value= nProg)) - - return(spec) - }),"SimpleList") - ##If there are more files than unique cpdIDs, only remember the first file for every cpdID - w@files <- w@files[sapply(uIDs, function(ID){ - return(which(cpdids == ID)[1]) - })] - - if(RMassBank.env$verbose.output) - for(specIdx in seq_along(w@spectra)) - if(!w@spectra[[specIdx]]@found) - cat(paste("### Warning ### No precursor ion was detected for ID '", w@spectra[[specIdx]]@id, "'\n", sep = "")) - - return(w) } ##Peaklist-readmethod @@ -225,13 +204,6 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, w@files <- sapply(files,function(file){return(file[1])}) message("Peaks read") - - if(RMassBank.env$verbose.output) - for(specIdx in seq_along(w@spectra)) - if(!w@spectra[[specIdx]]@found) - cat(paste("### Warning ### No precursor ion was detected for ID '", w@spectra[[specIdx]]@id, "'\n", sep = "")) - - return(w) } ##MSP-readmethod @@ -259,43 +231,36 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, return(spec) }),FALSE),"SimpleList") - - if(RMassBank.env$verbose.output) - for(specIdx in seq_along(w@spectra)) - if(!w@spectra[[specIdx]]@found) - cat(paste("### Warning ### No precursor ion was detected for ID '", w@spectra[[specIdx]]@id, "'\n", sep = "")) - - return(w) + } else { + ##Routine for the other cases + w@spectra <- as(lapply(uIDs, function(ID){ + # Find files corresponding to the compoundID + currentFile <- w@files[which(cpdids == ID)] + + # Retrieve spectrum data + spec <- findMsMsHRperMsp(currentFile, ID, mode=mode) + gc() + + # Progress: + nProg <<- nProg + 1 + pb <- do.call(progressbar, list(object=pb, value= nProg)) + + return(spec) + }),"SimpleList") + ##If there are more files than unique cpdIDs, only remember the first file for every cpdID + w@files <- w@files[sapply(uIDs, function(ID){ + return(which(cpdids == ID)[1]) + })] } - - ##Routine for the other cases - w@spectra <- as(lapply(uIDs, function(ID){ - # Find files corresponding to the compoundID - currentFile <- w@files[which(cpdids == ID)] - - # Retrieve spectrum data - spec <- findMsMsHRperMsp(currentFile, ID, mode=mode) - gc() - - # Progress: - nProg <<- nProg + 1 - pb <- do.call(progressbar, list(object=pb, value= nProg)) - - return(spec) - }),"SimpleList") - ##If there are more files than unique cpdIDs, only remember the first file for every cpdID - w@files <- w@files[sapply(uIDs, function(ID){ - return(which(cpdids == ID)[1]) - })] - - if(RMassBank.env$verbose.output) - for(specIdx in seq_along(w@spectra)) - if(!w@spectra[[specIdx]]@found) - cat(paste("### Warning ### No precursor ion was detected for ID '", w@spectra[[specIdx]]@id, "'\n", sep = "")) - - return(w) } + ## verbose output + if(RMassBank.env$verbose.output) + for(parentIdx in seq_along(w@spectra)) + if(!w@spectra[[parentIdx]]@found) + cat(paste("### Warning ### No precursor ion was detected for ID '", w@spectra[[parentIdx]]@id, "'\n", sep = "")) + + return(w) } #' From 93ba17c64b149a87a47360fd81760e8a7a6cc549 Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 27 Jul 2018 12:34:20 +0200 Subject: [PATCH 10/71] added fix to keep the properties of child-spectra valid and a minor fix to aggregate spectra without error --- R/leMsMs.r | 561 ++++++++++++++++++++++++++++------------------------- 1 file changed, 299 insertions(+), 262 deletions(-) diff --git a/R/leMsMs.r b/R/leMsMs.r index 308ea19..8faa38f 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -248,7 +248,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec if(6 %in% steps) { message("msmsWorkflow: Step 6. Aggregate recalibrated results") - w@aggregated <- aggregateSpectra(w@spectra, addIncomplete=TRUE) + w@aggregated <- aggregateSpectra(spec = w@spectra, addIncomplete=TRUE) if(RMassBank.env$verbose.output){ numberOfPeaksThere <- sum(unlist(lapply(X = w@spectra, FUN = function(spec){ sum(unlist(lapply(X = spec@children, FUN = function(child){ child@peaksCount }))) }))) @@ -278,8 +278,9 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec if(RMassBank.env$verbose.output){ isNoFormula <- is.na(w@aggregated$formula) & is.na(w@aggregated$reanalyzed.formula) noFormulaCount <- sum(isNoFormula) + numberOfPeaksThere <- sum(unlist(lapply(X = w@spectra, FUN = function(spec){ sum(unlist(lapply(X = spec@children, FUN = function(child){ child@peaksCount }))) }))) if(noFormulaCount > 0){ - cat(paste("### Warning ### ", noFormulaCount, " / ", nrow(unique(x = w@aggregated[, c("mzFound", "intensity", "formulaCount", "dppmBest")])), " peaks have no molecular formula:\n", sep = "")) + cat(paste("### Warning ### ", noFormulaCount, " / ", numberOfPeaksThere, " peaks have no molecular formula:\n", sep = "")) print(w@aggregated[isNoFormula, c("mzFound","intensity","cpdID")]) } } @@ -473,13 +474,11 @@ analyzeMsMs <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", if(method=="formula") { - r <- (analyzeMsMs.formula(msmsPeaks, mode, detail, run, filterSettings - )) + r <- analyzeMsMs.formula(msmsPeaks, mode, detail, run, filterSettings) } else if(method == "intensity") { - r <- (analyzeMsMs.intensity(msmsPeaks, mode, detail, run, filterSettings - )) + r <- analyzeMsMs.intensity(msmsPeaks, mode, detail, run, filterSettings) } # Add the spectrum labels to the spectra here. @@ -513,10 +512,9 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi { cut <- 0 cut_ratio <- 0 - if(run=="preliminary") - { + if(run=="preliminary"){ filterMode <- "coarse" - cut <- filterSettings$prelimCut + cut <- filterSettings$prelimCut if(is.na(cut)) { if(mode %in% c("pH", "pM", "pNa", "pNH4")) @@ -525,10 +523,8 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi cut <- 0 else stop(paste("The ionization mode", mode, "is unknown.")) } - cutRatio <- filterSettings$prelimCutRatio - } - else - { + cutRatio <- filterSettings$prelimCutRatio + } else { filterMode <- "fine" cut <- filterSettings$fineCut cut_ratio <- filterSettings$fineCutRatio @@ -547,13 +543,13 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi # with insufficient match accuracy or no match. analyzeTandemShot <- function(child) { - childIdx <- which(sapply(X = seq_along(msmsPeaks@children), FUN = function(i){ - all(child@mz == msmsPeaks@children[[i]]@mz) & all(child@rt == msmsPeaks@children[[i]]@rt) & all(child@intensity == msmsPeaks@children[[i]]@intensity) } - )) - shot <- getData(child) - shot$row <- which(!is.na(shot$mz)) - - + childIdx <- which(sapply(X = seq_along(msmsPeaks@children), FUN = function(i){ + all(child@mz == msmsPeaks@children[[i]]@mz) & all(child@rt == msmsPeaks@children[[i]]@rt) & all(child@intensity == msmsPeaks@children[[i]]@intensity) } + )) + shot <- getData(child) + shot$row <- which(!is.na(shot$mz)) + + # Filter out low intensity peaks: child@low <- (shot$intensity < cut) | (shot$intensity < max(shot$intensity)*cut_ratio) shot <- shot[!child@low,,drop=FALSE] @@ -561,261 +557,282 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi # Is there still anything left? if(length(which(!child@low))==0) - { - child@ok <- FALSE - - if(RMassBank.env$verbose.output) - cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' contains only low-intensity peaks\n", sep = "")) - - return(child) - } - + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("\n### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' contains only low-intensity peaks\n", sep = "")) + + return(child) + } + # Filter out satellite peaks: shot <- filterPeakSatellites(shot, filterSettings) - child@satellite <- rep(TRUE, child@peaksCount) - child@satellite[which(child@low == TRUE)] <- NA - child@satellite[shot$row] <- FALSE - + child@satellite <- rep(TRUE, child@peaksCount) + child@satellite[which(child@low == TRUE)] <- NA + child@satellite[shot$row] <- FALSE + # Is there still anything left? if(nrow(shot)==0) - { - child@ok <- FALSE - - if(RMassBank.env$verbose.output) - cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' contains no peaks after satellite filtering\n", sep = "")) - - return(child) - } - + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("\n### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' contains no peaks after satellite filtering\n", sep = "")) + + return(child) + } + if(max(shot$intensity) < as.numeric(filterSettings$specOkLimit)) - { - child@ok <- FALSE - - if(RMassBank.env$verbose.output) - cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is discarded due to parameter 'specOkLimit'\n", sep = "")) - - return(child) - } - + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("\n### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is discarded due to parameter 'specOkLimit'\n", sep = "")) + + return(child) + } + # Crop to 4 digits (necessary because of the recalibrated values) - # this was done for the MOLGEN MSMS type analysis, is not necessary anymore now (23.1.15 MST) + # this was done for the MOLGEN MSMS type analysis, is not necessary anymore now (23.1.15 MST) # shot[,mzColname] <- round(shot[,mzColname], 5) + + # here follows the Rcdk analysis + #------------------------------------ + parentPeaks <- data.frame(mzFound=msmsPeaks@mz, + formula=msmsPeaks@formula, + dppm=0, + x1=0,x2=0,x3=0) + + # define the adduct additions + if(mode == "pH") { + allowed_additions <- "H" + mode.charge <- 1 + } else if(mode == "pNa") { + allowed_additions <- "Na" + mode.charge <- 1 + } else if(mode == "pM") { + allowed_additions <- "" + mode.charge <- 1 + } else if(mode == "mM") { + allowed_additions <- "" + mode.charge <- -1 + } else if(mode == "mH") { + allowed_additions <- "H-1" + mode.charge <- -1 + } else if(mode == "mFA") { + allowed_additions <- "C2H3O2" + mode.charge <- -1 + } else if(mode == "pNH4") { + allowed_additions <- "NH4" + mode.charge <- 1 + } else{ + stop("mode = \"", mode, "\" not defined") + } + + # the ppm range is two-sided here. + # The range is slightly expanded because dppm calculation of + # generate.formula starts from empirical mass, but dppm cal- + # culation of the evaluation starts from theoretical mass. + # So we don't miss the points on 'the border'. + + if(run=="preliminary") + ppmlimit <- 2 * max(filterSettings$ppmLowMass, filterSettings$ppmHighMass) + else + ppmlimit <- 2.25 * filterSettings$ppmFine + + parent_formula <- add.formula(msmsPeaks@formula, allowed_additions) + dbe_parent <- dbe(parent_formula) + # check whether the formula is valid, i.e. has no negative or zero element numbers. + #print(parent_formula) + if(!is.valid.formula(parent_formula)) + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("\n### Warning ### The precursor ion formula of spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is invalid\n", sep = "")) + + return(child) + } + + limits <- to.limits.rcdk(parent_formula) + + peakmatrix <- lapply(split(shot,shot$row), function(shot.row) { + # Circumvent bug in rcdk: correct the mass for the charge first, then calculate uncharged formulae + # finally back-correct calculated masses for the charge + mass <- shot.row[["mz"]] + mass.calc <- mass + mode.charge * .emass + peakformula <- tryCatch(suppressWarnings( + rcdk::generate.formula(mass = mass.calc, window = ppm(mass.calc, ppmlimit, p=TRUE), elements = limits, charge=0) + ), error=function(e) + ## in case of zero formulas: Error in .jcall(mfSet, "I", "size") : RcallMethod: invalid object parameter + NA + ) + #peakformula <- tryCatch( + # generate.formula(mass, + # ppm(mass, ppmlimit, p=TRUE), + # limits, charge=1), + #error= function(e) list()) + + if(!is.list(peakformula)) + return(t(c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, + formula=NA, mzCalc=NA))) + else + { + return(t(sapply(peakformula, function(f) + { + mzCalc <- f@mass - mode.charge * .emass + c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, + formula=f@string, + mzCalc=mzCalc) + }))) + } + }) + + childPeaks <- as.data.frame(do.call(rbind, peakmatrix)) + + # Reformat the deformatted output correctly (why doesn't R have a better way to do this, e.g. avoid deformatting?) + + childPeaks$row <- as.numeric(as.character(childPeaks$row)) + childPeaks$intensity <- as.numeric(as.character(childPeaks$intensity)) + childPeaks$mz <- as.numeric(as.character(childPeaks$mz)) + childPeaks$formula <- as.character(childPeaks$formula) + childPeaks$mzCalc <- as.numeric(as.character(childPeaks$mzCalc)) + childPeaks$dppm <- (childPeaks$mz / childPeaks$mzCalc - 1) * 1e6 + childPeaks$dbe <- unlist(lapply(childPeaks$formula, dbe)) + + # childPeaks now contains all the good and unmatched peaks + # but not the ones which were cut as satellites or below threshold. + + ## child@mzFound <- rep(NA, child@peaksCount) + ## child@mzFound[childPeaks$row] <- as.numeric(as.character(childPeaks$mzFound)) + ## + ## child@formula <- rep(NA, child@peaksCount) + ## child@formula[childPeaks$row] <- as.character(childPeaks$formula) + ## + ## child@mzCalc <- rep(NA, child@peaksCount) + ## child@mzCalc[childPeaks$row] <- as.numeric(as.character(childPeaks$mzCalc)) + ## + ## child@dppm<- rep(NA, child@peaksCount) + ## child@dppm[childPeaks$row] <- (childPeaks$mzFound / childPeaks$mzCalc - 1) * 1e6 + # delete the NA data out again, because MolgenMsMs doesn't have them + # here and they will be re-added later + # (this is just left like this for "historical" reasons) + #childPeaks <- childPeaks[!is.na(childPeaks$formula),] + # check if a peak was recognized (here for the first time, + # otherwise the next command would fail) + + if(nrow(childPeaks)==0) + { + child@ok <- FALSE + + if(RMassBank.env$verbose.output) + cat(paste("\n### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is empty\n", sep = "")) + + return(child) + } - # here follows the Rcdk analysis - #------------------------------------ - parentPeaks <- data.frame(mzFound=msmsPeaks@mz, - formula=msmsPeaks@formula, - dppm=0, - x1=0,x2=0,x3=0) - - # define the adduct additions - if(mode == "pH") { - allowed_additions <- "H" - mode.charge <- 1 - } else if(mode == "pNa") { - allowed_additions <- "Na" - mode.charge <- 1 - } else if(mode == "pM") { - allowed_additions <- "" - mode.charge <- 1 - } else if(mode == "mM") { - allowed_additions <- "" - mode.charge <- -1 - } else if(mode == "mH") { - allowed_additions <- "H-1" - mode.charge <- -1 - } else if(mode == "mFA") { - allowed_additions <- "C2H3O2" - mode.charge <- -1 - } else if(mode == "pNH4") { - allowed_additions <- "NH4" - mode.charge <- 1 - } else{ - stop("mode = \"", mode, "\" not defined") - } - - # the ppm range is two-sided here. - # The range is slightly expanded because dppm calculation of - # generate.formula starts from empirical mass, but dppm cal- - # culation of the evaluation starts from theoretical mass. - # So we don't miss the points on 'the border'. - - if(run=="preliminary") - ppmlimit <- 2 * max(filterSettings$ppmLowMass, filterSettings$ppmHighMass) - else - ppmlimit <- 2.25 * filterSettings$ppmFine - - parent_formula <- add.formula(msmsPeaks@formula, allowed_additions) - dbe_parent <- dbe(parent_formula) - # check whether the formula is valid, i.e. has no negative or zero element numbers. - #print(parent_formula) - if(!is.valid.formula(parent_formula)) - { - child@ok <- FALSE - - if(RMassBank.env$verbose.output) - cat(paste("### Warning ### The precursor ion formula of spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is invalid\n", sep = "")) - - return(child) - } - - limits <- to.limits.rcdk(parent_formula) - - peakmatrix <- lapply( - split(shot,shot$row) - , function(shot.row) { - # Circumvent bug in rcdk: correct the mass for the charge first, then calculate uncharged formulae - # finally back-correct calculated masses for the charge - mass <- shot.row[["mz"]] - mass.calc <- mass + mode.charge * .emass - peakformula <- tryCatch(suppressWarnings(generate.formula(mass = mass.calc, window = ppm(mass.calc, ppmlimit, p=TRUE), - elements = limits, charge=0)), error=function(e) NA) - #peakformula <- tryCatch( - # generate.formula(mass, - # ppm(mass, ppmlimit, p=TRUE), - # limits, charge=1), - #error= function(e) list()) - - if(!is.list(peakformula)) - return(t(c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, - formula=NA, mzCalc=NA))) - else - { - return(t(sapply(peakformula, function(f) - { - mzCalc <- f@mass - mode.charge * .emass - c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, - formula=f@string, - mzCalc=mzCalc) - }))) - } - }) - - childPeaks <- as.data.frame(do.call(rbind, peakmatrix)) - - # Reformat the deformatted output correctly (why doesn't R have a better way to do this, e.g. avoid deformatting?) - - childPeaks$row <- as.numeric(as.character(childPeaks$row)) - childPeaks$intensity <- as.numeric(as.character(childPeaks$intensity)) - childPeaks$mz <- as.numeric(as.character(childPeaks$mz)) - childPeaks$formula <- as.character(childPeaks$formula) - childPeaks$mzCalc <- as.numeric(as.character(childPeaks$mzCalc)) - childPeaks$dppm <- (childPeaks$mz / childPeaks$mzCalc - 1) * 1e6 - childPeaks$dbe <- unlist(lapply(childPeaks$formula, dbe)) - - # childPeaks now contains all the good and unmatched peaks - # but not the ones which were cut as satellites or below threshold. - - ## child@mzFound <- rep(NA, child@peaksCount) - ## child@mzFound[childPeaks$row] <- as.numeric(as.character(childPeaks$mzFound)) - ## - ## child@formula <- rep(NA, child@peaksCount) - ## child@formula[childPeaks$row] <- as.character(childPeaks$formula) - ## - ## child@mzCalc <- rep(NA, child@peaksCount) - ## child@mzCalc[childPeaks$row] <- as.numeric(as.character(childPeaks$mzCalc)) - ## - ## child@dppm<- rep(NA, child@peaksCount) - ## child@dppm[childPeaks$row] <- (childPeaks$mzFound / childPeaks$mzCalc - 1) * 1e6 - # delete the NA data out again, because MolgenMsMs doesn't have them - # here and they will be re-added later - # (this is just left like this for "historical" reasons) - #childPeaks <- childPeaks[!is.na(childPeaks$formula),] - # check if a peak was recognized (here for the first time, - # otherwise the next command would fail) - - if(nrow(childPeaks)==0) - { - child@ok <- FALSE - - if(RMassBank.env$verbose.output) - cat(paste("### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' is empty\n", sep = "")) - - return(child) - } - - # now apply the rule-based filters to get rid of total junk: - # dbe >= -0.5, dbe excess over mother cpd < 3 - # dbe() has been adapted to return NA for NA input - #iff_rcdk_pM_eln$maxvalence <- unlist(lapply(diff_rcdk_pM_eln$formula.rcdk, maxvalence)) - temp.child.ok <- (childPeaks$dbe >= filterSettings$dbeMinLimit) - # & dbe < dbe_parent + 3) - # check if a peak was recognized - if(length(which(temp.child.ok)) == 0) - { - child@ok <- FALSE - return(child) - } + if(all(is.na(childPeaks$formula))) + { + child@ok <- FALSE + child@good <- rep(FALSE, length(childPeaks$formula)) + + if(RMassBank.env$verbose.output) + cat(paste("\n### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' comprises no peaks which could be assiged to a molecular formula\n", sep = "")) + + return(child) + } + + # now apply the rule-based filters to get rid of total junk: + # dbe >= -0.5, dbe excess over mother cpd < 3 + # dbe() has been adapted to return NA for NA input + #iff_rcdk_pM_eln$maxvalence <- unlist(lapply(diff_rcdk_pM_eln$formula.rcdk, maxvalence)) + temp.child.ok <- (childPeaks$dbe >= filterSettings$dbeMinLimit) + # & dbe < dbe_parent + 3) + # check if a peak was recognized + if(length(which(temp.child.ok)) == 0) + { + child@ok <- FALSE + child@good <- rep(FALSE, length(temp.child.ok)) + + if(RMassBank.env$verbose.output) + cat(paste("\n### Warning ### The spectrum '#", childIdx, "' for ID '", msmsPeaks@id, "' comprises no peaks which fulfil the dbeMinLimit criterion\n", sep = "")) + + return(child) + } #browser() - # find the best ppm value - bestPpm <- aggregate(as.data.frame(childPeaks[!is.na(childPeaks$dppm),"dppm"]), - list(childPeaks[!is.na(childPeaks$dppm),"row"]), - function(dppm) dppm[[which.min(abs(dppm))]]) + # find the best ppm value + bestPpm <- aggregate( + x = as.data.frame(childPeaks[!is.na(childPeaks$dppm),"dppm"]), + by = list(childPeaks[!is.na(childPeaks$dppm),"row"]), + FUN = function(dppm) dppm[[which.min(abs(dppm))]] + ) colnames(bestPpm) <- c("row", "dppmBest") childPeaks <- merge(childPeaks, bestPpm, by="row", all.x=TRUE) - - # Deactivated the following lines because we never actually want to look at the "old" formula count. - # To be verified (cf Refiltering, failpeak list and comparable things) - - ## # count formulas found per mass - ## countFormulasTab <- xtabs( ~formula + mz, data=childPeaks) - ## countFormulas <- colSums(countFormulasTab) - ## childPeaks$formulaCount <- countFormulas[as.character(childPeaks$row)] - + + # Deactivated the following lines because we never actually want to look at the "old" formula count. + # To be verified (cf Refiltering, failpeak list and comparable things) + + ## # count formulas found per mass + ## countFormulasTab <- xtabs( ~formula + mz, data=childPeaks) + ## countFormulas <- colSums(countFormulasTab) + ## childPeaks$formulaCount <- countFormulas[as.character(childPeaks$row)] + # filter results childPeaksFilt <- filterLowaccResults(childPeaks, filterMode, filterSettings) childPeaksGood <- childPeaksFilt[["TRUE"]] childPeaksBad <- childPeaksFilt[["FALSE"]] - if(is.null(childPeaksGood)){ - childPeaksGood <- childPeaks[c(),,drop=FALSE] - childPeaksGood$good <- logical(0) + if(is.null(childPeaksGood)){ + childPeaksGood <- childPeaks[c(),,drop=FALSE] + childPeaksGood$good <- logical(0) } - if(is.null(childPeaksBad)) - childPeaksBad <- childPeaks[c(),,drop=FALSE] - childPeaksUnassigned <- childPeaks[is.na(childPeaks$dppm),,drop=FALSE] - childPeaksUnassigned$good <- rep(FALSE, nrow(childPeaksUnassigned)) - # count formulas within new limits + if(is.null(childPeaksBad)) + childPeaksBad <- childPeaks[c(),,drop=FALSE] + childPeaksUnassigned <- childPeaks[is.na(childPeaks$dppm),,drop=FALSE] + childPeaksUnassigned$good <- rep(FALSE, nrow(childPeaksUnassigned)) + + # count formulas within new limits # (the results of the "old" count stay in childPeaksInt and are returned # in $childPeaks) - countFormulasTab <- xtabs( ~formula + mz, data=childPeaksGood) - countFormulas <- colSums(countFormulasTab) - childPeaksGood$formulaCount <- countFormulas[as.character(childPeaksGood$mz)] - - childPeaksUnassigned$formulaCount <- rep(NA, nrow(childPeaksUnassigned)) - childPeaksBad$formulaCount <- rep(NA, nrow(childPeaksBad)) - childPeaksBad$good <- rep(FALSE, nrow(childPeaksBad)) - - # Now: childPeaksGood (containing the new, recounted peaks with good = TRUE), and childPeaksBad (containing the - # peaks with good=FALSE, i.e. outside filter criteria, with the old formula count even though it is worthless) - # are bound together. - childPeaksBad <- childPeaksBad[,colnames(childPeaksGood),drop=FALSE] - childPeaksUnassigned <- childPeaksUnassigned[,colnames(childPeaksGood),drop=FALSE] - childPeaks <- rbind(childPeaksGood, childPeaksBad, childPeaksUnassigned) - - # Now let's cross fingers. Add a good=NA column to the unmatched peaks and reorder the columns - # to match order in childPeaks. After that, setData to the child slot. - - childPeaksOmitted <- getData(child) - childPeaksOmitted <- childPeaksOmitted[child@low | child@satellite,,drop=FALSE] - childPeaksOmitted$rawOK <- rep(FALSE, nrow(childPeaksOmitted)) - childPeaksOmitted$good <- rep(FALSE, nrow(childPeaksOmitted)) - childPeaksOmitted$dppm <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$formula <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$mzCalc <- rep(NA, nrow(childPeaksOmitted)) - childPeaksOmitted$dbe <- rep(NA, nrow(childPeaksOmitted)) + countFormulasTab <- xtabs( ~formula + mz, data=childPeaksGood) + countFormulas <- colSums(countFormulasTab) + childPeaksGood$formulaCount <- countFormulas[as.character(childPeaksGood$mz)] + + childPeaksUnassigned$formulaCount <- rep(NA, nrow(childPeaksUnassigned)) + childPeaksBad$formulaCount <- rep(NA, nrow(childPeaksBad)) + childPeaksBad$good <- rep(FALSE, nrow(childPeaksBad)) + + # Now: childPeaksGood (containing the new, recounted peaks with good = TRUE), and childPeaksBad (containing the + # peaks with good=FALSE, i.e. outside filter criteria, with the old formula count even though it is worthless) + # are bound together. + childPeaksBad <- childPeaksBad[,colnames(childPeaksGood),drop=FALSE] + childPeaksUnassigned <- childPeaksUnassigned[,colnames(childPeaksGood),drop=FALSE] + childPeaks <- rbind(childPeaksGood, childPeaksBad, childPeaksUnassigned) + + # Now let's cross fingers. Add a good=NA column to the unmatched peaks and reorder the columns + # to match order in childPeaks. After that, setData to the child slot. + + childPeaksOmitted <- getData(child) + childPeaksOmitted <- childPeaksOmitted[child@low | child@satellite,,drop=FALSE] + childPeaksOmitted$rawOK <- rep(FALSE, nrow(childPeaksOmitted)) + childPeaksOmitted$good <- rep(FALSE, nrow(childPeaksOmitted)) + childPeaksOmitted$dppm <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$formula <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$mzCalc <- rep(NA, nrow(childPeaksOmitted)) + childPeaksOmitted$dbe <- rep(NA, nrow(childPeaksOmitted)) childPeaksOmitted$dppmBest <- rep(NA, nrow(childPeaksOmitted)) childPeaksOmitted$formulaCount <- rep(0, nrow(childPeaksOmitted)) - childPeaks$satellite <- rep(FALSE, nrow(childPeaks)) - childPeaks$low <- rep(FALSE, nrow(childPeaks)) - childPeaks$rawOK <- rep(TRUE, nrow(childPeaks)) - - childPeaks <- childPeaks[,colnames(childPeaksOmitted), drop=FALSE] - - childPeaksTotal <- rbind(childPeaks, childPeaksOmitted) - child <- setData(child, childPeaksTotal) - child@ok <- TRUE - - return(child) + childPeaks$satellite <- rep(FALSE, nrow(childPeaks)) + childPeaks$low <- rep(FALSE, nrow(childPeaks)) + childPeaks$rawOK <- rep(TRUE, nrow(childPeaks)) + + childPeaks <- childPeaks[,colnames(childPeaksOmitted), drop=FALSE] + + childPeaksTotal <- rbind(childPeaks, childPeaksOmitted) + child <- setData(child, childPeaksTotal) + child@ok <- TRUE + + return(child) } # I believe these lines were fixed to remove a warning but in the refactored workflow "mzranges" doesn't exist anymore. @@ -832,9 +849,20 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi ## mzmax <- max(mzranges[,2], na.rm=TRUE) children <- lapply(msmsPeaks@children, analyzeTandemShot) + ## correct fields in case of invalid data + children <- lapply(children, function(child){ + if(child@ok) return(child) + child@rawOK <- rep(x = TRUE, times = child@peaksCount) + child@good <- rep(x = FALSE, times = child@peaksCount) + child@mzCalc <- as.numeric( rep(x = NA, times = child@peaksCount)) + child@formula <- as.character(rep(x = NA, times = child@peaksCount)) + child@dbe <- as.numeric( rep(x = NA, times = child@peaksCount)) + child@formulaCount <- as.integer( rep(x = NA, times = child@peaksCount)) + child@dppm <- as.numeric( rep(x = NA, times = child@peaksCount)) + child@dppmBest <- as.numeric( rep(x = NA, times = child@peaksCount)) + return(child) + }) - - ## shots <- mapply(function(shot, scan, info) ## { ## shot$scan <- scan @@ -1110,6 +1138,15 @@ aggregateSpectra <- function(spec, addIncomplete=FALSE) return(table.c) }) table.cpd <- do.call(rbind, tables.c) + + ## complete missing columns if necessary + ## mz intensity good scan cpdID parentScan + ## mz intensity good mzCalc formula dbe formulaCount dppm dppmBest scan cpdID parentScan + columnNames <- c("mzCalc", "formula", "dbe", "formulaCount", "dppm", "dppmBest") + if(all(!(columnNames %in% colnames(table.cpd)))) + for(columnName in columnNames) + table.cpd[, columnName] <- as.numeric(rep(x = NA, times = nrow(table.cpd))) + table.cpd$cpdID <- rep(s@id, nrow(table.cpd)) table.cpd$parentScan <- rep(s@parent@acquisitionNum, nrow(table.cpd)) return(table.cpd) @@ -1360,7 +1397,7 @@ makeRecalibration <- function(w, mode, stop("No spectra present to generate recalibration curve.") rcdata <- peaksMatched(w) - rcdata <- rcdata[rcdata$formulaCount == 1, ,drop=FALSE] + rcdata <- rcdata[!is.na(rcdata$formulaCount) & rcdata$formulaCount == 1, ,drop=FALSE] rcdata <- rcdata[,c("mzFound", "dppm", "mzCalc")] @@ -1399,9 +1436,9 @@ makeRecalibration <- function(w, mode, # plot the model par(mfrow=c(2,2)) if(nrow(rcdata)>0) - plotRecalibration.direct(rcdata, rc, rc.ms1, "MS2", - range(rcdata$mzFound), - recalibrateBy) + plotRecalibration.direct(rcdata = rcdata, rc = rc, rc.ms1 = rc.ms1, title = "MS2", + mzrange = range(rcdata$mzFound), + recalibrateBy = recalibrateBy) if(nrow(ms1data)>0) plotRecalibration.direct(ms1data, rc, rc.ms1, "MS1", range(ms1data$mzFound), @@ -1821,7 +1858,7 @@ reanalyzeFailpeak <- function(custom_additions, mass, cpdID, counter, pb = NULL, #print(parent_formula) limits <- to.limits.rcdk(parent_formula) - peakformula <- tryCatch(suppressWarnings(generate.formula(mass, ppm(mass, ppmlimit, p=TRUE), + peakformula <- tryCatch(suppressWarnings(rcdk::generate.formula(mass, ppm(mass, ppmlimit, p=TRUE), limits, charge=mode.charge)), error=function(e) NA) # was a formula found? If not, return empty result if(!is.list(peakformula)) From 97087e6392f9aaa7d499eb0e16619c319b4d4568 Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 27 Jul 2018 12:34:57 +0200 Subject: [PATCH 11/71] added export of invalid records, decomposed the export of records and molfiles --- R/RmbWorkspace.R | 2 + R/createMassBank.R | 145 +++++++++++++++++++++++++++++++-------------- R/zzz.R | 5 +- 3 files changed, 107 insertions(+), 45 deletions(-) diff --git a/R/RmbWorkspace.R b/R/RmbWorkspace.R index 72fe617..05b01d2 100755 --- a/R/RmbWorkspace.R +++ b/R/RmbWorkspace.R @@ -144,7 +144,9 @@ setClass("mbWorkspace", # output data: compiled = "list", compiled_ok = "list", + compiled_notOk = "list", mbfiles = "list", + mbfiles_notOk = "list", molfile = "list", ok = "integer", problems = "integer" diff --git a/R/createMassBank.R b/R/createMassBank.R index 872bcb7..50d23f6 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -75,26 +75,32 @@ loadInfolist <- function(mb, fileName) "COMMENT.ID" # Clear from padding spaces and NAs - mbdata_new <- as.data.frame(t(apply(mbdata_new, 1, function(r) + mbdata_new <- as.data.frame(x = t(apply(mbdata_new, 1, function(r) { # Substitute empty spaces by real NA values r[which(r == "")] <- NA # Trim spaces (in all non-NA fields) r[which(!is.na(r))] <- sub("^ *([^ ]+) *$", "\\1", r[which(!is.na(r))]) return(r) - }))) + })), stringsAsFactors = FALSE) # use only the columns present in mbdata_archive, no other columns added in excel mbdata_new <- mbdata_new[, colnames(mb@mbdata_archive)] # substitute the old entires with the ones from our files # then find the new (previously inexistent) entries, and rbind them to the table new_entries <- setdiff(mbdata_new$id, mb@mbdata_archive$id) old_entries <- intersect(mbdata_new$id, mb@mbdata_archive$id) + + for(colname in colnames(mb@mbdata_archive)) + mb@mbdata_archive[, colname] <- as.character(mb@mbdata_archive[, colname]) + for(entry in old_entries) mb@mbdata_archive[mb@mbdata_archive$id == entry,] <- mbdata_new[mbdata_new$id == entry,] - mb@mbdata_archive <- rbind(mb@mbdata_archive, - mbdata_new[mbdata_new$id==new_entries,]) + mb@mbdata_archive <- rbind(mb@mbdata_archive, mbdata_new[mbdata_new$id==new_entries,]) + + for(colname in colnames(mb@mbdata_archive)) + mb@mbdata_archive[, colname] <- as.factor(mb@mbdata_archive[, colname]) + return(mb) - } @@ -229,9 +235,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c if(4 %in% steps) { message("mbWorkflow: Step 4. Spectra compilation") - mb@compiled <- lapply( - selectSpectra(mb@spectra, "found", "object"), - function(r) { + mb@compiled <- lapply(X = selectSpectra(mb@spectra, "found", "object"), FUN = function(r) { message(paste("Compiling: ", r@name, sep="")) mbdata <- mb@mbdata_relisted[[which(mb@mbdata_archive$id == as.numeric(r@id))]] if(nrow(mb@additionalPeaks) > 0) @@ -241,41 +245,86 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c return(res) }) # check which compounds have useful spectra - mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) + ok <- unlist(lapply(X = selectSpectra(mb@spectra, "found", "object"), FUN = function(spec){unlist(lapply(X = spec@children, FUN = function(child){child@ok}))})) + mb@ok <- which(ok) + #mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) mb@problems <- which(is.na(mb@compiled)) - mb@compiled_ok <- mb@compiled[mb@ok] + mb@compiled_ok <- mb@compiled[mb@ok] + mb@compiled_notOk <- mb@compiled[!ok] } # Step 5: Convert the internal tree-like representation of the MassBank data into # flat-text string arrays (basically, into text-file style, but still in memory) if(5 %in% steps) { message("mbWorkflow: Step 5. Flattening records") - mb@mbfiles <- lapply(mb@compiled_ok, function(c) lapply(c, toMassbank)) + mb@mbfiles <- lapply(mb@compiled_ok, function(c) lapply(c, toMassbank)) + mb@mbfiles_notOk <- lapply(mb@compiled_notOk, function(c) lapply(c, toMassbank)) } # Step 6: For all OK records, generate a corresponding molfile with the structure # of the compound, based on the SMILES entry from the MassBank record. (This molfile # is still in memory only, not yet a physical file) if(6 %in% steps) { - message("mbWorkflow: Step 6. Generate molfiles") - mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(as.numeric(c[[1]][['COMMENT']][[getOption("RMassBank")$annotations$internal_id_fieldname]]))) + if(RMassBank.env$export.molfiles){ + message("mbWorkflow: Step 6. Generate molfiles") + mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(as.numeric(c[[1]][['COMMENT']][[getOption("RMassBank")$annotations$internal_id_fieldname]]))) + } else + warning("RMassBank is configured not to export molfiles (RMassBank.env$export.molfiles). Step 6 is therefore ignored.") } # Step 7: If necessary, generate the appropriate subdirectories, and actually write # the files to disk. if(7 %in% steps) { message("mbWorkflow: Step 7. Generate subdirs and export") - dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "moldata", sep='/'),recursive=TRUE) - dir.create(paste(getOption("RMassBank")$annotations$entry_prefix, "recdata", sep='/'),recursive=TRUE) - for(cnt in seq_along(mb@compiled_ok)) - exportMassbank(mb@compiled_ok[[cnt]], mb@mbfiles[[cnt]], mb@molfile[[cnt]]) + + ## create folder + filePath_recData_valid <- file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata") + filePath_recData_invalid <- file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata_invalid") + filePath_molData <- file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata") + + dir.create(filePath_recData_valid,recursive=TRUE) + if(RMassBank.env$export.molfiles) + dir.create(filePath_molData,recursive=TRUE) + if(RMassBank.env$export.invalid) + dir.create(filePath_recData_invalid,recursive=TRUE) + + if(length(mb@molfile) == 0) + mb@molfile <- as.list(rep(x = NA, times = length(mb@compiled_ok))) + + ## export valid spectra + for(cnt in seq_along(mb@compiled_ok)){ + exportMassbank_recdata( + accessions = unlist(lapply(X = mb@compiled_ok[[cnt]], FUN = "[", "ACCESSION")), + files = mb@mbfiles[[cnt]], + recDataFolder = filePath_recData_valid + ) + + if(findLevel(mb@compiled_ok[[cnt]][[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]],TRUE)=="standard" & RMassBank.env$export.molfiles) + exportMassbank_moldata( + cpdID = as.numeric(mb@compiled_ok[[cnt]][[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]]), + molfile = mb@molfile[[cnt]], + molDataFolder = filePath_molData + ) + } + + ## export invalid spectra + if(RMassBank.env$export.invalid) + for(cnt in seq_along(mb@compiled_notOk)) + exportMassbank_recdata( + accessions = unlist(lapply(X = mb@compiled_notOk[[cnt]], FUN = "[", "ACCESSION")), + files = mb@mbfiles_notOk[[cnt]], + recDataFolder = filePath_recData_invalid + ) } # Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank # to attribute substances to their corresponding structure molfiles. if(8 %in% steps) { - message("mbWorkflow: Step 8. Create list.tsv") - makeMollist(mb@compiled_ok) + if(RMassBank.env$export.molfiles){ + message("mbWorkflow: Step 8. Create list.tsv") + makeMollist(compiled = mb@compiled_ok) + } else + warning("RMassBank is configured not to export molfiles (RMassBank.env$export.molfiles). Step 8 is therefore ignored.") } return(mb) } @@ -1289,20 +1338,21 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP { # If the spectrum is not filled, return right now. All "NA" spectra will # not be treated further. - if(msmsdata@ok == FALSE) + if(msmsdata@ok == FALSE & !RMassBank.env$export.invalid) return(NA) # get data scan <- msmsdata@acquisitionNum id <- spec@id # Further fill the ac_ms datasets, and add the ms$focused_ion with spectrum-specific data: precursor_types <- list( - "pH" = "[M+H]+", - "pNa" = "[M+Na]+", - "mH" = "[M-H]-", - "mFA" = "[M+HCOO-]-", - "pM" = "[M]+", - "mM" = "[M]-", - "pNH4" = "[M+NH4]+") + "pH" = "[M+H]+", + "pNa" = "[M+Na]+", + "mH" = "[M-H]-", + "mFA" = "[M+HCOO-]-", + "pM" = "[M]+", + "mM" = "[M]-", + "pNH4" = "[M+NH4]+" + ) ac_ms[['FRAGMENTATION_MODE']] <- msmsdata@info$mode #ac_ms['PRECURSOR_TYPE'] <- precursor_types[spec$mode] ac_ms[['COLLISION_ENERGY']] <- msmsdata@info$ce @@ -1902,28 +1952,35 @@ toMassbank <- function (mbdata) #' } #' #' @export -exportMassbank <- function(compiled, files, molfile) +exportMassbank <- function(compiled, files, molfile){ + exportMassbank_recdata( + accessions = unlist(lapply(X = compiled, FUN = "[", "ACCESSION")), + files, + recDataFolder = file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata") + ) + exportMassbank_moldata( + cpdID = as.numeric(compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]]), + molfile, + molDataFolder = file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata") + ) +} + +exportMassbank_recdata <- function(accessions, files, recDataFolder) { - molnames <- c() - for(file in 1:length(compiled)) + for(fileIdx in 1:length(accessions)) { - # Read the accession no. from the corresponding "compiled" entry - filename <- compiled[[file]]["ACCESSION"] # use this accession no. as filename - filename <- paste(filename, ".txt", sep="") - write(files[[file]], - file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata",filename) - ) + filename <- paste(accessions[[fileIdx]], ".txt", sep="") + filePath <- file.path(recDataFolder,filename) + write(files[[fileIdx]], filePath) } +} +exportMassbank_moldata <- function(cpdID, molfile, molDataFolder) +{ # Use internal ID for naming the molfiles - if(findLevel(compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]],TRUE)=="standard"){ - molname <- sprintf("%04d", as.numeric( - compiled[[1]][["COMMENT"]][[getOption("RMassBank")$annotations$internal_id_fieldname]][[1]])) - molname <- paste(molname, ".mol", sep="") - write(molfile, - file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata",molname) - ) - } + molname <- sprintf("%04d", cpdID) + molname <- paste(molname, ".mol", sep="") + write(molfile,file.path(molDataFolder,molname)) } # Makes a list.tsv with molfile -> massbank ch$name attribution. diff --git a/R/zzz.R b/R/zzz.R index 3a53d8c..b43f08a 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -2,7 +2,10 @@ RMassBank.env <<- new.env() RMassBank.env$ReadAnnotation <- FALSE RMassBank.env$testnumber <- 1 - RMassBank.env$verbose.output <- TRUE + ## new variables + RMassBank.env$verbose.output <- FALSE + RMassBank.env$export.invalid <- FALSE + RMassBank.env$export.molfiles <- TRUE mb <- list() attach(RMassBank.env) From 72707c6c6acf700e28b14c76c681de8ca27c103d Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 27 Jul 2018 12:37:24 +0200 Subject: [PATCH 12/71] Minor changes --- R/SpectrumMethods.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/SpectrumMethods.R b/R/SpectrumMethods.R index dc7b71e..e5144a0 100644 --- a/R/SpectrumMethods.R +++ b/R/SpectrumMethods.R @@ -21,7 +21,8 @@ setMethod("getData", c("RmbSpectrum2"), function(s) { peaks <- s@peaksCount cols <- c("mz", "intensity", "satellite", "low", "rawOK", "good", "mzCalc", "formula", "dbe", "formulaCount", "dppm", "dppmBest") - cols.isFilled <- unlist(lapply(cols, function(col) length(slot(s, col)) == peaks)) + slotLength <- unlist(lapply(cols, function(col) length(slot(s, col)))) + cols.isFilled <- slotLength == peaks cols.filled <- cols[cols.isFilled] data <- lapply(cols.filled, function(col) slot(s, col)) data$stringsAsFactors <- FALSE From 6cf6d07091fa3c6d923827c7cf6bbd30a51a6cf2 Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 27 Jul 2018 15:54:56 +0200 Subject: [PATCH 13/71] Added the option to do a strict selection of MS/MS spectra from the raw data --- R/leMsmsRaw.R | 38 +++++++++++++++++++++++++++++++------- R/zzz.R | 1 + 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index 522239f..69e7870 100644 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -509,7 +509,11 @@ findMsMsHRperxcms.direct <- function(fileName, cpdID, mode="pH", findPeaksArgs = pspIndex <- which(sapply(anmsms@pspectra, function(x) {candidates[[i]][closestCandidate] %in% x})) } else{ # Else choose the candidate with the closest RT - pspIndex <- which.min(abs(getRT(anmsms) - RT)) + if(RMassBank.env$strictMsMsSpectraSelection){ + pspIndex <- NULL + } else { + pspIndex <- which.min(abs(getRT(anmsms) - RT)) + } } # 2nd best: Spectrum closest to MS1 @@ -664,12 +668,17 @@ findMsMsHRperMsp.direct <- function(fileName, cpdIDs, mode="pH") { #pl <- xrs[[i]]$pspectrum #pl <- data.frame("mz" = pl[, "mz"], "rt" = xrs[[i]]$RETENTIONTIME, stringsAsFactors = F) + maximumParentMass <- parentMass + mzabs + minimumParentMass <- parentMass - mzabs + maximumRT <- RT * 1.1 + minimumRT <- RT * 0.9 + mzMatch <- - precursorTable[,"mz", drop=FALSE] < parentMass + mzabs & - precursorTable[,"mz", drop=FALSE] > parentMass - mzabs + precursorTable[,"mz", drop=FALSE] < maximumParentMass & + precursorTable[,"mz", drop=FALSE] > minimumParentMass rtMatch <- - precursorTable[,"rt", drop=FALSE] < RT * 1.1 & - precursorTable[,"rt", drop=FALSE] > RT * 0.9 + precursorTable[,"rt", drop=FALSE] < maximumRT & + precursorTable[,"rt", drop=FALSE] > minimumRT if(is.na(RT)) rtMatch <- TRUE @@ -685,13 +694,24 @@ findMsMsHRperMsp.direct <- function(fileName, cpdIDs, mode="pH") { if(length(candidates) > 0){ if(is.na(RT)){ pspIndex <- candidates[[1]] + + if(RMassBank.env$verbose.output) + cat(paste("\n### Info ### Compound ", cpdIDs[[idIdx]], ": RT is not given. ", length(candidates), " candidates in range. Taking the first hit: mz[", minimumParentMass, ", ", maximumParentMass , "] vs mz ", precursorTable[pspIndex,"mz"], ".\n", sep = "")) } else { - closestCandidate <- which.min(abs(RT - precursorTable[candidates, "rt", drop=FALSE])) + closestCandidate <- which.min(abs(RT - precursorTable[candidates, "rt"])) pspIndex <- candidates[[closestCandidate]] + if(RMassBank.env$verbose.output) + cat(paste("\n### Info ### Compound ", cpdIDs[[idIdx]], ": ", length(candidates), " candidates in range. Taking the closest hit regarding RT (", RT, "): mz[", minimumParentMass, ", ", maximumParentMass , "] x rt[", minimumRT, ", ", maximumRT, "] vs (mz ", precursorTable[pspIndex,"mz"], ", rt ", precursorTable[pspIndex,"rt"], ")\n", sep = "")) } } else{ # Else choose the candidate with the closest RT - pspIndex <- which.min(abs(precursorTable[,"rt"] - RT)) + if(RMassBank.env$strictMsMsSpectraSelection){ + pspIndex <- NULL + cat(paste("\n### Warning ### Compound ", cpdIDs[[idIdx]], ": No candidates in range.\n", sep = "")) + } else { + pspIndex <- which.min(abs(RT - precursorTable[, "rt"])) + cat(paste("\n### Warning ### Compound ", cpdIDs[[idIdx]], ": No candidates in range. Taking the closest hit regarding RT (", RT, "): mz[", minimumParentMass, ", ", maximumParentMass , "] x rt[", minimumRT, ", ", maximumRT, "] vs (mz ", precursorTable[pspIndex,"mz"], ", rt ", precursorTable[pspIndex,"rt"], ")\n", sep = "")) + } } # 2nd best: Spectrum closest to MS1 @@ -764,6 +784,10 @@ read.msp <- function(file){ stop("No spectrum found") cmpnd <- lapply(fields.idx[-pk.idx], function(x) get.text.value(strs[x], paste(fields[x], ":", sep = ""))) names(cmpnd) <- fields[-pk.idx] + + ## minutes to seconds + cmpnd$RETENTIONTIME <- as.numeric(cmpnd$RETENTIONTIME) * 60 + nlines <- length(strs) npeaks <- as.numeric(get.text.value(strs[pk.idx], "Num Peaks:")) peaks.idx <- (pk.idx + 1):nlines diff --git a/R/zzz.R b/R/zzz.R index b43f08a..7fe2be5 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -6,6 +6,7 @@ RMassBank.env$verbose.output <- FALSE RMassBank.env$export.invalid <- FALSE RMassBank.env$export.molfiles <- TRUE + RMassBank.env$strictMsMsSpectraSelection <- FALSE mb <- list() attach(RMassBank.env) From 1d25b28a170fa30b3422a1dc8d2b6945e0d7c8c4 Mon Sep 17 00:00:00 2001 From: Treutler Date: Mon, 30 Jul 2018 14:06:52 +0200 Subject: [PATCH 14/71] Added SP to MB workflow --- R/createMassBank.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index 50d23f6..d991e8d 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -117,12 +117,12 @@ resetInfolists <- function(mb) CH.IUPAC = character(0), CH.LINK.CAS = character(0), CH.LINK.CHEBI = integer(0), CH.LINK.HMDB = character(0), CH.LINK.KEGG = character(0), CH.LINK.LIPIDMAPS = character(0), CH.LINK.PUBCHEM = character(0), CH.LINK.INCHIKEY = character(0), - CH.LINK.CHEMSPIDER = integer(0)), .Names = c("X", "id", "dbcas", + CH.LINK.CHEMSPIDER = integer(0), SP.SAMPLE = character(0)), .Names = c("X", "id", "dbcas", "dbname", "dataused", "COMMENT.CONFIDENCE", "COMMENT.ID", "CH.NAME1", "CH.NAME2", "CH.NAME3", "CH.COMPOUND_CLASS", "CH.FORMULA", "CH.EXACT_MASS", "CH.SMILES", "CH.IUPAC", "CH.LINK.CAS", "CH.LINK.CHEBI", "CH.LINK.HMDB", "CH.LINK.KEGG", "CH.LINK.LIPIDMAPS", "CH.LINK.PUBCHEM", - "CH.LINK.INCHIKEY", "CH.LINK.CHEMSPIDER"), row.names = integer(0), class = "data.frame") + "CH.LINK.INCHIKEY", "CH.LINK.CHEMSPIDER", "SP.SAMPLE"), row.names = integer(0), class = "data.frame") return(mb) } @@ -1218,6 +1218,8 @@ readMbdata <- function(row) # again, these constants are read from the options: mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type + if(all(nchar(row[["SP.SAMPLE"]]) > 0, row[["SP.SAMPLE"]] != "NA", !is.na(row[["SP.SAMPLE"]]), na.rm = TRUE)) + mbdata[['SP$SAMPLE']] <- row[["SP.SAMPLE"]] return(mbdata) From a930e59704093ae89da31dac8ba3a68294262edc Mon Sep 17 00:00:00 2001 From: Treutler Date: Mon, 30 Jul 2018 14:07:50 +0200 Subject: [PATCH 15/71] Fixed RT handling and the carry-over of Precursor intensities --- R/leMsmsRaw.R | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index 69e7870..a3d4392 100644 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -635,6 +635,7 @@ findMsMsHRperMsp.direct <- function(fileName, cpdIDs, mode="pH") { mz = as.numeric(unlist(lapply(X = xrmsms, FUN = function(x){ x$PRECURSORMZ }))), rt = as.numeric(unlist(lapply(X = xrmsms, FUN = function(x){ x$RETENTIONTIME }))) ) + precursorTable[, "rt"] <- precursorTable[, "rt"] * 60 ## ## Retrieval over all supplied cpdIDs @@ -751,7 +752,8 @@ findMsMsHRperMsp.direct <- function(fileName, cpdIDs, mode="pH") { "rt" = as.numeric(spectrum$RETENTIONTIME), "rtmin" = as.numeric(spectrum$RETENTIONTIME), "rtmax" = as.numeric(spectrum$RETENTIONTIME), - "into" = as.numeric(spectrum$pspectrum[, "intensity"]) + "into" = as.numeric(spectrum$pspectrum[, "intensity"]), + "into_parent" = as.numeric(spectrum$INTENSITY) )) } } @@ -786,7 +788,7 @@ read.msp <- function(file){ names(cmpnd) <- fields[-pk.idx] ## minutes to seconds - cmpnd$RETENTIONTIME <- as.numeric(cmpnd$RETENTIONTIME) * 60 + #cmpnd$RETENTIONTIME <- as.numeric(cmpnd$RETENTIONTIME) * 60 nlines <- length(strs) npeaks <- as.numeric(get.text.value(strs[pk.idx], "Num Peaks:")) @@ -966,10 +968,10 @@ toRMB <- function(msmsXCMSspecs = NA, cpdID = NA, mode="pH", MS1spec = NA){ mockenv$mockAcqnum <- mockenv$mockAcqnum + 1 ## Find peak table - pks <- matrix(nrow = length(spec[,1]), ncol = 2) + pks <- matrix(nrow = nrow(spec), ncol = 2) colnames(pks) <- c("mz","int") - pks[,1] <- spec[,1] - pks[,2] <- spec[,7] + pks[,1] <- spec[,"mz"] + pks[,2] <- spec[,"into"] ## Deprofiling not necessary for XCMS @@ -979,23 +981,23 @@ toRMB <- function(msmsXCMSspecs = NA, cpdID = NA, mode="pH", MS1spec = NA){ intensity = pks[,"int"], precScanNum = as.integer(1), precursorMz = findMz(cpdID)$mzCenter, - precursorIntensity = 0, + precursorIntensity = ifelse(test = "into_parent" %in% colnames(spec), yes = spec[,"into_parent"], no = 0), precursorCharge = as.integer(1), collisionEnergy = 0, tic = 0, peaksCount = nrow(spec), - rt = median(spec[,4]), + rt = median(spec[,"rt"]), acquisitionNum = as.integer(mockenv$mockAcqnum), centroided = TRUE )) }) msmsSpecs <- as(do.call(c, msmsSpecs), "SimpleList") - + ##Build the new objects masterSpec <- new("Spectrum1", mz = findMz(cpdID,mode=mode)$mzCenter, - intensity = 100, + intensity = ifelse(test = msmsSpecs[[1]]@precursorIntensity != 0, yes = msmsSpecs[[1]]@precursorIntensity, no = 100), polarity = as.integer(0), peaksCount = as.integer(1), rt = msmsSpecs[[1]]@rt, From 2d97fa5dd2fa5e4ba07689154efd301911f94065 Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 14 Aug 2018 13:24:01 +0200 Subject: [PATCH 16/71] Minor change --- R/leMsMs.r | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/R/leMsMs.r b/R/leMsMs.r index 8faa38f..7b513e3 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -457,6 +457,8 @@ analyzeMsMs <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", ## ); .checkMbSettings() + if(msmsPeaks@found == FALSE) + return(msmsPeaks) # Check whether the spectra can be fitted to the spectra list correctly! if(length(msmsPeaks@children) != length(spectraList)) @@ -469,9 +471,6 @@ analyzeMsMs <- function(msmsPeaks, mode="pH", detail=FALSE, run="preliminary", } - if(msmsPeaks@found == FALSE) - return(msmsPeaks) - if(method=="formula") { r <- analyzeMsMs.formula(msmsPeaks, mode, detail, run, filterSettings) From f31d57018e2a9baec1d7b65a21fce6705530c440 Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 14 Aug 2018 13:27:45 +0200 Subject: [PATCH 17/71] Added the optional export of invalid spectra to records; added the export of the preceursor-intensity into records; added the export of generic AC and AC information into records --- R/createMassBank.R | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index d991e8d..be9286e 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -80,7 +80,7 @@ loadInfolist <- function(mb, fileName) # Substitute empty spaces by real NA values r[which(r == "")] <- NA # Trim spaces (in all non-NA fields) - r[which(!is.na(r))] <- sub("^ *([^ ]+) *$", "\\1", r[which(!is.na(r))]) + r[which(!is.na(r))] <- sub(pattern = "^ *([^ ]+) *$", replacement = "\\1", x = r[which(!is.na(r))]) return(r) })), stringsAsFactors = FALSE) # use only the columns present in mbdata_archive, no other columns added in excel @@ -239,7 +239,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c message(paste("Compiling: ", r@name, sep="")) mbdata <- mb@mbdata_relisted[[which(mb@mbdata_archive$id == as.numeric(r@id))]] if(nrow(mb@additionalPeaks) > 0) - res <-compileRecord(r, mbdata, mb@aggregated, mb@additionalPeaks) + res <-compileRecord(spec = r, mbdata = mbdata, aggregated = mb@aggregated, additionalPeaks = mb@additionalPeaks) else res <-compileRecord(spec = r, mbdata = mbdata, aggregated = mb@aggregated, additionalPeaks = NULL, retrieval=findLevel(r@id,TRUE)) return(res) @@ -282,11 +282,11 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c filePath_recData_invalid <- file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata_invalid") filePath_molData <- file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata") - dir.create(filePath_recData_valid,recursive=TRUE) + if(!file.exists(filePath_recData_valid)) if(!dir.create(filePath_recData_valid,recursive=TRUE)) stop(paste("Could not create folder", filePath_recData_valid)) if(RMassBank.env$export.molfiles) - dir.create(filePath_molData,recursive=TRUE) - if(RMassBank.env$export.invalid) - dir.create(filePath_recData_invalid,recursive=TRUE) + if(!file.exists(filePath_molData)) if(!dir.create(filePath_molData,recursive=TRUE)) stop(paste("Could not create folder", filePath_molData)) + if(RMassBank.env$export.invalid & length(mb@mbfiles_notOk) > 0) + if(!file.exists(filePath_recData_invalid)) if(!dir.create(filePath_recData_invalid,recursive=TRUE)) stop(paste("Could not create folder", filePath_recData_invalid)) if(length(mb@molfile) == 0) mb@molfile <- as.list(rep(x = NA, times = length(mb@compiled_ok))) @@ -1299,7 +1299,16 @@ gatherCompound <- function(spec, aggregated, additionalPeaks = NULL, retrieval=" ac_ms[['MS_TYPE']] <- getOption("RMassBank")$annotations$ms_type ac_ms[['IONIZATION']] <- getOption("RMassBank")$annotations$ionization ac_ms[['ION_MODE']] <- mode - + + ## add generic AC$MASS_SPECTROMETRY information + properties <- names(getOption("RMassBank")$annotations) + theseProperties <- grepl(x = properties, pattern = "^AC\\$MASS_SPECTROMETRY_") + properties2 <- gsub(x = properties, pattern = "^AC\\$MASS_SPECTROMETRY_", replacement = "") + presentProperties <- names(ac_ms)#c('MS_TYPE', 'IONIZATION', 'ION_MODE')#, 'FRAGMENTATION_MODE', 'COLLISION_ENERGY', 'RESOLUTION') + theseProperties <- theseProperties & !(properties2 %in% presentProperties) + theseProperties <- theseProperties & (unlist(getOption("RMassBank")$annotations) != "NA") + ac_ms[properties2[theseProperties]] <- unlist(getOption("RMassBank")$annotations[theseProperties]) + # This list could be made customizable. ac_lc <- list(); rt <- spec@parent@rt / 60 @@ -1309,7 +1318,16 @@ gatherCompound <- function(spec, aggregated, additionalPeaks = NULL, retrieval=" ac_lc[['RETENTION_TIME']] <- sprintf("%.3f min", rt) ac_lc[['SOLVENT A']] <- getOption("RMassBank")$annotations$lc_solvent_a ac_lc[['SOLVENT B']] <- getOption("RMassBank")$annotations$lc_solvent_b - + + ## add generic AC$CHROMATOGRAPHY information + #properties <- names(getOption("RMassBank")$annotations) + theseProperties <- grepl(x = properties, pattern = "^AC\\$CHROMATOGRAPHY_") + properties2 <- gsub(x = properties, pattern = "^AC\\$CHROMATOGRAPHY_", replacement = "") + presentProperties <- names(ac_lc)#c('COLUMN_NAME', 'FLOW_GRADIENT', 'FLOW_RATE', 'RETENTION_TIME', 'SOLVENT A', 'SOLVENT B') + theseProperties <- theseProperties & !(properties2 %in% presentProperties) + theseProperties <- theseProperties & (unlist(getOption("RMassBank")$annotations) != "NA") + ac_lc[properties2[theseProperties]] <- unlist(getOption("RMassBank")$annotations[theseProperties]) + # Go through all child spectra, and fill our skeleton with scan data! # Pass them the AC_LC and AC_MS data, which are added at the right place # directly in there. @@ -1367,6 +1385,8 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP ms_fi[['BASE_PEAK']] <- round(mz(spec@parent)[which.max(intensity(spec@parent))],4) ms_fi[['PRECURSOR_M/Z']] <- round(precursorMz$mzCenter,4) ms_fi[['PRECURSOR_TYPE']] <- precursor_types[spec@mode] + if(all(!is.na(spec@parent@intensity), spec@parent@intensity != 0, spec@parent@intensity != 100, na.rm = TRUE)) + ms_fi[['PRECURSOR_INTENSITY']] <- spec@parent@intensity # Select all peaks which belong to this spectrum (correct cpdID and scan no.) # from peaksOK From 7fdfc02851d488379e009dfbbb6ee1258059757c Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 14 Aug 2018 14:41:24 +0200 Subject: [PATCH 18/71] Minor change --- R/leMsMs.r | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/leMsMs.r b/R/leMsMs.r index 7b513e3..ed47a7e 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -300,7 +300,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec if(RMassBank.env$verbose.output){ multiplicityNotOkCount <- sum(!w@aggregated$filterOK) if(multiplicityNotOkCount > 0) - cat(paste("### Warning ### ", multiplicityNotOkCount, " / ", nrow(w@aggregated[, c("mzFound", "intensity", "formulaCount", "dppmBest")]), " peaks do not fulfill the multiplicity criterion\n", sep = "")) + cat(paste("### Warning ### ", multiplicityNotOkCount, " / ", nrow(w@aggregated), " peaks do not fulfill the multiplicity criterion\n", sep = "")) } w@aggregated <- processProblematicPeaks(w, mode, archivename) From 635d86d703c7c4dffdb316a7cc854ff660a62f83 Mon Sep 17 00:00:00 2001 From: Treutler Date: Wed, 15 Aug 2018 08:34:14 +0200 Subject: [PATCH 19/71] Fixed bug when multiplicityFilter is not set --- R/leMsMs.r | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/leMsMs.r b/R/leMsMs.r index ed47a7e..563f962 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -293,6 +293,10 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec message("msmsWorkflow: Step 8. Peak multiplicity filtering") if (is.null(settings$multiplicityFilter)) { message("msmsWorkflow: Step 8. Peak multiplicity filtering skipped because multiplicityFilter parameter is not set.") + w@aggregated <- addProperty(w@aggregated, "formulaMultiplicity", "integer", 1) + w@aggregated <- addProperty(w@aggregated, "filterOK", "logical", FALSE) + w@aggregated$filterOK <- !((is.na(w@aggregated$formulaCount) | w@aggregated$formulaCount==0) & (is.na(w@aggregated$reanalyzed.formulaCount) | w@aggregated$reanalyzed.formulaCount==0)) + w@aggregated <- addProperty(w@aggregated, "problematicPeak", "logical", FALSE) } else { # apply heuristic filter w@aggregated <- filterMultiplicity(w = w, archivename = archivename, mode = mode, multiplicityFilter = settings$multiplicityFilter) From c562f8c60bbb8ab600e7bacfdf65a2311dec5c59 Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 17 Aug 2018 09:01:50 +0200 Subject: [PATCH 20/71] Fixed parsing of msp data in case of empty fields --- R/leMsmsRaw.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index a3d4392..1d73878 100644 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -764,6 +764,7 @@ findMsMsHRperMsp.direct <- function(fileName, cpdIDs, mode="pH") { ## adapted from the Bioconductor package 'metaMS' (method 'read.msp') read.msp <- function(file){ get.text.value <- function(x, field, do.err = TRUE) { + if(trimws(x) == field) return("") woppa <- strsplit(x, field) woppa.lengths <- sapply(woppa, length) if (all(woppa.lengths == 2)) { From 16eb71d407caae7cb3d479db8d1bc02007753d01 Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 28 Aug 2018 12:03:51 +0200 Subject: [PATCH 21/71] Bug fixes --- R/leMsMs.r | 9 +++++---- R/leMsmsRaw.R | 5 +++-- R/msmsRead.R | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/R/leMsMs.r b/R/leMsMs.r index 563f962..55935ad 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -1143,9 +1143,9 @@ aggregateSpectra <- function(spec, addIncomplete=FALSE) table.cpd <- do.call(rbind, tables.c) ## complete missing columns if necessary - ## mz intensity good scan cpdID parentScan - ## mz intensity good mzCalc formula dbe formulaCount dppm dppmBest scan cpdID parentScan - columnNames <- c("mzCalc", "formula", "dbe", "formulaCount", "dppm", "dppmBest") + ## mz intensity good scan cpdID parentScan + ## mz intensity good mzCalc formula dbe formulaCount dppm dppmBest scan cpdID parentScan + columnNames <- c( "mzCalc", "formula", "dbe", "formulaCount", "dppm", "dppmBest") if(all(!(columnNames %in% colnames(table.cpd)))) for(columnName in columnNames) table.cpd[, columnName] <- as.numeric(rep(x = NA, times = nrow(table.cpd))) @@ -1156,6 +1156,7 @@ aggregateSpectra <- function(spec, addIncomplete=FALSE) }) #return(compoundTables) aggTable <- do.call(rbind, compoundTables) + if(is.null(aggTable)) aggTable <- data.frame("mz"=numeric(), "intensity"=numeric(), "good"=logical(), "mzCalc"=numeric(), "formula"=character(), "dbe"=numeric(), "formulaCount"=integer(), "dppm"=numeric(), "dppmBest"=numeric(), "scan"=integer(), "cpdID"=integer(), "parentScan"=integer(), stringsAsFactors=FALSE) colnames(aggTable)[1] <- "mzFound" aggTable <- addProperty(aggTable, "dppmRc", "numeric") @@ -1305,7 +1306,7 @@ processProblematicPeaks <- function(w, mode, archivename = NA) # Add the info to specs specs <- addProperty(specs, "problematicPeak", "logical", FALSE) - specs[match(fp$index, specs$index),"problematicPeak"] <- TRUE + if(nrow(specs) > 0) specs[match(fp$index, specs$index),"problematicPeak"] <- TRUE # Select the columns for output into the failpeaks file fp <- fp[,c("OK", "name", "cpdID", "scan", "mzFound", "formula", diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index 1d73878..77192d2 100644 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -586,7 +586,7 @@ findMsMsHRperMsp <- function(fileName, cpdIDs, mode="pH"){ return(P) } else { # There is a file for every cpdID - spectra <- toRMB(msmsXCMSspecs = unlist(findMsMsHRperMsp.direct(fileName, cpdIDs, mode=mode),FALSE), cpdID = cpdIDs) + spectra <- toRMB(msmsXCMSspecs = unlist(findMsMsHRperMsp.direct(fileName = fileName, cpdIDs = cpdIDs, mode=mode),FALSE), cpdID = cpdIDs) } sp <- spectra @@ -787,6 +787,7 @@ read.msp <- function(file){ stop("No spectrum found") cmpnd <- lapply(fields.idx[-pk.idx], function(x) get.text.value(strs[x], paste(fields[x], ":", sep = ""))) names(cmpnd) <- fields[-pk.idx] + if(!("INTENSITY" %in% names(cmpnd))) cmpnd$"INTENSITY" <- 100 ## minutes to seconds #cmpnd$RETENTIONTIME <- as.numeric(cmpnd$RETENTIONTIME) * 60 @@ -797,7 +798,7 @@ read.msp <- function(file){ pks <- gsub("^ +", "", unlist(strsplit(strs[peaks.idx], ";"))) pks <- pks[pks != ""] if (length(pks) != npeaks) - stop("Not the right number of peaks in compound", cmpnd$Name) + stop(paste("Not the right number of peaks in compound '", cmpnd$Name, "' (", npeaks, " vs ", length(pks), ") in file '", file, "'", sep = "")) pklst <- strsplit(x = pks, split = "\t| ") pklst <- lapply(pklst, function(x) x[x != ""]) cmz <- as.numeric(sapply(pklst, "[[", 1)) diff --git a/R/msmsRead.R b/R/msmsRead.R index 113bf2e..09f4232 100644 --- a/R/msmsRead.R +++ b/R/msmsRead.R @@ -238,7 +238,7 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, currentFile <- w@files[which(cpdids == ID)] # Retrieve spectrum data - spec <- findMsMsHRperMsp(currentFile, ID, mode=mode) + spec <- findMsMsHRperMsp(fileName = currentFile, cpdIDs = ID, mode=mode) gc() # Progress: From 5b374ad612e09dd7048161eb51f9e77933d447cb Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 28 Aug 2018 12:07:41 +0200 Subject: [PATCH 22/71] Refined verbose output --- R/leMsMs.r | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/leMsMs.r b/R/leMsMs.r index 55935ad..88b0e9e 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -302,9 +302,11 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec w@aggregated <- filterMultiplicity(w = w, archivename = archivename, mode = mode, multiplicityFilter = settings$multiplicityFilter) if(RMassBank.env$verbose.output){ - multiplicityNotOkCount <- sum(!w@aggregated$filterOK) + peakDfs <- split(x = msmsList@aggregated, f = list("mzFound"=unique(msmsList@aggregated$mzFound), "cpdID"=unique(msmsList@aggregated$cpdID))) + numberOfPeaks <- length(peakDfs) + multiplicityNotOkCount <- numberOfPeaks - sum(unlist(lapply(X = peakDfs, FUN = function(x){any(x$filterOK)}))) if(multiplicityNotOkCount > 0) - cat(paste("### Warning ### ", multiplicityNotOkCount, " / ", nrow(w@aggregated), " peaks do not fulfill the multiplicity criterion\n", sep = "")) + cat(paste("### Warning ### ", multiplicityNotOkCount, " / ", numberOfPeaks, " peaks do not fulfill the multiplicity criterion\n", sep = "")) } w@aggregated <- processProblematicPeaks(w, mode, archivename) From 8c9270d656f8cc6ed63659e98dd8b18340ff7212 Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 28 Aug 2018 12:47:51 +0200 Subject: [PATCH 23/71] Fixed that compounds can be found in a msp file if the retention time is not given in the msp file --- R/leMsmsRaw.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index 77192d2..d536323 100644 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -681,6 +681,7 @@ findMsMsHRperMsp.direct <- function(fileName, cpdIDs, mode="pH") { precursorTable[,"rt", drop=FALSE] < maximumRT & precursorTable[,"rt", drop=FALSE] > minimumRT + mzMatch[is.na(mzMatch)] <- TRUE ## RT not given if(is.na(RT)) rtMatch <- TRUE From a1913cb773b0998bfd86e1927b31de8006057d58 Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 28 Aug 2018 12:48:08 +0200 Subject: [PATCH 24/71] Refined verbose output --- R/leMsMs.r | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/leMsMs.r b/R/leMsMs.r index 88b0e9e..955c147 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -302,7 +302,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec w@aggregated <- filterMultiplicity(w = w, archivename = archivename, mode = mode, multiplicityFilter = settings$multiplicityFilter) if(RMassBank.env$verbose.output){ - peakDfs <- split(x = msmsList@aggregated, f = list("mzFound"=unique(msmsList@aggregated$mzFound), "cpdID"=unique(msmsList@aggregated$cpdID))) + peakDfs <- split(x = w@aggregated, f = list("mzFound"=unique(w@aggregated$mzFound), "cpdID"=unique(w@aggregated$cpdID))) numberOfPeaks <- length(peakDfs) multiplicityNotOkCount <- numberOfPeaks - sum(unlist(lapply(X = peakDfs, FUN = function(x){any(x$filterOK)}))) if(multiplicityNotOkCount > 0) From 15d799ea1317bffacb183fe7b6c49462d4e726a5 Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 28 Aug 2018 12:53:03 +0200 Subject: [PATCH 25/71] Refined verbose output --- R/leMsMs.r | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/leMsMs.r b/R/leMsMs.r index 955c147..e154d4d 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -302,7 +302,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec w@aggregated <- filterMultiplicity(w = w, archivename = archivename, mode = mode, multiplicityFilter = settings$multiplicityFilter) if(RMassBank.env$verbose.output){ - peakDfs <- split(x = w@aggregated, f = list("mzFound"=unique(w@aggregated$mzFound), "cpdID"=unique(w@aggregated$cpdID))) + peakDfs <- split(x = w@aggregated, f = list("mzFound"=w@aggregated$mzFound, "cpdID"=w@aggregated$cpdID)) numberOfPeaks <- length(peakDfs) multiplicityNotOkCount <- numberOfPeaks - sum(unlist(lapply(X = peakDfs, FUN = function(x){any(x$filterOK)}))) if(multiplicityNotOkCount > 0) From ee6122a2e6d0bf5bd3cf69ca613430bd1a4b486c Mon Sep 17 00:00:00 2001 From: Treutler Date: Wed, 29 Aug 2018 14:51:30 +0200 Subject: [PATCH 26/71] Excluded empty spectra from the spectrum selection --- R/leMsmsRaw.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index d536323..0f378e6 100644 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -613,6 +613,7 @@ findMsMsHRperMsp.direct <- function(fileName, cpdIDs, mode="pH") { # Read file suppressWarnings(xrmsms <- read.msp(file = fileName)) + xrmsms <- xrmsms[unlist(lapply(X = xrmsms, FUN = function(spectrum){nrow(spectrum$pspectrum)})) > 0] ## If file is not MSe, split by collision energy #if(MSe == FALSE){ From 66cf177f589eadf6c316f4d0760b1b6027cbe69e Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 31 Aug 2018 10:45:44 +0200 Subject: [PATCH 27/71] Fixed bug for modes other than pH by passing the mode parameter to findMz --- R/leMsMs.r | 2 +- R/leMsmsRaw.R | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/leMsMs.r b/R/leMsMs.r index e154d4d..25c4862 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -2162,7 +2162,7 @@ filterMultiplicity <- function(w, archivename=NA, mode="pH", recalcBest = TRUE, # Kick the M+H+ satellites out of peaksReanOK: peaksReanOK$mzCenter <- as.numeric( - unlist(lapply(peaksReanOK$cpdID, function(id) findMz(id, retrieval=findLevel(id,TRUE))$mzCenter)) ) + unlist(lapply(peaksReanOK$cpdID, function(id) findMz(id, mode=mode, retrieval=findLevel(id,TRUE))$mzCenter)) ) peaksReanBad <- peaksReanOK[ !((peaksReanOK$mzFound < peaksReanOK$mzCenter - 1) | (peaksReanOK$mzFound > peaksReanOK$mzCenter + 1)),] diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index 0f378e6..9df5ace 100644 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -586,7 +586,7 @@ findMsMsHRperMsp <- function(fileName, cpdIDs, mode="pH"){ return(P) } else { # There is a file for every cpdID - spectra <- toRMB(msmsXCMSspecs = unlist(findMsMsHRperMsp.direct(fileName = fileName, cpdIDs = cpdIDs, mode=mode),FALSE), cpdID = cpdIDs) + spectra <- toRMB(msmsXCMSspecs = unlist(findMsMsHRperMsp.direct(fileName = fileName, cpdIDs = cpdIDs, mode=mode),FALSE), cpdID = cpdIDs, mode = mode) } sp <- spectra @@ -984,7 +984,7 @@ toRMB <- function(msmsXCMSspecs = NA, cpdID = NA, mode="pH", MS1spec = NA){ mz = pks[,"mz"], intensity = pks[,"int"], precScanNum = as.integer(1), - precursorMz = findMz(cpdID)$mzCenter, + precursorMz = findMz(cpdID, mode=mode)$mzCenter, precursorIntensity = ifelse(test = "into_parent" %in% colnames(spec), yes = spec[,"into_parent"], no = 0), precursorCharge = as.integer(1), collisionEnergy = 0, @@ -1090,7 +1090,7 @@ addPeaksManually <- function(w, cpdID = NA, handSpec, mode = "pH"){ mz = handSpec[,"mz"], intensity = handSpec[,"int"], precScanNum = as.integer(1), - precursorMz = findMz(cpdID)$mzCenter, + precursorMz = findMz(cpdID,mode=mode)$mzCenter, precursorIntensity = 0, precursorCharge = as.integer(1), collisionEnergy = 0, From e92f82be087e4b9b420948a272dd39806373b055 Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 31 Aug 2018 10:52:08 +0200 Subject: [PATCH 28/71] Fixed bug: Corrected charge for adduct pNH4 --- R/leCsvAccess.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index 965c94b..2ca2efb 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -348,7 +348,7 @@ findMz.formula <- function(formula, mode="pH", ppm=10, deltaMz=0) if (mode == "pM") mzopt <- list(addition = "", charge = 1) if (mode == "pNH4") - mzopt <- list(addition = "NH4", charge = -1) + mzopt <- list(addition = "NH4", charge = 1) if (mode == "mH") mzopt <- list(addition = "H-1", charge = -1) if (mode == "mFA") From 43f49f0fdce2515d3a05079d02e8b71db18d72ca Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 31 Aug 2018 12:49:36 +0200 Subject: [PATCH 29/71] Added 11 additional adducts: [M+K]+, [M+ACN+H]+, [M+ACN+Na]+, [M+2Na-H]+, [2M+H]+, [2M+K]+, [2M+Na]+, [2M+NH4]+, [2M+ACN+H]+, [M+ACN+2H]2+, [M+2H]2+ --- R/leCsvAccess.R | 30 +++++++++++++++++++++++++++--- R/msmsRead.R | 2 +- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index 2ca2efb..b051f45 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -337,18 +337,42 @@ getMolecule <- function(smiles) #' @export findMz.formula <- function(formula, mode="pH", ppm=10, deltaMz=0) { - if (!any(mode %in% c("pH", "pNa", "pM", "pNH4", "mH", "mFA", - "mM", ""))) + if (!any(mode %in% c("pH", "pNa", "pK", "pM", "pNH4", "pACN_pH", "pACN_pNa", "p2Na_mH", "pM_pH", "pM_pK", "pM_pNa", "pM_pNH4", "pM_pACN_pH", "pACN_p2H", "p2H", "mH", "mFA", "mM", ""))) stop(paste("The ionization mode", mode, "is unknown.")) mzopt <- list(addition = "", charge = 0) + ## M+X if (mode == "pH") mzopt <- list(addition = "H", charge = 1) if (mode == "pNa") - mzopt <- list(addition = "Na", charge = 1) + mzopt <- list(addition = "Na", charge = 1) + if (mode == "pK") + mzopt <- list(addition = "K", charge = 1) if (mode == "pM") mzopt <- list(addition = "", charge = 1) if (mode == "pNH4") mzopt <- list(addition = "NH4", charge = 1) + if (mode == "p2Na_mH") + mzopt <- list(addition = "Na2H-1", charge = 1) + if (mode == "pACN_pH") + mzopt <- list(addition = "C2H4N1", charge = 1) + if (mode == "pACN_pNa") + mzopt <- list(addition = "C2H3N1Na1", charge = 1) + if (mode == "p2H") + mzopt <- list(addition = "H2", charge = 2) + if (mode == "pACN_p2H") + mzopt <- list(addition = "C2H5N1", charge = 2) + ## 2M+X + if (mode == "pM_pH") + mzopt <- list(addition = add.formula(formula, "H1"), charge = 1) + if (mode == "pM_pK") + mzopt <- list(addition = add.formula(formula, "K1"), charge = 1) + if (mode == "pM_pNa") + mzopt <- list(addition = add.formula(formula, "Na1"), charge = 1) + if (mode == "pM_pNH4") + mzopt <- list(addition = add.formula(formula, "N1H4"), charge = 1) + if (mode == "pM_pACN_pH") + mzopt <- list(addition = add.formula(formula, "C2H4N1"), charge = 1) + ## M-X if (mode == "mH") mzopt <- list(addition = "H-1", charge = -1) if (mode == "mFA") diff --git a/R/msmsRead.R b/R/msmsRead.R index 09f4232..59ec892 100644 --- a/R/msmsRead.R +++ b/R/msmsRead.R @@ -47,7 +47,7 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, .checkMbSettings() ##Read the files and cpdids according to the definition ##All cases are silently accepted, as long as they can be handled according to one definition - if(!any(mode %in% c("pH","pNa","pM","pNH4","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) + if(!any(mode %in% c("pH", "pNa", "pK", "pM", "pNH4", "pACN_pH", "pACN_pNa", "p2Na_mH", "pM_pH", "pM_pK", "pM_pNa", "pM_pNH4", "pM_pACN_pH", "pACN_p2H", "p2H", "mH", "mFA", "mM", ""))) stop(paste("The ionization mode", mode, "is unknown.")) if(is.null(filetable)){ ##If no filetable is supplied, filenames must be named explicitly From e3d7cd59a2479f9bf3a69a36570f5272567be0d0 Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 31 Aug 2018 13:01:17 +0200 Subject: [PATCH 30/71] Updated the adduct recognition check in a unified way --- R/leCsvAccess.R | 6 +++++- R/leMsMs.r | 2 +- R/msmsRead.R | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index b051f45..c95e417 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -323,6 +323,10 @@ getMolecule <- function(smiles) return(mol) } +knownAdducts <- function(){ + return(c("pH", "pNa", "pK", "pM", "pNH4", "pACN_pH", "pACN_pNa", "p2Na_mH", "pM_pH", "pM_pK", "pM_pNa", "pM_pNH4", "pM_pACN_pH", "pACN_p2H", "p2H", "mH", "mFA", "mM", "")) +} + #' Find the exact mass +/- a given margin for a given formula or its ions and adducts. #' #' @param formula The molecular formula in text or list format (see \code{\link{formulastring.to.list}} @@ -337,7 +341,7 @@ getMolecule <- function(smiles) #' @export findMz.formula <- function(formula, mode="pH", ppm=10, deltaMz=0) { - if (!any(mode %in% c("pH", "pNa", "pK", "pM", "pNH4", "pACN_pH", "pACN_pNa", "p2Na_mH", "pM_pH", "pM_pK", "pM_pNa", "pM_pNH4", "pM_pACN_pH", "pACN_p2H", "p2H", "mH", "mFA", "mM", ""))) + if (!any(mode %in% knownAdducts())) stop(paste("The ionization mode", mode, "is unknown.")) mzopt <- list(addition = "", charge = 0) ## M+X diff --git a/R/leMsMs.r b/R/leMsMs.r index 25c4862..5818ec7 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -79,7 +79,7 @@ msmsWorkflow <- function(w, mode="pH", steps=c(1:8), confirmMode = FALSE, newRec progressbar = "progressBarHook", MSe = FALSE) { .checkMbSettings() - if(!any(mode %in% c("pH","pNa","pNH4","pM","mH","mFA","mM",""))) stop(paste("The ionization mode", mode, "is unknown.")) + if(!any(mode %in% knownAdducts())) stop(paste("The ionization mode", mode, "is unknown.")) if(!is.na(archivename)) w@archivename <- archivename diff --git a/R/msmsRead.R b/R/msmsRead.R index 59ec892..68be5d0 100644 --- a/R/msmsRead.R +++ b/R/msmsRead.R @@ -47,7 +47,7 @@ msmsRead <- function(w, filetable = NULL, files = NULL, cpdids = NULL, .checkMbSettings() ##Read the files and cpdids according to the definition ##All cases are silently accepted, as long as they can be handled according to one definition - if(!any(mode %in% c("pH", "pNa", "pK", "pM", "pNH4", "pACN_pH", "pACN_pNa", "p2Na_mH", "pM_pH", "pM_pK", "pM_pNa", "pM_pNH4", "pM_pACN_pH", "pACN_p2H", "p2H", "mH", "mFA", "mM", ""))) stop(paste("The ionization mode", mode, "is unknown.")) + if(!any(mode %in% knownAdducts())) stop(paste("The ionization mode", mode, "is unknown.")) if(is.null(filetable)){ ##If no filetable is supplied, filenames must be named explicitly From 3a27615e7d6b2f12e46da339148be5bcf8b463f8 Mon Sep 17 00:00:00 2001 From: Treutler Date: Mon, 3 Sep 2018 12:42:17 +0200 Subject: [PATCH 31/71] Fixed bug in the unified adduct recognition check --- R/leCsvAccess.R | 102 +++++++++++++++++++++++++++--------------------- R/leMsMs.r | 29 ++------------ 2 files changed, 61 insertions(+), 70 deletions(-) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index c95e417..2f64d43 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -207,7 +207,10 @@ loadList <- function(path, listEnv = NULL, check = TRUE) tryCatch( findMz(x), error = function(e){ - currEnvir$wrongID <- c(currEnvir$wrongID, x) + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### Error finding SMILES for ID '", x, "': ", e, sep = "")) + + currEnvir$wrongID <- c(currEnvir$wrongID, x) } ) }) @@ -326,6 +329,56 @@ getMolecule <- function(smiles) knownAdducts <- function(){ return(c("pH", "pNa", "pK", "pM", "pNH4", "pACN_pH", "pACN_pNa", "p2Na_mH", "pM_pH", "pM_pK", "pM_pNa", "pM_pNH4", "pM_pACN_pH", "pACN_p2H", "p2H", "mH", "mFA", "mM", "")) } +getAdductProperties <- function(mode, formula = NULL){ + if(grepl(x = "pN_pH", pattern = "^pM") & is.null(formula)) + stop("Cannot calculate pM adduct without formula") + + mzopt <- NULL + ## M+X + if (mode == "pH") + mzopt <- list(addition = "H", charge = 1) + if (mode == "pNa") + mzopt <- list(addition = "Na", charge = 1) + if (mode == "pK") + mzopt <- list(addition = "K", charge = 1) + if (mode == "pM") + mzopt <- list(addition = "", charge = 1) + if (mode == "pNH4") + mzopt <- list(addition = "NH4", charge = 1) + if (mode == "p2Na_mH") + mzopt <- list(addition = "Na2H-1", charge = 1) + if (mode == "pACN_pH") + mzopt <- list(addition = "C2H4N1", charge = 1) + if (mode == "pACN_pNa") + mzopt <- list(addition = "C2H3N1Na1", charge = 1) + if (mode == "p2H") + mzopt <- list(addition = "H2", charge = 2) + if (mode == "pACN_p2H") + mzopt <- list(addition = "C2H5N1", charge = 2) + ## 2M+X + if (mode == "pM_pH") + mzopt <- list(addition = add.formula(formula, "H1"), charge = 1) + if (mode == "pM_pK") + mzopt <- list(addition = add.formula(formula, "K1"), charge = 1) + if (mode == "pM_pNa") + mzopt <- list(addition = add.formula(formula, "Na1"), charge = 1) + if (mode == "pM_pNH4") + mzopt <- list(addition = add.formula(formula, "N1H4"), charge = 1) + if (mode == "pM_pACN_pH") + mzopt <- list(addition = add.formula(formula, "C2H4N1"), charge = 1) + ## M-X + if (mode == "mH") + mzopt <- list(addition = "H-1", charge = -1) + if (mode == "mFA") + mzopt <- list(addition = "C1O2", charge = -1) + if (mode == "mM") + mzopt <- list(addition = "", charge = -1) + if (mode == "") + mzopt <- list(addition = "", charge = 0) + if(is.null(mzopt)) stop("mode = \"", mode, "\" not defined") + + return(mzopt) +} #' Find the exact mass +/- a given margin for a given formula or its ions and adducts. #' @@ -343,48 +396,7 @@ findMz.formula <- function(formula, mode="pH", ppm=10, deltaMz=0) { if (!any(mode %in% knownAdducts())) stop(paste("The ionization mode", mode, "is unknown.")) - mzopt <- list(addition = "", charge = 0) - ## M+X - if (mode == "pH") - mzopt <- list(addition = "H", charge = 1) - if (mode == "pNa") - mzopt <- list(addition = "Na", charge = 1) - if (mode == "pK") - mzopt <- list(addition = "K", charge = 1) - if (mode == "pM") - mzopt <- list(addition = "", charge = 1) - if (mode == "pNH4") - mzopt <- list(addition = "NH4", charge = 1) - if (mode == "p2Na_mH") - mzopt <- list(addition = "Na2H-1", charge = 1) - if (mode == "pACN_pH") - mzopt <- list(addition = "C2H4N1", charge = 1) - if (mode == "pACN_pNa") - mzopt <- list(addition = "C2H3N1Na1", charge = 1) - if (mode == "p2H") - mzopt <- list(addition = "H2", charge = 2) - if (mode == "pACN_p2H") - mzopt <- list(addition = "C2H5N1", charge = 2) - ## 2M+X - if (mode == "pM_pH") - mzopt <- list(addition = add.formula(formula, "H1"), charge = 1) - if (mode == "pM_pK") - mzopt <- list(addition = add.formula(formula, "K1"), charge = 1) - if (mode == "pM_pNa") - mzopt <- list(addition = add.formula(formula, "Na1"), charge = 1) - if (mode == "pM_pNH4") - mzopt <- list(addition = add.formula(formula, "N1H4"), charge = 1) - if (mode == "pM_pACN_pH") - mzopt <- list(addition = add.formula(formula, "C2H4N1"), charge = 1) - ## M-X - if (mode == "mH") - mzopt <- list(addition = "H-1", charge = -1) - if (mode == "mFA") - mzopt <- list(addition = "C1O2", charge = -1) - if (mode == "mM") - mzopt <- list(addition = "", charge = -1) - if (mode == "") - mzopt <- list(addition = "", charge = 0) + mzopt <- getAdductProperties(mode, formula) formula <- add.formula(formula, mzopt$addition) # Since in special cases we want to use this with negative and zero number of atoms, we account for this case # by splitting up the formula into positive and negative atom counts (this eliminates the zeroes.) @@ -514,8 +526,8 @@ findSmiles <- function(cpdID) { stop("Compound list must be loaded first.") if(!exists("compoundList", where=.listEnvEnv$listEnv)) stop("Compound list must be loaded first.") - if(.listEnvEnv$listEnv$compoundList[match(cpdID, .listEnvEnv$listEnv$compoundList$ID),"SMILES"] == "") - return(NA) + if(.listEnvEnv$listEnv$compoundList[match(cpdID, .listEnvEnv$listEnv$compoundList$ID),"SMILES"] == "") + return(NA) return(.listEnvEnv$listEnv$compoundList[match(cpdID, .listEnvEnv$listEnv$compoundList$ID),"SMILES"]) } diff --git a/R/leMsMs.r b/R/leMsMs.r index 5818ec7..67c5169 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -609,31 +609,10 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi dppm=0, x1=0,x2=0,x3=0) - # define the adduct additions - if(mode == "pH") { - allowed_additions <- "H" - mode.charge <- 1 - } else if(mode == "pNa") { - allowed_additions <- "Na" - mode.charge <- 1 - } else if(mode == "pM") { - allowed_additions <- "" - mode.charge <- 1 - } else if(mode == "mM") { - allowed_additions <- "" - mode.charge <- -1 - } else if(mode == "mH") { - allowed_additions <- "H-1" - mode.charge <- -1 - } else if(mode == "mFA") { - allowed_additions <- "C2H3O2" - mode.charge <- -1 - } else if(mode == "pNH4") { - allowed_additions <- "NH4" - mode.charge <- 1 - } else{ - stop("mode = \"", mode, "\" not defined") - } + # get the adduct additions + adductProperties <- getAdductProperties(mode, msmsPeaks@formula) + allowed_additions <- adductProperties$addition + mode.charge <- adductProperties$charge # the ppm range is two-sided here. # The range is slightly expanded because dppm calculation of From ec91645efdc367c8441c70429ecf08b78a3b6cb1 Mon Sep 17 00:00:00 2001 From: Treutler Date: Mon, 3 Sep 2018 13:16:07 +0200 Subject: [PATCH 32/71] Distributed the unified adduct recognition check to all relevant places --- R/Isotopic_Annotation.R | 30 ++++-------------------------- R/alternateAnalyze.R | 30 ++++-------------------------- R/leCsvAccess.R | 39 ++++++++++++--------------------------- R/leMsMs.r | 29 ++++------------------------- 4 files changed, 24 insertions(+), 104 deletions(-) diff --git a/R/Isotopic_Annotation.R b/R/Isotopic_Annotation.R index 760f208..4c76f33 100644 --- a/R/Isotopic_Annotation.R +++ b/R/Isotopic_Annotation.R @@ -53,32 +53,10 @@ checkIsotopes <- function(w, mode = "pH", intensity_cutoff = 0, intensity_precis # Load filtersettings filterSettings = settings$filterSettings - # Assign formula additions according to code - if(mode == "pH") { - allowed_additions <- "H" - mode.charge <- 1 - } else if(mode == "pNa") { - allowed_additions <- "Na" - mode.charge <- 1 - } else if(mode == "pM") { - allowed_additions <- "" - mode.charge <- 1 - } else if(mode == "mM") { - allowed_additions <- "" - mode.charge <- -1 - } else if(mode == "mH") { - allowed_additions <- "H-1" - mode.charge <- -1 - } else if(mode == "mFA") { - allowed_additions <- "C2H3O2" - mode.charge <- -1 - } else if(mode == "pNH4") { - allowed_additions <- "NH4" - mode.charge <- 1 - } else{ - stop("mode = \"", mode, "\" not defined") - } - + # get the adduct additions + adductProperties <- getAdductProperties(mode, msmsPeaks@formula) + allowed_additions <- adductProperties$addition + mode.charge <- adductProperties$charge # "default" isotopes (i.e. those with the highest abundance) defIsotopes <- c("107Ag", "27Al", "40Ar", "75As", "197Au", "11B", "138Ba", "9Be", "209Bi", diff --git a/R/alternateAnalyze.R b/R/alternateAnalyze.R index 15f7d58..beb07e3 100644 --- a/R/alternateAnalyze.R +++ b/R/alternateAnalyze.R @@ -137,32 +137,10 @@ analyzeMsMs.formula.optimized <- function(msmsPeaks, mode="pH", detail=FALSE, ru dppm=0, x1=0,x2=0,x3=0) - # define the adduct additions - if(mode == "pH") { - allowed_additions <- "H" - mode.charge <- 1 - } else if(mode == "pNa") { - allowed_additions <- "Na" - mode.charge <- 1 - } else if(mode == "pM") { - allowed_additions <- "" - mode.charge <- 1 - } else if(mode == "mM") { - allowed_additions <- "" - mode.charge <- -1 - } else if(mode == "mH") { - allowed_additions <- "H-1" - mode.charge <- -1 - } else if(mode == "mFA") { - allowed_additions <- "C2H3O2" - mode.charge <- -1 - } else if(mode == "pNH4") { - allowed_additions <- "NH4" - mode.charge <- 1 - } else{ - stop("mode = \"", mode, "\" not defined") - } - + # get the adduct additions + adductProperties <- getAdductProperties(mode, msmsPeaks@formula) + allowed_additions <- adductProperties$addition + mode.charge <- adductProperties$charge # the ppm range is two-sided here. # The range is slightly expanded because dppm calculation of diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index 2f64d43..36843d1 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -329,18 +329,23 @@ getMolecule <- function(smiles) knownAdducts <- function(){ return(c("pH", "pNa", "pK", "pM", "pNH4", "pACN_pH", "pACN_pNa", "p2Na_mH", "pM_pH", "pM_pK", "pM_pNa", "pM_pNH4", "pM_pACN_pH", "pACN_p2H", "p2H", "mH", "mFA", "mM", "")) } +getMonoisotopicMass <- function(formula){ + enviPat::isopattern(isotopes = isotopes, chemforms = formula, threshold=0.1, charge = FALSE, verbose = FALSE)[[1]][[1,1]] +} getAdductProperties <- function(mode, formula = NULL){ - if(grepl(x = "pN_pH", pattern = "^pM") & is.null(formula)) + if(grepl(x = "pN_pH", pattern = "^pM_") & is.null(formula)) stop("Cannot calculate pM adduct without formula") + if(!exists("isotopes")) data(isotopes) + mzopt <- NULL ## M+X if (mode == "pH") - mzopt <- list(addition = "H", charge = 1) + mzopt <- list(addition = "H", charge = 1) if (mode == "pNa") mzopt <- list(addition = "Na", charge = 1) if (mode == "pK") - mzopt <- list(addition = "K", charge = 1) + mzopt <- list(addition = "K", charge = 1) if (mode == "pM") mzopt <- list(addition = "", charge = 1) if (mode == "pNH4") @@ -624,30 +629,10 @@ findMass <- function(cpdID_or_smiles, retrieval="standard", mode = "pH") { # Must calculate mass manually if no formula is given if(retrieval == "unknown"){ - if(mode == "pH") { - mass <- 1.00784 - mode.charge <- 1 - } else if(mode == "pNa") { - mass <- 22.989769 - mode.charge <- 1 - } else if(mode == "pM") { - mass <- 0 - mode.charge <- 1 - } else if(mode == "mM") { - mass <- 0 - mode.charge <- -1 - } else if(mode == "mH") { - mass <- -1.00784 - mode.charge <- -1 - } else if(mode == "mFA") { - mass <- 59.0440 - mode.charge <- -1 - } else if(mode == "pNH4") { - mass <- 18.03846 - mode.charge <- 1 - } else{ - stop("mode = \"", mode, "\" not defined") - } + adductProperties <- getAdductProperties(mode, rcdk::get.formula(findFormula(cpdID_or_smiles))) + allowed_additions <- adductProperties$addition + mode.charge <- adductProperties$charge + mass <- getMonoisotopicMass(allowed_additions) return(findMz(cpdID_or_smiles, mode=mode, retrieval=retrieval)$mzCenter - mass + mode.charge * .emass) } diff --git a/R/leMsMs.r b/R/leMsMs.r index 67c5169..0f50da6 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -1801,31 +1801,10 @@ reanalyzeFailpeak <- function(custom_additions, mass, cpdID, counter, pb = NULL, # here follows the Rcdk analysis #------------------------------------ - # define the adduct additions - if(mode == "pH") { - allowed_additions <- "H" - mode.charge <- 1 - } else if(mode == "pNa") { - allowed_additions <- "Na" - mode.charge <- 1 - } else if(mode == "pM") { - allowed_additions <- "" - mode.charge <- 1 - } else if(mode == "mM") { - allowed_additions <- "" - mode.charge <- -1 - } else if(mode == "mH") { - allowed_additions <- "H-1" - mode.charge <- -1 - } else if(mode == "mFA") { - allowed_additions <- "C2H3O2" - mode.charge <- -1 - } else if(mode == "pNH4") { - allowed_additions <- "NH4" - mode.charge <- 1 - } else { - stop("mode = \"", mode, "\" not defined") - } + # get the adduct additions + adductProperties <- getAdductProperties(mode, msmsPeaks@formula) + allowed_additions <- adductProperties$addition + mode.charge <- adductProperties$charge # the ppm range is two-sided here. # The range is slightly expanded because dppm calculation of From 14739aae09871fc3f81837c5155b6ac11fb179a8 Mon Sep 17 00:00:00 2001 From: Treutler Date: Mon, 3 Sep 2018 13:41:12 +0200 Subject: [PATCH 33/71] Added enviPat to the dependencies and fixed bug in the unified adduct recognition check --- DESCRIPTION | 10 ++++++---- NAMESPACE | 1 + R/leCsvAccess.R | 2 +- R/leMsMs.r | 10 +++++----- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d6ed2fe..0786195 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,7 +12,9 @@ Authors@R: c( "sneumann@ipb-halle.de"), person(given = "Erik", family = "Muller", role = "aut", email = "erik.mueller@student.uni-halle.de"), person(given = "Tobias", family = "Schulze", role = "ctb", email = - "tobias.schulze@ufz.de") ) + "tobias.schulze@ufz.de"), person(given = + "Hendrik", family = "Treutler", role = "ctb", email = + "hendrik.treutler@gmail.com") ) Author: Michael Stravs, Emma Schymanski, Steffen Neumann, Erik Mueller, with contributions from Tobias Schulze Maintainer: RMassBank at Eawag @@ -29,13 +31,13 @@ Depends: Encoding: UTF-8 Imports: XML,RCurl,rjson,S4Vectors,digest, - rcdk,yaml,mzR,methods,Biobase,MSnbase + rcdk,yaml,mzR,methods,Biobase,MSnbase, + enviPat Suggests: gplots,RMassBankData, xcms (>= 1.37.1), CAMERA, - RUnit, - enviPat + RUnit Collate: 'alternateAnalyze.R' 'createMassBank.R' diff --git a/NAMESPACE b/NAMESPACE index 5cdd5f1..99ec8ed 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -139,3 +139,4 @@ import(mzR) import(rcdk) import(rjson) import(yaml) +import(enviPat) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index 36843d1..4be4737 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -336,7 +336,7 @@ getAdductProperties <- function(mode, formula = NULL){ if(grepl(x = "pN_pH", pattern = "^pM_") & is.null(formula)) stop("Cannot calculate pM adduct without formula") - if(!exists("isotopes")) data(isotopes) + if(!exists("isotopes")) data("isotopes", package = "enviPat") mzopt <- NULL ## M+X diff --git a/R/leMsMs.r b/R/leMsMs.r index 0f50da6..59aaad8 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -1801,11 +1801,6 @@ reanalyzeFailpeak <- function(custom_additions, mass, cpdID, counter, pb = NULL, # here follows the Rcdk analysis #------------------------------------ - # get the adduct additions - adductProperties <- getAdductProperties(mode, msmsPeaks@formula) - allowed_additions <- adductProperties$addition - mode.charge <- adductProperties$charge - # the ppm range is two-sided here. # The range is slightly expanded because dppm calculation of # generate.formula starts from empirical mass, but dppm cal- @@ -1814,6 +1809,11 @@ reanalyzeFailpeak <- function(custom_additions, mass, cpdID, counter, pb = NULL, db_formula <- findFormula(cpdID, retrieval=findLevel(cpdID,TRUE)) + # get the adduct additions + adductProperties <- getAdductProperties(mode, db_formula) + allowed_additions <- adductProperties$addition + mode.charge <- adductProperties$charge + ppmlimit <- 2.25 * filterSettings$ppmFine parent_formula <- add.formula(db_formula, allowed_additions) parent_formula <- add.formula(parent_formula, custom_additions) From 373a370e097864aaff4c4eee93d9777818fc2026 Mon Sep 17 00:00:00 2001 From: Treutler Date: Mon, 3 Sep 2018 15:45:17 +0200 Subject: [PATCH 34/71] Distributed the unified adduct recognition check to all remaining relevant places --- R/alternateAnalyze.R | 8 +++----- R/createMassBank.R | 43 ++++++++++++++----------------------------- R/leCsvAccess.R | 38 +++++++++++++++++++------------------- R/leMsMs.r | 16 ++++++---------- 4 files changed, 42 insertions(+), 63 deletions(-) diff --git a/R/alternateAnalyze.R b/R/alternateAnalyze.R index beb07e3..ad25c1f 100644 --- a/R/alternateAnalyze.R +++ b/R/alternateAnalyze.R @@ -100,11 +100,9 @@ analyzeMsMs.formula.optimized <- function(msmsPeaks, mode="pH", detail=FALSE, ru cut <- filterSettings$prelimCut if(is.na(cut)) { - if(mode %in% c("pH", "pM", "pNa", "pNH4")) - cut <- 1e4 - else if(mode %in% c("mH", "mFA","mM")) - cut <- 0 - else stop(paste("The ionization mode", mode, "is unknown.")) + adductProperties <- getAdductProperties(mode) + if(adductProperties$charge > 0) cut <- 1e4 + if(adductProperties$charge < 0) cut <- 0 } cutRatio <- filterSettings$prelimCutRatio } else{ diff --git a/R/createMassBank.R b/R/createMassBank.R index be9286e..0a2f82c 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -1284,15 +1284,10 @@ gatherCompound <- function(spec, aggregated, additionalPeaks = NULL, retrieval=" imode <- spec@mode # define positive or negative, based on processing mode. - ion_modes <- list( - "pH" = "POSITIVE", - "pNa" = "POSITIVE", - "mH" = "NEGATIVE", - "mFA" = "NEGATIVE", - "pM" = "POSITIVE", - "mM" = "NEGATIVE", - "pNH4" = "POSITIVE") - mode <- ion_modes[[imode]] + adductProperties <- getAdductProperties(imode) + mode <- NULL + if(adductProperties$charge > 0) mode <- "POSITIVE" + if(adductProperties$charge < 0) mode <- "NEGATIVE" # for format 2.01 ac_ms <- list(); @@ -1364,17 +1359,12 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP scan <- msmsdata@acquisitionNum id <- spec@id # Further fill the ac_ms datasets, and add the ms$focused_ion with spectrum-specific data: - precursor_types <- list( - "pH" = "[M+H]+", - "pNa" = "[M+Na]+", - "mH" = "[M-H]-", - "mFA" = "[M+HCOO-]-", - "pM" = "[M]+", - "mM" = "[M]-", - "pNH4" = "[M+NH4]+" - ) + + adductProperties <- getAdductProperties(spec@mode) + adductString <- adductProperties$adductString + ac_ms[['FRAGMENTATION_MODE']] <- msmsdata@info$mode - #ac_ms['PRECURSOR_TYPE'] <- precursor_types[spec$mode] + #ac_ms['PRECURSOR_TYPE'] <- adductString ac_ms[['COLLISION_ENERGY']] <- msmsdata@info$ce ac_ms[['RESOLUTION']] <- msmsdata@info$res @@ -1384,7 +1374,7 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP ms_fi <- list() ms_fi[['BASE_PEAK']] <- round(mz(spec@parent)[which.max(intensity(spec@parent))],4) ms_fi[['PRECURSOR_M/Z']] <- round(precursorMz$mzCenter,4) - ms_fi[['PRECURSOR_TYPE']] <- precursor_types[spec@mode] + ms_fi[['PRECURSOR_TYPE']] <- adductString if(all(!is.na(spec@parent@intensity), spec@parent@intensity != 0, spec@parent@intensity != 100, na.rm = TRUE)) ms_fi[['PRECURSOR_INTENSITY']] <- spec@parent@intensity @@ -1471,15 +1461,10 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP # add + or - to fragment formulas - formula_tag <- list( - "pH" = "+", - "pNa" = "+", - "mH" = "-", - "mFA" = "-", - "pM" = "+", - "mM" = "-", - "pNH4" = "+") - type <- formula_tag[[spec@mode]] + adductProperties <- getAdductProperties(spec@mode) + type <- NULL + if(adductProperties$charge > 0) type <- "+" + if(adductProperties$charge < 0) type <- "-" annotator <- getOption("RMassBank")$annotator if(is.null(annotator)) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index 4be4737..5039594 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -341,45 +341,45 @@ getAdductProperties <- function(mode, formula = NULL){ mzopt <- NULL ## M+X if (mode == "pH") - mzopt <- list(addition = "H", charge = 1) + mzopt <- list(addition = "H", charge = 1, adductString = "[M+H]+") if (mode == "pNa") - mzopt <- list(addition = "Na", charge = 1) + mzopt <- list(addition = "Na", charge = 1, adductString = "[M+Na]+") if (mode == "pK") - mzopt <- list(addition = "K", charge = 1) + mzopt <- list(addition = "K", charge = 1, adductString = "[M+K]+") if (mode == "pM") - mzopt <- list(addition = "", charge = 1) + mzopt <- list(addition = "", charge = 1, adductString = "[M]+") if (mode == "pNH4") - mzopt <- list(addition = "NH4", charge = 1) + mzopt <- list(addition = "NH4", charge = 1, adductString = "[M+NH4]+") if (mode == "p2Na_mH") - mzopt <- list(addition = "Na2H-1", charge = 1) + mzopt <- list(addition = "Na2H-1", charge = 1, adductString = "[M+2Na-H]+") if (mode == "pACN_pH") - mzopt <- list(addition = "C2H4N1", charge = 1) + mzopt <- list(addition = "C2H4N1", charge = 1, adductString = "[M+ACN+H]+") if (mode == "pACN_pNa") - mzopt <- list(addition = "C2H3N1Na1", charge = 1) + mzopt <- list(addition = "C2H3N1Na1", charge = 1, adductString = "[M+ACN+Na]+") if (mode == "p2H") - mzopt <- list(addition = "H2", charge = 2) + mzopt <- list(addition = "H2", charge = 2, adductString = "[M+2H]2+") if (mode == "pACN_p2H") - mzopt <- list(addition = "C2H5N1", charge = 2) + mzopt <- list(addition = "C2H5N1", charge = 2, adductString = "[M+ACN+2H]2+") ## 2M+X if (mode == "pM_pH") - mzopt <- list(addition = add.formula(formula, "H1"), charge = 1) + mzopt <- list(addition = add.formula(formula, "H1"), charge = 1, adductString = "[2M+H]+") if (mode == "pM_pK") - mzopt <- list(addition = add.formula(formula, "K1"), charge = 1) + mzopt <- list(addition = add.formula(formula, "K1"), charge = 1, adductString = "[2M+K]+") if (mode == "pM_pNa") - mzopt <- list(addition = add.formula(formula, "Na1"), charge = 1) + mzopt <- list(addition = add.formula(formula, "Na1"), charge = 1, adductString = "[2M+Na]+") if (mode == "pM_pNH4") - mzopt <- list(addition = add.formula(formula, "N1H4"), charge = 1) + mzopt <- list(addition = add.formula(formula, "N1H4"), charge = 1, adductString = "[2M+NH4]+") if (mode == "pM_pACN_pH") - mzopt <- list(addition = add.formula(formula, "C2H4N1"), charge = 1) + mzopt <- list(addition = add.formula(formula, "C2H4N1"), charge = 1, adductString = "[2M+ACN+H]+") ## M-X if (mode == "mH") - mzopt <- list(addition = "H-1", charge = -1) + mzopt <- list(addition = "H-1", charge = -1, adductString = "[M-H]-") if (mode == "mFA") - mzopt <- list(addition = "C1O2", charge = -1) + mzopt <- list(addition = "C1O2", charge = -1, adductString = "[M+HCOO-]-") if (mode == "mM") - mzopt <- list(addition = "", charge = -1) + mzopt <- list(addition = "", charge = -1, adductString = "[M]-") if (mode == "") - mzopt <- list(addition = "", charge = 0) + mzopt <- list(addition = "", charge = 0, adductString = "[M]") if(is.null(mzopt)) stop("mode = \"", mode, "\" not defined") return(mzopt) diff --git a/R/leMsMs.r b/R/leMsMs.r index 59aaad8..a3871ff 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -522,11 +522,9 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi cut <- filterSettings$prelimCut if(is.na(cut)) { - if(mode %in% c("pH", "pM", "pNa", "pNH4")) - cut <- 1e4 - else if(mode %in% c("mH", "mFA","mM")) - cut <- 0 - else stop(paste("The ionization mode", mode, "is unknown.")) + adductProperties <- getAdductProperties(mode) + if(adductProperties$charge > 0) cut <- 1e4 + if(adductProperties$charge < 0) cut <- 0 } cutRatio <- filterSettings$prelimCutRatio } else { @@ -872,11 +870,9 @@ analyzeMsMs.intensity <- function(msmsPeaks, mode="pH", detail=FALSE, run="preli cut <- filterSettings$prelimCut if(is.na(cut)) { - if(mode %in% c("pH", "pM", "pNa", "pNH4")) - cut <- 1e4 - else if(mode %in% c("mH", "mFA", "mM")) - cut <- 0 - else stop(paste("The ionization mode", mode, "is unknown.")) + adductProperties <- getAdductProperties(mode) + if(adductProperties$charge > 0) cut <- 1e4 + if(adductProperties$charge < 0) cut <- 0 } cutRatio <- filterSettings$prelimCutRatio } From a1b845040eae82339b6ec950d6ae605bbf853194 Mon Sep 17 00:00:00 2001 From: Treutler Date: Mon, 3 Sep 2018 15:59:42 +0200 Subject: [PATCH 35/71] Passed the molecular formula to the the unified adduct recognition --- R/alternateAnalyze.R | 2 +- R/createMassBank.R | 6 +++--- R/leMsMs.r | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/alternateAnalyze.R b/R/alternateAnalyze.R index ad25c1f..db020f5 100644 --- a/R/alternateAnalyze.R +++ b/R/alternateAnalyze.R @@ -100,7 +100,7 @@ analyzeMsMs.formula.optimized <- function(msmsPeaks, mode="pH", detail=FALSE, ru cut <- filterSettings$prelimCut if(is.na(cut)) { - adductProperties <- getAdductProperties(mode) + adductProperties <- getAdductProperties(mode, msmsPeaks@formula) if(adductProperties$charge > 0) cut <- 1e4 if(adductProperties$charge < 0) cut <- 0 } diff --git a/R/createMassBank.R b/R/createMassBank.R index 0a2f82c..6560a13 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -1284,7 +1284,7 @@ gatherCompound <- function(spec, aggregated, additionalPeaks = NULL, retrieval=" imode <- spec@mode # define positive or negative, based on processing mode. - adductProperties <- getAdductProperties(imode) + adductProperties <- getAdductProperties(imode, spec@formula) mode <- NULL if(adductProperties$charge > 0) mode <- "POSITIVE" if(adductProperties$charge < 0) mode <- "NEGATIVE" @@ -1360,7 +1360,7 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP id <- spec@id # Further fill the ac_ms datasets, and add the ms$focused_ion with spectrum-specific data: - adductProperties <- getAdductProperties(spec@mode) + adductProperties <- getAdductProperties(spec@mode, spec@formula) adductString <- adductProperties$adductString ac_ms[['FRAGMENTATION_MODE']] <- msmsdata@info$mode @@ -1461,7 +1461,7 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP # add + or - to fragment formulas - adductProperties <- getAdductProperties(spec@mode) + adductProperties <- getAdductProperties(spec@mode, spec@formula) type <- NULL if(adductProperties$charge > 0) type <- "+" if(adductProperties$charge < 0) type <- "-" diff --git a/R/leMsMs.r b/R/leMsMs.r index a3871ff..ca63011 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -522,7 +522,7 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi cut <- filterSettings$prelimCut if(is.na(cut)) { - adductProperties <- getAdductProperties(mode) + adductProperties <- getAdductProperties(mode, msmsPeaks@formula) if(adductProperties$charge > 0) cut <- 1e4 if(adductProperties$charge < 0) cut <- 0 } @@ -870,7 +870,7 @@ analyzeMsMs.intensity <- function(msmsPeaks, mode="pH", detail=FALSE, run="preli cut <- filterSettings$prelimCut if(is.na(cut)) { - adductProperties <- getAdductProperties(mode) + adductProperties <- getAdductProperties(mode, msmsPeaks@formula) if(adductProperties$charge > 0) cut <- 1e4 if(adductProperties$charge < 0) cut <- 0 } From e76cf96d2d4de3cdf120d760f0a582bcf2416a24 Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 4 Sep 2018 08:54:30 +0200 Subject: [PATCH 36/71] Fixed bug for the export of invalid records --- R/createMassBank.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index 6560a13..3c86450 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -250,7 +250,7 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c #mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) mb@problems <- which(is.na(mb@compiled)) mb@compiled_ok <- mb@compiled[mb@ok] - mb@compiled_notOk <- mb@compiled[!ok] + mb@compiled_notOk <- mb@compiled[!ok & unlist(lapply(X = mb@compiled[!ok], FUN = length)) > 0] } # Step 5: Convert the internal tree-like representation of the MassBank data into # flat-text string arrays (basically, into text-file style, but still in memory) From 23f364fbd50e2c7670a031b36462e94d40fe22cc Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 4 Sep 2018 14:52:52 +0200 Subject: [PATCH 37/71] Added a check whether the elements in the molecular formulas are covered in the dbe calculation to convert strange error to explicit error --- R/leMsMs.r | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/R/leMsMs.r b/R/leMsMs.r index ca63011..102234c 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -673,6 +673,11 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi childPeaks <- as.data.frame(do.call(rbind, peakmatrix)) + presentElements <- unique(unlist(lapply(X = lapply(X = childPeaks$formula, FUN = formulastring.to.list), FUN = names))) + atomDBEs <- sapply(X = presentElements, FUN = dbe) + unknownElements <- names(atomDBEs)[sapply(X = atomDBEs, FUN = function(atomDBE){length(atomDBE)==0})] + if(length(unknownElements) > 0) stop(paste("Element(s)", paste(unknownElements), "cannot be assigned a DBE")) + # Reformat the deformatted output correctly (why doesn't R have a better way to do this, e.g. avoid deformatting?) childPeaks$row <- as.numeric(as.character(childPeaks$row)) @@ -1925,10 +1930,10 @@ filterPeaksMultiplicity <- function(peaks, formulacol, recalcBest = TRUE) peaks <- cbind(peaks, data.frame(formulaMultiplicity=numeric())) if(recalcBest){ if(formulacol == "formula"){ - warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.") + cat(paste("### Warning ### filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.")) } if(formulacol == "reanalyzed.formula"){ - warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point if this error message also shows for reanalyzed peaks.") + warning("filterPeaksMultiplicity: All peaks have been filtered. The workflow can not be continued beyond this point.") } peaks$fM_factor <- as.factor(peaks$formulaMultiplicity) return(peaks) From e3c37d9a55022ff2c4d153ceefc5a96de68397e4 Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 4 Sep 2018 14:53:29 +0200 Subject: [PATCH 38/71] Added K to the elements of the dbe calculation --- R/formulaCalculator.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/formulaCalculator.R b/R/formulaCalculator.R index a552058..a0b4117 100755 --- a/R/formulaCalculator.R +++ b/R/formulaCalculator.R @@ -179,7 +179,8 @@ dbe <- function(formula) "I" = -0.5, "As" = 2.5, "Hg" = 0, - "Na" = -0.5 + "Na" = -0.5, + "K" = -0.5 ) count <- 1 for(element in names(formula)) From ba570aa6eca40fb384f0e1abebf253695af4c645 Mon Sep 17 00:00:00 2001 From: Treutler Date: Thu, 6 Sep 2018 08:53:44 +0200 Subject: [PATCH 39/71] Correction for the accessibility of isotope information --- R/leCsvAccess.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index 5039594..c9004a8 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -330,14 +330,13 @@ knownAdducts <- function(){ return(c("pH", "pNa", "pK", "pM", "pNH4", "pACN_pH", "pACN_pNa", "p2Na_mH", "pM_pH", "pM_pK", "pM_pNa", "pM_pNH4", "pM_pACN_pH", "pACN_p2H", "p2H", "mH", "mFA", "mM", "")) } getMonoisotopicMass <- function(formula){ + if(!exists("isotopes")) data("isotopes", package = "enviPat") enviPat::isopattern(isotopes = isotopes, chemforms = formula, threshold=0.1, charge = FALSE, verbose = FALSE)[[1]][[1,1]] } getAdductProperties <- function(mode, formula = NULL){ if(grepl(x = "pN_pH", pattern = "^pM_") & is.null(formula)) stop("Cannot calculate pM adduct without formula") - if(!exists("isotopes")) data("isotopes", package = "enviPat") - mzopt <- NULL ## M+X if (mode == "pH") From 007eae54a456c202bc1ee384384a27275bb945df Mon Sep 17 00:00:00 2001 From: Treutler Date: Thu, 6 Sep 2018 09:11:35 +0200 Subject: [PATCH 40/71] Stored adduct information in data.frame instead of if-clauses --- R/leCsvAccess.R | 80 ++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 45 deletions(-) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index c9004a8..f15dd58 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -333,54 +333,44 @@ getMonoisotopicMass <- function(formula){ if(!exists("isotopes")) data("isotopes", package = "enviPat") enviPat::isopattern(isotopes = isotopes, chemforms = formula, threshold=0.1, charge = FALSE, verbose = FALSE)[[1]][[1,1]] } -getAdductProperties <- function(mode, formula = NULL){ - if(grepl(x = "pN_pH", pattern = "^pM_") & is.null(formula)) +getAdductInformation <- function(formula){ + adductDf <- as.data.frame(rbind( + ## M+X + c(mode = "pH", addition = "H", charge = 1, adductString = "[M+H]+"), + c(mode = "pNa", addition = "Na", charge = 1, adductString = "[M+Na]+"), + c(mode = "pK", addition = "K", charge = 1, adductString = "[M+K]+"), + c(mode = "pM", addition = "", charge = 1, adductString = "[M]+"), + c(mode = "pNH4", addition = "NH4", charge = 1, adductString = "[M+NH4]+"), + c(mode = "p2Na_mH", addition = "Na2H-1", charge = 1, adductString = "[M+2Na-H]+"), + c(mode = "pACN_pH", addition = "C2H4N1", charge = 1, adductString = "[M+ACN+H]+"), + c(mode = "pACN_pNa", addition = "C2H3N1Na1", charge = 1, adductString = "[M+ACN+Na]+"), + c(mode = "p2H", addition = "H2", charge = 2, adductString = "[M+2H]2+"), + c(mode = "pACN_p2H", addition = "C2H5N1", charge = 2, adductString = "[M+ACN+2H]2+"), + ## 2M+X + c(mode = "pM_pH", addition = add.formula(formula, "H1"), charge = 1, adductString = "[2M+H]+"), + c(mode = "pM_pK", addition = add.formula(formula, "K1"), charge = 1, adductString = "[2M+K]+"), + c(mode = "pM_pNa", addition = add.formula(formula, "Na1"), charge = 1, adductString = "[2M+Na]+"), + c(mode = "pM_pNH4", addition = add.formula(formula, "N1H4"), charge = 1, adductString = "[2M+NH4]+"), + c(mode = "pM_pACN_pH", addition = add.formula(formula, "C2H4N1"), charge = 1, adductString = "[2M+ACN+H]+"), + ## M-X + c(mode = "mH", addition = "H-1", charge = -1, adductString = "[M-H]-"), + c(mode = "mFA", addition = "C1O2", charge = -1, adductString = "[M+HCOO-]-"), + c(mode = "mM", addition = "", charge = -1, adductString = "[M]-"), + c(mode = "", addition = "", charge = 0, adductString = "[M]") + ), stringsAsFactors = F) + adductDf$charge <- as.integer(adductDf$charge) +} +getAdductProperties <- function(mode, formula){ + if(grepl(x = mode, pattern = "^pM") & is.null(formula)) stop("Cannot calculate pM adduct without formula") + else if(is.null(formula)) formula <- "" + + adductDf <- getAdductInformation(formula) - mzopt <- NULL - ## M+X - if (mode == "pH") - mzopt <- list(addition = "H", charge = 1, adductString = "[M+H]+") - if (mode == "pNa") - mzopt <- list(addition = "Na", charge = 1, adductString = "[M+Na]+") - if (mode == "pK") - mzopt <- list(addition = "K", charge = 1, adductString = "[M+K]+") - if (mode == "pM") - mzopt <- list(addition = "", charge = 1, adductString = "[M]+") - if (mode == "pNH4") - mzopt <- list(addition = "NH4", charge = 1, adductString = "[M+NH4]+") - if (mode == "p2Na_mH") - mzopt <- list(addition = "Na2H-1", charge = 1, adductString = "[M+2Na-H]+") - if (mode == "pACN_pH") - mzopt <- list(addition = "C2H4N1", charge = 1, adductString = "[M+ACN+H]+") - if (mode == "pACN_pNa") - mzopt <- list(addition = "C2H3N1Na1", charge = 1, adductString = "[M+ACN+Na]+") - if (mode == "p2H") - mzopt <- list(addition = "H2", charge = 2, adductString = "[M+2H]2+") - if (mode == "pACN_p2H") - mzopt <- list(addition = "C2H5N1", charge = 2, adductString = "[M+ACN+2H]2+") - ## 2M+X - if (mode == "pM_pH") - mzopt <- list(addition = add.formula(formula, "H1"), charge = 1, adductString = "[2M+H]+") - if (mode == "pM_pK") - mzopt <- list(addition = add.formula(formula, "K1"), charge = 1, adductString = "[2M+K]+") - if (mode == "pM_pNa") - mzopt <- list(addition = add.formula(formula, "Na1"), charge = 1, adductString = "[2M+Na]+") - if (mode == "pM_pNH4") - mzopt <- list(addition = add.formula(formula, "N1H4"), charge = 1, adductString = "[2M+NH4]+") - if (mode == "pM_pACN_pH") - mzopt <- list(addition = add.formula(formula, "C2H4N1"), charge = 1, adductString = "[2M+ACN+H]+") - ## M-X - if (mode == "mH") - mzopt <- list(addition = "H-1", charge = -1, adductString = "[M-H]-") - if (mode == "mFA") - mzopt <- list(addition = "C1O2", charge = -1, adductString = "[M+HCOO-]-") - if (mode == "mM") - mzopt <- list(addition = "", charge = -1, adductString = "[M]-") - if (mode == "") - mzopt <- list(addition = "", charge = 0, adductString = "[M]") - if(is.null(mzopt)) stop("mode = \"", mode, "\" not defined") + if(!(mode %in% adductDf$mode)) + stop("mode = \"", mode, "\" not defined") + mzopt <- as.list(adductDf[adductDf$mode==mode,]) return(mzopt) } From e1e7fbcff7b9c3ae3dcdb370709127ad0a7e6cbd Mon Sep 17 00:00:00 2001 From: Treutler Date: Thu, 6 Sep 2018 09:42:07 +0200 Subject: [PATCH 41/71] Fixed bug in the stored adduct information --- R/leCsvAccess.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index f15dd58..5dbff22 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -359,6 +359,7 @@ getAdductInformation <- function(formula){ c(mode = "", addition = "", charge = 0, adductString = "[M]") ), stringsAsFactors = F) adductDf$charge <- as.integer(adductDf$charge) + return(adductDf) } getAdductProperties <- function(mode, formula){ if(grepl(x = mode, pattern = "^pM") & is.null(formula)) From e4893aba8bc89d47bd872265640857774ec826cc Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 14 Sep 2018 15:03:29 +0200 Subject: [PATCH 42/71] Added additional adducts --- R/leCsvAccess.R | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index 5dbff22..8e93128 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -327,7 +327,7 @@ getMolecule <- function(smiles) } knownAdducts <- function(){ - return(c("pH", "pNa", "pK", "pM", "pNH4", "pACN_pH", "pACN_pNa", "p2Na_mH", "pM_pH", "pM_pK", "pM_pNa", "pM_pNH4", "pM_pACN_pH", "pACN_p2H", "p2H", "mH", "mFA", "mM", "")) + return(getAdductInformation("")$mode) } getMonoisotopicMass <- function(formula){ if(!exists("isotopes")) data("isotopes", package = "enviPat") @@ -337,6 +337,7 @@ getAdductInformation <- function(formula){ adductDf <- as.data.frame(rbind( ## M+X c(mode = "pH", addition = "H", charge = 1, adductString = "[M+H]+"), + c(mode = "pLi", addition = "Li", charge = 1, adductString = "[M+Li]+"), c(mode = "pNa", addition = "Na", charge = 1, adductString = "[M+Na]+"), c(mode = "pK", addition = "K", charge = 1, adductString = "[M+K]+"), c(mode = "pM", addition = "", charge = 1, adductString = "[M]+"), @@ -344,6 +345,7 @@ getAdductInformation <- function(formula){ c(mode = "p2Na_mH", addition = "Na2H-1", charge = 1, adductString = "[M+2Na-H]+"), c(mode = "pACN_pH", addition = "C2H4N1", charge = 1, adductString = "[M+ACN+H]+"), c(mode = "pACN_pNa", addition = "C2H3N1Na1", charge = 1, adductString = "[M+ACN+Na]+"), + c(mode = "pH_mH2O", addition = "H-1O-1", charge = 2, adductString = "[M-H2O+H]+"), c(mode = "p2H", addition = "H2", charge = 2, adductString = "[M+2H]2+"), c(mode = "pACN_p2H", addition = "C2H5N1", charge = 2, adductString = "[M+ACN+2H]2+"), ## 2M+X @@ -353,16 +355,25 @@ getAdductInformation <- function(formula){ c(mode = "pM_pNH4", addition = add.formula(formula, "N1H4"), charge = 1, adductString = "[2M+NH4]+"), c(mode = "pM_pACN_pH", addition = add.formula(formula, "C2H4N1"), charge = 1, adductString = "[2M+ACN+H]+"), ## M-X - c(mode = "mH", addition = "H-1", charge = -1, adductString = "[M-H]-"), - c(mode = "mFA", addition = "C1O2", charge = -1, adductString = "[M+HCOO-]-"), - c(mode = "mM", addition = "", charge = -1, adductString = "[M]-"), - c(mode = "", addition = "", charge = 0, adductString = "[M]") + c(mode = "mH", addition = "H-1", charge = -1, adductString = "[M-H]-"), + c(mode = "mFA", addition = "C1O2", charge = -1, adductString = "[M+HCOOH-H]-"), + c(mode = "mH_mH2O", addition = "H-3O-1", charge = -1, adductString = "[M-H2O-H]-"), + c(mode = "m2H_pNa", addition = "H-2Na1", charge = -1, adductString = "[M+Na-2H]-"), + c(mode = "mM", addition = "", charge = -1, adductString = "[M]-"), + c(mode = "m2H", addition = "H-2", charge = -1, adductString = "[M-2H]-"), ## in case of positively charged compounds + ## 2M-X + c(mode = "mH_pM", addition = add.formula(formula, "H-1"), charge = -1, adductString = "[2M-H]-"), + c(mode = "mFA_pM", addition = add.formula(formula, "C1O2"), charge = -1, adductString = "[2M+HCOOH-H]-"), + c(mode = "mH_pM_mH2O", addition = add.formula(formula, "H-3O-1"), charge = -1, adductString = "[2M-H2O-H]-"), + c(mode = "m2H_pM_pNa", addition = add.formula(formula, "H-2Na1"), charge = -1, adductString = "[2M+Na-2H]-"), + ## ??? + c(mode = "", addition = "", charge = 0, adductString = "[M]") ), stringsAsFactors = F) adductDf$charge <- as.integer(adductDf$charge) return(adductDf) } getAdductProperties <- function(mode, formula){ - if(grepl(x = mode, pattern = "^pM") & is.null(formula)) + if(grepl(x = mode, pattern = "pM") & is.null(formula)) stop("Cannot calculate pM adduct without formula") else if(is.null(formula)) formula <- "" From dae5b04efa9ba75efae5d9ee472c931960becdb3 Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 14 Sep 2018 15:10:22 +0200 Subject: [PATCH 43/71] Fixed bug in the record compilation in case of empty spectra --- R/createMassBank.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index 3c86450..eebd635 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -246,11 +246,13 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c }) # check which compounds have useful spectra ok <- unlist(lapply(X = selectSpectra(mb@spectra, "found", "object"), FUN = function(spec){unlist(lapply(X = spec@children, FUN = function(child){child@ok}))})) + notEmpty <- unlist(lapply(X = mb@compiled, FUN = length)) > 0 + ok <- ok & notEmpty mb@ok <- which(ok) #mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) mb@problems <- which(is.na(mb@compiled)) mb@compiled_ok <- mb@compiled[mb@ok] - mb@compiled_notOk <- mb@compiled[!ok & unlist(lapply(X = mb@compiled[!ok], FUN = length)) > 0] + mb@compiled_notOk <- mb@compiled[!ok & notEmpty] } # Step 5: Convert the internal tree-like representation of the MassBank data into # flat-text string arrays (basically, into text-file style, but still in memory) From 47239b1f923816d0aae3773282fa5cc2e4684ffe Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 14 Sep 2018 15:10:43 +0200 Subject: [PATCH 44/71] Fixed the parsing of msp files with decimal delimiter comma instead of point --- R/leMsmsRaw.R | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index 9df5ace..93316c3 100644 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -632,9 +632,11 @@ findMsMsHRperMsp.direct <- function(fileName, cpdIDs, mode="pH") { whichmissing <- vector() metaspec <- list() + mzs <- unlist(lapply(X = xrmsms, FUN = function(x){ x$PRECURSORMZ })) + rts <- unlist(lapply(X = xrmsms, FUN = function(x){ if(x$RETENTIONTIME == "NA") return(NA) else return(x$RETENTIONTIME) })) precursorTable <- data.frame(stringsAsFactors = FALSE, - mz = as.numeric(unlist(lapply(X = xrmsms, FUN = function(x){ x$PRECURSORMZ }))), - rt = as.numeric(unlist(lapply(X = xrmsms, FUN = function(x){ x$RETENTIONTIME }))) + mz = as.numeric(mzs), + rt = as.numeric(rts) ) precursorTable[, "rt"] <- precursorTable[, "rt"] * 60 @@ -746,14 +748,16 @@ findMsMsHRperMsp.direct <- function(fileName, cpdIDs, mode="pH") { if(is.null(spectrum)){ metaspec[[idIdx]] <- list(matrix(0,1,7)) } else { + mz <- as.numeric(spectrum$pspectrum[, "mz"]) + rt <- as.numeric(ifelse(test = spectrum$RETENTIONTIME=="NA", yes = NA, no = spectrum$RETENTIONTIME)) metaspec[[idIdx]] <- list(data.frame( stringsAsFactors = F, - "mz" = as.numeric(spectrum$pspectrum[, "mz"]), - "mzmin" = as.numeric(spectrum$pspectrum[, "mz"]), - "mzmax" = as.numeric(spectrum$pspectrum[, "mz"]), - "rt" = as.numeric(spectrum$RETENTIONTIME), - "rtmin" = as.numeric(spectrum$RETENTIONTIME), - "rtmax" = as.numeric(spectrum$RETENTIONTIME), + "mz" = mz, + "mzmin" = mz, + "mzmax" = mz, + "rt" = rt, + "rtmin" = rt, + "rtmax" = rt, "into" = as.numeric(spectrum$pspectrum[, "intensity"]), "into_parent" = as.numeric(spectrum$INTENSITY) )) @@ -791,6 +795,10 @@ read.msp <- function(file){ names(cmpnd) <- fields[-pk.idx] if(!("INTENSITY" %in% names(cmpnd))) cmpnd$"INTENSITY" <- 100 + cmpnd$PRECURSORMZ <- gsub(x = cmpnd$PRECURSORMZ, pattern = ",", replacement = ".") + cmpnd$RETENTIONTIME <- gsub(x = cmpnd$RETENTIONTIME, pattern = ",", replacement = ".") + cmpnd$INTENSITY <- gsub(x = cmpnd$INTENSITY, pattern = ",", replacement = ".") + ## minutes to seconds #cmpnd$RETENTIONTIME <- as.numeric(cmpnd$RETENTIONTIME) * 60 @@ -803,8 +811,8 @@ read.msp <- function(file){ stop(paste("Not the right number of peaks in compound '", cmpnd$Name, "' (", npeaks, " vs ", length(pks), ") in file '", file, "'", sep = "")) pklst <- strsplit(x = pks, split = "\t| ") pklst <- lapply(pklst, function(x) x[x != ""]) - cmz <- as.numeric(sapply(pklst, "[[", 1)) - cintens <- as.numeric(sapply(pklst, "[[", 2)) + cmz <- as.numeric(gsub(x = sapply(pklst, "[[", 1), pattern = ",", replacement = ".")) + cintens <- as.numeric(gsub(x = sapply(pklst, "[[", 2), pattern = ",", replacement = ".")) finaltab <- matrix(c(cmz, cintens), ncol = 2) if (any(table(cmz) > 1)) { warning("Duplicate mass in compound ", cmpnd$Name, " (CAS ", cmpnd$CAS, ")... summing up intensities") From 907bf3face1eaf08b77b8d8582c2102b57442d62 Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 18 Sep 2018 08:25:12 +0200 Subject: [PATCH 45/71] Refined the calculation of monoisotopic masses additions using enviPat --- R/leCsvAccess.R | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index 8e93128..a20394a 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -331,7 +331,30 @@ knownAdducts <- function(){ } getMonoisotopicMass <- function(formula){ if(!exists("isotopes")) data("isotopes", package = "enviPat") - enviPat::isopattern(isotopes = isotopes, chemforms = formula, threshold=0.1, charge = FALSE, verbose = FALSE)[[1]][[1,1]] + + if(formula == "") return(0) + + if(grepl(x = formula, pattern = "-")){ + starts <- gregexpr(text = formula, pattern = "[A-Z]")[[1]] + subFormulas <- sapply(X = seq_along(starts), FUN = function(startIdx){ + ifelse( + test = startIdx < length(starts), + yes = substr(x = formula, start = starts[[startIdx]], stop = starts[[startIdx + 1]] - 1), + no = substr(x = formula, start = starts[[startIdx]], stop = nchar(formula)) + ) + }) + + monoisotopicMass <- sum(sapply(X = subFormulas, FUN = function(subFormula){ + ifelse( + test = grepl(x = subFormula, pattern = "-"), + yes = -enviPat::isopattern(isotopes = isotopes, chemforms = gsub(x = subFormula, pattern = "-", replacement = ""), threshold=0.1, charge = FALSE, verbose = FALSE)[[1]][[1,1]], + no = enviPat::isopattern(isotopes = isotopes, chemforms = subFormula, threshold=0.1, charge = FALSE, verbose = FALSE)[[1]][[1,1]] + ) + })) + } else { + monoisotopicMass <- enviPat::isopattern(isotopes = isotopes, chemforms = formula, threshold=0.1, charge = FALSE, verbose = FALSE)[[1]][[1,1]] + } + return(monoisotopicMass) } getAdductInformation <- function(formula){ adductDf <- as.data.frame(rbind( From f679035ec3e30594b586fb4c61265f0ca716cdbb Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 2 Oct 2018 09:26:15 +0200 Subject: [PATCH 46/71] Added Li to DBE calculation --- R/formulaCalculator.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/formulaCalculator.R b/R/formulaCalculator.R index a0b4117..c045172 100755 --- a/R/formulaCalculator.R +++ b/R/formulaCalculator.R @@ -179,6 +179,7 @@ dbe <- function(formula) "I" = -0.5, "As" = 2.5, "Hg" = 0, + "Li" = -0.5, "Na" = -0.5, "K" = -0.5 ) From 2ee66712a03b0db6d7013d05bb7c3a8cf1a5ddb5 Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 2 Oct 2018 09:26:50 +0200 Subject: [PATCH 47/71] Added further adducts --- R/leCsvAccess.R | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index a20394a..b7e5ba9 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -358,6 +358,40 @@ getMonoisotopicMass <- function(formula){ } getAdductInformation <- function(formula){ adductDf <- as.data.frame(rbind( + + ## strange negative adducts + c(mode = "mpM", addition = formula, charge = -1, adductString = "[2M]-"), + c(mode = "m2H_pHCOOH_pNa", addition = "Na1C1O2", charge = -1, adductString = "[M+HCOOH+Na-2H]-"), + c(mode = "mH_p2H", addition = "H2", charge = -1, adductString = "[M+3H-H]-"), + c(mode = "mH_pH2O", addition = "H1O1", charge = -1, adductString = "[M+H2O-H]-"), + c(mode = "m4H_pM_p3Na", addition = add.formula(formula, "Na3H-4"), charge = -1, adductString = "[2M+3Na-4H]-"), + c(mode = "m2H_mNH3_pNa", addition = add.formula(formula, "Na1N-1H-5"), charge = -1, adductString = "[2M-NH3+Na-2H]-"), + c(mode = "m3H_pM_p2Na", addition = add.formula(formula, "Na2H-3"), charge = -1, adductString = "[2M+2Na-3H]-"), + c(mode = "m3H_pM", addition = add.formula(formula, "H-3"), charge = -1, adductString = "[2M-3H]-"), + c(mode = "mH_p2M", addition = add.formula(formula, add.formula(formula, "H-1")), charge = -1, adductString = "[3M-H]-"), + + ## strange positive adducts + c(mode = "pCOONa", addition = "C1O2Na1", charge = 1, adductString = "[M+COONa]+"), + c(mode = "p3H_c1", addition = "H3", charge = 1, adductString = "[M+3H]+"), + c(mode = "pH2O_c1", addition = "H2O1", charge = 1, adductString = "[M+H2O]+"), + c(mode = "pH_m2H2O", addition = "H-3O-2", charge = 1, adductString = "[M-2H2O+H]+"), + c(mode = "pH_mNH3", addition = "N-1H-2", charge = 1, adductString = "[M-NH3+H]+"), + c(mode = "p2H_c1", addition = "H2", charge = 1, adductString = "[M+2H]+"), + c(mode = "p_mNH3_c1", addition = "N-1H-3", charge = 1, adductString = "[M-NH2-H]+"), + c(mode = "pM_p2Na_m3H_c1", addition = add.formula(formula, "Na2H-3"), charge = 1, adductString = "[2M+2Na-3H]+"), + c(mode = "pM_pNa_m2H_c1", addition = add.formula(formula, "Na1H-2"), charge = 1, adductString = "[2M+Na-2H]+"), + c(mode = "pM_pNa_mH_c1", addition = add.formula(formula, "Na1H-1"), charge = 1, adductString = "[2M+Na-H]+"), + c(mode = "pM_p2Na_m2H_c1", addition = add.formula(formula, "Na2H-2"), charge = 1, adductString = "[2M+2Na-2H]+"), + c(mode = "pM_pH_m2H2O_c1", addition = add.formula(formula, "H-3O-2"), charge = 1, adductString = "[2M-2H2O+H]+"), + c(mode = "pM_pH_mH2O", addition = add.formula(formula, "H-1O-1"), charge = 1, adductString = "[2M-H2O+H]+"), + c(mode = "pM_m2H_c1", addition = add.formula(formula, "H-2"), charge = 1, adductString = "[2M-2H]+"), + c(mode = "pM_pLi", addition = add.formula(formula, "Li1"), charge = 1, adductString = "[2M+Li]+"), + c(mode = "pM_pH_m2O", addition = add.formula(formula, "O-2H1"), charge = 1, adductString = "[2M-2O+H]+"), + c(mode = "pM_pNa_m2O", addition = add.formula(formula, "O-2Na1"), charge = 1, adductString = "[2M-2O+Na]+"), + c(mode = "pM_pH_m3O", addition = add.formula(formula, "O-3H1"), charge = 1, adductString = "[2M-3O+H]+"), + c(mode = "pM_mH_c1", addition = add.formula(formula, "H-1"), charge = 1, adductString = "[2M-H]+"), + c(mode = "pH_c2", addition = "H1", charge = 2, adductString = "[M+H]2+"), + ## M+X c(mode = "pH", addition = "H", charge = 1, adductString = "[M+H]+"), c(mode = "pLi", addition = "Li", charge = 1, adductString = "[M+Li]+"), @@ -379,20 +413,23 @@ getAdductInformation <- function(formula){ c(mode = "pM_pACN_pH", addition = add.formula(formula, "C2H4N1"), charge = 1, adductString = "[2M+ACN+H]+"), ## M-X c(mode = "mH", addition = "H-1", charge = -1, adductString = "[M-H]-"), - c(mode = "mFA", addition = "C1O2", charge = -1, adductString = "[M+HCOOH-H]-"), + c(mode = "mFA", addition = "C1O2H", charge = -1, adductString = "[M+HCOOH-H]-"), c(mode = "mH_mH2O", addition = "H-3O-1", charge = -1, adductString = "[M-H2O-H]-"), c(mode = "m2H_pNa", addition = "H-2Na1", charge = -1, adductString = "[M+Na-2H]-"), c(mode = "mM", addition = "", charge = -1, adductString = "[M]-"), c(mode = "m2H", addition = "H-2", charge = -1, adductString = "[M-2H]-"), ## in case of positively charged compounds ## 2M-X c(mode = "mH_pM", addition = add.formula(formula, "H-1"), charge = -1, adductString = "[2M-H]-"), - c(mode = "mFA_pM", addition = add.formula(formula, "C1O2"), charge = -1, adductString = "[2M+HCOOH-H]-"), + c(mode = "mFA_pM", addition = add.formula(formula, "C1O2H"), charge = -1, adductString = "[2M+HCOOH-H]-"), c(mode = "mH_pM_mH2O", addition = add.formula(formula, "H-3O-1"), charge = -1, adductString = "[2M-H2O-H]-"), c(mode = "m2H_pM_pNa", addition = add.formula(formula, "H-2Na1"), charge = -1, adductString = "[2M+Na-2H]-"), ## ??? c(mode = "", addition = "", charge = 0, adductString = "[M]") ), stringsAsFactors = F) adductDf$charge <- as.integer(adductDf$charge) + + if(any(any(duplicated(adductDf$mode)), any(duplicated(adductDf$adductString)))) stop("Invalid adduct table") + return(adductDf) } getAdductProperties <- function(mode, formula){ From 04032d118093daaae0ebb6a5a4eb07c20af4c9a3 Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 2 Oct 2018 09:27:29 +0200 Subject: [PATCH 48/71] Corrected the position of SP in records --- R/createMassBank.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index eebd635..d4d97aa 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -1217,11 +1217,12 @@ readMbdata <- function(row) link[["CHEMSPIDER"]] = row[["CH.LINK.CHEMSPIDER"]] link[which(is.na(link))] <- NULL mbdata[["CH$LINK"]] <- link + ## SP$SAMPLE + if(all(nchar(row[["SP.SAMPLE"]]) > 0, row[["SP.SAMPLE"]] != "NA", !is.na(row[["SP.SAMPLE"]]), na.rm = TRUE)) + mbdata[['SP$SAMPLE']] <- row[["SP.SAMPLE"]] # again, these constants are read from the options: mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument mbdata[['AC$INSTRUMENT_TYPE']] <- getOption("RMassBank")$annotations$instrument_type - if(all(nchar(row[["SP.SAMPLE"]]) > 0, row[["SP.SAMPLE"]] != "NA", !is.na(row[["SP.SAMPLE"]]), na.rm = TRUE)) - mbdata[['SP$SAMPLE']] <- row[["SP.SAMPLE"]] return(mbdata) From 80cdf2d1f922d681315695057018e3a9ec7c04bd Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 2 Oct 2018 10:02:40 +0200 Subject: [PATCH 49/71] Changed order of AC: IONIZATION and AC: ION_MODE as the record specification demands it --- R/createMassBank.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index d4d97aa..3b4497a 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -1295,8 +1295,8 @@ gatherCompound <- function(spec, aggregated, additionalPeaks = NULL, retrieval=" # for format 2.01 ac_ms <- list(); ac_ms[['MS_TYPE']] <- getOption("RMassBank")$annotations$ms_type - ac_ms[['IONIZATION']] <- getOption("RMassBank")$annotations$ionization ac_ms[['ION_MODE']] <- mode + ac_ms[['IONIZATION']] <- getOption("RMassBank")$annotations$ionization ## add generic AC$MASS_SPECTROMETRY information properties <- names(getOption("RMassBank")$annotations) From 5aa961500d0e8f0d4c7b3a1cb2c0692290eff546 Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 20 Nov 2018 08:12:26 +0100 Subject: [PATCH 50/71] http://cts.fiehnlab.ucdavis.edu/service/compound/ -> https://cts.fiehnlab.ucdavis.edu/service/compound/ --- R/webAccess.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/webAccess.R b/R/webAccess.R index 38433f3..78b77d1 100755 --- a/R/webAccess.R +++ b/R/webAccess.R @@ -151,7 +151,7 @@ getPcId <- function(query, from = "inchikey") #' @note Currently, the CTS results are still incomplete; the name scores are all 0, #' formula and exact mass return zero. #' @references Chemical Translation Service: -#' \url{http://cts.fiehnlab.ucdavis.edu} +#' \url{https://cts.fiehnlab.ucdavis.edu} #' #' @examples #' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") @@ -163,7 +163,7 @@ getPcId <- function(query, from = "inchikey") #' @export getCtsRecord <- function(key) { - baseURL <- "http://cts.fiehnlab.ucdavis.edu/service/compound/" + baseURL <- "https://cts.fiehnlab.ucdavis.edu/service/compound/" errorvar <- 0 currEnvir <- environment() From b918f80382cd937cb29d6ae3481f221bf5b0ccd6 Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 20 Nov 2018 08:12:48 +0100 Subject: [PATCH 51/71] Added adducts --- R/leCsvAccess.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index b7e5ba9..a5d8526 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -363,6 +363,7 @@ getAdductInformation <- function(formula){ c(mode = "mpM", addition = formula, charge = -1, adductString = "[2M]-"), c(mode = "m2H_pHCOOH_pNa", addition = "Na1C1O2", charge = -1, adductString = "[M+HCOOH+Na-2H]-"), c(mode = "mH_p2H", addition = "H2", charge = -1, adductString = "[M+3H-H]-"), + c(mode = "mH_pH", addition = "H1", charge = -1, adductString = "[M+2H-H]-"), c(mode = "mH_pH2O", addition = "H1O1", charge = -1, adductString = "[M+H2O-H]-"), c(mode = "m4H_pM_p3Na", addition = add.formula(formula, "Na3H-4"), charge = -1, adductString = "[2M+3Na-4H]-"), c(mode = "m2H_mNH3_pNa", addition = add.formula(formula, "Na1N-1H-5"), charge = -1, adductString = "[2M-NH3+Na-2H]-"), @@ -375,21 +376,27 @@ getAdductInformation <- function(formula){ c(mode = "p3H_c1", addition = "H3", charge = 1, adductString = "[M+3H]+"), c(mode = "pH2O_c1", addition = "H2O1", charge = 1, adductString = "[M+H2O]+"), c(mode = "pH_m2H2O", addition = "H-3O-2", charge = 1, adductString = "[M-2H2O+H]+"), + c(mode = "pH_pH2O", addition = "H3O1", charge = 1, adductString = "[M+H2O+H]+"), c(mode = "pH_mNH3", addition = "N-1H-2", charge = 1, adductString = "[M-NH3+H]+"), c(mode = "p2H_c1", addition = "H2", charge = 1, adductString = "[M+2H]+"), c(mode = "p_mNH3_c1", addition = "N-1H-3", charge = 1, adductString = "[M-NH2-H]+"), + c(mode = "p_mNH2_pH_c1", addition = "N-1H-1", charge = 1, adductString = "[M-NH2+H]+"), c(mode = "pM_p2Na_m3H_c1", addition = add.formula(formula, "Na2H-3"), charge = 1, adductString = "[2M+2Na-3H]+"), c(mode = "pM_pNa_m2H_c1", addition = add.formula(formula, "Na1H-2"), charge = 1, adductString = "[2M+Na-2H]+"), c(mode = "pM_pNa_mH_c1", addition = add.formula(formula, "Na1H-1"), charge = 1, adductString = "[2M+Na-H]+"), c(mode = "pM_p2Na_m2H_c1", addition = add.formula(formula, "Na2H-2"), charge = 1, adductString = "[2M+2Na-2H]+"), c(mode = "pM_pH_m2H2O_c1", addition = add.formula(formula, "H-3O-2"), charge = 1, adductString = "[2M-2H2O+H]+"), c(mode = "pM_pH_mH2O", addition = add.formula(formula, "H-1O-1"), charge = 1, adductString = "[2M-H2O+H]+"), + c(mode = "pM_pNa_mH2O", addition = add.formula(formula, "H-2O-1Na1"), charge = 1, adductString = "[2M-H2O+Na]+"), c(mode = "pM_m2H_c1", addition = add.formula(formula, "H-2"), charge = 1, adductString = "[2M-2H]+"), + c(mode = "pM_mH_c2", addition = add.formula(formula, "H-1"), charge = 1, adductString = "[2M-2H+H]+"), c(mode = "pM_pLi", addition = add.formula(formula, "Li1"), charge = 1, adductString = "[2M+Li]+"), c(mode = "pM_pH_m2O", addition = add.formula(formula, "O-2H1"), charge = 1, adductString = "[2M-2O+H]+"), c(mode = "pM_pNa_m2O", addition = add.formula(formula, "O-2Na1"), charge = 1, adductString = "[2M-2O+Na]+"), c(mode = "pM_pH_m3O", addition = add.formula(formula, "O-3H1"), charge = 1, adductString = "[2M-3O+H]+"), + c(mode = "pM_pNa_m3O", addition = add.formula(formula, "O-3Na1"), charge = 1, adductString = "[2M-3O+Na]+"), c(mode = "pM_mH_c1", addition = add.formula(formula, "H-1"), charge = 1, adductString = "[2M-H]+"), + c(mode = "pM_mH_pH", addition = formula, charge = 1, adductString = "[2M-H+H]+"), c(mode = "pH_c2", addition = "H1", charge = 2, adductString = "[M+H]2+"), ## M+X @@ -403,6 +410,7 @@ getAdductInformation <- function(formula){ c(mode = "pACN_pH", addition = "C2H4N1", charge = 1, adductString = "[M+ACN+H]+"), c(mode = "pACN_pNa", addition = "C2H3N1Na1", charge = 1, adductString = "[M+ACN+Na]+"), c(mode = "pH_mH2O", addition = "H-1O-1", charge = 2, adductString = "[M-H2O+H]+"), + c(mode = "pNa_mH2O", addition = "H-2O-1Na1", charge = 2, adductString = "[M-H2O+Na]+"), c(mode = "p2H", addition = "H2", charge = 2, adductString = "[M+2H]2+"), c(mode = "pACN_p2H", addition = "C2H5N1", charge = 2, adductString = "[M+ACN+2H]2+"), ## 2M+X @@ -414,6 +422,7 @@ getAdductInformation <- function(formula){ ## M-X c(mode = "mH", addition = "H-1", charge = -1, adductString = "[M-H]-"), c(mode = "mFA", addition = "C1O2H", charge = -1, adductString = "[M+HCOOH-H]-"), + c(mode = "mFA_pH", addition = "C1O2H2", charge = -1, adductString = "[M+HCOOH]-"), c(mode = "mH_mH2O", addition = "H-3O-1", charge = -1, adductString = "[M-H2O-H]-"), c(mode = "m2H_pNa", addition = "H-2Na1", charge = -1, adductString = "[M+Na-2H]-"), c(mode = "mM", addition = "", charge = -1, adductString = "[M]-"), From f93312d0ace28865dc72923833073ce6305f1ffd Mon Sep 17 00:00:00 2001 From: Treutler Date: Tue, 20 Nov 2018 13:49:15 +0100 Subject: [PATCH 52/71] Fixed bug and implemented restarts of web queries for more error tolerance --- R/createMassBank.R | 2 +- R/webAccess.R | 58 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index 3b4497a..7794ede 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -763,7 +763,7 @@ gatherData <- function(id) } link[["INCHIKEY"]] <- inchikey_split - if(length(csid)>0) if(any(!is.na(csid))) link[["CHEMSPIDER"]] <- min(as.numeric(as.character(csid))) + if(length(csid)>0) if(any(!is.na(csid))) link[["CHEMSPIDER"]] <- min(as.numeric(as.character(csid[!is.na(csid)]))) mbdata[['CH$LINK']] <- link mbdata[['AC$INSTRUMENT']] <- getOption("RMassBank")$annotations$instrument diff --git a/R/webAccess.R b/R/webAccess.R index 78b77d1..3e9a56f 100755 --- a/R/webAccess.R +++ b/R/webAccess.R @@ -4,6 +4,38 @@ NULL ## library(RCurl) +retrieveDataWithRetry <- function(url, timeout, maximumNumberOfRetries = 5, retryDelayInSeconds = 3){ + #data <- getURL(URLencode(url), timeout=5) + + data <- NULL + queryIsSuccessful <- FALSE + numberOfRetries <- 0 + while(!queryIsSuccessful & numberOfRetries < maximumNumberOfRetries){ + data <- tryCatch( + expr = { + data <- getURL(url = url, timeout = timeout) + queryIsSuccessful <- TRUE + data + }, + warning=function(w){ + numberOfRetries <<- numberOfRetries + 1 + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### Web query failed (", numberOfRetries, " / ", maximumNumberOfRetries, ") for url '", url, "' because of warning '", w, "'\n", sep = "")) + if(numberOfRetries < maximumNumberOfRetries) + Sys.sleep(time = retryDelayInSeconds) + }, + error=function(e){ + numberOfRetries <<- numberOfRetries + 1 + if(RMassBank.env$verbose.output) + cat(paste("### Warning ### Web query failed (", numberOfRetries, " / ", maximumNumberOfRetries, ") for url '", url, "' because of error '", e, "'\n", sep = "")) + if(numberOfRetries < maximumNumberOfRetries) + Sys.sleep(time = retryDelayInSeconds) + } + ) + } + + return(data) +} #' Retrieve information from Cactus #' @@ -378,16 +410,22 @@ getCSID <- function(query) baseURL <- "http://www.chemspider.com/InChI.asmx/InChIKeyToCSID?inchi_key=" url <- paste0(baseURL, query) - errorvar <- 0 - currEnvir <- environment() - - tryCatch( - data <- getURL(URLencode(url), timeout=5), - error=function(e){ - currEnvir$errorvar <- 1 - }) - - if(errorvar){ + #errorvar <- 0 + #currEnvir <- environment() + # + #tryCatch( + # data <- getURL(URLencode(url), timeout=5), + # error=function(e){ + # currEnvir$errorvar <- 1 + #}) + # + #if(errorvar){ + # warning("Chemspider is currently offline") + # return(NA) + #} + + data <- retrieveDataWithRetry(url = URLencode(url), timeout = 5) + if(is.null(data)){ warning("Chemspider is currently offline") return(NA) } From 89cba7687ba7856e15a6e5a2e08b38598eaecf2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Meier?= Date: Wed, 5 Dec 2018 11:14:23 +0100 Subject: [PATCH 53/71] Do not write out empty PUBLICATION tags. --- R/createMassBank.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index 7e324b8..6ef3a0f 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -1114,7 +1114,9 @@ readMbdata <- function(row) mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright - mbdata[['PUBLICATION']] <- getOption("RMassBank")$annotations$publication + if(getOption("RMassBank")$annotations$publication!="") { + mbdata[['PUBLICATION']] <- getOption("RMassBank")$annotations$publication + } # Read all determined fields from the file # This is not very flexible, as you can see... From 278036488c9202b19d8d22aa15d28a43421ae17a Mon Sep 17 00:00:00 2001 From: Kayla-Morrell Date: Fri, 4 Jan 2019 13:46:55 -0500 Subject: [PATCH 54/71] Add ImmunoOncology biocViews term --- DESCRIPTION | 126 ++++++++++++++++++++++++++-------------------------- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index a50a037..3fc975f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,63 +1,63 @@ -Package: RMassBank -Type: Package -Title: Workflow to process tandem MS files and build MassBank records -Version: 2.11.0 -Authors@R: c( - person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", - role=c("cre")), - person(given = "Michael A.", family = "Stravs", email = - "michael.stravs@eawag.ch", role=c("aut")), person(given = "Emma L.", - family = "Schymanski", email = "emma.schymanski@eawag.ch", role=c("aut")), - person(given = "Steffen", family = "Neumann", role = "aut", email = - "sneumann@ipb-halle.de"), person(given = "Erik", family = "Muller", role = - "aut", email = "erik.mueller@student.uni-halle.de"), person(given = - "Tobias", family = "Schulze", role = "ctb", email = - "tobias.schulze@ufz.de") ) -Author: Michael Stravs, Emma Schymanski, Steffen Neumann, Erik Mueller, with - contributions from Tobias Schulze -Maintainer: RMassBank at Eawag -Description: Workflow to process tandem MS files and build MassBank records. - Functions include automated extraction of tandem MS spectra, formula - assignment to tandem MS fragments, recalibration of tandem MS spectra with - assigned fragments, spectrum cleanup, automated retrieval of compound - information from Internet databases, and export to MassBank records. -License: Artistic-2.0 -SystemRequirements: OpenBabel -biocViews: Bioinformatics, MassSpectrometry, Metabolomics, Software -Depends: - Rcpp -Encoding: UTF-8 -Imports: - XML,RCurl,rjson,S4Vectors,digest, - rcdk,yaml,mzR,methods,Biobase,MSnbase,httr -Suggests: - gplots,RMassBankData, - xcms (>= 1.37.1), - CAMERA, - RUnit, - enviPat -Collate: - 'alternateAnalyze.R' - 'createMassBank.R' - 'formulaCalculator.R' - 'getSplash.R' - 'leCsvAccess.R' - 'leMsMs.r' - 'leMsmsRaw.R' - 'msmsRawExtensions.r' - 'settings_example.R' - 'webAccess.R' - 'deprofile.R' - 'parseMassBank.R' - 'SpectrumClasses.R' - 'SpectrumMethods.R' - 'RmbWorkspace.R' - 'RmbWorkspaceUpdate.R' - 'SpectraSetMethods.R' - 'AggregateMethods.R' - 'validateMassBank.R' - 'tools.R' - 'msmsRead.R' - 'Isotopic_Annotation.R' - 'zzz.R' -RoxygenNote: 6.1.0 +Package: RMassBank +Type: Package +Title: Workflow to process tandem MS files and build MassBank records +Version: 2.11.1 +Authors@R: c( + person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", + role=c("cre")), + person(given = "Michael A.", family = "Stravs", email = + "michael.stravs@eawag.ch", role=c("aut")), person(given = "Emma L.", + family = "Schymanski", email = "emma.schymanski@eawag.ch", role=c("aut")), + person(given = "Steffen", family = "Neumann", role = "aut", email = + "sneumann@ipb-halle.de"), person(given = "Erik", family = "Muller", role = + "aut", email = "erik.mueller@student.uni-halle.de"), person(given = + "Tobias", family = "Schulze", role = "ctb", email = + "tobias.schulze@ufz.de") ) +Author: Michael Stravs, Emma Schymanski, Steffen Neumann, Erik Mueller, with + contributions from Tobias Schulze +Maintainer: RMassBank at Eawag +Description: Workflow to process tandem MS files and build MassBank records. + Functions include automated extraction of tandem MS spectra, formula + assignment to tandem MS fragments, recalibration of tandem MS spectra with + assigned fragments, spectrum cleanup, automated retrieval of compound + information from Internet databases, and export to MassBank records. +License: Artistic-2.0 +SystemRequirements: OpenBabel +biocViews: ImmunoOncology, Bioinformatics, MassSpectrometry, Metabolomics, Software +Depends: + Rcpp +Encoding: UTF-8 +Imports: + XML,RCurl,rjson,S4Vectors,digest, + rcdk,yaml,mzR,methods,Biobase,MSnbase,httr +Suggests: + gplots,RMassBankData, + xcms (>= 1.37.1), + CAMERA, + RUnit, + enviPat +Collate: + 'alternateAnalyze.R' + 'createMassBank.R' + 'formulaCalculator.R' + 'getSplash.R' + 'leCsvAccess.R' + 'leMsMs.r' + 'leMsmsRaw.R' + 'msmsRawExtensions.r' + 'settings_example.R' + 'webAccess.R' + 'deprofile.R' + 'parseMassBank.R' + 'SpectrumClasses.R' + 'SpectrumMethods.R' + 'RmbWorkspace.R' + 'RmbWorkspaceUpdate.R' + 'SpectraSetMethods.R' + 'AggregateMethods.R' + 'validateMassBank.R' + 'tools.R' + 'msmsRead.R' + 'Isotopic_Annotation.R' + 'zzz.R' +RoxygenNote: 6.1.0 From 32e21fa2548f0a0f3868423b00a9b77ff2648599 Mon Sep 17 00:00:00 2001 From: Egon Willighagen Date: Wed, 3 Apr 2019 11:16:32 +0200 Subject: [PATCH 55/71] Typo :) --- vignettes/RMassBank.Rnw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/RMassBank.Rnw b/vignettes/RMassBank.Rnw index 2df68b6..cd20fea 100755 --- a/vignettes/RMassBank.Rnw +++ b/vignettes/RMassBank.Rnw @@ -1,5 +1,5 @@ % \VignetteIndexEntry{RMassBank walkthrough} -% \VignettePackage{rcdk} +% \VignettePackage{RMassBank} % \VignetteKeywords{} %% To generate the Latex code %library(RMassBank) From 28cc175e7b708097f53ee6550fecc53202e1e14b Mon Sep 17 00:00:00 2001 From: Steffen Neumann Date: Wed, 3 Apr 2019 20:47:35 +0200 Subject: [PATCH 56/71] Bump version --- DESCRIPTION | 2 +- inst/NEWS | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index adeaffc..cf00f27 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RMassBank Type: Package Title: Workflow to process tandem MS files and build MassBank records -Version: 2.11.2 +Version: 2.11.3 Authors@R: c( person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", role=c("cre")), diff --git a/inst/NEWS b/inst/NEWS index 9952729..3fdbfba 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,3 +1,7 @@ +Changes in version 2.11.3 + +- Avoid writing out empty PUBLICATIONS + Changes in version 2.11.1 - Fix error in workflow steps 2 and 7 with rcdk >= 3.4.9 From 31f79498573f2929adac210b4012c0bb422969c0 Mon Sep 17 00:00:00 2001 From: Steffen Neumann Date: Wed, 3 Apr 2019 20:49:54 +0200 Subject: [PATCH 57/71] Bump version --- DESCRIPTION | 2 +- inst/NEWS | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3fc975f..78f2eb6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RMassBank Type: Package Title: Workflow to process tandem MS files and build MassBank records -Version: 2.11.1 +Version: 2.11.2 Authors@R: c( person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", role=c("cre")), diff --git a/inst/NEWS b/inst/NEWS index 3fdbfba..1a5c7bd 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,4 +1,4 @@ -Changes in version 2.11.3 +Changes in version 2.11.2 - Avoid writing out empty PUBLICATIONS From 60b00a19c28d1abe28e92907128de93d750620cf Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 5 Apr 2019 07:06:30 +0200 Subject: [PATCH 58/71] Added adducts --- R/leCsvAccess.R | 103 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 35 deletions(-) diff --git a/R/leCsvAccess.R b/R/leCsvAccess.R index a5d8526..bb74462 100755 --- a/R/leCsvAccess.R +++ b/R/leCsvAccess.R @@ -359,19 +359,49 @@ getMonoisotopicMass <- function(formula){ getAdductInformation <- function(formula){ adductDf <- as.data.frame(rbind( - ## strange negative adducts - c(mode = "mpM", addition = formula, charge = -1, adductString = "[2M]-"), - c(mode = "m2H_pHCOOH_pNa", addition = "Na1C1O2", charge = -1, adductString = "[M+HCOOH+Na-2H]-"), - c(mode = "mH_p2H", addition = "H2", charge = -1, adductString = "[M+3H-H]-"), - c(mode = "mH_pH", addition = "H1", charge = -1, adductString = "[M+2H-H]-"), - c(mode = "mH_pH2O", addition = "H1O1", charge = -1, adductString = "[M+H2O-H]-"), - c(mode = "m4H_pM_p3Na", addition = add.formula(formula, "Na3H-4"), charge = -1, adductString = "[2M+3Na-4H]-"), - c(mode = "m2H_mNH3_pNa", addition = add.formula(formula, "Na1N-1H-5"), charge = -1, adductString = "[2M-NH3+Na-2H]-"), - c(mode = "m3H_pM_p2Na", addition = add.formula(formula, "Na2H-3"), charge = -1, adductString = "[2M+2Na-3H]-"), - c(mode = "m3H_pM", addition = add.formula(formula, "H-3"), charge = -1, adductString = "[2M-3H]-"), - c(mode = "mH_p2M", addition = add.formula(formula, add.formula(formula, "H-1")), charge = -1, adductString = "[3M-H]-"), - - ## strange positive adducts + ## positive: M+X + c(mode = "pH", addition = "H", charge = 1, adductString = "[M+H]+"), + c(mode = "pLi", addition = "Li", charge = 1, adductString = "[M+Li]+"), + c(mode = "pNa", addition = "Na", charge = 1, adductString = "[M+Na]+"), + c(mode = "pNa_mO3S_mH", addition = "Na1O-3S-1H-1", charge = 1, adductString = "[M-O3S-H+Na]+"), + c(mode = "pK", addition = "K", charge = 1, adductString = "[M+K]+"), + c(mode = "pM", addition = "", charge = 1, adductString = "[M]+"), + c(mode = "pM_mC7H11NO9S2", addition = "C-7H-11NO-9S-2", charge = 1, adductString = "[M-C7H11NO9S2]+"), + c(mode = "pM_mC7H12NO9S2", addition = "C-7H-12NO-9S-2", charge = 1, adductString = "[M-C7H12NO9S2]+"), + c(mode = "pNH4", addition = "NH4", charge = 1, adductString = "[M+NH4]+"), + c(mode = "p2Na_mH", addition = "Na2H-1", charge = 1, adductString = "[M+2Na-H]+"), + c(mode = "pACN_pH", addition = "C2H4N1", charge = 1, adductString = "[M+ACN+H]+"), + c(mode = "pACN_pNa", addition = "C2H3N1Na1", charge = 1, adductString = "[M+ACN+Na]+"), + c(mode = "pH_mC7H6O", addition = "C-7H-5O-1", charge = 1, adductString = "[M-C7H6O+H]+"), + c(mode = "pH_mC18H30O14", addition = "C-18H-29O-14", charge = 1, adductString = "[M-C18H30O14+H]+"), + c(mode = "pH_mC6H10O5", addition = "C-6H-9O-5", charge = 1, adductString = "[M-C6H10O5+H]+"), + c(mode = "pH_mC12H20O9", addition = "C-12H-19O-9", charge = 1, adductString = "[M-C12H20O9+H]+"), + c(mode = "pH_mC9H8O4_mH2O", addition = "C-9H-9O-5", charge = 1, adductString = "[M-C9H8O4-H2O+H]+"), + c(mode = "pH_mC6H10O5_mH2O", addition = "C-6H-11O-6", charge = 1, adductString = "[M-C6H10O5-H2O+H]+"), + c(mode = "pH_mC5H8NO4", addition = "C-5H-7N-1O-4", charge = 1, adductString = "[M-C5H8NO4+H]+"), + c(mode = "pH_mO3S", addition = "O-3S-1H1", charge = 1, adductString = "[M-O3S+H]+"), + c(mode = "pH_mC6H10O8S", addition = "C-6H-9O-8S-1", charge = 1, adductString = "[M-C6H10O8S+H]+"), + c(mode = "pH_mC5H10N2O", addition = "C-5H-9N-2O-1", charge = 1, adductString = "[M-C5H10N2O+H]+"), + c(mode = "pH_mHO3P", addition = "O-3P-1", charge = 1, adductString = "[M-HO3P+H]+"), + c(mode = "pH_mC4H7", addition = "C-4H-6", charge = 1, adductString = "[M-C4H7+H]+"), + c(mode = "pH_mC6H10O4", addition = "C-6H-9O-4", charge = 1, adductString = "[M-C6H10O4+H]+"), + c(mode = "pH_mC5H8O3", addition = "C-5H-7O-3", charge = 1, adductString = "[M-C5H8O3+H]+"), + c(mode = "pH_mCO", addition = "H-1C-1O-1", charge = 1, adductString = "[M-CO+H]+"), + c(mode = "pH_mO3", addition = "H-1O-3", charge = 1, adductString = "[M-O3+H]+"), + c(mode = "pH_mC3H6", addition = "C-3H-5", charge = 1, adductString = "[M-C3H6+H]+"), + c(mode = "pH_mC4H3O5", addition = "C-4H-2O-5", charge = 1, adductString = "[M-C4H3O5+H]+"), + c(mode = "pH_mC6H11O6", addition = "C-6H-10O-6", charge = 1, adductString = "[M-C6H11O6+H]+"), + c(mode = "pH_mH2O", addition = "H-1O-1", charge = 2, adductString = "[M-H2O+H]+"), + c(mode = "pNa_mH2O", addition = "H-2O-1Na1", charge = 2, adductString = "[M-H2O+Na]+"), + c(mode = "p2H", addition = "H2", charge = 2, adductString = "[M+2H]2+"), + c(mode = "pACN_p2H", addition = "C2H5N1", charge = 2, adductString = "[M+ACN+2H]2+"), + ## positive: 2M+X + c(mode = "pM_pH", addition = add.formula(formula, "H1"), charge = 1, adductString = "[2M+H]+"), + c(mode = "pM_pK", addition = add.formula(formula, "K1"), charge = 1, adductString = "[2M+K]+"), + c(mode = "pM_pNa", addition = add.formula(formula, "Na1"), charge = 1, adductString = "[2M+Na]+"), + c(mode = "pM_pNH4", addition = add.formula(formula, "N1H4"), charge = 1, adductString = "[2M+NH4]+"), + c(mode = "pM_pACN_pH", addition = add.formula(formula, "C2H4N1"), charge = 1, adductString = "[2M+ACN+H]+"), + ## positive: strange positive adducts c(mode = "pCOONa", addition = "C1O2Na1", charge = 1, adductString = "[M+COONa]+"), c(mode = "p3H_c1", addition = "H3", charge = 1, adductString = "[M+3H]+"), c(mode = "pH2O_c1", addition = "H2O1", charge = 1, adductString = "[M+H2O]+"), @@ -399,39 +429,42 @@ getAdductInformation <- function(formula){ c(mode = "pM_mH_pH", addition = formula, charge = 1, adductString = "[2M-H+H]+"), c(mode = "pH_c2", addition = "H1", charge = 2, adductString = "[M+H]2+"), - ## M+X - c(mode = "pH", addition = "H", charge = 1, adductString = "[M+H]+"), - c(mode = "pLi", addition = "Li", charge = 1, adductString = "[M+Li]+"), - c(mode = "pNa", addition = "Na", charge = 1, adductString = "[M+Na]+"), - c(mode = "pK", addition = "K", charge = 1, adductString = "[M+K]+"), - c(mode = "pM", addition = "", charge = 1, adductString = "[M]+"), - c(mode = "pNH4", addition = "NH4", charge = 1, adductString = "[M+NH4]+"), - c(mode = "p2Na_mH", addition = "Na2H-1", charge = 1, adductString = "[M+2Na-H]+"), - c(mode = "pACN_pH", addition = "C2H4N1", charge = 1, adductString = "[M+ACN+H]+"), - c(mode = "pACN_pNa", addition = "C2H3N1Na1", charge = 1, adductString = "[M+ACN+Na]+"), - c(mode = "pH_mH2O", addition = "H-1O-1", charge = 2, adductString = "[M-H2O+H]+"), - c(mode = "pNa_mH2O", addition = "H-2O-1Na1", charge = 2, adductString = "[M-H2O+Na]+"), - c(mode = "p2H", addition = "H2", charge = 2, adductString = "[M+2H]2+"), - c(mode = "pACN_p2H", addition = "C2H5N1", charge = 2, adductString = "[M+ACN+2H]2+"), - ## 2M+X - c(mode = "pM_pH", addition = add.formula(formula, "H1"), charge = 1, adductString = "[2M+H]+"), - c(mode = "pM_pK", addition = add.formula(formula, "K1"), charge = 1, adductString = "[2M+K]+"), - c(mode = "pM_pNa", addition = add.formula(formula, "Na1"), charge = 1, adductString = "[2M+Na]+"), - c(mode = "pM_pNH4", addition = add.formula(formula, "N1H4"), charge = 1, adductString = "[2M+NH4]+"), - c(mode = "pM_pACN_pH", addition = add.formula(formula, "C2H4N1"), charge = 1, adductString = "[2M+ACN+H]+"), - ## M-X + + ## negative: M-X c(mode = "mH", addition = "H-1", charge = -1, adductString = "[M-H]-"), + c(mode = "mCl", addition = "Cl-1", charge = -1, adductString = "[M+Cl]-"), c(mode = "mFA", addition = "C1O2H", charge = -1, adductString = "[M+HCOOH-H]-"), + c(mode = "mH_pTFA", addition = "C2F3O2", charge = -1, adductString = "[M+CF3CO2H-H]-"), + + c(mode = "mH_mC6H10O5", addition = "C-6H-11O-5", charge = -1, adductString = "[M-C6H10O5-H]-"), + c(mode = "mFA_pH", addition = "C1O2H2", charge = -1, adductString = "[M+HCOOH]-"), c(mode = "mH_mH2O", addition = "H-3O-1", charge = -1, adductString = "[M-H2O-H]-"), + c(mode = "mCO2", addition = "C-1O-2", charge = -1, adductString = "[M-CO2]-"), + c(mode = "mH_mCH3", addition = "C-1H-4", charge = -1, adductString = "[M-CH3-H]-"), + c(mode = "mH_mCO2", addition = "C-1H-1O-2", charge = -1, adductString = "[M-CO2-H]-"), + c(mode = "mCH3", addition = "C-1H-3", charge = -1, adductString = "[M-CH3]-"), c(mode = "m2H_pNa", addition = "H-2Na1", charge = -1, adductString = "[M+Na-2H]-"), c(mode = "mM", addition = "", charge = -1, adductString = "[M]-"), c(mode = "m2H", addition = "H-2", charge = -1, adductString = "[M-2H]-"), ## in case of positively charged compounds - ## 2M-X + c(mode = "m2H_c2", addition = "H-2", charge = -2, adductString = "[M-2H]2-"), + ## negative: 2M-X c(mode = "mH_pM", addition = add.formula(formula, "H-1"), charge = -1, adductString = "[2M-H]-"), c(mode = "mFA_pM", addition = add.formula(formula, "C1O2H"), charge = -1, adductString = "[2M+HCOOH-H]-"), c(mode = "mH_pM_mH2O", addition = add.formula(formula, "H-3O-1"), charge = -1, adductString = "[2M-H2O-H]-"), c(mode = "m2H_pM_pNa", addition = add.formula(formula, "H-2Na1"), charge = -1, adductString = "[2M+Na-2H]-"), + ## negative: strange adducts + c(mode = "mpM", addition = formula, charge = -1, adductString = "[2M]-"), + c(mode = "m2H_pHCOOH_pNa", addition = "Na1C1O2", charge = -1, adductString = "[M+HCOOH+Na-2H]-"), + c(mode = "mH_p2H", addition = "H2", charge = -1, adductString = "[M+3H-H]-"), + c(mode = "mH_pH", addition = "H1", charge = -1, adductString = "[M+2H-H]-"), + c(mode = "mH_pH2O", addition = "H1O1", charge = -1, adductString = "[M+H2O-H]-"), + c(mode = "m4H_pM_p3Na", addition = add.formula(formula, "Na3H-4"), charge = -1, adductString = "[2M+3Na-4H]-"), + c(mode = "m2H_mNH3_pNa", addition = add.formula(formula, "Na1N-1H-5"), charge = -1, adductString = "[2M-NH3+Na-2H]-"), + c(mode = "m3H_pM_p2Na", addition = add.formula(formula, "Na2H-3"), charge = -1, adductString = "[2M+2Na-3H]-"), + c(mode = "m3H_pM", addition = add.formula(formula, "H-3"), charge = -1, adductString = "[2M-3H]-"), + c(mode = "mH_p2M", addition = add.formula(formula, add.formula(formula, "H-1")), charge = -1, adductString = "[3M-H]-"), + ## ??? c(mode = "", addition = "", charge = 0, adductString = "[M]") ), stringsAsFactors = F) From 260b0d5cc8848dbe7a228061309249b063560fe9 Mon Sep 17 00:00:00 2001 From: Treutler Date: Fri, 5 Apr 2019 07:07:11 +0200 Subject: [PATCH 59/71] Added the possibility to add comments with any sub tag --- R/createMassBank.R | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index 7794ede..c40ced7 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -84,7 +84,11 @@ loadInfolist <- function(mb, fileName) return(r) })), stringsAsFactors = FALSE) # use only the columns present in mbdata_archive, no other columns added in excel - mbdata_new <- mbdata_new[, colnames(mb@mbdata_archive)] + colNames <- colnames(mb@mbdata_archive) + commentColNames <- colnames(mbdata_new)[grepl(x = colnames(mbdata_new), pattern = "^COMMENT\\.(?!CONFIDENCE)(?!ID)", perl = TRUE)] + colNames <- c(colNames, commentColNames) + + mbdata_new <- mbdata_new[, colNames] # substitute the old entires with the ones from our files # then find the new (previously inexistent) entries, and rbind them to the table new_entries <- setdiff(mbdata_new$id, mb@mbdata_archive$id) @@ -629,7 +633,7 @@ gatherData <- function(id) # COMMENT: EAWAG_UCHEM_ID 1234 # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" mbdata[["COMMENT"]] <- list() - if(findLevel(id) == "0"){ + if(findLevel(id) == "0"){ mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment } else{ level <- findLevel(id) @@ -668,7 +672,16 @@ gatherData <- function(id) } } - mbdata[["COMMENT"]][["ID"]] = id + mbdata[["COMMENT"]][["ID"]] = id + + ## add generic COMMENT information + rowIdx <- which(.listEnvEnv$listEnv$compoundList$ID == id) + properties <- colnames(.listEnvEnv$listEnv$compoundList) + properties2 <- gsub(x = properties, pattern = "^COMMENT ", replacement = "") + theseProperties <- grepl(x = properties, pattern = "^COMMENT ") + theseProperties <- theseProperties & (!(unlist(.listEnvEnv$listEnv$compoundList[rowIdx, ]) == "NA" | is.na(unlist(.listEnvEnv$listEnv$compoundList[rowIdx, ])))) + mbdata[["COMMENT"]][properties2[theseProperties]] <- unlist(.listEnvEnv$listEnv$compoundList[rowIdx, theseProperties]) + # here compound info starts mbdata[['CH$NAME']] <- names # Currently we use a fixed value for Compound Class, since there is no useful @@ -1098,16 +1111,20 @@ flatten <- function(mbdata) { .checkMbSettings() + colNames <- names(unlist(mbdata[[1]])) + commentNames <- colNames[grepl(x = colNames, pattern = "^COMMENT\\.")] + colList <- c( "id", "dbcas", "dbname", "dataused", - "COMMENT.CONFIDENCE", + commentNames, + #"COMMENT.CONFIDENCE", # Note: The field name of the internal id field is replaced with the real name # at "compilation" time. Therefore, functions DOWNSTREAM from compileRecord() # must use the full name including the info from options("RMassBank"). - "COMMENT.ID", + #"COMMENT.ID", "CH$NAME1", "CH$NAME2", "CH$NAME3", @@ -1168,11 +1185,15 @@ readMbdata <- function(row) mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright mbdata[['PUBLICATION']] <- getOption("RMassBank")$annotations$publication + commentNames <- names(row)[grepl(x = names(row), pattern = "^COMMENT\\.")] + commentNames <- commentNames[!is.na(row[commentNames])] + # Read all determined fields from the file # This is not very flexible, as you can see... colList <- c( - "COMMENT.CONFIDENCE", - "COMMENT.ID", + commentNames, + #"COMMENT.CONFIDENCE", + #"COMMENT.ID", "CH$NAME1", "CH$NAME2", "CH$NAME3", @@ -1190,11 +1211,11 @@ readMbdata <- function(row) "CH$LINK.INCHIKEY", "CH$LINK.CHEMSPIDER") mbdata[["COMMENT"]] = list() - mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT.CONFIDENCE"]] + #mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT.CONFIDENCE"]] # Again, our ID field. + #mbdata[["COMMENT"]][["ID"]] <- row[["COMMENT.ID"]] + mbdata[["COMMENT"]][gsub(x = commentNames, pattern = "^COMMENT\\.", replacement = "")] <- row[commentNames] - mbdata[["COMMENT"]][["ID"]]<- - row[["COMMENT.ID"]] names = c(row[["CH.NAME1"]], row[["CH.NAME2"]], row[["CH.NAME3"]]) names = names[which(!is.na(names))] @@ -1300,9 +1321,10 @@ gatherCompound <- function(spec, aggregated, additionalPeaks = NULL, retrieval=" ## add generic AC$MASS_SPECTROMETRY information properties <- names(getOption("RMassBank")$annotations) + presentProperties <- names(ac_ms)#c('MS_TYPE', 'IONIZATION', 'ION_MODE')#, 'FRAGMENTATION_MODE', 'COLLISION_ENERGY', 'RESOLUTION') + theseProperties <- grepl(x = properties, pattern = "^AC\\$MASS_SPECTROMETRY_") properties2 <- gsub(x = properties, pattern = "^AC\\$MASS_SPECTROMETRY_", replacement = "") - presentProperties <- names(ac_ms)#c('MS_TYPE', 'IONIZATION', 'ION_MODE')#, 'FRAGMENTATION_MODE', 'COLLISION_ENERGY', 'RESOLUTION') theseProperties <- theseProperties & !(properties2 %in% presentProperties) theseProperties <- theseProperties & (unlist(getOption("RMassBank")$annotations) != "NA") ac_ms[properties2[theseProperties]] <- unlist(getOption("RMassBank")$annotations[theseProperties]) From e2989309782a5fed2d226d2c78a4dd462777ea38 Mon Sep 17 00:00:00 2001 From: meowcat Date: Fri, 5 Apr 2019 14:40:38 +0200 Subject: [PATCH 60/71] description fixed --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 06fd930..295bb08 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,4 +62,4 @@ Collate: 'msmsRead.R' 'Isotopic_Annotation.R' 'zzz.R' -RoxygenNote: 6.1.0 +RoxygenNote: 6.1.1 From 1fab56e75ed55b86c4dd9c5e803fa9274cbbd9c7 Mon Sep 17 00:00:00 2001 From: meowcat Date: Fri, 5 Apr 2019 15:35:05 +0200 Subject: [PATCH 61/71] better way to handle childIdx "transmission", fixed code for rcdk 3.4.9.2 --- R/leMsMs.r | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/R/leMsMs.r b/R/leMsMs.r index 2e9aa87..cf37e6e 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -544,11 +544,9 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi # filtering out low-intensity (<1e4) and shoulder peaks (deltam/z < 0.5, intensity # < 5%) and subsequently matching the peaks to formulas using Rcdk, discarding peaks # with insufficient match accuracy or no match. - analyzeTandemShot <- function(child) + analyzeTandemShot <- function(child, childIdx = 0) { - childIdx <- which(sapply(X = seq_along(msmsPeaks@children), FUN = function(i){ - all(child@mz == msmsPeaks@children[[i]]@mz) & all(child@rt == msmsPeaks@children[[i]]@rt) & all(child@intensity == msmsPeaks@children[[i]]@intensity) } - )) + shot <- getData(child) shot$row <- which(!is.na(shot$mz)) @@ -644,12 +642,9 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi # finally back-correct calculated masses for the charge mass <- shot.row[["mz"]] mass.calc <- mass + mode.charge * .emass - peakformula <- tryCatch(suppressWarnings( + peakformula <- suppressWarnings( rcdk::generate.formula(mass = mass.calc, window = ppm(mass.calc, ppmlimit, p=TRUE), elements = limits, charge=0) - ), error=function(e) - ## in case of zero formulas: Error in .jcall(mfSet, "I", "size") : RcallMethod: invalid object parameter - NA - ) + ) #peakformula <- tryCatch( # generate.formula(mass, # ppm(mass, ppmlimit, p=TRUE), @@ -834,7 +829,9 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi ## ## mzmin <- min(mzranges[,1], na.rm=TRUE) ## mzmax <- max(mzranges[,2], na.rm=TRUE) - children <- lapply(msmsPeaks@children, analyzeTandemShot) + children <- lapply(seq_along(msmsPeaks@children), + function(i) analyzeTandemShot(msmsPeaks@children[[i]], + childIdx = i)) ## correct fields in case of invalid data children <- lapply(children, function(child){ @@ -1823,8 +1820,9 @@ reanalyzeFailpeak <- function(custom_additions, mass, cpdID, counter, pb = NULL, #print(parent_formula) limits <- to.limits.rcdk(parent_formula) - peakformula <- tryCatch(suppressWarnings(rcdk::generate.formula(mass, ppm(mass, ppmlimit, p=TRUE), - limits, charge=mode.charge)) + peakformula <- tryCatch(suppressWarnings(rcdk::generate.formula( + mass = mass, window = ppm(mass, ppmlimit, p=TRUE), + elements = limits, charge=mode.charge))) # was a formula found? If not, return empty result if(length(peakformula)==0) return(as.data.frame( From b95b9399d033403cc69234c57d1d24e29422d778 Mon Sep 17 00:00:00 2001 From: meowcat Date: Wed, 10 Apr 2019 18:36:01 +0200 Subject: [PATCH 62/71] Fixes to Treutler workflow, vignette now builds almost completely --- R/createMassBank.R | 12 ++++++++---- R/settings_example.R | 1 + inst/RMB_options.ini | 2 ++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index bd6455f..27dfcf4 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -121,12 +121,16 @@ resetInfolists <- function(mb) CH.IUPAC = character(0), CH.LINK.CAS = character(0), CH.LINK.CHEBI = integer(0), CH.LINK.HMDB = character(0), CH.LINK.KEGG = character(0), CH.LINK.LIPIDMAPS = character(0), CH.LINK.PUBCHEM = character(0), CH.LINK.INCHIKEY = character(0), - CH.LINK.CHEMSPIDER = integer(0), SP.SAMPLE = character(0)), .Names = c("X", "id", "dbcas", + CH.LINK.CHEMSPIDER = integer(0)), .Names = c("X", "id", "dbcas", "dbname", "dataused", "COMMENT.CONFIDENCE", "COMMENT.ID", "CH.NAME1", "CH.NAME2", "CH.NAME3", "CH.COMPOUND_CLASS", "CH.FORMULA", "CH.EXACT_MASS", "CH.SMILES", "CH.IUPAC", "CH.LINK.CAS", "CH.LINK.CHEBI", "CH.LINK.HMDB", "CH.LINK.KEGG", "CH.LINK.LIPIDMAPS", "CH.LINK.PUBCHEM", - "CH.LINK.INCHIKEY", "CH.LINK.CHEMSPIDER", "SP.SAMPLE"), row.names = integer(0), class = "data.frame") + "CH.LINK.INCHIKEY", "CH.LINK.CHEMSPIDER"), row.names = integer(0), class = "data.frame") + if(getOption("RMassBank")$include_sp_tags) + { + mb@mbdata_archive["SP.SAMPLE"] <- character(0) + } return(mb) } @@ -1401,8 +1405,8 @@ gatherSpectrum <- function(spec, msmsdata, ac_ms, ac_lc, aggregated, additionalP ms_fi[['BASE_PEAK']] <- round(mz(spec@parent)[which.max(intensity(spec@parent))],4) ms_fi[['PRECURSOR_M/Z']] <- round(precursorMz$mzCenter,4) ms_fi[['PRECURSOR_TYPE']] <- adductString - if(all(!is.na(spec@parent@intensity), spec@parent@intensity != 0, spec@parent@intensity != 100, na.rm = TRUE)) - ms_fi[['PRECURSOR_INTENSITY']] <- spec@parent@intensity + if(all(!is.na(msmsdata@precursorIntensity), msmsdata@precursorIntensity != 0, msmsdata@precursorIntensity != 100, na.rm = TRUE)) + ms_fi[['PRECURSOR_INTENSITY']] <- msmsdata@precursorIntensity # Select all peaks which belong to this spectrum (correct cpdID and scan no.) # from peaksOK diff --git a/R/settings_example.R b/R/settings_example.R index b8fb266..b3ada46 100755 --- a/R/settings_example.R +++ b/R/settings_example.R @@ -215,6 +215,7 @@ NULL "RECALIBRATE" = "loess on assigned fragments and MS1" ) ), + include_sp_tags = FALSE, # List of data-dependent scans in their order (relative to the parent scan) # list(mode, ces, ce, res): # mode: fragmentation mode diff --git a/inst/RMB_options.ini b/inst/RMB_options.ini index 10574b1..3718cd9 100755 --- a/inst/RMB_options.ini +++ b/inst/RMB_options.ini @@ -64,6 +64,8 @@ annotations: ms_dataprocessing: RECALIBRATE: loess on assigned fragments and MS1 +include_sp_tags: FALSE + # Annotator: # by default, "annotator.default" is used. # If you want to build your custom annotator (check ?annotator.default and the source code), From 49518a43b3d024bc64407e86ddf21c47a38a78dd Mon Sep 17 00:00:00 2001 From: meowcat Date: Thu, 18 Apr 2019 09:56:57 +0200 Subject: [PATCH 63/71] Fixing formula generation for current CRAN rcdk 3.4.7.1 and rcdklibs 2.0.0 --- DESCRIPTION | 2 +- R/leMsMs.r | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 78f2eb6..39e0628 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RMassBank Type: Package Title: Workflow to process tandem MS files and build MassBank records -Version: 2.11.2 +Version: 2.11.3 Authors@R: c( person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", role=c("cre")), diff --git a/R/leMsMs.r b/R/leMsMs.r index 81c1a57..3401c87 100755 --- a/R/leMsMs.r +++ b/R/leMsMs.r @@ -612,8 +612,10 @@ analyzeMsMs.formula <- function(msmsPeaks, mode="pH", detail=FALSE, run="prelimi # finally back-correct calculated masses for the charge mass <- shot.row[["mz"]] mass.calc <- mass + mode.charge * .emass - peakformula <- suppressWarnings(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), - limits, charge=0)) + peakformula <- tryCatch( + suppressWarnings(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), + limits, charge=0)), + error = function(e) list()) if(length(peakformula)==0) return(t(c(row=shot.row[["row"]], intensity = shot.row[["intensity"]], mz=mass, @@ -1758,8 +1760,10 @@ reanalyzeFailpeak <- function(custom_additions, mass, cpdID, counter, pb = NULL, #print(parent_formula) limits <- to.limits.rcdk(parent_formula) - peakformula <- suppressWarnings(generate.formula(mass, ppm(mass, ppmlimit, p=TRUE), - limits, charge=mode.charge)) + peakformula <- tryCatch( + suppressWarnings(generate.formula(mass.calc, ppm(mass.calc, ppmlimit, p=TRUE), + limits, charge=0)), + error = function(e) list()) # was a formula found? If not, return empty result if(length(peakformula)==0) return(as.data.frame( From 424295b3294d454155ed65be346729256e3fab38 Mon Sep 17 00:00:00 2001 From: Nitesh Turaga Date: Thu, 2 May 2019 11:53:37 -0400 Subject: [PATCH 64/71] bump x.y.z versions to even y prior to creation of RELEASE_3_9 branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 39e0628..1e3edd6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RMassBank Type: Package Title: Workflow to process tandem MS files and build MassBank records -Version: 2.11.3 +Version: 2.12.0 Authors@R: c( person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", role=c("cre")), From fc1a11f0bd863718e64aecfedc165c5f0d5bf477 Mon Sep 17 00:00:00 2001 From: Nitesh Turaga Date: Thu, 2 May 2019 12:11:14 -0400 Subject: [PATCH 65/71] bump x.y.z versions to odd y after creation of RELEASE_3_9 branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1e3edd6..5ab1353 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RMassBank Type: Package Title: Workflow to process tandem MS files and build MassBank records -Version: 2.12.0 +Version: 2.13.0 Authors@R: c( person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", role=c("cre")), From f8a6a866ac8757d2bee827075c4142ed0985580c Mon Sep 17 00:00:00 2001 From: Adelene Lai Date: Tue, 28 May 2019 14:57:39 +0200 Subject: [PATCH 66/71] Add getCompTox to Mbworkflow #215 getCompTox retrieves DTXSID from EPA webservices using InChiKey. Modify generation of infolists and of final Mbrecord to include DTXSID. Resolves #215 --- R/createMassBank.R | 24 ++++++++++++++++++------ R/webAccess.R | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/R/createMassBank.R b/R/createMassBank.R index 7e324b8..2807945 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -111,17 +111,17 @@ resetInfolists <- function(mb) CH.IUPAC = character(0), CH.LINK.CAS = character(0), CH.LINK.CHEBI = integer(0), CH.LINK.HMDB = character(0), CH.LINK.KEGG = character(0), CH.LINK.LIPIDMAPS = character(0), CH.LINK.PUBCHEM = character(0), CH.LINK.INCHIKEY = character(0), - CH.LINK.CHEMSPIDER = integer(0)), .Names = c("X", "id", "dbcas", + CH.LINK.CHEMSPIDER = integer(0), CH.LINK.COMPTOX = character(0)), .Names = c("X", "id", "dbcas", "dbname", "dataused", "COMMENT.CONFIDENCE", "COMMENT.ID", "CH.NAME1", "CH.NAME2", "CH.NAME3", "CH.COMPOUND_CLASS", "CH.FORMULA", "CH.EXACT_MASS", "CH.SMILES", "CH.IUPAC", "CH.LINK.CAS", "CH.LINK.CHEBI", - "CH.LINK.HMDB", "CH.LINK.KEGG", "CH.LINK.LIPIDMAPS", "CH.LINK.PUBCHEM", - "CH.LINK.INCHIKEY", "CH.LINK.CHEMSPIDER"), row.names = integer(0), class = "data.frame") + "CH.LINK.HMDB", "CH.LINK.KEGG", "CH.LINK.LIPIDMAPS", "CH.LINK.PUBCHEM", + "CH.LINK.INCHIKEY", "CH.LINK.CHEMSPIDER", "CH.LINK.COMPTOX"), row.names = integer(0), class = "data.frame") return(mb) } -# The workflow function, i.e. (almost) the only thing you actually need to call. +# The workflow function, i.e. (almost) the only thing you actually need to call. # See below for explanation of steps. #' MassBank record creation workflow #' @@ -527,6 +527,13 @@ gatherData <- function(id) csid <- getCactus(inchikey_split, 'chemspider_id') } + ##Get CompTox + comptox <- getCompTox(inchikey_split) + + if(is.null(comptox)){ + comptox <- NA + } + ##Use CTS to retrieve information CTSinfo <- getCtsRecord(inchikey_split) @@ -711,6 +718,7 @@ gatherData <- function(id) } link[["INCHIKEY"]] <- inchikey_split + link[["COMPTOX"]] <- comptox if(length(csid)>0) if(any(!is.na(csid))) link[["CHEMSPIDER"]] <- min(as.numeric(as.character(csid))) mbdata[['CH$LINK']] <- link @@ -1071,7 +1079,9 @@ flatten <- function(mbdata) "CH$LINK.LIPIDMAPS", "CH$LINK.PUBCHEM", "CH$LINK.INCHIKEY", - "CH$LINK.CHEMSPIDER") + "CH$LINK.CHEMSPIDER", + "CH$LINK.COMPTOX" + ) # make an empty data frame with the right length rows <- length(mbdata) cols <- length(colList) @@ -1136,7 +1146,8 @@ readMbdata <- function(row) "CH$LINK.LIPIDMAPS", "CH$LINK.PUBCHEM", "CH$LINK.INCHIKEY", - "CH$LINK.CHEMSPIDER") + "CH$LINK.CHEMSPIDER", + "CH$LINK.COMPTOX") mbdata[["COMMENT"]] = list() mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT.CONFIDENCE"]] # Again, our ID field. @@ -1163,6 +1174,7 @@ readMbdata <- function(row) link[["PUBCHEM"]] = row[["CH.LINK.PUBCHEM"]] link[["INCHIKEY"]] = row[["CH.LINK.INCHIKEY"]] link[["CHEMSPIDER"]] = row[["CH.LINK.CHEMSPIDER"]] + link[["COMPTOX"]] = row[["CH.LINK.COMPTOX"]] link[which(is.na(link))] <- NULL mbdata[["CH$LINK"]] <- link # again, these constants are read from the options: diff --git a/R/webAccess.R b/R/webAccess.R index c2e3fdc..b0ddc9a 100755 --- a/R/webAccess.R +++ b/R/webAccess.R @@ -2,6 +2,7 @@ NULL ## library(XML) ## library(RCurl) +## library(jsonlite) @@ -352,6 +353,45 @@ getPcCHEBI <- function(query, from = "inchikey") } } +#' Retrieves DTXSID (if it exists) from EPA Comptox Dashboard +#' +#' @usage getCompTox(query) +#' @param query The InChIKey of the compound. +#' @return Returns the DTXSID. +#' +#' +#' @examples +#' +#' \dontrun{ +#' # getCompTox("MKXZASYAUGDDCJ-NJAFHUGGSA-N") +#' } +#' +#' @author Adelene Lai +#' @export + +getCompTox <- function(query) +{ + baseURL <- "https://actorws.epa.gov/actorws/chemIdentifier/v01/resolve.json?identifier=" + url <- paste0(baseURL,query) + errorvar <- 0 + currEnvir <- environment() + tryCatch( + data <- getURL(URLencode(url), timeout=5), + error=function(e){ + currEnvir$errorvar <- 1 #TRUE? + } + ) + + if(errorvar){ #if TRUE? + warning("EPA web service is currently offline") + return(NA) + } + + r <- fromJSON(data) #returns list + return(r$DataRow$dtxsid) + + } + #' Retrieve the Chemspider ID for a given compound #' #' Given an InChIKey, this function queries the chemspider web API to retrieve From c579455cff45d1971c029a9af9189a9e65429e16 Mon Sep 17 00:00:00 2001 From: meowcat Date: Tue, 25 Jun 2019 14:21:49 +0200 Subject: [PATCH 67/71] Fixed the edited compiled_ok step for multiple spectra --- DESCRIPTION | 2 +- R/createMassBank.R | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index df45f01..9c345ff 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RMassBank Type: Package Title: Workflow to process tandem MS files and build MassBank records -Version: 2.11.3 +Version: 2.11.4 Authors@R: c( person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", role=c("cre")), diff --git a/R/createMassBank.R b/R/createMassBank.R index 27dfcf4..3ac5f74 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -253,7 +253,8 @@ mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.c return(res) }) # check which compounds have useful spectra - ok <- unlist(lapply(X = selectSpectra(mb@spectra, "found", "object"), FUN = function(spec){unlist(lapply(X = spec@children, FUN = function(child){child@ok}))})) + ok <- unlist(lapply(X = selectSpectra(mb@spectra, "found", "object"), + FUN = function(spec){any(unlist(lapply(X = spec@children, FUN = function(child){child@ok})))})) notEmpty <- unlist(lapply(X = mb@compiled, FUN = length)) > 0 ok <- ok & notEmpty mb@ok <- which(ok) From 2cfce2172508294fb4e659ab84934f827b32391f Mon Sep 17 00:00:00 2001 From: Nitesh Turaga Date: Tue, 29 Oct 2019 13:08:33 -0400 Subject: [PATCH 68/71] bump x.y.z version to even y prior to creation of RELEASE_3_10 branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5ab1353..cd5f112 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RMassBank Type: Package Title: Workflow to process tandem MS files and build MassBank records -Version: 2.13.0 +Version: 2.14.0 Authors@R: c( person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", role=c("cre")), From f220d44290c1cd03c568bacfc0ea8cace43b0d42 Mon Sep 17 00:00:00 2001 From: Nitesh Turaga Date: Tue, 29 Oct 2019 13:36:20 -0400 Subject: [PATCH 69/71] bump x.y.z version to odd y after creation of RELEASE_3_10 branch --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index cd5f112..0352ff8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RMassBank Type: Package Title: Workflow to process tandem MS files and build MassBank records -Version: 2.14.0 +Version: 2.15.0 Authors@R: c( person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", role=c("cre")), From 7d45afa54e06df6122feed0a88ae680fbd663b11 Mon Sep 17 00:00:00 2001 From: jorainer Date: Fri, 15 Nov 2019 14:28:10 +0100 Subject: [PATCH 70/71] Fix mzR import after mzR::header reporting NA for missing data - Fix the mzR import functionality to match the changes in recent mzR::header reporting `NA` instead of `0` for not set/available spectra variables. --- DESCRIPTION | 2 +- R/leMsmsRaw.R | 22 ++++++++++++---------- R/msmsRawExtensions.r | 14 ++++++++------ 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9c345ff..689d429 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RMassBank Type: Package Title: Workflow to process tandem MS files and build MassBank records -Version: 2.11.4 +Version: 2.11.5 Authors@R: c( person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", role=c("cre")), diff --git a/R/leMsmsRaw.R b/R/leMsmsRaw.R index dbf0d4d..0f0c43c 100644 --- a/R/leMsmsRaw.R +++ b/R/leMsmsRaw.R @@ -198,14 +198,14 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, headerData[which(headerData$msLevel == 1),"precursorScanNum"] <- 0 } # bugfix 201803: PRM scans that were performed before the first full scan (found in some files) - headerData <- headerData[ - !((headerData$msLevel == 2) & (headerData$precursorScanNum == 0)),,drop=FALSE - ] + headerData <- headerData[!((headerData$msLevel == 2) & + (is.na(headerData$precursorScanNum))),, + drop = FALSE] # Find MS2 spectra with precursors which are in the allowed - # scan filter (coarse limit) range - findValidPrecursors <- headerData[ - (headerData$precursorMZ > mz - limit.coarse) & - (headerData$precursorMZ < mz + limit.coarse),] + # scan filter (coarse limit) range; which to get rid of NAs + findValidPrecursors <- headerData[which( + headerData$precursorMZ > (mz - limit.coarse) & + headerData$precursorMZ < (mz + limit.coarse)), ] # Find the precursors for the found spectra validPrecursors <- unique(findValidPrecursors$precursorScanNum) # check whether the precursors are real: must be within fine limits! @@ -245,9 +245,11 @@ findMsMsHR.mass <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA, spectra <- lapply(eic$scan, function(masterScan) { masterHeader <- headerData[headerData$acquisitionNum == masterScan,] - childHeaders <- headerData[(headerData$precursorScanNum == masterScan) - & (headerData$precursorMZ > mz - limit.coarse) - & (headerData$precursorMZ < mz + limit.coarse) ,] + childHeaders <- headerData[ + which(headerData$precursorScanNum == masterScan + & headerData$precursorMZ > (mz - limit.coarse) + & headerData$precursorMZ < (mz + limit.coarse)) , , + drop = FALSE] # Fix 9.10.17: headers now include non-numeric columns, leading to errors in data conversion. # Remove non-numeric columns diff --git a/R/msmsRawExtensions.r b/R/msmsRawExtensions.r index 5cdc1c7..dafc6fd 100644 --- a/R/msmsRawExtensions.r +++ b/R/msmsRawExtensions.r @@ -62,8 +62,8 @@ findMsMsHR.ticms2 <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA # Find MS2 spectra with precursors which are in the allowed # scan filter (coarse limit) range findValidPrecursors <- headerData[ - (headerData$precursorMZ > mz - limit.coarse) & - (headerData$precursorMZ < mz + limit.coarse),] + which(headerData$precursorMZ > (mz - limit.coarse) & + headerData$precursorMZ < (mz + limit.coarse)),] # Find the precursors for the found spectra @@ -94,10 +94,12 @@ findMsMsHR.ticms2 <- function(msRaw, mz, limit.coarse, limit.fine, rtLimits = NA ret$parentHeader[1,4:20] <- 0 ret$parentHeader[1,6] <- NA - childHeaders <- headerData[(headerData$acquisitionNum == masterScan) - & (headerData$precursorMZ > mz - limit.coarse) - & (headerData$precursorMZ < mz + limit.coarse) ,] - + childHeaders <- headerData[ + which(headerData$acquisitionNum == masterScan + & headerData$precursorMZ > (mz - limit.coarse) + & headerData$precursorMZ < (mz + limit.coarse)), , + drop = FALSE] + childScans <- childHeaders$acquisitionNum ret$parentScan <- min(childScans)-1 ret$parentHeader[1,1:3] <- min(childScans)-1 From c8b00e9c3acfc909f12e6e377f6b940ca8334a3c Mon Sep 17 00:00:00 2001 From: meowcat Date: Mon, 18 Nov 2019 15:59:57 +0100 Subject: [PATCH 71/71] Bump version number --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index a34427a..4935599 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: RMassBank Type: Package Title: Workflow to process tandem MS files and build MassBank records -Version: 2.15.0 +Version: 2.15.1 Authors@R: c( person(given = "RMassBank at Eawag", email = "massbank@eawag.ch", role=c("cre")),