diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..06f46ec --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,5 @@ +^.*\.Rproj$ +^\.Rproj\.user$ +^\.github$ +^\.travis\.yml$ +^LICENSE\.md$ diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml index bc4398a..b7bad7d 100644 --- a/.github/workflows/check-bioc.yml +++ b/.github/workflows/check-bioc.yml @@ -1,310 +1,310 @@ -## Read more about GitHub actions the features of this GitHub Actions workflow -## at https://lcolladotor.github.io/biocthis/articles/biocthis.html#use_bioc_github_action -## -## For more details, check the biocthis developer notes vignette at -## https://lcolladotor.github.io/biocthis/articles/biocthis_dev_notes.html -## -## You can add this workflow to other packages using: -## > biocthis::use_bioc_github_action() -## -## Using GitHub Actions exposes you to many details about how R packages are -## compiled and installed in several operating system.s -### If you need help, please follow the steps listed at -## https://github.com/r-lib/actions#where-to-find-help -## -## If you found an issue specific to biocthis's GHA workflow, please report it -## with the information that will make it easier for others to help you. -## Thank you! - -## Acronyms: -## * GHA: GitHub Action -## * OS: operating system - -on: - push: - pull_request: - -name: R-CMD-check-bioc - -## These environment variables control whether to run GHA code later on that is -## specific to testthat, covr, and pkgdown. -## -## If you need to clear the cache of packages, update the number inside -## cache-version as discussed at https://github.com/r-lib/actions/issues/86. -## Note that you can always run a GHA test without the cache by using the word -## "/nocache" in the commit message. -env: - has_testthat: 'true' - run_covr: 'true' - run_pkgdown: 'true' - has_RUnit: 'false' - has_BiocCheck: 'false' - cache-version: 'cache-v1' - -jobs: - build-check: - runs-on: ${{ matrix.config.os }} - name: ${{ matrix.config.os }} (${{ matrix.config.r }}) - container: ${{ matrix.config.cont }} - ## Environment variables unique to this job. - - strategy: - fail-fast: false - matrix: - config: - - { os: ubuntu-latest, r: 'devel', bioc: '3.19', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } - #- { os: macOS-latest, r: 'devel', bioc: '3.19'} - #- { os: windows-latest, r: 'devel', bioc: '3.19'} - env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - RSPM: ${{ matrix.config.rspm }} - NOT_CRAN: true - TZ: UTC - VDIFFR_RUN_TESTS: false - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - - steps: - - ## Set the R library to the directory matching the - ## R packages cache step further below when running on Docker (Linux). - - name: Set R Library home on Linux - if: runner.os == 'Linux' - run: | - mkdir /__w/_temp/Library - echo ".libPaths('/__w/_temp/Library')" > ~/.Rprofile - - ## Most of these steps are the same as the ones in - ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml - ## If they update their steps, we will also need to update ours. - - name: Checkout Repository - uses: actions/checkout@v2 - - ## R is already included in the Bioconductor docker images - - name: Setup R from r-lib - if: runner.os != 'Linux' - uses: r-lib/actions/setup-r@v2 - with: - r-version: ${{ matrix.config.r }} - - ## pandoc is already included in the Bioconductor docker images - - name: Setup pandoc from r-lib - if: runner.os != 'Linux' - uses: r-lib/actions/setup-pandoc@v2 - - - name: Query dependencies - run: | - install.packages('remotes') - saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) - shell: Rscript {0} - - - name: Cache R packages - if: "!contains(github.event.head_commit.message, '/nocache') && runner.os != 'Linux'" - uses: actions/cache@v3 - with: - path: ${{ env.R_LIBS_USER }} - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-devel-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-devel- - - - name: Cache R packages on Linux - if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' " - uses: actions/cache@v3 - with: - path: /home/runner/work/_temp/Library - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-devel-${{ hashFiles('.github/depends.Rds') }} - restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-devel- - - - name: Install Linux system dependencies - if: runner.os == 'Linux' - run: | - sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "20.04")), collapse = " "))') - echo $sysreqs - sudo -s eval "$sysreqs" - - # in addition install tesseract - - sudo apt-get install -y libtesseract-dev libleptonica-dev - - - name: Install macOS system dependencies - if: matrix.config.os == 'macOS-latest' - run: | - ## Enable installing XML from source if needed - brew install libxml2 - echo "XML_CONFIG=/usr/local/opt/libxml2/bin/xml2-config" >> $GITHUB_ENV - - ## Required to install magick as noted at - ## https://github.com/r-lib/usethis/commit/f1f1e0d10c1ebc75fd4c18fa7e2de4551fd9978f#diff-9bfee71065492f63457918efcd912cf2 - brew install imagemagick@6 - - ## For textshaping, required by ragg, and required by pkgdown - brew install harfbuzz fribidi - - brew install libgit2 - ## Helps compile RCurl from source - ## brew uninstall curl - - ## required for ncdf4 - can not use the homebrew one because that uses GCC - ## Use pre-compiled libraries from https://mac.r-project.org/libs-4/ - curl -O https://mac.r-project.org/libs-4/netcdf-4.7.4-darwin.17-x86_64.tar.gz - tar fvxzm netcdf-4.7.4-darwin.17-x86_64.tar.gz -C / - rm netcdf-4.7.4-darwin.17-x86_64.tar.gz - curl -O https://mac.r-project.org/libs-4/hdf5-1.12.0-darwin.17-x86_64.tar.gz - tar fvxzm hdf5-1.12.0-darwin.17-x86_64.tar.gz -C / - rm hdf5-1.12.0-darwin.17-x86_64.tar.gz - curl -O https://mac.r-project.org/libs-4/szip-2.1.1-darwin.17-x86_64.tar.gz - tar fvxzm szip-2.1.1-darwin.17-x86_64.tar.gz -C / - rm szip-2.1.1-darwin.17-x86_64.tar.gz - - - name: Install Windows system dependencies - if: runner.os == 'Windows' - run: | - ## Edit below if you have any Windows system dependencies - shell: Rscript {0} - - - name: Install BiocManager - run: | - message(paste('****', Sys.time(), 'installing BiocManager ****')) - remotes::install_cran("BiocManager") - shell: Rscript {0} - - - name: Set BiocVersion - run: | - BiocManager::install(version = "${{ matrix.config.bioc }}", ask = FALSE) - shell: Rscript {0} - - - name: Install dependencies - run: | - ## Try installing the package dependencies in steps. First the local - ## dependencies, then any remaining dependencies to avoid the - ## issues described at - ## https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016675.html - ## https://github.com/r-lib/remotes/issues/296 - ## Ideally, all dependencies should get installed in the first pass. - - ## Pass #1 at installing dependencies - BiocManager::install("RforMassSpectrometry/ProtGenerics", ask = FALSE,force = TRUE) - message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****')) - remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE) - BiocManager::install("RforMassSpectrometry/MsCoreUtils", ask = FALSE, force = TRUE) - ## Pass #2 at installing dependencies - message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****')) - - ## Temporaly use the development version of RMassBankData - ## remotes::install_github(repo = "MassBank/RMassBankData") - - remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE) - - ## Manually install required packages - BiocManager::install('rhdf5', dependencies = TRUE, ask = FALSE, update = FALSE, INSTALL_opts = '--force-biarch') - BiocManager::install("sneumann/mzR", dependencies = TRUE, ask = FALSE, update=FALSE) - BiocManager::install("SummarizedExperiment", ask = FALSE, update = FALSE) - BiocManager::install("msdata") - BiocManager::install("magick", dependencies = TRUE, ask = FALSE, update=FALSE) - - # BiocManager::install(c("devtools", "usethis", "vdiffr"), dependencies = TRUE, ask = FALSE, update = FALSE) - ## For running the checks - message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****')) - remotes::install_cran("rcmdcheck") - BiocManager::install("BiocCheck") - shell: Rscript {0} - - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - - - name: Install BiocGenerics - if: env.has_RUnit == 'true' - run: | - ## Install BiocGenerics - BiocManager::install("BiocGenerics") - shell: Rscript {0} - - - name: Install covr - if: github.ref == 'refs/heads/main' && env.run_covr == 'true' && runner.os == 'Linux' - run: | - remotes::install_cran("covr") - shell: Rscript {0} - - - name: Install pkgdown - if: github.ref == 'refs/heads/main' && env.run_pkgdown == 'true' && runner.os == 'Linux' - run: | - remotes::install_github("r-lib/pkgdown") - shell: Rscript {0} - - - name: Session info - run: | - options(width = 100) - pkgs <- installed.packages()[, "Package"] - sessioninfo::session_info(pkgs, include_base = TRUE) - shell: Rscript {0} - - - name: Save cache (Linux) - uses: actions/cache/save@v3 - if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' " - with: - path: /home/runner/work/_temp/Library - key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-devel-${{ hashFiles('.github/depends.Rds') }} - - - name: Run CMD check - if: "!contains(github.event.head_commit.message, '/nocheck')" - env: - _R_CHECK_CRAN_INCOMING_: false - run: | - rcmdcheck::rcmdcheck( - args = c("--no-build-vignettes", "--no-manual", "--timings"), - build_args = c("--no-manual", "--no-resave-data"), - error_on = "warning", - check_dir = "check" - ) - shell: Rscript {0} - - ## Might need an to add this to the if: && runner.os == 'Linux' - - name: Reveal testthat details - if: "env.has_testthat == 'true' && !contains(github.event.head_commit.message, '/nocheck')" - run: find . -name testthat.Rout -exec cat '{}' ';' - - - name: Run RUnit tests - if: "env.has_RUnit == 'true' && !contains(github.event.head_commit.message, '/nocheck')" - run: | - BiocGenerics:::testPackage() - shell: Rscript {0} - - - name: Run BiocCheck - if: "env.has_BiocCheck == 'true' && !contains(github.event.head_commit.message, '/nocheck')" - run: | - BiocCheck::BiocCheck( - dir('check', 'tar.gz$', full.names = TRUE), - `quit-with-status` = TRUE, - `no-check-R-ver` = TRUE, - `no-check-bioc-help` = TRUE - ) - shell: Rscript {0} - - - name: Test coverage - if: github.ref == 'refs/heads/main' && env.run_covr == 'true' && runner.os == 'Linux' - run: | - covr::codecov() - shell: Rscript {0} - - - name: Install package - if: github.ref == 'refs/heads/main' && env.run_pkgdown == 'true' && runner.os == 'Linux' - run: R CMD INSTALL . - - - name: Deploy package - if: github.ref == 'refs/heads/main' && env.run_pkgdown == 'true' && runner.os == 'Linux' - run: | - git config --global user.email "actions@github.com" - git config --global user.name "GitHub Actions" - git config --global --add safe.directory /__w/RMassBank/RMassBank - Rscript -e "pkgdown::deploy_to_branch(new_process = FALSE)" - shell: bash {0} - ## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE) - ## at least one locally before this will work. This creates the gh-pages - ## branch (erasing anything you haven't version controlled!) and - ## makes the git history recognizable by pkgdown. - - - name: Upload check results - if: failure() - uses: actions/upload-artifact@master - with: - name: ${{ runner.os }}-biocversion-devel-r-devel-results - path: check +## Read more about GitHub actions the features of this GitHub Actions workflow +## at https://lcolladotor.github.io/biocthis/articles/biocthis.html#use_bioc_github_action +## +## For more details, check the biocthis developer notes vignette at +## https://lcolladotor.github.io/biocthis/articles/biocthis_dev_notes.html +## +## You can add this workflow to other packages using: +## > biocthis::use_bioc_github_action() +## +## Using GitHub Actions exposes you to many details about how R packages are +## compiled and installed in several operating system.s +### If you need help, please follow the steps listed at +## https://github.com/r-lib/actions#where-to-find-help +## +## If you found an issue specific to biocthis's GHA workflow, please report it +## with the information that will make it easier for others to help you. +## Thank you! + +## Acronyms: +## * GHA: GitHub Action +## * OS: operating system + +on: + push: + pull_request: + +name: R-CMD-check-bioc + +## These environment variables control whether to run GHA code later on that is +## specific to testthat, covr, and pkgdown. +## +## If you need to clear the cache of packages, update the number inside +## cache-version as discussed at https://github.com/r-lib/actions/issues/86. +## Note that you can always run a GHA test without the cache by using the word +## "/nocache" in the commit message. +env: + has_testthat: 'true' + run_covr: 'true' + run_pkgdown: 'true' + has_RUnit: 'false' + has_BiocCheck: 'false' + cache-version: 'cache-v1' + +jobs: + build-check: + runs-on: ${{ matrix.config.os }} + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + container: ${{ matrix.config.cont }} + ## Environment variables unique to this job. + + strategy: + fail-fast: false + matrix: + config: + - { os: ubuntu-latest, r: 'devel', bioc: '3.23', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" } + #- { os: macOS-latest, r: 'devel', bioc: '3.19'} + #- { os: windows-latest, r: 'devel', bioc: '3.19'} + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + RSPM: ${{ matrix.config.rspm }} + NOT_CRAN: true + TZ: UTC + VDIFFR_RUN_TESTS: false + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + steps: + + ## Set the R library to the directory matching the + ## R packages cache step further below when running on Docker (Linux). + - name: Set R Library home on Linux + if: runner.os == 'Linux' + run: | + mkdir /__w/_temp/Library + echo ".libPaths('/__w/_temp/Library')" > ~/.Rprofile + + ## Most of these steps are the same as the ones in + ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml + ## If they update their steps, we will also need to update ours. + - name: Checkout Repository + uses: actions/checkout@v2 + + ## R is already included in the Bioconductor docker images + - name: Setup R from r-lib + if: runner.os != 'Linux' + uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + + ## pandoc is already included in the Bioconductor docker images + - name: Setup pandoc from r-lib + if: runner.os != 'Linux' + uses: r-lib/actions/setup-pandoc@v2 + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + shell: Rscript {0} + + - name: Cache R packages + if: "!contains(github.event.head_commit.message, '/nocache') && runner.os != 'Linux'" + uses: actions/cache@v3 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-devel-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-devel- + + - name: Cache R packages on Linux + if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' " + uses: actions/cache@v3 + with: + path: /home/runner/work/_temp/Library + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-devel-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-devel- + + - name: Install Linux system dependencies + if: runner.os == 'Linux' + run: | + sysreqs=$(Rscript -e 'cat("apt-get update -y && apt-get install -y", paste(gsub("apt-get install -y ", "", remotes::system_requirements("ubuntu", "20.04")), collapse = " "))') + echo $sysreqs + sudo -s eval "$sysreqs" + + # in addition install tesseract + + sudo apt-get install -y libtesseract-dev libleptonica-dev + + - name: Install macOS system dependencies + if: matrix.config.os == 'macOS-latest' + run: | + ## Enable installing XML from source if needed + brew install libxml2 + echo "XML_CONFIG=/usr/local/opt/libxml2/bin/xml2-config" >> $GITHUB_ENV + + ## Required to install magick as noted at + ## https://github.com/r-lib/usethis/commit/f1f1e0d10c1ebc75fd4c18fa7e2de4551fd9978f#diff-9bfee71065492f63457918efcd912cf2 + brew install imagemagick@6 + + ## For textshaping, required by ragg, and required by pkgdown + brew install harfbuzz fribidi + + brew install libgit2 + ## Helps compile RCurl from source + ## brew uninstall curl + + ## required for ncdf4 - can not use the homebrew one because that uses GCC + ## Use pre-compiled libraries from https://mac.r-project.org/libs-4/ + curl -O https://mac.r-project.org/libs-4/netcdf-4.7.4-darwin.17-x86_64.tar.gz + tar fvxzm netcdf-4.7.4-darwin.17-x86_64.tar.gz -C / + rm netcdf-4.7.4-darwin.17-x86_64.tar.gz + curl -O https://mac.r-project.org/libs-4/hdf5-1.12.0-darwin.17-x86_64.tar.gz + tar fvxzm hdf5-1.12.0-darwin.17-x86_64.tar.gz -C / + rm hdf5-1.12.0-darwin.17-x86_64.tar.gz + curl -O https://mac.r-project.org/libs-4/szip-2.1.1-darwin.17-x86_64.tar.gz + tar fvxzm szip-2.1.1-darwin.17-x86_64.tar.gz -C / + rm szip-2.1.1-darwin.17-x86_64.tar.gz + + - name: Install Windows system dependencies + if: runner.os == 'Windows' + run: | + ## Edit below if you have any Windows system dependencies + shell: Rscript {0} + + - name: Install BiocManager + run: | + message(paste('****', Sys.time(), 'installing BiocManager ****')) + remotes::install_cran("BiocManager") + shell: Rscript {0} + + - name: Set BiocVersion + run: | + BiocManager::install(version = "${{ matrix.config.bioc }}", ask = FALSE) + shell: Rscript {0} + + - name: Install dependencies + run: | + ## Try installing the package dependencies in steps. First the local + ## dependencies, then any remaining dependencies to avoid the + ## issues described at + ## https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016675.html + ## https://github.com/r-lib/remotes/issues/296 + ## Ideally, all dependencies should get installed in the first pass. + + ## Pass #1 at installing dependencies + BiocManager::install("RforMassSpectrometry/ProtGenerics", ask = FALSE,force = TRUE) + message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****')) + remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE) + BiocManager::install("RforMassSpectrometry/MsCoreUtils", ask = FALSE, force = TRUE) + ## Pass #2 at installing dependencies + message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****')) + + ## Temporaly use the development version of RMassBankData + ## remotes::install_github(repo = "MassBank/RMassBankData") + + remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = TRUE, upgrade = TRUE) + + ## Manually install required packages + BiocManager::install('rhdf5', dependencies = TRUE, ask = FALSE, update = FALSE, INSTALL_opts = '--force-biarch') + BiocManager::install("sneumann/mzR", dependencies = TRUE, ask = FALSE, update=FALSE) + BiocManager::install("SummarizedExperiment", ask = FALSE, update = FALSE) + BiocManager::install("msdata") + BiocManager::install("magick", dependencies = TRUE, ask = FALSE, update=FALSE) + + # BiocManager::install(c("devtools", "usethis", "vdiffr"), dependencies = TRUE, ask = FALSE, update = FALSE) + ## For running the checks + message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****')) + remotes::install_cran("rcmdcheck") + BiocManager::install("BiocCheck") + shell: Rscript {0} + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + + - name: Install BiocGenerics + if: env.has_RUnit == 'true' + run: | + ## Install BiocGenerics + BiocManager::install("BiocGenerics") + shell: Rscript {0} + + - name: Install covr + if: github.ref == 'refs/heads/main' && env.run_covr == 'true' && runner.os == 'Linux' + run: | + remotes::install_cran("covr") + shell: Rscript {0} + + - name: Install pkgdown + if: github.ref == 'refs/heads/main' && env.run_pkgdown == 'true' && runner.os == 'Linux' + run: | + remotes::install_github("r-lib/pkgdown") + shell: Rscript {0} + + - name: Session info + run: | + options(width = 100) + pkgs <- installed.packages()[, "Package"] + sessioninfo::session_info(pkgs, include_base = TRUE) + shell: Rscript {0} + + - name: Save cache (Linux) + uses: actions/cache/save@v3 + if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' " + with: + path: /home/runner/work/_temp/Library + key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-devel-${{ hashFiles('.github/depends.Rds') }} + + - name: Run CMD check + if: "!contains(github.event.head_commit.message, '/nocheck')" + env: + _R_CHECK_CRAN_INCOMING_: false + run: | + rcmdcheck::rcmdcheck( + args = c("--no-build-vignettes", "--no-manual", "--timings"), + build_args = c("--no-manual", "--no-resave-data"), + error_on = "warning", + check_dir = "check" + ) + shell: Rscript {0} + + ## Might need an to add this to the if: && runner.os == 'Linux' + - name: Reveal testthat details + if: "env.has_testthat == 'true' && !contains(github.event.head_commit.message, '/nocheck')" + run: find . -name testthat.Rout -exec cat '{}' ';' + + - name: Run RUnit tests + if: "env.has_RUnit == 'true' && !contains(github.event.head_commit.message, '/nocheck')" + run: | + BiocGenerics:::testPackage() + shell: Rscript {0} + + - name: Run BiocCheck + if: "env.has_BiocCheck == 'true' && !contains(github.event.head_commit.message, '/nocheck')" + run: | + BiocCheck::BiocCheck( + dir('check', 'tar.gz$', full.names = TRUE), + `quit-with-status` = TRUE, + `no-check-R-ver` = TRUE, + `no-check-bioc-help` = TRUE + ) + shell: Rscript {0} + + - name: Test coverage + if: github.ref == 'refs/heads/main' && env.run_covr == 'true' && runner.os == 'Linux' + run: | + covr::codecov() + shell: Rscript {0} + + - name: Install package + if: github.ref == 'refs/heads/main' && env.run_pkgdown == 'true' && runner.os == 'Linux' + run: R CMD INSTALL . + + - name: Deploy package + if: github.ref == 'refs/heads/main' && env.run_pkgdown == 'true' && runner.os == 'Linux' + run: | + git config --global user.email "actions@github.com" + git config --global user.name "GitHub Actions" + git config --global --add safe.directory /__w/RMassBank/RMassBank + Rscript -e "pkgdown::deploy_to_branch(new_process = FALSE)" + shell: bash {0} + ## Note that you need to run pkgdown::deploy_to_branch(new_process = FALSE) + ## at least one locally before this will work. This creates the gh-pages + ## branch (erasing anything you haven't version controlled!) and + ## makes the git history recognizable by pkgdown. + + - name: Upload check results + if: failure() + uses: actions/upload-artifact@master + with: + name: ${{ runner.os }}-biocversion-devel-r-devel-results + path: check diff --git a/.gitignore b/.gitignore index 62bcd8e..2d902e9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ .RData -.Rbuildignore .Rhistory .Rproj.user *.Rproj diff --git a/DESCRIPTION b/DESCRIPTION index d927b8a..ca25b3e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: RMassBank Title: Workflow to process tandem MS files and build MassBank records -Version: 3.15.3 +Version: 3.22.2 Authors@R: c( person("RMassBank at Eawag", , , "massbank@eawag.ch", role = "cre"), person("Michael A.", "Stravs", , "michael.stravs@eawag.ch", role = "aut", @@ -27,7 +27,8 @@ Description: Workflow to process tandem MS files and build MassBank of compound information from Internet databases, and export to MassBank records. License: Artistic-2.0 -Depends: +Depends: + R (>= 4.1.0), Rcpp Imports: assertthat, @@ -51,6 +52,7 @@ Imports: readr, rjson, S4Vectors, + stringr, tibble, tidyselect, webchem, @@ -71,7 +73,7 @@ VignetteBuilder: biocViews: ImmunoOncology, Bioinformatics, MassSpectrometry, Metabolomics, Software Encoding: UTF-8 -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.3 SystemRequirements: OpenBabel Collate: 'alternateAnalyze.R' diff --git a/NAMESPACE b/NAMESPACE index 3d8737d..01bae52 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -176,6 +176,7 @@ import(rcdk) import(readJDX) import(readr) import(rjson) +import(stringr) import(tibble) import(yaml) importFrom(Biobase,"classVersion<-") diff --git a/R/createMassBank.R b/R/createMassBank.R index ca6b11e..1ba155a 100755 --- a/R/createMassBank.R +++ b/R/createMassBank.R @@ -1,2058 +1,2062 @@ -# Script for writing MassBank files - -#testtest change -#' Load MassBank compound information lists -#' -#' Loads MassBank compound information lists (i.e. the lists which were created -#' in the first two steps of the MassBank \code{\link{mbWorkflow}} and -#' subsequently edited by hand.). -#' -#' \code{resetInfolists} clears the information lists, i.e. it creates a new -#' empty list in \code{mbdata_archive}. \code{loadInfolist} loads a single CSV -#' file, whereas \code{loadInfolists} loads a whole directory. -#' -#' @aliases loadInfolists loadInfolist resetInfolists -#' @usage loadInfolists(mb, path) -#' -#' loadInfolist(mb, fileName) -#' -#' resetInfolists(mb) -#' @param path Directory in which the namelists reside. All CSV files in this -#' directory will be loaded. -#' @param fileName A single namelist to be loaded. -#' @param mb The \code{mbWorkspace} to load/reset the lists in. -#' @return The new workspace with loaded/reset lists. -#' @author Michael Stravs, Tobias Schulze -#' @examples -#' -#' # -#' \dontrun{mb <- resetInfolists(mb) -#' mb <- loadInfolist(mb, "my_csv_infolist.csv")} -#' -#' @export -loadInfolists <- function(mb, path) -{ - archivefiles <- list.files(path, ".csv", full.names=TRUE) - for(afile in archivefiles) - mb <- loadInfolist(mb, afile) - return(mb) -} - -# Load an "infolist". This loads a CSV file which should contain the entries -# edited and controlled by hand. All compound infos from fileName are added into the -# global mbdata_archive. Entries with a cpdID which was already present, are substituted -# by new entries from the fileName file. -#' @export -loadInfolist <- function(mb, fileName) -{ - # Prime a new infolist if it doesn't exist - if(ncol(mb@mbdata_archive) == 0) { - mb <- resetInfolists(mb) - } - - # Import infolist, trim whitespace and transform NAs - mbdata_new <- readr::read_csv(file = fileName, - na = "", - trim_ws = TRUE, - show_col_types = FALSE - ) - - # Fix legacy infolist column names - # Firstly, remove the artifact first column - if (names(mbdata_new)[1] == "...1") { - mbdata_new <- mbdata_new |> - dplyr::select(-`...1`) - } - - # Secondly, replace the dots by underscores - if (any(grepl("\\.", colnames(mbdata_new)))) { - mbdata_new <- mbdata_new |> - dplyr::rename_with(~ gsub("\\.", "_", .), tidyselect::everything()) - } - - mbdata_new <- as.data.frame(mbdata_new, stringsAsFactors = FALSE) - - # Legacy check for loading the Uchem format files. - # Even if dbname_* are not used downstream of here, it's still good to keep them - # for debugging reasons. - n <- colnames(mbdata_new) - cols <- c("id","dbcas","dataused") - - # Check if comma-separated or semicolon-separated - d <- setdiff(cols, n) - if(length(d)>0){ - - # Import infolist, trim whitespace and transform NAs - mbdata_new <- readr::read_delim(file = fileName, - delim = ";", - na = "", - trim_ws = TRUE, - show_col_types = FALSE - ) - - mbdata_new <- as.data.frame(mbdata_new, stringsAsFactors = FALSE) - - n <- colnames(mbdata_new) - d2 <- setdiff(cols, n) - if(length(d2) > 0){ - stop("Some columns are missing in the infolist.") - } - } - - if("dbname_d" %in% colnames(mbdata_new)) { - colnames(mbdata_new)[[which(colnames(mbdata_new)=="dbname_d")]] <- "dbname" - # dbname_e will be dropped because of the select= in the subset below. - } - - if("COMMENT_EAWAG_UCHEM_ID" %in% colnames(mbdata_new)) { - colnames(mbdata_new)[[which(colnames(mbdata_new) == "COMMENT_EAWAG_UCHEM_ID")]] <- - "COMMENT_ID" - } - - # use only the columns present in mbdata_archive, no other columns added in excel - col_names <- colnames(mb@mbdata_archive) - comment_colnames <- colnames(mbdata_new)[grepl(x = colnames(mbdata_new), pattern = "^COMMENT\\_(?!CONFIDENCE)(?!ID)", perl = TRUE)] - col_names <- c(col_names, comment_colnames) - - ## The read infolists might not have all required / expected columns - missing_colnames <- col_names[!col_names %in% colnames(mbdata_new)] - if (length(missing_colnames >0)) { - missing_cols <- matrix(NA, ncol=length(missing_colnames)) - colnames(missing_cols) <- missing_colnames - mbdata_new <- cbind(mbdata_new, missing_cols) - } - - mbdata_new <- mbdata_new[, col_names] - # substitute the old entires with the ones from our files - # then find the new (previously inexistent) entries, and rbind them to the table - new_entries <- setdiff(mbdata_new$id, mb@mbdata_archive$id) - old_entries <- intersect(mbdata_new$id, mb@mbdata_archive$id) - - for(colname in colnames(mb@mbdata_archive)) { - mb@mbdata_archive[, colname] <- as.character(mb@mbdata_archive[, colname]) - } - - for(entry in old_entries) { - mb@mbdata_archive[mb@mbdata_archive$id == entry,] <- mbdata_new[mbdata_new$id == entry,] - } - - mb@mbdata_archive <- rbind(mb@mbdata_archive, mbdata_new[mbdata_new$id==new_entries,]) - - for(colname in colnames(mb@mbdata_archive)) { - mb@mbdata_archive[, colname] <- as.factor(mb@mbdata_archive[, colname]) - } - - return(mb) -} - - -# Resets the mbdata_archive to an empty version. -#' @export -resetInfolists <- function(mb) -{ - mb@mbdata_archive <- - structure(list(id = integer(0), dbcas = character(0), - dbname = character(0), dataused = character(0), COMMENT_CONFIDENCE = character(0), - COMMENT_ID = integer(0), `CH$NAME1` = character(0), - `CH$NAME2` = character(0), `CH$NAME3` = character(0), `CH$NAME4` = character(0), - `CH$NAME5` = character(0), `CH$COMPOUND_CLASS` = character(0), - `CH$FORMULA` = character(0), `CH$EXACT_MASS` = numeric(0),` CH$SMILES` = character(0), - `CH$IUPAC` = character(0), `CH$LINK_CAS` = character(0), `CH$LINK_CHEBI` = integer(0), - `CH$LINK_HMDB` = character(0), `CH$LINK_KEGG` = character(0), `CH$LINK_LIPIDMAPS` = character(0), - `CH$LINK_PUBCHEM` = character(0), `CH$LINK_INCHIKEY` = character(0), - `CH$LINK_CHEMSPIDER` = integer(0), `CH$LINK_COMPTOX` = character(0), - AUTHORS = character(0), COPYRIGHT = character(0), PUBLICATION = character(0) - ), .Names = c("id", "dbcas", - "dbname", "dataused", "COMMENT_CONFIDENCE", "COMMENT_ID", - "CH$NAME1", "CH$NAME2", "CH$NAME3", "CH$NAME4", "CH$NAME5", "CH$COMPOUND_CLASS", "CH$FORMULA", - "CH$EXACT_MASS", "CH$SMILES", "CH$IUPAC", "CH$LINK_CAS", "CH$LINK_CHEBI", - "CH$LINK_HMDB", "CH$LINK_KEGG", "CH$LINK_LIPIDMAPS", "CH$LINK_PUBCHEM", - "CH$LINK_INCHIKEY", "CH$LINK_CHEMSPIDER", "CH$LINK_COMPTOX", - "AUTHORS", "COPYRIGHT", "PUBLICATION"), row.names = integer(0), class = "data.frame") - if(getOption("RMassBank")$include_sp_tags) - { - mb@mbdata_archive["SP$SAMPLE"] <- character(0) - } - return(mb) - -} - -# The workflow function, i.e. (almost) the only thing you actually need to call. -# See below for explanation of steps. -#' MassBank record creation workflow -#' -#' Uses data generated by \code{\link{msmsWorkflow}} to create MassBank records. -#' -#' See the vignette \code{vignette("RMassBank")} for detailed informations about the usage. -#' -#' Steps: -#' -#' Step 1: Find which compounds don't have annotation information yet. For these -#' compounds, pull information from several databases (using gatherData). -#' -#' Step 2: If new compounds were found, then export the infolist.csv and stop the workflow. -#' Otherwise, continue. -#' -#' Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. -#' -#' Step 4: Compile the spectra. Using the skeletons from the archive data, create -#' MassBank records per compound and fill them with peak data for each spectrum. -#' Also, assign accession numbers based on scan mode and relative scan no. -#' -#' Step 5: Convert the internal tree-like representation of the MassBank data into -#' flat-text string arrays (basically, into text-file style, but still in memory) -#' -#' Step 6: For all OK records, generate a corresponding molfile with the structure -#' of the compound, based on the SMILES entry from the MassBank record. (This molfile -#' is still in memory only, not yet a physical file) -#' -#' Step 7: If necessary, generate the appropriate subdirectories, and actually write -#' the files to disk. -#' -#' Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank -#' to attribute substances to their corresponding structure molfiles. -#' -#' @param steps Which steps in the workflow to perform. -#' @param infolist_path A path where to store newly downloaded compound informations, -#' which should then be manually inspected. -#' @param mb The \code{mbWorkspace} to work in. -#' @param gatherData A variable denoting whether to retrieve information using several online databases \code{gatherData= "online"} -#' or to use the local babel installation \code{gatherData= "babel"}. Note that babel is used either way, if a directory is given -#' in the settings. This setting will be ignored if retrieval is set to "standard" -#' @param filter If \code{TRUE}, the peaks will be filtered according to the standard processing workflow in RMassBank - -#' only the best formula for a peak is retained, and only peaks passing multiplicity filtering are retained. If FALSE, it is assumed -#' that the user has already done filtering, and all peaks in the spectrum should be printed in the record (with or without formula.) -#' @return The processed \code{mbWorkspace}. -#' @seealso \code{\link{mbWorkspace-class}} -#' @author Michael A. Stravs, Eawag -#' @examples \dontrun{ -#' mb <- newMbWorkspace(w) # w being a msmsWorkspace -#' mb <- loadInfolists(mb, "D:/myInfolistPath") -#' mb <- mbWorkflow(mb, steps=c(1:3), "newinfos.csv") -#' -#' } -#' @export -mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.csv", gatherData = "online", filter = TRUE) -{ - # Step 1: Find which compounds don't have annotation information yet. For these - # compounds, pull information from CTS (using gatherData). - if(1 %in% steps) - { - mbdata_ids <- lapply(selectSpectra(mb@spectra, "found", "object"), function(spec) spec@id) - rmb_log_info("mbWorkflow: Step 1. Gather info from several databases") - # Which IDs are not in mbdata_archive yet? - new_ids <- setdiff(as.numeric(unlist(mbdata_ids)), mb@mbdata_archive$id) - mb@mbdata <- lapply(new_ids, function(id) - { - if(findLevel(id, TRUE) == "standard"){ - if(gatherData == "online"){ - - d <- gatherData(id) - } - if(gatherData == "babel"){ - # message("mbWorkflow: Step 1. Gather info using babel") - d <- gatherDataBabel(id) - } - } else{ - # message("mbWorkflow: Step 1. Gather no info - Unknown structure") - d <- gatherDataUnknown(id, mb@spectra[[1]]@mode, retrieval = findLevel(id, TRUE)) - } - rmb_log_info(paste(id, ": ", d$dataused, sep = '')) - return(d) - }) - } - # Step 2: If new compounds were found, then export the infolist.csv and stop the workflow. - # Otherwise, continue! - if(2 %in% steps) - { - rmb_log_info("mbWorkflow: Step 2. Export infolist (if required)") - if(length(mb@mbdata)>0) - { - mbdata <- flatten(mb@mbdata) - readr::write_csv(x = mbdata, file = infolist_path, col_names = TRUE, na = "", quote = "needed") - rmb_log_info(paste("The file", infolist_path, "was generated with new compound information. Please check and edit the table, and add it to your infolist folder.")) - return(mb) - } - else - rmb_log_info("No new data added.") - } - # Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. - if(3 %in% steps) - { - rmb_log_info("mbWorkflow: Step 3. Data reformatting") - mb@mbdata_relisted <- apply(mb@mbdata_archive, 1, readMbdata) - } - # Step 4: Compile the spectra! Using the skeletons from the archive data, create - # MassBank records per compound and fill them with peak data for each spectrum. - # Also, assign accession numbers based on scan mode and relative scan no. - if(4 %in% steps) - { - rmb_log_info("mbWorkflow: Step 4. Spectra compilation") - mb@compiled <- lapply( - selectSpectra(mb@spectra, "found", "object"), - function(r) { - # guard against NSE warnings from "filter" - filterOK <- NULL - best <- NULL - rmb_log_info(paste("Compiling: ", r@name, sep="")) - mbdata <- mb@mbdata_relisted[[which(mb@mbdata_archive$id == as.numeric(r@id))]] - if(filter) - res <- buildRecord(r, mbdata=mbdata, additionalPeaks=mb@additionalPeaks, filter = filterOK & best) - else - res <- buildRecord(r, mbdata=mbdata, additionalPeaks=mb@additionalPeaks) - return(res) - }) - # check which compounds have useful spectra - mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) - #mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) - mb@problems <- which(is.na(mb@compiled)) - mb@compiled_ok <- mb@compiled[mb@ok] - mb@compiled_notOk <- mb@compiled[!mb@ok] - } - # Step 5: Convert the internal tree-like representation of the MassBank data into - # flat-text string arrays (basically, into text-file style, but still in memory) - if(5 %in% steps) - { - rmb_log_info("mbWorkflow: [Legacy Step 5. Flattening records] ignored") - #mb@mbfiles <- lapply(mb@compiled_ok, function(cpd) toMassbank(cpd, mb@additionalPeaks)) - #mb@mbfiles_notOk <- lapply(mb@compiled_notOk, function(c) lapply(c, toMassbank)) - } - # Step 6: For all OK records, generate a corresponding molfile with the structure - # of the compound, based on the SMILES entry from the MassBank record. (This molfile - # is still in memory only, not yet a physical file) - if(6 %in% steps) - { - if(RMassBank.env$export.molfiles){ - rmb_log_info("mbWorkflow: Step 6. Generate molfiles") - mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(as.numeric(c@id))) - } else - warning("RMassBank is configured not to export molfiles (RMassBank.env$export.molfiles). Step 6 is therefore ignored.") - } - # Step 7: If necessary, generate the appropriate subdirectories, and actually write - # the files to disk. - if(7 %in% steps) - { - rmb_log_info("mbWorkflow: Step 7. Generate subdirs and export") - - ## create folder - filePath_recData_valid <- file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata") - filePath_recData_invalid <- file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata_invalid") - filePath_molData <- file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata") - - if(!file.exists(filePath_recData_valid)) if(!dir.create(filePath_recData_valid,recursive=TRUE)) stop(paste("Could not create folder", filePath_recData_valid)) - if(RMassBank.env$export.molfiles) - if(!file.exists(filePath_molData)) if(!dir.create(filePath_molData,recursive=TRUE)) stop(paste("Could not create folder", filePath_molData)) - if(RMassBank.env$export.invalid & length(mb@mbfiles_notOk) > 0) - if(!file.exists(filePath_recData_invalid)) if(!dir.create(filePath_recData_invalid,recursive=TRUE)) stop(paste("Could not create folder", filePath_recData_invalid)) - - if(length(mb@molfile) == 0) - mb@molfile <- as.list(rep(x = NA, times = length(mb@compiled_ok))) - - ## export valid spectra - for(cnt in seq_along(mb@compiled_ok)){ - exportMassbank_recdata( - mb@compiled_ok[[cnt]], - recDataFolder = filePath_recData_valid - ) - if(RMassBank.env$export.molfiles) - exportMassbank_moldata( - mb@compiled_ok[[cnt]], - molfile = mb@molfile[[cnt]], - molDataFolder = filePath_molData - ) - } - - ## export invalid spectra - for(cnt in seq_along(mb@compiled_notOk)) - exportMassbank_recdata( - compiled = mb@mbfiles_notOk[[cnt]], - recDataFolder = filePath_recData_invalid - ) - } - # Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank - # to attribute substances to their corresponding structure molfiles. - if(8 %in% steps) - { - if(RMassBank.env$export.molfiles){ - rmb_log_info("mbWorkflow: Step 8. Create list.tsv") - makeMollist(compiled = mb@compiled_ok) - } else - warning("RMassBank is configured not to export molfiles (RMassBank.env$export.molfiles). Step 8 is therefore ignored.") - } - return(mb) -} - - -# Calls openbabel and converts the SMILES code string (or retrieves the SMILES code from -# the ID, and then calls openbabel) to create a molfile in text format. -# If fileName is given, the file is directly stored. Otherwise, it is returned as a -# character array. -#' Create MOL file for a chemical structure -#' -#' Creates a MOL file (in memory or on disk) for a compound specified by the -#' compound ID or by a SMILES code. -#' -#' The function invokes OpenBabel (and therefore needs a correctly set -#' OpenBabel path in the RMassBank settings), using the SMILES code retrieved -#' with \code{findSmiles} or using the SMILES code directly. The current -#' implementation of the workflow uses the latter version, reading the SMILES -#' code directly from the MassBank record itself. -#' -#' @usage createMolfile(id_or_smiles, fileName = FALSE) -#' @param id_or_smiles The compound ID or a SMILES code. -#' @param fileName If the filename is set, the file is written directly to disk -#' using the specified filename. Otherwise, it is returned as a text array. -#' @return A character array containing the MOL/SDF format file, ready to be -#' written to disk. -#' @author Michael Stravs -#' @seealso \code{\link{findSmiles}} -#' @references OpenBabel: \url{http://openbabel.org} -#' @examples -#' -#' # Benzene: -#' \dontrun{ -#' createMolfile("C1=CC=CC=C1") -#' } -#' -#' @export -createMolfile <- function(id_or_smiles, fileName = FALSE) -{ - .checkMbSettings() - babeldir <- getOption("RMassBank")$babeldir - - if(!is.numeric(id_or_smiles)){ - smiles <- id_or_smiles - } else{ - if(findLevel(id_or_smiles,TRUE) != "standard"){ - return(c(" ","$$$$")) - } - smiles <- findSmiles(id_or_smiles) - } - # if no babeldir was set, get the result from cactus. - if(is.na(babeldir)) - { - res <- getCactus(smiles, "sdf") - - if(any(is.na(res))){ - res <- getPcSDF(smiles) - } - if(any(is.na(res))){ - stop("Pubchem and Cactus both seem to be down.") - } - if(is.character(fileName)) - writeLines(res, fileName) - } - # otherwise use the better-tested OpenBabel toolkit. - else - { - if(!is.character(fileName)) - cmd <- paste(babeldir, "obabel -ismi -osdf -d -b --gen2D", sep='') - else - cmd <- paste(babeldir, "obabel -ismi -osdf ", fileName , " -d -b --gen2D", sep='') - res <- system(cmd, intern=TRUE, input=smiles, ignore.stderr=TRUE) - # If we wrote to a file, read it back as return value. - if(is.character(fileName)) - res <- readLines(fileName) - } - #return(c(" ","$$$$")) - return(res) -} - - - -# Retrieve annotation data for a compound, from the internet service Pubchem -#' Retrieve supplemental annotation data from Pubchem -#' -#' Retrieves annotation data for a compound from the internet service Pubchem -#' based on the inchikey generated by babel or Cactus -#' -#' The data retrieved is the Pubchem CID, a synonym from the Pubchem database, -#' the IUPAC name (using the preferred if available) and a Chebi link -#' -#' @usage gatherPubChem(key) -#' @param key An Inchi-Key -#' @return Returns a list with 4 slots: -#' \code{PcID} The Pubchem CID -#' \code{Synonym} An arbitrary synonym for the compound name -#' \code{IUPAC} A IUPAC-name (preferred if available) -#' \code{Chebi} The identification number of the chebi database -#' @author Erik Mueller -#' @seealso \code{\link{mbWorkflow}} -#' @references Pubchem REST: -#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} -#' Chebi: -#' \url{http://www.ebi.ac.uk/chebi} -#' @examples -#' -#' # Gather data for compound ID 131 -#' \dontrun{gatherPubChem("QEIXBXXKTUNWDK-UHFFFAOYSA-N")} -#' -#' @export -gatherPubChem <- function(key){ - - PubChemData <- list() - - ##Trycatches are there because pubchem has connection issues 1 in 50 times. - ##Write NA into the respective fields if something goes wrong with the conenction or the data. - - ##Retrieve Pubchem CID - tryCatch( - PubChemData$PcID <- getPcId(key), - error=function(e){ - PubChemData$PcID <<- NA - }) - - ##Retrieve a synonym to the name - tryCatch( - PubChemData$Synonym <- getPcSynonym(key), - error=function(e){ - PubChemData$Synonym <<- NA - }) - - ##Retrieve the IUPAC-name - tryCatch( - PubChemData$IUPAC <- getPcIUPAC(key), - error=function(e){ - PubChemData$IUPAC <<- NA - }) - - ##Retrieve the Chebi-ID - tryCatch( - PubChemData$Chebi <- getPcCHEBI(key), - error=function(e){ - PubChemData$Chebi <<- NA - }) - - return(PubChemData) -} - -# Retrieve annotation data for a compound, from the internet service US EPA CCTE -#' Retrieve supplemental annotation data from US EPA -#' -#' Retrieves annotation data for a compound from the internet service US EPA CCTE -#' based on the inchikey generated by babel or Cactus -#' -#' The data retrieved is the US EPA DTXSID, the US EPA chemical dashboard -#' substance ID, the CAS-RN, the DTX preferred name, and the DTXCID (chemical ID). -#' -#' @usage gatherCCTE(key, api_key) -#' @param key An Inchi-Key or other chemical identifier (e.g. Chemical name, DTXSID, CASRN, InChIKey, DTXCID) -#' @param api_key An US EPA CCTE API key (personal or application) -#' @return Returns a list with 5 slots: -#' \code{dtxsid} The US EPA chemical dashboard substance id -#' \code{dtxcid} The US EPA chemical dashboard chemical id -#' \code{preferredName} The US EPA chemical dashboard preferred name -#' \code{casrn} The latest CAS registration number -#' \code{smiles} The SMILES annotation of the structure -#' @author Tobias Schulze -#' @seealso \code{\link{mbWorkflow}} -#' @references CCTE REST: -#' \url{https://api-ccte.epa.gov/docs/} -#' @examples -#' -#' # Gather data for compound ID 131 -#' \dontrun{gatherCCTE("QEIXBXXKTUNWDK-UHFFFAOYSA-N", api_key = NA)} -#' -#' @export -gatherCCTE <- function(key, api_key = NA) { - - # Check if the API key is provided, if not return an empty object - if (is.na(api_key)) { - CCTE_data <- list() - CTTE_data$dtxsid <- NA - CTTE_data$dtxcid <- NA - CTTE_data$preferredname <- NA - CTTE_data$casrn <- NA - CTTE_data$smiles <- NA - return(CCTE_data) - } - - CCTE_data <- list() - - ##Trycatches are there because pubchem has connection issues 1 in 50 times. - ##Write NA into the respective fields if something goes wrong with the conenction or the data. - - ##Retrieve DXTSID - tryCatch( - CCTE_data$dtxsid <- getDTXSID(key, api_key), - error=function(e){ - CCTE_data$dtxsid <<- NA - }) - - ##Retrieve DXTCID - tryCatch( - CCTE_data$dtxcid <- getDTXCID(key, api_key), - error=function(e){ - CCTE_data$dtxcid <<- NA - }) - - ##Retrieve preferred name - tryCatch( - CCTE_data$preferredname <- getPrefName(key, api_key), - error=function(e){ - CCTE_data$preferredname <<- NA - }) - - ##Retrieve latest CAS RN - tryCatch( - CCTE_data$casrn <- getCASRN(key, api_key), - error=function(e){ - CCTE_data$casrn <<- NA - }) - - ##Retrieve latest CAS RN - tryCatch( - CCTE_data$smiles <- getDTXSMILES(key, api_key), - error=function(e){ - CCTE_data$smiles <<- NA - }) - - return(CCTE_data) -} - - - -# Retrieve annotation data for a compound, from the internet services Cactvs, Pubchem, Chemspider and CTS. -#' Retrieve annotation data -#' -#' Retrieves annotation data for a compound from the internet services CTS, Pubchem, Chemspider and -#' Cactvs, based on the SMILES code and name of the compounds stored in the -#' compound list. -#' -#' Composes the "upper part" of a MassBank record filled with chemical data -#' about the compound: name, exact mass, structure, CAS no., links to PubChem, -#' KEGG, ChemSpider. The instrument type is also written into this block (even -#' if not strictly part of the chemical information). Additionally, index -#' fields are added at the start of the record, which will be removed later: -#' \code{id, dbcas, dbname} from the compound list, \code{dataused} to indicate -#' the used identifier for CTS search (\code{smiles} or \code{dbname}). -#' -#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are -#' inserted empty and will be filled later on. -#' -#' @usage gatherData(id) -#' @aliases gatherData -#' @param id The compound ID. -#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., -#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... -#' @author Michael Stravs -#' @seealso \code{\link{mbWorkflow}} -#' @references Chemical Translation Service: -#' \url{http://uranus.fiehnlab.ucdavis.edu:8080/cts/homePage} -#' cactus Chemical Identifier Resolver: -#' \url{http://cactus.nci.nih.gov/chemical/structure} -#' MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' Pubchem REST: -#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} -#' Chemspider InChI conversion: -#' \url{https://www.chemspider.com/InChI.asmx} -#' @examples -#' -#' # Gather data for compound ID 131 -#' \dontrun{gatherData(131)} -#' -#' @export -gatherData <- function(id) -{ - ##Preamble: Is a babeldir supplied? - ##If yes, use it - - .checkMbSettings() - usebabel=TRUE - babeldir <- getOption("RMassBank")$babeldir - - if(is.na(babeldir)){ - usebabel=FALSE - } - - - ##Get all useful information from the local "database" (from the CSV sheet) - - smiles <- findSmiles(id) - mass <- findMass(smiles) - dbcas <- findCAS(id) - dbname <- findName(id) - if(is.na(dbname)) dbname <- "" - if(is.na(dbcas)) dbcas <- "" - iupacName <- dbname - synonym <- dbname - formula <- findFormula(id) - - ##Convert SMILES to InChI key via Cactvs or babel. CTS doesn't "interpret" the SMILES per se, - ##it just matches identical known SMILES, so we need to convert to a "searchable" and - ##standardized format beforehand. Other databases are able to interpret the smiles. - - if(usebabel){ - cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey') - inchikey_split <- system(cmdinchikey, intern = TRUE, input = smiles, ignore.stderr = TRUE) - } else { - inchikey <- getCactus(identifier = smiles, representation = "stdinchikey") - - if(is.na(inchikey)) { - inchikey <- getPcInchiKey(query = smiles, from = "smiles") - } - - if(!is.na(inchikey)){ - ##Split the "InChiKey=" part off the key - inchikey_split <- strsplit(inchikey, "=", fixed = TRUE)[[1]][[2]] - } else { - inchikey_split <- getPcInchiKey(query = smiles, from = "smiles") - } - } - - ##Use Pubchem to retrieve information - PcInfo <- gatherPubChem(inchikey_split) - - if(!is.null(PcInfo$Synonym) & !is.na(PcInfo$Synonym)){ - synonym <- PcInfo$Synonym - } - - if(!is.null(PcInfo$IUPAC) & !is.na(PcInfo$IUPAC)){ - iupacName <- PcInfo$IUPAC - } - - ##Get Chemspider-ID - csid <- getCSID(inchikey_split) - - if(is.na(csid)){ - ##Get ChemSpider ID from Cactus if the Chemspider page is down - csid <- getCactus(inchikey_split, 'chemspider_id') - } - - ## Get DTXSID - - # Get the api key from the settings - api_key = getOption("RMassBank")$settings$ccte_api_key - - if(!is.null(api_key)) { - dtxsid <- getDTXSID(key = inchikey_split, api_key = api_key) - - if(is.null(dtxsid)){ - dtxsid <- NA - } - } - else { - dtxsid <- NA - } - - - - - ##Use CTS to retrieve information - CTSinfo <- getCtsRecord(inchikey_split) - - if((CTSinfo[1] == "Sorry, we couldn't find any matching results") || is.null(CTSinfo[1])) - { - CTSinfo <- NA - } - - ##List the names - if(iupacName == ""){ - warning(paste0("Compound ID ",id,": no IUPAC name could be identified.")) - } - - if(toupper(dbname) == toupper(synonym)){ - synonym <- dbname - } - - if(toupper(dbname) == toupper(iupacName)){ - iupacName <- dbname - } - - if(toupper(synonym) == toupper(iupacName)){ - synonym <- iupacName - } - - names <- as.list(unique(c(dbname, synonym, iupacName))) - - ##If no name is found, it must be supplied in one way or another - if(all(sapply(names, function(x) x == ""))){ - stop("RMassBank wasn't able to extract a usable name for this compound from any database. Please supply a name manually.") - } - - # Start to fill the MassBank record. - # The top 4 entries will not go into the final record; they are used to identify - # the record and also to facilitate manual editing of the exported record table. - mbdata <- list() - mbdata[['id']] <- id - mbdata[['dbcas']] <- dbcas - mbdata[['dbname']] <- dbname - mbdata[['dataused']] <- "smiles" - mbdata[['ACCESSION']] <- "" - mbdata[['RECORD_TITLE']] <- "" - mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") - mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors - mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license - mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright - # Confidence annotation and internal ID annotation. - # The ID of the compound will be written like: - # COMMENT: EAWAG_UCHEM_ID 1234 - # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" - mbdata[["COMMENT"]] <- list() - if(findLevel(id) == "0"){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment - } else{ - level <- findLevel(id) - if(level %in% c("1","1a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" - } - if(level == c("2")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" - } - if(level == c("2a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" - } - if(level == c("2b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" - } - if(level == c("3")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" - } - if(level == c("3a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" - } - if(level == c("3b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" - } - if(level == c("3c")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" - } - if(level == c("3d")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" - } - if(level == c("4")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" - } - if(level == c("5")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" - } - } - - mbdata[["COMMENT"]][["ID"]] = id - - ## add generic COMMENT information - rowIdx <- which(.listEnvEnv$listEnv$compoundList$ID == id) - properties <- colnames(.listEnvEnv$listEnv$compoundList) - properties2 <- gsub(x = properties, pattern = "^COMMENT ", replacement = "") - theseProperties <- grepl(x = properties, pattern = "^COMMENT ") - theseProperties <- theseProperties & (!(unlist(.listEnvEnv$listEnv$compoundList[rowIdx, ]) == "NA" | is.na(unlist(.listEnvEnv$listEnv$compoundList[rowIdx, ])))) - mbdata[["COMMENT"]][properties2[theseProperties]] <- unlist(.listEnvEnv$listEnv$compoundList[rowIdx, theseProperties]) - - # here compound info starts - mbdata[['CH$NAME']] <- names - # Currently we use a fixed value for Compound Class, since there is no useful - # convention of what should go there and what shouldn't, and the field is not used - # in search queries. - mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class - mbdata[['CH$FORMULA']] <- formula - mbdata[['CH$EXACT_MASS']] <- mass - mbdata[['CH$SMILES']] <- smiles - - if(usebabel){ - cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi') - mbdata[['CH$IUPAC']] <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE) - } else{ - mbdata[['CH$IUPAC']] <- getCactus(smiles, "stdinchi") - } - - - - # Add all CH$LINK fields present in the compound datasets - link <- list() - # CAS - if(!is.na(CTSinfo[1])){ - if("CAS" %in% CTS.externalIdTypes(CTSinfo)) - { - # Prefer database CAS if it is also listed in the CTS results. - # otherwise take the shortest one. - cas <- CTS.externalIdSubset(CTSinfo,"CAS") - if(dbcas %in% cas) - link[["CAS"]] <- dbcas - else - link[["CAS"]] <- cas[[which.min(nchar(cas))]] - } else{ - if(dbcas != ""){ - link[["CAS"]] <- dbcas - } - } - } else{ - if(dbcas != ""){ - link[["CAS"]] <- dbcas - } - } - - - # CHEBI - if(is.na(PcInfo$Chebi[1])){ - if(!is.na(CTSinfo[1])){ - if("ChEBI" %in% CTS.externalIdTypes(CTSinfo)) - { - # Cut off front "CHEBI:" if present - chebi <- CTS.externalIdSubset(CTSinfo,"ChEBI") - chebi <- chebi[[which.min(nchar(chebi))]] - chebi <- strsplit(chebi,":")[[1]] - link[["CHEBI"]] <- chebi[[length(chebi)]] - } - } - } else{ - chebi <- PcInfo$Chebi - chebi <- chebi[[which.min(nchar(chebi))]] - chebi <- strsplit(chebi,":")[[1]] - link[["CHEBI"]] <- chebi[[length(chebi)]] - } - # HMDB - if(!is.na(CTSinfo[1])){ - if("Human Metabolome Database" %in% CTS.externalIdTypes(CTSinfo)) - link[["HMDB"]] <- CTS.externalIdSubset(CTSinfo,"HMDB")[[1]] - # KEGG - if("KEGG" %in% CTS.externalIdTypes(CTSinfo)) - link[["KEGG"]] <- CTS.externalIdSubset(CTSinfo,"KEGG")[[1]] - # LipidMAPS - if("LipidMAPS" %in% CTS.externalIdTypes(CTSinfo)) - link[["LIPIDMAPS"]] <- CTS.externalIdSubset(CTSinfo,"LipidMAPS")[[1]] - } - # PubChem CID - if(is.na(PcInfo$PcID[1])){ - if(!is.na(CTSinfo[1])){ - if("PubChem CID" %in% CTS.externalIdTypes(CTSinfo)) - { - pc <- CTS.externalIdSubset(CTSinfo,"PubChem CID") - link[["PUBCHEM"]] <- paste0(min(pc)) - } - } - } else{ - link[["PUBCHEM"]] <- PcInfo$PcID[1] - } - - - if(!is.null(link[["PUBCHEM"]])){ - if(substr(link[["PUBCHEM"]],1,4) != "CID:"){ - link[["PUBCHEM"]] <- paste0("CID:", link[["PUBCHEM"]]) - } - } - - link[["INCHIKEY"]] <- inchikey_split - link[["COMPTOX"]] <- dtxsid - if(length(csid)>0) if(any(!is.na(csid))) link[["CHEMSPIDER"]] <- min(as.numeric(as.character(csid[!is.na(csid)]))) - mbdata[['CH$LINK']] <- link - - return(mbdata) -} - -# Retrieve annotation data for a compound, using only babel -#' Retrieve annotation data -#' -#' Retrieves annotation data for a compound by using babel, -#' based on the SMILES code and name of the compounds stored in the -#' compound list. -#' -#' Composes the "upper part" of a MassBank record filled with chemical data -#' about the compound: name, exact mass, structure, CAS no.. -#' The instrument type is also written into this block (even -#' if not strictly part of the chemical information). Additionally, index -#' fields are added at the start of the record, which will be removed later: -#' \code{id, dbcas, dbname} from the compound list. -#' -#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are -#' inserted empty and will be filled later on. -#' -#' This function is an alternative to gatherData, in case CTS is down or if information -#' on one or more of the compounds in the compound list are sparse -#' -#' @usage gatherDataBabel(id) -#' @param id The compound ID. -#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., -#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... -#' @author Michael Stravs, Erik Mueller -#' @seealso \code{\link{mbWorkflow}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' -#' # Gather data for compound ID 131 -#' \dontrun{gatherDataBabel(131)} -#' -#' @export -gatherDataBabel <- function(id){ - .checkMbSettings() - babeldir <- getOption("RMassBank")$babeldir - smiles <- findSmiles(id) - - - # if no babeldir was set, throw an error that says that either CTS or babel have to be used - if(is.na(babeldir)) - { - stop("No babeldir supplied; It is currently not possible to convert the information without either babel or CTS") - } else { - ###Babel conversion - cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey') - inchikey <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE) - cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi') - inchi <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE) - - ##Read from Compoundlist - smiles <- findSmiles(id) - mass <- findMass(smiles) - dbcas <- findCAS(id) - dbname <- findName(id) - if(is.na(dbname)) dbname <- "" - if(is.na(dbcas)) dbcas <- "" - formula <- findFormula(id) - - ##Create - mbdata <- list() - mbdata[['id']] <- id - mbdata[['dbcas']] <- dbcas - mbdata[['dbname']] <- dbname - mbdata[['dataused']] <- "smiles" - mbdata[['ACCESSION']] <- "" - mbdata[['RECORD_TITLE']] <- "" - mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") - mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors - mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license - mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright - # Confidence annotation and internal ID annotation. - # The ID of the compound will be written like: - # COMMENT: EAWAG_UCHEM_ID 1234 - # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" - mbdata[["COMMENT"]] <- list() - if(findLevel(id) == "0"){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment - } else{ - level <- findLevel(id) - if(level %in% c("1","1a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" - } - if(level == c("2")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" - } - if(level == c("2a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" - } - if(level == c("2b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" - } - if(level == c("3")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" - } - if(level == c("3a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" - } - if(level == c("3b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" - } - if(level == c("3c")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" - } - if(level == c("3d")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" - } - if(level == c("4")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" - } - if(level == c("5")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" - } - } - mbdata[["COMMENT"]][["ID"]] <- id - - # here compound info starts - mbdata[['CH$NAME']] <- as.list(dbname) - - # Currently we use a fixed value for Compound Class, since there is no useful - # convention of what should go there and what shouldn't, and the field is not used - # in search queries. - mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class - mbdata[['CH$FORMULA']] <- formula - mbdata[['CH$EXACT_MASS']] <- mass - mbdata[['CH$SMILES']] <- smiles - mbdata[['CH$IUPAC']] <- inchi - - link <- list() - if(dbcas != "") - link[["CAS"]] <- dbcas - link[["INCHIKEY"]] <- inchikey - mbdata[['CH$LINK']] <- link - } - return(mbdata) -} - -# Retrieve annotation data for a compound, using only babel -#' Retrieve annotation data -#' -#' Retrieves annotation data for an unknown compound by using basic information present -#' -#' Composes the "upper part" of a MassBank record filled with chemical data -#' about the compound: name, exact mass, structure, CAS no.. -#' The instrument type is also written into this block (even -#' if not strictly part of the chemical information). Additionally, index -#' fields are added at the start of the record, which will be removed later: -#' \code{id, dbcas, dbname} from the compound list. -#' -#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are -#' inserted empty and will be filled later on. -#' -#' This function is used to generate the data in case a substance is unknown, -#' i.e. not enough information is present to derive anything about formulas or links -#' -#' @usage gatherDataUnknown(id, mode, retrieval) -#' @param id The compound ID. -#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions -#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). -#' @param retrieval A value that determines whether the files should be handled either as "standard", -#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" -#' if the only know thing is the m/z -#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., -#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... -#' @author Michael Stravs, Erik Mueller -#' @seealso \code{\link{mbWorkflow}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' -#' # Gather data for compound ID 131 -#' \dontrun{gatherDataUnknown(131,"pH")} -#' -#' @export -gatherDataUnknown <- function(id, mode, retrieval){ - .checkMbSettings() - - ##Read from Compoundlist - smiles <- "" - if(retrieval == "unknown"){ - mass <- findMass(id, "unknown", mode) - formula <- "" - } - if(retrieval == "tentative"){ - mass <- findMass(id, "tentative", mode) - formula <- findFormula(id, "tentative") - } - dbcas <- NA - dbname <- findName(id) - if(is.na(dbname)) dbname <- paste("Unknown ID:",id) - if(is.na(dbcas)) dbcas <- "" - - - - ##Create - mbdata <- list() - mbdata[['id']] <- id - mbdata[['dbcas']] <- dbcas - mbdata[['dbname']] <- dbname - mbdata[['dataused']] <- "none" - mbdata[['ACCESSION']] <- "" - mbdata[['RECORD_TITLE']] <- "" - mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") - mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors - mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license - mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright - # Confidence annotation and internal ID annotation. - # The ID of the compound will be written like: - # COMMENT: EAWAG_UCHEM_ID 1234 - # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" - mbdata[["COMMENT"]] <- list() - if(findLevel(id) == "0"){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment - } else{ - level <- findLevel(id) - if(level %in% c("1","1a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" - } - if(level == c("2")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" - } - if(level == c("2a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" - } - if(level == c("2b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" - } - if(level == c("3")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" - } - if(level == c("3a")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" - } - if(level == c("3b")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" - } - if(level == c("3c")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" - } - if(level == c("3d")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" - } - if(level == c("4")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" - } - if(level == c("5")){ - mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" - } - } - mbdata[["COMMENT"]][["ID"]] <- id - - # here compound info starts - mbdata[['CH$NAME']] <- as.list(dbname) - - # Currently we use a fixed value for Compound Class, since there is no useful - # convention of what should go there and what shouldn't, and the field is not used - # in search queries. - mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class - mbdata[['CH$FORMULA']] <- formula - mbdata[['CH$EXACT_MASS']] <- mass - mbdata[['CH$SMILES']] <- "" - mbdata[['CH$IUPAC']] <- "" - - link <- list() - mbdata[['CH$LINK']] <- link - - return(mbdata) -} - -# Flatten the internal tree-like representation of MassBank data to a flat table. -# Note that this limits us, in that the fields should be constant over all records! -# Therefore, e.g. the fixed number of 3 names which may be filled. -# If anybody has a cooler solution, I'll be happy to hear from you :) -# -# Note: the records from gatherData have additional information which is discarded, like -# author, copyright etc. They will be re-filled automatically when reading the file. -#' Flatten, or re-read, MassBank header blocks -#' -#' \code{flatten} converts a list of MassBank compound information sets (as -#' retrieved by \code{\link{gatherData}}) to a flat table, to be exported into -#' an \link[=loadInfolist]{infolist}. \code{readMbdata} reads a single record -#' from an infolist flat table back into a MassBank (half-)entry. -#' -#' Neither the flattening system itself nor the implementation are particularly -#' fantastic, but since hand-checking of records is a necessary evil, there is -#' currently no alternative (short of coding a complete GUI for this and -#' working directly on the records.) -#' -#' @aliases flatten readMbdata -#' @usage flatten(mbdata) -#' -#' readMbdata(row) -#' @param mbdata A list of MassBank compound information sets as returned from -#' \code{\link{gatherData}}. -#' @param row One row of MassBank compound information retrieved from an -#' infolist. -#' @return \code{flatten} returns a tibble (not a data frame or matrix) to be written to -#' CSV. -#' -#' \code{readMbdata} returns a list of type \code{list(id= \var{compoundID}, -#' ..., 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. -#' @author Michael Stravs -#' @seealso \code{\link{gatherData}},\code{\link{loadInfolist}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples \dontrun{ -#' # Collect some data to flatten -#' ids <- c(40,50,60,70) -#' data <- lapply(ids, gatherData) -#' # Flatten the data trees to a table -#' flat.table <- flatten(data) -#' # reimport the table into a tree -#' data.reimported <- apply(flat.table, 1, readMbdata) -#' } -#' -#' @export -#' -flatten <- function(mbdata) -{ - .checkMbSettings() - - colNames <- names(unlist(mbdata[[1]])) - commentNames <- colNames[grepl(x = colNames, pattern = "^COMMENT\\_")] - if(!is.null(mbdata[[1]]$COMMENT)) { - commentNames <- c(commentNames, glue::glue("COMMENT_{names(mbdata[[1]]$COMMENT)}")) - } - - colList <- c( - "id", - "dbcas", - "dbname", - "dataused", - commentNames, - #"COMMENT_CONFIDENCE", - # Note: The field name of the internal id field is replaced with the real name - # at "compilation" time. Therefore, functions DOWNSTREAM from compileRecord() - # must use the full name including the info from options("RMassBank"). - #"COMMENT_ID", - "CH$NAME1", - "CH$NAME2", - "CH$NAME3", - "CH$NAME4", - "CH$NAME5", - "CH$COMPOUND_CLASS", - "CH$FORMULA", - "CH$EXACT_MASS", - "CH$SMILES", - "CH$IUPAC", - "CH$LINK_CAS", - "CH$LINK_CHEBI", - "CH$LINK_HMDB", - "CH$LINK_KEGG", - "CH$LINK_LIPIDMAPS", - "CH$LINK_PUBCHEM", - "CH$LINK_INCHIKEY", - "CH$LINK_CHEMSPIDER", - "CH$LINK_COMPTOX" - ) - # make an empty data frame with the right length - rows <- length(mbdata) - cols <- length(colList) - - mbtbl <- tibble::tibble(!!!colList, .rows = 0, .name_repair = ~ colList) - - - #mbframe <- matrix(data = NA, nrow = rows, ncol = cols) - #colnames(mbframe) <- colList - #browser() - for(i in 1:rows) { - # fill in all the data into the dataframe: all columns which - # a) exist in the target dataframe and b) exist in the (unlisted) MB record - # are written into the dataframe. - data <- unlist(mbdata[[i]], use.names = TRUE) - names(data) <- gsub("\\.", "_", names(data)) - # bugfix for the case of only one name - if(!("CH$NAME1" %in% names(data))) { - data[["CH$NAME1"]] <- data[["CH$NAME"]] - } - datacols <- intersect(colList, names(data)) - - mbtbl <- mbtbl |> dplyr::bind_rows(data[datacols]) - - } - - return(mbtbl) - -} - -# Read data from a flat-table MassBank record row and feed it into a -# MassBank tree-like record. Also, prime the ACCESSION and RECORD_TITLE fields in the -# correct position in the record. -#' @export -readMbdata <- function(row) -{ - .checkMbSettings() - - # Listify the table row. Lists are just cooler to work with :) - row <- as.list(row) - - mbdata <- list() - # Accession and title are added empty for now, to have them in the right place. - # Constants are read from the options or generated. - mbdata[['ACCESSION']] <- "" - mbdata[['RECORD_TITLE']] <- "" - mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") - mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors - mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license - mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright - if(getOption("RMassBank")$annotations$publication!="") { - mbdata[['PUBLICATION']] <- getOption("RMassBank")$annotations$publication - } - if(!is.na(row[["PUBLICATION"]])) - mbdata[["PUBLICATION"]] = row[["PUBLICATION"]] - commentNames <- names(row)[grepl(x = names(row), pattern = "^COMMENT\\.")] - commentNames <- c(commentNames, names(row)[grepl(x = names(row), pattern = "^COMMENT\\_")]) - commentNames <- commentNames[!is.na(row[commentNames])] - - # Read all determined fields from the file - # This is not very flexible, as you can see... - colList <- c( - commentNames, - #"COMMENT_CONFIDENCE", - #"COMMENT_ID", - "CH$NAME1", - "CH$NAME2", - "CH$NAME3", - "CH$NAME4", - "CH$NAME5", - "CH$COMPOUND_CLASS", - "CH$FORMULA", - "CH$EXACT_MASS", - "CH$SMILES", - "CH$IUPAC", - "CH$LINK_CAS", - "CH$LINK_CHEBI", - "CH$LINK_HMDB", - "CH$LINK_KEGG", - "CH$LINK_LIPIDMAPS", - "CH$LINK_PUBCHEM", - "CH$LINK_INCHIKEY", - "CH$LINK_CHEMSPIDER", - "CH$LINK_COMPTOX") - mbdata[["COMMENT"]] = list() - #mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT_CONFIDENCE"]] - # Again, our ID field. - #mbdata[["COMMENT"]][["ID"]] <- row[["COMMENT_D"]] - mbdata[["COMMENT"]][gsub(x = commentNames, pattern = "^COMMENT\\_", replacement = "")] <- row[commentNames] - - names = c(row[["CH$NAME1"]], row[["CH$NAME2"]], row[["CH$NAME3"]], row[["CH$NAME4"]], row[["CH$NAME5"]]) - names = names[which(!is.na(names))] - - names <- gsub("'", "`", names) - mbdata[["CH$NAME"]] = names - mbdata[["CH$COMPOUND_CLASS"]] = row[["CH$COMPOUND_CLASS"]] - mbdata[["CH$FORMULA"]] = row[["CH$FORMULA"]] - mbdata[["CH$EXACT_MASS"]] = row[["CH$EXACT_MASS"]] - mbdata[["CH$SMILES"]] = row[["CH$SMILES"]] - mbdata[["CH$IUPAC"]] = row[["CH$IUPAC"]] - # Add all links and then eliminate the NA values from the tree. - link = list() - link[["CAS"]] = row[["CH$LINK_CAS"]] - link[["CHEBI"]] = row[["CH$LINK_CHEBI"]] - link[["HMDB"]] = row[["CH$LINK_HMDB"]] - link[["KEGG"]] = row[["CH$LINK_KEGG"]] - link[["LIPIDMAPS"]] = row[["CH$LINK_LIPIDMAPS"]] - link[["PUBCHEM"]] = row[["CH$LINK_PUBCHEM"]] - link[["INCHIKEY"]] = row[["CH$LINK_INCHIKEY"]] - link[["CHEMSPIDER"]] = row[["CH$LINK_CHEMSPIDER"]] - link[["COMPTOX"]] = row[["CH$LINK_COMPTOX"]] - link[which(is.na(link))] <- NULL - mbdata[["CH$LINK"]] <- link - - ## SP$SAMPLE - if(all(nchar(row[["SP_SAMPLE"]]) > 0, row[["SP_SAMPLE"]] != "NA", !is.na(row[["SP_SAMPLE"]]), na.rm = TRUE)) - mbdata[['SP$SAMPLE']] <- row[["SP_SAMPLE"]] - - if(!is.na(row[["AUTHORS"]])) - mbdata[["AUTHORS"]] = row[["AUTHORS"]] - - if(!is.na(row[["COPYRIGHT"]])) - mbdata[["COPYRIGHT"]] = row[["COPYRIGHT"]] - - - - - - return(mbdata) - -} - -#' Generate peak annotation from peaklist -#' -#' Generates the PK$ANNOTATION entry from the peaklist obtained. This function is -#' overridable by using the "annotator" option in the settings file. -#' -#' @param annotation A peak list to be annotated. Contains columns: -#' \code{"cpdID","formula","mzFound" ,"scan","mzCalc","dppm", -#' "dbe","mz","int","formulaCount","parentScan","fM_factor","dppmBest", -#' "formulaMultiplicity","intrel","mzSpec"} -#' -#' @param formulaTag The ion type to be added to annotated formulas ("+" or "-" usually) -#' -#' @return The annotated peak table. Table \code{colnames()} will be used for the -#' titles (preferrably don't use spaces in the column titles; however no format is -#' strictly enforced by the MassBank data format. -#' -#' @examples -#' \dontrun{ -#' annotation <- annotator.default(annotation) -#' } -#' @author Michele Stravs, Eawag -#' @export -annotator.default <- function(annotation, formulaTag) -{ - if(!is.null(formulaTag)) - type <- formulaTag - else - type <- "" - - annotation <- annotation[!is.na(annotation$formula),,drop=FALSE] - annotation <- annotation[annotation$formula != "",,drop=FALSE] - - annotation$formula <- paste(annotation$formula, rep(type, length(annotation$formula)), sep='') - # Select the right columns and name them correctly for output. - annotation <- annotation[,c("mz","formula", "formulaCount", "mzCalc", "dppm")] - colnames(annotation) <- c("m/z", "tentative_formula", "formula_count", "mass", "error(ppm)") - - return(annotation) -} - -#' Parse record title -#' -#' Parses a title for a single MassBank record using the title format -#' specified in the option titleFormat. Internally used, not exported. -#' -#' If the option is not set, a standard title format is used (for record definition -#' version 1 or 2). -#' -#' @usage .parseTitleString(mbdata) -#' @param mbdata list -#' The information data block for the record header, as stored in -#' \code{mbdata_relisted} after loading an infolist. -#' @return A string with the title. -#' @author Michael Stravs, Eawag -#' @seealso \code{\link{buildRecord}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' \dontrun{ -#' # used in buildRecord() -#' title <- .parseTitleString(mbdata) -#' } -#' -#' -#' -.parseTitleString <- function(mbdata) -{ - - varlist <- getOption("RMassBank")$titleFormat - - # Set the standard title format. - if(is.null(varlist)) - { - if(getOption("RMassBank")$use_version == 2) - { - varlist <- c( - "{CH$NAME}", - "{AC$INSTRUMENT_TYPE}", - "{AC$MASS_SPECTROMETRY: MS_TYPE}", - "CE: {RECORD_TITLE_CE}", - "R={AC$MASS_SPECTROMETRY: RESOLUTION}", - "{MS$FOCUSED_ION: PRECURSOR_TYPE}" - ) - } - else - { - varlist <- c( - "{CH$NAME}", - "{AC$INSTRUMENT_TYPE}", - "{AC$ANALYTICAL_CONDITION: MS_TYPE}", - "CE: {RECORD_TITLE_CE}", - "R={AC$ANALYTICAL_CONDITION: RESOLUTION}", - "{MS$FOCUSED_ION: PRECURSOR_TYPE}" - ) - } - } - - - # Extract a {XXX} argument from each title section. - # check that every title has one and only one match - args <- regexec("\\{(.*)\\}", varlist) - arglist <- regmatches(varlist, args) - if(any(unlist(lapply(arglist, length)) != 2)) - stop("Title format is incorrectly specified: a section with not exactly 1 parameters") - - parsedVars <- lapply(varlist, function(var) - { - # Extract the specified parameter inside the {}. - # I.e. from a string like "R={BLA: BLUB}" return "BLA: BLUB" - args <- regexec("\\{(.*)\\}", var) - arg <- regmatches(var, args)[[1]][[2]] - # Split the parameter by colon if necessary - splitVar <- strsplit(arg, ": ")[[1]] - # Read the parameter value from the record - if(length(splitVar) == 2) - replaceVar <- mbdata[[splitVar[[1]]]][[splitVar[[2]]]] - else if(length(splitVar) == 1) - replaceVar <- mbdata[[splitVar]] - else - stop(paste( - "Title format is incorrectly specified:", var) - ) - # Fix problems: NULL returns - if(is.null(replaceVar)) - replaceVar <- "" - # Fix problems: Names will have >= 1 match. Take the first - if(length(replaceVar) > 1) - replaceVar <- replaceVar[[1]] - - # Fix problems: Unknowns might have no name - if(!length(replaceVar)){ - replaceVar <- "" - } - - # Substitute the parameter value into the string - parsedVar <- sub("\\{(.*)\\}", replaceVar, var) - return(parsedVar) - }) - title <- paste(parsedVars, collapse="; ") - return(title) -} - - -# This converts the tree-like list (as obtained e.g. from compileRecord()) -# into a plain text array, which can then be dumped to a file suitable for -# MassBank upload. - -#' Write MassBank record into character array -#' -#' Writes a MassBank record in list format to a text array. -#' -#' The function is a general conversion tool for the MassBank format; i.e. the -#' field names are not fixed. \code{mbdata} must be a named list, and the -#' entries can be as follows: \itemize{ -#' \item A single text line: -#' -#' \code{'CH\$EXACT_MASS' = '329.1023'} -#' -#' is written as -#' -#' \code{CH\$EXACT_MASS: 329.1023} -#' \item A character array: -#' -#' \code{'CH\$NAME' = c('2-Aminobenzimidazole', '1H-Benzimidazol-2-amine')} -#' -#' is written as -#' -#' \code{CH\$NAME: 2-Aminobenzimidazole} -#' -#' \code{CH\$NAME: 1H-Benzimidazol-2-amine} -#' -#' \item A named list of strings: -#' -#' \code{'CH\$LINK' = list('CHEBI' = "27822", "KEGG" = "C10901")} -#' -#' is written as -#' -#' \code{CH\$LINK: CHEBI 27822} -#' -#' \code{CH\$LINK: KEGG C10901} -#' -#' \item A data frame (e.g. the peak table) is written as specified in -#' the MassBank record format (Section 2.6.3): the column names are used as -#' headers for the first line, all data rows are printed space-separated. -#' } -#' -#' @usage toMassbank(o, ...) -#' @param o An object to convert to MassBank record format. This may be -#' a single `RmbSpectrum2`, or a complete compound (an `RmbSpectraSet`), -#' @param ... Parameters passed to the implementation, -#' in particular `addAnnotation` -#' @param addAnnotation `logical`, whether to add peak annotations (putative formulas) to the record. -#' -#' @return The result is a text array, which is ready to be written to the disk -#' as a file. -#' @note The function iterates over the list item names. \bold{This means that -#' duplicate entries in \code{mbdata} are (partially) discarded!} The correct -#' way to add them is by making a character array (as specified above): Instead -#' of \code{'CH\$NAME' = 'bla', 'CH\$NAME' = 'blub'} specify \code{'CH\$NAME' = -#' c('bla','blub')}. -#' @author Michael Stravs -#' @seealso \code{\link{buildRecord}}, \code{\link{mbWorkflow}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @examples -#' \dontrun{ -#' # Read just the compound info skeleton from the Internet for some compound ID -#' id <- 35 -#' mbdata <- gatherData(id) -#' #' # Export the mbdata blocks to line arrays -#' # (there is no spectrum information, just the compound info...) -#' mbtext <- toMassbank(mbdata) -#' } -#' -#' -#' @export -setGeneric("toMassbank", function(o, ...) standardGeneric("toMassbank")) - - -#' @rdname toMassbank -#' @export -setMethod("toMassbank", "RmbSpectraSet", function(o, addAnnotation = getOption("RMassBank")$add_annotation) - { - lapply(o@children, function(s) toMassbank(s, addAnnotation)) - }) - -#' @rdname toMassbank -#' @export -setMethod("toMassbank", "RmbSpectrum2", function(o, addAnnotation = getOption("RMassBank")$add_annotation) - { - .toMassbank(o, addAnnotation) - }) - -.toMassbank <- function (s, addAnnotation = getOption("RMassBank")$add_annotation) -{ - - peaks <- getData(s) - # check that peaks were normalized - if(!("intrel" %in% colnames(peaks))) - { - s <- normalize(s, slot="intrel") - peaks <- getData(s) - } - - # Keep only peaks with relative intensity >= 1 o/oo, since the MassBank record - # makes no sense otherwise. Also, keep only the columns needed in the output. - peaks <- peaks[ peaks$intrel >= 1,,drop=FALSE] - - peaks$mz <- round(peaks$mz, 4) - # Also format the other values, which are used in the annotation - peaks$dppm <- round(peaks$dppm, 2) - peaks$mzCalc <- round(peaks$mzCalc, 4) - peaks$intensity <- round(peaks$intensity, 1) - - # Get polarity from Spectrum2 now! - formulaTag <- "" - if(s@polarity == 1) formulaTag <- "+" - if(s@polarity == 0) formulaTag <- "-" - # if polarity is -1, leave it unspecified. the "specs" seem to be 1 for +, 0 for - and -1 for ??? - # (when reading mzML I often get -1, when reading mzXML I get 1 and 0 respectively) - - annotator <- getOption("RMassBank")$annotator - if(is.null(annotator)) - annotator <- "annotator.default" - - annotation <- do.call(annotator, list(annotation= peaks, formulaTag = formulaTag)) - - peaks <- peaks[,c("mz", "intensity", "intrel")] - peaks <- unique(peaks) - # Name the columns correctly. - colnames(peaks) <- c("m/z", "int.", "rel.int.") - peaknum <- nrow(peaks) - - mbdata <- s@info - - mbdata[["PK$SPLASH"]] <- list(SPLASH = getSplash(peaks[,c("m/z", "int.")])) - - # Annotation: - if(addAnnotation && (nrow(annotation) > 0)) - mbdata[["PK$ANNOTATION"]] <- annotation - - # Peak table - mbdata[["PK$NUM_PEAK"]] <- peaknum - mbdata[["PK$PEAK"]] <- peaks - - # mbf is an array of lines and count is the line counter. - # Very old-school, but it works. :) - mbf <- character(0) - count <- 1 - lapply(names(mbdata), function(entry) - { - # If entry is a char line, add it to the file. - # If it is a named sublist, add each subentry with name - # If it is an unnamed sublist, add each subentry without name - # if it is a dataframe, write in PEAKS mode - - # Note: this is were I liked "lapply" a little too much. "for" would - # be more idiomatic, and wouldn't need the <<- assignments. - - # Data frame: table mode. A header line and one space-separated line for - # each data frame row. - if(is.data.frame(mbdata[[entry]])) - { - mbf[[count]] <<- paste(entry,": " , - paste(colnames(mbdata[[entry]]), collapse=" "), - sep='') - count <<- count+1 - for(row in 1:nrow(mbdata[[entry]])) - { - mbf[[count]] <<- paste(" ", - paste( - prettyNum(mbdata[[entry]][row,], scientific = FALSE, digits = 12), - collapse=" "), - sep="") - count <<- count+1 - } - #browser() - } - # List with named items: Write every entry like CH$LINK: CAS 12-345-678 - else if(is.list(mbdata[[entry]]) & !is.null(names(mbdata[[entry]]))) - { - - lapply(names(mbdata[[entry]]), function(subentry) - { - if(subentry != "SPLASH"){ - mbf[[count]] <<- paste(entry,": ",subentry, " ", mbdata[[entry]][[subentry]], sep='') - } else { - mbf[[count]] <<- paste(entry,": ", mbdata[[entry]][[subentry]], sep='') - } - #print(mbf) - count <<- count + 1 - }) - } - # Array (or list) of unnamed items: Write every entry like CH$NAME: Paracetamol - # (iterative entry without subindices) - else if (length(mbdata[[entry]]) > 1 & is.null(names(mbdata[[entry]]))) - { - lapply(mbdata[[entry]], function(subentry) - { - mbf[[count]] <<- paste(entry,": ",subentry, sep='') - #print(mbf) - count <<- count + 1 - }) - } - # Length is 1: just write the entry like PK$NUM_PEAKS: 131 - else - { - mbf[[count]] <<- paste(entry,": ",mbdata[[entry]], sep='') - count <<- count + 1 - } - } - ) # End of lapply block (per child spectrum) - # Add mandatory EOF marker - mbf[[count]] <- "//" - return(mbf) -} - -# Exports compiled and massbanked spectra, with their associated molfiles, to physical files. -# "compiled" is still used here, because we need an accessible accession number. -# In the plain text arrays, the accession number is already "hidden". -# compiled: is ONE "compiled" entry, i.e. ONE compound with e.g. 14 spectra. -# files: is a return value from lapply(toMassbank), i.e. contains 14 plain-text arrays -# (for a 14-spectra method) -# molfile: a molfile from createMolfile - -#' Export internally stored MassBank data to files -#' -#' Exports MassBank recfile data arrays and corresponding molfiles to physical -#' files on hard disk, for one compound. -#' -#' The data from \code{compiled} is still used here, because it contains the -#' "visible" accession number. In the plain-text format contained in -#' \code{files}, the accession number is not "accessible" anymore since it's in -#' the file. -#' -#' @usage exportMassbank(compiled, molfile = NULL) -#' @param compiled \code{RmbSpectraSet} -#' the spectra of one compound for which files should be exported -#' @param molfile A molfile from \code{\link{createMolfile}}; -#' deprecated since molfiles are not used by MassBank anymore. -#' @return No return value. -#' @note An improvement would be to write the accession numbers into -#' \code{names(compiled)} and later into \code{names(files)} so \code{compiled} -#' wouldn't be needed here anymore. (The compound ID would have to go into -#' \code{names(molfile)}, since it is also retrieved from \code{compiled}.) -#' @author Michael Stravs -#' @seealso \code{\link{createMolfile}}, \code{\link{toMassbank}}, -#' \code{\link{mbWorkflow}} -#' @references MassBank record format: -#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} -#' @export -exportMassbank <- function(compiled, molfile = NULL) -{ - exportMassbank_recdata( - compiled, - recDataFolder = file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata") - ) - if(!is.null(molfile)) { - exportMassbank_moldata( - compiled, - molfile, - molDataFolder = file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata") - ) - } -} - -exportMassbank_recdata <- function(compiled, recDataFolder) -{ - #mb@mbfiles <- lapply(mb@compiled_ok, function(cpd) toMassbank(cpd, mb@additionalPeaks)) - - files <- toMassbank(compiled) - names(files) <- lapply(compiled@children, function(c) c@info[["ACCESSION"]] ) - - molnames <- c() - for(file in seq_len(length(files))) - { - # Read the accession no. from the corresponding "compiled" entry - fileName <- names(files)[[file]] - # use this accession no. as fileName - fileName <- paste(fileName, ".txt", sep="") - filePath <- file.path(recDataFolder,fileName) - write(files[[file]], filePath) - } -} - -exportMassbank_moldata <- function(compiled, molfile, molDataFolder) -{ - # Use internal ID for naming the molfiles - if(findLevel(compiled@id,TRUE)=="standard"){ - molname <- sprintf("%04d", as.numeric(compiled@id)) - molname <- paste(molname, ".mol", sep="") - write(molfile, file.path(molDataFolder,molname)) - } -} - - - - - -# Makes a list.tsv with molfile -> massbank ch$name attribution. - -#' Write list.tsv file -#' -#' Makes a list.tsv file in the "moldata" folder. -#' -#' Generates the list.tsv file which is needed by MassBank to connect records with -#' their respective molfiles. The first compound name is linked to a mol-file with -#' the compound ID (e.g. 2334.mol for ID 2334). -#' -#' @param compiled list of \code{RmbSpectraSet} -#' compiled spectra for multiple compounds (one \code{RmbSpectraSet} each). -#' @return No return value. -#' @author Michael A. Stravs, Eawag -#' @export -makeMollist <- function(compiled) -{ - # For every "compiled" entry (here, compiled is not one "compiled" entry but the total - # list of all compiled spectra), extract the uppermost CH$NAME and the ID (from the - # first spectrum.) Make the ID into 0000 format. - - emptySpectra <- unlist(lapply(compiled, function(cpd) length(cpd@children) == 0)) - compiled <- compiled[!emptySpectra] - - tsvlist <- t(sapply(compiled, function(entry) - { - name <- entry@children[[1]]@info[["CH$NAME"]][[1]] - id <- sprintf("%04d", as.numeric(entry@id)) - molfilename <- paste(id,".mol",sep='') - return(c(name,molfilename)) - })) - - IDs <- sapply(compiled, function(entry) return( sprintf("%04d", as.numeric( - entry@id)))) - level <- sapply(IDs, findLevel, compact=TRUE) - validentries <- which(level == "standard") - # Write the file with the - write.table(tsvlist[validentries,], - paste(getOption("RMassBank")$annotations$entry_prefix,"/moldata/list.tsv", sep=''), - quote = FALSE, - sep="\t", - row.names=FALSE, - col.names=FALSE - ) -} - - -# Load a dataframe or file into additional_peaks (or add additional points in there.) -# The columns cpdID, scan, mzFound, int, OK are mandatory. OK=1 means that the peaks -# will be added into the spectrum. mzFound and int will be taken for the table. -# No annotation will be written. -# Add peaks to the spectra by hand - -#' Add additional peaks to spectra -#' -#' Loads a table with additional peaks to add to the MassBank spectra. Required -#' columns are \code{cpdID, scan, int, mzFound, OK}. -#' -#' All peaks with OK=1 will be included in the spectra. -#' -#' @usage addPeaks(mb, filename_or_dataframe) -#' @param mb The \code{mbWorkspace} to load the peaks into. -#' @param filename_or_dataframe Filename of the csv file, or name of the R -#' dataframe containing the peaklist. -#' @return The \code{mbWorkspace} with loaded additional peaks. -#' @author Michael Stravs -#' @seealso \code{\link{mbWorkflow}} -#' @examples -#' -#' \dontrun{addPeaks("myrun_additionalPeaks.csv")} -#' -#' @export -addPeaks <- function(mb, filename_or_dataframe) -{ - - errorvar <- 0 - currEnvir <- environment() - d <- 1 - - if(is.data.frame(filename_or_dataframe)) - df <- filename_or_dataframe - else - tryCatch( - df <- readr::read_csv(filename_or_dataframe), - df <- as.data.frame(df), - error=function(e){ - currEnvir$errorvar <- 1 - }) - # I change your heuristic fix to another heuristic fix, because I will have to test for a column name change... - - if(!errorvar){ - - if(ncol(df) < 2){ - df <- readr::read_delim(file = filename_or_dataframe, delim = ";") - df <- as.data.frame(df) - } - # here: the column int was renamed to intensity, and we need to be able to read old files. sorry. - if(!("intensity" %in% colnames(df)) & ("int" %in% colnames(df))) - df$intensity <- df$int - - cols <- c("cpdID", "scan", "mzFound", "intensity", "OK") - n <- colnames(df) - # Check if comma-separated or semicolon-separated - d <- setdiff(cols, n) - if(length(d)>0){ - stop("Some columns are missing in the additional peak list. Needs at least cpdID, scan, mzFound, intensity, OK.") - } - } - - culled_df <- df[,c("cpdID", "scan", "mzFound", "intensity", "OK")] - - - if(nrow(mb@additionalPeaks) == 0) - mb@additionalPeaks <- culled_df - else - mb@additionalPeaks <- rbind(mb@additionalPeaks, culled_df) - return(mb) -} - - - -gatherDataMinimal.cpd <- function(cpd){ - - ##Read from Compoundlist - if(length(cpd@smiles) == 1) smiles <- cpd@smiles - else - smiles <- "" - - ##Create - mbdata <- list() - mbdata[['ACCESSION']] <- "" - mbdata[['RECORD_TITLE']] <- "" - mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") - # Confidence annotation and internal ID annotation. - # The ID of the compound will be written like: - # COMMENT: EAWAG_UCHEM_ID 1234 - # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" - if(length(cpd@id) > 0) - mbdata[["COMMENT"]][["ID"]] <- cpd@id - - # here compound info starts - mbdata[['CH$NAME']] <- cpd@name - - # Currently we use a fixed value for Compound Class, since there is no useful - # convention of what should go there and what shouldn't, and the field is not used - # in search queries. - mbdata[['CH$FORMULA']] <- cpd@formula - mbdata[['CH$EXACT_MASS']] <- round(findMz.formula(cpd@formula, "")$mzCenter, 4) - - if(cpd@smiles != "") - mbdata[['CH$SMILES']] <- cpd@smiles - - link <- list() - mbdata[['CH$LINK']] <- link - - return(mbdata) -} - - - -gatherDataMinimal.spectrum <- function(spectrum){ - - - smiles <- "" - - ##Create - mbdata <- list() - mbdata[['ACCESSION']] <- "" - mbdata[['RECORD_TITLE']] <- "" - mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") - # Confidence annotation and internal ID annotation. - # The ID of the compound will be written like: - # COMMENT: EAWAG_UCHEM_ID 1234 - # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" - - # here compound info starts - mbdata[['CH$NAME']] <- paste("parent", spectrum@precursorMz, "at RT", spectrum@rt, "- CE", spectrum@collisionEnergy) - - # Currently we use a fixed value for Compound Class, since there is no useful - # convention of what should go there and what shouldn't, and the field is not used - # in search queries. - - return(mbdata) -} - - +# Script for writing MassBank files + +#testtest change +#' Load MassBank compound information lists +#' +#' Loads MassBank compound information lists (i.e. the lists which were created +#' in the first two steps of the MassBank \code{\link{mbWorkflow}} and +#' subsequently edited by hand.). +#' +#' \code{resetInfolists} clears the information lists, i.e. it creates a new +#' empty list in \code{mbdata_archive}. \code{loadInfolist} loads a single CSV +#' file, whereas \code{loadInfolists} loads a whole directory. +#' +#' @aliases loadInfolists loadInfolist resetInfolists +#' @usage loadInfolists(mb, path) +#' +#' loadInfolist(mb, fileName) +#' +#' resetInfolists(mb) +#' @param path Directory in which the namelists reside. All CSV files in this +#' directory will be loaded. +#' @param fileName A single namelist to be loaded. +#' @param mb The \code{mbWorkspace} to load/reset the lists in. +#' @return The new workspace with loaded/reset lists. +#' @author Michael Stravs, Tobias Schulze +#' @examples +#' +#' # +#' \dontrun{mb <- resetInfolists(mb) +#' mb <- loadInfolist(mb, "my_csv_infolist.csv")} +#' +#' @export +loadInfolists <- function(mb, path) +{ + archivefiles <- list.files(path, ".csv", full.names=TRUE) + for(afile in archivefiles) + mb <- loadInfolist(mb, afile) + return(mb) +} + +# Load an "infolist". This loads a CSV file which should contain the entries +# edited and controlled by hand. All compound infos from fileName are added into the +# global mbdata_archive. Entries with a cpdID which was already present, are substituted +# by new entries from the fileName file. +#' @export +loadInfolist <- function(mb, fileName) +{ + # Prime a new infolist if it doesn't exist + if(ncol(mb@mbdata_archive) == 0) { + mb <- resetInfolists(mb) + } + + # Import infolist, trim whitespace and transform NAs + mbdata_new <- readr::read_csv(file = fileName, + na = "", + trim_ws = TRUE, + show_col_types = FALSE + ) + + # Fix legacy infolist column names + # Firstly, remove the artifact first column + if (names(mbdata_new)[1] == "...1") { + mbdata_new <- mbdata_new |> + dplyr::select(-`...1`) + } + + # Secondly, replace the dots by underscores + if (any(grepl("\\.", colnames(mbdata_new)))) { + mbdata_new <- mbdata_new |> + dplyr::rename_with(~ gsub("\\.", "_", .), tidyselect::everything()) + } + + mbdata_new <- as.data.frame(mbdata_new, stringsAsFactors = FALSE) + + # Legacy check for loading the Uchem format files. + # Even if dbname_* are not used downstream of here, it's still good to keep them + # for debugging reasons. + n <- colnames(mbdata_new) + cols <- c("id","dbcas","dataused") + + # Check if comma-separated or semicolon-separated + d <- setdiff(cols, n) + if(length(d)>0){ + + # Import infolist, trim whitespace and transform NAs + mbdata_new <- readr::read_delim(file = fileName, + delim = ";", + na = "", + trim_ws = TRUE, + show_col_types = FALSE + ) + + mbdata_new <- as.data.frame(mbdata_new, stringsAsFactors = FALSE) + + n <- colnames(mbdata_new) + d2 <- setdiff(cols, n) + if(length(d2) > 0){ + stop("Some columns are missing in the infolist.") + } + } + + if("dbname_d" %in% colnames(mbdata_new)) { + colnames(mbdata_new)[[which(colnames(mbdata_new)=="dbname_d")]] <- "dbname" + # dbname_e will be dropped because of the select= in the subset below. + } + + if("COMMENT_EAWAG_UCHEM_ID" %in% colnames(mbdata_new)) { + colnames(mbdata_new)[[which(colnames(mbdata_new) == "COMMENT_EAWAG_UCHEM_ID")]] <- + "COMMENT_ID" + } + + # use only the columns present in mbdata_archive, no other columns added in excel + col_names <- colnames(mb@mbdata_archive) + comment_colnames <- colnames(mbdata_new)[grepl(x = colnames(mbdata_new), pattern = "^COMMENT\\_(?!CONFIDENCE)(?!ID)", perl = TRUE)] + col_names <- c(col_names, comment_colnames) + + ## The read infolists might not have all required / expected columns + missing_colnames <- col_names[!col_names %in% colnames(mbdata_new)] + if (length(missing_colnames >0)) { + missing_cols <- matrix(NA, ncol=length(missing_colnames)) + colnames(missing_cols) <- missing_colnames + mbdata_new <- cbind(mbdata_new, missing_cols) + } + + mbdata_new <- mbdata_new[, col_names] + # substitute the old entires with the ones from our files + # then find the new (previously inexistent) entries, and rbind them to the table + new_entries <- setdiff(mbdata_new$id, mb@mbdata_archive$id) + old_entries <- intersect(mbdata_new$id, mb@mbdata_archive$id) + + for(colname in colnames(mb@mbdata_archive)) { + mb@mbdata_archive[, colname] <- as.character(mb@mbdata_archive[, colname]) + } + + for(entry in old_entries) { + mb@mbdata_archive[mb@mbdata_archive$id == entry,] <- mbdata_new[mbdata_new$id == entry,] + } + + mb@mbdata_archive <- rbind(mb@mbdata_archive, mbdata_new[mbdata_new$id==new_entries,]) + + for(colname in colnames(mb@mbdata_archive)) { + mb@mbdata_archive[, colname] <- as.factor(mb@mbdata_archive[, colname]) + } + + return(mb) +} + + +# Resets the mbdata_archive to an empty version. +#' @export +resetInfolists <- function(mb) +{ + mb@mbdata_archive <- + structure(list(id = integer(0), dbcas = character(0), + dbname = character(0), dataused = character(0), COMMENT_CONFIDENCE = character(0), + COMMENT_ID = integer(0), `CH$NAME1` = character(0), + `CH$NAME2` = character(0), `CH$NAME3` = character(0), `CH$NAME4` = character(0), + `CH$NAME5` = character(0), `CH$COMPOUND_CLASS` = character(0), + `CH$FORMULA` = character(0), `CH$EXACT_MASS` = numeric(0),` CH$SMILES` = character(0), + `CH$IUPAC` = character(0), `CH$LINK_CAS` = character(0), `CH$LINK_CHEBI` = integer(0), + `CH$LINK_HMDB` = character(0), `CH$LINK_KEGG` = character(0), `CH$LINK_LIPIDMAPS` = character(0), + `CH$LINK_PUBCHEM` = character(0), `CH$LINK_INCHIKEY` = character(0), + `CH$LINK_CHEMSPIDER` = integer(0), `CH$LINK_COMPTOX` = character(0), + AUTHORS = character(0), COPYRIGHT = character(0), PUBLICATION = character(0) + ), .Names = c("id", "dbcas", + "dbname", "dataused", "COMMENT_CONFIDENCE", "COMMENT_ID", + "CH$NAME1", "CH$NAME2", "CH$NAME3", "CH$NAME4", "CH$NAME5", "CH$COMPOUND_CLASS", "CH$FORMULA", + "CH$EXACT_MASS", "CH$SMILES", "CH$IUPAC", "CH$LINK_CAS", "CH$LINK_CHEBI", + "CH$LINK_HMDB", "CH$LINK_KEGG", "CH$LINK_LIPIDMAPS", "CH$LINK_PUBCHEM", + "CH$LINK_INCHIKEY", "CH$LINK_CHEMSPIDER", "CH$LINK_COMPTOX", + "AUTHORS", "COPYRIGHT", "PUBLICATION"), row.names = integer(0), class = "data.frame") + if(getOption("RMassBank")$include_sp_tags) + { + mb@mbdata_archive["SP$SAMPLE"] <- character(0) + } + return(mb) + +} + +# The workflow function, i.e. (almost) the only thing you actually need to call. +# See below for explanation of steps. +#' MassBank record creation workflow +#' +#' Uses data generated by \code{\link{msmsWorkflow}} to create MassBank records. +#' +#' See the vignette \code{vignette("RMassBank")} for detailed informations about the usage. +#' +#' Steps: +#' +#' Step 1: Find which compounds don't have annotation information yet. For these +#' compounds, pull information from several databases (using gatherData). +#' +#' Step 2: If new compounds were found, then export the infolist.csv and stop the workflow. +#' Otherwise, continue. +#' +#' Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. +#' +#' Step 4: Compile the spectra. Using the skeletons from the archive data, create +#' MassBank records per compound and fill them with peak data for each spectrum. +#' Also, assign accession numbers based on scan mode and relative scan no. +#' +#' Step 5: Convert the internal tree-like representation of the MassBank data into +#' flat-text string arrays (basically, into text-file style, but still in memory) +#' +#' Step 6: For all OK records, generate a corresponding molfile with the structure +#' of the compound, based on the SMILES entry from the MassBank record. (This molfile +#' is still in memory only, not yet a physical file) +#' +#' Step 7: If necessary, generate the appropriate subdirectories, and actually write +#' the files to disk. +#' +#' Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank +#' to attribute substances to their corresponding structure molfiles. +#' +#' @param steps Which steps in the workflow to perform. +#' @param infolist_path A path where to store newly downloaded compound informations, +#' which should then be manually inspected. +#' @param mb The \code{mbWorkspace} to work in. +#' @param gatherData A variable denoting whether to retrieve information using several online databases \code{gatherData= "online"} +#' or to use the local babel installation \code{gatherData= "babel"}. Note that babel is used either way, if a directory is given +#' in the settings. This setting will be ignored if retrieval is set to "standard" +#' @param filter If \code{TRUE}, the peaks will be filtered according to the standard processing workflow in RMassBank - +#' only the best formula for a peak is retained, and only peaks passing multiplicity filtering are retained. If FALSE, it is assumed +#' that the user has already done filtering, and all peaks in the spectrum should be printed in the record (with or without formula.) +#' @return The processed \code{mbWorkspace}. +#' @seealso \code{\link{mbWorkspace-class}} +#' @author Michael A. Stravs, Eawag +#' @examples \dontrun{ +#' mb <- newMbWorkspace(w) # w being a msmsWorkspace +#' mb <- loadInfolists(mb, "D:/myInfolistPath") +#' mb <- mbWorkflow(mb, steps=c(1:3), "newinfos.csv") +#' +#' } +#' @export +mbWorkflow <- function(mb, steps=c(1,2,3,4,5,6,7,8), infolist_path="./infolist.csv", gatherData = "online", filter = TRUE) +{ + # Step 1: Find which compounds don't have annotation information yet. For these + # compounds, pull information from CTS (using gatherData). + if(1 %in% steps) + { + mbdata_ids <- lapply(selectSpectra(mb@spectra, "found", "object"), function(spec) spec@id) + rmb_log_info("mbWorkflow: Step 1. Gather info from several databases") + # Which IDs are not in mbdata_archive yet? + new_ids <- setdiff(as.numeric(unlist(mbdata_ids)), mb@mbdata_archive$id) + mb@mbdata <- lapply(new_ids, function(id) + { + if(findLevel(id, TRUE) == "standard"){ + if(gatherData == "online"){ + + d <- gatherData(id) + } + if(gatherData == "babel"){ + # message("mbWorkflow: Step 1. Gather info using babel") + d <- gatherDataBabel(id) + } + } else{ + # message("mbWorkflow: Step 1. Gather no info - Unknown structure") + d <- gatherDataUnknown(id, mb@spectra[[1]]@mode, retrieval = findLevel(id, TRUE)) + } + rmb_log_info(paste(id, ": ", d$dataused, sep = '')) + return(d) + }) + } + # Step 2: If new compounds were found, then export the infolist.csv and stop the workflow. + # Otherwise, continue! + if(2 %in% steps) + { + rmb_log_info("mbWorkflow: Step 2. Export infolist (if required)") + if(length(mb@mbdata)>0) + { + mbdata <- flatten(mb@mbdata) + readr::write_csv(x = mbdata, file = infolist_path, col_names = TRUE, na = "", quote = "needed") + rmb_log_info(paste("The file", infolist_path, "was generated with new compound information. Please check and edit the table, and add it to your infolist folder.")) + return(mb) + } + else + rmb_log_info("No new data added.") + } + # Step 3: Take the archive data (in table format) and reformat it to MassBank tree format. + if(3 %in% steps) + { + rmb_log_info("mbWorkflow: Step 3. Data reformatting") + mb@mbdata_relisted <- apply(mb@mbdata_archive, 1, readMbdata) + } + # Step 4: Compile the spectra! Using the skeletons from the archive data, create + # MassBank records per compound and fill them with peak data for each spectrum. + # Also, assign accession numbers based on scan mode and relative scan no. + if(4 %in% steps) + { + rmb_log_info("mbWorkflow: Step 4. Spectra compilation") + mb@compiled <- lapply( + selectSpectra(mb@spectra, "found", "object"), + function(r) { + # guard against NSE warnings from "filter" + filterOK <- NULL + best <- NULL + rmb_log_info(paste("Compiling: ", r@name, sep="")) + mbdata <- mb@mbdata_relisted[[which(mb@mbdata_archive$id == as.numeric(r@id))]] + if(filter) + res <- buildRecord(r, mbdata=mbdata, additionalPeaks=mb@additionalPeaks, filter = filterOK & best) + else + res <- buildRecord(r, mbdata=mbdata, additionalPeaks=mb@additionalPeaks) + return(res) + }) + # check which compounds have useful spectra + mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) + #mb@ok <- which(!is.na(mb@compiled) & !(lapply(mb@compiled, length)==0)) + mb@problems <- which(is.na(mb@compiled)) + mb@compiled_ok <- mb@compiled[mb@ok] + mb@compiled_notOk <- mb@compiled[!mb@ok] + } + # Step 5: Convert the internal tree-like representation of the MassBank data into + # flat-text string arrays (basically, into text-file style, but still in memory) + if(5 %in% steps) + { + rmb_log_info("mbWorkflow: [Legacy Step 5. Flattening records] ignored") + #mb@mbfiles <- lapply(mb@compiled_ok, function(cpd) toMassbank(cpd, mb@additionalPeaks)) + #mb@mbfiles_notOk <- lapply(mb@compiled_notOk, function(c) lapply(c, toMassbank)) + } + # Step 6: For all OK records, generate a corresponding molfile with the structure + # of the compound, based on the SMILES entry from the MassBank record. (This molfile + # is still in memory only, not yet a physical file) + if(6 %in% steps) + { + if(RMassBank.env$export.molfiles){ + rmb_log_info("mbWorkflow: Step 6. Generate molfiles") + mb@molfile <- lapply(mb@compiled_ok, function(c) createMolfile(as.numeric(c@id))) + } else + warning("RMassBank is configured not to export molfiles (RMassBank.env$export.molfiles). Step 6 is therefore ignored.") + } + # Step 7: If necessary, generate the appropriate subdirectories, and actually write + # the files to disk. + if(7 %in% steps) + { + rmb_log_info("mbWorkflow: Step 7. Generate subdirs and export") + + ## create folder + filePath_recData_valid <- file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata") + filePath_recData_invalid <- file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata_invalid") + filePath_molData <- file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata") + + if(!file.exists(filePath_recData_valid)) if(!dir.create(filePath_recData_valid,recursive=TRUE)) stop(paste("Could not create folder", filePath_recData_valid)) + if(RMassBank.env$export.molfiles) + if(!file.exists(filePath_molData)) if(!dir.create(filePath_molData,recursive=TRUE)) stop(paste("Could not create folder", filePath_molData)) + if(RMassBank.env$export.invalid & length(mb@mbfiles_notOk) > 0) + if(!file.exists(filePath_recData_invalid)) if(!dir.create(filePath_recData_invalid,recursive=TRUE)) stop(paste("Could not create folder", filePath_recData_invalid)) + + if(length(mb@molfile) == 0) + mb@molfile <- as.list(rep(x = NA, times = length(mb@compiled_ok))) + + ## export valid spectra + for(cnt in seq_along(mb@compiled_ok)){ + exportMassbank_recdata( + mb@compiled_ok[[cnt]], + recDataFolder = filePath_recData_valid + ) + if(RMassBank.env$export.molfiles) + exportMassbank_moldata( + mb@compiled_ok[[cnt]], + molfile = mb@molfile[[cnt]], + molDataFolder = filePath_molData + ) + } + + ## export invalid spectra + for(cnt in seq_along(mb@compiled_notOk)) + exportMassbank_recdata( + compiled = mb@mbfiles_notOk[[cnt]], + recDataFolder = filePath_recData_invalid + ) + } + # Step 8: Create the list.tsv in the molfiles folder, which is required by MassBank + # to attribute substances to their corresponding structure molfiles. + if(8 %in% steps) + { + if(RMassBank.env$export.molfiles){ + rmb_log_info("mbWorkflow: Step 8. Create list.tsv") + makeMollist(compiled = mb@compiled_ok) + } else + warning("RMassBank is configured not to export molfiles (RMassBank.env$export.molfiles). Step 8 is therefore ignored.") + } + return(mb) +} + + +# Calls openbabel and converts the SMILES code string (or retrieves the SMILES code from +# the ID, and then calls openbabel) to create a molfile in text format. +# If fileName is given, the file is directly stored. Otherwise, it is returned as a +# character array. +#' Create MOL file for a chemical structure +#' +#' Creates a MOL file (in memory or on disk) for a compound specified by the +#' compound ID or by a SMILES code. +#' +#' The function invokes OpenBabel (and therefore needs a correctly set +#' OpenBabel path in the RMassBank settings), using the SMILES code retrieved +#' with \code{findSmiles} or using the SMILES code directly. The current +#' implementation of the workflow uses the latter version, reading the SMILES +#' code directly from the MassBank record itself. +#' +#' @usage createMolfile(id_or_smiles, fileName = FALSE) +#' @param id_or_smiles The compound ID or a SMILES code. +#' @param fileName If the filename is set, the file is written directly to disk +#' using the specified filename. Otherwise, it is returned as a text array. +#' @return A character array containing the MOL/SDF format file, ready to be +#' written to disk. +#' @author Michael Stravs +#' @seealso \code{\link{findSmiles}} +#' @references OpenBabel: \url{http://openbabel.org} +#' @examples +#' +#' # Benzene: +#' \dontrun{ +#' createMolfile("C1=CC=CC=C1") +#' } +#' +#' @export +createMolfile <- function(id_or_smiles, fileName = FALSE) +{ + .checkMbSettings() + babeldir <- getOption("RMassBank")$babeldir + + if(!is.numeric(id_or_smiles)){ + smiles <- id_or_smiles + } else{ + if(findLevel(id_or_smiles,TRUE) != "standard"){ + return(c(" ","$$$$")) + } + smiles <- findSmiles(id_or_smiles) + } + # if no babeldir was set, get the result from cactus. + if(is.na(babeldir)) + { + res <- getCactus(smiles, "sdf") + + if(any(is.na(res))){ + res <- getPcSDF(smiles) + } + if(any(is.na(res))){ + stop("Pubchem and Cactus both seem to be down.") + } + if(is.character(fileName)) + writeLines(res, fileName) + } + # otherwise use the better-tested OpenBabel toolkit. + else + { + if(!is.character(fileName)) + cmd <- paste(babeldir, "obabel -ismi -osdf -d -b --gen2D", sep='') + else + cmd <- paste(babeldir, "obabel -ismi -osdf ", fileName , " -d -b --gen2D", sep='') + res <- system(cmd, intern=TRUE, input=smiles, ignore.stderr=TRUE) + # If we wrote to a file, read it back as return value. + if(is.character(fileName)) + res <- readLines(fileName) + } + #return(c(" ","$$$$")) + return(res) +} + + + +# Retrieve annotation data for a compound, from the internet service Pubchem +#' Retrieve supplemental annotation data from Pubchem +#' +#' Retrieves annotation data for a compound from the internet service Pubchem +#' based on the inchikey generated by babel or Cactus +#' +#' The data retrieved is the Pubchem CID, a synonym from the Pubchem database, +#' the IUPAC name (using the preferred if available) and a Chebi link +#' +#' @usage gatherPubChem(key) +#' @param key An Inchi-Key +#' @return Returns a list with 4 slots: +#' \code{PcID} The Pubchem CID +#' \code{Synonym} An arbitrary synonym for the compound name +#' \code{IUPAC} A IUPAC-name (preferred if available) +#' \code{Chebi} The identification number of the chebi database +#' @author Erik Mueller +#' @seealso \code{\link{mbWorkflow}} +#' @references Pubchem REST: +#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} +#' Chebi: +#' \url{http://www.ebi.ac.uk/chebi} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherPubChem("QEIXBXXKTUNWDK-UHFFFAOYSA-N")} +#' +#' @export +gatherPubChem <- function(key){ + + PubChemData <- list() + + ##Trycatches are there because pubchem has connection issues 1 in 50 times. + ##Write NA into the respective fields if something goes wrong with the conenction or the data. + + ##Retrieve Pubchem CID + tryCatch( + PubChemData$PcID <- getPcId(key), + error=function(e){ + PubChemData$PcID <<- NA + }) + + ##Retrieve a synonym to the name + tryCatch( + PubChemData$Synonym <- getPcSynonym(key), + error=function(e){ + PubChemData$Synonym <<- NA + }) + + ##Retrieve the IUPAC-name + tryCatch( + PubChemData$IUPAC <- getPcIUPAC(key), + error=function(e){ + PubChemData$IUPAC <<- NA + }) + + ##Retrieve the Chebi-ID + tryCatch( + PubChemData$Chebi <- getPcCHEBI(key), + error=function(e){ + PubChemData$Chebi <<- NA + }) + + return(PubChemData) +} + +# Retrieve annotation data for a compound, from the internet service US EPA CCTE +#' Retrieve supplemental annotation data from US EPA +#' +#' Retrieves annotation data for a compound from the internet service US EPA CCTE +#' based on the inchikey generated by babel or Cactus +#' +#' The data retrieved is the US EPA DTXSID, the US EPA chemical dashboard +#' substance ID, the CAS-RN, the DTX preferred name, and the DTXCID (chemical ID). +#' +#' @usage gatherCCTE(key, api_key) +#' @param key An Inchi-Key or other chemical identifier (e.g. Chemical name, DTXSID, CASRN, InChIKey, DTXCID) +#' @param api_key An US EPA CCTE API key (personal or application) +#' @return Returns a list with 5 slots: +#' \code{dtxsid} The US EPA chemical dashboard substance id +#' \code{dtxcid} The US EPA chemical dashboard chemical id +#' \code{preferredName} The US EPA chemical dashboard preferred name +#' \code{casrn} The latest CAS registration number +#' \code{smiles} The SMILES annotation of the structure +#' @author Tobias Schulze +#' @seealso \code{\link{mbWorkflow}} +#' @references CCTE REST: +#' \url{https://api-ccte.epa.gov/docs/} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherCCTE("QEIXBXXKTUNWDK-UHFFFAOYSA-N", api_key = NA)} +#' +#' @export +gatherCCTE <- function(key, api_key = NA) { + + # Check if the API key is provided, if not return an empty object + if (is.na(api_key)) { + CCTE_data <- list() + CTTE_data$dtxsid <- NA + CTTE_data$dtxcid <- NA + CTTE_data$preferredname <- NA + CTTE_data$casrn <- NA + CTTE_data$smiles <- NA + return(CCTE_data) + } + + CCTE_data <- list() + + ##Trycatches are there because pubchem has connection issues 1 in 50 times. + ##Write NA into the respective fields if something goes wrong with the conenction or the data. + + ##Retrieve DXTSID + tryCatch( + CCTE_data$dtxsid <- getDTXSID(key, api_key), + error=function(e){ + CCTE_data$dtxsid <<- NA + }) + + ##Retrieve DXTCID + tryCatch( + CCTE_data$dtxcid <- getDTXCID(key, api_key), + error=function(e){ + CCTE_data$dtxcid <<- NA + }) + + ##Retrieve preferred name + tryCatch( + CCTE_data$preferredname <- getPrefName(key, api_key), + error=function(e){ + CCTE_data$preferredname <<- NA + }) + + ##Retrieve latest CAS RN + tryCatch( + CCTE_data$casrn <- getCASRN(key, api_key), + error=function(e){ + CCTE_data$casrn <<- NA + }) + + ##Retrieve latest CAS RN + tryCatch( + CCTE_data$smiles <- getDTXSMILES(key, api_key), + error=function(e){ + CCTE_data$smiles <<- NA + }) + + return(CCTE_data) +} + + + +# Retrieve annotation data for a compound, from the internet services Cactvs, Pubchem, Chemspider and CTS. +#' Retrieve annotation data +#' +#' Retrieves annotation data for a compound from the internet services CTS, Pubchem, Chemspider and +#' Cactvs, based on the SMILES code and name of the compounds stored in the +#' compound list. +#' +#' Composes the "upper part" of a MassBank record filled with chemical data +#' about the compound: name, exact mass, structure, CAS no., links to PubChem, +#' KEGG, ChemSpider. The instrument type is also written into this block (even +#' if not strictly part of the chemical information). Additionally, index +#' fields are added at the start of the record, which will be removed later: +#' \code{id, dbcas, dbname} from the compound list, \code{dataused} to indicate +#' the used identifier for CTS search (\code{smiles} or \code{dbname}). +#' +#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are +#' inserted empty and will be filled later on. +#' +#' @usage gatherData(id) +#' @aliases gatherData +#' @param id The compound ID. +#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., +#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... +#' @author Michael Stravs +#' @seealso \code{\link{mbWorkflow}} +#' @references Chemical Translation Service: +#' \url{http://uranus.fiehnlab.ucdavis.edu:8080/cts/homePage} +#' cactus Chemical Identifier Resolver: +#' \url{http://cactus.nci.nih.gov/chemical/structure} +#' MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' Pubchem REST: +#' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} +#' Chemspider InChI conversion: +#' \url{https://www.chemspider.com/InChI.asmx} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherData(131)} +#' +#' @export +gatherData <- function(id) +{ + ##Preamble: Is a babeldir supplied? + ##If yes, use it + + .checkMbSettings() + usebabel=TRUE + babeldir <- getOption("RMassBank")$babeldir + + if(is.na(babeldir)){ + usebabel=FALSE + } + + + ##Get all useful information from the local "database" (from the CSV sheet) + + smiles <- findSmiles(id) + mass <- findMass(smiles) + dbcas <- findCAS(id) + dbname <- findName(id) + if(is.na(dbname)) dbname <- "" + if(is.na(dbcas)) dbcas <- "" + iupacName <- dbname + synonym <- dbname + formula <- findFormula(id) + + ##Convert SMILES to InChI key via Cactvs or babel. CTS doesn't "interpret" the SMILES per se, + ##it just matches identical known SMILES, so we need to convert to a "searchable" and + ##standardized format beforehand. Other databases are able to interpret the smiles. + + if(usebabel){ + cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey') + inchikey_split <- system(cmdinchikey, intern = TRUE, input = smiles, ignore.stderr = TRUE) + } else { + inchikey <- getCactus(identifier = smiles, representation = "stdinchikey") + + if(is.na(inchikey)) { + inchikey <- getPcInchiKey(query = smiles, from = "smiles") + } + + if(!is.na(inchikey)){ + ##Split the "InChiKey=" part off the key + inchikey_split <- strsplit(inchikey, "=", fixed = TRUE)[[1]][[2]] + } else { + inchikey_split <- getPcInchiKey(query = smiles, from = "smiles") + } + } + + ##Use Pubchem to retrieve information + PcInfo <- gatherPubChem(inchikey_split) + + if(!is.null(PcInfo$Synonym) & !is.na(PcInfo$Synonym)){ + synonym <- PcInfo$Synonym + } + + if(!is.null(PcInfo$IUPAC) & !is.na(PcInfo$IUPAC)){ + iupacName <- PcInfo$IUPAC + } + + ##Get Chemspider-ID + # Get the api key from the settings + rcs_api_key = getOption("RMassBank")$settings$rcs_api_key + + if(!is.null(rcs_api_key)) { + csid <- getCSID(key = inchikey_split, identifier = "inchikey", api_key = rcs_api_key) + + if(is.null(csid)){ + csid <- NA + } + } + else { + csid <- NA + } + + ## Get DTXSID + + # Get the api key from the settings + ccte_api_key = getOption("RMassBank")$settings$ccte_api_key + + if(!is.null(ccte_api_key)) { + dtxsid <- getDTXSID(key = inchikey_split, api_key = ccte_api_key) + + if(is.null(dtxsid)){ + dtxsid <- NA + } + } + else { + dtxsid <- NA + } + + ##Use CTS to retrieve information + CTSinfo <- getCtsRecord(inchikey_split) + + if((CTSinfo[1] == "Sorry, we couldn't find any matching results") || is.null(CTSinfo[1])) + { + CTSinfo <- NA + } + + ##List the names + if(iupacName == ""){ + warning(paste0("Compound ID ",id,": no IUPAC name could be identified.")) + } + + if(toupper(dbname) == toupper(synonym)){ + synonym <- dbname + } + + if(toupper(dbname) == toupper(iupacName)){ + iupacName <- dbname + } + + if(toupper(synonym) == toupper(iupacName)){ + synonym <- iupacName + } + + names <- as.list(unique(c(dbname, synonym, iupacName))) + + ##If no name is found, it must be supplied in one way or another + if(all(sapply(names, function(x) x == ""))){ + stop("RMassBank wasn't able to extract a usable name for this compound from any database. Please supply a name manually.") + } + + # Start to fill the MassBank record. + # The top 4 entries will not go into the final record; they are used to identify + # the record and also to facilitate manual editing of the exported record table. + mbdata <- list() + mbdata[['id']] <- id + mbdata[['dbcas']] <- dbcas + mbdata[['dbname']] <- dbname + mbdata[['dataused']] <- "smiles" + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors + mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license + mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright + # Confidence annotation and internal ID annotation. + # The ID of the compound will be written like: + # COMMENT: EAWAG_UCHEM_ID 1234 + # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" + mbdata[["COMMENT"]] <- list() + if(findLevel(id) == "0"){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment + } else{ + level <- findLevel(id) + if(level %in% c("1","1a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" + } + if(level == c("2")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" + } + if(level == c("2a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" + } + if(level == c("2b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" + } + if(level == c("3")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" + } + if(level == c("3a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" + } + if(level == c("3b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" + } + if(level == c("3c")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" + } + if(level == c("3d")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" + } + if(level == c("4")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" + } + if(level == c("5")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" + } + } + + mbdata[["COMMENT"]][["ID"]] = id + + ## add generic COMMENT information + rowIdx <- which(.listEnvEnv$listEnv$compoundList$ID == id) + properties <- colnames(.listEnvEnv$listEnv$compoundList) + properties2 <- gsub(x = properties, pattern = "^COMMENT ", replacement = "") + theseProperties <- grepl(x = properties, pattern = "^COMMENT ") + theseProperties <- theseProperties & (!(unlist(.listEnvEnv$listEnv$compoundList[rowIdx, ]) == "NA" | is.na(unlist(.listEnvEnv$listEnv$compoundList[rowIdx, ])))) + mbdata[["COMMENT"]][properties2[theseProperties]] <- unlist(.listEnvEnv$listEnv$compoundList[rowIdx, theseProperties]) + + # here compound info starts + mbdata[['CH$NAME']] <- names + # Currently we use a fixed value for Compound Class, since there is no useful + # convention of what should go there and what shouldn't, and the field is not used + # in search queries. + mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class + mbdata[['CH$FORMULA']] <- formula + mbdata[['CH$EXACT_MASS']] <- mass + mbdata[['CH$SMILES']] <- smiles + + if(usebabel){ + cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi') + mbdata[['CH$IUPAC']] <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE) + } else{ + mbdata[['CH$IUPAC']] <- getCactus(smiles, "stdinchi") + } + + + + # Add all CH$LINK fields present in the compound datasets + link <- list() + # CAS + if(!is.na(CTSinfo[1])){ + if("CAS" %in% CTS.externalIdTypes(CTSinfo)) + { + # Prefer database CAS if it is also listed in the CTS results. + # otherwise take the shortest one. + cas <- CTS.externalIdSubset(CTSinfo,"CAS") + if(dbcas %in% cas) + link[["CAS"]] <- dbcas + else + link[["CAS"]] <- cas[[which.min(nchar(cas))]] + } else{ + if(dbcas != ""){ + link[["CAS"]] <- dbcas + } + } + } else{ + if(dbcas != ""){ + link[["CAS"]] <- dbcas + } + } + + + # CHEBI + if(is.na(PcInfo$Chebi[1])){ + if(!is.na(CTSinfo[1])){ + if("ChEBI" %in% CTS.externalIdTypes(CTSinfo)) + { + # Cut off front "CHEBI:" if present + chebi <- CTS.externalIdSubset(CTSinfo,"ChEBI") + chebi <- chebi[[which.min(nchar(chebi))]] + chebi <- strsplit(chebi,":")[[1]] + link[["CHEBI"]] <- chebi[[length(chebi)]] + } + } + } else{ + chebi <- PcInfo$Chebi + chebi <- chebi[[which.min(nchar(chebi))]] + chebi <- strsplit(chebi,":")[[1]] + link[["CHEBI"]] <- chebi[[length(chebi)]] + } + # HMDB + if(!is.na(CTSinfo[1])){ + if("Human Metabolome Database" %in% CTS.externalIdTypes(CTSinfo)) + link[["HMDB"]] <- CTS.externalIdSubset(CTSinfo,"HMDB")[[1]] + # KEGG + if("KEGG" %in% CTS.externalIdTypes(CTSinfo)) + link[["KEGG"]] <- CTS.externalIdSubset(CTSinfo,"KEGG")[[1]] + # LipidMAPS + if("LipidMAPS" %in% CTS.externalIdTypes(CTSinfo)) + link[["LIPIDMAPS"]] <- CTS.externalIdSubset(CTSinfo,"LipidMAPS")[[1]] + } + # PubChem CID + if(is.na(PcInfo$PcID[1])){ + if(!is.na(CTSinfo[1])){ + if("PubChem CID" %in% CTS.externalIdTypes(CTSinfo)) + { + pc <- CTS.externalIdSubset(CTSinfo,"PubChem CID") + link[["PUBCHEM"]] <- paste0(min(pc)) + } + } + } else{ + link[["PUBCHEM"]] <- PcInfo$PcID[1] + } + + + if(!is.null(link[["PUBCHEM"]])){ + if(substr(link[["PUBCHEM"]],1,4) != "CID:"){ + link[["PUBCHEM"]] <- paste0("CID:", link[["PUBCHEM"]]) + } + } + + link[["INCHIKEY"]] <- inchikey_split + link[["COMPTOX"]] <- dtxsid + if(length(csid)>0) if(any(!is.na(csid))) link[["CHEMSPIDER"]] <- min(as.numeric(as.character(csid[!is.na(csid)]))) + mbdata[['CH$LINK']] <- link + + return(mbdata) +} + +# Retrieve annotation data for a compound, using only babel +#' Retrieve annotation data +#' +#' Retrieves annotation data for a compound by using babel, +#' based on the SMILES code and name of the compounds stored in the +#' compound list. +#' +#' Composes the "upper part" of a MassBank record filled with chemical data +#' about the compound: name, exact mass, structure, CAS no.. +#' The instrument type is also written into this block (even +#' if not strictly part of the chemical information). Additionally, index +#' fields are added at the start of the record, which will be removed later: +#' \code{id, dbcas, dbname} from the compound list. +#' +#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are +#' inserted empty and will be filled later on. +#' +#' This function is an alternative to gatherData, in case CTS is down or if information +#' on one or more of the compounds in the compound list are sparse +#' +#' @usage gatherDataBabel(id) +#' @param id The compound ID. +#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., +#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... +#' @author Michael Stravs, Erik Mueller +#' @seealso \code{\link{mbWorkflow}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherDataBabel(131)} +#' +#' @export +gatherDataBabel <- function(id){ + .checkMbSettings() + babeldir <- getOption("RMassBank")$babeldir + smiles <- findSmiles(id) + + + # if no babeldir was set, throw an error that says that either CTS or babel have to be used + if(is.na(babeldir)) + { + stop("No babeldir supplied; It is currently not possible to convert the information without either babel or CTS") + } else { + ###Babel conversion + cmdinchikey <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchikey') + inchikey <- system(cmdinchikey, intern=TRUE, input=smiles, ignore.stderr=TRUE) + cmdinchi <- paste0(babeldir, 'obabel -:"',smiles,'" ', '-oinchi') + inchi <- system(cmdinchi, intern=TRUE, input=smiles, ignore.stderr=TRUE) + + ##Read from Compoundlist + smiles <- findSmiles(id) + mass <- findMass(smiles) + dbcas <- findCAS(id) + dbname <- findName(id) + if(is.na(dbname)) dbname <- "" + if(is.na(dbcas)) dbcas <- "" + formula <- findFormula(id) + + ##Create + mbdata <- list() + mbdata[['id']] <- id + mbdata[['dbcas']] <- dbcas + mbdata[['dbname']] <- dbname + mbdata[['dataused']] <- "smiles" + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors + mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license + mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright + # Confidence annotation and internal ID annotation. + # The ID of the compound will be written like: + # COMMENT: EAWAG_UCHEM_ID 1234 + # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" + mbdata[["COMMENT"]] <- list() + if(findLevel(id) == "0"){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment + } else{ + level <- findLevel(id) + if(level %in% c("1","1a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" + } + if(level == c("2")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" + } + if(level == c("2a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" + } + if(level == c("2b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" + } + if(level == c("3")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" + } + if(level == c("3a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" + } + if(level == c("3b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" + } + if(level == c("3c")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" + } + if(level == c("3d")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" + } + if(level == c("4")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" + } + if(level == c("5")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" + } + } + mbdata[["COMMENT"]][["ID"]] <- id + + # here compound info starts + mbdata[['CH$NAME']] <- as.list(dbname) + + # Currently we use a fixed value for Compound Class, since there is no useful + # convention of what should go there and what shouldn't, and the field is not used + # in search queries. + mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class + mbdata[['CH$FORMULA']] <- formula + mbdata[['CH$EXACT_MASS']] <- mass + mbdata[['CH$SMILES']] <- smiles + mbdata[['CH$IUPAC']] <- inchi + + link <- list() + if(dbcas != "") + link[["CAS"]] <- dbcas + link[["INCHIKEY"]] <- inchikey + mbdata[['CH$LINK']] <- link + } + return(mbdata) +} + +# Retrieve annotation data for a compound, using only babel +#' Retrieve annotation data +#' +#' Retrieves annotation data for an unknown compound by using basic information present +#' +#' Composes the "upper part" of a MassBank record filled with chemical data +#' about the compound: name, exact mass, structure, CAS no.. +#' The instrument type is also written into this block (even +#' if not strictly part of the chemical information). Additionally, index +#' fields are added at the start of the record, which will be removed later: +#' \code{id, dbcas, dbname} from the compound list. +#' +#' Additionally, the fields \code{ACCESSION} and \code{RECORD_TITLE} are +#' inserted empty and will be filled later on. +#' +#' This function is used to generate the data in case a substance is unknown, +#' i.e. not enough information is present to derive anything about formulas or links +#' +#' @usage gatherDataUnknown(id, mode, retrieval) +#' @param id The compound ID. +#' @param mode \code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +#' ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-). +#' @param retrieval A value that determines whether the files should be handled either as "standard", +#' if the compoundlist is complete, "tentative", if at least a formula is present or "unknown" +#' if the only know thing is the m/z +#' @return Returns a list of type \code{list(id= \var{compoundID}, ..., +#' 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. %% ... +#' @author Michael Stravs, Erik Mueller +#' @seealso \code{\link{mbWorkflow}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' +#' # Gather data for compound ID 131 +#' \dontrun{gatherDataUnknown(131,"pH")} +#' +#' @export +gatherDataUnknown <- function(id, mode, retrieval){ + .checkMbSettings() + + ##Read from Compoundlist + smiles <- "" + if(retrieval == "unknown"){ + mass <- findMass(id, "unknown", mode) + formula <- "" + } + if(retrieval == "tentative"){ + mass <- findMass(id, "tentative", mode) + formula <- findFormula(id, "tentative") + } + dbcas <- NA + dbname <- findName(id) + if(is.na(dbname)) dbname <- paste("Unknown ID:",id) + if(is.na(dbcas)) dbcas <- "" + + + + ##Create + mbdata <- list() + mbdata[['id']] <- id + mbdata[['dbcas']] <- dbcas + mbdata[['dbname']] <- dbname + mbdata[['dataused']] <- "none" + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors + mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license + mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright + # Confidence annotation and internal ID annotation. + # The ID of the compound will be written like: + # COMMENT: EAWAG_UCHEM_ID 1234 + # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" + mbdata[["COMMENT"]] <- list() + if(findLevel(id) == "0"){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- getOption("RMassBank")$annotations$confidence_comment + } else{ + level <- findLevel(id) + if(level %in% c("1","1a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Reference Standard (Level 1)" + } + if(level == c("2")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure, tentative identification (Level 2)" + } + if(level == c("2a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via library match, tentative identification (Level 2a)" + } + if(level == c("2b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Probable structure via diagnostic evidence, tentative identification (Level 2b)" + } + if(level == c("3")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification only (Level 3)" + } + if(level == c("3a")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: most likely structure (Level 3)" + } + if(level == c("3b")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: isomers possible (Level 3)" + } + if(level == c("3c")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: substance class known (Level 3)" + } + if(level == c("3d")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: best match only (Level 3)" + } + if(level == c("4")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: molecular formula only (Level 4)" + } + if(level == c("5")){ + mbdata[["COMMENT"]][["CONFIDENCE"]] <- "Tentative identification: structure and formula unknown (Level 5)" + } + } + mbdata[["COMMENT"]][["ID"]] <- id + + # here compound info starts + mbdata[['CH$NAME']] <- as.list(dbname) + + # Currently we use a fixed value for Compound Class, since there is no useful + # convention of what should go there and what shouldn't, and the field is not used + # in search queries. + mbdata[['CH$COMPOUND_CLASS']] <- getOption("RMassBank")$annotations$compound_class + mbdata[['CH$FORMULA']] <- formula + mbdata[['CH$EXACT_MASS']] <- mass + mbdata[['CH$SMILES']] <- "" + mbdata[['CH$IUPAC']] <- "" + + link <- list() + mbdata[['CH$LINK']] <- link + + return(mbdata) +} + +# Flatten the internal tree-like representation of MassBank data to a flat table. +# Note that this limits us, in that the fields should be constant over all records! +# Therefore, e.g. the fixed number of 3 names which may be filled. +# If anybody has a cooler solution, I'll be happy to hear from you :) +# +# Note: the records from gatherData have additional information which is discarded, like +# author, copyright etc. They will be re-filled automatically when reading the file. +#' Flatten, or re-read, MassBank header blocks +#' +#' \code{flatten} converts a list of MassBank compound information sets (as +#' retrieved by \code{\link{gatherData}}) to a flat table, to be exported into +#' an \link[=loadInfolist]{infolist}. \code{readMbdata} reads a single record +#' from an infolist flat table back into a MassBank (half-)entry. +#' +#' Neither the flattening system itself nor the implementation are particularly +#' fantastic, but since hand-checking of records is a necessary evil, there is +#' currently no alternative (short of coding a complete GUI for this and +#' working directly on the records.) +#' +#' @aliases flatten readMbdata +#' @usage flatten(mbdata) +#' +#' readMbdata(row) +#' @param mbdata A list of MassBank compound information sets as returned from +#' \code{\link{gatherData}}. +#' @param row One row of MassBank compound information retrieved from an +#' infolist. +#' @return \code{flatten} returns a tibble (not a data frame or matrix) to be written to +#' CSV. +#' +#' \code{readMbdata} returns a list of type \code{list(id= \var{compoundID}, +#' ..., 'ACCESSION' = '', 'RECORD_TITLE' = '', )} etc. +#' @author Michael Stravs +#' @seealso \code{\link{gatherData}},\code{\link{loadInfolist}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples \dontrun{ +#' # Collect some data to flatten +#' ids <- c(40,50,60,70) +#' data <- lapply(ids, gatherData) +#' # Flatten the data trees to a table +#' flat.table <- flatten(data) +#' # reimport the table into a tree +#' data.reimported <- apply(flat.table, 1, readMbdata) +#' } +#' +#' @export +#' +flatten <- function(mbdata) +{ + .checkMbSettings() + + colNames <- names(unlist(mbdata[[1]])) + commentNames <- colNames[grepl(x = colNames, pattern = "^COMMENT\\_")] + if(!is.null(mbdata[[1]]$COMMENT)) { + commentNames <- c(commentNames, glue::glue("COMMENT_{names(mbdata[[1]]$COMMENT)}")) + } + + colList <- c( + "id", + "dbcas", + "dbname", + "dataused", + commentNames, + #"COMMENT_CONFIDENCE", + # Note: The field name of the internal id field is replaced with the real name + # at "compilation" time. Therefore, functions DOWNSTREAM from compileRecord() + # must use the full name including the info from options("RMassBank"). + #"COMMENT_ID", + "CH$NAME1", + "CH$NAME2", + "CH$NAME3", + "CH$NAME4", + "CH$NAME5", + "CH$COMPOUND_CLASS", + "CH$FORMULA", + "CH$EXACT_MASS", + "CH$SMILES", + "CH$IUPAC", + "CH$LINK_CAS", + "CH$LINK_CHEBI", + "CH$LINK_HMDB", + "CH$LINK_KEGG", + "CH$LINK_LIPIDMAPS", + "CH$LINK_PUBCHEM", + "CH$LINK_INCHIKEY", + "CH$LINK_CHEMSPIDER", + "CH$LINK_COMPTOX" + ) + # make an empty data frame with the right length + rows <- length(mbdata) + cols <- length(colList) + + mbtbl <- tibble::tibble(!!!colList, .rows = 0, .name_repair = ~ colList) + + + #mbframe <- matrix(data = NA, nrow = rows, ncol = cols) + #colnames(mbframe) <- colList + #browser() + for(i in 1:rows) { + # fill in all the data into the dataframe: all columns which + # a) exist in the target dataframe and b) exist in the (unlisted) MB record + # are written into the dataframe. + data <- unlist(mbdata[[i]], use.names = TRUE) + names(data) <- gsub("\\.", "_", names(data)) + # bugfix for the case of only one name + if(!("CH$NAME1" %in% names(data))) { + data[["CH$NAME1"]] <- data[["CH$NAME"]] + } + datacols <- intersect(colList, names(data)) + + mbtbl <- mbtbl |> dplyr::bind_rows(data[datacols]) + + } + + return(mbtbl) + +} + +# Read data from a flat-table MassBank record row and feed it into a +# MassBank tree-like record. Also, prime the ACCESSION and RECORD_TITLE fields in the +# correct position in the record. +#' @export +readMbdata <- function(row) +{ + .checkMbSettings() + + # Listify the table row. Lists are just cooler to work with :) + row <- as.list(row) + + mbdata <- list() + # Accession and title are added empty for now, to have them in the right place. + # Constants are read from the options or generated. + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + mbdata[['AUTHORS']] <- getOption("RMassBank")$annotations$authors + mbdata[['LICENSE']] <- getOption("RMassBank")$annotations$license + mbdata[['COPYRIGHT']] <- getOption("RMassBank")$annotations$copyright + if(getOption("RMassBank")$annotations$publication!="") { + mbdata[['PUBLICATION']] <- getOption("RMassBank")$annotations$publication + } + if(!is.na(row[["PUBLICATION"]])) + mbdata[["PUBLICATION"]] = row[["PUBLICATION"]] + commentNames <- names(row)[grepl(x = names(row), pattern = "^COMMENT\\.")] + commentNames <- c(commentNames, names(row)[grepl(x = names(row), pattern = "^COMMENT\\_")]) + commentNames <- commentNames[!is.na(row[commentNames])] + + # Read all determined fields from the file + # This is not very flexible, as you can see... + colList <- c( + commentNames, + #"COMMENT_CONFIDENCE", + #"COMMENT_ID", + "CH$NAME1", + "CH$NAME2", + "CH$NAME3", + "CH$NAME4", + "CH$NAME5", + "CH$COMPOUND_CLASS", + "CH$FORMULA", + "CH$EXACT_MASS", + "CH$SMILES", + "CH$IUPAC", + "CH$LINK_CAS", + "CH$LINK_CHEBI", + "CH$LINK_HMDB", + "CH$LINK_KEGG", + "CH$LINK_LIPIDMAPS", + "CH$LINK_PUBCHEM", + "CH$LINK_INCHIKEY", + "CH$LINK_CHEMSPIDER", + "CH$LINK_COMPTOX") + mbdata[["COMMENT"]] = list() + #mbdata[["COMMENT"]][["CONFIDENCE"]] <- row[["COMMENT_CONFIDENCE"]] + # Again, our ID field. + #mbdata[["COMMENT"]][["ID"]] <- row[["COMMENT_D"]] + mbdata[["COMMENT"]][gsub(x = commentNames, pattern = "^COMMENT\\_", replacement = "")] <- row[commentNames] + + names = c(row[["CH$NAME1"]], row[["CH$NAME2"]], row[["CH$NAME3"]], row[["CH$NAME4"]], row[["CH$NAME5"]]) + names = names[which(!is.na(names))] + + names <- gsub("'", "`", names) + mbdata[["CH$NAME"]] = names + mbdata[["CH$COMPOUND_CLASS"]] = row[["CH$COMPOUND_CLASS"]] + mbdata[["CH$FORMULA"]] = row[["CH$FORMULA"]] + mbdata[["CH$EXACT_MASS"]] = row[["CH$EXACT_MASS"]] + mbdata[["CH$SMILES"]] = row[["CH$SMILES"]] + mbdata[["CH$IUPAC"]] = row[["CH$IUPAC"]] + # Add all links and then eliminate the NA values from the tree. + link = list() + link[["CAS"]] = row[["CH$LINK_CAS"]] + link[["CHEBI"]] = row[["CH$LINK_CHEBI"]] + link[["HMDB"]] = row[["CH$LINK_HMDB"]] + link[["KEGG"]] = row[["CH$LINK_KEGG"]] + link[["LIPIDMAPS"]] = row[["CH$LINK_LIPIDMAPS"]] + link[["PUBCHEM"]] = row[["CH$LINK_PUBCHEM"]] + link[["INCHIKEY"]] = row[["CH$LINK_INCHIKEY"]] + link[["CHEMSPIDER"]] = row[["CH$LINK_CHEMSPIDER"]] + link[["COMPTOX"]] = row[["CH$LINK_COMPTOX"]] + link[which(is.na(link))] <- NULL + mbdata[["CH$LINK"]] <- link + + ## SP$SAMPLE + if(all(nchar(row[["SP_SAMPLE"]]) > 0, row[["SP_SAMPLE"]] != "NA", !is.na(row[["SP_SAMPLE"]]), na.rm = TRUE)) + mbdata[['SP$SAMPLE']] <- row[["SP_SAMPLE"]] + + if(!is.na(row[["AUTHORS"]])) + mbdata[["AUTHORS"]] = row[["AUTHORS"]] + + if(!is.na(row[["COPYRIGHT"]])) + mbdata[["COPYRIGHT"]] = row[["COPYRIGHT"]] + + + + + + return(mbdata) + +} + +#' Generate peak annotation from peaklist +#' +#' Generates the PK$ANNOTATION entry from the peaklist obtained. This function is +#' overridable by using the "annotator" option in the settings file. +#' +#' @param annotation A peak list to be annotated. Contains columns: +#' \code{"cpdID","formula","mzFound" ,"scan","mzCalc","dppm", +#' "dbe","mz","int","formulaCount","parentScan","fM_factor","dppmBest", +#' "formulaMultiplicity","intrel","mzSpec"} +#' +#' @param formulaTag The ion type to be added to annotated formulas ("+" or "-" usually) +#' +#' @return The annotated peak table. Table \code{colnames()} will be used for the +#' titles (preferrably don't use spaces in the column titles; however no format is +#' strictly enforced by the MassBank data format. +#' +#' @examples +#' \dontrun{ +#' annotation <- annotator.default(annotation) +#' } +#' @author Michele Stravs, Eawag +#' @export +annotator.default <- function(annotation, formulaTag) +{ + if(!is.null(formulaTag)) + type <- formulaTag + else + type <- "" + + annotation <- annotation[!is.na(annotation$formula),,drop=FALSE] + annotation <- annotation[annotation$formula != "",,drop=FALSE] + + annotation$formula <- paste(annotation$formula, rep(type, length(annotation$formula)), sep='') + # Select the right columns and name them correctly for output. + annotation <- annotation[,c("mz","formula", "formulaCount", "mzCalc", "dppm")] + colnames(annotation) <- c("m/z", "tentative_formula", "formula_count", "mass", "error(ppm)") + + return(annotation) +} + +#' Parse record title +#' +#' Parses a title for a single MassBank record using the title format +#' specified in the option titleFormat. Internally used, not exported. +#' +#' If the option is not set, a standard title format is used (for record definition +#' version 1 or 2). +#' +#' @usage .parseTitleString(mbdata) +#' @param mbdata list +#' The information data block for the record header, as stored in +#' \code{mbdata_relisted} after loading an infolist. +#' @return A string with the title. +#' @author Michael Stravs, Eawag +#' @seealso \code{\link{buildRecord}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' \dontrun{ +#' # used in buildRecord() +#' title <- .parseTitleString(mbdata) +#' } +#' +#' +#' +.parseTitleString <- function(mbdata) +{ + + varlist <- getOption("RMassBank")$titleFormat + + # Set the standard title format. + if(is.null(varlist)) + { + if(getOption("RMassBank")$use_version == 2) + { + varlist <- c( + "{CH$NAME}", + "{AC$INSTRUMENT_TYPE}", + "{AC$MASS_SPECTROMETRY: MS_TYPE}", + "CE: {RECORD_TITLE_CE}", + "R={AC$MASS_SPECTROMETRY: RESOLUTION}", + "{MS$FOCUSED_ION: PRECURSOR_TYPE}" + ) + } + else + { + varlist <- c( + "{CH$NAME}", + "{AC$INSTRUMENT_TYPE}", + "{AC$ANALYTICAL_CONDITION: MS_TYPE}", + "CE: {RECORD_TITLE_CE}", + "R={AC$ANALYTICAL_CONDITION: RESOLUTION}", + "{MS$FOCUSED_ION: PRECURSOR_TYPE}" + ) + } + } + + + # Extract a {XXX} argument from each title section. + # check that every title has one and only one match + args <- regexec("\\{(.*)\\}", varlist) + arglist <- regmatches(varlist, args) + if(any(unlist(lapply(arglist, length)) != 2)) + stop("Title format is incorrectly specified: a section with not exactly 1 parameters") + + parsedVars <- lapply(varlist, function(var) + { + # Extract the specified parameter inside the {}. + # I.e. from a string like "R={BLA: BLUB}" return "BLA: BLUB" + args <- regexec("\\{(.*)\\}", var) + arg <- regmatches(var, args)[[1]][[2]] + # Split the parameter by colon if necessary + splitVar <- strsplit(arg, ": ")[[1]] + # Read the parameter value from the record + if(length(splitVar) == 2) + replaceVar <- mbdata[[splitVar[[1]]]][[splitVar[[2]]]] + else if(length(splitVar) == 1) + replaceVar <- mbdata[[splitVar]] + else + stop(paste( + "Title format is incorrectly specified:", var) + ) + # Fix problems: NULL returns + if(is.null(replaceVar)) + replaceVar <- "" + # Fix problems: Names will have >= 1 match. Take the first + if(length(replaceVar) > 1) + replaceVar <- replaceVar[[1]] + + # Fix problems: Unknowns might have no name + if(!length(replaceVar)){ + replaceVar <- "" + } + + # Substitute the parameter value into the string + parsedVar <- sub("\\{(.*)\\}", replaceVar, var) + return(parsedVar) + }) + title <- paste(parsedVars, collapse="; ") + return(title) +} + + +# This converts the tree-like list (as obtained e.g. from compileRecord()) +# into a plain text array, which can then be dumped to a file suitable for +# MassBank upload. + +#' Write MassBank record into character array +#' +#' Writes a MassBank record in list format to a text array. +#' +#' The function is a general conversion tool for the MassBank format; i.e. the +#' field names are not fixed. \code{mbdata} must be a named list, and the +#' entries can be as follows: \itemize{ +#' \item A single text line: +#' +#' \code{'CH\$EXACT_MASS' = '329.1023'} +#' +#' is written as +#' +#' \code{CH\$EXACT_MASS: 329.1023} +#' \item A character array: +#' +#' \code{'CH\$NAME' = c('2-Aminobenzimidazole', '1H-Benzimidazol-2-amine')} +#' +#' is written as +#' +#' \code{CH\$NAME: 2-Aminobenzimidazole} +#' +#' \code{CH\$NAME: 1H-Benzimidazol-2-amine} +#' +#' \item A named list of strings: +#' +#' \code{'CH\$LINK' = list('CHEBI' = "27822", "KEGG" = "C10901")} +#' +#' is written as +#' +#' \code{CH\$LINK: CHEBI 27822} +#' +#' \code{CH\$LINK: KEGG C10901} +#' +#' \item A data frame (e.g. the peak table) is written as specified in +#' the MassBank record format (Section 2.6.3): the column names are used as +#' headers for the first line, all data rows are printed space-separated. +#' } +#' +#' @usage toMassbank(o, ...) +#' @param o An object to convert to MassBank record format. This may be +#' a single `RmbSpectrum2`, or a complete compound (an `RmbSpectraSet`), +#' @param ... Parameters passed to the implementation, +#' in particular `addAnnotation` +#' @param addAnnotation `logical`, whether to add peak annotations (putative formulas) to the record. +#' +#' @return The result is a text array, which is ready to be written to the disk +#' as a file. +#' @note The function iterates over the list item names. \bold{This means that +#' duplicate entries in \code{mbdata} are (partially) discarded!} The correct +#' way to add them is by making a character array (as specified above): Instead +#' of \code{'CH\$NAME' = 'bla', 'CH\$NAME' = 'blub'} specify \code{'CH\$NAME' = +#' c('bla','blub')}. +#' @author Michael Stravs +#' @seealso \code{\link{buildRecord}}, \code{\link{mbWorkflow}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @examples +#' \dontrun{ +#' # Read just the compound info skeleton from the Internet for some compound ID +#' id <- 35 +#' mbdata <- gatherData(id) +#' #' # Export the mbdata blocks to line arrays +#' # (there is no spectrum information, just the compound info...) +#' mbtext <- toMassbank(mbdata) +#' } +#' +#' +#' @export +setGeneric("toMassbank", function(o, ...) standardGeneric("toMassbank")) + + +#' @rdname toMassbank +#' @export +setMethod("toMassbank", "RmbSpectraSet", function(o, addAnnotation = getOption("RMassBank")$add_annotation) + { + lapply(o@children, function(s) toMassbank(s, addAnnotation)) + }) + +#' @rdname toMassbank +#' @export +setMethod("toMassbank", "RmbSpectrum2", function(o, addAnnotation = getOption("RMassBank")$add_annotation) + { + .toMassbank(o, addAnnotation) + }) + +.toMassbank <- function (s, addAnnotation = getOption("RMassBank")$add_annotation) +{ + + peaks <- getData(s) + # check that peaks were normalized + if(!("intrel" %in% colnames(peaks))) + { + s <- normalize(s, slot="intrel") + peaks <- getData(s) + } + + # Keep only peaks with relative intensity >= 1 o/oo, since the MassBank record + # makes no sense otherwise. Also, keep only the columns needed in the output. + peaks <- peaks[ peaks$intrel >= 1,,drop=FALSE] + + peaks$mz <- round(peaks$mz, 4) + # Also format the other values, which are used in the annotation + peaks$dppm <- round(peaks$dppm, 2) + peaks$mzCalc <- round(peaks$mzCalc, 4) + peaks$intensity <- round(peaks$intensity, 1) + + # Get polarity from Spectrum2 now! + formulaTag <- "" + if(s@polarity == 1) formulaTag <- "+" + if(s@polarity == 0) formulaTag <- "-" + # if polarity is -1, leave it unspecified. the "specs" seem to be 1 for +, 0 for - and -1 for ??? + # (when reading mzML I often get -1, when reading mzXML I get 1 and 0 respectively) + + annotator <- getOption("RMassBank")$annotator + if(is.null(annotator)) + annotator <- "annotator.default" + + annotation <- do.call(annotator, list(annotation= peaks, formulaTag = formulaTag)) + + peaks <- peaks[,c("mz", "intensity", "intrel")] + peaks <- unique(peaks) + # Name the columns correctly. + colnames(peaks) <- c("m/z", "int.", "rel.int.") + peaknum <- nrow(peaks) + + mbdata <- s@info + + mbdata[["PK$SPLASH"]] <- list(SPLASH = getSplash(peaks[,c("m/z", "int.")])) + + # Annotation: + if(addAnnotation && (nrow(annotation) > 0)) + mbdata[["PK$ANNOTATION"]] <- annotation + + # Peak table + mbdata[["PK$NUM_PEAK"]] <- peaknum + mbdata[["PK$PEAK"]] <- peaks + + # mbf is an array of lines and count is the line counter. + # Very old-school, but it works. :) + mbf <- character(0) + count <- 1 + lapply(names(mbdata), function(entry) + { + # If entry is a char line, add it to the file. + # If it is a named sublist, add each subentry with name + # If it is an unnamed sublist, add each subentry without name + # if it is a dataframe, write in PEAKS mode + + # Note: this is were I liked "lapply" a little too much. "for" would + # be more idiomatic, and wouldn't need the <<- assignments. + + # Data frame: table mode. A header line and one space-separated line for + # each data frame row. + if(is.data.frame(mbdata[[entry]])) + { + mbf[[count]] <<- paste(entry,": " , + paste(colnames(mbdata[[entry]]), collapse=" "), + sep='') + count <<- count+1 + for(row in 1:nrow(mbdata[[entry]])) + { + mbf[[count]] <<- paste(" ", + paste( + prettyNum(mbdata[[entry]][row,], scientific = FALSE, digits = 12), + collapse=" "), + sep="") + count <<- count+1 + } + #browser() + } + # List with named items: Write every entry like CH$LINK: CAS 12-345-678 + else if(is.list(mbdata[[entry]]) & !is.null(names(mbdata[[entry]]))) + { + + lapply(names(mbdata[[entry]]), function(subentry) + { + if(subentry != "SPLASH"){ + mbf[[count]] <<- paste(entry,": ",subentry, " ", mbdata[[entry]][[subentry]], sep='') + } else { + mbf[[count]] <<- paste(entry,": ", mbdata[[entry]][[subentry]], sep='') + } + #print(mbf) + count <<- count + 1 + }) + } + # Array (or list) of unnamed items: Write every entry like CH$NAME: Paracetamol + # (iterative entry without subindices) + else if (length(mbdata[[entry]]) > 1 & is.null(names(mbdata[[entry]]))) + { + lapply(mbdata[[entry]], function(subentry) + { + mbf[[count]] <<- paste(entry,": ",subentry, sep='') + #print(mbf) + count <<- count + 1 + }) + } + # Length is 1: just write the entry like PK$NUM_PEAKS: 131 + else + { + mbf[[count]] <<- paste(entry,": ",mbdata[[entry]], sep='') + count <<- count + 1 + } + } + ) # End of lapply block (per child spectrum) + # Add mandatory EOF marker + mbf[[count]] <- "//" + return(mbf) +} + +# Exports compiled and massbanked spectra, with their associated molfiles, to physical files. +# "compiled" is still used here, because we need an accessible accession number. +# In the plain text arrays, the accession number is already "hidden". +# compiled: is ONE "compiled" entry, i.e. ONE compound with e.g. 14 spectra. +# files: is a return value from lapply(toMassbank), i.e. contains 14 plain-text arrays +# (for a 14-spectra method) +# molfile: a molfile from createMolfile + +#' Export internally stored MassBank data to files +#' +#' Exports MassBank recfile data arrays and corresponding molfiles to physical +#' files on hard disk, for one compound. +#' +#' The data from \code{compiled} is still used here, because it contains the +#' "visible" accession number. In the plain-text format contained in +#' \code{files}, the accession number is not "accessible" anymore since it's in +#' the file. +#' +#' @usage exportMassbank(compiled, molfile = NULL) +#' @param compiled \code{RmbSpectraSet} +#' the spectra of one compound for which files should be exported +#' @param molfile A molfile from \code{\link{createMolfile}}; +#' deprecated since molfiles are not used by MassBank anymore. +#' @return No return value. +#' @note An improvement would be to write the accession numbers into +#' \code{names(compiled)} and later into \code{names(files)} so \code{compiled} +#' wouldn't be needed here anymore. (The compound ID would have to go into +#' \code{names(molfile)}, since it is also retrieved from \code{compiled}.) +#' @author Michael Stravs +#' @seealso \code{\link{createMolfile}}, \code{\link{toMassbank}}, +#' \code{\link{mbWorkflow}} +#' @references MassBank record format: +#' \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} +#' @export +exportMassbank <- function(compiled, molfile = NULL) +{ + exportMassbank_recdata( + compiled, + recDataFolder = file.path(getOption("RMassBank")$annotations$entry_prefix, "recdata") + ) + if(!is.null(molfile)) { + exportMassbank_moldata( + compiled, + molfile, + molDataFolder = file.path(getOption("RMassBank")$annotations$entry_prefix, "moldata") + ) + } +} + +exportMassbank_recdata <- function(compiled, recDataFolder) +{ + #mb@mbfiles <- lapply(mb@compiled_ok, function(cpd) toMassbank(cpd, mb@additionalPeaks)) + + files <- toMassbank(compiled) + names(files) <- lapply(compiled@children, function(c) c@info[["ACCESSION"]] ) + + molnames <- c() + for(file in seq_len(length(files))) + { + # Read the accession no. from the corresponding "compiled" entry + fileName <- names(files)[[file]] + # use this accession no. as fileName + fileName <- paste(fileName, ".txt", sep="") + filePath <- file.path(recDataFolder,fileName) + write(files[[file]], filePath) + } +} + +exportMassbank_moldata <- function(compiled, molfile, molDataFolder) +{ + # Use internal ID for naming the molfiles + if(findLevel(compiled@id,TRUE)=="standard"){ + molname <- sprintf("%04d", as.numeric(compiled@id)) + molname <- paste(molname, ".mol", sep="") + write(molfile, file.path(molDataFolder,molname)) + } +} + + + + + +# Makes a list.tsv with molfile -> massbank ch$name attribution. + +#' Write list.tsv file +#' +#' Makes a list.tsv file in the "moldata" folder. +#' +#' Generates the list.tsv file which is needed by MassBank to connect records with +#' their respective molfiles. The first compound name is linked to a mol-file with +#' the compound ID (e.g. 2334.mol for ID 2334). +#' +#' @param compiled list of \code{RmbSpectraSet} +#' compiled spectra for multiple compounds (one \code{RmbSpectraSet} each). +#' @return No return value. +#' @author Michael A. Stravs, Eawag +#' @export +makeMollist <- function(compiled) +{ + # For every "compiled" entry (here, compiled is not one "compiled" entry but the total + # list of all compiled spectra), extract the uppermost CH$NAME and the ID (from the + # first spectrum.) Make the ID into 0000 format. + + emptySpectra <- unlist(lapply(compiled, function(cpd) length(cpd@children) == 0)) + compiled <- compiled[!emptySpectra] + + tsvlist <- t(sapply(compiled, function(entry) + { + name <- entry@children[[1]]@info[["CH$NAME"]][[1]] + id <- sprintf("%04d", as.numeric(entry@id)) + molfilename <- paste(id,".mol",sep='') + return(c(name,molfilename)) + })) + + IDs <- sapply(compiled, function(entry) return( sprintf("%04d", as.numeric( + entry@id)))) + level <- sapply(IDs, findLevel, compact=TRUE) + validentries <- which(level == "standard") + # Write the file with the + write.table(tsvlist[validentries,], + paste(getOption("RMassBank")$annotations$entry_prefix,"/moldata/list.tsv", sep=''), + quote = FALSE, + sep="\t", + row.names=FALSE, + col.names=FALSE + ) +} + + +# Load a dataframe or file into additional_peaks (or add additional points in there.) +# The columns cpdID, scan, mzFound, int, OK are mandatory. OK=1 means that the peaks +# will be added into the spectrum. mzFound and int will be taken for the table. +# No annotation will be written. +# Add peaks to the spectra by hand + +#' Add additional peaks to spectra +#' +#' Loads a table with additional peaks to add to the MassBank spectra. Required +#' columns are \code{cpdID, scan, int, mzFound, OK}. +#' +#' All peaks with OK=1 will be included in the spectra. +#' +#' @usage addPeaks(mb, filename_or_dataframe) +#' @param mb The \code{mbWorkspace} to load the peaks into. +#' @param filename_or_dataframe Filename of the csv file, or name of the R +#' dataframe containing the peaklist. +#' @return The \code{mbWorkspace} with loaded additional peaks. +#' @author Michael Stravs +#' @seealso \code{\link{mbWorkflow}} +#' @examples +#' +#' \dontrun{addPeaks("myrun_additionalPeaks.csv")} +#' +#' @export +addPeaks <- function(mb, filename_or_dataframe) +{ + + errorvar <- 0 + currEnvir <- environment() + d <- 1 + + if(is.data.frame(filename_or_dataframe)) + df <- filename_or_dataframe + else + tryCatch( + df <- readr::read_csv(filename_or_dataframe), + df <- as.data.frame(df), + error=function(e){ + currEnvir$errorvar <- 1 + }) + # I change your heuristic fix to another heuristic fix, because I will have to test for a column name change... + + if(!errorvar){ + + if(ncol(df) < 2){ + df <- readr::read_delim(file = filename_or_dataframe, delim = ";") + df <- as.data.frame(df) + } + # here: the column int was renamed to intensity, and we need to be able to read old files. sorry. + if(!("intensity" %in% colnames(df)) & ("int" %in% colnames(df))) + df$intensity <- df$int + + cols <- c("cpdID", "scan", "mzFound", "intensity", "OK") + n <- colnames(df) + # Check if comma-separated or semicolon-separated + d <- setdiff(cols, n) + if(length(d)>0){ + stop("Some columns are missing in the additional peak list. Needs at least cpdID, scan, mzFound, intensity, OK.") + } + } + + culled_df <- df[,c("cpdID", "scan", "mzFound", "intensity", "OK")] + + + if(nrow(mb@additionalPeaks) == 0) + mb@additionalPeaks <- culled_df + else + mb@additionalPeaks <- rbind(mb@additionalPeaks, culled_df) + return(mb) +} + + + +gatherDataMinimal.cpd <- function(cpd){ + + ##Read from Compoundlist + if(length(cpd@smiles) == 1) smiles <- cpd@smiles + else + smiles <- "" + + ##Create + mbdata <- list() + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + # Confidence annotation and internal ID annotation. + # The ID of the compound will be written like: + # COMMENT: EAWAG_UCHEM_ID 1234 + # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" + if(length(cpd@id) > 0) + mbdata[["COMMENT"]][["ID"]] <- cpd@id + + # here compound info starts + mbdata[['CH$NAME']] <- cpd@name + + # Currently we use a fixed value for Compound Class, since there is no useful + # convention of what should go there and what shouldn't, and the field is not used + # in search queries. + mbdata[['CH$FORMULA']] <- cpd@formula + mbdata[['CH$EXACT_MASS']] <- round(findMz.formula(cpd@formula, "")$mzCenter, 4) + + if(cpd@smiles != "") + mbdata[['CH$SMILES']] <- cpd@smiles + + link <- list() + mbdata[['CH$LINK']] <- link + + return(mbdata) +} + + + +gatherDataMinimal.spectrum <- function(spectrum){ + + + smiles <- "" + + ##Create + mbdata <- list() + mbdata[['ACCESSION']] <- "" + mbdata[['RECORD_TITLE']] <- "" + mbdata[['DATE']] <- format(Sys.Date(), "%Y.%m.%d") + # Confidence annotation and internal ID annotation. + # The ID of the compound will be written like: + # COMMENT: EAWAG_UCHEM_ID 1234 + # if annotations$internal_id_fieldname is set to "EAWAG_UCHEM_ID" + + # here compound info starts + mbdata[['CH$NAME']] <- paste("parent", spectrum@precursorMz, "at RT", spectrum@rt, "- CE", spectrum@collisionEnergy) + + # Currently we use a fixed value for Compound Class, since there is no useful + # convention of what should go there and what shouldn't, and the field is not used + # in search queries. + + return(mbdata) +} + + diff --git a/R/settings_example.R b/R/settings_example.R index 120e6ab..7a26fe7 100755 --- a/R/settings_example.R +++ b/R/settings_example.R @@ -1,457 +1,475 @@ -.checkMbSettings <- function() -{ - o <- getOption("RMassBank", NULL) - if(is.null(o)){ - stop("Please load your settings before using the RMassBank workflow.") - } - -} - - -#' RMassBank settings -#' -#' Describes all settings for the RMassBank settings file. -#' -#' \itemize{ -#' \item{\code{deprofile}}{ -#' Whether and how to deprofile input raw files. Leave the -#' setting empty if your raw files are already in "centroid" mode. If your -#' input files are in profile mode, you have the choice between algorithms -#' \code{\link{deprofile}.spline, deprofile.fwhm, deprofile.localMax}; refer to -#' the individual manpages for more information.} -#' \item{\code{rtMargin, rtShift}}{ -#' The allowed retention time deviation relative to the -#' values specified in your compound list (see \code{\link{loadList}}), and the systematic -#' shift (due to the use of, e.g., pre-columns or other special equipment.} -#' \item{\code{babeldir}}{ -#' Directory to OpenBabel. Required for creating molfiles for MassBank export. -#' If no OpenBabel directory is given, RMassBank will attempt to use the CACTUS webservice -#' for SDF generation. It is strongly advised to install OpenBabel; the CACTUS structures -#' have explicit hydrogen atoms. -#' The path should point to the directory where babel.exe (or the Linux "babel" equivalent) lies. -#' } -#' \item{\code{use_version}}{ -#' Which MassBank record format to use; version 2 is strongly advised, -#' version 1 is considered outdated and should be used only if for some reason you are running -#' old servers and an upgrade is not feasible.} -#' \item{\code{use_rean_peaks}}{ -#' Whether to include peaks from reanalysis (see -#' \code{\link{reanalyzeFailpeaks}}) in the MassBank records. Boolean, TRUE or FALSE. -#' } -#' \item{\code{annotations}}{ -#' A list of constant annotations to use in the MassBank records. The entries -#' \code{authors, copyright, license, instrument, instrument_type, compound_class} -#' correspond to the MassBank entries \code{AUTHORS, COPYRIGHT, PUBLICATION, LICENSE, AC$INSTRUMENT, -#' AC$INSTRUMENT_TYPE, CH$COMPOUND_CLASS}. The entry \code{confidence_comment} is added as -#' \code{COMMENT: CONFIDENCE} entry. -#' -#' The entry \code{internal_id_fieldname} is used to name -#' the MassBank entry which will keep a reference to the internal compound ID used in -#' the workflow: for \code{internal_id_fieldname = MYID} and e.g. compound 1234, an -#' entry will be added to the MassBank record with -#' \code{COMMENT: MYID 1234}. The internal fieldname should not be left empty! -#' -#' The entries \code{lc_gradient, lc_flow, lc_solvent_a, lc_solvent_b, lc_column} correspond -#' to the MassBank entries \code{AC$CHROMATOGRAPHY: FLOW_GRADIENT, FLOW_RATE, -#' SOLVENT A, SOLVENT B, COLUMN_NAME}. -#' -#' \code{ms_type, ionization} correspond to \code{AC$MASS_SPECTROMETRY: MS_TYPE, IONIZATION}. -#' -#' \code{entry_prefix} is the two-letter prefix used when building MassBank accession codes. -#' -#' Entries under \code{ms_dataprocessing} are added as \code{MS$DATA_PROCESSING:} entries, -#' in addition to the default \code{WHOLE: RMassBank}. -#' } -#' \item{\code{annotator}}{ -#' For advanced users: option to select your own custom annotator. -#' Check \code{\link{annotator.default}} and the source code for details.} -#' \item{\code{spectraList}}{ -#' This setting describes the experimental annotations for the single -#' data-dependent scans. For every data-dependent scan event, a \code{spectraList} entry with -#' \code{mode, ces, ce, res} denoting collision mode, collision energy in short and verbose -#' notation, and FT resolution.} -#' \item{\code{accessionNumberShifts}}{ -#' This denotes the starting points for accession numbers -#' for different ion types. For example, \code{pH: 0, mH: 50} means that [M+H]+ spectra will -#' start at \code{XX123401} (\code{XX} being the \code{entry_prefix} and \code{1234} the compound -#' id) and [M-H]- will start at \code{XX123451}.} -#' \item{\code{electronicNoise, electronicNoiseWidth}}{ -#' Known electronic noise peaks and the window -#' to be used by \code{\link{cleanElnoise}}} -#' \item{\code{recalibrateBy}}{ -#' \code{dppm} or \code{dmz} to recalibrate either by delta ppm or by -#' delta mz.} -#' \item{\code{recalibrateMS1}}{ -#' \code{common} or \code{separate} to recalibrate MS1 data points together -#' or separately from MS2 data points.} -#' \item{\code{recalibrator: MS1, MS2}}{ -#' The functions to use for recalibration of MS1 and MS2 data points. -#' Note that the \code{MS1} setting is only meaningful if \code{recalibrateMS1: separate}, otherwise -#' the \code{MS2} setting is used for a common recalibration curve. See \code{\link{recalibrate.loess}} -#' for details.} -#' \item{\code{multiplicityFilter}}{ -#' Define the multiplicity filtering level. Default is 2, a value of 1 -#' is off (no filtering) and >2 is harsher filtering.} -#' \item{\code{titleFormat}}{ -#' The title of MassBank records is a mini-summary -#' of the record, for example "Dinotefuran; LC-ESI-QFT; MS2; CE: 35%; R=35000; [M+H]+". -#' By default, the first compound name \code{CH$NAME}, instrument type -#' \code{AC$INSTRUMENT_TYPE}, MS/MS type \code{AC$MASS_SPECTROMETRY: MS_TYPE}, -#' collision energy \code{RECORD_TITLE_CE}, resolution \code{AC$MASS_SPECTROMETRY: RESOLUTION} -#' and precursor \code{MS$FOCUSED_ION: PRECURSOR_TYPE} are used. If alternative -#' information is relevant to differentiate acquired spectra, the title should be adjusted. -#' For example, many TOFs do not have a resolution setting. -#' See MassBank documentation for more.} -#' \item{\code{filterSettings}}{ -#' A list of settings that affect the MS/MS processing. The entries -#' \code{ppmHighMass, ppmLowMass, massRangeDivision} set values for -#' pre-processing, prior to recalibration. \code{ppmHighMass} defines the -#' ppm error for the high mass range (default 10 ppm for Orbitraps), -#' \code{ppmLowMass} is the error for the low mass range (default 15 ppm -#' for Orbitraps) and \code{massRangeDivision} is the m/z value defining -#' the split between the high and low mass range (default m/z = 120). -#' -#' The entry \code{ppmFine} defines the ppm cut-off post recalibration. -#' The default value of 5 ppm is recommended for Orbitraps. For other -#' instruments this can be interpreted from the recalibration plot. -#' All ppm limits are one-sided (e.g. this includes values to +5 ppm or -5 ppm -#' deviation from the exact mass). -#' -#' The entries \code{prelimCut, prelimCutRatio} define the intensity cut-off and -#' cut-off ratio (in % of the most intense peak) for pre-processing. This affects -#' the peak selection for the recalibration only. Careful: the default value -#' 1e4 for Orbitrap LTQ positive mode could remove all peaks for TOF data -#' and will remove too many peaks for Orbitrap LTQ negative mode spectra! -#' -#' The entry \code{specOKLimit} defines the intensity limit to include MS/MS spectra. -#' MS/MS spectra must have at least one peak above this limit to proceed through -#' the workflow. -#' -#' \code{dbeMinLimit} defines the minimum allowable ring and double bond equivalents (DBE) -#' allowed for assigned formulas. This assumes maximum valuences for elements with -#' multiple valence states. The default is -0.5 (accounting for fragments being ions). -#' -#' The entries \code{satelliteMzLimit, satelliteIntLimit} define the cut-off m/z and -#' intensity values for satellite peak removal (an artefact of Fourier Transform -#' processing). All peaks within the m/z limit (default 0.5) and intensity ratio -#' (default 0.05 or 5 %) of the respective peak will be removed. Applicable to -#' Fourier Transform instruments only (e.g. Orbitrap). -#' } -#' \item{\code{filterSettings}}{ -#' Parameters for adjusting the raw data retrieval. -#' The entry \code{ppmFine} defines the ppm error to look for the precursor in -#' the MS1 (parent) spectrum. Default is 10 ppm for Orbitrap. -#' -#' \code{mzCoarse} defines the error to search for the precursor specification -#' in the MS2 spectrum. This is often only saved to 2 decimal places and thus -#' can be quite inaccurate. The accuracy also depends on the isolation window used. -#' The default settings (for e.g. Orbitrap) is 0.5 (Da, or Th for m/z). -#' -#' The entry \code{fillPrecursorScan} is largely untested. The default value -#' (FALSE) assumes all necessary precursor information is available in the mzML file. -#' A setting ot TRUE tries to fill in the precursor data scan number if it is missing. -#' Only tested on one case study so far - feedback welcome! -#' } -#' } -#' -#' -#' @author Michael Stravs, Emma Schymanski -#' @seealso \code{\link{loadRmbSettings}} -#' @rdname RmbSettings -#' @name RmbSettings -NULL - -.settingsList <- list( - # Deprofile input data? - # NA if input data is already in "centroid" mode, - # "deprofile.fwhm" or "deprofile.localMax" to convert the input data with the - # corresponding algorithm. See ?deprofile - deprofile = NA, - # Deviation (in minutes) allowed the for retention time - rtMargin = 0.4, - # Systematic RT shift - rtShift = -0.3, - # Directory to OpenBabel. Required for MassBank export - babeldir = NA, - # Which MassBank format should be used? Version 2 is advised. - use_version = 2, - # Include reanalyzed peaks? - use_rean_peaks = TRUE, - # annotate the spectra files with (putative) molecular formulas for fragments? - add_annotation = TRUE, - # Annotations for the spectrum: - annotations = list( - authors = "Nomen Nescio, The Unseen University", - copyright = "Copyright (C) XXX", - publication = "", - license = "CC BY", - instrument = "LTQ Orbitrap XL Thermo Scientific", - instrument_type = "LC-ESI-ITFT", - confidence_comment = "standard compound", - compound_class = "N/A; Environmental Standard", - internal_id_fieldname = "INTERNAL_ID", - # - # HPLC annotations: - # - # example: lc_gradient = "90/10 at 0 min, 50/50 at 4 min, 5/95 at 17 min, 5/95 at 25 min, 90/10 at 25.1 min, 90/10 at 30 min"" - lc_gradient = "", - # example: lc_flow = "200 uL/min", - lc_flow = "", - # example: lc_solvent_a = 'water with 0.1% formic acid', - lc_solvent_a = '', - lc_solvent_b = '', - # example: lc_column "XBridge C18 3.5um, 2.1x50mm, Waters", - lc_column = "", - # - # Prefix for MassBank accession IDs - # - entry_prefix = "XX", - contributor_prefix = "CONTRIBUTOR", - ms_type = "MS2", - ionization = "ESI", - ms_dataprocessing = list( - "RECALIBRATE" = "loess on assigned fragments and MS1" - ) - ), - include_sp_tags = FALSE, - # List of data-dependent scans in their order (relative to the parent scan) - # list(mode, ces, ce, res): - # mode: fragmentation mode - # ces: "short" format collision energy (for record title) - # ce: "long" format collision energy (for annotation field) - # res: FT resolution - spectraList = list( - list(mode="CID", ces = "35%", ce = "35 % (nominal)", res = 7500), - list(mode="HCD", ces = "15%", ce = "15 % (nominal)", res = 7500), - list(mode="HCD", ces = "30%", ce = "30 % (nominal)", res = 7500), - list(mode="HCD", ces = "45%",ce = "45 % (nominal)", res = 7500), - list(mode="HCD", ces = "60%",ce = "60 % (nominal)", res = 7500), - list(mode="HCD", ces = "75%",ce = "75 % (nominal)", res = 7500), - list(mode="HCD", ces = "90%",ce = "90 % (nominal)", res = 7500), - list(mode="HCD", ces = "15%",ce = "15 % (nominal)", res = 15000), - list(mode="HCD", ces = "30%",ce = "30 % (nominal)", res = 15000), - list(mode="HCD", ces = "45%",ce = "45 % (nominal)", res = 15000), - list(mode="HCD", ces = "60%",ce = "60 % (nominal)", res = 15000), - list(mode="HCD", ces = "75%", ce = "75 % (nominal)", res = 15000), - list(mode="HCD", ces = "90%", ce = "90 % (nominal)", res = 15000), - list(mode="CID", ces = "35%", ce = "35 % (nominal)", res = 15000) - ), - accessionNumberShifts = list( - "pH" = 0, # [M+H]+: Accession numbers 1-14 - "pM" = 16, # [M]+: 17-30 - "pNa" = 32, # [M+Na]+: 33-46 - "mH" = 50, # [M-H]-: 51-64 - "mFA" = 66, # [M+FA]-: 67-80 - "mM" = 80 # [M]-: 81-94 - ), - accessionBuilderType = NULL, - accessionBuilder = "MSBNK-{contributor_prefix}-{entry_prefix}{compound_id(4)}{scan_id(2)}", - # Validate accession? Set to FALSE to bypass accession validation - accessionValidate = TRUE, - # Known electronic noise peaks in the Orbitrap data - electronicNoise = c(189.825, 201.725,196.875), - # Exclusion width of electronic noise peaks (from unmatched peaks, prior to - # reanalysis) - electronicNoiseWidth = 0.3, - # recalibration settings: - # recalibrate by: dppm or dmz - recalibrateBy = "dppm", - # recalibrate MS1: - # separately ("separate") - # with common curve ("common") - # do not recalibrate ("none") - recalibrateMS1 = "common", - # Custom recalibration function: You can overwrite the recal function by - # making any function which takes rcdata$recalfield ~ rcdata$mzFound. - # The settings define which recal function is used - recalibrator = list( - MS1 = "recalibrate.loess", - MS2 = "recalibrate.loess"), -# Window width to look for MS1 peaks to recalibrate (in ppm) - recalibrateMS1Window= 15, - - # Define the multiplicity filtering level - # Default is 2 (peak occurs at least twice) - # Set this to 1 if you want to turn this option off. - # Set this to anything > 2 if you want harder filtering - multiplicityFilter = 2, - # Define the title format. - # You can use all entries from MassBank records as tokens - # plus the additional token RECORD_TITLE_CE, which is a shortened - # version of the collision energy specifically for use in the title. - # Every line is one entry and must have one token in curly brackets - # e.g. {CH$NAME} or {AC$MASS_SPECTROMETRY: MS_TYPE} plus optionally - # additional text in front or behind e.g. - # R={AC$MASS_SPECTROMETRY: RESOLUTION} - # If this is not specified, it defaults to a title of the format - # "Dinotefuran; LC-ESI-QFT; MS2; CE: 35%; R=35000; [M+H]+" - titleFormat = c( - "{CH$NAME}", - "{AC$INSTRUMENT_TYPE}", - "{AC$MASS_SPECTROMETRY: MS_TYPE}", - "CE: {RECORD_TITLE_CE}", - "R={AC$MASS_SPECTROMETRY: RESOLUTION}", - "{MS$FOCUSED_ION: PRECURSOR_TYPE}" - ), -# Define filter settings. -# For Orbitrap, settings of 15 ppm in low mass range, 10 ppm in high -# mass range, m/z = 120 as mass range division and 5 ppm for recalibrated -# data overall are recommended. - filterSettings = list( - ppmHighMass = 10, - ppmLowMass = 15, - massRangeDivision= 120, - ppmFine= 5, - prelimCut= 1e4, - prelimCutRatio= 0, - fineCut= 0, - fineCutRatio= 0, - specOkLimit= 1e4, - dbeMinLimit= -0.5, - satelliteMzLimit= 0.5, - satelliteIntLimit= 0.05 - ), - - findMsMsRawSettings = list( - ppmFine= 10, - mzCoarse= 0.5, - fillPrecursorScan= FALSE) - ) - -# Writes a file with sample settings which the user can adjust with his values. -#' @export -RmbSettingsTemplate <- function(target) -{ - blub <- file.copy(from=system.file("RMB_options.ini", package="RMassBank"), to=target) -} - -# Loads settings from a file or from an object. -#' @export -loadRmbSettings <- function(file_or_list) -{ - # If the object exists in R, it is assumed to be the list itself - # Otherwise, it's assumed to be a file name and loaded. - # It will be either an INI file in YAML format or an R file to be directly sourced. - if(is.list(file_or_list)) - options(RMassBank = file_or_list) - else if(exists(file_or_list, inherits=TRUE)) - options(RMassBank = get(file_or_list)) - else if(file.exists(file_or_list)) - { - # Check if the file has an INI extension: - # If yes, load it with YAML - isIni <- grepl("\\.[iI][nN][iI]$", file_or_list, perl = T) - isIni <- isIni || grepl("\\.[yY][mM][lL]$", file_or_list, perl = T) - isR <- grepl("\\.[rR]$", file_or_list, perl = T) - if(isIni) - { - o <- yaml.load_file(file_or_list) - # Fix the YAML file to suit our needs - if(is.null(o$deprofile)) - o$deprofile <- NA - if(is.null(o$babeldir)){ - o$babeldir <- NA - } else{ - ##Check if babeldir exists - babelcheck <- gsub('\"','',o$babeldir) - if(substring(babelcheck, nchar(babelcheck)) == "\\"){ - babelexists <- file.exists(substring(babelcheck, 1, nchar(babelcheck)-1)) - } else{ - babelexists <- file.exists(babelcheck) - } - - if(!babelexists){ - stop("The babeldir does not exist. Please check the babeldir in the settings and adjust it accordingly.") - } - } - - for(name in names(o$annotations)) - { - if(is.null(o$annotations[[name]])) - o$annotations[[name]] <- "" - } - if (!is.null(o$logging_file)) { - appender_obj <- logger::appender_file(o$logging_file) - # This implicitly creates a new namespace in the - # logger package, that is used to treat calls from - # RMassBank differently - log_appender(appender_obj, namespace='RMassBank') - } - options(RMassBank = o) - } - else if (isR) - { - ov <- source(file_or_list) - o <- ov$value - options(RMassBank = o) - } - else - stop("Options format not recognized. Use YAML (.ini, .yml) or R file (.R) format.") - - } - else - stop("The file path supplied for the options does not exist.") - - # Settings are loaded, now check if they are up to date - o <- getOption("RMassBank") - curr <- names(.settingsList) - problem <- length(setdiff(curr, names(o))) > 0 - # Hesch es problem? He? - if(problem) - { - warning("Your settings are outdated. Missing will be replaced by default values.") - o <- updateSettings(o) - options(RMassBank = o) - } -} - -#' @export -loadRmbSettingsFromEnv <- function(env = .GlobalEnv) -{ - loadRmbSettings(env$RmbSettings) -} - -#' RMassBank settings -#' -#' Load, set and reset settings for RMassBank. -#' -#' \code{RmbSettingsTemplate} creates a template file in which you can adjust the -#' settings as you like. Before using RMassBank, you must then load the -#' settings file using \code{loadRmbSettings}. \code{RmbDefaultSettings} loads -#' the default settings. \code{loadRmbSettingsFromEnv} loads the settings -#' stored in env$RmbSettings, which is useful when reloading archives with -#' saved settings inside. -#' -#' Note: no settings are loaded upon loading MassBank! -#' This is intended, so that one never forgets to load the correct settings. -#' -#' The settings are described in \code{\link{RmbSettings}}. -#' -#' @aliases loadRmbSettings RmbDefaultSettings RmbSettingsTemplate loadRmbSettingsFromEnv -#' @usage loadRmbSettings(file_or_list) -#' -#' loadRmbSettingsFromEnv(env = .GlobalEnv) -#' -#' RmbDefaultSettings() -#' -#' RmbSettingsTemplate(target) -#' @param file_or_list The file (YML or R format) or R \code{list} with the settings to load. -#' @param target The path where the template setting file should be stored. -#' @param env The environment to load the settings from. -#' @return None. -#' @note \bold{The default settings will not work for you unless you have, by -#' chance, installed OpenBabel into the same directory as I have!} -#' @author Michael Stravs -#' @seealso \code{\link{RmbSettings}} -#' @examples -#' -#' # Create a standard settings file and load it (unedited) -#' RmbSettingsTemplate("mysettings.ini") -#' loadRmbSettings("mysettings.ini") -#' unlink("mysettings.ini") -#' -#' @export -RmbDefaultSettings <- function() -{ - options("RMassBank" = .settingsList) -} +.checkMbSettings <- function() +{ + o <- getOption("RMassBank", NULL) + if(is.null(o)){ + stop("Please load your settings before using the RMassBank workflow.") + } + +} + + +#' RMassBank settings +#' +#' Describes all settings for the RMassBank settings file. +#' +#' \itemize{ +#' \item{\code{deprofile}}{ +#' Whether and how to deprofile input raw files. Leave the +#' setting empty if your raw files are already in "centroid" mode. If your +#' input files are in profile mode, you have the choice between algorithms +#' \code{\link{deprofile}.spline, deprofile.fwhm, deprofile.localMax}; refer to +#' the individual manpages for more information.} +#' \item{\code{rtMargin, rtShift}}{ +#' The allowed retention time deviation relative to the +#' values specified in your compound list (see \code{\link{loadList}}), and the systematic +#' shift (due to the use of, e.g., pre-columns or other special equipment.} +#' \item{\code{babeldir}}{ +#' Directory to OpenBabel. Required for creating molfiles for MassBank export. +#' If no OpenBabel directory is given, RMassBank will attempt to use the CACTUS webservice +#' for SDF generation. It is strongly advised to install OpenBabel; the CACTUS structures +#' have explicit hydrogen atoms. +#' The path should point to the directory where babel.exe (or the Linux "babel" equivalent) lies. +#' } +#' \item{\code{use_version}}{ +#' Which MassBank record format to use; version 2 is strongly advised, +#' version 1 is considered outdated and should be used only if for some reason you are running +#' old servers and an upgrade is not feasible.} +#' \item{\code{use_rean_peaks}}{ +#' Whether to include peaks from reanalysis (see +#' \code{\link{reanalyzeFailpeaks}}) in the MassBank records. Boolean, TRUE or FALSE. +#' } +#' \item{\code{annotations}}{ +#' A list of constant annotations to use in the MassBank records. The entries +#' \code{authors, copyright, license, instrument, instrument_type, compound_class} +#' correspond to the MassBank entries \code{AUTHORS, COPYRIGHT, PUBLICATION, LICENSE, AC$INSTRUMENT, +#' AC$INSTRUMENT_TYPE, CH$COMPOUND_CLASS}. The entry \code{confidence_comment} is added as +#' \code{COMMENT: CONFIDENCE} entry. +#' +#' The entry \code{internal_id_fieldname} is used to name +#' the MassBank entry which will keep a reference to the internal compound ID used in +#' the workflow: for \code{internal_id_fieldname = MYID} and e.g. compound 1234, an +#' entry will be added to the MassBank record with +#' \code{COMMENT: MYID 1234}. The internal fieldname should not be left empty! +#' +#' The entries \code{lc_gradient, lc_flow, lc_solvent_a, lc_solvent_b, lc_column} correspond +#' to the MassBank entries \code{AC$CHROMATOGRAPHY: FLOW_GRADIENT, FLOW_RATE, +#' SOLVENT A, SOLVENT B, COLUMN_NAME}. +#' +#' \code{ms_type, ionization} correspond to \code{AC$MASS_SPECTROMETRY: MS_TYPE, IONIZATION}. +#' +#' \code{entry_prefix} is the two-letter prefix used when building MassBank accession codes. +#' +#' Entries under \code{ms_dataprocessing} are added as \code{MS$DATA_PROCESSING:} entries, +#' in addition to the default \code{WHOLE: RMassBank}. +#' } +#' \item{\code{annotator}}{ +#' For advanced users: option to select your own custom annotator. +#' Check \code{\link{annotator.default}} and the source code for details.} +#' \item{\code{spectraList}}{ +#' This setting describes the experimental annotations for the single +#' data-dependent scans. For every data-dependent scan event, a \code{spectraList} entry with +#' \code{mode, ces, ce, res} denoting collision mode, collision energy in short and verbose +#' notation, and FT resolution.} +#' \item{\code{accessionNumberShifts}}{ +#' This denotes the starting points for accession numbers +#' for different ion types. For example, \code{pH: 0, mH: 50} means that [M+H]+ spectra will +#' start at \code{XX123401} (\code{XX} being the \code{entry_prefix} and \code{1234} the compound +#' id) and [M-H]- will start at \code{XX123451}.} +#' \item{\code{electronicNoise, electronicNoiseWidth}}{ +#' Known electronic noise peaks and the window +#' to be used by \code{\link{cleanElnoise}}} +#' \item{\code{recalibrateBy}}{ +#' \code{dppm} or \code{dmz} to recalibrate either by delta ppm or by +#' delta mz.} +#' \item{\code{recalibrateMS1}}{ +#' \code{common} or \code{separate} to recalibrate MS1 data points together +#' or separately from MS2 data points.} +#' \item{\code{recalibrator: MS1, MS2}}{ +#' The functions to use for recalibration of MS1 and MS2 data points. +#' Note that the \code{MS1} setting is only meaningful if \code{recalibrateMS1: separate}, otherwise +#' the \code{MS2} setting is used for a common recalibration curve. See \code{\link{recalibrate.loess}} +#' for details.} +#' \item{\code{multiplicityFilter}}{ +#' Define the multiplicity filtering level. Default is 2, a value of 1 +#' is off (no filtering) and >2 is harsher filtering.} +#' \item{\code{titleFormat}}{ +#' The title of MassBank records is a mini-summary +#' of the record, for example "Dinotefuran; LC-ESI-QFT; MS2; CE: 35%; R=35000; [M+H]+". +#' By default, the first compound name \code{CH$NAME}, instrument type +#' \code{AC$INSTRUMENT_TYPE}, MS/MS type \code{AC$MASS_SPECTROMETRY: MS_TYPE}, +#' collision energy \code{RECORD_TITLE_CE}, resolution \code{AC$MASS_SPECTROMETRY: RESOLUTION} +#' and precursor \code{MS$FOCUSED_ION: PRECURSOR_TYPE} are used. If alternative +#' information is relevant to differentiate acquired spectra, the title should be adjusted. +#' For example, many TOFs do not have a resolution setting. +#' See MassBank documentation for more.} +#' \item{\code{filterSettings}}{ +#' A list of settings that affect the MS/MS processing. The entries +#' \code{ppmHighMass, ppmLowMass, massRangeDivision} set values for +#' pre-processing, prior to recalibration. \code{ppmHighMass} defines the +#' ppm error for the high mass range (default 10 ppm for Orbitraps), +#' \code{ppmLowMass} is the error for the low mass range (default 15 ppm +#' for Orbitraps) and \code{massRangeDivision} is the m/z value defining +#' the split between the high and low mass range (default m/z = 120). +#' +#' The entry \code{ppmFine} defines the ppm cut-off post recalibration. +#' The default value of 5 ppm is recommended for Orbitraps. For other +#' instruments this can be interpreted from the recalibration plot. +#' All ppm limits are one-sided (e.g. this includes values to +5 ppm or -5 ppm +#' deviation from the exact mass). +#' +#' The entries \code{prelimCut, prelimCutRatio} define the intensity cut-off and +#' cut-off ratio (in % of the most intense peak) for pre-processing. This affects +#' the peak selection for the recalibration only. Careful: the default value +#' 1e4 for Orbitrap LTQ positive mode could remove all peaks for TOF data +#' and will remove too many peaks for Orbitrap LTQ negative mode spectra! +#' +#' The entry \code{specOKLimit} defines the intensity limit to include MS/MS spectra. +#' MS/MS spectra must have at least one peak above this limit to proceed through +#' the workflow. +#' +#' \code{dbeMinLimit} defines the minimum allowable ring and double bond equivalents (DBE) +#' allowed for assigned formulas. This assumes maximum valuences for elements with +#' multiple valence states. The default is -0.5 (accounting for fragments being ions). +#' +#' The entries \code{satelliteMzLimit, satelliteIntLimit} define the cut-off m/z and +#' intensity values for satellite peak removal (an artefact of Fourier Transform +#' processing). All peaks within the m/z limit (default 0.5) and intensity ratio +#' (default 0.05 or 5 %) of the respective peak will be removed. Applicable to +#' Fourier Transform instruments only (e.g. Orbitrap). +#' } +#' \item{\code{filterSettings}}{ +#' Parameters for adjusting the raw data retrieval. +#' The entry \code{ppmFine} defines the ppm error to look for the precursor in +#' the MS1 (parent) spectrum. Default is 10 ppm for Orbitrap. +#' +#' \code{mzCoarse} defines the error to search for the precursor specification +#' in the MS2 spectrum. This is often only saved to 2 decimal places and thus +#' can be quite inaccurate. The accuracy also depends on the isolation window used. +#' The default settings (for e.g. Orbitrap) is 0.5 (Da, or Th for m/z). +#' +#' The entry \code{fillPrecursorScan} is largely untested. The default value +#' (FALSE) assumes all necessary precursor information is available in the mzML file. +#' A setting ot TRUE tries to fill in the precursor data scan number if it is missing. +#' Only tested on one case study so far - feedback welcome! +#' } +#' } +#' +#' +#' @author Michael Stravs, Emma Schymanski +#' @seealso \code{\link{loadRmbSettings}} +#' @rdname RmbSettings +#' @name RmbSettings +NULL + +.settingsList <- list( + # Deprofile input data? + # NA if input data is already in "centroid" mode, + # "deprofile.fwhm" or "deprofile.localMax" to convert the input data with the + # corresponding algorithm. See ?deprofile + deprofile = NA, + # Deviation (in minutes) allowed the for retention time + rtMargin = 0.4, + # Systematic RT shift + rtShift = -0.3, + # Directory to OpenBabel. Required for MassBank export + babeldir = NA, + # Which MassBank format should be used? Version 2 is advised. + use_version = 2, + # Include reanalyzed peaks? + use_rean_peaks = TRUE, + # annotate the spectra files with (putative) molecular formulas for fragments? + add_annotation = TRUE, + # Annotations for the spectrum: + annotations = list( + authors = "Nomen Nescio, The Unseen University", + copyright = "Copyright (C) XXX", + publication = "", + license = "CC BY", + instrument = "LTQ Orbitrap XL Thermo Scientific", + instrument_type = "LC-ESI-ITFT", + confidence_comment = "standard compound", + compound_class = "N/A; Environmental Standard", + internal_id_fieldname = "INTERNAL_ID", + # + # HPLC annotations: + # + # example: lc_gradient = "90/10 at 0 min, 50/50 at 4 min, 5/95 at 17 min, 5/95 at 25 min, 90/10 at 25.1 min, 90/10 at 30 min"" + lc_gradient = "", + # example: lc_flow = "200 uL/min", + lc_flow = "", + # example: lc_solvent_a = 'water with 0.1% formic acid', + lc_solvent_a = '', + lc_solvent_b = '', + # example: lc_column "XBridge C18 3.5um, 2.1x50mm, Waters", + lc_column = "", + # + # Prefix for MassBank accession IDs + # + entry_prefix = "XX", + contributor_prefix = "CONTRIBUTOR", + ms_type = "MS2", + ionization = "ESI", + ms_dataprocessing = list( + "RECALIBRATE" = "loess on assigned fragments and MS1" + ) + ), + include_sp_tags = FALSE, + # List of data-dependent scans in their order (relative to the parent scan) + # list(mode, ces, ce, res): + # mode: fragmentation mode + # ces: "short" format collision energy (for record title) + # ce: "long" format collision energy (for annotation field) + # res: FT resolution + spectraList = list( + list(mode="CID", ces = "35%", ce = "35 % (nominal)", res = 7500), + list(mode="HCD", ces = "15%", ce = "15 % (nominal)", res = 7500), + list(mode="HCD", ces = "30%", ce = "30 % (nominal)", res = 7500), + list(mode="HCD", ces = "45%",ce = "45 % (nominal)", res = 7500), + list(mode="HCD", ces = "60%",ce = "60 % (nominal)", res = 7500), + list(mode="HCD", ces = "75%",ce = "75 % (nominal)", res = 7500), + list(mode="HCD", ces = "90%",ce = "90 % (nominal)", res = 7500), + list(mode="HCD", ces = "15%",ce = "15 % (nominal)", res = 15000), + list(mode="HCD", ces = "30%",ce = "30 % (nominal)", res = 15000), + list(mode="HCD", ces = "45%",ce = "45 % (nominal)", res = 15000), + list(mode="HCD", ces = "60%",ce = "60 % (nominal)", res = 15000), + list(mode="HCD", ces = "75%", ce = "75 % (nominal)", res = 15000), + list(mode="HCD", ces = "90%", ce = "90 % (nominal)", res = 15000), + list(mode="CID", ces = "35%", ce = "35 % (nominal)", res = 15000) + ), + accessionNumberShifts = list( + "pH" = 0, # [M+H]+: Accession numbers 1-14 + "pM" = 16, # [M]+: 17-30 + "pNa" = 32, # [M+Na]+: 33-46 + "mH" = 50, # [M-H]-: 51-64 + "mFA" = 66, # [M+FA]-: 67-80 + "mM" = 80 # [M]-: 81-94 + ), + accessionBuilderType = NULL, + accessionBuilder = "MSBNK-{contributor_prefix}-{entry_prefix}{compound_id(4)}{scan_id(2)}", + # Validate accession? Set to FALSE to bypass accession validation + accessionValidate = TRUE, + # Known electronic noise peaks in the Orbitrap data + electronicNoise = c(189.825, 201.725,196.875), + # Exclusion width of electronic noise peaks (from unmatched peaks, prior to + # reanalysis) + electronicNoiseWidth = 0.3, + # recalibration settings: + # recalibrate by: dppm or dmz + recalibrateBy = "dppm", + # recalibrate MS1: + # separately ("separate") + # with common curve ("common") + # do not recalibrate ("none") + recalibrateMS1 = "common", + # Custom recalibration function: You can overwrite the recal function by + # making any function which takes rcdata$recalfield ~ rcdata$mzFound. + # The settings define which recal function is used + recalibrator = list( + MS1 = "recalibrate.loess", + MS2 = "recalibrate.loess"), + # Window width to look for MS1 peaks to recalibrate (in ppm) + recalibrateMS1Window= 15, + + # Define the multiplicity filtering level + # Default is 2 (peak occurs at least twice) + # Set this to 1 if you want to turn this option off. + # Set this to anything > 2 if you want harder filtering + multiplicityFilter = 2, + # Define the title format. + # You can use all entries from MassBank records as tokens + # plus the additional token RECORD_TITLE_CE, which is a shortened + # version of the collision energy specifically for use in the title. + # Every line is one entry and must have one token in curly brackets + # e.g. {CH$NAME} or {AC$MASS_SPECTROMETRY: MS_TYPE} plus optionally + # additional text in front or behind e.g. + # R={AC$MASS_SPECTROMETRY: RESOLUTION} + # If this is not specified, it defaults to a title of the format + # "Dinotefuran; LC-ESI-QFT; MS2; CE: 35%; R=35000; [M+H]+" + titleFormat = c( + "{CH$NAME}", + "{AC$INSTRUMENT_TYPE}", + "{AC$MASS_SPECTROMETRY: MS_TYPE}", + "CE: {RECORD_TITLE_CE}", + "R={AC$MASS_SPECTROMETRY: RESOLUTION}", + "{MS$FOCUSED_ION: PRECURSOR_TYPE}" + ), + # Define filter settings. + # For Orbitrap, settings of 15 ppm in low mass range, 10 ppm in high + # mass range, m/z = 120 as mass range division and 5 ppm for recalibrated + # data overall are recommended. + filterSettings = list( + ppmHighMass = 10, + ppmLowMass = 15, + massRangeDivision= 120, + ppmFine= 5, + prelimCut= 1e4, + prelimCutRatio= 0, + fineCut= 0, + fineCutRatio= 0, + specOkLimit= 1e4, + dbeMinLimit= -0.5, + satelliteMzLimit= 0.5, + satelliteIntLimit= 0.05 + ), + # Define raw MS retrieval settings. + # fillPrecursorScan is FALSE for "good" mzML files which have all the info needed. + # However, for example AB Sciex files will have missing precursor scan information, + # in which case fillPrecursorScan = TRUE is needed. Try it out. + findMsMsRawSettings = list( + ppmFine= 10, + mzCoarse= 0.5, + fillPrecursorScan= FALSE), + + # Select how to treat unknown compound masses: + # "charged" (the default, also if no option set) treats unknown (level 5) compound masses as the m/z, + # "neutral" treats unknown (level 5) compound masses as the neutral mass and applies [M+H]+ and [M-H]- calculations accordingly. + unknownMass = "charged", + + # Add the CCTE api key to retrieve information from https://api-ccte.epa.gov/docs + # Be aware, this is confidential information, so do not share with unauthorized + # persons + ccte_api_key = NULL, + + # Add the RSC api key to retrieve information from https://developer.rsc.org/api-reference + # Be aware, this is confidential information, so do not share with unauthorized + # persons + rcs_api_key = NULL +) + +# Writes a file with sample settings which the user can adjust with his values. +#' @export +RmbSettingsTemplate <- function(target) +{ + blub <- file.copy(from=system.file("RMB_options.ini", package="RMassBank"), to=target) +} + +# Loads settings from a file or from an object. +#' @export +loadRmbSettings <- function(file_or_list) +{ + # If the object exists in R, it is assumed to be the list itself + # Otherwise, it's assumed to be a file name and loaded. + # It will be either an INI file in YAML format or an R file to be directly sourced. + if(is.list(file_or_list)) + options(RMassBank = file_or_list) + else if(exists(file_or_list, inherits=TRUE)) + options(RMassBank = get(file_or_list)) + else if(file.exists(file_or_list)) + { + # Check if the file has an INI extension: + # If yes, load it with YAML + isIni <- grepl("\\.[iI][nN][iI]$", file_or_list, perl = T) + isIni <- isIni || grepl("\\.[yY][mM][lL]$", file_or_list, perl = T) + isR <- grepl("\\.[rR]$", file_or_list, perl = T) + if(isIni) + { + o <- yaml.load_file(file_or_list) + # Fix the YAML file to suit our needs + if(is.null(o$deprofile)) + o$deprofile <- NA + if(is.null(o$babeldir)){ + o$babeldir <- NA + } else{ + ##Check if babeldir exists + babelcheck <- gsub('\"','',o$babeldir) + if(substring(babelcheck, nchar(babelcheck)) == "\\"){ + babelexists <- file.exists(substring(babelcheck, 1, nchar(babelcheck)-1)) + } else{ + babelexists <- file.exists(babelcheck) + } + + if(!babelexists){ + stop("The babeldir does not exist. Please check the babeldir in the settings and adjust it accordingly.") + } + } + + for(name in names(o$annotations)) + { + if(is.null(o$annotations[[name]])) + o$annotations[[name]] <- "" + } + if (!is.null(o$logging_file)) { + appender_obj <- logger::appender_file(o$logging_file) + # This implicitly creates a new namespace in the + # logger package, that is used to treat calls from + # RMassBank differently + log_appender(appender_obj, namespace='RMassBank') + } + options(RMassBank = o) + } + else if (isR) + { + ov <- source(file_or_list) + o <- ov$value + options(RMassBank = o) + } + else + stop("Options format not recognized. Use YAML (.ini, .yml) or R file (.R) format.") + + } + else + stop("The file path supplied for the options does not exist.") + + # Settings are loaded, now check if they are up to date + o <- getOption("RMassBank") + curr <- names(.settingsList) + problem <- length(setdiff(curr, names(o))) > 0 + # Hesch es problem? He? + if(problem) + { + warning("Your settings are outdated. Missing will be replaced by default values.") + o <- updateSettings(o) + options(RMassBank = o) + } +} + +#' @export +loadRmbSettingsFromEnv <- function(env = .GlobalEnv) +{ + loadRmbSettings(env$RmbSettings) +} + +#' RMassBank settings +#' +#' Load, set and reset settings for RMassBank. +#' +#' \code{RmbSettingsTemplate} creates a template file in which you can adjust the +#' settings as you like. Before using RMassBank, you must then load the +#' settings file using \code{loadRmbSettings}. \code{RmbDefaultSettings} loads +#' the default settings. \code{loadRmbSettingsFromEnv} loads the settings +#' stored in env$RmbSettings, which is useful when reloading archives with +#' saved settings inside. +#' +#' Note: no settings are loaded upon loading MassBank! +#' This is intended, so that one never forgets to load the correct settings. +#' +#' The settings are described in \code{\link{RmbSettings}}. +#' +#' @aliases loadRmbSettings RmbDefaultSettings RmbSettingsTemplate loadRmbSettingsFromEnv +#' @usage loadRmbSettings(file_or_list) +#' +#' loadRmbSettingsFromEnv(env = .GlobalEnv) +#' +#' RmbDefaultSettings() +#' +#' RmbSettingsTemplate(target) +#' @param file_or_list The file (YML or R format) or R \code{list} with the settings to load. +#' @param target The path where the template setting file should be stored. +#' @param env The environment to load the settings from. +#' @return None. +#' @note \bold{The default settings will not work for you unless you have, by +#' chance, installed OpenBabel into the same directory as I have!} +#' @author Michael Stravs +#' @seealso \code{\link{RmbSettings}} +#' @examples +#' +#' # Create a standard settings file and load it (unedited) +#' RmbSettingsTemplate("mysettings.ini") +#' loadRmbSettings("mysettings.ini") +#' unlink("mysettings.ini") +#' +#' @export +RmbDefaultSettings <- function() +{ + options("RMassBank" = .settingsList) +} diff --git a/R/webAccess.R b/R/webAccess.R index c0d9d33..85bdaf1 100755 --- a/R/webAccess.R +++ b/R/webAccess.R @@ -8,7 +8,7 @@ retrieveDataWithRetry <- function(url, timeout, maximumNumberOfRetries = 5, retr expr = { res <- GET(utils::URLencode(url)) data <- httr::content(res, type="text", encoding="UTF-8") - + queryIsSuccessful <- TRUE data }, @@ -28,18 +28,18 @@ retrieveDataWithRetry <- function(url, timeout, maximumNumberOfRetries = 5, retr } ) } - + return(data) } #' Retrieve information from Cactus -#' +#' #' Retrieves information from the Cactus Chemical Identifier Resolver #' (PubChem). -#' +#' #' It is not necessary to specify in which format the \code{identifier} is. #' Somehow, cactus does this automatically. -#' +#' #' @usage getCactus(identifier, representation) #' @param identifier Any identifier interpreted by the resolver, e.g. an InChI #' key or a SMILES code. @@ -56,15 +56,15 @@ retrieveDataWithRetry <- function(url, timeout, maximumNumberOfRetries = 5, retr #' @references cactus Chemical Identifier Resolver: #' \url{http://cactus.nci.nih.gov/chemical/structure} #' @examples -#' +#' #' # Benzene: #' getCactus("C1=CC=CC=C1", "cas") #' getCactus("C1=CC=CC=C1", "stdinchikey") #' getCactus("C1=CC=CC=C1", "chemspider_id") -#' -#' @export -#' -#' +#' +#' @export +#' +#' getCactus <- function(identifier, representation){ identifier <- gsub('#', '%23', identifier) ret <- tryCatch(httr::GET(paste("https://cactus.nci.nih.gov/chemical/structure/", @@ -76,38 +76,38 @@ getCactus <- function(identifier, representation){ return(NA) ret <- httr::content(ret) return(unlist(strsplit(ret, "\n"))) - + } #' Search Pubchem CID -#' +#' #' Retrieves PubChem CIDs for a search term. -#' +#' #' Only the first result is returned currently. \bold{The function should be #' regarded as experimental and has not thoroughly been tested.} -#' +#' #' @usage getPcId(query, from = "inchikey") #' @param query ID to be converted #' @param from Type of input ID #' @return The PubChem CID (in string type). #' @author Michael Stravs, Erik Mueller #' @seealso \code{\link{getCtsRecord}}, \code{\link{getCactus}} -#' @references PubChem search: \url{http://pubchem.ncbi.nlm.nih.gov/} -#' +#' @references PubChem search: \url{http://pubchem.ncbi.nlm.nih.gov/} +#' #' Pubchem REST: #' \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} #' @examples #' getPcId("MKXZASYAUGDDCJ-NJAFHUGGSA-N") -#' +#' #' @export getPcId <- function(query, from = "inchikey") { baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" url <- paste(baseURL, from, query, "description", "json", sep="/") - + errorvar <- 0 currEnvir <- environment() - + tryCatch({ res <- GET(utils::URLencode(url)) data <- httr::content(res, type="text", encoding="UTF-8") @@ -115,23 +115,23 @@ getPcId <- function(query, from = "inchikey") error=function(e){ currEnvir$errorvar <- 1 }) - + if(errorvar){ return(NA) } - + # This happens if the InChI key is not found: r <- fromJSON(data) - + if(!is.null(r$Fault)) return(NA) - + titleEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Title)))) - + titleEntry <- titleEntry[which.min(sapply(titleEntry, function(x)r$InformationList$Information[[x]]$CID))] PcID <- r$InformationList$Information[[titleEntry]]$CID - + if(is.null(PcID)){ return(NA) } else{ @@ -141,19 +141,19 @@ getPcId <- function(query, from = "inchikey") #' Search CCTE DTXSID -#' +#' #' Retrieves CCTE DTXSID from US EPA for a search term. -#' +#' #' Only the first result is returned currently. \bold{The function should be #' regarded as experimental and has not thoroughly been tested.} -#' +#' #' @usage getDTXSID(key, api_key) #' @param key ID to be converted #' @param api_key API key for CCTE #' @return The DTXSID (in string type) #' @author Tobias Schulze -#' @references CCTE search: \url{https://api-ccte.epa.gov/docs} -#' +#' @references CCTE search: \url{https://api-ccte.epa.gov/docs} +#' #' CCTE REST: #' \url{https://api-ccte.epa.gov/docs} #' @examples @@ -162,29 +162,29 @@ getPcId <- function(query, from = "inchikey") #' } #' @export getDTXSID <- function(key, api_key) - + { errorvar <- 0 currEnvir <- environment() - + tryCatch({ - base_url <- paste0("https://api-ccte.epa.gov/chemical/search/equal/", key) + base_url <- stringr::str_c("https://comptox.epa.gov/ctx-api/chemical/search/equal/", key) url <- httr2::request(base_url) url <- url |> httr2::req_headers("x-api-key" = api_key, "accept" = "application/json") resp <- httr2::req_perform(url) data <- resp |> httr2::resp_body_json() - + }, error=function(e){ currEnvir$errorvar <- 1 }) - + if(errorvar){ return(NA) } - + dtxsid <- data[[1]]$dtxsid - + if(is.null(dtxsid)){ return(NA) } else{ @@ -193,19 +193,19 @@ getDTXSID <- function(key, api_key) } #' Search CCTE DTXCID -#' +#' #' Retrieves CCTE DTXCID from US EPA for a search term. -#' +#' #' Only the first result is returned currently. \bold{The function should be #' regarded as experimental and has not thoroughly been tested.} -#' +#' #' @usage getDTXCID(key, api_key) #' @param key ID to be converted #' @param api_key API key for CCTE #' @return The DTXCID (in string type) #' @author Tobias Schulze -#' @references CCTE search: \url{https://api-ccte.epa.gov/docs} -#' +#' @references CCTE search: \url{https://api-ccte.epa.gov/docs} +#' #' CCTE REST: #' \url{https://api-ccte.epa.gov/docs} #' @examples @@ -214,28 +214,28 @@ getDTXSID <- function(key, api_key) #' } #' @export getDTXCID <- function(key, api_key) - + { errorvar <- 0 currEnvir <- environment() - + tryCatch({ - base_url <- paste0("https://api-ccte.epa.gov/chemical/search/equal/", key) + base_url <- stringr::str_c("https://comptox.epa.gov/ctx-api/chemical/search/equal/", key) url <- httr2::request(base_url) url <- url |> httr2::req_headers("x-api-key" = api_key, "accept" = "application/json") resp <- httr2::req_perform(url) data <- resp |> httr2::resp_body_json() - + }, error=function(e){ }) - + if(errorvar){ return(NA) } - + dtxcid <- data[[1]]$dtxcid - + if(is.null(dtxcid)){ return(NA) } else{ @@ -244,52 +244,52 @@ getDTXCID <- function(key, api_key) } #' Search CCTE Preferred Name -#' +#' #' Retrieves CCTE Preferred Name from US EPA for a search term. -#' +#' #' Only the first result is returned currently. \bold{The function should be #' regarded as experimental and has not thoroughly been tested.} -#' +#' #' @usage getPrefName(key, api_key) #' @param key ID to be converted #' @param api_key API key for CCTE #' @return The CCTE Preferred Name (in string type) #' @author Tobias Schulze -#' @references CCTE search: \url{https://api-ccte.epa.gov/docs} -#' +#' @references CCTE search: \url{https://api-ccte.epa.gov/docs} +#' #' CCTE REST: #' \url{https://api-ccte.epa.gov/docs} #' @examples #' \dontrun{ #' getPrefName("MKXZASYAUGDDCJ-NJAFHUGGSA-N") #' } -#' +#' #' @export getPrefName <- function(key, api_key) - + { errorvar <- 0 currEnvir <- environment() - + tryCatch({ - base_url <- paste0("https://api-ccte.epa.gov/chemical/search/equal/", key) + base_url <- stringr::str_c("https://comptox.epa.gov/ctx-api/chemical/search/equal/", key) url <- httr2::request(base_url) url <- url |> httr2::req_headers("x-api-key" = api_key, "accept" = "application/json") url |> httr2::req_dry_run() resp <- httr2::req_perform(url) data <- resp |> httr2::resp_body_json() - + }, error=function(e){ currEnvir$errorvar <- 1 }) - + if(errorvar){ return(NA) } - + pref_name <- data[[1]]$preferredName - + if(is.null(pref_name)){ return(NA) } else{ @@ -298,51 +298,51 @@ getPrefName <- function(key, api_key) } #' Search CCTE CAS registration number -#' +#' #' Retrieves CCTE CASRN from US EPA for a search term. -#' +#' #' Only the first result is returned currently. \bold{The function should be #' regarded as experimental and has not thoroughly been tested.} -#' +#' #' @usage getCASRN(key, api_key) #' @param key ID to be converted #' @param api_key API key for CCTE #' @return The CCTE CAS RN (in string type) #' @author Tobias Schulze -#' @references CCTE search: \url{https://api-ccte.epa.gov/docs} -#' +#' @references CCTE search: \url{https://api-ccte.epa.gov/docs} +#' #' CCTE REST: #' \url{https://api-ccte.epa.gov/docs} #' @examples #' \dontrun{ #' getCASRN("MKXZASYAUGDDCJ-NJAFHUGGSA-N") #' } -#' +#' #' @export getCASRN <- function(key, api_key) - + { errorvar <- 0 currEnvir <- environment() - + tryCatch({ - base_url <- paste0("https://api-ccte.epa.gov/chemical/search/equal/", key) + base_url <- stringr::str_c("https://comptox.epa.gov/ctx-api/chemical/search/equal/", key) url <- httr2::request(base_url) url <- url |> httr2::req_headers("x-api-key" = api_key, "accept" = "application/json") resp <- httr2::req_perform(url) data <- resp |> httr2::resp_body_json() - + }, error=function(e){ currEnvir$errorvar <- 1 }) - + if(errorvar){ return(NA) } - + cas_rn <- data[[1]]$casrn - + if(is.null(cas_rn)){ return(NA) } else{ @@ -351,51 +351,51 @@ getCASRN <- function(key, api_key) } #' Search CCTE SMILES -#' +#' #' Retrieves CCTE SMILES from US EPA for a search term. -#' +#' #' Only the first result is returned currently. \bold{The function should be #' regarded as experimental and has not thoroughly been tested.} -#' +#' #' @usage getDTXSMILES(key, api_key) #' @param key ID to be converted #' @param api_key API key for CCTE #' @return The SMILES (in string type) #' @author Tobias Schulze -#' @references CCTE search: \url{https://api-ccte.epa.gov/docs} -#' +#' @references CCTE search: \url{https://api-ccte.epa.gov/docs} +#' #' CCTE REST: #' \url{https://api-ccte.epa.gov/docs} #' @examples #' \dontrun{ #' getDTXSMILES("MKXZASYAUGDDCJ-NJAFHUGGSA-N") #' } -#' +#' #' @export getDTXSMILES <- function(key, api_key) - + { errorvar <- 0 currEnvir <- environment() - + tryCatch({ - base_url <- paste0("https://api-ccte.epa.gov/chemical/search/equal/", key) + base_url <- stringr::str_c("https://comptox.epa.gov/ctx-api/chemical/search/equal/", key) url <- httr2::request(base_url) url <- url |> httr2::req_headers("x-api-key" = api_key, "accept" = "application/json") resp <- httr2::req_perform(url) data <- resp |> httr2::resp_body_json() - + }, error=function(e){ currEnvir$errorvar <- 1 }) - + if(errorvar){ return(NA) } - + smiles <- data[[1]]$smiles - + if(is.null(smiles)){ return(NA) } else{ @@ -404,40 +404,40 @@ getDTXSMILES <- function(key, api_key) } #' Retrieve information from CTS -#' +#' #' Retrieves a complete CTS record from the InChI key. -#' +#' #' @usage getCtsRecord(key) -#' -#' @param key The InChI key. -#' @return Returns a list with all information from CTS: \code{inchikey, +#' +#' @param key The InChI key. +#' @return Returns a list with all information from CTS: \code{inchikey, #' inchicode, formula, exactmass} contain single values. \code{synonyms} contains #' an unordered list of scored synonyms (\code{type, name, score}, where \code{type} #' indicates either a normal name or a specific IUPAC name, see below). -#' \code{externalIds} contains an unordered list of identifiers of the compound in +#' \code{externalIds} contains an unordered list of identifiers of the compound in #' various databases (\code{name, value}, where \code{name} is the database name and #' \code{value} the identifier in that database.) -#' +#' #' @note Currently, the CTS results are still incomplete; the name scores are all 0, #' formula and exact mass return zero. #' @references Chemical Translation Service: #' \url{https://cts.fiehnlab.ucdavis.edu} -#' +#' #' @examples #' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") #' # show all synonym "types" #' types <- unique(unlist(lapply(data$synonyms, function(i) i$type))) #' \dontrun{print(types)} -#' +#' #' @author Michele Stravs, Eawag #' @export getCtsRecord <- function(key) { baseURL <- "https://cts.fiehnlab.ucdavis.edu/service/compound/" - + errorvar <- 0 currEnvir <- environment() - + ##tryCatch a CTS timeout ## tryCatch({ @@ -449,7 +449,7 @@ getCtsRecord <- function(key) currEnvir$errorvar <- 1 } ) - + if(errorvar){ warning("CTS seems to be currently unavailable or incapable of interpreting your request") return(NULL) @@ -463,14 +463,14 @@ getCtsRecord <- function(key) } #' Convert a single ID to another using CTS. -#' +#' #' @usage getCtsKey(query, from = "Chemical Name", to = "InChIKey") #' @param query ID to be converted #' @param from Type of input ID -#' @param to Desired output ID -#' @return An unordered array with the resulting converted key(s). -#' -#' @examples +#' @param to Desired output ID +#' @return An unordered array with the resulting converted key(s). +#' +#' @examples #' k <- getCtsKey("benzene", "Chemical Name", "InChIKey") #' @author Michele Stravs, Eawag #' @export @@ -480,7 +480,7 @@ getCtsKey <- function(query, from = "Chemical Name", to = "InChIKey") url <- paste(baseURL, from, to, query, sep='/') errorvar <- 0 currEnvir <- environment() - + ##tryCatch a CTS timeout ## tryCatch({ @@ -491,7 +491,7 @@ getCtsKey <- function(query, from = "Chemical Name", to = "InChIKey") currEnvir$errorvar <- 1 } ) - + if(errorvar){ warning("CTS seems to be currently unavailable or incapable of interpreting your request") return(NULL) @@ -501,7 +501,7 @@ getCtsKey <- function(query, from = "Chemical Name", to = "InChIKey") warning(paste("CTS has return code", res$status_code)) return(NULL) } - + r <- fromJSON(data) if(length(r) == 0) return(NULL) @@ -514,21 +514,21 @@ getCtsKey <- function(query, from = "Chemical Name", to = "InChIKey") } #' Select a subset of external IDs from a CTS record. -#' +#' #' @usage CTS.externalIdSubset(data, database) -#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. -#' @param database The database for which keys should be returned. +#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. +#' @param database The database for which keys should be returned. #' @return Returns an array of all external identifiers stored in the record for the #' given database. -#' -#' @examples -#' +#' +#' @examples +#' #' \dontrun{ #' # Return all CAS registry numbers stored for benzene. #' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") #' cas <- CTS.externalIdSubset(data, "CAS") -#' } -#' +#' } +#' #' @author Michele Stravs, Eawag #' @export CTS.externalIdSubset <- function(data, database) @@ -542,22 +542,22 @@ CTS.externalIdSubset <- function(data, database) } #' Find all available databases for a CTS record -#' +#' #' @usage CTS.externalIdTypes(data) -#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. -#' @return Returns an array of all database names for which there are external +#' @param data The complete CTS record as retrieved by \code{\link{getCtsRecord}}. +#' @return Returns an array of all database names for which there are external #' identifiers stored in the record. -#' -#' @examples -#' +#' +#' @examples +#' #' \dontrun{ #' # Return all databases for which the benzene entry has #' # links in the CTS record. -#' +#' #' data <- getCTS("UHOVQNZJYSORNB-UHFFFAOYSA-N") #' databases <- CTS.externalIdTypes(data) -#' } -#' +#' } +#' #' @author Michele Stravs, Eawag #' @export CTS.externalIdTypes <- function(data) @@ -571,7 +571,7 @@ CTS.externalIdTypes <- function(data) .pubChemOnline <- function(){ baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" url <- paste(baseURL, "inchikey", "QEIXBXXKTUNWDK-UHFFFAOYSA-N", "description", "json", sep="/") - + errorvar <- 0 currEnvir <- environment() tryCatch({ @@ -581,7 +581,7 @@ CTS.externalIdTypes <- function(data) error=function(e){ currEnvir$errorvar <- 1 }) - + if(errorvar){ warning("Pubchem is currently offline") return(FALSE) @@ -599,7 +599,7 @@ getPcCHEBI <- function(query, from = "inchikey") url <- paste(baseURL, from, query, "synonyms", "json", sep="/") errorvar <- 0 currEnvir <- environment() - + tryCatch({ res <- GET(utils::URLencode(url)) data <- httr::content(res, type="text", encoding="UTF-8") @@ -607,68 +607,30 @@ getPcCHEBI <- function(query, from = "inchikey") error=function(e){ currEnvir$errorvar <- 1 }) - + if(errorvar){ return(NA) } - + r <- fromJSON(data) - + # This happens if the InChI key is not found: if(!is.null(r$Fault)) return(NA) - + # Find the entries which contain Chebi-links synonymEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Synonym)))) synonymList <- r$InformationList$Information[[synonymEntry]]$Synonym matchChebi <- which(grepl("CHEBI:", synonymList, fixed=TRUE)) - + # It doesn't matter if the db is down or if chebi isn't found, so return NA also if(length(matchChebi) == 0){ - return (NA) + return (NA) } else { return (sapply(matchChebi, function(x) synonymList[[x]])) } } -#' Retrieve the Chemspider ID for a given compound -#' -#' Given an InChIKey, this function queries the chemspider web API to retrieve -#' the Chemspider ID of he compound with that InChIkey. -#' -#' @usage getCSID(query) -#' -#' @param query The InChIKey of the compound -#' @return Returns the chemspide -#' -#' @examples -#' -#' \dontrun{ -#' # Return all CAS registry numbers stored for benzene. -#' data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") -#' cas <- CTS.externalIdSubset(data, "CAS") -#' } -#' -#' @author Michele Stravs, Eawag -#' @author Erik Mueller, UFZ -#' @export -getCSID <- function(query) -{ - baseURL <- "http://legacy.chemspider.com/InChI.asmx/InChIKeyToCSID?inchi_key=" - url <- paste0(baseURL, query) - - data <- retrieveDataWithRetry(url = utils::URLencode(url), timeout=8) - if(is.null(data)) { - warning("Chemspider is currently offline") - return(NA) - } - - xml <- xmlParseDoc(data,asText=TRUE) - # the returned XML document contains only the root node called "string" which contains the correct CSID - idNodes <- getNodeSet(xml, "/") - id <- xmlValue(idNodes[[1]]) - return(id) -} ##This function returns a sensible name for the compound getPcSynonym <- function (query, from = "inchikey") @@ -676,10 +638,10 @@ getPcSynonym <- function (query, from = "inchikey") # Get the JSON-Data from Pubchem baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" url <- paste(baseURL, from, query, "description", "json", sep="/") - + errorvar <- 0 currEnvir <- environment() - + tryCatch({ res <- GET(utils::URLencode(url)) data <- httr::content(res, type="text", encoding="UTF-8") @@ -687,31 +649,31 @@ getPcSynonym <- function (query, from = "inchikey") error=function(e){ currEnvir$errorvar <- 1 }) - + if(errorvar){ return(NA) } - + r <- fromJSON(data) - + # This happens if the InChI key is not found: if(!is.null(r$Fault)) return(NA) - + # Find the synonym - + titleEntry <- which(unlist(lapply(r$InformationList$Information, function(i) !is.null(i$Title)))) - + titleEntry <- titleEntry[which.min(sapply(titleEntry, function(x)r$InformationList$Information[[x]]$CID))] - + title <- r$InformationList$Information[[titleEntry]]$Title - + if(is.null(title)){ return(NA) } else{ return(title) } -} +} ##A function to retrieve a IUPAC Name from Pubchem @@ -720,10 +682,10 @@ getPcIUPAC <- function (query, from = "inchikey") # Get the JSON-Data from Pubchem baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" url <- paste(baseURL, from, query, "record", "json", sep="/") - + errorvar <- 0 currEnvir <- environment() - + tryCatch({ res <- GET(utils::URLencode(url)) data <- httr::content(res, type="text", encoding="UTF-8") @@ -731,17 +693,17 @@ getPcIUPAC <- function (query, from = "inchikey") error=function(e){ currEnvir$errorvar <- 1 }) - + if(errorvar){ return(NA) } - + r <- fromJSON(data) - + # This happens if the InChI key is not found: if(!is.null(r$Fault)) return(NA) - + # Find the IUPAC-Names if(!is.null(r$PC_Compounds[[1]]$props)){ IUPACIndex <- which(unlist(lapply(r$PC_Compounds[[1]]$props, function(i) (i$urn$label == "IUPAC Name")))) @@ -754,7 +716,7 @@ getPcIUPAC <- function (query, from = "inchikey") } else{return(NA)} } else{return(NA)} } else{return(NA)} - + if(length(PrefIUPAC) == 1){ return(IUPACEntries[[PrefIUPAC]]$value$sval) @@ -762,7 +724,7 @@ getPcIUPAC <- function (query, from = "inchikey") # Else it doesn't matter which return(IUPACEntries[[1]]$value$sval) } -} +} getPcInchiKey <- function(query, from = "smiles"){ # Get the JSON-Data from Pubchem @@ -770,7 +732,7 @@ getPcInchiKey <- function(query, from = "smiles"){ url <- paste(baseURL, from, query, "json", sep="/") errorvar <- 0 currEnvir <- environment() - + tryCatch({ res <- httr::GET(utils::URLencode(url)) data <- httr::content(res, type="text", encoding="UTF-8") @@ -778,17 +740,17 @@ getPcInchiKey <- function(query, from = "smiles"){ error=function(e){ currEnvir$errorvar <- 1 }) - + if(errorvar){ return(NA) } - + r <- rjson::fromJSON(data) - + # This happens if the InChI key is not found: if(!is.null(r$Fault)) return(NA) - + # Find the entries which contain Chebi-links if(!is.null(r$PC_Compounds[[1]]$props)){ INKEYindex <- which(sapply(r$PC_Compounds[[1]]$props, function(x) x$urn$label) == "InChIKey") @@ -797,16 +759,16 @@ getPcInchiKey <- function(query, from = "smiles"){ } else{return(NA)} } else{return(NA)} - + } getPcSDF <- function(query, from = "smiles"){ baseURL <- "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound" url <- paste(baseURL, from, query, "sdf", sep="/") - + errorvar <- 0 currEnvir <- environment() - + tryCatch({ res <- GET(utils::URLencode(url)) data <- httr::content(res, type="text", encoding="UTF-8") @@ -814,13 +776,97 @@ getPcSDF <- function(query, from = "smiles"){ error=function(e){ currEnvir$errorvar <- 1 }) - + if(errorvar){ return(NA) } - + molEnd <- regexpr(data,pattern="M END",fixed=TRUE)+5 data <- c(strsplit(substring(data,1,molEnd),"\n")[[1]],"$$$$") return(data) } +#' Search ChemSpider CSID +#' +#' Retrieves ChemSpider CSID from UK RSC for a search term. +#' +#' Requires a valid API key +#' @usage getCSID(key, identifier, api_key) +#' @param key ID to be converted +#' @param identifier identifier (name, inchikey) +#' @param api_key API key for ChemSpider (to be created on the developer site) +#' @return The CSID (in string type) +#' @author Tobias Schulze +#' @references ChemSpider search: \url{https://developer.rsc.org/api-reference#} +#' @references ChemSpider developer site: \url{https://developer.rsc.org} +#' +#' ChemSider REST: +#' \url{https://developer.rsc.org/api-reference#} +#' @examples +#' \dontrun{ +#' getDTXSID(key = "MKXZASYAUGDDCJ-NJAFHUGGSA-N", identifier = "InChIKey", api_key = "your RCS API key") +#' } +#' @export +getCSID <- function(key, identifier, api_key) + +{ + errorvar <- 0 + currEnvir <- environment() + + tryCatch({ + base_url <- stringr::str_c("https://api.rsc.org/compounds/v1/filter/", identifier) + + if (identifier == "inchikey") { + payload <- + stringr::str_c( + "{\n \"inchikey\":\"", key, "\"\n}" + ) + } + + if (identifier == "name") { + payload <- + stringr::str_c( + "{\n \"name\": \"", key,"\",\n \"orderBy\": \"default\",\n \"orderDirection\": \"default\"\n}" + ) + } + + resp_1 <- httr::VERB("POST", + url = base_url, + body = payload, + add_headers('apikey' = api_key), + content_type("application/json"), + accept("application/json"), + encode = "json" + ) + + query_id <- httr::content(resp_1, "parsed") + + query_url <- stringr::str_c("https://api.rsc.org/compounds/v1/filter/", query_id[[1]], "/results") + + resp_2 <- httr::VERB("GET", + query_url, + add_headers('apikey' = api_key), + content_type("application/octet-stream"), + accept("application/json")) + + resp <- httr::content(resp_2, "parsed") + + data <- resp$results[[1]] + + }, + error=function(e){ + currEnvir$errorvar <- 1 + }) + + if(errorvar){ + return(NA) + } + + csid <- data + + if(is.null(csid)){ + return(NA) + } else{ + return(csid) + } +} diff --git a/R/zzz.R b/R/zzz.R index b64f86b..e5f7726 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,56 +1,57 @@ -# Central import section - -#' @importFrom assertthat assert_that has_args -#' @importFrom Biobase isVersioned isCurrent classVersion<- classVersion -#' @importFrom ChemmineR smiles2sdf validSDF write.SDF -#' @importFrom data.table fread fwrite -#' @import digest -#' @importFrom dplyr rename_with select -#' @import glue -#' @import httr -#' @import httr2 -#' @import logger -#' @importFrom methods setGeneric setMethod -#' @import mzR -#' @import rcdk -#' @import Rcpp -#' @import readJDX -#' @import readr -#' @import rjson -#' @import S4Vectors -#' @importFrom stats lm loess median predict smooth.spline -#' @import tibble -#' @importFrom tidyselect everything -#' @importFrom utils URLencode capture.output data flush.console -#' @importFrom utils packageVersion read.csv read.csv2 setTxtProgressBar -#' @importFrom utils str txtProgressBar type.convert write.csv write.table -#' @importFrom utils globalVariables -#' @importFrom webchem cir_query -#' @import XML -#' @import yaml - - -.onLoad <- function(libname, pkgname) { - RMassBank.env <<- new.env() - RMassBank.env$ReadAnnotation <- FALSE - RMassBank.env$testnumber <- 1 - ## new variables - RMassBank.env$verbose.output <- FALSE - RMassBank.env$export.invalid <- FALSE - RMassBank.env$export.molfiles <- TRUE - RMassBank.env$strictMsMsSpectraSelection <- FALSE - - mb <- list() - attach(RMassBank.env) -} - -utils::globalVariables(c("cpdID", - "isotopes", - "mzCalc", - "...1", - "occurrenceMatrix", - "c.msmsWSspecs", - "mass.calc", - "updateObjectFromSlots")) - - +# Central import section + +#' @importFrom assertthat assert_that has_args +#' @importFrom Biobase isVersioned isCurrent classVersion<- classVersion +#' @importFrom ChemmineR smiles2sdf validSDF write.SDF +#' @importFrom data.table fread fwrite +#' @import digest +#' @importFrom dplyr rename_with select +#' @import glue +#' @import httr +#' @import httr2 +#' @import logger +#' @importFrom methods setGeneric setMethod +#' @import mzR +#' @import rcdk +#' @import Rcpp +#' @import readJDX +#' @import readr +#' @import rjson +#' @import S4Vectors +#' @importFrom stats lm loess median predict smooth.spline +#' @import stringr +#' @import tibble +#' @importFrom tidyselect everything +#' @importFrom utils URLencode capture.output data flush.console +#' @importFrom utils packageVersion read.csv read.csv2 setTxtProgressBar +#' @importFrom utils str txtProgressBar type.convert write.csv write.table +#' @importFrom utils globalVariables +#' @importFrom webchem cir_query +#' @import XML +#' @import yaml + + +.onLoad <- function(libname, pkgname) { + RMassBank.env <<- new.env() + RMassBank.env$ReadAnnotation <- FALSE + RMassBank.env$testnumber <- 1 + ## new variables + RMassBank.env$verbose.output <- FALSE + RMassBank.env$export.invalid <- FALSE + RMassBank.env$export.molfiles <- TRUE + RMassBank.env$strictMsMsSpectraSelection <- FALSE + + mb <- list() + attach(RMassBank.env) +} + +utils::globalVariables(c("cpdID", + "isotopes", + "mzCalc", + "...1", + "occurrenceMatrix", + "c.msmsWSspecs", + "mass.calc", + "updateObjectFromSlots")) + + diff --git a/inst/RMB_options.ini b/inst/RMB_options.ini index 22dc0fd..14c77b4 100755 --- a/inst/RMB_options.ini +++ b/inst/RMB_options.ini @@ -1,289 +1,293 @@ -# Sample configuration file for RMassBank. -# Adapt this file to your needs. -# NOTE: Do not indent with TAB characters! Use only spaces. -# (If your editor converts TAB to a certain number of spaces, it's OK.) -# Use a space after the colon. - -# Deprofile input data? -# Leave empty if input data is already in "centroid" mode. -# Use values deprofile.spline, deprofile.fwhm or deprofile.localMax to convert the input data with the -# corresponding algorithm. See ?deprofile -deprofile: - -# Deviation (in minutes) allowed the for retention time -rtMargin: 0.4 -# Systematic retention time shift -rtShift: 0.0 - -# Directory to OpenBabel. Required for creating molfiles for MassBank export. -# If no OpenBabel directory is given, RMassBank will attempt to use the CACTUS webservice -# for SDF generation. You really should install OpenBabel though; the CACTUS structures -# have explicit hydrogen atoms... -# Points to the directory where babel.exe (or the Linux "babel" equivalent) lies. -babeldir: -# Example: -# babeldir: '"C:\Program Files (x86)\OpenBabel-2.3.1"\' - -# Which MassBank record version to use; version 2 is advised. -use_version: 2 - -# Include reanalyzed peaks? -use_rean_peaks: TRUE - -# annotate the spectra files with (putative) molecular formulas for fragments? -add_annotation: TRUE - -# Annotations for the spectrum: -annotations: - # Author etc. annotation - authors: Nomen Nescio, The Unseen University - copyright: Copyright (C) XXX - publication: - license: CC BY - instrument: LTQ Orbitrap XL Thermo Scientific - instrument_type: LC-ESI-ITFT - confidence_comment: standard compound - compound_class: N/A; Environmental Standard - internal_id_fieldname: INTERNAL_ID - # - # HPLC annotations: - # - # example: lc_gradient: 90/10 at 0 min, 50/50 at 4 min, 5/95 at 17 min, 5/95 at 25 min, 90/10 at 25.1 min, 90/10 at 30 min - lc_gradient: - # example: lc_flow: 200 uL/min - lc_flow: - lc_solvents: - # example: lc_solvent_a: water with 0.1% formic acid - lc_solvent_a: - lc_solvent_b: - # example: lc_column: XBridge C18 3.5um, 2.1x50mm, Waters - lc_column: - # Prefix for MassBank accession IDs - contributor_prefix: CONTRIBUTOR - entry_prefix: XX - ms_type: MS2 - ionization: ESI - ms_dataprocessing: - RECALIBRATE: loess on assigned fragments and MS1 - -include_sp_tags: FALSE - -# Annotator: -# by default, "annotator.default" is used. -# If you want to build your custom annotator (check ?annotator.default and the source code), -# select it here by using e.g. -# annotator: annotator.myown -# for a function annotator.myown(annotation) - -# List of data-dependent scans in their order (relative to the parent scan), for annotation of the MassBank records -# For every data-dependent scan event, specify an element with: -# mode: fragmentation mode, e.g. CID -# ces: "short" format collision energy (for record title) -# ce: "long" format collision energy (for annotation field) -# res: FT resolution -spectraList: - # First scan: CID 35% NCE, resolution 7500 -- mode: CID - ces: 35% - ce: 35 % (nominal) - res: 7500 - # Second scan: HCD 15% NCE, resolution 7500 -- mode: HCD - ces: 15% - ce: 15 % (nominal) - res: 7500 - # Third scan, etc. -- mode: HCD - ces: 30% - ce: 30 % (nominal) - res: 7500 -- mode: HCD - ces: 45% - ce: 45 % (nominal) - res: 7500 -- mode: HCD - ces: 60% - ce: 60 % (nominal) - res: 7500 -- mode: HCD - ces: 75% - ce: 75 % (nominal) - res: 7500 -- mode: HCD - ces: 90% - ce: 90 % (nominal) - res: 7500 -- mode: HCD - ces: 15% - ce: 15 % (nominal) - res: 15000 -- mode: HCD - ces: 30% - ce: 30 % (nominal) - res: 15000 -- mode: HCD - ces: 45% - ce: 45 % (nominal) - res: 15000 -- mode: HCD - ces: 60% - ce: 60 % (nominal) - res: 15000 -- mode: HCD - ces: 75% - ce: 75 % (nominal) - res: 15000 -- mode: HCD - ces: 90% - ce: 90 % (nominal) - res: 15000 -- mode: CID - ces: 35% - ce: 35 % (nominal) - res: 15000 - -# Shifts of the starting points for RMassBank accession numbers. -# Change these if you measure different adducts -accessionNumberShifts: - pH: 0 # [M+H]+: Accession numbers 1-14 - pM: 16 # [M]+: 17-30 - pNa: 32 # [M+Na]+: 33-46 - mH: 50 # [M-H]-: 51-64 - mFA: 66 # [M+FA]-: 67-80 - -# How to build ACCESSION for records: - -# Predefined accession builders: -# 'standard': MSBNK-{contributor prefix}-{entry prefix}{compound id}{shifted subscan} -# 'simple': MSBNK-{contributor prefix}-{entry prefix}{accessionNumberStart + subscan} -# 'legacy': {entry prefix}{compound id}{shifted subscan} -accessionBuilderType: -# If 'accessionBuilderType' is empty: define a formatstring to build ACCESSION -# Available variables and functions: -# Zero-padded values for cpd id, (shifted) scan id and incremental id -# * compound_id(n) -# * scan_id(n) -# * incremental_id(n) -# Values from configuration: -# * contributor_prefix -# * entry_prefix -# Values from spectrum and compound -# * collision_energy_raw: collision energy as stored in the spectrum -# (a simple integer, not formatted like the spectraList info) -# * info("KEY"): an uppercased, slugified (with underscore) value of -# - any subtag from AC$MASS_SPECTROMETRY, AC$CHROMATOGRAPHY, CH$LINK, -# MS$FOCUSED_ION (e.g. info("RESOLUTION") to get AC$MASS_SPECTROMETRY: RESOLUTION) -# - any other tag except from the above-mentioned ones or MS$DATA_PROCESSING. -# e.g. info("CH$AUTHORS") to get the author list. -# Note that this is provided mostly for technological reason and makes it easier -# to provide `info_hash`. It is a bad idea to use most of these options directly. -# * info_hash("KEY", digits): A hash generated from the result of info("KEY"), -# which is `digits` characters long. E.g. `info_hash("AUTHORS", 3)` generates "E4A" -# for `AUTHORS: Michele Michele` -# Special value info("INCHIKEY2D") is the 14-character 2D block of the inchi key. -# * mode: the acquisition mode (pH, mH, pM etc) -# * mode_hash: a four-letter hash corresponding to the mode; the md5 hash of -# "{mode}${adductString}" -# (so it would also work with arbitrary adductString values in the future) -# * condition_hash: a hopefully unique four-letter hash encoding the mass spectrometry -# conditions: {INSTRUMENT_TYPE}${MS_TYPE}${ION_MODE}${IONIZATION}${FRAGMENTATION_MODE}${COLLISION_ENERGY}" -# (we left resolution out for now) -# * polarity(n): n-letter polarity string; e.g. polarity(1) = "P", polarity(4) = "POSI" -accessionBuilder: "MSBNK-{contributor_prefix}-{entry_prefix}{compound_id(4)}{scan_id(2)}" -# Validate accession? Set to FALSE to bypass accession validation -accessionValidate: true - - -# A list of known electronic noise peaks -electronicNoise: -- 189.825 -- 201.725 -- 196.875 -# Exclusion width of electronic noise peaks (from unmatched peaks, prior to -# reanalysis) -electronicNoiseWidth: 0.3 - -# recalibration settings: -# recalibrate by: dppm or dmz -recalibrateBy: dppm - -# recalibrate MS1: -# separately (separate) -# with common curve (common) -# do not recalibrate (none) -recalibrateMS1: common -# Window width to look for MS1 peaks to recalibrate (in ppm) -recalibrateMS1Window: 15 - -# Custom recalibration function: You can overwrite the recal function by -# making any function which takes rcdata$recalfield ~ rcdata$mzFound. -# The settings define which recal function is used. -# Note: if recalibrateMS1 is "common", the setting "recalibrator: MS1" is meaningless -# because the MS1 points will be recalibrated together with the MS2 points with -# the MS2 recalibration function. -recalibrator: - MS1: recalibrate.loess - MS2: recalibrate.loess - -# Define the multiplicity filtering level -# Default is 2 (peak occurs at least twice) -# Set this to 1 if you want to turn this option off. -# Set this to anything > 2 if you want harder filtering -multiplicityFilter: 2 - -# Define the title format. -# You can use all entries from MassBank records as tokens -# plus the additional token RECORD_TITLE_CE, which is a shortened -# version of the collision energy specifically for use in the title. -# Every line is one entry and must have one token in curly brackets -# e.g. {CH$NAME} or {AC$MASS_SPECTROMETRY: MS_TYPE} plus optionally -# additional text in front or behind e.g. -# R={AC$MASS_SPECTROMETRY: RESOLUTION} -# If this is not specified, it defaults to a title of the format -# "Dinotefuran; LC-ESI-QFT; MS2; CE: 35%; R=35000; [M+H]+" -# Note how everything must be in "" here because otherwise the : are getting mangled! -titleFormat: -- "{CH$NAME}" -- "{AC$INSTRUMENT_TYPE}" -- "{AC$MASS_SPECTROMETRY: MS_TYPE}" -- "CE: {RECORD_TITLE_CE}" -- "R={AC$MASS_SPECTROMETRY: RESOLUTION}" -- "{MS$FOCUSED_ION: PRECURSOR_TYPE}" - -# Define filter settings. -# For Orbitrap, settings of 15 ppm in low mass range, 10 ppm in high -# mass range, m/z = 120 as mass range division and 5 ppm for recalibrated -# data overall are recommended. -filterSettings: - ppmHighMass: 10 - ppmLowMass: 15 - massRangeDivision: 120 - ppmFine: 5 - prelimCut: 1000 - prelimCutRatio: 0 - fineCut: 0 - fineCutRatio: 0 - specOkLimit: 1000 - dbeMinLimit: -0.5 - satelliteMzLimit: 0.5 - satelliteIntLimit: 0.05 - - # Define raw MS retrieval settings. -findMsMsRawSettings: - ppmFine: 10 - mzCoarse: 0.5 - # fillPrecursorScan is FALSE for "good" mzML files which have all the info needed. - # However, for example AB Sciex files will have missing precursor scan information, - # in which case fillPrecursorScan = TRUE is needed. Try it out. - fillPrecursorScan: FALSE - -# Select how to treat unknown compound masses: -# "charged" (the default, also if no option set) treats unknown (level 5) compound masses as the m/z, -# "neutral" treats unknown (level 5) compound masses as the neutral mass and applies [M+H]+ and [M-H]- calculations accordingly. -unknownMass: charged - - -# Add the CCTE api key to retrieve information from https://api-ccte.epa.gov/docs -# Be aware, this is confidential information, so do not share with unauthorized -# persons -ccte_api_key: +# Sample configuration file for RMassBank. +# Adapt this file to your needs. +# NOTE: Do not indent with TAB characters! Use only spaces. +# (If your editor converts TAB to a certain number of spaces, it's OK.) +# Use a space after the colon. + +# Deprofile input data? +# Leave empty if input data is already in "centroid" mode. +# Use values deprofile.spline, deprofile.fwhm or deprofile.localMax to convert the input data with the +# corresponding algorithm. See ?deprofile +deprofile: + +# Deviation (in minutes) allowed the for retention time +rtMargin: 0.4 +# Systematic retention time shift +rtShift: 0.0 + +# Directory to OpenBabel. Required for creating molfiles for MassBank export. +# If no OpenBabel directory is given, RMassBank will attempt to use the CACTUS webservice +# for SDF generation. You really should install OpenBabel though; the CACTUS structures +# have explicit hydrogen atoms... +# Points to the directory where babel.exe (or the Linux "babel" equivalent) lies. +babeldir: +# Example: +# babeldir: '"C:\Program Files (x86)\OpenBabel-2.3.1"\' + +# Which MassBank record version to use; version 2 is advised. +use_version: 2 + +# Include reanalyzed peaks? +use_rean_peaks: TRUE + +# annotate the spectra files with (putative) molecular formulas for fragments? +add_annotation: TRUE + +# Annotations for the spectrum: +annotations: + # Author etc. annotation + authors: Nomen Nescio, The Unseen University + copyright: Copyright (C) XXX + publication: + license: CC BY + instrument: LTQ Orbitrap XL Thermo Scientific + instrument_type: LC-ESI-ITFT + confidence_comment: standard compound + compound_class: N/A; Environmental Standard + internal_id_fieldname: INTERNAL_ID + # + # HPLC annotations: + # + # example: lc_gradient: 90/10 at 0 min, 50/50 at 4 min, 5/95 at 17 min, 5/95 at 25 min, 90/10 at 25.1 min, 90/10 at 30 min + lc_gradient: + # example: lc_flow: 200 uL/min + lc_flow: + lc_solvents: + # example: lc_solvent_a: water with 0.1% formic acid + lc_solvent_a: + lc_solvent_b: + # example: lc_column: XBridge C18 3.5um, 2.1x50mm, Waters + lc_column: + # Prefix for MassBank accession IDs + contributor_prefix: CONTRIBUTOR + entry_prefix: XX + ms_type: MS2 + ionization: ESI + ms_dataprocessing: + RECALIBRATE: loess on assigned fragments and MS1 + +include_sp_tags: FALSE + +# Annotator: +# by default, "annotator.default" is used. +# If you want to build your custom annotator (check ?annotator.default and the source code), +# select it here by using e.g. +# annotator: annotator.myown +# for a function annotator.myown(annotation) + +# List of data-dependent scans in their order (relative to the parent scan), for annotation of the MassBank records +# For every data-dependent scan event, specify an element with: +# mode: fragmentation mode, e.g. CID +# ces: "short" format collision energy (for record title) +# ce: "long" format collision energy (for annotation field) +# res: FT resolution +spectraList: + # First scan: CID 35% NCE, resolution 7500 +- mode: CID + ces: 35% + ce: 35 % (nominal) + res: 7500 + # Second scan: HCD 15% NCE, resolution 7500 +- mode: HCD + ces: 15% + ce: 15 % (nominal) + res: 7500 + # Third scan, etc. +- mode: HCD + ces: 30% + ce: 30 % (nominal) + res: 7500 +- mode: HCD + ces: 45% + ce: 45 % (nominal) + res: 7500 +- mode: HCD + ces: 60% + ce: 60 % (nominal) + res: 7500 +- mode: HCD + ces: 75% + ce: 75 % (nominal) + res: 7500 +- mode: HCD + ces: 90% + ce: 90 % (nominal) + res: 7500 +- mode: HCD + ces: 15% + ce: 15 % (nominal) + res: 15000 +- mode: HCD + ces: 30% + ce: 30 % (nominal) + res: 15000 +- mode: HCD + ces: 45% + ce: 45 % (nominal) + res: 15000 +- mode: HCD + ces: 60% + ce: 60 % (nominal) + res: 15000 +- mode: HCD + ces: 75% + ce: 75 % (nominal) + res: 15000 +- mode: HCD + ces: 90% + ce: 90 % (nominal) + res: 15000 +- mode: CID + ces: 35% + ce: 35 % (nominal) + res: 15000 + +# Shifts of the starting points for RMassBank accession numbers. +# Change these if you measure different adducts +accessionNumberShifts: + pH: 0 # [M+H]+: Accession numbers 1-14 + pM: 16 # [M]+: 17-30 + pNa: 32 # [M+Na]+: 33-46 + mH: 50 # [M-H]-: 51-64 + mFA: 66 # [M+FA]-: 67-80 + +# How to build ACCESSION for records: + +# Predefined accession builders: +# 'standard': MSBNK-{contributor prefix}-{entry prefix}{compound id}{shifted subscan} +# 'simple': MSBNK-{contributor prefix}-{entry prefix}{accessionNumberStart + subscan} +# 'legacy': {entry prefix}{compound id}{shifted subscan} +accessionBuilderType: +# If 'accessionBuilderType' is empty: define a formatstring to build ACCESSION +# Available variables and functions: +# Zero-padded values for cpd id, (shifted) scan id and incremental id +# * compound_id(n) +# * scan_id(n) +# * incremental_id(n) +# Values from configuration: +# * contributor_prefix +# * entry_prefix +# Values from spectrum and compound +# * collision_energy_raw: collision energy as stored in the spectrum +# (a simple integer, not formatted like the spectraList info) +# * info("KEY"): an uppercased, slugified (with underscore) value of +# - any subtag from AC$MASS_SPECTROMETRY, AC$CHROMATOGRAPHY, CH$LINK, +# MS$FOCUSED_ION (e.g. info("RESOLUTION") to get AC$MASS_SPECTROMETRY: RESOLUTION) +# - any other tag except from the above-mentioned ones or MS$DATA_PROCESSING. +# e.g. info("CH$AUTHORS") to get the author list. +# Note that this is provided mostly for technological reason and makes it easier +# to provide `info_hash`. It is a bad idea to use most of these options directly. +# * info_hash("KEY", digits): A hash generated from the result of info("KEY"), +# which is `digits` characters long. E.g. `info_hash("AUTHORS", 3)` generates "E4A" +# for `AUTHORS: Michele Michele` +# Special value info("INCHIKEY2D") is the 14-character 2D block of the inchi key. +# * mode: the acquisition mode (pH, mH, pM etc) +# * mode_hash: a four-letter hash corresponding to the mode; the md5 hash of +# "{mode}${adductString}" +# (so it would also work with arbitrary adductString values in the future) +# * condition_hash: a hopefully unique four-letter hash encoding the mass spectrometry +# conditions: {INSTRUMENT_TYPE}${MS_TYPE}${ION_MODE}${IONIZATION}${FRAGMENTATION_MODE}${COLLISION_ENERGY}" +# (we left resolution out for now) +# * polarity(n): n-letter polarity string; e.g. polarity(1) = "P", polarity(4) = "POSI" +accessionBuilder: "MSBNK-{contributor_prefix}-{entry_prefix}{compound_id(4)}{scan_id(2)}" +# Validate accession? Set to FALSE to bypass accession validation +accessionValidate: true + + +# A list of known electronic noise peaks +electronicNoise: +- 189.825 +- 201.725 +- 196.875 +# Exclusion width of electronic noise peaks (from unmatched peaks, prior to +# reanalysis) +electronicNoiseWidth: 0.3 + +# recalibration settings: +# recalibrate by: dppm or dmz +recalibrateBy: dppm + +# recalibrate MS1: +# separately (separate) +# with common curve (common) +# do not recalibrate (none) +recalibrateMS1: common +# Window width to look for MS1 peaks to recalibrate (in ppm) +recalibrateMS1Window: 15 + +# Custom recalibration function: You can overwrite the recal function by +# making any function which takes rcdata$recalfield ~ rcdata$mzFound. +# The settings define which recal function is used. +# Note: if recalibrateMS1 is "common", the setting "recalibrator: MS1" is meaningless +# because the MS1 points will be recalibrated together with the MS2 points with +# the MS2 recalibration function. +recalibrator: + MS1: recalibrate.loess + MS2: recalibrate.loess + +# Define the multiplicity filtering level +# Default is 2 (peak occurs at least twice) +# Set this to 1 if you want to turn this option off. +# Set this to anything > 2 if you want harder filtering +multiplicityFilter: 2 + +# Define the title format. +# You can use all entries from MassBank records as tokens +# plus the additional token RECORD_TITLE_CE, which is a shortened +# version of the collision energy specifically for use in the title. +# Every line is one entry and must have one token in curly brackets +# e.g. {CH$NAME} or {AC$MASS_SPECTROMETRY: MS_TYPE} plus optionally +# additional text in front or behind e.g. +# R={AC$MASS_SPECTROMETRY: RESOLUTION} +# If this is not specified, it defaults to a title of the format +# "Dinotefuran; LC-ESI-QFT; MS2; CE: 35%; R=35000; [M+H]+" +# Note how everything must be in "" here because otherwise the : are getting mangled! +titleFormat: +- "{CH$NAME}" +- "{AC$INSTRUMENT_TYPE}" +- "{AC$MASS_SPECTROMETRY: MS_TYPE}" +- "CE: {RECORD_TITLE_CE}" +- "R={AC$MASS_SPECTROMETRY: RESOLUTION}" +- "{MS$FOCUSED_ION: PRECURSOR_TYPE}" + +# Define filter settings. +# For Orbitrap, settings of 15 ppm in low mass range, 10 ppm in high +# mass range, m/z = 120 as mass range division and 5 ppm for recalibrated +# data overall are recommended. +filterSettings: + ppmHighMass: 10 + ppmLowMass: 15 + massRangeDivision: 120 + ppmFine: 5 + prelimCut: 1000 + prelimCutRatio: 0 + fineCut: 0 + fineCutRatio: 0 + specOkLimit: 1000 + dbeMinLimit: -0.5 + satelliteMzLimit: 0.5 + satelliteIntLimit: 0.05 + + # Define raw MS retrieval settings. +findMsMsRawSettings: + ppmFine: 10 + mzCoarse: 0.5 + # fillPrecursorScan is FALSE for "good" mzML files which have all the info needed. + # However, for example AB Sciex files will have missing precursor scan information, + # in which case fillPrecursorScan = TRUE is needed. Try it out. + fillPrecursorScan: FALSE + +# Select how to treat unknown compound masses: +# "charged" (the default, also if no option set) treats unknown (level 5) compound masses as the m/z, +# "neutral" treats unknown (level 5) compound masses as the neutral mass and applies [M+H]+ and [M-H]- calculations accordingly. +unknownMass: charged + +# Add the CCTE api key to retrieve information from https://api-ccte.epa.gov/docs +# Be aware, this is confidential information, so do not share with unauthorized +# persons +ccte_api_key: + +# Add the RSC api key to retrieve information from https://developer.rsc.org/api-reference +# Be aware, this is confidential information, so do not share with unauthorized +# persons +rcs_api_key: diff --git a/man/CTS.externalIdSubset.Rd b/man/CTS.externalIdSubset.Rd index 5877a26..91b9b08 100644 --- a/man/CTS.externalIdSubset.Rd +++ b/man/CTS.externalIdSubset.Rd @@ -24,7 +24,7 @@ Select a subset of external IDs from a CTS record. # Return all CAS registry numbers stored for benzene. data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") cas <- CTS.externalIdSubset(data, "CAS") -} +} } \author{ diff --git a/man/CTS.externalIdTypes.Rd b/man/CTS.externalIdTypes.Rd index a9a7513..bb99493 100644 --- a/man/CTS.externalIdTypes.Rd +++ b/man/CTS.externalIdTypes.Rd @@ -10,7 +10,7 @@ CTS.externalIdTypes(data) \item{data}{The complete CTS record as retrieved by \code{\link{getCtsRecord}}.} } \value{ -Returns an array of all database names for which there are external +Returns an array of all database names for which there are external identifiers stored in the record. } \description{ @@ -24,7 +24,7 @@ Find all available databases for a CTS record data <- getCTS("UHOVQNZJYSORNB-UHFFFAOYSA-N") databases <- CTS.externalIdTypes(data) -} +} } \author{ diff --git a/man/RmbDefaultSettings.Rd b/man/RmbDefaultSettings.Rd index b4f80fb..1a088a9 100755 --- a/man/RmbDefaultSettings.Rd +++ b/man/RmbDefaultSettings.Rd @@ -7,7 +7,7 @@ \alias{loadRmbSettingsFromEnv} \title{RMassBank settings} \usage{ -loadRmbSettings(file_or_list) +loadRmbSettings(file_or_list) loadRmbSettingsFromEnv(env = .GlobalEnv) @@ -32,9 +32,9 @@ Load, set and reset settings for RMassBank. \code{RmbSettingsTemplate} creates a template file in which you can adjust the settings as you like. Before using RMassBank, you must then load the settings file using \code{loadRmbSettings}. \code{RmbDefaultSettings} loads -the default settings. \code{loadRmbSettingsFromEnv} loads the settings +the default settings. \code{loadRmbSettingsFromEnv} loads the settings stored in env$RmbSettings, which is useful when reloading archives with -saved settings inside. +saved settings inside. Note: no settings are loaded upon loading MassBank! This is intended, so that one never forgets to load the correct settings. diff --git a/man/RmbSettings.Rd b/man/RmbSettings.Rd index 87e0862..c7ef474 100755 --- a/man/RmbSettings.Rd +++ b/man/RmbSettings.Rd @@ -9,7 +9,7 @@ Describes all settings for the RMassBank settings file. \details{ \itemize{ \item{\code{deprofile}}{ - Whether and how to deprofile input raw files. Leave the + Whether and how to deprofile input raw files. Leave the setting empty if your raw files are already in "centroid" mode. If your input files are in profile mode, you have the choice between algorithms \code{\link{deprofile}.spline, deprofile.fwhm, deprofile.localMax}; refer to @@ -30,7 +30,7 @@ Describes all settings for the RMassBank settings file. version 1 is considered outdated and should be used only if for some reason you are running old servers and an upgrade is not feasible.} \item{\code{use_rean_peaks}}{ - Whether to include peaks from reanalysis (see + Whether to include peaks from reanalysis (see \code{\link{reanalyzeFailpeaks}}) in the MassBank records. Boolean, TRUE or FALSE. } \item{\code{annotations}}{ @@ -38,32 +38,32 @@ Describes all settings for the RMassBank settings file. \code{authors, copyright, license, instrument, instrument_type, compound_class} correspond to the MassBank entries \code{AUTHORS, COPYRIGHT, PUBLICATION, LICENSE, AC$INSTRUMENT, AC$INSTRUMENT_TYPE, CH$COMPOUND_CLASS}. The entry \code{confidence_comment} is added as - \code{COMMENT: CONFIDENCE} entry. + \code{COMMENT: CONFIDENCE} entry. The entry \code{internal_id_fieldname} is used to name - the MassBank entry which will keep a reference to the internal compound ID used in - the workflow: for \code{internal_id_fieldname = MYID} and e.g. compound 1234, an - entry will be added to the MassBank record with + the MassBank entry which will keep a reference to the internal compound ID used in + the workflow: for \code{internal_id_fieldname = MYID} and e.g. compound 1234, an + entry will be added to the MassBank record with \code{COMMENT: MYID 1234}. The internal fieldname should not be left empty! - + The entries \code{lc_gradient, lc_flow, lc_solvent_a, lc_solvent_b, lc_column} correspond - to the MassBank entries \code{AC$CHROMATOGRAPHY: FLOW_GRADIENT, FLOW_RATE, - SOLVENT A, SOLVENT B, COLUMN_NAME}. + to the MassBank entries \code{AC$CHROMATOGRAPHY: FLOW_GRADIENT, FLOW_RATE, + SOLVENT A, SOLVENT B, COLUMN_NAME}. \code{ms_type, ionization} correspond to \code{AC$MASS_SPECTROMETRY: MS_TYPE, IONIZATION}. \code{entry_prefix} is the two-letter prefix used when building MassBank accession codes. Entries under \code{ms_dataprocessing} are added as \code{MS$DATA_PROCESSING:} entries, - in addition to the default \code{WHOLE: RMassBank}. + in addition to the default \code{WHOLE: RMassBank}. } \item{\code{annotator}}{ - For advanced users: option to select your own custom annotator. + For advanced users: option to select your own custom annotator. Check \code{\link{annotator.default}} and the source code for details.} \item{\code{spectraList}}{ This setting describes the experimental annotations for the single data-dependent scans. For every data-dependent scan event, a \code{spectraList} entry with - \code{mode, ces, ce, res} denoting collision mode, collision energy in short and verbose + \code{mode, ces, ce, res} denoting collision mode, collision energy in short and verbose notation, and FT resolution.} \item{\code{accessionNumberShifts}}{ This denotes the starting points for accession numbers @@ -85,68 +85,68 @@ Describes all settings for the RMassBank settings file. the \code{MS2} setting is used for a common recalibration curve. See \code{\link{recalibrate.loess}} for details.} \item{\code{multiplicityFilter}}{ - Define the multiplicity filtering level. Default is 2, a value of 1 + Define the multiplicity filtering level. Default is 2, a value of 1 is off (no filtering) and >2 is harsher filtering.} \item{\code{titleFormat}}{ The title of MassBank records is a mini-summary - of the record, for example "Dinotefuran; LC-ESI-QFT; MS2; CE: 35%; R=35000; [M+H]+". - By default, the first compound name \code{CH$NAME}, instrument type - \code{AC$INSTRUMENT_TYPE}, MS/MS type \code{AC$MASS_SPECTROMETRY: MS_TYPE}, + of the record, for example "Dinotefuran; LC-ESI-QFT; MS2; CE: 35%; R=35000; [M+H]+". + By default, the first compound name \code{CH$NAME}, instrument type + \code{AC$INSTRUMENT_TYPE}, MS/MS type \code{AC$MASS_SPECTROMETRY: MS_TYPE}, collision energy \code{RECORD_TITLE_CE}, resolution \code{AC$MASS_SPECTROMETRY: RESOLUTION} - and precursor \code{MS$FOCUSED_ION: PRECURSOR_TYPE} are used. If alternative + and precursor \code{MS$FOCUSED_ION: PRECURSOR_TYPE} are used. If alternative information is relevant to differentiate acquired spectra, the title should be adjusted. - For example, many TOFs do not have a resolution setting. + For example, many TOFs do not have a resolution setting. See MassBank documentation for more.} \item{\code{filterSettings}}{ A list of settings that affect the MS/MS processing. The entries - \code{ppmHighMass, ppmLowMass, massRangeDivision} set values for - pre-processing, prior to recalibration. \code{ppmHighMass} defines the - ppm error for the high mass range (default 10 ppm for Orbitraps), - \code{ppmLowMass} is the error for the low mass range (default 15 ppm - for Orbitraps) and \code{massRangeDivision} is the m/z value defining + \code{ppmHighMass, ppmLowMass, massRangeDivision} set values for + pre-processing, prior to recalibration. \code{ppmHighMass} defines the + ppm error for the high mass range (default 10 ppm for Orbitraps), + \code{ppmLowMass} is the error for the low mass range (default 15 ppm + for Orbitraps) and \code{massRangeDivision} is the m/z value defining the split between the high and low mass range (default m/z = 120). - The entry \code{ppmFine} defines the ppm cut-off post recalibration. - The default value of 5 ppm is recommended for Orbitraps. For other + The entry \code{ppmFine} defines the ppm cut-off post recalibration. + The default value of 5 ppm is recommended for Orbitraps. For other instruments this can be interpreted from the recalibration plot. - All ppm limits are one-sided (e.g. this includes values to +5 ppm or -5 ppm + All ppm limits are one-sided (e.g. this includes values to +5 ppm or -5 ppm deviation from the exact mass). - - The entries \code{prelimCut, prelimCutRatio} define the intensity cut-off and - cut-off ratio (in % of the most intense peak) for pre-processing. This affects - the peak selection for the recalibration only. Careful: the default value - 1e4 for Orbitrap LTQ positive mode could remove all peaks for TOF data + + The entries \code{prelimCut, prelimCutRatio} define the intensity cut-off and + cut-off ratio (in % of the most intense peak) for pre-processing. This affects + the peak selection for the recalibration only. Careful: the default value + 1e4 for Orbitrap LTQ positive mode could remove all peaks for TOF data and will remove too many peaks for Orbitrap LTQ negative mode spectra! The entry \code{specOKLimit} defines the intensity limit to include MS/MS spectra. - MS/MS spectra must have at least one peak above this limit to proceed through + MS/MS spectra must have at least one peak above this limit to proceed through the workflow. - \code{dbeMinLimit} defines the minimum allowable ring and double bond equivalents (DBE) - allowed for assigned formulas. This assumes maximum valuences for elements with + \code{dbeMinLimit} defines the minimum allowable ring and double bond equivalents (DBE) + allowed for assigned formulas. This assumes maximum valuences for elements with multiple valence states. The default is -0.5 (accounting for fragments being ions). - The entries \code{satelliteMzLimit, satelliteIntLimit} define the cut-off m/z and - intensity values for satellite peak removal (an artefact of Fourier Transform - processing). All peaks within the m/z limit (default 0.5) and intensity ratio - (default 0.05 or 5 %) of the respective peak will be removed. Applicable to - Fourier Transform instruments only (e.g. Orbitrap). - } + The entries \code{satelliteMzLimit, satelliteIntLimit} define the cut-off m/z and + intensity values for satellite peak removal (an artefact of Fourier Transform + processing). All peaks within the m/z limit (default 0.5) and intensity ratio + (default 0.05 or 5 %) of the respective peak will be removed. Applicable to + Fourier Transform instruments only (e.g. Orbitrap). + } \item{\code{filterSettings}}{ - Parameters for adjusting the raw data retrieval. - The entry \code{ppmFine} defines the ppm error to look for the precursor in + Parameters for adjusting the raw data retrieval. + The entry \code{ppmFine} defines the ppm error to look for the precursor in the MS1 (parent) spectrum. Default is 10 ppm for Orbitrap. - \code{mzCoarse} defines the error to search for the precursor specification - in the MS2 spectrum. This is often only saved to 2 decimal places and thus - can be quite inaccurate. The accuracy also depends on the isolation window used. + \code{mzCoarse} defines the error to search for the precursor specification + in the MS2 spectrum. This is often only saved to 2 decimal places and thus + can be quite inaccurate. The accuracy also depends on the isolation window used. The default settings (for e.g. Orbitrap) is 0.5 (Da, or Th for m/z). - The entry \code{fillPrecursorScan} is largely untested. The default value + The entry \code{fillPrecursorScan} is largely untested. The default value (FALSE) assumes all necessary precursor information is available in the mzML file. A setting ot TRUE tries to fill in the precursor data scan number if it is missing. - Only tested on one case study so far - feedback welcome! - } + Only tested on one case study so far - feedback welcome! + } } } \seealso{ diff --git a/man/exportMassbank.Rd b/man/exportMassbank.Rd index 40ce169..bf00ea6 100755 --- a/man/exportMassbank.Rd +++ b/man/exportMassbank.Rd @@ -10,7 +10,7 @@ exportMassbank(compiled, molfile = NULL) \item{compiled}{\code{RmbSpectraSet} the spectra of one compound for which files should be exported} -\item{molfile}{A molfile from \code{\link{createMolfile}}; +\item{molfile}{A molfile from \code{\link{createMolfile}}; deprecated since molfiles are not used by MassBank anymore.} } \value{ diff --git a/man/flatten.Rd b/man/flatten.Rd index 2d60a06..f917255 100755 --- a/man/flatten.Rd +++ b/man/flatten.Rd @@ -5,7 +5,7 @@ \alias{readMbdata} \title{Flatten, or re-read, MassBank header blocks} \usage{ -flatten(mbdata) +flatten(mbdata) readMbdata(row) } diff --git a/man/gatherCCTE.Rd b/man/gatherCCTE.Rd index 6de7299..1826761 100644 --- a/man/gatherCCTE.Rd +++ b/man/gatherCCTE.Rd @@ -20,7 +20,7 @@ Returns a list with 5 slots: \code{smiles} The SMILES annotation of the structure } \description{ -Retrieves annotation data for a compound from the internet service US EPA CCTE +Retrieves annotation data for a compound from the internet service US EPA CCTE based on the inchikey generated by babel or Cactus } \details{ diff --git a/man/gatherData.Rd b/man/gatherData.Rd index 89e1a63..2e8992d 100755 --- a/man/gatherData.Rd +++ b/man/gatherData.Rd @@ -38,8 +38,8 @@ inserted empty and will be filled later on. } \references{ Chemical Translation Service: -\url{http://uranus.fiehnlab.ucdavis.edu:8080/cts/homePage} -cactus Chemical Identifier Resolver: +\url{http://uranus.fiehnlab.ucdavis.edu:8080/cts/homePage} +cactus Chemical Identifier Resolver: \url{http://cactus.nci.nih.gov/chemical/structure} MassBank record format: \url{http://www.massbank.jp/manuals/MassBankRecord_en.pdf} diff --git a/man/gatherDataBabel.Rd b/man/gatherDataBabel.Rd index 84737d0..d6e78d5 100644 --- a/man/gatherDataBabel.Rd +++ b/man/gatherDataBabel.Rd @@ -20,7 +20,7 @@ compound list. } \details{ Composes the "upper part" of a MassBank record filled with chemical data -about the compound: name, exact mass, structure, CAS no.. +about the compound: name, exact mass, structure, CAS no.. The instrument type is also written into this block (even if not strictly part of the chemical information). Additionally, index fields are added at the start of the record, which will be removed later: diff --git a/man/gatherDataUnknown.Rd b/man/gatherDataUnknown.Rd index 7f5903a..4c36eb5 100644 --- a/man/gatherDataUnknown.Rd +++ b/man/gatherDataUnknown.Rd @@ -9,7 +9,7 @@ gatherDataUnknown(id, mode, retrieval) \arguments{ \item{id}{The compound ID.} -\item{mode}{\code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions +\item{mode}{\code{"pH", "pNa", "pM", "pNH4", "mH", "mM", "mFA"} for different ions ([M+H]+, [M+Na]+, [M]+, [M+NH4]+, [M-H]-, [M]-, [M+FA]-).} \item{retrieval}{A value that determines whether the files should be handled either as "standard", @@ -25,7 +25,7 @@ Retrieves annotation data for an unknown compound by using basic information pre } \details{ Composes the "upper part" of a MassBank record filled with chemical data -about the compound: name, exact mass, structure, CAS no.. +about the compound: name, exact mass, structure, CAS no.. The instrument type is also written into this block (even if not strictly part of the chemical information). Additionally, index fields are added at the start of the record, which will be removed later: diff --git a/man/gatherPubChem.Rd b/man/gatherPubChem.Rd index ca21fc2..71c423c 100644 --- a/man/gatherPubChem.Rd +++ b/man/gatherPubChem.Rd @@ -17,7 +17,7 @@ Returns a list with 4 slots: \code{Chebi} The identification number of the chebi database } \description{ -Retrieves annotation data for a compound from the internet service Pubchem +Retrieves annotation data for a compound from the internet service Pubchem based on the inchikey generated by babel or Cactus } \details{ diff --git a/man/getCASRN.Rd b/man/getCASRN.Rd index 5d412a3..7b40630 100644 --- a/man/getCASRN.Rd +++ b/man/getCASRN.Rd @@ -28,7 +28,7 @@ getCASRN("MKXZASYAUGDDCJ-NJAFHUGGSA-N") } \references{ -CCTE search: \url{https://api-ccte.epa.gov/docs} +CCTE search: \url{https://api-ccte.epa.gov/docs} CCTE REST: \url{https://api-ccte.epa.gov/docs} diff --git a/man/getCSID.Rd b/man/getCSID.Rd index c629808..472b4e7 100644 --- a/man/getCSID.Rd +++ b/man/getCSID.Rd @@ -2,31 +2,39 @@ % Please edit documentation in R/webAccess.R \name{getCSID} \alias{getCSID} -\title{Retrieve the Chemspider ID for a given compound} +\title{Search ChemSpider CSID} \usage{ -getCSID(query) +getCSID(key, identifier, api_key) } \arguments{ -\item{query}{The InChIKey of the compound} +\item{key}{ID to be converted} + +\item{identifier}{identifier (name, inchikey)} + +\item{api_key}{API key for ChemSpider (to be created on the developer site)} } \value{ -Returns the chemspide +The CSID (in string type) } \description{ -Given an InChIKey, this function queries the chemspider web API to retrieve -the Chemspider ID of he compound with that InChIkey. +Retrieves ChemSpider CSID from UK RSC for a search term. +} +\details{ +Requires a valid API key } \examples{ - \dontrun{ -# Return all CAS registry numbers stored for benzene. -data <- getCtsRecord("UHOVQNZJYSORNB-UHFFFAOYSA-N") -cas <- CTS.externalIdSubset(data, "CAS") -} +getDTXSID(key = "MKXZASYAUGDDCJ-NJAFHUGGSA-N", identifier = "InChIKey", api_key = "your RCS API key") +} +} +\references{ +ChemSpider search: \url{https://developer.rsc.org/api-reference#} +ChemSpider developer site: \url{https://developer.rsc.org} + +ChemSider REST: +\url{https://developer.rsc.org/api-reference#} } \author{ -Michele Stravs, Eawag - -Erik Mueller, UFZ +Tobias Schulze } diff --git a/man/getCtsRecord.Rd b/man/getCtsRecord.Rd index d5bde0b..4baf8ba 100755 --- a/man/getCtsRecord.Rd +++ b/man/getCtsRecord.Rd @@ -10,11 +10,11 @@ getCtsRecord(key) \item{key}{The InChI key.} } \value{ -Returns a list with all information from CTS: \code{inchikey, +Returns a list with all information from CTS: \code{inchikey, inchicode, formula, exactmass} contain single values. \code{synonyms} contains an unordered list of scored synonyms (\code{type, name, score}, where \code{type} indicates either a normal name or a specific IUPAC name, see below). - \code{externalIds} contains an unordered list of identifiers of the compound in + \code{externalIds} contains an unordered list of identifiers of the compound in various databases (\code{name, value}, where \code{name} is the database name and \code{value} the identifier in that database.) } diff --git a/man/getDTXCID.Rd b/man/getDTXCID.Rd index 64620c0..bd64429 100644 --- a/man/getDTXCID.Rd +++ b/man/getDTXCID.Rd @@ -27,7 +27,7 @@ getDTXCID("MKXZASYAUGDDCJ-NJAFHUGGSA-N") } } \references{ -CCTE search: \url{https://api-ccte.epa.gov/docs} +CCTE search: \url{https://api-ccte.epa.gov/docs} CCTE REST: \url{https://api-ccte.epa.gov/docs} diff --git a/man/getDTXSID.Rd b/man/getDTXSID.Rd index cdd574e..f231a4f 100644 --- a/man/getDTXSID.Rd +++ b/man/getDTXSID.Rd @@ -27,7 +27,7 @@ getDTXSID("MKXZASYAUGDDCJ-NJAFHUGGSA-N") } } \references{ -CCTE search: \url{https://api-ccte.epa.gov/docs} +CCTE search: \url{https://api-ccte.epa.gov/docs} CCTE REST: \url{https://api-ccte.epa.gov/docs} diff --git a/man/getDTXSMILES.Rd b/man/getDTXSMILES.Rd index c580c05..e0fa0c5 100644 --- a/man/getDTXSMILES.Rd +++ b/man/getDTXSMILES.Rd @@ -28,7 +28,7 @@ getDTXSMILES("MKXZASYAUGDDCJ-NJAFHUGGSA-N") } \references{ -CCTE search: \url{https://api-ccte.epa.gov/docs} +CCTE search: \url{https://api-ccte.epa.gov/docs} CCTE REST: \url{https://api-ccte.epa.gov/docs} diff --git a/man/getPcId.Rd b/man/getPcId.Rd index 090f553..2a20b20 100755 --- a/man/getPcId.Rd +++ b/man/getPcId.Rd @@ -26,7 +26,7 @@ getPcId("MKXZASYAUGDDCJ-NJAFHUGGSA-N") } \references{ -PubChem search: \url{http://pubchem.ncbi.nlm.nih.gov/} +PubChem search: \url{http://pubchem.ncbi.nlm.nih.gov/} Pubchem REST: \url{https://pubchem.ncbi.nlm.nih.gov/pug_rest/PUG_REST.html} diff --git a/man/getPrefName.Rd b/man/getPrefName.Rd index e340041..4ac9064 100644 --- a/man/getPrefName.Rd +++ b/man/getPrefName.Rd @@ -28,7 +28,7 @@ getPrefName("MKXZASYAUGDDCJ-NJAFHUGGSA-N") } \references{ -CCTE search: \url{https://api-ccte.epa.gov/docs} +CCTE search: \url{https://api-ccte.epa.gov/docs} CCTE REST: \url{https://api-ccte.epa.gov/docs} diff --git a/man/mbWorkflow.Rd b/man/mbWorkflow.Rd index f7a5524..681464c 100755 --- a/man/mbWorkflow.Rd +++ b/man/mbWorkflow.Rd @@ -21,10 +21,10 @@ mbWorkflow( which should then be manually inspected.} \item{gatherData}{A variable denoting whether to retrieve information using several online databases \code{gatherData= "online"} -or to use the local babel installation \code{gatherData= "babel"}. Note that babel is used either way, if a directory is given +or to use the local babel installation \code{gatherData= "babel"}. Note that babel is used either way, if a directory is given in the settings. This setting will be ignored if retrieval is set to "standard"} -\item{filter}{If \code{TRUE}, the peaks will be filtered according to the standard processing workflow in RMassBank - +\item{filter}{If \code{TRUE}, the peaks will be filtered according to the standard processing workflow in RMassBank - only the best formula for a peak is retained, and only peaks passing multiplicity filtering are retained. If FALSE, it is assumed that the user has already done filtering, and all peaks in the spectrum should be printed in the record (with or without formula.)} } @@ -69,7 +69,7 @@ Step 8: Create the list.tsv in the molfiles folder, which is required by MassBan mb <- newMbWorkspace(w) # w being a msmsWorkspace mb <- loadInfolists(mb, "D:/myInfolistPath") mb <- mbWorkflow(mb, steps=c(1:3), "newinfos.csv") - + } } \seealso{ diff --git a/man/toMassbank.Rd b/man/toMassbank.Rd index 9385ba4..d6f9706 100755 --- a/man/toMassbank.Rd +++ b/man/toMassbank.Rd @@ -16,7 +16,7 @@ toMassbank(o, ...) \item{o}{An object to convert to MassBank record format. This may be a single `RmbSpectrum2`, or a complete compound (an `RmbSpectraSet`),} -\item{...}{Parameters passed to the implementation, +\item{...}{Parameters passed to the implementation, in particular `addAnnotation`} \item{addAnnotation}{`logical`, whether to add peak annotations (putative formulas) to the record.} @@ -38,10 +38,10 @@ entries can be as follows: \itemize{ is written as - \code{CH\$EXACT_MASS: 329.1023} + \code{CH\$EXACT_MASS: 329.1023} \item A character array: - \code{'CH\$NAME' = c('2-Aminobenzimidazole', '1H-Benzimidazol-2-amine')} + \code{'CH\$NAME' = c('2-Aminobenzimidazole', '1H-Benzimidazol-2-amine')} is written as @@ -49,19 +49,19 @@ is written as \code{CH\$NAME: 1H-Benzimidazol-2-amine} -\item A named list of strings: +\item A named list of strings: - \code{'CH\$LINK' = list('CHEBI' = "27822", "KEGG" = "C10901")} + \code{'CH\$LINK' = list('CHEBI' = "27822", "KEGG" = "C10901")} -is written as +is written as \code{CH\$LINK: CHEBI 27822} -\code{CH\$LINK: KEGG C10901} +\code{CH\$LINK: KEGG C10901} \item A data frame (e.g. the peak table) is written as specified in the MassBank record format (Section 2.6.3): the column names are used as -headers for the first line, all data rows are printed space-separated. +headers for the first line, all data rows are printed space-separated. } } \note{ @@ -76,7 +76,7 @@ c('bla','blub')}. # Read just the compound info skeleton from the Internet for some compound ID id <- 35 mbdata <- gatherData(id) -#' # Export the mbdata blocks to line arrays +#' # Export the mbdata blocks to line arrays # (there is no spectrum information, just the compound info...) mbtext <- toMassbank(mbdata) }