diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a8cf84d --- /dev/null +++ b/.gitattributes @@ -0,0 +1,10 @@ +# Juypter notebooks contains images, and tables, and parsing text +# blowing up the total language fraction unrealistically; +# then 'Juypter notebooks' are suddenly major part of repo language. + +# As they don't want to parse notebooks better +# (wont-fix = https://github.com/github/linguist/issues/3496) +# Simply exclude this file from counting now: + +notebooks/*.ipynb linguist-generated=true +stream_viz/tutorial/*.ipynb linguist-generated=true diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 0000000..b04fb15 --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,10 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: psf/black@stable diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..0ad2115 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,38 @@ +name: Unittests + +on: [pull_request] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade pip setuptools wheel + python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + python -m pip install -e . + + - name: Display Python & Installed Packages + run: | + python --version + pip freeze + + - name: Run Unit Tests + run: python -m unittest discover -s tests/unit -v + env: + ACTIONS_STEP_DEBUG: true # Enable debug logs + ACTIONS_RUNNER_DEBUG: true # Additional debug logs from Github Actions itself diff --git a/.github/workflows/token_consistency.yaml b/.github/workflows/token_consistency.yaml new file mode 100644 index 0000000..df9d8b6 --- /dev/null +++ b/.github/workflows/token_consistency.yaml @@ -0,0 +1,110 @@ +name: Check consistency of tokens.txt file + +# Define the file paths under `paths` to trigger this check only when specific files are modified. +# This script will then execute checks only on files that have changed, rather than all files listed in `paths`. + +# **Note** : To add a new token file for checks, include its path in: +# - `on` -> `push` and `pull_request` sections +# - `jobs` -> `check_tokens` -> `steps` -> Set global variable for multiple tokens.txt paths -> `TOKENS_FILES` + +on: + push: + paths: + - "chebai/preprocessing/bin/protein_token/tokens.txt" + - "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt" + pull_request: + paths: + - "chebai/preprocessing/bin/protein_token/tokens.txt" + - "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt" + +jobs: + check_tokens: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Get list of changed files + id: changed_files + run: | + git fetch origin dev + + # Get the list of changed files compared to origin/dev and save them to a file + git diff --name-only origin/dev > changed_files.txt + + # Print the names of changed files on separate lines + echo "Changed files:" + while read -r line; do + echo "Changed File name : $line" + done < changed_files.txt + + - name: Set global variable for multiple tokens.txt paths + run: | + # All token files that needs to checked must be included here too, same as in `paths`. + TOKENS_FILES=( + "chebai/preprocessing/bin/protein_token/tokens.txt" + "chebai/preprocessing/bin/protein_token_3_gram/tokens.txt" + ) + echo "TOKENS_FILES=${TOKENS_FILES[*]}" >> $GITHUB_ENV + + - name: Process only changed tokens.txt files + run: | + # Convert the TOKENS_FILES environment variable into an array + TOKENS_FILES=(${TOKENS_FILES}) + + # Iterate over each token file path + for TOKENS_FILE_PATH in "${TOKENS_FILES[@]}"; do + # Check if the current token file path is in the list of changed files + if grep -q "$TOKENS_FILE_PATH" changed_files.txt; then + echo "----------------------- Processing $TOKENS_FILE_PATH -----------------------" + + # Get previous tokens.txt version + git fetch origin dev + git diff origin/dev -- $TOKENS_FILE_PATH > tokens_diff.txt || echo "No previous tokens.txt found for $TOKENS_FILE_PATH" + + # Check for deleted or added lines in tokens.txt + if [ -f tokens_diff.txt ]; then + + # Check for deleted lines (lines starting with '-') + deleted_lines=$(grep '^-' tokens_diff.txt | grep -v '^---' | sed 's/^-//' || true) + if [ -n "$deleted_lines" ]; then + echo "Error: Lines have been deleted from $TOKENS_FILE_PATH." + echo -e "Deleted Lines: \n$deleted_lines" + exit 1 + fi + + # Check for added lines (lines starting with '+') + added_lines=$(grep '^+' tokens_diff.txt | grep -v '^+++' | sed 's/^+//' || true) + if [ -n "$added_lines" ]; then + + # Count how many lines have been added + num_added_lines=$(echo "$added_lines" | wc -l) + + # Get last `n` lines (equal to num_added_lines) of tokens.txt + last_lines=$(tail -n "$num_added_lines" $TOKENS_FILE_PATH) + + # Check if the added lines are at the end of the file + if [ "$added_lines" != "$last_lines" ]; then + + # Find lines that were added but not appended at the end of the file + non_appended_lines=$(diff <(echo "$added_lines") <(echo "$last_lines") | grep '^<' | sed 's/^< //') + + echo "Error: New lines have been added to $TOKENS_FILE_PATH, but they are not at the end of the file." + echo -e "Added lines that are not at the end of the file: \n$non_appended_lines" + exit 1 + fi + fi + + if [ "$added_lines" == "" ]; then + echo "$TOKENS_FILE_PATH validation successful: No lines were deleted, and no new lines were added." + else + echo "$TOKENS_FILE_PATH validation successful: No lines were deleted, and new lines were correctly appended at the end." + fi + else + echo "No previous version of $TOKENS_FILE_PATH found." + fi + else + echo "$TOKENS_FILE_PATH was not changed, skipping." + fi + done diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..05cdfb7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,172 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +docs/build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# configs/ # commented as new configs can be added as a part of a feature + +/.idea +/data +/logs +/results_buffer +electra_pretrained.ckpt +.jupyter +.virtual_documents +.isort.cfg diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..108b91d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,25 @@ +repos: +- repo: https://github.com/psf/black + rev: "24.2.0" + hooks: + - id: black + - id: black-jupyter # for formatting jupyter-notebook + +- repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort + name: isort (python) + args: ["--profile=black"] + +- repo: https://github.com/asottile/seed-isort-config + rev: v2.2.0 + hooks: + - id: seed-isort-config + +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0ad25db --- /dev/null +++ b/LICENSE @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published + by the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/README.md b/README.md index b954a79..f0033da 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,146 @@ -# python-chebai-proteins -Protein-related extension of the chebai framework + +# πŸ§ͺ ChEB-AI Proteins + +`python-chebai-proteins` repository for protein prediction and classification, built on top of the [`python-chebai`](https://github.com/ChEB-AI/python-chebai) codebase. + + +## πŸ”§ Installation + + +To install, follow these steps: + +1. Clone the repository: +``` +git clone https://github.com/ChEB-AI/python-chebai-proteins.git +``` + +2. Install the package: + +``` +cd python-chebai +pip install . +``` + +## πŸ—‚ Recommended Folder Structure + +To combine configuration files from both `python-chebai` and `python-chebai-proteins`, structure your project like this: + +``` +my_projects/ +β”œβ”€β”€ python-chebai/ +β”‚ β”œβ”€β”€ chebai/ +β”‚ β”œβ”€β”€ configs/ +β”‚ └── ... +└── python-chebai-proteins/ + β”œβ”€β”€ chebai_proteins/ + β”œβ”€β”€ configs/ + └── ... +``` + +This setup enables shared access to data and model configurations. + + + +## πŸš€ Training & Pretraining Guide + +### ⚠️ Important Setup Instructions + +Before running any training scripts, ensure the environment is correctly configured: + +* Either: + + * Install the `python-chebai` repository as a package using: + + ```bash + pip install . + ``` +* **OR** + + * Manually set the `PYTHONPATH` environment variable if working across multiple directories (`python-chebai` and `python-chebai-proteins`): + + * If your current working directory is `python-chebai-proteins`, set: + + ```bash + export PYTHONPATH=path/to/python-chebai + ``` + or vice versa. + + * If you're working within both repositories simultaneously or facing module not found errors, we **recommend configuring both directories**: + + ```bash + # Linux/macOS + export PYTHONPATH=path/to/python-chebai:path/to/python-chebai-proteins + + # Windows (use semicolon instead of colon) + set PYTHONPATH=path\to\python-chebai;path\to\python-chebai-proteins + ``` + +> πŸ”Ž See the [PYTHONPATH Explained](#-pythonpath-explained) section below for more details. + + +### πŸ“Š SCOPE hierarchy prediction + +Assuming your current working directory is `python-chebai-proteins`, run the following command to start training: +```bash +python -m chebai fit --trainer=../configs/training/default_trainer.yml --trainer.callbacks=../configs/training/default_callbacks.yml --trainer.logger.init_args.name=scope50 --trainer.accumulate_grad_batches=4 --trainer.logger=../configs/training/wandb_logger.yml --trainer.min_epochs=100 --trainer.max_epochs=100 --data=configs/data/scope/scope50.yml --data.init_args.batch_size=32 --data.init_args.num_workers=10 --model=../configs/model/electra.yml --model.train_metrics=../configs/metrics/micro-macro-f1.yml --model.test_metrics=../configs/metrics/micro-macro-f1.yml --model.val_metrics=../configs/metrics/micro-macro-f1.yml --model.pass_loss_kwargs=false --model.criterion=../configs/loss/bce.yml --model.criterion.init_args.beta=0.99 +``` + +Same command can be used for **DeepGO** just by changing the config path for data. + + + + + + + +## 🧭 PYTHONPATH Explained + +### What is `PYTHONPATH`? + +`PYTHONPATH` is an environment variable that tells Python where to search for modules that aren't installed via `pip` or not in your current working directory. + +### Why You Need It + +If your config refers to a custom module like: + +```yaml +class_path: chebai_proteins.preprocessing.datasets.scope.scope.SCOPe50 +``` + +...and you're running the code from `python-chebai`, Python won't know where to find `chebai_proteins` (from another repo like `python-chebai-proteins/`) unless you add it to `PYTHONPATH`. + + +### How Python Finds Modules + +Python looks for imports in this order: + +1. Current directory +2. Standard library +3. Paths in `PYTHONPATH` +4. Installed packages (`site-packages`) + +You can inspect the full search paths: + +```bash +python -c "import sys; print(sys.path)" +``` + + + +### βœ… Setting `PYTHONPATH` + +#### 🐧 Linux / macOS + +```bash +export PYTHONPATH=/path/to/python-chebai-graph +echo $PYTHONPATH +``` + +#### πŸͺŸ Windows CMD + +```cmd +set PYTHONPATH=C:\path\to\python-chebai-graph +echo %PYTHONPATH% +``` + +> πŸ’‘ Note: This is temporary for your terminal session. To make it permanent, add it to your system environment variables. diff --git a/chebai_proteins/__init__.py b/chebai_proteins/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chebai_proteins/preprocessing/__init__.py b/chebai_proteins/preprocessing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chebai_proteins/preprocessing/bin/protein_token/tokens.txt b/chebai_proteins/preprocessing/bin/protein_token/tokens.txt new file mode 100644 index 0000000..c31c5b7 --- /dev/null +++ b/chebai_proteins/preprocessing/bin/protein_token/tokens.txt @@ -0,0 +1,21 @@ +M +S +I +G +A +T +R +L +Q +N +D +K +Y +P +C +F +W +E +V +H +X diff --git a/chebai_proteins/preprocessing/bin/protein_token_3_gram/tokens.txt b/chebai_proteins/preprocessing/bin/protein_token_3_gram/tokens.txt new file mode 100644 index 0000000..534e5db --- /dev/null +++ b/chebai_proteins/preprocessing/bin/protein_token_3_gram/tokens.txt @@ -0,0 +1,8359 @@ +MAT +ATP +TPG +PGA +GAS +ASS +SSA +SAR +ARD +RDE +DEF +EFV +FVY +VYM +YMA +MAK +AKL +KLA +LAE +AEQ +EQA +QAE +AER +ERY +RYE +YEE +EEM +EMV +MVE +VEF +EFM +FME +MEK +EKV +KVA +VAK +AKA +KAV +AVD +VDK +DKD +KDE +DEL +ELT +LTV +TVE +VEE +EER +ERN +RNL +NLL +LLS +LSV +SVA +VAY +AYK +YKN +KNV +NVI +VIG +IGA +GAR +ARR +RRA +RAS +ASW +SWR +WRI +RII +IIS +ISS +SSI +SIE +IEQ +EQK +QKE +KEE +EES +ESR +SRG +RGN +GND +NDD +DDH +DHV +HVS +VSL +SLI +LIR +IRD +RDY +DYR +YRS +RSK +SKI +KIE +IET +ETE +TEL +ELS +LSD +SDI +DIC +ICD +CDG +DGI +GIL +ILK +LKL +KLL +LLD +LDT +DTI +TIL +ILV +LVP +VPA +PAA +AAA +AAS +ASG +SGD +GDS +DSK +SKV +KVF +VFY +FYL +YLK +LKM +KMK +MKG +KGD +GDY +DYH +YHR +HRY +RYL +YLA +AEF +EFK +FKS +KSG +SGQ +GQE +QER +ERK +RKD +KDA +DAA +AAE +AEH +EHT +HTL +TLT +LTA +TAY +YKA +KAA +AAQ +AQD +QDI +DIA +IAN +ANS +NSE +SEL +ELA +LAP +APT +PTH +THP +HPI +PIR +IRL +RLG +LGL +GLA +LAL +ALN +LNF +NFS +FSV +SVF +FYY +YYE +YEI +EIL +ILN +LNS +NSP +SPD +PDR +DRA +RAC +ACN +CNL +NLA +LAK +AKQ +KQA +QAF +AFD +FDE +DEA +EAI +AIA +IAE +AEL +ELD +DTL +TLG +LGE +GEE +ESY +SYK +YKD +KDS +DST +STL +TLI +LIM +IMQ +MQL +QLL +LLR +LRD +RDN +DNL +NLT +LTL +TLW +LWT +WTS +TSD +SDM +DMQ +MQD +QDD +DDV +DVA +VAD +ADD +DDI +DIK +IKE +KEA +EAA +AAP +APA +AAK +AKP +KPA +PAD +ADE +DEQ +EQQ +QQS +MSD +SDT +DTV +EEL +ELV +LVQ +VQR +QRA +RAK +RYD +YDD +DDM +DMA +MAA +AAM +AMK +MKK +KKV +KVT +VTE +TEQ +EQG +QGQ +QEL +LSN +SNE +NEE +NVV +VVG +VGA +RRS +RSS +SSW +WRV +RVI +VIS +QKT +KTE +TEG +EGS +GSE +SEK +EKK +KKQ +KQQ +QQL +QLA +AKE +KEY +EYR +YRV +RVK +VKV +KVE +VEQ +EQE +ELN +LND +NDI +ICQ +CQD +QDV +DVL +VLK +LDE +EFL +FLI +LIV +IVK +VKA +KAG +AGA +GAA +AES +ESK +DYY +YYR +YRY +AEV +EVA +VAS +ASE +SED +EDR +RAA +AAV +AVV +VVE +VEK +EKS +KSQ +SQK +QKA +KAY +AYQ +YQE +QEA +EAL +ALD +LDI +IAK +AKD +KDK +DKM +KMQ +MQP +QPT +LNT +NTP +TPE +PEH +EHA +HAC +ACQ +CQL +FDD +DDA +DAI +TLN +LNE +NED +EDS +DSY +SDV +DVG +GAE +AED +EDQ +DQE +QEQ +QEG +EGN +GNQ +NQE +EAG +AGN +MAS +ASA +SAE +LSR +SRE +REE +EEN +ENV +NVY +AKT +KTV +TVD +VDS +DSE +SEE +EEG +EGR +GRG +GNE +DRV +RVT +VTL +LIK +IKD +KDY +YRG +RGK +GKI +LTK +TKI +KIC +LLE +LET +ETH +THL +HLV +VPS +PSS +SST +STA +TAP +APE +PES +FKT +KTG +TGA +AEN +ENT +NTM +TMV +MVA +IAL +ALA +ACS +CSL +SLA +AIS +ISE +TLS +LSE +DIS +EDP +DPA +PAE +AEE +EEI +EIR +IRE +REA +EAP +APK +PKR +KRD +RDS +DSS +SSE +SEG +EGQ +LES +ESH +SHL +LLH +LHD +HDN +PKH +KHD +HDL +DLS +MST +STR +TRE +VDV +DVE +SVE +SKG +KGN +EDH +HVA +VAI +AII +IIK +IES +ESE +LSK +LNV +NVL +VLE +LEA +EAH +AHL +HLI +LIP +IPS +PSA +SAS +ASP +SPA +FKA +RKE +EST +TLV +LVA +YKS +KSA +ASD +IAT +ATA +TAE +DMT +MTD +TDE +AGD +GDE +DEI +EIK +EAS +ASK +SKP +KPD +PDG +DGA +MAE +RED +EDC +DCV +CVF +VFL +FLS +SKL +EQS +QSE +SER +YDE +DEM +MVQ +VQY +QYM +YMK +MKQ +KQV +QVA +VAA +AAL +NTE +IGS +GSR +SRR +IIT +ITS +TSL +SLE +LEQ +KEQ +QAK +AKG +NDK +DKH +KHV +HVE +VEI +EII +IKG +KGY +GYR +YRA +AKI +IED +EDE +AKY +KYC +YCD +CDD +LKV +KVI +VIK +KEN +ENL +LLP +LPN +PNA +NAS +AST +STS +TSE +SES +FYK +YKK +KKM +KME +MEG +EGD +RYY +YYA +YAE +EFT +FTV +VDE +DEK +EKR +KRQ +RQE +QEV +ADK +DKS +KSL +LAA +AAY +AYT +YTE +TEA +EAT +ATE +TEI +EIS +ISN +SNA +NAD +ADL +DLA +EIM +IMN +MND +NDA +DAD +DKA +KAC +DDS +DSI +SIA +KLD +DEV +EVP +VPE +ESS +SSY +DTA +TAD +DEE +AAT +ATL +LGR +GRD +RDQ +DQY +QYV +YVY +VQF +QFM +MEQ +EQL +QLV +LVT +VTG +GAT +TPA +GSL +SLR +LRA +AAW +AWR +RIV +IVS +VSS +SRK +RKN +KND +NDE +DEH +EHV +SLV +LVK +VKD +VES +LSS +SSV +SVC +VCS +CSG +SGI +LDS +DSH +SAG +RYM +DER +RKT +KTA +TAA +EDT +DTM +TML +MLA +LAY +IAA +AAD +ADM +MAP +NSS +SSD +SDK +CNM +NMA +AFE +FEE +EEA +MQE +EQM +QMD +MDE +ATT +TTL +SRD +LVS +VSG +SGA +PAG +AGE +GEL +KNE +EEH +VET +SIC +ICS +ILR +LRL +RLL +SAT +TAS +TMI +MIA +IAY +VAV +AVA +EKA +CSM +SMA +MTM +TMD +MDK +KSE +VQK +KAK +MKA +AVT +QGH +GHE +HEL +TER +RNE +NEK +QQM +QMG +MGK +GKE +YRE +REK +EKI +IEA +EAE +ELQ +LQD +ICN +CND +NDV +LEL +ELL +LDK +DKY +KYL +YLI +IPN +NAT +ATQ +TQP +QPE +DYF +YFR +FRY +YLS +SEV +GDN +DNK +NKQ +KQT +QTT +TTV +TVS +VSN +SNS +NSQ +SQQ +QQA +QAY +EAF +FEI +ISK +SKK +KKE +KEM +EMQ +SPE +PEK +TAF +SEN +ENQ +NQG +QGD +DEG +GDA +DAG +GEG +EGE +GEN +LIL +LNA +TQA +SGE +ENK +CSD +ATH +THA +HAE +MTE +ERE +REN +ENN +NNV +VYK +VEA +EAM +ASM +SMD +MDV +VEL +TSI +NKG +KGA +EEK +EKL +KLE +LEM +EMI +MIK +IKT +KTY +TYR +RGQ +GQV +QVE +EKE +KEL +ELR +RDI +DIL +LEK +EKH +KHL +IPC +PCA +CAT +ATS +TSG +GES +YYK +YKM +EFA +FAT +ATG +TGS +GSD +SDR +DRK +ENS +NSL +LIA +IAM +AMN +NDL +DLP +LPP +PPT +ACR +CRL +RLA +AAF +MQA +EEV +EVD +VDP +DPN +NAG +GDG +DGE +GEP +EPK +PKE +EQI +QIQ +IQD +VED +DQD +DVS +MDD +DDR +DRE +EDL +DLV +LVY +VYQ +YQA +ESM +SMK +VAG +AGM +GMD +KGG +GGE +GED +EDK +DKL +KLK +KMI +MIR +REY +YRQ +RQM +QMV +ELK +KLI +LIC +ICC +CCD +CDI +ILD +LDV +VLD +IPA +AAN +ANT +NTG +TGE +TGN +NDR +AMT +ELP +MQG +EEQ +EQN +QNK +NKE +ALQ +DEN +MGD +GDR +REQ +LLQ +LQR +RAR +ARL +SAM +NEP +EPL +PLS +DRN +KTM +TMA +MAD +ADG +DGN +KKL +KVK +AYR +IEK +ELE +ETV +TVC +VCN +VLS +LSL +SLL +DKF +KFL +IKN +KNC +NCN +NDF +DFQ +FQY +QYE +YES +GEK +KKN +KNS +NSV +SVV +SEA +YKE +SKE +QMQ +EIQ +IQN +QNA +NAP +PEQ +QAC +ACL +CLL +LLA +SDQ +DQQ +QQD +QDE +VLA +ALL +KEH +EHM +HMQ +MVD +VDR +KAR +MKN +NVT +KTS +TSA +SAD +KKI +IEM +MVR +VRA +RAY +EAV +AVC +VCQ +LDN +DNY +NYL +NCS +CSE +SET +ETQ +TQY +VAT +KRA +RAT +ATV +TVV +AYS +YSE +AHE +HEI +LNY +NYS +YSV +ACH +CHL +HLA +DDD +DDG +DGG +GNN +MER +ERA +ASL +LIQ +IQK +YED +EDM +AFM +FMK +MKS +SAV +AVE +EKG +KGE +LSC +SCE +CEE +VGG +GGQ +GQR +RVL +QKS +KSN +KGP +GPE +PEV +EVK +VKE +LRG +RGV +GVC +VCD +CDT +TVL +VLG +GLL +GAG +DAE +SRV +RVF +TGD +GDD +DDK +DKK +KKR +KRI +IID +IDS +DSA +ARS +RSA +SAY +AMD +MDI +EMP +MPP +PTN +TNP +NPI +VFH +FHY +HYE +EIA +PEE +ISL +KTT +TTF +TFD +AMA +DLH +LHT +WTA +ADS +EGG +GEA +EEP +EPQ +PQS +EKT +ELI +ATC +TCM +CMK +QGA +GGR +GRR +SAW +KTD +TDT +DTS +KLQ +LQL +QLI +LRS +RSI +ICT +CTT +ANA +ATN +NPE +VAC +ACG +CGD +RKQ +QTI +TID +IDN +DNS +SQG +GAY +FDI +LNN +NNP +PEL +LAC +ACT +CTL +TLA +SDS +EEC +ECD +CDA +AEG +EGA +TIE +IEN +STV +DKE +MAQ +AQA +QAM +KSV +SVT +TET +ETG +TGV +GVE +ARK +LAR +ARE +RER +ERV +RVE +LRE +REI +EIC +ICY +CYE +YEV +EVL +IPK +PKA +KAS +ASN +SNP +DAR +ARN +RNT +NTV +VVD +VDD +DSQ +SQT +QTA +YQD +QDA +DAF +KGK +GKM +PDK +DTQ +TQG +AEP +PQE +GGD +DKN +NEL +AAC +ACM +RVV +VVS +AEK +QMA +MAR +EKF +ASQ +SQA +AAG +KKG +KGI +GIV +IVD +VDQ +DQS +QSQ +AEA +SQP +MPA +PAS +ASR +DSV +SVY +VYL +VEN +ENM +NMK +SSG +EAK +NES +ESQ +SQV +VAL +ALI +ICE +CED +EDI +ILS +SVL +SDH +DHL +LIT +SAQ +AQT +QTG +FAI +KRK +EAY +DAV +DLE +ETL +WTD +TDL +TEE +QQQ +QSS +SSQ +QAP +AQP +PTE +EGK +GKA +KAD +ADQ +MTR +VAE +NEN +ENH +NHV +HVK +VKK +KIK +EYK +YKC +KCK +CKV +LTD +TDI +ILE +LEV +GNP +NPR +PRK +SSL +IAV +DVH +VHN +HNM +NME +EKN +KNQ +NQD +QDG +DGD +DDQ +DQN +QNE +EPG +PGM +AFT +FTR +EDY +DYV +YVF +VFM +FMA +AQL +QLN +ENA +NAE +ETM +TMR +MRK +RKI +KIS +ISG +SGM +GME +KER +IGP +GPR +PRR +KEK +KGR +GRQ +RQK +QKP +KPN +NAK +AKR +RIE +QIR +IRV +RVY +VYR +QKI +LQE +EQF +QFV +FVP +VPR +PRS +RST +STN +TNA +ADA +DAK +AKV +AEY +EYS +YSS +KIA +IAG +AGS +GSA +SAL +NAY +AYN +YNS +NSA +SAF +ISQ +QLP +ILA +LAS +ACE +CEL +RKA +KAF +FDA +AAI +AIT +ITD +DLD +KLT +LTE +NLN +LNL +NLW +LWV +WVT +VTD +TDS +DDN +DNA +NEA +ALS +VLN +DNF +NFL +NCG +CGE +GET +TQH +QHE +HES +KSY +SYS +DDE +MVS +VSQ +QVV +VVA +EKP +KPQ +PQL +KKA +AGC +GCN +CNS +NSH +SHG +HGQ +GQD +QDS +SYF +YFL +FLG +LGW +GWQ +WQE +QEY +EYE +YEK +KNP +NPF +PFD +FDP +DPV +PVS +NPS +PSG +GII +IIQ +IQM +MGL +NQL +QLS +LSF +SFD +FDL +DLL +LEE +EEW +EWL +WLE +NPH +PHA +HAL +ALG +GLR +LRR +RRE +REG +GGG +GGA +ASV +VFR +FRE +REL +ALF +LFQ +FQD +QDY +YHG +HGL +GLP +LPA +PAF +AFK +FKN +KNA +NAL +ARF +RFM +FMS +MSE +SEQ +EQR +QRG +RGY +GYK +YKV +KVV +VVF +VFD +DPS +PSN +SNI +NIV +IVL +VLT +TAG +SAN +ANE +ALM +LMF +MFC +FCL +CLA +LAD +ADH +DHG +HGD +AFL +IPT +PTP +TPY +PYY +YYP +YPG +PGF +GFD +FDR +DRD +RDL +DLK +LKW +KWR +WRT +RTG +AEI +EIV +IVP +VPV +PVH +VHC +HCA +CAS +ANG +NGF +GFR +FRV +VTR +TRP +RPA +PAL +LDD +DAY +YRR +RAQ +AQK +QKR +KRR +RRL +RLR +LRV +VKG +KGV +GVL +VLI +ITN +NPL +PLG +LGT +GTA +SPR +PRA +RAD +ETI +TIV +VDF +DFV +FVA +GIH +IHL +LIS +ISD +SDE +EIY +IYA +YAG +AGT +AFA +FAE +EPP +PPA +AGF +GFV +FVS +VSA +ALE +EVV +AGR +RDG +GAD +ADV +VSD +RVH +VHV +HVV +VVY +VYS +YSL +SLS +SKD +KDL +DLG +LPG +RVG +GAI +AIY +IYS +YSA +NAA +SAA +ATK +TKM +KMS +MSS +SSF +SFG +FGL +GLV +QTQ +QYL +YLL +LLG +LGD +RDF +DFT +TRS +RSY +SYV +YVA +NKR +RRI +RIK +ERH +RHD +HDQ +DQL +LVD +VDG +DGL +EIG +IGI +GIG +IGC +GCL +CLP +LPS +AGL +GLF +LFC +FCW +CWV +WVD +VDM +DMS +MSH +HLM +LMR +MRS +RSR +SRS +RSF +SFA +FAG +GEM +EME +MEL +ELW +LWK +WKK +VFE +FEV +EVG +VGL +GLN +LNI +NIS +ISP +SPG +PGS +GSS +SSC +SCH +CHC +HCR +CRE +REP +PGW +GWF +WFR +RVC +VCF +CFA +FAN +ANM +NMS +MSA +SAK +KTL +TLD +VAM +AMQ +MQR +QRL +SFV +FVD +TGG +ALR +AVP +PVR +VRS +RSV +SVS +VSC +SCP +CPL +PLA +LAI +AIK +IKW +KWA +WAL +RLT +LTP +TPS +PSI +IAD +ADR +KAE +MAY +YQG +QGI +GID +IDL +LST +STK +TKA +HGE +YFD +FDG +DGW +GWK +WKA +AYD +YDT +DTN +DLR +LRH +RHN +HNR +NRG +RGG +GGV +GVI +VIQ +SLD +LDL +DLI +LIE +IEE +EWS +WSK +SKN +KNH +NHP +HPE +PEA +ASI +CTP +PEG +EGV +GVS +SQF +QFK +FKR +RIA +ANF +NFQ +LPE +PEF +EFR +FRK +KAM +AQF +FMG +MGQ +QVR +VRG +GGK +KAT +ATF +DPD +VVM +VMS +MSG +SGG +GAQ +AQE +QET +LAF +AFC +LAN +ANP +NPG +PGE +FLV +VPT +YPA +RDC +DCC +CCW +CWR +WRS +RSG +GIK +IKL +LPI +PIE +IEC +ECH +CHS +HSF +SFN +FND +DFR +FRL +TKE +ALV +YDG +RRQ +RQG +GIS +ISV +SVK +ILI +GTI +TIT +TDR +RDT +LAM +AML +LAT +TFA +TEH +EHR +HRV +VHL +LVC +CDE +GSV +VFA +PEY +EYV +YVS +VSI +EVI +VIE +IER +ERD +RDV +DVP +VPW +PWC +WCN +CNR +NRD +LIH +IHV +KDF +DFG +VGI +IIY +YSY +SYN +YND +AAR +RRM +RMS +QYF +FLA +ARM +RML +MLS +EEF +EFI +FIG +IGR +GRF +RFL +FLQ +QES +SKC +KCR +RLV +VAR +ARH +RHE +HER +ERF +RFT +FTS +SGL +REV +CLR +GNA +LFS +FSW +SWM +WMD +MDL +MLR +LWR +VIV +IVH +VHQ +HQV +QVK +VKL +KLN +NVS +VSP +PGT +GTS +TSF +SFH +FHC +VCH +CHA +HAN +NMD +DET +TME +MEV +GRI +RIH +IHD +HDF +FVR +VRQ +RQH +QHQ +HQQ +QQR +QRR +RRV +ERW +RWA +WAA +ANR +NRQ +RQL +QLR +RLS +SLP +LPH +PHH +HHH +HHL +HLS +LSP +PAH +SSP +SPL +SPQ +QSP +SPM +PMV +KQL +TKV +VTS +TSN +SNG +NGH +GHG +GWE +WEE +EEY +NPY +PYD +NPN +PNG +NGM +GMI +MIQ +QLC +LCF +CFD +ESW +SWL +WLT +TKN +NPD +PDA +SLK +LKR +KRN +RNG +NGQ +GQS +QSI +SIF +IFR +HGM +GMP +MPE +FKK +MEE +IRG +GNR +NRV +VTF +DPK +PKK +KIV +GST +NET +TLM +PGD +FLL +LPT +VPI +PIH +IHC +HCS +CSS +SSS +SSN +GFQ +FQI +QIT +ITE +TES +ESA +LQQ +YQQ +QAQ +QKL +VLV +VTN +TAL +ALT +LTR +TRR +LLV +DFI +FIT +TSK +KNI +NIH +YSG +SGT +GTM +TMF +MFG +FGF +GFE +FEQ +QFI +FIS +SVM +VMD +LKD +LED +DTE +TEV +EVS +VSK +SKR +KRV +YSN +SND +MIV +LSA +KKF +KFT +TSQ +SQY +YLE +NQK +KRL +RLK +LKS +KSR +SRQ +RQR +GLE +AGI +GIT +ITC +TCL +RSN +DMR +MRH +RHL +HLL +TNT +NTF +TFE +FEA +DLW +IVY +VYN +YNV +NVK +HCT +CTE +TEP +ALK +LKT +KTF +TFV +FVE +STD +TDC +DCG +CGR +GRM +RMI +MIS +ISR +SSH +SHE +ERL +LRK +RKK +KKT +SNW +NWV +WVF +RVS +VSW +SWT +RVP +VPD +PDE +VAF +TEK +KQD +QDL +DLN +IAS +DGH +AYE +ENP +PFH +FHP +PID +IDR +DRP +RPD +DGV +LCG +GDL +DLM +RKW +KWV +WVL +LKH +KHP +CTS +GVN +VNQ +NQF +QFS +FSD +IAI +AIF +IFQ +FRQ +RQA +QAV +AKF +KFM +KTR +TRN +RNN +NNK +NKV +VKF +KFD +DRI +IVM +GAH +HET +TVA +DGF +GFL +LRW +RWR +VNL +NLV +PVT +VTC +TCH +HSS +GFK +FKI +KIT +ITV +YEN +NAR +RKS +NIP +IPV +PVK +KGL +GTT +LDR +REC +ECL +CLK +LVN +VNF +NFT +FTN +TND +DKG +YAA +TFG +FGQ +SEF +EIE +DCN +IHI +HIV +KDM +DMG +PGL +VVQ +VQI +QIA +IAR +RKM +QHL +AKM +KML +FIR +RES +KLR +RHA +EIT +ITT +TTG +TGL +GLD +LDG +GLG +LGI +IGW +GWL +WLK +LKA +LFL +FLW +LWM +LRN +LLK +TAT +FDS +PGG +GGS +GSF +HCH +CHE +HEP +MDH +DHK +HKT +MET +ETA +LER +ERI +RIR +VFT +SQL +QLE +EEE +EET +ETK +TKP +KPM +PMA +TTM +TMM +MMA +AKK +KKK +KKC +KCW +CWQ +WQS +QSN +SNL +NLR +SFS +DTR +RRF +RFD +GFF +FFS +FSP +SPH +PHS +HSP +SPV +PVP +VPP +PPS +PSP +PLV +LVR +RKV +NAH +AHG +NGI +ETW +TWL +WLA +AKN +GLK +LKK +KKD +KDG +DGQ +IFK +FKE +KAL +PSK +MLT +GTV +TVF +VFG +VSV +KNL +NLE +LEN +VHI +MVV +TST +STY +TYL +YLD +LKI +KIR +IRQ +QKK +KLV +VYD +YDV +DVK +MKR +LKE +YVE +DSR +SKS +KSS +SHD +HDR +IKS +RKR +KRT +RTV +MHG +HGS +GSG +SGH +GHS +HSL +SLT +LTG +GAP +APH +PHQ +HQI +QIP +IPP +PPP +PPR +PRT +RTQ +GQQ +TAN +ANQ +DKI +KID +IDP +DPF +FHN +HNK +KRG +RGT +TSR +LRI +RIN +INN +NNS +SSR +SRY +RYN +NVD +VQL +KDT +NEQ +EQP +QPA +LVI +VQC +QCQ +CQH +QHV +HVF +FDF +DFY +FYD +YDP +PVA +VAQ +QLK +LKC +CKE +KEI +IKR +LID +IDH +DHI +HIT +TKG +AIV +IVE +TIY +IYP +PAV +AVI +IKM +KMV +NIF +VLP +PSE +ENC +NCE +CEF +EFD +DPE +EED +DEP +EPT +PTL +TLE +SWP +WPH +PHL +HLQ +VYE +YEL +ELF +FLR +LRF +FLE +ESP +PDF +FQA +QAS +SIG +IGK +GKK +KKY +KYI +YID +IDQ +DQR +QRF +RFV +FVL +DLF +LFD +DPR +PRE +DFL +FLK +VLH +LHR +HRI +RIY +IYG +YGK +GKF +RAF +AFI +IRK +RKH +KHI +HIN +NNM +NMF +MFL +YET +ETD +DSF +FNG +NGV +GVG +VGE +LEI +ILG +LGS +GSI +SII +IIN +ING +GFA +FAL +ALP +LPL +PLK +LKQ +KQE +QEH +EHK +HKV +KVL +VLL +PLH +LHK +HKP +KPK +PKC +KCL +CLS +SLY +LYH +YHA +HAQ +AYC +YCV +CVV +FIE +EKD +TPQ +PQV +QVF +LKF +KFW +FWP +WPR +RTC +TCS +SSK +KEV +EVM +VMF +GEV +EVE +DII +IIE +IEP +EPE +KII +DPL +PLF +LFR +AKC +KCV +CVS +PHF +HFQ +FQV +RAL +ALY +LYF +YFW +FWN +WNN +NNE +NEY +EYI +YIL +TSS +LVM +VMP +MPI +PIM +IMF +MFP +FPA +LYR +YRI +RIS +EHW +HWN +WNQ +NQT +IVA +TFM +MEM +EMN +MNG +NGK +GKL +KLF +LTS +TYK +YKG +GER +EKQ +KQR +QRE +KDR +RDA +AFW +FWK +MEA +LNP +NPP +EVT +VTP +PSL +SLF +LFP +FPE +TDY +DYL +DGP +GPN +PNM +NMT +MTP +TPL +PLP +LPV +AGG +GDK +KSP +SPS +PSV +VVK +KKS +STG +ETT +TTT +TTP +PAK +TKL +KLP +STP +TPT +PTS +TSP +GLS +PPD +DKV +KVD +GFS +FSR +RSL +ARP +RPR +RSH +SHS +QFR +RYQ +YQS +SNQ +NQQ +QQE +PLL +KDV +ELH +LHE +RKL +LAQ +AQC +QCG +CGV +GVM +MFD +FLD +LDC +CVA +LKG +VKR +LVE +VEC +ECV +CVG +VGS +TRG +EPV +PVY +VYP +YPD +PDI +IIR +IRM +SVN +VNI +FRT +RTL +TLP +EPN +PNL +LEP +EPS +PSW +YEF +EFF +FFL +FQP +QPS +KRY +RYV +YVD +DQK +QKF +KFV +VLM +LML +MLL +EYL +KTI +ILH +VYG +AYI +YIR +KQC +QCN +CNH +NHI +HIF +IFL +RFI +FIY +IYE +LEH +EHF +HFN +GVA +HKQ +KQF +QFL +VRV +IPL +LHS +HSV +VKS +FHA +DAT +HVI +VIR +RGL +LKY +KYW +YWP +WPK +PKT +KTC +TCT +CTQ +TQK +DVI +PSQ +FVK +VKI +KIQ +IQE +QEP +LFK +FKQ +ARC +RCV +EDN +DNC +NCH +CHT +HTV +AVF +FGT +GTL +TLY +LYQ +YQV +QVS +LIY +IYN +ASY +YKL +QQK +KAQ +ERQ +WRG +RLQ +LQG +QGT +GTQ +GAK +APV +PRP +RPT +MPY +PYK +KEP +PPK +PKV +KCT +CTA +TAK +KPS +SGK +GKD +EAQ +QPQ +PQP +PQA +AQS +QPP +SNK +KRP +RPS +NST +TPP +PTQ +TQL +IKY +KYS +GGP +GPQ +PQI +QIV +ERR +RQS +SRF +RFN +FNL +NLS +KNR +NRE +LQK +DSP +SPT +TQE +LFI +FIQ +LRQ +RQC +QCC +CCV +CVL +VLF +SDP +SDL +KFK +RAG +NEM +VEY +YIT +ITH +THS +HSR +DVV +VVT +YPE +VTM +MFS +NLF +NPT +PTG +AWP +QPN +PNI +NIA +IRR +RQI +QIN +INH +IFY +FYR +YRF +EHH +HHN +HNG +GIA +HKM +KMF +VYH +YHP +HPQ +KES +PVI +IVG +KTH +SPK +FLN +EFS +FSK +KVM +VME +MEP +LYY +YYW +YWN +YIM +IMS +MSL +SDN +ARV +YRN +RNS +NSK +KSH +SHW +WNK +NKT +TIH +IHG +GLI +YNA +LFM +MNQ +DDC +DCT +TQQ +QQY +QYK +KQK +QKG +RFR +FRM +RMK +MKE +EMW +MWQ +WQK +RLN +NPQ +PQY +QYP +YPM +PMF +MFR +FRA +RAP +APP +PPL +PPV +YSM +SME +ETP +PTA +DIQ +IQL +AVQ +VQM +QML +MLK +KDI +IKK +RRK +LPQ +PQD +DVY +VYT +YTI +TIK +IKA +AHK +HKR +RAE +FLT +SQE +MMR +MRG +RGF +RLI +STT +TTS +KKP +HGT +TTH +GSK +KST +TTE +GKQ +KQS +QSG +SGS +SVP +QGK +GKH +KHH +HHS +SKT +KTK +TKT +VSR +TKK +RKG +KGQ +QSK +SKQ +QQP +SQS +QKQ +KQG +QGS +AIM +MNP +TPV +PVL +TVT +VTK +TKD +KDD +DHA +HAH +AHP +HPT +TLL +LGA +GAV +AVS +SPI +PIS +TAV +ENG +NGN +GNS +NSN +SNN +NNN +NMN +MNI +NIN +INT +NTS +SNT +NTQ +TQD +DAN +ANH +NHA +HAS +SID +IDI +DIP +IPR +SFE +FER +RLP +PTK +PDT +DTD +KTP +PQR +QRH +RHS +RFE +FEP +PSR +RYT +YTP +PLT +PNF +NFN +FNE +NEV +RIP +FIA +DQC +CNT +DFN +NDP +PSF +IQG +KRS +IEF +TNR +NRF +FTY +TYT +YTN +TNE +EMY +MYA +YAH +AHV +VVN +VNM +MFK +KIN +INL +FRP +RPI +PIP +PVN +VNP +NPV +PVG +VGD +GDI +DIY +IYD +DED +VNE +LAW +PHM +AVY +FNH +NHQ +KQY +QYI +QDF +FIL +DIR +DCL +TLH +SFI +RSM +SMN +MNN +NNI +LQF +KFN +VRI +RIL +KVR +VRC +RCL +YCI +CIV +IVQ +KDP +LLT +VMG +LRY +RYW +PKI +INS +NEI +DIF +IFE +PLE +LEF +FIK +IKV +VEV +VPL +LFV +FVQ +KCI +CIS +LSY +SYW +EYF +NLC +LCI +CIE +VIL +ILP +PII +IIF +IFP +LYE +NGE +SIS +DPY +PYM +YML +MLV +QAI +AIN +NSG +GSW +SWN +WNR +NRA +RAI +AIH +IHA +HAM +MAF +KIF +ETN +VLY +CNA +LYL +KET +QRK +KVQ +ENW +NWS +YVK +VKN +NND +KDQ +QYT +NSF +FNT +NTA +NNT +NTL +ENE +END +NDC +DCD +CDS +SEI +IKQ +KQI +QIF +IFG +FGK +LPR +RKP +SHN +HND +NDS +DSN +VNS +NSY +SYY +YYI +YIP +PNS +NGA +GAN +NGT +TVI +VIA +IAP +APS +SNR +NRT +RTN +TNQ +NQV +QVN +VNG +GVY +YEA +SFR +FRD +KLS +LSM +SMC +MCC +RQT +QTL +VDY +DYI +YIA +VST +SDA +QEI +RTF +TFP +FPS +NHE +KIL +DVD +EPA +PAW +LQV +LLL +PMT +TDA +RYI +DHS +FMV +MVH +VHR +HRP +RPF +PFI +KAI +FIF +FET +KHN +HKL +IRA +RPK +KCA +AYH +YHQ +SYC +DFK +FKL +ADT +WPV +TNS +QAA +EFQ +FQR +QRC +RCM +CMV +MVP +CLN +SHF +LWN +NDH +HIR +IRN +NLI +ITQ +TQN +QNH +NHK +VIM +IMP +PIV +IVF +VFP +PAM +AME +NTR +RGH +GHW +NQA +VQS +QSL +NVR +VRK +VMA +AET +TDQ +DQI +QIL +ILF +DEC +KFQ +FQE +QED +EAN +KRE +ATW +TWK +WKL +AVL +PRF +RFS +FSS +TGK +GKT +LTC +TCN +CNK +NKA +SRM +RMV +VDA +NGP +GPF +PFQ +QPV +PVV +VVL +LHI +QEK +KWK +WKE +SEM +THN +NRN +RNV +VIT +EPI +PIY +VVH +VHM +HMF +MFA +FAV +AVN +VLQ +HKI +MAL +KIM +IME +THW +QQF +EAW +AWV +WVK +KAN +YTV +TVY +YSQ +STM +TMS +MSI +SIP +TDG +GPL +LFE +FED +EDV +DVQ +TVK +AHQ +HQA +QKD +RPL +QDP +DPH +PHT +HTK +AHC +CRA +SQD +DGR +MSV +ATD +TDD +DAL +LYP +YPI +PIA +IDE +DVT +TLR +NSI +SIR +STI +TIA +LGV +VER +ERT +RTR +IQF +LVL +QLG +LGN +GNF +FTP +LVG +GPD +PDH +HVH +HCL +VVR +VRD +RDK +ESL +KHS +HFV +VPM +PML +GDW +DWF +WFT +SRT +RTS +SAC +CGL +YPR +PRV +PAI +KSM +SMF +TLC +LCR +CRD +RDD +DDT +DTP +TPM +VRR +KLG +GEF +FAK +FEK +IEG +EGL +GLH +LHV +HVD +EQD +SVR +VRL +SAI +IAF +AFG +ANK +NKK +PIL +IEL +KSW +RVR +VRY +YMV +IEI +QNV +DMD +MDT +DTT +NMY +MYT +TNL +EVR +RCA +CAA +TQR +QEF +NLP +PED +DKR +RQN +QNI +NII +IIC +LLN +NVA +LAG +AGV +IMG +APL +PLI +LIG +EQT +QTV +VSE +IYM +YMQ +NDQ +DQT +QTP +KVN +EDG +DGK +GKW +FMP +MPL +LGQ +FFD +PLC +LCL +LNW +NWL +TDH +VFS +FSI +IMK +LTQ +KFG +FGG +GQW +QWA +WAS +TNI +VPK +PKM +MQK +TNY +YLQ +QRM +RMT +MTC +CLF +MTQ +EDD +VPN +PNV +VRF +FNA +AKS +RIG +GKN +PST +VKP +KPL +LGK +DSD +SDF +DFD +FDV +DVR +RYF +YFS +FSE +SLG +SVD +DSL +LKN +SIK +RSE +IPF +PFL +FAM +AMY +MYL +LRT +EHS +HSA +EIH +VVP +TLQ +VCY +CYP +VTQ +RAN +NFR +KLC +LCQ +NKL +TEY +KSD +NFV +LAV +EAC +ACV +IAQ +VEH +EHL +QCA +VDL +DLQ +AVG +VGP +PEI +ITR +TRV +RVD +AFQ +DFC +FCA +CAN +ANL +NLD +QVQ +QII +IIL +SIL +LPY +PYV +YVR +PNP +PHV +SVI +MLG +YQT +ECP +CPE +CVN +VND +GIQ +IQQ +LSQ +SKW +IEY +EYM +YMP +AGQ +GQL +FDQ +GLC +LCM +CMG +MGW +WLN +HVY +VYA +YAI +AIR +LNM +QFG +FGA +APW +PWA +WAE +IIP +IPM +PMI +MIL +MSR +SRN +RNK +NKN +KNY +YLH +HRM +EVC +VCG +CGT +GTD +DIT +TTK +PTV +ADP +VAN +ANV +FNV +SPF +VID +IDA +DAQ +AQV +KPT +NTD +TDV +VKH +KHF +HFA +FAA +LPF +GTF +TFT +FTT +YVH +ISH +HEH +PSD +AHF +AVK +RQY +FRN +LCS +SDD +DNV +FSN +MPT +FTE +ITK +FQN +QNL +NLM +LMK +MKD +KDC +DCE +CEA +ASH +SHK +KEF +EFC +FCE +CEN +ADC +DCR +MSQ +SQI +LPC +PCI +CIK +NQH +KDN +DNT +NTI +IEH +GIR +EDA +AKW +SLC +CMA +MAW +AWL +WLV +VDH +NLK +KEW +EWA +WAH +AHA +HAT +ATI +TII +AMS +GDP +PNY +MTT +TLF +FCI +CIN +INV +CGQ +TKH +KHM +HML +MLP +VLR +LRM +RMA +MAG +SLQ +KIG +GPI +LQS +KPI +QDQ +VKY +KYF +YFA +FAQ +TTA +YPL +LLM +LMD +HDD +LGP +PER +EVF +VPY +PYI +YIG +IGG +QYA +YAT +ILL +VRE +SLN +QLF +ADW +WFS +KVS +IVR +NIL +MVK +RAV +VGK +NLG +EDW +DWD +WDY +YIS +FQK +IND +NDN +DNQ +VDC +CLI +ISI +KFF +FFN +DES +SHT +HTQ +IGD +DRF +VQP +QPF +LCE +DNE +NEG +GDV +SGF +LNK +NKI +VQN +TVR +NKD +DQV +QVI +VIN +NNF +FLP +NML +EFP +FPD +PDV +IIA +GIE +DVN +VNW +NWR +VRM +MAI +IPI +LGM +GMQ +MQF +QFF +DLC +LSW +WLW +LWD +WDT +YSI +VNN +NNL +EIF +FGS +SDW +DWC +WCR +SRL +ENF +FTI +LTT +GVP +NIR +IRF +SYA +YAV +KYD +YDA +KNT +LQT +AEC +ECQ +CQE +MVM +SQN +QNQ +NQP +AND +FDM +EGP +ETF +PVD +INW +NWK +WKF +FNQ +GNI +NID +VHT +HTE +EAD +ISC +SCV +CVE +FSH +HDG +GEY +GRV +VVI +VIF +QRD +GKY +KYV +GVR +EYN +YST +STF +TFQ +FQS +QSH +FDY +EID +INQ +NQI +IRW +RWL +NFI +DKT +KLW +WKI +DAW +AWN +WNL +NRI +FRG +RGR +GRL +LQI +SIV +PME +YGN +AHT +HTY +TYH +YHV +HVN +NSD +TFL +DDL +RVN +ESF +FNI +VDI +IKP +PAN +ITA +EFH +TQC +CNW +NWF +WFV +KGS +RLC +LCD +CDM +MRD +RDR +ALC +AYA +YAK +DPQ +QSR +SFF +KFS +NGR +GRY +TRD +YLT +KVW +VWD +WDL +MES +PVE +ETY +TYP +YPV +HNY +YLR +RTK +LCA +CAL +IFD +FDK +KFE +FEC +CDW +DWS +WSG +HIL +ILT +GSY +SYH +YHN +HNL +FRS +YAR +ARG +NNQ +KTW +TWE +WEA +EAR +RPQ +EPH +HSQ +FVV +QLQ +QFD +HTA +TAW +AWH +WHP +HPK +PKD +DNI +TNN +NLY +LYI +YIF +IFS +MGR +GRW +RWG +WGR +PDP +PQM +MQT +FMR +MRQ +SIT +IGN +GNM +MLN +TAI +INI +SWC +WCF +CFS +FSQ +QIK +GAL +ADI +EFN +NHD +RDP +SKA +RRG +RGE +INK +WLQ +QKN +VHF +HFL +WKV +KSF +GGY +GYN +YNT +NTK +NGL +PQN +VTA +VKQ +RRT +YHI +LWH +WHL +HLE +NQS +QSY +YNI +TNM +TEC +ECN +CNV +NVF +VFV +KGT +TIR +CDR +DRH +HSK +QFE +PEN +NRS +SGR +YMI +LSI +LHM +HME +VHE +HEY +DCI +CIF +ECC +CWN +WNG +SIM +IMT +MTG +YNN +NFF +FFR +LKP +KPR +KVC +VCT +CTG +GKR +CLD +LDF +FNK +ENI +QDK +DID +IDT +TRK +SFL +RDH +HSY +IST +NHT +HTG +QVH +HRR +WLP +PQQ +QQN +AYF +RPE +EGY +YNL +PAT +LRP +RPM +PMD +LMV +TPR +SDY +DYE +TYM +YMS +WNF +NFE +QSF +HPH +HHC +HCN +MRA +RHT +TKF +FFE +HSG +MEN +ENR +NRP +RPV +TYQ +VHD +HDY +CVW +VWN +NGS +RMF +TKR +AIL +VCV +DFS +HPS +MRF +RFC +FCV +AWF +WFF +FFP +FPN +NTT +TTR +VFW +FWD +WDA +AFS +SNF +FTG +TGC +GCH +CHH +HHG +GQN +GLY +YFQ +RFG +FGY +GYI +IPE +PET +TFS +FSG +SGN +FTD +DDF +ELY +QTN +TNF +LDA +LTI +TIQ +IQH +QHI +IVI +VIP +PRC +RCG +CGN +SLM +LMH +HGG +EVN +RTH +HLH +LHA +HAV +YTL +FPG +EPR +PRW +RWP +PRN +RNR +NRR +RRD +DLT +LTY +TYA +YAF +PKN +SRA +FGR +RWS +WSD +FTL +FST +ITI +TIG +IGF +GFY +FYT +YTG +GDH +EPF +LAH +HAF +SPP +KFH +FHL +HLD +WVV +ESV +AVH +IGH +GHL +LGH +ESI +IMY +MYP +YPT +PTI +LTN +VEG +EGI +IQY +YLY +LYG +YGA +KHQ +HQR +DTG +GGF +FSA +RID +IDG +DGS +TVG +VLW +LWF +WFL +MGS +PLR +KPG +TSW +WNS +VRT +TQV +EYG +YGC +GCF +CFE +KGH +LNG +GNK +NKP +KPE +EYD +GFT +EGM +GMG +MGV +VGR +RIT +LMW +MWP +WPE +CET +SYG +KRM +KMM +MMV +MVF +FES +FGM +HFD +SFC +CES +LHF +HFM +MRY +QPG +PGK +GRS +RSP +SLH +HKD +KSI +IVN +NQN +QND +EFE +GEW +EWI +WIL +ADN +DNH +GDC +DCF +CFM +AWS +WSN +RLH +QAR +FSF +SFP +FPK +EHP +HPL +LLF +LFN +FNP +PFE +YCF +CFT +FTK +KEG +CDL +PAQ +PFR +FRI +QGP +ERP +RQQ +QQC +QCS +CSQ +SQR +QRI +RIQ +QGE +NQC +QCR +CRS +RSQ +SQM +QSC +SCC +CCQ +LQN +NVE +EQC +CQC +MPG +GWS +WSC +SCL +CLV +FVG +VGQ +VQE +QTK +MLE +LEG +AQY +CQG +VIH +IHT +IDV +VSH +SHV +HVL +PRQ +IYC +YCS +CST +AGP +HEE +HHE +STW +TWS +AYP +YPY +PYS +YSK +KNG +NGG +GGT +HTC +TCA +PMY +MYI +YIY +YGE +ERS +VMI +KNK +VYV +YVG +VGN +GNV +VAW +AWA +AHI +NVQ +VQG +GQF +QFY +TPH +HQS +SYD +LNC +NCT +EWG +WGL +RLD +SWS +WSL +LLY +LYW +YWL +VSF +PFY +FYN +YNY +NYR +YRP +RPP +PPF +PFN +FNC +SKF +FTF +FSY +AQR +LGY +GYV +YVP +SWE +SEW +WIG +IGT +EQH +QHR +HRE +RET +DTK +TKS +GGL +AFR +QNR +TAC +ACI +CII +DVF +FGV +GVT +VTH +THR +MNV +NVN +VNV +CVQ +VQA +PVF +VFI +IYT +YTS +IEV +QNG +NTW +TWP +WPT +PYP +NGW +GWN +NGD +GDT +LYT +YTC +PTY +TYI +SIN +INE +NNG +SVG +TVN +KAP +YDN +NYI +EFG +SRW +LMY +MYW +YWI +SYQ +YQP +FNR +NRH +YKP +PLY +LYS +YSW +VEW +EWV +WVG +RHK +HKE +TLK +KSK +KTQ +YRT +KHK +VTV +RGD +DIV +QGM +GMS +VII +IIH +DAC +TFH +FHT +MVN +VNR +KNN +KRH +SIQ +NYT +WGF +GFC +MVT +VTI +TIS +ISY +GYE +YEP +QVP +YLV +GGC +GCG +CGF +GEH +EHI +LEW +EWE +WEP +PRL +LHL +TGP +GPV +PVQ +VQV +QVT +AIQ +QAH +HEV +GSH +IHK +VQT +TGT +GTR +TRL +SSM +GHP +HPF +PYE +IHR +HRH +RHP +HPY +YPC +PCS +CSK +GRK +RLF +AIP +EHG +HGR +AWM +WMH +MHI +LMG +MGG +QVY +VYF +YFC +FCY +CYD +YDK +SPY +SYE +EDF +FNM +MEF +SPC +PCG +GTH +PYW +WLL +LQW +QWL +PYT +TNK +RHF +HFG +ART +RTI +IHW +HWV +WVQ +RMG +DAS +ELG +VTT +DRG +WVR +DVC +VCA +TIF +IFH +ELM +DEY +QRS +NVG +GTE +TEN +HAG +GVQ +YTD +DLY +AQN +GVD +DGM +GML +CAI +IRP +GIW +IWG +WGN +GNG +GDQ +QTM +GHV +HGF +GFI +AAH +DGT +APG +PGQ +GQA +YFI +FIN +PIN +INM +MFE +FEF +FAR +QRW +KMR +MRI +SGP +GPA +AVR +VRW +RWV +WVM +VMT +TGW +WQR +HFR +FRF +GFP +PAP +RLY +NYF +LFT +TTQ +QAL +YYV +QMK +ARA +MMK +QLH +RMR +GRT +RTP +RLE +AHN +HNI +LQA +CLQ +PLM +LMA +SFK +LDP +PDS +SMG +EMS +MSC +SCA +ARI +FEM +EMT +MTL +LQP +QPL +HKK +DWN +WNT +QAT +QGL +LGG +GSP +HSH +HTT +MAN +YHF +FVT +KED +YAN +ANY +IQA +QAD +ADY +NHG +PSM +SMT +MTA +THF +HFP +FPR +YGV +GRE +CVM +VMM +MML +GMK +FCS +SYL +PEP +LMT +MTF +LYD +DDW +DWM +WMR +CSR +PPE +YLM +MKF +VNK +NKM +KMT +LLW +LWP +WPP +DQA +QLD +IQV +VGV +GVV +IQS +QSA +DIN +INF +QDT +DRL +RTE +PAR +PTM +TMP +PPQ +PPG +GTP +TVP +PGP +NPA +QVD +SGV +QPR +HNV +NVH +VHK +TAM +PLN +LNR +NRL +HTH +THM +HMA +QCK +CKD +HFS +YFT +FTH +HRK +NHS +APF +PFS +QEE +MTS +ALH +HDV +QEN +FNN +GIF +APQ +QQV +MTV +LPK +PKP +PTD +VGT +PCP +CPA +SNM +NMP +DQG +TED +GGH +HPP +PRG +EMH +MHW +HWP +PMK +AIG +LTM +AGY +GYL +KWP +WPL +FVI +KRC +CVY +VYY +YYF +YFK +PQG +GAF +FSL +LSG +SGY +YNR +RVM +VMR +FPF +PFK +HIS +KKH +KHR +HRT +RTW +TWF +WMA +GHF +HFH +FHE +HEK +PLD +SFY +FYG +TDN +YEH +EHD +EPD +PGR +MHP +PAY +YPP +DMP +MPR +RAH +AHS +SFT +GPG +KHG +LPD +LCP +CPR +EPC +DPP +KPP +PPC +PCF +CFR +EPW +PWT +WTP +PGH +HGA +GAC +IMA +RNC +NCD +CDK +RGP +GPP +SEP +PKF +AMP +VAP +APR +RQP +KVP +FVN +VNT +ESC +CEV +LYC +CIR +GKV +LVV +VVW +WDE +ETS +VRN +RNY +RIF +KFY +GSM +SMV +EHY +HYH +YHT +THV +PSH +SHQ +PYG +YGY +GYT +IQI +QIE +EIN +TFR +GNC +NCI +RPY +AQI +CQK +HAA +MSN +HEW +EWQ +WQF +FDN +NAW +AWQ +QEM +EML +LNH +QKV +MDA +DCH +EHQ +FRR +NKS +SRP +PYF +YFE +QVC +TYS +DIH +HRQ +GDF +DFP +FPT +PGV +FQL +EKC +KCD +CDY +DYP +YPS +GSQ +QMS +ACD +DYD +VRP +DVW +VWE +WEH +EHE +LDH +LMM +QQT +STE +QRP +RHC +HCD +CDV +TSC +HHQ +HQL +NHL +TPI +PIK +VSM +SMR +MRE +DRS +RRR +PRI +LNQ +QST +INR +ARQ +KFR +KPY +YWE +RVA +RQF +QRV +LVH +ARY +AMG +FEL +KYY +YVQ +KMA +IHE +MGP +RGC +TSV +DSC +SCS +CSN +TQS +QSV +GPT +MPD +PDQ +DQF +QFP +RPG +GMM +MMF +FPV +SEC +ECS +PEC +ECE +ERG +ANN +NNR +NRM +LQC +QIG +ISA +REH +HKA +LQM +GKS +TRM +GCD +GVK +YHS +HSN +WDD +YGD +HAD +IGE +IFN +FNS +QLW +WMV +VDN +FQT +QTE +YWS +WSE +LGF +LHG +HGY +FEH +HFK +FKD +DQM +QFT +FTA +NDT +QTR +VFN +AFP +KFA +AYL +YRW +RWH +WHS +SYI +TPD +FHS +QCL +CLW +WRW +RWW +WWK +WGC +GCP +LTF +TFI +IRH +RHR +EFY +IDM +DMV +VKT +DMY +MYD +DTF +KRW +RWD +WDP +MVL +EMA +QGR +AEW +WIA +TGY +PTF +FEN +GHR +QPI +PFP +FPH +HHI +ILQ +IDF +NDY +DYA +YAC +CSI +TRC +RCY +CYK +ASC +SCT +SCY +CYM +STQ +MIE +NWE +WEF +PDN +DNN +NNA +API +KHA +AFN +LHH +HHF +HFY +YRD +DGY +GYS +LDY +QFA +SVQ +VQQ +CVK +AQW +QWI +SCI +DNP +DMI +YMR +LIN +CLG +GSC +SCN +DFA +CGY +GYA +IVC +CFW +HSD +GQK +III +GGI +RGA +YER +GLQ +GPH +PHG +HGW +GWR +WRM +SWG +LDQ +IVV +YLP +FQQ +QQH +QHY +HYG +YGG +HRS +RSD +KLH +LHN +DIE +IHS +DAP +AEM +EMK +IGY +HFI +QRY +RTA +DWG +YNH +NHC +CDP +QDR +WRN +NNW +NWW +WWQ +WQM +HAP +PLQ +LQY +AVM +MAM +MED +LFA +GNL +LDW +DWE +RRP +RCS +SRI +IQT +RFW +FWG +WGE +WHV +EGT +TAR +WFI +YAD +DWL +LWG +WGY +GYD +HIA +MPQ +EWR +WRY +RYA +YAL +NWQ +WQP +PPY +YDW +WSW +WML +IPD +CNP +PGC +GCV +CVD +QGV +QLY +YIC +ICF +CFP +LPM +MTI +TIP +IPG +MKT +QTF +PGI +RWT +RGW +WQA +PDD +DDY +RFP +GMT +RRY +RWK +WKP +KPW +PWR +HIW +IWY +WYT +EGW +QPD +RIC +ICV +LFF +FFA +FAP +RNA +NPW +PWN +AGK +LYM +FQH +QHF +NAV +VEM +MYQ +YQR +QRN +RNF +TMH +MHS +RFH +KHY +HYS +YSF +TRW +RWE +FYS +GPM +PMR +MRT +TGH +NWI +WIV +IRT +TGR +TTD +DSG +SDG +QYY +FWI +WII +FLY +YDL +ACW +CWA +WAP +LFG +IWI +WIP +NYD +YDQ +GYM +CVR +RGM +GMA +AYV +SKM +GIP +IPY +PYR +RAM +KYA +YPH +PHI +HIE +RTM +MDP +MRP +PGN +HSM +SML +GIM +IML +YPW +DRR +MWC +VQD +QRQ +QQI +INA +RNQ +EMR +YLN +PTR +NPC +QYG +DAH +AHR +HRA +QAW +GRA +AHH +HGC +GCS +SRH +GVH +VHG +AWI +ASF +QNP +NPM +PMG +LMP +VYW +YWK +WKG +RRW +KIW +IWR +WRA +EYA +GGN +DRY +YYG +FYA +YAM +AMR +MRL +RLW +WPG +GEI +GTK +FAF +MVG +GKP +MFY +FYM +YMT +TGQ +VVV +GMV +HQG +PHY +GVW +VWI +PNN +RKY +HAI +IIG +DTY +PEM +LCW +WVP +VPG +PGY +YSD +VEP +KPF +PDL +PMN +MNM +NMV +VMQ +MQQ +HPR +KVG +TWG +WGK +VGM +IGL +LYV +GIY +IYV +RHG +HGV +EHN +HNE +QMR +MRV +KYQ +PIT +TEW +EWT +WTV +LME +AWW +WWG +WGP +PWF +WFA +IIV +KRF +FMN +MNE +SMP +HHM +HMY +MYG +GQY +YGQ +GQG +WLI +LIF +QYR +IFA +KWL +ESG +DFH +FHR +HRG +YDR +DPT +IKH +HGP +RTD +LYA +PVM +MGH +GHT +TVQ +RTY +HGI +KHT +HTP +KMC +MCW +GRP +AYG +MKV +TMW +MWA +WAK +HEA +CGG +LVF +RYR +WLD +NAF +VGH +SAP +QAG +QDW +DWT +YTA +AQG +GLT +TTI +SIW +IWL +RQD +NIE +PDY +RMD +INP +DIG +GRC +CTK +DRM +MIG +QNF +NFA +PRY +MHA +FEG +AIW +IWS +WSM +GPS +ATR +RRN +VPQ +TSH +CSP +DNG +SFM +FMI +MIF +DCP +CPP +AQH +QHC +CRK +RCR +AFF +FFC +FCP +PPN +AIE +AID +GNT +FYP +AMV +SYR +QDM +MIC +CYN +YNQ +PTT +GQC +QCY +DHR +GCA +CAC +ACP +CPN +CCS +KCN +YKT +TCP +LCY +MFM +GCI +CID +CPK +YVC +VCC +CCN +DRC +RCN +VCL +KCY +CYV +TQT +QTC +CEK +EKY +VSY +YFH +FHD +YEC +ECT +CHR +GPY +PYN +NVC +LCN +MGE +THT +HTI +HTS +HLN +KFI +ITY +EIP +NAN +LII +DFF +FCN +TSM +TYF +LLC +LCT +CTF +FLH +HHP +LHQ +HQT +FPL +PMS +LFY +YRK +KTN +TNV +YKH +NMR +YGP +LSH +PHD +HDT +HEC +FLC +CFG +AQQ +SGC +GCR +CRF +LWL +EMD +EGF +VGF +TWV +PQK +HDA +THC +HCG +CGW +WSS +GWP +MPM +IYI +HLP +RPC +PCL +NNH +HIY +YTY +TIM +IMI +FVF +MGA +YLG +ACF +CFV +VIC +ICI +EGC +CIH +IHF +HDI +QSD +PKG +VML +LTH +THK +HKG +YMH +HSE +LMC +MCV +LFH +FHI +QFC +KYK +PFV +PPI +TVM +IKF +KFP +QGY +GYG +YGM +AMC +MCL +MKI +QIM +TRT +IDK +WLH +DND +FIV +KGF +HPN +AFV +YKR +VFF +FFV +PKS +TKQ +PNH +QIY +NSR +IQP +QPK +IVT +CHV +CLH +QAN +NEH +YIH +DVM +MLC +LCV +IQR +RYK +WLY +IDD +TFY +FID +SPN +VVC +TTN +EMM +TGM +MSK +SHR +HRN +GEQ +FIC +CTV +MFH +YGL +YGS +MHE +HEM +MMS +SMH +MHT +VLC +KYP +TGI +RYG +YGT +GQI +GPK +PKQ +GYF +WLR +CYI +IFV +GYQ +FPM +YVV +APY +IKI +MDS +HAR +PNT +HVT +VNH +HPD +NIK +ESD +FHV +TFW +WPD +PDM +DMK +KYN +TWY +IHQ +SHP +EYP +YPK +PKL +IRS +RSC +CSA +LMS +PHK +KPV +VCI +FGW +WFH +FDT +KYG +INC +NCA +CAV +FCK +CKK +FKV +DYS +TRI +RKF +FLM +MEC +ECR +CRN +PRD +PPM +HLR +GHQ +HQP +DYC +YCT +PCH +MIT +DPI +PIQ +QMP +EVY +RGS +SNV +NVP +PSC +TPF +RKC +CVP +QFQ +MDR +KCP +CPH +PHR +YTK +YDS +EKW +KWH +WHA +KDH +HRL +REF +FGD +FGE +RND +SYT +PEW +EWF +TGF +CNG +NEF +VPC +SMI +RWF +PHE +QNS +GNY +MLQ +PFM +GDM +TMK +MSP +GQP +GLM +VFQ +TRA +PIG +FQG +GMR +TAQ +AQM +NFY +FYQ +GFG +DRT +KMY +MYE +NRY +VPH +HVP +LHP +HPG +VHP +PQH +SHA +HMH +KWF +WFG +LEY +DYK +APM +PMH +NPK +QTS +THQ +MPH +SHC +HCV +SDC +CVT +KQP +QPM +MNA +GWV +LFW +FWL +WLG +QKW +KWW +WWH +WHT +HKN +QTD +QID +NFG +TPN +NSW +SWF +VDT +EFW +WQN +NIT +LLI +GTN +ESN +NRW +RWC +WCS +CSW +YQL +QLM +MLF +MLW +DPG +RHW +HWD +WDQ +NER +HEG +FPY +PYA +QMN +MNL +KLY +FAD +TKC +KCH +QKH +YKI +NDM +MVI +SHI +HIQ +ECK +CKY +KYE +RQV +KLM +MKL +YVT +VKM +DHY +HYA +DME +VFC +CIT +PIF +IFF +FFF +KIP +WFK +KSC +SCK +CKG +CAY +CKS +LQH +QHP +PWV +WVE +MRM +MLH +SHM +NSM +QGN +GYY +YYD +KGW +RYP +YSP +PND +ITP +IFC +NAC +QVL +NKW +KWT +WTL +TCD +LCC +CCT +HLC +YWA +WAI +TDP +IDY +YVN +LTW +CTI +AFY +FYI +YGR +TRH +RNW +WRL +EVH +TPC +CAP +IIM +MGT +ILC +CWL +PFF +FFI +PFC +CHM +HMP +VIY +YAY +YFN +IKC +CKF +KFC +FCR +CRQ +WNI +WRR +RRC +RCP +CPV +YQI +FGN +CVI +IFI +ITW +TWI +CRI +ILM +DTC +VHH +HHY +HYV +LHC +HCK +CKP +ETC +IQC +HNC +IYQ +LPW +PWK +ITL +TMY +CDF +DFW +WLS +TCC +IMH +MHL +TPK +LVW +VWV +FFW +FWR +WRQ +PNK +VCW +FII +PIC +ICK +CWF +FHM +FNW +YTM +AFH +FHK +RFK +FKC +PNQ +GAW +AWD +YTT +TWN +DIW +IWV +WVS +AGH +GHA +AMI +AVW +TAH +QIS +STC +TCG +CGA +ILY +ITG +ICW +ICR +SCW +CWI +WIH +IHP +HPA +FFT +FTW +TNC +EKM +MLI +ICM +CMT +YIV +DRW +EVW +VWL +CTC +NAI +LMI +TVW +VWT +WTI +ISM +SQC +QCT +QHD +HDH +IYH +YQK +FAS +CKL +TFC +TEF +IRI +DHP +SIY +ADF +IRC +MPS +NWT +CEG +KNW +WSA +MAV +DML +MPV +WIY +IYL +IHH +VFK +FIP +IMV +SIH +TMQ +MQS +ACK +FLF +VMW +WCP +CPF +NIM +CNE +FVW +YIQ +CQY +KQH +QHS +LMQ +DCS +MEI +PTC +NRK +QVG +CTD +FVH +LHW +HWA +FVM +AMW +WLF +NQY +QYN +TCV +DFM +FML +VTY +MLD +MRR +CNQ +CNY +KIY +IYF +RNP +FFK +GIN +EQV +SEH +CDH +TVH +VHW +IWP +PHN +TCE +HRD +NDG +RTT +VNA +VGY +NYQ +KCS +YFG +MFQ +NWA +VMV +DWP +CPI +PIW +CIA +NYP +PHP +ITF +SNH +IMM +MMI +MII +KVY +SYP +TMG +SMQ +CGP +GPC +PCD +ANI +RLM +SWV +TRY +THI +LGC +NCK +CKQ +VHS +VWQ +FKF +QNW +NWP +WPA +HNA +YDY +YVW +VWP +YLC +PVW +WIS +IVW +VWA +IGV +TTC +TYC +YCL +CLT +YVL +HGH +KCC +CCK +CKR +IMW +IYR +SNY +LRC +NYK +NIY +YRH +HTN +MQV +TFN +FQF +NCC +CCC +APN +CGK +SKY +RCD +GKG +HNS +RDW +DWR +WRK +TTY +YIW +WYR +QFW +FWT +QWN +WNP +VRH +RHQ +EVQ +QNY +YNF +NFP +QNC +SLW +WEL +FYV +YFV +VCM +PLW +QLT +MGN +GNH +MCG +ETR +LWS +WSV +SVW +VWH +WHY +QYW +YWT +VYI +FSM +NYY +IAW +CSH +HMG +DFE +WSI +IWQ +WQY +CIP +IPQ +HST +QWT +GVF +PDW +MAH +HVG +IWH +LWA +WAC +CIL +DTH +THH +YHL +QKY +KYH +YHK +YNW +WTK +VHA +VWY +WYQ +WND +IWA +APD +ENY +TFK +HDK +QFN +RRH +PNC +NCR +HFF +TIC +HDE +VWS +WSQ +YLF +ALW +WGG +TRQ +QYH +CVH +SMW +MWY +RNH +QTY +WQL +GCT +APC +CAE +FWF +WFQ +LCH +CHF +WSR +GGW +TIN +DQH +QHG +PFT +ISF +WDN +NWN +KEC +DKP +FYF +YFP +DSM +WEI +MSM +YII +MMN +PMP +RCC +CCP +CPT +SGW +GWT +NCP +CPG +GQH +IWN +WNC +NCY +CYS +YSR +HTF +HHA +RPW +PWH +WHN +HNQ +VQW +ECG +SMS +ANW +RAW +SFQ +QIH +HHR +REW +YEQ +CRP +FKM +GQM +WTR +ICL +MVW +ISW +IRY +LDM +RNI +QTH +THG +YVI +VPF +DAM +CWD +WDR +DRQ +MPF +PFG +CCI +AIC +CQP +GCW +WVI +QGW +NIG +CSV +AYY +STH +MGC +CFC +CLC +DYT +QVW +YIN +GQT +QWE +WES +QCH +CHP +VCR +WAY +EMF +MFI +TWC +WCV +FCF +SRC +RCH +HLT +TFF +IWF +WFY +NHN +PQT +QDN +TNH +FAW +RWQ +LWI +IAC +WNV +RHM +MEY +EYT +NVM +TWA +GWG +CQV +PSY +IYK +DTW +TWR +WRE +CSC +SCD +IWK +WKS +NYN +KNF +PNR +SQH +WFP +SWD +WDI +AWG +WVA +VMC +GWH +WHE +YCR +GMF +MFF +VTW +CDC +GYC +YCN +TMN +PCV +HCP +CCL +TQI +FCC +CLE +PRH +WST +MMD +YGH +SWA +PTW +TWD +INY +NYG +NCL +KWI +WIF +FGH +GKC +KCM +GWA +WAQ +FMY +MYY +YYQ +AKH +HKF +ECA +KHE +ICG +GMH +CTR +DHH +HNW +MIH +HPV +WMP +RVQ +DHC +GHD +AWE +WET +CHI +HIG +NTY +QNN +GAM +HLY +GWM +WMI +TCI +RYS +HNT +NQW +QWS +DPW +YSC +MPK +EWK +YAQ +AYW +MLY +CAR +KYM +YME +EYQ +MTN +NHY +ATM +FFQ +RPH +QMF +YGI +IFT +MMP +TDF +CHN +EWY +HSI +NFM +FMD +MDF +FKG +NKH +YMD +YDI +MCP +QYQ +TIW +VAH +HEF +FPI +YPF +HGN +SCR +NMG +MIN +HTD +MFW +WNH +MCI +WEN +SVH +CRV +KMD +FQM +QMI +HDS +CHG +WAV +PKW +PQW +QWP +TSY +QWR +TCR +SEY +SHY +HYQ +SFW +FWA +WAM +GFN +VMK +DYW +YWY +WYS +HWQ +WQT +VMH +MHF +PKY +VKW +WWS +HMM +MMT +MTY +KWD +TKW +KWS +HPM +MKP +KPH +HNH +QHA +MVY +YYT +EYY +WDF +GYH +WVY +YGF +GFH +HDP +NEW +WYE +YKY +YIE +EWN +GTG +YAS +CSY +GMN +RIW +IMD +TYE +DHF +HIK +PCC +CCE +CEW +GHY +DIM +TDK +KWN +QGF +GFM +ATY +TYG +YGW +MWR +NKF +WTG +PYL +YQN +QNT +RGI +GHI +HID +REM +YRC +QYS +DYG +CRY +GWD +GMC +MCA +CAF +EMC +MCK +DMM +MID +MFV +VGC +FGP +KAW +HLK +DWV +KYR +WTC +TCF +DSW +GTW +WVH +TNW +ICA +CAK +HIM +YCA +MLM +FRW +WGI +QFH +FHH +HHT +SCF +CFL +NHH +IDW +DWA +WAR +ARW +WHF +FWV +NTN +KKW +SAH +CGS +VWF +LWW +WWR +WRP +HRF +SHH +HGK +NYE +YEG +FSC +NAM +MKM +KMG +IGM +TWT +WTM +SWQ +WQD +PMM +FPP +FIH +HHD +RFA +NQM +CIG +IGQ +YSH +IIW +IPW +PWY +WYL +HKY +LWE +WEG +YDF +FGI +KGM +HSC +GTC +WPF +WWL +HQD +WGD +FKP +NYV +YMM +WGH +HQF +GGM +YDM +QGG +QDC +CDN +FDC +EQY +MQM +GYP +WLM +FEW +FAY +HLG +DGC +HLF +AHD +GFW +FWW +YTH +THY +HYK +TQF +FMH +DMF +YFF +VWG +WGV +TEM +RMP +FNF +RHY +HYC +YCE +MQI +FFG +YCK +CKH +YYN +PPH +CPS +GHH +KQN +FTC +GCK +CRG +QHH +QYC +MGM +DPM +MGI +EPY +QCM +CMQ +NIC +EYC +CRT +YPN +TRF +RDM +WPY +LYK +MNC +NCV +SCQ +CQA +GHN +HNN +AMF +LYN +NIW +ECF +FAC +ACA +RHV +EWM +WME +FEY +DDP +TMT +MTW +IPH +CNI +NIQ +NVW +VWK +DHE +RWN +FCT +PCW +DMW +YWF +FHG +YMG +DHW +HWK +VCK +SWK +RFF +CAG +MFT +VMY +MYN +PRM +RMC +CPM +THD +CKI +FWQ +WQV +MDY +RME +WGA +AYM +YYL +FFM +KNM +YCP +CPD +WDG +HNF +MGY +LEC +CLY +SWI +NLQ +GSN +PQC +QCV +PNW +IYY +NMI +EGH +HIP +WQG +MWS +HII +YNG +RMH +WKR +RHI +YNK +HQE +RWM +DHT +WEV +KCE +IWT +YRL +KCG +WSP +KGC +CKA +HDM +MPN +DKC +DNR +LCK +YRM +RFQ +FMM +WMS +QIW +HVC +NWD +MDG +FIW +DPC +HED +KWQ +WQQ +PVC +HLW +TYN +GHK +EWD +WDS +PHC +HCI +CIQ +IWD +WDV +AWC +IDC +DCA +CRR +WPM +QYD +HYR +HCW +CWS +WHI +WRD +KMP +VIW +AWK +WKH +HYP +FFY +QCI +CIY +FAH +NWG +FYE +TDW +DWK +CEP +HMV +DKW +YFY +VDW +HKS +MQH +FDH +CCM +MMM +DYN +WKQ +WCL +IHN +NNY +NYH +VNY +HTR +MNR +DMH +MHY +GTY +IFM +QRT +GCE +CEI +YEM +RMM +YTR +YAP +YMN +WGQ +WNM +GHM +WQI +NFD +WEY +FKY +HYD +HVW +AMM +RMY +QWV +MMQ +MSF +HFE +WER +HQM +VQH +YPQ +PQF +GHC +HMN +MNT +FFH +MMH +FIM +MIY +IYW +YWH +FMC +MCS +DWY +WYA +MWK +CMR +HYN +GWI +WIW +KWC +PWQ +RYC +THE +YQF +ERM +EWP +SWY +WYN +WKY +WEC +ECM +CME +RVW +VWC +WCK +RFY +NHM +KHC +GWC +HRW +RWI +WIK +FRC +HIH +RCW +CWP +CGI +FVC +VMN +KDW +AMH +MHQ +NQR +TCY +CYT +YTQ +HHV +AHY +QSM +LMN +FMT +MTH +GRN +NMQ +NGY +AWT +FCG +NMH +MHM +YCQ +NWC +CKT +VCE +HWH +NLH +VWW +PCR +RWY +WYF +YCM +QVM +QHT +HVR +RMN +QPW +FRH +HQK +YKF +MQN +KWE +TYV +HMR +ICH +KYT +TDM +CEY +CVC +PAC +NFK +KCF +YNC +QSW +WEW +WPW +YQH +NFH +MSY +YNP +DQP +HKH +MTK +KAH +VKC +YKW +GWW +WWP +MWG +VYC +YCG +HSW +WNE +CFI +CLM +CHK +RCQ +TCQ +PFA +NNC +QGC +MNY +NYM +KQM +QME +NCF +PDC +WAN +RPN +VCP +WIN +PPW +PWL +CRH +PWD +SYM +FGC +YIK +VNC +YTF +SNC +QHM +MEH +CQT +ITM +EYH +CQF +DYM +SMM +QMH +CYA +MAC +WVN +WAT +FWM +WMT +CCG +CYG +WAF +EPM +MVC +HWG +ELC +RCI +WQH +FWH +QWQ +AGW +NWY +WYC +CRW +CQS +LIW +CAQ +QMW +MWT +CER +ERC +VGW +IAH +NAQ +WIM +MKC +FQC +MWE +TQM +YHW +HWS +NYA +WMM +MMW +MWN +WNW +NWM +YEY +PCQ +HFW +FNY +NHR +NSC +TNG +HVM +HQW +EYW +IWE +HCE +PYH +YHD +YKQ +SWH +HAY +QMY +KIH +WFN +CSF +RCE +YCH +GRH +YNE +HQN +QPH +HYL +MHV +WIT +SCG +SPW +FHF +CIW +WAG +CTW +YAW +RHH +NFW +MNK +GEC +AHM +CYY +HEQ +MWV +IMR +FCD +HQC +CYF +MHC +PMC +HQY +WTH +QKC +HRC +HYF +CYL +HKC +WPS +WDC +FMQ +QHK +CFK +NEC +DNM +CQM +QMT +MDN +DCK +WDW +LHY +TKY +FPC +MDM +QWF +MDW +DWW +WWE +GLW +TWM +MSW +WEQ +WKN +PMQ +WAW +WMQ +DCY +CYR +CFH +HMS +IWW +WWI +PFW +WVC +ACY +MNS +CGC +GCM +TYY +YYS +MIM +MKW +HMI +FWE +MKH +MEW +SMY +MYH +HYI +CKN +NMM +RIM +SKH +YEW +CQR +RYH +HTM +WKT +KMN +FKH +TCK +WYI +HNP +NGC +MRN +FHW +EIW +KVH +WFE +YCY +AHW +TYW +YWR +WNA +EMG +CFF +HYT +FHQ +NKY +HHK +PCE +FCM +CMY +DHM +QQW +QWY +WYM +MRW +FPQ +MME +MYR +LWQ +GWY +WYD +HPW +YWD +CAH +EQW +QWK +WSH +NMC +PNE +FYH +QKM +HWE +WHD +RQW +SWW +WWA +MYS +KQW +WWT +CPQ +WIE +ACC +CCH +WEK +GMY +HFT +WTY +MMG +WTN +YYM +NTH +YCC +CCF +DYQ +WEM +WGT +NHF +CMS +WGS +MIW +YQM +IHM +QDH +TWQ +CAD +GNW +NWH +YYH +YYY +YFM +TPW +WED +MCR +YNM +WWD +MYV +YWM +SCM +CMM +NRC +RCT +CTN +YHM +QWC +WCT +TTW +TWW +WWY +WMG +YYC +WID +YVM +WIR +FYC +FWS +FYW +WTW +RCF +QQG +HMD +HEN +CKM +MKY +HCF +SQW +TYD +GIC +FQW +IFW +YQY +CCY +WAD +WSF +MYK +NDW +MIP +QWG +TCW +CWW +YLW +TQW +IHY +MQC +QCD +WTQ +MWW +VWM +WMK +GMW +MQW +NCQ +CQI +MRC +PWP +WTF +HVQ +HMC +DWQ +ILW +PWS +YHH +CPC +YHE +HAK +RNM +CEH +CMF +QHN +QCE +MDQ +DHQ +YTW +WLC +MCF +WFC +CFQ +YCW +CWE +MPW +WYK +MGF +FTM +CWK +HWF +PCT +MHN +HKW +WYV +DCW +CYQ +CAW +HWC +HWR +RSW +PYC +FKW +WFW +FMF +YMY +DCM +YDH +LWY +WKD +WRF +DKQ +QEC +WTE +CEM +GCY +MNH +CEQ +HYY +PYQ +QIC +GPW +PWW +MCD +WHR +NYW +QWM +CQQ +YHC +FCH +CHQ +QCF +NFC +PCN +PWG +CMI +CTM +QCP +WWN +TMC +CYW +EHC +CCR +FTQ +CNF +FDW +DWI +PWM +YWG +KMH +PWE +KWG +WGM +WHM +WPQ +CHY +VWR +WRH +CYC +AWY +DHN +CIC +CPW +ICP +QWD +CQW +CTY +WRC +WYW +MWL +CGH +HPC +PCY +EWH +QNM +PCM +QMM +WMY +WPN +WCE +HQH +CNN +CMW +PCK +QWH +NTC +HIC +CMC +MCQ +KHW +KCQ +MHK +CWG +HMT +WFM +IWC +CML +HWT +MHR +DQW +IQW +WVW +WPC +WHG +WYH +IEW +VHY +YQW +WDH +CHD +QPY +WKC +YDC +NHW +WDM +QPC +CKW +KWY +NCM +CQN +MYF +YMW +MMC +KMW +MWI +MHD +ECI +CMD +WCI +CGM +GCQ +MCE +WWF +WTT +HDC +FCQ +DMN +PWI +RMQ +WGW +WYP +MYM +HCC +CDQ +MNW +CMP +RCK +MWD +FPW +QTW +WNY +MCT +MHH +IWM +CFY +HYW +PHW +HWW +CFN +MWF +HCM +MWH +GYW +HAW +DWH +YWV +NMW +QEW +CNC +WDK +NKC +GCC +MPC +MCN +CCA +KWM +MCM +HWL +WSY +CKC +WMF +CWY +HCQ +WCA +HMK +DHD +YHY +DNW +WCD +WPI +WFD +WHW +WHC +HCY +WHQ +IMC +KPC +YMC +CRC +MCY +ECY +MCH +HWI +DCQ +PMW +LWC +CRM +DMC +MNF +HWY +YWW +YWC +WYY +EWC +FWC +FWY +WMN +WWV +EWW +WCM +CAM +WKM +WHH +YMF +WCQ +WIQ +MFN +ANC +ECW +WCG +CIM +WQC +CMH +MYC +CTH +HHW +QWW +WIC +CPY +MDC +NYC +CMN +WHK +MMY +DEW +QHW +WQW +CEC +TWH +HFC +WKW +HWM +MQY +HDW +WYG +CWM +CYH +HYM +QMC +QCW +NCW +YQC +FMW +WMC +WWW +HMW +RMW +CHW +WCW +HTW +CWC +WCY +YWQ +WMW +CWT +CWH +MWM +WWC +WCC +WCH +WWM +TAX +AXD +XDR +IEX +EXV +QAX +AXX +XXE +XES +MXN +XNF +NRX +RXX +XXX +XXR +XRI +SAX +AXG +XGG +PRX +RXR +XRX +RXE +XEF +QEX +EXQ +XQR +REX +EXR +RXQ +XQQ +DRX +RXP +XPG +QMX +MXT +XTX +TXR +XRM +APX +PXX +XXG +XGI +NLX +LXX +XXM +XMA +LNX +NXE +XEA +GTX +TXN +XND +LIX +IXI +XIM +MVX +VXX +XXK +XKT +GLX +LXP +XPP +QGX +GXD +XDL +XAP +QNX +NXM +XMN +VAX +XGV +IKX +KXY +KEX +EXL +XLY +GQX +QXE +XEP +PLX +XKC +PVX +XKE +RXI +XIR +AXL +XLN +LLX +LXD +XDA +AXE +XEL +GGX +GXG +KAX +XXA +XAG +XWS +SPX +PXC +XCD +GWX +WXH +XHF +MPX +ESX +SXN +XNK +DLX +LXN +XNS +QXG +XGD +ITX +XRG +NEX +EXA +XAL +LDX +DXI +XII +TPX +PXM +XMR +NXG +XGY +ASX +SXV +XVE +TKX +KXA +KRX +XXT +XTL +IDX +DXX +XXL +XLV +AKX +KXX +QHX +HXV +XVN +NSX +SXX +XKX +XDP +DAX +AXK +XKQ +PIX +IXX +XXF +VLX +XDI +DIX +IXL +XLK +LKX +KXV +XVA +DNX +NXD +ILX +LXK +XKV +VYX +YXE +XEI +RXS +XSH +KGX +XGF +AVX +VXY +XYG +HVX +XXI +XID +TVX +XXS +XSA +ENX +NXX +XMD +IIX +XMQ +AEX +EXX +XME +PGX +GXP +XPR +SKX +KXF +XFT +HRX +XSW +PQX +XGR +QQX +VTX +XRP +PSX +SXP +XPL +VGX +GXY +RSX +SXS +XSL +VSX +XST +AXV +XVL +AGX +GXX +XTK +KLX +LXR +XRV +AHX +HXC +XCS +LVX +VXN +XNR +NGX +GXL +TSX +SXQ +XQN +KXL +XLL +VIX +IXG +XGA +GFX +FXG +XGL +PTX +TXT +XTS +EMX +MXQ +SXY +XYA +IQX +QXY +XYR +TXK +IGX +XPS +PXT +XTG +NXQ +VKX +KXS +XSN +GVX +VXE +GRX +XRE +YKX +KXE +XEE +EEX +EXT +XTI +EHX +HXN +XNL +NDX +DXD +IAX +KSX +SXL +RRX +XRK +DDX +DXE +RXG +VXL +XLS +DTX +TXG +VXF +XFA +XIG +VXT +XTA +ISX +SXR +XRY +VQX +QXP +XPC +LGX +GXS +HGX +XGH +XXD +XDD +KKX +XXV +PKX +XLT +XSP +XLD +RAX +AXS +XSI +IYX +YXX +XXP +XPI +MSX +SXT +GEX +XHP +LFX +FXX +VXI +XIW +QTX +TXX +XXQ +XQA +FLX +DXN +XNC +MXS +XSR +YLX +EQX +QXS +TMX +MXC +XCY +NXA +XAV +EXE +XEQ +HPX +PXP +LMX +MXX +KTX +XKK +XXH +XHS +MKX +XIH +WRX +XKS +EXY +XYQ +QKX diff --git a/chebai_proteins/preprocessing/datasets/__init__.py b/chebai_proteins/preprocessing/datasets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chebai_proteins/preprocessing/datasets/deepGO/__init__.py b/chebai_proteins/preprocessing/datasets/deepGO/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chebai_proteins/preprocessing/datasets/deepGO/go_uniprot.py b/chebai_proteins/preprocessing/datasets/deepGO/go_uniprot.py new file mode 100644 index 0000000..dbdf93e --- /dev/null +++ b/chebai_proteins/preprocessing/datasets/deepGO/go_uniprot.py @@ -0,0 +1,1002 @@ +# References for this file : +# Reference 1: +# Maxat Kulmanov, Mohammed Asif Khan, Robert Hoehndorf; +# DeepGO: Predicting protein functions from sequence and interactions +# using a deep ontology-aware classifier, Bioinformatics, 2017. +# https://doi.org/10.1093/bioinformatics/btx624 +# Github: https://github.com/bio-ontology-research-group/deepgo + +# Reference 2: +# https://www.ebi.ac.uk/GOA/downloads +# https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt +# https://www.uniprot.org/uniprotkb + +# Reference 3: +# Kulmanov, M., GuzmΓ‘n-Vega, F.J., Duek Roggli, +# P. et al. Protein function prediction as approximate semantic entailment. Nat Mach Intell 6, 220–228 (2024). +# https://doi.org/10.1038/s42256-024-00795-w +# https://github.com/bio-ontology-research-group/deepgo2 + +__all__ = [ + "GOUniProtOver250", + "GOUniProtOver50", + "EXPERIMENTAL_EVIDENCE_CODES", + "AMBIGUOUS_AMINO_ACIDS", + "DeepGO1MigratedData", + "DeepGO2MigratedData", +] + +import gzip +import itertools +import os +import shutil +from abc import ABC, abstractmethod +from collections import OrderedDict +from tempfile import NamedTemporaryFile +from typing import Any, Dict, Generator, List, Optional, Tuple, Union + +import fastobo +import networkx as nx +import pandas as pd +import requests +import torch +import tqdm +from Bio import SwissProt +from chebai.preprocessing.datasets.base import _DynamicDataset + +from chebai_proteins.preprocessing import reader as dr + +# fmt: off +# https://github.com/bio-ontology-research-group/deepgo/blob/master/utils.py#L15 +EXPERIMENTAL_EVIDENCE_CODES = { + "EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "TAS", "IC", + # New evidence codes added in latest paper year 2024 Reference number 3 + # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/utils.py#L24-L26 + "HTP", "HDA", "HMP", "HGI", "HEP", +} +# fmt: on + +# https://github.com/bio-ontology-research-group/deepgo/blob/d97447a05c108127fee97982fd2c57929b2cf7eb/aaindex.py#L8 +# https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L10 +# `X` is now considered as valid amino acid, as per latest paper year 2024 Refernce number 3 +AMBIGUOUS_AMINO_ACIDS = {"B", "O", "J", "U", "Z", "*"} + + +class _GOUniProtDataExtractor(_DynamicDataset, ABC): + """ + A class for extracting and processing data from the Gene Ontology (GO) dataset and the Swiss UniProt dataset. + + Args: + dynamic_data_split_seed (int, optional): The seed for random data splitting. Defaults to 42. + splits_file_path (str, optional): Path to the splits CSV file. Defaults to None. + max_sequence_length (int, optional): Specifies the maximum allowed sequence length for a protein, with a + default of 1002. During data preprocessing, any proteins exceeding this length will be excluded from further + processing. + **kwargs: Additional keyword arguments passed to DynamicDataset and XYBaseDataModule. + + Attributes: + dynamic_data_split_seed (int): The seed for random data splitting, default is 42. + max_sequence_length (int, optional): Specifies the maximum allowed sequence length for a protein, with a + default of 1002. During data preprocessing, any proteins exceeding this length will be excluded from further + processing. + splits_file_path (Optional[str]): Path to the CSV file containing split assignments. + """ + + _GO_DATA_INIT = "GO" + _SWISS_DATA_INIT = "SWISS" + + # -- Index for columns of processed `data.pkl` (derived from `_get_swiss_to_go_mapping` & `_graph_to_raw_dataset` + # "swiss_id" at row index 0 + # "accession" at row index 1 + # "go_ids" at row index 2 + # "sequence" at row index 3 + # labels starting from row index 4 + _ID_IDX: int = 0 + _DATA_REPRESENTATION_IDX: int = 3 # here `sequence` column + _LABELS_START_IDX: int = 4 + + _GO_DATA_URL: str = "https://purl.obolibrary.org/obo/go/go-basic.obo" + _SWISS_DATA_URL: str = ( + "https://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/complete/uniprot_sprot.dat.gz" + ) + + # Gene Ontology (GO) has three major branches, one for biological processes (BP), molecular functions (MF) and + # cellular components (CC). The value "all" will take data related to all three branches into account. + _ALL_GO_BRANCHES: str = "all" + _GO_BRANCH_NAMESPACE: Dict[str, str] = { + "BP": "biological_process", + "MF": "molecular_function", + "CC": "cellular_component", + } + + def __init__(self, **kwargs): + self.go_branch: str = self._get_go_branch(**kwargs) + + self.max_sequence_length: int = int(kwargs.get("max_sequence_length", 1002)) + assert ( + self.max_sequence_length >= 1 + ), "Max sequence length should be greater than or equal to 1." + + super(_GOUniProtDataExtractor, self).__init__(**kwargs) + + if self.reader.n_gram is not None: + assert self.max_sequence_length >= self.reader.n_gram, ( + f"max_sequence_length ({self.max_sequence_length}) must be greater than " + f"or equal to n_gram ({self.reader.n_gram})." + ) + + @classmethod + def _get_go_branch(cls, **kwargs) -> str: + """ + Retrieves the Gene Ontology (GO) branch based on provided keyword arguments. + This method checks if a valid GO branch value is provided in the keyword arguments. + + Args: + **kwargs: Arbitrary keyword arguments. Specifically looks for: + - "go_branch" (str): The desired GO branch. + Returns: + str: The GO branch value. This will be one of the allowed values. + + Raises: + ValueError: If the provided 'go_branch' value is not in the allowed list of values. + """ + + go_branch_value = kwargs.get("go_branch", cls._ALL_GO_BRANCHES) + allowed_values = list(cls._GO_BRANCH_NAMESPACE.keys()) + [cls._ALL_GO_BRANCHES] + if go_branch_value not in allowed_values: + raise ValueError( + f"Invalid value for go_branch: {go_branch_value}, Allowed values: {allowed_values}" + ) + return go_branch_value + + # ------------------------------ Phase: Prepare data ----------------------------------- + def _download_required_data(self) -> str: + """ + Downloads the required raw data related to Gene Ontology (GO) and Swiss-UniProt dataset. + + Returns: + str: Path to the downloaded data. + """ + self._download_swiss_uni_prot_data() + return self._download_gene_ontology_data() + + def _download_gene_ontology_data(self) -> str: + """ + Download the Gene Ontology data `.obo` file. + + Note: + Quote from : https://geneontology.org/docs/download-ontology/ + Three versions of the ontology are available, the one use in this method is described below: + https://purl.obolibrary.org/obo/go/go-basic.obo + The basic version of the GO, filtered such that the graph is guaranteed to be acyclic and annotations + can be propagated up the graph. The relations included are `is a, part of, regulates, negatively` + `regulates` and `positively regulates`. This version excludes relationships that cross the 3 GO + hierarchies. This version should be used with most GO-based annotation tools. + + Returns: + str: The file path of the loaded Gene Ontology data. + """ + go_path = os.path.join(self.raw_dir, self.raw_file_names_dict["GO"]) + os.makedirs(os.path.dirname(go_path), exist_ok=True) + + if not os.path.isfile(go_path): + print("Missing Gene Ontology raw data") + print(f"Downloading Gene Ontology data....") + r = requests.get(self._GO_DATA_URL, allow_redirects=True) + r.raise_for_status() # Check if the request was successful + open(go_path, "wb").write(r.content) + return go_path + + def _download_swiss_uni_prot_data(self) -> Optional[str]: + """ + Download the Swiss-Prot data file from UniProt Knowledgebase. + + Note: + UniProt Knowledgebase is collection of functional information on proteins, with accurate, consistent + and rich annotation. + + Swiss-Prot contains manually-annotated records with information extracted from literature and + curator-evaluated computational analysis. + + Returns: + str: The file path of the loaded Swiss-Prot data file. + """ + uni_prot_file_path = os.path.join( + self.raw_dir, self.raw_file_names_dict["SwissUniProt"] + ) + os.makedirs(os.path.dirname(uni_prot_file_path), exist_ok=True) + + if not os.path.isfile(uni_prot_file_path): + print(f"Downloading Swiss UniProt data....") + + # Create a temporary file + with NamedTemporaryFile(delete=False) as tf: + temp_filename = tf.name + print(f"Downloading to temporary file {temp_filename}") + + # Download the file + response = requests.get(self._SWISS_DATA_URL, stream=True) + with open(temp_filename, "wb") as temp_file: + shutil.copyfileobj(response.raw, temp_file) + + print(f"Downloaded to {temp_filename}") + + # Unpack the gzipped file + try: + print(f"Unzipping the file....") + with gzip.open(temp_filename, "rb") as f_in: + output_file_path = uni_prot_file_path + with open(output_file_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + print(f"Unpacked and saved to {output_file_path}") + + except Exception as e: + print(f"Failed to unpack the file: {e}") + finally: + # Clean up the temporary file + os.remove(temp_filename) + print(f"Removed temporary file {temp_filename}") + + return uni_prot_file_path + + def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph: + """ + Extracts the class hierarchy from the GO ontology. + Constructs a directed graph (DiGraph) using NetworkX, where nodes are annotated with GO term data. + + Args: + data_path (str): The path to the GO ontology. + + Returns: + nx.DiGraph: A directed graph representing the class hierarchy, where nodes are GO terms and edges + represent parent-child relationships. + """ + print("Extracting class hierarchy...") + elements = [] + for term in fastobo.load(data_path): + if isinstance(term, fastobo.typedef.TypedefFrame): + # ---- To avoid term frame of the below format/structure ---- + # [Typedef] + # id: part_of + # name: part of + # namespace: external + # xref: BFO:0000050 + # is_transitive: true + continue + + if ( + term + and isinstance(term.id, fastobo.id.PrefixedIdent) + and term.id.prefix == self._GO_DATA_INIT + ): + # Consider only terms with id in following format - GO:2001271 + term_dict = self.term_callback(term) + if term_dict: + elements.append(term_dict) + + g = nx.DiGraph() + + # Add GO term nodes to the graph and their hierarchical ontology + for n in elements: + g.add_node(n["go_id"], **n) + g.add_edges_from( + [ + (parent_id, node_id) + for node_id in g.nodes + for parent_id in g.nodes[node_id]["parents"] + if parent_id in g.nodes + ] + ) + + print("Compute transitive closure") + return nx.transitive_closure_dag(g) + + def term_callback(self, term: fastobo.term.TermFrame) -> Union[Dict, bool]: + """ + Extracts information from a Gene Ontology (GO) term document. + + Args: + term: A Gene Ontology term Frame document. + + Returns: + Optional[Dict]: A dictionary containing the extracted information if the term is not obsolete, + otherwise None. The dictionary includes: + - "id" (str): The ID of the GO term. + - "parents" (List[str]): A list of parent term IDs. + - "name" (str): The name of the GO term. + """ + parents = [] + name = None + + for clause in term: + if isinstance(clause, fastobo.term.NamespaceClause): + if ( + self.go_branch != self._ALL_GO_BRANCHES + and clause.namespace.escaped + != self._GO_BRANCH_NAMESPACE[self.go_branch] + ): + # if the term document is not related to given go branch (except `all`), skip this document. + return False + + if isinstance(clause, fastobo.term.IsObsoleteClause): + if clause.obsolete: + # if the term document contains clause as obsolete as true, skips this document. + return False + + if isinstance(clause, fastobo.term.IsAClause): + parents.append(self._parse_go_id(clause.term)) + elif isinstance(clause, fastobo.term.NameClause): + name = clause.name + + return { + "go_id": self._parse_go_id(term.id), + "parents": parents, + "name": name, + } + + @staticmethod + def _parse_go_id(go_id: str) -> int: + """ + Helper function to parse and normalize GO term IDs. + + Args: + go_id: The raw GO term ID string. + + Returns: + str: The parsed and normalized GO term ID. + """ + # `is_a` clause has GO id in the following formats: + # GO:0009968 ! negative regulation of signal transduction + # GO:0046780 + return int(str(go_id).split(":")[1].split("!")[0].strip()) + + def _graph_to_raw_dataset(self, g: nx.DiGraph) -> pd.DataFrame: + """ + Processes a directed acyclic graph (DAG) to create a raw dataset in DataFrame format. The dataset includes + Swiss-Prot protein data and their associations with Gene Ontology (GO) terms. + + Note: + - GO classes are used as labels in the dataset. Each GO term is represented as a column, and its value + indicates whether a Swiss-Prot protein is associated with that GO term. + - Swiss-Prot proteins serve as samples. There is no 1-to-1 correspondence between Swiss-Prot proteins + and GO terms. + + Data Format: pd.DataFrame + - Column 0 : swiss_id (Identifier for SwissProt protein) + - Column 1 : Accession of the protein + - Column 2 : GO IDs (associated GO terms) + - Column 3 : Sequence of the protein + - Column 4 to Column "n": Each column corresponding to a class with value True/False indicating whether the + protein is associated with this GO term. + + Args: + g (nx.DiGraph): The class hierarchy graph. + + Returns: + pd.DataFrame: The raw dataset created from the graph. + """ + print(f"Processing graph") + + data_df = self._get_swiss_to_go_mapping() + # add ancestors to go ids + data_df["go_ids"] = data_df["go_ids"].apply( + lambda go_ids: sorted( + set( + itertools.chain.from_iterable( + [ + [go_id] + list(g.predecessors(go_id)) + for go_id in go_ids + if go_id in g.nodes + ] + ) + ) + ) + ) + # Initialize the GO term labels/columns to False + selected_classes = self.select_classes(g, data_df=data_df) + if not selected_classes: + raise ValueError( + f"No classes selected for given threshold {self.THRESHOLD}" + ) + new_label_columns = pd.DataFrame( + False, index=data_df.index, columns=selected_classes + ) + data_df = pd.concat([data_df, new_label_columns], axis=1) + + # Set True for the corresponding GO IDs in the DataFrame go labels/columns + for index, row in data_df.iterrows(): + for go_id in row["go_ids"]: + if go_id in data_df.columns: + data_df.at[index, go_id] = True + + # This filters the DataFrame to include only the rows where at least one value in the row from 5th column + # onwards is True/non-zero. + # Quote from DeepGo Paper: `For training and testing, we use proteins which have been annotated with at least + # one GO term from the set of the GO terms for the model` + data_df = data_df[data_df.iloc[:, self._LABELS_START_IDX :].any(axis=1)] + return data_df + + def _get_swiss_to_go_mapping(self) -> pd.DataFrame: + """ + Parses the Swiss-Prot data and returns a DataFrame mapping Swiss-Prot records to Gene Ontology (GO) data. + + The DataFrame includes the following columns: + - "swiss_id": The unique identifier for each Swiss-Prot record. + - "sequence": The protein sequence. + - "accessions": Comma-separated list of accession numbers. + - "go_ids": List of GO IDs associated with the Swiss-Prot record. + + Note: + This mapping is necessary because the GO data does not include the protein sequence representation. + We select proteins with annotations having experimental evidence codes, as specified in + `EXPERIMENTAL_EVIDENCE_CODES` and filter the proteins by a maximum length of 1002, ignoring proteins with + ambiguous amino acid codes specified in `AMBIGUOUS_AMINO_ACIDS` in their sequence. + + Check the link below for keyword details: + https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt + + Returns: + pd.DataFrame: A DataFrame where each row corresponds to a Swiss-Prot record with its associated GO data. + """ + + print("Parsing swiss uniprot raw data....") + + swiss_ids, sequences, accessions, go_ids_list = [], [], [], [] + + swiss_data = SwissProt.parse( + open( + os.path.join(self.raw_dir, self.raw_file_names_dict["SwissUniProt"]), + "r", + ) + ) + + for record in swiss_data: + if record.data_class != "Reviewed": + # To consider only manually-annotated swiss data + continue + + if not record.sequence or len(record.sequence) > self.max_sequence_length: + # Consider protein with only sequence representation and seq. length not greater than max seq. length + continue + + if any(aa in AMBIGUOUS_AMINO_ACIDS for aa in record.sequence): + # Skip proteins with ambiguous amino acid codes + continue + + go_ids = [] + + for cross_ref in record.cross_references: + if cross_ref[0] == self._GO_DATA_INIT: + # One swiss data protein can correspond to many GO data instances + + if len(cross_ref) <= 3: + # No evidence code + continue + + # https://github.com/bio-ontology-research-group/deepgo/blob/master/get_functions.py#L63-L66 + evidence_code = cross_ref[3].split(":")[0] + if evidence_code not in EXPERIMENTAL_EVIDENCE_CODES: + # Skip GO id without the required experimental evidence codes + continue + + go_ids.append(self._parse_go_id(cross_ref[1])) + + if not go_ids: + # Skip Swiss proteins without mapping to GO data + continue + + swiss_ids.append(record.entry_name) + sequences.append(record.sequence) + accessions.append(",".join(record.accessions)) + go_ids.sort() + go_ids_list.append(go_ids) + + data_dict = OrderedDict( + swiss_id=swiss_ids, # swiss_id column at index 0 + accession=accessions, # Accession column at index 1 + go_ids=go_ids_list, # Go_ids (data representation) column at index 2 + sequence=sequences, # Sequence column at index 3 + ) + + return pd.DataFrame(data_dict) + + # ------------------------------ Phase: Setup data ----------------------------------- + def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, None]: + """ + Loads data from a pickled file and yields individual dictionaries for each row. + + The pickled file is expected to contain rows with the following structure: + - Data at row index `self._ID_IDX`: ID of go data instance + - Data at row index `self._DATA_REPRESENTATION_IDX`: Sequence representation of protein + - Data from row index `self._LABELS_START_IDX` onwards: Labels + + This method is used by `_load_data_from_file` to generate dictionaries that are then + processed and converted into a list of dictionaries containing the features and labels. + + Args: + input_file_path (str): The path to the pickled input file. + + Yields: + Dict[str, Any]: A dictionary containing: + - `features` (str): The sequence data from the file. + - `labels` (np.ndarray): A boolean array of labels starting from row index 4. + - `ident` (Any): The identifier from row index 0. + """ + with open(input_file_path, "rb") as input_file: + df = pd.read_pickle(input_file) + for row in df.values: + labels = row[self._LABELS_START_IDX :].astype(bool) + # chebai.preprocessing.reader.DataReader only needs features, labels, ident, group + # "group" set to None, by default as no such entity for this data + yield dict( + features=row[self._DATA_REPRESENTATION_IDX], + labels=labels, + ident=row[self._ID_IDX], + ) + + # ------------------------------ Phase: Dynamic Splits ----------------------------------- + def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Loads encoded data and generates training, validation, and test splits. + + This method attempts to load encoded data from a file named `data.pt`. It then splits this data into + training, validation, and test sets. + + Raises: + FileNotFoundError: If the `data.pt` file does not exist. Ensure that `prepare_data` and/or + `setup` methods are called to generate the necessary dataset files. + + Returns: + Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing three DataFrames: + - Training set + - Validation set + - Test set + """ + try: + filename = self.processed_file_names_dict["data"] + data_go = torch.load( + os.path.join(self.processed_dir, filename), weights_only=False + ) + except FileNotFoundError: + raise FileNotFoundError( + f"File data.pt doesn't exists. " + f"Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" + ) + + df_go_data = pd.DataFrame(data_go) + train_df_go, df_test = self.get_test_split( + df_go_data, seed=self.dynamic_data_split_seed + ) + + # Get all splits + df_train, df_val = self.get_train_val_splits_given_test( + train_df_go, + df_test, + seed=self.dynamic_data_split_seed, + ) + + return df_train, df_val, df_test + + # ------------------------------ Phase: Raw Properties ----------------------------------- + @property + def base_dir(self) -> str: + """ + Returns the base directory path for storing GO-Uniprot data. + + Returns: + str: The path to the base directory, which is "data/GO_UniProt". + """ + return os.path.join("data", f"GO_UniProt") + + @property + def raw_file_names_dict(self) -> dict: + """ + Returns a dictionary of raw file names used in data processing. + + Returns: + dict: A dictionary mapping dataset names to their respective file names. + For example, {"GO": "go-basic.obo", "SwissUniProt": "uniprot_sprot.dat"}. + """ + return {"GO": "go-basic.obo", "SwissUniProt": "uniprot_sprot.dat"} + + +class _GOUniProtOverX(_GOUniProtDataExtractor, ABC): + """ + A class for extracting data from the Gene Ontology (GO) dataset with a threshold for selecting classes based on + the number of subclasses. + + This class is designed to filter GO classes based on a specified threshold, selecting only those classes + which have a certain number of subclasses in the hierarchy. + + Attributes: + READER (dr.ProteinDataReader): The reader used for reading the dataset. + THRESHOLD (int): The threshold for selecting classes based on the number of subclasses. + """ + + READER: dr.ProteinDataReader = dr.ProteinDataReader + THRESHOLD: int = None + + @property + def _name(self) -> str: + """ + Returns the name of the dataset. + + 'max_sequence_length' in the name indicates that proteins with sequence lengths exceeding are ignored + in the dataset. + + Returns: + str: The dataset name, formatted with the current threshold value and/or given go_branch. + """ + if self.go_branch != self._ALL_GO_BRANCHES: + return f"GO{self.THRESHOLD}_{self.go_branch}_{self.max_sequence_length}" + + return f"GO{self.THRESHOLD}_{self.max_sequence_length}" + + def select_classes( + self, g: nx.DiGraph, *args: Any, **kwargs: Dict[str, Any] + ) -> List[int]: + """ + Selects classes (GO terms) from the Gene Ontology (GO) dataset based on the number of annotations meeting a + specified threshold. + + The selection process is based on the annotations of the GO terms with its ancestors across the dataset. + + Annotations are calculated by counting how many times each GO term, along with its ancestral hierarchy, + is annotated per protein across the dataset. + This means that for each protein, the GO terms associated with it are considered, and the entire hierarchical + structure (ancestors) of each GO term is taken into account. The total count for each GO term and its ancestors + reflects how frequently these terms are annotated across all proteins in the dataset. + + Args: + g (nx.DiGraph): The directed acyclic graph representing the GO dataset, where each node corresponds to a GO term. + *args: Additional positional arguments (not used). + **kwargs: Additional keyword arguments, including: + - data_df (pd.DataFrame): A DataFrame containing the GO annotations for various proteins. + It should include a 'go_ids' column with the GO terms associated with each protein. + + Returns: + List[int]: A sorted list of selected GO term IDs that meet the annotation threshold criteria. + + Side Effects: + - Writes the list of selected GO term IDs to a file named "classes.txt" in the specified processed directory. + + Raises: + AttributeError: If the 'data_df' argument is not provided in kwargs. + + Notes: + - The `THRESHOLD` attribute, which defines the minimum number of annotations required to select a GO term, should be defined in the subclass. + """ + # Retrieve the DataFrame containing GO annotations per protein from the keyword arguments + data_df = kwargs.get("data_df", None) + if data_df is None or not isinstance(data_df, pd.DataFrame) or data_df.empty: + raise AttributeError( + "The 'data_df' argument must be provided and must be a non-empty pandas DataFrame." + ) + + print(f"Selecting GO terms based on given threshold: {self.THRESHOLD} ...") + + # https://github.com/bio-ontology-research-group/deepgo/blob/master/get_functions.py#L59-L77 + go_term_annot: Dict[int, int] = {} + for idx, row in data_df.iterrows(): + # Count the annotations for each go_id **`per protein`** + for go_id in row["go_ids"]: + if go_id not in go_term_annot: + go_term_annot[go_id] = 0 + go_term_annot[go_id] += 1 + + # Select GO terms that meet or exceed the threshold of annotations + selected_nodes: List[int] = [ + go_id + for go_id in g.nodes + if go_id in go_term_annot and go_term_annot[go_id] >= self.THRESHOLD + ] + + # Sort the selected nodes (optional but often useful for consistent output) + selected_nodes.sort() + + # Write the selected node IDs/classes to the file + filename = "classes.txt" + with open(os.path.join(self.processed_dir_main, filename), "wt") as fout: + fout.writelines(str(node) + "\n" for node in selected_nodes) + + return selected_nodes + + +class GOUniProtOver250(_GOUniProtOverX): + """ + A class for extracting data from the Gene Ontology (GO) dataset with a threshold of 250 for selecting classes. + + Inherits from `_GOUniProtOverX` and sets the threshold for selecting classes to 250. + + Attributes: + THRESHOLD (int): The threshold for selecting classes (250). + """ + + THRESHOLD: int = 250 + + +class GOUniProtOver50(_GOUniProtOverX): + """ + A class for extracting data from the Gene Ontology (GO) dataset with a threshold of 50 for selecting classes. + + Inherits from `_GOUniProtOverX` and sets the threshold for selecting classes to 50. + + Attributes: + THRESHOLD (int): The threshold for selecting classes (50). + """ + + THRESHOLD: int = 50 + + +class _DeepGOMigratedData(_GOUniProtDataExtractor, ABC): + """ + Base class for use of the migrated DeepGO data with common properties, name formatting, and file paths. + + Attributes: + READER (dr.ProteinDataReader): Protein data reader class. + THRESHOLD (Optional[int]): Threshold value for GO class selection, + determined by the GO branch type in derived classes. + """ + + READER: dr.ProteinDataReader = dr.ProteinDataReader + THRESHOLD: Optional[int] = None + + # Mapping from GO branch conventions used in DeepGO to our conventions + GO_BRANCH_MAPPING: dict = { + "cc": "CC", + "mf": "MF", + "bp": "BP", + } + + @property + def _name(self) -> str: + """ + Generates a unique identifier for the migrated data based on the GO + branch and max sequence length, optionally including a threshold. + + Returns: + str: A formatted name string for the data. + """ + threshold_part = f"GO{self.THRESHOLD}_" if self.THRESHOLD is not None else "GO_" + + if self.go_branch != self._ALL_GO_BRANCHES: + return f"{threshold_part}{self.go_branch}_{self.max_sequence_length}" + + return f"{threshold_part}{self.max_sequence_length}" + + # ------------------------------ Phase: Prepare data ----------------------------------- + def _perform_data_preparation(self, *args: Any, **kwargs: Any) -> None: + """ + Checks for the existence of migrated DeepGO data in the specified directory. + Raises an error if the required data file is not found, prompting + migration from DeepGO to this data structure. + + Args: + *args (Any): Additional positional arguments. + **kwargs (Any): Additional keyword arguments. + + Raises: + FileNotFoundError: If the processed data file does not exist. + """ + print("Checking for processed data in", self.processed_dir_main) + + processed_name = self.processed_main_file_names_dict["data"] + if not os.path.isfile(os.path.join(self.processed_dir_main, processed_name)): + raise FileNotFoundError( + f"File {processed_name} not found.\n" + f"You must run the appropriate DeepGO migration script " + f"(chebai/preprocessing/migration/deep_go) before executing this configuration " + f"to migrate data from DeepGO to this data structure." + ) + + def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> List: + # Selection of GO classes not needed for migrated data + pass + + # ------------------------------ Phase: Raw Properties ----------------------------------- + @property + @abstractmethod + def processed_main_file_names_dict(self) -> Dict[str, str]: + """ + Abstract property for defining main processed file names. + These files are stored in the same directory as the generated data files + but have distinct names to differentiate them during training. + + Returns: + dict: A dictionary with key-value pairs for main processed file names. + """ + pass + + @property + @abstractmethod + def processed_file_names_dict(self) -> Dict[str, str]: + """ + Abstract property for defining additional processed file names. + These files are stored in the same directory as the generated data files + but have distinct names to differentiate them during training. + + Returns: + dict: A dictionary with key-value pairs for processed file names. + """ + pass + + +class DeepGO1MigratedData(_DeepGOMigratedData): + """ + Migrated data class specific to DeepGO1. Sets threshold values according + to the research paper based on the GO branch. + + Note: + Refer reference number 1 at the top of this file for the corresponding research paper. + + Args: + **kwargs: Arbitrary keyword arguments passed to the superclass. + + Raises: + ValueError: If an unsupported GO branch is provided. + """ + + def __init__(self, **kwargs): + # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11 + assert int(kwargs.get("max_sequence_length")) == 1002 + + # Set threshold based on GO branch, as per DeepGO1 paper and its data. + if kwargs.get("go_branch") in ["CC", "MF"]: + self.THRESHOLD = 50 + elif kwargs.get("go_branch") == "BP": + self.THRESHOLD = 250 + else: + raise ValueError( + f"DeepGO1 paper has no defined threshold for branch {self.go_branch}" + ) + + super(_DeepGOMigratedData, self).__init__(**kwargs) + + @property + def processed_main_file_names_dict(self) -> Dict[str, str]: + """ + Returns main processed file names specific to DeepGO1. + + Returns: + dict: Dictionary with the main data file name for DeepGO1. + """ + return {"data": "data_deep_go1.pkl"} + + @property + def processed_file_names_dict(self) -> Dict[str, str]: + """ + Returns processed file names specific to DeepGO1. + + Returns: + dict: Dictionary with data file name for DeepGO1. + """ + return {"data": "data_deep_go1.pt"} + + +class DeepGO2MigratedData(_DeepGOMigratedData): + """ + Migrated data class specific to DeepGO2, inheriting from DeepGO1MigratedData + with different processed file names. + + Note: + Refer reference number 3 at the top of this file for the corresponding research paper. + + Returns: + dict: Dictionary with file names specific to DeepGO2. + """ + + _LABELS_START_IDX: int = 5 # additional esm2_embeddings column in the dataframe + _ESM_EMBEDDINGS_COL_IDX: int = 4 + + def __init__(self, use_esm2_embeddings=False, **kwargs): + # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11 + assert int(kwargs.get("max_sequence_length")) == 1000 + self.use_esm2_embeddings: bool = use_esm2_embeddings + super(_DeepGOMigratedData, self).__init__(**kwargs) + + # ------------------------------ Phase: Setup data ----------------------------------- + def _load_data_from_file(self, path: str) -> List[Dict[str, Any]]: + """ + Load and process data from a file into a list of dictionaries containing features and labels. + + This method processes data differently based on the `use_esm2_embeddings` flag: + - If `use_esm2_embeddings` is True, raw dictionaries from `_load_dict` are returned, _load_dict already returns + the numerical features (esm2 embeddings) from the data file, hence no reader is required. + - Otherwise, a reader is used to process the data (generate numerical features). + + Args: + path (str): The path to the input file. + + Returns: + List[Dict[str, Any]]: A list of dictionaries with the following keys: + - `features`: Sequence or embedding data, depending on the context. + - `labels`: A boolean array of labels. + - `ident`: The identifier for the sequence. + """ + lines = self._get_data_size(path) + print(f"Processing {lines} lines...") + + if self.use_esm2_embeddings: + data = [ + d + for d in tqdm.tqdm(self._load_dict(path), total=lines) + if d["features"] is not None + ] + else: + data = [ + self.reader.to_data(d) + for d in tqdm.tqdm(self._load_dict(path), total=lines) + if d["features"] is not None + ] + + # filter for missing features in resulting data + data = [val for val in data if val["features"] is not None] + + return data + + def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, None]: + """ + Loads data from a pickled file and yields individual dictionaries for each row. + + The pickled file is expected to contain rows with the following structure: + - Data at row index `self._ID_IDX`: ID of go data instance + - Data at row index `self._DATA_REPRESENTATION_IDX`: Sequence representation of protein + - Data at row index `self._ESM2_EMBEDDINGS_COL_IDX`: ESM2 embeddings of the protein + - Data from row index `self._LABELS_START_IDX` onwards: Labels + + The method adapts based on the `use_esm2_embeddings` flag: + - If `use_esm2_embeddings` is True, features are loaded from the column specified by `self._ESM_EMBEDDINGS_COL_IDX`. + - Otherwise, features are loaded from the column specified by `self._DATA_REPRESENTATION_IDX`. + + Args: + input_file_path (str): The path to the pickled input file. + + Yields: + Dict[str, Any]: A dictionary containing: + - `features` (Any): Sequence or embedding data for the instance. + - `labels` (np.ndarray): A boolean array of labels starting from row index 4. + - `ident` (Any): The identifier from row index 0. + """ + with open(input_file_path, "rb") as input_file: + df = pd.read_pickle(input_file) + + if self.use_esm2_embeddings: + features_idx = self._ESM_EMBEDDINGS_COL_IDX + else: + features_idx = self._DATA_REPRESENTATION_IDX + + for row in df.values: + labels = row[self._LABELS_START_IDX :].astype(bool) + yield dict( + features=row[features_idx], + labels=labels, + ident=row[self._ID_IDX], + ) + + # ------------------------------ Phase: Raw Properties ----------------------------------- + @property + def processed_main_file_names_dict(self) -> Dict[str, str]: + """ + Returns main processed file names specific to DeepGO2. + + Returns: + dict: Dictionary with the main data file name for DeepGO2. + """ + return {"data": "data_deep_go2.pkl"} + + @property + def processed_file_names_dict(self) -> Dict[str, str]: + """ + Returns processed file names specific to DeepGO2. + + Returns: + dict: Dictionary with data file name for DeepGO2. + """ + return {"data": "data_deep_go2.pt"} + + @property + def identifier(self) -> tuple: + """Identifier for the dataset.""" + if self.use_esm2_embeddings: + return (dr.ESM2EmbeddingReader.name(),) + return (self.reader.name(),) diff --git a/chebai_proteins/preprocessing/datasets/deepGO/protein_pretraining.py b/chebai_proteins/preprocessing/datasets/deepGO/protein_pretraining.py new file mode 100644 index 0000000..8c39d86 --- /dev/null +++ b/chebai_proteins/preprocessing/datasets/deepGO/protein_pretraining.py @@ -0,0 +1,279 @@ +__all__ = ["SwissProteinPretrain"] + +import os +from abc import ABC +from collections import OrderedDict +from typing import Any, Dict, Generator, List, Tuple + +import networkx as nx +import pandas as pd +import torch +from Bio import SwissProt +from chebai.preprocessing.datasets.base import _DynamicDataset +from sklearn.model_selection import train_test_split + +from chebai_proteins.preprocessing.datasets.deepGO.go_uniprot import ( + AMBIGUOUS_AMINO_ACIDS, + EXPERIMENTAL_EVIDENCE_CODES, + GOUniProtOver250, +) +from chebai_proteins.preprocessing.reader import ProteinDataReader + + +class _ProteinPretrainingData(_DynamicDataset, ABC): + """ + Data module for pretraining protein sequences, specifically designed for Swiss-UniProt data. It includes methods for + data preparation, loading, and dynamic splitting of protein sequences. + The data is parsed and filtered to only select proteins with no associated `valid` Gene Ontology (GO) labels. + A valid GO label is the one which has one of evidence codes defined in `EXPERIMENTAL_EVIDENCE_CODES`. + """ + + _ID_IDX: int = 0 + _DATA_REPRESENTATION_IDX: int = 1 # Index of `sequence` column + + def __init__(self, **kwargs): + """ + Initializes the data module with any GOUniProt extractor class object. + + Args: + **kwargs: Additional arguments for the superclass initialization. + """ + self._go_uniprot_extractor = GOUniProtOver250() + assert self._go_uniprot_extractor.go_branch == GOUniProtOver250._ALL_GO_BRANCHES + + self.max_sequence_length: int = int(kwargs.get("max_sequence_length", 1002)) + assert ( + self.max_sequence_length >= 1 + ), "Max sequence length should be greater than or equal to 1." + + super(_ProteinPretrainingData, self).__init__(**kwargs) + + if self.reader.n_gram is not None: + assert self.max_sequence_length >= self.reader.n_gram, ( + f"max_sequence_length ({self.max_sequence_length}) must be greater than " + f"or equal to n_gram ({self.reader.n_gram})." + ) + + # ------------------------------ Phase: Prepare data ----------------------------------- + def _perform_data_preparation(self, *args: Any, **kwargs: Any) -> None: + """ + Prepares the data by downloading and parsing Swiss-Prot data if not already available. Saves the processed data + for further use. + + Args: + *args: Additional positional arguments. + **kwargs: Additional keyword arguments. + """ + processed_name = self.processed_main_file_names_dict["data"] + if not os.path.isfile(os.path.join(self.processed_dir_main, processed_name)): + print("Missing processed data file (`data.pkl` file)") + os.makedirs(self.processed_dir_main, exist_ok=True) + self._download_required_data() + protein_df = self._parse_protein_data_for_pretraining() + self.save_processed(protein_df, processed_name) + + def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph: + # method not required as no Swiss-UniProt has no ontological data + pass + + def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame: + # method not required as no Swiss-UniProt has no ontological data + pass + + def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> List: + # method not required as no Swiss-UniProt has no ontological data + pass + + def _download_required_data(self) -> str: + """ + Downloads the required Swiss-Prot data using the GOUniProt extractor class. + + Returns: + str: Path to the downloaded data. + """ + return self._go_uniprot_extractor._download_swiss_uni_prot_data() + + def _parse_protein_data_for_pretraining(self) -> pd.DataFrame: + """ + Parses the Swiss-Prot data and returns a DataFrame containing Swiss-Prot proteins which does not have any valid + Gene Ontology(GO) label. A valid GO label is the one which has one of the following evidence codes, as specified in + `EXPERIMENTAL_EVIDENCE_CODES`. + + The DataFrame includes the following columns: + - "swiss_id": The unique identifier for each Swiss-Prot record. + - "sequence": The protein sequence. + + Note: + We ignore proteins with ambiguous amino acid specified in `AMBIGUOUS_AMINO_ACIDS` in their sequence.` + + Returns: + pd.DataFrame: A DataFrame where each row corresponds to a Swiss-Prot record with not associated valid GO. + """ + print("Parsing swiss uniprot raw data....") + + swiss_ids, sequences = [], [] + + swiss_data = SwissProt.parse( + open( + os.path.join( + self._go_uniprot_extractor.raw_dir, + self._go_uniprot_extractor.raw_file_names_dict["SwissUniProt"], + ), + "r", + ) + ) + + for record in swiss_data: + if record.data_class != "Reviewed": + # To consider only manually-annotated swiss data + continue + + if not record.sequence: + # Consider protein with only sequence representation + continue + + if len(record.sequence) > self.max_sequence_length: + # Consider protein with only sequence length not greater than max seq. length + continue + + if any(aa in AMBIGUOUS_AMINO_ACIDS for aa in record.sequence): + # Skip proteins with ambiguous amino acid codes + continue + + has_valid_associated_go_label = False + for cross_ref in record.cross_references: + if cross_ref[0] == self._go_uniprot_extractor._GO_DATA_INIT: + + if len(cross_ref) <= 3: + # No evidence code + continue + + # https://github.com/bio-ontology-research-group/deepgo/blob/master/get_functions.py#L63-L66 + evidence_code = cross_ref[3].split(":")[0] + if evidence_code in EXPERIMENTAL_EVIDENCE_CODES: + has_valid_associated_go_label = True + break + + if has_valid_associated_go_label: + # Skip proteins which has at least one associated go label + continue + + swiss_ids.append(record.entry_name) + sequences.append(record.sequence) + + data_dict = OrderedDict( + swiss_id=swiss_ids, # swiss_id column at index 0 + sequence=sequences, # Sequence column at index 1 + ) + + return pd.DataFrame(data_dict) + + # ------------------------------ Phase: Setup data ----------------------------------- + def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, None]: + """ + Loads data from a pickled file and yields individual dictionaries for each row. + + The pickled file is expected to contain rows with the following structure: + - Data at row index `self._ID_IDX`: ID of go data instance + - Data at row index `self._DATA_REPRESENTATION_IDX`: Sequence representation of protein + + This method is used by `_load_data_from_file` to generate dictionaries that are then + processed and converted into a list of dictionaries containing the features and labels. + + Args: + input_file_path (str): The path to the pickled input file. + + Yields: + Dict[str, Any]: A dictionary containing: + - `features` (str): The sequence data from the file. + - `ident` (Any): The identifier from row index 0. + - `labels`: Set to None + """ + with open(input_file_path, "rb") as input_file: + df = pd.read_pickle(input_file) + for row in df.values: + yield dict( + features=row[self._DATA_REPRESENTATION_IDX], + ident=row[self._ID_IDX], + labels=None, + ) + + # ------------------------------ Phase: Dynamic Splits ----------------------------------- + def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Loads encoded data and generates training, validation, and test splits. + + This method attempts to load encoded data from a file named `data.pt`. It then splits this data into + training, validation, and test sets. + + Raises: + FileNotFoundError: If the `data.pt` file does not exist. Ensure that `prepare_data` and/or + `setup` methods are called to generate the necessary dataset files. + + Returns: + Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing three DataFrames: + - Training set + - Validation set + - Test set + """ + try: + filename = self.processed_file_names_dict["data"] + data_go = torch.load( + os.path.join(self.processed_dir, filename), weights_only=False + ) + except FileNotFoundError: + raise FileNotFoundError( + f"File data.pt doesn't exists. " + f"Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" + ) + + df_go_data = pd.DataFrame(data_go) + train_df_go, df_test = train_test_split( + df_go_data, + train_size=self.train_split, + random_state=self.dynamic_data_split_seed, + ) + + # Get all splits + df_train, df_val = train_test_split( + train_df_go, + train_size=self.train_split, + random_state=self.dynamic_data_split_seed, + ) + + return df_train, df_val, df_test + + # ------------------------------ Phase: Raw Properties ----------------------------------- + @property + def base_dir(self) -> str: + """ + str: The base directory for pretraining data storage. + """ + return os.path.join(self._go_uniprot_extractor.base_dir, "Pretraining") + + @property + def raw_dir(self) -> str: + """Name of the directory where the raw data is stored.""" + return self._go_uniprot_extractor.raw_dir + + +class SwissProteinPretrain(_ProteinPretrainingData): + """ + Data module for Swiss-Prot protein pretraining, inheriting from `_ProteinPretrainingData`. + This class is specifically designed to handle data processing and loading for Swiss-Prot-based protein datasets. + + Attributes: + READER (Type): The data reader class used to load and process protein pretraining data. + """ + + READER = ProteinDataReader + + @property + def _name(self) -> str: + """ + The name identifier for this data module. + + Returns: + str: A string identifier, "SwissProteinPretrain", representing the name of this data module. + """ + return f"Swiss_{self.max_sequence_length}" diff --git a/chebai_proteins/preprocessing/datasets/scope/__init__.py b/chebai_proteins/preprocessing/datasets/scope/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chebai_proteins/preprocessing/datasets/scope/scope.py b/chebai_proteins/preprocessing/datasets/scope/scope.py new file mode 100644 index 0000000..bf3540e --- /dev/null +++ b/chebai_proteins/preprocessing/datasets/scope/scope.py @@ -0,0 +1,974 @@ +# References for this file : + +# Reference 1: +# John-Marc Chandonia, Naomi K Fox, Steven E Brenner, SCOPe: classification of large macromolecular structures +# in the structural classification of proteinsβ€”extended database, Nucleic Acids Research, Volume 47, +# Issue D1, 08 January 2019, Pages D475–D481, https://doi.org/10.1093/nar/gky1134 +# https://scop.berkeley.edu/about/ver=2.08 + +# Reference 2: +# Murzin AG, Brenner SE, Hubbard TJP, Chothia C. 1995. SCOP: a structural classification of proteins database for +# the investigation of sequences and structures. Journal of Molecular Biology 247:536-540 + +import gzip +import os +import re +import shutil +from abc import ABC, abstractmethod +from tempfile import NamedTemporaryFile +from typing import Any, Dict, Generator, List, Optional, Tuple + +import networkx as nx +import pandas as pd +import requests +import torch +from Bio import SeqIO +from chebai.preprocessing.datasets.base import _DynamicDataset + +from chebai_proteins.preprocessing.reader import ESM2EmbeddingReader, ProteinDataReader + + +class _SCOPeDataExtractor(_DynamicDataset, ABC): + """ + A class for extracting and processing data from the SCOPe (Structural Classification of Proteins - extended) dataset. + + This class is designed to handle the parsing, preprocessing, and hierarchical structure extraction from various + SCOPe dataset files, such as classification (CLA), hierarchy (HIE), and description (DES) files. + Additionally, it supports downloading related data like PDB sequence files. + + Args: + scope_version (str): The SCOPe version to use. + scope_version_train (Optional[str]): The training SCOPe version, if different. + dynamic_data_split_seed (int, optional): The seed for random data splitting. Defaults to 42. + splits_file_path (str, optional): Path to the splits CSV file. Defaults to None. + **kwargs: Additional keyword arguments passed to DynamicDataset and XYBaseDataModule. + """ + + # -- Index for columns of processed `data.pkl` (derived from `_graph_to_raw_dataset`) + # "id" at row index 0 + # "sids" at row index 1 + # "sequence" at row index 2 + # labels starting from row index 3 + _ID_IDX: int = 0 + _DATA_REPRESENTATION_IDX: int = 2 # here `sequence` column + _LABELS_START_IDX: int = 3 + + _SCOPE_GENERAL_URL = "https://scop.berkeley.edu/downloads/parse/dir.{data_type}.scope.{version_number}-stable.txt" + _PDB_SEQUENCE_DATA_URL = ( + "https://files.rcsb.org/pub/pdb/derived_data/pdb_seqres.txt.gz" + ) + + SCOPE_HIERARCHY: Dict[str, str] = { + "cl": "class", + "cf": "fold", + "sf": "superfamily", + "fa": "family", + "dm": "protein", + "sp": "species", + "px": "domain", + } + + def __init__( + self, + scope_version: str, + scope_version_train: Optional[str] = None, + max_sequence_len: int = 1000, + **kwargs, + ): + self.scope_version: str = scope_version + self.scope_version_train: str = scope_version_train + self.max_sequence_len: int = max_sequence_len + + super(_SCOPeDataExtractor, self).__init__(**kwargs) + + if self.scope_version_train is not None: + # Instantiate another same class with "scope_version" as "scope_version_train", if train_version is given + # This is to get the data from respective directory related to "scope_version_train" + _init_kwargs = kwargs + _init_kwargs["scope_version"] = self.scope_version_train + self._scope_version_train_obj = self.__class__( + **_init_kwargs, + ) + + @staticmethod + def _get_scope_url(data_type: str, version_number: str) -> str: + """ + Generates the URL for downloading SCOPe files. + + Args: + data_type (str): The type of data (e.g., 'cla', 'hie', 'des'). + version_number (str): The version of the SCOPe file. + + Returns: + str: The formatted SCOPe file URL. + """ + return _SCOPeDataExtractor._SCOPE_GENERAL_URL.format( + data_type=data_type, version_number=version_number + ) + + # ------------------------------ Phase: Prepare data ----------------------------------- + def _download_required_data(self) -> str: + """ + Downloads the required raw data for SCOPe and PDB sequence datasets. + + Returns: + str: Path to the downloaded data. + """ + self._download_pdb_sequence_data() + return self._download_scope_raw_data() + + def _download_pdb_sequence_data(self) -> None: + """ + Downloads and unzips the PDB sequence dataset from the RCSB PDB repository. + + The file is downloaded as a temporary gzip file, which is then extracted to the + specified directory. + """ + pdb_seq_file_path = os.path.join( + self.scope_root_dir, self.raw_file_names_dict["PDB"] + ) + os.makedirs(os.path.dirname(pdb_seq_file_path), exist_ok=True) + + if not os.path.isfile(pdb_seq_file_path): + print(f"Missing PDB raw data, Downloading PDB sequence data....") + + # Create a temporary file + with NamedTemporaryFile(delete=False) as tf: + temp_filename = tf.name + print(f"Downloading to temporary file {temp_filename}") + + # Download the file + response = requests.get(self._PDB_SEQUENCE_DATA_URL, stream=True) + with open(temp_filename, "wb") as temp_file: + shutil.copyfileobj(response.raw, temp_file) + + print(f"Downloaded to {temp_filename}") + + # Unpack the gzipped file + try: + print(f"Unzipping the file....") + with gzip.open(temp_filename, "rb") as f_in: + output_file_path = pdb_seq_file_path + with open(output_file_path, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + print(f"Unpacked and saved to {output_file_path}") + + except Exception as e: + print(f"Failed to unpack the file: {e}") + finally: + # Clean up the temporary file + os.remove(temp_filename) + print(f"Removed temporary file {temp_filename}") + + def _download_scope_raw_data(self) -> str: + """ + Downloads the raw SCOPe dataset files (CLA, HIE, DES, and COM). + + Each file is downloaded from the SCOPe repository and saved to the specified directory. + Files are only downloaded if they do not already exist. + + Returns: + str: A dummy path to indicate completion (can be extended for custom behavior). + """ + os.makedirs(self.raw_dir, exist_ok=True) + for data_type in ["CLA", "HIE", "DES"]: + data_file_name = self.raw_file_names_dict[data_type] + scope_path = os.path.join(self.raw_dir, data_file_name) + if not os.path.isfile(scope_path): + print(f"Missing Scope: {data_file_name} raw data, Downloading...") + r = requests.get( + self._get_scope_url(data_type.lower(), self.scope_version), + allow_redirects=False, + verify=False, # Disable SSL verification + ) + r.raise_for_status() # Check if the request was successful + open(scope_path, "wb").write(r.content) + return "dummy/path" + + def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph: + """ + Extracts the class hierarchy from SCOPe data and computes its transitive closure. + + Args: + data_path (str): Path to the processed SCOPe dataset. + + Returns: + nx.DiGraph: A directed acyclic graph representing the SCOPe class hierarchy. + """ + print("Extracting class hierarchy...") + df_scope = self._get_scope_data() + pdb_chain_df = self._parse_pdb_sequence_file() + pdb_id_set = set(pdb_chain_df["pdb_id"]) # Search time complexity - O(1) + + # Initialize sets and dictionaries for storing edges and attributes + parent_node_edges, node_child_edges = set(), set() + node_attrs = {} + px_level_nodes = set() + sequence_nodes = dict() + px_to_seq_edges = set() + required_graph_nodes = set() + + # Create a lookup dictionary for PDB chain sequences + lookup_dict = ( + pdb_chain_df.groupby("pdb_id")[["chain_id", "sequence"]] + .apply(lambda x: dict(zip(x["chain_id"], x["sequence"]))) + .to_dict() + ) + + def add_sequence_nodes_edges(chain_sequence, px_sun_id): + """Adds sequence nodes and edges connecting px-level nodes to sequence nodes.""" + if chain_sequence not in sequence_nodes: + sequence_nodes[chain_sequence] = f"seq_{len(sequence_nodes)}" + px_to_seq_edges.add((px_sun_id, sequence_nodes[chain_sequence])) + + # Step 1: Build the graph structure and store node attributes + for row in df_scope.itertuples(index=False): + if row.level == "px": + + pdb_id, chain_id = row.sid[1:5], row.sid[5] + + if pdb_id not in pdb_id_set or chain_id == "_": + # Don't add domain level nodes that don't have pdb_id in pdb_sequences.txt file + # Also chain_id with "_" which corresponds to no chain + continue + px_level_nodes.add(row.sunid) + + # Add edges between px-level nodes and sequence nodes + if chain_id != ".": + if chain_id not in lookup_dict[pdb_id]: + continue + add_sequence_nodes_edges(lookup_dict[pdb_id][chain_id], row.sunid) + else: + # If chain_id is '.', connect all chains of this PDB ID + for chain, chain_sequence in lookup_dict[pdb_id].items(): + add_sequence_nodes_edges(chain_sequence, row.sunid) + else: + required_graph_nodes.add(row.sunid) + + node_attrs[row.sunid] = {"sid": row.sid, "level": row.level} + + if row.parent_sunid != -1: + parent_node_edges.add((row.parent_sunid, row.sunid)) + + for child_id in row.children_sunids: + node_child_edges.add((row.sunid, child_id)) + + del df_scope, pdb_chain_df, pdb_id_set + + g = nx.DiGraph() + g.add_nodes_from(node_attrs.items()) + # Note - `add_edges` internally create a node, if a node doesn't exist already + g.add_edges_from({(p, c) for p, c in parent_node_edges if p in node_attrs}) + g.add_edges_from({(p, c) for p, c in node_child_edges if c in node_attrs}) + + seq_nodes = set(sequence_nodes.values()) + g.add_nodes_from([(seq_id, {"level": "sequence"}) for seq_id in seq_nodes]) + g.add_edges_from( + { + (px_node, seq_node) + for px_node, seq_node in px_to_seq_edges + if px_node in node_attrs and seq_node in seq_nodes + } + ) + + # Step 2: Count sequence successors for required graph nodes only + for node in required_graph_nodes: + num_seq_successors = sum( + g.nodes[child]["level"] == "sequence" + for child in nx.descendants(g, node) + ) + g.nodes[node]["num_seq_successors"] = num_seq_successors + + # Step 3: Remove nodes which are not required before computing transitive closure for better efficiency + g.remove_nodes_from(px_level_nodes | seq_nodes) + + print("Computing Transitive Closure.........") + # Transitive closure is not needed in `select_classes` method but is required in _SCOPeOverXPartial + return nx.transitive_closure_dag(g) + + def _get_scope_data(self) -> pd.DataFrame: + """ + Merges and preprocesses the SCOPe classification, hierarchy, and description files into a unified DataFrame. + + Returns: + pd.DataFrame: A DataFrame containing combined SCOPe data with classification and hierarchy details. + """ + df_cla = self._get_classification_data() + df_hie = self._get_hierarchy_data() + df_des = self._get_node_description_data() + df_hie_with_cla = pd.merge(df_hie, df_cla, how="left", on="sunid") + df_all = pd.merge( + df_hie_with_cla, + df_des.drop(columns=["sid"], axis=1), + how="left", + on="sunid", + ) + return df_all + + def _get_classification_data(self) -> pd.DataFrame: + """ + Parses and processes the SCOPe CLA (classification) file. + + Returns: + pd.DataFrame: A DataFrame containing classification details, including hierarchy levels. + """ + df_cla = pd.read_csv( + os.path.join(self.raw_dir, self.raw_file_names_dict["CLA"]), + sep="\t", + header=None, + comment="#", + ) + df_cla.columns = [ + "sid", + "PDB_ID", + "description", + "sccs", + "sunid", + "hie_levels", + ] + + # Convert to dict - {cl:46456, cf:46457, sf:46458, fa:46459, dm:46460, sp:116748, px:113449} + df_cla["hie_levels"] = df_cla["hie_levels"].apply( + lambda x: {k: int(v) for k, v in (item.split("=") for item in x.split(","))} + ) + + # Split ancestor_nodes into separate columns and assign values + for key in self.SCOPE_HIERARCHY.keys(): + df_cla[self.SCOPE_HIERARCHY[key]] = df_cla["hie_levels"].apply( + lambda x: x[key] + ) + + df_cla["sunid"] = df_cla["sunid"].astype("int64") + + return df_cla + + def _get_hierarchy_data(self) -> pd.DataFrame: + """ + Parses and processes the SCOPe HIE (hierarchy) file. + + Returns: + pd.DataFrame: A DataFrame containing hierarchy details, including parent-child relationships. + """ + df_hie = pd.read_csv( + os.path.join(self.raw_dir, self.raw_file_names_dict["HIE"]), + sep="\t", + header=None, + comment="#", + low_memory=False, + ) + df_hie.columns = ["sunid", "parent_sunid", "children_sunids"] + + # if not parent id, then insert -1 + df_hie["parent_sunid"] = df_hie["parent_sunid"].replace("-", -1).astype(int) + # convert children ids to list of ids + df_hie["children_sunids"] = df_hie["children_sunids"].apply( + lambda x: list(map(int, x.split(","))) if x != "-" else [] + ) + + # Ensure the 'sunid' column in both DataFrames has the same type + df_hie["sunid"] = df_hie["sunid"].astype("int64") + return df_hie + + def _get_node_description_data(self) -> pd.DataFrame: + """ + Parses and processes the SCOPe DES (description) file. + + Returns: + pd.DataFrame: A DataFrame containing node-level descriptions from the SCOPe dataset. + """ + df_des = pd.read_csv( + os.path.join(self.raw_dir, self.raw_file_names_dict["DES"]), + sep="\t", + header=None, + comment="#", + low_memory=False, + ) + df_des.columns = ["sunid", "level", "scss", "sid", "description"] + df_des.loc[len(df_des)] = {"sunid": 0, "level": "root"} + + # Ensure the 'sunid' column in both DataFrames has the same type + df_des["sunid"] = df_des["sunid"].astype("int64") + return df_des + + def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame: + """ + Processes a directed acyclic graph (DAG) to generate a raw dataset in DataFrame format. This dataset includes + chain-level sequences and their corresponding labels based on the hierarchical structure of the associated domains. + + The process: + - Extracts SCOPe domain identifiers (sids) from the graph. + - Retrieves class labels for each domain based on all applicable taxonomy levels. + - Fetches the chain-level sequences from the Protein Data Bank (PDB) for each domain. + - For each sequence, identifies all domains associated with the same chain and assigns their corresponding labels. + + Notes: + - SCOPe hierarchy levels are used as labels, with each level represented by a column. The value in each column + indicates whether a PDB chain is associated with that particular hierarchy level. + - PDB chains are treated as samples. The method considers only domains that are mapped to the selected hierarchy levels. + + Data Format: pd.DataFrame + - Column 0 : id (Unique identifier for each sequence entry) + - Column 1 : sids (List of domain identifiers associated with the sequence) + - Column 2 : sequence (Amino acid sequence of the chain) + - Column 3 to Column "n": Each column corresponds to a SCOPe class hierarchy level with a value + of True/False indicating whether the chain is associated with the corresponding level. + + Args: + graph (nx.DiGraph): The class hierarchy graph. + + Returns: + pd.DataFrame: The raw dataset created from the graph. + + Raises: + RuntimeError: If no sunids are selected. + """ + print(f"Process graph") + + selected_sun_ids_per_lvl = self.select_classes(graph) + + if not selected_sun_ids_per_lvl: + raise RuntimeError("No sunid selected.") + + df_cla = self._get_classification_data() + hierarchy_levels = list(self.SCOPE_HIERARCHY.values()) + hierarchy_levels.remove("domain") + + df_cla = df_cla[["sid", "sunid"] + hierarchy_levels] + + # Initialize selected target columns + df_encoded = df_cla[["sid", "sunid"]].copy() + + # Collect all new columns in a dictionary first (avoids fragmentation) + encoded_df_columns = {} + + lvl_to_target_cols_mapping = {} + # Iterate over only the selected sun_ids (nodes) to one-hot encode them + for level, selected_sun_ids in selected_sun_ids_per_lvl.items(): + level_column = self.SCOPE_HIERARCHY[level] + if level_column in df_cla.columns: + # Create binary encoding for only relevant sun_ids + for sun_id in selected_sun_ids: + col_name = f"{level_column}_{sun_id}" + encoded_df_columns[col_name] = ( + df_cla[level_column] == sun_id + ).astype(bool) + + lvl_to_target_cols_mapping.setdefault(level_column, []).append( + col_name + ) + + # Convert the dictionary into a DataFrame and concatenate at once (prevents fragmentation) + df_encoded = pd.concat([df_encoded, pd.DataFrame(encoded_df_columns)], axis=1) + + encoded_target_columns = [] + for level in hierarchy_levels: + if level in lvl_to_target_cols_mapping: + encoded_target_columns.extend(lvl_to_target_cols_mapping[level]) + + print( + f"{len(encoded_target_columns)} labels has been selected for specified threshold, " + ) + print("Constructing data.pkl file .....") + + df_encoded = df_encoded[["sid", "sunid"] + encoded_target_columns] + + # Filter to select only domains that atleast map to any one selected sunid in any level + df_encoded = df_encoded[df_encoded.iloc[:, 2:].any(axis=1)] + + df_encoded["pdb_id"] = df_encoded["sid"].str[1:5] + df_encoded["chain_id"] = df_encoded["sid"].str[5] + + # "_" (underscore) means it has no chain + df_encoded = df_encoded[df_encoded["chain_id"] != "_"] + + pdb_chain_df = self._parse_pdb_sequence_file() + + # Handle chain_id == "." - Multiple chain case + # Split df_encoded into two: One for specific chains, one for "multiple chains" (".") + df_specific_chains = df_encoded[df_encoded["chain_id"] != "."] + df_multiple_chains = df_encoded[df_encoded["chain_id"] == "."].drop( + columns=["chain_id"] + ) + + # Merge specific chains normally + merged_specific = df_specific_chains.merge( + pdb_chain_df, on=["pdb_id", "chain_id"], how="left" + ) + + # Merge all chains case -> Join by pdb_id (not chain_id) + merged_all_chains = df_multiple_chains.merge( + pdb_chain_df, on="pdb_id", how="left" + ) + + # Combine both cases + sequence_hierarchy_df = pd.concat( + [merged_specific, merged_all_chains], ignore_index=True + ).dropna(subset=["sequence"]) + + # Vectorized Aggregation Instead of Row-wise Updates + sequence_hierarchy_df = ( + sequence_hierarchy_df.groupby("sequence", as_index=False) + .agg( + { + "sid": list, # Collect all SIDs per sequence + **{ + col: "max" for col in encoded_target_columns + }, # Max works as Bitwise OR for labels + } + ) + .rename(columns={"sid": "sids"}) + ) # Rename for clarity + + sequence_hierarchy_df = sequence_hierarchy_df.assign( + id=range(1, len(sequence_hierarchy_df) + 1) + )[["id", "sids", "sequence"] + encoded_target_columns] + + # Ensure atleast one label is true for each protein sequence + sequence_hierarchy_df = sequence_hierarchy_df[ + sequence_hierarchy_df.iloc[:, self._LABELS_START_IDX :].any(axis=1) + ] + + with open(os.path.join(self.processed_dir_main, "classes.txt"), "wt") as fout: + fout.writelines(str(sun_id) + "\n" for sun_id in encoded_target_columns) + + return sequence_hierarchy_df + + def _parse_pdb_sequence_file(self) -> pd.DataFrame: + """ + Parses the PDB sequence file and returns a DataFrame containing PDB IDs, chain IDs, and sequences. + + Returns: + pd.DataFrame: A DataFrame with columns ["pdb_id", "chain_id", "sequence"]. + """ + records = [] + valid_amino_acids = "".join(ProteinDataReader.AA_LETTER) + + for record in SeqIO.parse( + os.path.join(self.scope_root_dir, self.raw_file_names_dict["PDB"]), "fasta" + ): + + if not record.seq or len(record.seq) > self.max_sequence_len: + continue + + pdb_id, chain = record.id.split("_") + sequence = re.sub(f"[^{valid_amino_acids}]", "X", str(record.seq)) + + # Store as a dictionary entry (list of dicts -> DataFrame later) + records.append( + { + "pdb_id": pdb_id.lower(), + "chain_id": chain.lower(), + "sequence": sequence, + } + ) + + # Convert list of dictionaries to a DataFrame + pdb_chain_df = pd.DataFrame.from_records(records) + + return pdb_chain_df + + @abstractmethod + def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> Dict[str, List[int]]: + # Override the return type of the method from superclass + pass + + # ------------------------------ Phase: Setup data ----------------------------------- + def setup_processed(self) -> None: + """ + Transform and prepare processed data for the SCOPe dataset. + + Main function of this method is to transform `data.pkl` into a model input data format (`data.pt`), + ensuring that the data is in a format compatible for input to the model. + The transformed data must contain the following keys: `ident`, `features`, `labels`, and `group`. + This method uses a subclass of Data Reader to perform the transformation. + + It will transform the data related to `scope_version_train`, if specified. + """ + super().setup_processed() + + # Transform the data related to "scope_version_train" to encoded data, if it doesn't exist + if self.scope_version_train is not None and not os.path.isfile( + os.path.join( + self._scope_version_train_obj.processed_dir, + self._scope_version_train_obj.processed_file_names_dict["data"], + ) + ): + print( + f"Missing encoded data related to train version: {self.scope_version_train}" + ) + print("Calling the setup method related to it") + self._scope_version_train_obj.setup() + + def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, None]: + """ + Loads data from a pickled file and yields individual dictionaries for each row. + + The pickled file is expected to contain rows with the following structure: + - Data at row index `self._ID_IDX`: ID of go data instance + - Data at row index `self._DATA_REPRESENTATION_IDX`: Sequence representation of protein + - Data from row index `self._LABELS_START_IDX` onwards: Labels + + This method is used by `_load_data_from_file` to generate dictionaries that are then + processed and converted into a list of dictionaries containing the features and labels. + + Args: + input_file_path (str): The path to the pickled input file. + + Yields: + Dict[str, Any]: A dictionary containing: + - `features` (str): The sequence data from the file. + - `labels` (np.ndarray): A boolean array of labels starting from row index 4. + - `ident` (Any): The identifier from row index 0. + """ + with open(input_file_path, "rb") as input_file: + df = pd.read_pickle(input_file) + for row in df.values: + labels = row[self._LABELS_START_IDX :].astype(bool) + # chebai.preprocessing.reader.DataReader only needs features, labels, ident, group + # "group" set to None, by default as no such entity for this data + yield dict( + features=row[self._DATA_REPRESENTATION_IDX], + labels=labels, + ident=row[self._ID_IDX], + ) + + # ------------------------------ Phase: Dynamic Splits ----------------------------------- + def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """ + Loads encoded/transformed data and generates training, validation, and test splits. + + This method first loads encoded data from a file named `data.pt`, which is derived from either + `scope_version` or `scope_version_train`. It then splits the data into training, validation, and test sets. + + If `scope_version_train` is provided: + - Loads additional encoded data from `scope_version_train`. + - Splits this data into training and validation sets, while using the test set from `scope_version`. + - Prunes the test set from `scope_version` to include only labels that exist in `scope_version_train`. + + If `scope_version_train` is not provided: + - Splits the data from `scope_version` into training, validation, and test sets without modification. + + Raises: + FileNotFoundError: If the required `data.pt` file(s) do not exist. Ensure that `prepare_data` + and/or `setup` methods have been called to generate the dataset files. + + Returns: + Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing three DataFrames: + - Training set + - Validation set + - Test set + """ + try: + filename = self.processed_file_names_dict["data"] + data_scope_version = torch.load( + os.path.join(self.processed_dir, filename), weights_only=False + ) + except FileNotFoundError: + raise FileNotFoundError( + f"File data.pt doesn't exists. " + f"Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" + ) + + df_scope_version = pd.DataFrame(data_scope_version) + train_df_scope_ver, df_test_scope_ver = self.get_test_split( + df_scope_version, seed=self.dynamic_data_split_seed + ) + + if self.scope_version_train is not None: + # Load encoded data derived from "scope_version_train" + try: + filename_train = ( + self._scope_version_train_obj.processed_file_names_dict["data"] + ) + data_scope_train_version = torch.load( + os.path.join( + self._scope_version_train_obj.processed_dir, filename_train + ), + weights_only=False, + ) + except FileNotFoundError: + raise FileNotFoundError( + f"File data.pt doesn't exists related to scope_version_train {self.scope_version_train}." + f"Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" + ) + + df_scope_train_version = pd.DataFrame(data_scope_train_version) + # Get train/val split of data based on "scope_version_train", but + # using test set from "scope_version" + df_train, df_val = self.get_train_val_splits_given_test( + df_scope_train_version, + df_test_scope_ver, + seed=self.dynamic_data_split_seed, + ) + # Modify test set from "scope_version" to only include the labels that + # exists in "scope_version_train", all other entries remains same. + df_test = self._setup_pruned_test_set(df_test_scope_ver) + else: + # Get all splits based on "scope_version" + df_train, df_val = self.get_train_val_splits_given_test( + train_df_scope_ver, + df_test_scope_ver, + seed=self.dynamic_data_split_seed, + ) + df_test = df_test_scope_ver + + return df_train, df_val, df_test + + def _setup_pruned_test_set( + self, df_test_scope_version: pd.DataFrame + ) -> pd.DataFrame: + """ + Create a test set with the same leaf nodes, but use only classes that appear in the training set. + + Args: + df_test_scope_version (pd.DataFrame): The test dataset. + + Returns: + pd.DataFrame: The pruned test dataset. + """ + # TODO: find a more efficient way to do this + filename_old = "classes.txt" + + # Load original classes (from the current SCOPe version - scope_version) + with open(os.path.join(self.processed_dir_main, filename_old), "r") as file: + orig_classes = file.readlines() + + # Load new classes (from the training SCOPe version - scope_version_train) + with open( + os.path.join( + self._scope_version_train_obj.processed_dir_main, filename_old + ), + "r", + ) as file: + new_classes = file.readlines() + + # Create a mapping which give index of a class from scope_version, if the corresponding + # class exists in scope_version_train, Size = Number of classes in scope_version + mapping = [ + None if or_class not in new_classes else new_classes.index(or_class) + for or_class in orig_classes + ] + + # Iterate over each data instance in the test set which is derived from scope_version + for _, row in df_test_scope_version.iterrows(): + # Size = Number of classes in scope_version_train + new_labels = [False for _ in new_classes] + for ind, label in enumerate(row["labels"]): + # If the scope_version class exists in the scope_version_train and has a True label, + # set the corresponding label in new_labels to True + if mapping[ind] is not None and label: + new_labels[mapping[ind]] = label + # Update the labels from test instance of scope_version to new labels, which are compatible to both versions + row["labels"] = new_labels + + return df_test_scope_version + + # ------------------------------ Phase: Raw Properties ----------------------------------- + @property + def scope_root_dir(self) -> str: + """ + Returns the root directory of scope data. + + Returns: + str: The path to the base directory, which is "data/GO_UniProt". + """ + return os.path.join("data", "SCOPe") + + @property + def base_dir(self) -> str: + """ + Returns the base directory path for storing SCOPe data. + + Returns: + str: The path to the base directory, which is "data/GO_UniProt". + """ + return os.path.join(self.scope_root_dir, f"version_{self.scope_version}") + + @property + def raw_file_names_dict(self) -> dict: + """ + Returns a dictionary of raw file names used in data processing. + + Returns: + dict: A dictionary mapping dataset names to their respective file names. + """ + return { + "CLA": "cla.txt", + "DES": "des.txt", + "HIE": "hie.txt", + "PDB": "pdb_sequences.txt", + } + + +class _SCOPeOverX(_SCOPeDataExtractor, ABC): + """ + A class for extracting data from the SCOPe dataset with a threshold for selecting classes/labels based on + the number of subclasses. + + This class is designed to filter SCOPe classes/labels based on a specified threshold, selecting only those classes + which have a certain number of subclasses in the hierarchy. + + Attributes: + READER (dr.ProteinDataReader): The reader used for reading the dataset. + THRESHOLD (int): The threshold for selecting classes/labels based on the number of subclasses. + + """ + + READER = ProteinDataReader + THRESHOLD: int = None + + @property + def _name(self) -> str: + """ + Returns the name of the dataset. + + Returns: + str: The dataset name, formatted with the current threshold. + """ + return f"SCOPe{self.THRESHOLD}" + + def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> Dict[str, List[int]]: + """ + Selects classes from the SCOPe dataset based on the number of successors meeting a specified threshold. + + This method iterates over the nodes in the graph, counting the number of successors for each node. + Nodes with a number of successors greater than or equal to the defined threshold are selected. + + Note: + The input graph must be transitive closure of a directed acyclic graph. + + Args: + g (nx.Graph): The graph representing the dataset. + *args: Additional positional arguments (not used). + **kwargs: Additional keyword arguments (not used). + + Returns: + Dict: A dict containing selected nodes at each hierarchy level. + + Notes: + - The `THRESHOLD` attribute should be defined in the subclass of this class. + """ + selected_sunids_for_level = {} + for node, attr_dict in g.nodes(data=True): + if attr_dict["level"] in {"root", "px", "sequence"}: + # Skip nodes with level "root", "px", or "sequence" + continue + + # Check if the number of "sequence"-level successors meets or exceeds the threshold + if g.nodes[node]["num_seq_successors"] >= self.THRESHOLD: + selected_sunids_for_level.setdefault(attr_dict["level"], []).append( + node + ) + return selected_sunids_for_level + + +class _SCOPeOverXPartial(_SCOPeOverX, ABC): + """ + Dataset that doesn't use the full SCOPe dataset, but extracts a part of SCOPe (subclasses of a given top class) + + Attributes: + top_class_sunid (int): The Sun-ID of the top class from which to extract subclasses. + """ + + def __init__(self, top_class_sunid: int, **kwargs): + """ + Initializes the _SCOPeOverXPartial dataset. + + Args: + top_class_sunid (int): The Sun-ID of the top class from which to extract subclasses. + **kwargs: Additional keyword arguments passed to the superclass initializer. + """ + if "top_class_sunid" not in kwargs: + kwargs["top_class_sunid"] = top_class_sunid + + self.top_class_sunid: int = top_class_sunid + super().__init__(**kwargs) + + @property + def processed_dir_main(self) -> str: + """ + Returns the main processed data directory specific to the top class. + + Returns: + str: The processed data directory path. + """ + return os.path.join( + self.base_dir, + self._name, + f"partial_{self.top_class_sunid}", + "processed", + ) + + def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph: + """ + Extracts a subset of SCOPe based on subclasses of the top class ID. + + This method calls the superclass method to extract the full class hierarchy, + then extracts the subgraph containing only the descendants of the top class ID, including itself. + + Args: + data_path (str): The file path to the SCOPe ontology file. + + Returns: + nx.DiGraph: The extracted class hierarchy as a directed graph, limited to the + descendants of the top class ID. + """ + g = super()._extract_class_hierarchy(data_path) + g = g.subgraph( + list(g.successors(self.top_class_sunid)) + [self.top_class_sunid] + ) + return g + + +class SCOPeOver2000(_SCOPeOverX): + """ + A class for extracting data from the SCOPe dataset with a threshold of 2000 for selecting classes. + + Inherits from `_SCOPeOverX` and sets the threshold for selecting classes to 2000. + + Attributes: + THRESHOLD (int): The threshold for selecting classes (2000). + """ + + THRESHOLD: int = 2000 + + +class SCOPeOver50(_SCOPeOverX): + + THRESHOLD = 50 + + +class SCOPeOverPartial2000(_SCOPeOverXPartial): + """ + A class for extracting data from the SCOPe dataset with a threshold of 2000 for selecting classes. + + Inherits from `_SCOPeOverXPartial` and sets the threshold for selecting classes to 2000. + + Attributes: + THRESHOLD (int): The threshold for selecting classes (2000). + """ + + THRESHOLD: int = 2000 + + +class SCOPeOver50ESM(SCOPeOver50): + READER = ESM2EmbeddingReader + + +if __name__ == "__main__": + scope = SCOPeOver50(scope_version="2.08") + + # g = scope._extract_class_hierarchy("dummy/path") + # # Save graph + # import pickle + # with open("graph.gpickle", "wb") as f: + # pickle.dump(g, f) + + # Load graph + import pickle + + with open("graph.gpickle", "rb") as f: + g = pickle.load(f) + + # print(len([node for node in g.nodes() if g.out_degree(node) > 10000])) + scope._graph_to_raw_dataset(g) diff --git a/chebai_proteins/preprocessing/migration/__init__.py b/chebai_proteins/preprocessing/migration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chebai_proteins/preprocessing/migration/deep_go/__init__.py b/chebai_proteins/preprocessing/migration/deep_go/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/chebai_proteins/preprocessing/migration/deep_go/migrate_deep_go_1_data.py b/chebai_proteins/preprocessing/migration/deep_go/migrate_deep_go_1_data.py new file mode 100644 index 0000000..fb5beb4 --- /dev/null +++ b/chebai_proteins/preprocessing/migration/deep_go/migrate_deep_go_1_data.py @@ -0,0 +1,316 @@ +import os +from collections import OrderedDict +from typing import List, Literal, Optional, Tuple + +import pandas as pd +from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit +from jsonargparse import CLI + +from chebai_proteins.preprocessing.datasets.deepGO.go_uniprot import DeepGO1MigratedData + + +class DeepGo1DataMigration: + """ + A class to handle data migration and processing for the DeepGO project. + It migrates the DeepGO data to our data structure followed for GO-UniProt data. + + This class handles migration of data from the DeepGO paper below: + Maxat Kulmanov, Mohammed Asif Khan, Robert Hoehndorf, + DeepGO: predicting protein functions from sequence and interactions using a deep ontology-aware classifier, + Bioinformatics, Volume 34, Issue 4, February 2018, Pages 660–668 + (https://doi.org/10.1093/bioinformatics/btx624). + """ + + # Max sequence length as per DeepGO1 + _MAXLEN = 1002 + _LABELS_START_IDX = DeepGO1MigratedData._LABELS_START_IDX + + def __init__(self, data_dir: str, go_branch: Literal["cc", "mf", "bp"]): + """ + Initializes the data migration object with a data directory and GO branch. + + Args: + data_dir (str): Directory containing the data files. + go_branch (Literal["cc", "mf", "bp"]): GO branch to use. + """ + valid_go_branches = list(DeepGO1MigratedData.GO_BRANCH_MAPPING.keys()) + if go_branch not in valid_go_branches: + raise ValueError(f"go_branch must be one of {valid_go_branches}") + self._go_branch = go_branch + + self._data_dir: str = rf"{data_dir}" + self._train_df: Optional[pd.DataFrame] = None + self._test_df: Optional[pd.DataFrame] = None + self._validation_df: Optional[pd.DataFrame] = None + self._terms_df: Optional[pd.DataFrame] = None + self._classes: Optional[List[str]] = None + + def migrate(self) -> None: + """ + Executes the data migration by loading, processing, and saving the data. + """ + print("Starting the migration process...") + self._load_data() + if not all( + df is not None + for df in [ + self._train_df, + self._validation_df, + self._test_df, + self._terms_df, + ] + ): + raise Exception( + "Data splits or terms data is not available in instance variables." + ) + splits_df = self._record_splits() + data_with_labels_df = self._extract_required_data_from_splits() + + if not all( + var is not None for var in [data_with_labels_df, splits_df, self._classes] + ): + raise Exception( + "Data splits or terms data is not available in instance variables." + ) + + self.save_migrated_data(data_with_labels_df, splits_df) + + def _load_data(self) -> None: + """ + Loads the test, train, validation, and terms data from the pickled files + in the data directory. + """ + try: + print(f"Loading data files from directory: {self._data_dir}") + self._test_df = pd.DataFrame( + pd.read_pickle( + os.path.join(self._data_dir, f"test-{self._go_branch}.pkl") + ) + ) + + # DeepGO 1 lacks a validation split, so we will create one by further splitting the training set. + # Although this reduces the training data slightly compared to the original DeepGO setup, + # given the data size, the impact should be minimal. + train_df = pd.DataFrame( + pd.read_pickle( + os.path.join(self._data_dir, f"train-{self._go_branch}.pkl") + ) + ) + + self._train_df, self._validation_df = self._get_train_val_split(train_df) + + self._terms_df = pd.DataFrame( + pd.read_pickle(os.path.join(self._data_dir, f"{self._go_branch}.pkl")) + ) + + except FileNotFoundError as e: + raise FileNotFoundError( + f"Data file not found in directory: {e}. " + "Please ensure all required files are available in the specified directory." + ) + + @staticmethod + def _get_train_val_split( + train_df: pd.DataFrame, + ) -> Tuple[pd.DataFrame, pd.DataFrame]: + """ + Splits the training data into a smaller training set and a validation set. + + Args: + train_df (pd.DataFrame): Original training DataFrame. + + Returns: + Tuple[pd.DataFrame, pd.DataFrame]: Training and validation DataFrames. + """ + labels_list_train = train_df["labels"].tolist() + train_split = 0.85 + test_size = ((1 - train_split) ** 2) / train_split + + splitter = MultilabelStratifiedShuffleSplit( + n_splits=1, test_size=test_size, random_state=42 + ) + + train_indices, validation_indices = next( + splitter.split(labels_list_train, labels_list_train) + ) + + df_validation = train_df.iloc[validation_indices] + df_train = train_df.iloc[train_indices] + return df_train, df_validation + + def _record_splits(self) -> pd.DataFrame: + """ + Creates a DataFrame that stores the IDs and their corresponding data splits. + + Returns: + pd.DataFrame: A combined DataFrame containing split assignments. + """ + print("Recording data splits for train, validation, and test sets.") + split_assignment_list: List[pd.DataFrame] = [ + pd.DataFrame({"id": self._train_df["proteins"], "split": "train"}), + pd.DataFrame( + {"id": self._validation_df["proteins"], "split": "validation"} + ), + pd.DataFrame({"id": self._test_df["proteins"], "split": "test"}), + ] + + combined_split_assignment = pd.concat(split_assignment_list, ignore_index=True) + return combined_split_assignment + + def _extract_required_data_from_splits(self) -> pd.DataFrame: + """ + Extracts required columns from the combined data splits. + + Returns: + pd.DataFrame: A DataFrame containing the essential columns for processing. + """ + print("Combining data splits into a single DataFrame with required columns.") + required_columns = [ + "proteins", + "accessions", + "sequences", + "gos", + "labels", + ] + + new_df = pd.concat( + [ + self._train_df[required_columns], + self._validation_df[required_columns], + self._test_df[required_columns], + ], + ignore_index=True, + ) + new_df["go_ids"] = new_df.apply( + lambda row: self.extract_go_id(row["gos"]), axis=1 + ) + + labels_df = self._get_labels_columns(new_df) + + data_df = pd.DataFrame( + OrderedDict( + swiss_id=new_df["proteins"], + accession=new_df["accessions"], + go_ids=new_df["go_ids"], + sequence=new_df["sequences"], + ) + ) + + df = pd.concat([data_df, labels_df], axis=1) + + return df + + @staticmethod + def extract_go_id(go_list: List[str]) -> List[int]: + """ + Extracts and parses GO IDs from a list of GO annotations. + + Args: + go_list (List[str]): List of GO annotation strings. + + Returns: + List[int]: List of parsed GO IDs. + """ + return [DeepGO1MigratedData._parse_go_id(go_id_str) for go_id_str in go_list] + + def _get_labels_columns(self, data_df: pd.DataFrame) -> pd.DataFrame: + """ + Generates columns for labels based on provided selected terms. + + Args: + data_df (pd.DataFrame): DataFrame with GO annotations and labels. + + Returns: + pd.DataFrame: DataFrame with label columns. + """ + print("Generating label columns from provided selected terms.") + parsed_go_ids: pd.Series = self._terms_df["functions"].apply( + lambda gos: DeepGO1MigratedData._parse_go_id(gos) + ) + all_go_ids_list = parsed_go_ids.values.tolist() + self._classes = all_go_ids_list + + new_label_columns = pd.DataFrame( + data_df["labels"].tolist(), index=data_df.index, columns=all_go_ids_list + ) + + return new_label_columns + + def save_migrated_data( + self, data_df: pd.DataFrame, splits_df: pd.DataFrame + ) -> None: + """ + Saves the processed data and split information. + + Args: + data_df (pd.DataFrame): Data with GO labels. + splits_df (pd.DataFrame): Split assignment DataFrame. + """ + print("Saving transformed data files.") + + deepgo_migr_inst: DeepGO1MigratedData = DeepGO1MigratedData( + go_branch=DeepGO1MigratedData.GO_BRANCH_MAPPING[self._go_branch], + max_sequence_length=self._MAXLEN, + ) + + # Save data file + deepgo_migr_inst.save_processed( + data_df, deepgo_migr_inst.processed_main_file_names_dict["data"] + ) + print( + f"{deepgo_migr_inst.processed_main_file_names_dict['data']} saved to {deepgo_migr_inst.processed_dir_main}" + ) + + # Save splits file + splits_df.to_csv( + os.path.join(deepgo_migr_inst.processed_dir_main, "splits_deep_go1.csv"), + index=False, + ) + print(f"splits_deep_go1.csv saved to {deepgo_migr_inst.processed_dir_main}") + + # Save classes file + classes = sorted(self._classes) + with open( + os.path.join(deepgo_migr_inst.processed_dir_main, "classes_deep_go1.txt"), + "wt", + ) as fout: + fout.writelines(str(node) + "\n" for node in classes) + print(f"classes_deep_go1.txt saved to {deepgo_migr_inst.processed_dir_main}") + + print("Migration process completed!") + + +class Main: + """ + Main class to handle the migration process for DeepGo1DataMigration. + + Methods: + migrate(data_dir: str, go_branch: Literal["cc", "mf", "bp"]): + Initiates the migration process for the specified data directory and GO branch. + """ + + @staticmethod + def migrate(data_dir: str, go_branch: Literal["cc", "mf", "bp"]) -> None: + """ + Initiates the migration process by creating a DeepGoDataMigration instance + and invoking its migrate method. + + Args: + data_dir (str): Directory containing the data files. + go_branch (Literal["cc", "mf", "bp"]): GO branch to use + ("cc" for cellular_component, + "mf" for molecular_function, + or "bp" for biological_process). + """ + DeepGo1DataMigration(data_dir, go_branch).migrate() + + +if __name__ == "__main__": + # Example: python script_name.py migrate --data_dir="data/deep_go1" --go_branch="mf" + # --data_dir specifies the directory containing the data files. + # --go_branch specifies the GO branch (cc, mf, or bp) you want to use for the migration. + CLI( + Main, + description="DeepGo1DataMigration CLI tool to handle migration of GO data for specified branches (cc, mf, bp).", + as_positional=False, + ) diff --git a/chebai_proteins/preprocessing/migration/deep_go/migrate_deep_go_2_data.py b/chebai_proteins/preprocessing/migration/deep_go/migrate_deep_go_2_data.py new file mode 100644 index 0000000..01d9b3b --- /dev/null +++ b/chebai_proteins/preprocessing/migration/deep_go/migrate_deep_go_2_data.py @@ -0,0 +1,366 @@ +import os +import re +from collections import OrderedDict +from typing import List, Literal, Optional + +import pandas as pd +from jsonargparse import CLI + +from chebai_proteins.preprocessing.datasets.deepGO.go_uniprot import DeepGO2MigratedData +from chebai_proteins.preprocessing.reader import ProteinDataReader + + +class DeepGo2DataMigration: + """ + A class to handle data migration and processing for the DeepGO project. It migrates the data from the DeepGO-SE + data structure to our data structure followed for GO-UniProt data. + + This class handles migration of data from the DeepGO paper below: + Maxat Kulmanov, Mohammed Asif Khan, Robert Hoehndorf, + DeepGO: predicting protein functions from sequence and interactions using a deep ontology-aware classifier, + Bioinformatics, Volume 34, Issue 4, February 2018, Pages 660–668 + (https://doi.org/10.1093/bioinformatics/btx624) + """ + + _LABELS_START_IDX = DeepGO2MigratedData._LABELS_START_IDX + + def __init__( + self, data_dir: str, go_branch: Literal["cc", "mf", "bp"], max_len: int = 1000 + ): + """ + Initializes the data migration object with a data directory and GO branch. + + Args: + data_dir (str): Directory containing the data files. + go_branch (Literal["cc", "mf", "bp"]): GO branch to use. + max_len (int): Used to truncate the sequence to this length. Default is 1000. + # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11 + """ + valid_go_branches = list(DeepGO2MigratedData.GO_BRANCH_MAPPING.keys()) + if go_branch not in valid_go_branches: + raise ValueError(f"go_branch must be one of {valid_go_branches}") + self._go_branch = go_branch + + self._data_dir: str = os.path.join(rf"{data_dir}", go_branch) + self._max_len: int = max_len + + self._train_df: Optional[pd.DataFrame] = None + self._test_df: Optional[pd.DataFrame] = None + self._validation_df: Optional[pd.DataFrame] = None + self._terms_df: Optional[pd.DataFrame] = None + self._classes: Optional[List[str]] = None + + def migrate(self) -> None: + """ + Executes the data migration by loading, processing, and saving the data. + """ + print("Starting the migration process...") + self._load_data() + if not all( + df is not None + for df in [ + self._train_df, + self._validation_df, + self._test_df, + self._terms_df, + ] + ): + raise Exception( + "Data splits or terms data is not available in instance variables." + ) + splits_df = self._record_splits() + + data_df = self._extract_required_data_from_splits() + data_with_labels_df = self._generate_labels(data_df) + + if not all( + var is not None for var in [data_with_labels_df, splits_df, self._classes] + ): + raise Exception( + "Data splits or terms data is not available in instance variables." + ) + + self.save_migrated_data(data_with_labels_df, splits_df) + + def _load_data(self) -> None: + """ + Loads the test, train, validation, and terms data from the pickled files + in the data directory. + """ + + try: + print(f"Loading data from directory: {self._data_dir}......") + + print( + "Pre-processing the data before loading them into instance variables\n" + f"2-Steps preprocessing: \n" + f"\t 1: Truncating every sequence to {self._max_len}\n" + f"\t 2: Replacing every amino acid which is not in {ProteinDataReader.AA_LETTER}" + ) + + self._test_df = self._pre_process_data( + pd.DataFrame( + pd.read_pickle(os.path.join(self._data_dir, "test_data.pkl")) + ) + ) + self._train_df = self._pre_process_data( + pd.DataFrame( + pd.read_pickle(os.path.join(self._data_dir, "train_data.pkl")) + ) + ) + self._validation_df = self._pre_process_data( + pd.DataFrame( + pd.read_pickle(os.path.join(self._data_dir, "valid_data.pkl")) + ) + ) + + self._terms_df = pd.DataFrame( + pd.read_pickle(os.path.join(self._data_dir, "terms.pkl")) + ) + + except FileNotFoundError as e: + raise FileNotFoundError( + f"Data file not found in directory: {e}. " + "Please ensure all required files are available in the specified directory." + ) + + def _pre_process_data(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Pre-processes the input dataframe by truncating sequences to the maximum + length and replacing invalid amino acids with 'X'. + + Args: + df (pd.DataFrame): The dataframe to preprocess. + + Returns: + pd.DataFrame: The processed dataframe. + """ + df = self._truncate_sequences(df) + df = self._replace_invalid_amino_acids(df) + return df + + def _truncate_sequences( + self, df: pd.DataFrame, column: str = "sequences" + ) -> pd.DataFrame: + """ + Truncate sequences in a specified column of a dataframe to the maximum length. + + https://github.com/bio-ontology-research-group/deepgo2/blob/main/train_cnn.py#L206-L217 + + Args: + df (pd.DataFrame): The input dataframe containing the data to be processed. + column (str, optional): The column containing sequences to truncate. + Defaults to "sequences". + + Returns: + pd.DataFrame: The dataframe with sequences truncated to `self._max_len`. + """ + df[column] = df[column].apply(lambda x: x[: self._max_len]) + return df + + @staticmethod + def _replace_invalid_amino_acids( + df: pd.DataFrame, column: str = "sequences" + ) -> pd.DataFrame: + """ + Replaces invalid amino acids in a sequence with 'X' using regex. + + https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L26-L33 + https://github.com/ChEB-AI/python-chebai/pull/64#issuecomment-2517067073 + + Args: + df (pd.DataFrame): The dataframe containing the sequences to be processed. + column (str, optional): The column containing the sequences. Defaults to "sequences". + + Returns: + pd.DataFrame: The dataframe with invalid amino acids replaced by 'X'. + """ + valid_amino_acids = "".join(ProteinDataReader.AA_LETTER) + # Replace any character not in the valid set with 'X' + df[column] = df[column].apply( + lambda x: re.sub(f"[^{valid_amino_acids}]", "X", x) + ) + return df + + def _record_splits(self) -> pd.DataFrame: + """ + Creates a DataFrame that stores the IDs and their corresponding data splits. + + Returns: + pd.DataFrame: A combined DataFrame containing split assignments. + """ + print("Recording data splits for train, validation, and test sets.") + split_assignment_list: List[pd.DataFrame] = [ + pd.DataFrame({"id": self._train_df["proteins"], "split": "train"}), + pd.DataFrame( + {"id": self._validation_df["proteins"], "split": "validation"} + ), + pd.DataFrame({"id": self._test_df["proteins"], "split": "test"}), + ] + + combined_split_assignment = pd.concat(split_assignment_list, ignore_index=True) + return combined_split_assignment + + def _extract_required_data_from_splits(self) -> pd.DataFrame: + """ + Extracts required columns from the combined data splits. + + Returns: + pd.DataFrame: A DataFrame containing the essential columns for processing. + """ + print("Combining the data splits with required data..... ") + required_columns = [ + "proteins", + "accessions", + "sequences", + # https://github.com/bio-ontology-research-group/deepgo2/blob/main/gendata/uni2pandas.py#L60-L69 + "prop_annotations", # Direct and Transitively associated GO ids + "esm2", + ] + + new_df = pd.concat( + [ + self._train_df[required_columns], + self._validation_df[required_columns], + self._test_df[required_columns], + ], + ignore_index=True, + ) + new_df["go_ids"] = new_df["prop_annotations"].apply( + lambda x: self.extract_go_id(x) + ) + + data_df = pd.DataFrame( + OrderedDict( + swiss_id=new_df["proteins"], + accession=new_df["accessions"], + go_ids=new_df["go_ids"], + sequence=new_df["sequences"], + esm2_embeddings=new_df["esm2"], + ) + ) + return data_df + + @staticmethod + def extract_go_id(go_list: List[str]) -> List[int]: + """ + Extracts and parses GO IDs from a list of GO annotations. + + Args: + go_list (List[str]): List of GO annotation strings. + + Returns: + List[str]: List of parsed GO IDs. + """ + return [DeepGO2MigratedData._parse_go_id(go_id_str) for go_id_str in go_list] + + def _generate_labels(self, data_df: pd.DataFrame) -> pd.DataFrame: + """ + Generates label columns for each GO term in the dataset. + + Args: + data_df (pd.DataFrame): DataFrame containing data with GO IDs. + + Returns: + pd.DataFrame: DataFrame with new label columns. + """ + print("Generating labels based on terms.pkl file.......") + parsed_go_ids: pd.Series = self._terms_df["gos"].apply( + DeepGO2MigratedData._parse_go_id + ) + all_go_ids_list = parsed_go_ids.values.tolist() + self._classes = all_go_ids_list + new_label_columns = pd.DataFrame( + False, index=data_df.index, columns=all_go_ids_list + ) + data_df = pd.concat([data_df, new_label_columns], axis=1) + + for index, row in data_df.iterrows(): + for go_id in row["go_ids"]: + if go_id in data_df.columns: + data_df.at[index, go_id] = True + + data_df = data_df[data_df.iloc[:, self._LABELS_START_IDX :].any(axis=1)] + return data_df + + def save_migrated_data( + self, data_df: pd.DataFrame, splits_df: pd.DataFrame + ) -> None: + """ + Saves the processed data and split information. + + Args: + data_df (pd.DataFrame): Data with GO labels. + splits_df (pd.DataFrame): Split assignment DataFrame. + """ + print("Saving transformed data......") + deepgo_migr_inst: DeepGO2MigratedData = DeepGO2MigratedData( + go_branch=DeepGO2MigratedData.GO_BRANCH_MAPPING[self._go_branch], + max_sequence_length=self._max_len, + ) + + # Save data file + deepgo_migr_inst.save_processed( + data_df, deepgo_migr_inst.processed_main_file_names_dict["data"] + ) + print( + f"{deepgo_migr_inst.processed_main_file_names_dict['data']} saved to {deepgo_migr_inst.processed_dir_main}" + ) + + # Save split file + splits_df.to_csv( + os.path.join(deepgo_migr_inst.processed_dir_main, "splits_deep_go2.csv"), + index=False, + ) + print(f"splits_deep_go2.csv saved to {deepgo_migr_inst.processed_dir_main}") + + # Save classes.txt file + classes = sorted(self._classes) + with open( + os.path.join(deepgo_migr_inst.processed_dir_main, "classes_deep_go2.txt"), + "wt", + ) as fout: + fout.writelines(str(node) + "\n" for node in classes) + print(f"classes_deep_go2.txt saved to {deepgo_migr_inst.processed_dir_main}") + + print("Migration completed!") + + +class Main: + """ + Main class to handle the migration process for DeepGoDataMigration. + + Methods: + migrate(data_dir: str, go_branch: Literal["cc", "mf", "bp"]): + Initiates the migration process for the specified data directory and GO branch. + """ + + @staticmethod + def migrate( + data_dir: str, go_branch: Literal["cc", "mf", "bp"], max_len: int = 1000 + ) -> None: + """ + Initiates the migration process by creating a DeepGoDataMigration instance + and invoking its migrate method. + + Args: + data_dir (str): Directory containing the data files. + go_branch (Literal["cc", "mf", "bp"]): GO branch to use + ("cc" for cellular_component, + "mf" for molecular_function, + or "bp" for biological_process). + max_len (int): Used to truncate the sequence to this length. Default is 1000. + # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L11 + """ + DeepGo2DataMigration(data_dir, go_branch, max_len).migrate() + + +if __name__ == "__main__": + # Example: python script_name.py migrate --data_dir="data/deep_go_se_training_data" --go_branch="bp" + # --data_dir specifies the directory containing the data files. + # --go_branch specifies the GO branch (cc, mf, or bp) you want to use for the migration. + CLI( + Main, + description="DeepGoDataMigration CLI tool to handle migration of GO data for specified branches (cc, mf, bp).", + as_positional=False, + ) diff --git a/chebai_proteins/preprocessing/reader.py b/chebai_proteins/preprocessing/reader.py new file mode 100644 index 0000000..21bdaea --- /dev/null +++ b/chebai_proteins/preprocessing/reader.py @@ -0,0 +1,395 @@ +import os +from pathlib import Path +from typing import List, Optional, Tuple +from urllib.error import HTTPError + +import torch +from chebai.preprocessing.collate import RaggedCollator +from chebai.preprocessing.reader import DataReader, TokenIndexerReader +from esm import Alphabet +from esm.model.esm2 import ESM2 +from esm.pretrained import _has_regression_weights # noqa +from esm.pretrained import load_model_and_alphabet_core + + +class ProteinDataReader(TokenIndexerReader): + """ + Data reader for protein sequences using amino acid tokens. This class processes raw protein sequences into a format + suitable for model input by tokenizing them and assigning unique indices to each token. + + Note: + Refer for amino acid sequence: https://en.wikipedia.org/wiki/Protein_primary_structure + + Args: + collator_kwargs (Optional[Dict[str, Any]]): Optional dict of keyword arguments for configuring the collator. + token_path (Optional[str]): Path to the token file. If not provided, it will be created automatically. + kwargs: Additional keyword arguments. + """ + + COLLATOR = RaggedCollator + + # fmt: off + # 21 natural amino acid notation + AA_LETTER = { + "A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", + "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V", + # https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L3-L5 + "X", # Consider valid in latest paper year 2024 Reference number 3 in go_uniprot.py + } + # fmt: on + + def name(self) -> str: + """ + Returns the name of the data reader. This method identifies the specific type of data reader. + + Returns: + str: The name of the data reader, which is "protein_token". + """ + if self.n_gram is not None: + return f"protein_token_{self.n_gram}_gram" + + return "protein_token" + + def __init__(self, *args, n_gram: Optional[int] = None, **kwargs): + """ + Initializes the ProteinDataReader, loading existing tokens from the specified token file. + + Args: + *args: Additional positional arguments passed to the base class. + **kwargs: Additional keyword arguments passed to the base class. + """ + if n_gram is not None: + assert ( + int(n_gram) >= 2 + ), "Ngrams must be greater than or equal to 2 if provided." + self.n_gram = int(n_gram) + else: + self.n_gram = None + + super().__init__(*args, **kwargs) + + def _get_token_index(self, token: str) -> int: + """ + Returns a unique index for each token (amino acid). If the token is not already in the cache, it is added. + + Args: + token (str): The amino acid token to retrieve or add. + + Returns: + int: The index of the token, offset by the predefined EMBEDDING_OFFSET. + """ + error_str = ( + f"Please ensure that the input only contains valid amino acids " + f"20 Valid natural amino acid notation: {self.AA_LETTER}" + f"Refer to the amino acid sequence details here: " + f"https://en.wikipedia.org/wiki/Protein_primary_structure" + ) + + if self.n_gram is None: + # Single-letter amino acid token check + if str(token) not in self.AA_LETTER: + raise KeyError(f"Invalid token '{token}' encountered. " + error_str) + else: + # n-gram token validation, ensure that each component of the n-gram is valid + for aa in token: + if aa not in self.AA_LETTER: + raise KeyError( + f"Invalid token '{token}' encountered as part of n-gram {self.n_gram}. " + + error_str + ) + + return super()._get_token_index(token) + + def _read_data(self, raw_data: str) -> List[int]: + """ + Reads and tokenizes raw protein sequence data into a list of token indices. + + Args: + raw_data (str): The raw protein sequence to be tokenized (e.g., "MKTFF..."). + + Returns: + List[int]: A list of integers representing the indices of the amino acid tokens. + """ + if self.n_gram is not None: + # Tokenize the sequence into n-grams + tokens = [ + raw_data[i : i + self.n_gram] + for i in range(len(raw_data) - self.n_gram + 1) + ] + return [self._get_token_index(gram) for gram in tokens] + + # If n_gram is None, tokenize the sequence at the amino acid level (single-letter representation) + return [self._get_token_index(aa) for aa in raw_data] + + +class ESM2EmbeddingReader(DataReader): + """ + A data reader to process protein sequences using the ESM2 model for embeddings. + + References: + https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/extract_esm.py + + Note: + For layer availability by model, Please check below link: + https://github.com/facebookresearch/esm?tab=readme-ov-file#pre-trained-models- + + To test this reader, try lighter models: + esm2_t6_8M_UR50D: 6 layers (valid layers: 1–6), (~28 Mb) - A tiny 8M parameter model. + esm2_t12_35M_UR50D: 12 layers (valid layers: 1–12), (~128 Mb) - A slightly larger, 35M parameter model. + These smaller models are good for testing and debugging purposes. + + """ + + COLLATOR = RaggedCollator + + # https://github.com/facebookresearch/esm/blob/main/esm/pretrained.py#L53 + _MODELS_URL = "https://dl.fbaipublicfiles.com/fair-esm/models/{}.pt" + _REGRESSION_URL = ( + "https://dl.fbaipublicfiles.com/fair-esm/regression/{}-contact-regression.pt" + ) + + def __init__( + self, + # --------- Default Parameters as per DeepGO2 ------------ + save_model_dir: str = os.path.join("data", "esm2_reader"), + model_name: str = "esm2_t36_3B_UR50D", + device: Optional[torch.device] = None, + truncation_length: int = 1022, + toks_per_batch: int = 4096, + return_contacts: bool = False, + repr_layer: int = 36, + *args, + **kwargs, + ): + """ + Initialize the ESM2EmbeddingReader class. + + Args: + save_model_dir (str): Directory to save/load the pretrained ESM model. + model_name (str): Name of the pretrained model. Defaults to "esm2_t36_3B_UR50D". + device (torch.device or str, optional): Device for computation (e.g., 'cpu', 'cuda'). + truncation_length (int): Maximum sequence length for truncation. Defaults to 1022. + toks_per_batch (int): Tokens per batch for data processing. Defaults to 4096. + return_contacts (bool): Whether to return contact maps. Defaults to False. + repr_layers (int): Layer number to extract representations from. Defaults to 36. + """ + self.save_model_dir = save_model_dir + if not os.path.exists(self.save_model_dir): + os.makedirs((os.path.dirname(self.save_model_dir)), exist_ok=True) + self.model_name = model_name + self.device = device + self.truncation_length = truncation_length + self.toks_per_batch = toks_per_batch + self.return_contacts = return_contacts + self.repr_layer = repr_layer + + self._model: Optional[ESM2] = None + self._alphabet: Optional[Alphabet] = None + + self._model, self._alphabet = self.load_model_and_alphabet() + self._model.eval() + + if self.device: + self._model = self._model.to(device) + + super().__init__(*args, **kwargs) + + def load_model_and_alphabet(self) -> Tuple[ESM2, Alphabet]: + """ + Load the ESM2 model and its alphabet. + + References: + https://github.com/facebookresearch/esm/blob/main/esm/pretrained.py#L24-L28 + + Returns: + Tuple[ESM2, Alphabet]: Loaded model and alphabet. + """ + model_location = os.path.join(self.save_model_dir, f"{self.model_name}.pt") + if os.path.exists(model_location): + return self.load_model_and_alphabet_local(model_location) + else: + return self.load_model_and_alphabet_hub() + + @staticmethod + def load_model_and_alphabet_local(model_location): + """Load from local path. The regression weights need to be co-located""" + model_location = Path(model_location) + model_data = torch.load( + str(model_location), map_location="cpu", weights_only=False + ) + model_name = model_location.stem + if _has_regression_weights(model_name): + regression_location = ( + str(model_location.with_suffix("")) + "-contact-regression.pt" + ) + regression_data = torch.load( + regression_location, map_location="cpu", weights_only=False + ) + else: + regression_data = None + return load_model_and_alphabet_core(model_name, model_data, regression_data) + + def load_model_and_alphabet_hub(self) -> Tuple[ESM2, Alphabet]: + """ + Load the model and alphabet from the hub URL. + + References: + https://github.com/facebookresearch/esm/blob/main/esm/pretrained.py#L62-L64 + + Returns: + Tuple[ESM2, Alphabet]: Loaded model and alphabet. + """ + model_url = self._MODELS_URL.format(self.model_name) + model_data = self.load_hub_workaround(model_url) + regression_data = None + if _has_regression_weights(self.model_name): + regression_url = self._REGRESSION_URL.format(self.model_name) + regression_data = self.load_hub_workaround(regression_url) + return load_model_and_alphabet_core( + self.model_name, model_data, regression_data + ) + + def load_hub_workaround(self, url) -> torch.Tensor: + """ + Workaround to load models from the PyTorch Hub. + + References: + https://github.com/facebookresearch/esm/blob/main/esm/pretrained.py#L31-L43 + + Returns: + torch.Tensor: Loaded model state dictionary. + """ + try: + data = torch.hub.load_state_dict_from_url( + url, + self.save_model_dir, + progress=True, + map_location=self.device, + weights_only=False, + ) + + except RuntimeError: + # Handle PyTorch version issues + fn = Path(url).name + data = torch.load( + f"{torch.hub.get_dir()}/checkpoints/{fn}", + map_location="cpu", + weights_only=False, + ) + except HTTPError as e: + raise Exception( + f"Could not load {url}. Did you specify the correct model name? \n Error: {e}" + ) + return data + + @classmethod + def name(cls) -> str: + """ + Returns the name of the data reader. This method identifies the specific type of data reader. + + Returns: + str: The name of the data reader, which is "protein_token". + """ + return "esm2_embedding" + + @property + def token_path(self) -> None: + """ + Not used as no token file is not created for this reader. + + Returns: + str: Empty string since this method is not implemented. + """ + return + + def _read_data(self, raw_data: str) -> List[int]: + """ + Reads protein sequence data and generates embeddings. + + Args: + raw_data (str): The protein sequence. + + Returns: + List[int]: Embeddings generated for the sequence. + """ + alp_tokens_idx = self._sequence_to_alphabet_tokens_idx(raw_data) + return self._alphabet_tokens_to_esm_embedding(alp_tokens_idx).tolist() + + def _sequence_to_alphabet_tokens_idx(self, sequence: str) -> torch.Tensor: + """ + Converts a protein sequence into ESM alphabet token indices. + + Args: + sequence (str): Protein sequence. + + References: + https://github.com/facebookresearch/esm/blob/2b369911bb5b4b0dda914521b9475cad1656b2ac/esm/data.py#L249-L250 + https://github.com/facebookresearch/esm/blob/2b369911bb5b4b0dda914521b9475cad1656b2ac/esm/data.py#L262-L297 + + Returns: + torch.Tensor: Tokenized sequence with special tokens (BOS/EOS) included. + """ + seq_encoded = self._alphabet.encode(sequence) + tokens = [] + + # Add BOS token if configured + if self._alphabet.prepend_bos: + tokens.append(self._alphabet.cls_idx) + + # Add the main sequence + tokens.extend(seq_encoded) + + # Add EOS token if configured + if self._alphabet.append_eos: + tokens.append(self._alphabet.eos_idx) + + # Convert to PyTorch tensor and return + return torch.tensor([tokens], dtype=torch.int64) + + def _alphabet_tokens_to_esm_embedding(self, tokens: torch.Tensor) -> torch.Tensor: + """ + Converts alphabet tokens into ESM embeddings. + + Args: + tokens (torch.Tensor): Tokenized protein sequences. + + References: + https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/extract_esm.py#L82-L107 + + Returns: + torch.Tensor: Protein embedding from the specified representation layer. + """ + if self.device: + tokens = tokens.to(self.device, non_blocking=True) + + with torch.no_grad(): + out = self._model( + tokens, + repr_layers=[ + self.repr_layer, + ], + return_contacts=self.return_contacts, + ) + + # Extract representations and compute the mean embedding for each layer + representations = { + layer: t.to(self.device) for layer, t in out["representations"].items() + } + truncate_len = min(self.truncation_length, tokens.size(1)) + + result = { + "mean_representations": { + layer: t[0, 1 : truncate_len + 1].mean(0).clone() + for layer, t in representations.items() + } + } + return result["mean_representations"][self.repr_layer] + + def on_finish(self) -> None: + """ + Not used here as no token file exists for this reader. + + Returns: + None + """ + pass diff --git a/configs/data/deepGO/deepgo2_esm2.yml b/configs/data/deepGO/deepgo2_esm2.yml new file mode 100644 index 0000000..4c9d200 --- /dev/null +++ b/configs/data/deepGO/deepgo2_esm2.yml @@ -0,0 +1,5 @@ +class_path: chebai_proteins.preprocessing.datasets.deepGO.go_uniprot.DeepGO2MigratedData +init_args: + go_branch: "MF" + max_sequence_length: 1000 + use_esm2_embeddings: True diff --git a/configs/data/deepGO/deepgo_1_migrated_data.yml b/configs/data/deepGO/deepgo_1_migrated_data.yml new file mode 100644 index 0000000..5d7d237 --- /dev/null +++ b/configs/data/deepGO/deepgo_1_migrated_data.yml @@ -0,0 +1,4 @@ +class_path: chebai_proteins.preprocessing.datasets.deepGO.go_uniprot.DeepGO1MigratedData +init_args: + go_branch: "MF" + max_sequence_length: 1002 diff --git a/configs/data/deepGO/deepgo_2_migrated_data.yml b/configs/data/deepGO/deepgo_2_migrated_data.yml new file mode 100644 index 0000000..4c9d200 --- /dev/null +++ b/configs/data/deepGO/deepgo_2_migrated_data.yml @@ -0,0 +1,5 @@ +class_path: chebai_proteins.preprocessing.datasets.deepGO.go_uniprot.DeepGO2MigratedData +init_args: + go_branch: "MF" + max_sequence_length: 1000 + use_esm2_embeddings: True diff --git a/configs/data/deepGO/go250.yml b/configs/data/deepGO/go250.yml new file mode 100644 index 0000000..2d694b4 --- /dev/null +++ b/configs/data/deepGO/go250.yml @@ -0,0 +1,3 @@ +class_path: chebai_proteins.preprocessing.datasets.go_uniprot.deepGO.GOUniProtOver250 +init_args: + go_branch: "BP" diff --git a/configs/data/deepGO/go50.yml b/configs/data/deepGO/go50.yml new file mode 100644 index 0000000..495a923 --- /dev/null +++ b/configs/data/deepGO/go50.yml @@ -0,0 +1 @@ +class_path: chebai_proteins.preprocessing.datasets.deepGO.go_uniprot.GOUniProtOver50 diff --git a/configs/data/scope/scope2000.yml b/configs/data/scope/scope2000.yml new file mode 100644 index 0000000..ca1789b --- /dev/null +++ b/configs/data/scope/scope2000.yml @@ -0,0 +1,3 @@ +class_path: chebai_proteins.preprocessing.datasets.scope.scope.SCOPeOver2000 +init_args: + scope_version: "2.08" diff --git a/configs/data/scope/scope50.yml b/configs/data/scope/scope50.yml new file mode 100644 index 0000000..477d71b --- /dev/null +++ b/configs/data/scope/scope50.yml @@ -0,0 +1,3 @@ +class_path: chebai_proteins.preprocessing.datasets.scope.scope.SCOPeOver50 +init_args: + scope_version: "2.08" diff --git a/configs/data/scope/scope50_esm.yml b/configs/data/scope/scope50_esm.yml new file mode 100644 index 0000000..8575b98 --- /dev/null +++ b/configs/data/scope/scope50_esm.yml @@ -0,0 +1,6 @@ +class_path: chebai_proteins.preprocessing.datasets.scope.scope.SCOPeOver50ESM +init_args: + scope_version: "2.08" + reader_kwargs: { + truncation_length: 1000 + } diff --git a/configs/loss/BCELoss.yml b/configs/loss/BCELoss.yml new file mode 100644 index 0000000..6ee636d --- /dev/null +++ b/configs/loss/BCELoss.yml @@ -0,0 +1 @@ +class_path: torch.nn.BCELoss diff --git a/configs/metrics/MultilabelAUROC.yml b/configs/metrics/MultilabelAUROC.yml new file mode 100644 index 0000000..8ee2ae8 --- /dev/null +++ b/configs/metrics/MultilabelAUROC.yml @@ -0,0 +1,5 @@ +class_path: torchmetrics.MetricCollection +init_args: + metrics: + balanced-accuracy: + class_path: torchmetrics.classification.MultilabelAUROC diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..034dc5b --- /dev/null +++ b/setup.cfg @@ -0,0 +1,7 @@ +[tool:isort] +profile = black +from_first = True +line_length = 79 +known_first_party = chem +default_section = THIRDPARTY +skip = .tox,.eggs,ci/bootstrap.py,ci/templates,build,dist diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..27284a0 --- /dev/null +++ b/setup.py @@ -0,0 +1,24 @@ +from setuptools import find_packages, setup + +packages = find_packages() +print(packages) +setup( + name="chebai-proteins", + version="0.0.2.dev0", + packages=packages, + package_data={"": ["**/*.txt", "**/*.json"]}, + include_package_data=True, + url="", + license="", + author="MGlauer", + author_email="martin.glauer@ovgu.de", + description="", + zip_safe=False, + python_requires=">=3.9, <3.13", + install_requires=[ + "chebai @ git+https://github.com/ChEB-AI/python-chebai.git", + "biopython", + "fair-esm", + ], + extras_require={"dev": ["black", "isort", "pre-commit"]}, +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 0000000..6640a69 --- /dev/null +++ b/tests/unit/__init__.py @@ -0,0 +1,4 @@ +""" +This directory contains unit tests, which focus on individual functions and methods, ensuring they work as +expected in isolation. +""" diff --git a/tests/unit/dataset_classes/__init__.py b/tests/unit/dataset_classes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/dataset_classes/testGOUniProDataExtractor.py b/tests/unit/dataset_classes/testGOUniProDataExtractor.py new file mode 100644 index 0000000..8cee8f8 --- /dev/null +++ b/tests/unit/dataset_classes/testGOUniProDataExtractor.py @@ -0,0 +1,231 @@ +import unittest +from collections import OrderedDict +from unittest.mock import PropertyMock, mock_open, patch + +import fastobo +import networkx as nx +import pandas as pd + +from chebai_proteins.preprocessing.datasets.deepGO.go_uniprot import ( + _GOUniProtDataExtractor, +) +from chebai_proteins.preprocessing.reader import ProteinDataReader +from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData + + +class TestGOUniProtDataExtractor(unittest.TestCase): + """ + Unit tests for the _GOUniProtDataExtractor class. + """ + + @classmethod + @patch.multiple(_GOUniProtDataExtractor, __abstractmethods__=frozenset()) + @patch.object(_GOUniProtDataExtractor, "base_dir", new_callable=PropertyMock) + @patch.object(_GOUniProtDataExtractor, "_name", new_callable=PropertyMock) + @patch("os.makedirs", return_value=None) + def setUpClass( + cls, + mock_makedirs, + mock_name_property: PropertyMock, + mock_base_dir_property: PropertyMock, + ) -> None: + """ + Class setup for mocking abstract properties of _GOUniProtDataExtractor. + """ + mock_base_dir_property.return_value = "MockedBaseDirPropGOUniProtDataExtractor" + mock_name_property.return_value = "MockedNamePropGOUniProtDataExtractor" + + _GOUniProtDataExtractor.READER = ProteinDataReader + + cls.extractor = _GOUniProtDataExtractor() + + def test_term_callback(self) -> None: + """ + Test the term_callback method for correct parsing and filtering of GO terms. + """ + self.extractor.go_branch = "all" + term_mapping = {} + for term in fastobo.loads(GOUniProtMockData.get_GO_raw_data()): + if isinstance(term, fastobo.typedef.TypedefFrame): + continue + term_mapping[self.extractor._parse_go_id(term.id)] = term + + # Test individual term callback + term_dict = self.extractor.term_callback(term_mapping[4]) + expected_dict = {"go_id": 4, "parents": [3, 2], "name": "GO_4"} + self.assertEqual( + term_dict, + expected_dict, + "The term_callback did not return the expected dictionary.", + ) + + # Test filtering valid terms + valid_terms_docs = set() + for term_id, term_doc in term_mapping.items(): + if self.extractor.term_callback(term_doc): + valid_terms_docs.add(term_id) + + self.assertEqual( + valid_terms_docs, + set(GOUniProtMockData.get_nodes()), + "The valid terms do not match expected nodes.", + ) + + # Test that obsolete terms are filtered out + self.assertFalse( + any( + self.extractor.term_callback(term_mapping[obs_id]) + for obs_id in GOUniProtMockData.get_obsolete_nodes_ids() + ), + "Obsolete terms should not be present.", + ) + + # Test filtering by GO branch (e.g., BP) + self.extractor.go_branch = "BP" + BP_terms = { + term_id + for term_id, term in term_mapping.items() + if self.extractor.term_callback(term) + } + self.assertEqual( + BP_terms, {2, 4}, "The BP terms do not match the expected set." + ) + + @patch( + "fastobo.load", return_value=fastobo.loads(GOUniProtMockData.get_GO_raw_data()) + ) + def test_extract_class_hierarchy(self, mock_load) -> None: + """ + Test the extraction of the class hierarchy from the ontology. + """ + graph = self.extractor._extract_class_hierarchy("fake_path") + + # Validate the graph structure + self.assertIsInstance( + graph, nx.DiGraph, "The result should be a directed graph." + ) + + # Check nodes + actual_nodes = set(graph.nodes) + self.assertEqual( + set(GOUniProtMockData.get_nodes()), + actual_nodes, + "The graph nodes do not match the expected nodes.", + ) + + # Check edges + actual_edges = set(graph.edges) + self.assertEqual( + GOUniProtMockData.get_edges_of_transitive_closure_graph(), + actual_edges, + "The graph edges do not match the expected edges.", + ) + + # Check number of nodes and edges + self.assertEqual( + GOUniProtMockData.get_number_of_nodes(), + len(actual_nodes), + "The number of nodes should match the actual number of nodes in the graph.", + ) + + self.assertEqual( + GOUniProtMockData.get_number_of_transitive_edges(), + len(actual_edges), + "The number of transitive edges should match the actual number of transitive edges in the graph.", + ) + + @patch( + "builtins.open", + new_callable=mock_open, + read_data=GOUniProtMockData.get_UniProt_raw_data(), + ) + def test_get_swiss_to_go_mapping(self, mock_open) -> None: + """ + Test the extraction of SwissProt to GO term mapping. + """ + mapping_df = self.extractor._get_swiss_to_go_mapping() + expected_df = pd.DataFrame( + OrderedDict( + swiss_id=["Swiss_Prot_1", "Swiss_Prot_2"], + accession=["Q6GZX4", "DCGZX4"], + go_ids=[[2, 3, 5], [2, 5]], + sequence=list(GOUniProtMockData.protein_sequences().values()), + ) + ) + + pd.testing.assert_frame_equal( + mapping_df, + expected_df, + obj="The SwissProt to GO mapping DataFrame does not match the expected DataFrame.", + ) + + @patch( + "fastobo.load", return_value=fastobo.loads(GOUniProtMockData.get_GO_raw_data()) + ) + @patch( + "builtins.open", + new_callable=mock_open, + read_data=GOUniProtMockData.get_UniProt_raw_data(), + ) + @patch.object( + _GOUniProtDataExtractor, + "select_classes", + return_value=GOUniProtMockData.get_nodes(), + ) + def test_graph_to_raw_dataset( + self, mock_select_classes, mock_open, mock_load + ) -> None: + """ + Test the conversion of the class hierarchy graph to a raw dataset. + """ + graph = self.extractor._extract_class_hierarchy("fake_path") + actual_df = self.extractor._graph_to_raw_dataset(graph) + expected_df = GOUniProtMockData.get_data_in_dataframe() + + pd.testing.assert_frame_equal( + actual_df, + expected_df, + obj="The raw dataset DataFrame does not match the expected DataFrame.", + ) + + @patch("builtins.open", new_callable=mock_open, read_data=b"Mocktestdata") + @patch("pandas.read_pickle") + def test_load_dict( + self, mock_read_pickle: PropertyMock, mock_open: mock_open + ) -> None: + """ + Test the loading of the dictionary from a DataFrame. + """ + mock_df = GOUniProtMockData.get_data_in_dataframe() + mock_read_pickle.return_value = mock_df + + generator = self.extractor._load_dict("data/tests") + result = list(generator) + + # Convert NumPy arrays to lists for comparison + for item in result: + item["labels"] = list(item["labels"]) + + # Expected output for comparison + expected_result = [ + { + "features": mock_df["sequence"][0], + "labels": mock_df.iloc[0, 4:].to_list(), + "ident": mock_df["swiss_id"][0], + }, + { + "features": mock_df["sequence"][1], + "labels": mock_df.iloc[1, 4:].to_list(), + "ident": mock_df["swiss_id"][1], + }, + ] + + self.assertEqual( + result, + expected_result, + "The loaded dictionary does not match the expected structure.", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/dataset_classes/testGoUniProtOverX.py b/tests/unit/dataset_classes/testGoUniProtOverX.py new file mode 100644 index 0000000..ccd2d66 --- /dev/null +++ b/tests/unit/dataset_classes/testGoUniProtOverX.py @@ -0,0 +1,140 @@ +import unittest +from typing import List +from unittest.mock import mock_open, patch + +import networkx as nx +import pandas as pd + +from chebai_proteins.preprocessing.datasets.deepGO.go_uniprot import _GOUniProtOverX +from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData + + +class TestGOUniProtOverX(unittest.TestCase): + @classmethod + @patch.multiple(_GOUniProtOverX, __abstractmethods__=frozenset()) + @patch("os.makedirs", return_value=None) + def setUpClass(cls, mock_makedirs) -> None: + """ + Set up the class for tests by initializing the extractor, graph, and input DataFrame. + """ + cls.extractor = _GOUniProtOverX() + cls.test_graph: nx.DiGraph = GOUniProtMockData.get_transitively_closed_graph() + cls.input_df: pd.DataFrame = GOUniProtMockData.get_data_in_dataframe().iloc[ + :, :4 + ] + + @patch("builtins.open", new_callable=mock_open) + def test_select_classes(self, mock_open_file: mock_open) -> None: + """ + Test the `select_classes` method to ensure it selects classes based on the threshold. + + Args: + mock_open_file (mock_open): Mocked open function to intercept file operations. + """ + # Set threshold for testing + self.extractor.THRESHOLD = 2 + selected_classes: List[int] = self.extractor.select_classes( + self.test_graph, data_df=self.input_df + ) + + # Expected result: GO terms 1, 2, and 5 should be selected based on the threshold + expected_selected_classes: List[int] = sorted([1, 2, 5]) + + # Check if the selected classes are as expected + self.assertEqual( + selected_classes, + expected_selected_classes, + msg="The selected classes do not match the expected output for threshold 2.", + ) + + # Expected data as string + expected_lines: str = "\n".join(map(str, expected_selected_classes)) + "\n" + + # Extract the generator passed to writelines + written_generator = mock_open_file().writelines.call_args[0][0] + written_lines: str = "".join(written_generator) + + # Ensure the data matches + self.assertEqual( + written_lines, + expected_lines, + msg="The written lines do not match the expected lines for the given threshold of 2.", + ) + + @patch("builtins.open", new_callable=mock_open) + def test_no_classes_meet_threshold(self, mock_open_file: mock_open) -> None: + """ + Test the `select_classes` method when no nodes meet the successor threshold. + + Args: + mock_open_file (mock_open): Mocked open function to intercept file operations. + """ + self.extractor.THRESHOLD = 5 + selected_classes: List[int] = self.extractor.select_classes( + self.test_graph, data_df=self.input_df + ) + + # Expected result: No classes should meet the threshold of 5 + expected_selected_classes: List[int] = [] + + # Check if the selected classes are as expected + self.assertEqual( + selected_classes, + expected_selected_classes, + msg="The selected classes list should be empty when no nodes meet the threshold of 5.", + ) + + # Expected data as string + expected_lines: str = "" + + # Extract the generator passed to writelines + written_generator = mock_open_file().writelines.call_args[0][0] + written_lines: str = "".join(written_generator) + + # Ensure the data matches + self.assertEqual( + written_lines, + expected_lines, + msg="The written lines do not match the expected lines when no nodes meet the threshold of 5.", + ) + + @patch("builtins.open", new_callable=mock_open) + def test_all_nodes_meet_threshold(self, mock_open_file: mock_open) -> None: + """ + Test the `select_classes` method when all nodes meet the successor threshold. + + Args: + mock_open_file (mock_open): Mocked open function to intercept file operations. + """ + self.extractor.THRESHOLD = 0 + selected_classes: List[int] = self.extractor.select_classes( + self.test_graph, data_df=self.input_df + ) + + # Expected result: All nodes except those not referenced by any protein (4 and 6) should be selected + expected_classes: List[int] = sorted([1, 2, 3, 5]) + + # Check if the returned selected classes match the expected list + self.assertListEqual( + selected_classes, + expected_classes, + msg="The selected classes do not match the expected output when all nodes meet the threshold of 0.", + ) + + # Expected data as string + expected_lines: str = "\n".join(map(str, expected_classes)) + "\n" + + # Extract the generator passed to writelines + written_generator = mock_open_file().writelines.call_args[0][0] + written_lines: str = "".join(written_generator) + + # Ensure the data matches + self.assertEqual( + written_lines, + expected_lines, + msg="The written lines do not match the expected lines when all nodes meet the threshold of 0.", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/dataset_classes/testProteinPretrainingData.py b/tests/unit/dataset_classes/testProteinPretrainingData.py new file mode 100644 index 0000000..6c5044c --- /dev/null +++ b/tests/unit/dataset_classes/testProteinPretrainingData.py @@ -0,0 +1,76 @@ +import unittest +from unittest.mock import PropertyMock, mock_open, patch + +from chebai_proteins.preprocessing.datasets.deepGO.protein_pretraining import ( + _ProteinPretrainingData, +) +from chebai_proteins.preprocessing.reader import ProteinDataReader +from tests.unit.mock_data.ontology_mock_data import GOUniProtMockData + + +class TestProteinPretrainingData(unittest.TestCase): + """ + Unit tests for the _ProteinPretrainingData class. + Tests focus on data parsing and validation checks for protein pretraining. + """ + + @classmethod + @patch.multiple(_ProteinPretrainingData, __abstractmethods__=frozenset()) + @patch.object(_ProteinPretrainingData, "base_dir", new_callable=PropertyMock) + @patch.object(_ProteinPretrainingData, "_name", new_callable=PropertyMock) + @patch("os.makedirs", return_value=None) + def setUpClass( + cls, + mock_makedirs, + mock_name_property: PropertyMock, + mock_base_dir_property: PropertyMock, + ) -> None: + """ + Class setup for mocking abstract properties of _ProteinPretrainingData. + + Mocks the required abstract properties and sets up the data extractor. + """ + mock_base_dir_property.return_value = "MockedBaseDirPropProteinPretrainingData" + mock_name_property.return_value = "MockedNameProp_ProteinPretrainingData" + + # Set the READER class for the pretraining data + _ProteinPretrainingData.READER = ProteinDataReader + + # Initialize the extractor instance + cls.extractor = _ProteinPretrainingData() + + @patch( + "builtins.open", + new_callable=mock_open, + read_data=GOUniProtMockData.get_UniProt_raw_data(), + ) + def test_parse_protein_data_for_pretraining( + self, mock_open_file: mock_open + ) -> None: + """ + Tests the _parse_protein_data_for_pretraining method. + + Verifies that: + - The parsed DataFrame contains the expected protein IDs. + - The protein sequences are not empty. + """ + # Parse the pretraining data + pretrain_df = self.extractor._parse_protein_data_for_pretraining() + list_of_pretrain_swiss_ids = GOUniProtMockData.proteins_for_pretraining() + + # Assert that all expected Swiss-Prot IDs are present in the DataFrame + self.assertEqual( + set(pretrain_df["swiss_id"]), + set(list_of_pretrain_swiss_ids), + msg="The parsed DataFrame does not contain the expected Swiss-Prot IDs for pretraining.", + ) + + # Assert that all sequences are not empty + self.assertTrue( + pretrain_df["sequence"].str.len().gt(0).all(), + msg="Some protein sequences in the pretraining DataFrame are empty.", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/mock_data/__init__.py b/tests/unit/mock_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/mock_data/ontology_mock_data.py b/tests/unit/mock_data/ontology_mock_data.py new file mode 100644 index 0000000..552d291 --- /dev/null +++ b/tests/unit/mock_data/ontology_mock_data.py @@ -0,0 +1,813 @@ +from abc import ABC, abstractmethod +from collections import OrderedDict +from typing import Dict, List, Set, Tuple + +import networkx as nx +import pandas as pd + + +class MockOntologyGraphData(ABC): + """ + Abstract base class for mocking ontology graph data. + + This class provides a set of static methods that must be implemented by subclasses + to return various elements of an ontology graph such as nodes, edges, and dataframes. + """ + + @staticmethod + @abstractmethod + def get_nodes() -> List[int]: + """ + Get a list of node IDs in the ontology graph. + + Returns: + List[int]: A list of node IDs. + """ + pass + + @staticmethod + @abstractmethod + def get_number_of_nodes() -> int: + """ + Get the number of nodes in the ontology graph. + + Returns: + int: The total number of nodes. + """ + pass + + @staticmethod + @abstractmethod + def get_edges() -> Set[Tuple[int, int]]: + """ + Get the set of edges in the ontology graph. + + Returns: + Set[Tuple[int, int]]: A set of tuples where each tuple represents an edge between two nodes. + """ + pass + + @staticmethod + @abstractmethod + def get_number_of_edges() -> int: + """ + Get the number of edges in the ontology graph. + + Returns: + int: The total number of edges. + """ + pass + + @staticmethod + @abstractmethod + def get_edges_of_transitive_closure_graph() -> Set[Tuple[int, int]]: + """ + Get the set of edges in the transitive closure of the ontology graph. + + Returns: + Set[Tuple[int, int]]: A set of tuples representing the transitive closure edges. + """ + pass + + @staticmethod + @abstractmethod + def get_number_of_transitive_edges() -> int: + """ + Get the number of edges in the transitive closure of the ontology graph. + + Returns: + int: The total number of transitive edges. + """ + pass + + @staticmethod + @abstractmethod + def get_obsolete_nodes_ids() -> Set[int]: + """ + Get the set of obsolete node IDs in the ontology graph. + + Returns: + Set[int]: A set of obsolete node IDs. + """ + pass + + @staticmethod + @abstractmethod + def get_transitively_closed_graph() -> nx.DiGraph: + """ + Get the transitive closure of the ontology graph. + + Returns: + nx.DiGraph: A directed graph representing the transitive closure of the ontology graph. + """ + pass + + @staticmethod + @abstractmethod + def get_data_in_dataframe() -> pd.DataFrame: + """ + Get the ontology data as a Pandas DataFrame. + + Returns: + pd.DataFrame: A DataFrame containing ontology data. + """ + pass + + +class ChebiMockOntology(MockOntologyGraphData): + """ + A mock ontology representing a simplified ChEBI (Chemical Entities of Biological Interest) structure. + This class is used for testing purposes and includes nodes and edges representing chemical compounds + and their relationships in a graph structure. + + Nodes: + - CHEBI:12345 (Compound A) + - CHEBI:54321 (Compound B) + - CHEBI:67890 (Compound C) + - CHEBI:11111 (Compound D) + - CHEBI:22222 (Compound E) + - CHEBI:99999 (Compound F) + - CHEBI:77533 (Compound G, Obsolete node) + - CHEBI:77564 (Compound H, Obsolete node) + - CHEBI:88888 (Compound I) + + Valid Edges: + - CHEBI:54321 -> CHEBI:12345 + - CHEBI:67890 -> CHEBI:12345 + - CHEBI:67890 -> CHEBI:88888 + - CHEBI:11111 -> CHEBI:54321 + - CHEBI:22222 -> CHEBI:67890 + - CHEBI:12345 -> CHEBI:99999 + + The class also includes methods to retrieve nodes, edges, and transitive closure of the graph. + + Visual Representation Graph with Valid Nodes and Edges: + + 22222 + / + 11111 67890 + \\ / \ + 54321 / 88888 + \\ / + 12345 + \ + 99999 + """ + + @staticmethod + def get_nodes() -> List[int]: + """ + Get the set of valid node IDs in the mock ontology. + + Returns: + - Set[int]: A set of integers representing the valid ChEBI node IDs. + """ + return [11111, 12345, 22222, 54321, 67890, 88888, 99999] + + @staticmethod + def get_number_of_nodes() -> int: + """ + Get the number of valid nodes in the mock ontology. + + Returns: + - int: The number of valid nodes. + """ + return len(ChebiMockOntology.get_nodes()) + + @staticmethod + def get_edges() -> Set[Tuple[int, int]]: + """ + Get the set of valid edges in the mock ontology. + + Returns: + - Set[Tuple[int, int]]: A set of tuples representing the directed edges + between ChEBI nodes. + """ + return { + (54321, 12345), + (67890, 12345), + (67890, 88888), + (11111, 54321), + (22222, 67890), + (12345, 99999), + } + + @staticmethod + def get_number_of_edges() -> int: + """ + Get the number of valid edges in the mock ontology. + + Returns: + - int: The number of valid edges. + """ + return len(ChebiMockOntology.get_edges()) + + @staticmethod + def get_edges_of_transitive_closure_graph() -> Set[Tuple[int, int]]: + """ + Get the set of edges derived from the transitive closure of the mock ontology graph. + + Returns: + - Set[Tuple[int, int]]: A set of tuples representing the directed edges + in the transitive closure of the ChEBI graph. + """ + return { + (54321, 12345), + (54321, 99999), + (67890, 12345), + (67890, 99999), + (67890, 88888), + (11111, 54321), + (11111, 12345), + (11111, 99999), + (22222, 67890), + (22222, 12345), + (22222, 99999), + (22222, 88888), + (12345, 99999), + } + + @staticmethod + def get_number_of_transitive_edges() -> int: + """ + Get the number of edges in the transitive closure of the mock ontology graph. + + Returns: + - int: The number of edges in the transitive closure graph. + """ + return len(ChebiMockOntology.get_edges_of_transitive_closure_graph()) + + @staticmethod + def get_obsolete_nodes_ids() -> Set[int]: + """ + Get the set of obsolete node IDs in the mock ontology. + + Returns: + - Set[int]: A set of integers representing the obsolete ChEBI node IDs. + """ + return {77533, 77564} + + @staticmethod + def get_raw_data() -> str: + """ + Get the raw data representing the mock ontology in OBO format. + + Returns: + - str: A string containing the raw OBO data for the mock ChEBI terms. + """ + return """ + [Term] + id: CHEBI:12345 + name: Compound A + subset: 2_STAR + property_value: http://purl.obolibrary.org/obo/chebi/formula "C26H35ClN4O6S" xsd:string + property_value: http://purl.obolibrary.org/obo/chebi/charge "0" xsd:string + property_value: http://purl.obolibrary.org/obo/chebi/monoisotopicmass "566.19658" xsd:string + property_value: http://purl.obolibrary.org/obo/chebi/mass "567.099" xsd:string + property_value: http://purl.obolibrary.org/obo/chebi/inchikey "ROXPMFGZZQEKHB-IUKKYPGJSA-N" xsd:string + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1" xsd:string + property_value: http://purl.obolibrary.org/obo/chebi/inchi "InChI=1S/C26H35ClN4O6S/c1-16(2)28-26(34)30(5)14-23-17(3)13-31(18(4)15-32)25(33)21-7-6-8-22(24(21)37-23)29-38(35,36)20-11-9-19(27)10-12-20/h6-12,16-18,23,29,32H,13-15H2,1-5H3,(H,28,34)/t17-,18-,23+/m0/s1" xsd:string + xref: LINCS:LSM-20139 + is_a: CHEBI:54321 + is_a: CHEBI:67890 + + [Term] + id: CHEBI:54321 + name: Compound B + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1O" xsd:string + is_a: CHEBI:11111 + is_a: CHEBI:77564 + + [Term] + id: CHEBI:67890 + name: Compound C + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1N" xsd:string + is_a: CHEBI:22222 + + [Term] + id: CHEBI:11111 + name: Compound D + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1F" xsd:string + + [Term] + id: CHEBI:22222 + name: Compound E + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1Cl" xsd:string + + [Term] + id: CHEBI:99999 + name: Compound F + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1Br" xsd:string + is_a: CHEBI:12345 + + [Term] + id: CHEBI:77533 + name: Compound G + is_a: CHEBI:99999 + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=C1Br" xsd:string + is_obsolete: true + + [Term] + id: CHEBI:77564 + name: Compound H + property_value: http://purl.obolibrary.org/obo/chebi/smiles "CC=C1Br" xsd:string + is_obsolete: true + + [Typedef] + id: has_major_microspecies_at_pH_7_3 + name: has major microspecies at pH 7.3 + is_cyclic: true + is_transitive: false + + [Term] + id: CHEBI:88888 + name: Compound I + property_value: http://purl.obolibrary.org/obo/chebi/smiles "C1=CC=CC=C1[Mg+]" xsd:string + is_a: CHEBI:67890 + """ + + @staticmethod + def get_data_in_dataframe() -> pd.DataFrame: + data = OrderedDict( + id=[ + 12345, + 54321, + 67890, + 11111, + 22222, + 99999, + 88888, + ], + name=[ + "Compound A", + "Compound B", + "Compound C", + "Compound D", + "Compound E", + "Compound F", + "Compound I", + ], + SMILES=[ + "C1=CC=CC=C1", + "C1=CC=CC=C1O", + "C1=CC=CC=C1N", + "C1=CC=CC=C1F", + "C1=CC=CC=C1Cl", + "C1=CC=CC=C1Br", + "C1=CC=CC=C1[Mg+]", + ], + **{ + # -row- [12345, 54321, 67890, 11111, 22222, 99999, 88888] + 11111: [True, True, False, True, False, True, False], + 12345: [True, False, False, False, False, True, False], + 22222: [True, False, True, False, True, True, True], + 54321: [True, True, False, False, False, True, False], + 67890: [True, False, True, False, False, True, True], + 88888: [False, False, False, False, False, False, True], + 99999: [False, False, False, False, False, True, False], + }, + ) + + data_df = pd.DataFrame(data) + + # ------------- Code Approach ------- + # ancestors_of_nodes = {} + # for parent, child in ChebiMockOntology.get_edges_of_transitive_closure_graph(): + # if child not in ancestors_of_nodes: + # ancestors_of_nodes[child] = set() + # if parent not in ancestors_of_nodes: + # ancestors_of_nodes[parent] = set() + # ancestors_of_nodes[child].add(parent) + # ancestors_of_nodes[child].add(child) + # + # # For each node in the ontology, create a column to check if it's an ancestor of any other node or itself + # for node in ChebiMockOntology.get_nodes(): + # data_df[node] = data_df['id'].apply( + # lambda x: (x == node) or (node in ancestors_of_nodes[x]) + # ) + + return data_df + + @staticmethod + def get_transitively_closed_graph() -> nx.DiGraph: + """ + Create a directed graph, compute its transitive closure, and return it. + + Returns: + g (nx.DiGraph): A transitively closed directed graph. + """ + g = nx.DiGraph() + + for node in ChebiMockOntology.get_nodes(): + g.add_node(node, **{"smiles": "test_smiles_placeholder"}) + + g.add_edges_from(ChebiMockOntology.get_edges_of_transitive_closure_graph()) + + return g + + +class GOUniProtMockData(MockOntologyGraphData): + """ + A mock ontology representing a simplified version of the Gene Ontology (GO) structure with nodes and edges + representing GO terms and their relationships in a directed acyclic graph (DAG). + + Nodes: + - GO_1 + - GO_2 + - GO_3 + - GO_4 + - GO_5 + - GO_6 + + Edges (Parent-Child Relationships): + - GO_1 -> GO_2 + - GO_1 -> GO_3 + - GO_2 -> GO_4 + - GO_2 -> GO_5 + - GO_3 -> GO_4 + - GO_4 -> GO_6 + + This mock ontology structure is useful for testing methods related to GO hierarchy, graph extraction, and transitive + closure operations. + + The class also includes methods to retrieve nodes, edges, and transitive closure of the graph. + + Visual Representation Graph with Valid Nodes and Edges: + + GO_1 + / \ + GO_2 GO_3 + / \ / + GO_5 GO_4 + \ + GO_6 + + Valid Swiss Proteins with mapping to valid GO ids + Swiss_Prot_1 -> GO_2, GO_3, GO_5 + Swiss_Prot_2 -> GO_2, GO_5 + """ + + @staticmethod + def get_nodes() -> List[int]: + """ + Get a sorted list of node IDs. + + Returns: + List[int]: A sorted list of node IDs in the ontology graph. + """ + return sorted([1, 2, 3, 4, 5, 6]) + + @staticmethod + def get_number_of_nodes() -> int: + """ + Get the total number of nodes in the ontology graph. + + Returns: + int: The number of nodes. + """ + return len(GOUniProtMockData.get_nodes()) + + @staticmethod + def get_edges() -> Set[Tuple[int, int]]: + """ + Get the set of edges in the ontology graph. + + Returns: + Set[Tuple[int, int]]: A set of tuples where each tuple represents an edge between two nodes. + """ + return {(1, 2), (1, 3), (2, 4), (2, 5), (3, 4), (4, 6)} + + @staticmethod + def get_number_of_edges() -> int: + """ + Get the total number of edges in the ontology graph. + + Returns: + int: The number of edges. + """ + return len(GOUniProtMockData.get_edges()) + + @staticmethod + def get_edges_of_transitive_closure_graph() -> Set[Tuple[int, int]]: + """ + Get the set of edges in the transitive closure of the ontology graph. + + Returns: + Set[Tuple[int, int]]: A set of tuples representing edges in the transitive closure graph. + """ + return { + (1, 2), + (1, 3), + (1, 4), + (1, 5), + (1, 6), + (2, 4), + (2, 5), + (2, 6), + (3, 4), + (3, 6), + (4, 6), + } + + @staticmethod + def get_number_of_transitive_edges() -> int: + """ + Get the total number of edges in the transitive closure graph. + + Returns: + int: The number of transitive edges. + """ + return len(GOUniProtMockData.get_edges_of_transitive_closure_graph()) + + @staticmethod + def get_obsolete_nodes_ids() -> Set[int]: + """ + Get the set of obsolete node IDs in the ontology graph. + + Returns: + Set[int]: A set of node IDs representing obsolete nodes. + """ + return {7, 8} + + @staticmethod + def get_GO_raw_data() -> str: + """ + Get raw data in string format for a basic Gene Ontology (GO) structure. + + This data simulates a basic GO ontology format typically used for testing purposes. + The data will include valid and obsolete GO terms with various relationships between them. + + Scenarios covered: + - Obsolete terms being the parent of valid terms. + - Valid terms being the parent of obsolete terms. + - Both direct and indirect hierarchical relationships between terms. + + The data is designed to help test the proper handling of obsolete and valid GO terms, + ensuring that the ontology parser can correctly manage both cases. + + Returns: + str: The raw GO data in string format, structured as test input. + """ + return """ + [Term] + id: GO:0000001 + name: GO_1 + namespace: molecular_function + def: "OBSOLETE. Assists in the correct assembly of ribosomes or ribosomal subunits in vivo, but is not a component of the assembled ribosome when performing its normal biological function." [GOC:jl, PMID:12150913] + comment: This term was made obsolete because it refers to a class of gene products and a biological process rather than a molecular function. + synonym: "ribosomal chaperone activity" EXACT [] + xref: MetaCyc:BETAGALACTOSID-RXN + xref: Reactome:R-HSA-189062 "lactose + H2O => D-glucose + D-galactose" + xref: Reactome:R-HSA-5658001 "Defective LCT does not hydrolyze Lac" + xref: RHEA:10076 + + [Term] + id: GO:0000002 + name: GO_2 + namespace: biological_process + is_a: GO:0000001 ! hydrolase activity, hydrolyzing O-glycosyl compounds + is_a: GO:0000008 ! hydrolase activity, hydrolyzing O-glycosyl compounds + + [Term] + id: GO:0000003 + name: GO_3 + namespace: cellular_component + is_a: GO:0000001 ! regulation of DNA recombination + + [Term] + id: GO:0000004 + name: GO_4 + namespace: biological_process + is_a: GO:0000003 ! regulation of DNA recombination + is_a: GO:0000002 ! hydrolase activity, hydrolyzing O-glycosyl compounds + + [Term] + id: GO:0000005 + name: GO_5 + namespace: molecular_function + is_a: GO:0000002 ! regulation of DNA recombination + + [Term] + id: GO:0000006 + name: GO_6 + namespace: cellular_component + is_a: GO:0000004 ! glucoside transport + + [Term] + id: GO:0000007 + name: GO_7 + namespace: biological_process + is_a: GO:0000003 ! glucoside transport + is_obsolete: true + + [Term] + id: GO:0000008 + name: GO_8 + namespace: molecular_function + is_obsolete: true + + [Typedef] + id: term_tracker_item + name: term tracker item + namespace: external + xref: IAO:0000233 + is_metadata_tag: true + is_class_level: true + """ + + @staticmethod + def protein_sequences() -> Dict[str, str]: + """ + Get the protein sequences for Swiss-Prot proteins. + + Returns: + Dict[str, str]: A dictionary where keys are Swiss-Prot IDs and values are their respective sequences. + """ + return { + "Swiss_Prot_1": "MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK".replace( + " ", "" + ), + "Swiss_Prot_2": "EKGLIVGHFS GIKYKGEKAQ ASEVDVNKMC CWVSKFKDAM RRYQGIQTCK".replace( + " ", "" + ), + } + + @staticmethod + def proteins_for_pretraining() -> List[str]: + """ + Returns a list of protein IDs which will be used for pretraining based on mock UniProt data. + + Proteins include those with: + - No GO classes or invalid GO classes (missing required evidence codes). + + Returns: + List[str]: A list of protein IDs that do not meet validation criteria. + """ + return [ + "Swiss_Prot_5", # No GO classes associated + "Swiss_Prot_6", # GO class with no evidence code + "Swiss_Prot_7", # GO class with invalid evidence code + ] + + @staticmethod + def get_UniProt_raw_data() -> str: + """ + Get raw data in string format for UniProt proteins. + + This mock data contains eleven Swiss-Prot proteins with different properties: + - **Swiss_Prot_1**: A valid protein with three valid GO classes and one invalid GO class. + - **Swiss_Prot_2**: Another valid protein with two valid GO classes and one invalid. + - **Swiss_Prot_3**: Contains valid GO classes but has a sequence length > 1002. + - **Swiss_Prot_4**: Has valid GO classes but contains an invalid amino acid, 'B'. + - **Swiss_Prot_5**: Has a sequence but no GO classes associated. + - **Swiss_Prot_6**: Has GO classes without any associated evidence codes. + - **Swiss_Prot_7**: Has a GO class with an invalid evidence code. + - **Swiss_Prot_8**: Has a sequence length > 1002 and has only invalid GO class. + - **Swiss_Prot_9**: Has no GO classes but contains an invalid amino acid, 'B', in its sequence. + - **Swiss_Prot_10**: Has a valid GO class but lacks a sequence. + - **Swiss_Prot_11**: Has only Invalid GO class but lacks a sequence. + + Note: + A valid GO label is the one which has one of the following evidence code specified in + go_uniprot.py->`EXPERIMENTAL_EVIDENCE_CODES`. + Invalid amino acids are specified in go_uniprot.py->`AMBIGUOUS_AMINO_ACIDS`. + + Returns: + str: The raw UniProt data in string format. + """ + protein_sq_1 = GOUniProtMockData.protein_sequences()["Swiss_Prot_1"] + protein_sq_2 = GOUniProtMockData.protein_sequences()["Swiss_Prot_2"] + raw_str = ( + # Below protein with 3 valid associated GO class and one invalid GO class + f"ID Swiss_Prot_1 Reviewed; {len(protein_sq_1)} AA. \n" + "AC Q6GZX4;\n" + "DR GO; GO:0000002; C:membrane; EXP:UniProtKB-KW.\n" + "DR GO; GO:0000003; C:membrane; IDA:UniProtKB-KW.\n" + "DR GO; GO:0000005; P:regulation of viral transcription; IPI:InterPro.\n" + "DR GO; GO:0000004; P:regulation of viral transcription; IEA:SGD.\n" + f"SQ SEQUENCE {len(protein_sq_1)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + f" {protein_sq_1}\n" + "//\n" + # Below protein with 2 valid associated GO class and one invalid GO class + f"ID Swiss_Prot_2 Reviewed; {len(protein_sq_2)} AA.\n" + "AC DCGZX4;\n" + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" + "DR GO; GO:0000002; P:regulation of viral transcription; IMP:InterPro.\n" + "DR GO; GO:0000005; P:regulation of viral transcription; IGI:InterPro.\n" + "DR GO; GO:0000006; P:regulation of viral transcription; IEA:PomBase.\n" + f"SQ SEQUENCE {len(protein_sq_2)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + f" {protein_sq_2}\n" + "//\n" + # Below protein with all valid associated GO class but sequence length greater than 1002 + f"ID Swiss_Prot_3 Reviewed; {len(protein_sq_1 * 25)} AA.\n" + "AC Q6GZX4;\n" + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" + "DR GO; GO:0000002; P:regulation of viral transcription; IEP:InterPro.\n" + "DR GO; GO:0000005; P:regulation of viral transcription; TAS:InterPro.\n" + "DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n" + f"SQ SEQUENCE {len(protein_sq_1 * 25)} AA; 129118 MW; FE2984658CED53A8 CRC64;\n" + f" {protein_sq_1 * 25}\n" + "//\n" + # Below protein has valid go class association but invalid amino acid `X` in its sequence + "ID Swiss_Prot_4 Reviewed; 60 AA.\n" + "AC Q6GZX4;\n" + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" + "DR GO; GO:0000002; P:regulation of viral transcription; EXP:InterPro.\n" + "DR GO; GO:0000005; P:regulation of viral transcription; IEA:InterPro.\n" + "DR GO; GO:0000006; P:regulation of viral transcription; EXP:PomBase.\n" + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + " BAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" + "//\n" + # Below protein with sequence string but has no GO class + "ID Swiss_Prot_5 Reviewed; 60 AA.\n" + "AC Q6GZX4;\n" + "DR EMBL; AY548484; AAT09660.1; -; Genomic_DNA.\n" + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" + "//\n" + # Below protein with sequence string and with NO `valid` associated GO class (no evidence code) + "ID Swiss_Prot_6 Reviewed; 60 AA.\n" + "AC Q6GZX4;\n" + "DR GO; GO:0000023; P:regulation of viral transcription;\n" + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" + "//\n" + # Below protein with sequence string and with NO `valid` associated GO class (invalid evidence code) + "ID Swiss_Prot_7 Reviewed; 60 AA.\n" + "AC Q6GZX4;\n" + "DR GO; GO:0000024; P:regulation of viral transcription; IEA:SGD.\n" + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + " MAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" + "//\n" + # Below protein with sequence length greater than 1002 but with `Invalid` associated GO class + f"ID Swiss_Prot_8 Reviewed; {len(protein_sq_2 * 25)} AA.\n" + "AC Q6GZX4;\n" + "DR GO; GO:0000025; P:regulation of viral transcription; IC:Inferred.\n" + f"SQ SEQUENCE {len(protein_sq_2 * 25)} AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + f" {protein_sq_2 * 25}\n" + "//\n" + # Below protein with sequence string but invalid amino acid `X` in its sequence + "ID Swiss_Prot_9 Reviewed; 60 AA.\n" + "AC Q6GZX4;\n" + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + " BAFSAEDVLK EYDRRRRMEA LLLSLYYPND RKLLDYKEWS PPRVQVECPK APVEWNNPPS\n" + "//\n" + # Below protein with a `valid` associated GO class but without sequence string + "ID Swiss_Prot_10 Reviewed; 60 AA.\n" + "AC Q6GZX4;\n" + "DR GO; GO:0000027; P:regulation of viral transcription; EXP:InterPro.\n" + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + " \n" + "//\n" + # Below protein with a `Invalid` associated GO class but without sequence string + "ID Swiss_Prot_11 Reviewed; 60 AA.\n" + "AC Q6GZX4;\n" + "DR GO; GO:0000028; P:regulation of viral transcription; ND:NoData.\n" + "SQ SEQUENCE 60 AA; 29735 MW; B4840739BF7D4121 CRC64;\n" + " \n" + "//\n" + ) + + return raw_str + + @staticmethod + def get_data_in_dataframe() -> pd.DataFrame: + """ + Get a mock DataFrame representing UniProt data. + + The DataFrame contains Swiss-Prot protein data, including identifiers, accessions, GO terms, sequences, + and binary label columns representing whether each protein is associated with certain GO classes. + + Returns: + pd.DataFrame: A DataFrame containing mock UniProt data with columns for 'swiss_id', 'accession', 'go_ids', 'sequence', + and binary labels for GO classes. + """ + expected_data = OrderedDict( + swiss_id=["Swiss_Prot_1", "Swiss_Prot_2"], + accession=["Q6GZX4", "DCGZX4"], + go_ids=[[1, 2, 3, 5], [1, 2, 5]], + sequence=list(GOUniProtMockData.protein_sequences().values()), + **{ + # SP_1, SP_2 + 1: [True, True], + 2: [True, True], + 3: [True, False], + 4: [False, False], + 5: [True, True], + 6: [False, False], + }, + ) + return pd.DataFrame(expected_data) + + @staticmethod + def get_transitively_closed_graph() -> nx.DiGraph: + """ + Get the transitive closure of the ontology graph. + + Returns: + nx.DiGraph: A directed graph representing the transitive closure of the ontology graph. + """ + g = nx.DiGraph() + g.add_nodes_from(node for node in ChebiMockOntology.get_nodes()) + g.add_edges_from(GOUniProtMockData.get_edges_of_transitive_closure_graph()) + return g diff --git a/tests/unit/readers/__init__.py b/tests/unit/readers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/readers/testProteinDataReader.py b/tests/unit/readers/testProteinDataReader.py new file mode 100644 index 0000000..f097aab --- /dev/null +++ b/tests/unit/readers/testProteinDataReader.py @@ -0,0 +1,192 @@ +import unittest +from typing import List +from unittest.mock import mock_open, patch + +from chebai.preprocessing.reader import EMBEDDING_OFFSET + +from chebai_proteins.preprocessing.reader import ProteinDataReader + + +class TestProteinDataReader(unittest.TestCase): + """ + Unit tests for the ProteinDataReader class. + """ + + @classmethod + @patch( + "chebai.preprocessing.reader.open", + new_callable=mock_open, + read_data="M\nK\nT\nF\nR\nN", + ) + def setUpClass(cls, mock_file: mock_open) -> None: + """ + Set up the test environment by initializing a ProteinDataReader instance with a mocked token file. + + Args: + mock_file: Mock object for file operations. + """ + cls.reader = ProteinDataReader(token_path="/mock/path") + # After initializing, cls.reader.cache should now be set to ['M', 'K', 'T', 'F', 'R', 'N'] + assert list(cls.reader.cache.items()) == list( + { + "M": 0, + "K": 1, + "T": 2, + "F": 3, + "R": 4, + "N": 5, + }.items() + ), "Initial cache does not match expected values or the order doesn't match." + + def test_read_data(self) -> None: + """ + Test the _read_data method with a protein sequence to ensure it correctly tokenizes the sequence. + """ + raw_data = "MKTFFRN" + + # Expected output based on the cached tokens + expected_output: List[int] = [ + EMBEDDING_OFFSET + 0, # M + EMBEDDING_OFFSET + 1, # K + EMBEDDING_OFFSET + 2, # T + EMBEDDING_OFFSET + 3, # F + EMBEDDING_OFFSET + 3, # F (repeated token) + EMBEDDING_OFFSET + 4, # R + EMBEDDING_OFFSET + 5, # N + ] + result = self.reader._read_data(raw_data) + self.assertEqual( + result, + expected_output, + "The _read_data method did not produce the expected tokenized output.", + ) + + def test_read_data_with_new_token(self) -> None: + """ + Test the _read_data method with a protein sequence that includes a new token. + Ensure that the new token is added to the cache and processed correctly. + """ + raw_data = "MKTFY" + + # 'Y' is not in the initial cache and should be added. + expected_output: List[int] = [ + EMBEDDING_OFFSET + 0, # M + EMBEDDING_OFFSET + 1, # K + EMBEDDING_OFFSET + 2, # T + EMBEDDING_OFFSET + 3, # F + EMBEDDING_OFFSET + len(self.reader.cache), # Y (new token) + ] + + result = self.reader._read_data(raw_data) + self.assertEqual( + result, + expected_output, + "The _read_data method did not correctly handle a new token.", + ) + + # Verify that 'Y' was added to the cache + self.assertIn( + "Y", self.reader.cache, "The new token 'Y' was not added to the cache." + ) + # Ensure it's at the correct index + self.assertEqual( + self.reader.cache["Y"], + len(self.reader.cache) - 1, + "The new token 'Y' was not added at the correct index in the cache.", + ) + + def test_read_data_with_invalid_token(self) -> None: + """ + Test the _read_data method with an invalid amino acid token to ensure it raises a KeyError. + """ + raw_data = "MKTFZ" # 'Z' is not a valid amino acid token + + with self.assertRaises(KeyError) as context: + self.reader._read_data(raw_data) + + self.assertIn( + "Invalid token 'Z' encountered", + str(context.exception), + "The KeyError did not contain the expected message for an invalid token.", + ) + + def test_read_data_with_empty_sequence(self) -> None: + """ + Test the _read_data method with an empty protein sequence to ensure it returns an empty list. + """ + raw_data = "" + + result = self.reader._read_data(raw_data) + self.assertEqual( + result, + [], + "The _read_data method did not return an empty list for an empty input sequence.", + ) + + def test_read_data_with_repeated_tokens(self) -> None: + """ + Test the _read_data method with repeated amino acid tokens to ensure it handles them correctly. + """ + raw_data = "MMMMM" + + expected_output: List[int] = [EMBEDDING_OFFSET + 0] * 5 # All tokens are 'M' + + result = self.reader._read_data(raw_data) + self.assertEqual( + result, + expected_output, + "The _read_data method did not correctly handle repeated tokens.", + ) + + @patch("builtins.open", new_callable=mock_open) + def test_finish_method_for_new_tokens(self, mock_file: mock_open) -> None: + """ + Test the on_finish method to ensure it appends only the new tokens to the token file in order. + """ + # Simulate that some tokens were already loaded + self.reader._loaded_tokens_count = 6 # 6 tokens already loaded + self.reader.cache = { + "M": 0, + "K": 1, + "T": 2, + "F": 3, + "R": 4, + "N": 5, + "W": 6, # New token 1 + "Y": 7, # New token 2 + "V": 8, # New token 3 + "Q": 9, # New token 4 + "E": 10, # New token 5 + } + + # Run the on_finish method + self.reader.on_finish() + + # Check that the file was opened in append mode ('a') + mock_file.assert_called_with(self.reader.token_path, "a") + + # Verify the new tokens were written in the correct order + mock_file().writelines.assert_called_with(["W\n", "Y\n", "V\n", "Q\n", "E\n"]) + + def test_finish_method_no_new_tokens(self) -> None: + """ + Test the on_finish method when no new tokens are added (cache is the same). + """ + self.reader._loaded_tokens_count = 6 # No new tokens + self.reader.cache = { + "M": 0, + "K": 1, + "T": 2, + "F": 3, + "R": 4, + "N": 5, + } + + with patch("builtins.open", new_callable=mock_open) as mock_file: + self.reader.on_finish() + # Check that no new tokens were written + mock_file().writelines.assert_not_called() + + +if __name__ == "__main__": + unittest.main() diff --git a/tutorials/data_exploration_go.ipynb b/tutorials/data_exploration_go.ipynb new file mode 100644 index 0000000..1822c2f --- /dev/null +++ b/tutorials/data_exploration_go.ipynb @@ -0,0 +1,1343 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "da687d32ba48b188", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "This notebook serves as a guide for new developers using the `chebai` package. If you just want to run the experiments, you can refer to the [README.md](https://github.com/ChEB-AI/python-chebai/blob/dev/README.md) and the [wiki](https://github.com/ChEB-AI/python-chebai/wiki) for the basic commands. This notebook explains what happens under the hood for the GO-UniProt dataset. It covers\n", + "- how to instantiate a data class and generate data\n", + "- how the data is processed and stored\n", + "- and how to work with different molecule encodings.\n", + "\n", + "The chebai package simplifies the handling of these datasets by **automatically creating** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. This feature ensures that the right data is available and formatted properly. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n", + "\n", + "---\n" + ] + }, + { + "cell_type": "markdown", + "id": "0bd07c91-bb02-48d4-b759-aa35ecb224bd", + "metadata": {}, + "source": [ + "# 1. Instantiation of a Data Class\n", + "\n", + "To start working with `chebai`, you first need to instantiate a GO-UniProt data class. This class is responsible for managing, interacting with, and preprocessing the GO and UniProt data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a4d590fb-9a83-456e-9cb4-303caa8203e8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Already in the project root directory: G:\\github-aditya0by0\\python-chebai-proteins\n" + ] + } + ], + "source": [ + "# To run this notebook, you need to change the working directory of the jupyter notebook to root dir of the project.\n", + "import os\n", + "\n", + "# Root directory name of the project\n", + "expected_root_dir = \"python-chebai-proteins\"\n", + "\n", + "# Check if the current directory ends with the expected root directory name\n", + "if not os.getcwd().endswith(expected_root_dir):\n", + " os.chdir(\"..\") # Move up one directory level\n", + " if os.getcwd().endswith(expected_root_dir):\n", + " print(\"Changed to project root directory:\", os.getcwd())\n", + " else:\n", + " print(\"Warning: Directory change unsuccessful. Current directory:\", os.getcwd())\n", + "else:\n", + " print(\"Already in the project root directory:\", os.getcwd())" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "440f203ceaf7e4b7", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T21:25:03.920610Z", + "start_time": "2024-09-30T21:25:03.622407Z" + } + }, + "outputs": [], + "source": [ + "from chebai_proteins.preprocessing.datasets.deepGO.go_uniprot import GOUniProtOver250" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a648346d81d0dc5e", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T21:25:08.863132Z", + "start_time": "2024-09-30T21:25:08.387739Z" + } + }, + "outputs": [], + "source": [ + "go_class = GOUniProtOver250(go_branch=\"BP\")" + ] + }, + { + "cell_type": "markdown", + "id": "64585012b0d7f66f", + "metadata": {}, + "source": [ + "### Inheritance Hierarchy\n", + "\n", + "GO_UniProt data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22). Specifically:\n", + "\n", + "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", + "\n", + "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", + "\n", + "In summary, GO_UniProt data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", + "\n", + "\n", + "### Configuration Parameters\n", + "\n", + "Data classes related to proteins can be configured using the following main parameters:\n", + "\n", + "- **`go_branch (str)`**: The Gene Ontology (GO) branch. The default value is `\"all\"`, which includes all branches of GO in the dataset.\n", + " - **`\"BP\"`**: Biological Process branch.\n", + " - **`\"MF\"`**: Molecular Function branch.\n", + " - **`\"CC\"`**: Cellular Component branch.\n", + "\n", + "- **`max_sequence_length (int)`**: Specifies the maximum allowed sequence length for a protein, with a default of `1002`. During data preprocessing, any proteins exceeding this length will be excluded from further processing.\n", + "\n", + "This allows for more specific datasets focused on a particular aspect of gene function.\n", + "\n", + "- **`splits_file_path (str, optional)`**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. The default is `None`.\n", + "\n", + "### Additional Input Parameters\n", + "\n", + "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_GOUniProtDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py#L33), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n", + "\n", + "\n", + "# Available Data Classes\n", + "\n", + "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/go_uniprot.py).\n", + "\n", + "There is a range of available dataset classes for GOUniProt classes. Usually, you want to use `GOUniProtOver250` or `GOUniProtOver50`. Both inherit from `_GOUniProtOverX`. The number indicates the threshold for selecting label classes. The selection process is based on the annotations of the GO terms with its ancestors across the dataset. For instance, GOUniProtOver50 will only select labels which have at least 50 samples in the dataset.\n", + "\n", + "Refer `select_classes` method of `_GOUniProtOverX` for more details on selection process.\n", + "\n", + "If you need a different threshold, you can create your own subclass." + ] + }, + { + "cell_type": "markdown", + "id": "651ab5c39833bd2c", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "a52b4363-7398-44aa-a4cc-8bba14bdd966", + "metadata": {}, + "source": [ + "# 2. Preparation / Setup Methods\n", + "\n", + "Once a GOUniProt data class instance is created, it typically requires preparation before use. This step is to generate the actual dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9f77351090560bc4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking for processed data in data\\GO_UniProt\\GO250_BP_1002\\processed\n", + "Missing processed data file (`data.pkl` file)\n", + "Downloading Swiss UniProt data....\n", + "Downloading to temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n", + "Downloaded to C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n", + "Unzipping the file....\n", + "Unpacked and saved to data\\GO_UniProt\\raw\\uniprot_sprot.dat\n", + "Removed temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmp7pp677ik\n", + "Missing Gene Ontology raw data\n", + "Downloading Gene Ontology data....\n", + "Extracting class hierarchy...\n", + "Compute transitive closure\n", + "Processing graph\n", + "Parsing swiss uniprot raw data....\n", + "Selecting GO terms based on given threshold: 250 ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Check for processed data in data\\GO_UniProt\\GO250_BP_1002\\processed\\protein_token\n", + "Cross-validation enabled: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Missing transformed data (`data.pt` file). Transforming data.... \n", + "Processing 53604 lines...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 53604/53604 [01:18<00:00, 678.84it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving 20 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\protein_token\\tokens.txt...\n", + "First 10 tokens: ['M', 'S', 'I', 'G', 'A', 'T', 'R', 'L', 'Q', 'N']\n" + ] + } + ], + "source": [ + "go_class.prepare_data()\n", + "go_class.setup()" + ] + }, + { + "cell_type": "markdown", + "id": "2328e824c4dafb2d", + "metadata": {}, + "source": [ + "### Automatic Execution: \n", + "These methods are executed automatically within the data class instance. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", + "\n", + "\n", + "### Why is Preparation Needed?\n", + "\n", + "- **Data Availability**: The preparation step ensures that the required GOUniProt data files are downloaded or loaded, which are essential for analysis.\n", + "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", + "\n", + "### Main Methods for Data Preprocessing\n", + "\n", + "The data preprocessing in a data class involves two main methods:\n", + "\n", + "1. **`prepare_data` Method**:\n", + " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels.\n", + " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", + "\n", + "2. **`setup` Method**:\n", + " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", + " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), ensuring that the data is in a format compatible for input to the model. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the transformation.\n", + " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", + "\n", + "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." + ] + }, + { + "cell_type": "markdown", + "id": "db5b58f2d96823fc", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "ee174b61b36c71aa", + "metadata": {}, + "source": [ + "# 3. Overview of the 3 preprocessing stages\n", + "\n", + "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", + "\n", + "1. **Raw Data Stage**:\n", + " - **File**: `go-basic.obo` and `uniprot_sprot.data`\n", + " - **Description**: This stage contains the raw GO ontology data and raw Swiss-UniProt data, serving as the initial input for further processing.\n", + " - **File Paths**:\n", + " - `data/GO_UniProt/raw/go-basic.obo`\n", + " - `data/GO_UniProt/raw/uniprot_sprot.dat`\n", + "\n", + "2. **Processed Data Stage 1**:\n", + " - **File**: `data.pkl`\n", + " - **Description**: This stage includes the data after initial processing. It contains sequence strings, class columns, and metadata but lacks data splits.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n", + " - **Additional File**: `classes.txt` - A file listing the relevant ChEBI classes.\n", + "\n", + "3. **Processed Data Stage 2**:\n", + " - **File**: `data.pt`\n", + " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", + " - **File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n", + " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", + "\n", + "**Note**: If `go_branch` is specified, the `dataset_name` will include the branch name in the format `${dataset_name}_${go_branch}`. Otherwise, it will just be `${dataset_name}`.\n", + "\n", + "### Summary of File Paths\n", + "\n", + "- **Raw Data**: `data/GO_UniProt/raw`\n", + "- **Processed Data 1**: `data/GO_UniProt/${dataset_name}/processed`\n", + "- **Processed Data 2**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}`\n", + "\n", + "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\n", + "\n", + "### Data Splits\n", + "\n", + "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", + "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n" + ] + }, + { + "cell_type": "markdown", + "id": "a927ad484c930960", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "3f92b58e460c08fd", + "metadata": {}, + "source": [ + "# 4. Data Files and their structure\n", + "\n", + "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their content.\n" + ] + }, + { + "cell_type": "markdown", + "id": "cca75d881cb8bade", + "metadata": {}, + "source": [ + "## go-basic.obo File\n", + "\n", + "**Description**: The `go-basic.obo` file is a key resource in the Gene Ontology (GO) dataset, containing the ontology data that defines various biological processes, molecular functions, and cellular components, as well as their relationships. This file is downloaded directly from the Gene Ontology Consortium and serves as the foundational raw data for further processing in GO-based applications.\n", + "\n", + "#### Example of a Term Document\n", + "\n", + "```plaintext\n", + "[Term]\n", + "id: GO:0000032\n", + "name: cell wall mannoprotein biosynthetic process\n", + "namespace: biological_process\n", + "def: \"The chemical reactions and pathways resulting in the formation of cell wall mannoproteins, any cell wall protein that contains covalently bound mannose residues.\" [GOC:ai]\n", + "synonym: \"cell wall mannoprotein anabolism\" EXACT []\n", + "is_a: GO:0006057 ! mannoprotein biosynthetic process\n", + "is_a: GO:0031506 ! cell wall glycoprotein biosynthetic process\n", + "```\n", + "\n", + "**File Path**: `data/GO_UniProt/raw/go-basic.obo`\n", + "\n", + "### Structure of `go-basic.obo`\n", + "\n", + "The `go-basic.obo` file is organized into blocks of text known as \"term documents.\" Each block starts with a `[Term]` header and contains various attributes that describe a specific biological process, molecular function, or cellular component within the GO ontology. These attributes include identifiers, names, relationships to other terms, and more.\n", + "\n", + "\n", + "\n", + "### Breakdown of Attributes\n", + "\n", + "Each term document in the `go-basic.obo` file consists of the following key attributes:\n", + "\n", + "- **`[Term]`**: \n", + " - **Description**: Indicates the beginning of a new term in the ontology. Each term represents a distinct biological process, molecular function, or cellular component.\n", + "\n", + "- **`id: GO:0000032`**: \n", + " - **Description**: A unique identifier for the biological term within the GO ontology.\n", + " - **Example**: `GO:0000032` refers to the term \"cell wall mannoprotein biosynthetic process.\"\n", + "\n", + "- **`name: cell wall mannoprotein biosynthetic process`**: \n", + " - **Description**: The name of the biological process, molecular function, or cellular component being described.\n", + " - **Example**: The name \"cell wall mannoprotein biosynthetic process\" is a descriptive label for the GO term with the identifier `GO:0000032`.\n", + "\n", + "- **`namespace: biological_process`**: \n", + " - **Description**: Specifies which ontology the term belongs to. The main namespaces are `biological_process`, `molecular_function`, and `cellular_component`.\n", + "\n", + "- **`is_a: GO:0006057`**: \n", + " - **Description**: Defines hierarchical relationships to other terms within the ontology. The `is_a` attribute indicates that the current term is a subclass or specific instance of the referenced term.\n", + " - **Example**: The term `GO:0000032` (\"cell wall mannoprotein biosynthetic process\") is a subclass of `GO:0006057` and subclass of `GO:0031506`.\n" + ] + }, + { + "cell_type": "markdown", + "id": "87c841de7d80beef", + "metadata": {}, + "source": [ + "## uniprot_sprot.dat File\n", + "\n", + "**Description**: The `uniprot_sprot.dat` file is a key component of the UniProtKB/Swiss-Prot dataset. It contains curated protein sequences with detailed annotations. Each entry in the file corresponds to a reviewed protein sequence, complete with metadata about its biological function, taxonomy, gene name, cross-references to other databases, and more. Below is a breakdown of the structure and key attributes in the file, using the provided example.\n", + "\n", + "\n", + "### Example of a Protein Entry\n", + "\n", + "```plaintext\n", + "ID 002L_FRG3G Reviewed; 320 AA.\n", + "AC Q6GZX3;\n", + "DT 28-JUN-2011, integrated into UniProtKB/Swiss-Prot.\n", + "DT 19-JUL-2004, sequence version 1.\n", + "DT 08-NOV-2023, entry version 46.\n", + "DE RecName: Full=Uncharacterized protein 002L;\n", + "GN ORFNames=FV3-002L;\n", + "OS Frog virus 3 (isolate Goorha) (FV-3).\n", + "OC Viruses; Varidnaviria; Bamfordvirae; Nucleocytoviricota; Megaviricetes;\n", + "OX NCBI_TaxID=654924;\n", + "OH NCBI_TaxID=8404; Lithobates pipiens (Northern leopard frog) (Rana pipiens).\n", + "RN [1]\n", + "RP NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA].\n", + "RX PubMed=15165820; DOI=10.1016/j.virol.2004.02.019;\n", + "RA Tan W.G., Barkman T.J., Gregory Chinchar V., Essani K.;\n", + "RT \"Comparative genomic analyses of frog virus 3, type species of the genus\n", + "RT Ranavirus (family Iridoviridae).\";\n", + "RL Virology 323:70-84(2004).\n", + "CC -!- SUBCELLULAR LOCATION: Host membrane {ECO:0000305}; Single-pass membrane\n", + "CC protein {ECO:0000305}.\n", + "DR EMBL; AY548484; AAT09661.1; -; Genomic_DNA.\n", + "DR RefSeq; YP_031580.1; NC_005946.1.\n", + "DR GeneID; 2947774; -.\n", + "DR KEGG; vg:2947774; -.\n", + "DR Proteomes; UP000008770; Segment.\n", + "DR GO; GO:0033644; C:host cell membrane; IEA:UniProtKB-SubCell.\n", + "DR GO; GO:0016020; C:membrane; IEA:UniProtKB-KW.\n", + "PE 4: Predicted;\n", + "KW Host membrane; Membrane; Reference proteome; Transmembrane;\n", + "KW Transmembrane helix.\n", + "FT CHAIN 1..320\n", + "FT /note=\"Uncharacterized protein 002L\"\n", + "FT /id=\"PRO_0000410509\"\n", + "SQ SEQUENCE 320 AA; 34642 MW; 9E110808B6E328E0 CRC64;\n", + " MSIIGATRLQ NDKSDTYSAG PCYAGGCSAF TPRGTCGKDW DLGEQTCASG FCTSQPLCAR\n", + " IKKTQVCGLR YSSKGKDPLV SAEWDSRGAP YVRCTYDADL IDTQAQVDQF VSMFGESPSL\n", + " AERYCMRGVK NTAGELVSRV SSDADPAGGW CRKWYSAHRG PDQDAALGSF CIKNPGAADC\n", + " KCINRASDPV YQKVKTLHAY PDQCWYVPCA ADVGELKMGT QRDTPTNCPT QVCQIVFNML\n", + " DDGSVTMDDV KNTINCDFSK YVPPPPPPKP TPPTPPTPPT PPTPPTPPTP PTPRPVHNRK\n", + " VMFFVAGAVL VAILISTVRW\n", + "//\n", + "```\n", + "\n", + "**File Path**: `data/GO_UniProt/raw/uniprot_sprot.dat`\n", + "\n", + "\n", + "## Structure of `uniprot_sprot.dat`\n", + "\n", + "The `uniprot_sprot.dat` file is organized into blocks of text, each representing a single protein entry. These blocks contain specific tags and fields that describe different aspects of the protein, including its sequence, function, taxonomy, and cross-references to external databases.\n", + "\n", + "### Breakdown of Attributes\n", + "\n", + "Each protein entry in the `uniprot_sprot.dat` file is structured with specific tags and sections that describe the protein in detail. Here's a breakdown of the key attributes:\n", + "\n", + "- **`ID`**: \n", + " - **Description**: Contains the unique identifier for the protein and its status (e.g., `Reviewed` indicates the sequence has been manually curated).\n", + " - **Example**: `002L_FRG3G` is the identifier for the protein from Frog virus 3.\n", + "\n", + "- **`AC`**: \n", + " - **Description**: Accession number, a unique identifier for the protein sequence.\n", + " - **Example**: `Q6GZX3` is the accession number for this entry.\n", + "\n", + "- **`DR`**: \n", + " - **Description**: Cross-references to other databases like EMBL, RefSeq, KEGG, and GeneID.\n", + " - **Example**: This entry is cross-referenced with the EMBL database, RefSeq, GO, etc.\n", + "\n", + "- **`GO`**: \n", + " - **Description**: Gene Ontology annotations that describe the cellular component, biological process, or molecular function associated with the protein.\n", + " - **Example**: The protein is associated with the GO terms `GO:0033644` (host cell membrane) and `GO:0016020` (membrane).\n", + "\n", + "- **`SQ`**: \n", + " - **Description**: The amino acid sequence of the protein.\n", + " - **Example**: The sequence consists of 320 amino acids.\n", + "\n", + "__Note__: For more detailed information refer [here](https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/keywlist.txt\n", + "). \n", + "\n", + "Consider the below line from above example: \n", + "```plaintext\n", + "DR GO; GO:0033644; C:host cell membrane; IEA:UniProtKB-SubCell.\n", + "```\n", + "\n", + "The line contains a **Gene Ontology (GO) annotation** describing the protein's subcellular location. Here's a detailed breakdown:\n", + "\n", + "- **`GO:0033644`**: This is the specific **GO term** identifier for \"host cell membrane,\" which indicates that the protein is associated with or located at the membrane of the host cell.\n", + "\n", + "- **`IEA`**: This stands for **Inferred from Electronic Annotation**, which is part of the **GO Evidence Codes**. **IEA** indicates that the annotation was automatically generated based on computational methods rather than direct experimental evidence. While **IEA** annotations are useful, they are generally considered less reliable than manually curated or experimentally verified evidence codes.\n", + "\n", + "__Note__: For more details on evidence codes check section 5.2" + ] + }, + { + "cell_type": "markdown", + "id": "b7687078-f6b8-4fbf-afa7-dfda89061a5e", + "metadata": {}, + "source": [ + "## data.pkl File\n", + "\n", + "**Description**: This file is generated by the `prepare_data` method and contains the processed GO data in a dataframe format. It includes protein IDs, data representations (such as sequence strings), and class columns with boolean values." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b4da7e73e251e1d1", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T14:08:33.990378Z", + "start_time": "2024-09-30T14:08:33.959459Z" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b66fbb9b720d053c", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T14:10:12.796911Z", + "start_time": "2024-09-30T14:10:06.052276Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Size of the data (rows x columns): (53604, 902)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
swiss_idaccessiongo_idssequence4175122165209226...1990778200002620001452000146200014720002412000243200114120012332001234
111S1_CARILB5KVH4[3006, 8150, 9791, 10431, 21700, 22414, 32501,...MAKPILLSIYLCLIIVALFNGCLAQSGGRQQHKFGQCQLNRLDALE...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
311S2_SESINQ9XHP0[3006, 8150, 10431, 21700, 22414, 32502, 48609]MVAFKFLLALSLSLLVSAAIAQTREPRLTQGQQCRFQRISGAQPSL...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
614310_ARATHP48347,Q9LME5[7165, 8150, 9742, 9755, 9987, 43401, 50789, 5...MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
814331_ARATHP42643,Q945M2,Q9M0S7[8150, 19222, 50789, 65007]MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT...FalseFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
914331_CAEELP41932,Q21537[132, 226, 1708, 6611, 6810, 6886, 6913, 6950,...MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL...FalseFalseFalseFalseFalseTrue...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
\n", + "

5 rows Γ— 902 columns

\n", + "
" + ], + "text/plain": [ + " swiss_id accession \\\n", + "1 11S1_CARIL B5KVH4 \n", + "3 11S2_SESIN Q9XHP0 \n", + "6 14310_ARATH P48347,Q9LME5 \n", + "8 14331_ARATH P42643,Q945M2,Q9M0S7 \n", + "9 14331_CAEEL P41932,Q21537 \n", + "\n", + " go_ids \\\n", + "1 [3006, 8150, 9791, 10431, 21700, 22414, 32501,... \n", + "3 [3006, 8150, 10431, 21700, 22414, 32502, 48609] \n", + "6 [7165, 8150, 9742, 9755, 9987, 43401, 50789, 5... \n", + "8 [8150, 19222, 50789, 65007] \n", + "9 [132, 226, 1708, 6611, 6810, 6886, 6913, 6950,... \n", + "\n", + " sequence 41 75 122 \\\n", + "1 MAKPILLSIYLCLIIVALFNGCLAQSGGRQQHKFGQCQLNRLDALE... False False False \n", + "3 MVAFKFLLALSLSLLVSAAIAQTREPRLTQGQQCRFQRISGAQPSL... False False False \n", + "6 MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV... False False False \n", + "8 MATPGASSARDEFVYMAKLAEQAERYEEMVEFMEKVAKAVDKDELT... False False False \n", + "9 MSDTVEELVQRAKLAEQAERYDDMAAAMKKVTEQGQELSNEERNLL... False False False \n", + "\n", + " 165 209 226 ... 1990778 2000026 2000145 2000146 2000147 \\\n", + "1 False False False ... False False False False False \n", + "3 False False False ... False False False False False \n", + "6 False False False ... False False False False False \n", + "8 False False False ... False False False False False \n", + "9 False False True ... False False False False False \n", + "\n", + " 2000241 2000243 2001141 2001233 2001234 \n", + "1 False False False False False \n", + "3 False False False False False \n", + "6 False False False False False \n", + "8 False False False False False \n", + "9 False False False False False \n", + "\n", + "[5 rows x 902 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pkl_df = pd.DataFrame(\n", + " pd.read_pickle(\n", + " os.path.join(\n", + " go_class.processed_dir_main,\n", + " go_class.processed_dir_main_file_names_dict[\"data\"],\n", + " )\n", + " )\n", + ")\n", + "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", + "pkl_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "735844f0b2474ad6", + "metadata": {}, + "source": [ + "**File Path**: `data/GO_UniProt/${dataset_name}/processed/data.pkl`\n", + "\n", + "\n", + "### Structure of `data.pkl`\n", + "`data.pkl` as following structure: \n", + "- **Column 0**: Contains the Identifier from Swiss-UniProt Dataset for each Swiss Protein data instance.\n", + "- **Column 1**: Contains the accession of each Protein data instance.\n", + "- **Column 2**: Contains the list of GO-IDs (Identifiers from Gene Ontology) which maps each Swiss Protein to the Gene Ontology instance.\n", + "- **Column 3**: Contains the sequence representation for the Swiss Protein using Amino Acid notation.\n", + "- **Column 4 and onwards**: Contains the labels, starting from column 4.\n", + "\n", + "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" + ] + }, + { + "cell_type": "markdown", + "id": "2c9b17f6-93bd-4cc3-8967-7ab1d2e06e51", + "metadata": {}, + "source": [ + "## data.pt File\n", + "\n", + "**Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library. It includes keys such as `ident`, `features`, `labels`, and `group`, making it ready for model input." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "85b097601fb242d6", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T14:10:35.034002Z", + "start_time": "2024-09-30T14:10:35.018342Z" + } + }, + "outputs": [], + "source": [ + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "289a54a71dec20fb", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T14:11:36.443693Z", + "start_time": "2024-09-30T14:11:34.199285Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type of loaded data: \n", + "Content of the data file: \n", + " {'features': [10, 14, 21, 23, 12, 17, 17, 11, 12, 22, 17, 24, 17, 12, 12, 28, 14, 17, 25, 19, 13, 24, 17, 14, 18, 11, 13, 13, 16, 18, 18, 29, 21, 25, 13, 18, 24, 18, 17, 19, 16, 17, 20, 14, 17, 27, 23, 15, 19, 16, 12, 27, 14, 27, 14, 13, 28, 12, 27, 11, 26, 20, 23, 19, 29, 18, 18, 17, 18, 24, 14, 13, 28, 14, 28, 28, 16, 16, 15, 12, 27, 23, 19, 13, 17, 17, 17, 23, 29, 22, 11, 19, 14, 23, 18, 17, 28, 22, 12, 14, 16, 13, 16, 13, 12, 15, 13, 28, 17, 25, 23, 13, 24, 23, 27, 15, 25, 27, 27, 11, 18, 16, 18, 11, 18, 18, 13, 18, 16, 16, 27, 25, 18, 18, 20, 16, 29, 18, 21, 12, 16, 29, 25, 16, 27, 13, 20, 12, 12, 14, 25, 23, 14, 13, 28, 14, 29, 26, 24, 22, 19, 20, 13, 11, 11, 23, 28, 28, 14, 12, 25, 17, 17, 20, 15, 29, 19, 19, 14, 19, 18, 17, 20, 18, 19, 23, 16, 19, 25, 22, 17, 14, 13, 19, 23, 20, 20, 27, 25, 16, 23, 18, 13, 18, 18, 27, 22, 27, 18, 29, 16, 16, 18, 18, 18, 29, 18, 18, 16, 16, 13, 27, 29, 13, 27, 18, 18, 16, 20, 17, 13, 19, 19, 28, 25, 11, 13, 25, 20, 14, 27, 25, 17, 14, 20, 14, 25, 19, 28, 20, 15, 27, 15, 14, 16, 16, 17, 18, 11, 27, 19, 20, 29, 16, 13, 11, 12, 28, 16, 28, 27, 13, 16, 18, 17, 18, 28, 12, 16, 23, 16, 26, 11, 16, 27, 27, 18, 27, 29, 27, 27, 16, 21, 27, 16, 27, 16, 27, 16, 27, 11, 27, 11, 27, 16, 16, 18, 11, 16, 16, 13, 13, 16, 20, 20, 19, 13, 17, 27, 27, 15, 12, 24, 15, 17, 11, 17, 16, 27, 19, 12, 13, 20, 23, 11, 16, 14, 20, 12, 22, 15, 27, 27, 14, 13, 16, 12, 11, 15, 28, 19, 11, 29, 19, 17, 23, 12, 17, 16, 26, 17, 18, 17, 11, 14, 27, 16, 13, 14, 17, 22, 11, 20, 14, 17, 22, 28, 23, 29, 26, 19, 17, 19, 14, 29, 11, 28, 28, 22, 14, 17, 16, 13, 16, 14, 27, 28, 18, 28, 28, 20, 19, 25, 13, 18, 15, 28, 25, 20, 20, 27, 17, 16, 27, 13, 18, 17, 17, 15, 12, 23, 18, 19, 25, 14, 28, 28, 21, 16, 14, 16, 20, 27, 13, 25, 27, 26, 28, 11, 25, 21, 15, 19, 27, 19, 14, 10, 28, 11, 23, 17, 14, 13, 16, 15, 11, 14, 12, 16, 14, 17, 23, 27, 27, 28, 17, 28, 19, 14, 25, 18, 12, 23, 16, 27, 20, 14, 16, 16, 17, 21, 25, 19, 16, 18, 27, 11, 15, 17, 28, 16, 11, 16, 11, 16, 11, 11, 16, 11, 27, 16, 16, 14, 27, 28], 'labels': array([False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, True, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, True, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, True, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, True, False, False, False, False, False, True,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, True, True, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, True,\n", + " True, False, False, False, False, False, False, False, False,\n", + " True, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False, False, False,\n", + " False, False, False, False, False, False, False]), 'ident': '11S1_CARIL', 'group': None}\n" + ] + } + ], + "source": [ + "data_pt = torch.load(\n", + " os.path.join(go_class.processed_dir, go_class.processed_file_names_dict[\"data\"]),\n", + " weights_only=False,\n", + ")\n", + "print(\"Type of loaded data:\", type(data_pt))\n", + "print(\"Content of the data file: \\n\", data_pt[0])" + ] + }, + { + "cell_type": "markdown", + "id": "2c9f23883c66b48d", + "metadata": {}, + "source": [ + "**File Path**: `data/GO_UniProt/${dataset_name}/processed/${reader_name}/data.pt`\n", + "\n", + "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", + "\n", + "- **`features`**: \n", + " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", + "\n", + "- **`labels`**: \n", + " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", + "\n", + "- **`ident`**: \n", + " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" + ] + }, + { + "cell_type": "markdown", + "id": "36aed0b8-ab05-428d-8833-2a24deebacc3", + "metadata": {}, + "source": [ + "## classes.txt File\n", + "\n", + "**Description**: This file lists the GO classes that are used as labels. It can be used to match labels in `data.pt` with GO classes: For position `i` in the label-tensor, the GO-ID is in line `i` of `classes.txt`" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "19200f7ff9a6ebba", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T21:30:34.344202Z", + "start_time": "2024-09-30T21:30:34.328318Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "41\n", + "75\n", + "122\n", + "165\n", + "209\n" + ] + } + ], + "source": [ + "with open(os.path.join(go_class.processed_dir_main, \"classes.txt\"), \"r\") as file:\n", + " for i in range(5):\n", + " line = file.readline()\n", + " print(line.strip())" + ] + }, + { + "cell_type": "markdown", + "id": "f69012b3540fd1b6", + "metadata": {}, + "source": [ + "**File Path**: `data/GO_UniProt/${dataset_name}/processed/classes.txt`\n", + "\n", + "The `classes.txt` file lists selected GO classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique Swiss Protein class ID, identifying specific protein from Swiss-UniProt dataset." + ] + }, + { + "cell_type": "markdown", + "id": "b81ea34f-cfa8-4ffa-8b88-b54ca96afd84", + "metadata": {}, + "source": [ + "## splits.csv File\n", + "\n", + "**Description**: This file contains saved data splits from previous runs. During subsequent runs, it is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "88c3ea8f01ba9fac", + "metadata": { + "ExecuteTime": { + "end_time": "2024-09-30T21:30:41.586616Z", + "start_time": "2024-09-30T21:30:39.318598Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsplit
014331_ARATHtrain
114331_CAEELtrain
214331_MAIZEtrain
314332_MAIZEtrain
414333_ARATHtrain
\n", + "
" + ], + "text/plain": [ + " id split\n", + "0 14331_ARATH train\n", + "1 14331_CAEEL train\n", + "2 14331_MAIZE train\n", + "3 14332_MAIZE train\n", + "4 14333_ARATH train" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csv_df = pd.read_csv(os.path.join(go_class.processed_dir_main, \"splits.csv\"))\n", + "csv_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "6661dc11247e9753", + "metadata": {}, + "source": [ + "**File Path**: `data/GO_UniProt/${dataset_name}/processed/splits.csv`\n", + "\n", + "To reuse an existing split, you can use the `splits_file_path` argument. This way, you can reuse the same datasplit across several runs." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2b02d8b4-c2de-4b8e-b680-ec67b40d9a30", + "metadata": {}, + "outputs": [], + "source": [ + "# You can specify a literal path for the `splits_file_path`, or if another `go_class` instance is already defined,\n", + "# you can use its existing `splits_file_path` attribute for consistency.\n", + "go_class_with_splits = GOUniProtOver250(\n", + " go_branch=\"BP\",\n", + " # splits_file_path=\"data/GO_UniProt/GO250_BP_1002/processed/splits.csv\", # Literal path option\n", + " splits_file_path=go_class.splits_file_path, # Use path from an existing `go_class` instance\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e6b1f184a5091b83", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "481b8c0271ec9636", + "metadata": {}, + "source": [ + "## 5.1 Protein Representation Using Amino Acid Sequence Notation\n", + "\n", + "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n", + "\n", + "### Example Protein Sequence\n", + "\n", + "Protein: **Lysozyme C** from **Gallus gallus** (Chicken). \n", + "[Lysozyme C - UniProtKB P00698](https://www.uniprot.org/uniprotkb/P00698/entry#function)\n", + "\n", + "- **Sequence**: `MRSLLILVLCFLPLAALGKVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQATNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDGNGMNAWVAWRNRCKGTDVQAWIRGCRL`\n", + "- **Sequence Length**: 147\n", + "\n", + "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n", + "\n", + "### Tokenization and Encoding\n", + "\n", + "To tokenize and numerically encode this protein sequence, the `ProteinDataReader` class is used. This class allows for n-gram tokenization, where the `n_gram` parameter defines the size of the tokenized units. If `n_gram` is not provided (default is `None`), each amino acid letter is treated as a single token.\n", + "\n", + "For more details, you can explore the implementation of the `ProteinDataReader` class in the source code [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/reader.py)." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "e0cf4fb6-2ca4-4b85-a4e7-0cfbac5cd6c1", + "metadata": {}, + "outputs": [], + "source": [ + "from chebai_proteins.preprocessing.reader import ProteinDataReader" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "e8343d83-0be3-44df-9224-bba8d5c32336", + "metadata": {}, + "outputs": [], + "source": [ + "protein_dr_3gram = ProteinDataReader(n_gram=3)\n", + "protein_dr = ProteinDataReader()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "8a18dc27-f308-4dde-b1ae-b03a20fb0d45", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[10, 16, 11, 17, 17, 12, 17, 28, 17, 24, 25, 17, 23, 17, 14, 14, 17, 13, 21]\n", + "[30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]\n" + ] + } + ], + "source": [ + "protein = \"MRSLLILVLCFLPLAALGK\"\n", + "print(protein_dr._read_data(protein))\n", + "print(protein_dr_3gram._read_data(protein))" + ] + }, + { + "cell_type": "markdown", + "id": "7e95738a-0b2d-4c56-ac97-f3b24c1de18f", + "metadata": {}, + "source": [ + "The numbers mentioned above refer to the index of each individual token from the [`tokens.txt`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/bin/protein_token/tokens.txt) file, which is used by the `ProteinDataReader` class. \n", + "\n", + "Each token in the `tokens.txt` file corresponds to a specific amino-acid letter, and these tokens are referenced by their index. Additionally, the index values are offset by the `EMBEDDING_OFFSET`, ensuring that the token embeddings are adjusted appropriately during processing." + ] + }, + { + "cell_type": "markdown", + "id": "fd54ca4a-743c-496e-9e89-cff2d8226eb2", + "metadata": {}, + "source": [ + "### The 20 Amino Acids and Their One-Letter Notations\n", + "\n", + "Here is a list of the 20 standard amino acids, along with their one-letter notations and descriptions:\n", + "\n", + "| One-Letter Notation | Amino Acid Name | Description |\n", + "|---------------------|----------------------|---------------------------------------------------------|\n", + "| **A** | Alanine | Non-polar, aliphatic amino acid. |\n", + "| **C** | Cysteine | Polar, contains a thiol group, forms disulfide bonds. |\n", + "| **D** | Aspartic Acid | Acidic, negatively charged at physiological pH. |\n", + "| **E** | Glutamic Acid | Acidic, negatively charged at physiological pH. |\n", + "| **F** | Phenylalanine | Aromatic, non-polar. |\n", + "| **G** | Glycine | Smallest amino acid, non-polar. |\n", + "| **H** | Histidine | Polar, positively charged, can participate in enzyme active sites. |\n", + "| **I** | Isoleucine | Non-polar, aliphatic. |\n", + "| **K** | Lysine | Basic, positively charged at physiological pH. |\n", + "| **L** | Leucine | Non-polar, aliphatic. |\n", + "| **M** | Methionine | Non-polar, contains sulfur, start codon in mRNA translation. |\n", + "| **N** | Asparagine | Polar, uncharged. |\n", + "| **P** | Proline | Non-polar, introduces kinks in protein chains. |\n", + "| **Q** | Glutamine | Polar, uncharged. |\n", + "| **R** | Arginine | Basic, positively charged, involved in binding phosphate groups. |\n", + "| **S** | Serine | Polar, can be phosphorylated. |\n", + "| **T** | Threonine | Polar, can be phosphorylated. |\n", + "| **V** | Valine | Non-polar, aliphatic. |\n", + "| **W** | Tryptophan | Aromatic, non-polar, largest amino acid. |\n", + "| **Y** | Tyrosine | Aromatic, polar, can be phosphorylated. |\n", + "\n", + "### Understanding Protein Sequences\n", + "\n", + "In the example sequence, each letter represents one of the above amino acids. The sequence reflects the specific order of amino acids in the protein, which is critical for its structure and function.\n", + "\n", + "This notation is used extensively in various bioinformatics tools and databases to study protein structure, function, and interactions.\n", + "\n", + "\n", + "_Note_: Refer for amino acid sequence: https://en.wikipedia.org/wiki/Protein_primary_structure" + ] + }, + { + "cell_type": "markdown", + "id": "db6d7f2cc446e6f9", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "7f42b928364e5cd1", + "metadata": {}, + "source": [ + "## 5.2 More on GO Evidence Codes\n", + "\n", + "The **Gene Ontology (GO) Evidence Codes** provide a way to indicate the level of evidence supporting a GO annotation. Here's a list of the GO evidence codes with brief descriptions:\n", + "\n", + "| **Evidence Code** | **Description** |\n", + "|-----------------------|-----------------|\n", + "| **EXP** | [Inferred from Experiment (EXP)](http://wiki.geneontology.org/index.php/Inferred_from_Experiment_(EXP)) |\n", + "| **IDA** | [Inferred from Direct Assay (IDA)](http://wiki.geneontology.org/index.php/Inferred_from_Direct_Assay_(IDA)) |\n", + "| **IPI** | [Inferred from Physical Interaction (IPI)](http://wiki.geneontology.org/index.php/Inferred_from_Physical_Interaction_(IPI)) |\n", + "| **IMP** | [Inferred from Mutant Phenotype (IMP)](http://wiki.geneontology.org/index.php/Inferred_from_Mutant_Phenotype_(IMP)) |\n", + "| **IGI** | [Inferred from Genetic Interaction (IGI)](http://wiki.geneontology.org/index.php/Inferred_from_Genetic_Interaction_(IGI)) |\n", + "| **IEP** | [Inferred from Expression Pattern (IEP)](http://wiki.geneontology.org/index.php/Inferred_from_Expression_Pattern_(IEP)) |\n", + "| **HTP** | [Inferred from High Throughput Experiment (HTP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Experiment_(HTP) ) |\n", + "| **HDA** | [Inferred from High Throughput Direct Assay (HDA)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Direct_Assay_(HDA)) |\n", + "| **HMP** | [Inferred from High Throughput Mutant Phenotype (HMP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Mutant_Phenotype_(HMP)) |\n", + "| **HGI** | [Inferred from High Throughput Genetic Interaction (HGI)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Genetic_Interaction_(HGI)) |\n", + "| **HEP** | [Inferred from High Throughput Expression Pattern (HEP)](http://wiki.geneontology.org/index.php/Inferred_from_High_Throughput_Expression_Pattern_(HEP)) |\n", + "| **IBA** | [Inferred from Biological aspect of Ancestor (IBA)](http://wiki.geneontology.org/index.php/Inferred_from_Biological_aspect_of_Ancestor_(IBA)) |\n", + "| **IBD** | [Inferred from Biological aspect of Descendant (IBD)](http://wiki.geneontology.org/index.php/Inferred_from_Biological_aspect_of_Descendant_(IBD)) |\n", + "| **IKR** | [Inferred from Key Residues (IKR)](http://wiki.geneontology.org/index.php/Inferred_from_Key_Residues_(IKR)) |\n", + "| **IRD** | [Inferred from Rapid Divergence (IRD)](http://wiki.geneontology.org/index.php/Inferred_from_Rapid_Divergence(IRD)) |\n", + "| **ISS** | [Inferred from Sequence or Structural Similarity (ISS)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_or_structural_Similarity_(ISS)) |\n", + "| **ISO** | [Inferred from Sequence Orthology (ISO)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_Orthology_(ISO)) |\n", + "| **ISA** | [Inferred from Sequence Alignment (ISA)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_Alignment_(ISA)) |\n", + "| **ISM** | [Inferred from Sequence Model (ISM)](http://wiki.geneontology.org/index.php/Inferred_from_Sequence_Model_(ISM)) |\n", + "| **RCA** | [Inferred from Reviewed Computational Analysis (RCA)](http://wiki.geneontology.org/index.php/Inferred_from_Reviewed_Computational_Analysis_(RCA)) |\n", + "| **IEA** | [Inferred from Electronic Annotation (IEA)](http://wiki.geneontology.org/index.php/Inferred_from_Electronic_Annotation_(IEA)) |\n", + "| **TAS** | [Traceable Author Statement (TAS)](http://wiki.geneontology.org/index.php/Traceable_Author_Statement_(TAS)) |\n", + "| **NAS** | [Non-traceable Author Statement (NAS)](http://wiki.geneontology.org/index.php/Non-traceable_Author_Statement_(NAS)) |\n", + "| **IC** | [Inferred by Curator (IC)](http://wiki.geneontology.org/index.php/Inferred_by_Curator_(IC)) |\n", + "| **ND** | [No Biological Data Available (ND)](http://wiki.geneontology.org/index.php/No_biological_Data_available_(ND)_evidence_code) |\n", + "| **NR** | Not Recorded |\n", + "\n", + "\n", + "### **Grouping of Codes**:\n", + "\n", + "- **Experimental Evidence Codes**:\n", + " - **EXP**, **IDA**, **IPI**, **IMP**, **IGI**, **IEP**\n", + " \n", + "- **High-Throughput Experimental Codes**:\n", + " - **HTP**, **HDA**, **HMP**, **HGI**, **HEP**\n", + "\n", + "- **Phylogenetically-Inferred Codes**:\n", + " - **IBA**, **IBD**, **IKR**, **IRD**\n", + "\n", + "- **Author/Curator Inferred Codes**:\n", + " - **TAS**, **IC**, **NAS**\n", + "\n", + "- **Computational Evidence Codes**:\n", + " - **IEA**, **ISS**, **ISA**, **ISM**, **ISO**, **RCA**\n", + "\n", + "- **Others**:\n", + " - **ND** (No Biological Data Available), **NR** (Not Recorded)\n", + "\n", + "\n", + "These evidence codes ensure transparency and give researchers an understanding of how confident they can be in a particular GO annotation.\n", + "\n", + "__Note__ : For more information on GO evidence codes please check [here](https://geneontology.org/docs/guide-go-evidence-codes/) " + ] + }, + { + "cell_type": "markdown", + "id": "1c11d6f520b02434", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/data_exploration_scope.ipynb b/tutorials/data_exploration_scope.ipynb new file mode 100644 index 0000000..c7d17b6 --- /dev/null +++ b/tutorials/data_exploration_scope.ipynb @@ -0,0 +1,1182 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0bd757ea-a6a0-43f8-8701-cafb44f20f6b", + "metadata": {}, + "source": [ + "# Introduction\n", + "\n", + "This notebook serves as a guide for new developers using the `chebai` package. If you just want to run the experiments, you can refer to the [README.md](https://github.com/ChEB-AI/python-chebai/blob/dev/README.md) and the [wiki](https://github.com/ChEB-AI/python-chebai/wiki) for the basic commands. This notebook explains what happens under the hood for the SCOPe dataset. It covers\n", + "- how to instantiate a data class and generate data\n", + "- how the data is processed and stored\n", + "- and how to work with different molecule encodings.\n", + "\n", + "The `chebai` package simplifies the handling of these datasets by **automatically downloading and processing** them as needed. This means that you do not have to input any data manually; the package will generate and organize the data files based on the parameters and encodings selected. You can however provide your own data files, for instance if you want to replicate a specific experiment.\n", + "\n", + "---\n" + ] + }, + { + "cell_type": "markdown", + "id": "cca637ce-d4ea-4365-acd9-657418e0640f", + "metadata": {}, + "source": [ + "### Overview of SCOPe Data and its Usage in Protein-Related Tasks\n", + "\n", + "#### **What is SCOPe?**\n", + "\n", + "The **Structural Classification of Proteins β€” extended (SCOPe)** is a comprehensive database that extends the original SCOP (Structural Classification of Proteins) database. SCOPe offers a detailed classification of protein domains based on their structural and evolutionary relationships.\n", + "\n", + "The SCOPe database, like SCOP, organizes proteins into a hierarchy of domains based on structural similarities, which is crucial for understanding evolutionary patterns and functional aspects of proteins. This hierarchical structure is comparable to taxonomy in biology, where species are classified based on shared characteristics.\n", + "\n", + "#### **SCOPe Hierarchy:**\n", + "By analogy with taxonomy, SCOP was created as a hierarchy of several levels where the fundamental unit of classification is a **domain** in the experimentally determined protein structure. Starting at the bottom, the hierarchy of SCOP domains comprises the following levels:\n", + "\n", + "1. **Species**: Representing distinct protein sequences and their naturally occurring or artificially created variants.\n", + "2. **Protein**: Groups together similar sequences with essentially the same functions. These can originate from different biological species or represent isoforms within the same species.\n", + "3. **Family**: Contains proteins with similar sequences but typically distinct functions.\n", + "4. **Superfamily**: Bridges protein families with common functional and structural features, often inferred from a shared evolutionary ancestor.\n", + "5. **Fold**: Groups structurally similar superfamilies. \n", + "6. **Class**: Based on secondary structure content and organization. This level classifies proteins based on their secondary structure properties, such as alpha-helices and beta-sheets.\n", + "\n", + "\n", + "\n", + "For more details, you can refer to the [SCOPe documentation](https://scop.berkeley.edu/help/ver=2.08).\n", + "\n", + "---\n", + "\n", + "#### **Why are We Using SCOPe?**\n", + "\n", + "We are integrating the SCOPe data into our pipeline as part of an ontology pretraining task for protein-related models. SCOPe is a great fit for our goal because it is primarily **structure-based**, unlike other protein-related databases like Gene Ontology (GO), which focuses more on functional classes.\n", + "\n", + "Our primary objective is to reproduce **ontology pretraining** on a protein-related task, and SCOPe provides the structural ontology that we need for this. The steps in our pipeline are aligned as follows:\n", + "\n", + "| **Stage** | **Chemistry Task** | **Proteins Task** |\n", + "|--------------------------|-------------------------------------|------------------------------------------------|\n", + "| **Unsupervised Pretraining** | Mask pretraining (ELECTRA) | Mask pretraining (ESM2, optional) |\n", + "| **Ontology Pretraining** | ChEBI | SCOPe |\n", + "| **Finetuning Task** | Toxicity, Solubility, etc. | GO (MF, BP, CC branches) |\n", + "\n", + " \n", + "This integration will allow us to use **SCOPe** for tasks such as **protein classification** and will contribute to the success of **pretraining models** for protein structures. The data will be processed with the same approach as the GO data, with **different labels** corresponding to the SCOPe classification system.\n", + "\n", + "---\n", + "\n", + "#### **Why SCOPe is Suitable for Our Task**\n", + "\n", + "1. **Structure-Based Classification**: SCOPe is primarily concerned with the structural characteristics of proteins, making it ideal for protein structure pretraining tasks. This contrasts with other ontology databases like **GO**, which categorize proteins based on more complex functional relationships.\n", + " \n", + "2. **Manageable Size**: SCOPe contains around **140,000 entries**, making it a manageable dataset for training models. This is similar in size to **ChEBI**, which is used in the chemical domain, and ensures we can work with it effectively for pretraining." + ] + }, + { + "cell_type": "markdown", + "id": "338e452f-426c-493d-bec2-5bd51e24e4aa", + "metadata": {}, + "source": [ + "\n", + "### Protein Data Bank (PDB)\n", + "\n", + "The **Protein Data Bank (PDB)** is a global repository that stores 3D structural data of biological macromolecules like proteins and nucleic acids. It contains information obtained through experimental methods such as **X-ray crystallography**, **NMR spectroscopy**, and **cryo-EM**. The data includes atomic coordinates, secondary structure details, and experimental conditions.\n", + "\n", + "The PDB is an essential resource for **structural biology**, **bioinformatics**, and **drug discovery**, enabling scientists to understand protein functions, interactions, and mechanisms at the molecular level.\n", + "\n", + "For more details, visit the [RCSB PDB website](https://www.rcsb.org/).\n" + ] + }, + { + "cell_type": "markdown", + "id": "f6c25706-251c-438c-9915-e8002647eb94", + "metadata": {}, + "source": [ + "### Understanding [SCOPe](https://scop.berkeley.edu/) and [PDB](https://www.rcsb.org/) \n", + "\n", + "\n", + "1. **Protein domains form chains.** \n", + "2. **Chains form complexes** (protein complexes or structures). \n", + "3. These **complexes are the entries in PDB**, represented by unique identifiers like `\"1A3N\"`. \n", + "\n", + "---\n", + "\n", + "#### **Protein Domain** \n", + "A **protein domain** is a **structural and functional unit** of a protein. \n", + "\n", + "\n", + "##### Key Characteristics:\n", + "- **Domains are part of a protein chain.** \n", + "- A domain can span: \n", + " 1. **The entire chain** (single-domain protein): \n", + " - In this case, the protein domain is equivalent to the chain itself. \n", + " - Example: \n", + " - All chains of the **PDB structure \"1A3N\"** are single-domain proteins. \n", + " - Each chain has a SCOPe domain identifier. \n", + " - For example, Chain **A**: \n", + " - Domain identifier: `d1a3na_` \n", + " - Breakdown of the identifier: \n", + " - `d`: Denotes domain. \n", + " - `1a3n`: Refers to the PDB protein structure identifier. \n", + " - `a`: Specifies the chain within the structure. (`_` for None and `.` for multiple chains)\n", + " - `_`: Indicates the domain spans the entire chain (single-domain protein). \n", + " - Example: [PDB Structure 1A3N - Chain A](https://www.rcsb.org/sequence/1A3N#A)\n", + " 2. **A specific portion of the chain** (multi-domain protein): \n", + " - Here, a single chain contains multiple domains. \n", + " - Example: Chain **A** of the **PDB structure \"1PKN\"** contains three domains: `d1pkna1`, `d1pkna2`, `d1pkna3`. \n", + " - Example: [PDB Structure 1PKN - Chain A](https://www.rcsb.org/annotations/1PKN). \n", + "\n", + "---\n", + "\n", + "#### **Protein Chain** \n", + "A **protein chain** refers to the entire **polypeptide chain** observed in a protein's 3D structure (as described in PDB files). \n", + "\n", + "##### Key Points:\n", + "- A chain can consist of **one or multiple domains**:\n", + " - **Single-domain chain**: The chain and domain are identical. \n", + " - Example: Myoglobin. \n", + " - **Multi-domain chain**: Contains several domains, each with distinct structural and functional roles. \n", + "- Chains assemble to form **protein complexes** or **structures**. \n", + "\n", + "\n", + "---\n", + "\n", + "#### **Key Observations About SCOPe** \n", + "- The **fundamental classification unit** in SCOPe is the **protein domain**, not the entire protein. \n", + "- _**The taxonomy in SCOPe is not for the entire protein (i.e., the full-length amino acid sequence as encoded by a gene) but for protein domains, which are smaller, structurally and functionally distinct regions of the protein.**_\n", + "\n", + "\n", + "--- \n", + "\n", + "**SCOPe 2.08 Data Analysis:**\n", + "\n", + "The current SCOPe version (2.08) includes the following statistics based on analysis for relevant data:\n", + "\n", + "- **Classes**: 12\n", + "- **Folds**: 1485\n", + "- **Superfamilies**: 2368\n", + "- **Families**: 5431\n", + "- **Proteins**: 13,514\n", + "- **Species**: 30,294\n", + "- **Domains**: 344,851\n", + "\n", + "For more detailed statistics, please refer to the official SCOPe website:\n", + "\n", + "- [SCOPe 2.08 Statistics](https://scop.berkeley.edu/statistics/ver=2.08)\n", + "- [SCOPe 2.08 Release](https://scop.berkeley.edu/ver=2.08)\n", + "\n", + "---\n", + "\n", + "## SCOPe Labeling \n", + "\n", + "- Use SCOPe labels for protein domains.\n", + "- Map them back to their **protein-chain** sequences (protein sequence label = sum of all domain labels).\n", + "- Train on protein sequences.\n", + "- This pretraining task would be comparable to GO-based training.\n", + "\n", + "--- " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "990cc6f2-6b4a-4fa7-905f-dda183c3ec4c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Changed to project root directory: G:\\github-aditya0by0\\python-chebai-proteins\n" + ] + } + ], + "source": [ + "# To run this notebook, you need to change the working directory of the jupyter notebook to root dir of the project.\n", + "import os\n", + "\n", + "# Root directory name of the project\n", + "expected_root_dir = \"python-chebai-proteins\"\n", + "\n", + "# Check if the current directory ends with the expected root directory name\n", + "if not os.getcwd().endswith(expected_root_dir):\n", + " os.chdir(\"..\") # Move up one directory level\n", + " if os.getcwd().endswith(expected_root_dir):\n", + " print(\"Changed to project root directory:\", os.getcwd())\n", + " else:\n", + " print(\"Warning: Directory change unsuccessful. Current directory:\", os.getcwd())\n", + "else:\n", + " print(\"Already in the project root directory:\", os.getcwd())" + ] + }, + { + "cell_type": "markdown", + "id": "4550d01fc7af5ae4", + "metadata": {}, + "source": [ + "# 1. Instantiation of a Data Class\n", + "\n", + "To start working with `chebai`, you first need to instantiate a SCOPe data class. This class is responsible for managing, interacting with, and preprocessing the ChEBI chemical data." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f3a66e07-edc9-4aa2-9cd0-d4ea58914d22", + "metadata": {}, + "outputs": [], + "source": [ + "from chebai_proteins.preprocessing.datasets.scope.scope import SCOPeOver50" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a71b7301-6195-4155-a439-f5eb3183d0f3", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:07:26.371796Z", + "start_time": "2024-10-05T21:07:26.058728Z" + } + }, + "outputs": [], + "source": [ + "scope_class = SCOPeOver50(scope_version=\"2.08\")" + ] + }, + { + "cell_type": "markdown", + "id": "b810d7c9-4f7f-4725-9bc2-452ff2c3a89d", + "metadata": {}, + "source": [ + "\n", + "### Inheritance Hierarchy\n", + "\n", + "SCOPe data classes inherit from [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L598), which in turn inherits from [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L23). Specifically:\n", + "\n", + "- **`_DynamicDataset`**: This class serves as an intermediate base class that provides additional functionality or customization for datasets that require dynamic behavior. It inherits from `XYBaseDataModule`, which provides the core methods for data loading and processing.\n", + "\n", + "- **`XYBaseDataModule`**: This is the base class for data modules, providing foundational properties and methods for handling and processing datasets, including data splitting, loading, and preprocessing.\n", + "\n", + "In summary, ChEBI data classes are designed to manage and preprocess chemical data effectively by leveraging the capabilities provided by `XYBaseDataModule` through the `_DynamicDataset` intermediary.\n", + "\n", + "\n", + "### Input parameters\n", + "A SCOPe data class can be configured with a range of parameters, including:\n", + "\n", + "- **scope_version (str)**: Specifies the version of the ChEBI database to be used. Specifying a version ensures the reproducibility of your experiments by using a consistent dataset.\n", + "\n", + "- **scope_version_train (str, optional)**: The version of ChEBI to use specifically for training and validation. If not set, the `scope_version` specified will be used for all data splits, including training, validation, and test. Defaults to `None`.\n", + "\n", + "- **splits_file_path (str, optional)**: Path to a CSV file containing data splits. If not provided, the class will handle splits internally. Defaults to `None`.\n", + "\n", + "### Additional Input Parameters\n", + "\n", + "To get more control over various aspects of data loading, processing, and splitting, you can refer to documentation of additional parameters in docstrings of the respective classes: [`_SCOPeDataExtractor`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/scope/scope.py#L31), [`XYBaseDataModule`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L22), [`_DynamicDataset`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/base.py#L597), etc.\n" + ] + }, + { + "cell_type": "markdown", + "id": "8578b7aa-1bd9-4e50-9eee-01bfc6d5464a", + "metadata": {}, + "source": [ + "# Available SCOPe Data Classes\n", + "\n", + "__Note__: Check the code implementation of classes [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/datasets/scope/scope.py):\n", + "\n", + "There is a range of available dataset classes for SCOPe. Usually, you want to use `SCOPeOver2000` or `SCOPeOver50`. The number indicates the threshold for selecting label classes: SCOPe classes which have at least 2000 / 50 subclasses will be used as labels.\n", + "\n", + "Both inherit from `SCOPeOverX`. If you need a different threshold, you can create your own subclass. By default, `SCOPeOverX` uses the Protein encoding (see Section 5).\n", + "\n", + "Finally, `SCOPeOver2000Partial` selects extracts a part of SCOPe based on a given top class, with a threshold of 2000 for selecting labels.\n", + "This class inherits from `SCOPEOverXPartial`.\n" + ] + }, + { + "cell_type": "markdown", + "id": "8456b545-88c5-401d-baa5-47e8ae710f04", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "ed973fb59df11849", + "metadata": {}, + "source": [ + "# 2. Preparation / Setup Methods\n", + "\n", + "Now we have a SCOPe data class with all the relevant parameters. Next, we need to generate the actual dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "11f2208e-fa40-44c9-bfe7-576ca23ad366", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking for processed data in data\\SCOPe\\version_2.08\\SCOPe50\\processed\n", + "Missing processed data file (`data.pkl` file)\n", + "Missing PDB raw data, Downloading PDB sequence data....\n", + "Downloading to temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmpsif7r129\n", + "Downloaded to C:\\Users\\HP\\AppData\\Local\\Temp\\tmpsif7r129\n", + "Unzipping the file....\n", + "Unpacked and saved to data\\SCOPe\\pdb_sequences.txt\n", + "Removed temporary file C:\\Users\\HP\\AppData\\Local\\Temp\\tmpsif7r129\n", + "Missing Scope: cla.txt raw data, Downloading...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "G:\\anaconda3\\envs\\env_chebai\\lib\\site-packages\\urllib3\\connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'scop.berkeley.edu'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n", + "warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Missing Scope: hie.txt raw data, Downloading...\n", + "Missing Scope: des.txt raw data, Downloading...\n", + "Extracting class hierarchy...\n", + "Computing transitive closure\n", + "Process graph\n", + "101 labels has been selected for specified threshold, \n", + "Constructing data.pkl file .....\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Check for processed data in data\\SCOPe\\version_2.08\\SCOPe50\\processed\\protein_token\n", + "Cross-validation enabled: False\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Missing transformed data (`data.pt` file). Transforming data.... \n", + "Processing 60298 lines...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 60298/60298 [00:53<00:00, 1119.10it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving 21 tokens to G:\\github-aditya0by0\\python-chebai\\chebai\\preprocessing\\bin\\protein_token\\tokens.txt...\n", + "First 10 tokens: ['M', 'S', 'I', 'G', 'A', 'T', 'R', 'L', 'Q', 'N']\n" + ] + } + ], + "source": [ + "scope_class.prepare_data()\n", + "scope_class.setup()" + ] + }, + { + "cell_type": "markdown", + "id": "1655d489-25fe-46de-9feb-eeca5d36936f", + "metadata": {}, + "source": [ + "\n", + "### Automatic Execution: \n", + "These methods are executed automatically when using the training command `chebai fit`. Users do not need to call them explicitly, as the code internally manages the preparation and setup of data, ensuring that it is ready for subsequent use in training and validation processes.\n", + "\n", + "### Why is Preparation Needed?\n", + "\n", + "- **Data Availability**: The preparation step ensures that the required SCOPe data files are downloaded or loaded, which are essential for analysis.\n", + "- **Data Integrity**: It ensures that the data files are transformed into a compatible format required for model input.\n", + "\n", + "### Main Methods for Data Preprocessing\n", + "\n", + "The data preprocessing in a data class involves two main methods:\n", + "\n", + "1. **`prepare_data` Method**:\n", + " - **Purpose**: This method checks for the presence of raw data in the specified directory. If the raw data is missing, it fetches the ontology, creates a dataframe, and saves it to a file (`data.pkl`). The dataframe includes columns such as IDs, data representations, and labels. This step is independent of input encodings.\n", + " - **Documentation**: [PyTorch Lightning - `prepare_data`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#prepare-data)\n", + "\n", + "2. **`setup` Method**:\n", + " - **Purpose**: This method sets up the data module for training, validation, and testing. It checks for the processed data and, if necessary, performs additional setup to ensure the data is ready for model input. It also handles cross-validation settings if enabled.\n", + " - **Description**: Transforms `data.pkl` into a model input data format (`data.pt`), tokenizing the input according to the specified encoding. The transformed data contains the following keys: `ident`, `features`, `labels`, and `group`. This method uses a subclass of Data Reader to perform the tokenization.\n", + " - **Documentation**: [PyTorch Lightning - `setup`](https://lightning.ai/docs/pytorch/stable/data/datamodule.html#setup)\n", + "\n", + "These methods ensure that the data is correctly prepared and set up for subsequent use in training and validation processes." + ] + }, + { + "cell_type": "markdown", + "id": "f5aaa12d-5f01-4b74-8b59-72562af953bf", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "bb6e9a81554368f7", + "metadata": {}, + "source": [ + "# 3. Overview of the 3 preprocessing stages\n", + "\n", + "The `chebai` library follows a three-stage preprocessing pipeline, which is reflected in its file structure:\n", + "\n", + "1. **Raw Data Stage**:\n", + " - **Files**: `cla.txt`, `des.txt` and `hie.txt`. Please find description of each file [here](https://scop.berkeley.edu/help/ver=2.08#parseablefiles-2.08).\n", + " - **Description**: This stage contains the raw SCOPe data in txt format, serving as the initial input for further processing.\n", + " - **File Path**: `data/SCOPe/version_${scope_version}/raw/${filename}.txt`\n", + "\n", + "2. **Processed Data Stage 1**:\n", + " - **File**: `data.pkl`\n", + " - **Description**: This stage includes the data after initial processing. It contains protein sequence strings, class columns, and metadata but lacks data splits.\n", + " - **File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/data.pkl`\n", + " - **Additional File**: `classes.txt` - A file listing the relevant SCOPe classes.\n", + "\n", + "3. **Processed Data Stage 2**:\n", + " - **File**: `data.pt`\n", + " - **Description**: This final stage includes the encoded data in a format compatible with PyTorch, ready for model input. This stage also references data splits when available.\n", + " - **File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", + " - **Additional File**: `splits.csv` - Contains saved splits for reproducibility.\n", + "\n", + "This structured approach to data management ensures that each stage of data processing is well-organized and documented, from raw data acquisition to the preparation of model-ready inputs. It also facilitates reproducibility and traceability across different experiments.\n", + "\n", + "### Data Splits\n", + "\n", + "- **Creation**: Data splits are generated dynamically \"on the fly\" during training and evaluation to ensure flexibility and adaptability to different tasks.\n", + "- **Reproducibility**: To maintain consistency across different runs, splits can be reproduced by comparing hashes with a fixed seed value.\n" + ] + }, + { + "cell_type": "markdown", + "id": "7e172c0d1e8bb93f", + "metadata": {}, + "source": [ + "# 4. Data Files and their structure\n", + "\n", + "`chebai` creates and manages several data files during its operation. These files store various chemical data and metadata essential for different tasks. Let’s explore these files and their content.\n" + ] + }, + { + "cell_type": "markdown", + "id": "43329709-5134-4ce5-88e7-edd2176bf84d", + "metadata": {}, + "source": [ + "## raw files\n", + "- cla.txt, des.txt and hie.txt\n", + "\n", + "For detailed description of raw files and their structures, please refer the official website [here](https://scop.berkeley.edu/help/ver=2.08#parseablefiles-2.08).\n" + ] + }, + { + "cell_type": "markdown", + "id": "558295e5a7ded456", + "metadata": {}, + "source": [ + "## data.pkl File\n", + "\n", + "**Description**: Generated by the `prepare_data` method, this file contains processed data in a dataframe format. It includes the ids, sids which are used to label corresponding sequence, protein-chain sequence, and columns for each label with boolean values." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "fd490270-59b8-4c1c-8b09-204defddf592", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:09:01.622317Z", + "start_time": "2024-10-05T21:09:01.606698Z" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d7d16247-092c-4e8d-96c2-ab23931cf766", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:11:51.296162Z", + "start_time": "2024-10-05T21:11:44.559304Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Size of the data (rows x columns): (60424, 1035)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsidssequenceclass_46456class_48724class_51349class_53931class_56572class_56835class_56992...species_187294species_56257species_186882species_56690species_161316species_57962species_58067species_267696species_311502species_311501
01[d4oq9a_, d4oq9b_, d4oq9c_, d4oq9d_, d4niaa_, ...AAAAAAAAAAFalseTrueFalseFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
12[d7dxhc_]AAAAAAAAAAAAAAAAAAAAAAAFalseFalseFalseFalseFalseTrueFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
23[d1gkub1, d1gkub2, d1gkub3, d1gkub4]AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASLCLFPEDFLLKEF...FalseFalseTrueFalseTrueFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
34[d3c9wa2, d3c9wb2, d3c9wa3, d3c9wb3]AAAAAAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNLNKV...FalseFalseFalseTrueFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
45[d1xwaa1, d1xwab_, d1xwac_, d1xwad_, d1xwaa2]AAAAAMVYQVKDKADLDGQLTKASGKLVVLDFFATWCGPCKMISPK...FalseFalseTrueFalseFalseFalseFalse...FalseFalseFalseFalseFalseFalseFalseFalseFalseTrue
\n", + "

5 rows Γ— 1035 columns

\n", + "
" + ], + "text/plain": [ + " id sids \\\n", + "0 1 [d4oq9a_, d4oq9b_, d4oq9c_, d4oq9d_, d4niaa_, ... \n", + "1 2 [d7dxhc_] \n", + "2 3 [d1gkub1, d1gkub2, d1gkub3, d1gkub4] \n", + "3 4 [d3c9wa2, d3c9wb2, d3c9wa3, d3c9wb3] \n", + "4 5 [d1xwaa1, d1xwab_, d1xwac_, d1xwad_, d1xwaa2] \n", + "\n", + " sequence class_46456 \\\n", + "0 AAAAAAAAAA False \n", + "1 AAAAAAAAAAAAAAAAAAAAAAA False \n", + "2 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAASLCLFPEDFLLKEF... False \n", + "3 AAAAAAGPEMVRGQVFDVGPRYTNLSYIGEGAYGMVCSAYDNLNKV... False \n", + "4 AAAAAMVYQVKDKADLDGQLTKASGKLVVLDFFATWCGPCKMISPK... False \n", + "\n", + " class_48724 class_51349 class_53931 class_56572 class_56835 \\\n", + "0 True False False False False \n", + "1 False False False False True \n", + "2 False True False True False \n", + "3 False False True False False \n", + "4 False True False False False \n", + "\n", + " class_56992 ... species_187294 species_56257 species_186882 \\\n", + "0 False ... False False False \n", + "1 False ... False False False \n", + "2 False ... False False False \n", + "3 False ... False False False \n", + "4 False ... False False False \n", + "\n", + " species_56690 species_161316 species_57962 species_58067 \\\n", + "0 False False False False \n", + "1 False False False False \n", + "2 False False False False \n", + "3 False False False False \n", + "4 False False False False \n", + "\n", + " species_267696 species_311502 species_311501 \n", + "0 False False False \n", + "1 False False False \n", + "2 False False True \n", + "3 False False True \n", + "4 False False True \n", + "\n", + "[5 rows x 1035 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pkl_df = pd.DataFrame(\n", + " pd.read_pickle(\n", + " os.path.join(\n", + " scope_class.processed_dir_main,\n", + " scope_class.processed_main_file_names_dict[\"data\"],\n", + " )\n", + " )\n", + ")\n", + "print(\"Size of the data (rows x columns): \", pkl_df.shape)\n", + "pkl_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "322bc926-69ff-4b93-9e95-5e8b85869c38", + "metadata": {}, + "source": [ + "**File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/data.pkl`\n", + "\n", + "\n", + "### Structure of `data.pkl`\n", + "`data.pkl` as following structure: \n", + "- **Column 0**: Contains the ID of eachdata instance.\n", + "- **Column 1**: Contains the `sids` which are associated with corresponding protein-chain sequence.\n", + "- **Column 2**: Contains the protein-chain sequence.\n", + "- **Column 3 and onwards**: Contains the labels, starting from column 3.\n", + "\n", + "This structure ensures that the data is organized and ready for further processing, such as further encoding.\n" + ] + }, + { + "cell_type": "markdown", + "id": "ba019d2d4324bd0b", + "metadata": {}, + "source": [ + "## data.pt File\n", + "\n", + "\n", + "**Description**: Generated by the `setup` method, this file contains encoded data in a format compatible with the PyTorch library, specifically as a list of dictionaries. Each dictionary in this list includes keys such as `ident`, `features`, `labels`, and `group`, ready for model input." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "977ddd83-b469-4b58-ab1a-8574fb8769b4", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:12:49.338943Z", + "start_time": "2024-10-05T21:12:49.323319Z" + } + }, + "outputs": [], + "source": [ + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3266ade9-efdc-49fe-ae07-ed52b2eb52d0", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:14:12.892845Z", + "start_time": "2024-10-05T21:13:59.859953Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Type of loaded data: \n" + ] + } + ], + "source": [ + "data_pt = torch.load(\n", + " os.path.join(\n", + " scope_class.processed_dir, scope_class.processed_file_names_dict[\"data\"]\n", + " ),\n", + " weights_only=False,\n", + ")\n", + "print(\"Type of loaded data:\", type(data_pt))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "84cfa3e6-f60d-47c0-9f82-db3d5673d1e7", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:14:21.185027Z", + "start_time": "2024-10-05T21:14:21.169358Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'features': [14, 14, 14, 14, 20, 15, 15, 28, 15, 18, 25, 17, 18, 11, 25, 21, 27, 19, 14, 27, 19, 13, 14, 17, 16, 21, 25, 22, 27, 28, 12, 10, 20, 19, 13, 13, 14, 28, 17, 20, 20, 12, 19, 11, 17, 15, 27, 28, 15, 12, 17, 14, 23, 11, 19, 27, 14, 26, 19, 11, 11, 19, 12, 19, 19, 28, 17, 16, 20, 16, 19, 21, 10, 16, 18, 12, 17, 19, 10, 29, 12, 12, 21, 20, 16, 17, 19, 28, 20, 21, 12, 16, 18, 21, 19, 14, 19, 17, 12, 14, 18, 28, 23, 15, 28, 19, 19, 19, 15, 25, 17, 22, 25, 19, 28, 16, 13, 27, 13, 11, 20, 15, 28, 12, 15, 28, 27, 13, 13, 13, 28, 19, 14, 15, 28, 12, 18, 14, 20, 28, 14, 18, 15, 19, 13, 22, 28, 29, 12, 12, 20, 29, 28, 17, 13, 28, 23, 22, 15, 15, 28, 17, 13, 21, 17, 27, 11, 20, 23, 10, 10, 11, 20, 15, 22, 21, 10, 13, 21, 25, 11, 29, 25, 19, 20, 18, 17, 19, 19, 15, 18, 16, 16, 25, 15, 22, 25, 28, 23, 16, 20, 21, 13, 26, 18, 21, 15, 27, 17, 20, 22, 23, 11, 14, 29, 21, 21, 17, 25, 10, 14, 20, 25, 11, 22, 29, 11, 21, 11, 12, 17, 27, 16, 29, 17, 14, 12, 11, 20, 21, 27, 22, 15, 10, 21, 20, 17, 28, 21, 25, 11, 18, 27, 11, 13, 11, 28, 12, 17, 23, 15, 25, 16, 20, 11, 17, 11, 12, 16, 28, 27, 27, 27, 14, 13, 16, 22, 28, 12, 12, 26, 19, 22, 21, 21, 12, 19, 28, 22, 16, 23, 20, 28, 27, 24, 15, 19, 13, 12, 12, 29, 28, 12, 20, 22, 23, 17, 17, 27, 27, 21, 20, 28, 28, 28, 14, 13, 13, 11, 14, 14, 14, 14, 14], 'labels': array([False, True, False, ..., False, False, False]), 'ident': 6, 'group': None}\n" + ] + } + ], + "source": [ + "for i in range(5, 6):\n", + " print(data_pt[i])" + ] + }, + { + "cell_type": "markdown", + "id": "0d80ffbb-5f1e-4489-9bc8-d688c9be1d07", + "metadata": {}, + "source": [ + "**File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/${reader_name}/data.pt`\n", + "\n", + "\n", + "### Structure of `data.pt`\n", + "\n", + "The `data.pt` file is a list where each element is a dictionary with the following keys:\n", + "\n", + "- **`features`**: \n", + " - **Description**: This key holds the input features for the model. The features are typically stored as tensors and represent the attributes used by the model for training and evaluation.\n", + "\n", + "- **`labels`**: \n", + " - **Description**: This key contains the labels or target values associated with each instance. Labels are also stored as tensors and are used by the model to learn and make predictions.\n", + "\n", + "- **`ident`**: \n", + " - **Description**: This key holds identifiers for each data instance. These identifiers help track and reference the individual samples in the dataset.\n" + ] + }, + { + "cell_type": "markdown", + "id": "186ec6f0eed6ecf7", + "metadata": {}, + "source": [ + "## classes.txt File\n", + "\n", + "**Description**: A file containing the list of selected SCOPe **labels** based on the specified threshold. This file is crucial for ensuring that only relevant **labels** are included in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "8d1fbe6c-beb8-4038-93d4-c56bc7628716", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:15:19.146285Z", + "start_time": "2024-10-05T21:15:18.503284Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "class_48724\n", + "class_53931\n", + "class_310555\n", + "fold_48725\n", + "fold_56111\n", + "fold_56234\n", + "fold_310573\n", + "superfamily_48726\n", + "superfamily_56112\n", + "superfamily_56235\n", + "superfamily_310607\n", + "family_48942\n", + "family_56251\n", + "family_191359\n", + "family_191470\n" + ] + } + ], + "source": [ + "with open(os.path.join(scope_class.processed_dir_main, \"classes.txt\"), \"r\") as file:\n", + " for i in range(15):\n", + " line = file.readline()\n", + " print(line.strip())" + ] + }, + { + "cell_type": "markdown", + "id": "861da1c3-0401-49f0-a22f-109814ed95d5", + "metadata": {}, + "source": [ + "\n", + "**File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/classes.txt`\n", + "\n", + "The `classes.txt` file lists selected SCOPe classes. These classes are chosen based on a specified threshold, which is typically used for filtering or categorizing the dataset. Each line in the file corresponds to a unique SCOPe class ID, identifying specific class withing SCOPe ontology along with the hierarchy level.\n", + "\n", + "This file is essential for organizing the data and ensuring that only relevant classes, as defined by the threshold, are included in subsequent processing and analysis tasks.\n" + ] + }, + { + "cell_type": "markdown", + "id": "fb72be449e52b63f", + "metadata": {}, + "source": [ + "## splits.csv File\n", + "\n", + "**Description**: Contains saved data splits from previous runs. During subsequent runs, this file is used to reconstruct the train, validation, and test splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3ebdcae4-4344-46bd-8fc0-a82ef5d40da5", + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-05T21:15:54.575116Z", + "start_time": "2024-10-05T21:15:53.945139Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idsplit
01train
13train
24train
36train
49train
\n", + "
" + ], + "text/plain": [ + " id split\n", + "0 1 train\n", + "1 3 train\n", + "2 4 train\n", + "3 6 train\n", + "4 9 train" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csv_df = pd.read_csv(os.path.join(scope_class.processed_dir_main, \"splits.csv\"))\n", + "csv_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "b058714f-e434-4367-89b9-74c129ac727f", + "metadata": {}, + "source": [ + "\n", + "\n", + "**File Path**: `data/SCOPe/version_${scope_version}/${dataset_name}/processed/splits.csv`\n", + "\n", + "The `splits.csv` file contains the saved data splits from previous runs, including the train, validation, and test sets. During subsequent runs, this file is used to reconstruct these splits by filtering the encoded data (`data.pt`) based on the IDs stored in `splits.csv`. This ensures consistency and reproducibility in data splitting, allowing for reliable evaluation and comparison of model performance across different run.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6dc3fd6c-7cf6-47ef-812f-54319a0cdeb9", + "metadata": {}, + "outputs": [], + "source": [ + "# You can specify a literal path for the `splits_file_path`, or if another `scope_class` instance is already defined,\n", + "# you can use its existing `splits_file_path` attribute for consistency.\n", + "scope_class_with_splits = SCOPeOver2000(\n", + " scope_version=\"2.08\",\n", + " # splits_file_path=\"data/chebi_v231/ChEBI50/processed/splits.csv\", # Literal path option\n", + " splits_file_path=scope_class.splits_file_path, # Use path from an existing `chebi_class` instance\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a5eb482c-ce5b-4efc-b2ec-85ac7b1a78ee", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "ab110764-216d-4d52-a9d1-4412c8ac8c9d", + "metadata": {}, + "source": [ + "## 5.1 Protein Representation Using Amino Acid Sequence Notation\n", + "\n", + "Proteins are composed of chains of amino acids, and these sequences can be represented using a one-letter notation for each amino acid. This notation provides a concise way to describe the primary structure of a protein.\n", + "\n", + "### Example Protein Sequence\n", + "\n", + "Protein-Chain: PDB ID:**1cph** Chain ID:**B** mol:protein length:30 INSULIN (PH 10)\n", + "
Refer - [1cph_B](https://www.rcsb.org/sequence/1CPH)\n", + "\n", + "- **Sequence**: `FVNQHLCGSHLVEALYLVCGERGFFYTPKA`\n", + "- **Sequence Length**: 30\n", + "\n", + "In this sequence, each letter corresponds to a specific amino acid. This notation is widely used in bioinformatics and molecular biology to represent protein sequences.\n", + "\n", + "### Tokenization and Encoding\n", + "\n", + "To tokenize and numerically encode this protein sequence, the `ProteinDataReader` class is used. This class allows for n-gram tokenization, where the `n_gram` parameter defines the size of the tokenized units. If `n_gram` is not provided (default is `None`), each amino acid letter is treated as a single token.\n", + "\n", + "For more details, you can explore the implementation of the `ProteinDataReader` class in the source code [here](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/reader.py)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "da47d47e-4560-46af-b246-235596f27d82", + "metadata": {}, + "outputs": [], + "source": [ + "from chebai_proteins.preprocessing.reader import ProteinDataReader" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "8bdbf309-29ec-4aab-a6dc-9e09bc6961a2", + "metadata": {}, + "outputs": [], + "source": [ + "protein_dr_3gram = ProteinDataReader(n_gram=3)\n", + "protein_dr = ProteinDataReader()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "68e5c87c-79c3-4d5f-91e6-635399a84d3d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[25, 28, 19, 18, 29, 17, 24, 13, 11, 29, 17, 28, 27, 14, 17, 22, 17, 28, 24, 13, 27, 16, 13, 25, 25, 22, 15, 23, 21, 14]\n", + "[5023, 2218, 3799, 2290, 6139, 2208, 6917, 4674, 484, 439, 2737, 851, 365, 2624, 3240, 4655, 1904, 3737, 1453, 2659, 5160, 3027, 2355, 7163, 4328, 3115, 6207, 1234]\n" + ] + } + ], + "source": [ + "protein = \"FVNQHLCGSHLVEALYLVCGERGFFYTPKA\"\n", + "print(protein_dr._read_data(protein))\n", + "print(protein_dr_3gram._read_data(protein))" + ] + }, + { + "cell_type": "markdown", + "id": "5b7211ee-2ccc-46d3-8e8f-790f344726ba", + "metadata": {}, + "source": [ + "The numbers mentioned above refer to the index of each individual token from the [`tokens.txt`](https://github.com/ChEB-AI/python-chebai/blob/dev/chebai/preprocessing/bin/protein_token/tokens.txt) file, which is used by the `ProteinDataReader` class. \n", + "\n", + "Each token in the `tokens.txt` file corresponds to a specific amino-acid letter, and these tokens are referenced by their index. Additionally, the index values are offset by the `EMBEDDING_OFFSET`, ensuring that the token embeddings are adjusted appropriately during processing." + ] + }, + { + "cell_type": "markdown", + "id": "93e328cf-09f9-4694-b175-28320590937d", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}