diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml deleted file mode 100644 index b04fb15..0000000 --- a/.github/workflows/black.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: Lint - -on: [push, pull_request] - -jobs: - lint: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: psf/black@stable diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..bb9154f --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,26 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' # or any version your project uses + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install black==25.1.0 ruff==0.12.2 + + - name: Run Black + run: black --check . + + - name: Run Ruff (no formatting) + run: ruff check . --no-fix diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0ad2115..d01839c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12"] steps: - uses: actions/checkout@v4 diff --git a/.gitignore b/.gitignore index 05cdfb7..587b4f3 100644 --- a/.gitignore +++ b/.gitignore @@ -170,3 +170,4 @@ electra_pretrained.ckpt .jupyter .virtual_documents .isort.cfg +.vscode diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 108b91d..cbb7284 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: "24.2.0" + rev: "25.1.0" hooks: - id: black - id: black-jupyter # for formatting jupyter-notebook @@ -23,3 +23,9 @@ repos: - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace + +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.12.2 + hooks: + - id: ruff + args: [--fix] diff --git a/README.md b/README.md index f0033da..cd8069a 100644 --- a/README.md +++ b/README.md @@ -7,19 +7,22 @@ ## 🔧 Installation -To install, follow these steps: +To install this repository, download [`python-chebai`](https://github.com/ChEB-AI/python-chebai) and this repository, then run -1. Clone the repository: ``` -git clone https://github.com/ChEB-AI/python-chebai-proteins.git +cd python-chebai +pip install . + +cd python-chebai-proteins +pip install . ``` -2. Install the package: +_Note for developers_: If you want to install the package in editable mode, use the following command instead: +```bash +pip install -e . ``` -cd python-chebai -pip install . -``` + ## 🗂 Recommended Folder Structure @@ -43,39 +46,6 @@ This setup enables shared access to data and model configurations. ## 🚀 Training & Pretraining Guide -### ⚠️ Important Setup Instructions - -Before running any training scripts, ensure the environment is correctly configured: - -* Either: - - * Install the `python-chebai` repository as a package using: - - ```bash - pip install . - ``` -* **OR** - - * Manually set the `PYTHONPATH` environment variable if working across multiple directories (`python-chebai` and `python-chebai-proteins`): - - * If your current working directory is `python-chebai-proteins`, set: - - ```bash - export PYTHONPATH=path/to/python-chebai - ``` - or vice versa. - - * If you're working within both repositories simultaneously or facing module not found errors, we **recommend configuring both directories**: - - ```bash - # Linux/macOS - export PYTHONPATH=path/to/python-chebai:path/to/python-chebai-proteins - - # Windows (use semicolon instead of colon) - set PYTHONPATH=path\to\python-chebai;path\to\python-chebai-proteins - ``` - -> 🔎 See the [PYTHONPATH Explained](#-pythonpath-explained) section below for more details. ### 📊 SCOPE hierarchy prediction @@ -86,61 +56,3 @@ python -m chebai fit --trainer=../configs/training/default_trainer.yml --trainer ``` Same command can be used for **DeepGO** just by changing the config path for data. - - - - - - - -## 🧭 PYTHONPATH Explained - -### What is `PYTHONPATH`? - -`PYTHONPATH` is an environment variable that tells Python where to search for modules that aren't installed via `pip` or not in your current working directory. - -### Why You Need It - -If your config refers to a custom module like: - -```yaml -class_path: chebai_proteins.preprocessing.datasets.scope.scope.SCOPe50 -``` - -...and you're running the code from `python-chebai`, Python won't know where to find `chebai_proteins` (from another repo like `python-chebai-proteins/`) unless you add it to `PYTHONPATH`. - - -### How Python Finds Modules - -Python looks for imports in this order: - -1. Current directory -2. Standard library -3. Paths in `PYTHONPATH` -4. Installed packages (`site-packages`) - -You can inspect the full search paths: - -```bash -python -c "import sys; print(sys.path)" -``` - - - -### ✅ Setting `PYTHONPATH` - -#### 🐧 Linux / macOS - -```bash -export PYTHONPATH=/path/to/python-chebai-graph -echo $PYTHONPATH -``` - -#### 🪟 Windows CMD - -```cmd -set PYTHONPATH=C:\path\to\python-chebai-graph -echo %PYTHONPATH% -``` - -> 💡 Note: This is temporary for your terminal session. To make it permanent, add it to your system environment variables. diff --git a/chebai_proteins/loss/bce_logits_loss.py b/chebai_proteins/loss/bce_logits_loss.py new file mode 100644 index 0000000..dd629a4 --- /dev/null +++ b/chebai_proteins/loss/bce_logits_loss.py @@ -0,0 +1,7 @@ +import torch + + +class WrappedBCEWithLogitsLoss(torch.nn.BCEWithLogitsLoss): + def forward(self, input, target, **kwargs): + # As the custom passed kwargs are not used in BCEWithLogitsLoss, we can ignore them + return super().forward(input, target) diff --git a/chebai_proteins/preprocessing/datasets/deepGO/go_uniprot.py b/chebai_proteins/preprocessing/datasets/deepGO/go_uniprot.py index dbdf93e..e9a7ac0 100644 --- a/chebai_proteins/preprocessing/datasets/deepGO/go_uniprot.py +++ b/chebai_proteins/preprocessing/datasets/deepGO/go_uniprot.py @@ -102,31 +102,43 @@ class _GOUniProtDataExtractor(_DynamicDataset, ABC): # Gene Ontology (GO) has three major branches, one for biological processes (BP), molecular functions (MF) and # cellular components (CC). The value "all" will take data related to all three branches into account. + # TODO: should we be really allowing all branches for single dataset? _ALL_GO_BRANCHES: str = "all" _GO_BRANCH_NAMESPACE: Dict[str, str] = { - "BP": "biological_process", - "MF": "molecular_function", - "CC": "cellular_component", + "BP": "biological_process", # Huge branch, with 20,000+ GO terms + "MF": "molecular_function", # smaller branch, with 6000+ GO terms + "CC": "cellular_component", # smallest branch, with 2,000+ GO terms } - def __init__(self, **kwargs): - self.go_branch: str = self._get_go_branch(**kwargs) + READER = None + + def __init__( + self, + go_branch: str, + max_sequence_len: int = 1002, + use_esm2_embeddings: bool = False, + **kwargs, + ): + if bool(use_esm2_embeddings): + self.READER = dr.ESM2EmbeddingReader - self.max_sequence_length: int = int(kwargs.get("max_sequence_length", 1002)) + self.go_branch: str = self._get_go_branch(go_branch) + + self.max_sequence_length: int = int(max_sequence_len) assert ( self.max_sequence_length >= 1 ), "Max sequence length should be greater than or equal to 1." super(_GOUniProtDataExtractor, self).__init__(**kwargs) - if self.reader.n_gram is not None: + if hasattr(self.reader, "n_gram") and self.reader.n_gram is not None: assert self.max_sequence_length >= self.reader.n_gram, ( f"max_sequence_length ({self.max_sequence_length}) must be greater than " f"or equal to n_gram ({self.reader.n_gram})." ) @classmethod - def _get_go_branch(cls, **kwargs) -> str: + def _get_go_branch(cls, go_branch_value: str, **kwargs) -> str: """ Retrieves the Gene Ontology (GO) branch based on provided keyword arguments. This method checks if a valid GO branch value is provided in the keyword arguments. @@ -141,7 +153,6 @@ def _get_go_branch(cls, **kwargs) -> str: ValueError: If the provided 'go_branch' value is not in the allowed list of values. """ - go_branch_value = kwargs.get("go_branch", cls._ALL_GO_BRANCHES) allowed_values = list(cls._GO_BRANCH_NAMESPACE.keys()) + [cls._ALL_GO_BRANCHES] if go_branch_value not in allowed_values: raise ValueError( @@ -181,7 +192,7 @@ def _download_gene_ontology_data(self) -> str: if not os.path.isfile(go_path): print("Missing Gene Ontology raw data") - print(f"Downloading Gene Ontology data....") + print("Downloading Gene Ontology data....") r = requests.get(self._GO_DATA_URL, allow_redirects=True) r.raise_for_status() # Check if the request was successful open(go_path, "wb").write(r.content) @@ -207,7 +218,7 @@ def _download_swiss_uni_prot_data(self) -> Optional[str]: os.makedirs(os.path.dirname(uni_prot_file_path), exist_ok=True) if not os.path.isfile(uni_prot_file_path): - print(f"Downloading Swiss UniProt data....") + print("Downloading Swiss UniProt data....") # Create a temporary file with NamedTemporaryFile(delete=False) as tf: @@ -223,7 +234,7 @@ def _download_swiss_uni_prot_data(self) -> Optional[str]: # Unpack the gzipped file try: - print(f"Unzipping the file....") + print("Unzipping the file....") with gzip.open(temp_filename, "rb") as f_in: output_file_path = uni_prot_file_path with open(output_file_path, "wb") as f_out: @@ -375,7 +386,7 @@ def _graph_to_raw_dataset(self, g: nx.DiGraph) -> pd.DataFrame: Returns: pd.DataFrame: The raw dataset created from the graph. """ - print(f"Processing graph") + print("Processing graph") data_df = self._get_swiss_to_go_mapping() # add ancestors to go ids @@ -457,6 +468,14 @@ def _get_swiss_to_go_mapping(self) -> pd.DataFrame: if not record.sequence or len(record.sequence) > self.max_sequence_length: # Consider protein with only sequence representation and seq. length not greater than max seq. length + + # DeepGO1 paper ignores proteins with sequence length greater than 1002: https://github.com/bio-ontology-research-group/deepgo/blob/master/aaindex.py#L9-L14 + # But DeepGO2 paper truncates the sequence to 1000: https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L26-L33 + # Latest Discussion: https://github.com/ChEB-AI/python-chebai/issues/36#issuecomment-2385693976 + # So, we ignore proteins with sequence length greater than max_sequence_length + # The rationale is that with only a partial representation of the protein sequence, the model may not learn effectively. + # Also, proteins longer than 1002 are only 3.32% of the total proteins in Swiss-Prot dataset. + # https://github.com/ChEB-AI/python-chebai/issues/36#issuecomment-2431460448 continue if any(aa in AMBIGUOUS_AMINO_ACIDS for aa in record.sequence): @@ -559,8 +578,8 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: ) except FileNotFoundError: raise FileNotFoundError( - f"File data.pt doesn't exists. " - f"Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" + "File data.pt doesn't exists. " + "Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" ) df_go_data = pd.DataFrame(data_go) @@ -586,7 +605,7 @@ def base_dir(self) -> str: Returns: str: The path to the base directory, which is "data/GO_UniProt". """ - return os.path.join("data", f"GO_UniProt") + return os.path.join("data", "GO_UniProt") @property def raw_file_names_dict(self) -> dict: diff --git a/chebai_proteins/preprocessing/datasets/deepGO/protein_pretraining.py b/chebai_proteins/preprocessing/datasets/deepGO/protein_pretraining.py index 8c39d86..d1f615a 100644 --- a/chebai_proteins/preprocessing/datasets/deepGO/protein_pretraining.py +++ b/chebai_proteins/preprocessing/datasets/deepGO/protein_pretraining.py @@ -38,7 +38,7 @@ def __init__(self, **kwargs): Args: **kwargs: Additional arguments for the superclass initialization. """ - self._go_uniprot_extractor = GOUniProtOver250() + self._go_uniprot_extractor = GOUniProtOver250(go_branch="all") assert self._go_uniprot_extractor.go_branch == GOUniProtOver250._ALL_GO_BRANCHES self.max_sequence_length: int = int(kwargs.get("max_sequence_length", 1002)) @@ -143,7 +143,6 @@ def _parse_protein_data_for_pretraining(self) -> pd.DataFrame: has_valid_associated_go_label = False for cross_ref in record.cross_references: if cross_ref[0] == self._go_uniprot_extractor._GO_DATA_INIT: - if len(cross_ref) <= 3: # No evidence code continue @@ -223,8 +222,8 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: ) except FileNotFoundError: raise FileNotFoundError( - f"File data.pt doesn't exists. " - f"Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" + "File data.pt doesn't exists. " + "Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" ) df_go_data = pd.DataFrame(data_go) diff --git a/chebai_proteins/preprocessing/datasets/scope/scope.py b/chebai_proteins/preprocessing/datasets/scope/scope.py index bf3540e..842ce92 100644 --- a/chebai_proteins/preprocessing/datasets/scope/scope.py +++ b/chebai_proteins/preprocessing/datasets/scope/scope.py @@ -67,17 +67,22 @@ class _SCOPeDataExtractor(_DynamicDataset, ABC): "sp": "species", "px": "domain", } + READER = None def __init__( self, scope_version: str, scope_version_train: Optional[str] = None, - max_sequence_len: int = 1000, + max_sequence_len: int = 1002, + use_esm2_embeddings: bool = False, **kwargs, ): + if bool(use_esm2_embeddings): + self.READER = ESM2EmbeddingReader + self.scope_version: str = scope_version self.scope_version_train: str = scope_version_train - self.max_sequence_len: int = max_sequence_len + self.max_sequence_len: int = int(max_sequence_len) super(_SCOPeDataExtractor, self).__init__(**kwargs) @@ -130,7 +135,7 @@ def _download_pdb_sequence_data(self) -> None: os.makedirs(os.path.dirname(pdb_seq_file_path), exist_ok=True) if not os.path.isfile(pdb_seq_file_path): - print(f"Missing PDB raw data, Downloading PDB sequence data....") + print("Missing PDB raw data, Downloading PDB sequence data....") # Create a temporary file with NamedTemporaryFile(delete=False) as tf: @@ -146,7 +151,7 @@ def _download_pdb_sequence_data(self) -> None: # Unpack the gzipped file try: - print(f"Unzipping the file....") + print("Unzipping the file....") with gzip.open(temp_filename, "rb") as f_in: output_file_path = pdb_seq_file_path with open(output_file_path, "wb") as f_out: @@ -224,7 +229,6 @@ def add_sequence_nodes_edges(chain_sequence, px_sun_id): # Step 1: Build the graph structure and store node attributes for row in df_scope.itertuples(index=False): if row.level == "px": - pdb_id, chain_id = row.sid[1:5], row.sid[5] if pdb_id not in pdb_id_set or chain_id == "_": @@ -422,7 +426,7 @@ def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame: Raises: RuntimeError: If no sunids are selected. """ - print(f"Process graph") + print("Process graph") selected_sun_ids_per_lvl = self.select_classes(graph) @@ -546,7 +550,6 @@ def _parse_pdb_sequence_file(self) -> pd.DataFrame: for record in SeqIO.parse( os.path.join(self.scope_root_dir, self.raw_file_names_dict["PDB"]), "fasta" ): - if not record.seq or len(record.seq) > self.max_sequence_len: continue @@ -665,8 +668,8 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: ) except FileNotFoundError: raise FileNotFoundError( - f"File data.pt doesn't exists. " - f"Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" + "File data.pt doesn't exists. " + "Please call 'prepare_data' and/or 'setup' methods to generate the dataset files" ) df_scope_version = pd.DataFrame(data_scope_version) @@ -934,7 +937,6 @@ class SCOPeOver2000(_SCOPeOverX): class SCOPeOver50(_SCOPeOverX): - THRESHOLD = 50 @@ -951,10 +953,6 @@ class SCOPeOverPartial2000(_SCOPeOverXPartial): THRESHOLD: int = 2000 -class SCOPeOver50ESM(SCOPeOver50): - READER = ESM2EmbeddingReader - - if __name__ == "__main__": scope = SCOPeOver50(scope_version="2.08") diff --git a/chebai_proteins/preprocessing/reader.py b/chebai_proteins/preprocessing/reader.py index 21bdaea..640b9c5 100644 --- a/chebai_proteins/preprocessing/reader.py +++ b/chebai_proteins/preprocessing/reader.py @@ -181,7 +181,7 @@ def __init__( self.truncation_length = truncation_length self.toks_per_batch = toks_per_batch self.return_contacts = return_contacts - self.repr_layer = repr_layer + self.repr_layer = int(repr_layer) self._model: Optional[ESM2] = None self._alphabet: Optional[Alphabet] = None @@ -355,6 +355,7 @@ def _alphabet_tokens_to_esm_embedding(self, tokens: torch.Tensor) -> torch.Tenso References: https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/extract_esm.py#L82-L107 + https://github.com/facebookresearch/esm?tab=readme-ov-file#usage- Returns: torch.Tensor: Protein embedding from the specified representation layer. @@ -393,3 +394,16 @@ def on_finish(self) -> None: None """ pass + + +if __name__ == "__main__": + reader = ProteinDataReader() + sample_sequence = "MKTFFVAGVILLLLPLVSSQCVNLTTRTQSRGDPTQKARPEPT" + token_indices = reader._read_data(sample_sequence) + print(f"Token indices for the sequence: {token_indices}") + + esm_reader = ESM2EmbeddingReader( + model_name="esm2_t6_8M_UR50D", repr_layer="6", device=torch.device("cpu") + ) + embeddings = esm_reader._read_data(sample_sequence) + print(f"ESM2 embeddings shape: {len(embeddings)}") diff --git a/configs/loss/BCELoss.yml b/configs/loss/BCELoss.yml deleted file mode 100644 index 6ee636d..0000000 --- a/configs/loss/BCELoss.yml +++ /dev/null @@ -1 +0,0 @@ -class_path: torch.nn.BCELoss diff --git a/configs/loss/BCEWithLogitsLoss.yml b/configs/loss/BCEWithLogitsLoss.yml new file mode 100644 index 0000000..606cbcb --- /dev/null +++ b/configs/loss/BCEWithLogitsLoss.yml @@ -0,0 +1 @@ +class_path: chebai_proteins.loss.bce_logits_loss.WrappedBCEWithLogitsLoss diff --git a/configs/model/electra.yml b/configs/model/electra.yml new file mode 100644 index 0000000..da1dbf4 --- /dev/null +++ b/configs/model/electra.yml @@ -0,0 +1,14 @@ +class_path: chebai.models.Electra +init_args: + optimizer_kwargs: + lr: 1e-3 + config: + vocab_size: 31 # 21 unique + embedding offset (10) + # For classification:[Maximum sequence length (1002) (padding will be also upto 1002)] + 1 for CLS token + # For pretraining: [Maximum sequence length (1002) (padding will be also upto 1002)] + 10 embedding offset (includes all special tokens) + # Hence, use max of (classification, pretraining): max_position_embeddings = 1002 + 10 = 1012 + max_position_embeddings: 1012 + num_attention_heads: 8 + num_hidden_layers: 6 + type_vocab_size: 1 + hidden_size: 256 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8075481 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,33 @@ +[build-system] +requires = ["setuptools >= 77.0.3", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "chebai-proteins" +version = "0.0.2" +description = "Repository for protein prediction and classification, built on top of the python-chebai codebase" +authors = [] +readme = "README.md" +license = { text = "AGPL-3.0" } +requires-python = ">=3.10, <3.13" +dependencies = [ + "chebai @ git+https://github.com/ChEB-AI/python-chebai.git", + "biopython", + "fair-esm", +] + +[project.optional-dependencies] +dev = ["black", "isort", "pre-commit"] +plot = ["matplotlib", "seaborn"] +wandb = ["wandb"] + +[tool.setuptools] +include-package-data = true +license-files = ["LICEN[CS]E*"] + +[tool.setuptools.packages.find] +where = ["."] +exclude = ["tests*"] + +[tool.setuptools.package-data] +"*" = ["**/*.txt", "**/*.json"] diff --git a/setup.py b/setup.py deleted file mode 100644 index 27284a0..0000000 --- a/setup.py +++ /dev/null @@ -1,24 +0,0 @@ -from setuptools import find_packages, setup - -packages = find_packages() -print(packages) -setup( - name="chebai-proteins", - version="0.0.2.dev0", - packages=packages, - package_data={"": ["**/*.txt", "**/*.json"]}, - include_package_data=True, - url="", - license="", - author="MGlauer", - author_email="martin.glauer@ovgu.de", - description="", - zip_safe=False, - python_requires=">=3.9, <3.13", - install_requires=[ - "chebai @ git+https://github.com/ChEB-AI/python-chebai.git", - "biopython", - "fair-esm", - ], - extras_require={"dev": ["black", "isort", "pre-commit"]}, -) diff --git a/tests/unit/dataset_classes/testGOUniProDataExtractor.py b/tests/unit/dataset_classes/testGOUniProDataExtractor.py index 8cee8f8..d23f8e7 100644 --- a/tests/unit/dataset_classes/testGOUniProDataExtractor.py +++ b/tests/unit/dataset_classes/testGOUniProDataExtractor.py @@ -37,7 +37,7 @@ def setUpClass( _GOUniProtDataExtractor.READER = ProteinDataReader - cls.extractor = _GOUniProtDataExtractor() + cls.extractor = _GOUniProtDataExtractor(go_branch="all") def test_term_callback(self) -> None: """ diff --git a/tests/unit/dataset_classes/testGoUniProtOverX.py b/tests/unit/dataset_classes/testGoUniProtOverX.py index ccd2d66..2ec70da 100644 --- a/tests/unit/dataset_classes/testGoUniProtOverX.py +++ b/tests/unit/dataset_classes/testGoUniProtOverX.py @@ -17,7 +17,7 @@ def setUpClass(cls, mock_makedirs) -> None: """ Set up the class for tests by initializing the extractor, graph, and input DataFrame. """ - cls.extractor = _GOUniProtOverX() + cls.extractor = _GOUniProtOverX(go_branch="all") cls.test_graph: nx.DiGraph = GOUniProtMockData.get_transitively_closed_graph() cls.input_df: pd.DataFrame = GOUniProtMockData.get_data_in_dataframe().iloc[ :, :4 diff --git a/tutorials/data_exploration_scope.ipynb b/tutorials/data_exploration_scope.ipynb index c7d17b6..a083ad5 100644 --- a/tutorials/data_exploration_scope.ipynb +++ b/tutorials/data_exploration_scope.ipynb @@ -1049,13 +1049,15 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "6dc3fd6c-7cf6-47ef-812f-54319a0cdeb9", "metadata": {}, "outputs": [], "source": [ "# You can specify a literal path for the `splits_file_path`, or if another `scope_class` instance is already defined,\n", "# you can use its existing `splits_file_path` attribute for consistency.\n", + "from chebai_proteins.preprocessing.datasets.scope.scope import SCOPeOver2000\n", + "\n", "scope_class_with_splits = SCOPeOver2000(\n", " scope_version=\"2.08\",\n", " # splits_file_path=\"data/chebi_v231/ChEBI50/processed/splits.csv\", # Literal path option\n",