diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 330e857..3906c84 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ exclude: '^docs/conf.py' repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: - id: trailing-whitespace - id: check-added-large-files @@ -18,26 +18,26 @@ repos: args: ['--fix=auto'] # replace 'auto' with 'lf' to enforce Linux/Mac line endings or 'crlf' for Windows - repo: https://github.com/pycqa/isort - rev: 5.13.2 + rev: 7.0.0 hooks: - id: isort args: ["--profile", "black", "--filter-files"] - repo: https://github.com/psf/black - rev: 24.10.0 + rev: 26.1.0 hooks: - id: black language_version: python3 - repo: https://github.com/PyCQA/flake8 - rev: 7.1.1 + rev: 7.3.0 hooks: - id: flake8 ## You can add flake8 plugins via `additional_dependencies`: # additional_dependencies: [flake8-bugbear] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.13.0 # Use the sha / tag you want to point at + rev: v1.19.1 # Use the sha / tag you want to point at hooks: - id: mypy additional_dependencies: ['types-requests'] diff --git a/LICENSE.txt b/LICENSE.txt index 760fb8d..f2d6f40 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -4,4 +4,4 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file +THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/readme.md b/readme.md index ccfa8c6..633502e 100644 --- a/readme.md +++ b/readme.md @@ -6,7 +6,7 @@ The package is not yet deployed to PyPI. Only an editable (development) install 2. Clone the repository `git clonehttps://github.com/ClimateCompatibleGrowth/research_index_backend.git` 3. Change directory `cd research_index_backend` 4. Install the package `pip install -e .` as an editable package (development install) -5. Obtain an OpenAIRE Graph refresh token and create a .env file with the following parameters: +5. Obtain an OpenAIRE Graph refresh token and create a .env file with the following parameters: ```MG_HOST= MG_PORT= MG_PORT_ALT= @@ -30,7 +30,7 @@ The package is not yet deployed to PyPI. Only an editable (development) install research_index --help usage: research_index [-h] [-i] [-l LIMIT] [-u] list_of_dois - + positional arguments: list_of_dois Path to CSV file containing list of DOIs @@ -39,7 +39,7 @@ The package is not yet deployed to PyPI. Only an editable (development) install -i, --initialise Delete existing data and create new database -l, --limit N Limit number of DOIs to process (default: 50) -u, --update-metadata Update metadata for existing DOIs - -w, --write-metadata Save JSON responses to disk + -w, --write-metadata Save JSON responses to disk Examples: -> Process 10 DOIs from file: diff --git a/src/research_index_backend/create_graph_from_doi.py b/src/research_index_backend/create_graph_from_doi.py index 4bcd29c..153013c 100644 --- a/src/research_index_backend/create_graph_from_doi.py +++ b/src/research_index_backend/create_graph_from_doi.py @@ -192,7 +192,9 @@ def upload_article_to_memgraph(output: AnonymousArticle) -> bool: return True -def main(list_of_dois: list, limit: int, update_metadata: bool, write_metadata: bool): +def main( + list_of_dois: list, limit: int, update_metadata: bool, write_metadata: bool +): try: doi_manager = DOIManager( list_of_dois, limit=limit, update_metadata=update_metadata @@ -343,8 +345,12 @@ def entry_point(db: Driver) -> None: logger.info("Deleted graph") load_initial_data(join("data", "init")) - doi_manager = main(list_of_dois, limit=args.limit, update_metadata=args.update_metadata, - write_metadata=args.write_metadata) + doi_manager = main( + list_of_dois, + limit=args.limit, + update_metadata=args.update_metadata, + write_metadata=args.write_metadata, + ) add_country_relations() metrics, processed_dois = doi_manager.ingestion_metrics() @@ -357,17 +363,18 @@ def entry_point(db: Driver) -> None: print(f"{key.ljust(max_key_length)} | {value}") print("\nProcessing Results:") - print(f"\n• Failed metadata DOIs ({metrics['metadata_failure']}):") - for doi in processed_dois['metadata_failure']: + print(f"\n• Failed metadata DOIs ({metrics['metadata_failure']}):") + for doi in processed_dois["metadata_failure"]: print(f" - {doi}") - - print(f"\n• Invalid pattern DOIs ({metrics['invalid_pattern_dois']}):") - for doi in processed_dois['invalid_pattern_dois']: + + print(f"\n• Invalid pattern DOIs ({metrics['invalid_pattern_dois']}):") + for doi in processed_dois["invalid_pattern_dois"]: print(f" - {doi}") - + print(f"\n• Duplicated Submissions ({metrics['duplicated_submissions']}):") - for doi in processed_dois['duplicated_submissions']: + for doi in processed_dois["duplicated_submissions"]: print(f" - {doi}") - + + if __name__ == "__main__": entry_point() diff --git a/src/research_index_backend/doi.py b/src/research_index_backend/doi.py index dd4375a..2812265 100644 --- a/src/research_index_backend/doi.py +++ b/src/research_index_backend/doi.py @@ -11,13 +11,15 @@ from collections import Counter from logging import getLogger from re import IGNORECASE, compile -from typing import Dict, List +from typing import Any, Dict, List, Tuple from neo4j import Driver + +# https://neo4j.com/docs/api/python-driver/current/api.html#errors from neo4j.exceptions import ( - ServiceUnavailable, Neo4jError, -) # https://neo4j.com/docs/api/python-driver/current/api.html#errors + ServiceUnavailable, +) from pydantic import BaseModel from .session import connect_to_db @@ -38,16 +40,12 @@ class DOI(BaseModel): ingestion_success: bool = False -class DOITracker(BaseModel): - doi_tracker: Dict[str, DOI] - - class DOIManager: - """Manages the validation and ingestion tracking of Digital Object Identifiers (DOIs). + """Tracks the validation and ingestion of Digital Object Identifiers (DOIs) - This class handles DOI validation, database existence checks, and metadata tracking. - It processes DOIs up to a specified limit and can optionally update metadata - for existing entries. + This class handles DOI validation, database existence checks, and metadata + tracking. It processes DOIs up to a specified limit and can optionally + update metadata for existing entries. Parameters ---------- @@ -89,7 +87,10 @@ class DOIManager: """ def __init__( - self, list_of_dois: List[str], limit: int, update_metadata: bool = False + self, + list_of_dois: List[str], + limit: int, + update_metadata: bool = False, ) -> None: self._validate_inputs(list_of_dois, limit, update_metadata) @@ -104,9 +105,10 @@ def __init__( limit if limit < len(self.list_of_dois) else len(self.list_of_dois) ) self.update_metadata = update_metadata - self.doi_tracker: DOITracker = { + self.doi_tracker: Dict[str, DOI] = { doi: DOI(doi=doi) for doi in self.list_of_dois[: self.limit] } + self.PATTERN = compile(DOI_PATTERN, IGNORECASE) def _validate_inputs( @@ -184,26 +186,32 @@ def search_dois(self, db: Driver) -> None: self.num_existing_dois = len(self.existing_dois) logger.info( - f"Found {self.num_existing_dois} existing and {self.num_new_dois} new DOIs" + f"Found {self.num_existing_dois} existing and " + + "{self.num_new_dois} new DOIs" ) - def validate_dois(self) -> Dict[str, List[str]]: + @connect_to_db + def validate_dois(self, db: Driver) -> Dict[str, DOI]: try: self.pattern_check() - self.search_dois() + self.search_dois(db) return self.doi_tracker except Exception as e: logger.error(f"DOI validation failed: {e}") raise - def ingestion_metrics(self) -> Dict[str, int]: + def ingestion_metrics(self) -> Tuple[Dict[str, Any], Dict[str, Any]]: total_time = self.end_time - self.start_time processed_dois = ( self.valid_pattern_dois if self.update_metadata else self.new_dois ) - - duplicated_submissions = [doi for doi, count in Counter(self.list_of_dois).items() if count > 1] + + duplicated_submissions = [ + doi + for doi, count in Counter(self.list_of_dois).items() + if count > 1 + ] metadata_pass = [ doi diff --git a/tests/fixtures/authors.json b/tests/fixtures/authors.json index 0dd06f1..529dc3b 100644 --- a/tests/fixtures/authors.json +++ b/tests/fixtures/authors.json @@ -128,4 +128,4 @@ "$": "Rogner, Holger" } ] -} \ No newline at end of file +} diff --git a/tests/fixtures/zenodo.json b/tests/fixtures/zenodo.json index f48f394..b660a00 100644 --- a/tests/fixtures/zenodo.json +++ b/tests/fixtures/zenodo.json @@ -682,4 +682,4 @@ }, "browseResults": null } -} \ No newline at end of file +} diff --git a/tests/test_dois.py b/tests/test_dois.py index 6f163ab..3aec965 100644 --- a/tests/test_dois.py +++ b/tests/test_dois.py @@ -1,4 +1,5 @@ import pytest + from research_index_backend.doi import DOIManager valid_dois = [ @@ -22,7 +23,7 @@ raw_dois = [ "10.1371/journal.pclm.0000331", "doi.org/10.5281/zenodo.11395843", - "doi.org/10.5281/zenodo.11396572", + "doi.org/10.5281/zenodo.11396572", "10.5281/zenodo.11396370", "https://doi.org/10.5281/zenodo.11395518", "10.5281/zenodo.11395518.", @@ -33,26 +34,33 @@ "10.1371/journal.pclm.0000331", "10.5281/zenodo.11395843", "10.5281/zenodo.11396572", - "10.5281/zenodo.11396370", + "10.5281/zenodo.11396370", "10.5281/zenodo.11395518", "10.5281/zenodo.11395518", "10.5281/zenodo.11395519", ] + def test_valid_dois(): """Test that valid DOI patterns are correctly identified.""" - doi_manager = DOIManager(valid_dois, limit=len(valid_dois), update_metadata=False) + doi_manager = DOIManager( + valid_dois, limit=len(valid_dois), update_metadata=False + ) doi_manager.pattern_check() for doi in doi_manager.doi_tracker: assert doi_manager.doi_tracker[doi].valid_pattern + def test_invalid_dois(): """Test that invalid DOI patterns are correctly identified.""" - doi_manager = DOIManager(invalid_dois, limit=len(invalid_dois), update_metadata=False) + doi_manager = DOIManager( + invalid_dois, limit=len(invalid_dois), update_metadata=False + ) doi_manager.pattern_check() for doi in doi_manager.doi_tracker: assert not doi_manager.doi_tracker[doi].valid_pattern + def test_mixed_dois(): """Test processing of mixed valid and invalid DOIs.""" doi_manager = DOIManager( @@ -61,58 +69,82 @@ def test_mixed_dois(): update_metadata=False, ) doi_manager.pattern_check() - valid_count = sum(1 for doi in doi_manager.doi_tracker.values() if doi.valid_pattern) - invalid_count = sum(1 for doi in doi_manager.doi_tracker.values() if not doi.valid_pattern) - + valid_count = sum( + 1 for doi in doi_manager.doi_tracker.values() if doi.valid_pattern + ) + invalid_count = sum( + 1 for doi in doi_manager.doi_tracker.values() if not doi.valid_pattern + ) + assert valid_count == len(valid_dois) assert invalid_count == len(invalid_dois) + def test_doi_objects(): """Test DOI object initialization and default values.""" - doi_manager = DOIManager(valid_dois, limit=len(valid_dois), update_metadata=False) + doi_manager = DOIManager( + valid_dois, limit=len(valid_dois), update_metadata=False + ) doi_manager.pattern_check() - + for doi in doi_manager.doi_tracker: doi_obj = doi_manager.doi_tracker[doi] assert doi_obj.doi == doi, "DOI string mismatch" assert doi_obj.valid_pattern, "Pattern should be valid" assert not doi_obj.already_exists, "Should not exist by default" - assert not doi_obj.openalex_metadata, "Should not have OpenAlex metadata" - assert not doi_obj.openaire_metadata, "Should not have OpenAire metadata" + assert ( + not doi_obj.openalex_metadata + ), "Should not have OpenAlex metadata" + assert ( + not doi_obj.openaire_metadata + ), "Should not have OpenAire metadata" assert not doi_obj.ingestion_success, "Should not be ingested" + def test_pattern_cleaner(): """Test DOI pattern cleaning functionality.""" - doi_manager = DOIManager(raw_dois, limit=len(raw_dois), update_metadata=False) + doi_manager = DOIManager( + raw_dois, limit=len(raw_dois), update_metadata=False + ) assert doi_manager.list_of_dois == cleaned_dois, "DOI cleaning failed" + def test_case_insensitive_pattern(): """Test that DOI pattern matching is case insensitive.""" doi_manager = DOIManager( ["10.5281/zenodo.8140241", "10.5281/ZENODO.8140241"], limit=2, - update_metadata=False + update_metadata=False, ) doi_manager.pattern_check() assert all(doi.valid_pattern for doi in doi_manager.doi_tracker.values()) - + + def test_invalid_limit(): - """Test that providing an invalid (negative) limit raises a ValueError.""" + """Providing an invalid (negative) limit raises a ValueError""" with pytest.raises(ValueError): # Expect DOIManager to raise an error upon invalid limit input. - doi_manager = DOIManager(["10.5281/zenodo.8140241"], limit=-5, update_metadata=False) + doi_manager = DOIManager( + ["10.5281/zenodo.8140241"], limit=-5, update_metadata=False + ) doi_manager.validate_dois() + def test_wrong_type_for_doi_list(): - """Test that providing a wrong type (non-iterable) for DOI list raises a TypeError.""" + """Providing a wrong type (non-iterable) for DOI list raises a TypeError""" with pytest.raises(TypeError): # Passing a single string instead of a list should raise a TypeError. DOIManager("10.5281/zenodo.8140241", limit=1, update_metadata=False) - + + def test_wrong_tyoe_for_update_metadata(): - """Test that providing a wrong type for update_metadata raises a TypeError.""" + """Providing a wrong type for update_metadata raises a TypeError""" with pytest.raises(TypeError): # Passing a string instead of a boolean should raise a TypeError. - DOIManager(["10.5281/zenodo.8140241"], limit=1, update_metadata="False") - -# TODO: should the elements of the list of DOIs be checked for type or this is handled in the entry point? \ No newline at end of file + DOIManager( + ["10.5281/zenodo.8140241"], limit=1, update_metadata="False" + ) + + +# TODO: should the elements of the list of DOIs be checked for type +# or this is handled in the entry point? diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 81a6f70..21bb8ae 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,6 +1,7 @@ """ -These tests call the OpenAire API and require a REFRESH_TOKEN to be defined in the environment variables. +These tests call the OpenAire API and require a REFRESH_TOKEN to be +defined in the environment variables. Obtain a refresh token from https://develop.openaire.eu/personal-token """ @@ -66,12 +67,17 @@ def test_api_403_response(self, session, monkeypatch): monkeypatch.setattr(session, "get", dummy_get_403) with pytest.raises(ValueError) as e: MetadataFetcher(session=session).get_metadata_from_openaire("doi") - expected = "OpenAire refresh token is invalid or expired. Please update token and try again." + expected = ( + "OpenAire refresh token is invalid or expired. " + + "Please update token and try again." + ) assert str(e.value) == expected def test_openaire_v2(self, session, monkeypatch): fixture_path = os.path.join("tests", "fixtures", "openaire_v2.json") - monkeypatch.setattr(session, "get", make_dummy_get_success(fixture_path)) + monkeypatch.setattr( + session, "get", make_dummy_get_success(fixture_path) + ) fetcher = MetadataFetcher(session=session) actual = fetcher.get_metadata_from_openaire("10.5281/zenodo.4650794")