Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ exclude: '^docs/conf.py'

repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
rev: v6.0.0
hooks:
- id: trailing-whitespace
- id: check-added-large-files
Expand All @@ -18,26 +18,26 @@ repos:
args: ['--fix=auto'] # replace 'auto' with 'lf' to enforce Linux/Mac line endings or 'crlf' for Windows

- repo: https://github.com/pycqa/isort
rev: 5.13.2
rev: 7.0.0
hooks:
- id: isort
args: ["--profile", "black", "--filter-files"]

- repo: https://github.com/psf/black
rev: 24.10.0
rev: 26.1.0
hooks:
- id: black
language_version: python3

- repo: https://github.com/PyCQA/flake8
rev: 7.1.1
rev: 7.3.0
hooks:
- id: flake8
## You can add flake8 plugins via `additional_dependencies`:
# additional_dependencies: [flake8-bugbear]

- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.13.0 # Use the sha / tag you want to point at
rev: v1.19.1 # Use the sha / tag you want to point at
hooks:
- id: mypy
additional_dependencies: ['types-requests']
2 changes: 1 addition & 1 deletion LICENSE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ Permission is hereby granted, free of charge, to any person obtaining a copy of

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
6 changes: 3 additions & 3 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ The package is not yet deployed to PyPI. Only an editable (development) install
2. Clone the repository `git clonehttps://github.com/ClimateCompatibleGrowth/research_index_backend.git`
3. Change directory `cd research_index_backend`
4. Install the package `pip install -e .` as an editable package (development install)
5. Obtain an OpenAIRE Graph refresh token and create a .env file with the following parameters:
5. Obtain an OpenAIRE Graph refresh token and create a .env file with the following parameters:
```MG_HOST=
MG_PORT=
MG_PORT_ALT=
Expand All @@ -30,7 +30,7 @@ The package is not yet deployed to PyPI. Only an editable (development) install

research_index --help
usage: research_index [-h] [-i] [-l LIMIT] [-u] list_of_dois

positional arguments:
list_of_dois Path to CSV file containing list of DOIs

Expand All @@ -39,7 +39,7 @@ The package is not yet deployed to PyPI. Only an editable (development) install
-i, --initialise Delete existing data and create new database
-l, --limit N Limit number of DOIs to process (default: 50)
-u, --update-metadata Update metadata for existing DOIs
-w, --write-metadata Save JSON responses to disk
-w, --write-metadata Save JSON responses to disk

Examples:
-> Process 10 DOIs from file:
Expand Down
29 changes: 18 additions & 11 deletions src/research_index_backend/create_graph_from_doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,9 @@ def upload_article_to_memgraph(output: AnonymousArticle) -> bool:
return True


def main(list_of_dois: list, limit: int, update_metadata: bool, write_metadata: bool):
def main(
list_of_dois: list, limit: int, update_metadata: bool, write_metadata: bool
):
try:
doi_manager = DOIManager(
list_of_dois, limit=limit, update_metadata=update_metadata
Expand Down Expand Up @@ -343,8 +345,12 @@ def entry_point(db: Driver) -> None:
logger.info("Deleted graph")
load_initial_data(join("data", "init"))

doi_manager = main(list_of_dois, limit=args.limit, update_metadata=args.update_metadata,
write_metadata=args.write_metadata)
doi_manager = main(
list_of_dois,
limit=args.limit,
update_metadata=args.update_metadata,
write_metadata=args.write_metadata,
)
add_country_relations()
metrics, processed_dois = doi_manager.ingestion_metrics()

Expand All @@ -357,17 +363,18 @@ def entry_point(db: Driver) -> None:
print(f"{key.ljust(max_key_length)} | {value}")

print("\nProcessing Results:")
print(f"\n• Failed metadata DOIs ({metrics['metadata_failure']}):")
for doi in processed_dois['metadata_failure']:
print(f"\n• Failed metadata DOIs ({metrics['metadata_failure']}):")
for doi in processed_dois["metadata_failure"]:
print(f" - {doi}")
print(f"\n• Invalid pattern DOIs ({metrics['invalid_pattern_dois']}):")
for doi in processed_dois['invalid_pattern_dois']:

print(f"\n• Invalid pattern DOIs ({metrics['invalid_pattern_dois']}):")
for doi in processed_dois["invalid_pattern_dois"]:
print(f" - {doi}")

print(f"\n• Duplicated Submissions ({metrics['duplicated_submissions']}):")
for doi in processed_dois['duplicated_submissions']:
for doi in processed_dois["duplicated_submissions"]:
print(f" - {doi}")



if __name__ == "__main__":
entry_point()
46 changes: 27 additions & 19 deletions src/research_index_backend/doi.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@
from collections import Counter
from logging import getLogger
from re import IGNORECASE, compile
from typing import Dict, List
from typing import Any, Dict, List, Tuple

from neo4j import Driver

# https://neo4j.com/docs/api/python-driver/current/api.html#errors
from neo4j.exceptions import (
ServiceUnavailable,
Neo4jError,
) # https://neo4j.com/docs/api/python-driver/current/api.html#errors
ServiceUnavailable,
)
from pydantic import BaseModel

from .session import connect_to_db
Expand All @@ -38,16 +40,12 @@ class DOI(BaseModel):
ingestion_success: bool = False


class DOITracker(BaseModel):
doi_tracker: Dict[str, DOI]


class DOIManager:
"""Manages the validation and ingestion tracking of Digital Object Identifiers (DOIs).
"""Tracks the validation and ingestion of Digital Object Identifiers (DOIs)

This class handles DOI validation, database existence checks, and metadata tracking.
It processes DOIs up to a specified limit and can optionally update metadata
for existing entries.
This class handles DOI validation, database existence checks, and metadata
tracking. It processes DOIs up to a specified limit and can optionally
update metadata for existing entries.

Parameters
----------
Expand Down Expand Up @@ -89,7 +87,10 @@ class DOIManager:
"""

def __init__(
self, list_of_dois: List[str], limit: int, update_metadata: bool = False
self,
list_of_dois: List[str],
limit: int,
update_metadata: bool = False,
) -> None:

self._validate_inputs(list_of_dois, limit, update_metadata)
Expand All @@ -104,9 +105,10 @@ def __init__(
limit if limit < len(self.list_of_dois) else len(self.list_of_dois)
)
self.update_metadata = update_metadata
self.doi_tracker: DOITracker = {
self.doi_tracker: Dict[str, DOI] = {
doi: DOI(doi=doi) for doi in self.list_of_dois[: self.limit]
}

self.PATTERN = compile(DOI_PATTERN, IGNORECASE)

def _validate_inputs(
Expand Down Expand Up @@ -184,26 +186,32 @@ def search_dois(self, db: Driver) -> None:
self.num_existing_dois = len(self.existing_dois)

logger.info(
f"Found {self.num_existing_dois} existing and {self.num_new_dois} new DOIs"
f"Found {self.num_existing_dois} existing and "
+ "{self.num_new_dois} new DOIs"
)

def validate_dois(self) -> Dict[str, List[str]]:
@connect_to_db
def validate_dois(self, db: Driver) -> Dict[str, DOI]:
try:
self.pattern_check()
self.search_dois()
self.search_dois(db)
return self.doi_tracker
except Exception as e:
logger.error(f"DOI validation failed: {e}")
raise

def ingestion_metrics(self) -> Dict[str, int]:
def ingestion_metrics(self) -> Tuple[Dict[str, Any], Dict[str, Any]]:
total_time = self.end_time - self.start_time

processed_dois = (
self.valid_pattern_dois if self.update_metadata else self.new_dois
)

duplicated_submissions = [doi for doi, count in Counter(self.list_of_dois).items() if count > 1]

duplicated_submissions = [
doi
for doi, count in Counter(self.list_of_dois).items()
if count > 1
]

metadata_pass = [
doi
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/authors.json
Original file line number Diff line number Diff line change
Expand Up @@ -128,4 +128,4 @@
"$": "Rogner, Holger"
}
]
}
}
2 changes: 1 addition & 1 deletion tests/fixtures/zenodo.json
Original file line number Diff line number Diff line change
Expand Up @@ -682,4 +682,4 @@
},
"browseResults": null
}
}
}
76 changes: 54 additions & 22 deletions tests/test_dois.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pytest

from research_index_backend.doi import DOIManager

valid_dois = [
Expand All @@ -22,7 +23,7 @@
raw_dois = [
"10.1371/journal.pclm.0000331",
"doi.org/10.5281/zenodo.11395843",
"doi.org/10.5281/zenodo.11396572",
"doi.org/10.5281/zenodo.11396572",
"10.5281/zenodo.11396370",
"https://doi.org/10.5281/zenodo.11395518",
"10.5281/zenodo.11395518.",
Expand All @@ -33,26 +34,33 @@
"10.1371/journal.pclm.0000331",
"10.5281/zenodo.11395843",
"10.5281/zenodo.11396572",
"10.5281/zenodo.11396370",
"10.5281/zenodo.11396370",
"10.5281/zenodo.11395518",
"10.5281/zenodo.11395518",
"10.5281/zenodo.11395519",
]


def test_valid_dois():
"""Test that valid DOI patterns are correctly identified."""
doi_manager = DOIManager(valid_dois, limit=len(valid_dois), update_metadata=False)
doi_manager = DOIManager(
valid_dois, limit=len(valid_dois), update_metadata=False
)
doi_manager.pattern_check()
for doi in doi_manager.doi_tracker:
assert doi_manager.doi_tracker[doi].valid_pattern


def test_invalid_dois():
"""Test that invalid DOI patterns are correctly identified."""
doi_manager = DOIManager(invalid_dois, limit=len(invalid_dois), update_metadata=False)
doi_manager = DOIManager(
invalid_dois, limit=len(invalid_dois), update_metadata=False
)
doi_manager.pattern_check()
for doi in doi_manager.doi_tracker:
assert not doi_manager.doi_tracker[doi].valid_pattern


def test_mixed_dois():
"""Test processing of mixed valid and invalid DOIs."""
doi_manager = DOIManager(
Expand All @@ -61,58 +69,82 @@ def test_mixed_dois():
update_metadata=False,
)
doi_manager.pattern_check()
valid_count = sum(1 for doi in doi_manager.doi_tracker.values() if doi.valid_pattern)
invalid_count = sum(1 for doi in doi_manager.doi_tracker.values() if not doi.valid_pattern)

valid_count = sum(
1 for doi in doi_manager.doi_tracker.values() if doi.valid_pattern
)
invalid_count = sum(
1 for doi in doi_manager.doi_tracker.values() if not doi.valid_pattern
)

assert valid_count == len(valid_dois)
assert invalid_count == len(invalid_dois)


def test_doi_objects():
"""Test DOI object initialization and default values."""
doi_manager = DOIManager(valid_dois, limit=len(valid_dois), update_metadata=False)
doi_manager = DOIManager(
valid_dois, limit=len(valid_dois), update_metadata=False
)
doi_manager.pattern_check()

for doi in doi_manager.doi_tracker:
doi_obj = doi_manager.doi_tracker[doi]
assert doi_obj.doi == doi, "DOI string mismatch"
assert doi_obj.valid_pattern, "Pattern should be valid"
assert not doi_obj.already_exists, "Should not exist by default"
assert not doi_obj.openalex_metadata, "Should not have OpenAlex metadata"
assert not doi_obj.openaire_metadata, "Should not have OpenAire metadata"
assert (
not doi_obj.openalex_metadata
), "Should not have OpenAlex metadata"
assert (
not doi_obj.openaire_metadata
), "Should not have OpenAire metadata"
assert not doi_obj.ingestion_success, "Should not be ingested"


def test_pattern_cleaner():
"""Test DOI pattern cleaning functionality."""
doi_manager = DOIManager(raw_dois, limit=len(raw_dois), update_metadata=False)
doi_manager = DOIManager(
raw_dois, limit=len(raw_dois), update_metadata=False
)
assert doi_manager.list_of_dois == cleaned_dois, "DOI cleaning failed"


def test_case_insensitive_pattern():
"""Test that DOI pattern matching is case insensitive."""
doi_manager = DOIManager(
["10.5281/zenodo.8140241", "10.5281/ZENODO.8140241"],
limit=2,
update_metadata=False
update_metadata=False,
)
doi_manager.pattern_check()
assert all(doi.valid_pattern for doi in doi_manager.doi_tracker.values())



def test_invalid_limit():
"""Test that providing an invalid (negative) limit raises a ValueError."""
"""Providing an invalid (negative) limit raises a ValueError"""
with pytest.raises(ValueError):
# Expect DOIManager to raise an error upon invalid limit input.
doi_manager = DOIManager(["10.5281/zenodo.8140241"], limit=-5, update_metadata=False)
doi_manager = DOIManager(
["10.5281/zenodo.8140241"], limit=-5, update_metadata=False
)
doi_manager.validate_dois()


def test_wrong_type_for_doi_list():
"""Test that providing a wrong type (non-iterable) for DOI list raises a TypeError."""
"""Providing a wrong type (non-iterable) for DOI list raises a TypeError"""
with pytest.raises(TypeError):
# Passing a single string instead of a list should raise a TypeError.
DOIManager("10.5281/zenodo.8140241", limit=1, update_metadata=False)



def test_wrong_tyoe_for_update_metadata():
"""Test that providing a wrong type for update_metadata raises a TypeError."""
"""Providing a wrong type for update_metadata raises a TypeError"""
with pytest.raises(TypeError):
# Passing a string instead of a boolean should raise a TypeError.
DOIManager(["10.5281/zenodo.8140241"], limit=1, update_metadata="False")

# TODO: should the elements of the list of DOIs be checked for type or this is handled in the entry point?
DOIManager(
["10.5281/zenodo.8140241"], limit=1, update_metadata="False"
)


# TODO: should the elements of the list of DOIs be checked for type
# or this is handled in the entry point?
Loading