From 1e725e3f3d6ccbb210cfe6307a35958e932f4c4c Mon Sep 17 00:00:00 2001 From: Francesco Casalegno Date: Fri, 25 Mar 2022 12:11:49 +0100 Subject: [PATCH] Add download support for PMC oa_other --- src/bluesearch/database/download.py | 19 +++++++++++-------- .../entrypoint/database/download.py | 3 ++- tests/unit/database/test_download.py | 1 + .../unit/entrypoint/database/test_download.py | 1 + 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/bluesearch/database/download.py b/src/bluesearch/database/download.py index b3be9d07e..303b3b85f 100644 --- a/src/bluesearch/database/download.py +++ b/src/bluesearch/database/download.py @@ -91,7 +91,7 @@ def generate_pmc_urls( Parameters ---------- - component : {"author_manuscript", "oa_comm", "oa_noncomm"} + component : {"author_manuscript", "oa_comm", "oa_noncomm", "oa_other"} Part of the PMC to download. start_date Starting date to download the incremental files. @@ -108,17 +108,20 @@ def generate_pmc_urls( ValueError If the chosen component does not exist on PMC. """ - base_url = "https://ftp.ncbi.nlm.nih.gov/pub/pmc/" - if component in {"oa_comm", "oa_noncomm"}: - base_url += f"oa_bulk/{component}/xml/" - elif component == "author_manuscript": - base_url += "manuscript/xml/" - else: + avail_components = {"author_manuscript", "oa_comm", "oa_noncomm", "oa_other"} + if component not in avail_components: raise ValueError( f"Unexcepted component {component}. " - "Only {'author_manuscript', 'oa_comm', 'oa_noncomm'} are supported." + f"Only {avail_components} " + "are supported." ) + base_url = "https://ftp.ncbi.nlm.nih.gov/pub/pmc/" + if component == "author_manuscript": + base_url += "manuscript/xml/" + else: + base_url += f"oa_bulk/{component}/xml/" + days_list = get_daterange_list(start_date=start_date, end_date=end_date) url_list = [] diff --git a/src/bluesearch/entrypoint/database/download.py b/src/bluesearch/entrypoint/database/download.py index d25914ffd..2421686f6 100644 --- a/src/bluesearch/entrypoint/database/download.py +++ b/src/bluesearch/entrypoint/database/download.py @@ -143,7 +143,8 @@ def run(source: str, from_month: datetime, output_dir: Path, dry_run: bool) -> i if article_source == ArticleSource.PMC: url_dict = {} - for component in {"author_manuscript", "oa_comm", "oa_noncomm"}: + avail_components = ["author_manuscript", "oa_comm", "oa_noncomm", "oa_other"] + for component in avail_components: url_dict[component] = generate_pmc_urls(component, from_month) if dry_run: diff --git a/tests/unit/database/test_download.py b/tests/unit/database/test_download.py index 3981dc758..db6e5bfc5 100644 --- a/tests/unit/database/test_download.py +++ b/tests/unit/database/test_download.py @@ -74,6 +74,7 @@ def test_delta_wrong(self): ("author_manuscript", "https://ftp.ncbi.nlm.nih.gov/pub/pmc/manuscript/xml/"), ("oa_comm", "https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_comm/xml/"), ("oa_noncomm", "https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_noncomm/xml/"), + ("oa_other", "https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_other/xml/"), ], ) def test_generate_pmc_urls(monkeypatch, component, expected_url_start): diff --git a/tests/unit/entrypoint/database/test_download.py b/tests/unit/entrypoint/database/test_download.py index a317be77a..5c79c7735 100644 --- a/tests/unit/entrypoint/database/test_download.py +++ b/tests/unit/entrypoint/database/test_download.py @@ -80,6 +80,7 @@ def fake_download_articles_func(url_list, output_dir): "author_manuscript", "oa_comm", "oa_noncomm", + "oa_other", } for sub_dir in pmc_path.iterdir(): assert len(list(sub_dir.iterdir())) == 2