From 19911085daca6cb0c98e0601c59743b9cee460fc Mon Sep 17 00:00:00 2001 From: Alejandro de la Vega Date: Tue, 19 Aug 2025 16:39:08 -0500 Subject: [PATCH 1/2] Extract PMCID in both ways --- src/pubget/_utils.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/pubget/_utils.py b/src/pubget/_utils.py index 30c2ae9..a1c88a0 100644 --- a/src/pubget/_utils.py +++ b/src/pubget/_utils.py @@ -132,9 +132,18 @@ def load_stylesheet(stylesheet_name: str) -> etree.XSLT: def get_pmcid(article: Union[etree.ElementTree, etree.Element]) -> int: """Extract the PubMedCentral ID from an XML article.""" - return int( - article.find("front/article-meta/article-id[@pub-id-type='pmc']").text + pmc = article.find("front/article-meta/article-id[@pub-id-type='pmc']") + pmcid = article.find( + "front/article-meta/article-id[@pub-id-type='pmcid']" ) + if pmc is None and pmcid is None: + raise ValueError("No PMC ID found in the article XML.") + if pmc: + val = pmc.text + else: + val = pmcid.text.replace("PMC", "") + + return int(val) def get_pmcid_from_article_dir(article_dir: Path) -> int: From e0d365619174780cde53c79245684089dbb132ab Mon Sep 17 00:00:00 2001 From: Alejandro de la Vega Date: Tue, 19 Aug 2025 17:00:38 -0500 Subject: [PATCH 2/2] Extract pmcid correctly in article --- src/pubget/_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pubget/_text.py b/src/pubget/_text.py index 48efeb4..847544d 100644 --- a/src/pubget/_text.py +++ b/src/pubget/_text.py @@ -39,5 +39,5 @@ def extract( for part_name in self.fields: elem = transformed.find(part_name) result[part_name] = elem.text - result["pmcid"] = int(result["pmcid"]) + result["pmcid"] = _utils.get_pmcid(article) return result