diff --git a/src/pubget/_text.py b/src/pubget/_text.py index 48efeb4..847544d 100644 --- a/src/pubget/_text.py +++ b/src/pubget/_text.py @@ -39,5 +39,5 @@ def extract( for part_name in self.fields: elem = transformed.find(part_name) result[part_name] = elem.text - result["pmcid"] = int(result["pmcid"]) + result["pmcid"] = _utils.get_pmcid(article) return result diff --git a/src/pubget/_utils.py b/src/pubget/_utils.py index 30c2ae9..a1c88a0 100644 --- a/src/pubget/_utils.py +++ b/src/pubget/_utils.py @@ -132,9 +132,18 @@ def load_stylesheet(stylesheet_name: str) -> etree.XSLT: def get_pmcid(article: Union[etree.ElementTree, etree.Element]) -> int: """Extract the PubMedCentral ID from an XML article.""" - return int( - article.find("front/article-meta/article-id[@pub-id-type='pmc']").text + pmc = article.find("front/article-meta/article-id[@pub-id-type='pmc']") + pmcid = article.find( + "front/article-meta/article-id[@pub-id-type='pmcid']" ) + if pmc is None and pmcid is None: + raise ValueError("No PMC ID found in the article XML.") + if pmc: + val = pmc.text + else: + val = pmcid.text.replace("PMC", "") + + return int(val) def get_pmcid_from_article_dir(article_dir: Path) -> int: