From e487014f27a261abf25751c034573b813413cadf Mon Sep 17 00:00:00 2001 From: koudyk Date: Thu, 17 Aug 2023 11:41:37 -0400 Subject: [PATCH 1/3] allow user to download articlesets with pmids from pubmed database --- src/pubget/_download.py | 128 +++++++++++++++++++++++++++++++++++++++- src/pubget/_entrez.py | 48 +++++++++------ 2 files changed, 156 insertions(+), 20 deletions(-) diff --git a/src/pubget/_download.py b/src/pubget/_download.py index 470682e..7d77558 100644 --- a/src/pubget/_download.py +++ b/src/pubget/_download.py @@ -55,11 +55,13 @@ def __init__( n_docs: Optional[int] = None, retmax: int = 500, api_key: Optional[str] = None, + db="pmc", ) -> None: self._data_dir = Path(data_dir) self._n_docs = n_docs self._retmax = retmax self._api_key = api_key + self.db = db def download(self) -> Tuple[Path, ExitCode]: """Perform the download. @@ -95,6 +97,7 @@ def download(self) -> Tuple[Path, ExitCode]: failed_requests_dump_dir=output_dir.joinpath( "failed_requests_dumps" ), + db=self.db, ) if "search_result" in info and "webenv" in info["search_result"]: _LOG.info( @@ -147,7 +150,7 @@ def _prepare_webenv(self, client: EntrezClient) -> Dict[str, str]: @abc.abstractmethod def _save_input(self, output_dir: Path) -> None: - """Save the input (eg query, PMCIDs) used to download articles.""" + """Save the input (eg query, PMCIDs, PMIDs) used to download articles.""" class _QueryDownloader(_Downloader): @@ -214,9 +217,10 @@ def __init__( n_docs: Optional[int] = None, retmax: int = 500, api_key: Optional[str] = None, + db="pmc", ) -> None: super().__init__( - data_dir, n_docs=n_docs, retmax=retmax, api_key=api_key + data_dir, n_docs=n_docs, retmax=retmax, api_key=api_key, db=db ) self._pmcids = pmcids @@ -241,6 +245,53 @@ def _save_input(self, output_dir: Path) -> None: pmcids_f.write("\n") +class _PMIDListDownloader(_Downloader): + """Download articles in a provided list of PMIDs. + + Parameters + ---------- + pmids + List of PubMed IDs to download. + + other parameters are forwarded to `_Downloader`. + """ + + def __init__( + self, + pmids: Sequence[int], + data_dir: PathLikeOrStr, + *, + n_docs: Optional[int] = None, + retmax: int = 500, + api_key: Optional[str] = None, + db="pubmed", + ) -> None: + super().__init__( + data_dir, n_docs=n_docs, retmax=retmax, api_key=api_key, db=db + ) + self._pmids = pmids + + def _output_dir_name(self) -> str: + """Directory name containing the checksum of the pmid list.""" + checksum = _utils.checksum( + b",".join([str(pmid).encode("UTF-8") for pmid in self._pmids]) + ) + return f"pmidList_{checksum}" + + def _prepare_webenv(self, client: EntrezClient) -> Dict[str, str]: + """Use EPost to upload PMIDs to the history server.""" + _LOG.info("Uploading PMIDs.") + return client.epost(self._pmids) + + def _save_input(self, output_dir: Path) -> None: + """Save the PMIDs the user asked to download.""" + pmids_file_path = output_dir.joinpath("requested_pmids.txt") + with open(pmids_file_path, "w", encoding="UTF-8") as pmids_f: + for pmid in self._pmids: + pmids_f.write(str(pmid)) + pmids_f.write("\n") + + def _get_data_dir_env() -> Optional[str]: return os.environ.get("PUBGET_DATA_DIR", None) @@ -277,6 +328,13 @@ def _get_pmcids(args: argparse.Namespace) -> List[int]: ] +def _get_pmids(args: argparse.Namespace) -> List[int]: + return [ + int(pmid.strip()) + for pmid in Path(args.pmids_file).read_text("UTF-8").strip().split() + ] + + def _edit_argument_parser(argument_parser: ArgparseActions) -> None: nargs_kw = {"nargs": "?"} if _get_data_dir_env() else {} argument_parser.add_argument( @@ -318,6 +376,15 @@ def _edit_argument_parser(argument_parser: ArgparseActions) -> None: "parameter should be the path of a file containing PubMed Central IDs " "(one per line) to download.", ) + group.add_argument( + "--pmids_file", + type=str, + default=None, + help="Instead of using a query, we can download a predefined list " + "of articles by providing their PubmedIDs. The pmids_file " + "parameter should be the path of a file containing PubMed IDs " + "(one per line) to download.", + ) argument_parser.add_argument( "--alias", type=str, @@ -400,6 +467,55 @@ def download_pmcids( ).download() +def download_pmids( + pmids: Sequence[int], + data_dir: PathLikeOrStr, + *, + n_docs: Optional[int] = None, + retmax: int = 500, + api_key: Optional[str] = None, +) -> Tuple[Path, ExitCode]: + """Download articles in a provided list of PMIDs. + + Parameters + ---------- + pmids + List of PubMed IDs to download. + data_dir + Path to the directory where all pubget data is stored; a subdirectory + will be created for this download. + n_docs + Approximate maximum number of articles to download. By default, all + results returned for the search are downloaded. If n_docs is + specified, at most n_docs rounded up to the nearest multiple of + `retmax` articles will be downloaded. + retmax + Batch size -- number of articles that are downloaded per request. + api_key + API key for the Entrez E-utilities (see [the E-utilities + help](https://www.ncbi.nlm.nih.gov/books/NBK25497/)). If the API + key is provided, it is included in all requests to the Entrez + E-utilities. + + Returns + ------- + output_dir + The directory that was created in which downloaded data is stored. + exit_code + COMPLETED if all articles have been successfully downloaded and + INCOMPLETE or ERROR otherwise. Used by the `pubget` command-line + interface. + + """ + return _PMIDListDownloader( + pmids, + data_dir=data_dir, + n_docs=n_docs, + retmax=retmax, + api_key=api_key, + ).download() + + def download_query_results( query: str, data_dir: PathLikeOrStr, @@ -480,6 +596,14 @@ def _download_articles_for_args( n_docs=args.n_docs, api_key=api_key, ) + elif args.pmids_file is not None: + pmids = _get_pmids(args) + output_dir, exit_code = download_pmids( + pmids=pmids, + data_dir=data_dir, + n_docs=args.n_docs, + api_key=api_key, + ) else: query = _get_query(args) output_dir, exit_code = download_query_results( diff --git a/src/pubget/_entrez.py b/src/pubget/_entrez.py index 11db747..6e3ddfd 100644 --- a/src/pubget/_entrez.py +++ b/src/pubget/_entrez.py @@ -17,6 +17,7 @@ Union, ) from urllib.parse import urljoin +import IPython import requests from lxml import etree @@ -41,8 +42,8 @@ def _check_efetch_response(response: requests.Response) -> Tuple[bool, str]: return check, reason try: root = etree.fromstring(response.content) - assert root.tag == "pmc-articleset" - assert root.find("article") is not None + assert root.tag in ["PubmedArticleSet", "pmc-articleset"] + # assert root.find("article") is not None except Exception: return ( False, @@ -99,6 +100,7 @@ def __init__( request_period: Optional[float] = None, api_key: Optional[str] = None, failed_requests_dump_dir: Optional[PathLikeOrStr] = None, + db="pmc", ) -> None: self._entrez_id = {} if api_key is not None: @@ -116,6 +118,7 @@ def __init__( self._session = requests.Session() self.last_search_result: Optional[Mapping[str, str]] = None self.n_failures = 0 + self.db = db def _wait_to_send_request(self) -> None: if self._last_request_time is None: @@ -211,6 +214,7 @@ def _send_request( attempts fail. """ + req = requests.Request("POST", url, params=params, data=data) prepped = self._session.prepare_request(req) for attempt, delay in enumerate( @@ -231,8 +235,11 @@ def _send_request( ) return None - def epost(self, all_pmcids: Sequence[int]) -> Dict[str, str]: - """Post a list of PMCIDs to the Entrez history server. + def epost( + self, + all_ids: Sequence[int], + ) -> Dict[str, str]: + """Post a list of PMCIDs or PMIDs to the Entrez history server. An esearch query is then performed to filter the list of pmcids to keep only the open-acccess articles. @@ -245,17 +252,17 @@ def epost(self, all_pmcids: Sequence[int]) -> Dict[str, str]: the same way for queries and id lists. """ - # not 'if not all_pmcids' in case someone passes a numpy array - if len(all_pmcids) == 0: - _LOG.error("Empty PMCID list.") + # not 'if not all_ids' in case someone passes a numpy array + if len(all_ids) == 0: + _LOG.error("Empty ID list.") self.n_failures = 1 return {} data = { - "db": "pmc", - "id": ",".join(map(str, all_pmcids)), + "db": self.db, + "id": ",".join(map(str, all_ids)), **self._entrez_id, } - _LOG.info(f"Posting {len(all_pmcids)} PMCIDs to Entrez.") + _LOG.info(f"Posting {len(all_ids)} IDs to Entrez.") resp = self._send_request( self._epost_base_url, data=data, @@ -268,11 +275,12 @@ def epost(self, all_pmcids: Sequence[int]) -> Dict[str, str]: webenv = resp_xml.find("WebEnv").text query_key = resp_xml.find("QueryKey").text search_result = self.esearch(webenv=webenv, query_key=query_key) - if "count" in search_result: - _LOG.info( - f"{int(search_result['count'])} / {len(all_pmcids)} articles " - "are in PMC Open Access." - ) + if self.db == "pmc": + if "count" in search_result: + _LOG.info( + f"{int(search_result['count'])} / {len(all_ids)} articles " + "are in PMC Open Access." + ) return search_result def esearch( @@ -299,9 +307,12 @@ def esearch( """ term = "open+access[filter]" if query is not None: - term = "&".join((query, term)) + if self.db == "pmc": + term = "&".join((query, term)) + else: + term = query data = { - "db": "pmc", + "db": self.db, "term": term, "usehistory": "y", "retmode": "json", @@ -315,6 +326,7 @@ def esearch( data=data, response_validator=_check_esearch_response, ) + if resp is None: self.n_failures = 1 return {} @@ -375,7 +387,7 @@ def efetch( "query_key": search_result["querykey"], "retmax": retmax, "retstart": retstart, - "db": "pmc", + "db": self.db, **self._entrez_id, } n_batches = math.ceil(n_docs / retmax) From cfbf6cd011557cd312aa62a17b34861fe07904dc Mon Sep 17 00:00:00 2001 From: koudyk Date: Wed, 7 Feb 2024 17:08:57 -0500 Subject: [PATCH 2/3] allow pubget to get pubmed abstracts given a list of pmids --- src/pubget/_articles.py | 24 +-- src/pubget/_authors.py | 32 ++-- src/pubget/_coordinate_space.py | 21 ++- src/pubget/_coordinates.py | 9 +- .../_data/stylesheets/text_extraction.xsl | 4 +- src/pubget/_data_extraction.py | 6 +- src/pubget/_labelbuddy.py | 46 +++--- src/pubget/_links.py | 10 +- src/pubget/_metadata.py | 138 +++++++++++------- src/pubget/_text.py | 46 ++++-- src/pubget/_utils.py | 17 +++ src/pubget/_vectorization.py | 30 ++-- 12 files changed, 249 insertions(+), 134 deletions(-) diff --git a/src/pubget/_articles.py b/src/pubget/_articles.py index 0acbef3..476b7da 100644 --- a/src/pubget/_articles.py +++ b/src/pubget/_articles.py @@ -18,10 +18,12 @@ PipelineStep, ) +import IPython + _LOG = logging.getLogger(__name__) _LOG_PERIOD = 500 _STEP_NAME = "extract_articles" -_STEP_DESCRIPTION = "Extract articles from bulk PMC download." +_STEP_DESCRIPTION = "Extract articles from bulk download." def extract_articles( @@ -103,7 +105,7 @@ def _do_extract_articles( """Do the extraction and return number of articles found.""" output_dir.mkdir(exist_ok=True, parents=True) with Parallel(n_jobs=n_jobs, verbose=8) as parallel: - _LOG.info("Extracting articles from PMC articlesets.") + _LOG.info("Extracting articles from articlesets.") article_counts = parallel( delayed(_extract_from_articleset)( batch_file, output_dir=output_dir @@ -111,9 +113,7 @@ def _do_extract_articles( for batch_file in articlesets_dir.glob("articleset_*.xml") ) n_articles = int(sum(article_counts)) # int() is for mypy - _LOG.info( - f"Done extracting {n_articles} articles from PMC articlesets." - ) + _LOG.info(f"Done extracting {n_articles} articles from articlesets.") _LOG.info("Extracting tables from articles.") parallel( delayed(_extract_tables)(article_dir) @@ -132,7 +132,7 @@ def _iter_articles( n_articles = 0 for bucket in all_articles_dir.glob("*"): if bucket.is_dir(): - for article_dir in bucket.glob("pmcid_*"): + for article_dir in bucket.glob("pm*id_*"): n_articles += 1 yield article_dir if not n_articles % _LOG_PERIOD: @@ -144,11 +144,15 @@ def _extract_from_articleset(batch_file: Path, output_dir: Path) -> int: _LOG.debug(f"Extracting articles from {batch_file.name}") with open(batch_file, "rb") as batch_fh: tree = etree.parse(batch_fh) + if "pmc-articleset" in tree.docinfo.doctype: + article_indicator = "article" + elif "PubmedArticleSet" in tree.docinfo.doctype: + article_indicator = "PubmedArticle" n_articles = 0 - for article in tree.iterfind("article"): - pmcid = _utils.get_pmcid(article) - bucket = _utils.article_bucket_from_pmcid(pmcid) - article_dir = output_dir.joinpath(bucket, f"pmcid_{pmcid}") + for article in tree.iterfind(article_indicator): + id = _utils.get_id(article) + bucket = _utils.article_bucket_from_pmcid(id) + article_dir = output_dir.joinpath(bucket, f"{id}") article_dir.mkdir(exist_ok=True, parents=True) article_file = article_dir.joinpath("article.xml") article_file.write_bytes( diff --git a/src/pubget/_authors.py b/src/pubget/_authors.py index d48cf28..c70f42d 100644 --- a/src/pubget/_authors.py +++ b/src/pubget/_authors.py @@ -12,7 +12,7 @@ class AuthorsExtractor(Extractor): """Extracting list of authors from article XML.""" - fields = ("pmcid", "surname", "given-names") + fields = ("id", "firstname", "lastname") name = "authors" def extract( @@ -23,17 +23,27 @@ def extract( ) -> pd.DataFrame: del article_dir, previous_extractors_output authors = [] - pmcid = _utils.get_pmcid(article) - for author_elem in article.iterfind( - "front/article-meta/contrib-group/contrib[@contrib-type='author']" - ): - author_info = {"pmcid": pmcid} - for part in [ - "name/surname", - "name/given-names", - ]: + id = _utils.get_id(article) + if "pmcid" in id: + author_indicator = "front/article-meta/contrib-group/contrib[@contrib-type='author']" + firstname_indicator = "name/given-names" + lastname_indicator = "name/surname" + elif "pmid" in id: + author_indicator = ".//Author" + firstname_indicator = "ForeName" + lastname_indicator = "LastName" + firstname_field = "firstname" + lastname_field = "lastname" + + for author_elem in article.iterfind(author_indicator): + author_info = {"id": id} + for part, field in zip( + [firstname_indicator, lastname_indicator], + [firstname_field, lastname_field], + ): elem = author_elem.find(part) + if elem is not None: - author_info[elem.tag] = elem.text + author_info[field] = elem.text authors.append(author_info) return pd.DataFrame(authors, columns=self.fields) diff --git a/src/pubget/_coordinate_space.py b/src/pubget/_coordinate_space.py index fdb880a..eaa58ce 100644 --- a/src/pubget/_coordinate_space.py +++ b/src/pubget/_coordinate_space.py @@ -6,13 +6,13 @@ from lxml import etree from pubget._typing import Extractor, Records -from pubget._utils import get_pmcid +from pubget._utils import get_id class CoordinateSpaceExtractor(Extractor): """Extracting coordinate space from article XML""" - fields = ("pmcid", "coordinate_space") + fields = ("id", "coordinate_space") name = "coordinate_space" def extract( @@ -21,13 +21,18 @@ def extract( article_dir: pathlib.Path, previous_extractors_output: Dict[str, Records], ) -> Dict[str, Any]: + id = get_id(article) del article_dir, previous_extractors_output - return { - "pmcid": get_pmcid(article), - "coordinate_space": _neurosynth_guess_space( - " ".join(article.xpath(".//text()")) - ), - } + if "pmcid" in id: + result = { + "id": id, + "coordinate_space": _neurosynth_guess_space( + " ".join(article.xpath(".//text()")) + ), + } + else: + result = {"id": id, "coordinate_space": "UNKNOWN"} + return result def _neurosynth_guess_space(text: str) -> str: diff --git a/src/pubget/_coordinates.py b/src/pubget/_coordinates.py index 77a393b..0b7e36c 100644 --- a/src/pubget/_coordinates.py +++ b/src/pubget/_coordinates.py @@ -77,9 +77,14 @@ def extract( article_dir: pathlib.Path, previous_extractors_output: Dict[str, Records], ) -> pd.DataFrame: + id = _utils.get_id(article) del article, previous_extractors_output - coords = _extract_coordinates_from_article_dir(article_dir) - return coords.loc[:, self.fields] + if "pmcid" in id: + coords = _extract_coordinates_from_article_dir(article_dir) + coords.loc[:, self.fields] + else: + coords = pd.DataFrame(columns=self.fields) + return coords def _extract_coordinates_from_article_dir( diff --git a/src/pubget/_data/stylesheets/text_extraction.xsl b/src/pubget/_data/stylesheets/text_extraction.xsl index 656b5e3..422b95c 100644 --- a/src/pubget/_data/stylesheets/text_extraction.xsl +++ b/src/pubget/_data/stylesheets/text_extraction.xsl @@ -10,9 +10,9 @@ - + - + <xsl:value-of select="/article/front/article-meta/title-group/article-title"/> diff --git a/src/pubget/_data_extraction.py b/src/pubget/_data_extraction.py index 97adffc..d9a768b 100644 --- a/src/pubget/_data_extraction.py +++ b/src/pubget/_data_extraction.py @@ -39,6 +39,8 @@ ) from pubget._writers import CSVWriter +import IPython + _LOG = logging.getLogger(__name__) _STEP_NAME = "extract_data" _STEP_DESCRIPTION = "Extract metadata, text and coordinates from articles." @@ -125,7 +127,7 @@ def _iter_articles( articles_dir = Path(articles_dir) for subdir in articles_dir.glob("*"): if subdir.is_dir(): - for article_dir in subdir.glob("pmcid_*"): + for article_dir in subdir.glob("pm*id_*"): # Throttle processing articles so they don't accumulate in the # Pool's output queue. When joblib.Parallel starts returning # iterators we can use it instead of Pool @@ -205,8 +207,8 @@ def extract_data_to_csv( def _get_data_extractors() -> List[Extractor]: return [ - MetadataExtractor(), AuthorsExtractor(), + MetadataExtractor(), TextExtractor(), CoordinateExtractor(), CoordinateSpaceExtractor(), diff --git a/src/pubget/_labelbuddy.py b/src/pubget/_labelbuddy.py index a938501..9343928 100644 --- a/src/pubget/_labelbuddy.py +++ b/src/pubget/_labelbuddy.py @@ -80,7 +80,7 @@ def _get_inserted_field_positions( def _format_authors(doc_authors: pd.DataFrame) -> str: """Collapse dataframe with one row per author to a single string.""" return " and ".join( - f"{row['surname']}, {row['given-names']}" + f"{row['lastname']}, {row['firstname']}" for _, row in doc_authors.iterrows() ) @@ -98,7 +98,7 @@ def _prepare_document( fields["authors"] = _format_authors(doc_authors) doc_info["text"] = _TEMPLATE.format(**fields) doc_info["metadata"] = { - "pmcid": int(doc_meta["pmcid"]), + "id": int(doc_meta["id"].split('_')[1]), "text_md5": md5(doc_info["text"].encode("utf-8")).hexdigest(), "field_positions": _get_inserted_field_positions(_TEMPLATE, fields), "batch": batch, @@ -107,17 +107,29 @@ def _prepare_document( doc_info["metadata"]["pmid"] = int(doc_meta["pmid"]) if not pd.isnull(doc_meta["doi"]): doc_info["metadata"]["doi"] = doc_meta["doi"] - url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{doc_meta['pmcid']}" - doc_info["metadata"]["pmc_url"] = url - doc_info[ - "display_title" - ] = f'pmcid: {doc_meta["pmcid"]}' - doc_info["list_title"] = f"PMC{doc_meta['pmcid']} {doc_text['title']}" - efetch_url = ( - "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" - f"efetch.fcgi?db=pmc&id={doc_meta['pmcid']}" - ) - doc_info["metadata"]["efetch_url"] = efetch_url + id = doc_meta["id"] + if "pmcid" in id: + doc_meta["pmcid"] = doc_meta["id"][len("pmcid_"):] + doc_info["metadata"]["pmcid"] = doc_meta["id"][len("pmcid_"):] + url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{doc_meta['pmcid']}" + doc_info["metadata"]["pmc_url"] = url + doc_info[ + "display_title" + ] = f'pmcid: {doc_meta["pmcid"]}' + doc_info["list_title"] = f"PMC{doc_meta['pmcid']} {doc_text['title']}" + efetch_url = ( + "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + f"efetch.fcgi?db=pmc&id={doc_meta['pmcid']}" + ) + doc_info["metadata"]["efetch_url"] = efetch_url + elif "pmid" in id: + doc_info["display_title"] = f'pmid: {doc_meta["pmid"]}' + doc_info["list_title"] = f"PMID{doc_meta['pmid']} {doc_text['title']}" + efetch_url = ( + "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" + f"efetch.fcgi?db=pubmed&id={doc_meta['pmid']}" + ) + doc_info["metadata"]["efetch_url"] = efetch_url return doc_info @@ -135,8 +147,8 @@ def _iter_corpus( text_chunk.iterrows(), metadata_chunk.iterrows() ): n_articles += 1 - assert doc_meta["pmcid"] == doc_text["pmcid"] - doc_authors = authors[authors["pmcid"] == doc_meta["pmcid"]] + assert doc_meta["id"] == doc_text["id"] + doc_authors = authors[authors["id"] == doc_meta["id"]] if not n_articles % _LOG_PERIOD: _LOG.info(f"Read {n_articles} articles.") yield doc_text, doc_meta, doc_authors @@ -167,7 +179,7 @@ def _write_labelbuddy_batch( json.dumps(_prepare_document(*doc_info, batch=batch_nb)) ) out_f.write("\n") - row = (int(doc_info[1]["pmcid"]), batch_file.name, n_written) + row = (int(doc_info[1]["id"].split("_")[1]), batch_file.name, n_written) batch_info_f.write(",".join(map(str, row))) batch_info_f.write("\n") n_written += 1 @@ -183,7 +195,7 @@ def _do_make_labelbuddy_documents( text_file = extracted_data_dir.joinpath("text.csv") metadata_file = extracted_data_dir.joinpath("metadata.csv") authors = pd.read_csv(extracted_data_dir.joinpath("authors.csv")) - output_dir.joinpath("batch_info.csv").write_text("pmcid,file_name,line\n") + output_dir.joinpath("batch_info.csv").write_text("id,file_name,line\n") with open(text_file, encoding="utf-8") as text_fh, open( metadata_file, encoding="utf-8" ) as metadata_fh: diff --git a/src/pubget/_links.py b/src/pubget/_links.py index cf2e42c..04900b8 100644 --- a/src/pubget/_links.py +++ b/src/pubget/_links.py @@ -17,7 +17,7 @@ class LinkExtractor(Extractor): future. """ - fields = ("pmcid", "ext-link-type", "href") + fields = ("id", "ext-link-type", "href") name = "links" def extract( @@ -27,7 +27,7 @@ def extract( previous_extractors_output: Dict[str, Records], ) -> pd.DataFrame: del article_dir, previous_extractors_output - pmcid = _utils.get_pmcid(article) + id = _utils.get_id(article) all_links = [] xlink = "http://www.w3.org/1999/xlink" for tag in ["uri", "ext-link"]: @@ -35,7 +35,7 @@ def extract( href = link.get(f"{{{xlink}}}href") link_type = link.get("ext-link-type") or tag all_links.append( - {"pmcid": pmcid, "ext-link-type": link_type, "href": href} + {"id": id, "ext-link-type": link_type, "href": href} ) return pd.DataFrame(all_links, columns=self.fields).drop_duplicates() @@ -52,7 +52,7 @@ def __init__(self, pattern: str, name: str) -> None: self.name = name self.pattern = pattern capture_groups = re.findall(r"\(\?P<(\w+)>", self.pattern) - self.fields = ("pmcid", *capture_groups) + self.fields = ("id", *capture_groups) def extract( self, @@ -65,7 +65,7 @@ def extract( if links is None or (len(links) == 0): return pd.DataFrame(columns=self.fields) captured = links["href"].str.extract(self.pattern, expand=True) - captured["pmcid"] = links["pmcid"] + captured["id"] = links["id"] return pd.DataFrame( captured.dropna().drop_duplicates().reset_index(), columns=self.fields, diff --git a/src/pubget/_metadata.py b/src/pubget/_metadata.py index 8728f2e..47eb769 100644 --- a/src/pubget/_metadata.py +++ b/src/pubget/_metadata.py @@ -5,12 +5,14 @@ from lxml import etree from pubget._typing import Extractor, Records +from pubget._utils import get_id class MetadataExtractor(Extractor): """Extracting metatada from article XML.""" fields = ( + "id", "pmcid", "pmid", "doi", @@ -29,63 +31,99 @@ def extract( ) -> Dict[str, Any]: del article_dir, previous_extractors_output metadata: Dict[str, Any] = {} - for article_id in article.iterfind("front/article-meta/article-id"): - _add_id(article_id, metadata) - title_elem = article.find( - "front/article-meta/title-group/article-title" - ) - if title_elem is not None: - metadata["title"] = "".join(title_elem.xpath(".//text()")) - _add_journal(article, metadata) - _add_pub_date(article, metadata) - _add_license(article, metadata) + id = get_id(article) + metadata["id"] = id + if "pmcid" in id: + for article_id in article.iterfind( + "front/article-meta/article-id" + ): + _add_id(article_id, metadata) + + title_elem = article.find( + "front/article-meta/title-group/article-title" + ) + if title_elem is not None: + metadata["title"] = "".join(title_elem.xpath(".//text()")) + _add_journal(article, metadata) + _add_pub_date(article, metadata) + _add_license(article, metadata) + elif "pmid" in id: + metadata["pmid"] = int(id[len("pmid_") :]) + metadata["pmcid"] = None + doi_elem = article.find(".//ArticleId[@IdType='doi']") + if doi_elem is not None: + metadata["doi"] = doi_elem.text + metadata["title"] = article.find(".//ArticleTitle").text + _add_journal(article, metadata, id_type="pmid") + _add_pub_date(article, metadata, id_type="pmid") + _add_license(article, metadata, id_type="pmid") return metadata -def _add_journal(article: etree.Element, metadata: Dict[str, Any]) -> None: - journal_elem = article.find( - "front/journal-meta/journal-id[@journal-id-type='nlm-ta']" - ) - if journal_elem is not None: - metadata["journal"] = journal_elem.text +def _add_journal( + article: etree.Element, metadata: Dict[str, Any], id_type="pmcid" +) -> None: + if id_type == "pmcid": + journal_elem = article.find( + # "front/journal-meta/journal-id[@journal-id-type='nlm-ta']" + "front/journal-meta/journal-title-group/journal-title" + ) + if journal_elem is not None: + metadata["journal"] = journal_elem.text + elif id_type == "pmid": + journal_elem = article.find(".//Journal/Title") + if journal_elem is not None: + metadata["journal"] = journal_elem.text -def _add_pub_date(article: etree.Element, metadata: Dict[str, Any]) -> None: - pub_date_elems = article.findall("front/article-meta/pub-date/year") - pub_dates = [] - for elem in pub_date_elems: - try: - if len(elem.text) == 4: - pub_dates.append(int(elem.text)) - except Exception: - pass - if pub_dates: - metadata["publication_year"] = min(pub_dates) +def _add_pub_date( + article: etree.Element, metadata: Dict[str, Any], id_type="pmcid" +) -> None: + if id_type == "pmcid": + pub_date_elems = article.findall("front/article-meta/pub-date/year") + pub_dates = [] + for elem in pub_date_elems: + try: + if len(elem.text) == 4: + pub_dates.append(int(elem.text)) + except Exception: + pass + if pub_dates: + metadata["publication_year"] = min(pub_dates) + elif id_type == "pmid": + pub_date_elem = article.find(".//PubDate/Year") + if pub_date_elem is not None: + metadata["publication_year"] = int(pub_date_elem.text) -def _add_license(article: etree.Element, metadata: Dict[str, Any]) -> None: - license_elem = article.find("front/article-meta/permissions/license") - if license_elem is None: - return - href = "{http://www.w3.org/1999/xlink}href" - if href in license_elem.attrib: - metadata["license"] = license_elem.get(href) - return - license_p_link = license_elem.find(".//ext-link") - if license_p_link is None: - license_p_link = license_elem.find(".//uri") - if license_p_link is not None and href in license_p_link.attrib: - metadata["license"] = license_p_link.get(href) - return - ali_link = license_elem.find( - ".//{http://www.niso.org/schemas/ali/1.0/}license_ref" - ) - if ali_link is not None: - metadata["license"] = ali_link.text - return - if "license-type" in license_elem.attrib: - metadata["license"] = license_elem.get("license-type") - return +def _add_license( + article: etree.Element, metadata: Dict[str, Any], id_type="pmcid" +) -> None: + if id_type == "pmcid": + license_elem = article.find("front/article-meta/permissions/license") + if license_elem is None: + return + href = "{http://www.w3.org/1999/xlink}href" + if href in license_elem.attrib: + metadata["license"] = license_elem.get(href) + return + license_p_link = license_elem.find(".//ext-link") + if license_p_link is None: + license_p_link = license_elem.find(".//uri") + if license_p_link is not None and href in license_p_link.attrib: + metadata["license"] = license_p_link.get(href) + return + ali_link = license_elem.find( + ".//{http://www.niso.org/schemas/ali/1.0/}license_ref" + ) + if ali_link is not None: + metadata["license"] = ali_link.text + return + if "license-type" in license_elem.attrib: + metadata["license"] = license_elem.get("license-type") + return + elif id_type == "pmid": + metadata["license"] = "" def _add_id(article_id: etree.Element, metadata: Dict[str, Any]) -> None: diff --git a/src/pubget/_text.py b/src/pubget/_text.py index 48efeb4..1dcd58c 100644 --- a/src/pubget/_text.py +++ b/src/pubget/_text.py @@ -14,7 +14,7 @@ class TextExtractor(Extractor): """Extracting text from XML articles.""" - fields = ("pmcid", "title", "keywords", "abstract", "body") + fields = ("id", "title", "keywords", "abstract", "body") name = "text" def extract( @@ -28,16 +28,38 @@ def extract( # Stylesheet is not parsed in init because lxml.XSLT cannot be pickled # so that would prevent the extractor from being passed to # multiprocessing map. Parsing is cached. - stylesheet = _utils.load_stylesheet("text_extraction.xsl") - try: - transformed = stylesheet(article) - except Exception: - _LOG.exception( - f"failed to transform article: {stylesheet.error_log}" + id = _utils.get_id(article) + if "pmcid" in id: + stylesheet = _utils.load_stylesheet("text_extraction.xsl") + try: + transformed = stylesheet(article) + except Exception: + _LOG.exception( + f"failed to transform article: {stylesheet.error_log}" + ) + return result + for part_name in self.fields: + elem = transformed.find(part_name) + result[part_name] = elem.text + result["id"] = id + + elif "pmid" in id: + result["id"] = id + result["title"] = article.find(".//ArticleTitle").text + keywords = [] + for item in article.iterfind(".//DescriptorName"): + keywords.append(item.text) + keywords = "\n".join(keywords) + result["keywords"] = keywords + abstract_sections = article.xpath( + "//Article/Abstract/AbstractText" ) - return result - for part_name in self.fields: - elem = transformed.find(part_name) - result[part_name] = elem.text - result["pmcid"] = int(result["pmcid"]) + abstract = "" + for section in abstract_sections: + try: + abstract = abstract + section.text + " " + except: + continue + result["abstract"] = abstract + result["body"] = "" return result diff --git a/src/pubget/_utils.py b/src/pubget/_utils.py index 050dd58..1b0d237 100644 --- a/src/pubget/_utils.py +++ b/src/pubget/_utils.py @@ -137,6 +137,23 @@ def get_pmcid(article: Union[etree.ElementTree, etree.Element]) -> int: ) +def get_pmid(article: Union[etree.ElementTree, etree.Element]) -> int: + """Extract the PubMed ID from an XML article.""" + return int(article.find(".//PMID").text) + + +def get_id( + article: Union[etree.ElementTree, etree.Element], +) -> str: + try: + id = get_pmcid(article) + id_type = "pmcid" + except AttributeError: + id = get_pmid(article) + id_type = "pmid" + return f"{id_type}_{id}" + + def get_pmcid_from_article_dir(article_dir: Path) -> int: """Extract the PubMedCentral ID from an article's data dir.""" match = re.match(r"pmcid_(\d+)", article_dir.name) diff --git a/src/pubget/_vectorization.py b/src/pubget/_vectorization.py index 0267352..3103ded 100644 --- a/src/pubget/_vectorization.py +++ b/src/pubget/_vectorization.py @@ -64,7 +64,7 @@ def vectorize_corpus_to_npz( extracted_data_dir The directory containing the text of articles to vectorize. It is a directory created by `pubget.extract_data_to_csv`: it contains a file - named `text.csv` with fields `pmcid`, `title`, `keywords`, `abstract`, + named `text.csv` with fields `id`, `title`, `keywords`, `abstract`, `body`. output_dir The directory in which to store the results. If not specified, a @@ -134,9 +134,9 @@ def _do_vectorize_corpus_to_npz( extracted_data_dir, vocabulary_file, n_jobs=n_jobs ) np.savetxt( - output_dir.joinpath("pmcid.txt"), - extraction_result["pmcids"], - fmt="%i", + output_dir.joinpath("id.txt"), + extraction_result["ids"], + fmt="%s", encoding="utf-8", ) for feature_kind in "counts", "tfidf": @@ -154,7 +154,7 @@ def _do_vectorize_corpus_to_npz( voc_mapping_file.write_text( json.dumps(extraction_result["voc_mapping"]), "utf-8" ) - return len(extraction_result["pmcids"]) + return len(extraction_result["ids"]) def _vectorize_articles( @@ -162,13 +162,13 @@ def _vectorize_articles( ) -> Tuple[Sequence[int], Dict[str, sparse.csr_matrix]]: """Vectorize one batch of articles. - Returns the pmcids and the mapping text field: csr matrix of features. + Returns the ids and the mapping text field: csr matrix of features. """ articles.fillna("", inplace=True) vectorized = {} for field in _FIELDS: vectorized[field] = vectorizer.transform(articles[field].values) - return articles["pmcid"].values, vectorized + return articles["id"].values, vectorized def _extract_word_counts( @@ -176,8 +176,8 @@ def _extract_word_counts( ) -> Tuple[Sequence[int], Dict[str, sparse.csr_matrix], TextVectorizer]: """Compute word counts for all articles in a csv file. - returns the pmcids, mapping of text filed: csr matrix, and the vectorizer. - order of pmcids matches rows in the feature matrices. + returns the ids, mapping of text filed: csr matrix, and the vectorizer. + order of ids matches rows in the feature matrices. """ vectorizer = TextVectorizer.from_vocabulary_file( str(vocabulary_file), use_idf=False, norm=None, voc_mapping={} @@ -196,8 +196,8 @@ def _extract_word_counts( format="csr", dtype=int, ) - pmcids = np.concatenate([chunk[0] for chunk in vectorized_chunks]) - return pmcids, vectorized_fields, vectorizer + ids = np.concatenate([chunk[0] for chunk in vectorized_chunks]) + return ids, vectorized_fields, vectorizer def _get_voc_mapping_file(vocabulary_file: PathLikeOrStr) -> Path: @@ -339,7 +339,7 @@ def vectorize_corpus( extracted_data_dir The directory containing the text of articles to vectorize. It is a directory created by `pubget.extract_data_to_csv`: it contains a file - named `text.csv` with fields `pmcid`, `title`, `keywords`, `abstract`, + named `text.csv` with fields `id`, `title`, `keywords`, `abstract`, `body`. vocabulary A file containing the vocabulary used to vectorize text, with one term @@ -353,7 +353,7 @@ def vectorize_corpus( Returns ------- vectorized_data - Contains the pmcids of the vectorized articles, the document + Contains the ids of the vectorized articles, the document frequencies of the vocabulary, and the word counts and TFIDF for each article section and for whole articles as scipy sparse matrices. @@ -362,13 +362,13 @@ def vectorize_corpus( assert_exists(corpus_file) n_jobs = _utils.check_n_jobs(n_jobs) vocabulary_file = _resolve_voc(vocabulary) - pmcids, counts_full_voc, vectorizer = _extract_word_counts( + ids, counts_full_voc, vectorizer = _extract_word_counts( corpus_file, vocabulary_file, n_jobs=n_jobs ) voc = vectorizer.get_feature_names() voc_mapping = _load_voc_mapping(vocabulary_file) data = _prepare_bow_data(counts_full_voc, voc, voc_mapping) - data["pmcids"] = pmcids + data["ids"] = ids return data From 7d3d0200b17a4f73aad0f9310389c00abaed0006 Mon Sep 17 00:00:00 2001 From: koudyk Date: Wed, 28 Feb 2024 16:19:51 -0500 Subject: [PATCH 3/3] fix pmid keywords, get journal fullnames, get pmid url to paper for labelbuddy, and other small changes --- src/pubget/_labelbuddy.py | 7 ++++++- src/pubget/_metadata.py | 12 ++++++++++-- src/pubget/_text.py | 4 ++-- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/pubget/_labelbuddy.py b/src/pubget/_labelbuddy.py index 9343928..e375faf 100644 --- a/src/pubget/_labelbuddy.py +++ b/src/pubget/_labelbuddy.py @@ -123,13 +123,18 @@ def _prepare_document( ) doc_info["metadata"]["efetch_url"] = efetch_url elif "pmid" in id: - doc_info["display_title"] = f'pmid: {doc_meta["pmid"]}' + url = f"https://pubmed.ncbi.nlm.nih.gov/{doc_meta['pmid']}/" + doc_info["metadata"]["pmid_url"] = url + doc_info["display_title"] = ( + f'pmid: {doc_meta["pmid"]}' + ) doc_info["list_title"] = f"PMID{doc_meta['pmid']} {doc_text['title']}" efetch_url = ( "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" f"efetch.fcgi?db=pubmed&id={doc_meta['pmid']}" ) doc_info["metadata"]["efetch_url"] = efetch_url + doc_info["text"] = doc_info["text"].replace("\n\n# Body\n\n", "") return doc_info diff --git a/src/pubget/_metadata.py b/src/pubget/_metadata.py index 47eb769..015d45a 100644 --- a/src/pubget/_metadata.py +++ b/src/pubget/_metadata.py @@ -18,6 +18,7 @@ class MetadataExtractor(Extractor): "doi", "title", "journal", + "journal_fullname", "publication_year", "license", ) @@ -65,15 +66,22 @@ def _add_journal( ) -> None: if id_type == "pmcid": journal_elem = article.find( - # "front/journal-meta/journal-id[@journal-id-type='nlm-ta']" + "front/journal-meta/journal-id[@journal-id-type='nlm-ta']" + ) + journal_fullname_elem = article.find( "front/journal-meta/journal-title-group/journal-title" ) if journal_elem is not None: metadata["journal"] = journal_elem.text + elif journal_fullname_elem is not None: + metadata["journal_fullname"] = journal_fullname_elem.text elif id_type == "pmid": - journal_elem = article.find(".//Journal/Title") + journal_elem = article.find(".//Journal/ISOAbbreviation") + journal_fullname_elem = article.find(".//Journal/Title") if journal_elem is not None: metadata["journal"] = journal_elem.text + if journal_fullname_elem is not None: + metadata["journal_fullname"] = journal_fullname_elem.text def _add_pub_date( diff --git a/src/pubget/_text.py b/src/pubget/_text.py index 1dcd58c..cc3c294 100644 --- a/src/pubget/_text.py +++ b/src/pubget/_text.py @@ -47,7 +47,7 @@ def extract( result["id"] = id result["title"] = article.find(".//ArticleTitle").text keywords = [] - for item in article.iterfind(".//DescriptorName"): + for item in article.iterfind(".//KeywordList/Keyword"): keywords.append(item.text) keywords = "\n".join(keywords) result["keywords"] = keywords @@ -58,7 +58,7 @@ def extract( for section in abstract_sections: try: abstract = abstract + section.text + " " - except: + except TypeError: continue result["abstract"] = abstract result["body"] = ""