neuroquery · koudyk · Aug 17, 2023 · Feb 7, 2024 · Feb 28, 2024
diff --git a/src/pubget/_articles.py b/src/pubget/_articles.py
@@ -18,10 +18,12 @@
     PipelineStep,
 )
 
+import IPython
+
 _LOG = logging.getLogger(__name__)
 _LOG_PERIOD = 500
 _STEP_NAME = "extract_articles"
-_STEP_DESCRIPTION = "Extract articles from bulk PMC download."
+_STEP_DESCRIPTION = "Extract articles from bulk download."
 
 
 def extract_articles(
@@ -103,17 +105,15 @@ def _do_extract_articles(
     """Do the extraction and return number of articles found."""
     output_dir.mkdir(exist_ok=True, parents=True)
     with Parallel(n_jobs=n_jobs, verbose=8) as parallel:
-        _LOG.info("Extracting articles from PMC articlesets.")
+        _LOG.info("Extracting articles from articlesets.")
         article_counts = parallel(
             delayed(_extract_from_articleset)(
                 batch_file, output_dir=output_dir
             )
             for batch_file in articlesets_dir.glob("articleset_*.xml")
         )
         n_articles = int(sum(article_counts))  # int() is for mypy
-        _LOG.info(
-            f"Done extracting {n_articles} articles from PMC articlesets."
-        )
+        _LOG.info(f"Done extracting {n_articles} articles from articlesets.")
         _LOG.info("Extracting tables from articles.")
         parallel(
             delayed(_extract_tables)(article_dir)
@@ -132,7 +132,7 @@ def _iter_articles(
     n_articles = 0
     for bucket in all_articles_dir.glob("*"):
         if bucket.is_dir():
-            for article_dir in bucket.glob("pmcid_*"):
+            for article_dir in bucket.glob("pm*id_*"):
                 n_articles += 1
                 yield article_dir
                 if not n_articles % _LOG_PERIOD:
@@ -144,11 +144,15 @@ def _extract_from_articleset(batch_file: Path, output_dir: Path) -> int:
     _LOG.debug(f"Extracting articles from {batch_file.name}")
     with open(batch_file, "rb") as batch_fh:
         tree = etree.parse(batch_fh)
+    if "pmc-articleset" in tree.docinfo.doctype:
+        article_indicator = "article"
+    elif "PubmedArticleSet" in tree.docinfo.doctype:
+        article_indicator = "PubmedArticle"
     n_articles = 0
-    for article in tree.iterfind("article"):
-        pmcid = _utils.get_pmcid(article)
-        bucket = _utils.article_bucket_from_pmcid(pmcid)
-        article_dir = output_dir.joinpath(bucket, f"pmcid_{pmcid}")
+    for article in tree.iterfind(article_indicator):
+        id = _utils.get_id(article)
+        bucket = _utils.article_bucket_from_pmcid(id)
+        article_dir = output_dir.joinpath(bucket, f"{id}")
         article_dir.mkdir(exist_ok=True, parents=True)
         article_file = article_dir.joinpath("article.xml")
         article_file.write_bytes(

diff --git a/src/pubget/_authors.py b/src/pubget/_authors.py
@@ -12,7 +12,7 @@
 class AuthorsExtractor(Extractor):
     """Extracting list of authors from article XML."""
 
-    fields = ("pmcid", "surname", "given-names")
+    fields = ("id", "firstname", "lastname")
     name = "authors"
 
     def extract(
@@ -23,17 +23,27 @@ def extract(
     ) -> pd.DataFrame:
         del article_dir, previous_extractors_output
         authors = []
-        pmcid = _utils.get_pmcid(article)
-        for author_elem in article.iterfind(
-            "front/article-meta/contrib-group/contrib[@contrib-type='author']"
-        ):
-            author_info = {"pmcid": pmcid}
-            for part in [
-                "name/surname",
-                "name/given-names",
-            ]:
+        id = _utils.get_id(article)
+        if "pmcid" in id:
+            author_indicator = "front/article-meta/contrib-group/contrib[@contrib-type='author']"
+            firstname_indicator = "name/given-names"
+            lastname_indicator = "name/surname"
+        elif "pmid" in id:
+            author_indicator = ".//Author"
+            firstname_indicator = "ForeName"
+            lastname_indicator = "LastName"
+        firstname_field = "firstname"
+        lastname_field = "lastname"
+
+        for author_elem in article.iterfind(author_indicator):
+            author_info = {"id": id}
+            for part, field in zip(
+                [firstname_indicator, lastname_indicator],
+                [firstname_field, lastname_field],
+            ):
                 elem = author_elem.find(part)
+
                 if elem is not None:
-                    author_info[elem.tag] = elem.text
+                    author_info[field] = elem.text
             authors.append(author_info)
         return pd.DataFrame(authors, columns=self.fields)
diff --git a/src/pubget/_coordinate_space.py b/src/pubget/_coordinate_space.py
@@ -6,13 +6,13 @@
 from lxml import etree
 
 from pubget._typing import Extractor, Records
-from pubget._utils import get_pmcid
+from pubget._utils import get_id
 
 
 class CoordinateSpaceExtractor(Extractor):
     """Extracting coordinate space from article XML"""
 
-    fields = ("pmcid", "coordinate_space")
+    fields = ("id", "coordinate_space")
     name = "coordinate_space"
 
     def extract(
@@ -21,13 +21,18 @@ def extract(
         article_dir: pathlib.Path,
         previous_extractors_output: Dict[str, Records],
     ) -> Dict[str, Any]:
+        id = get_id(article)
         del article_dir, previous_extractors_output
-        return {
-            "pmcid": get_pmcid(article),
-            "coordinate_space": _neurosynth_guess_space(
-                " ".join(article.xpath(".//text()"))
-            ),
-        }
+        if "pmcid" in id:
+            result = {
+                "id": id,
+                "coordinate_space": _neurosynth_guess_space(
+                    " ".join(article.xpath(".//text()"))
+                ),
+            }
+        else:
+            result = {"id": id, "coordinate_space": "UNKNOWN"}
+        return result
 
 
 def _neurosynth_guess_space(text: str) -> str:

diff --git a/src/pubget/_coordinates.py b/src/pubget/_coordinates.py
@@ -77,9 +77,14 @@ def extract(
         article_dir: pathlib.Path,
         previous_extractors_output: Dict[str, Records],
     ) -> pd.DataFrame:
+        id = _utils.get_id(article)
         del article, previous_extractors_output
-        coords = _extract_coordinates_from_article_dir(article_dir)
-        return coords.loc[:, self.fields]
+        if "pmcid" in id:
+            coords = _extract_coordinates_from_article_dir(article_dir)
+            coords.loc[:, self.fields]
+        else:
+            coords = pd.DataFrame(columns=self.fields)
+        return coords
 
 
 def _extract_coordinates_from_article_dir(

diff --git a/src/pubget/_data/stylesheets/text_extraction.xsl b/src/pubget/_data/stylesheets/text_extraction.xsl
@@ -10,9 +10,9 @@
 
   <xsl:template match="/">
     <extracted-text>
-      <pmcid>
+      <id>
         <xsl:value-of select="/article/front /article-meta/article-id[@pub-id-type='pmc']"/>
-      </pmcid>
+      </id>
       <title>
         <xsl:value-of select="/article/front/article-meta/title-group/article-title"/>
       </title>

diff --git a/src/pubget/_data_extraction.py b/src/pubget/_data_extraction.py
@@ -39,6 +39,8 @@
 )
 from pubget._writers import CSVWriter
 
+import IPython
+
 _LOG = logging.getLogger(__name__)
 _STEP_NAME = "extract_data"
 _STEP_DESCRIPTION = "Extract metadata, text and coordinates from articles."
@@ -125,7 +127,7 @@ def _iter_articles(
     articles_dir = Path(articles_dir)
     for subdir in articles_dir.glob("*"):
         if subdir.is_dir():
-            for article_dir in subdir.glob("pmcid_*"):
+            for article_dir in subdir.glob("pm*id_*"):
                 # Throttle processing articles so they don't accumulate in the
                 # Pool's output queue. When joblib.Parallel starts returning
                 # iterators we can use it instead of Pool
@@ -205,8 +207,8 @@ def extract_data_to_csv(
 
 def _get_data_extractors() -> List[Extractor]:
     return [
-        MetadataExtractor(),
         AuthorsExtractor(),
+        MetadataExtractor(),
         TextExtractor(),
         CoordinateExtractor(),
         CoordinateSpaceExtractor(),