Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions src/pubget/_articles.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@
PipelineStep,
)

import IPython

_LOG = logging.getLogger(__name__)
_LOG_PERIOD = 500
_STEP_NAME = "extract_articles"
_STEP_DESCRIPTION = "Extract articles from bulk PMC download."
_STEP_DESCRIPTION = "Extract articles from bulk download."


def extract_articles(
Expand Down Expand Up @@ -103,17 +105,15 @@ def _do_extract_articles(
"""Do the extraction and return number of articles found."""
output_dir.mkdir(exist_ok=True, parents=True)
with Parallel(n_jobs=n_jobs, verbose=8) as parallel:
_LOG.info("Extracting articles from PMC articlesets.")
_LOG.info("Extracting articles from articlesets.")
article_counts = parallel(
delayed(_extract_from_articleset)(
batch_file, output_dir=output_dir
)
for batch_file in articlesets_dir.glob("articleset_*.xml")
)
n_articles = int(sum(article_counts)) # int() is for mypy
_LOG.info(
f"Done extracting {n_articles} articles from PMC articlesets."
)
_LOG.info(f"Done extracting {n_articles} articles from articlesets.")
_LOG.info("Extracting tables from articles.")
parallel(
delayed(_extract_tables)(article_dir)
Expand All @@ -132,7 +132,7 @@ def _iter_articles(
n_articles = 0
for bucket in all_articles_dir.glob("*"):
if bucket.is_dir():
for article_dir in bucket.glob("pmcid_*"):
for article_dir in bucket.glob("pm*id_*"):
n_articles += 1
yield article_dir
if not n_articles % _LOG_PERIOD:
Expand All @@ -144,11 +144,15 @@ def _extract_from_articleset(batch_file: Path, output_dir: Path) -> int:
_LOG.debug(f"Extracting articles from {batch_file.name}")
with open(batch_file, "rb") as batch_fh:
tree = etree.parse(batch_fh)
if "pmc-articleset" in tree.docinfo.doctype:
article_indicator = "article"
elif "PubmedArticleSet" in tree.docinfo.doctype:
article_indicator = "PubmedArticle"
n_articles = 0
for article in tree.iterfind("article"):
pmcid = _utils.get_pmcid(article)
bucket = _utils.article_bucket_from_pmcid(pmcid)
article_dir = output_dir.joinpath(bucket, f"pmcid_{pmcid}")
for article in tree.iterfind(article_indicator):
id = _utils.get_id(article)
bucket = _utils.article_bucket_from_pmcid(id)
article_dir = output_dir.joinpath(bucket, f"{id}")
article_dir.mkdir(exist_ok=True, parents=True)
article_file = article_dir.joinpath("article.xml")
article_file.write_bytes(
Expand Down
32 changes: 21 additions & 11 deletions src/pubget/_authors.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
class AuthorsExtractor(Extractor):
"""Extracting list of authors from article XML."""

fields = ("pmcid", "surname", "given-names")
fields = ("id", "firstname", "lastname")
name = "authors"

def extract(
Expand All @@ -23,17 +23,27 @@ def extract(
) -> pd.DataFrame:
del article_dir, previous_extractors_output
authors = []
pmcid = _utils.get_pmcid(article)
for author_elem in article.iterfind(
"front/article-meta/contrib-group/contrib[@contrib-type='author']"
):
author_info = {"pmcid": pmcid}
for part in [
"name/surname",
"name/given-names",
]:
id = _utils.get_id(article)
if "pmcid" in id:
author_indicator = "front/article-meta/contrib-group/contrib[@contrib-type='author']"
firstname_indicator = "name/given-names"
lastname_indicator = "name/surname"
elif "pmid" in id:
author_indicator = ".//Author"
firstname_indicator = "ForeName"
lastname_indicator = "LastName"
firstname_field = "firstname"
lastname_field = "lastname"

for author_elem in article.iterfind(author_indicator):
author_info = {"id": id}
for part, field in zip(
[firstname_indicator, lastname_indicator],
[firstname_field, lastname_field],
):
elem = author_elem.find(part)

if elem is not None:
author_info[elem.tag] = elem.text
author_info[field] = elem.text
authors.append(author_info)
return pd.DataFrame(authors, columns=self.fields)
21 changes: 13 additions & 8 deletions src/pubget/_coordinate_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
from lxml import etree

from pubget._typing import Extractor, Records
from pubget._utils import get_pmcid
from pubget._utils import get_id


class CoordinateSpaceExtractor(Extractor):
"""Extracting coordinate space from article XML"""

fields = ("pmcid", "coordinate_space")
fields = ("id", "coordinate_space")
name = "coordinate_space"

def extract(
Expand All @@ -21,13 +21,18 @@ def extract(
article_dir: pathlib.Path,
previous_extractors_output: Dict[str, Records],
) -> Dict[str, Any]:
id = get_id(article)
del article_dir, previous_extractors_output
return {
"pmcid": get_pmcid(article),
"coordinate_space": _neurosynth_guess_space(
" ".join(article.xpath(".//text()"))
),
}
if "pmcid" in id:
result = {
"id": id,
"coordinate_space": _neurosynth_guess_space(
" ".join(article.xpath(".//text()"))
),
}
else:
result = {"id": id, "coordinate_space": "UNKNOWN"}
return result


def _neurosynth_guess_space(text: str) -> str:
Expand Down
9 changes: 7 additions & 2 deletions src/pubget/_coordinates.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,14 @@ def extract(
article_dir: pathlib.Path,
previous_extractors_output: Dict[str, Records],
) -> pd.DataFrame:
id = _utils.get_id(article)
del article, previous_extractors_output
coords = _extract_coordinates_from_article_dir(article_dir)
return coords.loc[:, self.fields]
if "pmcid" in id:
coords = _extract_coordinates_from_article_dir(article_dir)
coords.loc[:, self.fields]
else:
coords = pd.DataFrame(columns=self.fields)
return coords


def _extract_coordinates_from_article_dir(
Expand Down
4 changes: 2 additions & 2 deletions src/pubget/_data/stylesheets/text_extraction.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@

<xsl:template match="/">
<extracted-text>
<pmcid>
<id>
<xsl:value-of select="/article/front /article-meta/article-id[@pub-id-type='pmc']"/>
</pmcid>
</id>
<title>
<xsl:value-of select="/article/front/article-meta/title-group/article-title"/>
</title>
Expand Down
6 changes: 4 additions & 2 deletions src/pubget/_data_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
)
from pubget._writers import CSVWriter

import IPython

_LOG = logging.getLogger(__name__)
_STEP_NAME = "extract_data"
_STEP_DESCRIPTION = "Extract metadata, text and coordinates from articles."
Expand Down Expand Up @@ -125,7 +127,7 @@ def _iter_articles(
articles_dir = Path(articles_dir)
for subdir in articles_dir.glob("*"):
if subdir.is_dir():
for article_dir in subdir.glob("pmcid_*"):
for article_dir in subdir.glob("pm*id_*"):
# Throttle processing articles so they don't accumulate in the
# Pool's output queue. When joblib.Parallel starts returning
# iterators we can use it instead of Pool
Expand Down Expand Up @@ -205,8 +207,8 @@ def extract_data_to_csv(

def _get_data_extractors() -> List[Extractor]:
return [
MetadataExtractor(),
AuthorsExtractor(),
MetadataExtractor(),
TextExtractor(),
CoordinateExtractor(),
CoordinateSpaceExtractor(),
Expand Down
Loading