Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 146 additions & 34 deletions adsingestp/parsers/ieee.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# IEEE parser for metadata-only (not full-text) conference XML files
# /proj/ads_abstracts/sources/IEEE/IEEEcnf/MetadataXML/*

# Parser assumes XML structured per:
# IEEE XML documentation v.5.14, July 2024
# https://www.ieee.org/content/dam/ieee-org/ieee/web/org/pubs/ieee-data-delivery-documentation.pdf

import logging
import re

Expand All @@ -20,15 +27,19 @@ def __init__(self):
self.article = None

def _parse_ids(self):
self.base_metadata["ids"] = {}

# ISSN
self.base_metadata["issn"] = []
for i in self.publicationinfo.find_all("issn"):
self.base_metadata["issn"].append((i["mediatype"], i.get_text()))

if self.article.find("doi"):
self.base_metadata["ids"]["doi"] = self.article.find("doi").get_text()
# IDs
self.base_metadata["ids"] = {}

# DOI for article
if self.article.find("articledoi"):
self.base_metadata["ids"]["doi"] = self.article.find("articledoi").get_text()

# DOI for Conference
self.base_metadata["ids"]["pub-id"] = []
if self.publicationinfo.find("publicationdoi"):
self.base_metadata["ids"]["pub-id"].append(
Expand All @@ -38,26 +49,98 @@ def _parse_ids(self):
}
)

# IEEE unique ID for article
if self.article.find("articleinfo"):
articleinfo = self.article.find("articleinfo")
# Article sequence number
if articleinfo.find("articleseqnum"):
articleid = articleinfo.find("articleseqnum").get_text()
self.base_metadata["electronic_id"] = articleid
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, if there are firstPage, lastPage and articleseqnum, I'd like to field all three. Right now, the existing test cases have a firstPage and lastPage under pagination, so I'd like to add electronicID rather than using it exclusively.

# This next bit probably unnecessary? Unlikely to be >9999 articles in a conf proceedings
#if len(articleid) > 4:
# self.base_metadata["page_first"] = articleid[-4:] # rightmost 4 chars
#else:
# self.base_metadata["page_first"] = articleid

def _parse_pub(self):
# Conference name
if self.publication.find("title"):
t = self.publication.find("title")
self.base_metadata["publication"] = self._clean_output(
title = self._clean_output(
self._detag(t, self.HTML_TAGSET["title"]).strip()
)
self.base_metadata["publication"] = title

# Conference volume number
if self.volumeinfo:
self.base_metadata["volume"] = self.volumeinfo.find("volumenum").get_text()
self.base_metadata["issue"] = self.volumeinfo.find("issue").find("issuenum").get_text()
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Restore L50, because you're losing issue numbers when available.

if self.volumeinfo.find("volumenum"):
self.base_metadata["volume"] = self.volumeinfo.find("volumenum").text
else:
self.base_metadata["volume"] = ""

# Conferences don't have an issue number
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, the parser should be able to handle both conferences and journal articles without giving special instructions to the parser.


# Conference abbreviation
self.base_metadata["comments"] = []
ieeeabbrev = self.publicationinfo.find("ieeeabbrev").text or ""
if ieeeabbrev:
cleanabbrev = re.sub(r"[^A-Za-z]", "", ieeeabbrev)
confabbrev = cleanabbrev[:4].ljust(4, '.') # leftmost 4 chars, or pad if <4
self.base_metadata["comments"].append({"text": confabbrev})
else:
self.base_metadata["comments"].append({"text": "ieee."})

# Conference location
if self.publicationinfo.find("conflocation") is not None:
confloc = self.publicationinfo.find("conflocation").text
self.base_metadata["conf_location"] = confloc
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line will fail if self.publicationinfo.find("conflocation") is None -- the variable confloc will be undefined at L96.


# Conference dates
confdate = ""
if self.publicationinfo.find("confdate", {"confdatetype": "End"}) is not None:
confend = self.publicationinfo.find("confdate", {"confdatetype": "End"})
end_year = confend.find("year").text if confend.find("year") else None
end_month = confend.find("month").text if confend.find("month") else None
end_day = confend.find("day").text if confend.find("day") else None
if self.publicationinfo.find("confdate", {"confdatetype": "Start"}) is not None:
confstart = self.publicationinfo.find("confdate", {"confdatetype": "Start"})
start_year = confstart.find("year").text if confstart.find("year") else None
start_month = confstart.find("month").text if confstart.find("month") else None
start_day = confstart.find("day").text if confstart.find("day") else None
confdate = f"{start_day} {start_month} {start_year} - {end_day} {end_month} {end_year}"
self.base_metadata["conf_date"] = confdate

# Conference topics
# Use for %W
collections = []
for pubtopicset in self.publicationinfo.find_all("pubtopicalbrowseset"):
for pubtopic in pubtopicset.find_all("pubtopicalbrowse"):
if pubtopic.get_text() == "Aerospace":
collections.append("astronomy")
elif pubtopic.get_text() == "Geoscience":
collections.append("earthscience")
else:
collections.append("physics") # Default for IEEE pubs is collection = physics
colls_uniq = list(set(collections))
# We don't yet have a JSON object in the ingest data model in which to pass the collection
#if colls_uniq:
# self.base_metadata["collection"] = colls_uniq

# TO DO: append confDates & confLocation to %J
if confdate:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You need an option to not output confloc if it is NoneType or ""

self.base_metadata["publication"] = f"{title}, {confdate}, {confloc}"
else:
self.base_metadata["publication"] = f"{title}, {confloc}"

def _parse_page(self):
n = self.article.find("artpagenums", None)
if n:
self.base_metadata["page_first"] = self.base_metadata["page_first"] = self._detag(
n.get("startpage", None), []
)
self.base_metadata["page_last"] = self.base_metadata["page_last"] = self._detag(
n.get("endpage", None), []
)
if self.article.find("artpagenums"):
startpage = self.article.find("artpagenums").get("startpage")
endpage = self.article.find("artpagenums").get("endpage")
# Using articleid as page_first, to avoid duplicate bibcodes
# See IEEE unique ID section above
# Because multiple papers in conferences use startpage = 1
#self.base_metadata["page_first"] = self._detag(startpage, []) if startpage else None
#self.base_metadata["page_last"] = self._detag(endpage, []) if endpage else None

def _parse_pubdate(self):
# Look for publication dates in article section
Expand All @@ -69,6 +152,7 @@ def _parse_pubdate(self):
year = date.find("year").get_text()
else:
year = "0000"
self.year = year

if date.find("month"):
month_raw = date.find("month").get_text()
Expand All @@ -78,7 +162,8 @@ def _parse_pubdate(self):
month_name = month_raw[0:3].lower()
month = utils.MONTH_TO_NUMBER[month_name]
else:
month_raw == "00"
month_raw = "00"
month = "00"

if date.find("day"):
day = date.find("day").get_text()
Expand All @@ -88,7 +173,6 @@ def _parse_pubdate(self):
# Format date string
pubdate = year + "-" + month + "-" + day

# Assign to appropriate metadata field based on date type
if date_type == "OriginalPub":
self.base_metadata["pubdate_print"] = pubdate
elif date_type == "ePub":
Expand All @@ -114,15 +198,27 @@ def _parse_permissions(self):
if self.article.find("articleinfo"):
articleinfo = self.article.find("articleinfo")

# Get copyright holder and year
# Copyright holder and year for article
if articleinfo.find("articlecopyright"):
copyright = articleinfo.find("articlecopyright")
copyright_holder = self._clean_output(copyright.get_text())
if copyright_holder == "":
copyright_holder = "IEEE"

copyright_year = copyright.get("year", "")
copyright_statement = self._detag(
articleinfo.find("article_copyright_statement").get_text(),
self.HTML_TAGSET["license"],
)
if copyright_year == "0":
copyright_year = self.year

# Sadly <article_copyright_statement> doesn't seem to exist in IEEE conference metadata
if articleinfo.find("article_copyright_statement"):
copyright_statement = self._detag(
articleinfo.find("article_copyright_statement").get_text(),
self.HTML_TAGSET["license"],
)
else:
copyright_statement = ""

# Copyright holder and year for publication is in <copyrightgroup>

# Format copyright string
copyright_text = (
Expand Down Expand Up @@ -184,7 +280,9 @@ def _parse_keywords(self):
# Parse IEEE keywords from keywordset elements
keywords = []

# Handle both IEEE and IEEEFree keyword types
# Handle all keyword types in <articleinfo>
# IEEE and IEEEFree keywordtype
# DOE & PACS don't exist in this collection?
for keywordset in self.article.find_all("keywordset"):
keyword_type = keywordset.get("keywordtype", "")

Expand All @@ -196,11 +294,28 @@ def _parse_keywords(self):
"string": self._clean_output(keyword.string.strip()),
}
)

'''
# <pubtopicalbrowse> = topic browse categories in IEEE Xplore
# NOTE: Some values of <pubtopicalbrowse> contain commas
# How to deal with this in %K ?
# Get all pub-level topics in <publicationinfo>
for pubtopicset in self.publicationinfo.find_all("pubtopicalbrowseset"):
for pubtopic in pubtopicset.find_all("pubtopicalbrowse"):
keywords.append(
{
"system": "XploreTopic",
"string": self._clean_output(pubtopic.string.strip()),
}
)
'''

if keywords:
self.base_metadata["keywords"] = keywords

def _parse_references(self):
# TODO: check if IEEE gives us references at all
# IEEE conferences do not provide references
# Check value of <articlereferenceflag>
references = []
if self.article.find("references"):
for ref in self.article.find_all("reference"):
Expand All @@ -213,10 +328,8 @@ def _parse_references(self):
def _parse_funding(self):
funding = []

# Look for funding info in article metadata

articleinfo = self.article.find("articleinfo")
# import pdb; pdb.set_trace()

if articleinfo.find("fundrefgrp"):
funding_sections = articleinfo.find("fundrefgrp").find_all("fundref", [])

Expand All @@ -228,7 +341,7 @@ def _parse_funding(self):
if funder_name:
funder.setdefault("agencyname", self._clean_output(funder_name.get_text()))

# Get award/grant numbers
# Get award/grant number(s)
award_nums = funding_section.find_all("grant_number")
if award_nums:
# Join multiple award numbers with comma if present
Expand All @@ -241,12 +354,11 @@ def _parse_funding(self):
if funding:
self.base_metadata["funding"] = funding


# Parse IEEE XML into standard JSON format
# :param text: string, contents of XML file
# :return: parsed file contents in JSON format
def parse(self, text):
"""
Parse IEEE XML into standard JSON format
:param text: string, contents of XML file
:return: parsed file contents in JSON format
"""
try:
d = self.bsstrtodict(text, parser="lxml-xml")
except Exception as err:
Expand All @@ -270,7 +382,7 @@ def parse(self, text):
self._parse_permissions()
self._parse_authors()
self._parse_keywords()
self._parse_references()
#self._parse_references()
self._parse_funding()

output = self.format(self.base_metadata, format="IEEE")
Expand Down
Loading
Loading