From 4bdee3af8492cb0b7c835b33d946bf3b5d7195c2 Mon Sep 17 00:00:00 2001
From: ADS Administration <ads@cfa.harvard.edu>
Date: Fri, 12 Dec 2025 10:59:20 -0500
Subject: [PATCH] Release candidate for IEEE parser

modified:   ieee.py
---
 adsingestp/parsers/ieee.py     | 180 +++++++++++++++++----
 adsingestp/parsers/ieeeOrig.py | 278 +++++++++++++++++++++++++++++++++
 2 files changed, 424 insertions(+), 34 deletions(-)
 create mode 100644 adsingestp/parsers/ieeeOrig.py

diff --git a/adsingestp/parsers/ieee.py b/adsingestp/parsers/ieee.py
index 031db214..1ed2340d 100644
--- a/adsingestp/parsers/ieee.py
+++ b/adsingestp/parsers/ieee.py
@@ -1,3 +1,10 @@
+# IEEE parser for metadata-only (not full-text) conference XML files
+# /proj/ads_abstracts/sources/IEEE/IEEEcnf/MetadataXML/*
+
+# Parser assumes XML structured per:
+# IEEE XML documentation v.5.14, July 2024
+# https://www.ieee.org/content/dam/ieee-org/ieee/web/org/pubs/ieee-data-delivery-documentation.pdf
+
 import logging
 import re
 
@@ -20,15 +27,19 @@ def __init__(self):
         self.article = None
 
     def _parse_ids(self):
-        self.base_metadata["ids"] = {}
-
+        # ISSN
         self.base_metadata["issn"] = []
         for i in self.publicationinfo.find_all("issn"):
             self.base_metadata["issn"].append((i["mediatype"], i.get_text()))
 
-        if self.article.find("doi"):
-            self.base_metadata["ids"]["doi"] = self.article.find("doi").get_text()
+        # IDs
+        self.base_metadata["ids"] = {}
+
+        # DOI for article
+        if self.article.find("articledoi"):
+            self.base_metadata["ids"]["doi"] = self.article.find("articledoi").get_text()
 
+        # DOI for Conference
         self.base_metadata["ids"]["pub-id"] = []
         if self.publicationinfo.find("publicationdoi"):
             self.base_metadata["ids"]["pub-id"].append(
@@ -38,26 +49,98 @@ def _parse_ids(self):
                 }
             )
 
+        # IEEE unique ID for article
+        if self.article.find("articleinfo"):
+            articleinfo = self.article.find("articleinfo")
+            # Article sequence number
+            if articleinfo.find("articleseqnum"):
+                articleid = articleinfo.find("articleseqnum").get_text()
+                self.base_metadata["electronic_id"] = articleid
+                # This next bit probably unnecessary? Unlikely to be >9999 articles in a conf proceedings
+                #if len(articleid) > 4:
+                #    self.base_metadata["page_first"] = articleid[-4:]  # rightmost 4 chars
+                #else:
+                #    self.base_metadata["page_first"] = articleid
+
     def _parse_pub(self):
+        # Conference name
         if self.publication.find("title"):
             t = self.publication.find("title")
-            self.base_metadata["publication"] = self._clean_output(
+            title = self._clean_output(
                 self._detag(t, self.HTML_TAGSET["title"]).strip()
             )
+        self.base_metadata["publication"] = title
 
+        # Conference volume number
         if self.volumeinfo:
-            self.base_metadata["volume"] = self.volumeinfo.find("volumenum").get_text()
-            self.base_metadata["issue"] = self.volumeinfo.find("issue").find("issuenum").get_text()
+            if self.volumeinfo.find("volumenum"):
+                self.base_metadata["volume"] = self.volumeinfo.find("volumenum").text
+            else:
+                self.base_metadata["volume"] = ""
+
+        # Conferences don't have an issue number
+
+        # Conference abbreviation
+        self.base_metadata["comments"] = []
+        ieeeabbrev = self.publicationinfo.find("ieeeabbrev").text or ""
+        if ieeeabbrev:
+            cleanabbrev = re.sub(r"[^A-Za-z]", "", ieeeabbrev)
+            confabbrev = cleanabbrev[:4].ljust(4, '.')  # leftmost 4 chars, or pad if <4
+            self.base_metadata["comments"].append({"text": confabbrev})
+        else:
+            self.base_metadata["comments"].append({"text": "ieee."})
+
+        # Conference location
+        if self.publicationinfo.find("conflocation") is not None:
+            confloc = self.publicationinfo.find("conflocation").text
+        self.base_metadata["conf_location"] = confloc
+
+        # Conference dates
+        confdate = ""
+        if self.publicationinfo.find("confdate", {"confdatetype": "End"}) is not None:
+            confend = self.publicationinfo.find("confdate", {"confdatetype": "End"}) 
+            end_year = confend.find("year").text if confend.find("year") else None
+            end_month = confend.find("month").text if confend.find("month") else None
+            end_day = confend.find("day").text if confend.find("day") else None
+        if self.publicationinfo.find("confdate", {"confdatetype": "Start"}) is not None:
+            confstart = self.publicationinfo.find("confdate", {"confdatetype": "Start"})
+            start_year = confstart.find("year").text if confstart.find("year") else None
+            start_month = confstart.find("month").text if confstart.find("month") else None
+            start_day = confstart.find("day").text if confstart.find("day") else None
+            confdate = f"{start_day} {start_month} {start_year} - {end_day} {end_month} {end_year}"
+            self.base_metadata["conf_date"] = confdate
+
+        # Conference topics
+        # Use for %W
+        collections = []
+        for pubtopicset in self.publicationinfo.find_all("pubtopicalbrowseset"):
+            for pubtopic in pubtopicset.find_all("pubtopicalbrowse"):
+                if pubtopic.get_text() == "Aerospace":
+                    collections.append("astronomy")
+                elif pubtopic.get_text() == "Geoscience":
+                    collections.append("earthscience")
+                else:
+                    collections.append("physics")  # Default for IEEE pubs is collection = physics
+        colls_uniq = list(set(collections))
+        # We don't yet have a JSON object in the ingest data model in which to pass the collection
+        #if colls_uniq:
+        #    self.base_metadata["collection"] = colls_uniq
+
+        # TO DO: append confDates & confLocation to %J
+        if confdate:
+            self.base_metadata["publication"] = f"{title}, {confdate}, {confloc}"
+        else:
+            self.base_metadata["publication"] = f"{title}, {confloc}"
 
     def _parse_page(self):
-        n = self.article.find("artpagenums", None)
-        if n:
-            self.base_metadata["page_first"] = self.base_metadata["page_first"] = self._detag(
-                n.get("startpage", None), []
-            )
-            self.base_metadata["page_last"] = self.base_metadata["page_last"] = self._detag(
-                n.get("endpage", None), []
-            )
+        if self.article.find("artpagenums"):
+            startpage = self.article.find("artpagenums").get("startpage")
+            endpage = self.article.find("artpagenums").get("endpage")
+            # Using articleid as page_first, to avoid duplicate bibcodes
+            # See IEEE unique ID section above
+            # Because multiple papers in conferences use startpage = 1
+            #self.base_metadata["page_first"] = self._detag(startpage, []) if startpage else None
+            #self.base_metadata["page_last"] = self._detag(endpage, []) if endpage else None
 
     def _parse_pubdate(self):
         # Look for publication dates in article section
@@ -69,6 +152,7 @@ def _parse_pubdate(self):
                 year = date.find("year").get_text()
             else:
                 year = "0000"
+            self.year = year
 
             if date.find("month"):
                 month_raw = date.find("month").get_text()
@@ -78,7 +162,8 @@ def _parse_pubdate(self):
                     month_name = month_raw[0:3].lower()
                     month = utils.MONTH_TO_NUMBER[month_name]
             else:
-                month_raw == "00"
+                month_raw = "00"
+                month = "00"
 
             if date.find("day"):
                 day = date.find("day").get_text()
@@ -88,7 +173,6 @@ def _parse_pubdate(self):
             # Format date string
             pubdate = year + "-" + month + "-" + day
 
-            # Assign to appropriate metadata field based on date type
             if date_type == "OriginalPub":
                 self.base_metadata["pubdate_print"] = pubdate
             elif date_type == "ePub":
@@ -114,15 +198,27 @@ def _parse_permissions(self):
         if self.article.find("articleinfo"):
             articleinfo = self.article.find("articleinfo")
 
-            # Get copyright holder and year
+            # Copyright holder and year for article
             if articleinfo.find("articlecopyright"):
                 copyright = articleinfo.find("articlecopyright")
                 copyright_holder = self._clean_output(copyright.get_text())
+                if copyright_holder == "":
+                    copyright_holder = "IEEE"
+
                 copyright_year = copyright.get("year", "")
-                copyright_statement = self._detag(
-                    articleinfo.find("article_copyright_statement").get_text(),
-                    self.HTML_TAGSET["license"],
-                )
+                if copyright_year == "0":
+                    copyright_year = self.year
+
+                # Sadly <article_copyright_statement> doesn't seem to exist in IEEE conference metadata
+                if articleinfo.find("article_copyright_statement"):
+                    copyright_statement = self._detag(
+                        articleinfo.find("article_copyright_statement").get_text(),
+                        self.HTML_TAGSET["license"],
+                    )
+                else:
+                    copyright_statement = ""
+
+            # Copyright holder and year for publication is in <copyrightgroup>
 
                 # Format copyright string
                 copyright_text = (
@@ -184,7 +280,9 @@ def _parse_keywords(self):
         # Parse IEEE keywords from keywordset elements
         keywords = []
 
-        # Handle both IEEE and IEEEFree keyword types
+        # Handle all keyword types in <articleinfo>
+        # IEEE and IEEEFree keywordtype
+        # DOE & PACS don't exist in this collection?
         for keywordset in self.article.find_all("keywordset"):
             keyword_type = keywordset.get("keywordtype", "")
 
@@ -196,11 +294,28 @@ def _parse_keywords(self):
                             "string": self._clean_output(keyword.string.strip()),
                         }
                     )
+
+        '''
+        # <pubtopicalbrowse> = topic browse categories in IEEE Xplore
+        # NOTE: Some values of <pubtopicalbrowse> contain commas
+        # How to deal with this in %K ?
+        # Get all pub-level topics in <publicationinfo>
+        for pubtopicset in self.publicationinfo.find_all("pubtopicalbrowseset"):
+            for pubtopic in pubtopicset.find_all("pubtopicalbrowse"):
+                keywords.append(
+                    {
+                        "system": "XploreTopic",
+                        "string": self._clean_output(pubtopic.string.strip()),
+                    }
+                )
+        '''
+
         if keywords:
             self.base_metadata["keywords"] = keywords
 
     def _parse_references(self):
-        # TODO: check if IEEE gives us references at all
+        # IEEE conferences do not provide references
+        # Check value of <articlereferenceflag>
         references = []
         if self.article.find("references"):
             for ref in self.article.find_all("reference"):
@@ -213,10 +328,8 @@ def _parse_references(self):
     def _parse_funding(self):
         funding = []
 
-        # Look for funding info in article metadata
-
         articleinfo = self.article.find("articleinfo")
-        # import pdb; pdb.set_trace()
+
         if articleinfo.find("fundrefgrp"):
             funding_sections = articleinfo.find("fundrefgrp").find_all("fundref", [])
 
@@ -228,7 +341,7 @@ def _parse_funding(self):
                 if funder_name:
                     funder.setdefault("agencyname", self._clean_output(funder_name.get_text()))
 
-                # Get award/grant numbers
+                # Get award/grant number(s)
                 award_nums = funding_section.find_all("grant_number")
                 if award_nums:
                     # Join multiple award numbers with comma if present
@@ -241,12 +354,11 @@ def _parse_funding(self):
         if funding:
             self.base_metadata["funding"] = funding
 
+
+    # Parse IEEE XML into standard JSON format
+    # :param text: string, contents of XML file
+    # :return: parsed file contents in JSON format
     def parse(self, text):
-        """
-        Parse IEEE XML into standard JSON format
-        :param text: string, contents of XML file
-        :return: parsed file contents in JSON format
-        """
         try:
             d = self.bsstrtodict(text, parser="lxml-xml")
         except Exception as err:
@@ -270,7 +382,7 @@ def parse(self, text):
         self._parse_permissions()
         self._parse_authors()
         self._parse_keywords()
-        self._parse_references()
+        #self._parse_references()
         self._parse_funding()
 
         output = self.format(self.base_metadata, format="IEEE")
diff --git a/adsingestp/parsers/ieeeOrig.py b/adsingestp/parsers/ieeeOrig.py
new file mode 100644
index 00000000..031db214
--- /dev/null
+++ b/adsingestp/parsers/ieeeOrig.py
@@ -0,0 +1,278 @@
+import logging
+import re
+
+from adsingestp import utils
+from adsingestp.ingest_exceptions import XmlLoadException
+from adsingestp.parsers.base import BaseBeautifulSoupParser
+
+logger = logging.getLogger(__name__)
+
+orcid_format = re.compile(r"(\d{4}-){3}\d{3}(\d|X)")
+
+
+class IEEEParser(BaseBeautifulSoupParser):
+    def __init__(self):
+        super(BaseBeautifulSoupParser, self).__init__()
+        self.base_metadata = {}
+        self.publication = None
+        self.publicationinfo = None
+        self.volumeinfo = None
+        self.article = None
+
+    def _parse_ids(self):
+        self.base_metadata["ids"] = {}
+
+        self.base_metadata["issn"] = []
+        for i in self.publicationinfo.find_all("issn"):
+            self.base_metadata["issn"].append((i["mediatype"], i.get_text()))
+
+        if self.article.find("doi"):
+            self.base_metadata["ids"]["doi"] = self.article.find("doi").get_text()
+
+        self.base_metadata["ids"]["pub-id"] = []
+        if self.publicationinfo.find("publicationdoi"):
+            self.base_metadata["ids"]["pub-id"].append(
+                {
+                    "attribute": "doi",
+                    "Identifier": self.publicationinfo.find("publicationdoi").get_text(),
+                }
+            )
+
+    def _parse_pub(self):
+        if self.publication.find("title"):
+            t = self.publication.find("title")
+            self.base_metadata["publication"] = self._clean_output(
+                self._detag(t, self.HTML_TAGSET["title"]).strip()
+            )
+
+        if self.volumeinfo:
+            self.base_metadata["volume"] = self.volumeinfo.find("volumenum").get_text()
+            self.base_metadata["issue"] = self.volumeinfo.find("issue").find("issuenum").get_text()
+
+    def _parse_page(self):
+        n = self.article.find("artpagenums", None)
+        if n:
+            self.base_metadata["page_first"] = self.base_metadata["page_first"] = self._detag(
+                n.get("startpage", None), []
+            )
+            self.base_metadata["page_last"] = self.base_metadata["page_last"] = self._detag(
+                n.get("endpage", None), []
+            )
+
+    def _parse_pubdate(self):
+        # Look for publication dates in article section
+        for date in self.article.find_all("date"):
+            date_type = date.get("datetype", "")
+
+            # Get year, month, day values
+            if date.find("year"):
+                year = date.find("year").get_text()
+            else:
+                year = "0000"
+
+            if date.find("month"):
+                month_raw = date.find("month").get_text()
+                if month_raw.isdigit():
+                    month = month_raw
+                else:
+                    month_name = month_raw[0:3].lower()
+                    month = utils.MONTH_TO_NUMBER[month_name]
+            else:
+                month_raw == "00"
+
+            if date.find("day"):
+                day = date.find("day").get_text()
+            else:
+                day = "00"
+
+            # Format date string
+            pubdate = year + "-" + month + "-" + day
+
+            # Assign to appropriate metadata field based on date type
+            if date_type == "OriginalPub":
+                self.base_metadata["pubdate_print"] = pubdate
+            elif date_type == "ePub":
+                self.base_metadata["pubdate_electronic"] = pubdate
+
+    def _parse_title_abstract(self):
+        # Parse title from article section
+        if self.article.find("title"):
+            self.base_metadata["title"] = self._clean_output(
+                self._detag(self.article.find("title"), self.HTML_TAGSET["title"]).strip()
+            )
+
+        # Parse abstract from articleinfo section
+        if self.article.find("articleinfo"):
+            for abstract in self.article.find("articleinfo").find_all("abstract"):
+                if abstract.get("abstracttype") == "Regular":
+                    self.base_metadata["abstract"] = self._clean_output(
+                        self._detag(abstract, self.HTML_TAGSET["abstract"]).strip()
+                    )
+
+    def _parse_permissions(self):
+        # Check for open-access and permissions information
+        if self.article.find("articleinfo"):
+            articleinfo = self.article.find("articleinfo")
+
+            # Get copyright holder and year
+            if articleinfo.find("articlecopyright"):
+                copyright = articleinfo.find("articlecopyright")
+                copyright_holder = self._clean_output(copyright.get_text())
+                copyright_year = copyright.get("year", "")
+                copyright_statement = self._detag(
+                    articleinfo.find("article_copyright_statement").get_text(),
+                    self.HTML_TAGSET["license"],
+                )
+
+                # Format copyright string
+                copyright_text = (
+                    copyright_year + " " + copyright_holder + ". " + copyright_statement
+                )
+                self.base_metadata["copyright"] = copyright_text
+
+            # Check if open access is given as "T" (true)
+            if articleinfo.find("articleopenaccess"):
+                if articleinfo.find("articleopenaccess").get_text() == "T":
+                    self.base_metadata.setdefault("openAccess", {}).setdefault("open", True)
+
+    def _parse_authors(self):
+        # Parse authors from articleinfo section
+        if self.article.find("articleinfo"):
+            articleinfo = self.article.find("articleinfo")
+            author_list = []
+
+            # Get all authors from authorgroup
+            if articleinfo.find("authorgroup"):
+                for author in articleinfo.find("authorgroup").find_all("author"):
+                    author_tmp = {}
+
+                    # Get author name components
+                    if author.find("firstname"):
+                        author_tmp["given"] = self._clean_output(
+                            author.find("firstname").get_text()
+                        )
+                    if author.find("surname"):
+                        author_tmp["surname"] = self._clean_output(
+                            author.find("surname").get_text()
+                        )
+
+                    # Get author affiliation
+                    if author.find("affiliation"):
+                        author_tmp["aff"] = [
+                            self._clean_output(author.find("affiliation").get_text())
+                        ]
+                        author_tmp["xaff"] = []
+
+                    # Get author email
+                    if author.find("email"):
+                        author_tmp["email"] = self._clean_output(author.find("email").get_text())
+
+                    # Get author ORCID if present
+                    if author.find("orcid"):
+                        author_tmp["orcid"] = author.find("orcid").get_text()
+
+                    # Check if author is corresponding author
+                    if author.get("role") == "corresponding":
+                        author_tmp["corresp"] = True
+
+                    author_list.append(author_tmp)
+
+            if author_list:
+                self.base_metadata["authors"] = author_list
+
+    def _parse_keywords(self):
+        # Parse IEEE keywords from keywordset elements
+        keywords = []
+
+        # Handle both IEEE and IEEEFree keyword types
+        for keywordset in self.article.find_all("keywordset"):
+            keyword_type = keywordset.get("keywordtype", "")
+
+            for keyword in keywordset.find_all("keywordterm"):
+                if keyword.string:
+                    keywords.append(
+                        {
+                            "system": keyword_type,
+                            "string": self._clean_output(keyword.string.strip()),
+                        }
+                    )
+        if keywords:
+            self.base_metadata["keywords"] = keywords
+
+    def _parse_references(self):
+        # TODO: check if IEEE gives us references at all
+        references = []
+        if self.article.find("references"):
+            for ref in self.article.find_all("reference"):
+                # output raw XML for reference service to parse later
+                ref_xml = str(ref.extract()).replace("\n", " ").replace("\xa0", " ")
+                references.append(ref_xml)
+
+            self.base_metadata["references"] = references
+
+    def _parse_funding(self):
+        funding = []
+
+        # Look for funding info in article metadata
+
+        articleinfo = self.article.find("articleinfo")
+        # import pdb; pdb.set_trace()
+        if articleinfo.find("fundrefgrp"):
+            funding_sections = articleinfo.find("fundrefgrp").find_all("fundref", [])
+
+            for funding_section in funding_sections:
+                funder = {}
+
+                # Get funder name
+                funder_name = funding_section.find("funder_name")
+                if funder_name:
+                    funder.setdefault("agencyname", self._clean_output(funder_name.get_text()))
+
+                # Get award/grant numbers
+                award_nums = funding_section.find_all("grant_number")
+                if award_nums:
+                    # Join multiple award numbers with comma if present
+                    awards = [self._clean_output(award.get_text()) for award in award_nums]
+                    funder.setdefault("awardnumber", ", ".join(awards))
+
+                if funder:
+                    funding.append(funder)
+
+        if funding:
+            self.base_metadata["funding"] = funding
+
+    def parse(self, text):
+        """
+        Parse IEEE XML into standard JSON format
+        :param text: string, contents of XML file
+        :return: parsed file contents in JSON format
+        """
+        try:
+            d = self.bsstrtodict(text, parser="lxml-xml")
+        except Exception as err:
+            raise XmlLoadException(err)
+
+        if d.find("publication", None):
+            self.publication = d.find("publication")
+
+            if self.publication.find("publicationinfo", None):
+                self.publicationinfo = self.publication.find("publicationinfo")
+
+            if self.publication.find("volume", None):
+                self.volumeinfo = self.publication.find("volume").find("volumeinfo", None)
+                self.article = self.publication.find("volume").find("article", None)
+
+        self._parse_ids()
+        self._parse_pub()
+        self._parse_page()
+        self._parse_pubdate()
+        self._parse_title_abstract()
+        self._parse_permissions()
+        self._parse_authors()
+        self._parse_keywords()
+        self._parse_references()
+        self._parse_funding()
+
+        output = self.format(self.base_metadata, format="IEEE")
+
+        return output