adsabs · Jul 29, 2025 · Jul 28, 2025
diff --git a/README.md b/README.md
@@ -2,6 +2,8 @@
 Reads flat/classic files with (mostly) non-bibliographic data and
 sends nonbib and metrics protobufs to master pipeline.
 
+Note: while in the Classic back office these files are sorted case insensitively, local copies will be created in the container for this pipeline that will be sorted case sensitively! This logic is not part of the code base in this Github repo, but in the repository that is used to create and deploy the containers.
+
 # Overview
 There are ~30 input files.  Each row in every file begins with a bibcode. It is followed by a tab character and then
 data from that bibcode.

diff --git a/adsdata/file_defs.py b/adsdata/file_defs.py
@@ -6,6 +6,8 @@
 # use dict to hold each input files and their properties and idiosycrasies
 
 data_files = OrderedDict()
+data_files['mention'] = {'path': 'links/mention/all.links', 'default_value': [], 'multiline': True}
+data_files['credit'] = {'path': 'links/credit/all.links', 'default_value': [], 'multiline': True}
 data_files['canonical'] = {'path': 'bibcodes.list.can', 'default_value': ''}
 data_files['author'] = {'path': 'links/facet_authors/all.links', 'default_value': []}
 data_files['bibgroup'] = {'path': 'links/bibgroups/all.links', 'default_value': [], 'multiline': True}

diff --git a/adsdata/process.py b/adsdata/process.py
@@ -132,8 +132,12 @@ def _convert(self, passed):
     def _add_citation_count_fields(self, return_value, original):
         author_count = len(original.get('author', ()))
         citation_count = len(return_value.get('citation', ()))
+        mention_count = len(return_value.get('mention', ()))
+        credit_count = len(return_value.get('credit', ()))
         return_value['citation_count'] = citation_count
         return_value['citation_count_norm'] = citation_count / float(max(author_count, 1))
+        return_value['mention_count'] = mention_count
+        return_value['credit_count'] = credit_count
 
     def _add_refereed_property(self, return_value):
         if'REFEREED' not in return_value['property']:

diff --git a/adsdata/reader.py b/adsdata/reader.py
@@ -1,4 +1,3 @@
-
 from adsdata import tasks
 from adsputils import load_config
 
@@ -82,6 +81,11 @@ def read_value_for(self, bibcode):
         # next, skip over lines in file until we:
         #   either find the passed bibcode or determine it isn't in the file
         skip_count = 0
+        # If you are wondering why this works: while the index files in the Classic
+        # Back Office are sorted case INsensitively, their sorting is converted to
+        # case sensitive when local copies are created in the pipeline container
+        # (see the script "copy_input_files.sh" located in the "root/app/bin" directory
+        # of the image for backoffice-data-pipeline in BeeHive).   
         while len(current_line) != 0 and self._get_bibcode(current_line) < bibcode:
             current_line = self._readline()
             skip_count = skip_count + 1
@@ -95,18 +99,21 @@ def read_value_for(self, bibcode):
 
         if isinstance(self.file_info['default_value'], bool):
             return self._convert_value(True)  # boolean files only hold bibcodes, all values are True
-
         # at this point, we have the first line with the bibcode in it
         # roll up possible other values on adjacent lines in file
         value = []
         if 'gpn/' in self.file_info['path'] or 'uat/' in self.file_info['path']:
             value.append("/".join(self._get_rest(current_line).split("\t")))
+        elif 'mention/' in self.file_info['path'] or 'credit/' in self.file_info['path']:
+            value.append(self._get_rest(current_line).split("\t")[0])
         else:
             value.append(self._get_rest(current_line))
         current_line = self._readline()
         while self.file_info.get('multiline', False) and (current_line not in [None, '']) and (bibcode == self._get_bibcode(current_line)):
             if 'gpn/' in self.file_info['path'] or 'uat/' in self.file_info['path']:
                 value.append("/".join(self._get_rest(current_line).split("\t")))
+            elif 'mention/' in self.file_info['path'] or 'credit/' in self.file_info['path']:
+                value.append(self._get_rest(current_line).split("\t")[0])
             else:
                 value.append(self._get_rest(current_line))
             current_line = self._readline()

diff --git a/adsdata/tests/data1/config/links/credit/all.links b/adsdata/tests/data1/config/links/credit/all.links
diff --git a/adsdata/tests/data1/config/links/mention/all.links b/adsdata/tests/data1/config/links/mention/all.links
diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py
@@ -90,7 +90,7 @@ def test_nonbib_record(self):
                  "data_links_rows": [{"url": ["http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']},
                                      {"url": ["http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']},
                                      {"url": [""], "link_type": "TOC", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}],
-                 "esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0}
+                 "esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'credit_count': 0, 'mention': ['2020xxxx.soft.....X', '2021yyyy.soft.....Y'], 'mention_count': 2,'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0}
             self.assertEqual(a, n)
 
             d = processor._read_next_bibcode('2004MNRAS.354L..31M')
@@ -114,6 +114,10 @@ def test_nonbib_record(self):
                  "data": ["CDS:1", "NED:1953", "SIMBAD:1", "Vizier:1"],
                  "citation_count_norm": 49.5,
                  "citation_count": 99,
+                 "credit": ["2001CoPhC.136..319S"],
+                 "credit_count": 1,
+                 "mention": ["2020xxxx.soft.....X"],
+                 "mention_count": 1,
                  "property": ["ADS_OPENACCESS", "ARTICLE", "ASSOCIATED", "DATA", "EPRINT_OPENACCESS", "ESOURCE", "INSPIRE", "OPENACCESS", "PUB_OPENACCESS", "REFEREED"],
                  "total_link_counts": 1956,
                  "esource": ["ADS_PDF", "ADS_SCAN", "EPRINT_HTML", "EPRINT_PDF", "PUB_HTML", "PUB_PDF"],

diff --git a/requirements.txt b/requirements.txt
@@ -1 +1 @@
-adsputils==1.5.5   
+adsputils==1.5.11