Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Reads flat/classic files with (mostly) non-bibliographic data and
sends nonbib and metrics protobufs to master pipeline.

Note: while in the Classic back office these files are sorted case insensitively, local copies will be created in the container for this pipeline that will be sorted case sensitively! This logic is not part of the code base in this Github repo, but in the repository that is used to create and deploy the containers.

# Overview
There are ~30 input files. Each row in every file begins with a bibcode. It is followed by a tab character and then
data from that bibcode.
Expand Down
2 changes: 2 additions & 0 deletions adsdata/file_defs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
# use dict to hold each input files and their properties and idiosycrasies

data_files = OrderedDict()
data_files['mention'] = {'path': 'links/mention/all.links', 'default_value': [], 'multiline': True}
data_files['credit'] = {'path': 'links/credit/all.links', 'default_value': [], 'multiline': True}
data_files['canonical'] = {'path': 'bibcodes.list.can', 'default_value': ''}
data_files['author'] = {'path': 'links/facet_authors/all.links', 'default_value': []}
data_files['bibgroup'] = {'path': 'links/bibgroups/all.links', 'default_value': [], 'multiline': True}
Expand Down
4 changes: 4 additions & 0 deletions adsdata/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,12 @@ def _convert(self, passed):
def _add_citation_count_fields(self, return_value, original):
author_count = len(original.get('author', ()))
citation_count = len(return_value.get('citation', ()))
mention_count = len(return_value.get('mention', ()))
credit_count = len(return_value.get('credit', ()))
return_value['citation_count'] = citation_count
return_value['citation_count_norm'] = citation_count / float(max(author_count, 1))
return_value['mention_count'] = mention_count
return_value['credit_count'] = credit_count

def _add_refereed_property(self, return_value):
if'REFEREED' not in return_value['property']:
Expand Down
11 changes: 9 additions & 2 deletions adsdata/reader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

from adsdata import tasks
from adsputils import load_config

Expand Down Expand Up @@ -82,6 +81,11 @@ def read_value_for(self, bibcode):
# next, skip over lines in file until we:
# either find the passed bibcode or determine it isn't in the file
skip_count = 0
# If you are wondering why this works: while the index files in the Classic
# Back Office are sorted case INsensitively, their sorting is converted to
# case sensitive when local copies are created in the pipeline container
# (see the script "copy_input_files.sh" located in the "root/app/bin" directory
# of the image for backoffice-data-pipeline in BeeHive).
while len(current_line) != 0 and self._get_bibcode(current_line) < bibcode:
current_line = self._readline()
skip_count = skip_count + 1
Expand All @@ -95,18 +99,21 @@ def read_value_for(self, bibcode):

if isinstance(self.file_info['default_value'], bool):
return self._convert_value(True) # boolean files only hold bibcodes, all values are True

# at this point, we have the first line with the bibcode in it
# roll up possible other values on adjacent lines in file
value = []
if 'gpn/' in self.file_info['path'] or 'uat/' in self.file_info['path']:
value.append("/".join(self._get_rest(current_line).split("\t")))
elif 'mention/' in self.file_info['path'] or 'credit/' in self.file_info['path']:
value.append(self._get_rest(current_line).split("\t")[0])
else:
value.append(self._get_rest(current_line))
current_line = self._readline()
while self.file_info.get('multiline', False) and (current_line not in [None, '']) and (bibcode == self._get_bibcode(current_line)):
if 'gpn/' in self.file_info['path'] or 'uat/' in self.file_info['path']:
value.append("/".join(self._get_rest(current_line).split("\t")))
elif 'mention/' in self.file_info['path'] or 'credit/' in self.file_info['path']:
value.append(self._get_rest(current_line).split("\t")[0])
else:
value.append(self._get_rest(current_line))
current_line = self._readline()
Expand Down
5,167 changes: 5,167 additions & 0 deletions adsdata/tests/data1/config/links/credit/all.links

Large diffs are not rendered by default.

5,166 changes: 5,166 additions & 0 deletions adsdata/tests/data1/config/links/mention/all.links

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion adsdata/tests/test_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def test_nonbib_record(self):
"data_links_rows": [{"url": ["http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']},
{"url": ["http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']},
{"url": [""], "link_type": "TOC", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}],
"esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0}
"esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'credit_count': 0, 'mention': ['2020xxxx.soft.....X', '2021yyyy.soft.....Y'], 'mention_count': 2,'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0}
self.assertEqual(a, n)

d = processor._read_next_bibcode('2004MNRAS.354L..31M')
Expand All @@ -114,6 +114,10 @@ def test_nonbib_record(self):
"data": ["CDS:1", "NED:1953", "SIMBAD:1", "Vizier:1"],
"citation_count_norm": 49.5,
"citation_count": 99,
"credit": ["2001CoPhC.136..319S"],
"credit_count": 1,
"mention": ["2020xxxx.soft.....X"],
"mention_count": 1,
"property": ["ADS_OPENACCESS", "ARTICLE", "ASSOCIATED", "DATA", "EPRINT_OPENACCESS", "ESOURCE", "INSPIRE", "OPENACCESS", "PUB_OPENACCESS", "REFEREED"],
"total_link_counts": 1956,
"esource": ["ADS_PDF", "ADS_SCAN", "EPRINT_HTML", "EPRINT_PDF", "PUB_HTML", "PUB_PDF"],
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
adsputils==1.5.5
adsputils==1.5.11