diff --git a/adsdata/process.py b/adsdata/process.py index da12db1..6ec47d3 100644 --- a/adsdata/process.py +++ b/adsdata/process.py @@ -1,4 +1,3 @@ - from datetime import datetime from collections import defaultdict @@ -19,6 +18,8 @@ def __init__(self, compute_metrics=True, compute_CC = False): self.data_dict = data_files self.logger = tasks.app.logger self.readers = {} + self.master_protobuf = self._get_master_nonbib_dict() + def __enter__(self): self._open_all() @@ -27,6 +28,46 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self._close_all() + def _get_master_nonbib_dict(self): + # Template for the new protobuf structure + return { + "identifier": [], # Master Pipeline + "links": { + "ARXIV": [], # Master Pipeline + "DOI": [], # Master Pipeline + "DATA": {}, + "ESOURCE": {}, + "ASSOCIATED": { + "url": [], + "title": [], + "count": 0 + }, + "INSPIRE": { + "url": [], + "title": [], + "count": 0 + }, + "LIBRARYCATALOG": { + "url": [], + "title": [], + "count": 0 + }, + "PRESENTATION": { + "url": [], + "title": [], + "count": 0 + }, + "ABSTRACT": False, # Master Pipeline + "CITATIONS": False, # Master Pipeline + "GRAPHICS": False, # Master Pipeline + "METRICS": False, # Master Pipeline + "OPENURL": False, # Master Pipeline + "REFERENCES": False,# Master Pipeline + "TOC": False, # Master Pipeline + "COREAD": False # Master Pipeline + } + } + def process_bibcodes(self, bibcodes): """send nonbib and metrics records to master for the passed bibcodes for each bibcode @@ -54,79 +95,118 @@ def process_bibcodes(self, bibcodes): tasks.task_output_metrics.delay(metrics_protos) def _convert(self, passed): - """convert full nonbib dict to what is needed for nonbib protobuf - data links values are read from separate files so they are in separate dicts - they must be merged into one field for the protobuf - a couple fields are summarized - some other fields are just copied - some fields are deleted + """Convert full nonbib dict to what is needed for nonbib protobuf. + + Data links values are read from separate files and merged into one field. + The method handles: + - Data link processing and merging + - Property aggregation + - Field summarization and copying + - Computed field generation + - Cleanup of unused fields + + Args: + passed (dict): Raw data dictionary containing all input fields + + Returns: + dict: Processed data ready for nonbib protobuf """ - return_value = {} - return_value['data_links_rows'] = [] - return_value['property'] = set() - return_value['esource'] = set() + # Initialize return structure + return_value = { + "data_links_rows": [], + "property": set(), + "esource": set() + } + for filetype, value in passed.items(): - file_properties = self.data_dict[filetype] #data_files[filetype] + file_properties = self.data_dict[filetype] + default_value = file_properties.get('default_value') + extra_values = file_properties.get('extra_values', {}) + + # Handle special cases first if filetype == 'canonical': return_value['bibcode'] = passed['canonical'] - if (value is dict and dict and 'property' in value[filetype]): - return_value['property'].update(value[filetype]['property']) - if (type(file_properties['default_value']) is bool): + continue + + if filetype == 'relevance': + return_value.update(passed[filetype]) + continue + + # Handle boolean fields and TOC + if isinstance(default_value, bool): return_value[filetype] = value[filetype] value = value[filetype] - if ('extra_values' in file_properties and 'link_type' in file_properties['extra_values'] and value != file_properties['default_value']): - # here with one or more real datalinks value(s) - # add each data links dict to existing list of dicts - # tweak some values (e.g., sub_link_type) in original dict - if type(value) is bool or type(value) is dict: - d = self._convert_data_link(filetype, value) - return_value['data_links_rows'].append(d) - elif type(value) is list: - for v in value: - d = self._convert_data_link(filetype, v) - return_value['data_links_rows'].append(d) + + # Process data links + if 'link_type' in extra_values and value != default_value: + # Convert and add data links + if isinstance(value, (bool, dict)): + return_value['data_links_rows'].append( + self._convert_data_link(filetype, value)) + elif isinstance(value, list): + return_value['data_links_rows'].extend( + self._convert_data_link(filetype, v) for v in value) else: - self.logger.error('serious error in process._convert with {} {} {}'.format(filetype, type(value), value)) - - if file_properties['extra_values']['link_type'] == 'ESOURCE': - return_value['esource'].add(file_properties['extra_values']['link_sub_type']) - return_value['property'].add(file_properties['extra_values']['link_type']) - return_value['property'].update(file_properties['extra_values'].get('property', [])) - elif ('extra_values' in file_properties and value != file_properties['default_value']): - if 'property' in file_properties['extra_values']: - return_value['property'].update(file_properties['extra_values']['property']) - - elif value != file_properties['default_value'] or file_properties.get('copy_default', False): - # otherwise, copy value + self.logger.error( + f'serious error in process._convert with {filetype} {type(value)} {value}') + continue + + # Update esource and properties + link_type = extra_values['link_type'] + if link_type == 'ESOURCE': + return_value['esource'].add(extra_values['link_sub_type']) + return_value['property'].add(link_type) + return_value['property'].update(extra_values.get('property', [])) + + # Handle properties + elif extra_values and value != default_value: + if 'property' in extra_values: + return_value['property'].update(extra_values['property']) + + # Copy remaining fields if needed + elif value != default_value or file_properties.get('copy_default', False): return_value[filetype] = passed[filetype] - if filetype == 'relevance': - for k in passed[filetype]: - # simply add all dict value to top level - return_value[k] = passed[filetype][k] - + + # Add computed properties self._add_refereed_property(return_value) self._add_article_property(return_value, passed) + self._add_data_summary(return_value) + self._add_citation_count_fields(return_value, passed) + + # Sort sets return_value['property'] = sorted(return_value['property']) return_value['esource'] = sorted(return_value['esource']) - self._add_data_summary(return_value) + + # Merge and process data links return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows']) - self._add_citation_count_fields(return_value, passed) - # time for computed fields - for k, v in computed_fields.items(): - f = getattr(self, v['converter_function'], None) - if f is None: - self.logger.error('serious error in process._covert, expected converter_function {} for field {} not found'.format(v['converter_function'], k)) + master_template = self._get_master_nonbib_dict() + + # Populate the new protobuf structure with link data + self._populate_new_links_structure(return_value['data_links_rows'], master_template) + + # Add computed fields + for field_name, field_config in computed_fields.items(): + converter = getattr(self, field_config['converter_function'], None) + if converter: + return_value.update(converter(return_value)) else: - x = f(return_value) - return_value.update(x) - - # finally, delete the keys not in the nonbib protobuf - not_needed = ['author', 'canonical', 'citation', 'deleted', 'deprecated_citation_count', 'doi', 'download', 'item_count', 'nonarticle', - 'ocrabstract', 'preprint', 'private', 'pub_openaccess', 'pub2arxiv', - 'reads', 'refereed', 'relevance', 'toc'] - for n in not_needed: - return_value.pop(n, None) + self.logger.error( + f'serious error in process._convert, expected converter_function ' + f'{field_config["converter_function"]} for field {field_name} not found') + + # Remove unused fields + unused_fields = { + 'author', 'canonical', 'citation', 'deleted', 'deprecated_citation_count', + 'doi', 'download', 'item_count', 'nonarticle', 'ocrabstract', 'preprint', + 'private', 'pub_openaccess', 'pub2arxiv', 'reads', 'refereed', + 'relevance', 'toc' + } + for field in unused_fields: + return_value.pop(field, None) + return_value.update(master_template) + return_value.pop('data_links_rows') + self.logger.debug('Processed nonbib data: {}'.format(return_value)) return return_value def _add_citation_count_fields(self, return_value, original): @@ -145,11 +225,11 @@ def _add_refereed_property(self, return_value): if'REFEREED' not in return_value['property']: return_value['property'].add('NOT REFEREED') - def _add_article_property(self, return_value, d): - x = d.get('nonarticle', False) - if type(x) is dict: - x = x['nonarticle'] - if x: + def _add_article_property(self, return_value, passed): + nonarticle_value = passed.get('nonarticle', False) + if isinstance(nonarticle_value, dict): + nonarticle_value = nonarticle_value['nonarticle'] + if nonarticle_value: return_value['property'].add('NONARTICLE') else: return_value['property'].add('ARTICLE') @@ -199,36 +279,50 @@ def _merge_data_links(self, datalinks): def _convert_data_link(self, filetype, value): """convert one data link row""" - file_properties = self.data_dict[filetype] #data_files[filetype] - d = {} - d['link_type'] = file_properties['extra_values']['link_type'] + + self.logger.debug('Converting data link: {}'.format(value)) + file_properties = self.data_dict[filetype] + + link_type = file_properties['extra_values']['link_type'] + link_sub_type = file_properties['extra_values'].get('link_sub_type', '') link_sub_type_suffix = '' - if value is dict and 'subparts' in value and 'item_count' in value['subparts']: - link_sub_type_suffix = ' ' + str(value['subparts']['item_count']) - if value is True: - d['link_sub_type'] = file_properties['extra_values']['link_sub_type'] + link_sub_type_suffix - elif 'link_sub_type' in value: - d['link_sub_type'] = value['link_sub_type'] + link_sub_type_suffix - elif 'link_sub_type' in file_properties['extra_values']: - d['link_sub_type'] = file_properties['extra_values']['link_sub_type'] + link_sub_type_suffix - if type(value) is bool: - d['url'] = [''] - d['title'] = [''] - d['item_count'] = 0 - elif type(value) is dict: - d['url'] = value.get('url', ['']) - if type(d['url']) is str: - d['url'] = [d['url']] - d['title'] = value.get('title', ['']) - if type(d['title']) is str: - d['title'] = [d['title']] - # if d['title'] == ['']: - # d.pop('title') # to match old pipeline - d['item_count'] = value.get('item_count', 0) - else: - self.logger.error('serious error in process.convert_data_link: unexpected type for value, filetype = {}, value = {}, type of value = {}'.format(filetype, value, type(value))) - return d + if isinstance(value, dict) and 'subparts' in value: + link_sub_type_suffix = f" {value['subparts'].get('item_count', '')}".strip() + + # Determine the link sub type + if not link_sub_type and isinstance(value, dict) and 'link_sub_type' in value: + link_sub_type = value['link_sub_type'] + + + link_sub_type += link_sub_type_suffix + + # Initialize result dictionary + link_data = { 'link_type': link_type, + 'link_sub_type': link_sub_type, + "url": [""], + "title": [""], + "item_count": 0 + } + + if isinstance(value, dict): + link_data['url'] = value.get('url', ['']) + link_data['title'] = value.get('title', ['']) + link_data['item_count'] = value.get('item_count', 0) + + self.logger.debug('Link data before conversion: {}'.format(link_data)) + if isinstance(link_data['url'], str): + link_data['url'] = [link_data['url']] + if isinstance(link_data['title'], str): + link_data['title'] = [link_data['title']] + self.logger.debug('Link data after conversion: {}'.format(link_data)) + elif not isinstance(value, bool): + self.logger.error( + f"Serious error in process.convert_data_link: unexpected type for value, filetype = {filetype}, " + f"value = {value}, type of value = {type(value)}" + ) + self.logger.debug('Converted data link: {}'.format(link_data)) + return link_data def _read_next_bibcode(self, bibcode): """read all the info for the passed bibcode into a dict""" @@ -324,3 +418,55 @@ def _compute_bibgroup_facet(self, d): return {} bibgroup_facet = sorted(list(set(bibgroup))) return {'bibgroup_facet': bibgroup_facet} + + def _populate_new_links_structure(self, data_links_rows, master_template): + """Populate the new protobuf links structure from data_links_rows. + Maps the flat data_links_rows into the hierarchical links structure.""" + + self.logger.debug('Populating new links structure: {}'.format(data_links_rows)) + + # Map for link types that need special handling + link_type_mapping = { + 'DATA': 'DATA', + 'ESOURCE': 'ESOURCE', + 'ASSOCIATED': 'ASSOCIATED', + 'INSPIRE': 'INSPIRE', + 'LIBRARYCATALOG': 'LIBRARYCATALOG', + 'PRESENTATION': 'PRESENTATION' + } + + for row in data_links_rows: + link_type = row.get('link_type', '') + + # Skip if not in our mapping + if link_type not in link_type_mapping: + continue + + mapped_type = link_type_mapping[link_type] + + # Handle DATA and ESOURCE which have sub_type structure + if mapped_type in ('DATA', 'ESOURCE'): + sub_type = row.get('link_sub_type', '') + if sub_type not in master_template['links'][mapped_type]: + master_template['links'][mapped_type][sub_type] = { + 'url': [], + 'title': [], + 'count': 0 + } + if 'url' in row: + master_template['links'][mapped_type][sub_type]['url'].extend(row['url']) + if 'title' in row: + master_template['links'][mapped_type][sub_type]['title'].extend(row['title']) + if 'item_count' in row: + master_template['links'][mapped_type][sub_type]['count'] = row['item_count'] + + # Handle other link types with direct structure + else: + if 'url' in row: + master_template['links'][mapped_type]['url'].extend(row['url']) + if 'title' in row: + master_template['links'][mapped_type]['title'].extend(row['title']) + if 'item_count' in row: + master_template['links'][mapped_type]['count'] = row['item_count'] + self.logger.debug('Populated new links structure: {}'.format(master_template)) + return master_template \ No newline at end of file diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py index 3cbb484..080c349 100644 --- a/adsdata/tests/test_process.py +++ b/adsdata/tests/test_process.py @@ -1,4 +1,3 @@ - import unittest from mock import patch, mock_open from datetime import datetime @@ -85,45 +84,60 @@ def test_nonbib_record(self): with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): d = processor._read_next_bibcode('2003ASPC..295..361M') n = processor._convert(d) - a = {"read_count": 4, "bibcode": "2003ASPC..295..361M", - 'bibgroup': ['Chandra Technical'], 'bibgroup_facet': ['Chandra Technical'], - "data_links_rows": [{"url": ["http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']}, - {"url": ["http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']}, - {"url": [""], "link_type": "TOC", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}], - "esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'reference_count': 0, 'credit_count': 0, 'mention': ['2020xxxx.soft.....X', '2021yyyy.soft.....Y'], 'mention_count': 2,'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0} + a = {'property': ['ADS_OPENACCESS', 'ARTICLE', 'ESOURCE', 'NOT REFEREED', 'OPENACCESS', 'TOC'], 'esource': ['ADS_PDF', 'ADS_SCAN'], + 'bibcode': '2003ASPC..295..361M', 'bibgroup': ['Chandra Technical'], 'boost': 0.15, 'reference_count': 0, 'credit_count': 0, 'mention': ['2020xxxx.soft.....X', '2021yyyy.soft.....Y'], 'mention_count': 2,'read_count': 4, 'norm_cites': 0, 'data': [], + 'total_link_counts': 0, 'citation_count': 0, 'citation_count_norm': 0.0, + 'bibgroup_facet': ['Chandra Technical'], 'identifier': [], + 'links': {'ARXIV': [], 'DOI': [], 'DATA': {}, + 'ESOURCE': {'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'count': 0}, + 'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M'], 'title': [''], 'count': 0}}, + 'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, + 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, + 'ABSTRACT': False, # Master Pipeline will set to True + 'CITATIONS': False, + 'GRAPHICS': False, # Master Pipeline will set to True + 'METRICS': False, + 'OPENURL': False, # Master Pipeline will set to True + 'REFERENCES': False, + 'TOC': False, + 'COREAD': False}} # Master Pipeline will set to True self.assertEqual(a, n) + self._validate_nonbib_structure(n) d = processor._read_next_bibcode('2004MNRAS.354L..31M') v = processor._convert(d) - a = {"bibcode": "2004MNRAS.354L..31M", - "simbad_objects": ["3253618 G"], - "read_count": 20, - "data_links_rows": [{"url": ["http://dx.doi.org/10.1111/j.1365-2966.2004.08374.x"], "link_type": "ESOURCE", "link_sub_type": "PUB_HTML", 'item_count': 0, 'title': ['']}, - {"url": ["https://arxiv.org/abs/astro-ph/0405472"], "link_type": "ESOURCE", "link_sub_type": "EPRINT_HTML", 'item_count': 0, 'title': ['']}, - {"url": ["https://academic.oup.com/mnras/pdf-lookup/doi/10.1111/j.1365-2966.2004.08374.x"], "link_type": "ESOURCE", "link_sub_type": "PUB_PDF", 'item_count': 0, 'title': ['']}, - {"url": ["http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']}, - {"url": ["https://arxiv.org/pdf/astro-ph/0405472"], "link_type": "ESOURCE", "link_sub_type": "EPRINT_PDF", 'item_count': 0, 'title': ['']}, - {"url": ["http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']}, - {"url": ["2004MNRAS.354L..31M", "2005yCat..73549031M"], "title": ["Source Paper", "Catalog Description"], "link_type": "ASSOCIATED", "link_sub_type": "NA", 'item_count': 0}, - {"url": ["http://inspirehep.net/search?p=find+j+MNRAA,354,L31"], "link_type": "INSPIRE", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}, - {"url": ["http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31"], "item_count": 1, "link_type": "DATA", "link_sub_type": "CDS", 'title': ['']}, - {"url": ["https://$NED$/cgi-bin/objsearch?search_type=Search&refcode=2004MNRAS.354L..31M"], "title": ["NED Objects (1953)"], "item_count": 1953, "link_type": "DATA", "link_sub_type": "NED"}, - {"url": ["http://$SIMBAD$/simbo.pl?bibcode=2004MNRAS.354L..31M"], "title": ["SIMBAD Objects (1)"], "item_count": 1, "link_type": "DATA", "link_sub_type": "SIMBAD"}, - {"url": ["http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31"], "item_count": 1, "link_type": "DATA", "link_sub_type": "Vizier", 'title': ['']}], - "norm_cites": 10000, - "data": ["CDS:1", "NED:1953", "SIMBAD:1", "Vizier:1"], - "citation_count_norm": 49.5, - "citation_count": 99, + a = {'property': ['ADS_OPENACCESS', 'ARTICLE', 'ASSOCIATED', 'DATA', 'EPRINT_OPENACCESS', 'ESOURCE', 'INSPIRE', 'OPENACCESS', 'PUB_OPENACCESS', 'REFEREED'], "reference": ["2004PhRvL..92q6804N", "1989TSF...171....5T"], "reference_count": 2, "credit": ["2001CoPhC.136..319S"], "credit_count": 1, "mention": ["2020xxxx.soft.....X"], "mention_count": 1, - "property": ["ADS_OPENACCESS", "ARTICLE", "ASSOCIATED", "DATA", "EPRINT_OPENACCESS", "ESOURCE", "INSPIRE", "OPENACCESS", "PUB_OPENACCESS", "REFEREED"], - "total_link_counts": 1956, - "esource": ["ADS_PDF", "ADS_SCAN", "EPRINT_HTML", "EPRINT_PDF", "PUB_HTML", "PUB_PDF"], - "boost": 0.4399999976158142} + 'esource': ['ADS_PDF', 'ADS_SCAN', 'EPRINT_HTML', 'EPRINT_PDF', 'PUB_HTML', 'PUB_PDF'], + 'bibcode': '2004MNRAS.354L..31M', 'boost': 0.44, 'read_count': 20, 'norm_cites': 10000, + 'simbad_objects': ['3253618 G'], 'data': ['CDS:1', 'NED:1953', 'SIMBAD:1', 'Vizier:1'], + 'total_link_counts': 1956, 'citation_count': 99, 'citation_count_norm': 49.5, 'identifier': [], + 'links': {'ARXIV': [], 'DOI': [], 'DATA': {'CDS': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}, + 'NED': {'url': ['https://$NED$/cgi-bin/objsearch?search_type=Search&refcode=2004MNRAS.354L..31M'], 'title': ['NED Objects (1953)'], 'count': 1953}, + 'SIMBAD': {'url': ['http://$SIMBAD$/simbo.pl?bibcode=2004MNRAS.354L..31M'], 'title': ['SIMBAD Objects (1)'], 'count': 1}, + 'Vizier': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}}, + 'ESOURCE': {'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M'], 'title': [''], 'count': 0}, + 'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M'], 'title': [''], 'count': 0}, + 'PUB_HTML': {'url': ['http://dx.doi.org/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, + 'EPRINT_HTML': {'url': ['https://arxiv.org/abs/astro-ph/0405472'], 'title': [''], 'count': 0}, + 'PUB_PDF': {'url': ['https://academic.oup.com/mnras/pdf-lookup/doi/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, + 'EPRINT_PDF': {'url': ['https://arxiv.org/pdf/astro-ph/0405472'], 'title': [''], 'count': 0}}, + 'ASSOCIATED': {'url': ['2004MNRAS.354L..31M', '2005yCat..73549031M'], 'title': ['Source Paper', 'Catalog Description'], 'count': 0}, + 'INSPIRE': {'url': ['http://inspirehep.net/search?p=find+j+MNRAA,354,L31'], 'title': [''], 'count': 0}, 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, + 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, + 'ABSTRACT': False, # Master Pipeline will set to True + 'CITATIONS': False, + 'GRAPHICS': False, # Master Pipeline will set to True + 'METRICS': False, + 'OPENURL': False, # Master Pipeline will set to True + 'REFERENCES': False, + 'TOC': False, + 'COREAD': False}} # Master Pipeline will set to True v_boost = v.pop('boost') a_boost = a.pop('boost') self.assertAlmostEqual(a_boost, v_boost) @@ -133,6 +147,107 @@ def test_nonbib_record(self): # consider library 1810hdla.book.....V # consider inspire 1908PASP...20....1. + def _validate_nonbib_structure(self, record): + """Validate that the nonbib record has all required fields with correct types""" + + # Required string fields + self.assertIn('bibcode', record) + self.assertIn('identifier', record) + self.assertIsInstance(record['bibcode'], str) + self.assertIsInstance(record['identifier'], list) + + # Required numeric fields + numeric_fields = { + 'boost': float, + 'citation_count': int, + 'read_count': int, + 'total_link_counts': int, + 'norm_cites': int, + 'citation_count_norm': float + } + for field, expected_type in numeric_fields.items(): + self.assertIn(field, record) + self.assertIsInstance(record[field], expected_type, + f"Field {field} should be {expected_type.__name__}") + + # Required array fields + required_array_fields = [ + 'property', + 'esource', + 'data', + 'identifier' + ] + + # Optional array fields + optional_array_fields = [ + 'simbad_objects', + 'grants', + 'readers', + 'reference', + 'ned_objects', + 'bibgroup', + 'bibgroup_facet', + 'gpn', + 'uat' + ] + + # Check required array fields + for field in required_array_fields: + self.assertIn(field, record) + self.assertIsInstance(record[field], list, + f"Field {field} should be a list") + + # Check optional array fields if present + for field in optional_array_fields: + if field in record: + self.assertIsInstance(record[field], list, + f"Field {field} should be a list") + + # Validate links structure + self.assertIn('links', record) + links = record['links'] + self.assertIsInstance(links, dict) + + # Direct link arrays + for field in ['ARXIV', 'DOI']: + self.assertIn(field, links) + self.assertIsInstance(links[field], (list)) + + # Mapped link types + for field in ['DATA', 'ESOURCE']: + self.assertIn(field, links) + self.assertIsInstance(links[field], dict) + + # If there are subtypes, validate their structure + for subtype, value in links[field].items(): + self.assertIsInstance(value, dict) + self.assertIn('url', value) + self.assertIsInstance(value['url'], (list)) + self.assertIn('title', value) + self.assertIsInstance(value['title'], (list)) + self.assertIn('count', value) + self.assertIsInstance(value['count'], int) + + # Link type records + for field in ['ASSOCIATED', 'INSPIRE', 'LIBRARYCATALOG', 'PRESENTATION']: + self.assertIn(field, links) + self.assertIsInstance(links[field], dict) + self.assertIn('url', links[field]) + self.assertIsInstance(links[field]['url'], (list)) + self.assertIn('title', links[field]) + self.assertIsInstance(links[field]['title'], (list)) + self.assertIn('count', links[field]) + self.assertIsInstance(links[field]['count'], int) + + # Boolean flags + boolean_flags = [ + 'ABSTRACT', 'CITATIONS', 'GRAPHICS', 'METRICS', + 'OPENURL', 'REFERENCES', 'TOC', 'COREAD' + ] + for field in boolean_flags: + self.assertIn(field, links) + self.assertIsInstance(links[field], bool, f"Links field {field} should be a boolean") + def test_add_data_summary(self): self.maxDiff = None with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): @@ -252,3 +367,87 @@ def test_compute_bibgroup_facet(self): self.assertEqual({'bibgroup_facet': ['a']}, p._compute_bibgroup_facet({'bibgroup': ['a']})) self.assertEqual({'bibgroup_facet': ['a', 'b']}, p._compute_bibgroup_facet({'bibgroup': ['a', 'b']})) self.assertEqual({'bibgroup_facet': ['a', 'b']}, p._compute_bibgroup_facet({'bibgroup': ['a', 'b', 'a']})) + + def test_multiple_bibcodes_no_link_leakage(self): + """Verify links don't leak between bibcodes when processing sequentially""" + self.maxDiff = None + + with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): + # Process bibcode A - has ADS_PDF and ADS_SCAN esources + bibcode_a = '2003ASPC..295..361M' + d_a = processor._read_next_bibcode(bibcode_a) + result_a = processor._convert(d_a) + + # Verify A has only its own ESOURCE links + self.assertIn('ESOURCE', result_a['links']) + esource_a = result_a['links']['ESOURCE'] + self.assertIn('ADS_PDF', esource_a) + self.assertIn('ADS_SCAN', esource_a) + + # Store A's link counts for later comparison + ads_pdf_urls_a = list(esource_a['ADS_PDF']['url']) + ads_scan_urls_a = list(esource_a['ADS_SCAN']['url']) + + # Verify A has only one URL per link type (its own) + self.assertEqual(len(ads_pdf_urls_a), 1, + f"Bibcode A should have exactly 1 ADS_PDF URL, got {len(ads_pdf_urls_a)}") + self.assertEqual(len(ads_scan_urls_a), 1, + f"Bibcode A should have exactly 1 ADS_SCAN URL, got {len(ads_scan_urls_a)}") + + # Verify URLs contain the correct bibcode + self.assertIn(bibcode_a, ads_pdf_urls_a[0]) + self.assertIn(bibcode_a, ads_scan_urls_a[0]) + + # Now process bibcode B - has different esources (includes PUB_HTML, EPRINT_HTML, etc.) + bibcode_b = '2004MNRAS.354L..31M' + d_b = processor._read_next_bibcode(bibcode_b) + result_b = processor._convert(d_b) + + # Verify B has its own ESOURCE links + self.assertIn('ESOURCE', result_b['links']) + esource_b = result_b['links']['ESOURCE'] + + # B should have ADS_PDF and ADS_SCAN (from its own data) + self.assertIn('ADS_PDF', esource_b) + self.assertIn('ADS_SCAN', esource_b) + + # B should have only its own URLs, NOT A's URLs + ads_pdf_urls_b = esource_b['ADS_PDF']['url'] + ads_scan_urls_b = esource_b['ADS_SCAN']['url'] + + # Check that B's URLs don't contain A's bibcode + for url in ads_pdf_urls_b: + self.assertNotIn(bibcode_a, url, + f"Bibcode B's ADS_PDF links leaked bibcode A's URL: {url}") + + for url in ads_scan_urls_b: + self.assertNotIn(bibcode_a, url, + f"Bibcode B's ADS_SCAN links leaked bibcode A's URL: {url}") + + # Verify B has its own bibcode in its URLs + b_pdf_has_own_bibcode = any(bibcode_b in url for url in ads_pdf_urls_b) + b_scan_has_own_bibcode = any(bibcode_b in url for url in ads_scan_urls_b) + + self.assertTrue(b_pdf_has_own_bibcode, + f"Bibcode B should have its own bibcode in ADS_PDF URLs") + self.assertTrue(b_scan_has_own_bibcode, + f"Bibcode B should have its own bibcode in ADS_SCAN URLs") + + # Also verify DATA links don't leak + # A has no DATA links, B has DATA links (CDS, NED, SIMBAD, Vizier) + data_a = result_a['links']['DATA'] + data_b = result_b['links']['DATA'] + + self.assertEqual(len(data_a), 0, "Bibcode A should have no DATA links") + self.assertGreater(len(data_b), 0, "Bibcode B should have DATA links") + + # Verify DATA subtypes in B + self.assertIn('CDS', data_b) + self.assertIn('NED', data_b) + self.assertIn('SIMBAD', data_b) + self.assertIn('Vizier', data_b) + + print(f"\n✅ Link leakage test passed!") + print(f" Bibcode A processed: {len(ads_pdf_urls_a)} ADS_PDF URLs, {len(ads_scan_urls_a)} ADS_SCAN URLs") + print(f" Bibcode B processed: {len(ads_pdf_urls_b)} ADS_PDF URLs, {len(ads_scan_urls_b)} ADS_SCAN URLs") + print(f" No links from A leaked into B")