From 70edb847aba583d394f8ee8c8298a0eefd2a5c32 Mon Sep 17 00:00:00 2001 From: femalves Date: Tue, 25 Feb 2025 16:49:29 -0500 Subject: [PATCH 01/17] changed _convert_data_link --- adsdata/process.py | 100 ++++++++++++++++++++++++++++++++------------- 1 file changed, 72 insertions(+), 28 deletions(-) diff --git a/adsdata/process.py b/adsdata/process.py index 7e1874a..da00981 100644 --- a/adsdata/process.py +++ b/adsdata/process.py @@ -191,38 +191,82 @@ def _merge_data_links(self, datalinks): new_datalinks.append(first) return new_datalinks + # def _convert_data_link(self, filetype, value): + # """convert one data link row""" + # file_properties = self.data_dict[filetype] #data_files[filetype] + # d = {} + # d['link_type'] = file_properties['extra_values']['link_type'] + # link_sub_type_suffix = '' + # if value is dict and 'subparts' in value and 'item_count' in value['subparts']: + # link_sub_type_suffix = ' ' + str(value['subparts']['item_count']) + # if value is True: + # d['link_sub_type'] = file_properties['extra_values']['link_sub_type'] + link_sub_type_suffix + # elif 'link_sub_type' in value: + # d['link_sub_type'] = value['link_sub_type'] + link_sub_type_suffix + # elif 'link_sub_type' in file_properties['extra_values']: + # d['link_sub_type'] = file_properties['extra_values']['link_sub_type'] + link_sub_type_suffix + # if type(value) is bool: + # d['url'] = [''] + # d['title'] = [''] + # d['item_count'] = 0 + # elif type(value) is dict: + # d['url'] = value.get('url', ['']) + # if type(d['url']) is str: + # d['url'] = [d['url']] + # d['title'] = value.get('title', ['']) + # if type(d['title']) is str: + # d['title'] = [d['title']] + # # if d['title'] == ['']: + # # d.pop('title') # to match old pipeline + # d['item_count'] = value.get('item_count', 0) + # else: + # self.logger.error('serious error in process.convert_data_link: unexpected type for value, filetype = {}, value = {}, type of value = {}'.format(filetype, value, type(value))) + # breakpoint() # {'link_type': 'ESOURCE', 'link_sub_type': 'ADS_PDF', 'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'item_count': 0} + # return d + def _convert_data_link(self, filetype, value): """convert one data link row""" - file_properties = self.data_dict[filetype] #data_files[filetype] - d = {} - d['link_type'] = file_properties['extra_values']['link_type'] + + file_properties = self.data_dict[filetype] + + link_type = file_properties['extra_values']['link_type'] + link_sub_type = file_properties['extra_values'].get('link_sub_type', '') link_sub_type_suffix = '' - if value is dict and 'subparts' in value and 'item_count' in value['subparts']: - link_sub_type_suffix = ' ' + str(value['subparts']['item_count']) - if value is True: - d['link_sub_type'] = file_properties['extra_values']['link_sub_type'] + link_sub_type_suffix - elif 'link_sub_type' in value: - d['link_sub_type'] = value['link_sub_type'] + link_sub_type_suffix - elif 'link_sub_type' in file_properties['extra_values']: - d['link_sub_type'] = file_properties['extra_values']['link_sub_type'] + link_sub_type_suffix - if type(value) is bool: - d['url'] = [''] - d['title'] = [''] - d['item_count'] = 0 - elif type(value) is dict: - d['url'] = value.get('url', ['']) - if type(d['url']) is str: - d['url'] = [d['url']] - d['title'] = value.get('title', ['']) - if type(d['title']) is str: - d['title'] = [d['title']] - # if d['title'] == ['']: - # d.pop('title') # to match old pipeline - d['item_count'] = value.get('item_count', 0) - else: - self.logger.error('serious error in process.convert_data_link: unexpected type for value, filetype = {}, value = {}, type of value = {}'.format(filetype, value, type(value))) - return d + if isinstance(value, dict) and 'subparts' in value: + link_sub_type_suffix = f" {value['subparts'].get('item_count', '')}".strip() + + # Determine the link sub type + if not link_sub_type and isinstance(value, dict) and 'link_sub_type' in value: + link_sub_type = value['link_sub_type'] + + link_sub_type += link_sub_type_suffix + + # Initialize result dictionary + link_data = { 'link_type': link_type, + 'link_sub_type': link_sub_type, + "url": [""], + "title": [""], + "item_count": 0 + } + + + if isinstance(value, dict): + link_data['url'] = value.get('url', ['']) + link_data['title'] = value.get('title', ['']) + link_data['item_count'] = value.get('item_count', 0) + + if isinstance(link_data['url'], str): + link_data['url'] = [link_data['url']] + if isinstance(link_data['title'], str): + link_data['title'] = [link_data['title']] + + elif not isinstance(value, bool): + self.logger.error( + f"Serious error in process.convert_data_link: unexpected type for value, filetype = {filetype}, " + f"value = {value}, type of value = {type(value)}" + ) + return link_data def _read_next_bibcode(self, bibcode): """read all the info for the passed bibcode into a dict""" From 855675ccd3577b7cf22ec2030a90e17160f219c1 Mon Sep 17 00:00:00 2001 From: femalves Date: Wed, 26 Feb 2025 16:51:45 -0500 Subject: [PATCH 02/17] changed all complex fields --- adsdata/process.py | 255 ++++++++++++++++++++++++++++++++------------- 1 file changed, 180 insertions(+), 75 deletions(-) diff --git a/adsdata/process.py b/adsdata/process.py index da00981..2133ec4 100644 --- a/adsdata/process.py +++ b/adsdata/process.py @@ -19,6 +19,7 @@ def __init__(self, compute_metrics=True, compute_CC = False): self.data_dict = data_files self.logger = tasks.app.logger self.readers = {} + self.nonbib_dict = self._get_nonbib_dict() def __enter__(self): self._open_all() @@ -27,6 +28,30 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self._close_all() + def _get_nonbib_dict(self): + return { + "bibcode": '', # OK + "identifier": [""], # MP + "links": { + "ARXIV": [""], # MP + "DOI": [""], # MP + "DATA": {}, # OK + "ESOURCE": {}, # OK + "ASSOCIATED": {},# OK + "INSPIRE": {},# OK + "LIBRARYCATALOG": {},# OK + "PRESENTATION": {},# OK + "ABSTRACT": False, + "CITATIONS": False, # OK + "GRAPHICS": False, + "METRICS": False, + "OPENURL": False, # MP + "REFERENCES": False, #OK + "TOC": False, # OK + "COREAD": False # MP + } + } + def process_bibcodes(self, bibcodes): """send nonbib and metrics records to master for the passed bibcodes for each bibcode @@ -53,81 +78,161 @@ def process_bibcodes(self, bibcodes): if not self.compute_CC: tasks.task_output_nonbib.delay(nonbib_protos) tasks.task_output_metrics.delay(metrics_protos) - def _convert(self, passed): - """convert full nonbib dict to what is needed for nonbib protobuf - data links values are read from separate files so they are in separate dicts - they must be merged into one field for the protobuf - a couple fields are summarized - some other fields are just copied - some fields are deleted - """ - return_value = {} - return_value['data_links_rows'] = [] - return_value['property'] = set() - return_value['esource'] = set() - for filetype, value in passed.items(): - file_properties = self.data_dict[filetype] #data_files[filetype] - if filetype == 'canonical': - return_value['bibcode'] = passed['canonical'] - if (value is dict and dict and 'property' in value[filetype]): - return_value['property'].update(value[filetype]['property']) - if (type(file_properties['default_value']) is bool): - return_value[filetype] = value[filetype] - value = value[filetype] - if ('extra_values' in file_properties and 'link_type' in file_properties['extra_values'] and value != file_properties['default_value']): - # here with one or more real datalinks value(s) - # add each data links dict to existing list of dicts - # tweak some values (e.g., sub_link_type) in original dict - if type(value) is bool or type(value) is dict: - d = self._convert_data_link(filetype, value) - return_value['data_links_rows'].append(d) - elif type(value) is list: - for v in value: - d = self._convert_data_link(filetype, v) - return_value['data_links_rows'].append(d) - else: - self.logger.error('serious error in process._convert with {} {} {}'.format(filetype, type(value), value)) - - if file_properties['extra_values']['link_type'] == 'ESOURCE': - return_value['esource'].add(file_properties['extra_values']['link_sub_type']) - return_value['property'].add(file_properties['extra_values']['link_type']) - return_value['property'].update(file_properties['extra_values'].get('property', [])) - elif ('extra_values' in file_properties and value != file_properties['default_value']): - if 'property' in file_properties['extra_values']: - return_value['property'].update(file_properties['extra_values']['property']) - - elif value != file_properties['default_value'] or file_properties.get('copy_default', False): - # otherwise, copy value - return_value[filetype] = passed[filetype] - if filetype == 'relevance': - for k in passed[filetype]: - # simply add all dict value to top level - return_value[k] = passed[filetype][k] - - self._add_refereed_property(return_value) - self._add_article_property(return_value, passed) - return_value['property'] = sorted(return_value['property']) - return_value['esource'] = sorted(return_value['esource']) - self._add_data_summary(return_value) - return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows']) - self._add_citation_count_fields(return_value, passed) - - # time for computed fields - for k, v in computed_fields.items(): - f = getattr(self, v['converter_function'], None) - if f is None: - self.logger.error('serious error in process._covert, expected converter_function {} for field {} not found'.format(v['converter_function'], k)) + + # def _convert(self, passed:dict): + # """convert full nonbib dict to what is needed for nonbib protobuf + # data links values are read from separate files so they are in separate dicts + # they must be merged into one field for the protobuf + # a couple fields are summarized + # some other fields are just copied + # some fields are deleted + # """ + + + # self.nonbib_dict["bibcode"] = passed['canonical'] + # self.nonbib_dict["links"]["CITATIONS"] = len(passed['citation']) > 0 + # self.nonbib_dict["links"]["REFERENCES"] = len(passed['reference']) > 0 + + # for filetype, value in passed.items(): + # file_properties = self.data_dict[filetype] #data_files[filetype] + + # not_default_value = value != file_properties['default_value'] + # link_type = file_properties.get('extra_values', {}).get('link_type', '') + + # if ('extra_values' in file_properties and 'link_type' in file_properties['extra_values'] and value != file_properties['default_value']): + # if filetype.upper() == 'TOC': + # self.nonbib_dict['links']['TOC'] = True + # else: + + # self._handle_data_link(filetype, value) + + # breakpoint() + + + # self._add_article_property(return_value, passed) + # return_value['esource'] = sorted(return_value['esource']) + # self._add_data_summary(return_value) + # return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows']) + # self._add_citation_count_fields(return_value, passed) + + # # time for computed fields + # for k, v in computed_fields.items(): + # f = getattr(self, v['converter_function'], None) + # if f is None: + # self.logger.error('serious error in process._covert, expected converter_function {} for field {} not found'.format(v['converter_function'], k)) + # else: + # x = f(return_value) + # return_value.update(x) + + + + # return return_value + + + def _handle_data_link(self, filetype, value): + result = [] + if isinstance(value, dict): # ESOURCE, ASSOCIATED, LIBRARYCATALOG, PRESENTATION, INSPIRE + d = self._convert_data_link(filetype, value) + result.append(d) + elif isinstance(value, list): # DATA + for v in value: + d = self._convert_data_link(filetype, v) + result.append(d) + elif not isinstance(value, bool): + self.logger.error('serious error in process._convert with {} {} {}'.format(filetype, type(value), value)) + + for d in result: + link_type = d.get('link_type', '') + link_sub_type = d.get('link_sub_type', '') + + del d['link_type'] + del d['link_sub_type'] + + # {'link_type': 'ESOURCE', 'link_sub_type': 'ADS_PDF', 'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'item_count': 0} + # Not here anymore {'link_type': 'TOC', 'link_sub_type': 'NA', 'url': [''], 'title': [''], 'item_count': 0} + # {'link_type': 'ASSOCIATED', 'link_sub_type': 'NA', 'url': ['2004MNRAS.354L..31M', '2005yCat..73549031M'], 'title': ['Source Paper', 'Catalog Description'], 'item_count': 0} + if link_type == 'ESOURCE' or link_type == 'DATA': + self.nonbib_dict['links'][link_type].update({link_sub_type: d}) else: - x = f(return_value) - return_value.update(x) - - # finally, delete the keys not in the nonbib protobuf - not_needed = ['author', 'canonical', 'citation', 'deleted', 'deprecated_citation_count', 'doi', 'download', 'item_count', 'nonarticle', - 'ocrabstract', 'preprint', 'private', 'pub_openaccess', 'pub2arxiv', - 'reads', 'refereed', 'relevance', 'toc'] - for n in not_needed: - return_value.pop(n, None) - return return_value + self.nonbib_dict['links'][link_type].update(d) + + + # def _convert(self, passed): + # """convert full nonbib dict to what is needed for nonbib protobuf + # data links values are read from separate files so they are in separate dicts + # they must be merged into one field for the protobuf + # a couple fields are summarized + # some other fields are just copied + # some fields are deleted + # """ + # return_value = {'data_links_rows': [], + # 'property': set(), + # "esource": set()} + + # for filetype, value in passed.items(): + # file_properties = self.data_dict[filetype] #data_files[filetype] + # if filetype == 'canonical': + # return_value['bibcode'] = passed['canonical'] + # if (value is dict and dict and 'property' in value[filetype]): + # return_value['property'].update(value[filetype]['property']) + # if (type(file_properties['default_value']) is bool): + # return_value[filetype] = value[filetype] + # value = value[filetype] + # if ('extra_values' in file_properties and 'link_type' in file_properties['extra_values'] and value != file_properties['default_value']): + # # here with one or more real datalinks value(s) + # # add each data links dict to existing list of dicts + # # tweak some values (e.g., sub_link_type) in original dict + + # if type(value) is bool or type(value) is dict: + # d = self._convert_data_link(filetype, value) + # return_value['data_links_rows'].append(d) + # elif type(value) is list: + # for v in value: + # d = self._convert_data_link(filetype, v) + # return_value['data_links_rows'].append(d) + # else: + # self.logger.error('serious error in process._convert with {} {} {}'.format(filetype, type(value), value)) + + # if file_properties['extra_values']['link_type'] == 'ESOURCE': + # return_value['esource'].add(file_properties['extra_values']['link_sub_type']) + # return_value['property'].add(file_properties['extra_values']['link_type']) + # return_value['property'].update(file_properties['extra_values'].get('property', [])) + # elif ('extra_values' in file_properties and value != file_properties['default_value']): + # if 'property' in file_properties['extra_values']: + # return_value['property'].update(file_properties['extra_values']['property']) + + # elif value != file_properties['default_value'] or file_properties.get('copy_default', False): + # # otherwise, copy value + # return_value[filetype] = passed[filetype] + # if filetype == 'relevance': + # for k in passed[filetype]: + # # simply add all dict value to top level + # return_value[k] = passed[filetype][k] + + # self._add_refereed_property(return_value) + # self._add_article_property(return_value, passed) + # return_value['property'] = sorted(return_value['property']) + # return_value['esource'] = sorted(return_value['esource']) + # self._add_data_summary(return_value) + # return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows']) + # self._add_citation_count_fields(return_value, passed) + + # # time for computed fields + # for k, v in computed_fields.items(): + # f = getattr(self, v['converter_function'], None) + # if f is None: + # self.logger.error('serious error in process._covert, expected converter_function {} for field {} not found'.format(v['converter_function'], k)) + # else: + # x = f(return_value) + # return_value.update(x) + + # # finally, delete the keys not in the nonbib protobuf + # not_needed = ['author', 'canonical', 'citation', 'deleted', 'deprecated_citation_count', 'doi', 'download', 'item_count', 'nonarticle', + # 'ocrabstract', 'preprint', 'private', 'pub_openaccess', 'pub2arxiv', + # 'reads', 'refereed', 'relevance', 'toc'] + # for n in not_needed: + # return_value.pop(n, None) + # return return_value def _add_citation_count_fields(self, return_value, original): author_count = len(original.get('author', ())) @@ -224,6 +329,7 @@ def _merge_data_links(self, datalinks): # breakpoint() # {'link_type': 'ESOURCE', 'link_sub_type': 'ADS_PDF', 'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'item_count': 0} # return d + #TODO: remove 'TOC' from here def _convert_data_link(self, filetype, value): """convert one data link row""" @@ -250,7 +356,6 @@ def _convert_data_link(self, filetype, value): "item_count": 0 } - if isinstance(value, dict): link_data['url'] = value.get('url', ['']) link_data['title'] = value.get('title', ['']) From 85b238ffb4c37db578f49dddd5ff88fb5791851b Mon Sep 17 00:00:00 2001 From: femalves Date: Tue, 11 Mar 2025 19:20:55 -0400 Subject: [PATCH 03/17] populate new protobuf --- adsdata/process.py | 425 ++++++++++++++++++++++----------------------- 1 file changed, 211 insertions(+), 214 deletions(-) diff --git a/adsdata/process.py b/adsdata/process.py index 2133ec4..7d953a5 100644 --- a/adsdata/process.py +++ b/adsdata/process.py @@ -1,4 +1,3 @@ - from datetime import datetime from collections import defaultdict @@ -19,7 +18,8 @@ def __init__(self, compute_metrics=True, compute_CC = False): self.data_dict = data_files self.logger = tasks.app.logger self.readers = {} - self.nonbib_dict = self._get_nonbib_dict() + self.new_protobuf_template = self._get_nonbib_dict() + def __enter__(self): self._open_all() @@ -28,30 +28,47 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self._close_all() - def _get_nonbib_dict(self): + def _get_nonbib_dict(self): + # Template for the new protobuf structure return { - "bibcode": '', # OK - "identifier": [""], # MP - "links": { - "ARXIV": [""], # MP - "DOI": [""], # MP - "DATA": {}, # OK - "ESOURCE": {}, # OK - "ASSOCIATED": {},# OK - "INSPIRE": {},# OK - "LIBRARYCATALOG": {},# OK - "PRESENTATION": {},# OK - "ABSTRACT": False, - "CITATIONS": False, # OK - "GRAPHICS": False, - "METRICS": False, - "OPENURL": False, # MP - "REFERENCES": False, #OK - "TOC": False, # OK - "COREAD": False # MP + "identifier": [], #MP + "links": { + "ARXIV": [], #MP + "DOI": [],#MP + "DATA": {}, + "ESOURCE": {}, + "ASSOCIATED": { + "url": [], + "title": [], + "count": 0 + }, + "INSPIRE": { + "url": [], + "title": [], + "count": 0 + }, + "LIBRARYCATALOG": { + "url": [], + "title": [], + "count": 0 + }, + "PRESENTATION": { + "url": [], + "title": [], + "count": 0 + }, + "ABSTRACT": False, #MP + "CITATIONS": False, + "GRAPHICS": False,#MP + "METRICS": False, #MP + "OPENURL": False, #MP + "REFERENCES": False, + "TOC": False, + "COREAD": False #MP } } - + + def process_bibcodes(self, bibcodes): """send nonbib and metrics records to master for the passed bibcodes for each bibcode @@ -78,165 +95,127 @@ def process_bibcodes(self, bibcodes): if not self.compute_CC: tasks.task_output_nonbib.delay(nonbib_protos) tasks.task_output_metrics.delay(metrics_protos) - - # def _convert(self, passed:dict): - # """convert full nonbib dict to what is needed for nonbib protobuf - # data links values are read from separate files so they are in separate dicts - # they must be merged into one field for the protobuf - # a couple fields are summarized - # some other fields are just copied - # some fields are deleted - # """ - - - # self.nonbib_dict["bibcode"] = passed['canonical'] - # self.nonbib_dict["links"]["CITATIONS"] = len(passed['citation']) > 0 - # self.nonbib_dict["links"]["REFERENCES"] = len(passed['reference']) > 0 - - # for filetype, value in passed.items(): - # file_properties = self.data_dict[filetype] #data_files[filetype] - # not_default_value = value != file_properties['default_value'] - # link_type = file_properties.get('extra_values', {}).get('link_type', '') - - # if ('extra_values' in file_properties and 'link_type' in file_properties['extra_values'] and value != file_properties['default_value']): - # if filetype.upper() == 'TOC': - # self.nonbib_dict['links']['TOC'] = True - # else: - - # self._handle_data_link(filetype, value) - - # breakpoint() - - - # self._add_article_property(return_value, passed) - # return_value['esource'] = sorted(return_value['esource']) - # self._add_data_summary(return_value) - # return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows']) - # self._add_citation_count_fields(return_value, passed) - - # # time for computed fields - # for k, v in computed_fields.items(): - # f = getattr(self, v['converter_function'], None) - # if f is None: - # self.logger.error('serious error in process._covert, expected converter_function {} for field {} not found'.format(v['converter_function'], k)) - # else: - # x = f(return_value) - # return_value.update(x) - + def _convert(self, passed): + """Convert full nonbib dict to what is needed for nonbib protobuf. + Data links values are read from separate files and merged into one field. + The method handles: + - Data link processing and merging + - Property aggregation + - Field summarization and copying + - Computed field generation + - Cleanup of unused fields - # return return_value - - - def _handle_data_link(self, filetype, value): - result = [] - if isinstance(value, dict): # ESOURCE, ASSOCIATED, LIBRARYCATALOG, PRESENTATION, INSPIRE - d = self._convert_data_link(filetype, value) - result.append(d) - elif isinstance(value, list): # DATA - for v in value: - d = self._convert_data_link(filetype, v) - result.append(d) - elif not isinstance(value, bool): - self.logger.error('serious error in process._convert with {} {} {}'.format(filetype, type(value), value)) + Args: + passed (dict): Raw data dictionary containing all input fields + + Returns: + dict: Processed data ready for nonbib protobuf + """ + # Initialize return structure + return_value = { + "data_links_rows": [], + "property": set(), + "esource": set() + } + + for filetype, value in passed.items(): + file_properties = self.data_dict[filetype] + default_value = file_properties.get('default_value') + extra_values = file_properties.get('extra_values', {}) + + # Handle special cases first + if filetype == 'canonical': + return_value['bibcode'] = passed['canonical'] + continue + + if filetype == 'relevance': + return_value.update(passed[filetype]) + continue - for d in result: - link_type = d.get('link_type', '') - link_sub_type = d.get('link_sub_type', '') - - del d['link_type'] - del d['link_sub_type'] + # Handle boolean fields and TOC + if isinstance(default_value, bool): + if filetype == 'toc': + self.new_protobuf_template['links']['TOC'] = value[filetype] + + return_value[filetype] = value[filetype] + value = value[filetype] - # {'link_type': 'ESOURCE', 'link_sub_type': 'ADS_PDF', 'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'item_count': 0} - # Not here anymore {'link_type': 'TOC', 'link_sub_type': 'NA', 'url': [''], 'title': [''], 'item_count': 0} - # {'link_type': 'ASSOCIATED', 'link_sub_type': 'NA', 'url': ['2004MNRAS.354L..31M', '2005yCat..73549031M'], 'title': ['Source Paper', 'Catalog Description'], 'item_count': 0} - if link_type == 'ESOURCE' or link_type == 'DATA': - self.nonbib_dict['links'][link_type].update({link_sub_type: d}) + # Process data links + if 'link_type' in extra_values and value != default_value: + # Convert and add data links + if isinstance(value, (bool, dict)): + return_value['data_links_rows'].append( + self._convert_data_link(filetype, value)) + elif isinstance(value, list): + return_value['data_links_rows'].extend( + self._convert_data_link(filetype, v) for v in value) + else: + self.logger.error( + f'serious error in process._convert with {filetype} {type(value)} {value}') + continue + + # Update esource and properties + link_type = extra_values['link_type'] + if link_type == 'ESOURCE': + return_value['esource'].add(extra_values['link_sub_type']) + return_value['property'].add(link_type) + return_value['property'].update(extra_values.get('property', [])) + + # Handle properties + elif extra_values and value != default_value: + if 'property' in extra_values: + return_value['property'].update(extra_values['property']) + + # Copy remaining fields if needed + elif value != default_value or file_properties.get('copy_default', False): + return_value[filetype] = passed[filetype] + + # Add computed properties + self._add_refereed_property(return_value) + self._add_article_property(return_value, passed) + self._add_data_summary(return_value) + self._add_citation_count_fields(return_value, passed) + + # Sort sets + return_value['property'] = sorted(return_value['property']) + return_value['esource'] = sorted(return_value['esource']) + + # Merge and process data links + return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows']) + + # Populate the new protobuf structure with link data + self._populate_new_links_structure(return_value['data_links_rows']) + + # Populate the boolean flags + self._populate_link_flags(passed) + + # Add computed fields + for field_name, field_config in computed_fields.items(): + converter = getattr(self, field_config['converter_function'], None) + if converter: + return_value.update(converter(return_value)) else: - self.nonbib_dict['links'][link_type].update(d) - - - # def _convert(self, passed): - # """convert full nonbib dict to what is needed for nonbib protobuf - # data links values are read from separate files so they are in separate dicts - # they must be merged into one field for the protobuf - # a couple fields are summarized - # some other fields are just copied - # some fields are deleted - # """ - # return_value = {'data_links_rows': [], - # 'property': set(), - # "esource": set()} + self.logger.error( + f'serious error in process._convert, expected converter_function ' + f'{field_config["converter_function"]} for field {field_name} not found') - # for filetype, value in passed.items(): - # file_properties = self.data_dict[filetype] #data_files[filetype] - # if filetype == 'canonical': - # return_value['bibcode'] = passed['canonical'] - # if (value is dict and dict and 'property' in value[filetype]): - # return_value['property'].update(value[filetype]['property']) - # if (type(file_properties['default_value']) is bool): - # return_value[filetype] = value[filetype] - # value = value[filetype] - # if ('extra_values' in file_properties and 'link_type' in file_properties['extra_values'] and value != file_properties['default_value']): - # # here with one or more real datalinks value(s) - # # add each data links dict to existing list of dicts - # # tweak some values (e.g., sub_link_type) in original dict - - # if type(value) is bool or type(value) is dict: - # d = self._convert_data_link(filetype, value) - # return_value['data_links_rows'].append(d) - # elif type(value) is list: - # for v in value: - # d = self._convert_data_link(filetype, v) - # return_value['data_links_rows'].append(d) - # else: - # self.logger.error('serious error in process._convert with {} {} {}'.format(filetype, type(value), value)) - - # if file_properties['extra_values']['link_type'] == 'ESOURCE': - # return_value['esource'].add(file_properties['extra_values']['link_sub_type']) - # return_value['property'].add(file_properties['extra_values']['link_type']) - # return_value['property'].update(file_properties['extra_values'].get('property', [])) - # elif ('extra_values' in file_properties and value != file_properties['default_value']): - # if 'property' in file_properties['extra_values']: - # return_value['property'].update(file_properties['extra_values']['property']) - - # elif value != file_properties['default_value'] or file_properties.get('copy_default', False): - # # otherwise, copy value - # return_value[filetype] = passed[filetype] - # if filetype == 'relevance': - # for k in passed[filetype]: - # # simply add all dict value to top level - # return_value[k] = passed[filetype][k] - - # self._add_refereed_property(return_value) - # self._add_article_property(return_value, passed) - # return_value['property'] = sorted(return_value['property']) - # return_value['esource'] = sorted(return_value['esource']) - # self._add_data_summary(return_value) - # return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows']) - # self._add_citation_count_fields(return_value, passed) - - # # time for computed fields - # for k, v in computed_fields.items(): - # f = getattr(self, v['converter_function'], None) - # if f is None: - # self.logger.error('serious error in process._covert, expected converter_function {} for field {} not found'.format(v['converter_function'], k)) - # else: - # x = f(return_value) - # return_value.update(x) - - # # finally, delete the keys not in the nonbib protobuf - # not_needed = ['author', 'canonical', 'citation', 'deleted', 'deprecated_citation_count', 'doi', 'download', 'item_count', 'nonarticle', - # 'ocrabstract', 'preprint', 'private', 'pub_openaccess', 'pub2arxiv', - # 'reads', 'refereed', 'relevance', 'toc'] - # for n in not_needed: - # return_value.pop(n, None) - # return return_value + # Remove unused fields + unused_fields = { + 'author', 'canonical', 'citation', 'deleted', 'deprecated_citation_count', + 'doi', 'download', 'item_count', 'nonarticle', 'ocrabstract', 'preprint', + 'private', 'pub_openaccess', 'pub2arxiv', 'reads', 'refereed', + 'relevance', 'toc' + } + for field in unused_fields: + return_value.pop(field, None) + + return return_value - def _add_citation_count_fields(self, return_value, original): - author_count = len(original.get('author', ())) - citation_count = len(return_value.get('citation', ())) + def _add_citation_count_fields(self, return_value, passed): + author_count = len(passed.get('author', ())) + citation_count = len(passed.get('citation', ())) return_value['citation_count'] = citation_count return_value['citation_count_norm'] = citation_count / float(max(author_count, 1)) @@ -244,11 +223,11 @@ def _add_refereed_property(self, return_value): if'REFEREED' not in return_value['property']: return_value['property'].add('NOT REFEREED') - def _add_article_property(self, return_value, d): - x = d.get('nonarticle', False) - if type(x) is dict: - x = x['nonarticle'] - if x: + def _add_article_property(self, return_value, passed): + nonarticle_value = passed.get('nonarticle', False) + if isinstance(nonarticle_value, dict): + nonarticle_value = nonarticle_value['nonarticle'] + if nonarticle_value: return_value['property'].add('NONARTICLE') else: return_value['property'].add('ARTICLE') @@ -296,40 +275,6 @@ def _merge_data_links(self, datalinks): new_datalinks.append(first) return new_datalinks - # def _convert_data_link(self, filetype, value): - # """convert one data link row""" - # file_properties = self.data_dict[filetype] #data_files[filetype] - # d = {} - # d['link_type'] = file_properties['extra_values']['link_type'] - # link_sub_type_suffix = '' - # if value is dict and 'subparts' in value and 'item_count' in value['subparts']: - # link_sub_type_suffix = ' ' + str(value['subparts']['item_count']) - # if value is True: - # d['link_sub_type'] = file_properties['extra_values']['link_sub_type'] + link_sub_type_suffix - # elif 'link_sub_type' in value: - # d['link_sub_type'] = value['link_sub_type'] + link_sub_type_suffix - # elif 'link_sub_type' in file_properties['extra_values']: - # d['link_sub_type'] = file_properties['extra_values']['link_sub_type'] + link_sub_type_suffix - # if type(value) is bool: - # d['url'] = [''] - # d['title'] = [''] - # d['item_count'] = 0 - # elif type(value) is dict: - # d['url'] = value.get('url', ['']) - # if type(d['url']) is str: - # d['url'] = [d['url']] - # d['title'] = value.get('title', ['']) - # if type(d['title']) is str: - # d['title'] = [d['title']] - # # if d['title'] == ['']: - # # d.pop('title') # to match old pipeline - # d['item_count'] = value.get('item_count', 0) - # else: - # self.logger.error('serious error in process.convert_data_link: unexpected type for value, filetype = {}, value = {}, type of value = {}'.format(filetype, value, type(value))) - # breakpoint() # {'link_type': 'ESOURCE', 'link_sub_type': 'ADS_PDF', 'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'item_count': 0} - # return d - - #TODO: remove 'TOC' from here def _convert_data_link(self, filetype, value): """convert one data link row""" @@ -346,10 +291,11 @@ def _convert_data_link(self, filetype, value): if not link_sub_type and isinstance(value, dict) and 'link_sub_type' in value: link_sub_type = value['link_sub_type'] + link_sub_type += link_sub_type_suffix # Initialize result dictionary - link_data = { 'link_type': link_type, + link_data = { 'link_type': link_type, 'link_sub_type': link_sub_type, "url": [""], "title": [""], @@ -467,3 +413,54 @@ def _compute_bibgroup_facet(self, d): return {} bibgroup_facet = sorted(list(set(bibgroup))) return {'bibgroup_facet': bibgroup_facet} + + def _populate_new_links_structure(self, data_links_rows): + """Populate the new protobuf links structure from data_links_rows. + Maps the flat data_links_rows into the hierarchical links structure.""" + + # Map for link types that need special handling + link_type_mapping = { + 'DATA': 'DATA', + 'ESOURCE': 'ESOURCE', + 'ASSOCIATED': 'ASSOCIATED', + 'INSPIRE': 'INSPIRE', + 'LIBRARYCATALOG': 'LIBRARYCATALOG', + 'PRESENTATION': 'PRESENTATION' + } + + for row in data_links_rows: + link_type = row['link_type'] + + # Skip if not in our mapping + if link_type not in link_type_mapping: + continue + + mapped_type = link_type_mapping[link_type] + + # Handle DATA and ESOURCE which have sub_type structure + if mapped_type in ('DATA', 'ESOURCE'): + sub_type = row['link_sub_type'] + if sub_type not in self.new_protobuf_template['links'][mapped_type]: + self.new_protobuf_template['links'][mapped_type][sub_type] = { + 'url': [], + 'title': [], + 'count': 0 + } + self.new_protobuf_template['links'][mapped_type][sub_type]['url'].extend(row['url']) + self.new_protobuf_template['links'][mapped_type][sub_type]['title'].extend(row['title']) + self.new_protobuf_template['links'][mapped_type][sub_type]['count'] = row['item_count'] + + # Handle other link types with direct structure + else: + self.new_protobuf_template['links'][mapped_type]['url'].extend(row['url']) + self.new_protobuf_template['links'][mapped_type]['title'].extend(row['title']) + self.new_protobuf_template['links'][mapped_type]['count'] = row['item_count'] + + + def _populate_link_flags(self, passed): + """Populate the boolean flags in the new protobuf links structure. + Sets CITATIONS, REFERENCES, and TOC based on data availability.""" + + self.new_protobuf_template['links']['CITATIONS'] = len(passed.get('citation', [])) > 0 + self.new_protobuf_template['links']['REFERENCES'] = len(passed.get('reference', [])) > 0 + \ No newline at end of file From 02094e0d6c1d144b104592127df03ec9d3da6364 Mon Sep 17 00:00:00 2001 From: femalves Date: Wed, 12 Mar 2025 17:12:04 -0400 Subject: [PATCH 04/17] modifying tests and removing test with old protobuf --- adsdata/process.py | 2 ++ adsdata/tests/test_process.py | 65 ++++++++++++++++++++++++++--------- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/adsdata/process.py b/adsdata/process.py index 7d953a5..c63ff72 100644 --- a/adsdata/process.py +++ b/adsdata/process.py @@ -210,6 +210,8 @@ def _convert(self, passed): } for field in unused_fields: return_value.pop(field, None) + + return_value.update(self.new_protobuf_template) return return_value diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py index 84d68fb..7b4abcd 100644 --- a/adsdata/tests/test_process.py +++ b/adsdata/tests/test_process.py @@ -72,26 +72,41 @@ def test_read(self): self.assertEqual(d['refereed'], {'refereed': False}) self.assertEqual(d['planetary_feature'], ['Moon/Mare/Mare Imbrium/3678', 'Moon/Crater/Alder/171', 'Moon/Crater/Finsen/1959', 'Moon/Crater/Leibnitz/3335']) - def test_protobuf(self): - """make sure protobuf are created without an exception""" - with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): - d = processor._read_next_bibcode('1057wjlf.book.....C') - c = processor._convert(d) - nonbib = NonBibRecord(**c) - print('nonbib = {}'.format(nonbib)) + # def test_protobuf(self): + # """make sure protobuf are created without an exception""" + # with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): + # d = processor._read_next_bibcode('1057wjlf.book.....C') + # c = processor._convert(d) + # nonbib = NonBibRecord(**c) + # print('nonbib = {}'.format(nonbib)) def test_nonbib_record(self): self.maxDiff = None with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): - d = processor._read_next_bibcode('2003ASPC..295..361M') - n = processor._convert(d) - a = {"read_count": 4, "bibcode": "2003ASPC..295..361M", - 'bibgroup': ['Chandra Technical'], 'bibgroup_facet': ['Chandra Technical'], - "data_links_rows": [{"url": ["http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']}, - {"url": ["http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']}, - {"url": [""], "link_type": "TOC", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}], - "esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0} - self.assertEqual(a, n) + # d = processor._read_next_bibcode('2003ASPC..295..361M') + # n = processor._convert(d) + # a = {"read_count": 4, "bibcode": "2003ASPC..295..361M", + # 'bibgroup': ['Chandra Technical'], 'bibgroup_facet': ['Chandra Technical'], + # "data_links_rows": [{"url": ["http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']}, + # {"url": ["http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']}, + # {"url": [""], "link_type": "TOC", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}], + # "esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0} + # new_protobuf = {'identifier': [], 'links': {'ARXIV': [], 'DOI': [], 'DATA': {}, + # 'ESOURCE': {'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'count': 0}, + # 'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M'], 'title': [''], 'count': 0}}, + # 'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, + # 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, + # 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, + # 'ABSTRACT': False, + # 'CITATIONS': False, + # 'GRAPHICS': False, + # 'METRICS': False, + # 'OPENURL': False, + # 'REFERENCES': False, + # 'TOC': True, + # 'COREAD': False}} + # a.update(new_protobuf) + # self.assertEqual(a, n) d = processor._read_next_bibcode('2004MNRAS.354L..31M') v = processor._convert(d) @@ -118,9 +133,27 @@ def test_nonbib_record(self): "total_link_counts": 1956, "esource": ["ADS_PDF", "ADS_SCAN", "EPRINT_HTML", "EPRINT_PDF", "PUB_HTML", "PUB_PDF"], "boost": 0.4399999976158142} + + new_protobuf = {'identifier': [], 'links': {'ARXIV': [], 'DOI': [], 'DATA': {'CDS': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}, + 'NED': {'url': ['https://$NED$/cgi-bin/objsearch?search_type=Search&refcode=2004MNRAS.354L..31M'], 'title': ['NED Objects (1953)'], 'count': 1953}, + 'SIMBAD': {'url': ['http://$SIMBAD$/simbo.pl?bibcode=2004MNRAS.354L..31M'], 'title': ['SIMBAD Objects (1)'], 'count': 1}, + 'Vizier': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}}, + 'ESOURCE': {'PUB_HTML': {'url': ['http://dx.doi.org/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, + 'EPRINT_HTML': {'url': ['https://arxiv.org/abs/astro-ph/0405472'], 'title': [''], 'count': 0}, + 'PUB_PDF': {'url': ['https://academic.oup.com/mnras/pdf-lookup/doi/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, + 'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M'], 'title': [''], 'count': 0}, + 'EPRINT_PDF': {'url': ['https://arxiv.org/pdf/astro-ph/0405472'], 'title': [''], 'count': 0}, + 'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M'], 'title': [''], 'count': 0}}, + 'ASSOCIATED': {'url': ['2004MNRAS.354L..31M', '2005yCat..73549031M'], 'title': ['Source Paper', 'Catalog Description'], 'count': 0}, + 'INSPIRE': {'url': ['http://inspirehep.net/search?p=find+j+MNRAA,354,L31'], 'title': [''], 'count': 0}, + 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, + 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, + 'ABSTRACT': False, 'CITATIONS': True, 'GRAPHICS': False, + 'METRICS': False, 'OPENURL': False, 'REFERENCES': False, 'TOC': False, 'COREAD': False}} v_boost = v.pop('boost') a_boost = a.pop('boost') self.assertAlmostEqual(a_boost, v_boost) + a.update(new_protobuf) self.assertEqual(a, v) # consider video 1997kbls.confE..10C From 940b433ef076719b3cff01b40a336adcdc68db6d Mon Sep 17 00:00:00 2001 From: femalves Date: Thu, 13 Mar 2025 11:25:36 -0400 Subject: [PATCH 05/17] getting it ready for master protobuf --- adsdata/process.py | 6 ++-- adsdata/tests/test_process.py | 64 +++++++++-------------------------- 2 files changed, 18 insertions(+), 52 deletions(-) diff --git a/adsdata/process.py b/adsdata/process.py index c63ff72..ad33dea 100644 --- a/adsdata/process.py +++ b/adsdata/process.py @@ -68,7 +68,7 @@ def _get_nonbib_dict(self): } } - + # TODO: add master protobuf def process_bibcodes(self, bibcodes): """send nonbib and metrics records to master for the passed bibcodes for each bibcode @@ -95,7 +95,7 @@ def process_bibcodes(self, bibcodes): if not self.compute_CC: tasks.task_output_nonbib.delay(nonbib_protos) tasks.task_output_metrics.delay(metrics_protos) - + # TODO: Check what else can be added for master protobuf def _convert(self, passed): """Convert full nonbib dict to what is needed for nonbib protobuf. @@ -210,8 +210,6 @@ def _convert(self, passed): } for field in unused_fields: return_value.pop(field, None) - - return_value.update(self.new_protobuf_template) return return_value diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py index 7b4abcd..f6f269b 100644 --- a/adsdata/tests/test_process.py +++ b/adsdata/tests/test_process.py @@ -72,41 +72,26 @@ def test_read(self): self.assertEqual(d['refereed'], {'refereed': False}) self.assertEqual(d['planetary_feature'], ['Moon/Mare/Mare Imbrium/3678', 'Moon/Crater/Alder/171', 'Moon/Crater/Finsen/1959', 'Moon/Crater/Leibnitz/3335']) - # def test_protobuf(self): - # """make sure protobuf are created without an exception""" - # with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): - # d = processor._read_next_bibcode('1057wjlf.book.....C') - # c = processor._convert(d) - # nonbib = NonBibRecord(**c) - # print('nonbib = {}'.format(nonbib)) + def test_protobuf(self): + """make sure protobuf are created without an exception""" + with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): + d = processor._read_next_bibcode('1057wjlf.book.....C') + c = processor._convert(d) + nonbib = NonBibRecord(**c) + print('nonbib = {}'.format(nonbib)) def test_nonbib_record(self): self.maxDiff = None with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): - # d = processor._read_next_bibcode('2003ASPC..295..361M') - # n = processor._convert(d) - # a = {"read_count": 4, "bibcode": "2003ASPC..295..361M", - # 'bibgroup': ['Chandra Technical'], 'bibgroup_facet': ['Chandra Technical'], - # "data_links_rows": [{"url": ["http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']}, - # {"url": ["http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']}, - # {"url": [""], "link_type": "TOC", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}], - # "esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0} - # new_protobuf = {'identifier': [], 'links': {'ARXIV': [], 'DOI': [], 'DATA': {}, - # 'ESOURCE': {'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'count': 0}, - # 'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M'], 'title': [''], 'count': 0}}, - # 'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, - # 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, - # 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, - # 'ABSTRACT': False, - # 'CITATIONS': False, - # 'GRAPHICS': False, - # 'METRICS': False, - # 'OPENURL': False, - # 'REFERENCES': False, - # 'TOC': True, - # 'COREAD': False}} - # a.update(new_protobuf) - # self.assertEqual(a, n) + d = processor._read_next_bibcode('2003ASPC..295..361M') + n = processor._convert(d) + a = {"read_count": 4, "bibcode": "2003ASPC..295..361M", + 'bibgroup': ['Chandra Technical'], 'bibgroup_facet': ['Chandra Technical'], + "data_links_rows": [{"url": ["http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']}, + {"url": ["http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']}, + {"url": [""], "link_type": "TOC", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}], + "esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0} + self.assertEqual(a, n) d = processor._read_next_bibcode('2004MNRAS.354L..31M') v = processor._convert(d) @@ -134,26 +119,9 @@ def test_nonbib_record(self): "esource": ["ADS_PDF", "ADS_SCAN", "EPRINT_HTML", "EPRINT_PDF", "PUB_HTML", "PUB_PDF"], "boost": 0.4399999976158142} - new_protobuf = {'identifier': [], 'links': {'ARXIV': [], 'DOI': [], 'DATA': {'CDS': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}, - 'NED': {'url': ['https://$NED$/cgi-bin/objsearch?search_type=Search&refcode=2004MNRAS.354L..31M'], 'title': ['NED Objects (1953)'], 'count': 1953}, - 'SIMBAD': {'url': ['http://$SIMBAD$/simbo.pl?bibcode=2004MNRAS.354L..31M'], 'title': ['SIMBAD Objects (1)'], 'count': 1}, - 'Vizier': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}}, - 'ESOURCE': {'PUB_HTML': {'url': ['http://dx.doi.org/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, - 'EPRINT_HTML': {'url': ['https://arxiv.org/abs/astro-ph/0405472'], 'title': [''], 'count': 0}, - 'PUB_PDF': {'url': ['https://academic.oup.com/mnras/pdf-lookup/doi/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, - 'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M'], 'title': [''], 'count': 0}, - 'EPRINT_PDF': {'url': ['https://arxiv.org/pdf/astro-ph/0405472'], 'title': [''], 'count': 0}, - 'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M'], 'title': [''], 'count': 0}}, - 'ASSOCIATED': {'url': ['2004MNRAS.354L..31M', '2005yCat..73549031M'], 'title': ['Source Paper', 'Catalog Description'], 'count': 0}, - 'INSPIRE': {'url': ['http://inspirehep.net/search?p=find+j+MNRAA,354,L31'], 'title': [''], 'count': 0}, - 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, - 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, - 'ABSTRACT': False, 'CITATIONS': True, 'GRAPHICS': False, - 'METRICS': False, 'OPENURL': False, 'REFERENCES': False, 'TOC': False, 'COREAD': False}} v_boost = v.pop('boost') a_boost = a.pop('boost') self.assertAlmostEqual(a_boost, v_boost) - a.update(new_protobuf) self.assertEqual(a, v) # consider video 1997kbls.confE..10C From b603a3a87192ba91d5d5daf5940a700d6c64fd1c Mon Sep 17 00:00:00 2001 From: femalves Date: Thu, 13 Mar 2025 12:10:35 -0400 Subject: [PATCH 06/17] adding metrics --- adsdata/process.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/adsdata/process.py b/adsdata/process.py index ad33dea..bc4d91e 100644 --- a/adsdata/process.py +++ b/adsdata/process.py @@ -18,7 +18,7 @@ def __init__(self, compute_metrics=True, compute_CC = False): self.data_dict = data_files self.logger = tasks.app.logger self.readers = {} - self.new_protobuf_template = self._get_nonbib_dict() + self.master_protobuf = self._get_master_nonbib_dict() def __enter__(self): @@ -28,7 +28,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self._close_all() - def _get_nonbib_dict(self): + def _get_master_nonbib_dict(self): # Template for the new protobuf structure return { "identifier": [], #MP @@ -57,10 +57,10 @@ def _get_nonbib_dict(self): "title": [], "count": 0 }, - "ABSTRACT": False, #MP + "ABSTRACT": False,#MP "CITATIONS": False, "GRAPHICS": False,#MP - "METRICS": False, #MP + "METRICS": False, "OPENURL": False, #MP "REFERENCES": False, "TOC": False, @@ -137,7 +137,7 @@ def _convert(self, passed): # Handle boolean fields and TOC if isinstance(default_value, bool): if filetype == 'toc': - self.new_protobuf_template['links']['TOC'] = value[filetype] + self.master_protobuf['links']['TOC'] = value[filetype] return_value[filetype] = value[filetype] value = value[filetype] @@ -440,27 +440,28 @@ def _populate_new_links_structure(self, data_links_rows): # Handle DATA and ESOURCE which have sub_type structure if mapped_type in ('DATA', 'ESOURCE'): sub_type = row['link_sub_type'] - if sub_type not in self.new_protobuf_template['links'][mapped_type]: - self.new_protobuf_template['links'][mapped_type][sub_type] = { + if sub_type not in self.master_protobuf['links'][mapped_type]: + self.master_protobuf['links'][mapped_type][sub_type] = { 'url': [], 'title': [], 'count': 0 } - self.new_protobuf_template['links'][mapped_type][sub_type]['url'].extend(row['url']) - self.new_protobuf_template['links'][mapped_type][sub_type]['title'].extend(row['title']) - self.new_protobuf_template['links'][mapped_type][sub_type]['count'] = row['item_count'] + self.master_protobuf['links'][mapped_type][sub_type]['url'].extend(row['url']) + self.master_protobuf['links'][mapped_type][sub_type]['title'].extend(row['title']) + self.master_protobuf['links'][mapped_type][sub_type]['count'] = row['item_count'] # Handle other link types with direct structure else: - self.new_protobuf_template['links'][mapped_type]['url'].extend(row['url']) - self.new_protobuf_template['links'][mapped_type]['title'].extend(row['title']) - self.new_protobuf_template['links'][mapped_type]['count'] = row['item_count'] + self.master_protobuf['links'][mapped_type]['url'].extend(row['url']) + self.master_protobuf['links'][mapped_type]['title'].extend(row['title']) + self.master_protobuf['links'][mapped_type]['count'] = row['item_count'] def _populate_link_flags(self, passed): """Populate the boolean flags in the new protobuf links structure. - Sets CITATIONS, REFERENCES, and TOC based on data availability.""" + Sets CITATIONS, REFERENCES, and METRICS based on data availability.""" - self.new_protobuf_template['links']['CITATIONS'] = len(passed.get('citation', [])) > 0 - self.new_protobuf_template['links']['REFERENCES'] = len(passed.get('reference', [])) > 0 + self.master_protobuf['links']['CITATIONS'] = len(passed.get('citation', [])) > 0 + self.master_protobuf['links']['REFERENCES'] = len(passed.get('reference', [])) > 0 + self.master_protobuf['links']['METRICS'] = self.compute_metrics \ No newline at end of file From 99ef925935a55ced4290f3249595a744323a43b7 Mon Sep 17 00:00:00 2001 From: femalves Date: Fri, 14 Mar 2025 14:07:03 -0400 Subject: [PATCH 07/17] adding tests and adding master to old protobuf --- adsdata/process.py | 3 +- adsdata/tests/test_process.py | 191 +++++++++++++++++++++++++++------- 2 files changed, 155 insertions(+), 39 deletions(-) diff --git a/adsdata/process.py b/adsdata/process.py index bc4d91e..b43ca93 100644 --- a/adsdata/process.py +++ b/adsdata/process.py @@ -210,7 +210,8 @@ def _convert(self, passed): } for field in unused_fields: return_value.pop(field, None) - + return_value.update(self.master_protobuf) + return_value.pop('data_links_rows') return return_value def _add_citation_count_fields(self, return_value, passed): diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py index f6f269b..f0fac40 100644 --- a/adsdata/tests/test_process.py +++ b/adsdata/tests/test_process.py @@ -1,4 +1,3 @@ - import unittest from mock import patch, mock_open from datetime import datetime @@ -72,53 +71,68 @@ def test_read(self): self.assertEqual(d['refereed'], {'refereed': False}) self.assertEqual(d['planetary_feature'], ['Moon/Mare/Mare Imbrium/3678', 'Moon/Crater/Alder/171', 'Moon/Crater/Finsen/1959', 'Moon/Crater/Leibnitz/3335']) - def test_protobuf(self): - """make sure protobuf are created without an exception""" - with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): - d = processor._read_next_bibcode('1057wjlf.book.....C') - c = processor._convert(d) - nonbib = NonBibRecord(**c) - print('nonbib = {}'.format(nonbib)) + # def test_protobuf(self): + # """make sure protobuf are created without an exception""" + # with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): + # d = processor._read_next_bibcode('1057wjlf.book.....C') + # c = processor._convert(d) + # nonbib = NonBibRecord(**c) + # print('nonbib = {}'.format(nonbib)) def test_nonbib_record(self): self.maxDiff = None with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): d = processor._read_next_bibcode('2003ASPC..295..361M') n = processor._convert(d) - a = {"read_count": 4, "bibcode": "2003ASPC..295..361M", - 'bibgroup': ['Chandra Technical'], 'bibgroup_facet': ['Chandra Technical'], - "data_links_rows": [{"url": ["http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']}, - {"url": ["http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']}, - {"url": [""], "link_type": "TOC", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}], - "esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0} + a = {'property': ['ADS_OPENACCESS', 'ARTICLE', 'ESOURCE', 'NOT REFEREED', 'OPENACCESS', 'TOC'], 'esource': ['ADS_PDF', 'ADS_SCAN'], + 'bibcode': '2003ASPC..295..361M', 'bibgroup': ['Chandra Technical'], 'boost': 0.15, 'read_count': 4, 'norm_cites': 0, 'data': [], + 'total_link_counts': 0, 'citation_count': 0, 'citation_count_norm': 0.0, + 'bibgroup_facet': ['Chandra Technical'], 'identifier': [], + 'links': {'ARXIV': [], 'DOI': [], 'DATA': {}, + 'ESOURCE': {'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'count': 0}, + 'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M'], 'title': [''], 'count': 0}}, + 'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, + 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, + 'ABSTRACT': False, + 'CITATIONS': False, + 'GRAPHICS': False, + 'METRICS': False, + 'OPENURL': False, + 'REFERENCES': False, + 'TOC': True, + 'COREAD': False}} self.assertEqual(a, n) + self._validate_nonbib_structure(n) d = processor._read_next_bibcode('2004MNRAS.354L..31M') v = processor._convert(d) - a = {"bibcode": "2004MNRAS.354L..31M", - "simbad_objects": ["3253618 G"], - "read_count": 20, - "data_links_rows": [{"url": ["http://dx.doi.org/10.1111/j.1365-2966.2004.08374.x"], "link_type": "ESOURCE", "link_sub_type": "PUB_HTML", 'item_count': 0, 'title': ['']}, - {"url": ["https://arxiv.org/abs/astro-ph/0405472"], "link_type": "ESOURCE", "link_sub_type": "EPRINT_HTML", 'item_count': 0, 'title': ['']}, - {"url": ["https://academic.oup.com/mnras/pdf-lookup/doi/10.1111/j.1365-2966.2004.08374.x"], "link_type": "ESOURCE", "link_sub_type": "PUB_PDF", 'item_count': 0, 'title': ['']}, - {"url": ["http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']}, - {"url": ["https://arxiv.org/pdf/astro-ph/0405472"], "link_type": "ESOURCE", "link_sub_type": "EPRINT_PDF", 'item_count': 0, 'title': ['']}, - {"url": ["http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']}, - {"url": ["2004MNRAS.354L..31M", "2005yCat..73549031M"], "title": ["Source Paper", "Catalog Description"], "link_type": "ASSOCIATED", "link_sub_type": "NA", 'item_count': 0}, - {"url": ["http://inspirehep.net/search?p=find+j+MNRAA,354,L31"], "link_type": "INSPIRE", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}, - {"url": ["http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31"], "item_count": 1, "link_type": "DATA", "link_sub_type": "CDS", 'title': ['']}, - {"url": ["https://$NED$/cgi-bin/objsearch?search_type=Search&refcode=2004MNRAS.354L..31M"], "title": ["NED Objects (1953)"], "item_count": 1953, "link_type": "DATA", "link_sub_type": "NED"}, - {"url": ["http://$SIMBAD$/simbo.pl?bibcode=2004MNRAS.354L..31M"], "title": ["SIMBAD Objects (1)"], "item_count": 1, "link_type": "DATA", "link_sub_type": "SIMBAD"}, - {"url": ["http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31"], "item_count": 1, "link_type": "DATA", "link_sub_type": "Vizier", 'title': ['']}], - "norm_cites": 10000, - "data": ["CDS:1", "NED:1953", "SIMBAD:1", "Vizier:1"], - "citation_count_norm": 49.5, - "citation_count": 99, - "property": ["ADS_OPENACCESS", "ARTICLE", "ASSOCIATED", "DATA", "EPRINT_OPENACCESS", "ESOURCE", "INSPIRE", "OPENACCESS", "PUB_OPENACCESS", "REFEREED"], - "total_link_counts": 1956, - "esource": ["ADS_PDF", "ADS_SCAN", "EPRINT_HTML", "EPRINT_PDF", "PUB_HTML", "PUB_PDF"], - "boost": 0.4399999976158142} - + a = {'property': ['ADS_OPENACCESS', 'ARTICLE', 'ASSOCIATED', 'DATA', 'EPRINT_OPENACCESS', 'ESOURCE', 'INSPIRE', 'OPENACCESS', 'PUB_OPENACCESS', 'REFEREED'], + 'esource': ['ADS_PDF', 'ADS_SCAN', 'EPRINT_HTML', 'EPRINT_PDF', 'PUB_HTML', 'PUB_PDF'], + 'bibcode': '2004MNRAS.354L..31M', 'boost': 0.44, 'read_count': 20, 'norm_cites': 10000, + 'simbad_objects': ['3253618 G'], 'data': ['CDS:1', 'NED:1953', 'SIMBAD:1', 'Vizier:1'], + 'total_link_counts': 1956, 'citation_count': 99, 'citation_count_norm': 49.5, 'identifier': [], + 'links': {'ARXIV': [], 'DOI': [], 'DATA': {'CDS': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}, + 'NED': {'url': ['https://$NED$/cgi-bin/objsearch?search_type=Search&refcode=2004MNRAS.354L..31M'], 'title': ['NED Objects (1953)'], 'count': 1953}, + 'SIMBAD': {'url': ['http://$SIMBAD$/simbo.pl?bibcode=2004MNRAS.354L..31M'], 'title': ['SIMBAD Objects (1)'], 'count': 1}, + 'Vizier': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}}, + 'ESOURCE': {'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M', 'http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M'], 'title': ['', ''], 'count': 0}, + 'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M', 'http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M'], 'title': ['', ''], 'count': 0}, + 'PUB_HTML': {'url': ['http://dx.doi.org/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, + 'EPRINT_HTML': {'url': ['https://arxiv.org/abs/astro-ph/0405472'], 'title': [''], 'count': 0}, + 'PUB_PDF': {'url': ['https://academic.oup.com/mnras/pdf-lookup/doi/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, + 'EPRINT_PDF': {'url': ['https://arxiv.org/pdf/astro-ph/0405472'], 'title': [''], 'count': 0}}, + 'ASSOCIATED': {'url': ['2004MNRAS.354L..31M', '2005yCat..73549031M'], 'title': ['Source Paper', 'Catalog Description'], 'count': 0}, + 'INSPIRE': {'url': ['http://inspirehep.net/search?p=find+j+MNRAA,354,L31'], 'title': [''], 'count': 0}, 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, + 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, + 'ABSTRACT': False, + 'CITATIONS': True, + 'GRAPHICS': False, + 'METRICS': False, + 'OPENURL': False, + 'REFERENCES': False, + 'TOC': False, + 'COREAD': False}} + v_boost = v.pop('boost') a_boost = a.pop('boost') self.assertAlmostEqual(a_boost, v_boost) @@ -128,6 +142,107 @@ def test_nonbib_record(self): # consider library 1810hdla.book.....V # consider inspire 1908PASP...20....1. + def _validate_nonbib_structure(self, record): + """Validate that the nonbib record has all required fields with correct types""" + + # Required string fields + self.assertIn('bibcode', record) + self.assertIn('identifier', record) + self.assertIsInstance(record['bibcode'], str) + self.assertIsInstance(record['identifier'], list) + + # Required numeric fields + numeric_fields = { + 'boost': float, + 'citation_count': int, + 'read_count': int, + 'total_link_counts': int, + 'norm_cites': int, + 'citation_count_norm': float + } + for field, expected_type in numeric_fields.items(): + self.assertIn(field, record) + self.assertIsInstance(record[field], expected_type, + f"Field {field} should be {expected_type.__name__}") + + # Required array fields + required_array_fields = [ + 'property', + 'esource', + 'data', + 'identifier' + ] + + # Optional array fields + optional_array_fields = [ + 'simbad_objects', + 'grants', + 'readers', + 'reference', + 'ned_objects', + 'bibgroup', + 'bibgroup_facet', + 'gpn', + 'uat' + ] + + # Check required array fields + for field in required_array_fields: + self.assertIn(field, record) + self.assertIsInstance(record[field], list, + f"Field {field} should be a list") + + # Check optional array fields if present + for field in optional_array_fields: + if field in record: + self.assertIsInstance(record[field], list, + f"Field {field} should be a list") + + # Validate links structure + self.assertIn('links', record) + links = record['links'] + self.assertIsInstance(links, dict) + + # Direct link arrays + for field in ['ARXIV', 'DOI']: + self.assertIn(field, links) + self.assertIsInstance(links[field], (list)) + + # Mapped link types + for field in ['DATA', 'ESOURCE']: + self.assertIn(field, links) + self.assertIsInstance(links[field], dict) + + # If there are subtypes, validate their structure + for subtype, value in links[field].items(): + self.assertIsInstance(value, dict) + self.assertIn('url', value) + self.assertIsInstance(value['url'], (list)) + self.assertIn('title', value) + self.assertIsInstance(value['title'], (list)) + self.assertIn('count', value) + self.assertIsInstance(value['count'], int) + + # Link type records + for field in ['ASSOCIATED', 'INSPIRE', 'LIBRARYCATALOG', 'PRESENTATION']: + self.assertIn(field, links) + self.assertIsInstance(links[field], dict) + self.assertIn('url', links[field]) + self.assertIsInstance(links[field]['url'], (list)) + self.assertIn('title', links[field]) + self.assertIsInstance(links[field]['title'], (list)) + self.assertIn('count', links[field]) + self.assertIsInstance(links[field]['count'], int) + + # Boolean flags + boolean_flags = [ + 'ABSTRACT', 'CITATIONS', 'GRAPHICS', 'METRICS', + 'OPENURL', 'REFERENCES', 'TOC', 'COREAD' + ] + for field in boolean_flags: + self.assertIn(field, links) + self.assertIsInstance(links[field], bool, f"Links field {field} should be a boolean") + def test_add_data_summary(self): self.maxDiff = None with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): From 99d133d49d6c5cc790c0ee5bed647df08d2e85e0 Mon Sep 17 00:00:00 2001 From: femalves Date: Tue, 8 Apr 2025 13:20:44 -0400 Subject: [PATCH 08/17] changing coreads and openurls to always be True --- adsdata/process.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/adsdata/process.py b/adsdata/process.py index b43ca93..64733b8 100644 --- a/adsdata/process.py +++ b/adsdata/process.py @@ -61,14 +61,13 @@ def _get_master_nonbib_dict(self): "CITATIONS": False, "GRAPHICS": False,#MP "METRICS": False, - "OPENURL": False, #MP + "OPENURL": True, "REFERENCES": False, "TOC": False, - "COREAD": False #MP + "COREAD": True } } - # TODO: add master protobuf def process_bibcodes(self, bibcodes): """send nonbib and metrics records to master for the passed bibcodes for each bibcode From 303d1d0c3c5631509bb7f87bbd956839cdd84245 Mon Sep 17 00:00:00 2001 From: femalves Date: Tue, 8 Apr 2025 13:22:10 -0400 Subject: [PATCH 09/17] fixing test --- adsdata/tests/test_process.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py index f0fac40..27eaf30 100644 --- a/adsdata/tests/test_process.py +++ b/adsdata/tests/test_process.py @@ -97,10 +97,10 @@ def test_nonbib_record(self): 'CITATIONS': False, 'GRAPHICS': False, 'METRICS': False, - 'OPENURL': False, + 'OPENURL': True, 'REFERENCES': False, 'TOC': True, - 'COREAD': False}} + 'COREAD': True}} self.assertEqual(a, n) self._validate_nonbib_structure(n) @@ -128,10 +128,10 @@ def test_nonbib_record(self): 'CITATIONS': True, 'GRAPHICS': False, 'METRICS': False, - 'OPENURL': False, + 'OPENURL': True, 'REFERENCES': False, 'TOC': False, - 'COREAD': False}} + 'COREAD': True}} v_boost = v.pop('boost') a_boost = a.pop('boost') From e2c651076ac10c1c4f31f91fddc0b1f52a276809 Mon Sep 17 00:00:00 2001 From: femalves Date: Tue, 8 Apr 2025 16:30:24 -0400 Subject: [PATCH 10/17] fixing _populate_new_links_structuretest --- adsdata/process.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/adsdata/process.py b/adsdata/process.py index 64733b8..9260d35 100644 --- a/adsdata/process.py +++ b/adsdata/process.py @@ -429,7 +429,7 @@ def _populate_new_links_structure(self, data_links_rows): } for row in data_links_rows: - link_type = row['link_type'] + link_type = row.get('link_type', '') # Skip if not in our mapping if link_type not in link_type_mapping: @@ -439,22 +439,28 @@ def _populate_new_links_structure(self, data_links_rows): # Handle DATA and ESOURCE which have sub_type structure if mapped_type in ('DATA', 'ESOURCE'): - sub_type = row['link_sub_type'] + sub_type = row.get('link_sub_type', '') if sub_type not in self.master_protobuf['links'][mapped_type]: self.master_protobuf['links'][mapped_type][sub_type] = { 'url': [], 'title': [], 'count': 0 } - self.master_protobuf['links'][mapped_type][sub_type]['url'].extend(row['url']) - self.master_protobuf['links'][mapped_type][sub_type]['title'].extend(row['title']) - self.master_protobuf['links'][mapped_type][sub_type]['count'] = row['item_count'] + if 'url' in row: + self.master_protobuf['links'][mapped_type][sub_type]['url'].extend(row['url']) + if 'title' in row: + self.master_protobuf['links'][mapped_type][sub_type]['title'].extend(row['title']) + if 'item_count' in row: + self.master_protobuf['links'][mapped_type][sub_type]['count'] = row['item_count'] # Handle other link types with direct structure else: - self.master_protobuf['links'][mapped_type]['url'].extend(row['url']) - self.master_protobuf['links'][mapped_type]['title'].extend(row['title']) - self.master_protobuf['links'][mapped_type]['count'] = row['item_count'] + if 'url' in row: + self.master_protobuf['links'][mapped_type]['url'].extend(row['url']) + if 'title' in row: + self.master_protobuf['links'][mapped_type]['title'].extend(row['title']) + if 'item_count' in row: + self.master_protobuf['links'][mapped_type]['count'] = row['item_count'] def _populate_link_flags(self, passed): From 338632e16119bed0c5a80fcea3f583ccc3f99da3 Mon Sep 17 00:00:00 2001 From: femalves Date: Fri, 18 Apr 2025 10:41:13 -0400 Subject: [PATCH 11/17] removing changes to flags --- adsdata/process.py | 22 +++------------------- adsdata/tests/test_process.py | 12 ++++++------ 2 files changed, 9 insertions(+), 25 deletions(-) diff --git a/adsdata/process.py b/adsdata/process.py index 9260d35..267b84d 100644 --- a/adsdata/process.py +++ b/adsdata/process.py @@ -61,10 +61,10 @@ def _get_master_nonbib_dict(self): "CITATIONS": False, "GRAPHICS": False,#MP "METRICS": False, - "OPENURL": True, + "OPENURL": False, "REFERENCES": False, "TOC": False, - "COREAD": True + "COREAD": False } } @@ -135,9 +135,6 @@ def _convert(self, passed): # Handle boolean fields and TOC if isinstance(default_value, bool): - if filetype == 'toc': - self.master_protobuf['links']['TOC'] = value[filetype] - return_value[filetype] = value[filetype] value = value[filetype] @@ -187,9 +184,6 @@ def _convert(self, passed): # Populate the new protobuf structure with link data self._populate_new_links_structure(return_value['data_links_rows']) - # Populate the boolean flags - self._populate_link_flags(passed) - # Add computed fields for field_name, field_config in computed_fields.items(): converter = getattr(self, field_config['converter_function'], None) @@ -460,14 +454,4 @@ def _populate_new_links_structure(self, data_links_rows): if 'title' in row: self.master_protobuf['links'][mapped_type]['title'].extend(row['title']) if 'item_count' in row: - self.master_protobuf['links'][mapped_type]['count'] = row['item_count'] - - - def _populate_link_flags(self, passed): - """Populate the boolean flags in the new protobuf links structure. - Sets CITATIONS, REFERENCES, and METRICS based on data availability.""" - - self.master_protobuf['links']['CITATIONS'] = len(passed.get('citation', [])) > 0 - self.master_protobuf['links']['REFERENCES'] = len(passed.get('reference', [])) > 0 - self.master_protobuf['links']['METRICS'] = self.compute_metrics - \ No newline at end of file + self.master_protobuf['links'][mapped_type]['count'] = row['item_count'] \ No newline at end of file diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py index 27eaf30..88822bc 100644 --- a/adsdata/tests/test_process.py +++ b/adsdata/tests/test_process.py @@ -97,10 +97,10 @@ def test_nonbib_record(self): 'CITATIONS': False, 'GRAPHICS': False, 'METRICS': False, - 'OPENURL': True, + 'OPENURL': False, 'REFERENCES': False, - 'TOC': True, - 'COREAD': True}} + 'TOC': False, + 'COREAD': False}} self.assertEqual(a, n) self._validate_nonbib_structure(n) @@ -125,13 +125,13 @@ def test_nonbib_record(self): 'INSPIRE': {'url': ['http://inspirehep.net/search?p=find+j+MNRAA,354,L31'], 'title': [''], 'count': 0}, 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, 'ABSTRACT': False, - 'CITATIONS': True, + 'CITATIONS': False, 'GRAPHICS': False, 'METRICS': False, - 'OPENURL': True, + 'OPENURL': False, 'REFERENCES': False, 'TOC': False, - 'COREAD': True}} + 'COREAD': False}} v_boost = v.pop('boost') a_boost = a.pop('boost') From a3f39daa69816845392f2a286b427a1b16c58004 Mon Sep 17 00:00:00 2001 From: femalves Date: Tue, 13 May 2025 14:56:47 -0400 Subject: [PATCH 12/17] uncommenting test --- adsdata/tests/test_process.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py index 88822bc..c28fa6c 100644 --- a/adsdata/tests/test_process.py +++ b/adsdata/tests/test_process.py @@ -71,13 +71,13 @@ def test_read(self): self.assertEqual(d['refereed'], {'refereed': False}) self.assertEqual(d['planetary_feature'], ['Moon/Mare/Mare Imbrium/3678', 'Moon/Crater/Alder/171', 'Moon/Crater/Finsen/1959', 'Moon/Crater/Leibnitz/3335']) - # def test_protobuf(self): - # """make sure protobuf are created without an exception""" - # with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): - # d = processor._read_next_bibcode('1057wjlf.book.....C') - # c = processor._convert(d) - # nonbib = NonBibRecord(**c) - # print('nonbib = {}'.format(nonbib)) + def test_protobuf(self): + """make sure protobuf are created without an exception""" + with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): + d = processor._read_next_bibcode('1057wjlf.book.....C') + c = processor._convert(d) + nonbib = NonBibRecord(**c) + print('nonbib = {}'.format(nonbib)) def test_nonbib_record(self): self.maxDiff = None @@ -99,7 +99,7 @@ def test_nonbib_record(self): 'METRICS': False, 'OPENURL': False, 'REFERENCES': False, - 'TOC': False, + 'TOC': True, 'COREAD': False}} self.assertEqual(a, n) self._validate_nonbib_structure(n) From 4eb266dec4faaa4deafbd49792fdc58f37fb2a0c Mon Sep 17 00:00:00 2001 From: femalves Date: Tue, 13 May 2025 15:43:22 -0400 Subject: [PATCH 13/17] updating requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5ef4fd3..5641829 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -adsputils==1.5.5 \ No newline at end of file +adsputils==1.5.7 \ No newline at end of file From c405a4f02ad663017701628e792d0b193cfa6182 Mon Sep 17 00:00:00 2001 From: femalves Date: Tue, 13 May 2025 16:04:07 -0400 Subject: [PATCH 14/17] changing test --- adsdata/tests/test_process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py index c28fa6c..a92baba 100644 --- a/adsdata/tests/test_process.py +++ b/adsdata/tests/test_process.py @@ -99,7 +99,7 @@ def test_nonbib_record(self): 'METRICS': False, 'OPENURL': False, 'REFERENCES': False, - 'TOC': True, + 'TOC': False, 'COREAD': False}} self.assertEqual(a, n) self._validate_nonbib_structure(n) From 53cc30c6528b872a2d0ee7a7fa6e6d08f4dfaeaa Mon Sep 17 00:00:00 2001 From: femalves Date: Mon, 24 Nov 2025 12:29:00 -0500 Subject: [PATCH 15/17] making abstract always true --- adsdata/process.py | 2 +- adsdata/tests/test_process.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/adsdata/process.py b/adsdata/process.py index 267b84d..e82953a 100644 --- a/adsdata/process.py +++ b/adsdata/process.py @@ -57,7 +57,7 @@ def _get_master_nonbib_dict(self): "title": [], "count": 0 }, - "ABSTRACT": False,#MP + "ABSTRACT": True, "CITATIONS": False, "GRAPHICS": False,#MP "METRICS": False, diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py index a92baba..4217daf 100644 --- a/adsdata/tests/test_process.py +++ b/adsdata/tests/test_process.py @@ -93,7 +93,7 @@ def test_nonbib_record(self): 'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M'], 'title': [''], 'count': 0}}, 'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, - 'ABSTRACT': False, + 'ABSTRACT': True, 'CITATIONS': False, 'GRAPHICS': False, 'METRICS': False, From 701711563ef8b8e46e94b67419dc9f8859a3b897 Mon Sep 17 00:00:00 2001 From: femalves Date: Mon, 24 Nov 2025 14:42:04 -0500 Subject: [PATCH 16/17] resolving bugs --- adsdata/process.py | 48 ++++++++-------- adsdata/tests/test_process.py | 104 ++++++++++++++++++++++++++++++---- 2 files changed, 119 insertions(+), 33 deletions(-) diff --git a/adsdata/process.py b/adsdata/process.py index f052ae7..736a215 100644 --- a/adsdata/process.py +++ b/adsdata/process.py @@ -31,10 +31,10 @@ def __exit__(self, exc_type, exc_value, traceback): def _get_master_nonbib_dict(self): # Template for the new protobuf structure return { - "identifier": [], #MP + "identifier": [], # Master Pipeline "links": { - "ARXIV": [], #MP - "DOI": [],#MP + "ARXIV": [], # Master Pipeline + "DOI": [], # Master Pipeline "DATA": {}, "ESOURCE": {}, "ASSOCIATED": { @@ -57,14 +57,14 @@ def _get_master_nonbib_dict(self): "title": [], "count": 0 }, - "ABSTRACT": True, - "CITATIONS": False, - "GRAPHICS": False,#MP - "METRICS": False, - "OPENURL": False, - "REFERENCES": False, - "TOC": False, - "COREAD": False + "ABSTRACT": False, # Master Pipeline + "CITATIONS": False, # Master Pipeline + "GRAPHICS": False, # Master Pipeline + "METRICS": False, # Master Pipeline + "OPENURL": False, # Master Pipeline + "REFERENCES": False,# Master Pipeline + "TOC": False, # Master Pipeline + "COREAD": False # Master Pipeline } } @@ -94,7 +94,6 @@ def process_bibcodes(self, bibcodes): if not self.compute_CC: tasks.task_output_nonbib.delay(nonbib_protos) tasks.task_output_metrics.delay(metrics_protos) - # TODO: Check what else can be added for master protobuf def _convert(self, passed): """Convert full nonbib dict to what is needed for nonbib protobuf. @@ -180,9 +179,11 @@ def _convert(self, passed): # Merge and process data links return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows']) + + master_template = self._get_master_nonbib_dict() # Populate the new protobuf structure with link data - self._populate_new_links_structure(return_value['data_links_rows']) + self._populate_new_links_structure(return_value['data_links_rows'], master_template) # Add computed fields for field_name, field_config in computed_fields.items(): @@ -203,7 +204,7 @@ def _convert(self, passed): } for field in unused_fields: return_value.pop(field, None) - return_value.update(self.master_protobuf) + return_value.update(master_template) return_value.pop('data_links_rows') return return_value @@ -414,7 +415,7 @@ def _compute_bibgroup_facet(self, d): bibgroup_facet = sorted(list(set(bibgroup))) return {'bibgroup_facet': bibgroup_facet} - def _populate_new_links_structure(self, data_links_rows): + def _populate_new_links_structure(self, data_links_rows, master_template): """Populate the new protobuf links structure from data_links_rows. Maps the flat data_links_rows into the hierarchical links structure.""" @@ -440,24 +441,25 @@ def _populate_new_links_structure(self, data_links_rows): # Handle DATA and ESOURCE which have sub_type structure if mapped_type in ('DATA', 'ESOURCE'): sub_type = row.get('link_sub_type', '') - if sub_type not in self.master_protobuf['links'][mapped_type]: - self.master_protobuf['links'][mapped_type][sub_type] = { + if sub_type not in master_template['links'][mapped_type]: + master_template['links'][mapped_type][sub_type] = { 'url': [], 'title': [], 'count': 0 } if 'url' in row: - self.master_protobuf['links'][mapped_type][sub_type]['url'].extend(row['url']) + master_template['links'][mapped_type][sub_type]['url'].extend(row['url']) if 'title' in row: - self.master_protobuf['links'][mapped_type][sub_type]['title'].extend(row['title']) + master_template['links'][mapped_type][sub_type]['title'].extend(row['title']) if 'item_count' in row: - self.master_protobuf['links'][mapped_type][sub_type]['count'] = row['item_count'] + master_template['links'][mapped_type][sub_type]['count'] = row['item_count'] # Handle other link types with direct structure else: if 'url' in row: - self.master_protobuf['links'][mapped_type]['url'].extend(row['url']) + master_template['links'][mapped_type]['url'].extend(row['url']) if 'title' in row: - self.master_protobuf['links'][mapped_type]['title'].extend(row['title']) + master_template['links'][mapped_type]['title'].extend(row['title']) if 'item_count' in row: - self.master_protobuf['links'][mapped_type]['count'] = row['item_count'] \ No newline at end of file + master_template['links'][mapped_type]['count'] = row['item_count'] + return master_template \ No newline at end of file diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py index d6b852b..080c349 100644 --- a/adsdata/tests/test_process.py +++ b/adsdata/tests/test_process.py @@ -93,14 +93,14 @@ def test_nonbib_record(self): 'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M'], 'title': [''], 'count': 0}}, 'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, - 'ABSTRACT': True, + 'ABSTRACT': False, # Master Pipeline will set to True 'CITATIONS': False, - 'GRAPHICS': False, + 'GRAPHICS': False, # Master Pipeline will set to True 'METRICS': False, - 'OPENURL': False, + 'OPENURL': False, # Master Pipeline will set to True 'REFERENCES': False, 'TOC': False, - 'COREAD': False}} + 'COREAD': False}} # Master Pipeline will set to True self.assertEqual(a, n) self._validate_nonbib_structure(n) @@ -121,8 +121,8 @@ def test_nonbib_record(self): 'NED': {'url': ['https://$NED$/cgi-bin/objsearch?search_type=Search&refcode=2004MNRAS.354L..31M'], 'title': ['NED Objects (1953)'], 'count': 1953}, 'SIMBAD': {'url': ['http://$SIMBAD$/simbo.pl?bibcode=2004MNRAS.354L..31M'], 'title': ['SIMBAD Objects (1)'], 'count': 1}, 'Vizier': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}}, - 'ESOURCE': {'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M', 'http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M'], 'title': ['', ''], 'count': 0}, - 'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M', 'http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M'], 'title': ['', ''], 'count': 0}, + 'ESOURCE': {'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M'], 'title': [''], 'count': 0}, + 'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M'], 'title': [''], 'count': 0}, 'PUB_HTML': {'url': ['http://dx.doi.org/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, 'EPRINT_HTML': {'url': ['https://arxiv.org/abs/astro-ph/0405472'], 'title': [''], 'count': 0}, 'PUB_PDF': {'url': ['https://academic.oup.com/mnras/pdf-lookup/doi/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, @@ -130,14 +130,14 @@ def test_nonbib_record(self): 'ASSOCIATED': {'url': ['2004MNRAS.354L..31M', '2005yCat..73549031M'], 'title': ['Source Paper', 'Catalog Description'], 'count': 0}, 'INSPIRE': {'url': ['http://inspirehep.net/search?p=find+j+MNRAA,354,L31'], 'title': [''], 'count': 0}, 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, - 'ABSTRACT': True, + 'ABSTRACT': False, # Master Pipeline will set to True 'CITATIONS': False, - 'GRAPHICS': False, + 'GRAPHICS': False, # Master Pipeline will set to True 'METRICS': False, - 'OPENURL': False, + 'OPENURL': False, # Master Pipeline will set to True 'REFERENCES': False, 'TOC': False, - 'COREAD': False}} + 'COREAD': False}} # Master Pipeline will set to True v_boost = v.pop('boost') a_boost = a.pop('boost') self.assertAlmostEqual(a_boost, v_boost) @@ -367,3 +367,87 @@ def test_compute_bibgroup_facet(self): self.assertEqual({'bibgroup_facet': ['a']}, p._compute_bibgroup_facet({'bibgroup': ['a']})) self.assertEqual({'bibgroup_facet': ['a', 'b']}, p._compute_bibgroup_facet({'bibgroup': ['a', 'b']})) self.assertEqual({'bibgroup_facet': ['a', 'b']}, p._compute_bibgroup_facet({'bibgroup': ['a', 'b', 'a']})) + + def test_multiple_bibcodes_no_link_leakage(self): + """Verify links don't leak between bibcodes when processing sequentially""" + self.maxDiff = None + + with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}): + # Process bibcode A - has ADS_PDF and ADS_SCAN esources + bibcode_a = '2003ASPC..295..361M' + d_a = processor._read_next_bibcode(bibcode_a) + result_a = processor._convert(d_a) + + # Verify A has only its own ESOURCE links + self.assertIn('ESOURCE', result_a['links']) + esource_a = result_a['links']['ESOURCE'] + self.assertIn('ADS_PDF', esource_a) + self.assertIn('ADS_SCAN', esource_a) + + # Store A's link counts for later comparison + ads_pdf_urls_a = list(esource_a['ADS_PDF']['url']) + ads_scan_urls_a = list(esource_a['ADS_SCAN']['url']) + + # Verify A has only one URL per link type (its own) + self.assertEqual(len(ads_pdf_urls_a), 1, + f"Bibcode A should have exactly 1 ADS_PDF URL, got {len(ads_pdf_urls_a)}") + self.assertEqual(len(ads_scan_urls_a), 1, + f"Bibcode A should have exactly 1 ADS_SCAN URL, got {len(ads_scan_urls_a)}") + + # Verify URLs contain the correct bibcode + self.assertIn(bibcode_a, ads_pdf_urls_a[0]) + self.assertIn(bibcode_a, ads_scan_urls_a[0]) + + # Now process bibcode B - has different esources (includes PUB_HTML, EPRINT_HTML, etc.) + bibcode_b = '2004MNRAS.354L..31M' + d_b = processor._read_next_bibcode(bibcode_b) + result_b = processor._convert(d_b) + + # Verify B has its own ESOURCE links + self.assertIn('ESOURCE', result_b['links']) + esource_b = result_b['links']['ESOURCE'] + + # B should have ADS_PDF and ADS_SCAN (from its own data) + self.assertIn('ADS_PDF', esource_b) + self.assertIn('ADS_SCAN', esource_b) + + # B should have only its own URLs, NOT A's URLs + ads_pdf_urls_b = esource_b['ADS_PDF']['url'] + ads_scan_urls_b = esource_b['ADS_SCAN']['url'] + + # Check that B's URLs don't contain A's bibcode + for url in ads_pdf_urls_b: + self.assertNotIn(bibcode_a, url, + f"Bibcode B's ADS_PDF links leaked bibcode A's URL: {url}") + + for url in ads_scan_urls_b: + self.assertNotIn(bibcode_a, url, + f"Bibcode B's ADS_SCAN links leaked bibcode A's URL: {url}") + + # Verify B has its own bibcode in its URLs + b_pdf_has_own_bibcode = any(bibcode_b in url for url in ads_pdf_urls_b) + b_scan_has_own_bibcode = any(bibcode_b in url for url in ads_scan_urls_b) + + self.assertTrue(b_pdf_has_own_bibcode, + f"Bibcode B should have its own bibcode in ADS_PDF URLs") + self.assertTrue(b_scan_has_own_bibcode, + f"Bibcode B should have its own bibcode in ADS_SCAN URLs") + + # Also verify DATA links don't leak + # A has no DATA links, B has DATA links (CDS, NED, SIMBAD, Vizier) + data_a = result_a['links']['DATA'] + data_b = result_b['links']['DATA'] + + self.assertEqual(len(data_a), 0, "Bibcode A should have no DATA links") + self.assertGreater(len(data_b), 0, "Bibcode B should have DATA links") + + # Verify DATA subtypes in B + self.assertIn('CDS', data_b) + self.assertIn('NED', data_b) + self.assertIn('SIMBAD', data_b) + self.assertIn('Vizier', data_b) + + print(f"\n✅ Link leakage test passed!") + print(f" Bibcode A processed: {len(ads_pdf_urls_a)} ADS_PDF URLs, {len(ads_scan_urls_a)} ADS_SCAN URLs") + print(f" Bibcode B processed: {len(ads_pdf_urls_b)} ADS_PDF URLs, {len(ads_scan_urls_b)} ADS_SCAN URLs") + print(f" No links from A leaked into B") From 3c22513aadfe85a01e4daf357959db4878d3dd53 Mon Sep 17 00:00:00 2001 From: femalves Date: Thu, 4 Dec 2025 12:07:41 -0500 Subject: [PATCH 17/17] adding logs --- adsdata/process.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/adsdata/process.py b/adsdata/process.py index 736a215..6ec47d3 100644 --- a/adsdata/process.py +++ b/adsdata/process.py @@ -206,6 +206,7 @@ def _convert(self, passed): return_value.pop(field, None) return_value.update(master_template) return_value.pop('data_links_rows') + self.logger.debug('Processed nonbib data: {}'.format(return_value)) return return_value def _add_citation_count_fields(self, return_value, original): @@ -279,6 +280,7 @@ def _merge_data_links(self, datalinks): def _convert_data_link(self, filetype, value): """convert one data link row""" + self.logger.debug('Converting data link: {}'.format(value)) file_properties = self.data_dict[filetype] link_type = file_properties['extra_values']['link_type'] @@ -308,16 +310,18 @@ def _convert_data_link(self, filetype, value): link_data['title'] = value.get('title', ['']) link_data['item_count'] = value.get('item_count', 0) + self.logger.debug('Link data before conversion: {}'.format(link_data)) if isinstance(link_data['url'], str): link_data['url'] = [link_data['url']] if isinstance(link_data['title'], str): link_data['title'] = [link_data['title']] - + self.logger.debug('Link data after conversion: {}'.format(link_data)) elif not isinstance(value, bool): self.logger.error( f"Serious error in process.convert_data_link: unexpected type for value, filetype = {filetype}, " f"value = {value}, type of value = {type(value)}" ) + self.logger.debug('Converted data link: {}'.format(link_data)) return link_data def _read_next_bibcode(self, bibcode): @@ -418,6 +422,8 @@ def _compute_bibgroup_facet(self, d): def _populate_new_links_structure(self, data_links_rows, master_template): """Populate the new protobuf links structure from data_links_rows. Maps the flat data_links_rows into the hierarchical links structure.""" + + self.logger.debug('Populating new links structure: {}'.format(data_links_rows)) # Map for link types that need special handling link_type_mapping = { @@ -462,4 +468,5 @@ def _populate_new_links_structure(self, data_links_rows, master_template): master_template['links'][mapped_type]['title'].extend(row['title']) if 'item_count' in row: master_template['links'][mapped_type]['count'] = row['item_count'] + self.logger.debug('Populated new links structure: {}'.format(master_template)) return master_template \ No newline at end of file