From 70edb847aba583d394f8ee8c8298a0eefd2a5c32 Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Tue, 25 Feb 2025 16:49:29 -0500
Subject: [PATCH 01/17] changed _convert_data_link

---
 adsdata/process.py | 100 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 72 insertions(+), 28 deletions(-)

diff --git a/adsdata/process.py b/adsdata/process.py
index 7e1874a..da00981 100644
--- a/adsdata/process.py
+++ b/adsdata/process.py
@@ -191,38 +191,82 @@ def _merge_data_links(self, datalinks):
                     new_datalinks.append(first)
             return new_datalinks
 
+    # def _convert_data_link(self, filetype, value):
+    #     """convert one data link row"""
+    #     file_properties = self.data_dict[filetype] #data_files[filetype]
+    #     d = {}
+    #     d['link_type'] = file_properties['extra_values']['link_type']
+    #     link_sub_type_suffix = ''
+    #     if value is dict and 'subparts' in value and 'item_count' in value['subparts']:
+    #         link_sub_type_suffix = ' ' + str(value['subparts']['item_count'])
+    #     if value is True:
+    #         d['link_sub_type'] = file_properties['extra_values']['link_sub_type'] + link_sub_type_suffix
+    #     elif 'link_sub_type' in value:
+    #         d['link_sub_type'] = value['link_sub_type'] + link_sub_type_suffix
+    #     elif 'link_sub_type' in file_properties['extra_values']:
+    #         d['link_sub_type'] = file_properties['extra_values']['link_sub_type'] + link_sub_type_suffix
+    #     if type(value) is bool:
+    #         d['url'] = ['']
+    #         d['title'] = ['']
+    #         d['item_count'] = 0
+    #     elif type(value) is dict:
+    #         d['url'] = value.get('url', [''])
+    #         if type(d['url']) is str:
+    #             d['url'] = [d['url']]
+    #         d['title'] = value.get('title', [''])
+    #         if type(d['title']) is str:
+    #             d['title'] = [d['title']]
+    #         # if d['title'] == ['']:
+    #         #    d.pop('title')  # to match old pipeline
+    #         d['item_count'] = value.get('item_count', 0)
+    #     else:
+    #         self.logger.error('serious error in process.convert_data_link: unexpected type for value, filetype = {}, value = {}, type of value = {}'.format(filetype, value, type(value)))
+    #     breakpoint() # {'link_type': 'ESOURCE', 'link_sub_type': 'ADS_PDF', 'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'item_count': 0}
+    #     return d
+
     def _convert_data_link(self, filetype, value):
         """convert one data link row"""
-        file_properties = self.data_dict[filetype] #data_files[filetype]
-        d = {}
-        d['link_type'] = file_properties['extra_values']['link_type']
+        
+        file_properties = self.data_dict[filetype]
+
+        link_type = file_properties['extra_values']['link_type']
+        link_sub_type = file_properties['extra_values'].get('link_sub_type', '')
         link_sub_type_suffix = ''
-        if value is dict and 'subparts' in value and 'item_count' in value['subparts']:
-            link_sub_type_suffix = ' ' + str(value['subparts']['item_count'])
-        if value is True:
-            d['link_sub_type'] = file_properties['extra_values']['link_sub_type'] + link_sub_type_suffix
-        elif 'link_sub_type' in value:
-            d['link_sub_type'] = value['link_sub_type'] + link_sub_type_suffix
-        elif 'link_sub_type' in file_properties['extra_values']:
-            d['link_sub_type'] = file_properties['extra_values']['link_sub_type'] + link_sub_type_suffix
-        if type(value) is bool:
-            d['url'] = ['']
-            d['title'] = ['']
-            d['item_count'] = 0
-        elif type(value) is dict:
-            d['url'] = value.get('url', [''])
-            if type(d['url']) is str:
-                d['url'] = [d['url']]
-            d['title'] = value.get('title', [''])
-            if type(d['title']) is str:
-                d['title'] = [d['title']]
-            # if d['title'] == ['']:
-            #    d.pop('title')  # to match old pipeline
-            d['item_count'] = value.get('item_count', 0)
-        else:
-            self.logger.error('serious error in process.convert_data_link: unexpected type for value, filetype = {}, value = {}, type of value = {}'.format(filetype, value, type(value)))
 
-        return d
+        if isinstance(value, dict) and 'subparts' in value:
+            link_sub_type_suffix = f" {value['subparts'].get('item_count', '')}".strip()
+        
+        # Determine the link sub type
+        if not link_sub_type and isinstance(value, dict) and 'link_sub_type' in value:
+            link_sub_type = value['link_sub_type']
+        
+        link_sub_type += link_sub_type_suffix
+        
+        # Initialize result dictionary
+        link_data =  { 'link_type': link_type, 
+                        'link_sub_type': link_sub_type,
+                        "url": [""],
+                        "title": [""],
+                        "item_count": 0
+                    }
+                
+
+        if isinstance(value, dict):
+            link_data['url'] = value.get('url', [''])
+            link_data['title'] = value.get('title', [''])
+            link_data['item_count'] = value.get('item_count', 0)
+            
+            if isinstance(link_data['url'], str):
+                link_data['url'] = [link_data['url']]
+            if isinstance(link_data['title'], str):
+                link_data['title'] = [link_data['title']]
+        
+        elif not isinstance(value, bool):
+            self.logger.error(
+                f"Serious error in process.convert_data_link: unexpected type for value, filetype = {filetype}, "
+                f"value = {value}, type of value = {type(value)}"
+            )
+        return link_data
 
     def _read_next_bibcode(self, bibcode):
         """read all the info for the passed bibcode into a dict"""

From 855675ccd3577b7cf22ec2030a90e17160f219c1 Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Wed, 26 Feb 2025 16:51:45 -0500
Subject: [PATCH 02/17] changed all complex fields

---
 adsdata/process.py | 255 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 180 insertions(+), 75 deletions(-)

diff --git a/adsdata/process.py b/adsdata/process.py
index da00981..2133ec4 100644
--- a/adsdata/process.py
+++ b/adsdata/process.py
@@ -19,6 +19,7 @@ def __init__(self, compute_metrics=True, compute_CC = False):
             self.data_dict = data_files
         self.logger = tasks.app.logger
         self.readers = {}
+        self.nonbib_dict = self._get_nonbib_dict()
 
     def __enter__(self):
         self._open_all()
@@ -27,6 +28,30 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_value, traceback):
         self._close_all()
 
+    def _get_nonbib_dict(self): 
+        return {
+        "bibcode": '', # OK
+        "identifier": [""], # MP
+        "links": {
+            "ARXIV": [""], # MP
+            "DOI": [""], # MP
+            "DATA": {}, # OK
+            "ESOURCE": {}, # OK 
+            "ASSOCIATED": {},# OK 
+            "INSPIRE": {},# OK 
+            "LIBRARYCATALOG": {},# OK 
+            "PRESENTATION": {},# OK 
+            "ABSTRACT": False,
+            "CITATIONS": False, # OK
+            "GRAPHICS": False,
+            "METRICS": False,
+            "OPENURL": False, # MP
+            "REFERENCES": False, #OK
+            "TOC": False, # OK
+            "COREAD": False # MP
+            }
+        }
+    
     def process_bibcodes(self, bibcodes):
         """send nonbib and metrics records to master for the passed bibcodes
         for each bibcode
@@ -53,81 +78,161 @@ def process_bibcodes(self, bibcodes):
         if not self.compute_CC: tasks.task_output_nonbib.delay(nonbib_protos)
         tasks.task_output_metrics.delay(metrics_protos)
 
-    def _convert(self, passed):
-        """convert full nonbib dict to what is needed for nonbib protobuf
-        data links values are read from separate files so they are in separate dicts
-            they must be merged into one field for the protobuf
-        a couple fields are summarized
-        some other fields are just copied
-        some fields are deleted
-        """
-        return_value = {}
-        return_value['data_links_rows'] = []
-        return_value['property'] = set()
-        return_value['esource'] = set()
-        for filetype, value in passed.items():
-            file_properties = self.data_dict[filetype] #data_files[filetype]
-            if filetype == 'canonical':
-                return_value['bibcode'] = passed['canonical']
-            if (value is dict and dict and 'property' in value[filetype]):
-                return_value['property'].update(value[filetype]['property'])
-            if (type(file_properties['default_value']) is bool):
-                return_value[filetype] = value[filetype]
-                value = value[filetype]
-            if ('extra_values' in file_properties and 'link_type' in file_properties['extra_values'] and value != file_properties['default_value']):
-                # here with one or more real datalinks value(s)
-                # add each data links dict to existing list of dicts
-                # tweak some values (e.g., sub_link_type) in original dict
-                if type(value) is bool or type(value) is dict:
-                    d = self._convert_data_link(filetype, value)
-                    return_value['data_links_rows'].append(d)
-                elif type(value) is list:
-                    for v in value:
-                        d = self._convert_data_link(filetype, v)
-                        return_value['data_links_rows'].append(d)
-                else:
-                    self.logger.error('serious error in process._convert with {} {} {}'.format(filetype, type(value), value))
-
-                if file_properties['extra_values']['link_type'] == 'ESOURCE':
-                    return_value['esource'].add(file_properties['extra_values']['link_sub_type'])
-                return_value['property'].add(file_properties['extra_values']['link_type'])
-                return_value['property'].update(file_properties['extra_values'].get('property', []))
-            elif ('extra_values' in file_properties and value != file_properties['default_value']):
-                if 'property' in file_properties['extra_values']:
-                    return_value['property'].update(file_properties['extra_values']['property'])
-
-            elif value != file_properties['default_value'] or file_properties.get('copy_default', False):
-                # otherwise, copy value
-                return_value[filetype] = passed[filetype]
-            if filetype == 'relevance':
-                for k in passed[filetype]:
-                    # simply add all dict value to top level
-                    return_value[k] = passed[filetype][k]
-
-        self._add_refereed_property(return_value)
-        self._add_article_property(return_value, passed)
-        return_value['property'] = sorted(return_value['property'])
-        return_value['esource'] = sorted(return_value['esource'])
-        self._add_data_summary(return_value)
-        return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows'])
-        self._add_citation_count_fields(return_value, passed)
-
-        # time for computed fields
-        for k, v in computed_fields.items():
-            f = getattr(self, v['converter_function'], None)
-            if f is None:
-                self.logger.error('serious error in process._covert, expected converter_function {} for field {} not found'.format(v['converter_function'], k))
+
+    # def _convert(self, passed:dict):
+    #     """convert full nonbib dict to what is needed for nonbib protobuf
+    #     data links values are read from separate files so they are in separate dicts
+    #         they must be merged into one field for the protobuf
+    #     a couple fields are summarized
+    #     some other fields are just copied
+    #     some fields are deleted
+    #     """
+
+        
+    #     self.nonbib_dict["bibcode"] = passed['canonical']
+    #     self.nonbib_dict["links"]["CITATIONS"] = len(passed['citation']) > 0
+    #     self.nonbib_dict["links"]["REFERENCES"] = len(passed['reference']) > 0
+
+    #     for filetype, value in passed.items():
+    #         file_properties = self.data_dict[filetype] #data_files[filetype]
+    
+    #         not_default_value = value != file_properties['default_value']
+    #         link_type = file_properties.get('extra_values', {}).get('link_type', '')
+
+    #         if ('extra_values' in file_properties and 'link_type' in file_properties['extra_values'] and value != file_properties['default_value']):
+    #             if filetype.upper() == 'TOC': 
+    #                 self.nonbib_dict['links']['TOC'] = True 
+    #             else:
+                    
+    #                 self._handle_data_link(filetype, value)
+        
+    #     breakpoint()
+
+           
+    #     self._add_article_property(return_value, passed)
+    #     return_value['esource'] = sorted(return_value['esource'])
+    #     self._add_data_summary(return_value)
+    #     return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows'])
+    #     self._add_citation_count_fields(return_value, passed)
+
+    #     # time for computed fields
+    #     for k, v in computed_fields.items():
+    #         f = getattr(self, v['converter_function'], None)
+    #         if f is None:
+    #             self.logger.error('serious error in process._covert, expected converter_function {} for field {} not found'.format(v['converter_function'], k))
+    #         else:
+    #             x = f(return_value)
+    #             return_value.update(x)
+
+        
+        
+    #     return return_value
+
+
+    def _handle_data_link(self, filetype, value): 
+        result = []
+        if isinstance(value, dict): # ESOURCE, ASSOCIATED, LIBRARYCATALOG, PRESENTATION, INSPIRE
+            d = self._convert_data_link(filetype, value)
+            result.append(d)
+        elif isinstance(value, list): # DATA
+            for v in value:
+                d = self._convert_data_link(filetype, v)
+                result.append(d)
+        elif not isinstance(value, bool):
+            self.logger.error('serious error in process._convert with {} {} {}'.format(filetype, type(value), value))
+        
+        for d in result: 
+            link_type = d.get('link_type', '')
+            link_sub_type = d.get('link_sub_type', '')
+
+            del d['link_type']
+            del d['link_sub_type']
+            
+            #   {'link_type': 'ESOURCE', 'link_sub_type': 'ADS_PDF', 'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'item_count': 0}
+            #   Not here anymore {'link_type': 'TOC', 'link_sub_type': 'NA', 'url': [''], 'title': [''], 'item_count': 0}
+            #   {'link_type': 'ASSOCIATED', 'link_sub_type': 'NA', 'url': ['2004MNRAS.354L..31M', '2005yCat..73549031M'], 'title': ['Source Paper', 'Catalog Description'], 'item_count': 0}
+            if link_type == 'ESOURCE' or link_type == 'DATA': 
+                self.nonbib_dict['links'][link_type].update({link_sub_type: d})
             else:
-                x = f(return_value)
-                return_value.update(x)
-
-        # finally, delete the keys not in the nonbib protobuf
-        not_needed = ['author', 'canonical', 'citation', 'deleted', 'deprecated_citation_count', 'doi', 'download', 'item_count', 'nonarticle',
-                      'ocrabstract', 'preprint', 'private', 'pub_openaccess', 'pub2arxiv',
-                      'reads', 'refereed', 'relevance', 'toc']
-        for n in not_needed:
-            return_value.pop(n, None)
-        return return_value
+                self.nonbib_dict['links'][link_type].update(d)
+       
+    
+    # def _convert(self, passed):
+    #     """convert full nonbib dict to what is needed for nonbib protobuf
+    #     data links values are read from separate files so they are in separate dicts
+    #         they must be merged into one field for the protobuf
+    #     a couple fields are summarized
+    #     some other fields are just copied
+    #     some fields are deleted
+    #     """
+    #     return_value = {'data_links_rows': [], 
+    #                     'property': set(), 
+    #                     "esource": set()}
+        
+    #     for filetype, value in passed.items():
+    #         file_properties = self.data_dict[filetype] #data_files[filetype]
+    #         if filetype == 'canonical':
+    #             return_value['bibcode'] = passed['canonical']
+    #         if (value is dict and dict and 'property' in value[filetype]):
+    #             return_value['property'].update(value[filetype]['property'])
+    #         if (type(file_properties['default_value']) is bool):
+    #             return_value[filetype] = value[filetype]
+    #             value = value[filetype]
+    #         if ('extra_values' in file_properties and 'link_type' in file_properties['extra_values'] and value != file_properties['default_value']):
+    #             # here with one or more real datalinks value(s)
+    #             # add each data links dict to existing list of dicts
+    #             # tweak some values (e.g., sub_link_type) in original dict
+
+    #             if type(value) is bool or type(value) is dict:
+    #                 d = self._convert_data_link(filetype, value)
+    #                 return_value['data_links_rows'].append(d)
+    #             elif type(value) is list:
+    #                 for v in value:
+    #                     d = self._convert_data_link(filetype, v)
+    #                     return_value['data_links_rows'].append(d)
+    #             else:
+    #                 self.logger.error('serious error in process._convert with {} {} {}'.format(filetype, type(value), value))
+
+    #             if file_properties['extra_values']['link_type'] == 'ESOURCE':
+    #                 return_value['esource'].add(file_properties['extra_values']['link_sub_type'])
+    #             return_value['property'].add(file_properties['extra_values']['link_type'])
+    #             return_value['property'].update(file_properties['extra_values'].get('property', []))
+    #         elif ('extra_values' in file_properties and value != file_properties['default_value']):
+    #             if 'property' in file_properties['extra_values']:
+    #                 return_value['property'].update(file_properties['extra_values']['property'])
+
+    #         elif value != file_properties['default_value'] or file_properties.get('copy_default', False):
+    #             # otherwise, copy value
+    #             return_value[filetype] = passed[filetype]
+    #         if filetype == 'relevance':
+    #             for k in passed[filetype]:
+    #                 # simply add all dict value to top level
+    #                 return_value[k] = passed[filetype][k]
+
+    #     self._add_refereed_property(return_value)
+    #     self._add_article_property(return_value, passed)
+    #     return_value['property'] = sorted(return_value['property'])
+    #     return_value['esource'] = sorted(return_value['esource'])
+    #     self._add_data_summary(return_value)
+    #     return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows'])
+    #     self._add_citation_count_fields(return_value, passed)
+
+    #     # time for computed fields
+    #     for k, v in computed_fields.items():
+    #         f = getattr(self, v['converter_function'], None)
+    #         if f is None:
+    #             self.logger.error('serious error in process._covert, expected converter_function {} for field {} not found'.format(v['converter_function'], k))
+    #         else:
+    #             x = f(return_value)
+    #             return_value.update(x)
+
+    #     # finally, delete the keys not in the nonbib protobuf
+    #     not_needed = ['author', 'canonical', 'citation', 'deleted', 'deprecated_citation_count', 'doi', 'download', 'item_count', 'nonarticle',
+    #                   'ocrabstract', 'preprint', 'private', 'pub_openaccess', 'pub2arxiv',
+    #                   'reads', 'refereed', 'relevance', 'toc']
+    #     for n in not_needed:
+    #         return_value.pop(n, None)
+    #     return return_value
 
     def _add_citation_count_fields(self, return_value, original):
         author_count = len(original.get('author', ()))
@@ -224,6 +329,7 @@ def _merge_data_links(self, datalinks):
     #     breakpoint() # {'link_type': 'ESOURCE', 'link_sub_type': 'ADS_PDF', 'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'item_count': 0}
     #     return d
 
+    #TODO: remove 'TOC' from here
     def _convert_data_link(self, filetype, value):
         """convert one data link row"""
         
@@ -250,7 +356,6 @@ def _convert_data_link(self, filetype, value):
                         "item_count": 0
                     }
                 
-
         if isinstance(value, dict):
             link_data['url'] = value.get('url', [''])
             link_data['title'] = value.get('title', [''])

From 85b238ffb4c37db578f49dddd5ff88fb5791851b Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Tue, 11 Mar 2025 19:20:55 -0400
Subject: [PATCH 03/17] populate new protobuf

---
 adsdata/process.py | 425 ++++++++++++++++++++++-----------------------
 1 file changed, 211 insertions(+), 214 deletions(-)

diff --git a/adsdata/process.py b/adsdata/process.py
index 2133ec4..7d953a5 100644
--- a/adsdata/process.py
+++ b/adsdata/process.py
@@ -1,4 +1,3 @@
-
 from datetime import datetime
 from collections import defaultdict
 
@@ -19,7 +18,8 @@ def __init__(self, compute_metrics=True, compute_CC = False):
             self.data_dict = data_files
         self.logger = tasks.app.logger
         self.readers = {}
-        self.nonbib_dict = self._get_nonbib_dict()
+        self.new_protobuf_template = self._get_nonbib_dict()
+        
 
     def __enter__(self):
         self._open_all()
@@ -28,30 +28,47 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_value, traceback):
         self._close_all()
 
-    def _get_nonbib_dict(self): 
+    def _get_nonbib_dict(self):
+        # Template for the new protobuf structure
         return {
-        "bibcode": '', # OK
-        "identifier": [""], # MP
-        "links": {
-            "ARXIV": [""], # MP
-            "DOI": [""], # MP
-            "DATA": {}, # OK
-            "ESOURCE": {}, # OK 
-            "ASSOCIATED": {},# OK 
-            "INSPIRE": {},# OK 
-            "LIBRARYCATALOG": {},# OK 
-            "PRESENTATION": {},# OK 
-            "ABSTRACT": False,
-            "CITATIONS": False, # OK
-            "GRAPHICS": False,
-            "METRICS": False,
-            "OPENURL": False, # MP
-            "REFERENCES": False, #OK
-            "TOC": False, # OK
-            "COREAD": False # MP
+            "identifier": [], #MP
+            "links": {
+                "ARXIV": [], #MP
+                "DOI": [],#MP
+                "DATA": {},
+                "ESOURCE": {},
+                "ASSOCIATED": {
+                    "url": [],
+                    "title": [],
+                    "count": 0
+                },
+                "INSPIRE": {
+                    "url": [],
+                    "title": [],
+                    "count": 0
+                },
+                "LIBRARYCATALOG": {
+                    "url": [],
+                    "title": [],
+                    "count": 0
+                },
+                "PRESENTATION": {
+                    "url": [],
+                    "title": [],
+                    "count": 0
+                },
+                "ABSTRACT": False, #MP
+                "CITATIONS": False,
+                "GRAPHICS": False,#MP
+                "METRICS": False, #MP
+                "OPENURL": False, #MP
+                "REFERENCES": False,
+                "TOC": False,
+                "COREAD": False #MP
             }
         }
-    
+
+        
     def process_bibcodes(self, bibcodes):
         """send nonbib and metrics records to master for the passed bibcodes
         for each bibcode
@@ -78,165 +95,127 @@ def process_bibcodes(self, bibcodes):
         if not self.compute_CC: tasks.task_output_nonbib.delay(nonbib_protos)
         tasks.task_output_metrics.delay(metrics_protos)
 
-
-    # def _convert(self, passed:dict):
-    #     """convert full nonbib dict to what is needed for nonbib protobuf
-    #     data links values are read from separate files so they are in separate dicts
-    #         they must be merged into one field for the protobuf
-    #     a couple fields are summarized
-    #     some other fields are just copied
-    #     some fields are deleted
-    #     """
-
-        
-    #     self.nonbib_dict["bibcode"] = passed['canonical']
-    #     self.nonbib_dict["links"]["CITATIONS"] = len(passed['citation']) > 0
-    #     self.nonbib_dict["links"]["REFERENCES"] = len(passed['reference']) > 0
-
-    #     for filetype, value in passed.items():
-    #         file_properties = self.data_dict[filetype] #data_files[filetype]
     
-    #         not_default_value = value != file_properties['default_value']
-    #         link_type = file_properties.get('extra_values', {}).get('link_type', '')
-
-    #         if ('extra_values' in file_properties and 'link_type' in file_properties['extra_values'] and value != file_properties['default_value']):
-    #             if filetype.upper() == 'TOC': 
-    #                 self.nonbib_dict['links']['TOC'] = True 
-    #             else:
-                    
-    #                 self._handle_data_link(filetype, value)
-        
-    #     breakpoint()
-
-           
-    #     self._add_article_property(return_value, passed)
-    #     return_value['esource'] = sorted(return_value['esource'])
-    #     self._add_data_summary(return_value)
-    #     return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows'])
-    #     self._add_citation_count_fields(return_value, passed)
-
-    #     # time for computed fields
-    #     for k, v in computed_fields.items():
-    #         f = getattr(self, v['converter_function'], None)
-    #         if f is None:
-    #             self.logger.error('serious error in process._covert, expected converter_function {} for field {} not found'.format(v['converter_function'], k))
-    #         else:
-    #             x = f(return_value)
-    #             return_value.update(x)
-
+    def _convert(self, passed):
+        """Convert full nonbib dict to what is needed for nonbib protobuf.
         
+        Data links values are read from separate files and merged into one field.
+        The method handles:
+        - Data link processing and merging
+        - Property aggregation
+        - Field summarization and copying
+        - Computed field generation
+        - Cleanup of unused fields
         
-    #     return return_value
-
-
-    def _handle_data_link(self, filetype, value): 
-        result = []
-        if isinstance(value, dict): # ESOURCE, ASSOCIATED, LIBRARYCATALOG, PRESENTATION, INSPIRE
-            d = self._convert_data_link(filetype, value)
-            result.append(d)
-        elif isinstance(value, list): # DATA
-            for v in value:
-                d = self._convert_data_link(filetype, v)
-                result.append(d)
-        elif not isinstance(value, bool):
-            self.logger.error('serious error in process._convert with {} {} {}'.format(filetype, type(value), value))
+        Args:
+            passed (dict): Raw data dictionary containing all input fields
+            
+        Returns:
+            dict: Processed data ready for nonbib protobuf
+        """
+        # Initialize return structure
+        return_value = {
+            "data_links_rows": [], 
+            "property": set(), 
+            "esource": set()
+        }
+          
+        for filetype, value in passed.items():
+            file_properties = self.data_dict[filetype]
+            default_value = file_properties.get('default_value')
+            extra_values = file_properties.get('extra_values', {})
+          
+            # Handle special cases first
+            if filetype == 'canonical':
+                return_value['bibcode'] = passed['canonical']
+                continue
+            
+            if filetype == 'relevance':
+                return_value.update(passed[filetype])
+                continue
         
-        for d in result: 
-            link_type = d.get('link_type', '')
-            link_sub_type = d.get('link_sub_type', '')
-
-            del d['link_type']
-            del d['link_sub_type']
+            # Handle boolean fields and TOC
+            if isinstance(default_value, bool):
+                if filetype == 'toc':
+                    self.new_protobuf_template['links']['TOC'] = value[filetype]
+                
+                return_value[filetype] = value[filetype]
+                value = value[filetype]
             
-            #   {'link_type': 'ESOURCE', 'link_sub_type': 'ADS_PDF', 'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'item_count': 0}
-            #   Not here anymore {'link_type': 'TOC', 'link_sub_type': 'NA', 'url': [''], 'title': [''], 'item_count': 0}
-            #   {'link_type': 'ASSOCIATED', 'link_sub_type': 'NA', 'url': ['2004MNRAS.354L..31M', '2005yCat..73549031M'], 'title': ['Source Paper', 'Catalog Description'], 'item_count': 0}
-            if link_type == 'ESOURCE' or link_type == 'DATA': 
-                self.nonbib_dict['links'][link_type].update({link_sub_type: d})
+            # Process data links
+            if 'link_type' in extra_values and value != default_value:
+                # Convert and add data links
+                if isinstance(value, (bool, dict)):
+                    return_value['data_links_rows'].append(
+                        self._convert_data_link(filetype, value))
+                elif isinstance(value, list):
+                    return_value['data_links_rows'].extend(
+                        self._convert_data_link(filetype, v) for v in value)
+                else:
+                    self.logger.error(
+                        f'serious error in process._convert with {filetype} {type(value)} {value}')
+                    continue
+                
+                # Update esource and properties
+                link_type = extra_values['link_type']
+                if link_type == 'ESOURCE':
+                    return_value['esource'].add(extra_values['link_sub_type'])
+                return_value['property'].add(link_type)
+                return_value['property'].update(extra_values.get('property', []))
+            
+            # Handle properties
+            elif extra_values and value != default_value:
+                if 'property' in extra_values:
+                    return_value['property'].update(extra_values['property'])
+            
+            # Copy remaining fields if needed
+            elif value != default_value or file_properties.get('copy_default', False):
+                return_value[filetype] = passed[filetype]
+        
+        # Add computed properties
+        self._add_refereed_property(return_value)
+        self._add_article_property(return_value, passed)
+        self._add_data_summary(return_value)
+        self._add_citation_count_fields(return_value, passed)
+        
+        # Sort sets
+        return_value['property'] = sorted(return_value['property'])
+        return_value['esource'] = sorted(return_value['esource'])
+        
+        # Merge and process data links
+        return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows'])
+        
+        # Populate the new protobuf structure with link data
+        self._populate_new_links_structure(return_value['data_links_rows'])
+        
+        # Populate the boolean flags
+        self._populate_link_flags(passed)
+        
+        # Add computed fields
+        for field_name, field_config in computed_fields.items():
+            converter = getattr(self, field_config['converter_function'], None)
+            if converter:
+                return_value.update(converter(return_value))
             else:
-                self.nonbib_dict['links'][link_type].update(d)
-       
-    
-    # def _convert(self, passed):
-    #     """convert full nonbib dict to what is needed for nonbib protobuf
-    #     data links values are read from separate files so they are in separate dicts
-    #         they must be merged into one field for the protobuf
-    #     a couple fields are summarized
-    #     some other fields are just copied
-    #     some fields are deleted
-    #     """
-    #     return_value = {'data_links_rows': [], 
-    #                     'property': set(), 
-    #                     "esource": set()}
+                self.logger.error(
+                    f'serious error in process._convert, expected converter_function '
+                    f'{field_config["converter_function"]} for field {field_name} not found')
         
-    #     for filetype, value in passed.items():
-    #         file_properties = self.data_dict[filetype] #data_files[filetype]
-    #         if filetype == 'canonical':
-    #             return_value['bibcode'] = passed['canonical']
-    #         if (value is dict and dict and 'property' in value[filetype]):
-    #             return_value['property'].update(value[filetype]['property'])
-    #         if (type(file_properties['default_value']) is bool):
-    #             return_value[filetype] = value[filetype]
-    #             value = value[filetype]
-    #         if ('extra_values' in file_properties and 'link_type' in file_properties['extra_values'] and value != file_properties['default_value']):
-    #             # here with one or more real datalinks value(s)
-    #             # add each data links dict to existing list of dicts
-    #             # tweak some values (e.g., sub_link_type) in original dict
-
-    #             if type(value) is bool or type(value) is dict:
-    #                 d = self._convert_data_link(filetype, value)
-    #                 return_value['data_links_rows'].append(d)
-    #             elif type(value) is list:
-    #                 for v in value:
-    #                     d = self._convert_data_link(filetype, v)
-    #                     return_value['data_links_rows'].append(d)
-    #             else:
-    #                 self.logger.error('serious error in process._convert with {} {} {}'.format(filetype, type(value), value))
-
-    #             if file_properties['extra_values']['link_type'] == 'ESOURCE':
-    #                 return_value['esource'].add(file_properties['extra_values']['link_sub_type'])
-    #             return_value['property'].add(file_properties['extra_values']['link_type'])
-    #             return_value['property'].update(file_properties['extra_values'].get('property', []))
-    #         elif ('extra_values' in file_properties and value != file_properties['default_value']):
-    #             if 'property' in file_properties['extra_values']:
-    #                 return_value['property'].update(file_properties['extra_values']['property'])
-
-    #         elif value != file_properties['default_value'] or file_properties.get('copy_default', False):
-    #             # otherwise, copy value
-    #             return_value[filetype] = passed[filetype]
-    #         if filetype == 'relevance':
-    #             for k in passed[filetype]:
-    #                 # simply add all dict value to top level
-    #                 return_value[k] = passed[filetype][k]
-
-    #     self._add_refereed_property(return_value)
-    #     self._add_article_property(return_value, passed)
-    #     return_value['property'] = sorted(return_value['property'])
-    #     return_value['esource'] = sorted(return_value['esource'])
-    #     self._add_data_summary(return_value)
-    #     return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows'])
-    #     self._add_citation_count_fields(return_value, passed)
-
-    #     # time for computed fields
-    #     for k, v in computed_fields.items():
-    #         f = getattr(self, v['converter_function'], None)
-    #         if f is None:
-    #             self.logger.error('serious error in process._covert, expected converter_function {} for field {} not found'.format(v['converter_function'], k))
-    #         else:
-    #             x = f(return_value)
-    #             return_value.update(x)
-
-    #     # finally, delete the keys not in the nonbib protobuf
-    #     not_needed = ['author', 'canonical', 'citation', 'deleted', 'deprecated_citation_count', 'doi', 'download', 'item_count', 'nonarticle',
-    #                   'ocrabstract', 'preprint', 'private', 'pub_openaccess', 'pub2arxiv',
-    #                   'reads', 'refereed', 'relevance', 'toc']
-    #     for n in not_needed:
-    #         return_value.pop(n, None)
-    #     return return_value
+        # Remove unused fields
+        unused_fields = {
+            'author', 'canonical', 'citation', 'deleted', 'deprecated_citation_count',
+            'doi', 'download', 'item_count', 'nonarticle', 'ocrabstract', 'preprint',
+            'private', 'pub_openaccess', 'pub2arxiv', 'reads', 'refereed',
+            'relevance', 'toc'
+        }
+        for field in unused_fields:
+            return_value.pop(field, None)
+        
+        return return_value
 
-    def _add_citation_count_fields(self, return_value, original):
-        author_count = len(original.get('author', ()))
-        citation_count = len(return_value.get('citation', ()))
+    def _add_citation_count_fields(self, return_value, passed):
+        author_count = len(passed.get('author', ()))
+        citation_count = len(passed.get('citation', ()))
         return_value['citation_count'] = citation_count
         return_value['citation_count_norm'] = citation_count / float(max(author_count, 1))
 
@@ -244,11 +223,11 @@ def _add_refereed_property(self, return_value):
         if'REFEREED' not in return_value['property']:
             return_value['property'].add('NOT REFEREED')
 
-    def _add_article_property(self, return_value, d):
-        x = d.get('nonarticle', False)
-        if type(x) is dict:
-            x = x['nonarticle']
-        if x:
+    def _add_article_property(self, return_value, passed):
+        nonarticle_value = passed.get('nonarticle', False)
+        if isinstance(nonarticle_value, dict):
+            nonarticle_value = nonarticle_value['nonarticle']
+        if nonarticle_value:
             return_value['property'].add('NONARTICLE')
         else:
             return_value['property'].add('ARTICLE')
@@ -296,40 +275,6 @@ def _merge_data_links(self, datalinks):
                     new_datalinks.append(first)
             return new_datalinks
 
-    # def _convert_data_link(self, filetype, value):
-    #     """convert one data link row"""
-    #     file_properties = self.data_dict[filetype] #data_files[filetype]
-    #     d = {}
-    #     d['link_type'] = file_properties['extra_values']['link_type']
-    #     link_sub_type_suffix = ''
-    #     if value is dict and 'subparts' in value and 'item_count' in value['subparts']:
-    #         link_sub_type_suffix = ' ' + str(value['subparts']['item_count'])
-    #     if value is True:
-    #         d['link_sub_type'] = file_properties['extra_values']['link_sub_type'] + link_sub_type_suffix
-    #     elif 'link_sub_type' in value:
-    #         d['link_sub_type'] = value['link_sub_type'] + link_sub_type_suffix
-    #     elif 'link_sub_type' in file_properties['extra_values']:
-    #         d['link_sub_type'] = file_properties['extra_values']['link_sub_type'] + link_sub_type_suffix
-    #     if type(value) is bool:
-    #         d['url'] = ['']
-    #         d['title'] = ['']
-    #         d['item_count'] = 0
-    #     elif type(value) is dict:
-    #         d['url'] = value.get('url', [''])
-    #         if type(d['url']) is str:
-    #             d['url'] = [d['url']]
-    #         d['title'] = value.get('title', [''])
-    #         if type(d['title']) is str:
-    #             d['title'] = [d['title']]
-    #         # if d['title'] == ['']:
-    #         #    d.pop('title')  # to match old pipeline
-    #         d['item_count'] = value.get('item_count', 0)
-    #     else:
-    #         self.logger.error('serious error in process.convert_data_link: unexpected type for value, filetype = {}, value = {}, type of value = {}'.format(filetype, value, type(value)))
-    #     breakpoint() # {'link_type': 'ESOURCE', 'link_sub_type': 'ADS_PDF', 'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'item_count': 0}
-    #     return d
-
-    #TODO: remove 'TOC' from here
     def _convert_data_link(self, filetype, value):
         """convert one data link row"""
         
@@ -346,10 +291,11 @@ def _convert_data_link(self, filetype, value):
         if not link_sub_type and isinstance(value, dict) and 'link_sub_type' in value:
             link_sub_type = value['link_sub_type']
         
+
         link_sub_type += link_sub_type_suffix
         
         # Initialize result dictionary
-        link_data =  { 'link_type': link_type, 
+        link_data =  {  'link_type': link_type, 
                         'link_sub_type': link_sub_type,
                         "url": [""],
                         "title": [""],
@@ -467,3 +413,54 @@ def _compute_bibgroup_facet(self, d):
             return {}
         bibgroup_facet = sorted(list(set(bibgroup)))
         return {'bibgroup_facet': bibgroup_facet}
+
+    def _populate_new_links_structure(self, data_links_rows):
+        """Populate the new protobuf links structure from data_links_rows.
+        Maps the flat data_links_rows into the hierarchical links structure."""
+        
+        # Map for link types that need special handling
+        link_type_mapping = {
+            'DATA': 'DATA',
+            'ESOURCE': 'ESOURCE',
+            'ASSOCIATED': 'ASSOCIATED',
+            'INSPIRE': 'INSPIRE',
+            'LIBRARYCATALOG': 'LIBRARYCATALOG',
+            'PRESENTATION': 'PRESENTATION'
+        }
+        
+        for row in data_links_rows:
+            link_type = row['link_type']
+            
+            # Skip if not in our mapping
+            if link_type not in link_type_mapping:
+                continue
+                
+            mapped_type = link_type_mapping[link_type]
+            
+            # Handle DATA and ESOURCE which have sub_type structure
+            if mapped_type in ('DATA', 'ESOURCE'):
+                sub_type = row['link_sub_type']
+                if sub_type not in self.new_protobuf_template['links'][mapped_type]:
+                    self.new_protobuf_template['links'][mapped_type][sub_type] = {
+                        'url': [],
+                        'title': [],
+                        'count': 0
+                    }
+                self.new_protobuf_template['links'][mapped_type][sub_type]['url'].extend(row['url'])
+                self.new_protobuf_template['links'][mapped_type][sub_type]['title'].extend(row['title'])
+                self.new_protobuf_template['links'][mapped_type][sub_type]['count'] = row['item_count']
+            
+            # Handle other link types with direct structure
+            else:
+                self.new_protobuf_template['links'][mapped_type]['url'].extend(row['url'])
+                self.new_protobuf_template['links'][mapped_type]['title'].extend(row['title'])
+                self.new_protobuf_template['links'][mapped_type]['count'] = row['item_count']
+        
+
+    def _populate_link_flags(self, passed):
+        """Populate the boolean flags in the new protobuf links structure.
+        Sets CITATIONS, REFERENCES, and TOC based on data availability."""
+    
+        self.new_protobuf_template['links']['CITATIONS'] = len(passed.get('citation', [])) > 0
+        self.new_protobuf_template['links']['REFERENCES'] = len(passed.get('reference', [])) > 0
+        
\ No newline at end of file

From 02094e0d6c1d144b104592127df03ec9d3da6364 Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Wed, 12 Mar 2025 17:12:04 -0400
Subject: [PATCH 04/17] modifying tests and removing test with old protobuf

---
 adsdata/process.py            |  2 ++
 adsdata/tests/test_process.py | 65 ++++++++++++++++++++++++++---------
 2 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/adsdata/process.py b/adsdata/process.py
index 7d953a5..c63ff72 100644
--- a/adsdata/process.py
+++ b/adsdata/process.py
@@ -210,6 +210,8 @@ def _convert(self, passed):
         }
         for field in unused_fields:
             return_value.pop(field, None)
+            
+        return_value.update(self.new_protobuf_template)
         
         return return_value
 
diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py
index 84d68fb..7b4abcd 100644
--- a/adsdata/tests/test_process.py
+++ b/adsdata/tests/test_process.py
@@ -72,26 +72,41 @@ def test_read(self):
             self.assertEqual(d['refereed'], {'refereed': False})
             self.assertEqual(d['planetary_feature'], ['Moon/Mare/Mare Imbrium/3678', 'Moon/Crater/Alder/171', 'Moon/Crater/Finsen/1959', 'Moon/Crater/Leibnitz/3335'])
 
-    def test_protobuf(self):
-        """make sure protobuf are created without an exception"""
-        with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}):
-            d = processor._read_next_bibcode('1057wjlf.book.....C')
-            c = processor._convert(d)
-            nonbib = NonBibRecord(**c)
-            print('nonbib = {}'.format(nonbib))
+    # def test_protobuf(self):
+    #     """make sure protobuf are created without an exception"""
+    #     with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}):
+    #         d = processor._read_next_bibcode('1057wjlf.book.....C')
+    #         c = processor._convert(d)
+    #         nonbib = NonBibRecord(**c)
+    #         print('nonbib = {}'.format(nonbib))
 
     def test_nonbib_record(self):
         self.maxDiff = None
         with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}):
-            d = processor._read_next_bibcode('2003ASPC..295..361M')
-            n = processor._convert(d)
-            a = {"read_count": 4, "bibcode": "2003ASPC..295..361M",
-                 'bibgroup': ['Chandra Technical'], 'bibgroup_facet': ['Chandra Technical'],
-                 "data_links_rows": [{"url": ["http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']},
-                                     {"url": ["http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']},
-                                     {"url": [""], "link_type": "TOC", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}],
-                 "esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0}
-            self.assertEqual(a, n)
+            # d = processor._read_next_bibcode('2003ASPC..295..361M')
+            # n = processor._convert(d)
+            # a = {"read_count": 4, "bibcode": "2003ASPC..295..361M",
+            #      'bibgroup': ['Chandra Technical'], 'bibgroup_facet': ['Chandra Technical'],
+            #      "data_links_rows": [{"url": ["http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']},
+            #                          {"url": ["http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']},
+            #                          {"url": [""], "link_type": "TOC", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}],
+            #      "esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0}
+            # new_protobuf = {'identifier': [], 'links': {'ARXIV': [], 'DOI': [], 'DATA': {}, 
+            #                                             'ESOURCE': {'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'count': 0}, 
+            #                                                         'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M'], 'title': [''], 'count': 0}}, 
+            #                                                         'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, 
+            #                                                         'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 
+            #                                                         'PRESENTATION': {'url': [], 'title': [], 'count': 0}, 
+            #                                                         'ABSTRACT': False, 
+            #                                                         'CITATIONS': False, 
+            #                                                         'GRAPHICS': False, 
+            #                                                         'METRICS': False, 
+            #                                                         'OPENURL': False, 
+            #                                                         'REFERENCES': False, 
+            #                                                         'TOC': True, 
+            #                                                         'COREAD': False}}
+            # a.update(new_protobuf)
+            # self.assertEqual(a, n)
 
             d = processor._read_next_bibcode('2004MNRAS.354L..31M')
             v = processor._convert(d)
@@ -118,9 +133,27 @@ def test_nonbib_record(self):
                  "total_link_counts": 1956,
                  "esource": ["ADS_PDF", "ADS_SCAN", "EPRINT_HTML", "EPRINT_PDF", "PUB_HTML", "PUB_PDF"],
                  "boost": 0.4399999976158142}
+
+            new_protobuf = {'identifier': [], 'links': {'ARXIV': [], 'DOI': [], 'DATA': {'CDS': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}, 
+                                                                                         'NED': {'url': ['https://$NED$/cgi-bin/objsearch?search_type=Search&refcode=2004MNRAS.354L..31M'], 'title': ['NED Objects (1953)'], 'count': 1953}, 
+                                                                                         'SIMBAD': {'url': ['http://$SIMBAD$/simbo.pl?bibcode=2004MNRAS.354L..31M'], 'title': ['SIMBAD Objects (1)'], 'count': 1}, 
+                                                                                         'Vizier': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}}, 
+                                                                                         'ESOURCE': {'PUB_HTML': {'url': ['http://dx.doi.org/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, 
+                                                                                                     'EPRINT_HTML': {'url': ['https://arxiv.org/abs/astro-ph/0405472'], 'title': [''], 'count': 0}, 
+                                                                                                     'PUB_PDF': {'url': ['https://academic.oup.com/mnras/pdf-lookup/doi/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, 
+                                                                                                     'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M'], 'title': [''], 'count': 0}, 
+                                                                                                     'EPRINT_PDF': {'url': ['https://arxiv.org/pdf/astro-ph/0405472'], 'title': [''], 'count': 0}, 
+                                                                                                     'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M'], 'title': [''], 'count': 0}}, 
+                                                                                                     'ASSOCIATED': {'url': ['2004MNRAS.354L..31M', '2005yCat..73549031M'], 'title': ['Source Paper', 'Catalog Description'], 'count': 0}, 
+                                                                                                     'INSPIRE': {'url': ['http://inspirehep.net/search?p=find+j+MNRAA,354,L31'], 'title': [''], 'count': 0}, 
+                                                                                                     'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 
+                                                                                                     'PRESENTATION': {'url': [], 'title': [], 'count': 0}, 
+                                                                                                     'ABSTRACT': False, 'CITATIONS': True, 'GRAPHICS': False, 
+                                                                                                     'METRICS': False, 'OPENURL': False, 'REFERENCES': False, 'TOC': False, 'COREAD': False}}
             v_boost = v.pop('boost')
             a_boost = a.pop('boost')
             self.assertAlmostEqual(a_boost, v_boost)
+            a.update(new_protobuf)
             self.assertEqual(a, v)
 
         # consider video 1997kbls.confE..10C

From 940b433ef076719b3cff01b40a336adcdc68db6d Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Thu, 13 Mar 2025 11:25:36 -0400
Subject: [PATCH 05/17] getting it ready for master protobuf

---
 adsdata/process.py            |  6 ++--
 adsdata/tests/test_process.py | 64 +++++++++--------------------------
 2 files changed, 18 insertions(+), 52 deletions(-)

diff --git a/adsdata/process.py b/adsdata/process.py
index c63ff72..ad33dea 100644
--- a/adsdata/process.py
+++ b/adsdata/process.py
@@ -68,7 +68,7 @@ def _get_nonbib_dict(self):
             }
         }
 
-        
+    # TODO: add master protobuf 
     def process_bibcodes(self, bibcodes):
         """send nonbib and metrics records to master for the passed bibcodes
         for each bibcode
@@ -95,7 +95,7 @@ def process_bibcodes(self, bibcodes):
         if not self.compute_CC: tasks.task_output_nonbib.delay(nonbib_protos)
         tasks.task_output_metrics.delay(metrics_protos)
 
-    
+    # TODO: Check what else can be added for master protobuf
     def _convert(self, passed):
         """Convert full nonbib dict to what is needed for nonbib protobuf.
         
@@ -210,8 +210,6 @@ def _convert(self, passed):
         }
         for field in unused_fields:
             return_value.pop(field, None)
-            
-        return_value.update(self.new_protobuf_template)
         
         return return_value
 
diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py
index 7b4abcd..f6f269b 100644
--- a/adsdata/tests/test_process.py
+++ b/adsdata/tests/test_process.py
@@ -72,41 +72,26 @@ def test_read(self):
             self.assertEqual(d['refereed'], {'refereed': False})
             self.assertEqual(d['planetary_feature'], ['Moon/Mare/Mare Imbrium/3678', 'Moon/Crater/Alder/171', 'Moon/Crater/Finsen/1959', 'Moon/Crater/Leibnitz/3335'])
 
-    # def test_protobuf(self):
-    #     """make sure protobuf are created without an exception"""
-    #     with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}):
-    #         d = processor._read_next_bibcode('1057wjlf.book.....C')
-    #         c = processor._convert(d)
-    #         nonbib = NonBibRecord(**c)
-    #         print('nonbib = {}'.format(nonbib))
+    def test_protobuf(self):
+        """make sure protobuf are created without an exception"""
+        with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}):
+            d = processor._read_next_bibcode('1057wjlf.book.....C')
+            c = processor._convert(d)
+            nonbib = NonBibRecord(**c)
+            print('nonbib = {}'.format(nonbib))
 
     def test_nonbib_record(self):
         self.maxDiff = None
         with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}):
-            # d = processor._read_next_bibcode('2003ASPC..295..361M')
-            # n = processor._convert(d)
-            # a = {"read_count": 4, "bibcode": "2003ASPC..295..361M",
-            #      'bibgroup': ['Chandra Technical'], 'bibgroup_facet': ['Chandra Technical'],
-            #      "data_links_rows": [{"url": ["http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']},
-            #                          {"url": ["http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']},
-            #                          {"url": [""], "link_type": "TOC", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}],
-            #      "esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0}
-            # new_protobuf = {'identifier': [], 'links': {'ARXIV': [], 'DOI': [], 'DATA': {}, 
-            #                                             'ESOURCE': {'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'count': 0}, 
-            #                                                         'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M'], 'title': [''], 'count': 0}}, 
-            #                                                         'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, 
-            #                                                         'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 
-            #                                                         'PRESENTATION': {'url': [], 'title': [], 'count': 0}, 
-            #                                                         'ABSTRACT': False, 
-            #                                                         'CITATIONS': False, 
-            #                                                         'GRAPHICS': False, 
-            #                                                         'METRICS': False, 
-            #                                                         'OPENURL': False, 
-            #                                                         'REFERENCES': False, 
-            #                                                         'TOC': True, 
-            #                                                         'COREAD': False}}
-            # a.update(new_protobuf)
-            # self.assertEqual(a, n)
+            d = processor._read_next_bibcode('2003ASPC..295..361M')
+            n = processor._convert(d)
+            a = {"read_count": 4, "bibcode": "2003ASPC..295..361M",
+                 'bibgroup': ['Chandra Technical'], 'bibgroup_facet': ['Chandra Technical'],
+                 "data_links_rows": [{"url": ["http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']},
+                                     {"url": ["http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']},
+                                     {"url": [""], "link_type": "TOC", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}],
+                 "esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0}
+            self.assertEqual(a, n)
 
             d = processor._read_next_bibcode('2004MNRAS.354L..31M')
             v = processor._convert(d)
@@ -134,26 +119,9 @@ def test_nonbib_record(self):
                  "esource": ["ADS_PDF", "ADS_SCAN", "EPRINT_HTML", "EPRINT_PDF", "PUB_HTML", "PUB_PDF"],
                  "boost": 0.4399999976158142}
 
-            new_protobuf = {'identifier': [], 'links': {'ARXIV': [], 'DOI': [], 'DATA': {'CDS': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}, 
-                                                                                         'NED': {'url': ['https://$NED$/cgi-bin/objsearch?search_type=Search&refcode=2004MNRAS.354L..31M'], 'title': ['NED Objects (1953)'], 'count': 1953}, 
-                                                                                         'SIMBAD': {'url': ['http://$SIMBAD$/simbo.pl?bibcode=2004MNRAS.354L..31M'], 'title': ['SIMBAD Objects (1)'], 'count': 1}, 
-                                                                                         'Vizier': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}}, 
-                                                                                         'ESOURCE': {'PUB_HTML': {'url': ['http://dx.doi.org/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, 
-                                                                                                     'EPRINT_HTML': {'url': ['https://arxiv.org/abs/astro-ph/0405472'], 'title': [''], 'count': 0}, 
-                                                                                                     'PUB_PDF': {'url': ['https://academic.oup.com/mnras/pdf-lookup/doi/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, 
-                                                                                                     'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M'], 'title': [''], 'count': 0}, 
-                                                                                                     'EPRINT_PDF': {'url': ['https://arxiv.org/pdf/astro-ph/0405472'], 'title': [''], 'count': 0}, 
-                                                                                                     'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M'], 'title': [''], 'count': 0}}, 
-                                                                                                     'ASSOCIATED': {'url': ['2004MNRAS.354L..31M', '2005yCat..73549031M'], 'title': ['Source Paper', 'Catalog Description'], 'count': 0}, 
-                                                                                                     'INSPIRE': {'url': ['http://inspirehep.net/search?p=find+j+MNRAA,354,L31'], 'title': [''], 'count': 0}, 
-                                                                                                     'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 
-                                                                                                     'PRESENTATION': {'url': [], 'title': [], 'count': 0}, 
-                                                                                                     'ABSTRACT': False, 'CITATIONS': True, 'GRAPHICS': False, 
-                                                                                                     'METRICS': False, 'OPENURL': False, 'REFERENCES': False, 'TOC': False, 'COREAD': False}}
             v_boost = v.pop('boost')
             a_boost = a.pop('boost')
             self.assertAlmostEqual(a_boost, v_boost)
-            a.update(new_protobuf)
             self.assertEqual(a, v)
 
         # consider video 1997kbls.confE..10C

From b603a3a87192ba91d5d5daf5940a700d6c64fd1c Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Thu, 13 Mar 2025 12:10:35 -0400
Subject: [PATCH 06/17] adding metrics

---
 adsdata/process.py | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/adsdata/process.py b/adsdata/process.py
index ad33dea..bc4d91e 100644
--- a/adsdata/process.py
+++ b/adsdata/process.py
@@ -18,7 +18,7 @@ def __init__(self, compute_metrics=True, compute_CC = False):
             self.data_dict = data_files
         self.logger = tasks.app.logger
         self.readers = {}
-        self.new_protobuf_template = self._get_nonbib_dict()
+        self.master_protobuf = self._get_master_nonbib_dict()
         
 
     def __enter__(self):
@@ -28,7 +28,7 @@ def __enter__(self):
     def __exit__(self, exc_type, exc_value, traceback):
         self._close_all()
 
-    def _get_nonbib_dict(self):
+    def _get_master_nonbib_dict(self):
         # Template for the new protobuf structure
         return {
             "identifier": [], #MP
@@ -57,10 +57,10 @@ def _get_nonbib_dict(self):
                     "title": [],
                     "count": 0
                 },
-                "ABSTRACT": False, #MP
+                "ABSTRACT": False,#MP 
                 "CITATIONS": False,
                 "GRAPHICS": False,#MP
-                "METRICS": False, #MP
+                "METRICS": False,
                 "OPENURL": False, #MP
                 "REFERENCES": False,
                 "TOC": False,
@@ -137,7 +137,7 @@ def _convert(self, passed):
             # Handle boolean fields and TOC
             if isinstance(default_value, bool):
                 if filetype == 'toc':
-                    self.new_protobuf_template['links']['TOC'] = value[filetype]
+                    self.master_protobuf['links']['TOC'] = value[filetype]
                 
                 return_value[filetype] = value[filetype]
                 value = value[filetype]
@@ -440,27 +440,28 @@ def _populate_new_links_structure(self, data_links_rows):
             # Handle DATA and ESOURCE which have sub_type structure
             if mapped_type in ('DATA', 'ESOURCE'):
                 sub_type = row['link_sub_type']
-                if sub_type not in self.new_protobuf_template['links'][mapped_type]:
-                    self.new_protobuf_template['links'][mapped_type][sub_type] = {
+                if sub_type not in self.master_protobuf['links'][mapped_type]:
+                    self.master_protobuf['links'][mapped_type][sub_type] = {
                         'url': [],
                         'title': [],
                         'count': 0
                     }
-                self.new_protobuf_template['links'][mapped_type][sub_type]['url'].extend(row['url'])
-                self.new_protobuf_template['links'][mapped_type][sub_type]['title'].extend(row['title'])
-                self.new_protobuf_template['links'][mapped_type][sub_type]['count'] = row['item_count']
+                self.master_protobuf['links'][mapped_type][sub_type]['url'].extend(row['url'])
+                self.master_protobuf['links'][mapped_type][sub_type]['title'].extend(row['title'])
+                self.master_protobuf['links'][mapped_type][sub_type]['count'] = row['item_count']
             
             # Handle other link types with direct structure
             else:
-                self.new_protobuf_template['links'][mapped_type]['url'].extend(row['url'])
-                self.new_protobuf_template['links'][mapped_type]['title'].extend(row['title'])
-                self.new_protobuf_template['links'][mapped_type]['count'] = row['item_count']
+                self.master_protobuf['links'][mapped_type]['url'].extend(row['url'])
+                self.master_protobuf['links'][mapped_type]['title'].extend(row['title'])
+                self.master_protobuf['links'][mapped_type]['count'] = row['item_count']
         
 
     def _populate_link_flags(self, passed):
         """Populate the boolean flags in the new protobuf links structure.
-        Sets CITATIONS, REFERENCES, and TOC based on data availability."""
+        Sets CITATIONS, REFERENCES, and METRICS based on data availability."""
     
-        self.new_protobuf_template['links']['CITATIONS'] = len(passed.get('citation', [])) > 0
-        self.new_protobuf_template['links']['REFERENCES'] = len(passed.get('reference', [])) > 0
+        self.master_protobuf['links']['CITATIONS'] = len(passed.get('citation', [])) > 0
+        self.master_protobuf['links']['REFERENCES'] = len(passed.get('reference', [])) > 0
+        self.master_protobuf['links']['METRICS'] = self.compute_metrics
         
\ No newline at end of file

From 99ef925935a55ced4290f3249595a744323a43b7 Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Fri, 14 Mar 2025 14:07:03 -0400
Subject: [PATCH 07/17] adding tests and adding master to old protobuf

---
 adsdata/process.py            |   3 +-
 adsdata/tests/test_process.py | 191 +++++++++++++++++++++++++++-------
 2 files changed, 155 insertions(+), 39 deletions(-)

diff --git a/adsdata/process.py b/adsdata/process.py
index bc4d91e..b43ca93 100644
--- a/adsdata/process.py
+++ b/adsdata/process.py
@@ -210,7 +210,8 @@ def _convert(self, passed):
         }
         for field in unused_fields:
             return_value.pop(field, None)
-        
+        return_value.update(self.master_protobuf)
+        return_value.pop('data_links_rows')
         return return_value
 
     def _add_citation_count_fields(self, return_value, passed):
diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py
index f6f269b..f0fac40 100644
--- a/adsdata/tests/test_process.py
+++ b/adsdata/tests/test_process.py
@@ -1,4 +1,3 @@
-
 import unittest
 from mock import patch, mock_open
 from datetime import datetime
@@ -72,53 +71,68 @@ def test_read(self):
             self.assertEqual(d['refereed'], {'refereed': False})
             self.assertEqual(d['planetary_feature'], ['Moon/Mare/Mare Imbrium/3678', 'Moon/Crater/Alder/171', 'Moon/Crater/Finsen/1959', 'Moon/Crater/Leibnitz/3335'])
 
-    def test_protobuf(self):
-        """make sure protobuf are created without an exception"""
-        with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}):
-            d = processor._read_next_bibcode('1057wjlf.book.....C')
-            c = processor._convert(d)
-            nonbib = NonBibRecord(**c)
-            print('nonbib = {}'.format(nonbib))
+    # def test_protobuf(self):
+    #     """make sure protobuf are created without an exception"""
+    #     with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}):
+    #         d = processor._read_next_bibcode('1057wjlf.book.....C')
+    #         c = processor._convert(d)
+    #         nonbib = NonBibRecord(**c)
+    #         print('nonbib = {}'.format(nonbib))
 
     def test_nonbib_record(self):
         self.maxDiff = None
         with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}):
             d = processor._read_next_bibcode('2003ASPC..295..361M')
             n = processor._convert(d)
-            a = {"read_count": 4, "bibcode": "2003ASPC..295..361M",
-                 'bibgroup': ['Chandra Technical'], 'bibgroup_facet': ['Chandra Technical'],
-                 "data_links_rows": [{"url": ["http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']},
-                                     {"url": ["http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']},
-                                     {"url": [""], "link_type": "TOC", "link_sub_type": "NA", 'item_count': 0, 'title': ['']}],
-                 "esource": ["ADS_PDF", "ADS_SCAN"], "property": ["ADS_OPENACCESS", "ARTICLE", "ESOURCE", "NOT REFEREED", "OPENACCESS", "TOC"], "boost": 0.15, 'citation_count': 0, 'norm_cites': 0, 'citation_count_norm': 0.0, 'data': [], 'total_link_counts': 0}
+            a = {'property': ['ADS_OPENACCESS', 'ARTICLE', 'ESOURCE', 'NOT REFEREED', 'OPENACCESS', 'TOC'], 'esource': ['ADS_PDF', 'ADS_SCAN'], 
+                 'bibcode': '2003ASPC..295..361M', 'bibgroup': ['Chandra Technical'], 'boost': 0.15, 'read_count': 4, 'norm_cites': 0, 'data': [], 
+                 'total_link_counts': 0, 'citation_count': 0, 'citation_count_norm': 0.0, 
+                 'bibgroup_facet': ['Chandra Technical'], 'identifier': [], 
+                 'links': {'ARXIV': [], 'DOI': [], 'DATA': {}, 
+                           'ESOURCE': {'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M'], 'title': [''], 'count': 0}, 
+                                       'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M'], 'title': [''], 'count': 0}}, 
+                                       'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, 
+                                       'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, 
+                                       'ABSTRACT': False, 
+                                       'CITATIONS': False, 
+                                       'GRAPHICS': False, 
+                                       'METRICS': False, 
+                                       'OPENURL': False, 
+                                       'REFERENCES': False, 
+                                       'TOC': True, 
+                                       'COREAD': False}}
             self.assertEqual(a, n)
+            self._validate_nonbib_structure(n)
 
             d = processor._read_next_bibcode('2004MNRAS.354L..31M')
             v = processor._convert(d)
-            a = {"bibcode": "2004MNRAS.354L..31M",
-                 "simbad_objects": ["3253618 G"],
-                 "read_count": 20,
-                 "data_links_rows": [{"url": ["http://dx.doi.org/10.1111/j.1365-2966.2004.08374.x"], "link_type": "ESOURCE", "link_sub_type": "PUB_HTML", 'item_count': 0, 'title': ['']},
-                                     {"url": ["https://arxiv.org/abs/astro-ph/0405472"], "link_type": "ESOURCE", "link_sub_type": "EPRINT_HTML", 'item_count': 0, 'title': ['']},
-                                     {"url": ["https://academic.oup.com/mnras/pdf-lookup/doi/10.1111/j.1365-2966.2004.08374.x"], "link_type": "ESOURCE", "link_sub_type": "PUB_PDF", 'item_count': 0, 'title': ['']},
-                                     {"url": ["http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M"], "link_type": "ESOURCE", "link_sub_type": "ADS_PDF", 'item_count': 0, 'title': ['']},
-                                     {"url": ["https://arxiv.org/pdf/astro-ph/0405472"], "link_type": "ESOURCE", "link_sub_type": "EPRINT_PDF", 'item_count': 0, 'title': ['']},
-                                     {"url": ["http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M"], "link_type": "ESOURCE", "link_sub_type": "ADS_SCAN", 'item_count': 0, 'title': ['']},
-                                     {"url": ["2004MNRAS.354L..31M", "2005yCat..73549031M"], "title": ["Source Paper", "Catalog Description"], "link_type": "ASSOCIATED", "link_sub_type": "NA", 'item_count': 0},
-                                     {"url": ["http://inspirehep.net/search?p=find+j+MNRAA,354,L31"], "link_type": "INSPIRE", "link_sub_type": "NA", 'item_count': 0, 'title': ['']},
-                                     {"url": ["http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31"], "item_count": 1, "link_type": "DATA", "link_sub_type": "CDS", 'title': ['']},
-                                     {"url": ["https://$NED$/cgi-bin/objsearch?search_type=Search&refcode=2004MNRAS.354L..31M"], "title": ["NED Objects (1953)"], "item_count": 1953, "link_type": "DATA", "link_sub_type": "NED"},
-                                     {"url": ["http://$SIMBAD$/simbo.pl?bibcode=2004MNRAS.354L..31M"], "title": ["SIMBAD Objects (1)"], "item_count": 1, "link_type": "DATA", "link_sub_type": "SIMBAD"},
-                                     {"url": ["http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31"], "item_count": 1, "link_type": "DATA", "link_sub_type": "Vizier", 'title': ['']}],
-                 "norm_cites": 10000,
-                 "data": ["CDS:1", "NED:1953", "SIMBAD:1", "Vizier:1"],
-                 "citation_count_norm": 49.5,
-                 "citation_count": 99,
-                 "property": ["ADS_OPENACCESS", "ARTICLE", "ASSOCIATED", "DATA", "EPRINT_OPENACCESS", "ESOURCE", "INSPIRE", "OPENACCESS", "PUB_OPENACCESS", "REFEREED"],
-                 "total_link_counts": 1956,
-                 "esource": ["ADS_PDF", "ADS_SCAN", "EPRINT_HTML", "EPRINT_PDF", "PUB_HTML", "PUB_PDF"],
-                 "boost": 0.4399999976158142}
-
+            a = {'property': ['ADS_OPENACCESS', 'ARTICLE', 'ASSOCIATED', 'DATA', 'EPRINT_OPENACCESS', 'ESOURCE', 'INSPIRE', 'OPENACCESS', 'PUB_OPENACCESS', 'REFEREED'], 
+                 'esource': ['ADS_PDF', 'ADS_SCAN', 'EPRINT_HTML', 'EPRINT_PDF', 'PUB_HTML', 'PUB_PDF'], 
+                 'bibcode': '2004MNRAS.354L..31M', 'boost': 0.44, 'read_count': 20, 'norm_cites': 10000, 
+                 'simbad_objects': ['3253618 G'], 'data': ['CDS:1', 'NED:1953', 'SIMBAD:1', 'Vizier:1'], 
+                 'total_link_counts': 1956, 'citation_count': 99, 'citation_count_norm': 49.5, 'identifier': [], 
+                 'links': {'ARXIV': [], 'DOI': [], 'DATA': {'CDS': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}, 
+                                                            'NED': {'url': ['https://$NED$/cgi-bin/objsearch?search_type=Search&refcode=2004MNRAS.354L..31M'], 'title': ['NED Objects (1953)'], 'count': 1953}, 
+                                                            'SIMBAD': {'url': ['http://$SIMBAD$/simbo.pl?bibcode=2004MNRAS.354L..31M'], 'title': ['SIMBAD Objects (1)'], 'count': 1}, 
+                                                            'Vizier': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}}, 
+                                                    'ESOURCE': {'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M', 'http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M'], 'title': ['', ''], 'count': 0}, 
+                                                                'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M', 'http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M'], 'title': ['', ''], 'count': 0}, 
+                                                                'PUB_HTML': {'url': ['http://dx.doi.org/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, 
+                                                                'EPRINT_HTML': {'url': ['https://arxiv.org/abs/astro-ph/0405472'], 'title': [''], 'count': 0}, 
+                                                                'PUB_PDF': {'url': ['https://academic.oup.com/mnras/pdf-lookup/doi/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, 
+                                                                'EPRINT_PDF': {'url': ['https://arxiv.org/pdf/astro-ph/0405472'], 'title': [''], 'count': 0}}, 
+                                                    'ASSOCIATED': {'url': ['2004MNRAS.354L..31M', '2005yCat..73549031M'], 'title': ['Source Paper', 'Catalog Description'], 'count': 0}, 
+                                                    'INSPIRE': {'url': ['http://inspirehep.net/search?p=find+j+MNRAA,354,L31'], 'title': [''], 'count': 0}, 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 
+                                                    'PRESENTATION': {'url': [], 'title': [], 'count': 0}, 
+                                                    'ABSTRACT': False, 
+                                                    'CITATIONS': True, 
+                                                    'GRAPHICS': False, 
+                                                    'METRICS': False, 
+                                                    'OPENURL': False, 
+                                                    'REFERENCES': False, 
+                                                    'TOC': False, 
+                                                    'COREAD': False}}
+        
             v_boost = v.pop('boost')
             a_boost = a.pop('boost')
             self.assertAlmostEqual(a_boost, v_boost)
@@ -128,6 +142,107 @@ def test_nonbib_record(self):
         # consider library 1810hdla.book.....V
         # consider inspire 1908PASP...20....1.
 
+    def _validate_nonbib_structure(self, record):
+        """Validate that the nonbib record has all required fields with correct types"""
+        
+        # Required string fields
+        self.assertIn('bibcode', record)
+        self.assertIn('identifier', record)
+        self.assertIsInstance(record['bibcode'], str)
+        self.assertIsInstance(record['identifier'], list)
+        
+        # Required numeric fields
+        numeric_fields = {
+            'boost': float,
+            'citation_count': int,
+            'read_count': int,
+            'total_link_counts': int,
+            'norm_cites': int,
+            'citation_count_norm': float
+        }
+        for field, expected_type in numeric_fields.items():
+            self.assertIn(field, record)
+            self.assertIsInstance(record[field], expected_type, 
+                                 f"Field {field} should be {expected_type.__name__}")
+        
+        # Required array fields 
+        required_array_fields = [
+            'property',  
+            'esource',   
+            'data',      
+            'identifier' 
+        ]
+
+        # Optional array fields 
+        optional_array_fields = [
+            'simbad_objects',
+            'grants',
+            'readers',
+            'reference',
+            'ned_objects',
+            'bibgroup',
+            'bibgroup_facet',
+            'gpn',
+            'uat'
+        ]
+
+        # Check required array fields
+        for field in required_array_fields:
+            self.assertIn(field, record)
+            self.assertIsInstance(record[field], list,
+                                 f"Field {field} should be a list")
+
+        # Check optional array fields if present
+        for field in optional_array_fields:
+            if field in record:
+                self.assertIsInstance(record[field], list,
+                                     f"Field {field} should be a list")
+        
+        # Validate links structure
+        self.assertIn('links', record)
+        links = record['links']
+        self.assertIsInstance(links, dict)
+        
+        # Direct link arrays
+        for field in ['ARXIV', 'DOI']:
+            self.assertIn(field, links)
+            self.assertIsInstance(links[field], (list))
+        
+        # Mapped link types
+        for field in ['DATA', 'ESOURCE']:
+            self.assertIn(field, links)
+            self.assertIsInstance(links[field], dict)
+            
+            # If there are subtypes, validate their structure
+            for subtype, value in links[field].items():
+                self.assertIsInstance(value, dict)
+                self.assertIn('url', value)
+                self.assertIsInstance(value['url'], (list))
+                self.assertIn('title', value)
+                self.assertIsInstance(value['title'], (list))
+                self.assertIn('count', value)
+                self.assertIsInstance(value['count'], int)
+        
+        # Link type records
+        for field in ['ASSOCIATED', 'INSPIRE', 'LIBRARYCATALOG', 'PRESENTATION']:
+            self.assertIn(field, links)
+            self.assertIsInstance(links[field], dict)
+            self.assertIn('url', links[field])
+            self.assertIsInstance(links[field]['url'], (list))
+            self.assertIn('title', links[field])
+            self.assertIsInstance(links[field]['title'], (list))
+            self.assertIn('count', links[field])
+            self.assertIsInstance(links[field]['count'], int)
+        
+        # Boolean flags
+        boolean_flags = [
+            'ABSTRACT', 'CITATIONS', 'GRAPHICS', 'METRICS',
+            'OPENURL', 'REFERENCES', 'TOC', 'COREAD'
+        ]
+        for field in boolean_flags:
+            self.assertIn(field, links)
+            self.assertIsInstance(links[field], bool, f"Links field {field} should be a boolean")
+
     def test_add_data_summary(self):
         self.maxDiff = None
         with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}):

From 99d133d49d6c5cc790c0ee5bed647df08d2e85e0 Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Tue, 8 Apr 2025 13:20:44 -0400
Subject: [PATCH 08/17] changing coreads and openurls to always be True

---
 adsdata/process.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/adsdata/process.py b/adsdata/process.py
index b43ca93..64733b8 100644
--- a/adsdata/process.py
+++ b/adsdata/process.py
@@ -61,14 +61,13 @@ def _get_master_nonbib_dict(self):
                 "CITATIONS": False,
                 "GRAPHICS": False,#MP
                 "METRICS": False,
-                "OPENURL": False, #MP
+                "OPENURL": True, 
                 "REFERENCES": False,
                 "TOC": False,
-                "COREAD": False #MP
+                "COREAD": True 
             }
         }
 
-    # TODO: add master protobuf 
     def process_bibcodes(self, bibcodes):
         """send nonbib and metrics records to master for the passed bibcodes
         for each bibcode

From 303d1d0c3c5631509bb7f87bbd956839cdd84245 Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Tue, 8 Apr 2025 13:22:10 -0400
Subject: [PATCH 09/17] fixing test

---
 adsdata/tests/test_process.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py
index f0fac40..27eaf30 100644
--- a/adsdata/tests/test_process.py
+++ b/adsdata/tests/test_process.py
@@ -97,10 +97,10 @@ def test_nonbib_record(self):
                                        'CITATIONS': False, 
                                        'GRAPHICS': False, 
                                        'METRICS': False, 
-                                       'OPENURL': False, 
+                                       'OPENURL': True, 
                                        'REFERENCES': False, 
                                        'TOC': True, 
-                                       'COREAD': False}}
+                                       'COREAD': True}}
             self.assertEqual(a, n)
             self._validate_nonbib_structure(n)
 
@@ -128,10 +128,10 @@ def test_nonbib_record(self):
                                                     'CITATIONS': True, 
                                                     'GRAPHICS': False, 
                                                     'METRICS': False, 
-                                                    'OPENURL': False, 
+                                                    'OPENURL': True, 
                                                     'REFERENCES': False, 
                                                     'TOC': False, 
-                                                    'COREAD': False}}
+                                                    'COREAD': True}}
         
             v_boost = v.pop('boost')
             a_boost = a.pop('boost')

From e2c651076ac10c1c4f31f91fddc0b1f52a276809 Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Tue, 8 Apr 2025 16:30:24 -0400
Subject: [PATCH 10/17] fixing _populate_new_links_structuretest

---
 adsdata/process.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/adsdata/process.py b/adsdata/process.py
index 64733b8..9260d35 100644
--- a/adsdata/process.py
+++ b/adsdata/process.py
@@ -429,7 +429,7 @@ def _populate_new_links_structure(self, data_links_rows):
         }
         
         for row in data_links_rows:
-            link_type = row['link_type']
+            link_type = row.get('link_type', '')
             
             # Skip if not in our mapping
             if link_type not in link_type_mapping:
@@ -439,22 +439,28 @@ def _populate_new_links_structure(self, data_links_rows):
             
             # Handle DATA and ESOURCE which have sub_type structure
             if mapped_type in ('DATA', 'ESOURCE'):
-                sub_type = row['link_sub_type']
+                sub_type = row.get('link_sub_type', '')
                 if sub_type not in self.master_protobuf['links'][mapped_type]:
                     self.master_protobuf['links'][mapped_type][sub_type] = {
                         'url': [],
                         'title': [],
                         'count': 0
                     }
-                self.master_protobuf['links'][mapped_type][sub_type]['url'].extend(row['url'])
-                self.master_protobuf['links'][mapped_type][sub_type]['title'].extend(row['title'])
-                self.master_protobuf['links'][mapped_type][sub_type]['count'] = row['item_count']
+                if 'url' in row:
+                    self.master_protobuf['links'][mapped_type][sub_type]['url'].extend(row['url'])
+                if 'title' in row:
+                    self.master_protobuf['links'][mapped_type][sub_type]['title'].extend(row['title'])
+                if 'item_count' in row:
+                    self.master_protobuf['links'][mapped_type][sub_type]['count'] = row['item_count']
             
             # Handle other link types with direct structure
             else:
-                self.master_protobuf['links'][mapped_type]['url'].extend(row['url'])
-                self.master_protobuf['links'][mapped_type]['title'].extend(row['title'])
-                self.master_protobuf['links'][mapped_type]['count'] = row['item_count']
+                if 'url' in row:
+                    self.master_protobuf['links'][mapped_type]['url'].extend(row['url'])
+                if 'title' in row:
+                    self.master_protobuf['links'][mapped_type]['title'].extend(row['title'])
+                if 'item_count' in row:
+                    self.master_protobuf['links'][mapped_type]['count'] = row['item_count']
         
 
     def _populate_link_flags(self, passed):

From 338632e16119bed0c5a80fcea3f583ccc3f99da3 Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Fri, 18 Apr 2025 10:41:13 -0400
Subject: [PATCH 11/17] removing changes to flags

---
 adsdata/process.py            | 22 +++-------------------
 adsdata/tests/test_process.py | 12 ++++++------
 2 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/adsdata/process.py b/adsdata/process.py
index 9260d35..267b84d 100644
--- a/adsdata/process.py
+++ b/adsdata/process.py
@@ -61,10 +61,10 @@ def _get_master_nonbib_dict(self):
                 "CITATIONS": False,
                 "GRAPHICS": False,#MP
                 "METRICS": False,
-                "OPENURL": True, 
+                "OPENURL": False, 
                 "REFERENCES": False,
                 "TOC": False,
-                "COREAD": True 
+                "COREAD": False 
             }
         }
 
@@ -135,9 +135,6 @@ def _convert(self, passed):
         
             # Handle boolean fields and TOC
             if isinstance(default_value, bool):
-                if filetype == 'toc':
-                    self.master_protobuf['links']['TOC'] = value[filetype]
-                
                 return_value[filetype] = value[filetype]
                 value = value[filetype]
             
@@ -187,9 +184,6 @@ def _convert(self, passed):
         # Populate the new protobuf structure with link data
         self._populate_new_links_structure(return_value['data_links_rows'])
         
-        # Populate the boolean flags
-        self._populate_link_flags(passed)
-        
         # Add computed fields
         for field_name, field_config in computed_fields.items():
             converter = getattr(self, field_config['converter_function'], None)
@@ -460,14 +454,4 @@ def _populate_new_links_structure(self, data_links_rows):
                 if 'title' in row:
                     self.master_protobuf['links'][mapped_type]['title'].extend(row['title'])
                 if 'item_count' in row:
-                    self.master_protobuf['links'][mapped_type]['count'] = row['item_count']
-        
-
-    def _populate_link_flags(self, passed):
-        """Populate the boolean flags in the new protobuf links structure.
-        Sets CITATIONS, REFERENCES, and METRICS based on data availability."""
-    
-        self.master_protobuf['links']['CITATIONS'] = len(passed.get('citation', [])) > 0
-        self.master_protobuf['links']['REFERENCES'] = len(passed.get('reference', [])) > 0
-        self.master_protobuf['links']['METRICS'] = self.compute_metrics
-        
\ No newline at end of file
+                    self.master_protobuf['links'][mapped_type]['count'] = row['item_count']
\ No newline at end of file
diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py
index 27eaf30..88822bc 100644
--- a/adsdata/tests/test_process.py
+++ b/adsdata/tests/test_process.py
@@ -97,10 +97,10 @@ def test_nonbib_record(self):
                                        'CITATIONS': False, 
                                        'GRAPHICS': False, 
                                        'METRICS': False, 
-                                       'OPENURL': True, 
+                                       'OPENURL': False, 
                                        'REFERENCES': False, 
-                                       'TOC': True, 
-                                       'COREAD': True}}
+                                       'TOC': False, 
+                                       'COREAD': False}}
             self.assertEqual(a, n)
             self._validate_nonbib_structure(n)
 
@@ -125,13 +125,13 @@ def test_nonbib_record(self):
                                                     'INSPIRE': {'url': ['http://inspirehep.net/search?p=find+j+MNRAA,354,L31'], 'title': [''], 'count': 0}, 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 
                                                     'PRESENTATION': {'url': [], 'title': [], 'count': 0}, 
                                                     'ABSTRACT': False, 
-                                                    'CITATIONS': True, 
+                                                    'CITATIONS': False, 
                                                     'GRAPHICS': False, 
                                                     'METRICS': False, 
-                                                    'OPENURL': True, 
+                                                    'OPENURL': False, 
                                                     'REFERENCES': False, 
                                                     'TOC': False, 
-                                                    'COREAD': True}}
+                                                    'COREAD': False}}
         
             v_boost = v.pop('boost')
             a_boost = a.pop('boost')

From a3f39daa69816845392f2a286b427a1b16c58004 Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Tue, 13 May 2025 14:56:47 -0400
Subject: [PATCH 12/17] uncommenting test

---
 adsdata/tests/test_process.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py
index 88822bc..c28fa6c 100644
--- a/adsdata/tests/test_process.py
+++ b/adsdata/tests/test_process.py
@@ -71,13 +71,13 @@ def test_read(self):
             self.assertEqual(d['refereed'], {'refereed': False})
             self.assertEqual(d['planetary_feature'], ['Moon/Mare/Mare Imbrium/3678', 'Moon/Crater/Alder/171', 'Moon/Crater/Finsen/1959', 'Moon/Crater/Leibnitz/3335'])
 
-    # def test_protobuf(self):
-    #     """make sure protobuf are created without an exception"""
-    #     with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}):
-    #         d = processor._read_next_bibcode('1057wjlf.book.....C')
-    #         c = processor._convert(d)
-    #         nonbib = NonBibRecord(**c)
-    #         print('nonbib = {}'.format(nonbib))
+    def test_protobuf(self):
+        """make sure protobuf are created without an exception"""
+        with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}):
+            d = processor._read_next_bibcode('1057wjlf.book.....C')
+            c = processor._convert(d)
+            nonbib = NonBibRecord(**c)
+            print('nonbib = {}'.format(nonbib))
 
     def test_nonbib_record(self):
         self.maxDiff = None
@@ -99,7 +99,7 @@ def test_nonbib_record(self):
                                        'METRICS': False, 
                                        'OPENURL': False, 
                                        'REFERENCES': False, 
-                                       'TOC': False, 
+                                       'TOC': True, 
                                        'COREAD': False}}
             self.assertEqual(a, n)
             self._validate_nonbib_structure(n)

From 4eb266dec4faaa4deafbd49792fdc58f37fb2a0c Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Tue, 13 May 2025 15:43:22 -0400
Subject: [PATCH 13/17] updating requirements

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 5ef4fd3..5641829 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-adsputils==1.5.5   
\ No newline at end of file
+adsputils==1.5.7   
\ No newline at end of file

From c405a4f02ad663017701628e792d0b193cfa6182 Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Tue, 13 May 2025 16:04:07 -0400
Subject: [PATCH 14/17] changing test

---
 adsdata/tests/test_process.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py
index c28fa6c..a92baba 100644
--- a/adsdata/tests/test_process.py
+++ b/adsdata/tests/test_process.py
@@ -99,7 +99,7 @@ def test_nonbib_record(self):
                                        'METRICS': False, 
                                        'OPENURL': False, 
                                        'REFERENCES': False, 
-                                       'TOC': True, 
+                                       'TOC': False, 
                                        'COREAD': False}}
             self.assertEqual(a, n)
             self._validate_nonbib_structure(n)

From 53cc30c6528b872a2d0ee7a7fa6e6d08f4dfaeaa Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Mon, 24 Nov 2025 12:29:00 -0500
Subject: [PATCH 15/17] making abstract always true

---
 adsdata/process.py            | 2 +-
 adsdata/tests/test_process.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/adsdata/process.py b/adsdata/process.py
index 267b84d..e82953a 100644
--- a/adsdata/process.py
+++ b/adsdata/process.py
@@ -57,7 +57,7 @@ def _get_master_nonbib_dict(self):
                     "title": [],
                     "count": 0
                 },
-                "ABSTRACT": False,#MP 
+                "ABSTRACT": True,
                 "CITATIONS": False,
                 "GRAPHICS": False,#MP
                 "METRICS": False,
diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py
index a92baba..4217daf 100644
--- a/adsdata/tests/test_process.py
+++ b/adsdata/tests/test_process.py
@@ -93,7 +93,7 @@ def test_nonbib_record(self):
                                        'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M'], 'title': [''], 'count': 0}}, 
                                        'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, 
                                        'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, 
-                                       'ABSTRACT': False, 
+                                       'ABSTRACT': True, 
                                        'CITATIONS': False, 
                                        'GRAPHICS': False, 
                                        'METRICS': False, 

From 701711563ef8b8e46e94b67419dc9f8859a3b897 Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Mon, 24 Nov 2025 14:42:04 -0500
Subject: [PATCH 16/17] resolving bugs

---
 adsdata/process.py            |  48 ++++++++--------
 adsdata/tests/test_process.py | 104 ++++++++++++++++++++++++++++++----
 2 files changed, 119 insertions(+), 33 deletions(-)

diff --git a/adsdata/process.py b/adsdata/process.py
index f052ae7..736a215 100644
--- a/adsdata/process.py
+++ b/adsdata/process.py
@@ -31,10 +31,10 @@ def __exit__(self, exc_type, exc_value, traceback):
     def _get_master_nonbib_dict(self):
         # Template for the new protobuf structure
         return {
-            "identifier": [], #MP
+            "identifier": [], # Master Pipeline 
             "links": {
-                "ARXIV": [], #MP
-                "DOI": [],#MP
+                "ARXIV": [], # Master Pipeline
+                "DOI": [], # Master Pipeline
                 "DATA": {},
                 "ESOURCE": {},
                 "ASSOCIATED": {
@@ -57,14 +57,14 @@ def _get_master_nonbib_dict(self):
                     "title": [],
                     "count": 0
                 },
-                "ABSTRACT": True,
-                "CITATIONS": False,
-                "GRAPHICS": False,#MP
-                "METRICS": False,
-                "OPENURL": False, 
-                "REFERENCES": False,
-                "TOC": False,
-                "COREAD": False 
+                "ABSTRACT": False,  # Master Pipeline
+                "CITATIONS": False, # Master Pipeline
+                "GRAPHICS": False,  # Master Pipeline
+                "METRICS": False,   # Master Pipeline
+                "OPENURL": False,   # Master Pipeline
+                "REFERENCES": False,# Master Pipeline
+                "TOC": False,       # Master Pipeline
+                "COREAD": False     # Master Pipeline
             }
         }
 
@@ -94,7 +94,6 @@ def process_bibcodes(self, bibcodes):
         if not self.compute_CC: tasks.task_output_nonbib.delay(nonbib_protos)
         tasks.task_output_metrics.delay(metrics_protos)
 
-    # TODO: Check what else can be added for master protobuf
     def _convert(self, passed):
         """Convert full nonbib dict to what is needed for nonbib protobuf.
         
@@ -180,9 +179,11 @@ def _convert(self, passed):
         
         # Merge and process data links
         return_value['data_links_rows'] = self._merge_data_links(return_value['data_links_rows'])
+
+        master_template = self._get_master_nonbib_dict()
         
         # Populate the new protobuf structure with link data
-        self._populate_new_links_structure(return_value['data_links_rows'])
+        self._populate_new_links_structure(return_value['data_links_rows'], master_template)
         
         # Add computed fields
         for field_name, field_config in computed_fields.items():
@@ -203,7 +204,7 @@ def _convert(self, passed):
         }
         for field in unused_fields:
             return_value.pop(field, None)
-        return_value.update(self.master_protobuf)
+        return_value.update(master_template)
         return_value.pop('data_links_rows')
         return return_value
 
@@ -414,7 +415,7 @@ def _compute_bibgroup_facet(self, d):
         bibgroup_facet = sorted(list(set(bibgroup)))
         return {'bibgroup_facet': bibgroup_facet}
 
-    def _populate_new_links_structure(self, data_links_rows):
+    def _populate_new_links_structure(self, data_links_rows, master_template):
         """Populate the new protobuf links structure from data_links_rows.
         Maps the flat data_links_rows into the hierarchical links structure."""
         
@@ -440,24 +441,25 @@ def _populate_new_links_structure(self, data_links_rows):
             # Handle DATA and ESOURCE which have sub_type structure
             if mapped_type in ('DATA', 'ESOURCE'):
                 sub_type = row.get('link_sub_type', '')
-                if sub_type not in self.master_protobuf['links'][mapped_type]:
-                    self.master_protobuf['links'][mapped_type][sub_type] = {
+                if sub_type not in master_template['links'][mapped_type]:
+                    master_template['links'][mapped_type][sub_type] = {
                         'url': [],
                         'title': [],
                         'count': 0
                     }
                 if 'url' in row:
-                    self.master_protobuf['links'][mapped_type][sub_type]['url'].extend(row['url'])
+                    master_template['links'][mapped_type][sub_type]['url'].extend(row['url'])
                 if 'title' in row:
-                    self.master_protobuf['links'][mapped_type][sub_type]['title'].extend(row['title'])
+                    master_template['links'][mapped_type][sub_type]['title'].extend(row['title'])
                 if 'item_count' in row:
-                    self.master_protobuf['links'][mapped_type][sub_type]['count'] = row['item_count']
+                    master_template['links'][mapped_type][sub_type]['count'] = row['item_count']
             
             # Handle other link types with direct structure
             else:
                 if 'url' in row:
-                    self.master_protobuf['links'][mapped_type]['url'].extend(row['url'])
+                    master_template['links'][mapped_type]['url'].extend(row['url'])
                 if 'title' in row:
-                    self.master_protobuf['links'][mapped_type]['title'].extend(row['title'])
+                    master_template['links'][mapped_type]['title'].extend(row['title'])
                 if 'item_count' in row:
-                    self.master_protobuf['links'][mapped_type]['count'] = row['item_count']
\ No newline at end of file
+                    master_template['links'][mapped_type]['count'] = row['item_count']
+        return master_template
\ No newline at end of file
diff --git a/adsdata/tests/test_process.py b/adsdata/tests/test_process.py
index d6b852b..080c349 100644
--- a/adsdata/tests/test_process.py
+++ b/adsdata/tests/test_process.py
@@ -93,14 +93,14 @@ def test_nonbib_record(self):
                                        'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M'], 'title': [''], 'count': 0}}, 
                                        'ASSOCIATED': {'url': [], 'title': [], 'count': 0}, 'INSPIRE': {'url': [], 'title': [], 'count': 0}, 
                                        'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0}, 'PRESENTATION': {'url': [], 'title': [], 'count': 0}, 
-                                       'ABSTRACT': True, 
+                                       'ABSTRACT': False,  # Master Pipeline will set to True
                                        'CITATIONS': False, 
-                                       'GRAPHICS': False, 
+                                       'GRAPHICS': False,  # Master Pipeline will set to True
                                        'METRICS': False, 
-                                       'OPENURL': False, 
+                                       'OPENURL': False,   # Master Pipeline will set to True
                                        'REFERENCES': False, 
                                        'TOC': False, 
-                                       'COREAD': False}}
+                                       'COREAD': False}}   # Master Pipeline will set to True
             self.assertEqual(a, n)
             self._validate_nonbib_structure(n)
 
@@ -121,8 +121,8 @@ def test_nonbib_record(self):
                                                             'NED': {'url': ['https://$NED$/cgi-bin/objsearch?search_type=Search&refcode=2004MNRAS.354L..31M'], 'title': ['NED Objects (1953)'], 'count': 1953}, 
                                                             'SIMBAD': {'url': ['http://$SIMBAD$/simbo.pl?bibcode=2004MNRAS.354L..31M'], 'title': ['SIMBAD Objects (1)'], 'count': 1}, 
                                                             'Vizier': {'url': ['http://$VIZIER$/viz-bin/VizieR?-source=J/MNRAS/354/L31'], 'title': [''], 'count': 1}}, 
-                                                    'ESOURCE': {'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2003ASPC..295..361M', 'http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M'], 'title': ['', ''], 'count': 0}, 
-                                                                'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2003ASPC..295..361M', 'http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M'], 'title': ['', ''], 'count': 0}, 
+                                                    'ESOURCE': {'ADS_PDF': {'url': ['http://articles.adsabs.harvard.edu/pdf/2004MNRAS.354L..31M'], 'title': [''], 'count': 0}, 
+                                                                'ADS_SCAN': {'url': ['http://articles.adsabs.harvard.edu/full/2004MNRAS.354L..31M'], 'title': [''], 'count': 0}, 
                                                                 'PUB_HTML': {'url': ['http://dx.doi.org/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, 
                                                                 'EPRINT_HTML': {'url': ['https://arxiv.org/abs/astro-ph/0405472'], 'title': [''], 'count': 0}, 
                                                                 'PUB_PDF': {'url': ['https://academic.oup.com/mnras/pdf-lookup/doi/10.1111/j.1365-2966.2004.08374.x'], 'title': [''], 'count': 0}, 
@@ -130,14 +130,14 @@ def test_nonbib_record(self):
                                                     'ASSOCIATED': {'url': ['2004MNRAS.354L..31M', '2005yCat..73549031M'], 'title': ['Source Paper', 'Catalog Description'], 'count': 0}, 
                                                     'INSPIRE': {'url': ['http://inspirehep.net/search?p=find+j+MNRAA,354,L31'], 'title': [''], 'count': 0}, 'LIBRARYCATALOG': {'url': [], 'title': [], 'count': 0},
                                                     'PRESENTATION': {'url': [], 'title': [], 'count': 0},
-                                                    'ABSTRACT': True,
+                                                    'ABSTRACT': False,  # Master Pipeline will set to True
                                                     'CITATIONS': False, 
-                                                    'GRAPHICS': False, 
+                                                    'GRAPHICS': False,  # Master Pipeline will set to True
                                                     'METRICS': False, 
-                                                    'OPENURL': False, 
+                                                    'OPENURL': False,   # Master Pipeline will set to True
                                                     'REFERENCES': False, 
                                                     'TOC': False, 
-                                                    'COREAD': False}}
+                                                    'COREAD': False}}   # Master Pipeline will set to True
             v_boost = v.pop('boost')
             a_boost = a.pop('boost')
             self.assertAlmostEqual(a_boost, v_boost)
@@ -367,3 +367,87 @@ def test_compute_bibgroup_facet(self):
         self.assertEqual({'bibgroup_facet': ['a']}, p._compute_bibgroup_facet({'bibgroup': ['a']}))
         self.assertEqual({'bibgroup_facet': ['a', 'b']}, p._compute_bibgroup_facet({'bibgroup': ['a', 'b']}))
         self.assertEqual({'bibgroup_facet': ['a', 'b']}, p._compute_bibgroup_facet({'bibgroup': ['a', 'b', 'a']}))
+
+    def test_multiple_bibcodes_no_link_leakage(self):
+        """Verify links don't leak between bibcodes when processing sequentially"""
+        self.maxDiff = None
+        
+        with Processor(compute_metrics=False) as processor, patch('adsputils.load_config', return_value={'INPUT_DATA_ROOT': './test/data1/config/'}):
+            # Process bibcode A - has ADS_PDF and ADS_SCAN esources
+            bibcode_a = '2003ASPC..295..361M'
+            d_a = processor._read_next_bibcode(bibcode_a)
+            result_a = processor._convert(d_a)
+            
+            # Verify A has only its own ESOURCE links
+            self.assertIn('ESOURCE', result_a['links'])
+            esource_a = result_a['links']['ESOURCE']
+            self.assertIn('ADS_PDF', esource_a)
+            self.assertIn('ADS_SCAN', esource_a)
+            
+            # Store A's link counts for later comparison
+            ads_pdf_urls_a = list(esource_a['ADS_PDF']['url'])
+            ads_scan_urls_a = list(esource_a['ADS_SCAN']['url'])
+            
+            # Verify A has only one URL per link type (its own)
+            self.assertEqual(len(ads_pdf_urls_a), 1, 
+                           f"Bibcode A should have exactly 1 ADS_PDF URL, got {len(ads_pdf_urls_a)}")
+            self.assertEqual(len(ads_scan_urls_a), 1,
+                           f"Bibcode A should have exactly 1 ADS_SCAN URL, got {len(ads_scan_urls_a)}")
+            
+            # Verify URLs contain the correct bibcode
+            self.assertIn(bibcode_a, ads_pdf_urls_a[0])
+            self.assertIn(bibcode_a, ads_scan_urls_a[0])
+            
+            # Now process bibcode B - has different esources (includes PUB_HTML, EPRINT_HTML, etc.)
+            bibcode_b = '2004MNRAS.354L..31M'
+            d_b = processor._read_next_bibcode(bibcode_b)
+            result_b = processor._convert(d_b)
+            
+            # Verify B has its own ESOURCE links
+            self.assertIn('ESOURCE', result_b['links'])
+            esource_b = result_b['links']['ESOURCE']
+            
+            # B should have ADS_PDF and ADS_SCAN (from its own data)
+            self.assertIn('ADS_PDF', esource_b)
+            self.assertIn('ADS_SCAN', esource_b)
+            
+            # B should have only its own URLs, NOT A's URLs
+            ads_pdf_urls_b = esource_b['ADS_PDF']['url']
+            ads_scan_urls_b = esource_b['ADS_SCAN']['url']
+            
+            # Check that B's URLs don't contain A's bibcode
+            for url in ads_pdf_urls_b:
+                self.assertNotIn(bibcode_a, url, 
+                               f"Bibcode B's ADS_PDF links leaked bibcode A's URL: {url}")
+                
+            for url in ads_scan_urls_b:
+                self.assertNotIn(bibcode_a, url,
+                               f"Bibcode B's ADS_SCAN links leaked bibcode A's URL: {url}")
+            
+            # Verify B has its own bibcode in its URLs
+            b_pdf_has_own_bibcode = any(bibcode_b in url for url in ads_pdf_urls_b)
+            b_scan_has_own_bibcode = any(bibcode_b in url for url in ads_scan_urls_b)
+            
+            self.assertTrue(b_pdf_has_own_bibcode, 
+                          f"Bibcode B should have its own bibcode in ADS_PDF URLs")
+            self.assertTrue(b_scan_has_own_bibcode,
+                          f"Bibcode B should have its own bibcode in ADS_SCAN URLs")
+            
+            # Also verify DATA links don't leak
+            # A has no DATA links, B has DATA links (CDS, NED, SIMBAD, Vizier)
+            data_a = result_a['links']['DATA']
+            data_b = result_b['links']['DATA']
+            
+            self.assertEqual(len(data_a), 0, "Bibcode A should have no DATA links")
+            self.assertGreater(len(data_b), 0, "Bibcode B should have DATA links")
+            
+            # Verify DATA subtypes in B
+            self.assertIn('CDS', data_b)
+            self.assertIn('NED', data_b)
+            self.assertIn('SIMBAD', data_b)
+            self.assertIn('Vizier', data_b)
+            
+            print(f"\n✅ Link leakage test passed!")
+            print(f"   Bibcode A processed: {len(ads_pdf_urls_a)} ADS_PDF URLs, {len(ads_scan_urls_a)} ADS_SCAN URLs")
+            print(f"   Bibcode B processed: {len(ads_pdf_urls_b)} ADS_PDF URLs, {len(ads_scan_urls_b)} ADS_SCAN URLs")
+            print(f"   No links from A leaked into B")

From 3c22513aadfe85a01e4daf357959db4878d3dd53 Mon Sep 17 00:00:00 2001
From: femalves <fernandamalves@ufrn.edu.br>
Date: Thu, 4 Dec 2025 12:07:41 -0500
Subject: [PATCH 17/17] adding logs

---
 adsdata/process.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/adsdata/process.py b/adsdata/process.py
index 736a215..6ec47d3 100644
--- a/adsdata/process.py
+++ b/adsdata/process.py
@@ -206,6 +206,7 @@ def _convert(self, passed):
             return_value.pop(field, None)
         return_value.update(master_template)
         return_value.pop('data_links_rows')
+        self.logger.debug('Processed nonbib data: {}'.format(return_value))
         return return_value
 
     def _add_citation_count_fields(self, return_value, original):
@@ -279,6 +280,7 @@ def _merge_data_links(self, datalinks):
     def _convert_data_link(self, filetype, value):
         """convert one data link row"""
         
+        self.logger.debug('Converting data link: {}'.format(value))
         file_properties = self.data_dict[filetype]
 
         link_type = file_properties['extra_values']['link_type']
@@ -308,16 +310,18 @@ def _convert_data_link(self, filetype, value):
             link_data['title'] = value.get('title', [''])
             link_data['item_count'] = value.get('item_count', 0)
             
+            self.logger.debug('Link data before conversion: {}'.format(link_data))
             if isinstance(link_data['url'], str):
                 link_data['url'] = [link_data['url']]
             if isinstance(link_data['title'], str):
                 link_data['title'] = [link_data['title']]
-        
+            self.logger.debug('Link data after conversion: {}'.format(link_data))
         elif not isinstance(value, bool):
             self.logger.error(
                 f"Serious error in process.convert_data_link: unexpected type for value, filetype = {filetype}, "
                 f"value = {value}, type of value = {type(value)}"
             )
+        self.logger.debug('Converted data link: {}'.format(link_data))
         return link_data
 
     def _read_next_bibcode(self, bibcode):
@@ -418,6 +422,8 @@ def _compute_bibgroup_facet(self, d):
     def _populate_new_links_structure(self, data_links_rows, master_template):
         """Populate the new protobuf links structure from data_links_rows.
         Maps the flat data_links_rows into the hierarchical links structure."""
+
+        self.logger.debug('Populating new links structure: {}'.format(data_links_rows))
         
         # Map for link types that need special handling
         link_type_mapping = {
@@ -462,4 +468,5 @@ def _populate_new_links_structure(self, data_links_rows, master_template):
                     master_template['links'][mapped_type]['title'].extend(row['title'])
                 if 'item_count' in row:
                     master_template['links'][mapped_type]['count'] = row['item_count']
+        self.logger.debug('Populated new links structure: {}'.format(master_template))
         return master_template
\ No newline at end of file