From 00409f5441984307ca0837ba9342c7028ae1c2d0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 22 Nov 2025 19:41:54 +0000 Subject: [PATCH 1/8] Initial plan From e7686fdc30d5d76f5ef18d2805927c87f538eefb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 22 Nov 2025 19:50:00 +0000 Subject: [PATCH 2/8] feat: update URL instead of creating new records in MongoDB Co-authored-by: ocervell <9629314+ocervell@users.noreply.github.com> --- secator/hooks/mongodb.py | 53 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/secator/hooks/mongodb.py b/secator/hooks/mongodb.py index fe0123418..9e7e5fb28 100644 --- a/secator/hooks/mongodb.py +++ b/secator/hooks/mongodb.py @@ -113,9 +113,56 @@ def update_finding(self, item): finding = db['findings'].update_one({'_id': _id}, {'$set': update}) status = 'UPDATED' else: - finding = db['findings'].insert_one(update) - item._uuid = str(finding.inserted_id) - status = 'CREATED' + # Check if a duplicate already exists in MongoDB + # Build query based on fields that are used for comparison + from dataclasses import fields as dataclass_fields + query = {'_type': _type} + + # Add workspace_id to query if present to scope duplicates to workspace + workspace_id = item._context.get('workspace_id') + if workspace_id: + query['_context.workspace_id'] = workspace_id + + # Get fields that are used for comparison (compare != False) + item_class = type(item) + for field in dataclass_fields(item_class): + if field.compare and not field.name.startswith('_'): + field_value = getattr(item, field.name) + if field_value: # Only add non-empty values to query + query[field.name] = field_value + + # Look for existing finding with same identifying fields + existing = db['findings'].find_one(query) + + if existing: + # Update existing record instead of creating new one + _id = existing['_id'] + # Merge data: keep existing values for empty fields, update with new values for non-empty fields + merged_update = existing.copy() + for key, value in update.items(): + # Update field if new value is not empty or if it's a list/dict that's not empty + if value or (isinstance(value, (list, dict)) and len(value) > 0): + # For lists, merge them (avoiding duplicates) + if isinstance(value, list) and isinstance(merged_update.get(key), list): + # Merge lists, preserving order and removing duplicates + existing_list = merged_update[key] + for item_val in value: + if item_val not in existing_list: + existing_list.append(item_val) + merged_update[key] = existing_list + # For dicts, merge them + elif isinstance(value, dict) and isinstance(merged_update.get(key), dict): + merged_update[key].update(value) + else: + merged_update[key] = value + db['findings'].update_one({'_id': _id}, {'$set': merged_update}) + item._uuid = str(_id) + status = 'UPDATED' + debug('found existing finding in db, updating', sub='hooks.mongodb', id=str(_id), verbose=True) + else: + finding = db['findings'].insert_one(update) + item._uuid = str(finding.inserted_id) + status = 'CREATED' end_time = time.time() elapsed = end_time - start_time debug_obj = { From f61917305f0f0b4355ac79415d1fe8c61fb10d19 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 22 Nov 2025 19:52:34 +0000 Subject: [PATCH 3/8] fix: address code review feedback - move import to top, fix value checking logic, handle unhashable list items Co-authored-by: ocervell <9629314+ocervell@users.noreply.github.com> --- secator/hooks/mongodb.py | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/secator/hooks/mongodb.py b/secator/hooks/mongodb.py index 9e7e5fb28..c0fbc8aa6 100644 --- a/secator/hooks/mongodb.py +++ b/secator/hooks/mongodb.py @@ -1,5 +1,6 @@ import logging import time +from dataclasses import fields as dataclass_fields import pymongo from bson.objectid import ObjectId @@ -115,7 +116,6 @@ def update_finding(self, item): else: # Check if a duplicate already exists in MongoDB # Build query based on fields that are used for comparison - from dataclasses import fields as dataclass_fields query = {'_type': _type} # Add workspace_id to query if present to scope duplicates to workspace @@ -140,21 +140,31 @@ def update_finding(self, item): # Merge data: keep existing values for empty fields, update with new values for non-empty fields merged_update = existing.copy() for key, value in update.items(): - # Update field if new value is not empty or if it's a list/dict that's not empty - if value or (isinstance(value, (list, dict)) and len(value) > 0): - # For lists, merge them (avoiding duplicates) - if isinstance(value, list) and isinstance(merged_update.get(key), list): - # Merge lists, preserving order and removing duplicates - existing_list = merged_update[key] + # Skip if value is None or empty string (but not False or 0 which are valid) + if value is None or value == '': + continue + # Skip empty lists or dicts + if isinstance(value, (list, dict)) and len(value) == 0: + continue + # For lists, merge them (avoiding duplicates) + if isinstance(value, list) and isinstance(merged_update.get(key), list): + # Try to use set for O(1) lookup if items are hashable + try: + existing_set = set(merged_update[key]) for item_val in value: - if item_val not in existing_list: - existing_list.append(item_val) - merged_update[key] = existing_list - # For dicts, merge them - elif isinstance(value, dict) and isinstance(merged_update.get(key), dict): - merged_update[key].update(value) - else: - merged_update[key] = value + if item_val not in existing_set: + merged_update[key].append(item_val) + existing_set.add(item_val) + except TypeError: + # Items are not hashable (e.g., dicts), fall back to O(n) check + for item_val in value: + if item_val not in merged_update[key]: + merged_update[key].append(item_val) + # For dicts, merge them + elif isinstance(value, dict) and isinstance(merged_update.get(key), dict): + merged_update[key].update(value) + else: + merged_update[key] = value db['findings'].update_one({'_id': _id}, {'$set': merged_update}) item._uuid = str(_id) status = 'UPDATED' From 64c1dc8ab45840fd20b27fe3bf1ab071a482ee83 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 22 Nov 2025 19:54:36 +0000 Subject: [PATCH 4/8] fix: handle False/0 values in query, avoid shallow copy, improve merge logic Co-authored-by: ocervell <9629314+ocervell@users.noreply.github.com> --- secator/hooks/mongodb.py | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/secator/hooks/mongodb.py b/secator/hooks/mongodb.py index c0fbc8aa6..92023eae3 100644 --- a/secator/hooks/mongodb.py +++ b/secator/hooks/mongodb.py @@ -128,7 +128,8 @@ def update_finding(self, item): for field in dataclass_fields(item_class): if field.compare and not field.name.startswith('_'): field_value = getattr(item, field.name) - if field_value: # Only add non-empty values to query + # Include all non-None values in query, including False and 0 + if field_value is not None and field_value != '': query[field.name] = field_value # Look for existing finding with same identifying fields @@ -137,35 +138,42 @@ def update_finding(self, item): if existing: # Update existing record instead of creating new one _id = existing['_id'] - # Merge data: keep existing values for empty fields, update with new values for non-empty fields - merged_update = existing.copy() + # Merge data: start with existing record and selectively update with new values + # Note: We update the existing document in-place to avoid copying large fields for key, value in update.items(): - # Skip if value is None or empty string (but not False or 0 which are valid) - if value is None or value == '': + # Always update special fields that may be modified + if key in ('_timestamp', '_tagged', '_duplicate', '_related', '_source'): + existing[key] = value continue - # Skip empty lists or dicts - if isinstance(value, (list, dict)) and len(value) == 0: + # Skip None values - keep existing value + if value is None: + continue + # Skip empty strings unless they override a non-empty value + if value == '' and existing.get(key): + continue + # Skip empty lists or dicts unless they override a non-empty value + if isinstance(value, (list, dict)) and len(value) == 0 and existing.get(key): continue # For lists, merge them (avoiding duplicates) - if isinstance(value, list) and isinstance(merged_update.get(key), list): + if isinstance(value, list) and isinstance(existing.get(key), list): # Try to use set for O(1) lookup if items are hashable try: - existing_set = set(merged_update[key]) + existing_set = set(existing[key]) for item_val in value: if item_val not in existing_set: - merged_update[key].append(item_val) + existing[key].append(item_val) existing_set.add(item_val) except TypeError: # Items are not hashable (e.g., dicts), fall back to O(n) check for item_val in value: - if item_val not in merged_update[key]: - merged_update[key].append(item_val) + if item_val not in existing[key]: + existing[key].append(item_val) # For dicts, merge them - elif isinstance(value, dict) and isinstance(merged_update.get(key), dict): - merged_update[key].update(value) + elif isinstance(value, dict) and isinstance(existing.get(key), dict): + existing[key].update(value) else: - merged_update[key] = value - db['findings'].update_one({'_id': _id}, {'$set': merged_update}) + existing[key] = value + db['findings'].update_one({'_id': _id}, {'$set': existing}) item._uuid = str(_id) status = 'UPDATED' debug('found existing finding in db, updating', sub='hooks.mongodb', id=str(_id), verbose=True) From 259f69125be75bb67b714a837da5d52a550e6ac9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 22 Nov 2025 19:56:40 +0000 Subject: [PATCH 5/8] refactor: extract list merge logic and optimize MongoDB updates Co-authored-by: ocervell <9629314+ocervell@users.noreply.github.com> --- secator/hooks/mongodb.py | 62 ++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/secator/hooks/mongodb.py b/secator/hooks/mongodb.py index 92023eae3..417ad24f6 100644 --- a/secator/hooks/mongodb.py +++ b/secator/hooks/mongodb.py @@ -101,6 +101,31 @@ def update_runner(self): debug(f'in {elapsed:.4f}s', sub='hooks.mongodb', id=_id, obj=get_runner_dbg(self), obj_after=False) +def _merge_lists(existing_list, new_list): + """Merge two lists, avoiding duplicates while preserving order. + + Args: + existing_list: The existing list to merge into + new_list: The new list to merge from + + Returns: + The merged list + """ + # Try to use set for O(1) lookup if items are hashable + try: + existing_set = set(existing_list) + for item_val in new_list: + if item_val not in existing_set: + existing_list.append(item_val) + existing_set.add(item_val) + except TypeError: + # Items are not hashable (e.g., dicts), fall back to O(n) check + for item_val in new_list: + if item_val not in existing_list: + existing_list.append(item_val) + return existing_list + + def update_finding(self, item): if type(item) not in OUTPUT_TYPES: return item @@ -128,7 +153,8 @@ def update_finding(self, item): for field in dataclass_fields(item_class): if field.compare and not field.name.startswith('_'): field_value = getattr(item, field.name) - # Include all non-None values in query, including False and 0 + # Include all non-None, non-empty values in query (including False and 0) + # Empty strings are excluded as they're not meaningful for duplicate detection if field_value is not None and field_value != '': query[field.name] = field_value @@ -138,12 +164,12 @@ def update_finding(self, item): if existing: # Update existing record instead of creating new one _id = existing['_id'] - # Merge data: start with existing record and selectively update with new values - # Note: We update the existing document in-place to avoid copying large fields + # Track which fields actually changed + changed_fields = {} for key, value in update.items(): # Always update special fields that may be modified if key in ('_timestamp', '_tagged', '_duplicate', '_related', '_source'): - existing[key] = value + changed_fields[key] = value continue # Skip None values - keep existing value if value is None: @@ -156,24 +182,22 @@ def update_finding(self, item): continue # For lists, merge them (avoiding duplicates) if isinstance(value, list) and isinstance(existing.get(key), list): - # Try to use set for O(1) lookup if items are hashable - try: - existing_set = set(existing[key]) - for item_val in value: - if item_val not in existing_set: - existing[key].append(item_val) - existing_set.add(item_val) - except TypeError: - # Items are not hashable (e.g., dicts), fall back to O(n) check - for item_val in value: - if item_val not in existing[key]: - existing[key].append(item_val) + merged_list = _merge_lists(existing[key].copy(), value) + if merged_list != existing[key]: + changed_fields[key] = merged_list # For dicts, merge them elif isinstance(value, dict) and isinstance(existing.get(key), dict): - existing[key].update(value) + merged_dict = existing[key].copy() + merged_dict.update(value) + if merged_dict != existing[key]: + changed_fields[key] = merged_dict else: - existing[key] = value - db['findings'].update_one({'_id': _id}, {'$set': existing}) + # Only update if value is different + if existing.get(key) != value: + changed_fields[key] = value + # Only update if there are actual changes + if changed_fields: + db['findings'].update_one({'_id': _id}, {'$set': changed_fields}) item._uuid = str(_id) status = 'UPDATED' debug('found existing finding in db, updating', sub='hooks.mongodb', id=str(_id), verbose=True) From 204d8e34783dd9cfdbc3d67d6f6908417110d0db Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 22 Nov 2025 19:59:48 +0000 Subject: [PATCH 6/8] fix: improve empty value checking and list merge comparison logic Co-authored-by: ocervell <9629314+ocervell@users.noreply.github.com> --- secator/hooks/mongodb.py | 45 +++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/secator/hooks/mongodb.py b/secator/hooks/mongodb.py index 417ad24f6..26cbc86b6 100644 --- a/secator/hooks/mongodb.py +++ b/secator/hooks/mongodb.py @@ -154,8 +154,11 @@ def update_finding(self, item): if field.compare and not field.name.startswith('_'): field_value = getattr(item, field.name) # Include all non-None, non-empty values in query (including False and 0) - # Empty strings are excluded as they're not meaningful for duplicate detection + # Empty strings, lists, and dicts are excluded as they're not meaningful for duplicate detection if field_value is not None and field_value != '': + # Skip empty lists and dicts + if isinstance(field_value, (list, dict)) and len(field_value) == 0: + continue query[field.name] = field_value # Look for existing finding with same identifying fields @@ -174,22 +177,40 @@ def update_finding(self, item): # Skip None values - keep existing value if value is None: continue - # Skip empty strings unless they override a non-empty value - if value == '' and existing.get(key): - continue - # Skip empty lists or dicts unless they override a non-empty value - if isinstance(value, (list, dict)) and len(value) == 0 and existing.get(key): + # Skip empty strings if existing value is non-empty + existing_value = existing.get(key) + if value == '' and existing_value and existing_value != '': continue + # Skip empty lists or dicts if existing value is non-empty + if isinstance(value, (list, dict)) and len(value) == 0: + if isinstance(existing_value, (list, dict)) and len(existing_value) > 0: + continue # For lists, merge them (avoiding duplicates) - if isinstance(value, list) and isinstance(existing.get(key), list): - merged_list = _merge_lists(existing[key].copy(), value) - if merged_list != existing[key]: + if isinstance(value, list) and isinstance(existing_value, list): + # Check if there are any new items to add + has_new_items = False + try: + existing_set = set(existing_value) + for item_val in value: + if item_val not in existing_set: + has_new_items = True + break + except TypeError: + # Items not hashable, use slower check + for item_val in value: + if item_val not in existing_value: + has_new_items = True + break + # Only merge and mark as changed if there are new items + if has_new_items: + merged_list = _merge_lists(existing_value.copy(), value) changed_fields[key] = merged_list # For dicts, merge them - elif isinstance(value, dict) and isinstance(existing.get(key), dict): - merged_dict = existing[key].copy() + elif isinstance(value, dict) and isinstance(existing_value, dict): + merged_dict = existing_value.copy() merged_dict.update(value) - if merged_dict != existing[key]: + # Only mark as changed if dict actually changed + if merged_dict != existing_value: changed_fields[key] = merged_dict else: # Only update if value is different From 0293b1e1d7acb427cea37261d2738f274bb19388 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 22 Nov 2025 20:01:22 +0000 Subject: [PATCH 7/8] refactor: extract _has_new_items helper and improve docstrings Co-authored-by: ocervell <9629314+ocervell@users.noreply.github.com> --- secator/hooks/mongodb.py | 45 +++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/secator/hooks/mongodb.py b/secator/hooks/mongodb.py index 26cbc86b6..e7f389c91 100644 --- a/secator/hooks/mongodb.py +++ b/secator/hooks/mongodb.py @@ -101,15 +101,40 @@ def update_runner(self): debug(f'in {elapsed:.4f}s', sub='hooks.mongodb', id=_id, obj=get_runner_dbg(self), obj_after=False) +def _has_new_items(existing_list, new_list): + """Check if new_list contains items not in existing_list. + + Args: + existing_list: The existing list to check against + new_list: The new list to check + + Returns: + bool: True if new_list has items not in existing_list + """ + try: + existing_set = set(existing_list) + for item_val in new_list: + if item_val not in existing_set: + return True + except TypeError: + # Items not hashable, use slower check + for item_val in new_list: + if item_val not in existing_list: + return True + return False + + def _merge_lists(existing_list, new_list): """Merge two lists, avoiding duplicates while preserving order. + Note: This function modifies existing_list in-place by appending new items from new_list. + Args: - existing_list: The existing list to merge into + existing_list: The existing list to merge into (modified in-place) new_list: The new list to merge from Returns: - The merged list + The merged list (same object as existing_list) """ # Try to use set for O(1) lookup if items are hashable try: @@ -187,22 +212,8 @@ def update_finding(self, item): continue # For lists, merge them (avoiding duplicates) if isinstance(value, list) and isinstance(existing_value, list): - # Check if there are any new items to add - has_new_items = False - try: - existing_set = set(existing_value) - for item_val in value: - if item_val not in existing_set: - has_new_items = True - break - except TypeError: - # Items not hashable, use slower check - for item_val in value: - if item_val not in existing_value: - has_new_items = True - break # Only merge and mark as changed if there are new items - if has_new_items: + if _has_new_items(existing_value, value): merged_list = _merge_lists(existing_value.copy(), value) changed_fields[key] = merged_list # For dicts, merge them From eabaee008827ed65f1d4fde96e5f4dbb91fe310f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 22 Nov 2025 20:03:24 +0000 Subject: [PATCH 8/8] fix: add safety check for _context attribute access Co-authored-by: ocervell <9629314+ocervell@users.noreply.github.com> --- secator/hooks/mongodb.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/secator/hooks/mongodb.py b/secator/hooks/mongodb.py index e7f389c91..c3aa2b80f 100644 --- a/secator/hooks/mongodb.py +++ b/secator/hooks/mongodb.py @@ -128,6 +128,7 @@ def _merge_lists(existing_list, new_list): """Merge two lists, avoiding duplicates while preserving order. Note: This function modifies existing_list in-place by appending new items from new_list. + Callers should pass a copy if they want to preserve the original list. Args: existing_list: The existing list to merge into (modified in-place) @@ -169,7 +170,7 @@ def update_finding(self, item): query = {'_type': _type} # Add workspace_id to query if present to scope duplicates to workspace - workspace_id = item._context.get('workspace_id') + workspace_id = getattr(item, '_context', {}).get('workspace_id') if hasattr(item, '_context') else None if workspace_id: query['_context.workspace_id'] = workspace_id