From a1079d7875986910d96b5e0bf1fdfc7ba8405149 Mon Sep 17 00:00:00 2001 From: Oussama Hansal Date: Thu, 30 Apr 2026 11:20:12 -0700 Subject: [PATCH] Regression tests for DeleteSources cascade deletion logic --- .../indexing/build/test_delete_sources.py | 291 ++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 lexical-graph/tests/unit/indexing/build/test_delete_sources.py diff --git a/lexical-graph/tests/unit/indexing/build/test_delete_sources.py b/lexical-graph/tests/unit/indexing/build/test_delete_sources.py new file mode 100644 index 00000000..325bfd1b --- /dev/null +++ b/lexical-graph/tests/unit/indexing/build/test_delete_sources.py @@ -0,0 +1,291 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +Regression tests for DeleteSources cascade deletion logic. + +These tests verify that the Cypher queries generated by DeleteSources methods +contain the correct property names and relationship types, preventing +reintroduction of typos that caused incorrect orphan detection, missed cleanup, +and failed deletions. + +Each method is tested with both Neo4j-style and Neptune-style mock graph stores +to ensure correctness on both backends. +""" + +import pytest +from unittest.mock import MagicMock + +from graphrag_toolkit.lexical_graph.storage.graph.graph_store import format_id, GraphStore +from graphrag_toolkit.lexical_graph.storage.graph.neptune_graph_stores import format_id_for_neptune +from graphrag_toolkit.lexical_graph.storage.vector import VectorStore +from graphrag_toolkit.lexical_graph.indexing.build.delete_sources import DeleteSources + + +@pytest.fixture +def neo4j_mock_store(): + """Mock GraphStore that uses format_id() — preserves property names verbatim. + + For example, node_id("c.chunkId") returns a NodeId whose str() is "c.chunkId". + """ + store = MagicMock(spec=GraphStore) + store.node_id = lambda field: format_id(field) + store.execute_query = MagicMock(return_value=[]) + store.execute_query_with_retry = MagicMock(return_value=[]) + return store + + +@pytest.fixture +def neptune_mock_store(): + """Mock GraphStore that uses format_id_for_neptune() — collapses to id(node) form. + + For example, node_id("c.chunkId") returns a NodeId whose str() is "id(c)". + """ + store = MagicMock(spec=GraphStore) + store.node_id = lambda field: format_id_for_neptune(field) + store.execute_query = MagicMock(return_value=[]) + store.execute_query_with_retry = MagicMock(return_value=[]) + return store + + +@pytest.fixture +def mock_vector_store(): + """MagicMock of VectorStore — required by DeleteSources but not exercised.""" + return MagicMock(spec=VectorStore) + + +def make_delete_sources(graph_store, vector_store): + """Instantiate DeleteSources with the given mock stores. + + DeleteSources is a Pydantic BaseComponent, so we use model_construct() + to bypass validation and allow MagicMock instances for graph_store and + vector_store. + """ + return DeleteSources.model_construct( + graph_store=graph_store, + vector_store=vector_store, + num_workers=10, + batch_size=1000, + ) + + +class TestEntityOrphanDetectionRelTypes: + """Regression tests for get_orphaned_entity_ids() relationship types. + + These tests verify the query contains the correct + relationship types ``__SUBJECT__`` and ``__OBJECT__`` for both backends. + """ + + SAMPLE_ENTITY_IDS = ["entity-1", "entity-2"] + + def test_neo4j_query_contains_object_rel_type( + self, neo4j_mock_store, mock_vector_store + ): + """__OBJECT__ relationship type must appear in the Neo4j Cypher query.""" + ds = make_delete_sources(neo4j_mock_store, mock_vector_store) + ds.get_orphaned_entity_ids(self.SAMPLE_ENTITY_IDS) + + cypher_query = neo4j_mock_store.execute_query.call_args[0][0] + assert "__OBJECT__" in cypher_query + + def test_neo4j_query_contains_subject_rel_type( + self, neo4j_mock_store, mock_vector_store + ): + """__SUBJECT__ relationship type must appear in the Neo4j Cypher query.""" + ds = make_delete_sources(neo4j_mock_store, mock_vector_store) + ds.get_orphaned_entity_ids(self.SAMPLE_ENTITY_IDS) + + cypher_query = neo4j_mock_store.execute_query.call_args[0][0] + assert "__SUBJECT__" in cypher_query + + + def test_neptune_query_contains_object_rel_type( + self, neptune_mock_store, mock_vector_store + ): + """__OBJECT__ relationship type must appear in the Neptune Cypher query.""" + ds = make_delete_sources(neptune_mock_store, mock_vector_store) + ds.get_orphaned_entity_ids(self.SAMPLE_ENTITY_IDS) + + cypher_query = neptune_mock_store.execute_query.call_args[0][0] + assert "__OBJECT__" in cypher_query + + def test_neptune_query_contains_subject_rel_type( + self, neptune_mock_store, mock_vector_store + ): + """__SUBJECT__ relationship type must appear in the Neptune Cypher query.""" + ds = make_delete_sources(neptune_mock_store, mock_vector_store) + ds.get_orphaned_entity_ids(self.SAMPLE_ENTITY_IDS) + + cypher_query = neptune_mock_store.execute_query.call_args[0][0] + assert "__SUBJECT__" in cypher_query + + def test_neo4j_query_does_not_contain_objecxt_typo( + self, neo4j_mock_store, mock_vector_store + ): + ds = make_delete_sources(neo4j_mock_store, mock_vector_store) + ds.get_orphaned_entity_ids(self.SAMPLE_ENTITY_IDS) + + cypher_query = neo4j_mock_store.execute_query.call_args[0][0] + assert "__OBJECXT__" not in cypher_query + + def test_neptune_query_does_not_contain_objecxt_typo( + self, neptune_mock_store, mock_vector_store + ): + ds = make_delete_sources(neptune_mock_store, mock_vector_store) + ds.get_orphaned_entity_ids(self.SAMPLE_ENTITY_IDS) + + cypher_query = neptune_mock_store.execute_query.call_args[0][0] + assert "__OBJECXT__" not in cypher_query + +class TestEntityRetrievalRelTypes: + """Regression tests for get_entity_ids() relationship types. + These tests verify the query is correct for both backends. + """ + + SAMPLE_FACT_IDS = ["fact-1", "fact-2"] + + def test_neo4j_query_contains_subject_and_object_rel_types( + self, neo4j_mock_store, mock_vector_store + ): + """Both __SUBJECT__ and __OBJECT__ must appear in the Neo4j Cypher query.""" + ds = make_delete_sources(neo4j_mock_store, mock_vector_store) + ds.get_entity_ids(self.SAMPLE_FACT_IDS) + + cypher_query = neo4j_mock_store.execute_query.call_args[0][0] + assert "__SUBJECT__" in cypher_query + assert "__OBJECT__" in cypher_query + + def test_neptune_query_contains_subject_and_object_rel_types( + self, neptune_mock_store, mock_vector_store + ): + """Both __SUBJECT__ and __OBJECT__ must appear in the Neptune Cypher query.""" + ds = make_delete_sources(neptune_mock_store, mock_vector_store) + ds.get_entity_ids(self.SAMPLE_FACT_IDS) + + cypher_query = neptune_mock_store.execute_query.call_args[0][0] + assert "__SUBJECT__" in cypher_query + assert "__OBJECT__" in cypher_query + +class TestOrphanFactDetectionProperty: + """Regression tests for get_orphaned_fact_ids() property name. + These tests verify the correct property is used. + """ + + SAMPLE_FACT_IDS = ["fact-1", "fact-2"] + + def test_neo4j_query_contains_f_factId( + self, neo4j_mock_store, mock_vector_store + ): + """The Neo4j Cypher query must reference f.factId in the WHERE clause.""" + ds = make_delete_sources(neo4j_mock_store, mock_vector_store) + ds.get_orphaned_fact_ids(self.SAMPLE_FACT_IDS) + + cypher_query = neo4j_mock_store.execute_query.call_args[0][0] + assert "f.factId" in cypher_query + + def test_neo4j_query_does_not_contain_f_statementId( + self, neo4j_mock_store, mock_vector_store + ): + ds = make_delete_sources(neo4j_mock_store, mock_vector_store) + ds.get_orphaned_fact_ids(self.SAMPLE_FACT_IDS) + + cypher_query = neo4j_mock_store.execute_query.call_args[0][0] + assert "f.statementId" not in cypher_query + + def test_neptune_query_contains_id_f( + self, neptune_mock_store, mock_vector_store + ): + """The Neptune Cypher query must reference id(f) in the WHERE clause.""" + ds = make_delete_sources(neptune_mock_store, mock_vector_store) + ds.get_orphaned_fact_ids(self.SAMPLE_FACT_IDS) + + cypher_query = neptune_mock_store.execute_query.call_args[0][0] + assert "id(f)" in cypher_query + +class TestChunkDeletionProperty: + """Regression tests for delete_chunks() property name. + """ + + SAMPLE_CHUNK_IDS = ["chunk-1", "chunk-2"] + + def test_neo4j_both_queries_contain_c_chunkId( + self, neo4j_mock_store, mock_vector_store + ): + """c.chunkId must appear in both Cypher queries on Neo4j.""" + ds = make_delete_sources(neo4j_mock_store, mock_vector_store) + ds.delete_chunks(self.SAMPLE_CHUNK_IDS) + + calls = neo4j_mock_store.execute_query_with_retry.call_args_list + assert len(calls) == 2, f"Expected 2 queries, got {len(calls)}" + + for i, call in enumerate(calls): + cypher_query = call[0][0] + assert "c.chunkId" in cypher_query, ( + f"Query {i + 1} missing c.chunkId: {cypher_query}" + ) + + def test_neo4j_neither_query_contains_c_chunkIds_typo( + self, neo4j_mock_store, mock_vector_store + ): + ds = make_delete_sources(neo4j_mock_store, mock_vector_store) + ds.delete_chunks(self.SAMPLE_CHUNK_IDS) + + calls = neo4j_mock_store.execute_query_with_retry.call_args_list + assert len(calls) == 2, f"Expected 2 queries, got {len(calls)}" + + for i, call in enumerate(calls): + cypher_query = call[0][0] + assert "c.chunkIds" not in cypher_query, ( + f"Query {i + 1} contains typo c.chunkIds: {cypher_query}" + ) + + def test_neptune_both_queries_contain_id_c( + self, neptune_mock_store, mock_vector_store + ): + """id(c) must appear in both Cypher queries on Neptune.""" + ds = make_delete_sources(neptune_mock_store, mock_vector_store) + ds.delete_chunks(self.SAMPLE_CHUNK_IDS) + + calls = neptune_mock_store.execute_query_with_retry.call_args_list + assert len(calls) == 2, f"Expected 2 queries, got {len(calls)}" + + for i, call in enumerate(calls): + cypher_query = call[0][0] + assert "id(c)" in cypher_query, ( + f"Query {i + 1} missing id(c): {cypher_query}" + ) + +class TestChunkRetrievalProperty: + """Regression tests for get_chunk_ids() property name. + """ + + SAMPLE_SOURCE_ID = "source-1" + + def test_neo4j_query_contains_c_chunkId( + self, neo4j_mock_store, mock_vector_store + ): + """c.chunkId must appear in the Neo4j Cypher query.""" + ds = make_delete_sources(neo4j_mock_store, mock_vector_store) + ds.get_chunk_ids(self.SAMPLE_SOURCE_ID) + + cypher_query = neo4j_mock_store.execute_query.call_args[0][0] + assert "c.chunkId" in cypher_query + + def test_neo4j_query_does_not_contain_c_chunkIds_typo( + self, neo4j_mock_store, mock_vector_store + ): + ds = make_delete_sources(neo4j_mock_store, mock_vector_store) + ds.get_chunk_ids(self.SAMPLE_SOURCE_ID) + + cypher_query = neo4j_mock_store.execute_query.call_args[0][0] + assert "c.chunkIds" not in cypher_query + + def test_neptune_query_contains_id_c( + self, neptune_mock_store, mock_vector_store + ): + """id(c) must appear in the Neptune Cypher query.""" + ds = make_delete_sources(neptune_mock_store, mock_vector_store) + ds.get_chunk_ids(self.SAMPLE_SOURCE_ID) + + cypher_query = neptune_mock_store.execute_query.call_args[0][0] + assert "id(c)" in cypher_query