Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
291 changes: 291 additions & 0 deletions lexical-graph/tests/unit/indexing/build/test_delete_sources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Regression tests for DeleteSources cascade deletion logic.

These tests verify that the Cypher queries generated by DeleteSources methods
contain the correct property names and relationship types, preventing
reintroduction of typos that caused incorrect orphan detection, missed cleanup,
and failed deletions.

Each method is tested with both Neo4j-style and Neptune-style mock graph stores
to ensure correctness on both backends.
"""

import pytest
from unittest.mock import MagicMock

from graphrag_toolkit.lexical_graph.storage.graph.graph_store import format_id, GraphStore
from graphrag_toolkit.lexical_graph.storage.graph.neptune_graph_stores import format_id_for_neptune
from graphrag_toolkit.lexical_graph.storage.vector import VectorStore
from graphrag_toolkit.lexical_graph.indexing.build.delete_sources import DeleteSources


@pytest.fixture
def neo4j_mock_store():
"""Mock GraphStore that uses format_id() — preserves property names verbatim.

For example, node_id("c.chunkId") returns a NodeId whose str() is "c.chunkId".
"""
store = MagicMock(spec=GraphStore)
store.node_id = lambda field: format_id(field)
store.execute_query = MagicMock(return_value=[])
store.execute_query_with_retry = MagicMock(return_value=[])
return store


@pytest.fixture
def neptune_mock_store():
"""Mock GraphStore that uses format_id_for_neptune() — collapses to id(node) form.

For example, node_id("c.chunkId") returns a NodeId whose str() is "id(c)".
"""
store = MagicMock(spec=GraphStore)
store.node_id = lambda field: format_id_for_neptune(field)
store.execute_query = MagicMock(return_value=[])
store.execute_query_with_retry = MagicMock(return_value=[])
return store


@pytest.fixture
def mock_vector_store():
"""MagicMock of VectorStore — required by DeleteSources but not exercised."""
return MagicMock(spec=VectorStore)


def make_delete_sources(graph_store, vector_store):
"""Instantiate DeleteSources with the given mock stores.

DeleteSources is a Pydantic BaseComponent, so we use model_construct()
to bypass validation and allow MagicMock instances for graph_store and
vector_store.
"""
return DeleteSources.model_construct(
graph_store=graph_store,
vector_store=vector_store,
num_workers=10,
batch_size=1000,
)


class TestEntityOrphanDetectionRelTypes:
"""Regression tests for get_orphaned_entity_ids() relationship types.

These tests verify the query contains the correct
relationship types ``__SUBJECT__`` and ``__OBJECT__`` for both backends.
"""

SAMPLE_ENTITY_IDS = ["entity-1", "entity-2"]

def test_neo4j_query_contains_object_rel_type(
self, neo4j_mock_store, mock_vector_store
):
"""__OBJECT__ relationship type must appear in the Neo4j Cypher query."""
ds = make_delete_sources(neo4j_mock_store, mock_vector_store)
ds.get_orphaned_entity_ids(self.SAMPLE_ENTITY_IDS)

cypher_query = neo4j_mock_store.execute_query.call_args[0][0]
assert "__OBJECT__" in cypher_query

def test_neo4j_query_contains_subject_rel_type(
self, neo4j_mock_store, mock_vector_store
):
"""__SUBJECT__ relationship type must appear in the Neo4j Cypher query."""
ds = make_delete_sources(neo4j_mock_store, mock_vector_store)
ds.get_orphaned_entity_ids(self.SAMPLE_ENTITY_IDS)

cypher_query = neo4j_mock_store.execute_query.call_args[0][0]
assert "__SUBJECT__" in cypher_query


def test_neptune_query_contains_object_rel_type(
self, neptune_mock_store, mock_vector_store
):
"""__OBJECT__ relationship type must appear in the Neptune Cypher query."""
ds = make_delete_sources(neptune_mock_store, mock_vector_store)
ds.get_orphaned_entity_ids(self.SAMPLE_ENTITY_IDS)

cypher_query = neptune_mock_store.execute_query.call_args[0][0]
assert "__OBJECT__" in cypher_query

def test_neptune_query_contains_subject_rel_type(
self, neptune_mock_store, mock_vector_store
):
"""__SUBJECT__ relationship type must appear in the Neptune Cypher query."""
ds = make_delete_sources(neptune_mock_store, mock_vector_store)
ds.get_orphaned_entity_ids(self.SAMPLE_ENTITY_IDS)

cypher_query = neptune_mock_store.execute_query.call_args[0][0]
assert "__SUBJECT__" in cypher_query

def test_neo4j_query_does_not_contain_objecxt_typo(
self, neo4j_mock_store, mock_vector_store
):
ds = make_delete_sources(neo4j_mock_store, mock_vector_store)
ds.get_orphaned_entity_ids(self.SAMPLE_ENTITY_IDS)

cypher_query = neo4j_mock_store.execute_query.call_args[0][0]
assert "__OBJECXT__" not in cypher_query

def test_neptune_query_does_not_contain_objecxt_typo(
self, neptune_mock_store, mock_vector_store
):
ds = make_delete_sources(neptune_mock_store, mock_vector_store)
ds.get_orphaned_entity_ids(self.SAMPLE_ENTITY_IDS)

cypher_query = neptune_mock_store.execute_query.call_args[0][0]
assert "__OBJECXT__" not in cypher_query

class TestEntityRetrievalRelTypes:
"""Regression tests for get_entity_ids() relationship types.
These tests verify the query is correct for both backends.
"""

SAMPLE_FACT_IDS = ["fact-1", "fact-2"]

def test_neo4j_query_contains_subject_and_object_rel_types(
self, neo4j_mock_store, mock_vector_store
):
"""Both __SUBJECT__ and __OBJECT__ must appear in the Neo4j Cypher query."""
ds = make_delete_sources(neo4j_mock_store, mock_vector_store)
ds.get_entity_ids(self.SAMPLE_FACT_IDS)

cypher_query = neo4j_mock_store.execute_query.call_args[0][0]
assert "__SUBJECT__" in cypher_query
assert "__OBJECT__" in cypher_query

def test_neptune_query_contains_subject_and_object_rel_types(
self, neptune_mock_store, mock_vector_store
):
"""Both __SUBJECT__ and __OBJECT__ must appear in the Neptune Cypher query."""
ds = make_delete_sources(neptune_mock_store, mock_vector_store)
ds.get_entity_ids(self.SAMPLE_FACT_IDS)

cypher_query = neptune_mock_store.execute_query.call_args[0][0]
assert "__SUBJECT__" in cypher_query
assert "__OBJECT__" in cypher_query

class TestOrphanFactDetectionProperty:
"""Regression tests for get_orphaned_fact_ids() property name.
These tests verify the correct property is used.
"""

SAMPLE_FACT_IDS = ["fact-1", "fact-2"]

def test_neo4j_query_contains_f_factId(
self, neo4j_mock_store, mock_vector_store
):
"""The Neo4j Cypher query must reference f.factId in the WHERE clause."""
ds = make_delete_sources(neo4j_mock_store, mock_vector_store)
ds.get_orphaned_fact_ids(self.SAMPLE_FACT_IDS)

cypher_query = neo4j_mock_store.execute_query.call_args[0][0]
assert "f.factId" in cypher_query

def test_neo4j_query_does_not_contain_f_statementId(
self, neo4j_mock_store, mock_vector_store
):
ds = make_delete_sources(neo4j_mock_store, mock_vector_store)
ds.get_orphaned_fact_ids(self.SAMPLE_FACT_IDS)

cypher_query = neo4j_mock_store.execute_query.call_args[0][0]
assert "f.statementId" not in cypher_query

def test_neptune_query_contains_id_f(
self, neptune_mock_store, mock_vector_store
):
"""The Neptune Cypher query must reference id(f) in the WHERE clause."""
ds = make_delete_sources(neptune_mock_store, mock_vector_store)
ds.get_orphaned_fact_ids(self.SAMPLE_FACT_IDS)

cypher_query = neptune_mock_store.execute_query.call_args[0][0]
assert "id(f)" in cypher_query

class TestChunkDeletionProperty:
"""Regression tests for delete_chunks() property name.
"""

SAMPLE_CHUNK_IDS = ["chunk-1", "chunk-2"]

def test_neo4j_both_queries_contain_c_chunkId(
self, neo4j_mock_store, mock_vector_store
):
"""c.chunkId must appear in both Cypher queries on Neo4j."""
ds = make_delete_sources(neo4j_mock_store, mock_vector_store)
ds.delete_chunks(self.SAMPLE_CHUNK_IDS)

calls = neo4j_mock_store.execute_query_with_retry.call_args_list
assert len(calls) == 2, f"Expected 2 queries, got {len(calls)}"

for i, call in enumerate(calls):
cypher_query = call[0][0]
assert "c.chunkId" in cypher_query, (
f"Query {i + 1} missing c.chunkId: {cypher_query}"
)

def test_neo4j_neither_query_contains_c_chunkIds_typo(
self, neo4j_mock_store, mock_vector_store
):
ds = make_delete_sources(neo4j_mock_store, mock_vector_store)
ds.delete_chunks(self.SAMPLE_CHUNK_IDS)

calls = neo4j_mock_store.execute_query_with_retry.call_args_list
assert len(calls) == 2, f"Expected 2 queries, got {len(calls)}"

for i, call in enumerate(calls):
cypher_query = call[0][0]
assert "c.chunkIds" not in cypher_query, (
f"Query {i + 1} contains typo c.chunkIds: {cypher_query}"
)

def test_neptune_both_queries_contain_id_c(
self, neptune_mock_store, mock_vector_store
):
"""id(c) must appear in both Cypher queries on Neptune."""
ds = make_delete_sources(neptune_mock_store, mock_vector_store)
ds.delete_chunks(self.SAMPLE_CHUNK_IDS)

calls = neptune_mock_store.execute_query_with_retry.call_args_list
assert len(calls) == 2, f"Expected 2 queries, got {len(calls)}"

for i, call in enumerate(calls):
cypher_query = call[0][0]
assert "id(c)" in cypher_query, (
f"Query {i + 1} missing id(c): {cypher_query}"
)

class TestChunkRetrievalProperty:
"""Regression tests for get_chunk_ids() property name.
"""

SAMPLE_SOURCE_ID = "source-1"

def test_neo4j_query_contains_c_chunkId(
self, neo4j_mock_store, mock_vector_store
):
"""c.chunkId must appear in the Neo4j Cypher query."""
ds = make_delete_sources(neo4j_mock_store, mock_vector_store)
ds.get_chunk_ids(self.SAMPLE_SOURCE_ID)

cypher_query = neo4j_mock_store.execute_query.call_args[0][0]
assert "c.chunkId" in cypher_query

def test_neo4j_query_does_not_contain_c_chunkIds_typo(
self, neo4j_mock_store, mock_vector_store
):
ds = make_delete_sources(neo4j_mock_store, mock_vector_store)
ds.get_chunk_ids(self.SAMPLE_SOURCE_ID)

cypher_query = neo4j_mock_store.execute_query.call_args[0][0]
assert "c.chunkIds" not in cypher_query

def test_neptune_query_contains_id_c(
self, neptune_mock_store, mock_vector_store
):
"""id(c) must appear in the Neptune Cypher query."""
ds = make_delete_sources(neptune_mock_store, mock_vector_store)
ds.get_chunk_ids(self.SAMPLE_SOURCE_ID)

cypher_query = neptune_mock_store.execute_query.call_args[0][0]
assert "id(c)" in cypher_query
Loading