diff --git a/.gitignore b/.gitignore index 5e5bc30..3ad1bcc 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ env/ # Test / coverage .pytest_cache/ +backend/pytest-unit-results.xml coverage.xml htmlcov/ diff --git a/backend/app/adapters/repos/base.py b/backend/app/adapters/repos/base.py index 7d4739b..2dc89ad 100644 --- a/backend/app/adapters/repos/base.py +++ b/backend/app/adapters/repos/base.py @@ -32,9 +32,10 @@ async def list_gt_paginated( tags: list[str] | None = None, exclude_tags: list[str] | None = None, item_id: str | None = None, - ref_url: str | None = None, + plugin_filters: dict[str, str] | None = None, keyword: str | None = None, sort_by: SortField | None = None, + plugin_sort: str | None = None, sort_order: SortOrder | None = None, page: int = 1, limit: int = 25, diff --git a/backend/app/adapters/repos/cosmos_repo.py b/backend/app/adapters/repos/cosmos_repo.py index 2dfe5b4..5a394c7 100644 --- a/backend/app/adapters/repos/cosmos_repo.py +++ b/backend/app/adapters/repos/cosmos_repo.py @@ -21,6 +21,7 @@ from azure.cosmos.exceptions import CosmosHttpResponseError, CosmosResourceNotFoundError from app.adapters.repos.base import GroundTruthRepo +from app.domain.conversation_fields import answer_text_from_item, question_text_from_item from app.domain.models import ( AgenticGroundTruthEntry, Stats, @@ -32,6 +33,8 @@ ) from app.domain.enums import GroundTruthStatus, SortField, SortOrder from app.core.config import get_sampling_allocation +from app.plugins.base import PluginPackRegistry +from app.plugins.pack_registry import get_default_pack_registry _SMART_PUNCT_REPLACEMENTS: dict[str, str] = { @@ -51,19 +54,16 @@ **{ord(ch): " " for ch in (chr(i) for i in range(32)) if ch not in ("\n", "\r", "\t")}, ord("\u007f"): " ", } - # Cosmos DB SELECT clause for AgenticGroundTruthEntry fields used in several functions # list_gt_paginated, _list_gt_paginated_with_emulator, list_gt_by_dataset -# Note: legacy fields like synthQuestion, editedQuestion are still selected for compatibility -# during migration, but the model will access them via computed properties SELECT_CLAUSE_C = ( "SELECT c.id, c.datasetName, c.bucket, c.status, c.docType, c.schemaVersion, " - "c.synthQuestion, c.editedQuestion, c.answer, c.refs, c.tags, c.manualTags, c.computedTags, c.comment, c.plugins, " + "c.tags, c.manualTags, c.computedTags, c.comment, c.plugins, " "c.scenarioId, c.history, c.contextEntries, c.traceIds, c.toolCalls, c.expectedTools, " "c.feedback, c.metadata, c.createdBy, c.createdAt, c.tracePayload, " "c.contextUsedForGeneration, c.contextSource, c.modelUsedForGeneration, " "c.semanticClusterNumber, c.weight, c.samplingBucket, c.questionLength, " - "c.assignedTo, c.assignedAt, c.totalReferences, c.updatedAt, c.updatedBy, c.reviewedAt, c._etag " + "c.assignedTo, c.assignedAt, c.updatedAt, c.updatedBy, c.reviewedAt, c._etag " ) @@ -147,7 +147,7 @@ def _sanitize_string_for_cosmos(value: str) -> str: def _normalize_unicode_for_cosmos(obj: Any) -> Any: """ Recursively sanitize strings to work around Cosmos emulator Unicode bugs. - Also Base64-encodes 'content' fields in 'refs' arrays as a workaround. + Also Base64-encodes 'content' fields in reference arrays as a workaround. """ if not settings.COSMOS_DISABLE_UNICODE_ESCAPE: @@ -158,11 +158,11 @@ def _normalize_unicode_for_cosmos(obj: Any) -> Any: if isinstance(obj, dict): normalized = {} for k, v in obj.items(): - # Special handling for 'refs' array - encode content fields - if k == "refs" and isinstance(v, list): - # First normalize the refs + # Special handling for canonical reference arrays - encode content fields + if k == "references" and isinstance(v, list): + # First normalize the reference entries normalized_refs = [_normalize_unicode_for_cosmos(item) for item in v] - # Then Base64-encode content fields in refs + # Then Base64-encode content fields in references normalized[k] = _base64_encode_refs_content(normalized_refs) else: normalized[k] = _normalize_unicode_for_cosmos(v) @@ -175,7 +175,7 @@ def _normalize_unicode_for_cosmos(obj: Any) -> Any: def _restore_unicode_from_cosmos(obj: Any) -> Any: """ Reverse emulator-only sanitization markers after fetching documents. - Also Base64-decodes 'content' fields in 'refs' arrays. + Also Base64-decodes 'content' fields in reference arrays. """ if not settings.COSMOS_DISABLE_UNICODE_ESCAPE: @@ -188,8 +188,8 @@ def _restore_unicode_from_cosmos(obj: Any) -> Any: if isinstance(obj, dict): restored = {} for k, v in obj.items(): - # Special handling for 'refs' array - decode content fields - if k == "refs" and isinstance(v, list): + # Special handling for canonical reference arrays - decode content fields + if k == "references" and isinstance(v, list): # First decode Base64-encoded content fields decoded_refs = _base64_decode_refs_content(v) # Then restore backslash sentinels @@ -221,6 +221,7 @@ def __init__( connection_verify: bool | str | None = None, test_mode: bool = False, credential: Any | None = None, + plugin_pack_registry: PluginPackRegistry | None = None, ): # Defer CosmosClient creation to _init so the underlying aiohttp session binds # to the event loop of the running app (avoids cross-loop RuntimeError in tests). @@ -236,6 +237,7 @@ def __init__( self._db: DatabaseProxy | None = None self._gt_container: ContainerProxy | None = None self._assignments_container: ContainerProxy | None = None + self._plugin_pack_registry = plugin_pack_registry or get_default_pack_registry() # Track the event loop on which the aiohttp client/session was created to # guard against cross-loop usage during tests. self._loop: asyncio.AbstractEventLoop | None = None # set in _init on first use @@ -394,10 +396,6 @@ def _to_doc(self, item: AgenticGroundTruthEntry) -> dict[str, Any]: # Dump in JSON mode so datetimes/enums are serialized to strings d = item.model_dump(mode="json", by_alias=True) - # Ensure totalReferences is computed and persisted for sorting/querying - # Use the property getter which handles both explicit values and plugin storage - d["totalReferences"] = item.totalReferences - if d.get("bucket") is not None: d["bucket"] = str(d["bucket"]) # store UUID as string # Ensure updatedAt present as ISO string @@ -406,58 +404,23 @@ def _to_doc(self, item: AgenticGroundTruthEntry) -> dict[str, Any]: return d - @staticmethod - def _from_doc(doc: dict[str, Any]) -> AgenticGroundTruthEntry: + def _from_doc(self, doc: dict[str, Any]) -> AgenticGroundTruthEntry: # Normalize doc before validation normalized_doc = ( _restore_unicode_from_cosmos(doc) if settings.COSMOS_DISABLE_UNICODE_ESCAPE else doc ) - from app.plugins.packs.rag_compat import _LEGACY_PLUGIN_FIELDS - - allowed_keys = ( - {field_name for field_name in AgenticGroundTruthEntry.model_fields} - | { - field.alias - for field in AgenticGroundTruthEntry.model_fields.values() - if field.alias is not None - } - | { - # Include computed_fields that need to be preserved from Cosmos documents - "totalReferences" # Computed and persisted for sorting/querying - } - | set(_LEGACY_PLUGIN_FIELDS) - ) - normalized_doc = { - key: value for key, value in normalized_doc.items() if key in allowed_keys + allowed_keys = {field_name for field_name in AgenticGroundTruthEntry.model_fields} | { + field.alias + for field in AgenticGroundTruthEntry.model_fields.values() + if field.alias is not None } + transformed_doc: dict[str, Any] = dict(normalized_doc) + for transform in self._plugin_pack_registry.collect_import_transforms(): + transformed_doc = transform.transform(transformed_doc) - plugins = normalized_doc.get("plugins") - rag_plugin = plugins.get("rag-compat") if isinstance(plugins, dict) else None - rag_data = rag_plugin.get("data") if isinstance(rag_plugin, dict) else None - history_annotations = ( - rag_data.get("historyAnnotations") if isinstance(rag_data, dict) else None - ) - history = normalized_doc.get("history") - if isinstance(history, list) and isinstance(history_annotations, list): - merged_history: list[Any] = [] - for index, entry in enumerate(history): - if isinstance(entry, dict): - entry_dict = dict(entry) - annotation = ( - history_annotations[index] if index < len(history_annotations) else None - ) - if isinstance(annotation, dict): - if "refs" in annotation and "refs" not in entry_dict: - entry_dict["refs"] = annotation["refs"] - if ( - "expectedBehavior" in annotation - and "expectedBehavior" not in entry_dict - ): - entry_dict["expectedBehavior"] = annotation["expectedBehavior"] - merged_history.append(entry_dict) - else: - merged_history.append(entry) - normalized_doc["history"] = merged_history + normalized_doc = { + key: value for key, value in transformed_doc.items() if key in allowed_keys + } # Convert None to [] for history field (legacy data compatibility) if normalized_doc.get("history") is None: @@ -466,12 +429,6 @@ def _from_doc(doc: dict[str, Any]) -> AgenticGroundTruthEntry: # Pydantic will parse aliases automatically item = AgenticGroundTruthEntry.model_validate(normalized_doc) - # IMPORTANT: totalReferences is a @computed_field, so Pydantic won't deserialize it - # from the document. We need to manually set it in __dict__ so the property getter - # can find it. This preserves the value we computed and persisted in _to_doc. - if "totalReferences" in normalized_doc: - item.__dict__["totalReferences"] = normalized_doc["totalReferences"] - return item async def _ensure_initialized(self) -> None: @@ -595,12 +552,7 @@ async def import_bulk_gt( status = getattr(e, "status_code", None) if status == 409: # Duplicate; report but continue others - article_num = ( - doc.get("refs", [{}])[0].get("url", "unknown") - if doc.get("refs") - else "unknown" - ) - message = f"exists (article: {article_num}, id: {doc.get('id', 'unknown')})" + message = f"exists (id: {doc.get('id', 'unknown')})" errors.append(message) persistence_errors.append( BulkImportPersistenceError( @@ -610,13 +562,8 @@ async def import_bulk_gt( ) ) else: - article_num = ( - doc.get("refs", [{}])[0].get("url", "unknown") - if doc.get("refs") - else "unknown" - ) message = ( - f"create_failed (article: {article_num}, id: {doc.get('id', 'unknown')}): " + f"create_failed (id: {doc.get('id', 'unknown')}): " f"{getattr(e, 'message', str(e))}" ) errors.append(message) @@ -708,16 +655,6 @@ def _build_query_filter( ) params.append({"name": pname, "value": tag}) - # Ref URL filtering only if not using the Cosmos Emulator as it does not support EXISTS - # include_ref_url set to True when Comsomus Emulator is not used - if include_ref_url and ref_url: - clauses.append( - "(EXISTS(SELECT VALUE r FROM r IN c.refs WHERE CONTAINS(r.url, @refUrl)) " - "OR EXISTS(SELECT VALUE h FROM h IN c.history " - "WHERE EXISTS(SELECT VALUE r FROM r IN h.refs WHERE CONTAINS(r.url, @refUrl))))" - ) - params.append({"name": "@refUrl", "value": ref_url}) - where_clause = " WHERE " + " AND ".join(clauses) if clauses else "" return where_clause, params @@ -747,7 +684,19 @@ def _resolve_sort( return field, direction @staticmethod - def _sort_key(item: AgenticGroundTruthEntry, field: SortField) -> tuple[Any, ...]: + def _sort_key( + item: AgenticGroundTruthEntry, + field: SortField, + plugin_sort: str | None = None, + plugin_pack_registry: PluginPackRegistry | None = None, + ) -> tuple[Any, ...]: + if plugin_sort: + if plugin_pack_registry is None: + return (-1, item.id) + return ( + plugin_pack_registry.plugin_sort_value(item, plugin_sort), + item.id, + ) if field == SortField.id: return (item.id or "",) @@ -762,15 +711,12 @@ def _sort_key(item: AgenticGroundTruthEntry, field: SortField) -> tuple[Any, ... if field == SortField.has_answer: # In-memory sort: Primary by presence of non-empty answer, secondary by reviewed_at # (Cosmos ORDER BY uses c.reviewedAt placeholder - see _build_secure_sort_clause) - has_answer = 1 if item.answer and item.answer.strip() else 0 + has_answer = 1 if answer_text_from_item(item) else 0 reference_time = ( item.reviewed_at or item.updated_at or datetime(1970, 1, 1, tzinfo=timezone.utc) ) return (has_answer, reference_time, item.id) - if field == SortField.totalReferences: - return (item.totalReferences, item.id) - if field == SortField.tag_count: tag_count = len(item.tags) return (tag_count, item.id) @@ -789,8 +735,8 @@ def _item_matches_keyword(item: AgenticGroundTruthEntry, keyword: str) -> bool: """Check if item matches keyword search (case-insensitive substring match). Searches across: - - synth_question and edited_question fields - - answer field + - canonical question text (derived from history/plugin data) + - canonical answer text (derived from history/plugin data) - history[*].msg content (all turns) """ if not keyword: @@ -799,13 +745,13 @@ def _item_matches_keyword(item: AgenticGroundTruthEntry, keyword: str) -> bool: search_term = keyword.lower() # Search question fields - if item.synth_question and search_term in item.synth_question.lower(): - return True - if item.edited_question and search_term in item.edited_question.lower(): + question_text = question_text_from_item(item) + if question_text and search_term in question_text.lower(): return True # Search answer field - if item.answer and search_term in item.answer.lower(): + answer_text = answer_text_from_item(item) + if answer_text and search_term in answer_text.lower(): return True # Search history messages @@ -830,7 +776,6 @@ def _build_secure_sort_clause(self, sort_field: SortField, sort_direction: SortO SortField.updated_at: "c.updatedAt", SortField.reviewed_at: "c.reviewedAt", SortField.has_answer: "c.reviewedAt", # Placeholder - actual sort is in-memory - SortField.totalReferences: "c.totalReferences", } # Security: Safe direction mapping (no user input) @@ -860,9 +805,10 @@ async def list_gt_paginated( tags: list[str] | None = None, exclude_tags: list[str] | None = None, item_id: str | None = None, - ref_url: str | None = None, + plugin_filters: dict[str, str] | None = None, keyword: str | None = None, sort_by: SortField | None = None, + plugin_sort: str | None = None, sort_order: SortOrder | None = None, page: int = 1, limit: int = 25, @@ -889,9 +835,11 @@ async def list_gt_paginated( if ( normalized_tags or normalized_exclude_tags - or ref_url + or plugin_filters or keyword or sort_field == SortField.tag_count + or sort_field == SortField.has_answer + or plugin_sort is not None ): # Always use in-memory filtering path for these filters # (Cosmos emulator has limitations, and keyword search needs in-memory filtering regardless) @@ -901,9 +849,10 @@ async def list_gt_paginated( normalized_tags, normalized_exclude_tags, item_id, - ref_url, + plugin_filters, keyword, sort_by, + plugin_sort, sort_order, safe_page, safe_limit, @@ -916,9 +865,9 @@ async def list_gt_paginated( normalized_tags, normalized_exclude_tags, item_id, - ref_url, + None, include_tags=True, - include_ref_url=True, + include_ref_url=False, ) # Build ORDER BY clause @@ -982,9 +931,10 @@ async def _list_gt_paginated_with_emulator( tags: list[str], exclude_tags: list[str], item_id: str | None, - ref_url: str | None, + plugin_filters: dict[str, str] | None, keyword: str | None, sort_by: SortField | None, + plugin_sort: str | None, sort_order: SortOrder | None, page: int, limit: int, @@ -1008,7 +958,7 @@ async def _list_gt_paginated_with_emulator( tags, exclude_tags, item_id, - ref_url, + None, include_tags=False, # Disable SQL-level tag filtering - filter in-memory instead include_ref_url=False, # Disable ref_url filtering for emulator ) @@ -1071,39 +1021,23 @@ async def _list_gt_paginated_with_emulator( filtered_items_exclude.append(item) raw_items = filtered_items_exclude - # Filter by ref_url in-memory (EXISTS not supported by Cosmos DB emulator) - if ref_url: + # Filter by plugin-owned filters in-memory. + if plugin_filters: start = time.time() - filtered_items_ref: list[AgenticGroundTruthEntry] = [] - total_refs_checked = 0 + filtered_items_plugin: list[AgenticGroundTruthEntry] = [] for item in raw_items: - # Check item-level refs - has_match = any(ref_url in ref.url for ref in item.refs) - total_refs_checked += len(item.refs) - - # Check history-level refs if no match yet - if not has_match and item.history: - for turn in item.history: - turn_refs = getattr(turn, "refs", None) - if turn_refs: - total_refs_checked += len(turn_refs) - if any(ref_url in ref.url for ref in turn_refs): - has_match = True - break - if has_match: - filtered_items_ref.append(item) + if self._plugin_pack_registry.matches_query_filters(item, plugin_filters): + filtered_items_plugin.append(item) elapsed = time.time() - start self._logger.info( - "repo.ref_url_filter.performance" + "repo.plugin_filter.performance" f"items_checked: {len(raw_items)}, " - f"items_matched: {len(filtered_items_ref)}, " - f"refs_checked: {total_refs_checked}, " + f"items_matched: {len(filtered_items_plugin)}, " f"elapsed_ms: {elapsed * 1000}, " - f"ref_url_length: {len(ref_url)}, " ) - raw_items = filtered_items_ref + raw_items = filtered_items_plugin # Filter by keyword in-memory (case-insensitive substring match) if keyword: @@ -1126,7 +1060,15 @@ async def _list_gt_paginated_with_emulator( # Sort in-memory (required since ORDER BY conflicts with ARRAY_CONTAINS in Cosmos DB) reverse_sort = sort_direction == SortOrder.desc - raw_items.sort(key=lambda item: self._sort_key(item, sort_field), reverse=reverse_sort) + raw_items.sort( + key=lambda item: self._sort_key( + item, + sort_field, + plugin_sort=plugin_sort, + plugin_pack_registry=self._plugin_pack_registry, + ), + reverse=reverse_sort, + ) total = len(raw_items) total_pages = math.ceil(total / limit) if total > 0 else 0 diff --git a/backend/app/adapters/repos/memory_repo.py b/backend/app/adapters/repos/memory_repo.py index 84d278a..79972dc 100644 --- a/backend/app/adapters/repos/memory_repo.py +++ b/backend/app/adapters/repos/memory_repo.py @@ -5,6 +5,7 @@ from typing import Iterable from uuid import UUID +from app.domain.conversation_fields import answer_text_from_item, question_text_from_item from app.domain.enums import GroundTruthStatus, SortField, SortOrder from app.domain.models import ( AgenticGroundTruthEntry, @@ -15,6 +16,8 @@ PaginationMetadata, Stats, ) +from app.plugins.base import PluginPackRegistry +from app.plugins.pack_registry import get_default_pack_registry ZERO_UUID = UUID("00000000-0000-0000-0000-000000000000") @@ -25,12 +28,14 @@ def __init__( *, items: list[AgenticGroundTruthEntry] | None = None, curation_instructions: list[DatasetCurationInstructions] | None = None, + plugin_pack_registry: PluginPackRegistry | None = None, ) -> None: self.items: dict[str, AgenticGroundTruthEntry] = {} self._locations: dict[tuple[str, UUID, str], str] = {} self._assignment_docs: dict[tuple[str, str], AssignmentDocument] = {} self._curation: dict[str, DatasetCurationInstructions] = {} self._etag_version = 0 + self._plugin_pack_registry = plugin_pack_registry or get_default_pack_registry() for item in items or []: self._store_initial_item(item) @@ -45,7 +50,9 @@ def _next_etag(self) -> str: return f"memory-etag-{self._etag_version}" def _clone_item(self, item: AgenticGroundTruthEntry) -> AgenticGroundTruthEntry: - return AgenticGroundTruthEntry.model_validate(item.model_dump(by_alias=True)) + return AgenticGroundTruthEntry.model_validate( + item.model_dump(by_alias=True, exclude={"tags"}) + ) def _clone_instruction(self, doc: DatasetCurationInstructions) -> DatasetCurationInstructions: return DatasetCurationInstructions.model_validate(doc.model_dump(by_alias=True)) @@ -104,28 +111,30 @@ def _matches_location( ) def _collect_urls(self, item: AgenticGroundTruthEntry) -> Iterable[str]: - for ref in item.refs: - yield ref.url - for turn in item.history or []: - for ref in getattr(turn, "refs", None) or []: - yield ref.url + for doc in self._plugin_pack_registry.collect_search_documents(item): + url = doc.get("url") + if isinstance(url, str) and url: + yield url def _collect_text(self, item: AgenticGroundTruthEntry) -> str: parts = [ item.id, item.datasetName, - item.synth_question or "", - item.edited_question or "", - item.answer or "", + question_text_from_item(item), + answer_text_from_item(item), item.comment or "", ] for turn in item.history or []: parts.append(turn.msg) - for ref in item.refs: - parts.extend([ref.title or "", ref.url, ref.content or "", ref.keyExcerpt or ""]) - for turn in item.history or []: - for ref in getattr(turn, "refs", None) or []: - parts.extend([ref.title or "", ref.url, ref.content or "", ref.keyExcerpt or ""]) + for doc in self._plugin_pack_registry.collect_search_documents(item): + parts.extend( + [ + str(doc.get("id") or ""), + str(doc.get("title") or ""), + str(doc.get("url") or ""), + str(doc.get("chunk") or ""), + ] + ) return " ".join(parts).lower() def _is_unassigned_candidate(self, item: AgenticGroundTruthEntry) -> bool: @@ -138,24 +147,27 @@ def _sort_items( self, items: list[AgenticGroundTruthEntry], sort_by: SortField | None, + plugin_sort: str | None, sort_order: SortOrder | None, ) -> list[AgenticGroundTruthEntry]: field = sort_by or SortField.reviewed_at reverse = (sort_order or SortOrder.desc) == SortOrder.desc def key(item: AgenticGroundTruthEntry): + if plugin_sort: + plugin_value = self._plugin_pack_registry.plugin_sort_value(item, plugin_sort) + return ( + plugin_value if plugin_value is not None else -1, + item.updated_at or datetime.min.replace(tzinfo=timezone.utc), + item.id, + ) if field == SortField.updated_at: return item.updated_at or datetime.min.replace(tzinfo=timezone.utc) if field == SortField.id: return item.id if field == SortField.has_answer: return ( - 1 if (item.answer or "").strip() else 0, - item.updated_at or datetime.min.replace(tzinfo=timezone.utc), - ) - if field == SortField.totalReferences: - return ( - item.totalReferences, + 1 if answer_text_from_item(item) else 0, item.updated_at or datetime.min.replace(tzinfo=timezone.utc), ) if field == SortField.tag_count: @@ -194,7 +206,7 @@ async def list_gt_by_dataset( items = [item for item in items if item.status == status] return [ self._clone_item(item) - for item in self._sort_items(items, SortField.updated_at, SortOrder.desc) + for item in self._sort_items(items, SortField.updated_at, None, SortOrder.desc) ] async def list_all_gt( @@ -205,7 +217,7 @@ async def list_all_gt( items = [item for item in items if item.status == status] return [ self._clone_item(item) - for item in self._sort_items(items, SortField.updated_at, SortOrder.desc) + for item in self._sort_items(items, SortField.updated_at, None, SortOrder.desc) ] async def list_gt_paginated( @@ -215,9 +227,10 @@ async def list_gt_paginated( tags: list[str] | None = None, exclude_tags: list[str] | None = None, item_id: str | None = None, - ref_url: str | None = None, + plugin_filters: dict[str, str] | None = None, keyword: str | None = None, sort_by: SortField | None = None, + plugin_sort: str | None = None, sort_order: SortOrder | None = None, page: int = 1, limit: int = 25, @@ -235,15 +248,17 @@ async def list_gt_paginated( filtered = [item for item in filtered if not banned.intersection(set(item.tags))] if item_id: filtered = [item for item in filtered if item_id in item.id] - if ref_url: + if plugin_filters: filtered = [ - item for item in filtered if any(ref_url in url for url in self._collect_urls(item)) + item + for item in filtered + if self._plugin_pack_registry.matches_query_filters(item, plugin_filters) ] if keyword: lowered = keyword.lower() filtered = [item for item in filtered if lowered in self._collect_text(item)] - sorted_items = self._sort_items(filtered, sort_by, sort_order) + sorted_items = self._sort_items(filtered, sort_by, plugin_sort, sort_order) total = len(sorted_items) start = (page - 1) * limit end = start + limit @@ -325,7 +340,7 @@ async def list_unassigned(self, limit: int) -> list[AgenticGroundTruthEntry]: items = [item for item in self.items.values() if self._is_unassigned_candidate(item)] return [ self._clone_item(item) - for item in self._sort_items(items, SortField.updated_at, SortOrder.desc)[:limit] + for item in self._sort_items(items, SortField.updated_at, None, SortOrder.desc)[:limit] ] async def sample_unassigned( @@ -346,7 +361,7 @@ async def query_unassigned_by_dataset_prefix( ] return [ self._clone_item(item) - for item in self._sort_items(items, SortField.updated_at, SortOrder.desc)[:take] + for item in self._sort_items(items, SortField.updated_at, None, SortOrder.desc)[:take] ] async def query_unassigned_global( @@ -360,7 +375,7 @@ async def query_unassigned_global( ] return [ self._clone_item(item) - for item in self._sort_items(items, SortField.updated_at, SortOrder.desc)[:take] + for item in self._sort_items(items, SortField.updated_at, None, SortOrder.desc)[:take] ] async def assign_to(self, item_id: str, user_id: str) -> bool: @@ -399,7 +414,7 @@ async def list_assigned(self, user_id: str) -> list[AgenticGroundTruthEntry]: ] return [ self._clone_item(item) - for item in self._sort_items(items, SortField.updated_at, SortOrder.desc) + for item in self._sort_items(items, SortField.updated_at, None, SortOrder.desc) ] async def upsert_assignment_doc( diff --git a/backend/app/adapters/search/demo_search.py b/backend/app/adapters/search/demo_search.py index 046cfda..14030db 100644 --- a/backend/app/adapters/search/demo_search.py +++ b/backend/app/adapters/search/demo_search.py @@ -1,11 +1,18 @@ from __future__ import annotations from app.domain.models import AgenticGroundTruthEntry +from app.plugins.base import PluginPackRegistry +from app.plugins.pack_registry import get_default_pack_registry class DemoSearchAdapter: - def __init__(self, items: list[AgenticGroundTruthEntry]) -> None: + def __init__( + self, + items: list[AgenticGroundTruthEntry], + plugin_pack_registry: PluginPackRegistry | None = None, + ) -> None: self._items = items + self._plugin_pack_registry = plugin_pack_registry or get_default_pack_registry() async def query(self, q: str, top: int = 5) -> list[dict[str, object]]: query = q.strip().lower() @@ -15,30 +22,32 @@ async def query(self, q: str, top: int = 5) -> list[dict[str, object]]: matches: list[dict[str, object]] = [] seen_urls: set[str] = set() for item in self._items: - refs = list(item.refs) - for turn in item.history or []: - refs.extend(getattr(turn, "refs", None) or []) - for ref in refs: + for ref in self._plugin_pack_registry.collect_search_documents(item): + doc_id = ref.get("id") + url = ref.get("url") + if not isinstance(url, str) or not url: + continue haystack = " ".join( [ - ref.url, - ref.title or "", - ref.content or "", - ref.keyExcerpt or "", + str(doc_id or ""), + url, + str(ref.get("title") or ""), + str(ref.get("chunk") or ""), item.datasetName, item.id, ] ).lower() if query not in haystack: continue - if ref.url in seen_urls: + if url in seen_urls: continue - seen_urls.add(ref.url) + seen_urls.add(url) matches.append( { - "url": ref.url, - "title": ref.title, - "chunk": ref.content or ref.keyExcerpt or f"Reference for {item.id}", + "id": doc_id, + "url": url, + "title": ref.get("title"), + "chunk": ref.get("chunk") or f"Reference for {item.id}", } ) if len(matches) >= top: diff --git a/backend/app/api/v1/assignments.py b/backend/app/api/v1/assignments.py index f5382cd..05714b0 100644 --- a/backend/app/api/v1/assignments.py +++ b/backend/app/api/v1/assignments.py @@ -27,7 +27,6 @@ ETagRequiredError, apply_shared_update, persist_shared_update, - read_legacy_compat_update, ) from app.services.validation_service import ( ApprovalValidationError, @@ -137,7 +136,6 @@ async def update_item( original_assigned_to = it.assignedTo provided_fields: Set[str] = set(payload.model_fields_set) - payload_extras = payload.model_extra or {} try: mutation = apply_shared_update( it, @@ -157,7 +155,6 @@ async def update_item( status=payload.status, approve=bool(payload.approve), actor_user_id=user.user_id, - legacy_update=read_legacy_compat_update(payload_extras), clear_assignment_on_statuses={ GroundTruthStatus.approved, GroundTruthStatus.deleted, diff --git a/backend/app/api/v1/ground_truths.py b/backend/app/api/v1/ground_truths.py index 32f08cd..6ce4b9f 100644 --- a/backend/app/api/v1/ground_truths.py +++ b/backend/app/api/v1/ground_truths.py @@ -22,7 +22,6 @@ ExpectedTools, FeedbackEntry, GroundTruthListResponse, - HistoryItem, PluginPayload, ToolCallRecord, BulkImportError, @@ -39,7 +38,6 @@ ETagRequiredError, apply_shared_update, persist_shared_update, - read_legacy_compat_update, ) from app.services.validation_service import ( ApprovalValidationError, @@ -140,17 +138,6 @@ class GroundTruthUpdateRequest(BaseModel): etag: str | None = Field(default=None, alias="etag") -def _coerce_history_for_internal_use(item: AgenticGroundTruthEntry) -> None: - if not item.history: - return - item.history = [ - entry - if isinstance(entry, HistoryItem) - else HistoryItem.model_validate(entry.model_dump(by_alias=True)) - for entry in item.history - ] - - @router.post("", response_model=ImportBulkResponse) async def import_bulk( items: list[AgenticGroundTruthEntry], @@ -274,7 +261,6 @@ async def import_bulk( # Fetch registry once for performance (avoids O(n) singleton lookups) registry = get_default_registry() for it in gt_items: - _coerce_history_for_internal_use(it) apply_computed_tags(it, registry) result = await container.repo.import_bulk_gt(gt_items, buckets=buckets) @@ -442,16 +428,24 @@ async def list_all_ground_truths( alias="itemId", description="Search for items by ID (case-sensitive partial match)", ), - ref_url: str | None = Query( + plugin_filter: list[str] | None = Query( default=None, - alias="refUrl", - description="Search for items by reference URL (case-sensitive partial match)", + alias="pluginFilter", + description=( + "Plugin-namespaced filters in key=value form (repeat query param). " + "Example: pluginFilter=rag-compat:refUrl=https://example.com" + ), ), keyword: str | None = Query( default=None, description="Search for items by keyword (case-insensitive text search across questions, answers, and history)", ), sort_by: SortField = Query(default=SortField.reviewed_at.value, alias="sortBy"), + plugin_sort: str | None = Query( + default=None, + alias="pluginSort", + description="Plugin-namespaced sort key, e.g. rag-compat:totalReferences", + ), sort_order: SortOrder = Query(default=SortOrder.desc.value, alias="sortOrder"), page: int = Query(default=1), limit: int = Query(default=25), @@ -479,17 +473,51 @@ async def list_all_ground_truths( else: item_id_search = item_id - # Reference URL search validation - ref_url_search = None - if ref_url is not None: - ref_url = ref_url.strip() - if not ref_url: - # Empty after trim - treat as if parameter not provided - ref_url = None - elif len(ref_url) > 500: - raise HTTPException(status_code=400, detail="refUrl must be 500 characters or less") - else: - ref_url_search = ref_url + plugin_filters: dict[str, str] | None = None + if plugin_filter: + parsed: dict[str, str] = {} + for raw_filter in plugin_filter: + candidate = raw_filter.strip() + if not candidate: + continue + key, sep, value = candidate.partition("=") + if not sep: + raise HTTPException( + status_code=400, + detail="pluginFilter entries must use key=value format", + ) + key = key.strip() + value = value.strip() + if not key: + raise HTTPException( + status_code=400, + detail="pluginFilter entries must include a non-empty key", + ) + if ":" not in key: + raise HTTPException( + status_code=400, + detail="pluginFilter key must be namespaced (pack:key)", + ) + if not value: + continue + if len(value) > 500: + raise HTTPException( + status_code=400, + detail="pluginFilter value must be 500 characters or less", + ) + parsed[key] = value + plugin_filters = parsed or None + + plugin_sort_key = None + if plugin_sort is not None: + plugin_sort = plugin_sort.strip() + if plugin_sort: + if ":" not in plugin_sort: + raise HTTPException( + status_code=400, + detail="pluginSort must be namespaced (pack:key)", + ) + plugin_sort_key = plugin_sort # Keyword search validation keyword_search = None @@ -557,9 +585,10 @@ async def list_all_ground_truths( tags=tag_list, exclude_tags=exclude_tag_list, item_id=item_id_search, - ref_url=ref_url_search, + plugin_filters=plugin_filters, keyword=keyword_search, sort_by=sort_by, + plugin_sort=plugin_sort_key, sort_order=sort_order, page=page, limit=limit, @@ -639,7 +668,6 @@ async def update_ground_truth( manual_tags=payload.manual_tags, status=payload.status, actor_user_id=user.user_id, - legacy_update=read_legacy_compat_update(payload_extras), ) except ValidationError as e: raise HTTPException(status_code=400, detail=e.message) diff --git a/backend/app/container.py b/backend/app/container.py index 1054d48..e39af47 100644 --- a/backend/app/container.py +++ b/backend/app/container.py @@ -143,6 +143,14 @@ def _build_snapshot_service(self, repo: GroundTruthRepo) -> SnapshotService: plugin_export_transforms=self.plugin_pack_registry.collect_export_transforms(), ) + def _validate_plugin_packs_startup(self) -> None: + logger.info("Running plugin-pack startup validation...") + self.plugin_pack_registry.validate_all() + logger.info( + "Plugin-pack validation passed. Registered packs: %s", + self.plugin_pack_registry.names(), + ) + def init_cosmos_repo(self, db_name: str | None = None) -> None: """Create a Cosmos repo instance and wire services. @@ -180,6 +188,7 @@ def init_cosmos_repo(self, db_name: str | None = None) -> None: connection_verify=settings.COSMOS_CONNECTION_VERIFY, test_mode=settings.COSMOS_TEST_MODE, credential=credential, + plugin_pack_registry=self.plugin_pack_registry, ) logger.info( "Using CosmosGroundTruthRepo (endpoint=%s, db=%s, container=%s)", @@ -227,6 +236,7 @@ def init_memory_repo(self, *, enable_demo_data: bool = False) -> None: self.repo = InMemoryGroundTruthRepo( items=demo_items, curation_instructions=demo_instructions, + plugin_pack_registry=self.plugin_pack_registry, ) self.assignment_service = AssignmentService(self.repo) self.snapshot_service = self._build_snapshot_service(self.repo) @@ -235,13 +245,18 @@ def init_memory_repo(self, *, enable_demo_data: bool = False) -> None: self.tag_registry_service = TagRegistryService(self.tags_repo) self.tag_definitions_repo = cast(Any, None) self.search_service = ( - SearchService(DemoSearchAdapter(demo_items)) if enable_demo_data else SearchService() + SearchService( + DemoSearchAdapter(demo_items, plugin_pack_registry=self.plugin_pack_registry) + ) + if enable_demo_data + else SearchService() ) logger.info( "Using InMemoryGroundTruthRepo (demo_mode=%s, items=%s)", enable_demo_data, len(demo_items), ) + self._validate_plugin_packs_startup() async def startup_cosmos(self, db_name: str | None = None) -> None: """Initialize and validate Cosmos repos and services. @@ -283,12 +298,7 @@ async def startup_cosmos(self, db_name: str | None = None) -> None: # Step 4: Run plugin-pack startup validation so misconfigured packs # fail here with an actionable error rather than silently at runtime. - logger.info("Running plugin-pack startup validation...") - self.plugin_pack_registry.validate_all() - logger.info( - "Plugin-pack validation passed. Registered packs: %s", - self.plugin_pack_registry.names(), - ) + self._validate_plugin_packs_startup() def init_search(self) -> None: """Configure search adapter if Azure Search settings are present.""" diff --git a/backend/app/demo_seed.py b/backend/app/demo_seed.py index 258509a..4a1ffd3 100644 --- a/backend/app/demo_seed.py +++ b/backend/app/demo_seed.py @@ -11,8 +11,6 @@ AgenticGroundTruthEntry, DatasetCurationInstructions, ExpectedTools, - HistoryEntry, - HistoryItem, Reference, ToolExpectation, ) @@ -513,21 +511,14 @@ def _tool_call( ] -def _hydrate_history_with_refs(item: AgenticGroundTruthEntry, refs: list[Reference]) -> None: - if not item.history: - return +def _set_rag_compat_refs(item: AgenticGroundTruthEntry, refs: list[Reference]) -> None: + from app.plugins.pack_registry import get_default_pack_registry, get_required_pack - enriched_history: list[HistoryEntry] = [] - last_turn_index = len(item.history) - 1 - for index, turn in enumerate(item.history): - enriched_history.append( - HistoryItem( - role=turn.role, - msg=turn.msg, - refs=refs if index == last_turn_index and turn.role != "user" else None, - ) - ) - item.history = enriched_history + pack = get_required_pack("rag-compat", get_default_pack_registry()) + replace_references = getattr(pack, "replace_references", None) + if not callable(replace_references): + raise TypeError("Registered 'rag-compat' pack does not expose replace_references") + replace_references(item, refs) def _expected_tools(tool_names: list[str]) -> ExpectedTools: @@ -558,7 +549,9 @@ def _build_demo_item( created_by="demo-seed", ) adapted = adapter.adapt_payload({"trace_count": 1, "traces": [trace]})[0] - item = AgenticGroundTruthEntry.model_validate(adapted.model_dump(by_alias=True)) + item = AgenticGroundTruthEntry.model_validate( + adapted.model_dump(by_alias=True, exclude={"tags"}) + ) item.id = item_id item.scenario_id = scenario_id @@ -566,8 +559,7 @@ def _build_demo_item( item.manual_tags = sorted(set(item.manual_tags + manual_tags)) item.metadata = {**item.metadata, "source": "demo-seed"} item.trace_ids = {**(item.trace_ids or {}), "demoItemId": item_id} - item.refs = refs - _hydrate_history_with_refs(item, refs) + _set_rag_compat_refs(item, refs) item.expected_tools = _expected_tools(required_tools) if assigned: diff --git a/backend/app/domain/conversation_fields.py b/backend/app/domain/conversation_fields.py new file mode 100644 index 0000000..6bdf1c3 --- /dev/null +++ b/backend/app/domain/conversation_fields.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from app.domain.models import AgenticGroundTruthEntry + + +def _normalize_role(role: str) -> str: + return role.strip().lower() + + +def is_user_role(role: str) -> bool: + return _normalize_role(role) == "user" + + +def is_non_user_role(role: str) -> bool: + return not is_user_role(role) + + +def question_text_from_item(item: AgenticGroundTruthEntry) -> str: + for turn in reversed(item.history or []): + if is_user_role(turn.role) and turn.msg.strip(): + return turn.msg.strip() + return "" + + +def answer_text_from_item(item: AgenticGroundTruthEntry) -> str: + for turn in reversed(item.history or []): + if is_non_user_role(turn.role) and turn.msg.strip(): + return turn.msg.strip() + return "" diff --git a/backend/app/domain/enums.py b/backend/app/domain/enums.py index 4b8c9d0..a41ba1b 100644 --- a/backend/app/domain/enums.py +++ b/backend/app/domain/enums.py @@ -13,7 +13,6 @@ class SortField(str, Enum): updated_at = "updatedAt" id = "id" has_answer = "hasAnswer" - totalReferences = "totalReferences" tag_count = "tagCount" diff --git a/backend/app/domain/models.py b/backend/app/domain/models.py index edab26f..246da56 100644 --- a/backend/app/domain/models.py +++ b/backend/app/domain/models.py @@ -1,7 +1,7 @@ from __future__ import annotations from datetime import datetime, timezone -from typing import Any, ClassVar, Optional, Literal, cast +from typing import Any, ClassVar, Optional, Literal from uuid import UUID from pydantic import BaseModel, Field, ConfigDict, computed_field, field_validator, model_validator @@ -9,12 +9,6 @@ from app.domain.enums import GroundTruthStatus from app.domain.validators import GroundTruthItemTagValidators -LEGACY_HOST_FIELD_DELETE_GATES = ( - "stored-data audit completed", - "caller audit completed", - "import/export verification completed", -) - class Reference(BaseModel): """Legacy RAG reference object retained for compatibility helpers and tests.""" @@ -51,9 +45,8 @@ def validate_non_empty_text(cls, value: str) -> str: class HistoryItem(HistoryEntry): - """Legacy RAG-compatible history item retained for internal compatibility.""" + """Canonical history item used by generic core flows.""" - refs: Optional[list[Reference]] = None expected_behavior: Optional[list[str]] = Field(default=None, alias="expectedBehavior") model_config = ConfigDict(populate_by_name=True, extra="forbid") @@ -258,83 +251,12 @@ class AgenticGroundTruthEntry(GroundTruthItemTagValidators, BaseModel): _RAG_COMPAT_PLUGIN: ClassVar[str] = "rag-compat" - # --- Legacy compatibility layer --- - # The model_validator, computed_fields, and property accessors below exist because - # stored Cosmos DB documents may still carry top-level RAG fields (synthQuestion, - # editedQuestion, answer, refs, etc.). They transparently relocate those fields into - # plugins["rag-compat"] on read and re-expose them for internal code that still - # accesses .synth_question, .answer, .refs, .totalReferences. - # - # Hard-delete only after all LEGACY_HOST_FIELD_DELETE_GATES are satisfied. Until then, - # these accessors are migration projections, not long-term host ownership. - - @model_validator(mode="before") - @classmethod - def translate_legacy_payload_for_core_model(cls, value: object) -> object: - if cls is not AgenticGroundTruthEntry: - return value - from app.plugins.packs.rag_compat import normalize_legacy_payload_for_core_model - - return normalize_legacy_payload_for_core_model(value, plugin_name=cls._RAG_COMPAT_PLUGIN) - - @model_validator(mode="after") - def restore_history_annotations(self) -> "AgenticGroundTruthEntry": - history_annotations = self._rag_compat_data().get("historyAnnotations") - if not isinstance(history_annotations, list) or not self.history: - return self - - merged_history: list[HistoryEntry] = [] - changed = False - for index, entry in enumerate(self.history): - annotation = history_annotations[index] if index < len(history_annotations) else None - if not isinstance(annotation, dict) or not annotation: - merged_history.append(entry) - continue - - entry_payload = entry.model_dump(by_alias=True) - if "refs" in annotation: - entry_payload["refs"] = annotation["refs"] - changed = True - if "expectedBehavior" in annotation: - entry_payload["expectedBehavior"] = annotation["expectedBehavior"] - changed = True - merged_history.append(HistoryItem.model_validate(entry_payload)) - - if changed: - self.history = merged_history - return self - @computed_field @property def tags(self) -> list[str]: merged = set(self.manual_tags or []) | set(self.computed_tags or []) return sorted(merged) - @computed_field(alias="synthQuestion") - @property - def compat_synth_question(self) -> str | None: - return self.synth_question - - @computed_field(alias="editedQuestion") - @property - def compat_edited_question(self) -> str | None: - return self.edited_question - - @computed_field(alias="answer") - @property - def compat_answer(self) -> str | None: - return self.answer - - @computed_field(alias="refs") - @property - def compat_refs(self) -> list[Reference]: - return self.refs - - @computed_field(alias="totalReferences") - @property - def compat_total_references(self) -> int: - return self.totalReferences - def set_plugin(self, slot: str, data: dict[str, Any], *, version: str = "1.0") -> None: self.plugins[slot] = PluginPayload(kind=slot, version=version, data=data) @@ -345,149 +267,6 @@ def get_plugin_data(self, slot: str) -> dict[str, Any] | None: def export_json_schema(self) -> dict[str, Any]: return self.model_json_schema() - def _rag_compat_data(self) -> dict[str, Any]: - plugin = self.plugins.get(self._RAG_COMPAT_PLUGIN) - if plugin is None: - return {} - return plugin.data - - def _set_rag_compat_value(self, key: str, value: Any) -> None: - plugin = self.plugins.get(self._RAG_COMPAT_PLUGIN) - if plugin is None: - plugin = PluginPayload(kind=self._RAG_COMPAT_PLUGIN, version="1.0", data={}) - self.plugins[self._RAG_COMPAT_PLUGIN] = plugin - if value is None: - plugin.data.pop(key, None) - else: - plugin.data[key] = value - - def _find_history_message(self, role: str, *, reverse: bool = False) -> str | None: - history = self.history or [] - history_iterable = reversed(history) if reverse else history - for turn in history_iterable: - if turn.role == role and turn.msg: - return turn.msg - return None - - def _find_last_agent_message(self) -> str | None: - """Return the last non-user history message (any agent role).""" - for turn in reversed(self.history or []): - if turn.role != "user" and turn.msg: - return turn.msg - return None - - @property - def synth_question(self) -> str | None: - if "synth_question" in self.__dict__: - return cast(str | None, self.__dict__.get("synth_question")) - compat = self._rag_compat_data() - return cast(str | None, compat.get("synthQuestion")) or self._find_history_message("user") - - @synth_question.setter - def synth_question(self, value: str | None) -> None: - if "synth_question" in getattr(type(self), "model_fields", {}): - self.__dict__["synth_question"] = value - return - self._set_rag_compat_value("synthQuestion", value) - - @property - def edited_question(self) -> str | None: - if "edited_question" in self.__dict__: - return cast(str | None, self.__dict__.get("edited_question")) - compat = self._rag_compat_data() - return cast(str | None, compat.get("editedQuestion")) or self.synth_question - - @edited_question.setter - def edited_question(self, value: str | None) -> None: - if "edited_question" in getattr(type(self), "model_fields", {}): - self.__dict__["edited_question"] = value - return - self._set_rag_compat_value("editedQuestion", value) - - @property - def answer(self) -> str | None: - if "answer" in self.__dict__: - return cast(str | None, self.__dict__.get("answer")) - compat = self._rag_compat_data() - return cast(str | None, compat.get("answer")) or self._find_last_agent_message() - - @answer.setter - def answer(self, value: str | None) -> None: - if "answer" in getattr(type(self), "model_fields", {}): - self.__dict__["answer"] = value - return - self._set_rag_compat_value("answer", value) - - @property - def refs(self) -> list[Reference]: - direct_value = self.__dict__.get("refs") - if isinstance(direct_value, list): - return [ - ref if isinstance(ref, Reference) else Reference.model_validate(ref) - for ref in direct_value - ] - from app.plugins.packs.rag_compat import compat_refs_from_payload - - return cast( - list[Reference], - compat_refs_from_payload( - { - "plugins": self.plugins, - "toolCalls": self.tool_calls, - "history": self.history, - }, - plugin_name=self._RAG_COMPAT_PLUGIN, - ), - ) - - @refs.setter - def refs(self, value: list[Reference] | list[dict[str, Any]] | None) -> None: - if "refs" in getattr(type(self), "model_fields", {}): - self.__dict__["refs"] = list(value or []) - return - # Handle both Reference objects and dict representations - serialized = [] - for ref in value or []: - if isinstance(ref, Reference): - serialized.append(ref.model_dump(by_alias=True)) - elif isinstance(ref, dict): - # Validate and convert dict to ensure it's a valid reference - validated_ref = Reference.model_validate(ref) - serialized.append(validated_ref.model_dump(by_alias=True)) - else: - serialized.append(ref) - self._set_rag_compat_value("refs", serialized) - - @property - def totalReferences(self) -> int: - direct_value = self.__dict__.get("totalReferences") - if isinstance(direct_value, int): - return direct_value - from app.plugins.packs.rag_compat import compat_total_references_from_payload - - return compat_total_references_from_payload( - { - "plugins": self.plugins, - "toolCalls": self.tool_calls, - "history": self.history, - }, - plugin_name=self._RAG_COMPAT_PLUGIN, - ) - - @totalReferences.setter - def totalReferences(self, value: int | None) -> None: - if "totalReferences" in getattr(type(self), "model_fields", {}): - self.__dict__["totalReferences"] = 0 if value is None else int(value) - return - self._set_rag_compat_value("totalReferences", None if value is None else int(value)) - - # NOTE: Informational RAG-era accessors (contextUsedForGeneration, contextSource, - # modelUsedForGeneration, semanticClusterNumber, weight, samplingBucket, questionLength) - # removed in Phase 7 legacy retirement. No callers accessed them via - # AgenticGroundTruthEntry uses computed properties for legacy field access. - # Read paths extract these values from history and plugin data. Write paths - # normalize incoming payloads into canonical multi-turn structures. - class PaginationMetadata(BaseModel): model_config = ConfigDict(populate_by_name=True) diff --git a/backend/app/plugins/base.py b/backend/app/plugins/base.py index 008f6f4..884e160 100644 --- a/backend/app/plugins/base.py +++ b/backend/app/plugins/base.py @@ -101,7 +101,7 @@ def tag_key(self) -> str: return "length:long" def compute(self, doc: AgenticGroundTruthEntry) -> str | None: - content = doc.answer or "" + content = "\n".join(turn.msg for turn in (doc.history or [])) return self.tag_key if len(content) > 10000 else None Example (dynamic tag): @@ -130,8 +130,7 @@ def compute(self, doc: AgenticGroundTruthEntry) -> str | None: Args: doc: The AgenticGroundTruthEntry to evaluate. - Contains fields like 'answer', 'history', 'refs', etc. - Legacy fields like synthQuestion, editedQuestion are accessed via computed properties. + Contains canonical fields like 'history', 'plugins', 'tool_calls', etc. Returns: The tag string if applicable, None otherwise. @@ -326,7 +325,7 @@ def collect_approval_errors( self, item: AgenticGroundTruthEntry ) -> list[str]: errors: list[str] = [] - if not item.refs: + if not item.history: errors.append("strict-ref: at least one reference is required") return errors """ @@ -438,6 +437,45 @@ def get_export_transforms(self) -> list[ExportTransform]: """ return [] + def matches_query_filter( + self, item: AgenticGroundTruthEntry, filter_key: str, filter_value: str + ) -> bool | None: + """Evaluate a plugin-namespaced query filter for an item. + + Args: + item: Item being evaluated. + filter_key: Pack-local filter key (namespace removed by host). + filter_value: Filter value from the request. + + Returns: + True/False when this pack handles the key, or None when unsupported. + """ + return None + + def get_sort_value(self, item: AgenticGroundTruthEntry, sort_key: str) -> Any | None: + """Return a plugin-owned sort value for a namespaced sort key. + + Args: + item: Item being sorted. + sort_key: Pack-local sort key (namespace removed by host). + + Returns: + Sort value when handled, or None when unsupported. + """ + return None + + def get_search_documents(self, item: AgenticGroundTruthEntry) -> list[dict[str, Any]]: + """Return plugin-owned search candidate docs for a single item. + + Each candidate should include at least ``url`` when applicable and may + include ``id``, ``title``, and ``chunk``. + """ + return [] + + def get_primary_reference_url(self, item: AgenticGroundTruthEntry) -> str | None: + """Return a primary reference URL for diagnostics/error reporting.""" + return None + class PluginPackRegistry: """Registry for plugin packs with startup validation. @@ -625,6 +663,59 @@ def __len__(self) -> int: """Return the number of registered packs.""" return len(self._packs) + @staticmethod + def _split_namespaced_key(namespaced_key: str) -> tuple[str, str] | None: + pack_name, sep, pack_key = namespaced_key.partition(":") + if not sep or not pack_name.strip() or not pack_key.strip(): + return None + return pack_name.strip(), pack_key.strip() + + def matches_query_filters( + self, item: AgenticGroundTruthEntry, filters: Mapping[str, str] | None + ) -> bool: + """Return True when an item satisfies all plugin-namespaced filters.""" + if not filters: + return True + + for namespaced_key, value in filters.items(): + split = self._split_namespaced_key(namespaced_key) + if split is None: + return False + pack_name, pack_key = split + pack = self.get(pack_name) + if pack is None: + return False + result = pack.matches_query_filter(item, pack_key, value) + if result is None or result is False: + return False + return True + + def plugin_sort_value(self, item: AgenticGroundTruthEntry, namespaced_sort_key: str) -> Any: + """Resolve a plugin-namespaced sort key for an item.""" + split = self._split_namespaced_key(namespaced_sort_key) + if split is None: + return None + pack_name, pack_key = split + pack = self.get(pack_name) + if pack is None: + return None + return pack.get_sort_value(item, pack_key) + + def collect_search_documents(self, item: AgenticGroundTruthEntry) -> list[dict[str, Any]]: + """Collect plugin-owned search candidate documents for an item.""" + docs: list[dict[str, Any]] = [] + for pack in self._packs.values(): + docs.extend(pack.get_search_documents(item)) + return docs + + def primary_reference_url(self, item: AgenticGroundTruthEntry) -> str | None: + """Return the first available plugin-owned primary reference URL.""" + for pack in self._packs.values(): + candidate = pack.get_primary_reference_url(item) + if candidate: + return candidate + return None + # --------------------------------------------------------------------------- # Trace adapter plugin system diff --git a/backend/app/plugins/computed_tags/no_answer.py b/backend/app/plugins/computed_tags/no_answer.py index 5fdd3ea..a470af1 100644 --- a/backend/app/plugins/computed_tags/no_answer.py +++ b/backend/app/plugins/computed_tags/no_answer.py @@ -8,6 +8,7 @@ from typing import TYPE_CHECKING +from app.domain.conversation_fields import answer_text_from_item from app.plugins.base import ComputedTagPlugin if TYPE_CHECKING: @@ -30,6 +31,7 @@ def tag_key(self) -> str: return "answer:no_answer" def compute(self, doc: AgenticGroundTruthEntry) -> str | None: - if doc.answer and doc.answer.strip().casefold() == "no_answer": + answer_text = answer_text_from_item(doc) + if answer_text and answer_text.casefold() == "no_answer": return self.tag_key return None diff --git a/backend/app/plugins/computed_tags/question_length.py b/backend/app/plugins/computed_tags/question_length.py index 7b09359..5788696 100644 --- a/backend/app/plugins/computed_tags/question_length.py +++ b/backend/app/plugins/computed_tags/question_length.py @@ -13,6 +13,7 @@ from typing import TYPE_CHECKING +from app.domain.conversation_fields import question_text_from_item from app.plugins.base import ComputedTagPlugin if TYPE_CHECKING: @@ -28,8 +29,8 @@ def _get_question_word_count(doc: AgenticGroundTruthEntry) -> int: """Get the word count for the document's question. - Uses the computed property accessor which returns editedQuestion if available, - otherwise synthQuestion. Uses .split() to count words as specified in requirements. + Uses canonical question derivation from history. + Uses .split() to count words as specified in requirements. Args: doc: The AgenticGroundTruthEntry to evaluate. @@ -37,7 +38,7 @@ def _get_question_word_count(doc: AgenticGroundTruthEntry) -> int: Returns: The number of words in the question. """ - question = doc.edited_question or doc.synth_question or "" + question = question_text_from_item(doc) return len(question.split()) diff --git a/backend/app/plugins/computed_tags/reference_type.py b/backend/app/plugins/computed_tags/reference_type.py index c22cdf6..aee0ea8 100644 --- a/backend/app/plugins/computed_tags/reference_type.py +++ b/backend/app/plugins/computed_tags/reference_type.py @@ -16,6 +16,7 @@ from typing import TYPE_CHECKING from app.plugins.base import ComputedTagPlugin +from app.plugins.pack_registry import get_default_pack_registry if TYPE_CHECKING: from app.domain.models import AgenticGroundTruthEntry, Reference @@ -61,17 +62,15 @@ def _get_all_references(doc: AgenticGroundTruthEntry) -> list[Reference]: Returns: A list of all Reference objects from the document. """ - from app.domain.models import HistoryItem - - refs: list[Reference] = list(doc.refs or []) - - # Also gather refs from history turns - # HistoryItem (subclass of HistoryEntry) has refs field - if doc.history: - for turn in doc.history: - if isinstance(turn, HistoryItem) and turn.refs: - refs.extend(turn.refs) - + from app.domain.models import Reference + + docs = get_default_pack_registry().collect_search_documents(doc) + refs: list[Reference] = [] + for candidate in docs: + url = candidate.get("url") + if not isinstance(url, str) or not url: + continue + refs.append(Reference(url=url)) return refs diff --git a/backend/app/plugins/computed_tags/retrieval_behavior.py b/backend/app/plugins/computed_tags/retrieval_behavior.py index e2a62e1..83f3baa 100644 --- a/backend/app/plugins/computed_tags/retrieval_behavior.py +++ b/backend/app/plugins/computed_tags/retrieval_behavior.py @@ -13,6 +13,7 @@ from typing import TYPE_CHECKING from app.plugins.base import ComputedTagPlugin +from app.plugins.pack_registry import get_default_pack_registry if TYPE_CHECKING: from app.domain.models import AgenticGroundTruthEntry @@ -21,8 +22,7 @@ def _get_total_reference_count(doc: AgenticGroundTruthEntry) -> int: """Get the total count of references from a document. - Uses the totalReferences computed field which counts refs at item level - and across all history turns. + Uses canonical reference derivation from history/plugin payloads. Args: doc: The AgenticGroundTruthEntry to evaluate. @@ -30,7 +30,8 @@ def _get_total_reference_count(doc: AgenticGroundTruthEntry) -> int: Returns: The total number of references. """ - return doc.totalReferences + count = get_default_pack_registry().plugin_sort_value(doc, "rag-compat:totalReferences") + return int(count) if isinstance(count, int) else 0 class RetrievalBehaviorNoRefsPlugin(ComputedTagPlugin): diff --git a/backend/app/plugins/packs/rag_compat.py b/backend/app/plugins/packs/rag_compat.py index fd712bd..039d4f5 100644 --- a/backend/app/plugins/packs/rag_compat.py +++ b/backend/app/plugins/packs/rag_compat.py @@ -1,20 +1,10 @@ """RAG compatibility pack. -This pack owns retrieval-specific behavior on the generic agentic host: -- Validates its own plugin-kind constant at startup so mismatches are detected - before any data is processed. -- Projects per-item RAG state from ``plugins["rag-compat"].data`` via the - compat-accessor helpers already present on AgenticGroundTruthEntry. -- Provides the canonical ``rag_compat_data``, ``refs_from_item``, - ``attach_reference``, and ``detach_reference`` helpers so reference - manipulation stays in one owned location rather than being inlined across - multiple services. -- Contributes approval validation hooks that enforce RAG-specific invariants on - top of the generic core checks. - -Retrieval search remains available through the standard ``/v1/search`` endpoint -(backed by SearchService), which handles the generic query path independently. -Reference selection and attachment are owned by this pack. +This pack owns the remaining RAG-specific compatibility surface on the generic +agentic host. The only plugin-owned payload retained here is normalized +``references`` data. Legacy RAG fields are translated into generic history or +flattened into references during import, and new writes only persist +``plugins[\"rag-compat\"].data.references``. """ from __future__ import annotations @@ -22,45 +12,26 @@ import logging from typing import TYPE_CHECKING, Any -from app.plugins.base import ExplorerFieldDefinition, ExportTransform, PluginPack +from app.plugins.base import ExplorerFieldDefinition, ExportTransform, ImportTransform, PluginPack if TYPE_CHECKING: from app.domain.models import AgenticGroundTruthEntry, Reference logger = logging.getLogger(__name__) -# The plugin-kind key stored inside AgenticGroundTruthEntry.plugins. -# This MUST match AgenticGroundTruthEntry._RAG_COMPAT_PLUGIN. -# validate_registration() enforces this at startup. _RAG_COMPAT_KIND: str = "rag-compat" - -_LEGACY_PLUGIN_FIELDS: tuple[str, ...] = ( +_PLUGIN_REFERENCES_KEY = "references" +_LEGACY_REFS_KEY = "refs" +_LEGACY_KEYS_TO_DROP = ( + _LEGACY_REFS_KEY, + "retrievals", + "historyAnnotations", + "totalReferences", "synthQuestion", "editedQuestion", "answer", - "refs", - "contextUsedForGeneration", - "contextSource", - "modelUsedForGeneration", - "semanticClusterNumber", - "weight", - "samplingBucket", - "questionLength", - "totalReferences", ) -_LEGACY_PLUGIN_FIELD_ALIASES: dict[str, str] = { - "synth_question": "synthQuestion", - "edited_question": "editedQuestion", - "context_used_for_generation": "contextUsedForGeneration", - "context_source": "contextSource", - "model_used_for_generation": "modelUsedForGeneration", - "semantic_cluster_number": "semanticClusterNumber", - "sampling_bucket": "samplingBucket", - "question_length": "questionLength", - "total_references": "totalReferences", -} - def _coerce_reference_list(raw_refs: Any) -> list[Any]: if not isinstance(raw_refs, list): @@ -73,22 +44,66 @@ def _coerce_reference_list(raw_refs: Any) -> list[Any]: ] -def _history_message(history: Any, role: str, *, reverse: bool = False) -> str | None: +def _extract_history_refs(history: Any) -> list[Any]: if not isinstance(history, list): - return None - iterator = reversed(history) if reverse else history - for turn in iterator: - if hasattr(turn, "role") and hasattr(turn, "msg"): - current_role = str(getattr(turn, "role", "")).strip().lower() - current_msg = str(getattr(turn, "msg", "")).strip() - elif isinstance(turn, dict): - current_role = str(turn.get("role", "")).strip().lower() - current_msg = str(turn.get("msg") or turn.get("content") or "").strip() - else: + return [] + + refs: list[Any] = [] + for turn in history: + if hasattr(turn, "refs"): + refs.extend(_coerce_reference_list(getattr(turn, "refs", None))) + continue + if isinstance(turn, dict): + refs.extend(_coerce_reference_list(turn.get(_LEGACY_REFS_KEY))) + return refs + + +def _extract_retrieval_refs(payload: dict[str, Any], compat: dict[str, Any]) -> list[Any]: + retrievals = compat.get("retrievals") + if not isinstance(retrievals, dict): + return [] + + from app.domain.models import Reference + + tool_calls = payload.get("toolCalls") or payload.get("tool_calls") or [] + step_by_tool_call_id: dict[str, int | None] = {} + if isinstance(tool_calls, list): + for tool_call in tool_calls: + if hasattr(tool_call, "id"): + tool_call_id = getattr(tool_call, "id", "") + step_number = getattr(tool_call, "step_number", None) + elif isinstance(tool_call, dict): + tool_call_id = str(tool_call.get("id") or "") + step_number = tool_call.get("stepNumber", tool_call.get("step_number")) + else: + continue + if tool_call_id: + step_by_tool_call_id[tool_call_id] = ( + step_number if isinstance(step_number, int) else None + ) + + flattened: list[Reference] = [] + for tool_call_id, bucket in retrievals.items(): + if not isinstance(bucket, dict): + continue + candidates = bucket.get("candidates") + if not isinstance(candidates, list): continue - if current_role == role and current_msg: - return current_msg - return None + for candidate in candidates: + if not isinstance(candidate, dict): + continue + candidate_tool_call_id = candidate.get("toolCallId") or tool_call_id or None + flattened.append( + Reference( + url=str(candidate.get("url") or ""), + title=candidate.get("title"), + content=candidate.get("chunk"), + messageIndex=step_by_tool_call_id.get(str(candidate_tool_call_id)) + if candidate_tool_call_id + else None, + ) + ) + return flattened def rag_compat_data_from_payload( @@ -107,86 +122,74 @@ def rag_compat_data_from_payload( return {} -def normalize_legacy_payload_for_core_model( - value: object, *, plugin_name: str = _RAG_COMPAT_KIND -) -> object: - if not isinstance(value, dict): - return value +def compat_refs_from_payload( + payload: dict[str, Any], *, plugin_name: str = _RAG_COMPAT_KIND +) -> list[Any]: + compat = rag_compat_data_from_payload(payload, plugin_name=plugin_name) + if _PLUGIN_REFERENCES_KEY in compat: + return _coerce_reference_list(compat.get(_PLUGIN_REFERENCES_KEY)) + + compat_refs = _coerce_reference_list(compat.get(_LEGACY_REFS_KEY)) + if compat_refs: + return compat_refs + + retrieval_refs = _extract_retrieval_refs(payload, compat) + if retrieval_refs: + return retrieval_refs + + return _extract_history_refs(payload.get("history")) + + +def normalize_legacy_payload_for_core_model( + value: dict[str, Any], *, plugin_name: str = _RAG_COMPAT_KIND +) -> dict[str, Any]: data = dict(value) data.pop("tags", None) - legacy_payload: dict[str, Any] = {} - for alias, canonical in _LEGACY_PLUGIN_FIELD_ALIASES.items(): - if alias not in data: - continue - alias_value = data.pop(alias) - if canonical not in data: - data[canonical] = alias_value - - for field_name in _LEGACY_PLUGIN_FIELDS: - if field_name in data: - legacy_payload[field_name] = data.pop(field_name) - - if "refs" in legacy_payload: - legacy_payload["refs"] = _coerce_reference_list(legacy_payload["refs"]) - - history_value = data.get("history") - if isinstance(history_value, list): - normalized_history: list[dict[str, Any]] = [] - history_annotations: list[dict[str, Any]] = [] - saw_history_annotations = False - for raw_entry in history_value: + raw_history = data.get("history") + normalized_history: list[dict[str, Any]] | None = None + history_refs: list[Any] = [] + if isinstance(raw_history, list): + normalized_history = [] + for raw_entry in raw_history: if hasattr(raw_entry, "model_dump"): entry_dict = raw_entry.model_dump(by_alias=True, exclude_none=True) elif isinstance(raw_entry, dict): entry_dict = dict(raw_entry) else: - normalized_history.append(raw_entry) - history_annotations.append({}) continue - annotation: dict[str, Any] = {} - if "refs" in entry_dict: - annotation["refs"] = _coerce_reference_list(entry_dict.pop("refs")) - saw_history_annotations = True - expected_behavior = entry_dict.pop( - "expectedBehavior", entry_dict.pop("expected_behavior", None) - ) - if expected_behavior is not None: - annotation["expectedBehavior"] = expected_behavior - saw_history_annotations = True - + history_refs.extend(_coerce_reference_list(entry_dict.pop(_LEGACY_REFS_KEY, None))) message = entry_dict.get("msg") - if message is None and "content" in entry_dict: - message = entry_dict.pop("content") + if message is None and isinstance(entry_dict.get("content"), str): + message = entry_dict.get("content") normalized_history.append( { - "role": entry_dict.get("role", ""), - "msg": message or "", + "role": str(entry_dict.get("role") or ""), + "msg": str(message or ""), } ) - history_annotations.append(annotation) - data["history"] = normalized_history - if saw_history_annotations: - legacy_payload["historyAnnotations"] = history_annotations - elif history_value is None and ( - legacy_payload.get("editedQuestion") - or legacy_payload.get("synthQuestion") - or legacy_payload.get("answer") + + synth_question = data.pop("synthQuestion", None) + edited_question = data.pop("editedQuestion", None) + answer = data.pop("answer", None) + if normalized_history is None and any( + isinstance(v, str) and v.strip() for v in (edited_question, synth_question, answer) ): generated_history: list[dict[str, Any]] = [] - question_text = legacy_payload.get("editedQuestion") or legacy_payload.get("synthQuestion") - if question_text: - generated_history.append({"role": "user", "msg": question_text}) - if legacy_payload.get("answer"): - generated_history.append({"role": "assistant", "msg": legacy_payload["answer"]}) + question_text = ( + edited_question + if isinstance(edited_question, str) and edited_question.strip() + else synth_question + ) + if isinstance(question_text, str) and question_text.strip(): + generated_history.append({"role": "user", "msg": question_text.strip()}) + if isinstance(answer, str) and answer.strip(): + generated_history.append({"role": "assistant", "msg": answer.strip()}) data["history"] = generated_history - if not legacy_payload: - return data - plugins_payload = dict(data.get("plugins") or {}) existing_plugin = plugins_payload.get(plugin_name) if hasattr(existing_plugin, "model_dump"): @@ -195,149 +198,73 @@ def normalize_legacy_payload_for_core_model( plugin_dict = dict(existing_plugin) else: plugin_dict = {"kind": plugin_name, "version": "1.0", "data": {}} + plugin_data_raw = plugin_dict.get("data") plugin_data = dict(plugin_data_raw) if isinstance(plugin_data_raw, dict) else {} - plugin_data.update(legacy_payload) - plugin_dict["kind"] = plugin_dict.get("kind") or plugin_name - plugin_dict["version"] = plugin_dict.get("version") or "1.0" - plugin_dict["data"] = plugin_data - plugins_payload[plugin_name] = plugin_dict - data["plugins"] = plugins_payload - return data - - -def compat_refs_from_payload( - payload: dict[str, Any], *, plugin_name: str = _RAG_COMPAT_KIND -) -> list[Any]: - compat = rag_compat_data_from_payload(payload, plugin_name=plugin_name) - refs = _coerce_reference_list(compat.get("refs")) - if refs: - return refs - - retrievals = compat.get("retrievals") - if not isinstance(retrievals, dict): - return [] - - from app.domain.models import Reference - - tool_calls = payload.get("toolCalls") or payload.get("tool_calls") or [] - step_by_tool_call_id: dict[str, int | None] = {} - if isinstance(tool_calls, list): - for tool_call in tool_calls: - if hasattr(tool_call, "id"): - tool_call_id = getattr(tool_call, "id", "") - step_number = getattr(tool_call, "step_number", None) - elif isinstance(tool_call, dict): - tool_call_id = str(tool_call.get("id") or "") - step_number = tool_call.get("stepNumber", tool_call.get("step_number")) - else: - continue - if tool_call_id: - step_by_tool_call_id[tool_call_id] = ( - step_number if isinstance(step_number, int) else None - ) - - flattened: list[Reference] = [] - for tool_call_id, bucket in retrievals.items(): - if not isinstance(bucket, dict): - continue - candidates = bucket.get("candidates") - if not isinstance(candidates, list): - continue - for candidate in candidates: - if not isinstance(candidate, dict): - continue - candidate_tool_call_id = candidate.get("toolCallId") or ( - tool_call_id if tool_call_id != RagCompatPack._UNASSOCIATED_KEY else None - ) - flattened.append( - Reference( - url=str(candidate.get("url") or ""), - title=candidate.get("title"), - content=candidate.get("chunk"), - messageIndex=step_by_tool_call_id.get(str(candidate_tool_call_id)) - if candidate_tool_call_id - else None, - ) + has_canonical_references = _PLUGIN_REFERENCES_KEY in plugin_data + if has_canonical_references: + references = _coerce_reference_list(plugin_data.get(_PLUGIN_REFERENCES_KEY)) + else: + references = _coerce_reference_list(plugin_data.get(_LEGACY_REFS_KEY)) + if not references: + top_level_refs = data.pop(_LEGACY_REFS_KEY, None) + references = _coerce_reference_list(top_level_refs) + if not references: + references = _extract_retrieval_refs( + {"plugins": plugins_payload, "toolCalls": data.get("toolCalls")}, plugin_data ) - return flattened + if not references: + references = history_refs + for legacy_key in _LEGACY_KEYS_TO_DROP: + plugin_data.pop(legacy_key, None) -def compat_total_references_from_payload( - payload: dict[str, Any], *, plugin_name: str = _RAG_COMPAT_KIND -) -> int: - compat = rag_compat_data_from_payload(payload, plugin_name=plugin_name) - explicit_total = compat.get("totalReferences") - if isinstance(explicit_total, int): - return explicit_total + if has_canonical_references: + plugin_data[_PLUGIN_REFERENCES_KEY] = [ + ref.model_dump(by_alias=True, exclude_none=True) if hasattr(ref, "model_dump") else ref + for ref in references + ] + elif references: + plugin_data[_PLUGIN_REFERENCES_KEY] = [ + ref.model_dump(by_alias=True, exclude_none=True) if hasattr(ref, "model_dump") else ref + for ref in references + ] + + if plugin_data: + plugin_dict["kind"] = plugin_dict.get("kind") or plugin_name + plugin_dict["version"] = plugin_dict.get("version") or "1.0" + plugin_dict["data"] = plugin_data + plugins_payload[plugin_name] = plugin_dict + data["plugins"] = plugins_payload + elif plugin_name in plugins_payload: + plugins_payload.pop(plugin_name, None) + data["plugins"] = plugins_payload - history_count = 0 - history_annotations = compat.get("historyAnnotations") - if isinstance(history_annotations, list): - for annotation in history_annotations: - if isinstance(annotation, dict) and isinstance(annotation.get("refs"), list): - history_count += len(annotation["refs"]) - if history_count: - return history_count - return len(compat_refs_from_payload(payload, plugin_name=plugin_name)) + return data def apply_export_projection( doc: dict[str, Any], *, plugin_name: str = _RAG_COMPAT_KIND ) -> dict[str, Any]: projected = dict(doc) - compat = rag_compat_data_from_payload(projected, plugin_name=plugin_name) - if not compat: - return projected - refs = compat_refs_from_payload(projected, plugin_name=plugin_name) - projected["refs"] = [ref.model_dump(by_alias=True, exclude_none=True) for ref in refs] - projected["totalReferences"] = len(refs) - - if projected.get("synthQuestion") is None: - projected["synthQuestion"] = compat.get("synthQuestion") or _history_message( - projected.get("history"), "user" - ) - if projected.get("editedQuestion") is None: - projected["editedQuestion"] = compat.get("editedQuestion") or projected.get("synthQuestion") - if projected.get("answer") is None: - projected["answer"] = compat.get("answer") or _history_message( - projected.get("history"), "assistant", reverse=True - ) - + if refs: + projected[_PLUGIN_REFERENCES_KEY] = [ + ref.model_dump(by_alias=True, exclude_none=True) for ref in refs + ] + projected["totalReferences"] = len(refs) + else: + projected.pop(_PLUGIN_REFERENCES_KEY, None) + projected["totalReferences"] = 0 return projected class RagCompatPack(PluginPack): - """RAG compatibility pack. - - Owns retrieval-specific behavior behind the generic plugin-pack contract. - Registered at startup via PluginPackRegistry so misconfiguration raises - a clear startup error instead of silently producing wrong data. - - Design notes: - - The ``rag-compat`` plugin payload is written by - AgenticGroundTruthEntry.translate_legacy_payload_for_core_model during - ingest of legacy RAG-shaped documents. - - Core approval checks (history, tool-call consistency) run before pack - hooks. The pack adds RAG-specific approval gates that cannot be expressed - generically. - - The pack does NOT add new top-level fields to the host model; all RAG - state is accessed via plugins["rag-compat"].data. - - Reference attachment and detachment are owned by this pack; the generic - SearchService only owns the query path. - """ - @property def name(self) -> str: return _RAG_COMPAT_KIND def validate_registration(self) -> None: - """Validate that the rag-compat kind constant matches the host model. - - Fails startup if someone renames the plugin key in - AgenticGroundTruthEntry without updating this pack (or vice-versa). - """ from app.domain.models import AgenticGroundTruthEntry expected = AgenticGroundTruthEntry._RAG_COMPAT_PLUGIN @@ -350,35 +277,16 @@ def validate_registration(self) -> None: logger.debug("rag_compat_pack.validate_registration.ok | kind=%s", _RAG_COMPAT_KIND) def collect_approval_errors(self, item: AgenticGroundTruthEntry) -> list[str]: - """Return RAG-specific approval errors for an item. - - Items that have no RAG compat data receive no additional errors. - """ - compat = self.rag_compat_data(item) - if not compat: - return [] - # RAG items: future validation hooks go here. - # e.g. per-retrieval-call selection completeness could be enforced once - # FR-029/FR-030 retrieval tool-call per-call state is implemented. return [] def collect_approval_waivers( self, item: AgenticGroundTruthEntry, core_errors: list[str] ) -> list[str]: - """Waive core errors that do not apply to RAG retrieval-only items. - - When an item has ``totalReferences > 0`` (indicating it is a - retrieval-based item), the following core checks are waived: - - "history must include at least one assistant message" — retrieval-only - items may not produce an assistant reply. - - "expectedTools.required must include at least one tool…" — retrieval - items may use reference attachment instead of classified tool calls. - """ if self.reference_count(item) == 0: return [] waivers: list[str] = [] - assistant_error = "history must include at least one assistant message" + assistant_error = "history must include at least one agent message" if assistant_error in core_errors: waivers.append(assistant_error) @@ -391,17 +299,10 @@ def collect_approval_waivers( return waivers - # ------------------------------------------------------------------ - # Accessor helpers — owned by this pack so callers don't embed the - # plugin-kind string literal elsewhere. - # ------------------------------------------------------------------ - def rag_compat_data(self, item: AgenticGroundTruthEntry) -> dict[str, Any]: - """Return the raw rag-compat plugin data dict for an item, or {}.""" return item.get_plugin_data(_RAG_COMPAT_KIND) or {} def refs_from_item(self, item: AgenticGroundTruthEntry) -> list[Any]: - """Return the references list projected from the rag-compat payload.""" return compat_refs_from_payload( { "plugins": item.plugins, @@ -412,41 +313,32 @@ def refs_from_item(self, item: AgenticGroundTruthEntry) -> list[Any]: def reference_count(self, item: AgenticGroundTruthEntry) -> int: refs = self.refs_from_item(item) + compat = self.rag_compat_data(item) + if _PLUGIN_REFERENCES_KEY in compat: + return len(refs) if refs: return len(refs) - - compat = self.rag_compat_data(item) explicit_total = compat.get("totalReferences") return explicit_total if isinstance(explicit_total, int) and explicit_total > 0 else 0 def replace_references( self, item: AgenticGroundTruthEntry, refs: list[Reference] ) -> AgenticGroundTruthEntry: - serialized = [ref.model_dump(by_alias=True, exclude_none=True) for ref in refs] - item._set_rag_compat_value("refs", serialized) - item._set_rag_compat_value("retrievals", None) - # Clear cached totalReferences so it will be recomputed from refs/historyAnnotations - if "totalReferences" in item.__dict__: - del item.__dict__["totalReferences"] - item._set_rag_compat_value("totalReferences", None) # Remove from plugin storage too + compat = dict(self.rag_compat_data(item)) + for legacy_key in _LEGACY_KEYS_TO_DROP: + compat.pop(legacy_key, None) + if refs: + compat[_PLUGIN_REFERENCES_KEY] = [ + ref.model_dump(by_alias=True, exclude_none=True) for ref in refs + ] + else: + compat.pop(_PLUGIN_REFERENCES_KEY, None) + item.set_plugin(_RAG_COMPAT_KIND, compat) return item def attach_reference( self, item: AgenticGroundTruthEntry, ref: Reference ) -> AgenticGroundTruthEntry: - """Attach a reference to an item via the rag-compat plugin payload. - - This is a RAG-compat concern; the generic core does not manage refs. - The ``refs`` setter on AgenticGroundTruthEntry writes to - ``plugins["rag-compat"].data`` automatically. - - Args: - item: The ground-truth item to modify in-place. - ref: The reference to attach. - - Returns: - The same item (mutated in-place) for convenience. - """ current = list(self.refs_from_item(item)) current.append(ref) return self.replace_references(item, current) @@ -454,107 +346,9 @@ def attach_reference( def detach_reference( self, item: AgenticGroundTruthEntry, ref_url: str ) -> AgenticGroundTruthEntry: - """Detach a reference from an item by URL, using the rag-compat payload. - - This is a RAG-compat concern; the generic core does not manage refs. - - Args: - item: The ground-truth item to modify in-place. - ref_url: The URL of the reference to remove. - - Returns: - The same item (mutated in-place) for convenience. - """ remaining = [r for r in self.refs_from_item(item) if getattr(r, "url", None) != ref_url] return self.replace_references(item, remaining) - # ------------------------------------------------------------------ - # Per-tool-call retrieval state (Phase 6 — retrieval normalization) - # - # New items store references per retrieval tool call inside - # ``plugins["rag-compat"].data.retrievals``. - # Read path: per-call state first, then fall back to top-level refs. - # Write path: always to per-call state. - # ------------------------------------------------------------------ - - _UNASSOCIATED_KEY: str = "_unassociated" - - def get_retrievals(self, item: AgenticGroundTruthEntry) -> dict[str, Any]: - """Return the full retrievals dict or {} when absent.""" - compat = self.rag_compat_data(item) - retrievals = compat.get("retrievals") - return dict(retrievals) if isinstance(retrievals, dict) else {} - - def get_retrieval_candidates( - self, item: AgenticGroundTruthEntry, tool_call_id: str - ) -> list[dict[str, Any]]: - """Return candidate list for one tool call, or [].""" - retrievals = self.get_retrievals(item) - bucket = retrievals.get(tool_call_id) - if isinstance(bucket, dict): - cands = bucket.get("candidates") - return list(cands) if isinstance(cands, list) else [] - return [] - - def set_retrieval_candidates( - self, - item: AgenticGroundTruthEntry, - tool_call_id: str, - candidates: list[dict[str, Any]], - ) -> None: - """Set candidates for a single tool call (write-through to plugin data).""" - compat = self.rag_compat_data(item) - retrievals = dict(compat.get("retrievals") or {}) - retrievals[tool_call_id] = {"candidates": candidates} - item._set_rag_compat_value("retrievals", retrievals) - - def set_retrievals( - self, - item: AgenticGroundTruthEntry, - retrievals: dict[str, Any], - ) -> None: - """Replace the entire retrievals dict.""" - item._set_rag_compat_value("retrievals", retrievals) - - def has_per_call_state(self, item: AgenticGroundTruthEntry) -> bool: - """Return True when per-call retrieval state exists.""" - compat = self.rag_compat_data(item) - retrievals = compat.get("retrievals") - return isinstance(retrievals, dict) and len(retrievals) > 0 - - def get_all_candidates_flat(self, item: AgenticGroundTruthEntry) -> list[dict[str, Any]]: - """Flatten all per-call candidates into a single list. - - Read path: returns per-call candidates when present. Falls back - to converting top-level refs into candidate dicts for backward compat. - """ - if self.has_per_call_state(item): - result: list[dict[str, Any]] = [] - for tool_call_id, bucket in self.get_retrievals(item).items(): - if not isinstance(bucket, dict): - continue - cands = bucket.get("candidates") - if isinstance(cands, list): - for c in cands: - entry = dict(c) if isinstance(c, dict) else {} - if "toolCallId" not in entry: - entry["toolCallId"] = tool_call_id - result.append(entry) - return result - - # Backward compat: convert top-level refs to candidate shape - refs = item.refs - return [ - { - "url": getattr(r, "url", ""), - "title": getattr(r, "title", None), - "chunk": getattr(r, "content", None), - "relevance": None, - "toolCallId": None, - } - for r in refs - ] - def get_explorer_fields(self) -> list[ExplorerFieldDefinition]: return [ ExplorerFieldDefinition( @@ -563,65 +357,56 @@ def get_explorer_fields(self) -> list[ExplorerFieldDefinition]: field_type="number", sortable=True, filterable=True, - ), - ExplorerFieldDefinition( - key="rag-compat:perCallRetrievals", - label="Per-Call Retrievals", - field_type="boolean", - filterable=True, - ), + ) + ] + + def get_import_transforms(self) -> list[ImportTransform]: + return [ + ImportTransform( + name="rag-compat:normalize-legacy-payload", + description="Normalize legacy RAG fields into generic history and rag-compat references", + transform=normalize_legacy_payload_for_core_model, + ) ] def get_export_transforms(self) -> list[ExportTransform]: return [ ExportTransform( - name="rag-compat:project-legacy-export-fields", - description="Project rag-compat retrieval/reference fields into export payloads", + name="rag-compat:project-references", + description="Project rag-compat references into export payloads", transform=apply_export_projection, ) ] - def migrate_refs_to_per_call(self, item: AgenticGroundTruthEntry) -> bool: - """Migrate top-level refs into per-call state (idempotent). - - Associates refs with retrieval tool calls by matching - ``messageIndex`` to tool-call step ordering when possible. - Refs that cannot be matched go into the ``_unassociated`` bucket. + def matches_query_filter( + self, item: AgenticGroundTruthEntry, filter_key: str, filter_value: str + ) -> bool | None: + if filter_key != "refUrl": + return None + refs = self.refs_from_item(item) + return any(filter_value in (getattr(ref, "url", "") or "") for ref in refs) - Returns True if migration produced changes. - """ - if self.has_per_call_state(item): - return False + def get_sort_value(self, item: AgenticGroundTruthEntry, sort_key: str) -> Any | None: + if sort_key != "totalReferences": + return None + return self.reference_count(item) - refs = item.refs - if not refs: - return False - - # Build a map from step/messageIndex to tool call id - tool_calls = item.tool_calls or [] - step_to_tc: dict[int | None, str] = {} - for tc in tool_calls: - if tc.step_number is not None: - step_to_tc[tc.step_number] = tc.id - - retrievals: dict[str, dict[str, list[dict[str, Any]]]] = {} - for ref in refs: - mi = getattr(ref, "messageIndex", None) - tc_id = step_to_tc.get(mi) if mi is not None else None - key = tc_id or self._UNASSOCIATED_KEY - - if key not in retrievals: - retrievals[key] = {"candidates": []} - retrievals[key]["candidates"].append( + def get_search_documents(self, item: AgenticGroundTruthEntry) -> list[dict[str, Any]]: + docs: list[dict[str, Any]] = [] + for idx, ref in enumerate(self.refs_from_item(item)): + docs.append( { - "url": getattr(ref, "url", ""), + "id": f"{item.id}:ref:{idx}", + "url": getattr(ref, "url", None), "title": getattr(ref, "title", None), - "chunk": getattr(ref, "content", None), - "relevance": None, - "rawPayload": None, - "toolCallId": key if key != self._UNASSOCIATED_KEY else None, + "chunk": getattr(ref, "content", None) or getattr(ref, "keyExcerpt", None), } ) + return docs - self.set_retrievals(item, retrievals) - return True + def get_primary_reference_url(self, item: AgenticGroundTruthEntry) -> str | None: + refs = self.refs_from_item(item) + if not refs: + return None + first_url = getattr(refs[0], "url", None) + return first_url if isinstance(first_url, str) and first_url else None diff --git a/backend/app/services/assignment_service.py b/backend/app/services/assignment_service.py index 28d2022..7d83b52 100644 --- a/backend/app/services/assignment_service.py +++ b/backend/app/services/assignment_service.py @@ -2,7 +2,7 @@ import re from app.adapters.repos.base import GroundTruthRepo -from app.domain.models import AgenticGroundTruthEntry, AssignmentDocument, HistoryItem +from app.domain.models import AgenticGroundTruthEntry, AssignmentDocument from app.plugins import get_default_registry from app.core.errors import AssignmentConflictError from app.core.config import get_sampling_allocation @@ -606,7 +606,7 @@ async def duplicate_item( Rules: - Keep datasetName and bucket identical to the original - Generate a new id (uuid4 string) - - Copy synthQuestion, editedQuestion, answer, refs, tags, comment, history and provenance fields + - Copy tags, comment, history, plugin references, and provenance fields - Ensure the `rephrase:{original.id}` tag is present exactly once - Set status=draft; clear reviewed_at and updatedBy - Assign to requesting user (assignedTo, assignedAt) @@ -619,13 +619,9 @@ async def duplicate_item( new_tags.append(rephrase_tag) now = datetime.now(timezone.utc) - new_item = AgenticGroundTruthEntry.model_validate(original.model_dump(by_alias=True)) - new_item.history = [ - entry - if isinstance(entry, HistoryItem) - else HistoryItem.model_validate(entry.model_dump(by_alias=True)) - for entry in (new_item.history or []) - ] + new_item = AgenticGroundTruthEntry.model_validate( + original.model_dump(by_alias=True, exclude_computed_fields=True) + ) new_item.id = randomname.get_name() new_item.status = GroundTruthStatus.draft new_item.manual_tags = new_tags diff --git a/backend/app/services/duplicate_detection_service.py b/backend/app/services/duplicate_detection_service.py index 439533a..81a4b29 100644 --- a/backend/app/services/duplicate_detection_service.py +++ b/backend/app/services/duplicate_detection_service.py @@ -5,8 +5,8 @@ Detection strategy: - Normalize whitespace and casing for comparison -- Compare editedQuestion or synthQuestion (whichever is present) -- Compare answer content +- Compare canonical question text +- Compare canonical answer text - Only check against approved items (drafts can have temporary duplicates) """ @@ -18,6 +18,7 @@ from pydantic import BaseModel, Field, ConfigDict +from app.domain.conversation_fields import answer_text_from_item, question_text_from_item from app.domain.models import AgenticGroundTruthEntry from app.domain.enums import GroundTruthStatus @@ -53,8 +54,8 @@ def _normalize_text(text: str | None) -> str: def _get_question_text(item: AgenticGroundTruthEntry) -> str: - """Get the effective question text (edited or synth).""" - return item.edited_question or item.synth_question or "" + """Get the effective question text from conversation history.""" + return question_text_from_item(item) def _serialize_generic_value(value: object) -> str: @@ -132,8 +133,8 @@ def _items_are_duplicates( # Check for exact question match when both items expose question text if draft_question and approved_question and draft_question == approved_question: # Also check answer for stronger signal - draft_answer = _normalize_text(draft.answer) - approved_answer = _normalize_text(approved.answer) + draft_answer = _normalize_text(answer_text_from_item(draft)) + approved_answer = _normalize_text(answer_text_from_item(approved)) if draft_answer and approved_answer and draft_answer == approved_answer: return (True, "exact question and answer match") diff --git a/backend/app/services/ground_truth_update_service.py b/backend/app/services/ground_truth_update_service.py index 8a2ca69..420184e 100644 --- a/backend/app/services/ground_truth_update_service.py +++ b/backend/app/services/ground_truth_update_service.py @@ -13,17 +13,12 @@ HistoryEntry, HistoryItem, PluginPayload, - Reference, ToolCallRecord, ) -from app.plugins.pack_registry import get_rag_compat_pack from app.services.tagging_service import apply_computed_tags from app.services.validation_service import ValidationError, validate_item_for_approval -MISSING = object() - - class ETagRequiredError(Exception): """Raised when an update request omits optimistic-concurrency state.""" @@ -32,47 +27,11 @@ class ETagMismatchError(Exception): """Raised when the provided ETag no longer matches persisted state.""" -@dataclass(slots=True) -class LegacyCompatUpdate: - edited_question: str | None | object = MISSING - answer: str | None | object = MISSING - refs: list[Reference] | object = MISSING - - @dataclass(slots=True) class UpdateMutationResult: should_delete_assignment: bool = False -def read_legacy_compat_update(extras: dict[str, Any]) -> LegacyCompatUpdate: - update = LegacyCompatUpdate() - - if "editedQuestion" in extras or "edited_question" in extras: - update.edited_question = cast( - str | None, extras.get("editedQuestion", extras.get("edited_question")) - ) - - if "answer" in extras: - answer_value = extras["answer"] - if answer_value is not None and not isinstance(answer_value, str): - raise ValidationError("", "answer", "answer must be a string or null") - update.answer = cast(str | None, answer_value) - - if "refs" in extras: - refs_payload = extras["refs"] - if refs_payload is None: - update.refs = [] - elif isinstance(refs_payload, list): - update.refs = [ - ref if isinstance(ref, Reference) else Reference.model_validate(ref) - for ref in refs_payload - ] - else: - raise ValidationError("", "refs", "refs must be a list or null") - - return update - - def _parse_status(value: GroundTruthStatus | str | None) -> GroundTruthStatus: if value is None: raise ValidationError( @@ -100,16 +59,6 @@ def parse_history_entries(entries: Sequence[Any]) -> list[HistoryItem]: if not message: raise ValidationError("", "history", "history entries must include a non-empty msg") - refs_data = extras.get("refs") - refs_list = None - if refs_data is not None: - if not isinstance(refs_data, list): - raise ValidationError("", "history", "history refs must be a list") - refs_list = [ - ref if isinstance(ref, Reference) else Reference.model_validate(ref) - for ref in refs_data - ] - expected_behavior = extras.get("expectedBehavior", extras.get("expected_behavior")) if expected_behavior is not None and not isinstance(expected_behavior, list): raise ValidationError( @@ -122,7 +71,6 @@ def parse_history_entries(entries: Sequence[Any]) -> list[HistoryItem]: HistoryItem( role=getattr(entry, "role"), msg=message, - refs=refs_list, expected_behavior=expected_behavior, ) ) @@ -148,7 +96,6 @@ def apply_shared_update( status: GroundTruthStatus | str | None = None, approve: bool = False, actor_user_id: str, - legacy_update: LegacyCompatUpdate | None = None, clear_assignment_on_statuses: set[GroundTruthStatus] | None = None, ) -> UpdateMutationResult: now = datetime.now(timezone.utc) @@ -161,11 +108,9 @@ def apply_shared_update( if "history" in provided_fields: if history_entries is None: item.history = [] - item.totalReferences = 0 else: # HistoryItem is a subclass of HistoryEntry, so this is safe item.history = cast(list[HistoryEntry], parse_history_entries(history_entries)) - item.totalReferences = 0 if "context_entries" in provided_fields: item.context_entries = context_entries or [] @@ -194,17 +139,6 @@ def apply_shared_update( if "manual_tags" in provided_fields: item.manual_tags = manual_tags or [] - if legacy_update is not None: - if legacy_update.edited_question is not MISSING: - item.edited_question = cast(str | None, legacy_update.edited_question) - if legacy_update.answer is not MISSING: - item.answer = cast(str | None, legacy_update.answer) - if legacy_update.refs is not MISSING: - rag_compat_pack = get_rag_compat_pack() - rag_compat_pack.replace_references( - item, list(cast(list[Reference], legacy_update.refs)) - ) - if approve: item.status = GroundTruthStatus.approved item.reviewed_at = now diff --git a/backend/app/services/pii_service.py b/backend/app/services/pii_service.py index 7a0555c..cdebc7d 100644 --- a/backend/app/services/pii_service.py +++ b/backend/app/services/pii_service.py @@ -15,6 +15,7 @@ from pydantic import BaseModel, Field +from app.domain.conversation_fields import answer_text_from_item, question_text_from_item from app.domain.models import AgenticGroundTruthEntry @@ -23,7 +24,7 @@ class PIIWarning(BaseModel): item_id: str = Field(description="Item identifier") field: str = Field( - description="Field name where PII was detected (e.g., 'synthQuestion', 'history[2].msg')" + description="Field name where the PII was detected (e.g., 'history.question', 'history[2].msg')" ) pattern_type: str = Field(description="Type of PII detected ('email' or 'phone')") snippet: str = Field(description="Masked context snippet showing the detected PII") @@ -163,9 +164,8 @@ def scan_item_for_pii(item: AgenticGroundTruthEntry) -> list[PIIWarning]: """Scan a ground truth item for PII in all relevant fields. Phase 1 scans: - - synth_question - - edited_question - - answer + - canonical question text derived from history + - canonical answer text derived from history - comment - history[].msg @@ -195,15 +195,14 @@ def scan_nested_value(value: Any, field_name: str) -> None: for idx, nested in enumerate(value): scan_nested_value(nested, f"{field_name}[{idx}]") - # Scan primary text fields - if item.synth_question: - warnings.extend(scan_text_for_pii(item.synth_question, "synthQuestion", item_id)) + # Scan canonical conversation-derived text fields + question_text = question_text_from_item(item) + if question_text: + warnings.extend(scan_text_for_pii(question_text, "history.question", item_id)) - if item.edited_question: - warnings.extend(scan_text_for_pii(item.edited_question, "editedQuestion", item_id)) - - if item.answer: - warnings.extend(scan_text_for_pii(item.answer, "answer", item_id)) + answer_text = answer_text_from_item(item) + if answer_text: + warnings.extend(scan_text_for_pii(answer_text, "history.answer", item_id)) if item.comment: warnings.extend(scan_text_for_pii(item.comment, "comment", item_id)) diff --git a/backend/app/services/validation_service.py b/backend/app/services/validation_service.py index 4cc05f7..5d6b133 100644 --- a/backend/app/services/validation_service.py +++ b/backend/app/services/validation_service.py @@ -5,6 +5,12 @@ import asyncio import logging +from app.domain.conversation_fields import ( + answer_text_from_item, + is_non_user_role, + is_user_role, + question_text_from_item, +) from app.domain.models import AgenticGroundTruthEntry, BulkImportError, HistoryEntry from app.services.tagging_service import validate_tags_with_cache @@ -56,20 +62,20 @@ def __init__(self, errors: list[str]): def _normalized_history(item: AgenticGroundTruthEntry) -> list[HistoryEntry]: history = list(item.history or []) - question_text = item.edited_question or item.synth_question + question_text = question_text_from_item(item) + answer_text = answer_text_from_item(item) if history: - roles = {entry.role.strip().lower() for entry in history} - if "user" not in roles and question_text: + if not any(is_user_role(entry.role) for entry in history) and question_text: history.insert(0, HistoryEntry(role="user", msg=question_text)) - if "assistant" not in roles and item.answer: - history.append(HistoryEntry(role="assistant", msg=item.answer)) + if not any(is_non_user_role(entry.role) for entry in history) and answer_text: + history.append(HistoryEntry(role="agent", msg=answer_text)) return history synthesized: list[HistoryEntry] = [] if question_text: synthesized.append(HistoryEntry(role="user", msg=question_text)) - if item.answer: - synthesized.append(HistoryEntry(role="assistant", msg=item.answer)) + if answer_text: + synthesized.append(HistoryEntry(role="agent", msg=answer_text)) return synthesized @@ -87,14 +93,12 @@ def collect_approval_validation_errors(item: AgenticGroundTruthEntry) -> list[st if not history: errors.append("history must contain at least one conversation message") else: - user_messages = [entry for entry in history if entry.role.strip().lower() == "user"] - assistant_messages = [ - entry for entry in history if entry.role.strip().lower() == "assistant" - ] + user_messages = [entry for entry in history if is_user_role(entry.role)] + assistant_messages = [entry for entry in history if is_non_user_role(entry.role)] if not user_messages: errors.append("history must include at least one user message") if not assistant_messages: - errors.append("history must include at least one assistant message") + errors.append("history must include at least one agent message") tool_call_names = {tool.name for tool in item.tool_calls if tool.name} required_tools = [tool.name for tool in item.expected_tools.required if tool.name] diff --git a/backend/scripts/README.md b/backend/scripts/README.md index d622e8b..3eed914 100644 --- a/backend/scripts/README.md +++ b/backend/scripts/README.md @@ -1,56 +1,3 @@ # Scripts This folder contains helper scripts used during development and data ops. - -## KB CSV import workflow - -Use these two scripts to prepare and import a KB CSV into the Ground Truth Curator API. - -1) Clean the CSV (drops Japanese descriptions and rows with non-empty "Added question?"): - -```bash -uv run python scripts/clean_kb_csv.py \ - --input 'scripts/AI_Generated_Questions_500_v1_0820-1545_dataset(Sheet1).csv' \ - --output /tmp/kb_cleaned.csv -``` - -2) Import the cleaned CSV (prefix CS to `article`, build KB article URLs, POST in batches): - -```bash -uv run python scripts/import_kb_csv.py \ - --input /tmp/kb_cleaned.csv \ - --base-url http://localhost:8000 \ - --api-prefix /v1 \ - --dataset kb \ - --kb-base-url https://example.com \ - --approve \ - --batch-size 200 -``` - -Authentication: -- Bearer token header: - -```bash -uv run python scripts/import_kb_csv.py --input /tmp/kb_cleaned.csv \ - --base-url http://localhost:8000 --api-prefix /v1 --dataset kb --kb-base-url https://example.com --approve \ - --bearer-token '' -``` - -- Dev convenience header (used when AUTH_MODE=dev): - -```bash -uv run python scripts/import_kb_csv.py --input /tmp/kb_cleaned.csv \ - --base-url http://localhost:8000 --api-prefix /v1 --dataset kb --kb-base-url https://example.com --approve \ - --user-id importer -``` - -Dry-run (no POSTs; preview first 3 payloads): - -```bash -uv run python scripts/import_kb_csv.py --input /tmp/kb_cleaned.csv --dry-run -``` - -Notes: -- Cleaning detects Japanese using a Unicode-range heuristic (Hiragana/Katakana/Kanji) in `description` and removes such rows; also removes rows where "Added question?" is non-empty. -- Import normalizes the `article` field to start with `CS` and constructs references like `https://example.com/support/article/CS32540` using `--kb-base-url`. It uses `generated_question` (fallback: `description`) as the synthetic question and posts to `/v1/ground-truths` in batches. -- API errors are printed per batch, plus a final deduplicated summary. diff --git a/backend/scripts/backfill_total_references.py b/backend/scripts/backfill_total_references.py deleted file mode 100644 index 2108f10..0000000 --- a/backend/scripts/backfill_total_references.py +++ /dev/null @@ -1,365 +0,0 @@ -#!/usr/bin/env python3 -""" -Backfill script to update existing Cosmos DB documents with totalReferences field. - -This script: -1. Queries for documents missing the totalReferences field -2. Calculates totalReferences for each document -3. Updates documents in batches to avoid memory issues -4. Provides progress reporting and error handling -5. Can be run safely multiple times (idempotent) - -USAGE (Local Development): - python scripts/backfill_total_references.py [--batch-size 100] [--dry-run] - -USAGE (Azure Container App): - # Connect to the running container app instance - az containerapp exec --name --resource-group --command "/bin/bash" - - # Inside the container, run: - cd /app - python scripts/backfill_total_references.py --batch-size 50 - - # For dry-run validation first: - python scripts/backfill_total_references.py --dry-run - - # Monitor logs: - az containerapp logs show --name --resource-group --follow - -AZURE CONTAINER APP CONSIDERATIONS: - - Use smaller batch sizes (50-100) to avoid timeouts - - Monitor memory usage during execution - - Ensure the container has sufficient CPU/memory allocation - - Set appropriate environment variables for Cosmos DB connection - - Consider running during off-peak hours to minimize impact -""" - -import argparse -import asyncio -import logging -import sys -from datetime import datetime, timezone -from pathlib import Path -from typing import Any, Dict -from azure.cosmos.exceptions import CosmosHttpResponseError - -# Add the backend directory to Python path so we can import app modules -backend_dir = Path(__file__).parent.parent -sys.path.insert(0, str(backend_dir)) - -from app.container import container -from app.adapters.repos.cosmos_repo import CosmosGroundTruthRepo - - -logger = logging.getLogger(__name__) - - -def compute_total_references_from_doc(doc: Dict[str, Any]) -> int: - """Calculate total reference count from raw document data. - - Args: - doc: Raw document from Cosmos DB - - Returns: - Total reference count - """ - # Count refs in all history turns - history = doc.get("history", []) or [] - history_refs = 0 - - for turn in history: - if isinstance(turn, dict): - refs = turn.get("refs", []) or [] - history_refs += len(refs) - - # If no turn refs, return item-level refs count - if history_refs == 0: - refs = doc.get("refs", []) or [] - return len(refs) - - return history_refs - - -async def get_documents_missing_total_references(batch_size: int = 100) -> list[Dict[str, Any]]: - """Query for documents that don't have totalReferences field. - - Args: - batch_size: Maximum number of documents to return - - Returns: - List of documents missing totalReferences field - """ - # Initialize the Cosmos repository if not already done - if container.repo is None: - container.init_cosmos_repo() - - repo = container.repo - if isinstance(repo, CosmosGroundTruthRepo): - await repo._ensure_initialized() - - # Query for documents without totalReferences field - query = """ - SELECT * FROM c - WHERE c.docType = 'ground-truth-item' - AND NOT IS_DEFINED(c.totalReferences) - """ - - container_client = repo._gt_container - if not container_client: - raise ValueError("Cosmos container not initialized") - else: - raise ValueError("This script only works with CosmosGroundTruthRepo") - query_iterator = container_client.query_items( - query=query, enable_scan_in_query=True, max_item_count=batch_size - ) - - documents = [] - try: - async for item in query_iterator: - documents.append(item) - if len(documents) >= batch_size: - break - except Exception as e: - logger.error(f"Error querying documents: {e}") - raise - - return documents - - -async def update_document_with_total_references( - doc: Dict[str, Any], dry_run: bool = False, max_retries: int = 3 -) -> bool: - """Update a single document with totalReferences field. - - Args: - doc: Document to update - dry_run: If True, don't actually update the document - - Returns: - True if update was successful, False otherwise - """ - for attempt in range(max_retries): - try: - # Calculate totalReferences - total_refs = compute_total_references_from_doc(doc) - - if dry_run: - logger.info( - f"DRY RUN: Would update document {doc.get('id')} with totalReferences={total_refs}" - ) - return True - - # Add totalReferences to document - doc["totalReferences"] = total_refs - doc["updatedAt"] = datetime.now(timezone.utc).isoformat() - - # Update in Cosmos DB - repo = container.repo - if isinstance(repo, CosmosGroundTruthRepo): - container_client = repo._gt_container - if not container_client: - raise ValueError("Cosmos container not initialized") - else: - raise ValueError("This script only works with CosmosGroundTruthRepo") - - # Use replace_item to update the document - await container_client.replace_item(item=doc["id"], body=doc) - - logger.info(f"Updated document {doc.get('id')} with totalReferences={total_refs}") - return True - - except CosmosHttpResponseError as e: - if e.status_code == 429: # Rate limited - wait_time = 2**attempt - logger.warning(f"Rate limited, retrying in {wait_time}s") - await asyncio.sleep(wait_time) - continue - elif e.status_code == 412: # Precondition failed (etag mismatch) - logger.warning(f"Document {doc['id']} was updated by another process") - return False - else: - raise - except Exception as e: - if attempt == max_retries - 1: - logger.error(f"Final attempt failed for {doc['id']}: {e}") - return False - logger.warning(f"Attempt {attempt + 1} failed, retrying: {e}") - - return False - - -async def update_documents_batch( - documents: list[Dict[str, Any]], batch_size: int = 10, max_ru_per_second: int = 400 -) -> Dict[str, int]: - """Optimized batch processing with rate limiting.""" - - stats = {"processed": 0, "updated": 0, "errors": 0, "skipped": 0} - - # Process in smaller batches to control RU consumption - for i in range(0, len(documents), batch_size): - batch = documents[i : i + batch_size] - - # Execute batch operations concurrently with semaphore - semaphore = asyncio.Semaphore(5) # Limit concurrent operations - - async def process_document(doc): - async with semaphore: - return await update_document_with_total_references(doc) - - # Process batch concurrently - batch_tasks = [process_document(doc) for doc in batch] - results = await asyncio.gather(*batch_tasks, return_exceptions=True) - - # Update statistics - for result in results: - if isinstance(result, Exception): - stats["errors"] += 1 - elif result: - stats["updated"] += 1 - stats["processed"] += 1 - - # Rate limiting: pause between batches - await asyncio.sleep(0.1) # 100ms pause - - # Log progress - logger.info(f"Processed batch {i // batch_size + 1}, Progress: {stats}") - - return stats - - -async def backfill_total_references_batch( - batch_size: int = 100, dry_run: bool = False -) -> Dict[str, int]: - """Process a batch of documents and update them with totalReferences. - - Args: - batch_size: Number of documents to process in this batch - dry_run: If True, don't actually update documents - - Returns: - Dictionary with processing statistics - """ - stats = {"processed": 0, "updated": 0, "errors": 0, "skipped": 0} - - try: - # Get documents missing totalReferences - documents = await get_documents_missing_total_references(batch_size) - - if not documents: - logger.info("No documents found missing totalReferences field") - return stats - - logger.info(f"Found {len(documents)} documents to update") - - batch_stats = await update_documents_batch(documents, batch_size=10) - - # Update the main stats with batch results - for key in ["processed", "updated", "errors", "skipped"]: - stats[key] = batch_stats[key] - - return stats - - except Exception as e: - logger.error(f"Error in batch processing: {e}") - stats["errors"] += 1 - return stats - - -async def run_full_migration( - batch_size: int = 100, max_batches: int | None = None, dry_run: bool = False -) -> None: - """Run the complete migration process. - - Args: - batch_size: Number of documents to process per batch - max_batches: Maximum number of batches to process (None = unlimited) - dry_run: If True, don't actually update documents - """ - logger.info("Starting totalReferences backfill migration") - logger.info(f"Batch size: {batch_size}, Max batches: {max_batches}, Dry run: {dry_run}") - - total_stats = {"processed": 0, "updated": 0, "errors": 0, "skipped": 0, "batches": 0} - - batch_count = 0 - - while True: - batch_count += 1 - - if max_batches and batch_count > max_batches: - logger.info(f"Reached maximum batch limit of {max_batches}") - break - - logger.info(f"Processing batch {batch_count}...") - - # Process batch - batch_stats = await backfill_total_references_batch(batch_size, dry_run) - - # Update totals - for key in ["processed", "updated", "errors", "skipped"]: - total_stats[key] += batch_stats[key] - total_stats["batches"] = batch_count - - # Log batch results - logger.info(f"Batch {batch_count} complete: {batch_stats}") - - # If no documents were processed, we're done - if batch_stats["processed"] == 0: - logger.info("No more documents to process") - break - - # In dry-run mode, stop after first batch to avoid infinite loop - # (since we're not actually updating documents, the query will keep finding the same ones) - if dry_run: - logger.info("Dry-run mode: stopping after first batch to prevent infinite loop") - break - - # Final summary - logger.info("Migration complete!") - logger.info(f"Total statistics: {total_stats}") - - -async def main() -> None: - """Main function to handle command line arguments and execute migration.""" - parser = argparse.ArgumentParser( - description="Backfill totalReferences field in Cosmos DB documents", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=__doc__, - ) - - parser.add_argument( - "--batch-size", - type=int, - default=100, - help="Number of documents to process per batch (default: 100)", - ) - - parser.add_argument( - "--max-batches", - type=int, - default=None, - help="Maximum number of batches to process (default: unlimited)", - ) - - parser.add_argument( - "--dry-run", action="store_true", help="Preview changes without actually updating documents" - ) - - parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging") - - args = parser.parse_args() - - # Configure logging - log_level = logging.DEBUG if args.verbose else logging.INFO - logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s") - - try: - await run_full_migration( - batch_size=args.batch_size, max_batches=args.max_batches, dry_run=args.dry_run - ) - except Exception as e: - logger.error(f"Migration failed: {e}") - raise - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/backend/scripts/cosmos_export_import.py b/backend/scripts/cosmos_export_import.py deleted file mode 100644 index 1be1277..0000000 --- a/backend/scripts/cosmos_export_import.py +++ /dev/null @@ -1,511 +0,0 @@ -#!/usr/bin/env python3 -""" -Cosmos DB Export/Import Script -============================== - -This script exports documents from a source Azure Cosmos DB container and imports them -into a target container with hierarchical partition keys (HPK). It's designed for -migrating data between Cosmos DB instances (e.g., from cloud to local emulator). - -FEATURES: -- Exports data in paginated JSONL format for memory efficiency -- Supports hierarchical partition keys (/datasetName, /bucket) -- Concurrent/bulk import with retry logic for 429 throttling -- Dry-run mode for validation without writing -- Flexible missing partition key policies - -USAGE: -1. Configure environment variables in '.env' file (use sample_cosmos_export_import.env as template) -2. Run: python cosmos_export_import.py - -CONFIGURATION: -All settings are loaded from '.env' file: -- Source/target Cosmos DB connection strings and credentials -- Partition key paths (default: ["/datasetName", "/bucket"]) -- Batch sizes, concurrency settings -- DRY_RUN mode for validation only - - Note with DRY_RUN=true, export to jsonl files will still occur, - hence allowing an export to file without importing into another instance -- Missing partition key handling policy - -EXAMPLES: -# Export from cloud to local emulator (dry-run first) -DRY_RUN=true python cosmos_export_import.py - -# Actual migration -DRY_RUN=false python cosmos_export_import.py - -OUTPUT: -- Creates ./cosmos_export/ directory with paginated JSONL files -- Each page contains up to EXPORT_PAGE_SIZE documents -- Import processes files in batches of IMPORT_batch_SIZE - -ERROR HANDLING: -- Automatic retry with exponential backoff for 429 (throttling) -- Configurable missing partition key policies: error/skip/default -- Detailed logging of progress and errors - -NOTE: Ensure target container has sufficient RU/s to avoid throttling during import. -""" - -import os -import json -from pathlib import Path -import time -from typing import List, Dict, Any, Optional, Tuple -from dotenv import load_dotenv -from azure.cosmos import CosmosClient, PartitionKey, exceptions -from azure.cosmos.exceptions import CosmosHttpResponseError -from concurrent.futures import ThreadPoolExecutor, as_completed -from azure.identity import DefaultAzureCredential - -# ------------- Configuration ------------- - - -def require_env(name: str) -> str: - val = os.getenv(name) - if not val: # catches None and empty string - raise RuntimeError(f"Missing required environment variable: {name}") - return val - - -DOTENV_PATH = Path(".env") -load_dotenv(dotenv_path=DOTENV_PATH) - -SRC_ACCOUNT_URI = require_env("SRC_ACCOUNT_URI") -SRC_DATABASE = require_env("SRC_DATABASE") -SRC_CONTAINER = require_env("SRC_CONTAINER") - -DST_ACCOUNT_URI = require_env("DST_ACCOUNT_URI") -DST_DATABASE = require_env("DST_DATABASE") -DST_CONTAINER = require_env("DST_CONTAINER") - - -def is_dst_cosmos_emulator_in_use() -> bool: - """Detect if Cosmos DB emulator is in use based on endpoint URL.""" - return "localhost" in DST_ACCOUNT_URI or "127.0.0.1" in DST_ACCOUNT_URI - - -if is_dst_cosmos_emulator_in_use(): - DST_EMULATOR_KEY = require_env("DST_EMULATOR_KEY") - -# HPK paths -DST_PARTITION_KEY_PATHS_RAW = require_env("DST_PARTITION_KEY_PATHS") -try: - DST_PARTITION_KEY_PATHS = json.loads(DST_PARTITION_KEY_PATHS_RAW) - if not isinstance(DST_PARTITION_KEY_PATHS, list): - raise ValueError("DST_PARTITION_KEY_PATHS must be a JSON list") -except (json.JSONDecodeError, ValueError) as e: - raise RuntimeError( - f"Invalid DST_PARTITION_KEY_PATHS format: {e}. Expected JSON list like: ['/datasetName', '/bucket']" - ) - -OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./cosmos_export") -EXPORT_PAGE_SIZE = int(os.getenv("EXPORT_PAGE_SIZE", "500")) -IMPORT_BATCH_SIZE = int(os.getenv("IMPORT_BATCH_SIZE", "200")) -BULK_MODE = os.getenv("BULK_MODE", "true").lower() == "true" -DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true" - -MISSING_PK_POLICY = os.getenv("MISSING_PK_POLICY", "error").lower() -DEFAULT_PK_VALUES_RAW = os.getenv("DEFAULT_PK_VALUES", '["UNKNOWN","DEFAULT_BUCKET"]') - -# Retry/backoff -MAX_RETRY_ATTEMPTS = 10 -RETRY_BACKOFF_BASE = 0.5 # seconds - -CONCURRENCY = int(os.getenv("CONCURRENCY", "32")) # number of parallel upserts - - -# ------------- Helpers ------------- - - -def ensure_dir(path: str): - if not os.path.exists(path): - os.makedirs(path) - - -def exponential_backoff(attempt: int) -> float: - return min(RETRY_BACKOFF_BASE * (2**attempt), 30.0) - - -def log(msg: str): - print(f"[cosmos-migrate] {msg}") - - -def transform_document(doc: Dict[str, Any]) -> Dict[str, Any]: - """ - Remove Cosmos system props; keep content unchanged. - Add transformations here if needed later. - """ - new_doc = dict(doc) - for sys_field in ["_rid", "_ts", "_self", "_etag", "_attachments"]: - new_doc.pop(sys_field, None) - return new_doc - - -def get_value_by_path(doc: Dict[str, Any], path: str) -> Any: - """ - Extract value from document following a path like "/bucket". - """ - parts = path.strip("/").split("/") - cur = doc - for p in parts: - if not isinstance(cur, dict) or p not in cur: - return None - cur = cur[p] - return cur - - -def compute_hpk_values(doc: Dict[str, Any], paths: List[str]) -> Tuple[List[Any], List[int]]: - """ - Returns (values_list, missing_indices) for hierarchical partition key paths. - """ - values = [] - missing = [] - for i, path in enumerate(paths): - val = get_value_by_path(doc, path) - if val is None: - values.append(None) - missing.append(i) - else: - values.append(val) - return values, missing - - -def parse_default_pk_values(raw: str, count: int) -> List[Any]: - try: - vals = json.loads(raw) - except Exception: - vals = [] - if len(vals) < count: - vals = vals + [None] * (count - len(vals)) - elif len(vals) > count: - vals = vals[:count] - return vals - - -DEFAULT_PK_VALUES = parse_default_pk_values(DEFAULT_PK_VALUES_RAW, len(DST_PARTITION_KEY_PATHS)) - - -def resolve_missing_hpk(values: List[Any], missing_indices: List[int]) -> Optional[List[Any]]: - """ - Apply MISSING_PK_POLICY to fill or handle missing HPK components. - """ - if not missing_indices: - return values - - if MISSING_PK_POLICY == "error": - return None - elif MISSING_PK_POLICY == "skip": - return None - elif MISSING_PK_POLICY == "default": - for i in missing_indices: - default_val = DEFAULT_PK_VALUES[i] - if default_val is None: - return None - values[i] = default_val - return values - else: - return None - - -def upsert_with_retry(container, doc): - attempts = 0 - while True: - try: - # Non-bulk path: let SDK infer HPK from doc fields - container.upsert_item(doc) - return True - except CosmosHttpResponseError as e: - # 429 (throttled) -> backoff, then retry - if getattr(e, "status_code", None) == 429 and attempts < MAX_RETRY_ATTEMPTS: - attempts += 1 - delay = min(RETRY_BACKOFF_BASE * (2**attempts), 30.0) - log( - f"Throttled (429) on id={doc.get('id')}. Backing off {delay:.1f}s (attempt {attempts}/{MAX_RETRY_ATTEMPTS})" - ) - time.sleep(delay) - continue - else: - # Bubble up any non-retryable errors - raise - - -# ------------- Export ------------- - - -def export_cosmos_container_to_jsonl( - client: CosmosClient, - database_name: str, - container_name: str, - output_dir: str, - page_size: int = 500, -) -> str: - """ - Export all documents from a Cosmos DB container to paginated JSONL files. - Uses by_page() for robust continuation handling. - """ - log(f"Exporting from {SRC_ACCOUNT_URI}:{database_name}/{container_name} ...") - ensure_dir(output_dir) - - database = client.get_database_client(database_name) - container = database.get_container_client(container_name) - - query = "SELECT * FROM c" - page_index = 1 - total_docs = 0 - - try: - iterator = container.query_items( - query=query, - enable_cross_partition_query=True, - max_item_count=page_size, - ).by_page() - except CosmosHttpResponseError as e: - log(f"Query initialization error: {e}") - raise - - for page in iterator: - docs = list(page) - if not docs: - break - page_file = os.path.join(output_dir, f"{container_name}_page_{page_index}.jsonl") - with open(page_file, "w", encoding="utf-8") as f: - for doc in docs: - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - total_docs += len(docs) - log(f"Wrote {len(docs)} docs to {page_file}") - page_index += 1 - - log(f"Export complete: {total_docs} documents across {page_index - 1} file(s).") - return output_dir - - -# ------------- Import ------------- - - -def maybe_create_target_container( - client: CosmosClient, - database_name: str, - container_name: str, - partition_key_paths: List[str], - throughput: Optional[int] = None, -): - """ - Create database & container if they do not exist, with hierarchical PK definition. - """ - log(f"Ensuring target DB '{database_name}' and container '{container_name}' exist ...") - db_client = client.create_database_if_not_exists(id=database_name) - - try: - db_client.create_container_if_not_exists( - id=container_name, - partition_key=PartitionKey(path=partition_key_paths, kind="MultiHash"), - offer_throughput=throughput, - ) - log("Target container is ready.") - except exceptions.CosmosResourceExistsError: - log("Target container already exists.") - except CosmosHttpResponseError as e: - log(f"Failed to create container: {e}") - raise - - -def read_jsonl_files(folder: str, prefix: str) -> List[str]: - files = [] - for name in sorted(os.listdir(folder)): - if name.startswith(prefix) and name.endswith(".jsonl"): - files.append(os.path.join(folder, name)) - return files - - -def summarize_missing_hpk_components( - source_folder: str, prefix: str, partition_key_paths: List[str] -) -> None: - """ - DRY_RUN validator: counts and reports missing HPK components without writing. - """ - files = read_jsonl_files(source_folder, prefix) - if not files: - log(f"No JSONL files found in {source_folder} with prefix '{prefix}'") - return - - total = 0 - missing_counts = [0] * len(partition_key_paths) - - for file in files: - with open(file, "r", encoding="utf-8") as f: - for line in f: - doc = json.loads(line) - _, missing = compute_hpk_values(doc, partition_key_paths) - total += 1 - for i in missing: - missing_counts[i] += 1 - - log(f"Validation summary: checked {total} docs.") - for i, path in enumerate(partition_key_paths): - log(f" Path {path}: missing in {missing_counts[i]} docs") - - -def import_jsonl_to_cosmos( - client: CosmosClient, - database_name: str, - container_name: str, - source_folder: str, - source_prefix: Optional[str], - partition_key_paths: List[str], - batch_size: int = 200, - bulk_mode: bool = True, -): - """ - Import JSONL files into target Cosmos container with hierarchical partition keys. - """ - db = client.get_database_client(database_name) - container = db.get_container_client(container_name) - - prefix = source_prefix or container_name - files = read_jsonl_files(source_folder, prefix) - if not files: - log(f"No JSONL files found in {source_folder} with prefix '{prefix}'") - return - - total_written = 0 - total_skipped = 0 - - for file in files: - log(f"Importing from {file} to {DST_ACCOUNT_URI}:{database_name}/{container_name} ...") - batch: List[Tuple[Dict[str, Any], List[Any]]] = [] - - with open(file, "r", encoding="utf-8") as f: - for line in f: - doc = json.loads(line) - doc = transform_document(doc) - - hpk_values, missing = compute_hpk_values(doc, partition_key_paths) - if missing: - resolved = resolve_missing_hpk(hpk_values, missing) - if resolved is None: - total_skipped += 1 - log( - f"Skipped doc id={doc.get('id')} due to missing HPK components at indices {missing}" - ) - continue - else: - hpk_values = resolved - - batch.append((doc, hpk_values)) - - if len(batch) >= batch_size: - if DRY_RUN: - total_written += len(batch) # pretend write - log(f"[DRY_RUN] Would write {len(batch)} docs") - else: - written = write_batch(container, batch, bulk_mode=bulk_mode) - total_written += written - batch = [] - - if batch: - if DRY_RUN: - total_written += len(batch) - log(f"[DRY_RUN] Would write {len(batch)} docs") - else: - written = write_batch(container, batch, bulk_mode=bulk_mode) - total_written += written - - log( - f"Import complete: {total_written} documents {'validated' if DRY_RUN else 'written'}, {total_skipped} skipped (policy={MISSING_PK_POLICY})." - ) - - -def write_batch( - container, docs_with_pk: List[Tuple[Dict[str, Any], List[Any]]], bulk_mode: bool = True -) -> int: - """ - Write a batch of documents with retry on 429. - For HPK, partition key is a list in the same order as paths. - If bulk_mode=true, use concurrent upserts (no explicit partition_key kwarg). - """ - # If someone enables bulk_mode, but the SDK doesn't have container.bulk, we'll use concurrency instead. - if bulk_mode: - log(f"Using concurrent upserts: CONCURRENCY={CONCURRENCY}, batch_size={len(docs_with_pk)}") - total_success = 0 - # Kick off upserts in parallel - with ThreadPoolExecutor(max_workers=CONCURRENCY) as tp: - futures = [tp.submit(upsert_with_retry, container, d) for (d, _pk_list) in docs_with_pk] - for fut in as_completed(futures): - try: - if fut.result(): - total_success += 1 - except CosmosHttpResponseError as e: - log(f"Upsert failed: {e}") - # If desired, you can collect failed docs and retry sequentially here. - # For now, we just log and continue to next future. - continue - log(f"Concurrent upserts wrote {total_success}/{len(docs_with_pk)} docs") - return total_success - - # Fallback: sequential upserts (non-bulk) - success = 0 - for d, _pk_list in docs_with_pk: - upsert_with_retry(container, d) - success += 1 - log(f"Sequential upserts wrote {success}/{len(docs_with_pk)} docs") - return success - - -# ------------- Main orchestration ------------- - - -def main(): - # Build one DefaultAzureCredential and reuse it. - # For user-assigned managed identity, set the credential's managed identity - # client id from the AZURE_CLIENT_ID environment variable. - aad_credential = DefaultAzureCredential() - - # Source client (AAD) - src_client = CosmosClient(SRC_ACCOUNT_URI, credential=aad_credential, logging_enable=True) - - # Target client (emulator or AAD) - if is_dst_cosmos_emulator_in_use(): - log("Using Cosmos Emulator for target client") - dst_client = CosmosClient(DST_ACCOUNT_URI, credential=DST_EMULATOR_KEY, logging_enable=True) - else: - dst_client = CosmosClient(DST_ACCOUNT_URI, credential=aad_credential, logging_enable=True) - - # 1) Export - export_cosmos_container_to_jsonl( - client=src_client, - database_name=SRC_DATABASE, - container_name=SRC_CONTAINER, - output_dir=OUTPUT_DIR, - page_size=EXPORT_PAGE_SIZE, - ) - - # 2) Ensure target container exists (HPK-aware) - maybe_create_target_container( - client=dst_client, - database_name=DST_DATABASE, - container_name=DST_CONTAINER, - partition_key_paths=DST_PARTITION_KEY_PATHS, - throughput=None, # set higher RU/s temporarily if you see 429s - ) - - # 3) Optional: summarize_missing_hpk_components (no writes) - if DRY_RUN: - summarize_missing_hpk_components(OUTPUT_DIR, SRC_CONTAINER, DST_PARTITION_KEY_PATHS) - - # 4) Import with HPK mapping - import_jsonl_to_cosmos( - client=dst_client, - database_name=DST_DATABASE, - container_name=DST_CONTAINER, - source_folder=OUTPUT_DIR, - source_prefix=SRC_CONTAINER, # files are named using source container - partition_key_paths=DST_PARTITION_KEY_PATHS, - batch_size=IMPORT_BATCH_SIZE, - bulk_mode=BULK_MODE, - ) - - -if __name__ == "__main__": - main() diff --git a/backend/scripts/init_seed_data.py b/backend/scripts/init_seed_data.py index e5043ad..1faca3e 100644 --- a/backend/scripts/init_seed_data.py +++ b/backend/scripts/init_seed_data.py @@ -24,7 +24,7 @@ def _build_item(dataset: str, idx: int) -> Any: - from app.domain.models import GroundTruthItem, Reference + from app.domain.models import AgenticGroundTruthEntry, Reference from app.domain.enums import GroundTruthStatus # Vary some fields for realism while keeping validation simple @@ -45,15 +45,23 @@ def _build_item(dataset: str, idx: int) -> Any: "id": f"{dataset}-q{idx:04d}", "datasetName": dataset, "status": GroundTruthStatus.draft.value, - "synthQuestion": f"What is item {idx} about in dataset '{dataset}'?", - "refs": [ - Reference(url=f"https://example.com/{dataset}/{idx}").model_dump( - mode="json", by_alias=True - ) - ], - "tags": tags, + "history": [{"role": "user", "msg": f"What is item {idx} about in dataset '{dataset}'?"}], + "plugins": { + "rag-compat": { + "kind": "rag-compat", + "version": "1.0", + "data": { + "references": [ + Reference(url=f"https://example.com/{dataset}/{idx}").model_dump( + mode="json", by_alias=True + ) + ] + }, + } + }, + "manualTags": tags, } - return GroundTruthItem.model_validate(data) + return AgenticGroundTruthEntry.model_validate(data) def _default_registry_tags() -> list[str]: diff --git a/backend/scripts/update_greetings_answer.py b/backend/scripts/update_greetings_answer.py deleted file mode 100644 index 49612b8..0000000 --- a/backend/scripts/update_greetings_answer.py +++ /dev/null @@ -1,344 +0,0 @@ -#!/usr/bin/env python3 -""" -Cosmos DB Greetings Answer Update Script -========================================= - -This script updates the answer field from NO_ANSWER to GREETING for all items -in the ground_truth container that have a tag set to "intent:greetings". - -FEATURES: -- Connects to Azure Cosmos DB using DefaultAzureCredential -- Supports local Cosmos DB emulator for dev environment -- Uses Cosmos DB patch operations for Azure (cost-efficient, lower RU consumption) -- Uses upsert operations for emulator (patch not supported on emulator) -- Concurrent batch processing for efficient bulk updates -- Dry-run mode to preview changes without modifying data -- Automatic retry with exponential backoff for 429 throttling -- Progress reporting and statistics - -USAGE: -1. Configure environment variables in '.env' file -2. Run: python update_greetings_answer.py - -CONFIGURATION: -Environment variables (set in .env): -- COSMOS_ACCOUNT_URI: Cosmos DB endpoint URL -- COSMOS_DATABASE: Database name -- COSMOS_CONTAINER: Container name (default: ground_truth) -- COSMOS_EMULATOR_KEY: Required if using local emulator (localhost/127.0.0.1) -- DRY_RUN: Set to "true" to preview without modifying (default: false) -- CONCURRENCY: Number of parallel updates (default: 32) -- BATCH_SIZE: Items per progress update (default: 100) - -EXAMPLES: -# Preview changes (dry-run) -DRY_RUN=true python update_greetings_answer.py - -# Execute actual update -DRY_RUN=false python update_greetings_answer.py - -COST OPTIMIZATION: -- Uses patch operations for Azure instead of upsert (typically 50% less RUs) -- Automatically falls back to upsert for emulator (patch not supported) -- Query selects only required fields (id, datasetName, bucket, answer) -- Concurrent processing maximizes throughput without extra cost -""" - -import os -import time -from pathlib import Path -from typing import Any -from dotenv import load_dotenv -from azure.cosmos import CosmosClient -from azure.cosmos.exceptions import CosmosHttpResponseError -from azure.identity import DefaultAzureCredential -from concurrent.futures import ThreadPoolExecutor, as_completed - - -# ------------- Configuration ------------- - - -def require_env(name: str) -> str: - val = os.getenv(name) - if not val: # catches None and empty string - raise RuntimeError(f"Missing required environment variable: {name}") - return val - - -DOTENV_PATH = Path(".env") -load_dotenv(dotenv_path=DOTENV_PATH) - -COSMOS_ACCOUNT_URI = require_env("COSMOS_ACCOUNT_URI") -COSMOS_DATABASE = require_env("COSMOS_DATABASE") -COSMOS_CONTAINER = os.getenv("COSMOS_CONTAINER", "ground_truth") - -DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true" - -# Retry/backoff settings -MAX_RETRY_ATTEMPTS = 10 -RETRY_BACKOFF_BASE = 0.5 # seconds - -# Concurrency settings -CONCURRENCY = int(os.getenv("CONCURRENCY", "32")) # parallel updates -BATCH_SIZE = int(os.getenv("BATCH_SIZE", "100")) # items per progress update - - -# ------------- Helpers ------------- - - -def is_cosmos_emulator_in_use() -> bool: - """Detect if Cosmos DB emulator is in use based on endpoint URL.""" - return "localhost" in COSMOS_ACCOUNT_URI or "127.0.0.1" in COSMOS_ACCOUNT_URI - - -def log(msg: str): - """Log message with prefix.""" - print(f"[update-greetings] {msg}") - - -def extract_partition_key(item: dict[str, Any]) -> list[Any]: - """ - Extract hierarchical partition key values from item. - Assumes HPK structure: [datasetName, bucket] - """ - dataset_name = item.get("datasetName") - bucket = item.get("bucket") - return [dataset_name, bucket] - - -def patch_item_with_retry(container, item_id: str, partition_key: list[Any]) -> bool: - """ - Patch a single item's answer field with retry logic for 429 throttling. - Uses patch operation for cost efficiency (lower RU consumption than upsert). - Returns True if successful. - """ - attempts = 0 - while True: - try: - # Patch operation - only updates the 'answer' field - container.patch_item( - item=item_id, - partition_key=partition_key, - patch_operations=[{"op": "replace", "path": "/answer", "value": "GREETING"}], - ) - return True - except CosmosHttpResponseError as e: - # 429 (throttled) -> backoff, then retry - if getattr(e, "status_code", None) == 429 and attempts < MAX_RETRY_ATTEMPTS: - attempts += 1 - delay = min(RETRY_BACKOFF_BASE * (2**attempts), 30.0) - log( - f"Throttled (429) on id={item_id}. " - f"Backing off {delay:.1f}s (attempt {attempts}/{MAX_RETRY_ATTEMPTS})" - ) - time.sleep(delay) - continue - else: - # Bubble up any non-retryable errors - log(f"Failed to patch item id={item_id}: {e}") - raise - - -def upsert_item_with_retry(container, item: dict[str, Any]) -> bool: - """ - Upsert a single item with retry logic for 429 throttling. - Used for emulator since patch operations are not supported. - Returns True if successful. - """ - attempts = 0 - while True: - try: - container.upsert_item(item) - return True - except CosmosHttpResponseError as e: - # 429 (throttled) -> backoff, then retry - if getattr(e, "status_code", None) == 429 and attempts < MAX_RETRY_ATTEMPTS: - attempts += 1 - delay = min(RETRY_BACKOFF_BASE * (2**attempts), 30.0) - log( - f"Throttled (429) on id={item.get('id')}. " - f"Backing off {delay:.1f}s (attempt {attempts}/{MAX_RETRY_ATTEMPTS})" - ) - time.sleep(delay) - continue - else: - # Bubble up any non-retryable errors - log(f"Failed to upsert item id={item.get('id')}: {e}") - raise - - -def patch_batch_concurrent( - container, - items: list[dict[str, Any]], - concurrency: int, - use_emulator: bool = False, -) -> int: - """ - Update a batch of items concurrently using ThreadPoolExecutor. - Uses patch operations for Azure, upsert for emulator. - Returns count of successfully updated items. - """ - if not items: - return 0 - - operation = "Upserting" if use_emulator else "Patching" - log(f"{operation} {len(items)} items with concurrency={concurrency}") - items_updated = 0 - - with ThreadPoolExecutor(max_workers=concurrency) as executor: - if use_emulator: - # Emulator: use upsert (patch not supported) - # Prepare items with updated answer field - for item in items: - item["answer"] = "GREETING" - - future_to_item = { - executor.submit(upsert_item_with_retry, container, item): item for item in items - } - else: - # Azure: use patch for cost efficiency - future_to_item = { - executor.submit( - patch_item_with_retry, container, item["id"], extract_partition_key(item) - ): item - for item in items - } - - # Collect results as they complete - for future in as_completed(future_to_item): - item = future_to_item[future] - try: - if future.result(): - items_updated += 1 - if items_updated % BATCH_SIZE == 0: - log(f"Progress: {items_updated}/{len(items)} items updated") - except Exception as e: - log(f"Failed to update item id={item.get('id')}: {e}") - continue - - return items_updated - - -def update_greetings_answer( - client: CosmosClient, - database_name: str, - container_name: str, - dry_run: bool = False, - use_emulator: bool = False, -) -> tuple[int, int]: - """ - Update answer field from NO_ANSWER to GREETING for items with intent:greetings tag. - Uses patch operations for Azure or upsert for emulator. - - Returns: - Tuple of (items_matched, items_updated) - """ - operation = "upsert" if use_emulator else "patch" - log( - f"Connecting to {COSMOS_ACCOUNT_URI}:{database_name}/{container_name} (using {operation} operations)" - ) - - database = client.get_database_client(database_name) - container = database.get_container_client(container_name) - - # Query for items with intent:greetings tag and answer = NO_ANSWER - # For emulator: select all fields since we need full document for upsert - # For Azure: select only necessary fields to minimize RU consumption - if use_emulator: - query = """ - SELECT * FROM c - WHERE ARRAY_CONTAINS(c.manualTags, "intent:greetings") - AND c.answer = "NO_ANSWER" - """ - else: - query = """ - SELECT c.id, c.datasetName, c.bucket, c.answer FROM c - WHERE ARRAY_CONTAINS(c.manualTags, "intent:greetings") - AND c.answer = "NO_ANSWER" - """ - - log("Querying for items with 'intent:greetings' tag and answer='NO_ANSWER'...") - - try: - items = list( - container.query_items( - query=query, - enable_cross_partition_query=True, - ) - ) - except CosmosHttpResponseError as e: - log(f"Query error: {e}") - raise - - items_matched = len(items) - log(f"Found {items_matched} items matching criteria") - - if items_matched == 0: - return 0, 0 - - if dry_run: - operation = "upserted" if use_emulator else "patched" - log(f"[DRY RUN] Items that would be {operation}:") - for item in items: - log(f" - id: {item.get('id')}, answer: {item.get('answer')} -> GREETING") - log(f"[DRY RUN] Total items that would be {operation}: {items_matched}") - return items_matched, 0 - - # Execute concurrent batch update (patch for Azure, upsert for emulator) - items_updated = patch_batch_concurrent(container, items, CONCURRENCY, use_emulator) - - operation = "upserted" if use_emulator else "patched" - log(f"Update complete: {items_updated}/{items_matched} items {operation} successfully") - return items_matched, items_updated - - -# ------------- Main ------------- - - -def main(): - """Main entry point.""" - mode = "DRY RUN" if DRY_RUN else "LIVE" - use_emulator = is_cosmos_emulator_in_use() - log(f"Starting update script in {mode} mode") - - # Build credential - aad_credential = DefaultAzureCredential() - - # Create Cosmos client - if use_emulator: - log("Using Cosmos Emulator (will use upsert operations - patch not supported)") - emulator_key = require_env("COSMOS_EMULATOR_KEY") - client = CosmosClient(COSMOS_ACCOUNT_URI, credential=emulator_key, logging_enable=True) - else: - log("Using Azure Cosmos DB with DefaultAzureCredential (will use patch operations)") - client = CosmosClient(COSMOS_ACCOUNT_URI, credential=aad_credential, logging_enable=True) - - # Execute update - items_matched, items_updated = update_greetings_answer( - client=client, - database_name=COSMOS_DATABASE, - container_name=COSMOS_CONTAINER, - dry_run=DRY_RUN, - use_emulator=use_emulator, - ) - - # Summary - operation = "upserted" if use_emulator else "patched" - log("=" * 60) - log("Summary:") - log(f" Mode: {mode}") - log(f" Environment: {'Emulator' if use_emulator else 'Azure'}") - log(f" Operation: {'upsert' if use_emulator else 'patch'}") - log(f" Items matched: {items_matched}") - if DRY_RUN: - log(f" Items that would be {operation}: {items_matched}") - else: - log(f" Items {operation}: {items_updated}") - if items_matched > 0: - success_rate = (items_updated / items_matched) * 100 - log(f" Success rate: {success_rate:.1f}%") - log("=" * 60) - - -if __name__ == "__main__": - main() diff --git a/backend/scripts/update_greetings_answer_sample.env b/backend/scripts/update_greetings_answer_sample.env deleted file mode 100644 index 897080b..0000000 --- a/backend/scripts/update_greetings_answer_sample.env +++ /dev/null @@ -1,72 +0,0 @@ -# Sample Environment Configuration for update_greetings_answer.py -# Copy this file to .env and update with your actual values - -# ============================================================================ -# COSMOS DB CONNECTION -# ============================================================================ - -# Azure Cosmos DB account URI -# For Azure: https://your-account.documents.azure.com:443/ -# For local emulator: https://localhost:8081/ -COSMOS_ACCOUNT_URI=https://localhost:8081/ - -# Database name -COSMOS_DATABASE=your-database-name - -# Container name (defaults to "ground_truth" if not specified) -COSMOS_CONTAINER=ground_truth - - -# ============================================================================ -# COSMOS DB AUTHENTICATION -# ============================================================================ - -# Cosmos DB Emulator Key (only required for local emulator) -# Standard emulator key (localhost/127.0.0.1 detection is automatic) -COSMOS_EMULATOR_KEY=C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw== - -# For Azure Cosmos DB: Uses DefaultAzureCredential automatically -# Ensure you're logged in via: az login -# Or set up managed identity / service principal environment variables - - -# ============================================================================ -# SCRIPT BEHAVIOR -# ============================================================================ - -# Dry-run mode: Set to "true" to preview changes without modifying data -# Set to "false" to execute actual updates -DRY_RUN=true - - -# ============================================================================ -# PERFORMANCE TUNING -# ============================================================================ - -# Number of concurrent patch operations (adjust based on your RU/s provisioning) -# Higher values = faster updates but may cause throttling if RUs are limited -# Recommended: 16-32 for 400-1000 RU/s, 64+ for higher RU/s -CONCURRENCY=32 - -# Items per progress log update -BATCH_SIZE=100 - - -# ============================================================================ -# EXAMPLE CONFIGURATIONS -# ============================================================================ - -# --- Local Development (Cosmos Emulator) --- -# COSMOS_ACCOUNT_URI=https://localhost:8081/ -# COSMOS_DATABASE=gtc-dev -# COSMOS_CONTAINER=ground_truth -# COSMOS_EMULATOR_KEY=C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw== -# DRY_RUN=true -# CONCURRENCY=16 - -# --- Azure Production --- -# COSMOS_ACCOUNT_URI=https://your-prod-account.documents.azure.com:443/ -# COSMOS_DATABASE=gtc-prod -# COSMOS_CONTAINER=ground_truth -# DRY_RUN=false -# CONCURRENCY=64 diff --git a/backend/tests/integration/test_assignments_assign_single_cosmos.py b/backend/tests/integration/test_assignments_assign_single_cosmos.py index 8b40e3f..d62189b 100644 --- a/backend/tests/integration/test_assignments_assign_single_cosmos.py +++ b/backend/tests/integration/test_assignments_assign_single_cosmos.py @@ -17,7 +17,9 @@ def make_item( "id": item_id, "datasetName": dataset, "bucket": bucket_id, - "synthQuestion": "What is the meaning of life?", + "history": [ + {"role": "user", "msg": "What is the meaning of life?"}, + ], "status": status, } if assigned_to: diff --git a/backend/tests/integration/test_assignments_cosmos.py b/backend/tests/integration/test_assignments_cosmos.py index 25dde5d..317b626 100644 --- a/backend/tests/integration/test_assignments_cosmos.py +++ b/backend/tests/integration/test_assignments_cosmos.py @@ -1,11 +1,11 @@ -from httpx import AsyncClient -from pydantic import TypeAdapter -import pytest import uuid +from typing import Any, cast + +import pytest +from httpx import AsyncClient -from app.domain.models import AgenticGroundTruthEntry -from app.container import container from app.adapters.repos.cosmos_repo import CosmosGroundTruthRepo +from app.container import container def make_item(dataset: str) -> dict: @@ -14,8 +14,9 @@ def make_item(dataset: str) -> dict: "datasetName": dataset, # Use NIL UUID for explicit bucket in tests "bucket": str(uuid.UUID("00000000-0000-0000-0000-000000000000")), - "synthQuestion": "Q?", - "samplingBucket": 0, + "history": [ + {"role": "user", "msg": "Q?"}, + ], "assignedTo": None, } @@ -56,13 +57,18 @@ async def test_assigned_ground_truths_update_and_approve(async_client: AsyncClie r = await async_client.post("/v1/assignments/self-serve", json=body, headers=user_headers) assert r.status_code == 200 data: dict = r.json() - # mypy: data.get returns Optional[Any]; use default [] to ensure list type - adocs = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python(data.get("assigned") or []) + adocs = cast(list[dict[str, Any]], data.get("assigned") or []) assert adocs and len(adocs) >= 1 - gt_id = adocs[0].id + gt_id = cast(str, adocs[0]["id"]) + etag = cast(str | None, adocs[0].get("_etag")) + assert etag # SME approves via assignments PUT - payload = {"approve": True, "answer": "ans", "etag": adocs[0].etag} + payload = { + "approve": True, + "etag": etag, + "history": [{"role": "user", "msg": "Q?"}, {"role": "assistant", "msg": "ans"}], + } r = await async_client.put( f"/v1/assignments/{dataset}/{bucket}/{gt_id}", json=payload, headers=user_headers ) @@ -103,14 +109,14 @@ async def assigned_ground_truth(async_client: AsyncClient, user_headers): ) assert r.status_code == 200 data = r.json() - adocs = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python(data.get("assigned") or []) + adocs = cast(list[dict[str, Any]], data.get("assigned") or []) assert adocs and len(adocs) >= 1 gt = adocs[0] # Verify assignment document exists repo = container.repo assert isinstance(repo, CosmosGroundTruthRepo) - assignment = await repo.get_assignment_by_gt(TEST_USER_ID, gt.id) + assignment = await repo.get_assignment_by_gt(TEST_USER_ID, cast(str, gt["id"])) assert assignment is not None, "Assignment document should exist after self-serve" yield { @@ -135,16 +141,22 @@ async def test_approve_deletes_assignment_document( user_id = assigned_ground_truth["user_id"] # SME approves via assignments PUT - payload = {"approve": True, "answer": "ans", "etag": gt.etag} + etag = cast(str | None, gt.get("_etag")) + assert etag + payload = { + "approve": True, + "etag": etag, + "history": [{"role": "user", "msg": "Q?"}, {"role": "assistant", "msg": "ans"}], + } r = await async_client.put( - f"/v1/assignments/{dataset}/{bucket}/{gt.id}", json=payload, headers=user_headers + f"/v1/assignments/{dataset}/{bucket}/{gt['id']}", json=payload, headers=user_headers ) assert r.status_code == 200 res: dict = r.json() assert res.get("status") == "approved" # Verify assignment document is deleted after approval - assignment_after = await repo.get_assignment_by_gt(user_id, gt.id) + assignment_after = await repo.get_assignment_by_gt(user_id, cast(str, gt["id"])) assert assignment_after is None, "Assignment document should be deleted after approval" @@ -160,14 +172,16 @@ async def test_delete_deletes_assignment_document( user_id = assigned_ground_truth["user_id"] # SME soft-deletes via assignments PUT with status=deleted - payload = {"status": "deleted", "etag": gt.etag} + etag = cast(str | None, gt.get("_etag")) + assert etag + payload = {"status": "deleted", "etag": etag} r = await async_client.put( - f"/v1/assignments/{dataset}/{bucket}/{gt.id}", json=payload, headers=user_headers + f"/v1/assignments/{dataset}/{bucket}/{gt['id']}", json=payload, headers=user_headers ) assert r.status_code == 200 res: dict = r.json() assert res.get("status") == "deleted" # Verify assignment document is deleted after soft-delete - assignment_after = await repo.get_assignment_by_gt(user_id, gt.id) + assignment_after = await repo.get_assignment_by_gt(user_id, cast(str, gt["id"])) assert assignment_after is None, "Assignment document should be deleted after soft-delete" diff --git a/backend/tests/integration/test_assignments_duplicate_cosmos.py b/backend/tests/integration/test_assignments_duplicate_cosmos.py index a56129c..2d0a743 100644 --- a/backend/tests/integration/test_assignments_duplicate_cosmos.py +++ b/backend/tests/integration/test_assignments_duplicate_cosmos.py @@ -14,10 +14,10 @@ def make_item(dataset: str, *, assigned_to: str | None = None) -> dict[str, Any] "datasetName": dataset, # Use NIL UUID for explicit bucket to keep PK simple in tests "bucket": str(uuid.UUID("00000000-0000-0000-0000-000000000000")), - "synthQuestion": "Q?", - "samplingBucket": 0, + "history": [ + {"role": "user", "msg": "Q?"}, + ], "assignedTo": assigned_to, - "refs": [], "manualTags": ["source:synthetic"], } diff --git a/backend/tests/integration/test_assignments_edited_question_persist_cosmos.py b/backend/tests/integration/test_assignments_edited_question_persist_cosmos.py index e339fe9..17e2014 100644 --- a/backend/tests/integration/test_assignments_edited_question_persist_cosmos.py +++ b/backend/tests/integration/test_assignments_edited_question_persist_cosmos.py @@ -13,9 +13,9 @@ def make_item(dataset: str, item_id: str) -> dict[str, Any]: "datasetName": dataset, # Fixed bucket UUID for deterministic PK "bucket": str(UUID("00000000-0000-0000-0000-000000000000")), - "synthQuestion": "Original synth question?", - "answer": None, - "refs": [], + "history": [ + {"role": "user", "msg": "Original synth question?"}, + ], "manualTags": [ "source:synthetic", "split:train", @@ -31,18 +31,7 @@ def make_item(dataset: str, item_id: str) -> dict[str, Any]: async def test_assignments_put_persists_edited_question_camel_case( async_client: AsyncClient, user_headers: dict[str, str] ): - """Compat-migration coverage for the temporary editedQuestion alias path. - - This test stays only while assignments updates still project legacy camelCase - question fields across the compatibility boundary. Delete it with the alias - retirement work in the hard-delete phase. - - **Phase 5 Audit (2026-03-12)**: MIGRATION TEST - INFORMATIONAL - This test validates that editedQuestion persists correctly through Cosmos - round-trips. The test is marked as temporary and should be deleted when - Phase 6 removes legacy field support. Not a delete blocker, but documents - current persistence contract. - """ + """Assignments PUT persists an updated user question via canonical history.""" dataset = f"editedq-{uuid4().hex[:6]}" item_id = "gt-1" item = make_item(dataset, item_id) @@ -65,16 +54,17 @@ async def test_assignments_put_persists_edited_question_camel_case( bucket = cast(str, row["bucket"]) etag = cast(str, row.get("_etag")) - # Update via assignments PUT using camelCase editedQuestion + # Update via assignments PUT using canonical history new_question = "How do I reset my password (rephrased)?" r = await async_client.put( f"/v1/assignments/{dataset}/{bucket}/{item_id}", headers={**user_headers, "If-Match": etag}, - json={"editedQuestion": new_question}, + json={"history": [{"role": "user", "msg": new_question}]}, ) assert r.status_code == 200, r.text body = cast(dict[str, Any], r.json()) - assert body.get("editedQuestion") == new_question + history = body.get("history") or [] + assert history and history[0].get("msg") == new_question # Fetch item directly and assert persistence r = await async_client.get( @@ -82,11 +72,13 @@ async def test_assignments_put_persists_edited_question_camel_case( ) assert r.status_code == 200, r.text fetched = cast(dict[str, Any], r.json()) - assert fetched.get("editedQuestion") == new_question + fetched_history = fetched.get("history") or [] + assert fetched_history and fetched_history[0].get("msg") == new_question # List my assignments and ensure enriched view carries updated question r = await async_client.get("/v1/assignments/my", headers=user_headers) assert r.status_code == 200, r.text my_items = cast(list[dict[str, Any]], r.json()) mine = next(x for x in my_items if x.get("id") == item_id) - assert mine.get("editedQuestion") == new_question + mine_history = mine.get("history") or [] + assert mine_history and mine_history[0].get("msg") == new_question diff --git a/backend/tests/integration/test_assignments_flow_cosmos.py b/backend/tests/integration/test_assignments_flow_cosmos.py index 785a2c2..b2e9b4b 100644 --- a/backend/tests/integration/test_assignments_flow_cosmos.py +++ b/backend/tests/integration/test_assignments_flow_cosmos.py @@ -3,12 +3,9 @@ from typing import Any, cast from uuid import uuid4 -from pydantic.type_adapter import TypeAdapter import pytest from httpx import AsyncClient -from app.domain.models import AgenticGroundTruthEntry - def make_item(dataset: str) -> dict[str, Any]: return { @@ -16,10 +13,9 @@ def make_item(dataset: str) -> dict[str, Any]: "datasetName": dataset, "bucket": "00000000-0000-0000-0000-000000000000", "status": "draft", - "samplingBucket": 0, - "synthQuestion": "Q?", - "answer": None, - "refs": [], + "history": [ + {"role": "user", "msg": "Q?"}, + ], "manualTags": ["source:synthetic", "topic:general"], } @@ -41,24 +37,26 @@ async def test_self_serve_list_and_approve(async_client: AsyncClient, user_heade assert r.status_code == 200 resp = cast(dict[str, Any], r.json()) assert resp.get("assignedCount") == 2 - assigned = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python( - resp.get("assigned") or [] - ) + assigned = cast(list[dict[str, Any]], resp.get("assigned") or []) assert len(assigned) == 2 # List my assignments r = await async_client.get("/v1/assignments/my", headers=user_headers) assert r.status_code == 200 - docs = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python(r.json()) + docs = cast(list[dict[str, Any]], r.json()) assert len(docs) == 2 # Approve first via assignments PUT - gt_id = docs[0].id - etag = docs[0].etag + gt_id = docs[0]["id"] + etag = docs[0]["_etag"] r = await async_client.put( f"/v1/assignments/{dataset}/{bucket}/{gt_id}", headers=user_headers, - json={"approve": True, "answer": "ans", "etag": etag}, + json={ + "approve": True, + "etag": etag, + "history": [{"role": "user", "msg": "Q?"}, {"role": "assistant", "msg": "ans"}], + }, ) assert r.status_code == 200 res = cast(dict[str, Any], r.json()) @@ -162,6 +160,7 @@ async def test_exclusive_tag_error_prevents_persistence( actual_bucket = assigned[0]["bucket"] etag = assigned[0]["_etag"] original_tags = assigned[0]["manualTags"] + original_history = assigned[0]["history"] # Attempt invalid update with exclusive tag conflict r = await async_client.put( @@ -172,7 +171,6 @@ async def test_exclusive_tag_error_prevents_persistence( "difficulty:easy", "difficulty:hard", ], # Both difficulty tags - conflict! - "answer": "This should not be saved", "etag": etag, }, ) @@ -189,8 +187,8 @@ async def test_exclusive_tag_error_prevents_persistence( # Tags should still be the original ones assert item_after["manualTags"] == original_tags - # Answer should still be None (not the rejected value) - assert item_after["answer"] is None + # History should also remain unchanged after the rejected update. + assert item_after["history"] == original_history @pytest.mark.anyio diff --git a/backend/tests/integration/test_assignments_retry_exclusion.py b/backend/tests/integration/test_assignments_retry_exclusion.py index 5232ae9..fa851e6 100644 --- a/backend/tests/integration/test_assignments_retry_exclusion.py +++ b/backend/tests/integration/test_assignments_retry_exclusion.py @@ -12,10 +12,8 @@ from uuid import UUID, uuid4 from httpx import AsyncClient -from pydantic.type_adapter import TypeAdapter import pytest -from app.domain.models import AgenticGroundTruthEntry from app.adapters.repos.cosmos_repo import CosmosGroundTruthRepo @@ -26,10 +24,10 @@ def make_unassigned_item(dataset: str, item_id: str | None = None) -> dict[str, "datasetName": dataset, "bucket": str(UUID("00000000-0000-0000-0000-000000000000")), "status": "draft", - "samplingBucket": 0, - "synthQuestion": f"Question about {uuid4().hex[:4]}?", + "history": [ + {"role": "user", "msg": f"Question about {uuid4().hex[:4]}?"}, + ], "assignedTo": None, - "refs": [], "manualTags": ["source:synthetic", "split:test"], } @@ -110,16 +108,14 @@ async def test_skipped_items_excluded_from_user_resampling( "/v1/assignments/self-serve", json={"limit": 2}, headers=user_headers ) assert r.status_code == 200 - first_batch = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python( - r.json().get("assigned") or [] - ) + first_batch = r.json().get("assigned") or [] assert len(first_batch) == 2 # Skip one item skipped_item = first_batch[0] r = await async_client.put( - f"/v1/ground-truths/{dataset}/{skipped_item.bucket}/{skipped_item.id}", - json={"status": "skipped", "etag": skipped_item.etag}, + f"/v1/ground-truths/{dataset}/{skipped_item['bucket']}/{skipped_item['id']}", + json={"status": "skipped", "etag": skipped_item["_etag"]}, headers=user_headers, ) assert r.status_code == 200 @@ -129,19 +125,17 @@ async def test_skipped_items_excluded_from_user_resampling( "/v1/assignments/self-serve", json={"limit": 3}, headers=user_headers ) assert r.status_code == 200 - second_batch = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python( - r.json().get("assigned") or [] - ) + second_batch = r.json().get("assigned") or [] assert len(second_batch) == 3 - second_batch_ids = {item.id for item in second_batch} + second_batch_ids = {item["id"] for item in second_batch} non_skipped_item = first_batch[1] # Core assertions: skipped item not returned, non-skipped item is returned - assert skipped_item.id not in second_batch_ids, "Bug: Skipped item was resampled" - assert non_skipped_item.id in second_batch_ids, "Non-skipped item should be included" + assert skipped_item["id"] not in second_batch_ids, "Bug: Skipped item was resampled" + assert non_skipped_item["id"] in second_batch_ids, "Non-skipped item should be included" # Should have 2 new items (not from first batch) - first_batch_ids = {item.id for item in first_batch} + first_batch_ids = {item["id"] for item in first_batch} new_items = second_batch_ids - first_batch_ids assert len(new_items) == 2 diff --git a/backend/tests/integration/test_assignments_skipped_reassign_cosmos.py b/backend/tests/integration/test_assignments_skipped_reassign_cosmos.py index a19faf9..57f6c91 100644 --- a/backend/tests/integration/test_assignments_skipped_reassign_cosmos.py +++ b/backend/tests/integration/test_assignments_skipped_reassign_cosmos.py @@ -5,11 +5,8 @@ from datetime import datetime, timezone from httpx import AsyncClient -from pydantic.type_adapter import TypeAdapter import pytest -from app.domain.models import AgenticGroundTruthEntry - def make_skipped_item(dataset: str, assigned_to: str) -> dict[str, Any]: return { @@ -18,12 +15,12 @@ def make_skipped_item(dataset: str, assigned_to: str) -> dict[str, Any]: # Use NIL UUID for explicit bucket to keep PK simple in tests "bucket": str(UUID("00000000-0000-0000-0000-000000000000")), "status": "skipped", - "samplingBucket": 0, - "synthQuestion": "Q?", + "history": [ + {"role": "user", "msg": "Q?"}, + ], # Simulate a prior assignment to another SME "assignedTo": assigned_to, "assignedAt": datetime.now(timezone.utc).isoformat(), - "refs": [], "manualTags": ["source:synthetic", "split:validation"], } @@ -48,23 +45,21 @@ async def test_self_serve_reassigns_skipped_and_lists_in_my( payload = cast(dict[str, Any], r.json()) assert payload.get("assignedCount") == 1 - assigned_items = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python( - payload.get("assigned") or [] - ) + assigned_items = cast(list[dict[str, Any]], payload.get("assigned") or []) assert len(assigned_items) == 1 gt = assigned_items[0] # After assignment, item should be assigned to current user and status should be draft # In integration tests, the effective user id comes from Easy Auth principal (tester@example.com) expected_user = "tester@example.com" - assert gt.assignedTo == expected_user - assert gt.status.value == "draft" + assert gt["assignedTo"] == expected_user + assert gt["status"] == "draft" # /my should list the item now (since it filters by assignedTo == user and status == draft) r = await async_client.get("/v1/assignments/my", headers=user_headers) assert r.status_code == 200 - my_items = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python(r.json()) + my_items = cast(list[dict[str, Any]], r.json()) assert len(my_items) == 1 - assert my_items[0].id == gt.id - assert my_items[0].assignedTo == expected_user - assert my_items[0].status.value == "draft" + assert my_items[0]["id"] == gt["id"] + assert my_items[0]["assignedTo"] == expected_user + assert my_items[0]["status"] == "draft" diff --git a/backend/tests/integration/test_bucket_assignment_cosmos.py b/backend/tests/integration/test_bucket_assignment_cosmos.py index def1be9..a176c92 100644 --- a/backend/tests/integration/test_bucket_assignment_cosmos.py +++ b/backend/tests/integration/test_bucket_assignment_cosmos.py @@ -7,7 +7,9 @@ def make_item(dataset: str, with_bucket: bool = False, bucket: str | None = None item = { "id": str(uuid.uuid4()), "datasetName": dataset, - "synthQuestion": "Q?", + "history": [ + {"role": "user", "msg": "Q?"}, + ], } if with_bucket: item["bucket"] = bucket or str(uuid.uuid4()) diff --git a/backend/tests/integration/test_bulk_import_validation.py b/backend/tests/integration/test_bulk_import_validation.py index 3f08522..9999173 100644 --- a/backend/tests/integration/test_bulk_import_validation.py +++ b/backend/tests/integration/test_bulk_import_validation.py @@ -15,13 +15,17 @@ async def test_bulk_import_with_valid_items_passes( { "id": "", "datasetName": "test-dataset", - "synthQuestion": "What is the capital of France?", - "refs": [{"url": "https://example.com", "content": "Paris info"}], + "history": [ + {"role": "user", "msg": "What is the capital of France?"}, + {"role": "assistant", "msg": "Paris."}, + ], }, { "id": "", "datasetName": "test-dataset", - "synthQuestion": "How does gravity work?", + "history": [ + {"role": "user", "msg": "How does gravity work?"}, + ], }, ] @@ -53,18 +57,23 @@ async def test_bulk_import_filters_invalid_items( { "id": "valid-1", "datasetName": "test-dataset", - "synthQuestion": "This is a valid question that meets length requirements?", + "history": [ + {"role": "user", "msg": "This is a valid question that meets length requirements?"}, + ], }, { "id": "valid-2", "datasetName": "test-dataset", - "synthQuestion": "Another valid question that is long enough?", + "history": [ + {"role": "user", "msg": "Another valid question that is long enough?"}, + ], }, { - "id": "invalid-url", + "id": "invalid-history", "datasetName": "test-dataset", - "synthQuestion": "Question with bad reference URL?", - "refs": [{"url": ""}], + "history": [ + {"role": "user", "msg": ""}, + ], }, ] @@ -78,5 +87,5 @@ async def test_bulk_import_filters_invalid_items( data = response.json() # Check that errors mention validation issues - error_text = data["detail"][0]["msg"] - assert "Reference URL cannot be empty" in error_text or "invalid-url" in error_text + details = data.get("detail") or [] + assert any("history fields cannot be empty" in err.get("msg", "") for err in details) diff --git a/backend/tests/integration/test_datasets_api.py b/backend/tests/integration/test_datasets_api.py index 3110c00..9550134 100644 --- a/backend/tests/integration/test_datasets_api.py +++ b/backend/tests/integration/test_datasets_api.py @@ -9,7 +9,9 @@ def make_gt_item(dataset: str, *, bucket: str | None = None) -> dict: "id": str(uuid.uuid4()), "datasetName": dataset, "bucket": bucket or str(uuid.UUID("00000000-0000-0000-0000-000000000000")), - "synthQuestion": f"Question for {dataset}?", + "history": [ + {"role": "user", "msg": f"Question for {dataset}?"}, + ], "docType": "ground-truth-item", } diff --git a/backend/tests/integration/test_etag_and_refs_cosmos.py b/backend/tests/integration/test_etag_and_refs_cosmos.py index ca29ecd..5985fec 100644 --- a/backend/tests/integration/test_etag_and_refs_cosmos.py +++ b/backend/tests/integration/test_etag_and_refs_cosmos.py @@ -8,8 +8,9 @@ def make_item(dataset: str) -> dict: "id": str(uuid.uuid4()), "datasetName": dataset, "bucket": str(uuid.UUID("00000000-0000-0000-0000-000000000000")), - "synthQuestion": "Q?", - "samplingBucket": 0, + "history": [ + {"role": "user", "msg": "Q?"}, + ], "assignedTo": None, } @@ -32,7 +33,9 @@ async def test_sme_update_requires_etag_and_includes_updated_etag( # Try SME update without ETag -> 412 r = await async_client.put( - f"/v1/assignments/{ds}/{bucket}/{item['id']}", json={"answer": "A1"}, headers=user_headers + f"/v1/assignments/{ds}/{bucket}/{item['id']}", + json={"history": [{"role": "user", "msg": "Q?"}, {"role": "assistant", "msg": "A1"}]}, + headers=user_headers, ) assert r.status_code == 412 @@ -43,11 +46,14 @@ async def test_sme_update_requires_etag_and_includes_updated_etag( headers = dict(user_headers) headers.update({"If-Match": etag}) r = await async_client.put( - f"/v1/assignments/{ds}/{bucket}/{item['id']}", json={"answer": "A2"}, headers=headers + f"/v1/assignments/{ds}/{bucket}/{item['id']}", + json={"history": [{"role": "user", "msg": "Q?"}, {"role": "assistant", "msg": "A2"}]}, + headers=headers, ) assert r.status_code == 200 body = r.json() - assert body.get("answer") == "A2" + history = body.get("history") or [] + assert any(turn.get("role") == "assistant" and turn.get("msg") == "A2" for turn in history) assert body.get("_etag") and isinstance(body["_etag"], str) @@ -73,7 +79,9 @@ async def test_sme_etag_mismatch_returns_412(async_client: AsyncClient, user_hea headers = dict(user_headers) headers.update({"If-Match": etag1}) r = await async_client.put( - f"/v1/assignments/{ds}/{bucket}/{item['id']}", json={"answer": "v1"}, headers=headers + f"/v1/assignments/{ds}/{bucket}/{item['id']}", + json={"history": [{"role": "user", "msg": "Q?"}, {"role": "assistant", "msg": "v1"}]}, + headers=headers, ) assert r.status_code == 200 new_etag = r.json().get("_etag") @@ -83,14 +91,16 @@ async def test_sme_etag_mismatch_returns_412(async_client: AsyncClient, user_hea headers_stale = dict(user_headers) headers_stale.update({"If-Match": etag1}) r = await async_client.put( - f"/v1/assignments/{ds}/{bucket}/{item['id']}", json={"answer": "v2"}, headers=headers_stale + f"/v1/assignments/{ds}/{bucket}/{item['id']}", + json={"history": [{"role": "user", "msg": "Q?"}, {"role": "assistant", "msg": "v2"}]}, + headers=headers_stale, ) assert r.status_code == 412 @pytest.mark.anyio -async def test_curator_put_refs_with_etag(async_client: AsyncClient, user_headers): - ds = "test-curator-refs" +async def test_curator_put_plugins_with_etag(async_client: AsyncClient, user_headers): + ds = "test-curator-plugins" item = make_item(ds) bucket = item["bucket"] r = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers) @@ -103,15 +113,28 @@ async def test_curator_put_refs_with_etag(async_client: AsyncClient, user_header headers = dict(user_headers) headers.update({"If-Match": etag}) - refs = [ - {"url": "https://example.com/a", "content": "alpha"}, - {"url": "https://example.com/b", "keyExcerpt": "beta"}, - ] - payload = {"refs": refs, "answer": "Ans"} + plugin_data = { + "score": 0.87, + "notes": ["source-a", "source-b"], + } + payload = { + "history": [ + {"role": "user", "msg": "Q?"}, + {"role": "assistant", "msg": "Ans"}, + ], + "plugins": { + "test-pack": { + "kind": "test-pack", + "version": "1.0", + "data": plugin_data, + } + }, + } r = await async_client.put( f"/v1/ground-truths/{ds}/{bucket}/{item['id']}", json=payload, headers=headers ) assert r.status_code == 200 body = r.json() - assert body.get("refs") and isinstance(body["refs"], list) and len(body["refs"]) == 2 + persisted_data = body.get("plugins", {}).get("test-pack", {}).get("data") + assert persisted_data == plugin_data assert body.get("_etag") and isinstance(body["_etag"], str) diff --git a/backend/tests/integration/test_ground_truths_cosmos.py b/backend/tests/integration/test_ground_truths_cosmos.py index f3ac5c6..c1b75e9 100644 --- a/backend/tests/integration/test_ground_truths_cosmos.py +++ b/backend/tests/integration/test_ground_truths_cosmos.py @@ -11,7 +11,9 @@ def make_item(dataset: str) -> dict: "datasetName": dataset, # Use NIL UUID for explicit bucket in tests "bucket": str(uuid.UUID("00000000-0000-0000-0000-000000000000")), - "synthQuestion": "What is the capital of France?", + "history": [ + {"role": "user", "msg": "What is the capital of France?"}, + ], } @@ -59,13 +61,22 @@ async def test_update_with_etag(async_client: AsyncClient, user_headers): # update with If-Match header headers = dict(user_headers) headers.update({"If-Match": etag}) - payload = {"answer": "Paris", "status": "approved"} + payload = { + "history": [ + {"role": "user", "msg": "What is the capital of France?"}, + {"role": "assistant", "msg": "Paris"}, + ], + "status": "approved", + } r = await async_client.put( f"/v1/ground-truths/{dataset}/{bucket}/{item['id']}", json=payload, headers=headers ) assert r.status_code == 200 res = r.json() - assert res["answer"] == "Paris" + assert any( + turn.get("role") == "assistant" and turn.get("msg") == "Paris" + for turn in (res.get("history") or []) + ) assert res["status"] == GroundTruthStatus.approved.value @@ -142,7 +153,13 @@ async def test_snapshot_and_stats(async_client: AsyncClient, user_headers): bucket = data[0]["bucket"] headers = dict(user_headers) headers.update({"If-Match": etag}) - payload = {"answer": "Paris", "status": "approved"} + payload = { + "history": [ + {"role": "user", "msg": "What is the capital of France?"}, + {"role": "assistant", "msg": "Paris"}, + ], + "status": "approved", + } r = await async_client.put( f"/v1/ground-truths/{dataset}/{bucket}/{item['id']}", json=payload, headers=headers ) @@ -165,8 +182,9 @@ async def test_snapshot_and_stats(async_client: AsyncClient, user_headers): async def test_import_with_approve_flag(async_client: AsyncClient, user_headers): dataset = "test-approve-on-import" - # Item WITHOUT history: approval validation should reject it + # Item WITHOUT assistant response: approval validation should reject it invalid_item = make_item(dataset) + invalid_item["history"] = [{"role": "user", "msg": "What is the capital of France?"}] r = await async_client.post( "/v1/ground-truths?approve=true", json=[invalid_item], headers=user_headers ) diff --git a/backend/tests/integration/test_ground_truths_delete_restore_etag_cosmos.py b/backend/tests/integration/test_ground_truths_delete_restore_etag_cosmos.py index 58ef555..dd63660 100644 --- a/backend/tests/integration/test_ground_truths_delete_restore_etag_cosmos.py +++ b/backend/tests/integration/test_ground_truths_delete_restore_etag_cosmos.py @@ -13,9 +13,10 @@ def make_item(dataset: str, item_id: str) -> dict[str, Any]: "datasetName": dataset, # Use NIL UUID so tests don't depend on bucket assignment logic "bucket": str(UUID("00000000-0000-0000-0000-000000000000")), - "synthQuestion": "How do I reset my password?", - "answer": "Use the reset link", - "refs": [], + "history": [ + {"role": "user", "msg": "How do I reset my password?"}, + {"role": "assistant", "msg": "Use the reset link"}, + ], "manualTags": [ "source:synthetic", "split:validation", diff --git a/backend/tests/integration/test_ground_truths_etag_errors_cosmos.py b/backend/tests/integration/test_ground_truths_etag_errors_cosmos.py index 58411f5..d2f02a1 100644 --- a/backend/tests/integration/test_ground_truths_etag_errors_cosmos.py +++ b/backend/tests/integration/test_ground_truths_etag_errors_cosmos.py @@ -12,9 +12,10 @@ def make_item(dataset: str, item_id: str) -> dict[str, Any]: "id": item_id, "datasetName": dataset, "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "How do I reset my password?", - "answer": "Use the reset link", - "refs": [], + "history": [ + {"role": "user", "msg": "How do I reset my password?"}, + {"role": "assistant", "msg": "Use the reset link"}, + ], "manualTags": [ "source:synthetic", "split:validation", diff --git a/backend/tests/integration/test_ground_truths_explorer.py b/backend/tests/integration/test_ground_truths_explorer.py index d843f10..90003f3 100644 --- a/backend/tests/integration/test_ground_truths_explorer.py +++ b/backend/tests/integration/test_ground_truths_explorer.py @@ -29,9 +29,10 @@ def build_item( "datasetName": dataset, "bucket": str(uuid4()), "status": status, - "synthQuestion": f"Question {idx}", - "answer": answer, - "refs": [], + "history": [ + {"role": "user", "msg": f"Question {idx}"}, + *([{"role": "assistant", "msg": answer}] if answer else []), + ], "manualTags": tags or ["source:sme"], "reviewedAt": reviewed.isoformat(), "updatedAt": updated.isoformat(), diff --git a/backend/tests/integration/test_ground_truths_get_and_filters_cosmos.py b/backend/tests/integration/test_ground_truths_get_and_filters_cosmos.py index 3e9712f..9d63728 100644 --- a/backend/tests/integration/test_ground_truths_get_and_filters_cosmos.py +++ b/backend/tests/integration/test_ground_truths_get_and_filters_cosmos.py @@ -2,22 +2,20 @@ from typing import Any, Optional, cast -from pydantic import TypeAdapter import pytest from uuid import uuid4 from httpx import AsyncClient -from app.domain.models import AgenticGroundTruthEntry - def make_item(dataset: str, *, gid: Optional[str] = None) -> dict[str, Any]: return { "id": gid or f"gt-{uuid4().hex[:8]}", "datasetName": dataset, "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "What is the capital of France?", - "answer": "Paris", - "refs": [], + "history": [ + {"role": "user", "msg": "What is the capital of France?"}, + {"role": "assistant", "msg": "Paris"}, + ], "manualTags": [ "source:synthetic", "split:validation", @@ -50,9 +48,9 @@ async def test_get_item_200_and_404(async_client: AsyncClient, user_headers: dic f"/v1/ground-truths/{dataset}/{bucket}/gt-200", headers=user_headers ) assert res.status_code == 200 - gt_item = TypeAdapter(AgenticGroundTruthEntry).validate_python(res.json()) - assert gt_item.id == "gt-200" - assert gt_item.etag + gt_item = cast(dict[str, Any], res.json()) + assert gt_item.get("id") == "gt-200" + assert gt_item.get("_etag") # 404 for missing res = await async_client.get( diff --git a/backend/tests/integration/test_ground_truths_id_search.py b/backend/tests/integration/test_ground_truths_id_search.py index efd089a..23c78da 100644 --- a/backend/tests/integration/test_ground_truths_id_search.py +++ b/backend/tests/integration/test_ground_truths_id_search.py @@ -8,8 +8,6 @@ from httpx import AsyncClient from uuid import uuid4 -from app.domain.models import GroundTruthListResponse - def make_item(dataset: str, gid: str | None = None) -> dict: """Helper to create a minimal ground truth item for testing.""" @@ -17,7 +15,9 @@ def make_item(dataset: str, gid: str | None = None) -> dict: "id": gid or f"test-{uuid4().hex[:8]}", "datasetName": dataset, "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Test question?", + "history": [ + {"role": "user", "msg": "Test question?"}, + ], } @@ -39,9 +39,9 @@ async def test_list_ground_truths_search_by_id_exact_match( "/v1/ground-truths", params={"itemId": item_id}, headers=user_headers ) assert res.status_code == 200 - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 1 - assert response_data.items[0].id == item_id + response_data = res.json() + assert len(response_data["items"]) == 1 + assert response_data["items"][0]["id"] == item_id @pytest.mark.anyio @@ -66,9 +66,9 @@ async def test_list_ground_truths_search_by_id_partial_match( "/v1/ground-truths", params={"itemId": unique}, headers=user_headers ) assert res.status_code == 200 - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 2 - found_ids = {item.id for item in response_data.items} + response_data = res.json() + assert len(response_data["items"]) == 2 + found_ids = {item["id"] for item in response_data["items"]} assert f"{unique}-suffix1" in found_ids assert f"{unique}-end" in found_ids @@ -90,9 +90,9 @@ async def test_list_ground_truths_search_by_id_with_whitespace_trimming( "/v1/ground-truths", params={"itemId": f" {item_id} "}, headers=user_headers ) assert res.status_code == 200 - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 1 - assert response_data.items[0].id == item_id + response_data = res.json() + assert len(response_data["items"]) == 1 + assert response_data["items"][0]["id"] == item_id @pytest.mark.anyio @@ -112,9 +112,9 @@ async def test_list_ground_truths_search_by_id_whitespace_only_returns_all( "/v1/ground-truths", params={"itemId": " ", "dataset": dataset}, headers=user_headers ) assert res.status_code == 200 - response_data = GroundTruthListResponse.model_validate(res.json()) + response_data = res.json() # Should return all items from dataset (whitespace-only treated as omitted) - assert len(response_data.items) >= 3 + assert len(response_data["items"]) >= 3 @pytest.mark.anyio @@ -141,10 +141,10 @@ async def test_list_ground_truths_search_by_id_combined_with_other_filters( headers=user_headers, ) assert res.status_code == 200 - response_data = GroundTruthListResponse.model_validate(res.json()) + response_data = res.json() # Should find the 2 items with matching ID in this dataset - assert len(response_data.items) == 2 - assert all(unique in item.id for item in response_data.items) + assert len(response_data["items"]) == 2 + assert all(unique in item["id"] for item in response_data["items"]) @pytest.mark.anyio @@ -165,8 +165,8 @@ async def test_list_ground_truths_search_by_id_empty_results( headers=user_headers, ) assert res.status_code == 200 - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 0 + response_data = res.json() + assert len(response_data["items"]) == 0 @pytest.mark.anyio @@ -189,10 +189,10 @@ async def test_list_ground_truths_search_by_id_pagination( headers=user_headers, ) assert res.status_code == 200 - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 2 - assert response_data.pagination.has_next is True - assert response_data.pagination.total == 5 + response_data = res.json() + assert len(response_data["items"]) == 2 + assert response_data["pagination"]["hasNext"] is True + assert response_data["pagination"]["total"] == 5 @pytest.mark.anyio @@ -225,5 +225,5 @@ async def test_list_ground_truths_search_by_id_no_param_returns_all( "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers ) assert res.status_code == 200 - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) >= 3 + response_data = res.json() + assert len(response_data["items"]) >= 3 diff --git a/backend/tests/integration/test_ground_truths_import_conflicts_cosmos.py b/backend/tests/integration/test_ground_truths_import_conflicts_cosmos.py index 9a5b643..c04b5f9 100644 --- a/backend/tests/integration/test_ground_truths_import_conflicts_cosmos.py +++ b/backend/tests/integration/test_ground_truths_import_conflicts_cosmos.py @@ -12,9 +12,10 @@ def make_item(dataset: str, item_id: str) -> dict[str, Any]: "id": item_id, "datasetName": dataset, "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "What is the capital of France?", - "answer": "Paris", - "refs": [], + "history": [ + {"role": "user", "msg": "What is the capital of France?"}, + {"role": "assistant", "msg": "Paris"}, + ], "manualTags": [ "source:synthetic", "split:validation", diff --git a/backend/tests/integration/test_ground_truths_reference_count.py b/backend/tests/integration/test_ground_truths_reference_count.py deleted file mode 100644 index c21317d..0000000 --- a/backend/tests/integration/test_ground_truths_reference_count.py +++ /dev/null @@ -1,260 +0,0 @@ -"""Integration tests for totalReferences computed field on GroundTruthItem.""" - -import pytest -from httpx import AsyncClient -from uuid import uuid4 - -from app.domain.models import GroundTruthListResponse - - -@pytest.mark.anyio -async def test_ground_truth_item_includes_total_references_field( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Verify totalReferences field exists in response.""" - dataset = f"ref-count-exists-{uuid4().hex[:6]}" - - item = { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Test question?", - } - - res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers) - assert res.status_code == 200 - - res = await async_client.get( - "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 1 - assert hasattr(response_data.items[0], "totalReferences") - assert response_data.items[0].totalReferences == 0 # No refs yet - - -@pytest.mark.anyio -async def test_total_references_counts_item_level_refs_only( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Item with 3 item-level refs, no history → totalReferences=3.""" - dataset = f"ref-count-item-{uuid4().hex[:6]}" - - item = { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Test question?", - "refs": [ - {"url": "https://example.com/1"}, - {"url": "https://example.com/2"}, - {"url": "https://example.com/3"}, - ], - } - - res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers) - assert res.status_code == 200 - - res = await async_client.get( - "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 1 - assert response_data.items[0].totalReferences == 3 - - -@pytest.mark.anyio -async def test_total_references_counts_history_level_refs_only( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Item with no item-level refs, 2 history turns with refs → correct count.""" - dataset = f"ref-count-history-{uuid4().hex[:6]}" - - item = { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Test question?", - "history": [ - { - "role": "user", - "msg": "First question", - "refs": None, - }, - { - "role": "assistant", - "msg": "First answer", - "refs": [ - {"url": "https://example.com/turn1-ref1"}, - {"url": "https://example.com/turn1-ref2"}, - ], - }, - { - "role": "user", - "msg": "Follow-up question", - "refs": None, - }, - { - "role": "assistant", - "msg": "Follow-up answer", - "refs": [ - {"url": "https://example.com/turn2-ref1"}, - ], - }, - ], - } - - res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers) - assert res.status_code == 200 - - res = await async_client.get( - "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 1 - # 2 refs from first assistant turn + 1 ref from second assistant turn = 3 total - assert response_data.items[0].totalReferences == 3 - - -@pytest.mark.anyio -async def test_total_references_counts_both_levels( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Item with item-level refs + history turn refs → only history turn refs counted.""" - dataset = f"ref-count-both-{uuid4().hex[:6]}" - - item = { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Test question?", - "refs": [ - {"url": "https://example.com/item-ref1"}, - {"url": "https://example.com/item-ref2"}, - ], - "history": [ - { - "role": "user", - "msg": "Question", - "refs": None, - }, - { - "role": "assistant", - "msg": "Answer", - "refs": [ - {"url": "https://example.com/history-ref1"}, - {"url": "https://example.com/history-ref2"}, - {"url": "https://example.com/history-ref3"}, - ], - }, - ], - } - - res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers) - assert res.status_code == 200 - - res = await async_client.get( - "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 1 - # 2 item-level refs , 3 history refs ignore the item-level refs = 3 total - assert response_data.items[0].totalReferences == 3 - - -@pytest.mark.anyio -async def test_total_references_zero_when_no_refs( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Item with empty refs and no history → totalReferences=0.""" - dataset = f"ref-count-zero-{uuid4().hex[:6]}" - - item = { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Test question?", - } - - res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers) - assert res.status_code == 200 - - res = await async_client.get( - "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 1 - assert response_data.items[0].totalReferences == 0 - - -@pytest.mark.anyio -async def test_total_references_multiple_items_independent( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Multiple items each have correct independent counts.""" - dataset = f"ref-count-multi-{uuid4().hex[:6]}" - - items = [ - { - "id": f"item1-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Question 1?", - "refs": [{"url": "https://example.com/1"}], - }, - { - "id": f"item2-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Question 2?", - "refs": [ - {"url": "https://example.com/2a"}, - {"url": "https://example.com/2b"}, - ], - "history": [ - {"role": "user", "msg": "Follow up"}, - { - "role": "assistant", - "msg": "Answer", - "refs": [{"url": "https://example.com/2c"}], - }, - ], - }, - { - "id": f"item3-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Question 3?", - # No refs - }, - ] - - res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers) - assert res.status_code == 200 - - res = await async_client.get( - "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 3 - - # Find items by checking synthQuestion to verify independent counts - items_by_question = {item.synth_question: item for item in response_data.items} - - assert items_by_question["Question 1?"].totalReferences == 1 # 1 item-level ref - assert ( - items_by_question["Question 2?"].totalReferences == 1 - ) # 2 item-level , 1 history ref then count only history = 1 - assert items_by_question["Question 3?"].totalReferences == 0 # No refs diff --git a/backend/tests/integration/test_ground_truths_reference_search.py b/backend/tests/integration/test_ground_truths_reference_search.py deleted file mode 100644 index 7a6176f..0000000 --- a/backend/tests/integration/test_ground_truths_reference_search.py +++ /dev/null @@ -1,492 +0,0 @@ -"""Integration tests for reference URL search on GET /v1/ground-truths endpoint.""" - -import pytest -from httpx import AsyncClient -from uuid import uuid4 - -from app.domain.models import GroundTruthListResponse - - -@pytest.mark.anyio -async def test_ref_url_search_matches_item_level_refs( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Item with ref url containing search term → returns item.""" - dataset = f"ref-search-item-{uuid4().hex[:6]}" - - item = { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Test question?", - "refs": [ - {"url": "https://example.com/page1"}, - {"url": "https://docs.example.com/guide"}, - ], - } - - res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers) - assert res.status_code == 200 - - # Search for "page1" should find the item - res = await async_client.get( - "/v1/ground-truths", params={"dataset": dataset, "refUrl": "page1"}, headers=user_headers - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 1 - assert response_data.items[0].id == item["id"] - - -@pytest.mark.anyio -async def test_ref_url_search_matches_history_level_refs( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Item with history turn refs containing search term → returns item.""" - dataset = f"ref-search-history-{uuid4().hex[:6]}" - - item = { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Test question?", - "history": [ - { - "role": "user", - "msg": "User question", - }, - { - "role": "assistant", - "msg": "Assistant response", - "refs": [ - {"url": "https://docs.example.com/article/123"}, - {"url": "https://support.example.com/kb/456"}, - ], - }, - ], - } - - res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers) - assert res.status_code == 200 - - # Search for "article" should find the item - res = await async_client.get( - "/v1/ground-truths", params={"dataset": dataset, "refUrl": "article"}, headers=user_headers - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 1 - assert response_data.items[0].id == item["id"] - - -@pytest.mark.anyio -async def test_ref_url_search_matches_both_levels( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Item with item-level and history-level refs → search matches either.""" - dataset = f"ref-search-both-{uuid4().hex[:6]}" - - item = { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Test question?", - "refs": [ - {"url": "https://foo.com/bar"}, - ], - "history": [ - { - "role": "assistant", - "msg": "Response", - "refs": [ - {"url": "https://baz.com/bar"}, - ], - }, - ], - } - - res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers) - assert res.status_code == 200 - - # Search for "bar" should find the item (matches both levels) - res = await async_client.get( - "/v1/ground-truths", params={"dataset": dataset, "refUrl": "bar"}, headers=user_headers - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 1 - assert response_data.items[0].id == item["id"] - - -@pytest.mark.anyio -async def test_ref_url_search_case_sensitive( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Reference URL search is case-sensitive (Cosmos CONTAINS behavior).""" - dataset = f"ref-search-case-{uuid4().hex[:6]}" - - item = { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Test question?", - "refs": [ - {"url": "https://Example.COM/Page"}, - ], - } - - res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers) - assert res.status_code == 200 - - # Search for lowercase "example.com" should NOT find the item (case-sensitive) - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "refUrl": "example.com"}, - headers=user_headers, - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 0 - - # Search for exact case "Example.COM" should find it - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "refUrl": "Example.COM"}, - headers=user_headers, - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 1 - - -@pytest.mark.anyio -async def test_ref_url_search_partial_match( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Reference URL search supports partial matching.""" - dataset = f"ref-search-partial-{uuid4().hex[:6]}" - - item = { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Test question?", - "refs": [ - {"url": "https://docs.example.com/guide/introduction"}, - ], - } - - res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers) - assert res.status_code == 200 - - # Search for domain portion - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "refUrl": "docs.example"}, - headers=user_headers, - ) - assert res.status_code == 200 - assert len(GroundTruthListResponse.model_validate(res.json()).items) == 1 - - # Search for path portion - res = await async_client.get( - "/v1/ground-truths", params={"dataset": dataset, "refUrl": "/guide"}, headers=user_headers - ) - assert res.status_code == 200 - assert len(GroundTruthListResponse.model_validate(res.json()).items) == 1 - - # Search for non-matching substring - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "refUrl": "nonexistent"}, - headers=user_headers, - ) - assert res.status_code == 200 - assert len(GroundTruthListResponse.model_validate(res.json()).items) == 0 - - -@pytest.mark.anyio -async def test_ref_url_search_no_matches(async_client: AsyncClient, user_headers: dict[str, str]): - """Search with no matching refs returns empty list.""" - dataset = f"ref-search-nomatch-{uuid4().hex[:6]}" - - items = [ - { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Question 1", - "refs": [{"url": "https://foo.com/1"}], - }, - { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Question 2", - "refs": [{"url": "https://bar.com/2"}], - }, - ] - - res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers) - assert res.status_code == 200 - - # Search for non-existent URL - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "refUrl": "nonexistent-url"}, - headers=user_headers, - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 0 - - -@pytest.mark.anyio -async def test_ref_url_search_multiple_refs_per_item( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Item with multiple refs, only one matches → search finds item.""" - dataset = f"ref-search-multi-{uuid4().hex[:6]}" - - item = { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Test question?", - "refs": [ - {"url": "https://foo.com/1"}, - {"url": "https://bar.com/2"}, - {"url": "https://baz.com/3"}, - {"url": "https://example.com/matching-url"}, - {"url": "https://qux.com/5"}, - ], - } - - res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers) - assert res.status_code == 200 - - # Search for the matching URL - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "refUrl": "matching-url"}, - headers=user_headers, - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 1 - assert response_data.items[0].id == item["id"] - - -@pytest.mark.anyio -async def test_ref_url_search_combined_with_other_filters( - async_client: AsyncClient, user_headers: dict[str, str] -): - """refUrl filter works together with dataset and status filters.""" - dataset1 = f"ref-search-combined1-{uuid4().hex[:6]}" - dataset2 = f"ref-search-combined2-{uuid4().hex[:6]}" - - items = [ - { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset1, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Q1", - "status": "draft", - "refs": [{"url": "https://example.com/doc"}], - }, - { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset1, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Q2", - "status": "approved", - "refs": [{"url": "https://example.com/doc"}], - }, - { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset2, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Q3", - "status": "draft", - "refs": [{"url": "https://example.com/doc"}], - }, - ] - - res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers) - assert res.status_code == 200 - - # Filter by dataset + refUrl → should get 2 items from dataset1 - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset1, "refUrl": "example.com"}, - headers=user_headers, - ) - assert res.status_code == 200 - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 2 - - # Filter by dataset + status + refUrl → should get 1 item (approved in dataset1) - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset1, "status": "approved", "refUrl": "example.com"}, - headers=user_headers, - ) - assert res.status_code == 200 - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 1 - assert response_data.items[0].status.value == "approved" - - -@pytest.mark.anyio -async def test_ref_url_search_with_pagination( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Reference URL search works correctly with pagination.""" - dataset = f"ref-search-page-{uuid4().hex[:6]}" - - # Create 15 items with matching refs - items = [ - { - "id": f"test-{i:03d}-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": f"Question {i}", - "refs": [{"url": f"https://example.com/doc/{i}"}], - } - for i in range(15) - ] - - res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers) - assert res.status_code == 200 - - # Get first page with limit=10 - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "refUrl": "example.com", "page": 1, "limit": 10}, - headers=user_headers, - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 10 - assert response_data.pagination.total == 15 - assert response_data.pagination.page == 1 - assert response_data.pagination.has_next is True - assert response_data.pagination.has_prev is False - - # Get second page - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "refUrl": "example.com", "page": 2, "limit": 10}, - headers=user_headers, - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 5 - assert response_data.pagination.total == 15 - assert response_data.pagination.page == 2 - assert response_data.pagination.has_next is False - assert response_data.pagination.has_prev is True - - -@pytest.mark.anyio -async def test_ref_url_search_empty_string_ignored( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Empty or whitespace refUrl behaves like no filter.""" - dataset = f"ref-search-empty-{uuid4().hex[:6]}" - - items = [ - { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Q1", - "refs": [{"url": "https://foo.com"}], - }, - { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Q2", - "refs": [{"url": "https://bar.com"}], - }, - ] - - res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers) - assert res.status_code == 200 - - # Empty string - res = await async_client.get( - "/v1/ground-truths", params={"dataset": dataset, "refUrl": ""}, headers=user_headers - ) - assert res.status_code == 200 - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 2 # Returns all items - - # Whitespace only - res = await async_client.get( - "/v1/ground-truths", params={"dataset": dataset, "refUrl": " "}, headers=user_headers - ) - assert res.status_code == 200 - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 2 # Returns all items - - -@pytest.mark.anyio -async def test_ref_url_search_omitted_parameter( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Request without refUrl parameter returns all items.""" - dataset = f"ref-search-omit-{uuid4().hex[:6]}" - - items = [ - { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Q1", - "refs": [{"url": "https://foo.com"}], - }, - { - "id": f"test-{uuid4().hex[:8]}", - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Q2", - "refs": [{"url": "https://bar.com"}], - }, - ] - - res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers) - assert res.status_code == 200 - - # Omit refUrl parameter entirely - res = await async_client.get( - "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 2 # Returns all items - - -@pytest.mark.anyio -async def test_ref_url_search_too_long_returns_400( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Test that refUrl longer than 500 characters returns 400 error.""" - long_url = "https://example.com/" + "x" * 500 # >500 characters total - - res = await async_client.get( - "/v1/ground-truths", params={"refUrl": long_url}, headers=user_headers - ) - assert res.status_code == 400 - assert "500 characters" in res.json()["detail"] diff --git a/backend/tests/integration/test_ground_truths_sort_total_references.py b/backend/tests/integration/test_ground_truths_sort_total_references.py deleted file mode 100644 index 513186c..0000000 --- a/backend/tests/integration/test_ground_truths_sort_total_references.py +++ /dev/null @@ -1,442 +0,0 @@ -"""Integration tests for sorting by totalReferences field (SA-369). - -Tests the database-level sorting by totalReferences, verifying that: -1. The sortBy=totalReferences parameter is accepted by the API -2. Sorting works correctly in ascending and descending order -3. Sorting handles edge cases (items with 0 refs, history vs item-level refs) -4. Pagination works correctly with totalReferences sorting -""" - -import pytest -from httpx import AsyncClient -from uuid import uuid4 - -from app.domain.models import GroundTruthListResponse - - -def make_item_with_refs( - dataset: str, - item_id: str, - *, - item_refs_count: int = 0, - history_refs_counts: list[int] | None = None, -) -> dict: - """Create a test item with specified reference counts. - - Args: - dataset: Dataset name - item_id: Item ID - item_refs_count: Number of item-level refs - history_refs_counts: List of ref counts per history turn (None = no history) - - Returns: - Item dict for API submission - """ - item: dict = { - "id": item_id, - "datasetName": dataset, - "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": f"Question for {item_id}?", - } - - # Add item-level refs - if item_refs_count > 0: - item["refs"] = [ - {"url": f"https://example.com/{item_id}/item-ref-{i}"} for i in range(item_refs_count) - ] - - # Add history with refs if specified - if history_refs_counts: - history = [] - for turn_idx, ref_count in enumerate(history_refs_counts): - # User turn (no refs) - history.append( - { - "role": "user", - "msg": f"Turn {turn_idx} question", - } - ) - # Assistant turn with refs - turn: dict = { - "role": "assistant", - "msg": f"Turn {turn_idx} answer", - } - if ref_count > 0: - turn["refs"] = [ - {"url": f"https://example.com/{item_id}/turn{turn_idx}-ref-{i}"} - for i in range(ref_count) - ] - history.append(turn) - item["history"] = history - - return item - - -@pytest.mark.anyio -async def test_sort_by_total_references_descending( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Sort by totalReferences DESC returns items with most refs first.""" - dataset = f"sort-refs-desc-{uuid4().hex[:6]}" - - items = [ - make_item_with_refs(dataset, "item-0-refs", item_refs_count=0), - make_item_with_refs(dataset, "item-3-refs", item_refs_count=3), - make_item_with_refs(dataset, "item-1-ref", item_refs_count=1), - make_item_with_refs(dataset, "item-5-refs", item_refs_count=5), - ] - - res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers) - assert res.status_code == 200 - - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "sortBy": "totalReferences", "sortOrder": "desc"}, - headers=user_headers, - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 4 - - # Verify descending order (most refs first) - ref_counts = [item.totalReferences for item in response_data.items] - assert ref_counts == [5, 3, 1, 0], f"Expected descending order, got {ref_counts}" - - -@pytest.mark.anyio -async def test_sort_by_total_references_ascending( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Sort by totalReferences ASC returns items with fewest refs first.""" - dataset = f"sort-refs-asc-{uuid4().hex[:6]}" - - items = [ - make_item_with_refs(dataset, "item-3-refs", item_refs_count=3), - make_item_with_refs(dataset, "item-0-refs", item_refs_count=0), - make_item_with_refs(dataset, "item-5-refs", item_refs_count=5), - make_item_with_refs(dataset, "item-1-ref", item_refs_count=1), - ] - - res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers) - assert res.status_code == 200 - - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "sortBy": "totalReferences", "sortOrder": "asc"}, - headers=user_headers, - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 4 - - # Verify ascending order (fewest refs first) - ref_counts = [item.totalReferences for item in response_data.items] - assert ref_counts == [0, 1, 3, 5], f"Expected ascending order, got {ref_counts}" - - -@pytest.mark.anyio -async def test_sort_by_total_references_with_history_refs( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Sort works correctly when refs come from history turns.""" - dataset = f"sort-refs-history-{uuid4().hex[:6]}" - - items = [ - # Item with only item-level refs (2 refs) - make_item_with_refs(dataset, "item-level-2", item_refs_count=2), - # Item with only history refs (3 refs across turns) - make_item_with_refs(dataset, "history-3", history_refs_counts=[1, 2]), - # Item with no refs (0 refs) - make_item_with_refs(dataset, "no-refs", item_refs_count=0), - # Item with history refs overriding item refs (history: 4 refs) - make_item_with_refs( - dataset, "history-4-override", item_refs_count=10, history_refs_counts=[2, 2] - ), - ] - - res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers) - assert res.status_code == 200 - - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "sortBy": "totalReferences", "sortOrder": "desc"}, - headers=user_headers, - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 4 - - # Expected order: history-4-override (4), history-3 (3), item-level-2 (2), no-refs (0) - item_ids = [item.id for item in response_data.items] - ref_counts = [item.totalReferences for item in response_data.items] - - assert ref_counts == [4, 3, 2, 0], f"Expected [4, 3, 2, 0], got {ref_counts}" - assert item_ids[0] == "history-4-override" - assert item_ids[1] == "history-3" - assert item_ids[2] == "item-level-2" - assert item_ids[3] == "no-refs" - - -@pytest.mark.anyio -async def test_sort_by_total_references_stable_pagination( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Pagination is stable when sorting by totalReferences.""" - dataset = f"sort-refs-pagination-{uuid4().hex[:6]}" - - # Create 6 items: 2 with 3 refs, 2 with 1 ref, 2 with 0 refs - items = [ - make_item_with_refs(dataset, "item-3a", item_refs_count=3), - make_item_with_refs(dataset, "item-3b", item_refs_count=3), - make_item_with_refs(dataset, "item-1a", item_refs_count=1), - make_item_with_refs(dataset, "item-1b", item_refs_count=1), - make_item_with_refs(dataset, "item-0a", item_refs_count=0), - make_item_with_refs(dataset, "item-0b", item_refs_count=0), - ] - - res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers) - assert res.status_code == 200 - - # Get all items on page 1 (limit 3) - res = await async_client.get( - "/v1/ground-truths", - params={ - "dataset": dataset, - "sortBy": "totalReferences", - "sortOrder": "desc", - "page": 1, - "limit": 3, - }, - headers=user_headers, - ) - assert res.status_code == 200 - page1 = GroundTruthListResponse.model_validate(res.json()) - - # Get page 2 - res = await async_client.get( - "/v1/ground-truths", - params={ - "dataset": dataset, - "sortBy": "totalReferences", - "sortOrder": "desc", - "page": 2, - "limit": 3, - }, - headers=user_headers, - ) - assert res.status_code == 200 - page2 = GroundTruthListResponse.model_validate(res.json()) - - assert len(page1.items) == 3 - assert len(page2.items) == 3 - - # Combine pages and verify no duplicates - all_ids = [item.id for item in page1.items] + [item.id for item in page2.items] - assert len(set(all_ids)) == 6, "All 6 items should appear exactly once across pages" - - # Verify page 1 has higher ref counts than page 2 - page1_refs = [item.totalReferences for item in page1.items] - page2_refs = [item.totalReferences for item in page2.items] - assert min(page1_refs) >= max(page2_refs), ( - f"Page 1 refs {page1_refs} should be >= page 2 refs {page2_refs}" - ) - - -@pytest.mark.anyio -async def test_sort_by_total_references_with_status_filter( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Sort by totalReferences works with status filter.""" - dataset = f"sort-refs-status-{uuid4().hex[:6]}" - - items = [ - make_item_with_refs(dataset, "draft-2", item_refs_count=2), - make_item_with_refs(dataset, "draft-5", item_refs_count=5), - make_item_with_refs(dataset, "approved-3", item_refs_count=3), - ] - - # Create all as draft - res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers) - assert res.status_code == 200 - - # Approve one item (approved-3) - res = await async_client.get(f"/v1/ground-truths/{dataset}", headers=user_headers) - assert res.status_code == 200 - all_items = res.json() - approved_item = next(i for i in all_items if i["id"] == "approved-3") - - res = await async_client.put( - f"/v1/ground-truths/{dataset}/{approved_item['bucket']}/approved-3", - headers={**user_headers, "If-Match": approved_item["_etag"]}, - json={"status": "approved"}, - ) - assert res.status_code == 200 - - # Filter by draft status and sort by totalReferences - res = await async_client.get( - "/v1/ground-truths", - params={ - "dataset": dataset, - "status": "draft", - "sortBy": "totalReferences", - "sortOrder": "desc", - }, - headers=user_headers, - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 2 - - # Only draft items (draft-5, draft-2) should be returned, sorted by refs - ref_counts = [item.totalReferences for item in response_data.items] - assert ref_counts == [5, 2], f"Expected [5, 2], got {ref_counts}" - - -@pytest.mark.anyio -async def test_sort_by_total_references_all_zero( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Sort works when all items have 0 refs (stable by ID).""" - dataset = f"sort-refs-zeros-{uuid4().hex[:6]}" - - items = [ - make_item_with_refs(dataset, "item-c", item_refs_count=0), - make_item_with_refs(dataset, "item-a", item_refs_count=0), - make_item_with_refs(dataset, "item-b", item_refs_count=0), - ] - - res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers) - assert res.status_code == 200 - - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "sortBy": "totalReferences", "sortOrder": "asc"}, - headers=user_headers, - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 3 - - # All have 0 refs - should be stable sorted by id ASC - ref_counts = [item.totalReferences for item in response_data.items] - assert ref_counts == [0, 0, 0] - - # Secondary sort by ID should apply - ids = [item.id for item in response_data.items] - assert ids == sorted(ids), f"Expected IDs sorted alphabetically as secondary sort, got {ids}" - - -@pytest.mark.anyio -async def test_sort_by_total_references_large_counts( - async_client: AsyncClient, user_headers: dict[str, str] -): - """Sort handles items with many refs correctly.""" - dataset = f"sort-refs-large-{uuid4().hex[:6]}" - - items = [ - make_item_with_refs(dataset, "item-10", item_refs_count=10), - make_item_with_refs(dataset, "item-50", item_refs_count=50), - make_item_with_refs(dataset, "item-25", item_refs_count=25), - ] - - res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers) - assert res.status_code == 200 - - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "sortBy": "totalReferences", "sortOrder": "desc"}, - headers=user_headers, - ) - assert res.status_code == 200 - - response_data = GroundTruthListResponse.model_validate(res.json()) - assert len(response_data.items) == 3 - - ref_counts = [item.totalReferences for item in response_data.items] - assert ref_counts == [50, 25, 10], f"Expected [50, 25, 10], got {ref_counts}" - - -@pytest.mark.anyio -async def test_sort_by_total_references_after_update( - async_client: AsyncClient, user_headers: dict[str, str] -): - """totalReferences is recalculated on update and sort reflects changes.""" - dataset = f"sort-refs-update-{uuid4().hex[:6]}" - - items = [ - make_item_with_refs(dataset, "item-to-update", item_refs_count=1), - make_item_with_refs(dataset, "item-static", item_refs_count=3), - ] - - res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers) - assert res.status_code == 200 - - # Initial sort - item-static should be first (3 refs vs 1 ref) - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "sortBy": "totalReferences", "sortOrder": "desc"}, - headers=user_headers, - ) - assert res.status_code == 200 - response_data = GroundTruthListResponse.model_validate(res.json()) - assert response_data.items[0].id == "item-static" - - # Get the item to update - res = await async_client.get(f"/v1/ground-truths/{dataset}", headers=user_headers) - assert res.status_code == 200 - all_items = res.json() - item_to_update = next(i for i in all_items if i["id"] == "item-to-update") - - # Update item-to-update to have 5 refs (more than item-static's 3) - res = await async_client.put( - f"/v1/ground-truths/{dataset}/{item_to_update['bucket']}/item-to-update", - headers={**user_headers, "If-Match": item_to_update["_etag"]}, - json={ - "refs": [ - {"url": "https://example.com/new-ref-1"}, - {"url": "https://example.com/new-ref-2"}, - {"url": "https://example.com/new-ref-3"}, - {"url": "https://example.com/new-ref-4"}, - {"url": "https://example.com/new-ref-5"}, - ] - }, - ) - assert res.status_code == 200 - - # After update - item-to-update should now be first (5 refs vs 3 refs) - res = await async_client.get( - "/v1/ground-truths", - params={"dataset": dataset, "sortBy": "totalReferences", "sortOrder": "desc"}, - headers=user_headers, - ) - assert res.status_code == 200 - response_data = GroundTruthListResponse.model_validate(res.json()) - assert response_data.items[0].id == "item-to-update" - assert response_data.items[0].totalReferences == 5 - - -@pytest.mark.anyio -async def test_invalid_sort_field_returns_422( - async_client: AsyncClient, user_headers: dict[str, str] -): - """API returns 400 for invalid sortBy value.""" - res = await async_client.get( - "/v1/ground-truths", - params={"sortBy": "invalidField"}, - headers=user_headers, - ) - assert res.status_code == 422 - - data = res.json() - detail = data.get("detail") - - assert any( - ("sortby" in " ".join(map(str, err.get("loc", []))).lower()) - or ("sortby" in err.get("msg", "").lower()) - for err in detail - ), f"Expected 'sortBy' in validation detail, got: {detail}" diff --git a/backend/tests/integration/test_recompute_tags.py b/backend/tests/integration/test_recompute_tags.py index a263a51..ffb6365 100644 --- a/backend/tests/integration/test_recompute_tags.py +++ b/backend/tests/integration/test_recompute_tags.py @@ -12,7 +12,9 @@ def make_item(dataset: str, status: str = "draft") -> dict: "id": str(uuid.uuid4()), "datasetName": dataset, "bucket": str(uuid.UUID("00000000-0000-0000-0000-000000000000")), - "synthQuestion": "What is the capital of France?", + "history": [ + {"role": "user", "msg": "What is the capital of France?"}, + ], "status": status, } diff --git a/backend/tests/integration/test_sample_unassigned_allocation.py b/backend/tests/integration/test_sample_unassigned_allocation.py index cdca54f..bdc9099 100644 --- a/backend/tests/integration/test_sample_unassigned_allocation.py +++ b/backend/tests/integration/test_sample_unassigned_allocation.py @@ -13,10 +13,9 @@ def make_item(dataset: str) -> dict[str, Any]: "datasetName": dataset, "bucket": "00000000-0000-0000-0000-000000000000", "status": "draft", - "samplingBucket": 0, - "synthQuestion": "Q?", - "answer": None, - "refs": [], + "history": [ + {"role": "user", "msg": "Q?"}, + ], "manualTags": ["source:synthetic", "split:validation"], } diff --git a/backend/tests/integration/test_snapshot_artifacts_cosmos.py b/backend/tests/integration/test_snapshot_artifacts_cosmos.py index 79de2c0..91643a8 100644 --- a/backend/tests/integration/test_snapshot_artifacts_cosmos.py +++ b/backend/tests/integration/test_snapshot_artifacts_cosmos.py @@ -24,9 +24,10 @@ def make_item(dataset: str, item_id: str) -> dict[str, Any]: "id": item_id, "datasetName": dataset, "bucket": "00000000-0000-0000-0000-000000000000", - "synthQuestion": "Q?", - "answer": "A", - "refs": [], + "history": [ + {"role": "user", "msg": "Q?"}, + {"role": "assistant", "msg": "A"}, + ], "manualTags": ["source:synthetic", "topic:general"], } diff --git a/backend/tests/test_helpers.py b/backend/tests/test_helpers.py index d725ab7..244eed6 100644 --- a/backend/tests/test_helpers.py +++ b/backend/tests/test_helpers.py @@ -1,7 +1,8 @@ """Test helpers for creating AgenticGroundTruthEntry fixtures. -After Phase 6: canonical state is history[]; question/answer/refs are derived -from history or stored in plugins["rag-compat"]. +After Phase 6: canonical state is history[]; question/answer are derived from +history and plugin-owned reference compatibility lives in +plugins["rag-compat"].data.references. """ from __future__ import annotations @@ -34,12 +35,12 @@ def make_test_entry( id: Item ID (default: "test-item") dataset_name: Dataset name (default: "test-dataset") status: Item status (default: draft) - history: Explicit history array. If None and synth_question/answer provided, - a simple Q&A history will be auto-generated. - synth_question: Question text (stored in rag-compat plugin) - edited_question: Edited question text (stored in rag-compat plugin) - answer: Answer text (stored in rag-compat plugin) - refs: References (stored in rag-compat plugin) + history: Explicit history array. If None and question/answer inputs are provided, + a simple Q&A history will be auto-generated. + synth_question: Fallback question text used when edited_question is absent + edited_question: Preferred question text for generated history + answer: Answer text used for generated history + refs: References stored in rag-compat plugin data manual_tags: Manual tags list comment: Item comment reviewed_at: Review timestamp @@ -92,25 +93,23 @@ def make_test_entry( if history is not None: # Use explicit history payload["history"] = history - elif synth_question or answer: + elif edited_question or synth_question or answer: # Auto-generate simple Q&A history from legacy-style params auto_history: list[dict[str, Any]] = [] - if synth_question: - auto_history.append({"role": "user", "msg": synth_question}) + question = edited_question or synth_question + if question: + auto_history.append({"role": "user", "msg": question}) if answer: auto_history.append({"role": "assistant", "msg": answer}) payload["history"] = auto_history - # Build rag-compat plugin data if any legacy fields are provided + # Build rag-compat plugin data when references are provided rag_compat_data: dict[str, Any] = {} - if synth_question is not None: - rag_compat_data["synthQuestion"] = synth_question - if edited_question is not None: - rag_compat_data["editedQuestion"] = edited_question - if answer is not None: - rag_compat_data["answer"] = answer if refs is not None: - rag_compat_data["refs"] = refs + rag_compat_data["references"] = [ + ref.model_dump(by_alias=True, exclude_none=True) if hasattr(ref, "model_dump") else ref + for ref in refs + ] if rag_compat_data: payload["plugins"] = { diff --git a/backend/tests/unit/plugins/test_plugin_dataset.py b/backend/tests/unit/plugins/test_plugin_dataset.py index b07a0c1..f664ce8 100644 --- a/backend/tests/unit/plugins/test_plugin_dataset.py +++ b/backend/tests/unit/plugins/test_plugin_dataset.py @@ -4,8 +4,8 @@ import pytest -from app.domain.models import AgenticGroundTruthEntry from app.plugins.computed_tags.dataset import DatasetPlugin +from tests.test_helpers import make_test_entry class TestDatasetPlugin: @@ -30,9 +30,5 @@ def test_tag_key_is_dynamic_placeholder(self): def test_compute_returns_dataset_prefixed_tag(self, dataset_name, expected_tag): """compute() returns 'dataset:' prefix with the dataset name.""" plugin = DatasetPlugin() - item = AgenticGroundTruthEntry( - id="test-id", - datasetName=dataset_name, - synthQuestion="Question", - ) + item = make_test_entry(id="test-id", dataset_name=dataset_name) assert plugin.compute(item) == expected_tag diff --git a/backend/tests/unit/plugins/test_plugin_no_answer.py b/backend/tests/unit/plugins/test_plugin_no_answer.py index 3b561db..f7460b6 100644 --- a/backend/tests/unit/plugins/test_plugin_no_answer.py +++ b/backend/tests/unit/plugins/test_plugin_no_answer.py @@ -2,8 +2,8 @@ from __future__ import annotations -from app.domain.models import AgenticGroundTruthEntry from app.plugins.computed_tags.no_answer import NoAnswerPlugin +from tests.test_helpers import make_test_entry class TestNoAnswerPlugin: @@ -12,45 +12,43 @@ class TestNoAnswerPlugin: def test_no_answer_exact_match(self): """Should return tag when answer is exactly NO_ANSWER.""" plugin = NoAnswerPlugin() - item = AgenticGroundTruthEntry( - id="test", datasetName="test", synthQuestion="Q", answer="NO_ANSWER" + item = make_test_entry( + id="test", dataset_name="test", synth_question="Q", answer="NO_ANSWER" ) assert plugin.compute(item) == "answer:no_answer" def test_no_answer_with_whitespace(self): """Should return tag when answer is NO_ANSWER with surrounding whitespace.""" plugin = NoAnswerPlugin() - item = AgenticGroundTruthEntry( - id="test", datasetName="test", synthQuestion="Q", answer=" NO_ANSWER " + item = make_test_entry( + id="test", dataset_name="test", synth_question="Q", answer=" NO_ANSWER " ) assert plugin.compute(item) == "answer:no_answer" def test_no_answer_with_newlines(self): """Should return tag when answer is NO_ANSWER with newlines.""" plugin = NoAnswerPlugin() - item = AgenticGroundTruthEntry( - id="test", datasetName="test", synthQuestion="Q", answer="\nNO_ANSWER\n" + item = make_test_entry( + id="test", dataset_name="test", synth_question="Q", answer="\nNO_ANSWER\n" ) assert plugin.compute(item) == "answer:no_answer" def test_regular_answer_returns_none(self): """Should return None for regular answers.""" plugin = NoAnswerPlugin() - item = AgenticGroundTruthEntry( - id="test", datasetName="test", synthQuestion="Q", answer="A valid answer" + item = make_test_entry( + id="test", dataset_name="test", synth_question="Q", answer="A valid answer" ) assert plugin.compute(item) is None def test_none_answer_returns_none(self): """Should return None when answer is None.""" plugin = NoAnswerPlugin() - item = AgenticGroundTruthEntry( - id="test", datasetName="test", synthQuestion="Q", answer=None - ) + item = make_test_entry(id="test", dataset_name="test", synth_question="Q", answer=None) assert plugin.compute(item) is None def test_empty_answer_returns_none(self): """Should return None when answer is empty string.""" plugin = NoAnswerPlugin() - item = AgenticGroundTruthEntry(id="test", datasetName="test", synthQuestion="Q", answer="") + item = make_test_entry(id="test", dataset_name="test", synth_question="Q", answer="") assert plugin.compute(item) is None diff --git a/backend/tests/unit/plugins/test_plugin_question_length.py b/backend/tests/unit/plugins/test_plugin_question_length.py index d740fb9..a2b997f 100644 --- a/backend/tests/unit/plugins/test_plugin_question_length.py +++ b/backend/tests/unit/plugins/test_plugin_question_length.py @@ -4,12 +4,12 @@ import pytest -from app.domain.models import AgenticGroundTruthEntry from app.plugins.computed_tags.question_length import ( QuestionLengthLongPlugin, QuestionLengthMediumPlugin, QuestionLengthShortPlugin, ) +from tests.test_helpers import make_test_entry class TestQuestionLengthPlugins: @@ -31,11 +31,7 @@ def test_mutually_exclusive_classification( ): """Each document gets exactly one length tag.""" question = " ".join([f"word{i}" for i in range(word_count)]) - item = AgenticGroundTruthEntry( - id="test-id", - datasetName="test-dataset", - synthQuestion=question, - ) + item = make_test_entry(id="test-id", dataset_name="test-dataset", synth_question=question) short_plugin = QuestionLengthShortPlugin() medium_plugin = QuestionLengthMediumPlugin() @@ -55,11 +51,11 @@ def test_mutually_exclusive_classification( def test_edited_question_takes_precedence(self): """editedQuestion is used over synthQuestion when present.""" - item = AgenticGroundTruthEntry( + item = make_test_entry( id="test-id", - datasetName="test-dataset", - synthQuestion="short", # 1 word - editedQuestion=" ".join([f"word{i}" for i in range(35)]), # 35 words -> long + dataset_name="test-dataset", + synth_question="short", # 1 word + edited_question=" ".join([f"word{i}" for i in range(35)]), # 35 words -> long ) assert QuestionLengthLongPlugin().compute(item) == "question_length:long" diff --git a/backend/tests/unit/plugins/test_plugin_reference_type.py b/backend/tests/unit/plugins/test_plugin_reference_type.py index 8d7dfd8..dc954fd 100644 --- a/backend/tests/unit/plugins/test_plugin_reference_type.py +++ b/backend/tests/unit/plugins/test_plugin_reference_type.py @@ -4,13 +4,14 @@ import pytest -from app.domain.models import AgenticGroundTruthEntry, Reference +from app.domain.models import Reference from app.plugins.computed_tags.reference_type import ( ReferenceTypeArticlePlugin, ReferenceTypeHelpcenterPlugin, _is_article_url, _is_helpcenter_url, ) +from tests.test_helpers import make_test_entry class TestUrlPatternDetection: @@ -48,20 +49,18 @@ class TestReferenceTypePlugins: def test_no_refs_gets_no_tags(self): """Item with no refs should get neither tag.""" - item = AgenticGroundTruthEntry( - id="test-no-refs", - datasetName="test-dataset", - synthQuestion="Question", + item = make_test_entry( + id="test-no-refs", dataset_name="test-dataset", synth_question="Question" ) assert ReferenceTypeArticlePlugin().compute(item) is None assert ReferenceTypeHelpcenterPlugin().compute(item) is None def test_item_can_have_both_tags(self): """Item with both reference types should get both tags.""" - item = AgenticGroundTruthEntry( + item = make_test_entry( id="test-both", - datasetName="test-dataset", - synthQuestion="Question", + dataset_name="test-dataset", + synth_question="Question", refs=[ Reference(url="https://docs.example.com/support/article/CS431120"), Reference(url="https://support.example.com/help/product/page.html"), @@ -72,10 +71,10 @@ def test_item_can_have_both_tags(self): def test_type_field_is_ignored(self): """Only URL matters, not the type field on Reference.""" - item = AgenticGroundTruthEntry( + item = make_test_entry( id="test-type-ignored", - datasetName="test-dataset", - synthQuestion="Question", + dataset_name="test-dataset", + synth_question="Question", refs=[Reference(url="https://example.com/page", type="article")], ) # URL doesn't match article pattern, so no tag even though type="article" diff --git a/backend/tests/unit/plugins/test_plugin_retrieval_behavior.py b/backend/tests/unit/plugins/test_plugin_retrieval_behavior.py index 3439b15..f94f847 100644 --- a/backend/tests/unit/plugins/test_plugin_retrieval_behavior.py +++ b/backend/tests/unit/plugins/test_plugin_retrieval_behavior.py @@ -4,7 +4,7 @@ import pytest -from app.domain.models import AgenticGroundTruthEntry, Reference, HistoryItem +from app.domain.models import Reference from app.domain.enums import HistoryItemRole from app.plugins.computed_tags.retrieval_behavior import ( RetrievalBehaviorNoRefsPlugin, @@ -12,6 +12,7 @@ RetrievalBehaviorTwoRefsPlugin, RetrievalBehaviorRichPlugin, ) +from tests.test_helpers import make_test_entry class TestRetrievalBehaviorPlugins: @@ -30,10 +31,10 @@ class TestRetrievalBehaviorPlugins: ) def test_mutually_exclusive_classification(self, num_refs, expected_tag): """Each document gets exactly one retrieval behavior tag.""" - item = AgenticGroundTruthEntry( + item = make_test_entry( id=f"test-{num_refs}-refs", - datasetName="test-dataset", - synthQuestion="Question", + dataset_name="test-dataset", + synth_question="Question", refs=[Reference(url=f"https://example.com/doc{i}") for i in range(num_refs)], ) @@ -51,27 +52,21 @@ def test_mutually_exclusive_classification(self, num_refs, expected_tag): assert non_none[0] == expected_tag def test_refs_in_history_are_counted(self): - """References in history turns are included in the count.""" - item = AgenticGroundTruthEntry( + """Canonical plugin references with turn ownership are included in the count.""" + item = make_test_entry( id="test-history-refs", - datasetName="test-dataset", - synthQuestion="Follow up question", + dataset_name="test-dataset", + synth_question="Follow up question", history=[ - HistoryItem(role=HistoryItemRole.user, msg="First question"), - HistoryItem( - role=HistoryItemRole.assistant, - msg="First answer", - refs=[ - Reference(url="https://example.com/doc1"), - Reference(url="https://example.com/doc2"), - ], - ), - HistoryItem(role=HistoryItemRole.user, msg="Second question"), - HistoryItem( - role=HistoryItemRole.assistant, - msg="Second answer", - refs=[Reference(url="https://example.com/doc3")], - ), + {"role": HistoryItemRole.user, "msg": "First question"}, + {"role": HistoryItemRole.assistant, "msg": "First answer"}, + {"role": HistoryItemRole.user, "msg": "Second question"}, + {"role": HistoryItemRole.assistant, "msg": "Second answer"}, + ], + refs=[ + Reference(url="https://example.com/doc1", messageIndex=1), + Reference(url="https://example.com/doc2", messageIndex=1), + Reference(url="https://example.com/doc3", messageIndex=3), ], ) # 3 refs total in history -> rich diff --git a/backend/tests/unit/plugins/test_plugin_turns.py b/backend/tests/unit/plugins/test_plugin_turns.py index c09d372..e2b8833 100644 --- a/backend/tests/unit/plugins/test_plugin_turns.py +++ b/backend/tests/unit/plugins/test_plugin_turns.py @@ -4,9 +4,10 @@ import pytest -from app.domain.models import AgenticGroundTruthEntry, HistoryItem +from app.domain.models import HistoryItem from app.domain.enums import HistoryItemRole from app.plugins.computed_tags.turns import MultiTurnPlugin, SingleTurnPlugin +from tests.test_helpers import make_test_entry class TestTurnsPlugins: @@ -36,10 +37,10 @@ def test_mutually_exclusive_classification(self, history_len, expected_single, e else None ) - item = AgenticGroundTruthEntry( + item = make_test_entry( id="test-id", - datasetName="test-dataset", - synthQuestion="Question", + dataset_name="test-dataset", + synth_question="Question", history=history, ) diff --git a/backend/tests/unit/test_assignments_skip_persist.py b/backend/tests/unit/test_assignments_skip_persist.py index 3294aa4..ffa582e 100644 --- a/backend/tests/unit/test_assignments_skip_persist.py +++ b/backend/tests/unit/test_assignments_skip_persist.py @@ -129,7 +129,7 @@ async def test_status_skipped_keeps_assignment(async_client, user_headers): id=item_id, datasetName=dataset, bucket=bucket, - synthQuestion="Q?", + history=[{"role": "user", "msg": "Q?"}], status=GroundTruthStatus.draft, assignedTo=user_headers["X-User-Id"], assignedAt=assigned_at, diff --git a/backend/tests/unit/test_bulk_import_tag_validation.py b/backend/tests/unit/test_bulk_import_tag_validation.py index a716976..3699b50 100644 --- a/backend/tests/unit/test_bulk_import_tag_validation.py +++ b/backend/tests/unit/test_bulk_import_tag_validation.py @@ -1,7 +1,8 @@ import pytest from unittest.mock import AsyncMock, MagicMock, patch -from app.domain.models import AgenticGroundTruthEntry, BulkImportResult +from app.domain.models import BulkImportResult from app.core.auth import UserContext +from tests.test_helpers import make_test_entry @pytest.fixture @@ -31,11 +32,11 @@ async def test_bulk_import_validates_tags(mock_container, mock_user): ) items = [ - AgenticGroundTruthEntry( + make_test_entry( id="test-1", - datasetName="test", - synthQuestion="What is Q?", - manualTags=["source:synthetic"], + dataset_name="test", + synth_question="What is Q?", + manual_tags=["source:synthetic"], ) ] @@ -65,8 +66,11 @@ async def test_bulk_import_rejects_invalid_tags(mock_container, mock_user): ) items = [ - AgenticGroundTruthEntry( - id="test-1", datasetName="test", synthQuestion="What is Q?", manualTags=["invalid:tag"] + make_test_entry( + id="test-1", + dataset_name="test", + synth_question="What is Q?", + manual_tags=["invalid:tag"], ) ] @@ -102,17 +106,17 @@ async def test_bulk_import_mixed_valid_invalid_tags(mock_container, mock_user): ) items = [ - AgenticGroundTruthEntry( + make_test_entry( id="test-1", - datasetName="test", - synthQuestion="Q1?", - manualTags=["source:synthetic"], # valid + dataset_name="test", + synth_question="Q1?", + manual_tags=["source:synthetic"], # valid ), - AgenticGroundTruthEntry( + make_test_entry( id="test-2", - datasetName="test", - synthQuestion="Q2?", - manualTags=["invalid:tag"], # invalid + dataset_name="test", + synth_question="Q2?", + manual_tags=["invalid:tag"], # invalid ), ] @@ -149,11 +153,11 @@ async def test_bulk_import_no_tags(mock_container, mock_user): ) items = [ - AgenticGroundTruthEntry( + make_test_entry( id="test-1", - datasetName="test", - synthQuestion="What is Q?", - manualTags=[], # no tags + dataset_name="test", + synth_question="What is Q?", + manual_tags=[], # no tags ) ] @@ -183,11 +187,11 @@ async def test_bulk_import_tag_validation_single_registry_fetch(mock_container, ) items = [ - AgenticGroundTruthEntry( + make_test_entry( id=f"test-{i}", - datasetName="test", - synthQuestion=f"Q{i}?", - manualTags=["source:synthetic"], + dataset_name="test", + synth_question=f"Q{i}?", + manual_tags=["source:synthetic"], ) for i in range(10) ] diff --git a/backend/tests/unit/test_computed_tags_plugins.py b/backend/tests/unit/test_computed_tags_plugins.py index 630dcb0..9bfecd7 100644 --- a/backend/tests/unit/test_computed_tags_plugins.py +++ b/backend/tests/unit/test_computed_tags_plugins.py @@ -26,7 +26,11 @@ class TestTagPluginRegistry: def test_empty_registry_returns_empty_tags(self): """An empty registry should return no tags.""" registry = TagPluginRegistry() - item = AgenticGroundTruthEntry(id="test", datasetName="test", synthQuestion="Q") + item = AgenticGroundTruthEntry( + id="test", + datasetName="test", + history=[{"role": "user", "msg": "Q"}], + ) assert registry.compute_all(item) == [] assert registry.get_all_keys() == set() @@ -207,7 +211,7 @@ def test_computed_and_manual_tags_merge(self): item = AgenticGroundTruthEntry( id="merge-test", datasetName="test-dataset", - synthQuestion="Test question", + history=[{"role": "user", "msg": "Test question"}], manualTags=["source:manual", "priority:high"], computedTags=["turns:singleturn"], ) diff --git a/backend/tests/unit/test_conversation_fields.py b/backend/tests/unit/test_conversation_fields.py new file mode 100644 index 0000000..39d2c5e --- /dev/null +++ b/backend/tests/unit/test_conversation_fields.py @@ -0,0 +1,36 @@ +from app.domain.conversation_fields import ( + answer_text_from_item, + is_non_user_role, + is_user_role, + question_text_from_item, +) +from app.domain.models import AgenticGroundTruthEntry + + +def test_role_helpers_use_strict_user_semantics(): + assert is_user_role("user") + assert is_user_role(" User ") + assert not is_user_role("assistant") + assert not is_user_role("planner") + + assert not is_non_user_role("user") + assert is_non_user_role("assistant") + assert is_non_user_role("planner") + + +def test_question_and_answer_derivation_follow_user_vs_non_user_contract(): + item = AgenticGroundTruthEntry.model_validate( + { + "id": "item-role-derivation", + "datasetName": "demo", + "history": [ + {"role": "user", "msg": "Initial question"}, + {"role": "planner", "msg": "Draft answer"}, + {"role": "user", "msg": "Follow-up"}, + {"role": "assistant", "msg": "Final answer"}, + ], + } + ) + + assert question_text_from_item(item) == "Follow-up" + assert answer_text_from_item(item) == "Final answer" diff --git a/backend/tests/unit/test_cosmos_repo.py b/backend/tests/unit/test_cosmos_repo.py index 0647bf7..23c2f12 100644 --- a/backend/tests/unit/test_cosmos_repo.py +++ b/backend/tests/unit/test_cosmos_repo.py @@ -5,7 +5,13 @@ import pytest # type: ignore[import-not-found] -from app.adapters.repos.cosmos_repo import CosmosGroundTruthRepo, SELECT_CLAUSE_C +from app.adapters.repos.cosmos_repo import ( + CosmosGroundTruthRepo, + SELECT_CLAUSE_C, + _normalize_unicode_for_cosmos, + _restore_unicode_from_cosmos, +) +from app.plugins.pack_registry import get_rag_compat_pack from app.domain.enums import GroundTruthStatus, SortField, SortOrder from app.domain.models import AgenticGroundTruthEntry from tests.test_helpers import make_test_entry @@ -104,6 +110,66 @@ def test_resolve_sort_with_overrides(repo: CosmosGroundTruthRepo) -> None: assert direction is SortOrder.asc +def test_emulator_unicode_normalization_encodes_canonical_reference_content(monkeypatch) -> None: + monkeypatch.setattr( + "app.adapters.repos.cosmos_repo.settings.COSMOS_DISABLE_UNICODE_ESCAPE", True + ) + + original_content = r"Snippet with invalid escape \q and unicode \u2603" + payload = { + "plugins": { + "rag-compat": { + "data": { + "references": [ + { + "url": "https://example.com/canonical", + "content": original_content, + } + ] + } + } + } + } + + normalized = _normalize_unicode_for_cosmos(payload) + ref = normalized["plugins"]["rag-compat"]["data"]["references"][0] + + assert ref.get("_contentEncoded") is True + assert ref["content"] != original_content + + restored = _restore_unicode_from_cosmos(normalized) + restored_ref = restored["plugins"]["rag-compat"]["data"]["references"][0] + assert restored_ref["content"] == original_content + assert "_contentEncoded" not in restored_ref + + +def test_emulator_unicode_normalization_does_not_base64_encode_legacy_refs(monkeypatch) -> None: + monkeypatch.setattr( + "app.adapters.repos.cosmos_repo.settings.COSMOS_DISABLE_UNICODE_ESCAPE", True + ) + + original_content = r"Legacy snippet with invalid escape \q" + payload = { + "history": [ + { + "role": "assistant", + "msg": "Answer", + "refs": [{"url": "https://example.com/legacy", "content": original_content}], + } + ] + } + + normalized = _normalize_unicode_for_cosmos(payload) + ref = normalized["history"][0]["refs"][0] + assert "_contentEncoded" not in ref + assert ref["content"] != original_content + + restored = _restore_unicode_from_cosmos(normalized) + restored_ref = restored["history"][0]["refs"][0] + assert restored_ref["content"] == original_content + assert "_contentEncoded" not in restored_ref + + def test_sort_key_has_answer(repo: CosmosGroundTruthRepo) -> None: example = make_test_entry( id="item", @@ -134,28 +200,17 @@ def test_select_clause_includes_generic_phase_one_fields() -> None: # ============================================================================= -# Tests for totalReferences auto-computation (domain model validator) +# Reference-count semantics via rag-compat pack helpers # ============================================================================= class TestComputeTotalReferences: - """Unit tests for AgenticGroundTruthEntry.totalReferences computation. - - The property calculates total references with the following logic: - - If history has refs, count only history refs (history takes priority) - - If history has no refs, count plugin-stored refs as fallback - - **Phase 5 Audit (2026-03-12)**: ACTIVE COMPUTATION LOGIC - BLOCKING - The totalReferences field has active property logic that computes - values from history and plugin refs. This is not just compatibility - testing - it's core functionality that is used by: - - Model validation on all item saves - - Sort/filter operations that check reference counts - - UI displays of reference totals - - Cannot delete totalReferences until this computation is either: - - Moved to a computed property on AgenticGroundTruthEntry, OR - - Replaced by direct history ref counting in callers + """Unit tests for rag-compat reference_count behavior. + + These tests exercise reference counting through + ``get_rag_compat_pack().reference_count(item)`` using compatibility + payload shapes seeded in fixtures. The host model no longer owns + ``totalReferences`` behavior. """ def _make_item( @@ -164,56 +219,60 @@ def _make_item( history: list[dict] | None = None, ) -> AgenticGroundTruthEntry: """Helper to create an AgenticGroundTruthEntry with specified refs and history.""" + normalized_history = history + if history is not None: + normalized_history = [] + for turn in history: + turn_copy = dict(turn) + if turn_copy.get("refs") in (None, []): + turn_copy.pop("refs", None) + normalized_history.append(turn_copy) return make_test_entry( id="test-item", dataset_name="test-dataset", synth_question="Test question?", refs=refs, - history=history, + history=normalized_history, ) # ------------------------------------------------------------------------- - # History refs take priority over item refs + # Compat-reference counting with conversation history present # ------------------------------------------------------------------------- def test_history_refs_take_priority_over_item_refs(self) -> None: - """When history has refs, only history refs are counted (item refs ignored).""" + """Compat refs are counted even when conversation history is present.""" item = self._make_item( refs=[{"url": "https://item-ref-1.com"}, {"url": "https://item-ref-2.com"}], history=[ {"role": "user", "msg": "Hello"}, - {"role": "assistant", "msg": "Hi", "refs": [{"url": "https://history-ref.com"}]}, + {"role": "assistant", "msg": "Hi"}, ], ) - # totalReferences is auto-computed by model_validator - # Should count only history refs (1), not item refs (2) - assert item.totalReferences == 1 + assert get_rag_compat_pack().reference_count(item) == 2 def test_history_refs_from_multiple_turns(self) -> None: - """Refs from all history turns are summed.""" + """Compat refs are counted correctly with multi-turn history present.""" item = self._make_item( - refs=[{"url": "https://ignored.com"}], + refs=[ + {"url": "https://ref1.com"}, + {"url": "https://ref2.com"}, + {"url": "https://ref3.com"}, + ], history=[ {"role": "user", "msg": "Q1"}, - { - "role": "assistant", - "msg": "A1", - "refs": [{"url": "https://ref1.com"}, {"url": "https://ref2.com"}], - }, + {"role": "assistant", "msg": "A1"}, {"role": "user", "msg": "Q2"}, - {"role": "assistant", "msg": "A2", "refs": [{"url": "https://ref3.com"}]}, + {"role": "assistant", "msg": "A2"}, ], ) - # totalReferences is auto-computed by model_validator - # Should count all history refs: 2 + 1 = 3 - assert item.totalReferences == 3 + assert get_rag_compat_pack().reference_count(item) == 3 # ------------------------------------------------------------------------- - # Item refs used when no history refs exist + # Compat-reference fallback when history contributes no refs # ------------------------------------------------------------------------- def test_item_refs_fallback_when_no_history(self) -> None: - """Item refs are counted when there is no history.""" + """Plugin-owned compat refs are counted when there is no history.""" item = self._make_item( refs=[ {"url": "https://ref1.com"}, @@ -222,18 +281,18 @@ def test_item_refs_fallback_when_no_history(self) -> None: ], history=None, ) - assert item.totalReferences == 3 + assert get_rag_compat_pack().reference_count(item) == 3 def test_item_refs_fallback_when_history_empty(self) -> None: - """Item refs are counted when history is an empty list.""" + """Plugin-owned compat refs are counted when history is an empty list.""" item = self._make_item( refs=[{"url": "https://ref1.com"}, {"url": "https://ref2.com"}], history=[], ) - assert item.totalReferences == 2 + assert get_rag_compat_pack().reference_count(item) == 2 def test_item_refs_fallback_when_history_has_no_refs(self) -> None: - """Item refs are counted when history exists but contains no refs.""" + """Plugin-owned compat refs are counted when history exists but contains no refs.""" item = self._make_item( refs=[{"url": "https://item-ref.com"}], history=[ @@ -241,11 +300,11 @@ def test_item_refs_fallback_when_history_has_no_refs(self) -> None: {"role": "assistant", "msg": "Hi"}, # No refs ], ) - # History has 0 refs, so item refs (1) should be used - assert item.totalReferences == 1 + # History contributes 0 compat refs, so top-level compat refs (1) are used + assert get_rag_compat_pack().reference_count(item) == 1 def test_item_refs_fallback_when_history_refs_are_empty_lists(self) -> None: - """Item refs are counted when history refs are empty lists.""" + """Plugin-owned compat refs are counted when history refs are empty lists.""" item = self._make_item( refs=[{"url": "https://item-ref.com"}], history=[ @@ -253,27 +312,27 @@ def test_item_refs_fallback_when_history_refs_are_empty_lists(self) -> None: {"role": "assistant", "msg": "Hi", "refs": []}, # Empty refs list ], ) - # History refs total is 0, so item refs (1) should be used - assert item.totalReferences == 1 + # History compat refs total is 0, so top-level compat refs (1) are used + assert get_rag_compat_pack().reference_count(item) == 1 # ------------------------------------------------------------------------- - # Handle empty/null refs and history + # Handle empty/null compat refs and history # ------------------------------------------------------------------------- def test_zero_when_no_refs_anywhere(self) -> None: """Returns 0 when there are no refs at any level.""" item = self._make_item(refs=None, history=None) - assert item.totalReferences == 0 + assert get_rag_compat_pack().reference_count(item) == 0 def test_zero_when_empty_refs_and_no_history(self) -> None: """Returns 0 when refs is empty list and no history.""" item = self._make_item(refs=[], history=None) - assert item.totalReferences == 0 + assert get_rag_compat_pack().reference_count(item) == 0 def test_zero_when_empty_refs_and_empty_history(self) -> None: """Returns 0 when refs is empty and history is empty list.""" item = self._make_item(refs=[], history=[]) - assert item.totalReferences == 0 + assert get_rag_compat_pack().reference_count(item) == 0 def test_handles_none_refs_in_history_turn(self) -> None: """Handles history turns where refs is explicitly None.""" @@ -284,61 +343,56 @@ def test_handles_none_refs_in_history_turn(self) -> None: {"role": "assistant", "msg": "Hi", "refs": None}, # Explicitly None ], ) - # History refs is 0, fallback to item refs - assert item.totalReferences == 1 + # History compat refs is 0, so top-level compat refs are used + assert get_rag_compat_pack().reference_count(item) == 1 # ------------------------------------------------------------------------- # Complex scenarios with partial data # ------------------------------------------------------------------------- def test_mixed_history_some_turns_with_refs_some_without(self) -> None: - """History with mix of turns with and without refs.""" + """Compat refs are counted with mixed multi-turn history.""" item = self._make_item( - refs=[{"url": "https://ignored.com"}], + refs=[ + {"url": "https://ref1.com"}, + {"url": "https://ref2.com"}, + {"url": "https://ref3.com"}, + ], history=[ {"role": "user", "msg": "Q1"}, {"role": "assistant", "msg": "A1"}, # No refs {"role": "user", "msg": "Q2"}, - {"role": "assistant", "msg": "A2", "refs": [{"url": "https://ref1.com"}]}, + {"role": "assistant", "msg": "A2"}, {"role": "user", "msg": "Q3"}, - {"role": "assistant", "msg": "A3", "refs": None}, # Explicitly None + {"role": "assistant", "msg": "A3"}, {"role": "user", "msg": "Q4"}, - { - "role": "assistant", - "msg": "A4", - "refs": [{"url": "https://ref2.com"}, {"url": "https://ref3.com"}], - }, + {"role": "assistant", "msg": "A4"}, ], ) - # History refs: 0 + 1 + 0 + 2 = 3 - assert item.totalReferences == 3 + assert get_rag_compat_pack().reference_count(item) == 3 def test_user_turns_with_refs_are_counted(self) -> None: - """Refs on user turns are also counted (not just assistant turns).""" + """Compat refs are counted regardless of turn roles.""" item = self._make_item( - refs=[{"url": "https://ignored.com"}], + refs=[{"url": "https://user-ref.com"}, {"url": "https://assistant-ref.com"}], history=[ - {"role": "user", "msg": "Here's a doc", "refs": [{"url": "https://user-ref.com"}]}, - { - "role": "assistant", - "msg": "Thanks", - "refs": [{"url": "https://assistant-ref.com"}], - }, + {"role": "user", "msg": "Here's a doc"}, + {"role": "assistant", "msg": "Thanks"}, ], ) - # Both user and assistant refs are counted: 1 + 1 = 2 - assert item.totalReferences == 2 + assert get_rag_compat_pack().reference_count(item) == 2 def test_many_refs_in_single_turn(self) -> None: - """Handles turns with many references.""" + """Handles many compatibility refs.""" many_refs = [{"url": f"https://ref{i}.com"} for i in range(10)] item = self._make_item( + refs=many_refs, history=[ {"role": "user", "msg": "Q"}, - {"role": "assistant", "msg": "A", "refs": many_refs}, + {"role": "assistant", "msg": "A"}, ], ) - assert item.totalReferences == 10 + assert get_rag_compat_pack().reference_count(item) == 10 def test_item_only_no_history_field_at_all(self) -> None: """Item created without history field entirely.""" @@ -359,41 +413,30 @@ def test_item_only_no_history_field_at_all(self) -> None: }, } ) - assert item.totalReferences == 1 + assert get_rag_compat_pack().reference_count(item) == 1 def test_complex_real_world_scenario(self) -> None: """Realistic multi-turn conversation with various ref patterns.""" item = self._make_item( - # Item-level refs (should be ignored if history has any refs) - refs=[{"url": "https://old-ref.com"}], + refs=[ + {"url": "https://kb.example.com/article1"}, + {"url": "https://docs.example.com/troubleshooting"}, + {"url": "https://kb.example.com/article2"}, + ], history=[ # Turn 1: User asks question {"role": "user", "msg": "How do I fix error X?"}, - # Turn 2: Assistant responds with 2 refs - { - "role": "assistant", - "msg": "You can try these solutions...", - "refs": [ - {"url": "https://kb.example.com/article1"}, - {"url": "https://docs.example.com/troubleshooting"}, - ], - }, + {"role": "assistant", "msg": "You can try these solutions..."}, # Turn 3: User follow-up {"role": "user", "msg": "That didn't work, any other ideas?"}, - # Turn 4: Assistant with 1 more ref - { - "role": "assistant", - "msg": "Let's try this instead...", - "refs": [{"url": "https://kb.example.com/article2"}], - }, + {"role": "assistant", "msg": "Let's try this instead..."}, # Turn 5: User confirms {"role": "user", "msg": "That worked, thanks!"}, # Turn 6: Assistant closes (no refs needed) {"role": "assistant", "msg": "Glad I could help!"}, ], ) - # History refs: 2 + 1 = 3 (item-level ref is ignored) - assert item.totalReferences == 3 + assert get_rag_compat_pack().reference_count(item) == 3 # --------------------------------------------------------------------------- diff --git a/backend/tests/unit/test_demo_mode_memory_api.py b/backend/tests/unit/test_demo_mode_memory_api.py index abc0fa9..74202cb 100644 --- a/backend/tests/unit/test_demo_mode_memory_api.py +++ b/backend/tests/unit/test_demo_mode_memory_api.py @@ -37,6 +37,11 @@ async def test_demo_mode_seeds_memory_backend_for_api_usage() -> None: settings.DEMO_USER_ID = "anonymous" container.repo = None + container.assignment_service = None + container.search_service = None + container.snapshot_service = None + container.curation_service = None + container.init_memory_repo(enable_demo_data=True) app = create_app() diff --git a/backend/tests/unit/test_groundtruthitem_tags_validation.py b/backend/tests/unit/test_groundtruthitem_tags_validation.py index a80b3ec..923427c 100644 --- a/backend/tests/unit/test_groundtruthitem_tags_validation.py +++ b/backend/tests/unit/test_groundtruthitem_tags_validation.py @@ -3,7 +3,11 @@ from app.domain.models import AgenticGroundTruthEntry -BASE = dict(id="id1", datasetName="ds", synthQuestion="What is this product?") +BASE = dict( + id="id1", + datasetName="ds", + history=[{"role": "user", "msg": "What is this product?"}], +) def make_item(**overrides): diff --git a/backend/tests/unit/test_history_with_refs.py b/backend/tests/unit/test_history_with_refs.py index 49f3df5..8880ff6 100644 --- a/backend/tests/unit/test_history_with_refs.py +++ b/backend/tests/unit/test_history_with_refs.py @@ -1,35 +1,29 @@ -""" -Unit tests for HistoryItem with refs field. -Validates that history items can store references alongside agent messages. -""" +"""Unit tests for canonical HistoryItem validation semantics.""" + +import pytest +from pydantic import ValidationError from app.domain.models import HistoryItem, Reference from app.domain.enums import HistoryItemRole, ExpectedBehavior -def test_history_item_with_refs(): - """Test that HistoryItem can include refs.""" +def test_history_item_rejects_refs(): + """HistoryItem rejects legacy refs; refs are plugin-owned canonical data.""" refs = [ Reference(url="https://example.com/doc1", content="Content 1"), Reference(url="https://example.com/doc2", content="Content 2", bonus=True), ] - history_item = HistoryItem( - role=HistoryItemRole.assistant, - msg="Here is the answer based on the documentation.", - refs=refs, - ) - - assert history_item.role == HistoryItemRole.assistant - assert history_item.msg == "Here is the answer based on the documentation." - assert history_item.refs is not None - assert len(history_item.refs) == 2 - assert history_item.refs[0].url == "https://example.com/doc1" - assert history_item.refs[1].bonus is True + with pytest.raises(ValidationError): + HistoryItem( + role=HistoryItemRole.assistant, + msg="Here is the answer based on the documentation.", + refs=refs, + ) def test_history_item_without_refs(): - """Test that refs is optional in HistoryItem.""" + """HistoryItem remains valid with canonical role/msg content only.""" history_item = HistoryItem( role=HistoryItemRole.user, msg="What is the answer?", @@ -37,33 +31,23 @@ def test_history_item_without_refs(): assert history_item.role == HistoryItemRole.user assert history_item.msg == "What is the answer?" - assert history_item.refs is None + assert "refs" not in history_item.model_dump() def test_history_item_serialization(): - """Test that HistoryItem serializes correctly with refs.""" - refs = [ - Reference(url="https://example.com/doc1", content="Content 1"), - ] - - history_item = HistoryItem( - role=HistoryItemRole.assistant, - msg="Answer text", - refs=refs, - ) + """HistoryItem serialization excludes legacy refs field.""" + history_item = HistoryItem(role=HistoryItemRole.assistant, msg="Answer text") # Serialize to dict data = history_item.model_dump() assert data["role"] == "assistant" assert data["msg"] == "Answer text" - assert data["refs"] is not None - assert len(data["refs"]) == 1 - assert data["refs"][0]["url"] == "https://example.com/doc1" + assert "refs" not in data -def test_history_item_deserialization(): - """Test that HistoryItem can be created from dict with refs.""" +def test_history_item_deserialization_rejects_refs(): + """HistoryItem rejects dict payloads containing legacy refs.""" data = { "role": "assistant", "msg": "Answer text", @@ -73,33 +57,24 @@ def test_history_item_deserialization(): ], } - history_item = HistoryItem(**data) - - assert history_item.role == HistoryItemRole.assistant - assert history_item.msg == "Answer text" - assert history_item.refs is not None - assert len(history_item.refs) == 2 - assert history_item.refs[0].url == "https://example.com/doc1" - assert history_item.refs[1].bonus is True + with pytest.raises(ValidationError): + HistoryItem(**data) -def test_user_history_item_typically_no_refs(): - """Test that user messages typically don't have refs (but could).""" - # User message without refs (typical) +def test_user_history_item_rejects_refs(): + """User history items also reject legacy refs.""" user_item = HistoryItem( role=HistoryItemRole.user, msg="What is this product?", ) - assert user_item.refs is None - - # User message with refs (uncommon but allowed) - user_item_with_refs = HistoryItem( - role=HistoryItemRole.user, - msg="Based on this document, what is this product?", - refs=[Reference(url="https://example.com/doc1")], - ) - assert user_item_with_refs.refs is not None - assert len(user_item_with_refs.refs) == 1 + assert "refs" not in user_item.model_dump() + + with pytest.raises(ValidationError): + HistoryItem( + role=HistoryItemRole.user, + msg="Based on this document, what is this product?", + refs=[Reference(url="https://example.com/doc1")], + ) def test_history_item_with_expected_behavior(): diff --git a/backend/tests/unit/test_phase1_rework.py b/backend/tests/unit/test_phase1_rework.py index 9810eef..e2eaae9 100644 --- a/backend/tests/unit/test_phase1_rework.py +++ b/backend/tests/unit/test_phase1_rework.py @@ -24,6 +24,7 @@ BulkImportResult, HistoryEntry, ) +from app.plugins.pack_registry import get_rag_compat_pack from app.domain.enums import GroundTruthStatus @@ -210,11 +211,11 @@ async def test_bulk_import_approve_enforces_plugin_pack_approval_hooks(self): class TestAssignmentHistoryReset: - """Test IV-002: Assignment route history edits reset totalReferences.""" + """Test IV-002: Assignment route history edits preserve compat totalReferences.""" @pytest.mark.asyncio async def test_assignment_update_history_resets_total_references(self): - """When history is updated via assignment route, totalReferences should be reset to 0.""" + """When history updates, compat plugin totalReferences remains unchanged.""" from app.core.auth import UserContext from app.container import container from app.api.v1.assignments import update_item @@ -235,7 +236,13 @@ async def test_assignment_update_history_resets_total_references(self): HistoryEntry(role="user", msg="Old question"), HistoryEntry(role="assistant", msg="Old answer"), ], - totalReferences=5, # Stale value + plugins={ + "rag-compat": { + "kind": "rag-compat", + "version": "1.0", + "data": {"totalReferences": 5}, + } + }, _etag="test-etag", ) @@ -277,16 +284,16 @@ async def mock_upsert(item): if_match=None, ) - # Verify totalReferences was reset to 0 + # Verify compat totalReferences was preserved on the plugin payload assert saved_item is not None - assert saved_item.totalReferences == 0 + assert get_rag_compat_pack().reference_count(saved_item) == 5 finally: container.repo = original_repo @pytest.mark.asyncio async def test_assignment_clear_history_resets_total_references(self): - """When history is cleared via assignment route, totalReferences should be reset to 0.""" + """When history is cleared, compat plugin totalReferences remains unchanged.""" from app.core.auth import UserContext from app.container import container from app.api.v1.assignments import update_item @@ -307,7 +314,13 @@ async def test_assignment_clear_history_resets_total_references(self): HistoryEntry(role="user", msg="Question"), HistoryEntry(role="assistant", msg="Answer"), ], - totalReferences=3, + plugins={ + "rag-compat": { + "kind": "rag-compat", + "version": "1.0", + "data": {"totalReferences": 3}, + } + }, _etag="test-etag", ) @@ -344,9 +357,9 @@ async def mock_upsert(item): if_match=None, ) - # Verify totalReferences was reset to 0 + # Verify compat totalReferences was preserved on the plugin payload assert saved_item is not None - assert saved_item.totalReferences == 0 + assert get_rag_compat_pack().reference_count(saved_item) == 3 finally: container.repo = original_repo diff --git a/backend/tests/unit/test_pii_detection.py b/backend/tests/unit/test_pii_detection.py index 58673b8..430e3d3 100644 --- a/backend/tests/unit/test_pii_detection.py +++ b/backend/tests/unit/test_pii_detection.py @@ -165,7 +165,7 @@ class TestGroundTruthItemScanning: """Tests for scanning GroundTruthItem fields.""" def test_scans_synth_question(self): - """Should detect PII in synthQuestion field.""" + """Should detect PII in canonical question field.""" item = make_test_entry( id="test-1", dataset_name="test-dataset", @@ -174,12 +174,12 @@ def test_scans_synth_question(self): warnings = scan_item_for_pii(item) # Should find PII in multiple representations (history, plugin data, computed fields) assert len(warnings) >= 1 - # Check that at least one warning is for synthQuestion - assert any(w.field == "synthQuestion" for w in warnings) + # Check that at least one warning is for canonical question text + assert any(w.field == "history.question" for w in warnings) assert any("email" in w.pattern_type for w in warnings) def test_scans_edited_question(self): - """Should detect PII in editedQuestion field.""" + """Should detect PII in edited question via canonical question field.""" item = make_test_entry( id="test-1", dataset_name="test-dataset", @@ -187,13 +187,12 @@ def test_scans_edited_question(self): edited_question="Contact support@company.org for assistance", ) warnings = scan_item_for_pii(item) - # Should find PII in multiple representations assert len(warnings) >= 1 - assert any(w.field == "editedQuestion" for w in warnings) + assert any(w.field == "history.question" for w in warnings) assert any("email" in w.pattern_type for w in warnings) def test_scans_answer(self): - """Should detect PII in answer field.""" + """Should detect PII in canonical answer field.""" item = make_test_entry( id="test-1", dataset_name="test-dataset", @@ -203,7 +202,7 @@ def test_scans_answer(self): warnings = scan_item_for_pii(item) # Should find PII in multiple representations assert len(warnings) >= 1 - assert any(w.field == "answer" for w in warnings) + assert any(w.field == "history.answer" for w in warnings) assert any("phone" in w.pattern_type for w in warnings) def test_scans_comment(self): @@ -231,9 +230,10 @@ def test_scans_history_messages(self): ], ) warnings = scan_item_for_pii(item) - assert len(warnings) == 2 + assert len(warnings) >= 2 # Check field names include index fields = {w.field for w in warnings} + assert "history.question" in fields assert "history[0].msg" in fields assert "history[2].msg" in fields @@ -360,14 +360,14 @@ def test_warning_model_serialization(self): """PIIWarning should serialize correctly.""" warning = PIIWarning( item_id="test-1", - field="synthQuestion", + field="history.question", pattern_type="email", snippet="...[u***@e***e.com]...", position=10, ) data = warning.model_dump() assert data["item_id"] == "test-1" - assert data["field"] == "synthQuestion" + assert data["field"] == "history.question" assert data["pattern_type"] == "email" assert data["snippet"] == "...[u***@e***e.com]..." assert data["position"] == 10 diff --git a/backend/tests/unit/test_rag_compat_approval.py b/backend/tests/unit/test_rag_compat_approval.py index 951f505..bc73cce 100644 --- a/backend/tests/unit/test_rag_compat_approval.py +++ b/backend/tests/unit/test_rag_compat_approval.py @@ -24,7 +24,7 @@ def _make_item(**overrides) -> AgenticGroundTruthEntry: defaults = { "id": "rag-test-1", "datasetName": "demo", - "synthQuestion": "What is X?", + "history": [{"role": "user", "msg": "What is X?"}], } defaults.update(overrides) return AgenticGroundTruthEntry.model_validate(defaults) @@ -39,10 +39,16 @@ def test_core_requires_assistant_message_even_with_refs(): """After waiver removal, core always generates the assistant error.""" item = _make_item( history=[{"role": "user", "msg": "hello"}], - totalReferences=5, + plugins={ + "rag-compat": { + "kind": "rag-compat", + "version": "1.0", + "data": {"references": [{"url": "https://example.com/ref"}]}, + } + }, ) errors = collect_approval_validation_errors(item) - assert "history must include at least one assistant message" in errors + assert "history must include at least one agent message" in errors def test_core_no_error_when_assistant_present(): @@ -65,18 +71,26 @@ def test_rag_pack_waives_assistant_error_when_refs_present(): pack = RagCompatPack() item = _make_item( history=[{"role": "user", "msg": "hello"}], - totalReferences=3, + plugins={ + "rag-compat": { + "kind": "rag-compat", + "version": "1.0", + "data": {"references": [{"url": "https://example.com/ref"}]}, + } + }, ) core_errors = collect_approval_validation_errors(item) waivers = pack.collect_approval_waivers(item, core_errors) - assert "history must include at least one assistant message" in waivers + assert "history must include at least one agent message" in waivers def test_rag_pack_no_waiver_when_refs_zero(): pack = RagCompatPack() item = _make_item( history=[{"role": "user", "msg": "hello"}], - totalReferences=0, + plugins={ + "rag-compat": {"kind": "rag-compat", "version": "1.0", "data": {"references": []}} + }, ) core_errors = collect_approval_validation_errors(item) waivers = pack.collect_approval_waivers(item, core_errors) @@ -88,7 +102,13 @@ def test_rag_pack_does_not_waive_user_message_error(): pack = RagCompatPack() item = _make_item( history=[{"role": "assistant", "msg": "answer"}], - totalReferences=5, + plugins={ + "rag-compat": { + "kind": "rag-compat", + "version": "1.0", + "data": {"references": [{"url": "https://example.com/ref"}]}, + } + }, ) core_errors = collect_approval_validation_errors(item) waivers = pack.collect_approval_waivers(item, core_errors) @@ -109,7 +129,13 @@ def test_rag_pack_waives_required_tools_error_when_refs_present(): {"role": "assistant", "msg": "world"}, ], toolCalls=[{"name": "search"}], - totalReferences=3, + plugins={ + "rag-compat": { + "kind": "rag-compat", + "version": "1.0", + "data": {"references": [{"url": "https://example.com/ref"}]}, + } + }, ) core_errors = collect_approval_validation_errors(item) waivers = pack.collect_approval_waivers(item, core_errors) @@ -124,7 +150,9 @@ def test_rag_pack_no_required_tools_waiver_when_refs_zero(): {"role": "assistant", "msg": "world"}, ], toolCalls=[{"name": "search"}], - totalReferences=0, + plugins={ + "rag-compat": {"kind": "rag-compat", "version": "1.0", "data": {"references": []}} + }, ) core_errors = collect_approval_validation_errors(item) waivers = pack.collect_approval_waivers(item, core_errors) @@ -145,13 +173,19 @@ def test_registry_filters_waived_errors(): item = _make_item( history=[{"role": "user", "msg": "hello"}], - totalReferences=3, + plugins={ + "rag-compat": { + "kind": "rag-compat", + "version": "1.0", + "data": {"references": [{"url": "https://example.com/ref"}]}, + } + }, ) core_errors = collect_approval_validation_errors(item) - assert "history must include at least one assistant message" in core_errors + assert "history must include at least one agent message" in core_errors filtered = registry.filter_core_errors(item, core_errors) - assert "history must include at least one assistant message" not in filtered + assert "history must include at least one agent message" not in filtered def test_registry_preserves_non_waived_errors(): @@ -161,7 +195,16 @@ def test_registry_preserves_non_waived_errors(): registry.register(RagCompatPack()) # Item with no history, no question, no answer → "no conversation message" error - item = _make_item(synthQuestion="", totalReferences=5) + item = _make_item( + history=[], + plugins={ + "rag-compat": { + "kind": "rag-compat", + "version": "1.0", + "data": {"references": [{"url": "https://example.com/ref"}]}, + } + }, + ) core_errors = collect_approval_validation_errors(item) filtered = registry.filter_core_errors(item, core_errors) # "history must contain at least one conversation message" is NOT waived diff --git a/backend/tests/unit/test_rag_compat_pack.py b/backend/tests/unit/test_rag_compat_pack.py index 84259cc..22c2d8e 100644 --- a/backend/tests/unit/test_rag_compat_pack.py +++ b/backend/tests/unit/test_rag_compat_pack.py @@ -1,36 +1,20 @@ -"""Unit tests for RagCompatPack plugin contracts and migration helpers. - -Core-generic behavior stays covered elsewhere. This file focuses on: -- runtime-backed pack registration and registry presence -- stable helper contracts for retrieval/reference ownership -- compat-migration helpers that still project legacy payloads while the shim exists -""" +"""Unit tests for RagCompatPack plugin contracts and reference ownership.""" from __future__ import annotations import pytest from app.domain.models import AgenticGroundTruthEntry, Reference +from app.plugins.pack_registry import get_default_pack_registry, reset_default_pack_registry from app.plugins.packs.rag_compat import RagCompatPack, _RAG_COMPAT_KIND -from app.plugins.pack_registry import ( - get_default_pack_registry, - reset_default_pack_registry, -) - - -# --------------------------------------------------------------------------- -# validate_registration -# --------------------------------------------------------------------------- def test_validate_registration_passes(): - """RagCompatPack registers successfully when constants are in sync.""" pack = RagCompatPack() - pack.validate_registration() # should not raise + pack.validate_registration() def test_validate_registration_name_matches_host_model_constant(): - """The pack name must equal AgenticGroundTruthEntry._RAG_COMPAT_PLUGIN.""" from app.domain.models import AgenticGroundTruthEntry pack = RagCompatPack() @@ -38,26 +22,18 @@ def test_validate_registration_name_matches_host_model_constant(): def test_validate_registration_kind_constant_correct(): - """_RAG_COMPAT_KIND must match the host model constant.""" from app.domain.models import AgenticGroundTruthEntry assert _RAG_COMPAT_KIND == AgenticGroundTruthEntry._RAG_COMPAT_PLUGIN def test_validate_registration_fails_on_constant_mismatch(monkeypatch: pytest.MonkeyPatch): - """validate_registration() must raise ValueError if constants diverge.""" pack = RagCompatPack() - # Simulate a rename of the host-model constant monkeypatch.setattr(AgenticGroundTruthEntry, "_RAG_COMPAT_PLUGIN", "rag-v2") with pytest.raises(ValueError, match="does not match"): pack.validate_registration() -# --------------------------------------------------------------------------- -# Plugin-contract: approval hooks -# --------------------------------------------------------------------------- - - def _generic_item() -> AgenticGroundTruthEntry: return AgenticGroundTruthEntry( id="gen-001", @@ -74,74 +50,62 @@ def _rag_item() -> AgenticGroundTruthEntry: { "id": "rag-001", "datasetName": "rag-dataset", - "synthQuestion": "What is retrieval?", - "answer": "Retrieval is finding relevant docs.", - "refs": [{"url": "https://example.com/doc"}], + "history": [ + {"role": "user", "msg": "What is retrieval?"}, + {"role": "assistant", "msg": "Retrieval is finding relevant docs."}, + ], + "plugins": { + "rag-compat": { + "kind": "rag-compat", + "version": "1.0", + "data": { + "references": [{"url": "https://example.com/doc"}], + }, + } + }, } ) -def test_collect_approval_errors_generic_item_empty(): +def test_collect_approval_errors_are_empty(): pack = RagCompatPack() - item = _generic_item() - assert pack.collect_approval_errors(item) == [] - - -def test_collect_approval_errors_rag_item_empty(): - """RAG items currently produce no additional pack-level errors.""" - pack = RagCompatPack() - item = _rag_item() - assert pack.collect_approval_errors(item) == [] - - -# --------------------------------------------------------------------------- -# Plugin-contract: helper accessors -# --------------------------------------------------------------------------- + assert pack.collect_approval_errors(_generic_item()) == [] + assert pack.collect_approval_errors(_rag_item()) == [] def test_rag_compat_data_empty_for_generic_item(): pack = RagCompatPack() - item = _generic_item() - assert pack.rag_compat_data(item) == {} - - -def test_rag_compat_data_populated_for_rag_item(): - pack = RagCompatPack() - item = _rag_item() - data = pack.rag_compat_data(item) - # The model_validator moves synthQuestion, answer, refs into rag-compat plugin data - assert data # non-empty + assert pack.rag_compat_data(_generic_item()) == {} -def test_rag_compat_data_contains_synth_question(): +def test_rag_compat_data_contains_only_references_for_owned_payload(): pack = RagCompatPack() - item = _rag_item() - data = pack.rag_compat_data(item) - assert "synthQuestion" in data - assert data["synthQuestion"] == "What is retrieval?" - - -# --------------------------------------------------------------------------- -# refs_from_item accessor -# --------------------------------------------------------------------------- + data = pack.rag_compat_data(_rag_item()) + assert list(data.keys()) == ["references"] def test_refs_from_item_empty_for_generic_item(): pack = RagCompatPack() - item = _generic_item() - assert pack.refs_from_item(item) == [] + assert pack.refs_from_item(_generic_item()) == [] -def test_refs_from_item_populated_for_rag_item(): +def test_refs_from_item_reads_owned_references(): pack = RagCompatPack() - item = _rag_item() - refs = pack.refs_from_item(item) + refs = pack.refs_from_item(_rag_item()) assert len(refs) == 1 assert isinstance(refs[0], Reference) assert refs[0].url == "https://example.com/doc" -def test_refs_from_item_flattens_per_call_retrieval_state(): +def test_get_search_documents_includes_stable_id(): + pack = RagCompatPack() + docs = pack.get_search_documents(_rag_item()) + assert len(docs) == 1 + assert docs[0]["id"] == "rag-001:ref:0" + assert docs[0]["url"] == "https://example.com/doc" + + +def test_refs_from_item_reads_legacy_retrieval_payloads(): pack = RagCompatPack() item = AgenticGroundTruthEntry.model_validate( { @@ -176,9 +140,44 @@ def test_refs_from_item_flattens_per_call_retrieval_state(): assert refs[0].messageIndex == 2 -# --------------------------------------------------------------------------- -# Plugin-contract: reference ownership helpers -# --------------------------------------------------------------------------- +def test_refs_from_item_respects_explicit_empty_canonical_references(): + pack = RagCompatPack() + item = AgenticGroundTruthEntry.model_validate( + { + "id": "rag-002b", + "datasetName": "rag-dataset", + "history": [ + {"role": "user", "msg": "Question"}, + ], + "toolCalls": [{"id": "tc-1", "name": "search", "callType": "tool", "stepNumber": 3}], + "plugins": { + "rag-compat": { + "kind": "rag-compat", + "data": { + "references": [], + "retrievals": { + "tc-1": { + "candidates": [ + { + "url": "https://example.com/stale", + "title": "Stale", + "chunk": "stale retrieval snippet", + } + ] + } + }, + }, + } + }, + } + ) + + refs = pack.refs_from_item(item) + assert refs == [] + assert pack.reference_count(item) == 0 + + core_errors = ["history must include at least one agent message"] + assert pack.collect_approval_waivers(item, core_errors) == [] def test_attach_reference_adds_to_rag_item(): @@ -187,43 +186,28 @@ def test_attach_reference_adds_to_rag_item(): initial_count = len(pack.refs_from_item(item)) new_ref = Reference(url="https://newdoc.example.com/page") result = pack.attach_reference(item, new_ref) - assert result is item # mutated in-place + assert result is item assert len(pack.refs_from_item(item)) == initial_count + 1 - urls = [r.url for r in pack.refs_from_item(item)] - assert "https://newdoc.example.com/page" in urls -def test_attach_reference_works_on_generic_item(): +def test_attach_reference_writes_owned_references_key(): pack = RagCompatPack() item = _generic_item() - new_ref = Reference(url="https://docs.example.com/a") - pack.attach_reference(item, new_ref) - # The ref is written to rag-compat plugin payload via the setter - refs = pack.refs_from_item(item) - assert len(refs) == 1 - assert refs[0].url == "https://docs.example.com/a" + pack.attach_reference(item, Reference(url="https://docs.example.com/a")) + assert item.plugins["rag-compat"].data == { + "references": [{"url": "https://docs.example.com/a", "bonus": False}] + } def test_detach_reference_removes_by_url(): pack = RagCompatPack() item = _rag_item() - target_url = "https://example.com/doc" - assert any(r.url == target_url for r in pack.refs_from_item(item)) - - result = pack.detach_reference(item, target_url) + result = pack.detach_reference(item, "https://example.com/doc") assert result is item - assert not any(r.url == target_url for r in pack.refs_from_item(item)) - - -def test_detach_reference_nonexistent_url_is_noop(): - pack = RagCompatPack() - item = _rag_item() - before = len(pack.refs_from_item(item)) - pack.detach_reference(item, "https://nonexistent.example.com") - assert len(pack.refs_from_item(item)) == before + assert pack.refs_from_item(item) == [] -def test_replace_references_clears_per_call_retrieval_state(): +def test_replace_references_clears_legacy_fields(): pack = RagCompatPack() item = AgenticGroundTruthEntry.model_validate( { @@ -233,7 +217,12 @@ def test_replace_references_clears_per_call_retrieval_state(): "rag-compat": { "kind": "rag-compat", "data": { - "retrievals": {"tc-1": {"candidates": [{"url": "https://example.com/old"}]}} + "refs": [{"url": "https://example.com/old"}], + "retrievals": { + "tc-1": {"candidates": [{"url": "https://example.com/legacy"}]} + }, + "totalReferences": 2, + "synthQuestion": "legacy", }, } }, @@ -242,50 +231,87 @@ def test_replace_references_clears_per_call_retrieval_state(): pack.replace_references(item, [Reference(url="https://example.com/new")]) - assert pack.has_per_call_state(item) is False - refs = pack.refs_from_item(item) - assert len(refs) == 1 - assert refs[0].url == "https://example.com/new" + assert item.plugins["rag-compat"].data == { + "references": [{"url": "https://example.com/new", "bonus": False}] + } -def test_export_transform_projects_retrieval_candidates_to_refs(): +def test_import_transform_normalizes_legacy_fields_to_history_and_references(): pack = RagCompatPack() - transform = pack.get_export_transforms()[0].transform + transform = pack.get_import_transforms()[0].transform - projected = transform( + normalized = transform( { - "id": "rag-004", + "id": "legacy-001", "datasetName": "rag-dataset", - "toolCalls": [{"id": "tc-1", "stepNumber": 1}], + "editedQuestion": "What is retrieval?", + "answer": "Retrieval finds relevant docs.", + "refs": [{"url": "https://example.com/doc"}], + } + ) + + assert normalized["history"] == [ + {"role": "user", "msg": "What is retrieval?"}, + {"role": "assistant", "msg": "Retrieval finds relevant docs."}, + ] + assert normalized["plugins"]["rag-compat"]["data"] == { + "references": [{"url": "https://example.com/doc", "bonus": False}] + } + + +def test_import_transform_preserves_explicit_empty_canonical_references(): + pack = RagCompatPack() + transform = pack.get_import_transforms()[0].transform + + normalized = transform( + { + "id": "legacy-002", + "datasetName": "rag-dataset", + "history": [{"role": "user", "msg": "Question only"}], "plugins": { "rag-compat": { "kind": "rag-compat", + "version": "1.0", "data": { + "references": [], "retrievals": { "tc-1": { "candidates": [ { - "url": "https://example.com/exported", - "title": "Exported", - "chunk": "retrieved chunk", + "url": "https://example.com/stale", + "chunk": "stale retrieval snippet", } ] } - } + }, }, } }, } ) - assert projected["totalReferences"] == 1 - assert projected["refs"][0]["url"] == "https://example.com/exported" - assert projected["refs"][0]["messageIndex"] == 1 + assert normalized["plugins"]["rag-compat"]["data"] == {"references": []} + +def test_export_transform_projects_references_and_count(): + pack = RagCompatPack() + transform = pack.get_export_transforms()[0].transform -# --------------------------------------------------------------------------- -# Runtime-backed registry seam -# --------------------------------------------------------------------------- + projected = transform( + { + "id": "rag-004", + "datasetName": "rag-dataset", + "plugins": { + "rag-compat": { + "kind": "rag-compat", + "data": {"references": [{"url": "https://example.com/exported"}]}, + } + }, + } + ) + + assert projected["totalReferences"] == 1 + assert projected["references"][0]["url"] == "https://example.com/exported" def test_default_pack_registry_contains_rag_compat(): @@ -301,7 +327,7 @@ def test_default_pack_registry_validates_without_error(): reset_default_pack_registry() try: registry = get_default_pack_registry() - registry.validate_all() # should not raise + registry.validate_all() finally: reset_default_pack_registry() diff --git a/backend/tests/unit/test_retrieval_per_call.py b/backend/tests/unit/test_retrieval_per_call.py index b410d6e..8891c57 100644 --- a/backend/tests/unit/test_retrieval_per_call.py +++ b/backend/tests/unit/test_retrieval_per_call.py @@ -1,183 +1,68 @@ -"""Tests for RagCompatPack per-tool-call retrieval state (Phase 6).""" +"""Unit tests for legacy retrieval compatibility normalization.""" from __future__ import annotations - -from app.domain.models import AgenticGroundTruthEntry +from app.domain.models import AgenticGroundTruthEntry, Reference from app.plugins.packs.rag_compat import RagCompatPack -def _make_item(**overrides) -> AgenticGroundTruthEntry: - """Create a minimal item with default fields.""" - base = { - "id": "test-item", - "datasetName": "ds", - "history": [ - {"role": "user", "msg": "hi"}, - {"role": "assistant", "msg": "hello"}, - ], - } - base.update(overrides) - return AgenticGroundTruthEntry.model_validate(base) - - -def _make_item_with_refs(**overrides) -> AgenticGroundTruthEntry: - """Create an item with top-level refs (legacy pattern).""" - return _make_item( - refs=[ - {"url": "https://a.com", "title": "A", "content": "chunk-a"}, - {"url": "https://b.com", "title": "B", "content": "chunk-b"}, - ], - **overrides, - ) - - -def _make_item_with_tool_calls(**overrides) -> AgenticGroundTruthEntry: - """Create an item with tool calls and top-level refs.""" - return _make_item( - refs=[ - {"url": "https://a.com", "title": "A", "content": "chunk-a", "messageIndex": 1}, - {"url": "https://b.com", "title": "B", "content": "chunk-b"}, - ], - toolCalls=[ - {"id": "tc-1", "name": "search", "callType": "tool", "stepNumber": 1}, - {"id": "tc-2", "name": "lookup", "callType": "tool", "stepNumber": 2}, - ], - **overrides, - ) - - -class TestPerCallRetrievalState: - """Per-tool-call retrieval management on RagCompatPack.""" - - def test_get_retrievals_empty_item(self): - pack = RagCompatPack() - item = _make_item() - assert pack.get_retrievals(item) == {} - - def test_set_and_get_retrieval_candidates(self): - pack = RagCompatPack() - item = _make_item() - candidates = [ - {"url": "https://a.com", "title": "A", "chunk": "text-a"}, - ] - pack.set_retrieval_candidates(item, "tc-1", candidates) - assert pack.get_retrieval_candidates(item, "tc-1") == candidates - - def test_get_retrieval_candidates_missing_tool_call(self): - pack = RagCompatPack() - item = _make_item() - assert pack.get_retrieval_candidates(item, "nonexistent") == [] - - def test_set_retrievals_replaces_all(self): +class TestLegacyRetrievalCompatibility: + def test_refs_from_item_flattens_legacy_retrieval_candidates(self): pack = RagCompatPack() - item = _make_item() - pack.set_retrieval_candidates(item, "tc-1", [{"url": "https://a.com"}]) - pack.set_retrievals( - item, + item = AgenticGroundTruthEntry.model_validate( { - "tc-2": {"candidates": [{"url": "https://b.com"}]}, - }, - ) - assert pack.get_retrieval_candidates(item, "tc-1") == [] - assert len(pack.get_retrieval_candidates(item, "tc-2")) == 1 - - def test_has_per_call_state_false_when_empty(self): - pack = RagCompatPack() - item = _make_item() - assert pack.has_per_call_state(item) is False - - def test_has_per_call_state_true_after_set(self): - pack = RagCompatPack() - item = _make_item() - pack.set_retrieval_candidates(item, "tc-1", [{"url": "https://a.com"}]) - assert pack.has_per_call_state(item) is True - - def test_get_all_candidates_flat_from_per_call(self): - pack = RagCompatPack() - item = _make_item() - pack.set_retrieval_candidates( - item, - "tc-1", - [ - {"url": "https://a.com", "title": "A"}, - ], + "id": "test-item", + "datasetName": "ds", + "history": [ + {"role": "user", "msg": "hi"}, + {"role": "assistant", "msg": "hello"}, + ], + "toolCalls": [ + {"id": "tc-1", "name": "search", "callType": "tool", "stepNumber": 1} + ], + "plugins": { + "rag-compat": { + "kind": "rag-compat", + "version": "1.0", + "data": { + "retrievals": { + "tc-1": { + "candidates": [ + {"url": "https://a.com", "title": "A", "chunk": "chunk-a"}, + {"url": "https://b.com", "title": "B", "chunk": "chunk-b"}, + ] + } + } + }, + } + }, + } ) - pack.set_retrieval_candidates( - item, - "tc-2", - [ - {"url": "https://b.com", "title": "B"}, - ], - ) - flat = pack.get_all_candidates_flat(item) - assert len(flat) == 2 - urls = {c["url"] for c in flat} - assert urls == {"https://a.com", "https://b.com"} - def test_get_all_candidates_flat_falls_back_to_top_level_refs(self): - pack = RagCompatPack() - item = _make_item_with_refs() - flat = pack.get_all_candidates_flat(item) - assert len(flat) == 2 - assert flat[0]["url"] == "https://a.com" - assert flat[0]["chunk"] == "chunk-a" + refs = pack.refs_from_item(item) + assert [ref.url for ref in refs] == ["https://a.com", "https://b.com"] + assert [ref.messageIndex for ref in refs] == [1, 1] - def test_get_all_candidates_flat_includes_tool_call_id(self): + def test_replace_references_rewrites_payload_to_owned_references(self): pack = RagCompatPack() - item = _make_item() - pack.set_retrieval_candidates( - item, - "tc-1", - [ - {"url": "https://a.com"}, - ], + item = AgenticGroundTruthEntry.model_validate( + { + "id": "test-item", + "datasetName": "ds", + "plugins": { + "rag-compat": { + "kind": "rag-compat", + "version": "1.0", + "data": { + "retrievals": {"tc-1": {"candidates": [{"url": "https://legacy.com"}]}} + }, + } + }, + } ) - flat = pack.get_all_candidates_flat(item) - assert flat[0]["toolCallId"] == "tc-1" - - -class TestMigrateRefsToPerCall: - """Tests for migrate_refs_to_per_call helper.""" - - def test_migrate_no_refs_returns_false(self): - pack = RagCompatPack() - item = _make_item() - assert pack.migrate_refs_to_per_call(item) is False - def test_migrate_already_migrated_returns_false(self): - pack = RagCompatPack() - item = _make_item() - pack.set_retrieval_candidates(item, "tc-1", [{"url": "https://a.com"}]) - # Even with refs present, per-call state exists → skip migration - assert pack.migrate_refs_to_per_call(item) is False + pack.replace_references(item, [Reference(url="https://normalized.com")]) - def test_migrate_top_level_refs_to_unassociated(self): - pack = RagCompatPack() - item = _make_item_with_refs() - assert pack.migrate_refs_to_per_call(item) is True - # All refs go to _unassociated since no tool calls - cands = pack.get_retrieval_candidates(item, "_unassociated") - assert len(cands) == 2 - assert cands[0]["url"] == "https://a.com" - - def test_migrate_refs_matched_to_tool_calls_by_step(self): - pack = RagCompatPack() - item = _make_item_with_tool_calls() - assert pack.migrate_refs_to_per_call(item) is True - - # Ref with messageIndex=1 matches tc-1 (stepNumber=1) - tc1_cands = pack.get_retrieval_candidates(item, "tc-1") - assert len(tc1_cands) == 1 - assert tc1_cands[0]["url"] == "https://a.com" - - # Ref without messageIndex goes to _unassociated - unassociated = pack.get_retrieval_candidates(item, "_unassociated") - assert len(unassociated) == 1 - assert unassociated[0]["url"] == "https://b.com" - - def test_migrate_idempotent(self): - pack = RagCompatPack() - item = _make_item_with_refs() - assert pack.migrate_refs_to_per_call(item) is True - assert pack.migrate_refs_to_per_call(item) is False + assert item.plugins["rag-compat"].data == { + "references": [{"url": "https://normalized.com", "bonus": False}] + } diff --git a/backend/tests/unit/test_snapshot_service.py b/backend/tests/unit/test_snapshot_service.py index 9a8cfcf..52d2c5c 100644 --- a/backend/tests/unit/test_snapshot_service.py +++ b/backend/tests/unit/test_snapshot_service.py @@ -100,9 +100,10 @@ def _make_item(id: str, dataset: str, status: GroundTruthStatus) -> AgenticGroun datasetName=dataset, bucket=None, status=status, - synthQuestion="Q?", - answer="A", - refs=[], + history=[ + {"role": "user", "msg": "Q?"}, + {"role": "assistant", "msg": "A"}, + ], manualTags=[], computedTags=[], ) diff --git a/backend/tests/unit/test_trace_export_adapter.py b/backend/tests/unit/test_trace_export_adapter.py index 2f0f8ba..3b193b0 100644 --- a/backend/tests/unit/test_trace_export_adapter.py +++ b/backend/tests/unit/test_trace_export_adapter.py @@ -49,9 +49,11 @@ def test_trace_export_adapter_maps_trace_into_agentic_ground_truth() -> None: assert item.id == "trace-trace-123" assert item.datasetName == "customer-feedback" assert item.scenario_id == "trace-export:trace-123" - assert item.synth_question == "CX IS USING TOO MUCH DATA AND WANTS TO KNOW WHY" - assert item.answer is not None - assert "Root Cause" in item.answer + assert item.history[0].role == "user" + assert item.history[0].msg == "CX IS USING TOO MUCH DATA AND WANTS TO KNOW WHY" + assert item.history[1].role == "orchestrator-agent" + assert "cellular data" in item.history[1].msg + assert any("Root Cause" in turn.msg for turn in item.history) assert item.comment == "CUSTOMER WAS ON CELLULAR DATA INSTEAD OF WIFI" assert item.trace_ids == { "traceId": "trace-123", diff --git a/backend/tests/unit/test_validation_required_tools.py b/backend/tests/unit/test_validation_required_tools.py index babde3e..1efabee 100644 --- a/backend/tests/unit/test_validation_required_tools.py +++ b/backend/tests/unit/test_validation_required_tools.py @@ -97,7 +97,13 @@ def test_rag_pack_waives_required_tools_for_retrieval_items(): item = _make_item( toolCalls=[{"name": "search"}], - totalReferences=3, + plugins={ + "rag-compat": { + "kind": "rag-compat", + "version": "1.0", + "data": {"references": [{"url": "https://example.com/ref"}]}, + } + }, ) core_errors = collect_approval_validation_errors(item) assert REQUIRED_TOOLS_ERROR in core_errors @@ -113,7 +119,13 @@ def test_rag_pack_does_not_waive_required_tools_without_refs(): item = _make_item( toolCalls=[{"name": "search"}], - totalReferences=0, + plugins={ + "rag-compat": { + "kind": "rag-compat", + "version": "1.0", + "data": {"references": []}, + } + }, ) core_errors = collect_approval_validation_errors(item) filtered = registry.filter_core_errors(item, core_errors) diff --git a/backend/tests/unit/test_validation_service.py b/backend/tests/unit/test_validation_service.py index 942eafd..c18d5f5 100644 --- a/backend/tests/unit/test_validation_service.py +++ b/backend/tests/unit/test_validation_service.py @@ -12,14 +12,63 @@ def test_approval_validation_accepts_legacy_question_answer_payload(): { "id": "item-1", "datasetName": "demo", - "synthQuestion": "What is Ground Truth Curator?", - "answer": "It is a curation application.", + "history": [ + {"role": "user", "msg": "What is Ground Truth Curator?"}, + {"role": "assistant", "msg": "It is a curation application."}, + ], } ) assert collect_approval_validation_errors(item) == [] +def test_approval_validation_accepts_agent_answer_role(): + item = AgenticGroundTruthEntry.model_validate( + { + "id": "item-agent", + "datasetName": "demo", + "history": [ + {"role": "user", "msg": "What is Ground Truth Curator?"}, + {"role": "agent", "msg": "It is a curation application."}, + ], + } + ) + + assert collect_approval_validation_errors(item) == [] + + +def test_approval_validation_accepts_custom_non_user_answer_role(): + item = AgenticGroundTruthEntry.model_validate( + { + "id": "item-planner", + "datasetName": "demo", + "history": [ + {"role": "user", "msg": "Plan the rollout."}, + {"role": "planner", "msg": "Step 1: scope. Step 2: validate."}, + ], + } + ) + + assert collect_approval_validation_errors(item) == [] + + +def test_approval_validation_rejects_all_user_history(): + item = AgenticGroundTruthEntry.model_validate( + { + "id": "item-all-user", + "datasetName": "demo", + "history": [ + {"role": "user", "msg": "Question one"}, + {"role": "user", "msg": "Question two"}, + ], + } + ) + + assert collect_approval_validation_errors(item) == [ + "history must include at least one agent message" + ] + + def test_approval_validation_requires_required_tool_when_tool_calls_exist(): item = AgenticGroundTruthEntry( id="item-2", diff --git a/frontend/src/adapters/apiMapper.ts b/frontend/src/adapters/apiMapper.ts index f285c90..40e985c 100644 --- a/frontend/src/adapters/apiMapper.ts +++ b/frontend/src/adapters/apiMapper.ts @@ -3,61 +3,52 @@ import { createConversationTurn, ensureConversationTurnIdentity, type GroundTruthItem, - getItemReferences, - getLastAgentTurn, - getLastUserTurn, type PluginPayload, - type Reference, type ToolCallRecord, withDerivedLegacyFields, } from "../models/groundTruth"; -import { urlToTitle } from "../models/utils"; - -const _RAG_COMPAT_KEY = "rag-compat"; -const _UNASSOCIATED_KEY = "_unassociated"; - -type RetrievalBucket = { - candidates: Array<{ - url: string; - title?: string; - chunk?: string; - relevance?: string; - toolCallId?: string; - messageIndex?: number; - turnId?: string; - keyParagraph?: string; - bonus?: boolean; - }>; -}; -type RetrievalsMap = Record; +import { sanitizeCompatPluginForPatch } from "./ragCompatBoundary"; + +const _REMOVED_COMPAT_PATCH_KEYS = [ + "synthQuestion", + "editedQuestion", + "answer", + "refs", + "totalReferences", + "retrievals", + "historyAnnotations", + "contextUsedForGeneration", + "contextSource", + "modelUsedForGeneration", + "semanticClusterNumber", + "weight", + "samplingBucket", + "questionLength", +]; type ConversationTurn = NonNullable[number]; +export type ApiReference = { + url: string; + title?: string | null; + content?: string | null; + keyExcerpt?: string | null; + type?: string | null; + bonus?: boolean; + messageIndex?: number | null; +}; export type ApiHistoryEntry = components["schemas"]["HistoryEntry"] & { - refs?: components["schemas"]["Reference"][]; + refs?: ApiReference[]; expectedBehavior?: string[]; turnId?: string; stepId?: string; }; -export type ApiGroundTruth = - components["schemas"]["AgenticGroundTruthEntry-Output"] & { - synthQuestion?: string | null; - editedQuestion?: string | null; - answer?: string | null; - refs?: components["schemas"]["Reference"][]; - totalReferences?: number; - tags?: string[]; - comment?: string | null; - } & Omit< - components["schemas"]["AgenticGroundTruthEntry-Output"], - "history" - > & { - history?: ApiHistoryEntry[]; - }; -export type ApiReference = components["schemas"]["Reference"]; - -type StoredTurnIdentity = { - turnId?: string; - stepId?: string; +export type ApiGroundTruth = Omit< + components["schemas"]["AgenticGroundTruthEntry-Output"], + "history" +> & { + tags?: string[]; + comment?: string | null; + history?: ApiHistoryEntry[]; }; function hasOwnField(value: object, field: PropertyKey): boolean { @@ -77,17 +68,6 @@ function normalizeToolCalls( })); } -function getStoredTurnIdentities( - plugins: Record, -): StoredTurnIdentity[] { - const turnIdentity = ( - plugins[_RAG_COMPAT_KEY]?.data as Record - )?.turnIdentity; - return Array.isArray(turnIdentity) - ? (turnIdentity as StoredTurnIdentity[]) - : []; -} - export function groundTruthFromApi( api: ApiGroundTruth, providerId = "api", @@ -96,130 +76,26 @@ export function groundTruthFromApi( api.plugins && Object.keys(api.plugins).length ? (api.plugins as Record) : {}; - const storedTurnIdentity = getStoredTurnIdentities(plugins); let history: GroundTruthItem["history"]; - const legacyRefs: Reference[] = []; - let refIndex = 0; - if (api.history && api.history.length > 0) { + if (Array.isArray(api.history)) { history = new Array(api.history.length); for (let idx = 0; idx < api.history.length; idx++) { const h = api.history[idx]; - // Preserve free-form roles; map "assistant" to "agent" for backward compat. - const role = h.role === "assistant" ? "agent" : h.role; - const identity = storedTurnIdentity[idx]; history[idx] = createConversationTurn({ - role, + role: h.role, content: h.msg, - turnId: h.turnId || identity?.turnId, - stepId: h.stepId || identity?.stepId, + turnId: h.turnId, + stepId: h.stepId, expectedBehavior: h.expectedBehavior && h.expectedBehavior.length > 0 ? (h.expectedBehavior as ConversationTurn["expectedBehavior"]) : undefined, }); - - if (h.refs && h.refs.length > 0) { - for (const r of h.refs) { - legacyRefs.push({ - id: `ref_${refIndex++}`, - title: r.title || (r.url ? urlToTitle(r.url) : undefined), - url: r.url, - snippet: r.content ?? undefined, - keyParagraph: r.keyExcerpt ?? undefined, - visitedAt: null, - bonus: r.bonus === true, - messageIndex: idx, - turnId: history[idx]?.turnId, - }); - } - } - } - } else { - // Legacy single-turn item: create initial history from synthQuestion/editedQuestion - const initialQuestion = api.editedQuestion || api.synthQuestion || ""; - if (initialQuestion) { - history = [ - createConversationTurn({ - role: "user", - content: initialQuestion, - turnId: storedTurnIdentity[0]?.turnId, - stepId: storedTurnIdentity[0]?.stepId, - }), - createConversationTurn({ - role: "agent", - content: api.answer || "", - turnId: storedTurnIdentity[1]?.turnId, - stepId: storedTurnIdentity[1]?.stepId, - }), - ]; - } - } - - // Process top-level refs (backward compatibility) - if (api.refs && api.refs.length > 0) { - const wasLegacyConversion = !api.history || api.history.length === 0; - const messageIndex = wasLegacyConversion ? 1 : undefined; - const turnId = - typeof messageIndex === "number" - ? history?.[messageIndex]?.turnId - : undefined; - - for (const r of api.refs) { - legacyRefs.push({ - id: `ref_${refIndex++}`, - title: r.title || (r.url ? urlToTitle(r.url) : undefined), - url: r.url, - snippet: r.content ?? undefined, - keyParagraph: r.keyExcerpt ?? undefined, - visitedAt: null, - bonus: r.bonus === true, - messageIndex, - turnId, - }); } } - // Read per-call retrieval state from plugin data if it already exists - const existingRetrievals = ( - plugins[_RAG_COMPAT_KEY]?.data as Record | undefined - )?.retrievals; - const hasPerCallState = - existingRetrievals && - typeof existingRetrievals === "object" && - !Array.isArray(existingRetrievals) && - Object.keys(existingRetrievals as Record).length > 0; - - // When no per-call state exists but legacy refs were extracted, migrate them - if (!hasPerCallState && legacyRefs.length > 0) { - const retrievals: RetrievalsMap = {}; - for (const ref of legacyRefs) { - const key = ref.toolCallId || _UNASSOCIATED_KEY; - if (!retrievals[key]) { - retrievals[key] = { candidates: [] }; - } - retrievals[key].candidates.push({ - url: ref.url, - title: ref.title, - chunk: ref.snippet, - relevance: undefined, - toolCallId: ref.toolCallId, - messageIndex: ref.turnId ? undefined : ref.messageIndex, - turnId: ref.turnId, - keyParagraph: ref.keyParagraph, - bonus: ref.bonus, - }); - } - - const existingPlugin = plugins[_RAG_COMPAT_KEY]; - plugins[_RAG_COMPAT_KEY] = { - kind: _RAG_COMPAT_KEY, - version: existingPlugin?.version || "1.0", - data: { ...(existingPlugin?.data || {}), retrievals }, - }; - } - const deleted = api.status === "deleted"; return withDerivedLegacyFields({ @@ -235,7 +111,6 @@ export function groundTruthFromApi( manualTags: api.manualTags || [], computedTags: api.computedTags || [], reviewedAt: api.reviewedAt ?? null, - totalReferences: api.totalReferences, // Generic schema fields — passed through from the API scenarioId: api.scenarioId || undefined, contextEntries: @@ -265,87 +140,25 @@ export function groundTruthFromApi( export function groundTruthToPatch(args: { item: GroundTruthItem; - originalApi?: ApiGroundTruth; }): Partial { - const { originalApi } = args; const item = withDerivedLegacyFields(args.item); const history = ensureConversationTurnIdentity(item.history); - // Extract references from per-call plugin state - const references = getItemReferences(item); - - const hadLegacyTopLevelRefs = - !!originalApi && - (!originalApi.history || originalApi.history.length === 0) && - (originalApi.refs?.length || 0) > 0; - - let topLevelRefs: ApiReference[] = []; - if (hadLegacyTopLevelRefs) { - const legacyAgentTurnId = history[1]?.turnId; - topLevelRefs = references - .filter( - (r) => - r.turnId === legacyAgentTurnId || - r.messageIndex === 1 || - r.messageIndex === undefined, - ) - .map((r) => ({ - url: r.url, - title: r.title || undefined, - keyExcerpt: r.keyParagraph || undefined, - content: r.snippet || undefined, - bonus: !!r.bonus, - })); - } else { - topLevelRefs = references - .filter((r) => r.messageIndex === undefined) - .map((r) => ({ - url: r.url, - title: r.title || undefined, - keyExcerpt: r.keyParagraph || undefined, - content: r.snippet || undefined, - bonus: !!r.bonus, - })); - } - const body: Partial = { status: (item.deleted ? "deleted" : item.status) as components["schemas"]["GroundTruthStatus"], - answer: getLastAgentTurn(item), - editedQuestion: getLastUserTurn(item), - refs: topLevelRefs, manualTags: item.manualTags || [], }; if (history.length > 0) { - body.history = history.map((turn, idx) => { - let turnRefs: ApiReference[] | undefined; - if (turn.role !== "user") { - const refsForTurn = references.filter( - (r) => r.turnId === turn.turnId || r.messageIndex === idx, - ); - if (refsForTurn.length > 0) { - turnRefs = refsForTurn.map((r) => ({ - url: r.url, - title: r.title || undefined, - content: r.snippet || undefined, - keyExcerpt: r.keyParagraph || undefined, - bonus: !!r.bonus, - })); - } - } - - // Map "agent" back to "assistant" for backward compat; preserve other free-form roles. - const apiRole = turn.role === "agent" ? "assistant" : turn.role; - + body.history = history.map((turn) => { return { - role: apiRole, + role: turn.role, msg: turn.content, turnId: turn.turnId, stepId: turn.stepId, expectedBehavior: turn.expectedBehavior || undefined, - ...(turnRefs ? { refs: turnRefs } : {}), }; }); } @@ -373,22 +186,19 @@ export function groundTruthToPatch(args: { if (item.metadata && Object.keys(item.metadata).length) { (body as Record).metadata = item.metadata; } - const plugins = { ...(item.plugins || {}) }; - const existingCompat = plugins[_RAG_COMPAT_KEY]; - if (history.length > 0) { - plugins[_RAG_COMPAT_KEY] = { - kind: _RAG_COMPAT_KEY, - version: existingCompat?.version || "1.0", - data: { - ...(existingCompat?.data || {}), - turnIdentity: history.map((turn) => ({ - turnId: turn.turnId, - stepId: turn.stepId, - })), - }, - }; - } - if (Object.keys(plugins).length) { + const plugins = sanitizeCompatPluginForPatch({ + plugins: item.plugins, + removedKeys: _REMOVED_COMPAT_PATCH_KEYS, + historyTurnIds: history.map((turn) => turn.turnId), + indexByTurnId: new Map( + history + .map((turn, index) => + turn.turnId ? ([turn.turnId, index] as const) : null, + ) + .filter((entry): entry is readonly [string, number] => entry !== null), + ), + }); + if (plugins && Object.keys(plugins).length) { (body as Record).plugins = plugins; } if (item.traceIds) { diff --git a/frontend/src/adapters/apiProvider.ts b/frontend/src/adapters/apiProvider.ts index 466d7de..32098fe 100644 --- a/frontend/src/adapters/apiProvider.ts +++ b/frontend/src/adapters/apiProvider.ts @@ -85,7 +85,7 @@ export class ApiProvider implements Provider { const fresh = await getGroundTruthRaw(dataset, bucket, item.id); updatedApi = fresh; } else { - const patch = groundTruthToPatch({ item, originalApi: e.api }); + const patch = groundTruthToPatch({ item }); const doUpdate = async (nextEtag?: string | null) => updateAssignedGroundTruth( dataset, diff --git a/frontend/src/adapters/ragCompatBoundary.ts b/frontend/src/adapters/ragCompatBoundary.ts new file mode 100644 index 0000000..4c678f7 --- /dev/null +++ b/frontend/src/adapters/ragCompatBoundary.ts @@ -0,0 +1,103 @@ +import { + type CompatPluginsMap, + collectCanonicalReferencesFromCompatPlugins, + getCompatReferencesFromData, + getCompatRetrievalsFromData, + retrievalsToCanonicalReferences, + writeCompatPluginEnvelope, +} from "../models/ragCompatPayload"; + +const _RAG_COMPAT_KEY = "rag-compat"; + +type ReferenceLike = { + id: string; + title?: string; + url: string; + snippet?: string; + visitedAt?: string | null; + keyParagraph?: string; + bonus?: boolean; + messageIndex?: number; + turnId?: string; + toolCallId?: string; +}; + +export function collectReferencesFromCompat(args: { + plugins: CompatPluginsMap | undefined; + historyTurnIds: Array; + indexByTurnId: Map; +}): ReferenceLike[] { + const { plugins, historyTurnIds, indexByTurnId } = args; + return collectCanonicalReferencesFromCompatPlugins({ + plugins, + historyTurnIds, + indexByTurnId, + }); +} + +export function withCompatReferences(args: { + plugins: CompatPluginsMap | undefined; + refs: ReferenceLike[]; +}): CompatPluginsMap { + const { plugins, refs } = args; + return writeCompatPluginEnvelope({ plugins, refs }); +} + +export function sanitizeCompatData( + data: unknown, + removedKeys: readonly string[], + historyTurnIds: Array = [], + indexByTurnId: Map = new Map(), +): Record { + if (!data || typeof data !== "object" || Array.isArray(data)) { + return {}; + } + const sanitized = { ...(data as Record) }; + const canonicalRefs = getCompatReferencesFromData(sanitized); + if (!canonicalRefs) { + const retrievals = getCompatRetrievalsFromData(sanitized); + if (retrievals) { + const materialized = retrievalsToCanonicalReferences({ + retrievals, + historyTurnIds, + indexByTurnId, + }); + if (materialized.length > 0) { + sanitized.references = materialized; + } + } + } + for (const key of removedKeys) { + delete sanitized[key]; + } + delete sanitized.retrievals; + return sanitized; +} + +export function sanitizeCompatPluginForPatch(args: { + plugins: CompatPluginsMap | undefined; + removedKeys: readonly string[]; + historyTurnIds?: Array; + indexByTurnId?: Map; +}): CompatPluginsMap | undefined { + const { plugins, removedKeys, historyTurnIds, indexByTurnId } = args; + if (!plugins) { + return undefined; + } + const nextPlugins = { ...plugins }; + const existingCompat = nextPlugins[_RAG_COMPAT_KEY]; + if (!existingCompat) { + return nextPlugins; + } + nextPlugins[_RAG_COMPAT_KEY] = { + kind: _RAG_COMPAT_KEY, + version: existingCompat.version || "1.0", + data: sanitizeCompatData( + existingCompat.data, + removedKeys, + historyTurnIds, + indexByTurnId, + ), + }; + return nextPlugins; +} diff --git a/frontend/src/api/generated.ts b/frontend/src/api/generated.ts index e2a83f0..2efc7da 100644 --- a/frontend/src/api/generated.ts +++ b/frontend/src/api/generated.ts @@ -689,16 +689,6 @@ export interface components { }; /** Tags */ readonly tags: string[]; - /** Synthquestion */ - readonly synthQuestion: string | null; - /** Editedquestion */ - readonly editedQuestion: string | null; - /** Answer */ - readonly answer: string | null; - /** Refs */ - readonly refs: components["schemas"]["Reference"][]; - /** Totalreferences */ - readonly totalReferences: number; }; /** * AssignItemRequest @@ -1056,7 +1046,7 @@ export interface components { item_id: string; /** * Field - * @description Field name where PII was detected (e.g., 'synthQuestion', 'history[2].msg') + * @description Field name where the PII was detected (e.g., 'history.question', 'history[2].msg') */ field: string; /** @@ -1163,35 +1153,6 @@ export interface components { */ duration_ms: number; }; - /** - * Reference - * @description Legacy RAG reference object retained for compatibility helpers and tests. - */ - Reference: { - /** - * Url - * @description Reference URL (required, non-empty) - */ - url: string; - /** - * Title - * @description Human-readable title for the reference - */ - title?: string | null; - /** Content */ - content?: string | null; - /** Keyexcerpt */ - keyExcerpt?: string | null; - /** Type */ - type?: string | null; - /** - * Bonus - * @default false - */ - bonus: boolean; - /** Messageindex */ - messageIndex?: number | null; - }; /** RemoveTagsRequest */ RemoveTagsRequest: { /** Tags */ @@ -1219,7 +1180,7 @@ export interface components { * SortField * @enum {string} */ - SortField: "reviewedAt" | "updatedAt" | "id" | "hasAnswer" | "totalReferences" | "tagCount"; + SortField: "reviewedAt" | "updatedAt" | "id" | "hasAnswer" | "tagCount"; /** * SortOrder * @enum {string} @@ -1450,11 +1411,13 @@ export interface operations { excludeTags?: string | null; /** @description Search for items by ID (case-sensitive partial match) */ itemId?: string | null; - /** @description Search for items by reference URL (case-sensitive partial match) */ - refUrl?: string | null; + /** @description Plugin-namespaced filters in key=value form (repeat query param). Example: pluginFilter=rag-compat:refUrl=https://example.com */ + pluginFilter?: string[] | null; /** @description Search for items by keyword (case-insensitive text search across questions, answers, and history) */ keyword?: string | null; sortBy?: components["schemas"]["SortField"]; + /** @description Plugin-namespaced sort key, e.g. rag-compat:totalReferences */ + pluginSort?: string | null; sortOrder?: components["schemas"]["SortOrder"]; page?: number; limit?: number; diff --git a/frontend/src/api/openapi.json b/frontend/src/api/openapi.json index 6135025..b4cb427 100644 --- a/frontend/src/api/openapi.json +++ b/frontend/src/api/openapi.json @@ -230,22 +230,25 @@ "description": "Search for items by ID (case-sensitive partial match)" }, { - "name": "refUrl", + "name": "pluginFilter", "in": "query", "required": false, "schema": { "anyOf": [ { - "type": "string" + "type": "array", + "items": { + "type": "string" + } }, { "type": "null" } ], - "description": "Search for items by reference URL (case-sensitive partial match)", - "title": "Refurl" + "description": "Plugin-namespaced filters in key=value form (repeat query param). Example: pluginFilter=rag-compat:refUrl=https://example.com", + "title": "Pluginfilter" }, - "description": "Search for items by reference URL (case-sensitive partial match)" + "description": "Plugin-namespaced filters in key=value form (repeat query param). Example: pluginFilter=rag-compat:refUrl=https://example.com" }, { "name": "keyword", @@ -274,6 +277,24 @@ "default": "reviewedAt" } }, + { + "name": "pluginSort", + "in": "query", + "required": false, + "schema": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "description": "Plugin-namespaced sort key, e.g. rag-compat:totalReferences", + "title": "Pluginsort" + }, + "description": "Plugin-namespaced sort key, e.g. rag-compat:totalReferences" + }, { "name": "sortOrder", "in": "query", @@ -1983,69 +2004,11 @@ "type": "array", "title": "Tags", "readOnly": true - }, - "synthQuestion": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Synthquestion", - "readOnly": true - }, - "editedQuestion": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Editedquestion", - "readOnly": true - }, - "answer": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Answer", - "readOnly": true - }, - "refs": { - "items": { - "$ref": "#/components/schemas/Reference" - }, - "type": "array", - "title": "Refs", - "readOnly": true - }, - "totalReferences": { - "type": "integer", - "title": "Totalreferences", - "readOnly": true } }, "additionalProperties": false, "type": "object", - "required": [ - "id", - "datasetName", - "tags", - "synthQuestion", - "editedQuestion", - "answer", - "refs", - "totalReferences" - ], + "required": ["id", "datasetName", "tags"], "title": "AgenticGroundTruthEntry", "description": "Generic agentic-first host model.\n\nThe core contract intentionally exposes only the generic schema in OpenAPI. Legacy\nRAG-shaped payloads are translated into this shape when validating this base class so\nexisting data can be carried forward without remaining top-level contract fields." }, @@ -2899,7 +2862,7 @@ "field": { "type": "string", "title": "Field", - "description": "Field name where PII was detected (e.g., 'synthQuestion', 'history[2].msg')" + "description": "Field name where the PII was detected (e.g., 'history.question', 'history[2].msg')" }, "pattern_type": { "type": "string", @@ -3041,80 +3004,6 @@ "title": "RecomputeTagsResponse", "description": "Response for bulk computed tag recomputation." }, - "Reference": { - "properties": { - "url": { - "type": "string", - "title": "Url", - "description": "Reference URL (required, non-empty)" - }, - "title": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Title", - "description": "Human-readable title for the reference" - }, - "content": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Content" - }, - "keyExcerpt": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Keyexcerpt" - }, - "type": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "title": "Type" - }, - "bonus": { - "type": "boolean", - "title": "Bonus", - "default": false - }, - "messageIndex": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ], - "title": "Messageindex" - } - }, - "type": "object", - "required": ["url"], - "title": "Reference", - "description": "Legacy RAG reference object retained for compatibility helpers and tests." - }, "RemoveTagsRequest": { "properties": { "tags": { @@ -3206,14 +3095,7 @@ }, "SortField": { "type": "string", - "enum": [ - "reviewedAt", - "updatedAt", - "id", - "hasAnswer", - "totalReferences", - "tagCount" - ], + "enum": ["reviewedAt", "updatedAt", "id", "hasAnswer", "tagCount"], "title": "SortField" }, "SortOrder": { diff --git a/frontend/src/components/app/QuestionsExplorer.example.tsx b/frontend/src/components/app/QuestionsExplorer.example.tsx index 0ca7862..ceab33a 100644 --- a/frontend/src/components/app/QuestionsExplorer.example.tsx +++ b/frontend/src/components/app/QuestionsExplorer.example.tsx @@ -9,608 +9,82 @@ import QuestionsExplorer, { type QuestionsExplorerItem, } from "./QuestionsExplorer"; -// Sample data - now with 50 items to demonstrate pagination and dataset filtering -const sampleItems: QuestionsExplorerItem[] = [ - { - id: "gt-001", - question: "What is the capital of France?", - answer: "Paris", - status: "approved", - providerId: "json", - views: 150, - reuses: 12, - datasetName: "geography", - tags: ["beginner", "popular"], - reviewedAt: "2025-09-15T10:30:00Z", - }, - { - id: "gt-002", - question: "How does photosynthesis work?", - answer: "Photosynthesis converts light energy into chemical energy...", - status: "draft", - providerId: "json", - views: 45, - reuses: 3, - datasetName: "biology", - tags: ["science", "beginner"], - reviewedAt: "2025-09-20T14:20:00Z", - }, - { - id: "gt-003", - question: "What is quantum computing?", - answer: "Quantum computing uses quantum-mechanical phenomena...", - status: "approved", - providerId: "json", - views: 230, - reuses: 28, - datasetName: "technology", - tags: ["advanced", "technical", "popular"], - reviewedAt: "2025-09-10T08:15:00Z", - }, - { - id: "gt-004", - question: "Explain machine learning basics", - answer: "Machine learning is a subset of artificial intelligence...", - status: "deleted", - providerId: "json", - views: 89, - reuses: 7, - datasetName: "technology", - tags: ["technical", "AI"], - reviewedAt: "2025-08-25T16:45:00Z", - }, - { - id: "gt-005", - question: "What are the benefits of exercise?", - answer: "Regular exercise improves cardiovascular health...", - status: "approved", - providerId: "json", - views: 320, - reuses: 41, - datasetName: "health", - tags: ["beginner", "wellness"], - reviewedAt: "2025-09-22T11:00:00Z", - }, - { - id: "gt-006", - question: "How do neural networks work?", - answer: - "Neural networks are computing systems inspired by biological neural networks...", - status: "draft", - providerId: "json", - views: 198, - reuses: 15, - datasetName: "technology", - tags: ["advanced", "technical", "AI"], - reviewedAt: "2025-09-18T09:30:00Z", - }, - { - id: "gt-007", - question: "What is blockchain technology?", - answer: "Blockchain is a distributed ledger technology...", - status: "approved", - providerId: "json", - views: 412, - reuses: 52, - datasetName: "technology", - tags: ["popular", "technical"], - reviewedAt: "2025-09-12T14:22:00Z", - }, - { - id: "gt-008", - question: "Explain the water cycle", - answer: "The water cycle describes the continuous movement of water...", - status: "approved", - providerId: "json", - views: 276, - reuses: 34, - datasetName: "science", - tags: ["beginner", "science"], - reviewedAt: "2025-09-05T10:10:00Z", - }, - { - id: "gt-009", - question: "What causes climate change?", - answer: "Climate change is primarily caused by greenhouse gas emissions...", - status: "draft", - providerId: "json", - views: 523, - reuses: 67, - datasetName: "science", - tags: ["science", "popular", "environmental"], - reviewedAt: "2025-09-28T15:45:00Z", - }, - { - id: "gt-010", - question: "How does DNA replication work?", - answer: "DNA replication is the process of copying DNA molecules...", - status: "approved", - providerId: "json", - views: 145, - reuses: 18, - datasetName: "biology", - tags: ["science", "advanced"], - reviewedAt: "2025-09-08T13:20:00Z", - }, - { - id: "gt-011", - question: "What is the theory of relativity?", - answer: - "Einstein's theory of relativity revolutionized our understanding of space and time...", - status: "approved", - providerId: "json", - views: 387, - reuses: 45, - datasetName: "physics", - tags: ["advanced", "science", "popular"], - reviewedAt: "2025-09-14T07:30:00Z", - }, - { - id: "gt-012", - question: "How do vaccines work?", - answer: - "Vaccines work by training the immune system to recognize pathogens...", - status: "draft", - providerId: "json", - views: 612, - reuses: 78, - datasetName: "health", - reviewedAt: "2025-09-25T16:15:00Z", - }, - { - id: "gt-013", - question: "What is cloud computing?", - answer: "Cloud computing delivers computing services over the internet...", - status: "deleted", - providerId: "json", - views: 298, - reuses: 39, - datasetName: "technology", - reviewedAt: "2025-08-30T12:00:00Z", - }, - { - id: "gt-014", - question: "Explain Newton's laws of motion", - answer: - "Newton's three laws describe the relationship between objects and forces...", - status: "approved", - providerId: "json", - views: 165, - reuses: 21, - datasetName: "physics", - reviewedAt: "2025-09-03T09:45:00Z", - }, - { - id: "gt-015", - question: "What is artificial intelligence?", - answer: "AI is the simulation of human intelligence by machines...", - status: "draft", - providerId: "json", - views: 734, - reuses: 91, - datasetName: "technology", - reviewedAt: "2025-09-29T18:00:00Z", - }, - { - id: "gt-016", - question: "How do black holes form?", - answer: - "Black holes form when massive stars collapse at the end of their life cycle...", - status: "approved", - providerId: "json", - views: 445, - reuses: 56, - datasetName: "physics", - reviewedAt: "2025-09-11T11:20:00Z", - }, - { - id: "gt-017", - question: "What is cryptocurrency?", - answer: "Cryptocurrency is a digital currency secured by cryptography...", - status: "draft", - providerId: "json", - views: 521, - reuses: 64, - datasetName: "technology", - reviewedAt: "2025-09-27T10:30:00Z", - }, - { - id: "gt-018", - question: "Explain the concept of entropy", - answer: "Entropy is a measure of disorder or randomness in a system...", - status: "approved", - providerId: "json", - views: 187, - reuses: 23, - datasetName: "physics", - reviewedAt: "2025-09-06T14:50:00Z", - }, - { - id: "gt-019", - question: "What are stem cells?", - answer: - "Stem cells are undifferentiated cells capable of developing into various cell types...", - status: "draft", - providerId: "json", - views: 354, - reuses: 42, - datasetName: "biology", - reviewedAt: "2025-09-19T08:15:00Z", - }, - { - id: "gt-020", - question: "How does the internet work?", - answer: "The internet is a global network of interconnected computers...", - status: "approved", - providerId: "json", - views: 289, - reuses: 37, - datasetName: "technology", - reviewedAt: "2025-09-13T16:40:00Z", - }, - { - id: "gt-021", - question: "What is natural selection?", - answer: - "Natural selection is the process by which organisms better adapted survive...", - status: "approved", - providerId: "json", - views: 423, - reuses: 54, - datasetName: "biology", - reviewedAt: "2025-09-16T12:30:00Z", - }, - { - id: "gt-022", - question: "Explain quantum entanglement", - answer: - "Quantum entanglement is a phenomenon where particles remain connected...", - status: "draft", - providerId: "json", - views: 267, - reuses: 31, - datasetName: "physics", - reviewedAt: "2025-09-21T09:00:00Z", - }, - { - id: "gt-023", - question: "What causes earthquakes?", - answer: - "Earthquakes occur when energy is released from tectonic plate movements...", - status: "deleted", - providerId: "json", - views: 198, - reuses: 25, - datasetName: "science", - reviewedAt: "2025-08-28T15:20:00Z", - }, - { - id: "gt-024", - question: "How do solar panels work?", - answer: - "Solar panels convert sunlight into electricity using photovoltaic cells...", - status: "approved", - providerId: "json", - views: 512, - reuses: 66, - datasetName: "technology", - reviewedAt: "2025-09-17T13:45:00Z", - }, - { - id: "gt-025", - question: "What is gene editing?", - answer: "Gene editing allows scientists to modify DNA sequences...", - status: "draft", - providerId: "json", - views: 389, - reuses: 48, - datasetName: "biology", - reviewedAt: "2025-09-24T11:10:00Z", - }, - { - id: "gt-026", - question: "Explain the greenhouse effect", - answer: - "The greenhouse effect is the warming of Earth's surface and atmosphere...", - status: "approved", - providerId: "json", - views: 456, - reuses: 58, - datasetName: "science", - reviewedAt: "2025-09-09T10:25:00Z", - }, - { - id: "gt-027", - question: "What is machine vision?", - answer: - "Machine vision enables computers to interpret visual information...", - status: "draft", - providerId: "json", - views: 312, - reuses: 38, - datasetName: "technology", - reviewedAt: "2025-09-23T14:00:00Z", - }, - { - id: "gt-028", - question: "How does GPS work?", - answer: "GPS uses satellites to determine precise location on Earth...", - status: "approved", - providerId: "json", - views: 234, - reuses: 29, - datasetName: "technology", - reviewedAt: "2025-09-07T08:40:00Z", - }, - { - id: "gt-029", - question: "What are exoplanets?", - answer: - "Exoplanets are planets that orbit stars outside our solar system...", - status: "approved", - providerId: "json", - views: 378, - reuses: 47, - datasetName: "physics", - reviewedAt: "2025-09-14T16:55:00Z", - }, - { - id: "gt-030", - question: "Explain nuclear fusion", - answer: "Nuclear fusion is the process that powers the sun...", - status: "draft", - providerId: "json", - views: 521, - reuses: 63, - datasetName: "physics", - reviewedAt: "2025-09-26T09:30:00Z", - }, - { - id: "gt-031", - question: "What is nanotechnology?", - answer: - "Nanotechnology involves manipulating matter at the atomic scale...", - status: "approved", - providerId: "json", - views: 267, - reuses: 33, - datasetName: "technology", - reviewedAt: "2025-09-04T12:15:00Z", - }, - { - id: "gt-032", - question: "How do antibiotics work?", - answer: "Antibiotics kill or inhibit the growth of bacteria...", - status: "draft", - providerId: "json", - views: 445, - reuses: 55, - datasetName: "health", - reviewedAt: "2025-09-20T17:30:00Z", - }, - { - id: "gt-033", - question: "What is dark matter?", - answer: - "Dark matter is an invisible form of matter that makes up most of the universe...", - status: "approved", - providerId: "json", - views: 598, - reuses: 72, - datasetName: "physics", - reviewedAt: "2025-09-15T11:45:00Z", - }, - { - id: "gt-034", - question: "Explain the Big Bang theory", - answer: "The Big Bang theory describes the origin of the universe...", - status: "approved", - providerId: "json", - views: 487, - reuses: 61, - datasetName: "physics", - reviewedAt: "2025-09-12T09:20:00Z", - }, - { - id: "gt-035", - question: "What is cybersecurity?", - answer: "Cybersecurity protects computer systems from digital attacks...", - status: "draft", - providerId: "json", - views: 623, - reuses: 79, - datasetName: "technology", - reviewedAt: "2025-09-28T13:50:00Z", - }, - { - id: "gt-036", - question: "How does the human brain work?", - answer: "The brain processes information through billions of neurons...", - status: "approved", - providerId: "json", - views: 712, - reuses: 88, - datasetName: "biology", - reviewedAt: "2025-09-19T15:10:00Z", - }, - { - id: "gt-037", - question: "What is 5G technology?", - answer: "5G is the fifth generation of cellular network technology...", - status: "draft", - providerId: "json", - views: 356, - reuses: 44, - datasetName: "technology", - reviewedAt: "2025-09-25T10:05:00Z", - }, - { - id: "gt-038", - question: "Explain plate tectonics", - answer: - "Plate tectonics describes the movement of Earth's lithospheric plates...", - status: "approved", - providerId: "json", - views: 298, - reuses: 36, - datasetName: "science", - reviewedAt: "2025-09-08T14:25:00Z", - }, - { - id: "gt-039", - question: "What is renewable energy?", - answer: "Renewable energy comes from naturally replenishing sources...", - status: "approved", - providerId: "json", - views: 534, - reuses: 68, - datasetName: "science", - reviewedAt: "2025-09-16T16:40:00Z", - }, - { - id: "gt-040", - question: "How do batteries work?", - answer: "Batteries convert chemical energy into electrical energy...", - status: "draft", - providerId: "json", - views: 412, - reuses: 51, - datasetName: "technology", - reviewedAt: "2025-09-22T11:55:00Z", - }, - { - id: "gt-041", - question: "What is the immune system?", - answer: "The immune system defends the body against harmful pathogens...", - status: "approved", - providerId: "json", - views: 467, - reuses: 59, - datasetName: "health", - reviewedAt: "2025-09-11T13:20:00Z", - }, - { - id: "gt-042", - question: "Explain deep learning", - answer: "Deep learning uses neural networks with multiple layers...", - status: "draft", - providerId: "json", - views: 589, - reuses: 73, - datasetName: "technology", - reviewedAt: "2025-09-27T15:35:00Z", - }, - { - id: "gt-043", - question: "What is bioengineering?", - answer: - "Bioengineering applies engineering principles to biological systems...", - status: "approved", - providerId: "json", - views: 321, - reuses: 40, - datasetName: "biology", - reviewedAt: "2025-09-10T10:50:00Z", - }, - { - id: "gt-044", - question: "How do superconductors work?", - answer: - "Superconductors conduct electricity with zero resistance at low temperatures...", - status: "draft", - providerId: "json", - views: 245, - reuses: 30, - datasetName: "physics", - reviewedAt: "2025-09-18T12:05:00Z", - }, - { - id: "gt-045", - question: "What is augmented reality?", - answer: "AR overlays digital information onto the real world...", - status: "approved", - providerId: "json", - views: 678, - reuses: 84, - datasetName: "technology", - reviewedAt: "2025-09-24T14:15:00Z", - }, - { - id: "gt-046", - question: "Explain the carbon cycle", - answer: "The carbon cycle describes how carbon moves through ecosystems...", - status: "approved", - providerId: "json", - views: 334, - reuses: 42, - datasetName: "science", - reviewedAt: "2025-09-13T09:30:00Z", - }, - { - id: "gt-047", - question: "What is quantum computing used for?", - answer: - "Quantum computers solve complex problems beyond classical computers...", - status: "draft", - providerId: "json", - views: 501, - reuses: 62, - datasetName: "technology", - reviewedAt: "2025-09-29T11:45:00Z", - }, - { - id: "gt-048", - question: "How does protein synthesis work?", - answer: - "Protein synthesis involves transcription and translation of genetic code...", - status: "approved", - providerId: "json", - views: 287, - reuses: 35, - datasetName: "biology", - reviewedAt: "2025-09-06T16:00:00Z", - }, - { - id: "gt-049", - question: "What is edge computing?", - answer: "Edge computing processes data closer to where it's generated...", - status: "draft", - providerId: "json", - views: 423, - reuses: 53, - datasetName: "technology", - reviewedAt: "2025-09-21T13:10:00Z", - }, - { - id: "gt-050", - question: "Explain the Doppler effect", - answer: - "The Doppler effect is the change in frequency due to relative motion...", - status: "approved", - providerId: "json", - views: 198, - reuses: 24, - datasetName: "physics", - reviewedAt: "2025-09-05T11:25:00Z", - }, +const questionPrompts = [ + "What is the capital of France?", + "How does photosynthesis work?", + "What is quantum computing?", + "Explain machine learning basics", + "What are the benefits of exercise?", + "How do neural networks work?", + "What is blockchain technology?", + "Explain the water cycle", + "What causes climate change?", + "How does DNA replication work?", +]; + +const answerSnippets = [ + "Paris is the capital city of France.", + "Photosynthesis converts light energy into chemical energy.", + "Quantum computing uses superposition and entanglement.", + "Machine learning learns patterns from data.", + "Exercise improves cardiovascular and mental health.", + "Neural networks stack layers of weighted transformations.", + "Blockchain is a tamper-evident distributed ledger.", + "The water cycle moves water through evaporation and precipitation.", + "Climate change is driven largely by greenhouse gas emissions.", + "DNA replication copies genetic material before cell division.", ]; +const datasets = ["technology", "science", "biology", "physics", "health"]; +const statuses: QuestionsExplorerItem["status"][] = [ + "approved", + "draft", + "approved", + "deleted", + "approved", +]; + +// Sample data with canonical history turns (no top-level answer convenience fields). +const sampleItems: QuestionsExplorerItem[] = Array.from( + { length: 50 }, + (_, i) => { + const prompt = questionPrompts[i % questionPrompts.length]; + const answer = answerSnippets[i % answerSnippets.length]; + const id = `gt-${String(i + 1).padStart(3, "0")}`; + + return { + id, + providerId: "json", + status: statuses[i % statuses.length], + deleted: statuses[i % statuses.length] === "deleted", + history: [ + { role: "user", content: prompt }, + { role: "agent", content: answer }, + ], + tags: i % 3 === 0 ? ["popular", "beginner"] : ["technical"], + manualTags: [], + computedTags: [], + datasetName: datasets[i % datasets.length], + reviewedAt: `2025-09-${String((i % 28) + 1).padStart(2, "0")}T10:30:00Z`, + views: 100 + i * 7, + reuses: 5 + (i % 40), + }; + }, +); + export default function QuestionsExplorerExample() { const handleAssign = (item: QuestionsExplorerItem) => { console.log(`Assign ground truth: ${item.id}`); - // Implementation: Open assignment modal or navigate to assignment flow alert(`Assign functionality for ${item.id} would be triggered here`); }; const handleInspect = (item: QuestionsExplorerItem) => { console.log(`Inspect ground truth: ${item.id}`); - // Implementation: Open detail view or navigate to editor alert(`Inspect functionality for ${item.id} would be triggered here`); }; const handleDelete = (item: QuestionsExplorerItem) => { console.log(`Delete ground truth: ${item.id}`); - // Implementation: Confirm and perform soft delete const confirmed = window.confirm( `Are you sure you want to delete ${item.id}?`, ); diff --git a/frontend/src/components/app/QuestionsExplorer.tsx b/frontend/src/components/app/QuestionsExplorer.tsx index a188e4b..0d3e2e8 100644 --- a/frontend/src/components/app/QuestionsExplorer.tsx +++ b/frontend/src/components/app/QuestionsExplorer.tsx @@ -2,7 +2,11 @@ import { Lock } from "lucide-react"; import { useEffect, useId, useMemo, useRef, useState } from "react"; import useTags from "../../hooks/useTags"; import type { GroundTruthItem } from "../../models/groundTruth"; -import { getLastAgentTurn, getQueuePreview } from "../../models/groundTruth"; +import { + getItemReferences, + getLastAgentTurn, + getQueuePreview, +} from "../../models/groundTruth"; import { cn } from "../../models/utils"; import { getExplorerExtensions } from "../../registry/ExplorerExtensions"; import { fetchAvailableDatasets } from "../../services/datasets"; @@ -243,13 +247,16 @@ export default function QuestionsExplorer({ // Build API parameters from applied filters // Note: toolCallCount is a client-side sort only (not passed to API) const sortByParam = + appliedFilter.sortColumn === "tagCount" + ? "tagCount" + : appliedFilter.sortColumn === "refs" || + appliedFilter.sortColumn === "toolCallCount" + ? null // plugin/client-side sort; do not pass as core sortBy + : appliedFilter.sortColumn; + const pluginSortParam = appliedFilter.sortColumn === "refs" - ? "totalReferences" - : appliedFilter.sortColumn === "tagCount" - ? "tagCount" - : appliedFilter.sortColumn === "toolCallCount" - ? null // client-side sort; do not pass to backend - : appliedFilter.sortColumn; + ? "rag-compat:totalReferences" + : undefined; // Ensure page is at least 1 const safePage = Math.max(1, currentPage); @@ -267,10 +274,16 @@ export default function QuestionsExplorer({ ? appliedFilter.tags.exclude : undefined, itemId: appliedFilter.itemId || undefined, - refUrl: appliedFilter.refUrl || undefined, + pluginFilter: appliedFilter.refUrl + ? [`rag-compat:refUrl=${appliedFilter.refUrl}`] + : undefined, keyword: appliedFilter.keyword || undefined, - sortBy: sortByParam, - sortOrder: sortByParam ? appliedFilter.sortDirection : undefined, + sortBy: sortByParam ?? undefined, + pluginSort: pluginSortParam, + sortOrder: + sortByParam || pluginSortParam + ? appliedFilter.sortDirection + : undefined, page: safePage, limit: itemsPerPage, }; @@ -1232,7 +1245,7 @@ export default function QuestionsExplorer({ {/* Refs */} - {item.totalReferences ?? 0} + {getItemReferences(item).length} {/* Tag Count */} diff --git a/frontend/src/dev/self-tests.ts b/frontend/src/dev/self-tests.ts index 7659582..2c392eb 100644 --- a/frontend/src/dev/self-tests.ts +++ b/frontend/src/dev/self-tests.ts @@ -16,7 +16,7 @@ export function runSelfTests() { const item: GroundTruthItem = { id: "T", question: "q", - answer: "a", + history: [{ role: "agent", content: "a" }], status: "draft", providerId: "json", }; diff --git a/frontend/src/models/demoData.ts b/frontend/src/models/demoData.ts index 07d7513..779c8a0 100644 --- a/frontend/src/models/demoData.ts +++ b/frontend/src/models/demoData.ts @@ -7,8 +7,6 @@ export const DEMO_JSON: GroundTruthItem[] = [ id: "demo-data-overage", providerId: "json", question: "CX IS USING TOO MUCH DATA AND WANTS TO KNOW WHY", - answer: - "The RCA shows the line exceeded the 50 GB plan cap after extended streaming and hotspot activity stayed on cellular data instead of Wi-Fi.", history: [ { role: "user", @@ -151,8 +149,6 @@ export const DEMO_JSON: GroundTruthItem[] = [ providerId: "json", question: "CUSTOMER WAS CHARGED ROAMING FEES EVEN THOUGH THEY BOUGHT A PASS", - answer: - "The travel pass activated after the first charged roaming session, so the early usage billed at standard rates and later usage correctly switched to the pass.", history: [ { role: "user", @@ -266,8 +262,6 @@ export const DEMO_JSON: GroundTruthItem[] = [ providerId: "json", question: "CUSTOMER THINKS THERE WAS AN OUTAGE WHEN DATA SLOWED DOWN AT A STADIUM", - answer: - "The slowdown was caused by short-lived cell congestion during a high-density event, not by a persistent account or device problem.", history: [ { role: "user", @@ -350,8 +344,6 @@ export const DEMO_JSON: GroundTruthItem[] = [ providerId: "json", question: "What is our refund policy for services interrupted during a network outage?", - answer: - "Customers affected by confirmed outages lasting more than 4 hours are eligible for a pro-rated service credit. Credits are applied automatically within 2 billing cycles for outages flagged in the NOC system.", history: [ { role: "user", diff --git a/frontend/src/models/groundTruth.ts b/frontend/src/models/groundTruth.ts index 3592650..5db5876 100644 --- a/frontend/src/models/groundTruth.ts +++ b/frontend/src/models/groundTruth.ts @@ -1,3 +1,10 @@ +import { + collectCanonicalReferencesFromCompatData, + getCompatData, + getCompatRetrievalsFromData, + writeCompatPluginEnvelope, +} from "./ragCompatPayload"; + // Domain models and constants for Ground Truth items // --------------------------------------------------------------------------- @@ -66,51 +73,8 @@ export type RetrievalCandidate = { toolCallId?: string; }; -// --------------------------------------------------------------------------- -// Per-call retrieval helpers (Phase 6 — retrieval normalization) -// -// References are stored in plugins["rag-compat"].data.retrievals per tool -// call. The helpers below provide flat Reference[] access for UI -// components that still consume the legacy Reference shape. -// --------------------------------------------------------------------------- - -const _RAG_COMPAT_KEY = "rag-compat"; -const _UNASSOCIATED_KEY = "_unassociated"; - -/** Per-call retrieval bucket as stored in plugin data. */ -type RetrievalBucket = { - candidates: Array<{ - url: string; - title?: string; - chunk?: string; - rawPayload?: Record; - relevance?: string; - toolCallId?: string | null; - messageIndex?: number; - turnId?: string; - keyParagraph?: string; - bonus?: boolean; - visitedAt?: string | null; - }>; -}; - -/** Typed shorthand for the retrievals dict inside rag-compat plugin data. */ -type RetrievalsMap = Record; - -/** - * Read the per-call retrievals map from plugin data. - * Returns `undefined` when no per-call state exists. - */ -export function getRetrievalsMap( - item: Pick, -): RetrievalsMap | undefined { - const data = item.plugins?.[_RAG_COMPAT_KEY]?.data; - if (!data) return undefined; - const r = data.retrievals; - if (r && typeof r === "object" && !Array.isArray(r)) { - return r as RetrievalsMap; - } - return undefined; +export function getRetrievalsMap(item: Pick) { + return getCompatRetrievalsFromData(getCompatData(item.plugins)); } /** @@ -121,41 +85,15 @@ export function getRetrievalsMap( * exists (caller should provide legacy references separately if needed). */ export function getItemReferences(item: GroundTruthItem): Reference[] { - const retrievals = getRetrievalsMap(item); - if (!retrievals) return []; const history = ensureConversationTurnIdentity(item.history); const indexByTurnId = getTurnIndexById(history); + const historyTurnIds = history.map((turn) => turn.turnId); - const refs: Reference[] = []; - let refIndex = 0; - for (const [toolCallId, bucket] of Object.entries(retrievals)) { - if (!bucket?.candidates) continue; - for (const c of bucket.candidates) { - const storedTurnId = c.turnId; - const resolvedMessageIndex = - storedTurnId && indexByTurnId.has(storedTurnId) - ? indexByTurnId.get(storedTurnId) - : c.messageIndex; - const resolvedTurnId = - storedTurnId || - (typeof resolvedMessageIndex === "number" - ? history[resolvedMessageIndex]?.turnId - : undefined); - refs.push({ - id: `ref_${refIndex++}`, - title: c.title, - url: c.url, - snippet: c.chunk, - visitedAt: c.visitedAt ?? null, - keyParagraph: c.keyParagraph, - bonus: c.bonus ?? false, - messageIndex: resolvedMessageIndex, - turnId: resolvedTurnId, - toolCallId: toolCallId !== _UNASSOCIATED_KEY ? toolCallId : undefined, - }); - } - } - return refs; + return collectCanonicalReferencesFromCompatData({ + data: getCompatData(item.plugins), + historyTurnIds, + indexByTurnId, + }); } /** @@ -167,35 +105,10 @@ export function withUpdatedReferences( item: GroundTruthItem, refs: Reference[], ): GroundTruthItem { - const retrievals: RetrievalsMap = {}; - for (const ref of refs) { - const key = ref.toolCallId || _UNASSOCIATED_KEY; - if (!retrievals[key]) { - retrievals[key] = { candidates: [] }; - } - retrievals[key].candidates.push({ - url: ref.url, - title: ref.title, - chunk: ref.snippet, - relevance: undefined, - toolCallId: ref.toolCallId || undefined, - messageIndex: ref.turnId ? undefined : ref.messageIndex, - turnId: ref.turnId, - keyParagraph: ref.keyParagraph, - bonus: ref.bonus, - visitedAt: ref.visitedAt, - }); - } - - const plugins = { ...(item.plugins || {}) }; - const existing = plugins[_RAG_COMPAT_KEY]; - plugins[_RAG_COMPAT_KEY] = { - kind: _RAG_COMPAT_KEY, - version: existing?.version || "1.0", - data: { ...(existing?.data || {}), retrievals }, + return { + ...item, + plugins: writeCompatPluginEnvelope({ plugins: item.plugins, refs }), }; - - return { ...item, plugins }; } // --------------------------------------------------------------------------- @@ -214,8 +127,8 @@ export type ConversationTurn = { turnId?: string; /** Stable workflow-step identity when a turn maps to a durable step. */ stepId?: string; - /** Free-form role string. "user" marks the human turn; any other value is a non-user (agent/assistant) turn. - * Common values: "user", "agent", "assistant", "output-agent", "orchestrator-agent". */ + /** Free-form role string. "user" marks the human turn; all non-user roles + * represent non-user/answer content (e.g. "agent", "assistant", "planner"). */ role: string; content: string; /** Expected behavior(s) for this turn in the conversation (agent turns only, legacy/compat) */ @@ -340,8 +253,6 @@ export type GroundTruthItem = { bucket?: string; /** Legacy compatibility projection derived from history when absent. */ question?: string; - /** Legacy compatibility projection derived from history when absent. */ - answer?: string; /** ISO date string of the last review, when provided by the API. */ reviewedAt?: string | null; /** @@ -349,8 +260,6 @@ export type GroundTruthItem = { * Rendered in a collapsible pane above the Question/Answer editors. */ curationInstructions?: string; - /** Backend-computed total count of references (item-level + all turn-level). */ - totalReferences?: number; /** ETag for optimistic concurrency control */ _etag?: string; }; @@ -369,6 +278,18 @@ export function getLegacyHostDeleteGates(): LegacyHostDeleteGate[] { return [...LEGACY_HOST_DELETE_GATES]; } +function normalizeRole(role: string): string { + return role.trim().toLowerCase(); +} + +export function isUserRole(role: string): boolean { + return normalizeRole(role) === "user"; +} + +export function isNonUserRole(role: string): boolean { + return !isUserRole(role); +} + export function createConversationTurn(args: { role: string; content: string; @@ -408,7 +329,7 @@ export function getLastUserTurn(item: GroundTruthItem): string { } // Find the last user turn for (let i = history.length - 1; i >= 0; i--) { - if (history[i].role === "user") { + if (isUserRole(history[i].role)) { return history[i].content; } } @@ -417,19 +338,19 @@ export function getLastUserTurn(item: GroundTruthItem): string { /** * Returns the last agent message from history. - * "Agent" is any turn whose role is not "user" (supports free-form roles). + * Non-user turns are treated as answer content. */ export function getLastAgentTurn(item: GroundTruthItem): string { if (!Array.isArray(item.history)) { - return item.answer || ""; + return ""; } const history = ensureConversationTurnIdentity(item.history); if (history.length === 0) { return ""; } - // Find the last non-user turn (any agent/assistant/orchestrator role) + // Find the last non-user turn. for (let i = history.length - 1; i >= 0; i--) { - if (history[i].role !== "user") { + if (isNonUserRole(history[i].role)) { return history[i].content; } } @@ -458,8 +379,8 @@ export function getQueuePreview(item: GroundTruthItem): string { if (!Array.isArray(item.history)) { return item.question || "(no message)"; } - const first = ensureConversationTurnIdentity(item.history).find( - (t) => t.role === "user", + const first = ensureConversationTurnIdentity(item.history).find((t) => + isUserRole(t.role), ); return first?.content || "(no message)"; } @@ -477,7 +398,6 @@ export function withDerivedLegacyFields( return { ...derivedItem, question: getLastUserTurn(derivedItem), - answer: getLastAgentTurn(derivedItem), }; } diff --git a/frontend/src/models/ragCompatPayload.ts b/frontend/src/models/ragCompatPayload.ts new file mode 100644 index 0000000..5340a4f --- /dev/null +++ b/frontend/src/models/ragCompatPayload.ts @@ -0,0 +1,292 @@ +const _RAG_COMPAT_KEY = "rag-compat"; +const _UNASSOCIATED_KEY = "_unassociated"; + +export type CompatPluginPayload = { + kind: string; + version: string; + data?: Record; +}; + +export type CompatPluginsMap = Record; + +export type CompatReferencePayload = { + url: string; + title?: string; + content?: string; + keyExcerpt?: string; + bonus?: boolean; + messageIndex?: number; + turnId?: string; + toolCallId?: string; + visitedAt?: string | null; +}; + +export type CanonicalReferencePayload = { + id: string; + url: string; + title?: string; + snippet?: string; + visitedAt?: string | null; + keyParagraph?: string; + bonus?: boolean; + messageIndex?: number; + turnId?: string; + toolCallId?: string; +}; + +export type RetrievalCandidatePayload = { + url: string; + title?: string; + chunk?: string; + relevance?: string; + toolCallId?: string | null; + messageIndex?: number; + turnId?: string; + keyParagraph?: string; + bonus?: boolean; + visitedAt?: string | null; +}; + +export type RetrievalBucketPayload = { + candidates: RetrievalCandidatePayload[]; +}; + +export type RetrievalsMap = Record; + +function asObjectRecord(value: unknown): Record | undefined { + if (!value || typeof value !== "object" || Array.isArray(value)) { + return undefined; + } + return value as Record; +} + +export function getCompatData( + plugins: CompatPluginsMap | undefined, +): Record | undefined { + return asObjectRecord(plugins?.[_RAG_COMPAT_KEY]?.data); +} + +export function getCompatReferencesFromData( + data: Record | undefined, +): CompatReferencePayload[] | undefined { + const references = data?.references; + if (!Array.isArray(references)) { + return undefined; + } + return references as CompatReferencePayload[]; +} + +export function getCompatReferencesFromPlugins( + plugins: CompatPluginsMap | undefined, +): CompatReferencePayload[] | undefined { + return getCompatReferencesFromData(getCompatData(plugins)); +} + +export function getCompatRetrievalsFromData( + data: Record | undefined, +): RetrievalsMap | undefined { + const retrievals = data?.retrievals; + if ( + retrievals && + typeof retrievals === "object" && + !Array.isArray(retrievals) + ) { + return retrievals as RetrievalsMap; + } + return undefined; +} + +export function getCompatRetrievalsFromPlugins( + plugins: CompatPluginsMap | undefined, +): RetrievalsMap | undefined { + return getCompatRetrievalsFromData(getCompatData(plugins)); +} + +export function retrievalsToCanonicalReferences(args: { + retrievals: RetrievalsMap; + historyTurnIds: Array; + indexByTurnId: Map; +}): CompatReferencePayload[] { + const { retrievals, historyTurnIds, indexByTurnId } = args; + const refs: CompatReferencePayload[] = []; + for (const [toolCallId, bucket] of Object.entries(retrievals)) { + if (!bucket?.candidates) continue; + for (const candidate of bucket.candidates) { + if (!candidate?.url) continue; + const storedTurnId = candidate.turnId; + const resolvedMessageIndex = + storedTurnId && indexByTurnId.has(storedTurnId) + ? indexByTurnId.get(storedTurnId) + : candidate.messageIndex; + const resolvedTurnId = + storedTurnId || + (typeof resolvedMessageIndex === "number" + ? historyTurnIds[resolvedMessageIndex] + : undefined); + refs.push({ + url: candidate.url, + title: candidate.title, + content: candidate.chunk, + keyExcerpt: candidate.keyParagraph, + bonus: candidate.bonus ?? false, + messageIndex: resolvedTurnId ? undefined : resolvedMessageIndex, + turnId: resolvedTurnId, + toolCallId: + toolCallId !== _UNASSOCIATED_KEY + ? toolCallId + : candidate.toolCallId || undefined, + visitedAt: candidate.visitedAt ?? null, + }); + } + } + return refs; +} + +export function compatReferencesToCanonicalPayload(args: { + references: CompatReferencePayload[]; + historyTurnIds: Array; + indexByTurnId: Map; +}): CanonicalReferencePayload[] { + const { references, historyTurnIds, indexByTurnId } = args; + return references + .filter((ref): ref is CompatReferencePayload => !!ref?.url) + .map((ref, index) => { + const resolvedMessageIndex = + ref.turnId && indexByTurnId.has(ref.turnId) + ? indexByTurnId.get(ref.turnId) + : ref.messageIndex; + const resolvedTurnId = + ref.turnId || + (typeof resolvedMessageIndex === "number" + ? historyTurnIds[resolvedMessageIndex] + : undefined); + return { + id: `ref_${index}`, + title: ref.title, + url: ref.url, + snippet: ref.content, + visitedAt: ref.visitedAt ?? null, + keyParagraph: ref.keyExcerpt, + bonus: ref.bonus ?? false, + messageIndex: resolvedMessageIndex, + turnId: resolvedTurnId, + toolCallId: ref.toolCallId, + }; + }); +} + +export function collectCanonicalReferencesFromCompatData(args: { + data: Record | undefined; + historyTurnIds: Array; + indexByTurnId: Map; +}): CanonicalReferencePayload[] { + const { data, historyTurnIds, indexByTurnId } = args; + const canonicalRefs = getCompatReferencesFromData(data); + if (canonicalRefs) { + return compatReferencesToCanonicalPayload({ + references: canonicalRefs, + historyTurnIds, + indexByTurnId, + }); + } + + const retrievals = getCompatRetrievalsFromData(data); + if (!retrievals) { + return []; + } + return retrievalsToCanonicalReferences({ + retrievals, + historyTurnIds, + indexByTurnId, + }).map((ref, index) => ({ + id: `ref_${index}`, + title: ref.title, + url: ref.url, + snippet: ref.content, + visitedAt: ref.visitedAt ?? null, + keyParagraph: ref.keyExcerpt, + bonus: ref.bonus ?? false, + messageIndex: ref.turnId ? undefined : ref.messageIndex, + turnId: ref.turnId, + toolCallId: ref.toolCallId, + })); +} + +export function collectCanonicalReferencesFromCompatPlugins(args: { + plugins: CompatPluginsMap | undefined; + historyTurnIds: Array; + indexByTurnId: Map; +}): CanonicalReferencePayload[] { + const { plugins, historyTurnIds, indexByTurnId } = args; + return collectCanonicalReferencesFromCompatData({ + data: getCompatData(plugins), + historyTurnIds, + indexByTurnId, + }); +} + +export function serializeCanonicalReferences( + refs: Array< + Pick< + CanonicalReferencePayload, + | "url" + | "title" + | "snippet" + | "keyParagraph" + | "bonus" + | "messageIndex" + | "turnId" + | "toolCallId" + | "visitedAt" + > + >, +): CompatReferencePayload[] { + return refs.map((ref) => ({ + url: ref.url, + title: ref.title, + content: ref.snippet, + keyExcerpt: ref.keyParagraph, + bonus: ref.bonus ?? false, + messageIndex: ref.turnId ? undefined : ref.messageIndex, + turnId: ref.turnId, + toolCallId: ref.toolCallId, + visitedAt: ref.visitedAt ?? null, + })); +} + +export function writeCompatPluginEnvelope(args: { + plugins: CompatPluginsMap | undefined; + refs: Array< + Pick< + CanonicalReferencePayload, + | "url" + | "title" + | "snippet" + | "keyParagraph" + | "bonus" + | "messageIndex" + | "turnId" + | "toolCallId" + | "visitedAt" + > + >; +}): CompatPluginsMap { + const { plugins, refs } = args; + const references = serializeCanonicalReferences(refs); + const nextPlugins = { ...(plugins || {}) }; + const existingCompat = nextPlugins[_RAG_COMPAT_KEY]; + const existingData = + existingCompat?.data && + typeof existingCompat.data === "object" && + !Array.isArray(existingCompat.data) + ? existingCompat.data + : {}; + const { retrievals: _deprecatedRetrievals, ...restData } = existingData; + + nextPlugins[_RAG_COMPAT_KEY] = { + kind: _RAG_COMPAT_KEY, + version: existingCompat?.version || "1.0", + data: { ...restData, references }, + }; + return nextPlugins; +} diff --git a/frontend/src/services/groundTruths.ts b/frontend/src/services/groundTruths.ts index f99a443..3bbe703 100644 --- a/frontend/src/services/groundTruths.ts +++ b/frontend/src/services/groundTruths.ts @@ -1,8 +1,4 @@ -import type { - ApiGroundTruth, - ApiHistoryEntry, - ApiReference, -} from "../adapters/apiMapper"; +import type { ApiGroundTruth, ApiHistoryEntry } from "../adapters/apiMapper"; import { groundTruthFromApi } from "../adapters/apiMapper"; import { client } from "../api/client"; import type { components, operations } from "../api/generated"; @@ -14,11 +10,6 @@ type GroundTruthItemOut = Omit< components["schemas"]["AgenticGroundTruthEntry-Output"], "history" > & { - synthQuestion?: string | null; - editedQuestion?: string | null; - answer?: string | null; - refs?: ApiReference[]; - totalReferences?: number; tags?: string[]; comment?: string | null; history?: ApiHistoryEntry[]; @@ -46,9 +37,10 @@ interface ListAllGroundTruthsParams { tags?: string[]; excludeTags?: string[]; itemId?: string | null; - refUrl?: string | null; + pluginFilter?: string[]; keyword?: string | null; sortBy?: string | null; + pluginSort?: string | null; sortOrder?: "asc" | "desc" | null; page?: number; limit?: number; @@ -71,10 +63,11 @@ export async function listAllGroundTruths( if (params.excludeTags?.length) query.excludeTags = params.excludeTags.join(","); if (params.itemId) query.itemId = params.itemId; - if (params.refUrl) query.refUrl = params.refUrl; + if (params.pluginFilter?.length) query.pluginFilter = params.pluginFilter; if (params.keyword) query.keyword = params.keyword; if (params.sortBy) query.sortBy = params.sortBy as components["schemas"]["SortField"]; + if (params.pluginSort) query.pluginSort = params.pluginSort; if (params.sortOrder) query.sortOrder = params.sortOrder; if (typeof params.page === "number") query.page = params.page; if (typeof params.limit === "number") query.limit = params.limit; diff --git a/frontend/tests/unit/adapters/apiMapper.test.ts b/frontend/tests/unit/adapters/apiMapper.test.ts index 7731f6b..0b6412d 100644 --- a/frontend/tests/unit/adapters/apiMapper.test.ts +++ b/frontend/tests/unit/adapters/apiMapper.test.ts @@ -11,11 +11,7 @@ function makeApiItem(overrides: Partial = {}): ApiGroundTruth { return { id: "gt-1", status: "draft", - answer: "Test answer", - synthQuestion: "Synth question", - editedQuestion: "Edited question", history: undefined, - refs: [], tags: [], manualTags: [], computedTags: [], @@ -27,14 +23,28 @@ function makeApiItem(overrides: Partial = {}): ApiGroundTruth { } as ApiGroundTruth; } +function withCompatData( + data: Record, +): Pick { + return { + plugins: { + "rag-compat": { + kind: "rag-compat", + version: "1.0", + data, + }, + }, + }; +} + describe("groundTruthFromApi", () => { describe("role mapping", () => { - it("maps history role 'assistant' to 'agent'", () => { + it("preserves history role values from the API payload", () => { const api = makeApiItem({ history: [{ role: "assistant", msg: "Hello from assistant" }], }); const result = groundTruthFromApi(api); - expect(result.history?.[0].role).toBe("agent"); + expect(result.history?.[0].role).toBe("assistant"); expect(result.history?.[0].content).toBe("Hello from assistant"); }); @@ -46,6 +56,19 @@ describe("groundTruthFromApi", () => { expect(result.history?.[0].role).toBe("user"); expect(result.history?.[0].content).toBe("Hello from user"); }); + + it("derives compatibility question from the latest user turn", () => { + const api = makeApiItem({ + history: [ + { role: "user", msg: "Initial question" }, + { role: "planner", msg: "Planner output" }, + { role: "user", msg: "Follow-up question" }, + { role: "assistant", msg: "Final answer" }, + ], + }); + const result = groundTruthFromApi(api); + expect(result.question).toBe("Follow-up question"); + }); }); describe("expectedBehavior handling", () => { @@ -84,7 +107,32 @@ describe("groundTruthFromApi", () => { }); describe("reference mapping", () => { - it("assigns turn refs to correct messageIndex", () => { + it("reads canonical rag-compat data.references", () => { + const api = makeApiItem({ + history: [{ role: "assistant", msg: "A" }], + ...withCompatData({ + references: [ + { + url: "https://canonical.ref/1", + title: "Canonical Ref", + content: "Canonical snippet", + keyExcerpt: "Canonical key excerpt", + bonus: true, + messageIndex: 0, + }, + ], + }), + }); + const result = groundTruthFromApi(api); + const [ref] = getItemReferences(result); + expect(ref.url).toBe("https://canonical.ref/1"); + expect(ref.title).toBe("Canonical Ref"); + expect(ref.snippet).toBe("Canonical snippet"); + expect(ref.keyParagraph).toBe("Canonical key excerpt"); + expect(ref.bonus).toBe(true); + }); + + it("ignores retired turn-level refs from history payloads", () => { const api = makeApiItem({ history: [ { role: "user", msg: "Question" }, @@ -108,39 +156,28 @@ describe("groundTruthFromApi", () => { }); const result = groundTruthFromApi(api); - // Refs from history[1] should have messageIndex 1 - const allRefs = getItemReferences(result); - const refsAt1 = allRefs.filter((r) => r.messageIndex === 1); - expect(refsAt1).toHaveLength(2); - expect(refsAt1.map((r) => r.url)).toEqual([ - "https://ref1.com", - "https://ref2.com", - ]); - - // Refs from history[3] should have messageIndex 3 - const refsAt3 = allRefs.filter((r) => r.messageIndex === 3); - expect(refsAt3).toHaveLength(1); - expect(refsAt3[0].url).toBe("https://ref3.com"); + expect(getItemReferences(result)).toEqual([]); }); - it("maps ref fields correctly", () => { + it("maps canonical plugin reference fields correctly", () => { const api = makeApiItem({ - history: [ - { role: "user", msg: "Q" }, - { - role: "assistant", - msg: "A", - refs: [ - { - url: "https://example.com", - title: "Example Title", - content: "Snippet content", - keyExcerpt: "Key paragraph", - bonus: true, - }, - ], + plugins: { + "rag-compat": { + kind: "rag-compat", + version: "1.0", + data: { + references: [ + { + url: "https://example.com", + title: "Example Title", + content: "Snippet content", + keyExcerpt: "Key paragraph", + bonus: true, + }, + ], + }, }, - ], + }, }); const result = groundTruthFromApi(api); const ref = getItemReferences(result)[0]; @@ -155,85 +192,118 @@ describe("groundTruthFromApi", () => { }); }); - describe("legacy single-turn conversion", () => { - it("creates 2-turn history from editedQuestion and answer", () => { + describe("retired single-turn compat behavior", () => { + it("does not synthesize history from editedQuestion and answer", () => { const api = makeApiItem({ - editedQuestion: "What is X?", - answer: "X is Y", history: undefined, + ...withCompatData({ + editedQuestion: "What is X?", + answer: "X is Y", + }), }); const result = groundTruthFromApi(api); - expect(result.history).toHaveLength(2); - expect(result.history?.[0]).toMatchObject({ - role: "user", - content: "What is X?", - }); - expect(result.history?.[1]).toMatchObject({ - role: "agent", - content: "X is Y", - }); + expect(result.history).toBeUndefined(); }); - it("falls back to synthQuestion when editedQuestion is empty", () => { + it("does not fall back to synthQuestion when editedQuestion is empty", () => { const api = makeApiItem({ - synthQuestion: "Synth question?", - editedQuestion: "", - answer: "Answer", history: undefined, + ...withCompatData({ + synthQuestion: "Synth question?", + editedQuestion: "", + answer: "Answer", + }), }); const result = groundTruthFromApi(api); - expect(result.history?.[0].content).toBe("Synth question?"); + expect(result.history).toBeUndefined(); }); - it("assigns legacy top-level refs to messageIndex 1", () => { + it("does not import legacy top-level refs", () => { const api = makeApiItem({ - editedQuestion: "Question", - answer: "Answer", - refs: [ - { - url: "https://legacy.ref", - content: "Legacy content", - bonus: false, - }, - ], history: undefined, + ...withCompatData({ + editedQuestion: "Question", + answer: "Answer", + refs: [ + { + url: "https://legacy.ref", + content: "Legacy content", + bonus: false, + }, + ], + }), }); const result = groundTruthFromApi(api); - expect(getItemReferences(result)).toHaveLength(1); - expect(getItemReferences(result)[0].messageIndex).toBe(1); + expect(getItemReferences(result)).toEqual([]); }); - it("creates empty agent turn when answer is empty", () => { + it("does not create synthetic turns when answer is empty", () => { const api = makeApiItem({ - editedQuestion: "Question without answer", - answer: "", history: undefined, + ...withCompatData({ + editedQuestion: "Question without answer", + answer: "", + }), + }); + const result = groundTruthFromApi(api); + + expect(result.history).toBeUndefined(); + }); + + it("treats explicit API empty history as authoritative over compat question/answer", () => { + const api = makeApiItem({ + history: [], + ...withCompatData({ + editedQuestion: "Compat question", + answer: "Compat answer", + }), }); const result = groundTruthFromApi(api); - expect(result.history).toHaveLength(2); - expect(result.history?.[1].content).toBe(""); + expect(result.history).toEqual([]); }); }); describe("multi-turn item top-level refs", () => { - it("assigns top-level refs to undefined messageIndex for true multi-turn", () => { + it("does not import compat refs for true multi-turn", () => { const api = makeApiItem({ history: [ { role: "user", msg: "Q" }, { role: "assistant", msg: "A" }, ], - refs: [ - { url: "https://global.ref", content: "Global ref", bonus: false }, - ], + ...withCompatData({ + refs: [ + { + url: "https://global.ref", + content: "Global ref", + bonus: false, + }, + ], + }), + }); + const result = groundTruthFromApi(api); + + expect(getItemReferences(result)).toEqual([]); + }); + + it("treats explicit empty canonical references as authoritative", () => { + const api = makeApiItem({ + history: [{ role: "assistant", msg: "A" }], + ...withCompatData({ + references: [], + retrievals: { + _unassociated: { + candidates: [{ url: "https://stale.ref", messageIndex: 0 }], + }, + }, + }), }); const result = groundTruthFromApi(api); - expect(getItemReferences(result)).toHaveLength(1); - expect(getItemReferences(result)[0].messageIndex).toBeUndefined(); + expect(getItemReferences(result)).toEqual([]); }); }); @@ -348,7 +418,7 @@ describe("groundTruthToPatch", () => { id: "gt-1", providerId: "api", question: "Test question", - answer: "Test answer", + history: [{ role: "agent", content: "Test answer" }], status: "draft", deleted: false, tags: [], @@ -358,7 +428,7 @@ describe("groundTruthToPatch", () => { } describe("role mapping", () => { - it("maps UI role 'agent' to API role 'assistant'", () => { + it("preserves UI role values when serializing patch payloads", () => { const item = makeDomainItem({ history: [ { role: "user", content: "Q" }, @@ -368,12 +438,160 @@ describe("groundTruthToPatch", () => { const patch = groundTruthToPatch({ item }); expect(patch.history?.[0].role).toBe("user"); - expect(patch.history?.[1].role).toBe("assistant"); + expect(patch.history?.[1].role).toBe("agent"); }); }); describe("reference handling", () => { - it("includes refs only on agent turns in history", () => { + it("does not create rag-compat plugin when history exists without compat payload", () => { + const item = makeDomainItem({ + history: [ + { role: "user", content: "Q" }, + { role: "agent", content: "A" }, + ], + plugins: { + other: { + kind: "other", + version: "1.0", + data: { keep: true }, + }, + }, + }); + const patch = groundTruthToPatch({ item }); + const patchPlugins = (patch as Record).plugins as + | Record }> + | undefined; + expect(patchPlugins?.other?.data).toEqual({ keep: true }); + expect(patchPlugins?.["rag-compat"]).toBeUndefined(); + }); + + it("round-trips canonical rag-compat data.references through patch generation", () => { + const item = makeDomainItem({ + history: [ + { role: "user", content: "Q" }, + { role: "agent", content: "A" }, + ], + plugins: { + "rag-compat": { + kind: "rag-compat", + version: "1.0", + data: { + references: [ + { + url: "https://canonical.roundtrip/ref", + title: "Round Trip Ref", + content: "Round trip snippet", + keyExcerpt: "Round trip excerpt", + messageIndex: 1, + }, + ], + }, + }, + }, + }); + + const patch = groundTruthToPatch({ item }); + expect(patch.history?.[1].refs).toBeUndefined(); + const patchPlugins = (patch as Record).plugins as + | Record }> + | undefined; + expect(patchPlugins?.["rag-compat"]?.data?.references).toEqual([ + expect.objectContaining({ + url: "https://canonical.roundtrip/ref", + title: "Round Trip Ref", + }), + ]); + }); + + it("materializes retrieval-only rag-compat payloads into canonical references during save patch", () => { + const fromApi = groundTruthFromApi( + makeApiItem({ + history: [ + { role: "user", msg: "Q", turnId: "turn-user" }, + { role: "assistant", msg: "A", turnId: "turn-answer" }, + ], + ...withCompatData({ + retrievals: { + tc1: { + candidates: [ + { + url: "https://retrieval.only/ref", + title: "Retrieval Only Ref", + chunk: "retrieval snippet", + messageIndex: 1, + }, + ], + }, + }, + }), + }), + ); + + const patch = groundTruthToPatch({ item: fromApi }); + const patchPlugins = (patch as Record).plugins as + | Record }> + | undefined; + + expect(patchPlugins?.["rag-compat"]?.data?.references).toEqual([ + expect.objectContaining({ + url: "https://retrieval.only/ref", + title: "Retrieval Only Ref", + toolCallId: "tc1", + turnId: "turn-answer", + }), + ]); + expect(patchPlugins?.["rag-compat"]?.data?.retrievals).toBeUndefined(); + }); + + it("scrubs removed legacy compat keys and deprecated compat retrievals", () => { + const item = makeDomainItem({ + history: [ + { role: "user", content: "Q" }, + { role: "agent", content: "A" }, + ], + plugins: { + "rag-compat": { + kind: "rag-compat", + version: "1.0", + data: { + synthQuestion: "legacy question", + editedQuestion: "legacy edited", + answer: "legacy answer", + refs: [{ url: "https://legacy.ref" }], + totalReferences: 99, + historyAnnotations: [{ note: "legacy" }], + references: [{ url: "https://canonical.ref" }], + retrievals: { + _unassociated: { + candidates: [{ url: "https://retrieval.ref" }], + }, + }, + }, + }, + }, + }); + + const patch = groundTruthToPatch({ item }); + const patchPlugins = (patch as Record).plugins as + | Record }> + | undefined; + const compatData = patchPlugins?.["rag-compat"]?.data; + + expect(compatData).toBeDefined(); + expect(compatData?.synthQuestion).toBeUndefined(); + expect(compatData?.editedQuestion).toBeUndefined(); + expect(compatData?.answer).toBeUndefined(); + expect(compatData?.refs).toBeUndefined(); + expect(compatData?.totalReferences).toBeUndefined(); + expect(compatData?.historyAnnotations).toBeUndefined(); + expect(compatData?.references).toEqual([ + expect.objectContaining({ url: "https://canonical.ref" }), + ]); + expect(compatData?.retrievals).toBeUndefined(); + expect(compatData?.turnIdentity).toBeUndefined(); + }); + + it("does not emit retired refs on user or agent turns in history", () => { const item = makeDomainItem({ history: [ { role: "user", content: "Q" }, @@ -401,16 +619,10 @@ describe("groundTruthToPatch", () => { // User turn should not have refs expect(patch.history?.[0].refs).toBeUndefined(); - // Agent turn should have refs - expect(patch.history?.[1].refs).toHaveLength(1); - expect(patch.history?.[1].refs?.[0].url).toBe("https://ref.com"); + expect(patch.history?.[1].refs).toBeUndefined(); }); - it("preserves top-level refs for legacy items", () => { - const originalApi = makeApiItem({ - history: undefined, - refs: [{ url: "https://legacy.ref", bonus: false }], - }); + it("does not emit refs even when mapped to non-user turns", () => { const item = makeDomainItem({ history: [ { role: "user", content: "Q" }, @@ -433,19 +645,12 @@ describe("groundTruthToPatch", () => { }, }, }); - const patch = groundTruthToPatch({ item, originalApi }); + const patch = groundTruthToPatch({ item }); - // Top-level refs should include refs with messageIndex 1 - expect(patch.refs).toHaveLength(2); - expect(patch.refs?.map((r) => r.url)).toContain("https://legacy.ref"); - expect(patch.refs?.map((r) => r.url)).toContain("https://new.ref"); + expect(patch.history?.[1]?.refs).toBeUndefined(); }); - it("preserves top-level refs when legacy items use empty history arrays", () => { - const originalApi = makeApiItem({ - history: [], - refs: [{ url: "https://legacy-empty.ref", bonus: false }], - }); + it("does not emit refs when item history exists", () => { const item = makeDomainItem({ history: [ { role: "user", content: "Q" }, @@ -468,27 +673,12 @@ describe("groundTruthToPatch", () => { }, }, }); - const patch = groundTruthToPatch({ item, originalApi }); + const patch = groundTruthToPatch({ item }); - expect(patch.refs).toHaveLength(2); - expect(patch.refs?.map((r) => r.url)).toContain( - "https://legacy-empty.ref", - ); - expect(patch.refs?.map((r) => r.url)).toContain("https://new-empty.ref"); + expect(patch.history?.[1]?.refs).toBeUndefined(); }); - it("omits top-level refs for true multi-turn items", () => { - const originalApi = makeApiItem({ - history: [ - { role: "user", msg: "Q" }, - { - role: "assistant", - msg: "A", - refs: [{ url: "https://turn.ref", bonus: false }], - }, - ], - refs: [], - }); + it("does not serialize refs into assistant history entries", () => { const item = makeDomainItem({ history: [ { role: "user", content: "Q" }, @@ -508,16 +698,12 @@ describe("groundTruthToPatch", () => { }, }, }); - const patch = groundTruthToPatch({ item, originalApi }); - - // Top-level refs should be empty for true multi-turn - expect(patch.refs).toHaveLength(0); + const patch = groundTruthToPatch({ item }); - // Refs should be in history - expect(patch.history?.[1].refs).toHaveLength(1); + expect(patch.history?.[1].refs).toBeUndefined(); }); - it("maps ref fields correctly in patch", () => { + it("omits ref fields from patch history entries", () => { const item = makeDomainItem({ history: [ { role: "user", content: "Q" }, @@ -547,13 +733,7 @@ describe("groundTruthToPatch", () => { }, }); const patch = groundTruthToPatch({ item }); - const ref = patch.history?.[1].refs?.[0]; - - expect(ref?.url).toBe("https://example.com"); - expect(ref?.title).toBe("Title"); - expect(ref?.content).toBe("Snippet"); - expect(ref?.keyExcerpt).toBe("Key"); - expect(ref?.bonus).toBe(true); + expect(patch.history?.[1].refs).toBeUndefined(); }); }); @@ -630,14 +810,22 @@ describe("groundTruthToPatch", () => { }); describe("basic field mapping", () => { - it("includes answer and editedQuestion", () => { + it("serializes canonical history content without role remapping", () => { const item = makeDomainItem({ - question: "My question", - answer: "My answer", + history: [ + { role: "user", content: "My question" }, + { role: "agent", content: "My answer" }, + ], }); const patch = groundTruthToPatch({ item }); - expect(patch.answer).toBe("My answer"); - expect(patch.editedQuestion).toBe("My question"); + expect(patch.history?.[0]).toMatchObject({ + role: "user", + msg: "My question", + }); + expect(patch.history?.[1]).toMatchObject({ + role: "agent", + msg: "My answer", + }); }); it("includes manualTags", () => { diff --git a/frontend/tests/unit/adapters/apiProvider-etag.test.ts b/frontend/tests/unit/adapters/apiProvider-etag.test.ts index 4377739..9af17ed 100644 --- a/frontend/tests/unit/adapters/apiProvider-etag.test.ts +++ b/frontend/tests/unit/adapters/apiProvider-etag.test.ts @@ -1,14 +1,12 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; import { ApiProvider } from "../../../src/adapters/apiProvider"; import type { components } from "../../../src/api/generated"; -import type { GroundTruthItem } from "../../../src/models/groundTruth"; +import { + type GroundTruthItem, + getLastAgentTurn, +} from "../../../src/models/groundTruth"; type ApiItem = components["schemas"]["AgenticGroundTruthEntry-Output"] & { - synthQuestion?: string | null; - editedQuestion?: string | null; - answer?: string | null; - refs?: components["schemas"]["Reference"][]; - totalReferences?: number; tags?: string[]; comment?: string | null; }; @@ -40,11 +38,7 @@ function makeApiItem(overrides: Partial = {}): ApiItem { return { id: "gt-1", status: "draft", - answer: "Original answer", - synthQuestion: "Synth question", - editedQuestion: "Edited question", history: [], - refs: [], tags: [], comment: null, datasetName: "dataset-1", @@ -67,7 +61,7 @@ describe("ApiProvider ETag 412 retry behavior", () => { const freshItem = makeApiItem({ _etag: "etag-fresh" }); const updatedItem = makeApiItem({ _etag: "etag-after-update", - answer: "Updated answer", + history: [{ role: "assistant", msg: "Updated answer" }], }); mockGetMyAssignments.mockResolvedValue([originalItem]); @@ -85,7 +79,7 @@ describe("ApiProvider ETag 412 retry behavior", () => { const domainItem: GroundTruthItem = { ...items[0], - answer: "Updated answer", + history: [{ role: "agent", content: "Updated answer" }], }; const result = await provider.save(domainItem); @@ -108,7 +102,7 @@ describe("ApiProvider ETag 412 retry behavior", () => { expect(mockUpdateAssignedGroundTruth.mock.calls[1][4]).toBe("etag-fresh"); // Result should be the updated item - expect(result.answer).toBe("Updated answer"); + expect(getLastAgentTurn(result)).toBe("Updated answer"); }); it("updates cache with fresh ETag after 412 retry", async () => { diff --git a/frontend/tests/unit/components/app/CurateLayout.integration.test.tsx b/frontend/tests/unit/components/app/CurateLayout.integration.test.tsx index b61d784..65ecb5b 100644 --- a/frontend/tests/unit/components/app/CurateLayout.integration.test.tsx +++ b/frontend/tests/unit/components/app/CurateLayout.integration.test.tsx @@ -12,7 +12,6 @@ function MiniCurateApp() { { id: "1", question: "Q-1", - answer: "", history: [{ role: "user", content: "Q-1" }], status: "draft", providerId: "json", @@ -21,7 +20,6 @@ function MiniCurateApp() { { id: "2", question: "Q-2", - answer: "", history: [{ role: "user", content: "Q-2" }], status: "draft", providerId: "json", diff --git a/frontend/tests/unit/components/app/QuestionsExplorer.test.tsx b/frontend/tests/unit/components/app/QuestionsExplorer.test.tsx index 2b292a7..97badc4 100644 --- a/frontend/tests/unit/components/app/QuestionsExplorer.test.tsx +++ b/frontend/tests/unit/components/app/QuestionsExplorer.test.tsx @@ -47,14 +47,21 @@ vi.mock("../../../../src/services/datasets", () => ({ const createMockItem = ( overrides: Partial = {}, -): QuestionsExplorerItem => ({ - id: "item-1", - question: "Test Question", - answer: "Test Answer", - status: "draft", - providerId: "test", - ...overrides, -}); +): QuestionsExplorerItem => { + const question = overrides.question ?? "Test Question"; + const history = overrides.history ?? [ + { role: "user", content: question }, + { role: "agent", content: "Test Answer" }, + ]; + return { + id: "item-1", + question, + history, + status: "draft", + providerId: "test", + ...overrides, + }; +}; describe("QuestionsExplorer", () => { const mockOnAssign = vi.fn(); @@ -109,14 +116,14 @@ describe("QuestionsExplorer", () => { async ( params: { itemId?: string; - refUrl?: string; + pluginFilter?: string[]; keyword?: string; page?: number; } = {}, ) => { const hasTextFilter = Boolean(params.itemId) || - Boolean(params.refUrl) || + Boolean(params.pluginFilter?.length) || Boolean(params.keyword); const page = typeof params.page === "number" ? params.page : 1; const totalPages = hasTextFilter ? 1 : 3; @@ -405,7 +412,9 @@ describe("QuestionsExplorer", () => { name: "reference URL", label: "Reference URL:", value: "https://example.com/ref", - expectedFilter: { refUrl: "https://example.com/ref" }, + expectedFilter: { + pluginFilter: ["rag-compat:refUrl=https://example.com/ref"], + }, }, { name: "keyword", diff --git a/frontend/tests/unit/components/app/pages/CuratePane.test.tsx b/frontend/tests/unit/components/app/pages/CuratePane.test.tsx index 9498d62..1075bfe 100644 --- a/frontend/tests/unit/components/app/pages/CuratePane.test.tsx +++ b/frontend/tests/unit/components/app/pages/CuratePane.test.tsx @@ -5,7 +5,7 @@ import type { GroundTruthItem } from "../../../../../src/models/groundTruth"; const item: GroundTruthItem = { id: "1", question: "What is this software?", - answer: "", + history: [{ role: "agent", content: "" }], status: "draft", providerId: "json", tags: [], diff --git a/frontend/tests/unit/components/app/pages/QuestionsList.test.tsx b/frontend/tests/unit/components/app/pages/QuestionsList.test.tsx index 36d7247..3e3440e 100644 --- a/frontend/tests/unit/components/app/pages/QuestionsList.test.tsx +++ b/frontend/tests/unit/components/app/pages/QuestionsList.test.tsx @@ -5,7 +5,7 @@ import type { GroundTruthItem } from "../../../../../src/models/groundTruth"; const mkItem = (id: string, deleted = false): GroundTruthItem => ({ id, question: `Q-${id}`, - answer: "", + history: [{ role: "agent", content: "" }], tags: [], status: "draft", providerId: "json", diff --git a/frontend/tests/unit/components/app/pages/ReferencesSection.test.tsx b/frontend/tests/unit/components/app/pages/ReferencesSection.test.tsx index 87a07dd..52b5778 100644 --- a/frontend/tests/unit/components/app/pages/ReferencesSection.test.tsx +++ b/frontend/tests/unit/components/app/pages/ReferencesSection.test.tsx @@ -85,7 +85,7 @@ const makeItem = ( ): GroundTruthItem => ({ id: "i1", question: "Q", - answer: "A", + history: [{ role: "agent", content: "A" }], status: "draft", providerId: "test", ...overrides, diff --git a/frontend/tests/unit/hooks/useGroundTruth-deleteTurn.test.tsx b/frontend/tests/unit/hooks/useGroundTruth-deleteTurn.test.tsx index 50f7545..a999b72 100644 --- a/frontend/tests/unit/hooks/useGroundTruth-deleteTurn.test.tsx +++ b/frontend/tests/unit/hooks/useGroundTruth-deleteTurn.test.tsx @@ -1,6 +1,10 @@ import { act, renderHook, waitFor } from "@testing-library/react"; import type { ConversationTurn } from "../../../src/models/groundTruth"; -import { getItemReferences } from "../../../src/models/groundTruth"; +import { + getItemReferences, + getLastAgentTurn, + getLastUserTurn, +} from "../../../src/models/groundTruth"; vi.mock("../../../src/config/demo", () => ({ default: true, @@ -181,18 +185,18 @@ describe("useGroundTruth deleteTurn", () => { }, ]; await seedHistory(result, history); - expect(result.current.current?.question).toBe("Second question"); - expect(result.current.current?.answer).toBe("Second answer"); + expect(getLastUserTurn(result.current.current!)).toBe("Second question"); + expect(getLastAgentTurn(result.current.current!)).toBe("Second answer"); await act(async () => { result.current.deleteTurn(3); }); - expect(result.current.current?.question).toBe("Second question"); - expect(result.current.current?.answer).toBe("First answer"); + expect(getLastUserTurn(result.current.current!)).toBe("Second question"); + expect(getLastAgentTurn(result.current.current!)).toBe("First answer"); await act(async () => { result.current.deleteTurn(2); }); - expect(result.current.current?.question).toBe("First question"); - expect(result.current.current?.answer).toBe("First answer"); + expect(getLastUserTurn(result.current.current!)).toBe("First question"); + expect(getLastAgentTurn(result.current.current!)).toBe("First answer"); }); it("handles empty or out-of-range deletions without breaking canonical state", async () => { @@ -208,7 +212,7 @@ describe("useGroundTruth deleteTurn", () => { result.current.deleteTurn(0); }); expect(result.current.current?.history).toHaveLength(0); - expect(result.current.current?.question).toBe(""); - expect(result.current.current?.answer).toBe(""); + expect(getLastUserTurn(result.current.current!)).toBe(""); + expect(getLastAgentTurn(result.current.current!)).toBe(""); }); }); diff --git a/frontend/tests/unit/hooks/useGroundTruth-multiturn.test.tsx b/frontend/tests/unit/hooks/useGroundTruth-multiturn.test.tsx index 41b00bd..2c5452b 100644 --- a/frontend/tests/unit/hooks/useGroundTruth-multiturn.test.tsx +++ b/frontend/tests/unit/hooks/useGroundTruth-multiturn.test.tsx @@ -1,6 +1,10 @@ import { act, renderHook, waitFor } from "@testing-library/react"; import type { ConversationTurn } from "../../../src/models/groundTruth"; -import { getItemReferences } from "../../../src/models/groundTruth"; +import { + getItemReferences, + getLastAgentTurn, + getLastUserTurn, +} from "../../../src/models/groundTruth"; vi.mock("../../../src/config/demo", () => ({ default: true, @@ -54,8 +58,8 @@ describe("useGroundTruth multi-turn flows", () => { expect( result.current.current?.history?.every((turn) => !!turn.turnId), ).toBe(true); - expect(result.current.current?.question).toBe("New question"); - expect(result.current.current?.answer).toBe("Fresh answer"); + expect(getLastUserTurn(result.current.current!)).toBe("New question"); + expect(getLastAgentTurn(result.current.current!)).toBe("Fresh answer"); }); it("addTurn appends to history and keeps question/answer in sync", async () => { @@ -67,7 +71,7 @@ describe("useGroundTruth multi-turn flows", () => { expect(result.current.current?.history?.length).toBe( initialHistoryLength + 1, ); - expect(result.current.current?.question).toBe("Follow-up question"); + expect(getLastUserTurn(result.current.current!)).toBe("Follow-up question"); await act(async () => { result.current.addTurn("agent", "Agent reply"); }); @@ -75,7 +79,7 @@ describe("useGroundTruth multi-turn flows", () => { role: "agent", content: "Agent reply", }); - expect(result.current.current?.answer).toBe("Agent reply"); + expect(getLastAgentTurn(result.current.current!)).toBe("Agent reply"); }); it("stateSignature ignores visitedAt mutations for hasUnsaved", async () => { diff --git a/frontend/tests/unit/models/groundTruth.multiturn.test.ts b/frontend/tests/unit/models/groundTruth.multiturn.test.ts index 15546f0..2c714a3 100644 --- a/frontend/tests/unit/models/groundTruth.multiturn.test.ts +++ b/frontend/tests/unit/models/groundTruth.multiturn.test.ts @@ -7,6 +7,7 @@ import { getLastUserTurn, getTurnCount, isMultiTurn, + withDerivedLegacyFields, } from "../../../src/models/groundTruth"; describe("groundTruth multi-turn helpers", () => { @@ -16,7 +17,7 @@ describe("groundTruth multi-turn helpers", () => { id: "item-1", providerId: "demo", question: "fallback question", - answer: "fallback answer", + history: [{ role: "agent", content: "fallback answer" }], status: "draft", ...overrides, }); @@ -58,6 +59,16 @@ describe("groundTruth multi-turn helpers", () => { expect(getLastAgentTurn(item)).toBe(""); }); + it("treats custom non-user roles as answer turns", () => { + const item = makeItem({ + history: [ + { role: "user", content: "User" }, + { role: "planner", content: "Intermediate planner output" }, + ], + }); + expect(getLastAgentTurn(item)).toBe("Intermediate planner output"); + }); + it("returns latest matching turn content", () => { const item = makeItem({ history: [ @@ -70,6 +81,29 @@ describe("groundTruth multi-turn helpers", () => { expect(getLastUserTurn(item)).toBe("Follow-up"); expect(getLastAgentTurn(item)).toBe("Updated answer"); }); + + it("derives compatibility question from the latest user turn for cross-layer parity", () => { + const item = makeItem({ + history: [ + { role: "user", content: "Initial question" }, + { role: "planner", content: "Interim planning output" }, + { role: "user", content: "Follow-up question" }, + { role: "assistant", content: "Final answer" }, + ], + }); + expect(withDerivedLegacyFields(item).question).toBe("Follow-up question"); + }); + + it("returns the last non-user turn regardless of role label", () => { + const item = makeItem({ + history: [ + { role: "user", content: "Question" }, + { role: "assistant", content: "Assistant output" }, + { role: "planner", content: "Planner output" }, + ], + }); + expect(getLastAgentTurn(item)).toBe("Planner output"); + }); }); describe("conversation metadata helpers", () => { diff --git a/frontend/tests/unit/models/gtHelpers.expectedBehavior.test.ts b/frontend/tests/unit/models/gtHelpers.expectedBehavior.test.ts index 12bb707..2c20da5 100644 --- a/frontend/tests/unit/models/gtHelpers.expectedBehavior.test.ts +++ b/frontend/tests/unit/models/gtHelpers.expectedBehavior.test.ts @@ -23,7 +23,6 @@ describe("canApproveMultiTurn - Expected Behavior Validation", () => { id: "test-1", providerId: "test", question: "Test question", - answer: "Test answer", status: "draft", expectedTools: { required: [{ name: "search" }] }, toolCalls: [{ id: "tc1", name: "search", callType: "tool" }], @@ -311,7 +310,7 @@ describe("canApproveMultiTurn - expectedTools gating", () => { id: "test-et", providerId: "test", question: "Test question", - answer: "Test answer", + history: [{ role: "agent", content: "Test answer" }], status: "draft", }; const validHistory: ConversationTurn[] = [ diff --git a/frontend/tests/unit/provider/duplicate-json.test.ts b/frontend/tests/unit/provider/duplicate-json.test.ts index 2b93f7a..1a36e54 100644 --- a/frontend/tests/unit/provider/duplicate-json.test.ts +++ b/frontend/tests/unit/provider/duplicate-json.test.ts @@ -1,6 +1,9 @@ import { describe, expect, it } from "vitest"; import { DEMO_JSON } from "../../../src/models/demoData"; -import { getItemReferences } from "../../../src/models/groundTruth"; +import { + getItemReferences, + getLastAgentTurn, +} from "../../../src/models/groundTruth"; import { JsonProvider } from "../../../src/models/provider"; describe("JsonProvider duplicate", () => { @@ -16,7 +19,7 @@ describe("JsonProvider duplicate", () => { expect(created.id.startsWith("temp-")).toBe(true); // Core fields copied expect(created.question).toBe(original.question); - expect(created.answer).toBe(original.answer); + expect(getLastAgentTurn(created)).toBe(getLastAgentTurn(original)); expect(getItemReferences(created).length).toBe( getItemReferences(original).length, ); diff --git a/frontend/tests/unit/provider/provider.multiturn.test.ts b/frontend/tests/unit/provider/provider.multiturn.test.ts index 27687d4..5e808bd 100644 --- a/frontend/tests/unit/provider/provider.multiturn.test.ts +++ b/frontend/tests/unit/provider/provider.multiturn.test.ts @@ -1,4 +1,5 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; +import type { ApiReference } from "../../../src/adapters/apiMapper"; import { ApiProvider } from "../../../src/adapters/apiProvider"; import type { components } from "../../../src/api/generated"; import type { @@ -34,18 +35,15 @@ vi.mock("../../../src/services/groundTruths", () => ({ })); type ApiHistoryEntry = components["schemas"]["HistoryEntry"] & { - refs?: components["schemas"]["Reference"][]; + refs?: ApiReference[]; expectedBehavior?: string[]; + turnId?: string; + stepId?: string; }; type ApiItem = Omit< components["schemas"]["AgenticGroundTruthEntry-Output"], "history" > & { - synthQuestion?: string | null; - editedQuestion?: string | null; - answer?: string | null; - refs?: components["schemas"]["Reference"][]; - totalReferences?: number; tags?: string[]; comment?: string | null; history?: ApiHistoryEntry[]; @@ -57,11 +55,7 @@ function makeApiItem(overrides: Partial = {}): ApiItem { return { id: "gt-1", status: "draft", - answer: "Original answer", - synthQuestion: "Synth question", - editedQuestion: "Edited question", history: [], - refs: [], tags: [], comment: null, datasetName: "dataset-1", @@ -71,6 +65,20 @@ function makeApiItem(overrides: Partial = {}): ApiItem { } as ApiItem; } +function withCompatData( + data: Record, +): Pick { + return { + plugins: { + "rag-compat": { + kind: "rag-compat", + version: "1.0", + data, + }, + }, + }; +} + beforeEach(() => { mockGetMyAssignments.mockReset(); mockUpdateAssignedGroundTruth.mockReset(); @@ -94,14 +102,14 @@ describe("ApiProvider mapping", () => { expect(history).toHaveLength(2); expect(history[0]).toMatchObject({ role: "user", content: "How do I?" }); expect(history[1]).toMatchObject({ - role: "agent", + role: "assistant", content: "Use the regenerate command.", }); expect(history[0]?.turnId).toBeTruthy(); expect(history[1]?.turnId).toBeTruthy(); }); - it("maps per-turn refs onto the owning non-user turn", async () => { + it("ignores retired history refs payloads on read", async () => { const apiItem = makeApiItem({ history: [ { role: "user", msg: "Q" }, @@ -122,77 +130,53 @@ describe("ApiProvider mapping", () => { mockGetMyAssignments.mockResolvedValue([apiItem]); const provider = new ApiProvider(); const { items } = await provider.list(); - const turn = items[0].history?.[1]; - const [ref] = getItemReferences(items[0]); - expect(ref).toMatchObject({ - url: "https://turn.ref", - bonus: true, - messageIndex: 1, - turnId: turn?.turnId, - }); + expect(getItemReferences(items[0])).toEqual([]); }); }); - describe("compat-migration read projections", () => { - it("projects legacy single-turn payloads into stable user and agent turns", async () => { + describe("retired compat read behavior", () => { + it("does not synthesize history from retired compat question/answer fields", async () => { const apiItem = makeApiItem({ - synthQuestion: "What is X?", - editedQuestion: "What is X exactly?", - answer: "X is Y", - tags: ["important", "technical"], history: undefined, + ...withCompatData({ + synthQuestion: "What is X?", + editedQuestion: "What is X exactly?", + answer: "X is Y", + }), }); mockGetMyAssignments.mockResolvedValue([apiItem]); const provider = new ApiProvider(); const { items } = await provider.list(); - const history = items[0].history ?? []; - expect(history).toHaveLength(2); - expect(history[0]).toMatchObject({ - role: "user", - content: "What is X exactly?", - }); - expect(history[1]).toMatchObject({ - role: "agent", - content: "X is Y", - }); + expect(items[0].history).toBeUndefined(); }); - it("anchors legacy top-level refs to the synthesized agent turn even without an answer", async () => { + it("does not import retired compat refs into canonical references", async () => { const apiItem = makeApiItem({ - editedQuestion: "How do I configure authentication for my app?", - answer: "", - refs: [ - { - url: "https://docs.example.com/auth", - content: "Authentication documentation content", - keyExcerpt: "Use OAuth 2.0 for authentication", - bonus: false, - }, - ], history: undefined, + ...withCompatData({ + editedQuestion: "How do I configure authentication for my app?", + answer: "", + refs: [ + { + url: "https://docs.example.com/auth", + content: "Authentication documentation content", + keyExcerpt: "Use OAuth 2.0 for authentication", + bonus: false, + }, + ], + }), }); mockGetMyAssignments.mockResolvedValue([apiItem]); const provider = new ApiProvider(); const { items } = await provider.list(); - const history = items[0].history ?? []; - const [ref] = getItemReferences(items[0]); - expect(history).toHaveLength(2); - expect(history[0]?.content).toBe( - "How do I configure authentication for my app?", - ); - expect(history[1]).toMatchObject({ role: "agent", content: "" }); - expect(ref).toMatchObject({ - url: "https://docs.example.com/auth", - messageIndex: 1, - turnId: history[1]?.turnId, - }); + expect(getItemReferences(items[0])).toEqual([]); }); }); }); describe("ApiProvider serialization", () => { describe("core-generic multi-turn writes", () => { - it("serializes history roles and keeps refs scoped to non-user turns", async () => { + it("serializes history roles and omits retired history refs", async () => { const apiItem = makeApiItem({ history: [ { role: "user", msg: "Original Q" }, @@ -207,10 +191,6 @@ describe("ApiProvider serialization", () => { ...apiItem, id, history: (patch.history as ApiItem["history"]) ?? apiItem.history, - refs: (patch.refs as ApiItem["refs"]) ?? apiItem.refs, - answer: (patch.answer as string) ?? apiItem.answer, - editedQuestion: - (patch.editedQuestion as string) ?? apiItem.editedQuestion, status: (patch.status as ApiItem["status"]) ?? apiItem.status, } as ApiItem; }, @@ -230,11 +210,6 @@ describe("ApiProvider serialization", () => { url: "https://turn", turnId: "turn-agent-updated", }, - { - id: "user-ref", - url: "https://user", - turnId: "turn-user-updated", - }, ]; const updated: GroundTruthItem = withUpdatedReferences( { ...domain, history }, @@ -245,73 +220,29 @@ describe("ApiProvider serialization", () => { const patch = capturedPatch as Patch; const patchHistory = patch.history as ApiItem["history"]; expect(patchHistory?.[0]?.role).toBe("user"); - expect(patchHistory?.[1]?.role).toBe("assistant"); + expect(patchHistory?.[1]?.role).toBe("agent"); expect(patchHistory?.[0]?.refs).toBeUndefined(); - expect(patchHistory?.[1]?.refs).toHaveLength(1); - expect(patchHistory?.[1]?.refs?.[0]?.url).toBe("https://turn"); - }); - - it("keeps true multi-turn refs out of top-level compatibility fields", async () => { - const apiItem = makeApiItem({ - history: [ - { role: "user", msg: "Question" }, - { - role: "assistant", - msg: "Answer", - refs: [ - { - url: "https://turn.ref", - content: "Turn content", - bonus: false, - }, - ], - }, - ], - refs: [], - }); - let capturedPatch: Patch | undefined; - mockUpdateAssignedGroundTruth.mockImplementation( - async ( - _dataset: string, - _bucket: string, - _id: string, - patch: Patch, - ) => { - capturedPatch = patch; - return apiItem; - }, - ); - mockGetMyAssignments.mockResolvedValue([apiItem]); - const provider = new ApiProvider(); - const { items } = await provider.list(); - await provider.save(items[0]); - const patch = capturedPatch as Patch; - const patchHistory = patch.history as ApiItem["history"]; - expect(patch.refs).toHaveLength(0); - expect(patchHistory?.[1]?.refs).toHaveLength(1); - expect(patchHistory?.[1]?.refs?.[0]?.url).toBe("https://turn.ref"); + expect(patchHistory?.[1]?.refs).toBeUndefined(); + expect((patch as Record).refs).toBeUndefined(); }); }); - describe("compat-migration write projections", () => { - it("preserves legacy top-level refs when saving a synthesized single-turn item", async () => { + describe("canonical write projections", () => { + it("persists canonical plugin references without history refs emission", async () => { const apiItem = makeApiItem({ - synthQuestion: "What is X?", - answer: "X is Y", - refs: [ - { - url: "https://legacy.ref/doc1", - content: "Legacy content", - keyExcerpt: "Key paragraph", - bonus: false, - }, - { - url: "https://legacy.ref/doc2", - content: "Bonus content", - bonus: true, - }, + history: [ + { role: "user", msg: "Q", turnId: "t-user" }, + { role: "assistant", msg: "A", turnId: "t-agent" }, ], - history: undefined, + ...withCompatData({ + references: [ + { + url: "https://canonical.ref/doc1", + content: "Canonical content", + messageIndex: 1, + }, + ], + }), }); let capturedPatch: Patch | undefined; mockUpdateAssignedGroundTruth.mockImplementation( @@ -320,7 +251,7 @@ describe("ApiProvider serialization", () => { return { ...apiItem, id, - refs: (patch.refs as ApiItem["refs"]) ?? apiItem.refs, + plugins: (patch.plugins as ApiItem["plugins"]) ?? apiItem.plugins, status: (patch.status as ApiItem["status"]) ?? apiItem.status, } as ApiItem; }, @@ -328,11 +259,11 @@ describe("ApiProvider serialization", () => { mockGetMyAssignments.mockResolvedValue([apiItem]); const provider = new ApiProvider(); const { items } = await provider.list(); - const legacyRefs = getItemReferences(items[0]); + const existingRefs = getItemReferences(items[0]); const updated: GroundTruthItem = withUpdatedReferences( items[0], - legacyRefs.map((ref) => - ref.url === "https://legacy.ref/doc1" + existingRefs.map((ref) => + ref.url === "https://canonical.ref/doc1" ? { ...ref, bonus: true, keyParagraph: "Updated key" } : ref, ), @@ -340,17 +271,12 @@ describe("ApiProvider serialization", () => { await provider.save(updated); const patch = capturedPatch as Patch; const patchHistory = patch.history as ApiItem["history"]; - expect(patch.refs).toHaveLength(2); - expect(patch.refs?.[0]).toMatchObject({ - url: "https://legacy.ref/doc1", - bonus: true, - keyExcerpt: "Updated key", - }); - expect(patch.refs?.[1]).toMatchObject({ - url: "https://legacy.ref/doc2", - bonus: true, - }); - expect(patchHistory?.[1]?.refs).toHaveLength(2); + expect((patch as Record).refs).toBeUndefined(); + expect(patchHistory?.[1]?.refs).toBeUndefined(); + expect( + (patch.plugins?.["rag-compat"]?.data as { references?: unknown }) + .references, + ).toBeDefined(); }); }); }); diff --git a/frontend/tests/unit/registry/RegistryRenderer.test.tsx b/frontend/tests/unit/registry/RegistryRenderer.test.tsx index 1a01f2b..d73f685 100644 --- a/frontend/tests/unit/registry/RegistryRenderer.test.tsx +++ b/frontend/tests/unit/registry/RegistryRenderer.test.tsx @@ -25,7 +25,7 @@ function renderExtension(toolCall: ToolCallRecord) { item: { id: "item-1", question: "q", - answer: "", + history: [{ role: "agent", content: "" }], status: "draft", providerId: "json", tags: [], diff --git a/frontend/tests/unit/services/groundTruths-mapping.test.ts b/frontend/tests/unit/services/groundTruths-mapping.test.ts index 389a38e..4c68cc0 100644 --- a/frontend/tests/unit/services/groundTruths-mapping.test.ts +++ b/frontend/tests/unit/services/groundTruths-mapping.test.ts @@ -1,5 +1,8 @@ import { describe, expect, it } from "vitest"; -import type { ApiGroundTruth } from "../../../src/adapters/apiMapper"; +import type { + ApiGroundTruth, + ApiReference, +} from "../../../src/adapters/apiMapper"; import { groundTruthFromApi } from "../../../src/adapters/apiMapper"; import type { components } from "../../../src/api/generated"; import { getItemReferences } from "../../../src/models/groundTruth"; @@ -9,15 +12,10 @@ type ApiItem = Omit< components["schemas"]["AgenticGroundTruthEntry-Output"], "history" > & { - synthQuestion?: string | null; - editedQuestion?: string | null; - answer?: string | null; - refs?: components["schemas"]["Reference"][]; - totalReferences?: number; tags?: string[]; comment?: string | null; history?: (components["schemas"]["HistoryEntry"] & { - refs?: components["schemas"]["Reference"][]; + refs?: ApiReference[]; expectedBehavior?: string[]; })[]; }; @@ -26,11 +24,7 @@ function makeApiItem(overrides: Partial = {}): ApiItem { return { id: "gt-1", status: "draft", - answer: "", - synthQuestion: "", - editedQuestion: "", history: undefined, - refs: [], tags: [], comment: null, datasetName: "dataset-1", @@ -40,9 +34,23 @@ function makeApiItem(overrides: Partial = {}): ApiItem { } as ApiItem; } +function withCompatData( + data: Record, +): Pick { + return { + plugins: { + "rag-compat": { + kind: "rag-compat", + version: "1.0", + data, + }, + }, + }; +} + describe("mapGroundTruthFromApi", () => { describe("core-generic mapping", () => { - it("converts assistant role to agent and keeps stable turn ids", () => { + it("preserves assistant role values and keeps stable turn ids", () => { const apiItem = makeApiItem({ history: [ { role: "user", msg: "Question" }, @@ -55,14 +63,14 @@ describe("mapGroundTruthFromApi", () => { content: "Question", }); expect(result.history?.[1]).toMatchObject({ - role: "agent", + role: "assistant", content: "Answer", }); expect(result.history?.[0].turnId).toBeTruthy(); expect(result.history?.[1].turnId).toBeTruthy(); }); - it("preserves per-turn refs when canonical history already exists", () => { + it("ignores retired per-turn history refs", () => { const apiItem = makeApiItem({ history: [ { role: "user", msg: "Q1" }, @@ -80,72 +88,53 @@ describe("mapGroundTruthFromApi", () => { ], }); const result = mapGroundTruthFromApi(apiItem); - const [ref] = getItemReferences(result); - expect(ref).toMatchObject({ - url: "https://turn-ref.com", - messageIndex: 1, - turnId: result.history?.[1]?.turnId, - }); + expect(getItemReferences(result)).toEqual([]); }); }); - describe("compat-migration read mapping", () => { - it("creates synthesized user and agent turns from legacy single-turn fields", () => { + describe("retired compat read mapping", () => { + it("does not synthesize history from retired single-turn fields", () => { const apiItem = makeApiItem({ - synthQuestion: "Synth", - editedQuestion: "Edited", - answer: "A", history: undefined, + ...withCompatData({ + synthQuestion: "Synth", + editedQuestion: "Edited", + answer: "A", + }), }); const result = mapGroundTruthFromApi(apiItem); - expect(result.history).toHaveLength(2); - expect(result.history?.[0]).toMatchObject({ - role: "user", - content: "Edited", - }); - expect(result.history?.[1]).toMatchObject({ - role: "agent", - content: "A", - }); + expect(result.history).toBeUndefined(); }); - it("anchors legacy top-level refs to the synthesized agent turn when answer is empty", () => { + it("does not import retired compat refs", () => { const apiItem = makeApiItem({ - editedQuestion: "How do I configure authentication for my app?", - answer: "", - refs: [ - { - url: "https://docs.example.com/auth", - content: "Authentication documentation content", - keyExcerpt: "Use OAuth 2.0 for authentication", - bonus: false, - }, - ], history: undefined, + ...withCompatData({ + editedQuestion: "How do I configure authentication for my app?", + answer: "", + refs: [ + { + url: "https://docs.example.com/auth", + content: "Authentication documentation content", + keyExcerpt: "Use OAuth 2.0 for authentication", + bonus: false, + }, + ], + }), }); const result = mapGroundTruthFromApi(apiItem); - const [ref] = getItemReferences(result); - expect(result.history).toHaveLength(2); - expect(result.history?.[1]).toMatchObject({ role: "agent", content: "" }); - expect(ref).toMatchObject({ - url: "https://docs.example.com/auth", - messageIndex: 1, - turnId: result.history?.[1]?.turnId, - }); + expect(getItemReferences(result)).toEqual([]); }); }); describe("providerId", () => { it("defaults to 'api' when not provided", () => { - const result = mapGroundTruthFromApi(makeApiItem({ synthQuestion: "Q" })); + const result = mapGroundTruthFromApi(makeApiItem()); expect(result.providerId).toBe("api"); }); it("uses provided providerId", () => { - const result = mapGroundTruthFromApi( - makeApiItem({ synthQuestion: "Q" }), - "custom-provider", - ); + const result = mapGroundTruthFromApi(makeApiItem(), "custom-provider"); expect(result.providerId).toBe("custom-provider"); }); }); @@ -206,11 +195,7 @@ describe("mapper parity: groundTruthFromApi and mapGroundTruthFromApi", () => { return { id: "parity-1", status: "draft", - answer: "Parity answer", - synthQuestion: "Synth parity Q", - editedQuestion: "Edited parity Q", history: undefined, - refs: [], tags: ["t1"], manualTags: ["m1"], computedTags: ["c1"], @@ -219,11 +204,20 @@ describe("mapper parity: groundTruthFromApi and mapGroundTruthFromApi", () => { bucket: "bkt" as ApiGroundTruth["bucket"], _etag: "etag-parity", reviewedAt: "2024-01-01T00:00:00Z", + plugins: { + "rag-compat": { + kind: "rag-compat", + version: "1.0", + data: { + references: [], + }, + }, + }, ...overrides, } as ApiGroundTruth; } - it("produces identical output for a legacy single-turn payload", () => { + it("produces identical output for a canonical payload", () => { const payload = makeSharedPayload(); const fromProvider = groundTruthFromApi(payload); const fromService = mapGroundTruthFromApi(payload); @@ -232,11 +226,8 @@ describe("mapper parity: groundTruthFromApi and mapGroundTruthFromApi", () => { ); }); - it("produces identical output for a multi-turn payload with per-turn refs", () => { + it("produces identical output for a multi-turn payload with retired per-turn refs", () => { const payload = makeSharedPayload({ - editedQuestion: "", - synthQuestion: "", - answer: "", history: [ { role: "user", msg: "First question" }, { @@ -257,7 +248,7 @@ describe("mapper parity: groundTruthFromApi and mapGroundTruthFromApi", () => { expect(normalizeTurnIdentity(fromProvider)).toEqual( normalizeTurnIdentity(fromService), ); - expect(getItemReferences(fromProvider)).toHaveLength(2); + expect(getItemReferences(fromProvider)).toHaveLength(0); }); it("preserves reviewedAt through both paths identically", () => {