diff --git a/.gitignore b/.gitignore
index 5e5bc30..3ad1bcc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,7 @@ env/
 
 # Test / coverage
 .pytest_cache/
+backend/pytest-unit-results.xml
 coverage.xml
 htmlcov/
 
diff --git a/backend/app/adapters/repos/base.py b/backend/app/adapters/repos/base.py
index 7d4739b..2dc89ad 100644
--- a/backend/app/adapters/repos/base.py
+++ b/backend/app/adapters/repos/base.py
@@ -32,9 +32,10 @@ async def list_gt_paginated(
         tags: list[str] | None = None,
         exclude_tags: list[str] | None = None,
         item_id: str | None = None,
-        ref_url: str | None = None,
+        plugin_filters: dict[str, str] | None = None,
         keyword: str | None = None,
         sort_by: SortField | None = None,
+        plugin_sort: str | None = None,
         sort_order: SortOrder | None = None,
         page: int = 1,
         limit: int = 25,
diff --git a/backend/app/adapters/repos/cosmos_repo.py b/backend/app/adapters/repos/cosmos_repo.py
index 2dfe5b4..5a394c7 100644
--- a/backend/app/adapters/repos/cosmos_repo.py
+++ b/backend/app/adapters/repos/cosmos_repo.py
@@ -21,6 +21,7 @@
 from azure.cosmos.exceptions import CosmosHttpResponseError, CosmosResourceNotFoundError
 
 from app.adapters.repos.base import GroundTruthRepo
+from app.domain.conversation_fields import answer_text_from_item, question_text_from_item
 from app.domain.models import (
     AgenticGroundTruthEntry,
     Stats,
@@ -32,6 +33,8 @@
 )
 from app.domain.enums import GroundTruthStatus, SortField, SortOrder
 from app.core.config import get_sampling_allocation
+from app.plugins.base import PluginPackRegistry
+from app.plugins.pack_registry import get_default_pack_registry
 
 
 _SMART_PUNCT_REPLACEMENTS: dict[str, str] = {
@@ -51,19 +54,16 @@
     **{ord(ch): " " for ch in (chr(i) for i in range(32)) if ch not in ("\n", "\r", "\t")},
     ord("\u007f"): " ",
 }
-
 # Cosmos DB SELECT clause for AgenticGroundTruthEntry fields used in several functions
 # list_gt_paginated, _list_gt_paginated_with_emulator, list_gt_by_dataset
-# Note: legacy fields like synthQuestion, editedQuestion are still selected for compatibility
-# during migration, but the model will access them via computed properties
 SELECT_CLAUSE_C = (
     "SELECT c.id, c.datasetName, c.bucket, c.status, c.docType, c.schemaVersion, "
-    "c.synthQuestion, c.editedQuestion, c.answer, c.refs, c.tags, c.manualTags, c.computedTags, c.comment, c.plugins, "
+    "c.tags, c.manualTags, c.computedTags, c.comment, c.plugins, "
     "c.scenarioId, c.history, c.contextEntries, c.traceIds, c.toolCalls, c.expectedTools, "
     "c.feedback, c.metadata, c.createdBy, c.createdAt, c.tracePayload, "
     "c.contextUsedForGeneration, c.contextSource, c.modelUsedForGeneration, "
     "c.semanticClusterNumber, c.weight, c.samplingBucket, c.questionLength, "
-    "c.assignedTo, c.assignedAt, c.totalReferences, c.updatedAt, c.updatedBy, c.reviewedAt, c._etag "
+    "c.assignedTo, c.assignedAt, c.updatedAt, c.updatedBy, c.reviewedAt, c._etag "
 )
 
 
@@ -147,7 +147,7 @@ def _sanitize_string_for_cosmos(value: str) -> str:
 def _normalize_unicode_for_cosmos(obj: Any) -> Any:
     """
     Recursively sanitize strings to work around Cosmos emulator Unicode bugs.
-    Also Base64-encodes 'content' fields in 'refs' arrays as a workaround.
+    Also Base64-encodes 'content' fields in reference arrays as a workaround.
     """
 
     if not settings.COSMOS_DISABLE_UNICODE_ESCAPE:
@@ -158,11 +158,11 @@ def _normalize_unicode_for_cosmos(obj: Any) -> Any:
     if isinstance(obj, dict):
         normalized = {}
         for k, v in obj.items():
-            # Special handling for 'refs' array - encode content fields
-            if k == "refs" and isinstance(v, list):
-                # First normalize the refs
+            # Special handling for canonical reference arrays - encode content fields
+            if k == "references" and isinstance(v, list):
+                # First normalize the reference entries
                 normalized_refs = [_normalize_unicode_for_cosmos(item) for item in v]
-                # Then Base64-encode content fields in refs
+                # Then Base64-encode content fields in references
                 normalized[k] = _base64_encode_refs_content(normalized_refs)
             else:
                 normalized[k] = _normalize_unicode_for_cosmos(v)
@@ -175,7 +175,7 @@ def _normalize_unicode_for_cosmos(obj: Any) -> Any:
 def _restore_unicode_from_cosmos(obj: Any) -> Any:
     """
     Reverse emulator-only sanitization markers after fetching documents.
-    Also Base64-decodes 'content' fields in 'refs' arrays.
+    Also Base64-decodes 'content' fields in reference arrays.
     """
 
     if not settings.COSMOS_DISABLE_UNICODE_ESCAPE:
@@ -188,8 +188,8 @@ def _restore_unicode_from_cosmos(obj: Any) -> Any:
     if isinstance(obj, dict):
         restored = {}
         for k, v in obj.items():
-            # Special handling for 'refs' array - decode content fields
-            if k == "refs" and isinstance(v, list):
+            # Special handling for canonical reference arrays - decode content fields
+            if k == "references" and isinstance(v, list):
                 # First decode Base64-encoded content fields
                 decoded_refs = _base64_decode_refs_content(v)
                 # Then restore backslash sentinels
@@ -221,6 +221,7 @@ def __init__(
         connection_verify: bool | str | None = None,
         test_mode: bool = False,
         credential: Any | None = None,
+        plugin_pack_registry: PluginPackRegistry | None = None,
     ):
         # Defer CosmosClient creation to _init so the underlying aiohttp session binds
         # to the event loop of the running app (avoids cross-loop RuntimeError in tests).
@@ -236,6 +237,7 @@ def __init__(
         self._db: DatabaseProxy | None = None
         self._gt_container: ContainerProxy | None = None
         self._assignments_container: ContainerProxy | None = None
+        self._plugin_pack_registry = plugin_pack_registry or get_default_pack_registry()
         # Track the event loop on which the aiohttp client/session was created to
         # guard against cross-loop usage during tests.
         self._loop: asyncio.AbstractEventLoop | None = None  # set in _init on first use
@@ -394,10 +396,6 @@ def _to_doc(self, item: AgenticGroundTruthEntry) -> dict[str, Any]:
         # Dump in JSON mode so datetimes/enums are serialized to strings
         d = item.model_dump(mode="json", by_alias=True)
 
-        # Ensure totalReferences is computed and persisted for sorting/querying
-        # Use the property getter which handles both explicit values and plugin storage
-        d["totalReferences"] = item.totalReferences
-
         if d.get("bucket") is not None:
             d["bucket"] = str(d["bucket"])  # store UUID as string
         # Ensure updatedAt present as ISO string
@@ -406,58 +404,23 @@ def _to_doc(self, item: AgenticGroundTruthEntry) -> dict[str, Any]:
 
         return d
 
-    @staticmethod
-    def _from_doc(doc: dict[str, Any]) -> AgenticGroundTruthEntry:
+    def _from_doc(self, doc: dict[str, Any]) -> AgenticGroundTruthEntry:
         # Normalize doc before validation
         normalized_doc = (
             _restore_unicode_from_cosmos(doc) if settings.COSMOS_DISABLE_UNICODE_ESCAPE else doc
         )
-        from app.plugins.packs.rag_compat import _LEGACY_PLUGIN_FIELDS
-
-        allowed_keys = (
-            {field_name for field_name in AgenticGroundTruthEntry.model_fields}
-            | {
-                field.alias
-                for field in AgenticGroundTruthEntry.model_fields.values()
-                if field.alias is not None
-            }
-            | {
-                # Include computed_fields that need to be preserved from Cosmos documents
-                "totalReferences"  # Computed and persisted for sorting/querying
-            }
-            | set(_LEGACY_PLUGIN_FIELDS)
-        )
-        normalized_doc = {
-            key: value for key, value in normalized_doc.items() if key in allowed_keys
+        allowed_keys = {field_name for field_name in AgenticGroundTruthEntry.model_fields} | {
+            field.alias
+            for field in AgenticGroundTruthEntry.model_fields.values()
+            if field.alias is not None
         }
+        transformed_doc: dict[str, Any] = dict(normalized_doc)
+        for transform in self._plugin_pack_registry.collect_import_transforms():
+            transformed_doc = transform.transform(transformed_doc)
 
-        plugins = normalized_doc.get("plugins")
-        rag_plugin = plugins.get("rag-compat") if isinstance(plugins, dict) else None
-        rag_data = rag_plugin.get("data") if isinstance(rag_plugin, dict) else None
-        history_annotations = (
-            rag_data.get("historyAnnotations") if isinstance(rag_data, dict) else None
-        )
-        history = normalized_doc.get("history")
-        if isinstance(history, list) and isinstance(history_annotations, list):
-            merged_history: list[Any] = []
-            for index, entry in enumerate(history):
-                if isinstance(entry, dict):
-                    entry_dict = dict(entry)
-                    annotation = (
-                        history_annotations[index] if index < len(history_annotations) else None
-                    )
-                    if isinstance(annotation, dict):
-                        if "refs" in annotation and "refs" not in entry_dict:
-                            entry_dict["refs"] = annotation["refs"]
-                        if (
-                            "expectedBehavior" in annotation
-                            and "expectedBehavior" not in entry_dict
-                        ):
-                            entry_dict["expectedBehavior"] = annotation["expectedBehavior"]
-                    merged_history.append(entry_dict)
-                else:
-                    merged_history.append(entry)
-            normalized_doc["history"] = merged_history
+        normalized_doc = {
+            key: value for key, value in transformed_doc.items() if key in allowed_keys
+        }
 
         # Convert None to [] for history field (legacy data compatibility)
         if normalized_doc.get("history") is None:
@@ -466,12 +429,6 @@ def _from_doc(doc: dict[str, Any]) -> AgenticGroundTruthEntry:
         # Pydantic will parse aliases automatically
         item = AgenticGroundTruthEntry.model_validate(normalized_doc)
 
-        # IMPORTANT: totalReferences is a @computed_field, so Pydantic won't deserialize it
-        # from the document. We need to manually set it in __dict__ so the property getter
-        # can find it. This preserves the value we computed and persisted in _to_doc.
-        if "totalReferences" in normalized_doc:
-            item.__dict__["totalReferences"] = normalized_doc["totalReferences"]
-
         return item
 
     async def _ensure_initialized(self) -> None:
@@ -595,12 +552,7 @@ async def import_bulk_gt(
                 status = getattr(e, "status_code", None)
                 if status == 409:
                     # Duplicate; report but continue others
-                    article_num = (
-                        doc.get("refs", [{}])[0].get("url", "unknown")
-                        if doc.get("refs")
-                        else "unknown"
-                    )
-                    message = f"exists (article: {article_num}, id: {doc.get('id', 'unknown')})"
+                    message = f"exists (id: {doc.get('id', 'unknown')})"
                     errors.append(message)
                     persistence_errors.append(
                         BulkImportPersistenceError(
@@ -610,13 +562,8 @@ async def import_bulk_gt(
                         )
                     )
                 else:
-                    article_num = (
-                        doc.get("refs", [{}])[0].get("url", "unknown")
-                        if doc.get("refs")
-                        else "unknown"
-                    )
                     message = (
-                        f"create_failed (article: {article_num}, id: {doc.get('id', 'unknown')}): "
+                        f"create_failed (id: {doc.get('id', 'unknown')}): "
                         f"{getattr(e, 'message', str(e))}"
                     )
                     errors.append(message)
@@ -708,16 +655,6 @@ def _build_query_filter(
                 )
                 params.append({"name": pname, "value": tag})
 
-        # Ref URL filtering only if not using the Cosmos Emulator as it does not support EXISTS
-        # include_ref_url set to True when Comsomus Emulator is not used
-        if include_ref_url and ref_url:
-            clauses.append(
-                "(EXISTS(SELECT VALUE r FROM r IN c.refs WHERE CONTAINS(r.url, @refUrl)) "
-                "OR EXISTS(SELECT VALUE h FROM h IN c.history "
-                "WHERE EXISTS(SELECT VALUE r FROM r IN h.refs WHERE CONTAINS(r.url, @refUrl))))"
-            )
-            params.append({"name": "@refUrl", "value": ref_url})
-
         where_clause = " WHERE " + " AND ".join(clauses) if clauses else ""
         return where_clause, params
 
@@ -747,7 +684,19 @@ def _resolve_sort(
         return field, direction
 
     @staticmethod
-    def _sort_key(item: AgenticGroundTruthEntry, field: SortField) -> tuple[Any, ...]:
+    def _sort_key(
+        item: AgenticGroundTruthEntry,
+        field: SortField,
+        plugin_sort: str | None = None,
+        plugin_pack_registry: PluginPackRegistry | None = None,
+    ) -> tuple[Any, ...]:
+        if plugin_sort:
+            if plugin_pack_registry is None:
+                return (-1, item.id)
+            return (
+                plugin_pack_registry.plugin_sort_value(item, plugin_sort),
+                item.id,
+            )
         if field == SortField.id:
             return (item.id or "",)
 
@@ -762,15 +711,12 @@ def _sort_key(item: AgenticGroundTruthEntry, field: SortField) -> tuple[Any, ...
         if field == SortField.has_answer:
             # In-memory sort: Primary by presence of non-empty answer, secondary by reviewed_at
             # (Cosmos ORDER BY uses c.reviewedAt placeholder - see _build_secure_sort_clause)
-            has_answer = 1 if item.answer and item.answer.strip() else 0
+            has_answer = 1 if answer_text_from_item(item) else 0
             reference_time = (
                 item.reviewed_at or item.updated_at or datetime(1970, 1, 1, tzinfo=timezone.utc)
             )
             return (has_answer, reference_time, item.id)
 
-        if field == SortField.totalReferences:
-            return (item.totalReferences, item.id)
-
         if field == SortField.tag_count:
             tag_count = len(item.tags)
             return (tag_count, item.id)
@@ -789,8 +735,8 @@ def _item_matches_keyword(item: AgenticGroundTruthEntry, keyword: str) -> bool:
         """Check if item matches keyword search (case-insensitive substring match).
 
         Searches across:
-        - synth_question and edited_question fields
-        - answer field
+        - canonical question text (derived from history/plugin data)
+        - canonical answer text (derived from history/plugin data)
         - history[*].msg content (all turns)
         """
         if not keyword:
@@ -799,13 +745,13 @@ def _item_matches_keyword(item: AgenticGroundTruthEntry, keyword: str) -> bool:
         search_term = keyword.lower()
 
         # Search question fields
-        if item.synth_question and search_term in item.synth_question.lower():
-            return True
-        if item.edited_question and search_term in item.edited_question.lower():
+        question_text = question_text_from_item(item)
+        if question_text and search_term in question_text.lower():
             return True
 
         # Search answer field
-        if item.answer and search_term in item.answer.lower():
+        answer_text = answer_text_from_item(item)
+        if answer_text and search_term in answer_text.lower():
             return True
 
         # Search history messages
@@ -830,7 +776,6 @@ def _build_secure_sort_clause(self, sort_field: SortField, sort_direction: SortO
             SortField.updated_at: "c.updatedAt",
             SortField.reviewed_at: "c.reviewedAt",
             SortField.has_answer: "c.reviewedAt",  # Placeholder - actual sort is in-memory
-            SortField.totalReferences: "c.totalReferences",
         }
 
         # Security: Safe direction mapping (no user input)
@@ -860,9 +805,10 @@ async def list_gt_paginated(
         tags: list[str] | None = None,
         exclude_tags: list[str] | None = None,
         item_id: str | None = None,
-        ref_url: str | None = None,
+        plugin_filters: dict[str, str] | None = None,
         keyword: str | None = None,
         sort_by: SortField | None = None,
+        plugin_sort: str | None = None,
         sort_order: SortOrder | None = None,
         page: int = 1,
         limit: int = 25,
@@ -889,9 +835,11 @@ async def list_gt_paginated(
         if (
             normalized_tags
             or normalized_exclude_tags
-            or ref_url
+            or plugin_filters
             or keyword
             or sort_field == SortField.tag_count
+            or sort_field == SortField.has_answer
+            or plugin_sort is not None
         ):
             # Always use in-memory filtering path for these filters
             # (Cosmos emulator has limitations, and keyword search needs in-memory filtering regardless)
@@ -901,9 +849,10 @@ async def list_gt_paginated(
                 normalized_tags,
                 normalized_exclude_tags,
                 item_id,
-                ref_url,
+                plugin_filters,
                 keyword,
                 sort_by,
+                plugin_sort,
                 sort_order,
                 safe_page,
                 safe_limit,
@@ -916,9 +865,9 @@ async def list_gt_paginated(
             normalized_tags,
             normalized_exclude_tags,
             item_id,
-            ref_url,
+            None,
             include_tags=True,
-            include_ref_url=True,
+            include_ref_url=False,
         )
 
         # Build ORDER BY clause
@@ -982,9 +931,10 @@ async def _list_gt_paginated_with_emulator(
         tags: list[str],
         exclude_tags: list[str],
         item_id: str | None,
-        ref_url: str | None,
+        plugin_filters: dict[str, str] | None,
         keyword: str | None,
         sort_by: SortField | None,
+        plugin_sort: str | None,
         sort_order: SortOrder | None,
         page: int,
         limit: int,
@@ -1008,7 +958,7 @@ async def _list_gt_paginated_with_emulator(
             tags,
             exclude_tags,
             item_id,
-            ref_url,
+            None,
             include_tags=False,  # Disable SQL-level tag filtering - filter in-memory instead
             include_ref_url=False,  # Disable ref_url filtering for emulator
         )
@@ -1071,39 +1021,23 @@ async def _list_gt_paginated_with_emulator(
                     filtered_items_exclude.append(item)
             raw_items = filtered_items_exclude
 
-        # Filter by ref_url in-memory (EXISTS not supported by Cosmos DB emulator)
-        if ref_url:
+        # Filter by plugin-owned filters in-memory.
+        if plugin_filters:
             start = time.time()
-            filtered_items_ref: list[AgenticGroundTruthEntry] = []
-            total_refs_checked = 0
+            filtered_items_plugin: list[AgenticGroundTruthEntry] = []
 
             for item in raw_items:
-                # Check item-level refs
-                has_match = any(ref_url in ref.url for ref in item.refs)
-                total_refs_checked += len(item.refs)
-
-                # Check history-level refs if no match yet
-                if not has_match and item.history:
-                    for turn in item.history:
-                        turn_refs = getattr(turn, "refs", None)
-                        if turn_refs:
-                            total_refs_checked += len(turn_refs)
-                            if any(ref_url in ref.url for ref in turn_refs):
-                                has_match = True
-                                break
-                if has_match:
-                    filtered_items_ref.append(item)
+                if self._plugin_pack_registry.matches_query_filters(item, plugin_filters):
+                    filtered_items_plugin.append(item)
 
             elapsed = time.time() - start
             self._logger.info(
-                "repo.ref_url_filter.performance"
+                "repo.plugin_filter.performance"
                 f"items_checked: {len(raw_items)}, "
-                f"items_matched: {len(filtered_items_ref)}, "
-                f"refs_checked: {total_refs_checked}, "
+                f"items_matched: {len(filtered_items_plugin)}, "
                 f"elapsed_ms: {elapsed * 1000}, "
-                f"ref_url_length: {len(ref_url)}, "
             )
-            raw_items = filtered_items_ref
+            raw_items = filtered_items_plugin
 
         # Filter by keyword in-memory (case-insensitive substring match)
         if keyword:
@@ -1126,7 +1060,15 @@ async def _list_gt_paginated_with_emulator(
 
         # Sort in-memory (required since ORDER BY conflicts with ARRAY_CONTAINS in Cosmos DB)
         reverse_sort = sort_direction == SortOrder.desc
-        raw_items.sort(key=lambda item: self._sort_key(item, sort_field), reverse=reverse_sort)
+        raw_items.sort(
+            key=lambda item: self._sort_key(
+                item,
+                sort_field,
+                plugin_sort=plugin_sort,
+                plugin_pack_registry=self._plugin_pack_registry,
+            ),
+            reverse=reverse_sort,
+        )
 
         total = len(raw_items)
         total_pages = math.ceil(total / limit) if total > 0 else 0
diff --git a/backend/app/adapters/repos/memory_repo.py b/backend/app/adapters/repos/memory_repo.py
index 84d278a..79972dc 100644
--- a/backend/app/adapters/repos/memory_repo.py
+++ b/backend/app/adapters/repos/memory_repo.py
@@ -5,6 +5,7 @@
 from typing import Iterable
 from uuid import UUID
 
+from app.domain.conversation_fields import answer_text_from_item, question_text_from_item
 from app.domain.enums import GroundTruthStatus, SortField, SortOrder
 from app.domain.models import (
     AgenticGroundTruthEntry,
@@ -15,6 +16,8 @@
     PaginationMetadata,
     Stats,
 )
+from app.plugins.base import PluginPackRegistry
+from app.plugins.pack_registry import get_default_pack_registry
 
 ZERO_UUID = UUID("00000000-0000-0000-0000-000000000000")
 
@@ -25,12 +28,14 @@ def __init__(
         *,
         items: list[AgenticGroundTruthEntry] | None = None,
         curation_instructions: list[DatasetCurationInstructions] | None = None,
+        plugin_pack_registry: PluginPackRegistry | None = None,
     ) -> None:
         self.items: dict[str, AgenticGroundTruthEntry] = {}
         self._locations: dict[tuple[str, UUID, str], str] = {}
         self._assignment_docs: dict[tuple[str, str], AssignmentDocument] = {}
         self._curation: dict[str, DatasetCurationInstructions] = {}
         self._etag_version = 0
+        self._plugin_pack_registry = plugin_pack_registry or get_default_pack_registry()
 
         for item in items or []:
             self._store_initial_item(item)
@@ -45,7 +50,9 @@ def _next_etag(self) -> str:
         return f"memory-etag-{self._etag_version}"
 
     def _clone_item(self, item: AgenticGroundTruthEntry) -> AgenticGroundTruthEntry:
-        return AgenticGroundTruthEntry.model_validate(item.model_dump(by_alias=True))
+        return AgenticGroundTruthEntry.model_validate(
+            item.model_dump(by_alias=True, exclude={"tags"})
+        )
 
     def _clone_instruction(self, doc: DatasetCurationInstructions) -> DatasetCurationInstructions:
         return DatasetCurationInstructions.model_validate(doc.model_dump(by_alias=True))
@@ -104,28 +111,30 @@ def _matches_location(
         )
 
     def _collect_urls(self, item: AgenticGroundTruthEntry) -> Iterable[str]:
-        for ref in item.refs:
-            yield ref.url
-        for turn in item.history or []:
-            for ref in getattr(turn, "refs", None) or []:
-                yield ref.url
+        for doc in self._plugin_pack_registry.collect_search_documents(item):
+            url = doc.get("url")
+            if isinstance(url, str) and url:
+                yield url
 
     def _collect_text(self, item: AgenticGroundTruthEntry) -> str:
         parts = [
             item.id,
             item.datasetName,
-            item.synth_question or "",
-            item.edited_question or "",
-            item.answer or "",
+            question_text_from_item(item),
+            answer_text_from_item(item),
             item.comment or "",
         ]
         for turn in item.history or []:
             parts.append(turn.msg)
-        for ref in item.refs:
-            parts.extend([ref.title or "", ref.url, ref.content or "", ref.keyExcerpt or ""])
-        for turn in item.history or []:
-            for ref in getattr(turn, "refs", None) or []:
-                parts.extend([ref.title or "", ref.url, ref.content or "", ref.keyExcerpt or ""])
+        for doc in self._plugin_pack_registry.collect_search_documents(item):
+            parts.extend(
+                [
+                    str(doc.get("id") or ""),
+                    str(doc.get("title") or ""),
+                    str(doc.get("url") or ""),
+                    str(doc.get("chunk") or ""),
+                ]
+            )
         return " ".join(parts).lower()
 
     def _is_unassigned_candidate(self, item: AgenticGroundTruthEntry) -> bool:
@@ -138,24 +147,27 @@ def _sort_items(
         self,
         items: list[AgenticGroundTruthEntry],
         sort_by: SortField | None,
+        plugin_sort: str | None,
         sort_order: SortOrder | None,
     ) -> list[AgenticGroundTruthEntry]:
         field = sort_by or SortField.reviewed_at
         reverse = (sort_order or SortOrder.desc) == SortOrder.desc
 
         def key(item: AgenticGroundTruthEntry):
+            if plugin_sort:
+                plugin_value = self._plugin_pack_registry.plugin_sort_value(item, plugin_sort)
+                return (
+                    plugin_value if plugin_value is not None else -1,
+                    item.updated_at or datetime.min.replace(tzinfo=timezone.utc),
+                    item.id,
+                )
             if field == SortField.updated_at:
                 return item.updated_at or datetime.min.replace(tzinfo=timezone.utc)
             if field == SortField.id:
                 return item.id
             if field == SortField.has_answer:
                 return (
-                    1 if (item.answer or "").strip() else 0,
-                    item.updated_at or datetime.min.replace(tzinfo=timezone.utc),
-                )
-            if field == SortField.totalReferences:
-                return (
-                    item.totalReferences,
+                    1 if answer_text_from_item(item) else 0,
                     item.updated_at or datetime.min.replace(tzinfo=timezone.utc),
                 )
             if field == SortField.tag_count:
@@ -194,7 +206,7 @@ async def list_gt_by_dataset(
             items = [item for item in items if item.status == status]
         return [
             self._clone_item(item)
-            for item in self._sort_items(items, SortField.updated_at, SortOrder.desc)
+            for item in self._sort_items(items, SortField.updated_at, None, SortOrder.desc)
         ]
 
     async def list_all_gt(
@@ -205,7 +217,7 @@ async def list_all_gt(
             items = [item for item in items if item.status == status]
         return [
             self._clone_item(item)
-            for item in self._sort_items(items, SortField.updated_at, SortOrder.desc)
+            for item in self._sort_items(items, SortField.updated_at, None, SortOrder.desc)
         ]
 
     async def list_gt_paginated(
@@ -215,9 +227,10 @@ async def list_gt_paginated(
         tags: list[str] | None = None,
         exclude_tags: list[str] | None = None,
         item_id: str | None = None,
-        ref_url: str | None = None,
+        plugin_filters: dict[str, str] | None = None,
         keyword: str | None = None,
         sort_by: SortField | None = None,
+        plugin_sort: str | None = None,
         sort_order: SortOrder | None = None,
         page: int = 1,
         limit: int = 25,
@@ -235,15 +248,17 @@ async def list_gt_paginated(
             filtered = [item for item in filtered if not banned.intersection(set(item.tags))]
         if item_id:
             filtered = [item for item in filtered if item_id in item.id]
-        if ref_url:
+        if plugin_filters:
             filtered = [
-                item for item in filtered if any(ref_url in url for url in self._collect_urls(item))
+                item
+                for item in filtered
+                if self._plugin_pack_registry.matches_query_filters(item, plugin_filters)
             ]
         if keyword:
             lowered = keyword.lower()
             filtered = [item for item in filtered if lowered in self._collect_text(item)]
 
-        sorted_items = self._sort_items(filtered, sort_by, sort_order)
+        sorted_items = self._sort_items(filtered, sort_by, plugin_sort, sort_order)
         total = len(sorted_items)
         start = (page - 1) * limit
         end = start + limit
@@ -325,7 +340,7 @@ async def list_unassigned(self, limit: int) -> list[AgenticGroundTruthEntry]:
         items = [item for item in self.items.values() if self._is_unassigned_candidate(item)]
         return [
             self._clone_item(item)
-            for item in self._sort_items(items, SortField.updated_at, SortOrder.desc)[:limit]
+            for item in self._sort_items(items, SortField.updated_at, None, SortOrder.desc)[:limit]
         ]
 
     async def sample_unassigned(
@@ -346,7 +361,7 @@ async def query_unassigned_by_dataset_prefix(
         ]
         return [
             self._clone_item(item)
-            for item in self._sort_items(items, SortField.updated_at, SortOrder.desc)[:take]
+            for item in self._sort_items(items, SortField.updated_at, None, SortOrder.desc)[:take]
         ]
 
     async def query_unassigned_global(
@@ -360,7 +375,7 @@ async def query_unassigned_global(
         ]
         return [
             self._clone_item(item)
-            for item in self._sort_items(items, SortField.updated_at, SortOrder.desc)[:take]
+            for item in self._sort_items(items, SortField.updated_at, None, SortOrder.desc)[:take]
         ]
 
     async def assign_to(self, item_id: str, user_id: str) -> bool:
@@ -399,7 +414,7 @@ async def list_assigned(self, user_id: str) -> list[AgenticGroundTruthEntry]:
         ]
         return [
             self._clone_item(item)
-            for item in self._sort_items(items, SortField.updated_at, SortOrder.desc)
+            for item in self._sort_items(items, SortField.updated_at, None, SortOrder.desc)
         ]
 
     async def upsert_assignment_doc(
diff --git a/backend/app/adapters/search/demo_search.py b/backend/app/adapters/search/demo_search.py
index 046cfda..14030db 100644
--- a/backend/app/adapters/search/demo_search.py
+++ b/backend/app/adapters/search/demo_search.py
@@ -1,11 +1,18 @@
 from __future__ import annotations
 
 from app.domain.models import AgenticGroundTruthEntry
+from app.plugins.base import PluginPackRegistry
+from app.plugins.pack_registry import get_default_pack_registry
 
 
 class DemoSearchAdapter:
-    def __init__(self, items: list[AgenticGroundTruthEntry]) -> None:
+    def __init__(
+        self,
+        items: list[AgenticGroundTruthEntry],
+        plugin_pack_registry: PluginPackRegistry | None = None,
+    ) -> None:
         self._items = items
+        self._plugin_pack_registry = plugin_pack_registry or get_default_pack_registry()
 
     async def query(self, q: str, top: int = 5) -> list[dict[str, object]]:
         query = q.strip().lower()
@@ -15,30 +22,32 @@ async def query(self, q: str, top: int = 5) -> list[dict[str, object]]:
         matches: list[dict[str, object]] = []
         seen_urls: set[str] = set()
         for item in self._items:
-            refs = list(item.refs)
-            for turn in item.history or []:
-                refs.extend(getattr(turn, "refs", None) or [])
-            for ref in refs:
+            for ref in self._plugin_pack_registry.collect_search_documents(item):
+                doc_id = ref.get("id")
+                url = ref.get("url")
+                if not isinstance(url, str) or not url:
+                    continue
                 haystack = " ".join(
                     [
-                        ref.url,
-                        ref.title or "",
-                        ref.content or "",
-                        ref.keyExcerpt or "",
+                        str(doc_id or ""),
+                        url,
+                        str(ref.get("title") or ""),
+                        str(ref.get("chunk") or ""),
                         item.datasetName,
                         item.id,
                     ]
                 ).lower()
                 if query not in haystack:
                     continue
-                if ref.url in seen_urls:
+                if url in seen_urls:
                     continue
-                seen_urls.add(ref.url)
+                seen_urls.add(url)
                 matches.append(
                     {
-                        "url": ref.url,
-                        "title": ref.title,
-                        "chunk": ref.content or ref.keyExcerpt or f"Reference for {item.id}",
+                        "id": doc_id,
+                        "url": url,
+                        "title": ref.get("title"),
+                        "chunk": ref.get("chunk") or f"Reference for {item.id}",
                     }
                 )
                 if len(matches) >= top:
diff --git a/backend/app/api/v1/assignments.py b/backend/app/api/v1/assignments.py
index f5382cd..05714b0 100644
--- a/backend/app/api/v1/assignments.py
+++ b/backend/app/api/v1/assignments.py
@@ -27,7 +27,6 @@
     ETagRequiredError,
     apply_shared_update,
     persist_shared_update,
-    read_legacy_compat_update,
 )
 from app.services.validation_service import (
     ApprovalValidationError,
@@ -137,7 +136,6 @@ async def update_item(
     original_assigned_to = it.assignedTo
 
     provided_fields: Set[str] = set(payload.model_fields_set)
-    payload_extras = payload.model_extra or {}
     try:
         mutation = apply_shared_update(
             it,
@@ -157,7 +155,6 @@ async def update_item(
             status=payload.status,
             approve=bool(payload.approve),
             actor_user_id=user.user_id,
-            legacy_update=read_legacy_compat_update(payload_extras),
             clear_assignment_on_statuses={
                 GroundTruthStatus.approved,
                 GroundTruthStatus.deleted,
diff --git a/backend/app/api/v1/ground_truths.py b/backend/app/api/v1/ground_truths.py
index 32f08cd..6ce4b9f 100644
--- a/backend/app/api/v1/ground_truths.py
+++ b/backend/app/api/v1/ground_truths.py
@@ -22,7 +22,6 @@
     ExpectedTools,
     FeedbackEntry,
     GroundTruthListResponse,
-    HistoryItem,
     PluginPayload,
     ToolCallRecord,
     BulkImportError,
@@ -39,7 +38,6 @@
     ETagRequiredError,
     apply_shared_update,
     persist_shared_update,
-    read_legacy_compat_update,
 )
 from app.services.validation_service import (
     ApprovalValidationError,
@@ -140,17 +138,6 @@ class GroundTruthUpdateRequest(BaseModel):
     etag: str | None = Field(default=None, alias="etag")
 
 
-def _coerce_history_for_internal_use(item: AgenticGroundTruthEntry) -> None:
-    if not item.history:
-        return
-    item.history = [
-        entry
-        if isinstance(entry, HistoryItem)
-        else HistoryItem.model_validate(entry.model_dump(by_alias=True))
-        for entry in item.history
-    ]
-
-
 @router.post("", response_model=ImportBulkResponse)
 async def import_bulk(
     items: list[AgenticGroundTruthEntry],
@@ -274,7 +261,6 @@ async def import_bulk(
         # Fetch registry once for performance (avoids O(n) singleton lookups)
         registry = get_default_registry()
         for it in gt_items:
-            _coerce_history_for_internal_use(it)
             apply_computed_tags(it, registry)
 
         result = await container.repo.import_bulk_gt(gt_items, buckets=buckets)
@@ -442,16 +428,24 @@ async def list_all_ground_truths(
         alias="itemId",
         description="Search for items by ID (case-sensitive partial match)",
     ),
-    ref_url: str | None = Query(
+    plugin_filter: list[str] | None = Query(
         default=None,
-        alias="refUrl",
-        description="Search for items by reference URL (case-sensitive partial match)",
+        alias="pluginFilter",
+        description=(
+            "Plugin-namespaced filters in key=value form (repeat query param). "
+            "Example: pluginFilter=rag-compat:refUrl=https://example.com"
+        ),
     ),
     keyword: str | None = Query(
         default=None,
         description="Search for items by keyword (case-insensitive text search across questions, answers, and history)",
     ),
     sort_by: SortField = Query(default=SortField.reviewed_at.value, alias="sortBy"),
+    plugin_sort: str | None = Query(
+        default=None,
+        alias="pluginSort",
+        description="Plugin-namespaced sort key, e.g. rag-compat:totalReferences",
+    ),
     sort_order: SortOrder = Query(default=SortOrder.desc.value, alias="sortOrder"),
     page: int = Query(default=1),
     limit: int = Query(default=25),
@@ -479,17 +473,51 @@ async def list_all_ground_truths(
         else:
             item_id_search = item_id
 
-    # Reference URL search validation
-    ref_url_search = None
-    if ref_url is not None:
-        ref_url = ref_url.strip()
-        if not ref_url:
-            # Empty after trim - treat as if parameter not provided
-            ref_url = None
-        elif len(ref_url) > 500:
-            raise HTTPException(status_code=400, detail="refUrl must be 500 characters or less")
-        else:
-            ref_url_search = ref_url
+    plugin_filters: dict[str, str] | None = None
+    if plugin_filter:
+        parsed: dict[str, str] = {}
+        for raw_filter in plugin_filter:
+            candidate = raw_filter.strip()
+            if not candidate:
+                continue
+            key, sep, value = candidate.partition("=")
+            if not sep:
+                raise HTTPException(
+                    status_code=400,
+                    detail="pluginFilter entries must use key=value format",
+                )
+            key = key.strip()
+            value = value.strip()
+            if not key:
+                raise HTTPException(
+                    status_code=400,
+                    detail="pluginFilter entries must include a non-empty key",
+                )
+            if ":" not in key:
+                raise HTTPException(
+                    status_code=400,
+                    detail="pluginFilter key must be namespaced (pack:key)",
+                )
+            if not value:
+                continue
+            if len(value) > 500:
+                raise HTTPException(
+                    status_code=400,
+                    detail="pluginFilter value must be 500 characters or less",
+                )
+            parsed[key] = value
+        plugin_filters = parsed or None
+
+    plugin_sort_key = None
+    if plugin_sort is not None:
+        plugin_sort = plugin_sort.strip()
+        if plugin_sort:
+            if ":" not in plugin_sort:
+                raise HTTPException(
+                    status_code=400,
+                    detail="pluginSort must be namespaced (pack:key)",
+                )
+            plugin_sort_key = plugin_sort
 
     # Keyword search validation
     keyword_search = None
@@ -557,9 +585,10 @@ async def list_all_ground_truths(
         tags=tag_list,
         exclude_tags=exclude_tag_list,
         item_id=item_id_search,
-        ref_url=ref_url_search,
+        plugin_filters=plugin_filters,
         keyword=keyword_search,
         sort_by=sort_by,
+        plugin_sort=plugin_sort_key,
         sort_order=sort_order,
         page=page,
         limit=limit,
@@ -639,7 +668,6 @@ async def update_ground_truth(
             manual_tags=payload.manual_tags,
             status=payload.status,
             actor_user_id=user.user_id,
-            legacy_update=read_legacy_compat_update(payload_extras),
         )
     except ValidationError as e:
         raise HTTPException(status_code=400, detail=e.message)
diff --git a/backend/app/container.py b/backend/app/container.py
index 1054d48..e39af47 100644
--- a/backend/app/container.py
+++ b/backend/app/container.py
@@ -143,6 +143,14 @@ def _build_snapshot_service(self, repo: GroundTruthRepo) -> SnapshotService:
             plugin_export_transforms=self.plugin_pack_registry.collect_export_transforms(),
         )
 
+    def _validate_plugin_packs_startup(self) -> None:
+        logger.info("Running plugin-pack startup validation...")
+        self.plugin_pack_registry.validate_all()
+        logger.info(
+            "Plugin-pack validation passed. Registered packs: %s",
+            self.plugin_pack_registry.names(),
+        )
+
     def init_cosmos_repo(self, db_name: str | None = None) -> None:
         """Create a Cosmos repo instance and wire services.
 
@@ -180,6 +188,7 @@ def init_cosmos_repo(self, db_name: str | None = None) -> None:
             connection_verify=settings.COSMOS_CONNECTION_VERIFY,
             test_mode=settings.COSMOS_TEST_MODE,
             credential=credential,
+            plugin_pack_registry=self.plugin_pack_registry,
         )
         logger.info(
             "Using CosmosGroundTruthRepo (endpoint=%s, db=%s, container=%s)",
@@ -227,6 +236,7 @@ def init_memory_repo(self, *, enable_demo_data: bool = False) -> None:
         self.repo = InMemoryGroundTruthRepo(
             items=demo_items,
             curation_instructions=demo_instructions,
+            plugin_pack_registry=self.plugin_pack_registry,
         )
         self.assignment_service = AssignmentService(self.repo)
         self.snapshot_service = self._build_snapshot_service(self.repo)
@@ -235,13 +245,18 @@ def init_memory_repo(self, *, enable_demo_data: bool = False) -> None:
         self.tag_registry_service = TagRegistryService(self.tags_repo)
         self.tag_definitions_repo = cast(Any, None)
         self.search_service = (
-            SearchService(DemoSearchAdapter(demo_items)) if enable_demo_data else SearchService()
+            SearchService(
+                DemoSearchAdapter(demo_items, plugin_pack_registry=self.plugin_pack_registry)
+            )
+            if enable_demo_data
+            else SearchService()
         )
         logger.info(
             "Using InMemoryGroundTruthRepo (demo_mode=%s, items=%s)",
             enable_demo_data,
             len(demo_items),
         )
+        self._validate_plugin_packs_startup()
 
     async def startup_cosmos(self, db_name: str | None = None) -> None:
         """Initialize and validate Cosmos repos and services.
@@ -283,12 +298,7 @@ async def startup_cosmos(self, db_name: str | None = None) -> None:
 
         # Step 4: Run plugin-pack startup validation so misconfigured packs
         # fail here with an actionable error rather than silently at runtime.
-        logger.info("Running plugin-pack startup validation...")
-        self.plugin_pack_registry.validate_all()
-        logger.info(
-            "Plugin-pack validation passed. Registered packs: %s",
-            self.plugin_pack_registry.names(),
-        )
+        self._validate_plugin_packs_startup()
 
     def init_search(self) -> None:
         """Configure search adapter if Azure Search settings are present."""
diff --git a/backend/app/demo_seed.py b/backend/app/demo_seed.py
index 258509a..4a1ffd3 100644
--- a/backend/app/demo_seed.py
+++ b/backend/app/demo_seed.py
@@ -11,8 +11,6 @@
     AgenticGroundTruthEntry,
     DatasetCurationInstructions,
     ExpectedTools,
-    HistoryEntry,
-    HistoryItem,
     Reference,
     ToolExpectation,
 )
@@ -513,21 +511,14 @@ def _tool_call(
 ]
 
 
-def _hydrate_history_with_refs(item: AgenticGroundTruthEntry, refs: list[Reference]) -> None:
-    if not item.history:
-        return
+def _set_rag_compat_refs(item: AgenticGroundTruthEntry, refs: list[Reference]) -> None:
+    from app.plugins.pack_registry import get_default_pack_registry, get_required_pack
 
-    enriched_history: list[HistoryEntry] = []
-    last_turn_index = len(item.history) - 1
-    for index, turn in enumerate(item.history):
-        enriched_history.append(
-            HistoryItem(
-                role=turn.role,
-                msg=turn.msg,
-                refs=refs if index == last_turn_index and turn.role != "user" else None,
-            )
-        )
-    item.history = enriched_history
+    pack = get_required_pack("rag-compat", get_default_pack_registry())
+    replace_references = getattr(pack, "replace_references", None)
+    if not callable(replace_references):
+        raise TypeError("Registered 'rag-compat' pack does not expose replace_references")
+    replace_references(item, refs)
 
 
 def _expected_tools(tool_names: list[str]) -> ExpectedTools:
@@ -558,7 +549,9 @@ def _build_demo_item(
         created_by="demo-seed",
     )
     adapted = adapter.adapt_payload({"trace_count": 1, "traces": [trace]})[0]
-    item = AgenticGroundTruthEntry.model_validate(adapted.model_dump(by_alias=True))
+    item = AgenticGroundTruthEntry.model_validate(
+        adapted.model_dump(by_alias=True, exclude={"tags"})
+    )
 
     item.id = item_id
     item.scenario_id = scenario_id
@@ -566,8 +559,7 @@ def _build_demo_item(
     item.manual_tags = sorted(set(item.manual_tags + manual_tags))
     item.metadata = {**item.metadata, "source": "demo-seed"}
     item.trace_ids = {**(item.trace_ids or {}), "demoItemId": item_id}
-    item.refs = refs
-    _hydrate_history_with_refs(item, refs)
+    _set_rag_compat_refs(item, refs)
     item.expected_tools = _expected_tools(required_tools)
 
     if assigned:
diff --git a/backend/app/domain/conversation_fields.py b/backend/app/domain/conversation_fields.py
new file mode 100644
index 0000000..6bdf1c3
--- /dev/null
+++ b/backend/app/domain/conversation_fields.py
@@ -0,0 +1,29 @@
+from __future__ import annotations
+
+from app.domain.models import AgenticGroundTruthEntry
+
+
+def _normalize_role(role: str) -> str:
+    return role.strip().lower()
+
+
+def is_user_role(role: str) -> bool:
+    return _normalize_role(role) == "user"
+
+
+def is_non_user_role(role: str) -> bool:
+    return not is_user_role(role)
+
+
+def question_text_from_item(item: AgenticGroundTruthEntry) -> str:
+    for turn in reversed(item.history or []):
+        if is_user_role(turn.role) and turn.msg.strip():
+            return turn.msg.strip()
+    return ""
+
+
+def answer_text_from_item(item: AgenticGroundTruthEntry) -> str:
+    for turn in reversed(item.history or []):
+        if is_non_user_role(turn.role) and turn.msg.strip():
+            return turn.msg.strip()
+    return ""
diff --git a/backend/app/domain/enums.py b/backend/app/domain/enums.py
index 4b8c9d0..a41ba1b 100644
--- a/backend/app/domain/enums.py
+++ b/backend/app/domain/enums.py
@@ -13,7 +13,6 @@ class SortField(str, Enum):
     updated_at = "updatedAt"
     id = "id"
     has_answer = "hasAnswer"
-    totalReferences = "totalReferences"
     tag_count = "tagCount"
 
 
diff --git a/backend/app/domain/models.py b/backend/app/domain/models.py
index edab26f..246da56 100644
--- a/backend/app/domain/models.py
+++ b/backend/app/domain/models.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from datetime import datetime, timezone
-from typing import Any, ClassVar, Optional, Literal, cast
+from typing import Any, ClassVar, Optional, Literal
 from uuid import UUID
 
 from pydantic import BaseModel, Field, ConfigDict, computed_field, field_validator, model_validator
@@ -9,12 +9,6 @@
 from app.domain.enums import GroundTruthStatus
 from app.domain.validators import GroundTruthItemTagValidators
 
-LEGACY_HOST_FIELD_DELETE_GATES = (
-    "stored-data audit completed",
-    "caller audit completed",
-    "import/export verification completed",
-)
-
 
 class Reference(BaseModel):
     """Legacy RAG reference object retained for compatibility helpers and tests."""
@@ -51,9 +45,8 @@ def validate_non_empty_text(cls, value: str) -> str:
 
 
 class HistoryItem(HistoryEntry):
-    """Legacy RAG-compatible history item retained for internal compatibility."""
+    """Canonical history item used by generic core flows."""
 
-    refs: Optional[list[Reference]] = None
     expected_behavior: Optional[list[str]] = Field(default=None, alias="expectedBehavior")
 
     model_config = ConfigDict(populate_by_name=True, extra="forbid")
@@ -258,83 +251,12 @@ class AgenticGroundTruthEntry(GroundTruthItemTagValidators, BaseModel):
 
     _RAG_COMPAT_PLUGIN: ClassVar[str] = "rag-compat"
 
-    # --- Legacy compatibility layer ---
-    # The model_validator, computed_fields, and property accessors below exist because
-    # stored Cosmos DB documents may still carry top-level RAG fields (synthQuestion,
-    # editedQuestion, answer, refs, etc.). They transparently relocate those fields into
-    # plugins["rag-compat"] on read and re-expose them for internal code that still
-    # accesses .synth_question, .answer, .refs, .totalReferences.
-    #
-    # Hard-delete only after all LEGACY_HOST_FIELD_DELETE_GATES are satisfied. Until then,
-    # these accessors are migration projections, not long-term host ownership.
-
-    @model_validator(mode="before")
-    @classmethod
-    def translate_legacy_payload_for_core_model(cls, value: object) -> object:
-        if cls is not AgenticGroundTruthEntry:
-            return value
-        from app.plugins.packs.rag_compat import normalize_legacy_payload_for_core_model
-
-        return normalize_legacy_payload_for_core_model(value, plugin_name=cls._RAG_COMPAT_PLUGIN)
-
-    @model_validator(mode="after")
-    def restore_history_annotations(self) -> "AgenticGroundTruthEntry":
-        history_annotations = self._rag_compat_data().get("historyAnnotations")
-        if not isinstance(history_annotations, list) or not self.history:
-            return self
-
-        merged_history: list[HistoryEntry] = []
-        changed = False
-        for index, entry in enumerate(self.history):
-            annotation = history_annotations[index] if index < len(history_annotations) else None
-            if not isinstance(annotation, dict) or not annotation:
-                merged_history.append(entry)
-                continue
-
-            entry_payload = entry.model_dump(by_alias=True)
-            if "refs" in annotation:
-                entry_payload["refs"] = annotation["refs"]
-                changed = True
-            if "expectedBehavior" in annotation:
-                entry_payload["expectedBehavior"] = annotation["expectedBehavior"]
-                changed = True
-            merged_history.append(HistoryItem.model_validate(entry_payload))
-
-        if changed:
-            self.history = merged_history
-        return self
-
     @computed_field
     @property
     def tags(self) -> list[str]:
         merged = set(self.manual_tags or []) | set(self.computed_tags or [])
         return sorted(merged)
 
-    @computed_field(alias="synthQuestion")
-    @property
-    def compat_synth_question(self) -> str | None:
-        return self.synth_question
-
-    @computed_field(alias="editedQuestion")
-    @property
-    def compat_edited_question(self) -> str | None:
-        return self.edited_question
-
-    @computed_field(alias="answer")
-    @property
-    def compat_answer(self) -> str | None:
-        return self.answer
-
-    @computed_field(alias="refs")
-    @property
-    def compat_refs(self) -> list[Reference]:
-        return self.refs
-
-    @computed_field(alias="totalReferences")
-    @property
-    def compat_total_references(self) -> int:
-        return self.totalReferences
-
     def set_plugin(self, slot: str, data: dict[str, Any], *, version: str = "1.0") -> None:
         self.plugins[slot] = PluginPayload(kind=slot, version=version, data=data)
 
@@ -345,149 +267,6 @@ def get_plugin_data(self, slot: str) -> dict[str, Any] | None:
     def export_json_schema(self) -> dict[str, Any]:
         return self.model_json_schema()
 
-    def _rag_compat_data(self) -> dict[str, Any]:
-        plugin = self.plugins.get(self._RAG_COMPAT_PLUGIN)
-        if plugin is None:
-            return {}
-        return plugin.data
-
-    def _set_rag_compat_value(self, key: str, value: Any) -> None:
-        plugin = self.plugins.get(self._RAG_COMPAT_PLUGIN)
-        if plugin is None:
-            plugin = PluginPayload(kind=self._RAG_COMPAT_PLUGIN, version="1.0", data={})
-            self.plugins[self._RAG_COMPAT_PLUGIN] = plugin
-        if value is None:
-            plugin.data.pop(key, None)
-        else:
-            plugin.data[key] = value
-
-    def _find_history_message(self, role: str, *, reverse: bool = False) -> str | None:
-        history = self.history or []
-        history_iterable = reversed(history) if reverse else history
-        for turn in history_iterable:
-            if turn.role == role and turn.msg:
-                return turn.msg
-        return None
-
-    def _find_last_agent_message(self) -> str | None:
-        """Return the last non-user history message (any agent role)."""
-        for turn in reversed(self.history or []):
-            if turn.role != "user" and turn.msg:
-                return turn.msg
-        return None
-
-    @property
-    def synth_question(self) -> str | None:
-        if "synth_question" in self.__dict__:
-            return cast(str | None, self.__dict__.get("synth_question"))
-        compat = self._rag_compat_data()
-        return cast(str | None, compat.get("synthQuestion")) or self._find_history_message("user")
-
-    @synth_question.setter
-    def synth_question(self, value: str | None) -> None:
-        if "synth_question" in getattr(type(self), "model_fields", {}):
-            self.__dict__["synth_question"] = value
-            return
-        self._set_rag_compat_value("synthQuestion", value)
-
-    @property
-    def edited_question(self) -> str | None:
-        if "edited_question" in self.__dict__:
-            return cast(str | None, self.__dict__.get("edited_question"))
-        compat = self._rag_compat_data()
-        return cast(str | None, compat.get("editedQuestion")) or self.synth_question
-
-    @edited_question.setter
-    def edited_question(self, value: str | None) -> None:
-        if "edited_question" in getattr(type(self), "model_fields", {}):
-            self.__dict__["edited_question"] = value
-            return
-        self._set_rag_compat_value("editedQuestion", value)
-
-    @property
-    def answer(self) -> str | None:
-        if "answer" in self.__dict__:
-            return cast(str | None, self.__dict__.get("answer"))
-        compat = self._rag_compat_data()
-        return cast(str | None, compat.get("answer")) or self._find_last_agent_message()
-
-    @answer.setter
-    def answer(self, value: str | None) -> None:
-        if "answer" in getattr(type(self), "model_fields", {}):
-            self.__dict__["answer"] = value
-            return
-        self._set_rag_compat_value("answer", value)
-
-    @property
-    def refs(self) -> list[Reference]:
-        direct_value = self.__dict__.get("refs")
-        if isinstance(direct_value, list):
-            return [
-                ref if isinstance(ref, Reference) else Reference.model_validate(ref)
-                for ref in direct_value
-            ]
-        from app.plugins.packs.rag_compat import compat_refs_from_payload
-
-        return cast(
-            list[Reference],
-            compat_refs_from_payload(
-                {
-                    "plugins": self.plugins,
-                    "toolCalls": self.tool_calls,
-                    "history": self.history,
-                },
-                plugin_name=self._RAG_COMPAT_PLUGIN,
-            ),
-        )
-
-    @refs.setter
-    def refs(self, value: list[Reference] | list[dict[str, Any]] | None) -> None:
-        if "refs" in getattr(type(self), "model_fields", {}):
-            self.__dict__["refs"] = list(value or [])
-            return
-        # Handle both Reference objects and dict representations
-        serialized = []
-        for ref in value or []:
-            if isinstance(ref, Reference):
-                serialized.append(ref.model_dump(by_alias=True))
-            elif isinstance(ref, dict):
-                # Validate and convert dict to ensure it's a valid reference
-                validated_ref = Reference.model_validate(ref)
-                serialized.append(validated_ref.model_dump(by_alias=True))
-            else:
-                serialized.append(ref)
-        self._set_rag_compat_value("refs", serialized)
-
-    @property
-    def totalReferences(self) -> int:
-        direct_value = self.__dict__.get("totalReferences")
-        if isinstance(direct_value, int):
-            return direct_value
-        from app.plugins.packs.rag_compat import compat_total_references_from_payload
-
-        return compat_total_references_from_payload(
-            {
-                "plugins": self.plugins,
-                "toolCalls": self.tool_calls,
-                "history": self.history,
-            },
-            plugin_name=self._RAG_COMPAT_PLUGIN,
-        )
-
-    @totalReferences.setter
-    def totalReferences(self, value: int | None) -> None:
-        if "totalReferences" in getattr(type(self), "model_fields", {}):
-            self.__dict__["totalReferences"] = 0 if value is None else int(value)
-            return
-        self._set_rag_compat_value("totalReferences", None if value is None else int(value))
-
-    # NOTE: Informational RAG-era accessors (contextUsedForGeneration, contextSource,
-    # modelUsedForGeneration, semanticClusterNumber, weight, samplingBucket, questionLength)
-    # removed in Phase 7 legacy retirement. No callers accessed them via
-    # AgenticGroundTruthEntry uses computed properties for legacy field access.
-    # Read paths extract these values from history and plugin data. Write paths
-    # normalize incoming payloads into canonical multi-turn structures.
-
 
 class PaginationMetadata(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
diff --git a/backend/app/plugins/base.py b/backend/app/plugins/base.py
index 008f6f4..884e160 100644
--- a/backend/app/plugins/base.py
+++ b/backend/app/plugins/base.py
@@ -101,7 +101,7 @@ def tag_key(self) -> str:
                 return "length:long"
 
             def compute(self, doc: AgenticGroundTruthEntry) -> str | None:
-                content = doc.answer or ""
+                content = "\n".join(turn.msg for turn in (doc.history or []))
                 return self.tag_key if len(content) > 10000 else None
 
     Example (dynamic tag):
@@ -130,8 +130,7 @@ def compute(self, doc: AgenticGroundTruthEntry) -> str | None:
 
         Args:
             doc: The AgenticGroundTruthEntry to evaluate.
-                 Contains fields like 'answer', 'history', 'refs', etc.
-                 Legacy fields like synthQuestion, editedQuestion are accessed via computed properties.
+                 Contains canonical fields like 'history', 'plugins', 'tool_calls', etc.
 
         Returns:
             The tag string if applicable, None otherwise.
@@ -326,7 +325,7 @@ def collect_approval_errors(
                 self, item: AgenticGroundTruthEntry
             ) -> list[str]:
                 errors: list[str] = []
-                if not item.refs:
+                if not item.history:
                     errors.append("strict-ref: at least one reference is required")
                 return errors
     """
@@ -438,6 +437,45 @@ def get_export_transforms(self) -> list[ExportTransform]:
         """
         return []
 
+    def matches_query_filter(
+        self, item: AgenticGroundTruthEntry, filter_key: str, filter_value: str
+    ) -> bool | None:
+        """Evaluate a plugin-namespaced query filter for an item.
+
+        Args:
+            item: Item being evaluated.
+            filter_key: Pack-local filter key (namespace removed by host).
+            filter_value: Filter value from the request.
+
+        Returns:
+            True/False when this pack handles the key, or None when unsupported.
+        """
+        return None
+
+    def get_sort_value(self, item: AgenticGroundTruthEntry, sort_key: str) -> Any | None:
+        """Return a plugin-owned sort value for a namespaced sort key.
+
+        Args:
+            item: Item being sorted.
+            sort_key: Pack-local sort key (namespace removed by host).
+
+        Returns:
+            Sort value when handled, or None when unsupported.
+        """
+        return None
+
+    def get_search_documents(self, item: AgenticGroundTruthEntry) -> list[dict[str, Any]]:
+        """Return plugin-owned search candidate docs for a single item.
+
+        Each candidate should include at least ``url`` when applicable and may
+        include ``id``, ``title``, and ``chunk``.
+        """
+        return []
+
+    def get_primary_reference_url(self, item: AgenticGroundTruthEntry) -> str | None:
+        """Return a primary reference URL for diagnostics/error reporting."""
+        return None
+
 
 class PluginPackRegistry:
     """Registry for plugin packs with startup validation.
@@ -625,6 +663,59 @@ def __len__(self) -> int:
         """Return the number of registered packs."""
         return len(self._packs)
 
+    @staticmethod
+    def _split_namespaced_key(namespaced_key: str) -> tuple[str, str] | None:
+        pack_name, sep, pack_key = namespaced_key.partition(":")
+        if not sep or not pack_name.strip() or not pack_key.strip():
+            return None
+        return pack_name.strip(), pack_key.strip()
+
+    def matches_query_filters(
+        self, item: AgenticGroundTruthEntry, filters: Mapping[str, str] | None
+    ) -> bool:
+        """Return True when an item satisfies all plugin-namespaced filters."""
+        if not filters:
+            return True
+
+        for namespaced_key, value in filters.items():
+            split = self._split_namespaced_key(namespaced_key)
+            if split is None:
+                return False
+            pack_name, pack_key = split
+            pack = self.get(pack_name)
+            if pack is None:
+                return False
+            result = pack.matches_query_filter(item, pack_key, value)
+            if result is None or result is False:
+                return False
+        return True
+
+    def plugin_sort_value(self, item: AgenticGroundTruthEntry, namespaced_sort_key: str) -> Any:
+        """Resolve a plugin-namespaced sort key for an item."""
+        split = self._split_namespaced_key(namespaced_sort_key)
+        if split is None:
+            return None
+        pack_name, pack_key = split
+        pack = self.get(pack_name)
+        if pack is None:
+            return None
+        return pack.get_sort_value(item, pack_key)
+
+    def collect_search_documents(self, item: AgenticGroundTruthEntry) -> list[dict[str, Any]]:
+        """Collect plugin-owned search candidate documents for an item."""
+        docs: list[dict[str, Any]] = []
+        for pack in self._packs.values():
+            docs.extend(pack.get_search_documents(item))
+        return docs
+
+    def primary_reference_url(self, item: AgenticGroundTruthEntry) -> str | None:
+        """Return the first available plugin-owned primary reference URL."""
+        for pack in self._packs.values():
+            candidate = pack.get_primary_reference_url(item)
+            if candidate:
+                return candidate
+        return None
+
 
 # ---------------------------------------------------------------------------
 # Trace adapter plugin system
diff --git a/backend/app/plugins/computed_tags/no_answer.py b/backend/app/plugins/computed_tags/no_answer.py
index 5fdd3ea..a470af1 100644
--- a/backend/app/plugins/computed_tags/no_answer.py
+++ b/backend/app/plugins/computed_tags/no_answer.py
@@ -8,6 +8,7 @@
 
 from typing import TYPE_CHECKING
 
+from app.domain.conversation_fields import answer_text_from_item
 from app.plugins.base import ComputedTagPlugin
 
 if TYPE_CHECKING:
@@ -30,6 +31,7 @@ def tag_key(self) -> str:
         return "answer:no_answer"
 
     def compute(self, doc: AgenticGroundTruthEntry) -> str | None:
-        if doc.answer and doc.answer.strip().casefold() == "no_answer":
+        answer_text = answer_text_from_item(doc)
+        if answer_text and answer_text.casefold() == "no_answer":
             return self.tag_key
         return None
diff --git a/backend/app/plugins/computed_tags/question_length.py b/backend/app/plugins/computed_tags/question_length.py
index 7b09359..5788696 100644
--- a/backend/app/plugins/computed_tags/question_length.py
+++ b/backend/app/plugins/computed_tags/question_length.py
@@ -13,6 +13,7 @@
 
 from typing import TYPE_CHECKING
 
+from app.domain.conversation_fields import question_text_from_item
 from app.plugins.base import ComputedTagPlugin
 
 if TYPE_CHECKING:
@@ -28,8 +29,8 @@
 def _get_question_word_count(doc: AgenticGroundTruthEntry) -> int:
     """Get the word count for the document's question.
 
-    Uses the computed property accessor which returns editedQuestion if available,
-    otherwise synthQuestion. Uses .split() to count words as specified in requirements.
+    Uses canonical question derivation from history.
+    Uses .split() to count words as specified in requirements.
 
     Args:
         doc: The AgenticGroundTruthEntry to evaluate.
@@ -37,7 +38,7 @@ def _get_question_word_count(doc: AgenticGroundTruthEntry) -> int:
     Returns:
         The number of words in the question.
     """
-    question = doc.edited_question or doc.synth_question or ""
+    question = question_text_from_item(doc)
     return len(question.split())
 
 
diff --git a/backend/app/plugins/computed_tags/reference_type.py b/backend/app/plugins/computed_tags/reference_type.py
index c22cdf6..aee0ea8 100644
--- a/backend/app/plugins/computed_tags/reference_type.py
+++ b/backend/app/plugins/computed_tags/reference_type.py
@@ -16,6 +16,7 @@
 from typing import TYPE_CHECKING
 
 from app.plugins.base import ComputedTagPlugin
+from app.plugins.pack_registry import get_default_pack_registry
 
 if TYPE_CHECKING:
     from app.domain.models import AgenticGroundTruthEntry, Reference
@@ -61,17 +62,15 @@ def _get_all_references(doc: AgenticGroundTruthEntry) -> list[Reference]:
     Returns:
         A list of all Reference objects from the document.
     """
-    from app.domain.models import HistoryItem
-
-    refs: list[Reference] = list(doc.refs or [])
-
-    # Also gather refs from history turns
-    # HistoryItem (subclass of HistoryEntry) has refs field
-    if doc.history:
-        for turn in doc.history:
-            if isinstance(turn, HistoryItem) and turn.refs:
-                refs.extend(turn.refs)
-
+    from app.domain.models import Reference
+
+    docs = get_default_pack_registry().collect_search_documents(doc)
+    refs: list[Reference] = []
+    for candidate in docs:
+        url = candidate.get("url")
+        if not isinstance(url, str) or not url:
+            continue
+        refs.append(Reference(url=url))
     return refs
 
 
diff --git a/backend/app/plugins/computed_tags/retrieval_behavior.py b/backend/app/plugins/computed_tags/retrieval_behavior.py
index e2a62e1..83f3baa 100644
--- a/backend/app/plugins/computed_tags/retrieval_behavior.py
+++ b/backend/app/plugins/computed_tags/retrieval_behavior.py
@@ -13,6 +13,7 @@
 from typing import TYPE_CHECKING
 
 from app.plugins.base import ComputedTagPlugin
+from app.plugins.pack_registry import get_default_pack_registry
 
 if TYPE_CHECKING:
     from app.domain.models import AgenticGroundTruthEntry
@@ -21,8 +22,7 @@
 def _get_total_reference_count(doc: AgenticGroundTruthEntry) -> int:
     """Get the total count of references from a document.
 
-    Uses the totalReferences computed field which counts refs at item level
-    and across all history turns.
+    Uses canonical reference derivation from history/plugin payloads.
 
     Args:
         doc: The AgenticGroundTruthEntry to evaluate.
@@ -30,7 +30,8 @@ def _get_total_reference_count(doc: AgenticGroundTruthEntry) -> int:
     Returns:
         The total number of references.
     """
-    return doc.totalReferences
+    count = get_default_pack_registry().plugin_sort_value(doc, "rag-compat:totalReferences")
+    return int(count) if isinstance(count, int) else 0
 
 
 class RetrievalBehaviorNoRefsPlugin(ComputedTagPlugin):
diff --git a/backend/app/plugins/packs/rag_compat.py b/backend/app/plugins/packs/rag_compat.py
index fd712bd..039d4f5 100644
--- a/backend/app/plugins/packs/rag_compat.py
+++ b/backend/app/plugins/packs/rag_compat.py
@@ -1,20 +1,10 @@
 """RAG compatibility pack.
 
-This pack owns retrieval-specific behavior on the generic agentic host:
-- Validates its own plugin-kind constant at startup so mismatches are detected
-  before any data is processed.
-- Projects per-item RAG state from ``plugins["rag-compat"].data`` via the
-  compat-accessor helpers already present on AgenticGroundTruthEntry.
-- Provides the canonical ``rag_compat_data``, ``refs_from_item``,
-  ``attach_reference``, and ``detach_reference`` helpers so reference
-  manipulation stays in one owned location rather than being inlined across
-  multiple services.
-- Contributes approval validation hooks that enforce RAG-specific invariants on
-  top of the generic core checks.
-
-Retrieval search remains available through the standard ``/v1/search`` endpoint
-(backed by SearchService), which handles the generic query path independently.
-Reference selection and attachment are owned by this pack.
+This pack owns the remaining RAG-specific compatibility surface on the generic
+agentic host. The only plugin-owned payload retained here is normalized
+``references`` data. Legacy RAG fields are translated into generic history or
+flattened into references during import, and new writes only persist
+``plugins[\"rag-compat\"].data.references``.
 """
 
 from __future__ import annotations
@@ -22,45 +12,26 @@
 import logging
 from typing import TYPE_CHECKING, Any
 
-from app.plugins.base import ExplorerFieldDefinition, ExportTransform, PluginPack
+from app.plugins.base import ExplorerFieldDefinition, ExportTransform, ImportTransform, PluginPack
 
 if TYPE_CHECKING:
     from app.domain.models import AgenticGroundTruthEntry, Reference
 
 logger = logging.getLogger(__name__)
 
-# The plugin-kind key stored inside AgenticGroundTruthEntry.plugins.
-# This MUST match AgenticGroundTruthEntry._RAG_COMPAT_PLUGIN.
-# validate_registration() enforces this at startup.
 _RAG_COMPAT_KIND: str = "rag-compat"
-
-_LEGACY_PLUGIN_FIELDS: tuple[str, ...] = (
+_PLUGIN_REFERENCES_KEY = "references"
+_LEGACY_REFS_KEY = "refs"
+_LEGACY_KEYS_TO_DROP = (
+    _LEGACY_REFS_KEY,
+    "retrievals",
+    "historyAnnotations",
+    "totalReferences",
     "synthQuestion",
     "editedQuestion",
     "answer",
-    "refs",
-    "contextUsedForGeneration",
-    "contextSource",
-    "modelUsedForGeneration",
-    "semanticClusterNumber",
-    "weight",
-    "samplingBucket",
-    "questionLength",
-    "totalReferences",
 )
 
-_LEGACY_PLUGIN_FIELD_ALIASES: dict[str, str] = {
-    "synth_question": "synthQuestion",
-    "edited_question": "editedQuestion",
-    "context_used_for_generation": "contextUsedForGeneration",
-    "context_source": "contextSource",
-    "model_used_for_generation": "modelUsedForGeneration",
-    "semantic_cluster_number": "semanticClusterNumber",
-    "sampling_bucket": "samplingBucket",
-    "question_length": "questionLength",
-    "total_references": "totalReferences",
-}
-
 
 def _coerce_reference_list(raw_refs: Any) -> list[Any]:
     if not isinstance(raw_refs, list):
@@ -73,22 +44,66 @@ def _coerce_reference_list(raw_refs: Any) -> list[Any]:
     ]
 
 
-def _history_message(history: Any, role: str, *, reverse: bool = False) -> str | None:
+def _extract_history_refs(history: Any) -> list[Any]:
     if not isinstance(history, list):
-        return None
-    iterator = reversed(history) if reverse else history
-    for turn in iterator:
-        if hasattr(turn, "role") and hasattr(turn, "msg"):
-            current_role = str(getattr(turn, "role", "")).strip().lower()
-            current_msg = str(getattr(turn, "msg", "")).strip()
-        elif isinstance(turn, dict):
-            current_role = str(turn.get("role", "")).strip().lower()
-            current_msg = str(turn.get("msg") or turn.get("content") or "").strip()
-        else:
+        return []
+
+    refs: list[Any] = []
+    for turn in history:
+        if hasattr(turn, "refs"):
+            refs.extend(_coerce_reference_list(getattr(turn, "refs", None)))
+            continue
+        if isinstance(turn, dict):
+            refs.extend(_coerce_reference_list(turn.get(_LEGACY_REFS_KEY)))
+    return refs
+
+
+def _extract_retrieval_refs(payload: dict[str, Any], compat: dict[str, Any]) -> list[Any]:
+    retrievals = compat.get("retrievals")
+    if not isinstance(retrievals, dict):
+        return []
+
+    from app.domain.models import Reference
+
+    tool_calls = payload.get("toolCalls") or payload.get("tool_calls") or []
+    step_by_tool_call_id: dict[str, int | None] = {}
+    if isinstance(tool_calls, list):
+        for tool_call in tool_calls:
+            if hasattr(tool_call, "id"):
+                tool_call_id = getattr(tool_call, "id", "")
+                step_number = getattr(tool_call, "step_number", None)
+            elif isinstance(tool_call, dict):
+                tool_call_id = str(tool_call.get("id") or "")
+                step_number = tool_call.get("stepNumber", tool_call.get("step_number"))
+            else:
+                continue
+            if tool_call_id:
+                step_by_tool_call_id[tool_call_id] = (
+                    step_number if isinstance(step_number, int) else None
+                )
+
+    flattened: list[Reference] = []
+    for tool_call_id, bucket in retrievals.items():
+        if not isinstance(bucket, dict):
+            continue
+        candidates = bucket.get("candidates")
+        if not isinstance(candidates, list):
             continue
-        if current_role == role and current_msg:
-            return current_msg
-    return None
+        for candidate in candidates:
+            if not isinstance(candidate, dict):
+                continue
+            candidate_tool_call_id = candidate.get("toolCallId") or tool_call_id or None
+            flattened.append(
+                Reference(
+                    url=str(candidate.get("url") or ""),
+                    title=candidate.get("title"),
+                    content=candidate.get("chunk"),
+                    messageIndex=step_by_tool_call_id.get(str(candidate_tool_call_id))
+                    if candidate_tool_call_id
+                    else None,
+                )
+            )
+    return flattened
 
 
 def rag_compat_data_from_payload(
@@ -107,86 +122,74 @@ def rag_compat_data_from_payload(
     return {}
 
 
-def normalize_legacy_payload_for_core_model(
-    value: object, *, plugin_name: str = _RAG_COMPAT_KIND
-) -> object:
-    if not isinstance(value, dict):
-        return value
+def compat_refs_from_payload(
+    payload: dict[str, Any], *, plugin_name: str = _RAG_COMPAT_KIND
+) -> list[Any]:
+    compat = rag_compat_data_from_payload(payload, plugin_name=plugin_name)
 
+    if _PLUGIN_REFERENCES_KEY in compat:
+        return _coerce_reference_list(compat.get(_PLUGIN_REFERENCES_KEY))
+
+    compat_refs = _coerce_reference_list(compat.get(_LEGACY_REFS_KEY))
+    if compat_refs:
+        return compat_refs
+
+    retrieval_refs = _extract_retrieval_refs(payload, compat)
+    if retrieval_refs:
+        return retrieval_refs
+
+    return _extract_history_refs(payload.get("history"))
+
+
+def normalize_legacy_payload_for_core_model(
+    value: dict[str, Any], *, plugin_name: str = _RAG_COMPAT_KIND
+) -> dict[str, Any]:
     data = dict(value)
     data.pop("tags", None)
-    legacy_payload: dict[str, Any] = {}
 
-    for alias, canonical in _LEGACY_PLUGIN_FIELD_ALIASES.items():
-        if alias not in data:
-            continue
-        alias_value = data.pop(alias)
-        if canonical not in data:
-            data[canonical] = alias_value
-
-    for field_name in _LEGACY_PLUGIN_FIELDS:
-        if field_name in data:
-            legacy_payload[field_name] = data.pop(field_name)
-
-    if "refs" in legacy_payload:
-        legacy_payload["refs"] = _coerce_reference_list(legacy_payload["refs"])
-
-    history_value = data.get("history")
-    if isinstance(history_value, list):
-        normalized_history: list[dict[str, Any]] = []
-        history_annotations: list[dict[str, Any]] = []
-        saw_history_annotations = False
-        for raw_entry in history_value:
+    raw_history = data.get("history")
+    normalized_history: list[dict[str, Any]] | None = None
+    history_refs: list[Any] = []
+    if isinstance(raw_history, list):
+        normalized_history = []
+        for raw_entry in raw_history:
             if hasattr(raw_entry, "model_dump"):
                 entry_dict = raw_entry.model_dump(by_alias=True, exclude_none=True)
             elif isinstance(raw_entry, dict):
                 entry_dict = dict(raw_entry)
             else:
-                normalized_history.append(raw_entry)
-                history_annotations.append({})
                 continue
 
-            annotation: dict[str, Any] = {}
-            if "refs" in entry_dict:
-                annotation["refs"] = _coerce_reference_list(entry_dict.pop("refs"))
-                saw_history_annotations = True
-            expected_behavior = entry_dict.pop(
-                "expectedBehavior", entry_dict.pop("expected_behavior", None)
-            )
-            if expected_behavior is not None:
-                annotation["expectedBehavior"] = expected_behavior
-                saw_history_annotations = True
-
+            history_refs.extend(_coerce_reference_list(entry_dict.pop(_LEGACY_REFS_KEY, None)))
             message = entry_dict.get("msg")
-            if message is None and "content" in entry_dict:
-                message = entry_dict.pop("content")
+            if message is None and isinstance(entry_dict.get("content"), str):
+                message = entry_dict.get("content")
             normalized_history.append(
                 {
-                    "role": entry_dict.get("role", ""),
-                    "msg": message or "",
+                    "role": str(entry_dict.get("role") or ""),
+                    "msg": str(message or ""),
                 }
             )
-            history_annotations.append(annotation)
-
         data["history"] = normalized_history
-        if saw_history_annotations:
-            legacy_payload["historyAnnotations"] = history_annotations
-    elif history_value is None and (
-        legacy_payload.get("editedQuestion")
-        or legacy_payload.get("synthQuestion")
-        or legacy_payload.get("answer")
+
+    synth_question = data.pop("synthQuestion", None)
+    edited_question = data.pop("editedQuestion", None)
+    answer = data.pop("answer", None)
+    if normalized_history is None and any(
+        isinstance(v, str) and v.strip() for v in (edited_question, synth_question, answer)
     ):
         generated_history: list[dict[str, Any]] = []
-        question_text = legacy_payload.get("editedQuestion") or legacy_payload.get("synthQuestion")
-        if question_text:
-            generated_history.append({"role": "user", "msg": question_text})
-        if legacy_payload.get("answer"):
-            generated_history.append({"role": "assistant", "msg": legacy_payload["answer"]})
+        question_text = (
+            edited_question
+            if isinstance(edited_question, str) and edited_question.strip()
+            else synth_question
+        )
+        if isinstance(question_text, str) and question_text.strip():
+            generated_history.append({"role": "user", "msg": question_text.strip()})
+        if isinstance(answer, str) and answer.strip():
+            generated_history.append({"role": "assistant", "msg": answer.strip()})
         data["history"] = generated_history
 
-    if not legacy_payload:
-        return data
-
     plugins_payload = dict(data.get("plugins") or {})
     existing_plugin = plugins_payload.get(plugin_name)
     if hasattr(existing_plugin, "model_dump"):
@@ -195,149 +198,73 @@ def normalize_legacy_payload_for_core_model(
         plugin_dict = dict(existing_plugin)
     else:
         plugin_dict = {"kind": plugin_name, "version": "1.0", "data": {}}
+
     plugin_data_raw = plugin_dict.get("data")
     plugin_data = dict(plugin_data_raw) if isinstance(plugin_data_raw, dict) else {}
-    plugin_data.update(legacy_payload)
-    plugin_dict["kind"] = plugin_dict.get("kind") or plugin_name
-    plugin_dict["version"] = plugin_dict.get("version") or "1.0"
-    plugin_dict["data"] = plugin_data
-    plugins_payload[plugin_name] = plugin_dict
-    data["plugins"] = plugins_payload
-    return data
-
-
-def compat_refs_from_payload(
-    payload: dict[str, Any], *, plugin_name: str = _RAG_COMPAT_KIND
-) -> list[Any]:
-    compat = rag_compat_data_from_payload(payload, plugin_name=plugin_name)
-    refs = _coerce_reference_list(compat.get("refs"))
-    if refs:
-        return refs
-
-    retrievals = compat.get("retrievals")
-    if not isinstance(retrievals, dict):
-        return []
-
-    from app.domain.models import Reference
-
-    tool_calls = payload.get("toolCalls") or payload.get("tool_calls") or []
-    step_by_tool_call_id: dict[str, int | None] = {}
-    if isinstance(tool_calls, list):
-        for tool_call in tool_calls:
-            if hasattr(tool_call, "id"):
-                tool_call_id = getattr(tool_call, "id", "")
-                step_number = getattr(tool_call, "step_number", None)
-            elif isinstance(tool_call, dict):
-                tool_call_id = str(tool_call.get("id") or "")
-                step_number = tool_call.get("stepNumber", tool_call.get("step_number"))
-            else:
-                continue
-            if tool_call_id:
-                step_by_tool_call_id[tool_call_id] = (
-                    step_number if isinstance(step_number, int) else None
-                )
-
-    flattened: list[Reference] = []
-    for tool_call_id, bucket in retrievals.items():
-        if not isinstance(bucket, dict):
-            continue
-        candidates = bucket.get("candidates")
-        if not isinstance(candidates, list):
-            continue
-        for candidate in candidates:
-            if not isinstance(candidate, dict):
-                continue
-            candidate_tool_call_id = candidate.get("toolCallId") or (
-                tool_call_id if tool_call_id != RagCompatPack._UNASSOCIATED_KEY else None
-            )
-            flattened.append(
-                Reference(
-                    url=str(candidate.get("url") or ""),
-                    title=candidate.get("title"),
-                    content=candidate.get("chunk"),
-                    messageIndex=step_by_tool_call_id.get(str(candidate_tool_call_id))
-                    if candidate_tool_call_id
-                    else None,
-                )
+    has_canonical_references = _PLUGIN_REFERENCES_KEY in plugin_data
+    if has_canonical_references:
+        references = _coerce_reference_list(plugin_data.get(_PLUGIN_REFERENCES_KEY))
+    else:
+        references = _coerce_reference_list(plugin_data.get(_LEGACY_REFS_KEY))
+        if not references:
+            top_level_refs = data.pop(_LEGACY_REFS_KEY, None)
+            references = _coerce_reference_list(top_level_refs)
+        if not references:
+            references = _extract_retrieval_refs(
+                {"plugins": plugins_payload, "toolCalls": data.get("toolCalls")}, plugin_data
             )
-    return flattened
+        if not references:
+            references = history_refs
 
+    for legacy_key in _LEGACY_KEYS_TO_DROP:
+        plugin_data.pop(legacy_key, None)
 
-def compat_total_references_from_payload(
-    payload: dict[str, Any], *, plugin_name: str = _RAG_COMPAT_KIND
-) -> int:
-    compat = rag_compat_data_from_payload(payload, plugin_name=plugin_name)
-    explicit_total = compat.get("totalReferences")
-    if isinstance(explicit_total, int):
-        return explicit_total
+    if has_canonical_references:
+        plugin_data[_PLUGIN_REFERENCES_KEY] = [
+            ref.model_dump(by_alias=True, exclude_none=True) if hasattr(ref, "model_dump") else ref
+            for ref in references
+        ]
+    elif references:
+        plugin_data[_PLUGIN_REFERENCES_KEY] = [
+            ref.model_dump(by_alias=True, exclude_none=True) if hasattr(ref, "model_dump") else ref
+            for ref in references
+        ]
+
+    if plugin_data:
+        plugin_dict["kind"] = plugin_dict.get("kind") or plugin_name
+        plugin_dict["version"] = plugin_dict.get("version") or "1.0"
+        plugin_dict["data"] = plugin_data
+        plugins_payload[plugin_name] = plugin_dict
+        data["plugins"] = plugins_payload
+    elif plugin_name in plugins_payload:
+        plugins_payload.pop(plugin_name, None)
+        data["plugins"] = plugins_payload
 
-    history_count = 0
-    history_annotations = compat.get("historyAnnotations")
-    if isinstance(history_annotations, list):
-        for annotation in history_annotations:
-            if isinstance(annotation, dict) and isinstance(annotation.get("refs"), list):
-                history_count += len(annotation["refs"])
-    if history_count:
-        return history_count
-    return len(compat_refs_from_payload(payload, plugin_name=plugin_name))
+    return data
 
 
 def apply_export_projection(
     doc: dict[str, Any], *, plugin_name: str = _RAG_COMPAT_KIND
 ) -> dict[str, Any]:
     projected = dict(doc)
-    compat = rag_compat_data_from_payload(projected, plugin_name=plugin_name)
-    if not compat:
-        return projected
-
     refs = compat_refs_from_payload(projected, plugin_name=plugin_name)
-    projected["refs"] = [ref.model_dump(by_alias=True, exclude_none=True) for ref in refs]
-    projected["totalReferences"] = len(refs)
-
-    if projected.get("synthQuestion") is None:
-        projected["synthQuestion"] = compat.get("synthQuestion") or _history_message(
-            projected.get("history"), "user"
-        )
-    if projected.get("editedQuestion") is None:
-        projected["editedQuestion"] = compat.get("editedQuestion") or projected.get("synthQuestion")
-    if projected.get("answer") is None:
-        projected["answer"] = compat.get("answer") or _history_message(
-            projected.get("history"), "assistant", reverse=True
-        )
-
+    if refs:
+        projected[_PLUGIN_REFERENCES_KEY] = [
+            ref.model_dump(by_alias=True, exclude_none=True) for ref in refs
+        ]
+        projected["totalReferences"] = len(refs)
+    else:
+        projected.pop(_PLUGIN_REFERENCES_KEY, None)
+        projected["totalReferences"] = 0
     return projected
 
 
 class RagCompatPack(PluginPack):
-    """RAG compatibility pack.
-
-    Owns retrieval-specific behavior behind the generic plugin-pack contract.
-    Registered at startup via PluginPackRegistry so misconfiguration raises
-    a clear startup error instead of silently producing wrong data.
-
-    Design notes:
-    - The ``rag-compat`` plugin payload is written by
-      AgenticGroundTruthEntry.translate_legacy_payload_for_core_model during
-      ingest of legacy RAG-shaped documents.
-    - Core approval checks (history, tool-call consistency) run before pack
-      hooks. The pack adds RAG-specific approval gates that cannot be expressed
-      generically.
-    - The pack does NOT add new top-level fields to the host model; all RAG
-      state is accessed via plugins["rag-compat"].data.
-    - Reference attachment and detachment are owned by this pack; the generic
-      SearchService only owns the query path.
-    """
-
     @property
     def name(self) -> str:
         return _RAG_COMPAT_KIND
 
     def validate_registration(self) -> None:
-        """Validate that the rag-compat kind constant matches the host model.
-
-        Fails startup if someone renames the plugin key in
-        AgenticGroundTruthEntry without updating this pack (or vice-versa).
-        """
         from app.domain.models import AgenticGroundTruthEntry
 
         expected = AgenticGroundTruthEntry._RAG_COMPAT_PLUGIN
@@ -350,35 +277,16 @@ def validate_registration(self) -> None:
         logger.debug("rag_compat_pack.validate_registration.ok | kind=%s", _RAG_COMPAT_KIND)
 
     def collect_approval_errors(self, item: AgenticGroundTruthEntry) -> list[str]:
-        """Return RAG-specific approval errors for an item.
-
-        Items that have no RAG compat data receive no additional errors.
-        """
-        compat = self.rag_compat_data(item)
-        if not compat:
-            return []
-        # RAG items: future validation hooks go here.
-        # e.g. per-retrieval-call selection completeness could be enforced once
-        # FR-029/FR-030 retrieval tool-call per-call state is implemented.
         return []
 
     def collect_approval_waivers(
         self, item: AgenticGroundTruthEntry, core_errors: list[str]
     ) -> list[str]:
-        """Waive core errors that do not apply to RAG retrieval-only items.
-
-        When an item has ``totalReferences > 0`` (indicating it is a
-        retrieval-based item), the following core checks are waived:
-        - "history must include at least one assistant message" — retrieval-only
-          items may not produce an assistant reply.
-        - "expectedTools.required must include at least one tool…" — retrieval
-          items may use reference attachment instead of classified tool calls.
-        """
         if self.reference_count(item) == 0:
             return []
 
         waivers: list[str] = []
-        assistant_error = "history must include at least one assistant message"
+        assistant_error = "history must include at least one agent message"
         if assistant_error in core_errors:
             waivers.append(assistant_error)
 
@@ -391,17 +299,10 @@ def collect_approval_waivers(
 
         return waivers
 
-    # ------------------------------------------------------------------
-    # Accessor helpers — owned by this pack so callers don't embed the
-    # plugin-kind string literal elsewhere.
-    # ------------------------------------------------------------------
-
     def rag_compat_data(self, item: AgenticGroundTruthEntry) -> dict[str, Any]:
-        """Return the raw rag-compat plugin data dict for an item, or {}."""
         return item.get_plugin_data(_RAG_COMPAT_KIND) or {}
 
     def refs_from_item(self, item: AgenticGroundTruthEntry) -> list[Any]:
-        """Return the references list projected from the rag-compat payload."""
         return compat_refs_from_payload(
             {
                 "plugins": item.plugins,
@@ -412,41 +313,32 @@ def refs_from_item(self, item: AgenticGroundTruthEntry) -> list[Any]:
 
     def reference_count(self, item: AgenticGroundTruthEntry) -> int:
         refs = self.refs_from_item(item)
+        compat = self.rag_compat_data(item)
+        if _PLUGIN_REFERENCES_KEY in compat:
+            return len(refs)
         if refs:
             return len(refs)
-
-        compat = self.rag_compat_data(item)
         explicit_total = compat.get("totalReferences")
         return explicit_total if isinstance(explicit_total, int) and explicit_total > 0 else 0
 
     def replace_references(
         self, item: AgenticGroundTruthEntry, refs: list[Reference]
     ) -> AgenticGroundTruthEntry:
-        serialized = [ref.model_dump(by_alias=True, exclude_none=True) for ref in refs]
-        item._set_rag_compat_value("refs", serialized)
-        item._set_rag_compat_value("retrievals", None)
-        # Clear cached totalReferences so it will be recomputed from refs/historyAnnotations
-        if "totalReferences" in item.__dict__:
-            del item.__dict__["totalReferences"]
-        item._set_rag_compat_value("totalReferences", None)  # Remove from plugin storage too
+        compat = dict(self.rag_compat_data(item))
+        for legacy_key in _LEGACY_KEYS_TO_DROP:
+            compat.pop(legacy_key, None)
+        if refs:
+            compat[_PLUGIN_REFERENCES_KEY] = [
+                ref.model_dump(by_alias=True, exclude_none=True) for ref in refs
+            ]
+        else:
+            compat.pop(_PLUGIN_REFERENCES_KEY, None)
+        item.set_plugin(_RAG_COMPAT_KIND, compat)
         return item
 
     def attach_reference(
         self, item: AgenticGroundTruthEntry, ref: Reference
     ) -> AgenticGroundTruthEntry:
-        """Attach a reference to an item via the rag-compat plugin payload.
-
-        This is a RAG-compat concern; the generic core does not manage refs.
-        The ``refs`` setter on AgenticGroundTruthEntry writes to
-        ``plugins["rag-compat"].data`` automatically.
-
-        Args:
-            item: The ground-truth item to modify in-place.
-            ref: The reference to attach.
-
-        Returns:
-            The same item (mutated in-place) for convenience.
-        """
         current = list(self.refs_from_item(item))
         current.append(ref)
         return self.replace_references(item, current)
@@ -454,107 +346,9 @@ def attach_reference(
     def detach_reference(
         self, item: AgenticGroundTruthEntry, ref_url: str
     ) -> AgenticGroundTruthEntry:
-        """Detach a reference from an item by URL, using the rag-compat payload.
-
-        This is a RAG-compat concern; the generic core does not manage refs.
-
-        Args:
-            item: The ground-truth item to modify in-place.
-            ref_url: The URL of the reference to remove.
-
-        Returns:
-            The same item (mutated in-place) for convenience.
-        """
         remaining = [r for r in self.refs_from_item(item) if getattr(r, "url", None) != ref_url]
         return self.replace_references(item, remaining)
 
-    # ------------------------------------------------------------------
-    # Per-tool-call retrieval state (Phase 6 — retrieval normalization)
-    #
-    # New items store references per retrieval tool call inside
-    # ``plugins["rag-compat"].data.retrievals``.
-    # Read path: per-call state first, then fall back to top-level refs.
-    # Write path: always to per-call state.
-    # ------------------------------------------------------------------
-
-    _UNASSOCIATED_KEY: str = "_unassociated"
-
-    def get_retrievals(self, item: AgenticGroundTruthEntry) -> dict[str, Any]:
-        """Return the full retrievals dict or {} when absent."""
-        compat = self.rag_compat_data(item)
-        retrievals = compat.get("retrievals")
-        return dict(retrievals) if isinstance(retrievals, dict) else {}
-
-    def get_retrieval_candidates(
-        self, item: AgenticGroundTruthEntry, tool_call_id: str
-    ) -> list[dict[str, Any]]:
-        """Return candidate list for one tool call, or []."""
-        retrievals = self.get_retrievals(item)
-        bucket = retrievals.get(tool_call_id)
-        if isinstance(bucket, dict):
-            cands = bucket.get("candidates")
-            return list(cands) if isinstance(cands, list) else []
-        return []
-
-    def set_retrieval_candidates(
-        self,
-        item: AgenticGroundTruthEntry,
-        tool_call_id: str,
-        candidates: list[dict[str, Any]],
-    ) -> None:
-        """Set candidates for a single tool call (write-through to plugin data)."""
-        compat = self.rag_compat_data(item)
-        retrievals = dict(compat.get("retrievals") or {})
-        retrievals[tool_call_id] = {"candidates": candidates}
-        item._set_rag_compat_value("retrievals", retrievals)
-
-    def set_retrievals(
-        self,
-        item: AgenticGroundTruthEntry,
-        retrievals: dict[str, Any],
-    ) -> None:
-        """Replace the entire retrievals dict."""
-        item._set_rag_compat_value("retrievals", retrievals)
-
-    def has_per_call_state(self, item: AgenticGroundTruthEntry) -> bool:
-        """Return True when per-call retrieval state exists."""
-        compat = self.rag_compat_data(item)
-        retrievals = compat.get("retrievals")
-        return isinstance(retrievals, dict) and len(retrievals) > 0
-
-    def get_all_candidates_flat(self, item: AgenticGroundTruthEntry) -> list[dict[str, Any]]:
-        """Flatten all per-call candidates into a single list.
-
-        Read path: returns per-call candidates when present.  Falls back
-        to converting top-level refs into candidate dicts for backward compat.
-        """
-        if self.has_per_call_state(item):
-            result: list[dict[str, Any]] = []
-            for tool_call_id, bucket in self.get_retrievals(item).items():
-                if not isinstance(bucket, dict):
-                    continue
-                cands = bucket.get("candidates")
-                if isinstance(cands, list):
-                    for c in cands:
-                        entry = dict(c) if isinstance(c, dict) else {}
-                        if "toolCallId" not in entry:
-                            entry["toolCallId"] = tool_call_id
-                        result.append(entry)
-            return result
-
-        # Backward compat: convert top-level refs to candidate shape
-        refs = item.refs
-        return [
-            {
-                "url": getattr(r, "url", ""),
-                "title": getattr(r, "title", None),
-                "chunk": getattr(r, "content", None),
-                "relevance": None,
-                "toolCallId": None,
-            }
-            for r in refs
-        ]
-
     def get_explorer_fields(self) -> list[ExplorerFieldDefinition]:
         return [
             ExplorerFieldDefinition(
@@ -563,65 +357,56 @@ def get_explorer_fields(self) -> list[ExplorerFieldDefinition]:
                 field_type="number",
                 sortable=True,
                 filterable=True,
-            ),
-            ExplorerFieldDefinition(
-                key="rag-compat:perCallRetrievals",
-                label="Per-Call Retrievals",
-                field_type="boolean",
-                filterable=True,
-            ),
+            )
+        ]
+
+    def get_import_transforms(self) -> list[ImportTransform]:
+        return [
+            ImportTransform(
+                name="rag-compat:normalize-legacy-payload",
+                description="Normalize legacy RAG fields into generic history and rag-compat references",
+                transform=normalize_legacy_payload_for_core_model,
+            )
         ]
 
     def get_export_transforms(self) -> list[ExportTransform]:
         return [
             ExportTransform(
-                name="rag-compat:project-legacy-export-fields",
-                description="Project rag-compat retrieval/reference fields into export payloads",
+                name="rag-compat:project-references",
+                description="Project rag-compat references into export payloads",
                 transform=apply_export_projection,
             )
         ]
 
-    def migrate_refs_to_per_call(self, item: AgenticGroundTruthEntry) -> bool:
-        """Migrate top-level refs into per-call state (idempotent).
-
-        Associates refs with retrieval tool calls by matching
-        ``messageIndex`` to tool-call step ordering when possible.
-        Refs that cannot be matched go into the ``_unassociated`` bucket.
+    def matches_query_filter(
+        self, item: AgenticGroundTruthEntry, filter_key: str, filter_value: str
+    ) -> bool | None:
+        if filter_key != "refUrl":
+            return None
+        refs = self.refs_from_item(item)
+        return any(filter_value in (getattr(ref, "url", "") or "") for ref in refs)
 
-        Returns True if migration produced changes.
-        """
-        if self.has_per_call_state(item):
-            return False
+    def get_sort_value(self, item: AgenticGroundTruthEntry, sort_key: str) -> Any | None:
+        if sort_key != "totalReferences":
+            return None
+        return self.reference_count(item)
 
-        refs = item.refs
-        if not refs:
-            return False
-
-        # Build a map from step/messageIndex to tool call id
-        tool_calls = item.tool_calls or []
-        step_to_tc: dict[int | None, str] = {}
-        for tc in tool_calls:
-            if tc.step_number is not None:
-                step_to_tc[tc.step_number] = tc.id
-
-        retrievals: dict[str, dict[str, list[dict[str, Any]]]] = {}
-        for ref in refs:
-            mi = getattr(ref, "messageIndex", None)
-            tc_id = step_to_tc.get(mi) if mi is not None else None
-            key = tc_id or self._UNASSOCIATED_KEY
-
-            if key not in retrievals:
-                retrievals[key] = {"candidates": []}
-            retrievals[key]["candidates"].append(
+    def get_search_documents(self, item: AgenticGroundTruthEntry) -> list[dict[str, Any]]:
+        docs: list[dict[str, Any]] = []
+        for idx, ref in enumerate(self.refs_from_item(item)):
+            docs.append(
                 {
-                    "url": getattr(ref, "url", ""),
+                    "id": f"{item.id}:ref:{idx}",
+                    "url": getattr(ref, "url", None),
                     "title": getattr(ref, "title", None),
-                    "chunk": getattr(ref, "content", None),
-                    "relevance": None,
-                    "rawPayload": None,
-                    "toolCallId": key if key != self._UNASSOCIATED_KEY else None,
+                    "chunk": getattr(ref, "content", None) or getattr(ref, "keyExcerpt", None),
                 }
             )
+        return docs
 
-        self.set_retrievals(item, retrievals)
-        return True
+    def get_primary_reference_url(self, item: AgenticGroundTruthEntry) -> str | None:
+        refs = self.refs_from_item(item)
+        if not refs:
+            return None
+        first_url = getattr(refs[0], "url", None)
+        return first_url if isinstance(first_url, str) and first_url else None
diff --git a/backend/app/services/assignment_service.py b/backend/app/services/assignment_service.py
index 28d2022..7d83b52 100644
--- a/backend/app/services/assignment_service.py
+++ b/backend/app/services/assignment_service.py
@@ -2,7 +2,7 @@
 
 import re
 from app.adapters.repos.base import GroundTruthRepo
-from app.domain.models import AgenticGroundTruthEntry, AssignmentDocument, HistoryItem
+from app.domain.models import AgenticGroundTruthEntry, AssignmentDocument
 from app.plugins import get_default_registry
 from app.core.errors import AssignmentConflictError
 from app.core.config import get_sampling_allocation
@@ -606,7 +606,7 @@ async def duplicate_item(
         Rules:
         - Keep datasetName and bucket identical to the original
         - Generate a new id (uuid4 string)
-        - Copy synthQuestion, editedQuestion, answer, refs, tags, comment, history and provenance fields
+        - Copy tags, comment, history, plugin references, and provenance fields
         - Ensure the `rephrase:{original.id}` tag is present exactly once
         - Set status=draft; clear reviewed_at and updatedBy
         - Assign to requesting user (assignedTo, assignedAt)
@@ -619,13 +619,9 @@ async def duplicate_item(
             new_tags.append(rephrase_tag)
 
         now = datetime.now(timezone.utc)
-        new_item = AgenticGroundTruthEntry.model_validate(original.model_dump(by_alias=True))
-        new_item.history = [
-            entry
-            if isinstance(entry, HistoryItem)
-            else HistoryItem.model_validate(entry.model_dump(by_alias=True))
-            for entry in (new_item.history or [])
-        ]
+        new_item = AgenticGroundTruthEntry.model_validate(
+            original.model_dump(by_alias=True, exclude_computed_fields=True)
+        )
         new_item.id = randomname.get_name()
         new_item.status = GroundTruthStatus.draft
         new_item.manual_tags = new_tags
diff --git a/backend/app/services/duplicate_detection_service.py b/backend/app/services/duplicate_detection_service.py
index 439533a..81a4b29 100644
--- a/backend/app/services/duplicate_detection_service.py
+++ b/backend/app/services/duplicate_detection_service.py
@@ -5,8 +5,8 @@
 
 Detection strategy:
 - Normalize whitespace and casing for comparison
-- Compare editedQuestion or synthQuestion (whichever is present)
-- Compare answer content
+- Compare canonical question text
+- Compare canonical answer text
 - Only check against approved items (drafts can have temporary duplicates)
 """
 
@@ -18,6 +18,7 @@
 
 from pydantic import BaseModel, Field, ConfigDict
 
+from app.domain.conversation_fields import answer_text_from_item, question_text_from_item
 from app.domain.models import AgenticGroundTruthEntry
 from app.domain.enums import GroundTruthStatus
 
@@ -53,8 +54,8 @@ def _normalize_text(text: str | None) -> str:
 
 
 def _get_question_text(item: AgenticGroundTruthEntry) -> str:
-    """Get the effective question text (edited or synth)."""
-    return item.edited_question or item.synth_question or ""
+    """Get the effective question text from conversation history."""
+    return question_text_from_item(item)
 
 
 def _serialize_generic_value(value: object) -> str:
@@ -132,8 +133,8 @@ def _items_are_duplicates(
     # Check for exact question match when both items expose question text
     if draft_question and approved_question and draft_question == approved_question:
         # Also check answer for stronger signal
-        draft_answer = _normalize_text(draft.answer)
-        approved_answer = _normalize_text(approved.answer)
+        draft_answer = _normalize_text(answer_text_from_item(draft))
+        approved_answer = _normalize_text(answer_text_from_item(approved))
 
         if draft_answer and approved_answer and draft_answer == approved_answer:
             return (True, "exact question and answer match")
diff --git a/backend/app/services/ground_truth_update_service.py b/backend/app/services/ground_truth_update_service.py
index 8a2ca69..420184e 100644
--- a/backend/app/services/ground_truth_update_service.py
+++ b/backend/app/services/ground_truth_update_service.py
@@ -13,17 +13,12 @@
     HistoryEntry,
     HistoryItem,
     PluginPayload,
-    Reference,
     ToolCallRecord,
 )
-from app.plugins.pack_registry import get_rag_compat_pack
 from app.services.tagging_service import apply_computed_tags
 from app.services.validation_service import ValidationError, validate_item_for_approval
 
 
-MISSING = object()
-
-
 class ETagRequiredError(Exception):
     """Raised when an update request omits optimistic-concurrency state."""
 
@@ -32,47 +27,11 @@ class ETagMismatchError(Exception):
     """Raised when the provided ETag no longer matches persisted state."""
 
 
-@dataclass(slots=True)
-class LegacyCompatUpdate:
-    edited_question: str | None | object = MISSING
-    answer: str | None | object = MISSING
-    refs: list[Reference] | object = MISSING
-
-
 @dataclass(slots=True)
 class UpdateMutationResult:
     should_delete_assignment: bool = False
 
 
-def read_legacy_compat_update(extras: dict[str, Any]) -> LegacyCompatUpdate:
-    update = LegacyCompatUpdate()
-
-    if "editedQuestion" in extras or "edited_question" in extras:
-        update.edited_question = cast(
-            str | None, extras.get("editedQuestion", extras.get("edited_question"))
-        )
-
-    if "answer" in extras:
-        answer_value = extras["answer"]
-        if answer_value is not None and not isinstance(answer_value, str):
-            raise ValidationError("", "answer", "answer must be a string or null")
-        update.answer = cast(str | None, answer_value)
-
-    if "refs" in extras:
-        refs_payload = extras["refs"]
-        if refs_payload is None:
-            update.refs = []
-        elif isinstance(refs_payload, list):
-            update.refs = [
-                ref if isinstance(ref, Reference) else Reference.model_validate(ref)
-                for ref in refs_payload
-            ]
-        else:
-            raise ValidationError("", "refs", "refs must be a list or null")
-
-    return update
-
-
 def _parse_status(value: GroundTruthStatus | str | None) -> GroundTruthStatus:
     if value is None:
         raise ValidationError(
@@ -100,16 +59,6 @@ def parse_history_entries(entries: Sequence[Any]) -> list[HistoryItem]:
         if not message:
             raise ValidationError("", "history", "history entries must include a non-empty msg")
 
-        refs_data = extras.get("refs")
-        refs_list = None
-        if refs_data is not None:
-            if not isinstance(refs_data, list):
-                raise ValidationError("", "history", "history refs must be a list")
-            refs_list = [
-                ref if isinstance(ref, Reference) else Reference.model_validate(ref)
-                for ref in refs_data
-            ]
-
         expected_behavior = extras.get("expectedBehavior", extras.get("expected_behavior"))
         if expected_behavior is not None and not isinstance(expected_behavior, list):
             raise ValidationError(
@@ -122,7 +71,6 @@ def parse_history_entries(entries: Sequence[Any]) -> list[HistoryItem]:
             HistoryItem(
                 role=getattr(entry, "role"),
                 msg=message,
-                refs=refs_list,
                 expected_behavior=expected_behavior,
             )
         )
@@ -148,7 +96,6 @@ def apply_shared_update(
     status: GroundTruthStatus | str | None = None,
     approve: bool = False,
     actor_user_id: str,
-    legacy_update: LegacyCompatUpdate | None = None,
     clear_assignment_on_statuses: set[GroundTruthStatus] | None = None,
 ) -> UpdateMutationResult:
     now = datetime.now(timezone.utc)
@@ -161,11 +108,9 @@ def apply_shared_update(
     if "history" in provided_fields:
         if history_entries is None:
             item.history = []
-            item.totalReferences = 0
         else:
             # HistoryItem is a subclass of HistoryEntry, so this is safe
             item.history = cast(list[HistoryEntry], parse_history_entries(history_entries))
-            item.totalReferences = 0
 
     if "context_entries" in provided_fields:
         item.context_entries = context_entries or []
@@ -194,17 +139,6 @@ def apply_shared_update(
     if "manual_tags" in provided_fields:
         item.manual_tags = manual_tags or []
 
-    if legacy_update is not None:
-        if legacy_update.edited_question is not MISSING:
-            item.edited_question = cast(str | None, legacy_update.edited_question)
-        if legacy_update.answer is not MISSING:
-            item.answer = cast(str | None, legacy_update.answer)
-        if legacy_update.refs is not MISSING:
-            rag_compat_pack = get_rag_compat_pack()
-            rag_compat_pack.replace_references(
-                item, list(cast(list[Reference], legacy_update.refs))
-            )
-
     if approve:
         item.status = GroundTruthStatus.approved
         item.reviewed_at = now
diff --git a/backend/app/services/pii_service.py b/backend/app/services/pii_service.py
index 7a0555c..cdebc7d 100644
--- a/backend/app/services/pii_service.py
+++ b/backend/app/services/pii_service.py
@@ -15,6 +15,7 @@
 
 from pydantic import BaseModel, Field
 
+from app.domain.conversation_fields import answer_text_from_item, question_text_from_item
 from app.domain.models import AgenticGroundTruthEntry
 
 
@@ -23,7 +24,7 @@ class PIIWarning(BaseModel):
 
     item_id: str = Field(description="Item identifier")
     field: str = Field(
-        description="Field name where PII was detected (e.g., 'synthQuestion', 'history[2].msg')"
+        description="Field name where the PII was detected (e.g., 'history.question', 'history[2].msg')"
     )
     pattern_type: str = Field(description="Type of PII detected ('email' or 'phone')")
     snippet: str = Field(description="Masked context snippet showing the detected PII")
@@ -163,9 +164,8 @@ def scan_item_for_pii(item: AgenticGroundTruthEntry) -> list[PIIWarning]:
     """Scan a ground truth item for PII in all relevant fields.
 
     Phase 1 scans:
-    - synth_question
-    - edited_question
-    - answer
+    - canonical question text derived from history
+    - canonical answer text derived from history
     - comment
     - history[].msg
 
@@ -195,15 +195,14 @@ def scan_nested_value(value: Any, field_name: str) -> None:
             for idx, nested in enumerate(value):
                 scan_nested_value(nested, f"{field_name}[{idx}]")
 
-    # Scan primary text fields
-    if item.synth_question:
-        warnings.extend(scan_text_for_pii(item.synth_question, "synthQuestion", item_id))
+    # Scan canonical conversation-derived text fields
+    question_text = question_text_from_item(item)
+    if question_text:
+        warnings.extend(scan_text_for_pii(question_text, "history.question", item_id))
 
-    if item.edited_question:
-        warnings.extend(scan_text_for_pii(item.edited_question, "editedQuestion", item_id))
-
-    if item.answer:
-        warnings.extend(scan_text_for_pii(item.answer, "answer", item_id))
+    answer_text = answer_text_from_item(item)
+    if answer_text:
+        warnings.extend(scan_text_for_pii(answer_text, "history.answer", item_id))
 
     if item.comment:
         warnings.extend(scan_text_for_pii(item.comment, "comment", item_id))
diff --git a/backend/app/services/validation_service.py b/backend/app/services/validation_service.py
index 4cc05f7..5d6b133 100644
--- a/backend/app/services/validation_service.py
+++ b/backend/app/services/validation_service.py
@@ -5,6 +5,12 @@
 import asyncio
 import logging
 
+from app.domain.conversation_fields import (
+    answer_text_from_item,
+    is_non_user_role,
+    is_user_role,
+    question_text_from_item,
+)
 from app.domain.models import AgenticGroundTruthEntry, BulkImportError, HistoryEntry
 from app.services.tagging_service import validate_tags_with_cache
 
@@ -56,20 +62,20 @@ def __init__(self, errors: list[str]):
 
 def _normalized_history(item: AgenticGroundTruthEntry) -> list[HistoryEntry]:
     history = list(item.history or [])
-    question_text = item.edited_question or item.synth_question
+    question_text = question_text_from_item(item)
+    answer_text = answer_text_from_item(item)
     if history:
-        roles = {entry.role.strip().lower() for entry in history}
-        if "user" not in roles and question_text:
+        if not any(is_user_role(entry.role) for entry in history) and question_text:
             history.insert(0, HistoryEntry(role="user", msg=question_text))
-        if "assistant" not in roles and item.answer:
-            history.append(HistoryEntry(role="assistant", msg=item.answer))
+        if not any(is_non_user_role(entry.role) for entry in history) and answer_text:
+            history.append(HistoryEntry(role="agent", msg=answer_text))
         return history
 
     synthesized: list[HistoryEntry] = []
     if question_text:
         synthesized.append(HistoryEntry(role="user", msg=question_text))
-    if item.answer:
-        synthesized.append(HistoryEntry(role="assistant", msg=item.answer))
+    if answer_text:
+        synthesized.append(HistoryEntry(role="agent", msg=answer_text))
     return synthesized
 
 
@@ -87,14 +93,12 @@ def collect_approval_validation_errors(item: AgenticGroundTruthEntry) -> list[st
     if not history:
         errors.append("history must contain at least one conversation message")
     else:
-        user_messages = [entry for entry in history if entry.role.strip().lower() == "user"]
-        assistant_messages = [
-            entry for entry in history if entry.role.strip().lower() == "assistant"
-        ]
+        user_messages = [entry for entry in history if is_user_role(entry.role)]
+        assistant_messages = [entry for entry in history if is_non_user_role(entry.role)]
         if not user_messages:
             errors.append("history must include at least one user message")
         if not assistant_messages:
-            errors.append("history must include at least one assistant message")
+            errors.append("history must include at least one agent message")
 
     tool_call_names = {tool.name for tool in item.tool_calls if tool.name}
     required_tools = [tool.name for tool in item.expected_tools.required if tool.name]
diff --git a/backend/scripts/README.md b/backend/scripts/README.md
index d622e8b..3eed914 100644
--- a/backend/scripts/README.md
+++ b/backend/scripts/README.md
@@ -1,56 +1,3 @@
 # Scripts
 
 This folder contains helper scripts used during development and data ops.
-
-## KB CSV import workflow
-
-Use these two scripts to prepare and import a KB CSV into the Ground Truth Curator API.
-
-1) Clean the CSV (drops Japanese descriptions and rows with non-empty "Added question?"):
-
-```bash
-uv run python scripts/clean_kb_csv.py \
-  --input 'scripts/AI_Generated_Questions_500_v1_0820-1545_dataset(Sheet1).csv' \
-  --output /tmp/kb_cleaned.csv
-```
-
-2) Import the cleaned CSV (prefix CS to `article`, build KB article URLs, POST in batches):
-
-```bash
-uv run python scripts/import_kb_csv.py \
-  --input /tmp/kb_cleaned.csv \
-  --base-url http://localhost:8000 \
-  --api-prefix /v1 \
-  --dataset kb \
-  --kb-base-url https://example.com \
-  --approve \
-  --batch-size 200
-```
-
-Authentication:
-- Bearer token header:
-
-```bash
-uv run python scripts/import_kb_csv.py --input /tmp/kb_cleaned.csv \
-  --base-url http://localhost:8000 --api-prefix /v1 --dataset kb --kb-base-url https://example.com --approve \
-  --bearer-token '<your_token_here>'
-```
-
-- Dev convenience header (used when AUTH_MODE=dev):
-
-```bash
-uv run python scripts/import_kb_csv.py --input /tmp/kb_cleaned.csv \
-  --base-url http://localhost:8000 --api-prefix /v1 --dataset kb --kb-base-url https://example.com --approve \
-  --user-id importer
-```
-
-Dry-run (no POSTs; preview first 3 payloads):
-
-```bash
-uv run python scripts/import_kb_csv.py --input /tmp/kb_cleaned.csv --dry-run
-```
-
-Notes:
-- Cleaning detects Japanese using a Unicode-range heuristic (Hiragana/Katakana/Kanji) in `description` and removes such rows; also removes rows where "Added question?" is non-empty.
-- Import normalizes the `article` field to start with `CS` and constructs references like `https://example.com/support/article/CS32540` using `--kb-base-url`. It uses `generated_question` (fallback: `description`) as the synthetic question and posts to `/v1/ground-truths` in batches.
-- API errors are printed per batch, plus a final deduplicated summary.
diff --git a/backend/scripts/backfill_total_references.py b/backend/scripts/backfill_total_references.py
deleted file mode 100644
index 2108f10..0000000
--- a/backend/scripts/backfill_total_references.py
+++ /dev/null
@@ -1,365 +0,0 @@
-#!/usr/bin/env python3
-"""
-Backfill script to update existing Cosmos DB documents with totalReferences field.
-
-This script:
-1. Queries for documents missing the totalReferences field
-2. Calculates totalReferences for each document
-3. Updates documents in batches to avoid memory issues
-4. Provides progress reporting and error handling
-5. Can be run safely multiple times (idempotent)
-
-USAGE (Local Development):
-    python scripts/backfill_total_references.py [--batch-size 100] [--dry-run]
-
-USAGE (Azure Container App):
-    # Connect to the running container app instance
-    az containerapp exec --name <container-app-name> --resource-group <resource-group> --command "/bin/bash"
-
-    # Inside the container, run:
-    cd /app
-    python scripts/backfill_total_references.py --batch-size 50
-
-    # For dry-run validation first:
-    python scripts/backfill_total_references.py --dry-run
-
-    # Monitor logs:
-    az containerapp logs show --name <container-app-name> --resource-group <resource-group> --follow
-
-AZURE CONTAINER APP CONSIDERATIONS:
-    - Use smaller batch sizes (50-100) to avoid timeouts
-    - Monitor memory usage during execution
-    - Ensure the container has sufficient CPU/memory allocation
-    - Set appropriate environment variables for Cosmos DB connection
-    - Consider running during off-peak hours to minimize impact
-"""
-
-import argparse
-import asyncio
-import logging
-import sys
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import Any, Dict
-from azure.cosmos.exceptions import CosmosHttpResponseError
-
-# Add the backend directory to Python path so we can import app modules
-backend_dir = Path(__file__).parent.parent
-sys.path.insert(0, str(backend_dir))
-
-from app.container import container
-from app.adapters.repos.cosmos_repo import CosmosGroundTruthRepo
-
-
-logger = logging.getLogger(__name__)
-
-
-def compute_total_references_from_doc(doc: Dict[str, Any]) -> int:
-    """Calculate total reference count from raw document data.
-
-    Args:
-        doc: Raw document from Cosmos DB
-
-    Returns:
-        Total reference count
-    """
-    # Count refs in all history turns
-    history = doc.get("history", []) or []
-    history_refs = 0
-
-    for turn in history:
-        if isinstance(turn, dict):
-            refs = turn.get("refs", []) or []
-            history_refs += len(refs)
-
-    # If no turn refs, return item-level refs count
-    if history_refs == 0:
-        refs = doc.get("refs", []) or []
-        return len(refs)
-
-    return history_refs
-
-
-async def get_documents_missing_total_references(batch_size: int = 100) -> list[Dict[str, Any]]:
-    """Query for documents that don't have totalReferences field.
-
-    Args:
-        batch_size: Maximum number of documents to return
-
-    Returns:
-        List of documents missing totalReferences field
-    """
-    # Initialize the Cosmos repository if not already done
-    if container.repo is None:
-        container.init_cosmos_repo()
-
-    repo = container.repo
-    if isinstance(repo, CosmosGroundTruthRepo):
-        await repo._ensure_initialized()
-
-        # Query for documents without totalReferences field
-        query = """
-            SELECT * FROM c 
-            WHERE c.docType = 'ground-truth-item' 
-            AND NOT IS_DEFINED(c.totalReferences)
-        """
-
-        container_client = repo._gt_container
-        if not container_client:
-            raise ValueError("Cosmos container not initialized")
-    else:
-        raise ValueError("This script only works with CosmosGroundTruthRepo")
-    query_iterator = container_client.query_items(
-        query=query, enable_scan_in_query=True, max_item_count=batch_size
-    )
-
-    documents = []
-    try:
-        async for item in query_iterator:
-            documents.append(item)
-            if len(documents) >= batch_size:
-                break
-    except Exception as e:
-        logger.error(f"Error querying documents: {e}")
-        raise
-
-    return documents
-
-
-async def update_document_with_total_references(
-    doc: Dict[str, Any], dry_run: bool = False, max_retries: int = 3
-) -> bool:
-    """Update a single document with totalReferences field.
-
-    Args:
-        doc: Document to update
-        dry_run: If True, don't actually update the document
-
-    Returns:
-        True if update was successful, False otherwise
-    """
-    for attempt in range(max_retries):
-        try:
-            # Calculate totalReferences
-            total_refs = compute_total_references_from_doc(doc)
-
-            if dry_run:
-                logger.info(
-                    f"DRY RUN: Would update document {doc.get('id')} with totalReferences={total_refs}"
-                )
-                return True
-
-            # Add totalReferences to document
-            doc["totalReferences"] = total_refs
-            doc["updatedAt"] = datetime.now(timezone.utc).isoformat()
-
-            # Update in Cosmos DB
-            repo = container.repo
-            if isinstance(repo, CosmosGroundTruthRepo):
-                container_client = repo._gt_container
-                if not container_client:
-                    raise ValueError("Cosmos container not initialized")
-            else:
-                raise ValueError("This script only works with CosmosGroundTruthRepo")
-
-            # Use replace_item to update the document
-            await container_client.replace_item(item=doc["id"], body=doc)
-
-            logger.info(f"Updated document {doc.get('id')} with totalReferences={total_refs}")
-            return True
-
-        except CosmosHttpResponseError as e:
-            if e.status_code == 429:  # Rate limited
-                wait_time = 2**attempt
-                logger.warning(f"Rate limited, retrying in {wait_time}s")
-                await asyncio.sleep(wait_time)
-                continue
-            elif e.status_code == 412:  # Precondition failed (etag mismatch)
-                logger.warning(f"Document {doc['id']} was updated by another process")
-                return False
-            else:
-                raise
-        except Exception as e:
-            if attempt == max_retries - 1:
-                logger.error(f"Final attempt failed for {doc['id']}: {e}")
-                return False
-            logger.warning(f"Attempt {attempt + 1} failed, retrying: {e}")
-
-    return False
-
-
-async def update_documents_batch(
-    documents: list[Dict[str, Any]], batch_size: int = 10, max_ru_per_second: int = 400
-) -> Dict[str, int]:
-    """Optimized batch processing with rate limiting."""
-
-    stats = {"processed": 0, "updated": 0, "errors": 0, "skipped": 0}
-
-    # Process in smaller batches to control RU consumption
-    for i in range(0, len(documents), batch_size):
-        batch = documents[i : i + batch_size]
-
-        # Execute batch operations concurrently with semaphore
-        semaphore = asyncio.Semaphore(5)  # Limit concurrent operations
-
-        async def process_document(doc):
-            async with semaphore:
-                return await update_document_with_total_references(doc)
-
-        # Process batch concurrently
-        batch_tasks = [process_document(doc) for doc in batch]
-        results = await asyncio.gather(*batch_tasks, return_exceptions=True)
-
-        # Update statistics
-        for result in results:
-            if isinstance(result, Exception):
-                stats["errors"] += 1
-            elif result:
-                stats["updated"] += 1
-            stats["processed"] += 1
-
-        # Rate limiting: pause between batches
-        await asyncio.sleep(0.1)  # 100ms pause
-
-        # Log progress
-        logger.info(f"Processed batch {i // batch_size + 1}, Progress: {stats}")
-
-    return stats
-
-
-async def backfill_total_references_batch(
-    batch_size: int = 100, dry_run: bool = False
-) -> Dict[str, int]:
-    """Process a batch of documents and update them with totalReferences.
-
-    Args:
-        batch_size: Number of documents to process in this batch
-        dry_run: If True, don't actually update documents
-
-    Returns:
-        Dictionary with processing statistics
-    """
-    stats = {"processed": 0, "updated": 0, "errors": 0, "skipped": 0}
-
-    try:
-        # Get documents missing totalReferences
-        documents = await get_documents_missing_total_references(batch_size)
-
-        if not documents:
-            logger.info("No documents found missing totalReferences field")
-            return stats
-
-        logger.info(f"Found {len(documents)} documents to update")
-
-        batch_stats = await update_documents_batch(documents, batch_size=10)
-
-        # Update the main stats with batch results
-        for key in ["processed", "updated", "errors", "skipped"]:
-            stats[key] = batch_stats[key]
-
-        return stats
-
-    except Exception as e:
-        logger.error(f"Error in batch processing: {e}")
-        stats["errors"] += 1
-        return stats
-
-
-async def run_full_migration(
-    batch_size: int = 100, max_batches: int | None = None, dry_run: bool = False
-) -> None:
-    """Run the complete migration process.
-
-    Args:
-        batch_size: Number of documents to process per batch
-        max_batches: Maximum number of batches to process (None = unlimited)
-        dry_run: If True, don't actually update documents
-    """
-    logger.info("Starting totalReferences backfill migration")
-    logger.info(f"Batch size: {batch_size}, Max batches: {max_batches}, Dry run: {dry_run}")
-
-    total_stats = {"processed": 0, "updated": 0, "errors": 0, "skipped": 0, "batches": 0}
-
-    batch_count = 0
-
-    while True:
-        batch_count += 1
-
-        if max_batches and batch_count > max_batches:
-            logger.info(f"Reached maximum batch limit of {max_batches}")
-            break
-
-        logger.info(f"Processing batch {batch_count}...")
-
-        # Process batch
-        batch_stats = await backfill_total_references_batch(batch_size, dry_run)
-
-        # Update totals
-        for key in ["processed", "updated", "errors", "skipped"]:
-            total_stats[key] += batch_stats[key]
-        total_stats["batches"] = batch_count
-
-        # Log batch results
-        logger.info(f"Batch {batch_count} complete: {batch_stats}")
-
-        # If no documents were processed, we're done
-        if batch_stats["processed"] == 0:
-            logger.info("No more documents to process")
-            break
-
-        # In dry-run mode, stop after first batch to avoid infinite loop
-        # (since we're not actually updating documents, the query will keep finding the same ones)
-        if dry_run:
-            logger.info("Dry-run mode: stopping after first batch to prevent infinite loop")
-            break
-
-    # Final summary
-    logger.info("Migration complete!")
-    logger.info(f"Total statistics: {total_stats}")
-
-
-async def main() -> None:
-    """Main function to handle command line arguments and execute migration."""
-    parser = argparse.ArgumentParser(
-        description="Backfill totalReferences field in Cosmos DB documents",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=__doc__,
-    )
-
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=100,
-        help="Number of documents to process per batch (default: 100)",
-    )
-
-    parser.add_argument(
-        "--max-batches",
-        type=int,
-        default=None,
-        help="Maximum number of batches to process (default: unlimited)",
-    )
-
-    parser.add_argument(
-        "--dry-run", action="store_true", help="Preview changes without actually updating documents"
-    )
-
-    parser.add_argument("--verbose", "-v", action="store_true", help="Enable verbose logging")
-
-    args = parser.parse_args()
-
-    # Configure logging
-    log_level = logging.DEBUG if args.verbose else logging.INFO
-    logging.basicConfig(level=log_level, format="%(asctime)s - %(levelname)s - %(message)s")
-
-    try:
-        await run_full_migration(
-            batch_size=args.batch_size, max_batches=args.max_batches, dry_run=args.dry_run
-        )
-    except Exception as e:
-        logger.error(f"Migration failed: {e}")
-        raise
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/backend/scripts/cosmos_export_import.py b/backend/scripts/cosmos_export_import.py
deleted file mode 100644
index 1be1277..0000000
--- a/backend/scripts/cosmos_export_import.py
+++ /dev/null
@@ -1,511 +0,0 @@
-#!/usr/bin/env python3
-"""
-Cosmos DB Export/Import Script
-==============================
-
-This script exports documents from a source Azure Cosmos DB container and imports them
-into a target container with hierarchical partition keys (HPK). It's designed for
-migrating data between Cosmos DB instances (e.g., from cloud to local emulator).
-
-FEATURES:
-- Exports data in paginated JSONL format for memory efficiency
-- Supports hierarchical partition keys (/datasetName, /bucket)
-- Concurrent/bulk import with retry logic for 429 throttling
-- Dry-run mode for validation without writing
-- Flexible missing partition key policies
-
-USAGE:
-1. Configure environment variables in '.env' file (use sample_cosmos_export_import.env as template)
-2. Run: python cosmos_export_import.py
-
-CONFIGURATION:
-All settings are loaded from '.env' file:
-- Source/target Cosmos DB connection strings and credentials
-- Partition key paths (default: ["/datasetName", "/bucket"])
-- Batch sizes, concurrency settings
-- DRY_RUN mode for validation only
-    - Note with DRY_RUN=true, export to jsonl files will still occur,
-      hence allowing an export to file without importing into another instance
-- Missing partition key handling policy
-
-EXAMPLES:
-# Export from cloud to local emulator (dry-run first)
-DRY_RUN=true python cosmos_export_import.py
-
-# Actual migration
-DRY_RUN=false python cosmos_export_import.py
-
-OUTPUT:
-- Creates ./cosmos_export/ directory with paginated JSONL files
-- Each page contains up to EXPORT_PAGE_SIZE documents
-- Import processes files in batches of IMPORT_batch_SIZE
-
-ERROR HANDLING:
-- Automatic retry with exponential backoff for 429 (throttling)
-- Configurable missing partition key policies: error/skip/default
-- Detailed logging of progress and errors
-
-NOTE: Ensure target container has sufficient RU/s to avoid throttling during import.
-"""
-
-import os
-import json
-from pathlib import Path
-import time
-from typing import List, Dict, Any, Optional, Tuple
-from dotenv import load_dotenv
-from azure.cosmos import CosmosClient, PartitionKey, exceptions
-from azure.cosmos.exceptions import CosmosHttpResponseError
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from azure.identity import DefaultAzureCredential
-
-# ------------- Configuration -------------
-
-
-def require_env(name: str) -> str:
-    val = os.getenv(name)
-    if not val:  # catches None and empty string
-        raise RuntimeError(f"Missing required environment variable: {name}")
-    return val
-
-
-DOTENV_PATH = Path(".env")
-load_dotenv(dotenv_path=DOTENV_PATH)
-
-SRC_ACCOUNT_URI = require_env("SRC_ACCOUNT_URI")
-SRC_DATABASE = require_env("SRC_DATABASE")
-SRC_CONTAINER = require_env("SRC_CONTAINER")
-
-DST_ACCOUNT_URI = require_env("DST_ACCOUNT_URI")
-DST_DATABASE = require_env("DST_DATABASE")
-DST_CONTAINER = require_env("DST_CONTAINER")
-
-
-def is_dst_cosmos_emulator_in_use() -> bool:
-    """Detect if Cosmos DB emulator is in use based on endpoint URL."""
-    return "localhost" in DST_ACCOUNT_URI or "127.0.0.1" in DST_ACCOUNT_URI
-
-
-if is_dst_cosmos_emulator_in_use():
-    DST_EMULATOR_KEY = require_env("DST_EMULATOR_KEY")
-
-# HPK paths
-DST_PARTITION_KEY_PATHS_RAW = require_env("DST_PARTITION_KEY_PATHS")
-try:
-    DST_PARTITION_KEY_PATHS = json.loads(DST_PARTITION_KEY_PATHS_RAW)
-    if not isinstance(DST_PARTITION_KEY_PATHS, list):
-        raise ValueError("DST_PARTITION_KEY_PATHS must be a JSON list")
-except (json.JSONDecodeError, ValueError) as e:
-    raise RuntimeError(
-        f"Invalid DST_PARTITION_KEY_PATHS format: {e}. Expected JSON list like: ['/datasetName', '/bucket']"
-    )
-
-OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./cosmos_export")
-EXPORT_PAGE_SIZE = int(os.getenv("EXPORT_PAGE_SIZE", "500"))
-IMPORT_BATCH_SIZE = int(os.getenv("IMPORT_BATCH_SIZE", "200"))
-BULK_MODE = os.getenv("BULK_MODE", "true").lower() == "true"
-DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
-
-MISSING_PK_POLICY = os.getenv("MISSING_PK_POLICY", "error").lower()
-DEFAULT_PK_VALUES_RAW = os.getenv("DEFAULT_PK_VALUES", '["UNKNOWN","DEFAULT_BUCKET"]')
-
-# Retry/backoff
-MAX_RETRY_ATTEMPTS = 10
-RETRY_BACKOFF_BASE = 0.5  # seconds
-
-CONCURRENCY = int(os.getenv("CONCURRENCY", "32"))  # number of parallel upserts
-
-
-# ------------- Helpers -------------
-
-
-def ensure_dir(path: str):
-    if not os.path.exists(path):
-        os.makedirs(path)
-
-
-def exponential_backoff(attempt: int) -> float:
-    return min(RETRY_BACKOFF_BASE * (2**attempt), 30.0)
-
-
-def log(msg: str):
-    print(f"[cosmos-migrate] {msg}")
-
-
-def transform_document(doc: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Remove Cosmos system props; keep content unchanged.
-    Add transformations here if needed later.
-    """
-    new_doc = dict(doc)
-    for sys_field in ["_rid", "_ts", "_self", "_etag", "_attachments"]:
-        new_doc.pop(sys_field, None)
-    return new_doc
-
-
-def get_value_by_path(doc: Dict[str, Any], path: str) -> Any:
-    """
-    Extract value from document following a path like "/bucket".
-    """
-    parts = path.strip("/").split("/")
-    cur = doc
-    for p in parts:
-        if not isinstance(cur, dict) or p not in cur:
-            return None
-        cur = cur[p]
-    return cur
-
-
-def compute_hpk_values(doc: Dict[str, Any], paths: List[str]) -> Tuple[List[Any], List[int]]:
-    """
-    Returns (values_list, missing_indices) for hierarchical partition key paths.
-    """
-    values = []
-    missing = []
-    for i, path in enumerate(paths):
-        val = get_value_by_path(doc, path)
-        if val is None:
-            values.append(None)
-            missing.append(i)
-        else:
-            values.append(val)
-    return values, missing
-
-
-def parse_default_pk_values(raw: str, count: int) -> List[Any]:
-    try:
-        vals = json.loads(raw)
-    except Exception:
-        vals = []
-    if len(vals) < count:
-        vals = vals + [None] * (count - len(vals))
-    elif len(vals) > count:
-        vals = vals[:count]
-    return vals
-
-
-DEFAULT_PK_VALUES = parse_default_pk_values(DEFAULT_PK_VALUES_RAW, len(DST_PARTITION_KEY_PATHS))
-
-
-def resolve_missing_hpk(values: List[Any], missing_indices: List[int]) -> Optional[List[Any]]:
-    """
-    Apply MISSING_PK_POLICY to fill or handle missing HPK components.
-    """
-    if not missing_indices:
-        return values
-
-    if MISSING_PK_POLICY == "error":
-        return None
-    elif MISSING_PK_POLICY == "skip":
-        return None
-    elif MISSING_PK_POLICY == "default":
-        for i in missing_indices:
-            default_val = DEFAULT_PK_VALUES[i]
-            if default_val is None:
-                return None
-            values[i] = default_val
-        return values
-    else:
-        return None
-
-
-def upsert_with_retry(container, doc):
-    attempts = 0
-    while True:
-        try:
-            # Non-bulk path: let SDK infer HPK from doc fields
-            container.upsert_item(doc)
-            return True
-        except CosmosHttpResponseError as e:
-            # 429 (throttled) -> backoff, then retry
-            if getattr(e, "status_code", None) == 429 and attempts < MAX_RETRY_ATTEMPTS:
-                attempts += 1
-                delay = min(RETRY_BACKOFF_BASE * (2**attempts), 30.0)
-                log(
-                    f"Throttled (429) on id={doc.get('id')}. Backing off {delay:.1f}s (attempt {attempts}/{MAX_RETRY_ATTEMPTS})"
-                )
-                time.sleep(delay)
-                continue
-            else:
-                # Bubble up any non-retryable errors
-                raise
-
-
-# ------------- Export -------------
-
-
-def export_cosmos_container_to_jsonl(
-    client: CosmosClient,
-    database_name: str,
-    container_name: str,
-    output_dir: str,
-    page_size: int = 500,
-) -> str:
-    """
-    Export all documents from a Cosmos DB container to paginated JSONL files.
-    Uses by_page() for robust continuation handling.
-    """
-    log(f"Exporting from {SRC_ACCOUNT_URI}:{database_name}/{container_name} ...")
-    ensure_dir(output_dir)
-
-    database = client.get_database_client(database_name)
-    container = database.get_container_client(container_name)
-
-    query = "SELECT * FROM c"
-    page_index = 1
-    total_docs = 0
-
-    try:
-        iterator = container.query_items(
-            query=query,
-            enable_cross_partition_query=True,
-            max_item_count=page_size,
-        ).by_page()
-    except CosmosHttpResponseError as e:
-        log(f"Query initialization error: {e}")
-        raise
-
-    for page in iterator:
-        docs = list(page)
-        if not docs:
-            break
-        page_file = os.path.join(output_dir, f"{container_name}_page_{page_index}.jsonl")
-        with open(page_file, "w", encoding="utf-8") as f:
-            for doc in docs:
-                f.write(json.dumps(doc, ensure_ascii=False) + "\n")
-        total_docs += len(docs)
-        log(f"Wrote {len(docs)} docs to {page_file}")
-        page_index += 1
-
-    log(f"Export complete: {total_docs} documents across {page_index - 1} file(s).")
-    return output_dir
-
-
-# ------------- Import -------------
-
-
-def maybe_create_target_container(
-    client: CosmosClient,
-    database_name: str,
-    container_name: str,
-    partition_key_paths: List[str],
-    throughput: Optional[int] = None,
-):
-    """
-    Create database & container if they do not exist, with hierarchical PK definition.
-    """
-    log(f"Ensuring target DB '{database_name}' and container '{container_name}' exist ...")
-    db_client = client.create_database_if_not_exists(id=database_name)
-
-    try:
-        db_client.create_container_if_not_exists(
-            id=container_name,
-            partition_key=PartitionKey(path=partition_key_paths, kind="MultiHash"),
-            offer_throughput=throughput,
-        )
-        log("Target container is ready.")
-    except exceptions.CosmosResourceExistsError:
-        log("Target container already exists.")
-    except CosmosHttpResponseError as e:
-        log(f"Failed to create container: {e}")
-        raise
-
-
-def read_jsonl_files(folder: str, prefix: str) -> List[str]:
-    files = []
-    for name in sorted(os.listdir(folder)):
-        if name.startswith(prefix) and name.endswith(".jsonl"):
-            files.append(os.path.join(folder, name))
-    return files
-
-
-def summarize_missing_hpk_components(
-    source_folder: str, prefix: str, partition_key_paths: List[str]
-) -> None:
-    """
-    DRY_RUN validator: counts and reports missing HPK components without writing.
-    """
-    files = read_jsonl_files(source_folder, prefix)
-    if not files:
-        log(f"No JSONL files found in {source_folder} with prefix '{prefix}'")
-        return
-
-    total = 0
-    missing_counts = [0] * len(partition_key_paths)
-
-    for file in files:
-        with open(file, "r", encoding="utf-8") as f:
-            for line in f:
-                doc = json.loads(line)
-                _, missing = compute_hpk_values(doc, partition_key_paths)
-                total += 1
-                for i in missing:
-                    missing_counts[i] += 1
-
-    log(f"Validation summary: checked {total} docs.")
-    for i, path in enumerate(partition_key_paths):
-        log(f"  Path {path}: missing in {missing_counts[i]} docs")
-
-
-def import_jsonl_to_cosmos(
-    client: CosmosClient,
-    database_name: str,
-    container_name: str,
-    source_folder: str,
-    source_prefix: Optional[str],
-    partition_key_paths: List[str],
-    batch_size: int = 200,
-    bulk_mode: bool = True,
-):
-    """
-    Import JSONL files into target Cosmos container with hierarchical partition keys.
-    """
-    db = client.get_database_client(database_name)
-    container = db.get_container_client(container_name)
-
-    prefix = source_prefix or container_name
-    files = read_jsonl_files(source_folder, prefix)
-    if not files:
-        log(f"No JSONL files found in {source_folder} with prefix '{prefix}'")
-        return
-
-    total_written = 0
-    total_skipped = 0
-
-    for file in files:
-        log(f"Importing from {file} to {DST_ACCOUNT_URI}:{database_name}/{container_name} ...")
-        batch: List[Tuple[Dict[str, Any], List[Any]]] = []
-
-        with open(file, "r", encoding="utf-8") as f:
-            for line in f:
-                doc = json.loads(line)
-                doc = transform_document(doc)
-
-                hpk_values, missing = compute_hpk_values(doc, partition_key_paths)
-                if missing:
-                    resolved = resolve_missing_hpk(hpk_values, missing)
-                    if resolved is None:
-                        total_skipped += 1
-                        log(
-                            f"Skipped doc id={doc.get('id')} due to missing HPK components at indices {missing}"
-                        )
-                        continue
-                    else:
-                        hpk_values = resolved
-
-                batch.append((doc, hpk_values))
-
-                if len(batch) >= batch_size:
-                    if DRY_RUN:
-                        total_written += len(batch)  # pretend write
-                        log(f"[DRY_RUN] Would write {len(batch)} docs")
-                    else:
-                        written = write_batch(container, batch, bulk_mode=bulk_mode)
-                        total_written += written
-                    batch = []
-
-        if batch:
-            if DRY_RUN:
-                total_written += len(batch)
-                log(f"[DRY_RUN] Would write {len(batch)} docs")
-            else:
-                written = write_batch(container, batch, bulk_mode=bulk_mode)
-                total_written += written
-
-    log(
-        f"Import complete: {total_written} documents {'validated' if DRY_RUN else 'written'}, {total_skipped} skipped (policy={MISSING_PK_POLICY})."
-    )
-
-
-def write_batch(
-    container, docs_with_pk: List[Tuple[Dict[str, Any], List[Any]]], bulk_mode: bool = True
-) -> int:
-    """
-    Write a batch of documents with retry on 429.
-    For HPK, partition key is a list in the same order as paths.
-    If bulk_mode=true, use concurrent upserts (no explicit partition_key kwarg).
-    """
-    # If someone enables bulk_mode, but the SDK doesn't have container.bulk, we'll use concurrency instead.
-    if bulk_mode:
-        log(f"Using concurrent upserts: CONCURRENCY={CONCURRENCY}, batch_size={len(docs_with_pk)}")
-        total_success = 0
-        # Kick off upserts in parallel
-        with ThreadPoolExecutor(max_workers=CONCURRENCY) as tp:
-            futures = [tp.submit(upsert_with_retry, container, d) for (d, _pk_list) in docs_with_pk]
-            for fut in as_completed(futures):
-                try:
-                    if fut.result():
-                        total_success += 1
-                except CosmosHttpResponseError as e:
-                    log(f"Upsert failed: {e}")
-                    # If desired, you can collect failed docs and retry sequentially here.
-                    # For now, we just log and continue to next future.
-                    continue
-        log(f"Concurrent upserts wrote {total_success}/{len(docs_with_pk)} docs")
-        return total_success
-
-    # Fallback: sequential upserts (non-bulk)
-    success = 0
-    for d, _pk_list in docs_with_pk:
-        upsert_with_retry(container, d)
-        success += 1
-    log(f"Sequential upserts wrote {success}/{len(docs_with_pk)} docs")
-    return success
-
-
-# ------------- Main orchestration -------------
-
-
-def main():
-    # Build one DefaultAzureCredential and reuse it.
-    # For user-assigned managed identity, set the credential's managed identity
-    # client id from the AZURE_CLIENT_ID environment variable.
-    aad_credential = DefaultAzureCredential()
-
-    # Source client (AAD)
-    src_client = CosmosClient(SRC_ACCOUNT_URI, credential=aad_credential, logging_enable=True)
-
-    # Target client (emulator or AAD)
-    if is_dst_cosmos_emulator_in_use():
-        log("Using Cosmos Emulator for target client")
-        dst_client = CosmosClient(DST_ACCOUNT_URI, credential=DST_EMULATOR_KEY, logging_enable=True)
-    else:
-        dst_client = CosmosClient(DST_ACCOUNT_URI, credential=aad_credential, logging_enable=True)
-
-    # 1) Export
-    export_cosmos_container_to_jsonl(
-        client=src_client,
-        database_name=SRC_DATABASE,
-        container_name=SRC_CONTAINER,
-        output_dir=OUTPUT_DIR,
-        page_size=EXPORT_PAGE_SIZE,
-    )
-
-    # 2) Ensure target container exists (HPK-aware)
-    maybe_create_target_container(
-        client=dst_client,
-        database_name=DST_DATABASE,
-        container_name=DST_CONTAINER,
-        partition_key_paths=DST_PARTITION_KEY_PATHS,
-        throughput=None,  # set higher RU/s temporarily if you see 429s
-    )
-
-    # 3) Optional: summarize_missing_hpk_components (no writes)
-    if DRY_RUN:
-        summarize_missing_hpk_components(OUTPUT_DIR, SRC_CONTAINER, DST_PARTITION_KEY_PATHS)
-
-    # 4) Import with HPK mapping
-    import_jsonl_to_cosmos(
-        client=dst_client,
-        database_name=DST_DATABASE,
-        container_name=DST_CONTAINER,
-        source_folder=OUTPUT_DIR,
-        source_prefix=SRC_CONTAINER,  # files are named using source container
-        partition_key_paths=DST_PARTITION_KEY_PATHS,
-        batch_size=IMPORT_BATCH_SIZE,
-        bulk_mode=BULK_MODE,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/backend/scripts/init_seed_data.py b/backend/scripts/init_seed_data.py
index e5043ad..1faca3e 100644
--- a/backend/scripts/init_seed_data.py
+++ b/backend/scripts/init_seed_data.py
@@ -24,7 +24,7 @@
 
 
 def _build_item(dataset: str, idx: int) -> Any:
-    from app.domain.models import GroundTruthItem, Reference
+    from app.domain.models import AgenticGroundTruthEntry, Reference
     from app.domain.enums import GroundTruthStatus
 
     # Vary some fields for realism while keeping validation simple
@@ -45,15 +45,23 @@ def _build_item(dataset: str, idx: int) -> Any:
         "id": f"{dataset}-q{idx:04d}",
         "datasetName": dataset,
         "status": GroundTruthStatus.draft.value,
-        "synthQuestion": f"What is item {idx} about in dataset '{dataset}'?",
-        "refs": [
-            Reference(url=f"https://example.com/{dataset}/{idx}").model_dump(
-                mode="json", by_alias=True
-            )
-        ],
-        "tags": tags,
+        "history": [{"role": "user", "msg": f"What is item {idx} about in dataset '{dataset}'?"}],
+        "plugins": {
+            "rag-compat": {
+                "kind": "rag-compat",
+                "version": "1.0",
+                "data": {
+                    "references": [
+                        Reference(url=f"https://example.com/{dataset}/{idx}").model_dump(
+                            mode="json", by_alias=True
+                        )
+                    ]
+                },
+            }
+        },
+        "manualTags": tags,
     }
-    return GroundTruthItem.model_validate(data)
+    return AgenticGroundTruthEntry.model_validate(data)
 
 
 def _default_registry_tags() -> list[str]:
diff --git a/backend/scripts/update_greetings_answer.py b/backend/scripts/update_greetings_answer.py
deleted file mode 100644
index 49612b8..0000000
--- a/backend/scripts/update_greetings_answer.py
+++ /dev/null
@@ -1,344 +0,0 @@
-#!/usr/bin/env python3
-"""
-Cosmos DB Greetings Answer Update Script
-=========================================
-
-This script updates the answer field from NO_ANSWER to GREETING for all items
-in the ground_truth container that have a tag set to "intent:greetings".
-
-FEATURES:
-- Connects to Azure Cosmos DB using DefaultAzureCredential
-- Supports local Cosmos DB emulator for dev environment
-- Uses Cosmos DB patch operations for Azure (cost-efficient, lower RU consumption)
-- Uses upsert operations for emulator (patch not supported on emulator)
-- Concurrent batch processing for efficient bulk updates
-- Dry-run mode to preview changes without modifying data
-- Automatic retry with exponential backoff for 429 throttling
-- Progress reporting and statistics
-
-USAGE:
-1. Configure environment variables in '.env' file
-2. Run: python update_greetings_answer.py
-
-CONFIGURATION:
-Environment variables (set in .env):
-- COSMOS_ACCOUNT_URI: Cosmos DB endpoint URL
-- COSMOS_DATABASE: Database name
-- COSMOS_CONTAINER: Container name (default: ground_truth)
-- COSMOS_EMULATOR_KEY: Required if using local emulator (localhost/127.0.0.1)
-- DRY_RUN: Set to "true" to preview without modifying (default: false)
-- CONCURRENCY: Number of parallel updates (default: 32)
-- BATCH_SIZE: Items per progress update (default: 100)
-
-EXAMPLES:
-# Preview changes (dry-run)
-DRY_RUN=true python update_greetings_answer.py
-
-# Execute actual update
-DRY_RUN=false python update_greetings_answer.py
-
-COST OPTIMIZATION:
-- Uses patch operations for Azure instead of upsert (typically 50% less RUs)
-- Automatically falls back to upsert for emulator (patch not supported)
-- Query selects only required fields (id, datasetName, bucket, answer)
-- Concurrent processing maximizes throughput without extra cost
-"""
-
-import os
-import time
-from pathlib import Path
-from typing import Any
-from dotenv import load_dotenv
-from azure.cosmos import CosmosClient
-from azure.cosmos.exceptions import CosmosHttpResponseError
-from azure.identity import DefaultAzureCredential
-from concurrent.futures import ThreadPoolExecutor, as_completed
-
-
-# ------------- Configuration -------------
-
-
-def require_env(name: str) -> str:
-    val = os.getenv(name)
-    if not val:  # catches None and empty string
-        raise RuntimeError(f"Missing required environment variable: {name}")
-    return val
-
-
-DOTENV_PATH = Path(".env")
-load_dotenv(dotenv_path=DOTENV_PATH)
-
-COSMOS_ACCOUNT_URI = require_env("COSMOS_ACCOUNT_URI")
-COSMOS_DATABASE = require_env("COSMOS_DATABASE")
-COSMOS_CONTAINER = os.getenv("COSMOS_CONTAINER", "ground_truth")
-
-DRY_RUN = os.getenv("DRY_RUN", "false").lower() == "true"
-
-# Retry/backoff settings
-MAX_RETRY_ATTEMPTS = 10
-RETRY_BACKOFF_BASE = 0.5  # seconds
-
-# Concurrency settings
-CONCURRENCY = int(os.getenv("CONCURRENCY", "32"))  # parallel updates
-BATCH_SIZE = int(os.getenv("BATCH_SIZE", "100"))  # items per progress update
-
-
-# ------------- Helpers -------------
-
-
-def is_cosmos_emulator_in_use() -> bool:
-    """Detect if Cosmos DB emulator is in use based on endpoint URL."""
-    return "localhost" in COSMOS_ACCOUNT_URI or "127.0.0.1" in COSMOS_ACCOUNT_URI
-
-
-def log(msg: str):
-    """Log message with prefix."""
-    print(f"[update-greetings] {msg}")
-
-
-def extract_partition_key(item: dict[str, Any]) -> list[Any]:
-    """
-    Extract hierarchical partition key values from item.
-    Assumes HPK structure: [datasetName, bucket]
-    """
-    dataset_name = item.get("datasetName")
-    bucket = item.get("bucket")
-    return [dataset_name, bucket]
-
-
-def patch_item_with_retry(container, item_id: str, partition_key: list[Any]) -> bool:
-    """
-    Patch a single item's answer field with retry logic for 429 throttling.
-    Uses patch operation for cost efficiency (lower RU consumption than upsert).
-    Returns True if successful.
-    """
-    attempts = 0
-    while True:
-        try:
-            # Patch operation - only updates the 'answer' field
-            container.patch_item(
-                item=item_id,
-                partition_key=partition_key,
-                patch_operations=[{"op": "replace", "path": "/answer", "value": "GREETING"}],
-            )
-            return True
-        except CosmosHttpResponseError as e:
-            # 429 (throttled) -> backoff, then retry
-            if getattr(e, "status_code", None) == 429 and attempts < MAX_RETRY_ATTEMPTS:
-                attempts += 1
-                delay = min(RETRY_BACKOFF_BASE * (2**attempts), 30.0)
-                log(
-                    f"Throttled (429) on id={item_id}. "
-                    f"Backing off {delay:.1f}s (attempt {attempts}/{MAX_RETRY_ATTEMPTS})"
-                )
-                time.sleep(delay)
-                continue
-            else:
-                # Bubble up any non-retryable errors
-                log(f"Failed to patch item id={item_id}: {e}")
-                raise
-
-
-def upsert_item_with_retry(container, item: dict[str, Any]) -> bool:
-    """
-    Upsert a single item with retry logic for 429 throttling.
-    Used for emulator since patch operations are not supported.
-    Returns True if successful.
-    """
-    attempts = 0
-    while True:
-        try:
-            container.upsert_item(item)
-            return True
-        except CosmosHttpResponseError as e:
-            # 429 (throttled) -> backoff, then retry
-            if getattr(e, "status_code", None) == 429 and attempts < MAX_RETRY_ATTEMPTS:
-                attempts += 1
-                delay = min(RETRY_BACKOFF_BASE * (2**attempts), 30.0)
-                log(
-                    f"Throttled (429) on id={item.get('id')}. "
-                    f"Backing off {delay:.1f}s (attempt {attempts}/{MAX_RETRY_ATTEMPTS})"
-                )
-                time.sleep(delay)
-                continue
-            else:
-                # Bubble up any non-retryable errors
-                log(f"Failed to upsert item id={item.get('id')}: {e}")
-                raise
-
-
-def patch_batch_concurrent(
-    container,
-    items: list[dict[str, Any]],
-    concurrency: int,
-    use_emulator: bool = False,
-) -> int:
-    """
-    Update a batch of items concurrently using ThreadPoolExecutor.
-    Uses patch operations for Azure, upsert for emulator.
-    Returns count of successfully updated items.
-    """
-    if not items:
-        return 0
-
-    operation = "Upserting" if use_emulator else "Patching"
-    log(f"{operation} {len(items)} items with concurrency={concurrency}")
-    items_updated = 0
-
-    with ThreadPoolExecutor(max_workers=concurrency) as executor:
-        if use_emulator:
-            # Emulator: use upsert (patch not supported)
-            # Prepare items with updated answer field
-            for item in items:
-                item["answer"] = "GREETING"
-
-            future_to_item = {
-                executor.submit(upsert_item_with_retry, container, item): item for item in items
-            }
-        else:
-            # Azure: use patch for cost efficiency
-            future_to_item = {
-                executor.submit(
-                    patch_item_with_retry, container, item["id"], extract_partition_key(item)
-                ): item
-                for item in items
-            }
-
-        # Collect results as they complete
-        for future in as_completed(future_to_item):
-            item = future_to_item[future]
-            try:
-                if future.result():
-                    items_updated += 1
-                    if items_updated % BATCH_SIZE == 0:
-                        log(f"Progress: {items_updated}/{len(items)} items updated")
-            except Exception as e:
-                log(f"Failed to update item id={item.get('id')}: {e}")
-                continue
-
-    return items_updated
-
-
-def update_greetings_answer(
-    client: CosmosClient,
-    database_name: str,
-    container_name: str,
-    dry_run: bool = False,
-    use_emulator: bool = False,
-) -> tuple[int, int]:
-    """
-    Update answer field from NO_ANSWER to GREETING for items with intent:greetings tag.
-    Uses patch operations for Azure or upsert for emulator.
-
-    Returns:
-        Tuple of (items_matched, items_updated)
-    """
-    operation = "upsert" if use_emulator else "patch"
-    log(
-        f"Connecting to {COSMOS_ACCOUNT_URI}:{database_name}/{container_name} (using {operation} operations)"
-    )
-
-    database = client.get_database_client(database_name)
-    container = database.get_container_client(container_name)
-
-    # Query for items with intent:greetings tag and answer = NO_ANSWER
-    # For emulator: select all fields since we need full document for upsert
-    # For Azure: select only necessary fields to minimize RU consumption
-    if use_emulator:
-        query = """
-            SELECT * FROM c 
-            WHERE ARRAY_CONTAINS(c.manualTags, "intent:greetings") 
-            AND c.answer = "NO_ANSWER"
-        """
-    else:
-        query = """
-            SELECT c.id, c.datasetName, c.bucket, c.answer FROM c 
-            WHERE ARRAY_CONTAINS(c.manualTags, "intent:greetings") 
-            AND c.answer = "NO_ANSWER"
-        """
-
-    log("Querying for items with 'intent:greetings' tag and answer='NO_ANSWER'...")
-
-    try:
-        items = list(
-            container.query_items(
-                query=query,
-                enable_cross_partition_query=True,
-            )
-        )
-    except CosmosHttpResponseError as e:
-        log(f"Query error: {e}")
-        raise
-
-    items_matched = len(items)
-    log(f"Found {items_matched} items matching criteria")
-
-    if items_matched == 0:
-        return 0, 0
-
-    if dry_run:
-        operation = "upserted" if use_emulator else "patched"
-        log(f"[DRY RUN] Items that would be {operation}:")
-        for item in items:
-            log(f"  - id: {item.get('id')}, answer: {item.get('answer')} -> GREETING")
-        log(f"[DRY RUN] Total items that would be {operation}: {items_matched}")
-        return items_matched, 0
-
-    # Execute concurrent batch update (patch for Azure, upsert for emulator)
-    items_updated = patch_batch_concurrent(container, items, CONCURRENCY, use_emulator)
-
-    operation = "upserted" if use_emulator else "patched"
-    log(f"Update complete: {items_updated}/{items_matched} items {operation} successfully")
-    return items_matched, items_updated
-
-
-# ------------- Main -------------
-
-
-def main():
-    """Main entry point."""
-    mode = "DRY RUN" if DRY_RUN else "LIVE"
-    use_emulator = is_cosmos_emulator_in_use()
-    log(f"Starting update script in {mode} mode")
-
-    # Build credential
-    aad_credential = DefaultAzureCredential()
-
-    # Create Cosmos client
-    if use_emulator:
-        log("Using Cosmos Emulator (will use upsert operations - patch not supported)")
-        emulator_key = require_env("COSMOS_EMULATOR_KEY")
-        client = CosmosClient(COSMOS_ACCOUNT_URI, credential=emulator_key, logging_enable=True)
-    else:
-        log("Using Azure Cosmos DB with DefaultAzureCredential (will use patch operations)")
-        client = CosmosClient(COSMOS_ACCOUNT_URI, credential=aad_credential, logging_enable=True)
-
-    # Execute update
-    items_matched, items_updated = update_greetings_answer(
-        client=client,
-        database_name=COSMOS_DATABASE,
-        container_name=COSMOS_CONTAINER,
-        dry_run=DRY_RUN,
-        use_emulator=use_emulator,
-    )
-
-    # Summary
-    operation = "upserted" if use_emulator else "patched"
-    log("=" * 60)
-    log("Summary:")
-    log(f"  Mode: {mode}")
-    log(f"  Environment: {'Emulator' if use_emulator else 'Azure'}")
-    log(f"  Operation: {'upsert' if use_emulator else 'patch'}")
-    log(f"  Items matched: {items_matched}")
-    if DRY_RUN:
-        log(f"  Items that would be {operation}: {items_matched}")
-    else:
-        log(f"  Items {operation}: {items_updated}")
-        if items_matched > 0:
-            success_rate = (items_updated / items_matched) * 100
-            log(f"  Success rate: {success_rate:.1f}%")
-    log("=" * 60)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/backend/scripts/update_greetings_answer_sample.env b/backend/scripts/update_greetings_answer_sample.env
deleted file mode 100644
index 897080b..0000000
--- a/backend/scripts/update_greetings_answer_sample.env
+++ /dev/null
@@ -1,72 +0,0 @@
-# Sample Environment Configuration for update_greetings_answer.py
-# Copy this file to .env and update with your actual values
-
-# ============================================================================
-# COSMOS DB CONNECTION
-# ============================================================================
-
-# Azure Cosmos DB account URI
-# For Azure: https://your-account.documents.azure.com:443/
-# For local emulator: https://localhost:8081/
-COSMOS_ACCOUNT_URI=https://localhost:8081/
-
-# Database name
-COSMOS_DATABASE=your-database-name
-
-# Container name (defaults to "ground_truth" if not specified)
-COSMOS_CONTAINER=ground_truth
-
-
-# ============================================================================
-# COSMOS DB AUTHENTICATION
-# ============================================================================
-
-# Cosmos DB Emulator Key (only required for local emulator)
-# Standard emulator key (localhost/127.0.0.1 detection is automatic)
-COSMOS_EMULATOR_KEY=C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw==
-
-# For Azure Cosmos DB: Uses DefaultAzureCredential automatically
-# Ensure you're logged in via: az login
-# Or set up managed identity / service principal environment variables
-
-
-# ============================================================================
-# SCRIPT BEHAVIOR
-# ============================================================================
-
-# Dry-run mode: Set to "true" to preview changes without modifying data
-# Set to "false" to execute actual updates
-DRY_RUN=true
-
-
-# ============================================================================
-# PERFORMANCE TUNING
-# ============================================================================
-
-# Number of concurrent patch operations (adjust based on your RU/s provisioning)
-# Higher values = faster updates but may cause throttling if RUs are limited
-# Recommended: 16-32 for 400-1000 RU/s, 64+ for higher RU/s
-CONCURRENCY=32
-
-# Items per progress log update
-BATCH_SIZE=100
-
-
-# ============================================================================
-# EXAMPLE CONFIGURATIONS
-# ============================================================================
-
-# --- Local Development (Cosmos Emulator) ---
-# COSMOS_ACCOUNT_URI=https://localhost:8081/
-# COSMOS_DATABASE=gtc-dev
-# COSMOS_CONTAINER=ground_truth
-# COSMOS_EMULATOR_KEY=C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw==
-# DRY_RUN=true
-# CONCURRENCY=16
-
-# --- Azure Production ---
-# COSMOS_ACCOUNT_URI=https://your-prod-account.documents.azure.com:443/
-# COSMOS_DATABASE=gtc-prod
-# COSMOS_CONTAINER=ground_truth
-# DRY_RUN=false
-# CONCURRENCY=64
diff --git a/backend/tests/integration/test_assignments_assign_single_cosmos.py b/backend/tests/integration/test_assignments_assign_single_cosmos.py
index 8b40e3f..d62189b 100644
--- a/backend/tests/integration/test_assignments_assign_single_cosmos.py
+++ b/backend/tests/integration/test_assignments_assign_single_cosmos.py
@@ -17,7 +17,9 @@ def make_item(
         "id": item_id,
         "datasetName": dataset,
         "bucket": bucket_id,
-        "synthQuestion": "What is the meaning of life?",
+        "history": [
+            {"role": "user", "msg": "What is the meaning of life?"},
+        ],
         "status": status,
     }
     if assigned_to:
diff --git a/backend/tests/integration/test_assignments_cosmos.py b/backend/tests/integration/test_assignments_cosmos.py
index 25dde5d..317b626 100644
--- a/backend/tests/integration/test_assignments_cosmos.py
+++ b/backend/tests/integration/test_assignments_cosmos.py
@@ -1,11 +1,11 @@
-from httpx import AsyncClient
-from pydantic import TypeAdapter
-import pytest
 import uuid
+from typing import Any, cast
+
+import pytest
+from httpx import AsyncClient
 
-from app.domain.models import AgenticGroundTruthEntry
-from app.container import container
 from app.adapters.repos.cosmos_repo import CosmosGroundTruthRepo
+from app.container import container
 
 
 def make_item(dataset: str) -> dict:
@@ -14,8 +14,9 @@ def make_item(dataset: str) -> dict:
         "datasetName": dataset,
         # Use NIL UUID for explicit bucket in tests
         "bucket": str(uuid.UUID("00000000-0000-0000-0000-000000000000")),
-        "synthQuestion": "Q?",
-        "samplingBucket": 0,
+        "history": [
+            {"role": "user", "msg": "Q?"},
+        ],
         "assignedTo": None,
     }
 
@@ -56,13 +57,18 @@ async def test_assigned_ground_truths_update_and_approve(async_client: AsyncClie
     r = await async_client.post("/v1/assignments/self-serve", json=body, headers=user_headers)
     assert r.status_code == 200
     data: dict = r.json()
-    # mypy: data.get returns Optional[Any]; use default [] to ensure list type
-    adocs = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python(data.get("assigned") or [])
+    adocs = cast(list[dict[str, Any]], data.get("assigned") or [])
     assert adocs and len(adocs) >= 1
-    gt_id = adocs[0].id
+    gt_id = cast(str, adocs[0]["id"])
+    etag = cast(str | None, adocs[0].get("_etag"))
+    assert etag
 
     # SME approves via assignments PUT
-    payload = {"approve": True, "answer": "ans", "etag": adocs[0].etag}
+    payload = {
+        "approve": True,
+        "etag": etag,
+        "history": [{"role": "user", "msg": "Q?"}, {"role": "assistant", "msg": "ans"}],
+    }
     r = await async_client.put(
         f"/v1/assignments/{dataset}/{bucket}/{gt_id}", json=payload, headers=user_headers
     )
@@ -103,14 +109,14 @@ async def assigned_ground_truth(async_client: AsyncClient, user_headers):
     )
     assert r.status_code == 200
     data = r.json()
-    adocs = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python(data.get("assigned") or [])
+    adocs = cast(list[dict[str, Any]], data.get("assigned") or [])
     assert adocs and len(adocs) >= 1
     gt = adocs[0]
 
     # Verify assignment document exists
     repo = container.repo
     assert isinstance(repo, CosmosGroundTruthRepo)
-    assignment = await repo.get_assignment_by_gt(TEST_USER_ID, gt.id)
+    assignment = await repo.get_assignment_by_gt(TEST_USER_ID, cast(str, gt["id"]))
     assert assignment is not None, "Assignment document should exist after self-serve"
 
     yield {
@@ -135,16 +141,22 @@ async def test_approve_deletes_assignment_document(
     user_id = assigned_ground_truth["user_id"]
 
     # SME approves via assignments PUT
-    payload = {"approve": True, "answer": "ans", "etag": gt.etag}
+    etag = cast(str | None, gt.get("_etag"))
+    assert etag
+    payload = {
+        "approve": True,
+        "etag": etag,
+        "history": [{"role": "user", "msg": "Q?"}, {"role": "assistant", "msg": "ans"}],
+    }
     r = await async_client.put(
-        f"/v1/assignments/{dataset}/{bucket}/{gt.id}", json=payload, headers=user_headers
+        f"/v1/assignments/{dataset}/{bucket}/{gt['id']}", json=payload, headers=user_headers
     )
     assert r.status_code == 200
     res: dict = r.json()
     assert res.get("status") == "approved"
 
     # Verify assignment document is deleted after approval
-    assignment_after = await repo.get_assignment_by_gt(user_id, gt.id)
+    assignment_after = await repo.get_assignment_by_gt(user_id, cast(str, gt["id"]))
     assert assignment_after is None, "Assignment document should be deleted after approval"
 
 
@@ -160,14 +172,16 @@ async def test_delete_deletes_assignment_document(
     user_id = assigned_ground_truth["user_id"]
 
     # SME soft-deletes via assignments PUT with status=deleted
-    payload = {"status": "deleted", "etag": gt.etag}
+    etag = cast(str | None, gt.get("_etag"))
+    assert etag
+    payload = {"status": "deleted", "etag": etag}
     r = await async_client.put(
-        f"/v1/assignments/{dataset}/{bucket}/{gt.id}", json=payload, headers=user_headers
+        f"/v1/assignments/{dataset}/{bucket}/{gt['id']}", json=payload, headers=user_headers
     )
     assert r.status_code == 200
     res: dict = r.json()
     assert res.get("status") == "deleted"
 
     # Verify assignment document is deleted after soft-delete
-    assignment_after = await repo.get_assignment_by_gt(user_id, gt.id)
+    assignment_after = await repo.get_assignment_by_gt(user_id, cast(str, gt["id"]))
     assert assignment_after is None, "Assignment document should be deleted after soft-delete"
diff --git a/backend/tests/integration/test_assignments_duplicate_cosmos.py b/backend/tests/integration/test_assignments_duplicate_cosmos.py
index a56129c..2d0a743 100644
--- a/backend/tests/integration/test_assignments_duplicate_cosmos.py
+++ b/backend/tests/integration/test_assignments_duplicate_cosmos.py
@@ -14,10 +14,10 @@ def make_item(dataset: str, *, assigned_to: str | None = None) -> dict[str, Any]
         "datasetName": dataset,
         # Use NIL UUID for explicit bucket to keep PK simple in tests
         "bucket": str(uuid.UUID("00000000-0000-0000-0000-000000000000")),
-        "synthQuestion": "Q?",
-        "samplingBucket": 0,
+        "history": [
+            {"role": "user", "msg": "Q?"},
+        ],
         "assignedTo": assigned_to,
-        "refs": [],
         "manualTags": ["source:synthetic"],
     }
 
diff --git a/backend/tests/integration/test_assignments_edited_question_persist_cosmos.py b/backend/tests/integration/test_assignments_edited_question_persist_cosmos.py
index e339fe9..17e2014 100644
--- a/backend/tests/integration/test_assignments_edited_question_persist_cosmos.py
+++ b/backend/tests/integration/test_assignments_edited_question_persist_cosmos.py
@@ -13,9 +13,9 @@ def make_item(dataset: str, item_id: str) -> dict[str, Any]:
         "datasetName": dataset,
         # Fixed bucket UUID for deterministic PK
         "bucket": str(UUID("00000000-0000-0000-0000-000000000000")),
-        "synthQuestion": "Original synth question?",
-        "answer": None,
-        "refs": [],
+        "history": [
+            {"role": "user", "msg": "Original synth question?"},
+        ],
         "manualTags": [
             "source:synthetic",
             "split:train",
@@ -31,18 +31,7 @@ def make_item(dataset: str, item_id: str) -> dict[str, Any]:
 async def test_assignments_put_persists_edited_question_camel_case(
     async_client: AsyncClient, user_headers: dict[str, str]
 ):
-    """Compat-migration coverage for the temporary editedQuestion alias path.
-
-    This test stays only while assignments updates still project legacy camelCase
-    question fields across the compatibility boundary. Delete it with the alias
-    retirement work in the hard-delete phase.
-
-    **Phase 5 Audit (2026-03-12)**: MIGRATION TEST - INFORMATIONAL
-    This test validates that editedQuestion persists correctly through Cosmos
-    round-trips. The test is marked as temporary and should be deleted when
-    Phase 6 removes legacy field support. Not a delete blocker, but documents
-    current persistence contract.
-    """
+    """Assignments PUT persists an updated user question via canonical history."""
     dataset = f"editedq-{uuid4().hex[:6]}"
     item_id = "gt-1"
     item = make_item(dataset, item_id)
@@ -65,16 +54,17 @@ async def test_assignments_put_persists_edited_question_camel_case(
     bucket = cast(str, row["bucket"])
     etag = cast(str, row.get("_etag"))
 
-    # Update via assignments PUT using camelCase editedQuestion
+    # Update via assignments PUT using canonical history
     new_question = "How do I reset my password (rephrased)?"
     r = await async_client.put(
         f"/v1/assignments/{dataset}/{bucket}/{item_id}",
         headers={**user_headers, "If-Match": etag},
-        json={"editedQuestion": new_question},
+        json={"history": [{"role": "user", "msg": new_question}]},
     )
     assert r.status_code == 200, r.text
     body = cast(dict[str, Any], r.json())
-    assert body.get("editedQuestion") == new_question
+    history = body.get("history") or []
+    assert history and history[0].get("msg") == new_question
 
     # Fetch item directly and assert persistence
     r = await async_client.get(
@@ -82,11 +72,13 @@ async def test_assignments_put_persists_edited_question_camel_case(
     )
     assert r.status_code == 200, r.text
     fetched = cast(dict[str, Any], r.json())
-    assert fetched.get("editedQuestion") == new_question
+    fetched_history = fetched.get("history") or []
+    assert fetched_history and fetched_history[0].get("msg") == new_question
 
     # List my assignments and ensure enriched view carries updated question
     r = await async_client.get("/v1/assignments/my", headers=user_headers)
     assert r.status_code == 200, r.text
     my_items = cast(list[dict[str, Any]], r.json())
     mine = next(x for x in my_items if x.get("id") == item_id)
-    assert mine.get("editedQuestion") == new_question
+    mine_history = mine.get("history") or []
+    assert mine_history and mine_history[0].get("msg") == new_question
diff --git a/backend/tests/integration/test_assignments_flow_cosmos.py b/backend/tests/integration/test_assignments_flow_cosmos.py
index 785a2c2..b2e9b4b 100644
--- a/backend/tests/integration/test_assignments_flow_cosmos.py
+++ b/backend/tests/integration/test_assignments_flow_cosmos.py
@@ -3,12 +3,9 @@
 from typing import Any, cast
 from uuid import uuid4
 
-from pydantic.type_adapter import TypeAdapter
 import pytest
 from httpx import AsyncClient
 
-from app.domain.models import AgenticGroundTruthEntry
-
 
 def make_item(dataset: str) -> dict[str, Any]:
     return {
@@ -16,10 +13,9 @@ def make_item(dataset: str) -> dict[str, Any]:
         "datasetName": dataset,
         "bucket": "00000000-0000-0000-0000-000000000000",
         "status": "draft",
-        "samplingBucket": 0,
-        "synthQuestion": "Q?",
-        "answer": None,
-        "refs": [],
+        "history": [
+            {"role": "user", "msg": "Q?"},
+        ],
         "manualTags": ["source:synthetic", "topic:general"],
     }
 
@@ -41,24 +37,26 @@ async def test_self_serve_list_and_approve(async_client: AsyncClient, user_heade
     assert r.status_code == 200
     resp = cast(dict[str, Any], r.json())
     assert resp.get("assignedCount") == 2
-    assigned = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python(
-        resp.get("assigned") or []
-    )
+    assigned = cast(list[dict[str, Any]], resp.get("assigned") or [])
     assert len(assigned) == 2
 
     # List my assignments
     r = await async_client.get("/v1/assignments/my", headers=user_headers)
     assert r.status_code == 200
-    docs = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python(r.json())
+    docs = cast(list[dict[str, Any]], r.json())
     assert len(docs) == 2
 
     # Approve first via assignments PUT
-    gt_id = docs[0].id
-    etag = docs[0].etag
+    gt_id = docs[0]["id"]
+    etag = docs[0]["_etag"]
     r = await async_client.put(
         f"/v1/assignments/{dataset}/{bucket}/{gt_id}",
         headers=user_headers,
-        json={"approve": True, "answer": "ans", "etag": etag},
+        json={
+            "approve": True,
+            "etag": etag,
+            "history": [{"role": "user", "msg": "Q?"}, {"role": "assistant", "msg": "ans"}],
+        },
     )
     assert r.status_code == 200
     res = cast(dict[str, Any], r.json())
@@ -162,6 +160,7 @@ async def test_exclusive_tag_error_prevents_persistence(
     actual_bucket = assigned[0]["bucket"]
     etag = assigned[0]["_etag"]
     original_tags = assigned[0]["manualTags"]
+    original_history = assigned[0]["history"]
 
     # Attempt invalid update with exclusive tag conflict
     r = await async_client.put(
@@ -172,7 +171,6 @@ async def test_exclusive_tag_error_prevents_persistence(
                 "difficulty:easy",
                 "difficulty:hard",
             ],  # Both difficulty tags - conflict!
-            "answer": "This should not be saved",
             "etag": etag,
         },
     )
@@ -189,8 +187,8 @@ async def test_exclusive_tag_error_prevents_persistence(
 
     # Tags should still be the original ones
     assert item_after["manualTags"] == original_tags
-    # Answer should still be None (not the rejected value)
-    assert item_after["answer"] is None
+    # History should also remain unchanged after the rejected update.
+    assert item_after["history"] == original_history
 
 
 @pytest.mark.anyio
diff --git a/backend/tests/integration/test_assignments_retry_exclusion.py b/backend/tests/integration/test_assignments_retry_exclusion.py
index 5232ae9..fa851e6 100644
--- a/backend/tests/integration/test_assignments_retry_exclusion.py
+++ b/backend/tests/integration/test_assignments_retry_exclusion.py
@@ -12,10 +12,8 @@
 from uuid import UUID, uuid4
 
 from httpx import AsyncClient
-from pydantic.type_adapter import TypeAdapter
 import pytest
 
-from app.domain.models import AgenticGroundTruthEntry
 from app.adapters.repos.cosmos_repo import CosmosGroundTruthRepo
 
 
@@ -26,10 +24,10 @@ def make_unassigned_item(dataset: str, item_id: str | None = None) -> dict[str,
         "datasetName": dataset,
         "bucket": str(UUID("00000000-0000-0000-0000-000000000000")),
         "status": "draft",
-        "samplingBucket": 0,
-        "synthQuestion": f"Question about {uuid4().hex[:4]}?",
+        "history": [
+            {"role": "user", "msg": f"Question about {uuid4().hex[:4]}?"},
+        ],
         "assignedTo": None,
-        "refs": [],
         "manualTags": ["source:synthetic", "split:test"],
     }
 
@@ -110,16 +108,14 @@ async def test_skipped_items_excluded_from_user_resampling(
         "/v1/assignments/self-serve", json={"limit": 2}, headers=user_headers
     )
     assert r.status_code == 200
-    first_batch = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python(
-        r.json().get("assigned") or []
-    )
+    first_batch = r.json().get("assigned") or []
     assert len(first_batch) == 2
 
     # Skip one item
     skipped_item = first_batch[0]
     r = await async_client.put(
-        f"/v1/ground-truths/{dataset}/{skipped_item.bucket}/{skipped_item.id}",
-        json={"status": "skipped", "etag": skipped_item.etag},
+        f"/v1/ground-truths/{dataset}/{skipped_item['bucket']}/{skipped_item['id']}",
+        json={"status": "skipped", "etag": skipped_item["_etag"]},
         headers=user_headers,
     )
     assert r.status_code == 200
@@ -129,19 +125,17 @@ async def test_skipped_items_excluded_from_user_resampling(
         "/v1/assignments/self-serve", json={"limit": 3}, headers=user_headers
     )
     assert r.status_code == 200
-    second_batch = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python(
-        r.json().get("assigned") or []
-    )
+    second_batch = r.json().get("assigned") or []
     assert len(second_batch) == 3
 
-    second_batch_ids = {item.id for item in second_batch}
+    second_batch_ids = {item["id"] for item in second_batch}
     non_skipped_item = first_batch[1]
 
     # Core assertions: skipped item not returned, non-skipped item is returned
-    assert skipped_item.id not in second_batch_ids, "Bug: Skipped item was resampled"
-    assert non_skipped_item.id in second_batch_ids, "Non-skipped item should be included"
+    assert skipped_item["id"] not in second_batch_ids, "Bug: Skipped item was resampled"
+    assert non_skipped_item["id"] in second_batch_ids, "Non-skipped item should be included"
 
     # Should have 2 new items (not from first batch)
-    first_batch_ids = {item.id for item in first_batch}
+    first_batch_ids = {item["id"] for item in first_batch}
     new_items = second_batch_ids - first_batch_ids
     assert len(new_items) == 2
diff --git a/backend/tests/integration/test_assignments_skipped_reassign_cosmos.py b/backend/tests/integration/test_assignments_skipped_reassign_cosmos.py
index a19faf9..57f6c91 100644
--- a/backend/tests/integration/test_assignments_skipped_reassign_cosmos.py
+++ b/backend/tests/integration/test_assignments_skipped_reassign_cosmos.py
@@ -5,11 +5,8 @@
 from datetime import datetime, timezone
 
 from httpx import AsyncClient
-from pydantic.type_adapter import TypeAdapter
 import pytest
 
-from app.domain.models import AgenticGroundTruthEntry
-
 
 def make_skipped_item(dataset: str, assigned_to: str) -> dict[str, Any]:
     return {
@@ -18,12 +15,12 @@ def make_skipped_item(dataset: str, assigned_to: str) -> dict[str, Any]:
         # Use NIL UUID for explicit bucket to keep PK simple in tests
         "bucket": str(UUID("00000000-0000-0000-0000-000000000000")),
         "status": "skipped",
-        "samplingBucket": 0,
-        "synthQuestion": "Q?",
+        "history": [
+            {"role": "user", "msg": "Q?"},
+        ],
         # Simulate a prior assignment to another SME
         "assignedTo": assigned_to,
         "assignedAt": datetime.now(timezone.utc).isoformat(),
-        "refs": [],
         "manualTags": ["source:synthetic", "split:validation"],
     }
 
@@ -48,23 +45,21 @@ async def test_self_serve_reassigns_skipped_and_lists_in_my(
     payload = cast(dict[str, Any], r.json())
     assert payload.get("assignedCount") == 1
 
-    assigned_items = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python(
-        payload.get("assigned") or []
-    )
+    assigned_items = cast(list[dict[str, Any]], payload.get("assigned") or [])
     assert len(assigned_items) == 1
     gt = assigned_items[0]
 
     # After assignment, item should be assigned to current user and status should be draft
     # In integration tests, the effective user id comes from Easy Auth principal (tester@example.com)
     expected_user = "tester@example.com"
-    assert gt.assignedTo == expected_user
-    assert gt.status.value == "draft"
+    assert gt["assignedTo"] == expected_user
+    assert gt["status"] == "draft"
 
     # /my should list the item now (since it filters by assignedTo == user and status == draft)
     r = await async_client.get("/v1/assignments/my", headers=user_headers)
     assert r.status_code == 200
-    my_items = TypeAdapter(list[AgenticGroundTruthEntry]).validate_python(r.json())
+    my_items = cast(list[dict[str, Any]], r.json())
     assert len(my_items) == 1
-    assert my_items[0].id == gt.id
-    assert my_items[0].assignedTo == expected_user
-    assert my_items[0].status.value == "draft"
+    assert my_items[0]["id"] == gt["id"]
+    assert my_items[0]["assignedTo"] == expected_user
+    assert my_items[0]["status"] == "draft"
diff --git a/backend/tests/integration/test_bucket_assignment_cosmos.py b/backend/tests/integration/test_bucket_assignment_cosmos.py
index def1be9..a176c92 100644
--- a/backend/tests/integration/test_bucket_assignment_cosmos.py
+++ b/backend/tests/integration/test_bucket_assignment_cosmos.py
@@ -7,7 +7,9 @@ def make_item(dataset: str, with_bucket: bool = False, bucket: str | None = None
     item = {
         "id": str(uuid.uuid4()),
         "datasetName": dataset,
-        "synthQuestion": "Q?",
+        "history": [
+            {"role": "user", "msg": "Q?"},
+        ],
     }
     if with_bucket:
         item["bucket"] = bucket or str(uuid.uuid4())
diff --git a/backend/tests/integration/test_bulk_import_validation.py b/backend/tests/integration/test_bulk_import_validation.py
index 3f08522..9999173 100644
--- a/backend/tests/integration/test_bulk_import_validation.py
+++ b/backend/tests/integration/test_bulk_import_validation.py
@@ -15,13 +15,17 @@ async def test_bulk_import_with_valid_items_passes(
         {
             "id": "",
             "datasetName": "test-dataset",
-            "synthQuestion": "What is the capital of France?",
-            "refs": [{"url": "https://example.com", "content": "Paris info"}],
+            "history": [
+                {"role": "user", "msg": "What is the capital of France?"},
+                {"role": "assistant", "msg": "Paris."},
+            ],
         },
         {
             "id": "",
             "datasetName": "test-dataset",
-            "synthQuestion": "How does gravity work?",
+            "history": [
+                {"role": "user", "msg": "How does gravity work?"},
+            ],
         },
     ]
 
@@ -53,18 +57,23 @@ async def test_bulk_import_filters_invalid_items(
         {
             "id": "valid-1",
             "datasetName": "test-dataset",
-            "synthQuestion": "This is a valid question that meets length requirements?",
+            "history": [
+                {"role": "user", "msg": "This is a valid question that meets length requirements?"},
+            ],
         },
         {
             "id": "valid-2",
             "datasetName": "test-dataset",
-            "synthQuestion": "Another valid question that is long enough?",
+            "history": [
+                {"role": "user", "msg": "Another valid question that is long enough?"},
+            ],
         },
         {
-            "id": "invalid-url",
+            "id": "invalid-history",
             "datasetName": "test-dataset",
-            "synthQuestion": "Question with bad reference URL?",
-            "refs": [{"url": ""}],
+            "history": [
+                {"role": "user", "msg": ""},
+            ],
         },
     ]
 
@@ -78,5 +87,5 @@ async def test_bulk_import_filters_invalid_items(
     data = response.json()
 
     # Check that errors mention validation issues
-    error_text = data["detail"][0]["msg"]
-    assert "Reference URL cannot be empty" in error_text or "invalid-url" in error_text
+    details = data.get("detail") or []
+    assert any("history fields cannot be empty" in err.get("msg", "") for err in details)
diff --git a/backend/tests/integration/test_datasets_api.py b/backend/tests/integration/test_datasets_api.py
index 3110c00..9550134 100644
--- a/backend/tests/integration/test_datasets_api.py
+++ b/backend/tests/integration/test_datasets_api.py
@@ -9,7 +9,9 @@ def make_gt_item(dataset: str, *, bucket: str | None = None) -> dict:
         "id": str(uuid.uuid4()),
         "datasetName": dataset,
         "bucket": bucket or str(uuid.UUID("00000000-0000-0000-0000-000000000000")),
-        "synthQuestion": f"Question for {dataset}?",
+        "history": [
+            {"role": "user", "msg": f"Question for {dataset}?"},
+        ],
         "docType": "ground-truth-item",
     }
 
diff --git a/backend/tests/integration/test_etag_and_refs_cosmos.py b/backend/tests/integration/test_etag_and_refs_cosmos.py
index ca29ecd..5985fec 100644
--- a/backend/tests/integration/test_etag_and_refs_cosmos.py
+++ b/backend/tests/integration/test_etag_and_refs_cosmos.py
@@ -8,8 +8,9 @@ def make_item(dataset: str) -> dict:
         "id": str(uuid.uuid4()),
         "datasetName": dataset,
         "bucket": str(uuid.UUID("00000000-0000-0000-0000-000000000000")),
-        "synthQuestion": "Q?",
-        "samplingBucket": 0,
+        "history": [
+            {"role": "user", "msg": "Q?"},
+        ],
         "assignedTo": None,
     }
 
@@ -32,7 +33,9 @@ async def test_sme_update_requires_etag_and_includes_updated_etag(
 
     # Try SME update without ETag -> 412
     r = await async_client.put(
-        f"/v1/assignments/{ds}/{bucket}/{item['id']}", json={"answer": "A1"}, headers=user_headers
+        f"/v1/assignments/{ds}/{bucket}/{item['id']}",
+        json={"history": [{"role": "user", "msg": "Q?"}, {"role": "assistant", "msg": "A1"}]},
+        headers=user_headers,
     )
     assert r.status_code == 412
 
@@ -43,11 +46,14 @@ async def test_sme_update_requires_etag_and_includes_updated_etag(
     headers = dict(user_headers)
     headers.update({"If-Match": etag})
     r = await async_client.put(
-        f"/v1/assignments/{ds}/{bucket}/{item['id']}", json={"answer": "A2"}, headers=headers
+        f"/v1/assignments/{ds}/{bucket}/{item['id']}",
+        json={"history": [{"role": "user", "msg": "Q?"}, {"role": "assistant", "msg": "A2"}]},
+        headers=headers,
     )
     assert r.status_code == 200
     body = r.json()
-    assert body.get("answer") == "A2"
+    history = body.get("history") or []
+    assert any(turn.get("role") == "assistant" and turn.get("msg") == "A2" for turn in history)
     assert body.get("_etag") and isinstance(body["_etag"], str)
 
 
@@ -73,7 +79,9 @@ async def test_sme_etag_mismatch_returns_412(async_client: AsyncClient, user_hea
     headers = dict(user_headers)
     headers.update({"If-Match": etag1})
     r = await async_client.put(
-        f"/v1/assignments/{ds}/{bucket}/{item['id']}", json={"answer": "v1"}, headers=headers
+        f"/v1/assignments/{ds}/{bucket}/{item['id']}",
+        json={"history": [{"role": "user", "msg": "Q?"}, {"role": "assistant", "msg": "v1"}]},
+        headers=headers,
     )
     assert r.status_code == 200
     new_etag = r.json().get("_etag")
@@ -83,14 +91,16 @@ async def test_sme_etag_mismatch_returns_412(async_client: AsyncClient, user_hea
     headers_stale = dict(user_headers)
     headers_stale.update({"If-Match": etag1})
     r = await async_client.put(
-        f"/v1/assignments/{ds}/{bucket}/{item['id']}", json={"answer": "v2"}, headers=headers_stale
+        f"/v1/assignments/{ds}/{bucket}/{item['id']}",
+        json={"history": [{"role": "user", "msg": "Q?"}, {"role": "assistant", "msg": "v2"}]},
+        headers=headers_stale,
     )
     assert r.status_code == 412
 
 
 @pytest.mark.anyio
-async def test_curator_put_refs_with_etag(async_client: AsyncClient, user_headers):
-    ds = "test-curator-refs"
+async def test_curator_put_plugins_with_etag(async_client: AsyncClient, user_headers):
+    ds = "test-curator-plugins"
     item = make_item(ds)
     bucket = item["bucket"]
     r = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers)
@@ -103,15 +113,28 @@ async def test_curator_put_refs_with_etag(async_client: AsyncClient, user_header
     headers = dict(user_headers)
     headers.update({"If-Match": etag})
 
-    refs = [
-        {"url": "https://example.com/a", "content": "alpha"},
-        {"url": "https://example.com/b", "keyExcerpt": "beta"},
-    ]
-    payload = {"refs": refs, "answer": "Ans"}
+    plugin_data = {
+        "score": 0.87,
+        "notes": ["source-a", "source-b"],
+    }
+    payload = {
+        "history": [
+            {"role": "user", "msg": "Q?"},
+            {"role": "assistant", "msg": "Ans"},
+        ],
+        "plugins": {
+            "test-pack": {
+                "kind": "test-pack",
+                "version": "1.0",
+                "data": plugin_data,
+            }
+        },
+    }
     r = await async_client.put(
         f"/v1/ground-truths/{ds}/{bucket}/{item['id']}", json=payload, headers=headers
     )
     assert r.status_code == 200
     body = r.json()
-    assert body.get("refs") and isinstance(body["refs"], list) and len(body["refs"]) == 2
+    persisted_data = body.get("plugins", {}).get("test-pack", {}).get("data")
+    assert persisted_data == plugin_data
     assert body.get("_etag") and isinstance(body["_etag"], str)
diff --git a/backend/tests/integration/test_ground_truths_cosmos.py b/backend/tests/integration/test_ground_truths_cosmos.py
index f3ac5c6..c1b75e9 100644
--- a/backend/tests/integration/test_ground_truths_cosmos.py
+++ b/backend/tests/integration/test_ground_truths_cosmos.py
@@ -11,7 +11,9 @@ def make_item(dataset: str) -> dict:
         "datasetName": dataset,
         # Use NIL UUID for explicit bucket in tests
         "bucket": str(uuid.UUID("00000000-0000-0000-0000-000000000000")),
-        "synthQuestion": "What is the capital of France?",
+        "history": [
+            {"role": "user", "msg": "What is the capital of France?"},
+        ],
     }
 
 
@@ -59,13 +61,22 @@ async def test_update_with_etag(async_client: AsyncClient, user_headers):
     # update with If-Match header
     headers = dict(user_headers)
     headers.update({"If-Match": etag})
-    payload = {"answer": "Paris", "status": "approved"}
+    payload = {
+        "history": [
+            {"role": "user", "msg": "What is the capital of France?"},
+            {"role": "assistant", "msg": "Paris"},
+        ],
+        "status": "approved",
+    }
     r = await async_client.put(
         f"/v1/ground-truths/{dataset}/{bucket}/{item['id']}", json=payload, headers=headers
     )
     assert r.status_code == 200
     res = r.json()
-    assert res["answer"] == "Paris"
+    assert any(
+        turn.get("role") == "assistant" and turn.get("msg") == "Paris"
+        for turn in (res.get("history") or [])
+    )
     assert res["status"] == GroundTruthStatus.approved.value
 
 
@@ -142,7 +153,13 @@ async def test_snapshot_and_stats(async_client: AsyncClient, user_headers):
     bucket = data[0]["bucket"]
     headers = dict(user_headers)
     headers.update({"If-Match": etag})
-    payload = {"answer": "Paris", "status": "approved"}
+    payload = {
+        "history": [
+            {"role": "user", "msg": "What is the capital of France?"},
+            {"role": "assistant", "msg": "Paris"},
+        ],
+        "status": "approved",
+    }
     r = await async_client.put(
         f"/v1/ground-truths/{dataset}/{bucket}/{item['id']}", json=payload, headers=headers
     )
@@ -165,8 +182,9 @@ async def test_snapshot_and_stats(async_client: AsyncClient, user_headers):
 async def test_import_with_approve_flag(async_client: AsyncClient, user_headers):
     dataset = "test-approve-on-import"
 
-    # Item WITHOUT history: approval validation should reject it
+    # Item WITHOUT assistant response: approval validation should reject it
     invalid_item = make_item(dataset)
+    invalid_item["history"] = [{"role": "user", "msg": "What is the capital of France?"}]
     r = await async_client.post(
         "/v1/ground-truths?approve=true", json=[invalid_item], headers=user_headers
     )
diff --git a/backend/tests/integration/test_ground_truths_delete_restore_etag_cosmos.py b/backend/tests/integration/test_ground_truths_delete_restore_etag_cosmos.py
index 58ef555..dd63660 100644
--- a/backend/tests/integration/test_ground_truths_delete_restore_etag_cosmos.py
+++ b/backend/tests/integration/test_ground_truths_delete_restore_etag_cosmos.py
@@ -13,9 +13,10 @@ def make_item(dataset: str, item_id: str) -> dict[str, Any]:
         "datasetName": dataset,
         # Use NIL UUID so tests don't depend on bucket assignment logic
         "bucket": str(UUID("00000000-0000-0000-0000-000000000000")),
-        "synthQuestion": "How do I reset my password?",
-        "answer": "Use the reset link",
-        "refs": [],
+        "history": [
+            {"role": "user", "msg": "How do I reset my password?"},
+            {"role": "assistant", "msg": "Use the reset link"},
+        ],
         "manualTags": [
             "source:synthetic",
             "split:validation",
diff --git a/backend/tests/integration/test_ground_truths_etag_errors_cosmos.py b/backend/tests/integration/test_ground_truths_etag_errors_cosmos.py
index 58411f5..d2f02a1 100644
--- a/backend/tests/integration/test_ground_truths_etag_errors_cosmos.py
+++ b/backend/tests/integration/test_ground_truths_etag_errors_cosmos.py
@@ -12,9 +12,10 @@ def make_item(dataset: str, item_id: str) -> dict[str, Any]:
         "id": item_id,
         "datasetName": dataset,
         "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "How do I reset my password?",
-        "answer": "Use the reset link",
-        "refs": [],
+        "history": [
+            {"role": "user", "msg": "How do I reset my password?"},
+            {"role": "assistant", "msg": "Use the reset link"},
+        ],
         "manualTags": [
             "source:synthetic",
             "split:validation",
diff --git a/backend/tests/integration/test_ground_truths_explorer.py b/backend/tests/integration/test_ground_truths_explorer.py
index d843f10..90003f3 100644
--- a/backend/tests/integration/test_ground_truths_explorer.py
+++ b/backend/tests/integration/test_ground_truths_explorer.py
@@ -29,9 +29,10 @@ def build_item(
         "datasetName": dataset,
         "bucket": str(uuid4()),
         "status": status,
-        "synthQuestion": f"Question {idx}",
-        "answer": answer,
-        "refs": [],
+        "history": [
+            {"role": "user", "msg": f"Question {idx}"},
+            *([{"role": "assistant", "msg": answer}] if answer else []),
+        ],
         "manualTags": tags or ["source:sme"],
         "reviewedAt": reviewed.isoformat(),
         "updatedAt": updated.isoformat(),
diff --git a/backend/tests/integration/test_ground_truths_get_and_filters_cosmos.py b/backend/tests/integration/test_ground_truths_get_and_filters_cosmos.py
index 3e9712f..9d63728 100644
--- a/backend/tests/integration/test_ground_truths_get_and_filters_cosmos.py
+++ b/backend/tests/integration/test_ground_truths_get_and_filters_cosmos.py
@@ -2,22 +2,20 @@
 
 from typing import Any, Optional, cast
 
-from pydantic import TypeAdapter
 import pytest
 from uuid import uuid4
 from httpx import AsyncClient
 
-from app.domain.models import AgenticGroundTruthEntry
-
 
 def make_item(dataset: str, *, gid: Optional[str] = None) -> dict[str, Any]:
     return {
         "id": gid or f"gt-{uuid4().hex[:8]}",
         "datasetName": dataset,
         "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "What is the capital of France?",
-        "answer": "Paris",
-        "refs": [],
+        "history": [
+            {"role": "user", "msg": "What is the capital of France?"},
+            {"role": "assistant", "msg": "Paris"},
+        ],
         "manualTags": [
             "source:synthetic",
             "split:validation",
@@ -50,9 +48,9 @@ async def test_get_item_200_and_404(async_client: AsyncClient, user_headers: dic
         f"/v1/ground-truths/{dataset}/{bucket}/gt-200", headers=user_headers
     )
     assert res.status_code == 200
-    gt_item = TypeAdapter(AgenticGroundTruthEntry).validate_python(res.json())
-    assert gt_item.id == "gt-200"
-    assert gt_item.etag
+    gt_item = cast(dict[str, Any], res.json())
+    assert gt_item.get("id") == "gt-200"
+    assert gt_item.get("_etag")
 
     # 404 for missing
     res = await async_client.get(
diff --git a/backend/tests/integration/test_ground_truths_id_search.py b/backend/tests/integration/test_ground_truths_id_search.py
index efd089a..23c78da 100644
--- a/backend/tests/integration/test_ground_truths_id_search.py
+++ b/backend/tests/integration/test_ground_truths_id_search.py
@@ -8,8 +8,6 @@
 from httpx import AsyncClient
 from uuid import uuid4
 
-from app.domain.models import GroundTruthListResponse
-
 
 def make_item(dataset: str, gid: str | None = None) -> dict:
     """Helper to create a minimal ground truth item for testing."""
@@ -17,7 +15,9 @@ def make_item(dataset: str, gid: str | None = None) -> dict:
         "id": gid or f"test-{uuid4().hex[:8]}",
         "datasetName": dataset,
         "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "Test question?",
+        "history": [
+            {"role": "user", "msg": "Test question?"},
+        ],
     }
 
 
@@ -39,9 +39,9 @@ async def test_list_ground_truths_search_by_id_exact_match(
         "/v1/ground-truths", params={"itemId": item_id}, headers=user_headers
     )
     assert res.status_code == 200
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 1
-    assert response_data.items[0].id == item_id
+    response_data = res.json()
+    assert len(response_data["items"]) == 1
+    assert response_data["items"][0]["id"] == item_id
 
 
 @pytest.mark.anyio
@@ -66,9 +66,9 @@ async def test_list_ground_truths_search_by_id_partial_match(
         "/v1/ground-truths", params={"itemId": unique}, headers=user_headers
     )
     assert res.status_code == 200
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 2
-    found_ids = {item.id for item in response_data.items}
+    response_data = res.json()
+    assert len(response_data["items"]) == 2
+    found_ids = {item["id"] for item in response_data["items"]}
     assert f"{unique}-suffix1" in found_ids
     assert f"{unique}-end" in found_ids
 
@@ -90,9 +90,9 @@ async def test_list_ground_truths_search_by_id_with_whitespace_trimming(
         "/v1/ground-truths", params={"itemId": f"  {item_id}  "}, headers=user_headers
     )
     assert res.status_code == 200
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 1
-    assert response_data.items[0].id == item_id
+    response_data = res.json()
+    assert len(response_data["items"]) == 1
+    assert response_data["items"][0]["id"] == item_id
 
 
 @pytest.mark.anyio
@@ -112,9 +112,9 @@ async def test_list_ground_truths_search_by_id_whitespace_only_returns_all(
         "/v1/ground-truths", params={"itemId": "   ", "dataset": dataset}, headers=user_headers
     )
     assert res.status_code == 200
-    response_data = GroundTruthListResponse.model_validate(res.json())
+    response_data = res.json()
     # Should return all items from dataset (whitespace-only treated as omitted)
-    assert len(response_data.items) >= 3
+    assert len(response_data["items"]) >= 3
 
 
 @pytest.mark.anyio
@@ -141,10 +141,10 @@ async def test_list_ground_truths_search_by_id_combined_with_other_filters(
         headers=user_headers,
     )
     assert res.status_code == 200
-    response_data = GroundTruthListResponse.model_validate(res.json())
+    response_data = res.json()
     # Should find the 2 items with matching ID in this dataset
-    assert len(response_data.items) == 2
-    assert all(unique in item.id for item in response_data.items)
+    assert len(response_data["items"]) == 2
+    assert all(unique in item["id"] for item in response_data["items"])
 
 
 @pytest.mark.anyio
@@ -165,8 +165,8 @@ async def test_list_ground_truths_search_by_id_empty_results(
         headers=user_headers,
     )
     assert res.status_code == 200
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 0
+    response_data = res.json()
+    assert len(response_data["items"]) == 0
 
 
 @pytest.mark.anyio
@@ -189,10 +189,10 @@ async def test_list_ground_truths_search_by_id_pagination(
         headers=user_headers,
     )
     assert res.status_code == 200
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 2
-    assert response_data.pagination.has_next is True
-    assert response_data.pagination.total == 5
+    response_data = res.json()
+    assert len(response_data["items"]) == 2
+    assert response_data["pagination"]["hasNext"] is True
+    assert response_data["pagination"]["total"] == 5
 
 
 @pytest.mark.anyio
@@ -225,5 +225,5 @@ async def test_list_ground_truths_search_by_id_no_param_returns_all(
         "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers
     )
     assert res.status_code == 200
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) >= 3
+    response_data = res.json()
+    assert len(response_data["items"]) >= 3
diff --git a/backend/tests/integration/test_ground_truths_import_conflicts_cosmos.py b/backend/tests/integration/test_ground_truths_import_conflicts_cosmos.py
index 9a5b643..c04b5f9 100644
--- a/backend/tests/integration/test_ground_truths_import_conflicts_cosmos.py
+++ b/backend/tests/integration/test_ground_truths_import_conflicts_cosmos.py
@@ -12,9 +12,10 @@ def make_item(dataset: str, item_id: str) -> dict[str, Any]:
         "id": item_id,
         "datasetName": dataset,
         "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "What is the capital of France?",
-        "answer": "Paris",
-        "refs": [],
+        "history": [
+            {"role": "user", "msg": "What is the capital of France?"},
+            {"role": "assistant", "msg": "Paris"},
+        ],
         "manualTags": [
             "source:synthetic",
             "split:validation",
diff --git a/backend/tests/integration/test_ground_truths_reference_count.py b/backend/tests/integration/test_ground_truths_reference_count.py
deleted file mode 100644
index c21317d..0000000
--- a/backend/tests/integration/test_ground_truths_reference_count.py
+++ /dev/null
@@ -1,260 +0,0 @@
-"""Integration tests for totalReferences computed field on GroundTruthItem."""
-
-import pytest
-from httpx import AsyncClient
-from uuid import uuid4
-
-from app.domain.models import GroundTruthListResponse
-
-
-@pytest.mark.anyio
-async def test_ground_truth_item_includes_total_references_field(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Verify totalReferences field exists in response."""
-    dataset = f"ref-count-exists-{uuid4().hex[:6]}"
-
-    item = {
-        "id": f"test-{uuid4().hex[:8]}",
-        "datasetName": dataset,
-        "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "Test question?",
-    }
-
-    res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers)
-    assert res.status_code == 200
-
-    res = await async_client.get(
-        "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 1
-    assert hasattr(response_data.items[0], "totalReferences")
-    assert response_data.items[0].totalReferences == 0  # No refs yet
-
-
-@pytest.mark.anyio
-async def test_total_references_counts_item_level_refs_only(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Item with 3 item-level refs, no history → totalReferences=3."""
-    dataset = f"ref-count-item-{uuid4().hex[:6]}"
-
-    item = {
-        "id": f"test-{uuid4().hex[:8]}",
-        "datasetName": dataset,
-        "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "Test question?",
-        "refs": [
-            {"url": "https://example.com/1"},
-            {"url": "https://example.com/2"},
-            {"url": "https://example.com/3"},
-        ],
-    }
-
-    res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers)
-    assert res.status_code == 200
-
-    res = await async_client.get(
-        "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 1
-    assert response_data.items[0].totalReferences == 3
-
-
-@pytest.mark.anyio
-async def test_total_references_counts_history_level_refs_only(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Item with no item-level refs, 2 history turns with refs → correct count."""
-    dataset = f"ref-count-history-{uuid4().hex[:6]}"
-
-    item = {
-        "id": f"test-{uuid4().hex[:8]}",
-        "datasetName": dataset,
-        "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "Test question?",
-        "history": [
-            {
-                "role": "user",
-                "msg": "First question",
-                "refs": None,
-            },
-            {
-                "role": "assistant",
-                "msg": "First answer",
-                "refs": [
-                    {"url": "https://example.com/turn1-ref1"},
-                    {"url": "https://example.com/turn1-ref2"},
-                ],
-            },
-            {
-                "role": "user",
-                "msg": "Follow-up question",
-                "refs": None,
-            },
-            {
-                "role": "assistant",
-                "msg": "Follow-up answer",
-                "refs": [
-                    {"url": "https://example.com/turn2-ref1"},
-                ],
-            },
-        ],
-    }
-
-    res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers)
-    assert res.status_code == 200
-
-    res = await async_client.get(
-        "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 1
-    # 2 refs from first assistant turn + 1 ref from second assistant turn = 3 total
-    assert response_data.items[0].totalReferences == 3
-
-
-@pytest.mark.anyio
-async def test_total_references_counts_both_levels(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Item with item-level refs + history turn refs → only history turn refs counted."""
-    dataset = f"ref-count-both-{uuid4().hex[:6]}"
-
-    item = {
-        "id": f"test-{uuid4().hex[:8]}",
-        "datasetName": dataset,
-        "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "Test question?",
-        "refs": [
-            {"url": "https://example.com/item-ref1"},
-            {"url": "https://example.com/item-ref2"},
-        ],
-        "history": [
-            {
-                "role": "user",
-                "msg": "Question",
-                "refs": None,
-            },
-            {
-                "role": "assistant",
-                "msg": "Answer",
-                "refs": [
-                    {"url": "https://example.com/history-ref1"},
-                    {"url": "https://example.com/history-ref2"},
-                    {"url": "https://example.com/history-ref3"},
-                ],
-            },
-        ],
-    }
-
-    res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers)
-    assert res.status_code == 200
-
-    res = await async_client.get(
-        "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 1
-    # 2 item-level refs , 3 history refs ignore the item-level refs = 3 total
-    assert response_data.items[0].totalReferences == 3
-
-
-@pytest.mark.anyio
-async def test_total_references_zero_when_no_refs(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Item with empty refs and no history → totalReferences=0."""
-    dataset = f"ref-count-zero-{uuid4().hex[:6]}"
-
-    item = {
-        "id": f"test-{uuid4().hex[:8]}",
-        "datasetName": dataset,
-        "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "Test question?",
-    }
-
-    res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers)
-    assert res.status_code == 200
-
-    res = await async_client.get(
-        "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 1
-    assert response_data.items[0].totalReferences == 0
-
-
-@pytest.mark.anyio
-async def test_total_references_multiple_items_independent(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Multiple items each have correct independent counts."""
-    dataset = f"ref-count-multi-{uuid4().hex[:6]}"
-
-    items = [
-        {
-            "id": f"item1-{uuid4().hex[:8]}",
-            "datasetName": dataset,
-            "bucket": "00000000-0000-0000-0000-000000000000",
-            "synthQuestion": "Question 1?",
-            "refs": [{"url": "https://example.com/1"}],
-        },
-        {
-            "id": f"item2-{uuid4().hex[:8]}",
-            "datasetName": dataset,
-            "bucket": "00000000-0000-0000-0000-000000000000",
-            "synthQuestion": "Question 2?",
-            "refs": [
-                {"url": "https://example.com/2a"},
-                {"url": "https://example.com/2b"},
-            ],
-            "history": [
-                {"role": "user", "msg": "Follow up"},
-                {
-                    "role": "assistant",
-                    "msg": "Answer",
-                    "refs": [{"url": "https://example.com/2c"}],
-                },
-            ],
-        },
-        {
-            "id": f"item3-{uuid4().hex[:8]}",
-            "datasetName": dataset,
-            "bucket": "00000000-0000-0000-0000-000000000000",
-            "synthQuestion": "Question 3?",
-            # No refs
-        },
-    ]
-
-    res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers)
-    assert res.status_code == 200
-
-    res = await async_client.get(
-        "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 3
-
-    # Find items by checking synthQuestion to verify independent counts
-    items_by_question = {item.synth_question: item for item in response_data.items}
-
-    assert items_by_question["Question 1?"].totalReferences == 1  # 1 item-level ref
-    assert (
-        items_by_question["Question 2?"].totalReferences == 1
-    )  # 2 item-level , 1 history ref then count only history = 1
-    assert items_by_question["Question 3?"].totalReferences == 0  # No refs
diff --git a/backend/tests/integration/test_ground_truths_reference_search.py b/backend/tests/integration/test_ground_truths_reference_search.py
deleted file mode 100644
index 7a6176f..0000000
--- a/backend/tests/integration/test_ground_truths_reference_search.py
+++ /dev/null
@@ -1,492 +0,0 @@
-"""Integration tests for reference URL search on GET /v1/ground-truths endpoint."""
-
-import pytest
-from httpx import AsyncClient
-from uuid import uuid4
-
-from app.domain.models import GroundTruthListResponse
-
-
-@pytest.mark.anyio
-async def test_ref_url_search_matches_item_level_refs(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Item with ref url containing search term → returns item."""
-    dataset = f"ref-search-item-{uuid4().hex[:6]}"
-
-    item = {
-        "id": f"test-{uuid4().hex[:8]}",
-        "datasetName": dataset,
-        "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "Test question?",
-        "refs": [
-            {"url": "https://example.com/page1"},
-            {"url": "https://docs.example.com/guide"},
-        ],
-    }
-
-    res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers)
-    assert res.status_code == 200
-
-    # Search for "page1" should find the item
-    res = await async_client.get(
-        "/v1/ground-truths", params={"dataset": dataset, "refUrl": "page1"}, headers=user_headers
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 1
-    assert response_data.items[0].id == item["id"]
-
-
-@pytest.mark.anyio
-async def test_ref_url_search_matches_history_level_refs(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Item with history turn refs containing search term → returns item."""
-    dataset = f"ref-search-history-{uuid4().hex[:6]}"
-
-    item = {
-        "id": f"test-{uuid4().hex[:8]}",
-        "datasetName": dataset,
-        "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "Test question?",
-        "history": [
-            {
-                "role": "user",
-                "msg": "User question",
-            },
-            {
-                "role": "assistant",
-                "msg": "Assistant response",
-                "refs": [
-                    {"url": "https://docs.example.com/article/123"},
-                    {"url": "https://support.example.com/kb/456"},
-                ],
-            },
-        ],
-    }
-
-    res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers)
-    assert res.status_code == 200
-
-    # Search for "article" should find the item
-    res = await async_client.get(
-        "/v1/ground-truths", params={"dataset": dataset, "refUrl": "article"}, headers=user_headers
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 1
-    assert response_data.items[0].id == item["id"]
-
-
-@pytest.mark.anyio
-async def test_ref_url_search_matches_both_levels(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Item with item-level and history-level refs → search matches either."""
-    dataset = f"ref-search-both-{uuid4().hex[:6]}"
-
-    item = {
-        "id": f"test-{uuid4().hex[:8]}",
-        "datasetName": dataset,
-        "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "Test question?",
-        "refs": [
-            {"url": "https://foo.com/bar"},
-        ],
-        "history": [
-            {
-                "role": "assistant",
-                "msg": "Response",
-                "refs": [
-                    {"url": "https://baz.com/bar"},
-                ],
-            },
-        ],
-    }
-
-    res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers)
-    assert res.status_code == 200
-
-    # Search for "bar" should find the item (matches both levels)
-    res = await async_client.get(
-        "/v1/ground-truths", params={"dataset": dataset, "refUrl": "bar"}, headers=user_headers
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 1
-    assert response_data.items[0].id == item["id"]
-
-
-@pytest.mark.anyio
-async def test_ref_url_search_case_sensitive(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Reference URL search is case-sensitive (Cosmos CONTAINS behavior)."""
-    dataset = f"ref-search-case-{uuid4().hex[:6]}"
-
-    item = {
-        "id": f"test-{uuid4().hex[:8]}",
-        "datasetName": dataset,
-        "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "Test question?",
-        "refs": [
-            {"url": "https://Example.COM/Page"},
-        ],
-    }
-
-    res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers)
-    assert res.status_code == 200
-
-    # Search for lowercase "example.com" should NOT find the item (case-sensitive)
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "refUrl": "example.com"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 0
-
-    # Search for exact case "Example.COM" should find it
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "refUrl": "Example.COM"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 1
-
-
-@pytest.mark.anyio
-async def test_ref_url_search_partial_match(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Reference URL search supports partial matching."""
-    dataset = f"ref-search-partial-{uuid4().hex[:6]}"
-
-    item = {
-        "id": f"test-{uuid4().hex[:8]}",
-        "datasetName": dataset,
-        "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "Test question?",
-        "refs": [
-            {"url": "https://docs.example.com/guide/introduction"},
-        ],
-    }
-
-    res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers)
-    assert res.status_code == 200
-
-    # Search for domain portion
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "refUrl": "docs.example"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-    assert len(GroundTruthListResponse.model_validate(res.json()).items) == 1
-
-    # Search for path portion
-    res = await async_client.get(
-        "/v1/ground-truths", params={"dataset": dataset, "refUrl": "/guide"}, headers=user_headers
-    )
-    assert res.status_code == 200
-    assert len(GroundTruthListResponse.model_validate(res.json()).items) == 1
-
-    # Search for non-matching substring
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "refUrl": "nonexistent"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-    assert len(GroundTruthListResponse.model_validate(res.json()).items) == 0
-
-
-@pytest.mark.anyio
-async def test_ref_url_search_no_matches(async_client: AsyncClient, user_headers: dict[str, str]):
-    """Search with no matching refs returns empty list."""
-    dataset = f"ref-search-nomatch-{uuid4().hex[:6]}"
-
-    items = [
-        {
-            "id": f"test-{uuid4().hex[:8]}",
-            "datasetName": dataset,
-            "bucket": "00000000-0000-0000-0000-000000000000",
-            "synthQuestion": "Question 1",
-            "refs": [{"url": "https://foo.com/1"}],
-        },
-        {
-            "id": f"test-{uuid4().hex[:8]}",
-            "datasetName": dataset,
-            "bucket": "00000000-0000-0000-0000-000000000000",
-            "synthQuestion": "Question 2",
-            "refs": [{"url": "https://bar.com/2"}],
-        },
-    ]
-
-    res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers)
-    assert res.status_code == 200
-
-    # Search for non-existent URL
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "refUrl": "nonexistent-url"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 0
-
-
-@pytest.mark.anyio
-async def test_ref_url_search_multiple_refs_per_item(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Item with multiple refs, only one matches → search finds item."""
-    dataset = f"ref-search-multi-{uuid4().hex[:6]}"
-
-    item = {
-        "id": f"test-{uuid4().hex[:8]}",
-        "datasetName": dataset,
-        "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "Test question?",
-        "refs": [
-            {"url": "https://foo.com/1"},
-            {"url": "https://bar.com/2"},
-            {"url": "https://baz.com/3"},
-            {"url": "https://example.com/matching-url"},
-            {"url": "https://qux.com/5"},
-        ],
-    }
-
-    res = await async_client.post("/v1/ground-truths", json=[item], headers=user_headers)
-    assert res.status_code == 200
-
-    # Search for the matching URL
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "refUrl": "matching-url"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 1
-    assert response_data.items[0].id == item["id"]
-
-
-@pytest.mark.anyio
-async def test_ref_url_search_combined_with_other_filters(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """refUrl filter works together with dataset and status filters."""
-    dataset1 = f"ref-search-combined1-{uuid4().hex[:6]}"
-    dataset2 = f"ref-search-combined2-{uuid4().hex[:6]}"
-
-    items = [
-        {
-            "id": f"test-{uuid4().hex[:8]}",
-            "datasetName": dataset1,
-            "bucket": "00000000-0000-0000-0000-000000000000",
-            "synthQuestion": "Q1",
-            "status": "draft",
-            "refs": [{"url": "https://example.com/doc"}],
-        },
-        {
-            "id": f"test-{uuid4().hex[:8]}",
-            "datasetName": dataset1,
-            "bucket": "00000000-0000-0000-0000-000000000000",
-            "synthQuestion": "Q2",
-            "status": "approved",
-            "refs": [{"url": "https://example.com/doc"}],
-        },
-        {
-            "id": f"test-{uuid4().hex[:8]}",
-            "datasetName": dataset2,
-            "bucket": "00000000-0000-0000-0000-000000000000",
-            "synthQuestion": "Q3",
-            "status": "draft",
-            "refs": [{"url": "https://example.com/doc"}],
-        },
-    ]
-
-    res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers)
-    assert res.status_code == 200
-
-    # Filter by dataset + refUrl → should get 2 items from dataset1
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset1, "refUrl": "example.com"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 2
-
-    # Filter by dataset + status + refUrl → should get 1 item (approved in dataset1)
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset1, "status": "approved", "refUrl": "example.com"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 1
-    assert response_data.items[0].status.value == "approved"
-
-
-@pytest.mark.anyio
-async def test_ref_url_search_with_pagination(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Reference URL search works correctly with pagination."""
-    dataset = f"ref-search-page-{uuid4().hex[:6]}"
-
-    # Create 15 items with matching refs
-    items = [
-        {
-            "id": f"test-{i:03d}-{uuid4().hex[:8]}",
-            "datasetName": dataset,
-            "bucket": "00000000-0000-0000-0000-000000000000",
-            "synthQuestion": f"Question {i}",
-            "refs": [{"url": f"https://example.com/doc/{i}"}],
-        }
-        for i in range(15)
-    ]
-
-    res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers)
-    assert res.status_code == 200
-
-    # Get first page with limit=10
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "refUrl": "example.com", "page": 1, "limit": 10},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 10
-    assert response_data.pagination.total == 15
-    assert response_data.pagination.page == 1
-    assert response_data.pagination.has_next is True
-    assert response_data.pagination.has_prev is False
-
-    # Get second page
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "refUrl": "example.com", "page": 2, "limit": 10},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 5
-    assert response_data.pagination.total == 15
-    assert response_data.pagination.page == 2
-    assert response_data.pagination.has_next is False
-    assert response_data.pagination.has_prev is True
-
-
-@pytest.mark.anyio
-async def test_ref_url_search_empty_string_ignored(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Empty or whitespace refUrl behaves like no filter."""
-    dataset = f"ref-search-empty-{uuid4().hex[:6]}"
-
-    items = [
-        {
-            "id": f"test-{uuid4().hex[:8]}",
-            "datasetName": dataset,
-            "bucket": "00000000-0000-0000-0000-000000000000",
-            "synthQuestion": "Q1",
-            "refs": [{"url": "https://foo.com"}],
-        },
-        {
-            "id": f"test-{uuid4().hex[:8]}",
-            "datasetName": dataset,
-            "bucket": "00000000-0000-0000-0000-000000000000",
-            "synthQuestion": "Q2",
-            "refs": [{"url": "https://bar.com"}],
-        },
-    ]
-
-    res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers)
-    assert res.status_code == 200
-
-    # Empty string
-    res = await async_client.get(
-        "/v1/ground-truths", params={"dataset": dataset, "refUrl": ""}, headers=user_headers
-    )
-    assert res.status_code == 200
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 2  # Returns all items
-
-    # Whitespace only
-    res = await async_client.get(
-        "/v1/ground-truths", params={"dataset": dataset, "refUrl": "   "}, headers=user_headers
-    )
-    assert res.status_code == 200
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 2  # Returns all items
-
-
-@pytest.mark.anyio
-async def test_ref_url_search_omitted_parameter(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Request without refUrl parameter returns all items."""
-    dataset = f"ref-search-omit-{uuid4().hex[:6]}"
-
-    items = [
-        {
-            "id": f"test-{uuid4().hex[:8]}",
-            "datasetName": dataset,
-            "bucket": "00000000-0000-0000-0000-000000000000",
-            "synthQuestion": "Q1",
-            "refs": [{"url": "https://foo.com"}],
-        },
-        {
-            "id": f"test-{uuid4().hex[:8]}",
-            "datasetName": dataset,
-            "bucket": "00000000-0000-0000-0000-000000000000",
-            "synthQuestion": "Q2",
-            "refs": [{"url": "https://bar.com"}],
-        },
-    ]
-
-    res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers)
-    assert res.status_code == 200
-
-    # Omit refUrl parameter entirely
-    res = await async_client.get(
-        "/v1/ground-truths", params={"dataset": dataset}, headers=user_headers
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 2  # Returns all items
-
-
-@pytest.mark.anyio
-async def test_ref_url_search_too_long_returns_400(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Test that refUrl longer than 500 characters returns 400 error."""
-    long_url = "https://example.com/" + "x" * 500  # >500 characters total
-
-    res = await async_client.get(
-        "/v1/ground-truths", params={"refUrl": long_url}, headers=user_headers
-    )
-    assert res.status_code == 400
-    assert "500 characters" in res.json()["detail"]
diff --git a/backend/tests/integration/test_ground_truths_sort_total_references.py b/backend/tests/integration/test_ground_truths_sort_total_references.py
deleted file mode 100644
index 513186c..0000000
--- a/backend/tests/integration/test_ground_truths_sort_total_references.py
+++ /dev/null
@@ -1,442 +0,0 @@
-"""Integration tests for sorting by totalReferences field (SA-369).
-
-Tests the database-level sorting by totalReferences, verifying that:
-1. The sortBy=totalReferences parameter is accepted by the API
-2. Sorting works correctly in ascending and descending order
-3. Sorting handles edge cases (items with 0 refs, history vs item-level refs)
-4. Pagination works correctly with totalReferences sorting
-"""
-
-import pytest
-from httpx import AsyncClient
-from uuid import uuid4
-
-from app.domain.models import GroundTruthListResponse
-
-
-def make_item_with_refs(
-    dataset: str,
-    item_id: str,
-    *,
-    item_refs_count: int = 0,
-    history_refs_counts: list[int] | None = None,
-) -> dict:
-    """Create a test item with specified reference counts.
-
-    Args:
-        dataset: Dataset name
-        item_id: Item ID
-        item_refs_count: Number of item-level refs
-        history_refs_counts: List of ref counts per history turn (None = no history)
-
-    Returns:
-        Item dict for API submission
-    """
-    item: dict = {
-        "id": item_id,
-        "datasetName": dataset,
-        "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": f"Question for {item_id}?",
-    }
-
-    # Add item-level refs
-    if item_refs_count > 0:
-        item["refs"] = [
-            {"url": f"https://example.com/{item_id}/item-ref-{i}"} for i in range(item_refs_count)
-        ]
-
-    # Add history with refs if specified
-    if history_refs_counts:
-        history = []
-        for turn_idx, ref_count in enumerate(history_refs_counts):
-            # User turn (no refs)
-            history.append(
-                {
-                    "role": "user",
-                    "msg": f"Turn {turn_idx} question",
-                }
-            )
-            # Assistant turn with refs
-            turn: dict = {
-                "role": "assistant",
-                "msg": f"Turn {turn_idx} answer",
-            }
-            if ref_count > 0:
-                turn["refs"] = [
-                    {"url": f"https://example.com/{item_id}/turn{turn_idx}-ref-{i}"}
-                    for i in range(ref_count)
-                ]
-            history.append(turn)
-        item["history"] = history
-
-    return item
-
-
-@pytest.mark.anyio
-async def test_sort_by_total_references_descending(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Sort by totalReferences DESC returns items with most refs first."""
-    dataset = f"sort-refs-desc-{uuid4().hex[:6]}"
-
-    items = [
-        make_item_with_refs(dataset, "item-0-refs", item_refs_count=0),
-        make_item_with_refs(dataset, "item-3-refs", item_refs_count=3),
-        make_item_with_refs(dataset, "item-1-ref", item_refs_count=1),
-        make_item_with_refs(dataset, "item-5-refs", item_refs_count=5),
-    ]
-
-    res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers)
-    assert res.status_code == 200
-
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "sortBy": "totalReferences", "sortOrder": "desc"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 4
-
-    # Verify descending order (most refs first)
-    ref_counts = [item.totalReferences for item in response_data.items]
-    assert ref_counts == [5, 3, 1, 0], f"Expected descending order, got {ref_counts}"
-
-
-@pytest.mark.anyio
-async def test_sort_by_total_references_ascending(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Sort by totalReferences ASC returns items with fewest refs first."""
-    dataset = f"sort-refs-asc-{uuid4().hex[:6]}"
-
-    items = [
-        make_item_with_refs(dataset, "item-3-refs", item_refs_count=3),
-        make_item_with_refs(dataset, "item-0-refs", item_refs_count=0),
-        make_item_with_refs(dataset, "item-5-refs", item_refs_count=5),
-        make_item_with_refs(dataset, "item-1-ref", item_refs_count=1),
-    ]
-
-    res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers)
-    assert res.status_code == 200
-
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "sortBy": "totalReferences", "sortOrder": "asc"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 4
-
-    # Verify ascending order (fewest refs first)
-    ref_counts = [item.totalReferences for item in response_data.items]
-    assert ref_counts == [0, 1, 3, 5], f"Expected ascending order, got {ref_counts}"
-
-
-@pytest.mark.anyio
-async def test_sort_by_total_references_with_history_refs(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Sort works correctly when refs come from history turns."""
-    dataset = f"sort-refs-history-{uuid4().hex[:6]}"
-
-    items = [
-        # Item with only item-level refs (2 refs)
-        make_item_with_refs(dataset, "item-level-2", item_refs_count=2),
-        # Item with only history refs (3 refs across turns)
-        make_item_with_refs(dataset, "history-3", history_refs_counts=[1, 2]),
-        # Item with no refs (0 refs)
-        make_item_with_refs(dataset, "no-refs", item_refs_count=0),
-        # Item with history refs overriding item refs (history: 4 refs)
-        make_item_with_refs(
-            dataset, "history-4-override", item_refs_count=10, history_refs_counts=[2, 2]
-        ),
-    ]
-
-    res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers)
-    assert res.status_code == 200
-
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "sortBy": "totalReferences", "sortOrder": "desc"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 4
-
-    # Expected order: history-4-override (4), history-3 (3), item-level-2 (2), no-refs (0)
-    item_ids = [item.id for item in response_data.items]
-    ref_counts = [item.totalReferences for item in response_data.items]
-
-    assert ref_counts == [4, 3, 2, 0], f"Expected [4, 3, 2, 0], got {ref_counts}"
-    assert item_ids[0] == "history-4-override"
-    assert item_ids[1] == "history-3"
-    assert item_ids[2] == "item-level-2"
-    assert item_ids[3] == "no-refs"
-
-
-@pytest.mark.anyio
-async def test_sort_by_total_references_stable_pagination(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Pagination is stable when sorting by totalReferences."""
-    dataset = f"sort-refs-pagination-{uuid4().hex[:6]}"
-
-    # Create 6 items: 2 with 3 refs, 2 with 1 ref, 2 with 0 refs
-    items = [
-        make_item_with_refs(dataset, "item-3a", item_refs_count=3),
-        make_item_with_refs(dataset, "item-3b", item_refs_count=3),
-        make_item_with_refs(dataset, "item-1a", item_refs_count=1),
-        make_item_with_refs(dataset, "item-1b", item_refs_count=1),
-        make_item_with_refs(dataset, "item-0a", item_refs_count=0),
-        make_item_with_refs(dataset, "item-0b", item_refs_count=0),
-    ]
-
-    res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers)
-    assert res.status_code == 200
-
-    # Get all items on page 1 (limit 3)
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={
-            "dataset": dataset,
-            "sortBy": "totalReferences",
-            "sortOrder": "desc",
-            "page": 1,
-            "limit": 3,
-        },
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-    page1 = GroundTruthListResponse.model_validate(res.json())
-
-    # Get page 2
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={
-            "dataset": dataset,
-            "sortBy": "totalReferences",
-            "sortOrder": "desc",
-            "page": 2,
-            "limit": 3,
-        },
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-    page2 = GroundTruthListResponse.model_validate(res.json())
-
-    assert len(page1.items) == 3
-    assert len(page2.items) == 3
-
-    # Combine pages and verify no duplicates
-    all_ids = [item.id for item in page1.items] + [item.id for item in page2.items]
-    assert len(set(all_ids)) == 6, "All 6 items should appear exactly once across pages"
-
-    # Verify page 1 has higher ref counts than page 2
-    page1_refs = [item.totalReferences for item in page1.items]
-    page2_refs = [item.totalReferences for item in page2.items]
-    assert min(page1_refs) >= max(page2_refs), (
-        f"Page 1 refs {page1_refs} should be >= page 2 refs {page2_refs}"
-    )
-
-
-@pytest.mark.anyio
-async def test_sort_by_total_references_with_status_filter(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Sort by totalReferences works with status filter."""
-    dataset = f"sort-refs-status-{uuid4().hex[:6]}"
-
-    items = [
-        make_item_with_refs(dataset, "draft-2", item_refs_count=2),
-        make_item_with_refs(dataset, "draft-5", item_refs_count=5),
-        make_item_with_refs(dataset, "approved-3", item_refs_count=3),
-    ]
-
-    # Create all as draft
-    res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers)
-    assert res.status_code == 200
-
-    # Approve one item (approved-3)
-    res = await async_client.get(f"/v1/ground-truths/{dataset}", headers=user_headers)
-    assert res.status_code == 200
-    all_items = res.json()
-    approved_item = next(i for i in all_items if i["id"] == "approved-3")
-
-    res = await async_client.put(
-        f"/v1/ground-truths/{dataset}/{approved_item['bucket']}/approved-3",
-        headers={**user_headers, "If-Match": approved_item["_etag"]},
-        json={"status": "approved"},
-    )
-    assert res.status_code == 200
-
-    # Filter by draft status and sort by totalReferences
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={
-            "dataset": dataset,
-            "status": "draft",
-            "sortBy": "totalReferences",
-            "sortOrder": "desc",
-        },
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 2
-
-    # Only draft items (draft-5, draft-2) should be returned, sorted by refs
-    ref_counts = [item.totalReferences for item in response_data.items]
-    assert ref_counts == [5, 2], f"Expected [5, 2], got {ref_counts}"
-
-
-@pytest.mark.anyio
-async def test_sort_by_total_references_all_zero(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Sort works when all items have 0 refs (stable by ID)."""
-    dataset = f"sort-refs-zeros-{uuid4().hex[:6]}"
-
-    items = [
-        make_item_with_refs(dataset, "item-c", item_refs_count=0),
-        make_item_with_refs(dataset, "item-a", item_refs_count=0),
-        make_item_with_refs(dataset, "item-b", item_refs_count=0),
-    ]
-
-    res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers)
-    assert res.status_code == 200
-
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "sortBy": "totalReferences", "sortOrder": "asc"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 3
-
-    # All have 0 refs - should be stable sorted by id ASC
-    ref_counts = [item.totalReferences for item in response_data.items]
-    assert ref_counts == [0, 0, 0]
-
-    # Secondary sort by ID should apply
-    ids = [item.id for item in response_data.items]
-    assert ids == sorted(ids), f"Expected IDs sorted alphabetically as secondary sort, got {ids}"
-
-
-@pytest.mark.anyio
-async def test_sort_by_total_references_large_counts(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """Sort handles items with many refs correctly."""
-    dataset = f"sort-refs-large-{uuid4().hex[:6]}"
-
-    items = [
-        make_item_with_refs(dataset, "item-10", item_refs_count=10),
-        make_item_with_refs(dataset, "item-50", item_refs_count=50),
-        make_item_with_refs(dataset, "item-25", item_refs_count=25),
-    ]
-
-    res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers)
-    assert res.status_code == 200
-
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "sortBy": "totalReferences", "sortOrder": "desc"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert len(response_data.items) == 3
-
-    ref_counts = [item.totalReferences for item in response_data.items]
-    assert ref_counts == [50, 25, 10], f"Expected [50, 25, 10], got {ref_counts}"
-
-
-@pytest.mark.anyio
-async def test_sort_by_total_references_after_update(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """totalReferences is recalculated on update and sort reflects changes."""
-    dataset = f"sort-refs-update-{uuid4().hex[:6]}"
-
-    items = [
-        make_item_with_refs(dataset, "item-to-update", item_refs_count=1),
-        make_item_with_refs(dataset, "item-static", item_refs_count=3),
-    ]
-
-    res = await async_client.post("/v1/ground-truths", json=items, headers=user_headers)
-    assert res.status_code == 200
-
-    # Initial sort - item-static should be first (3 refs vs 1 ref)
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "sortBy": "totalReferences", "sortOrder": "desc"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert response_data.items[0].id == "item-static"
-
-    # Get the item to update
-    res = await async_client.get(f"/v1/ground-truths/{dataset}", headers=user_headers)
-    assert res.status_code == 200
-    all_items = res.json()
-    item_to_update = next(i for i in all_items if i["id"] == "item-to-update")
-
-    # Update item-to-update to have 5 refs (more than item-static's 3)
-    res = await async_client.put(
-        f"/v1/ground-truths/{dataset}/{item_to_update['bucket']}/item-to-update",
-        headers={**user_headers, "If-Match": item_to_update["_etag"]},
-        json={
-            "refs": [
-                {"url": "https://example.com/new-ref-1"},
-                {"url": "https://example.com/new-ref-2"},
-                {"url": "https://example.com/new-ref-3"},
-                {"url": "https://example.com/new-ref-4"},
-                {"url": "https://example.com/new-ref-5"},
-            ]
-        },
-    )
-    assert res.status_code == 200
-
-    # After update - item-to-update should now be first (5 refs vs 3 refs)
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"dataset": dataset, "sortBy": "totalReferences", "sortOrder": "desc"},
-        headers=user_headers,
-    )
-    assert res.status_code == 200
-    response_data = GroundTruthListResponse.model_validate(res.json())
-    assert response_data.items[0].id == "item-to-update"
-    assert response_data.items[0].totalReferences == 5
-
-
-@pytest.mark.anyio
-async def test_invalid_sort_field_returns_422(
-    async_client: AsyncClient, user_headers: dict[str, str]
-):
-    """API returns 400 for invalid sortBy value."""
-    res = await async_client.get(
-        "/v1/ground-truths",
-        params={"sortBy": "invalidField"},
-        headers=user_headers,
-    )
-    assert res.status_code == 422
-
-    data = res.json()
-    detail = data.get("detail")
-
-    assert any(
-        ("sortby" in " ".join(map(str, err.get("loc", []))).lower())
-        or ("sortby" in err.get("msg", "").lower())
-        for err in detail
-    ), f"Expected 'sortBy' in validation detail, got: {detail}"
diff --git a/backend/tests/integration/test_recompute_tags.py b/backend/tests/integration/test_recompute_tags.py
index a263a51..ffb6365 100644
--- a/backend/tests/integration/test_recompute_tags.py
+++ b/backend/tests/integration/test_recompute_tags.py
@@ -12,7 +12,9 @@ def make_item(dataset: str, status: str = "draft") -> dict:
         "id": str(uuid.uuid4()),
         "datasetName": dataset,
         "bucket": str(uuid.UUID("00000000-0000-0000-0000-000000000000")),
-        "synthQuestion": "What is the capital of France?",
+        "history": [
+            {"role": "user", "msg": "What is the capital of France?"},
+        ],
         "status": status,
     }
 
diff --git a/backend/tests/integration/test_sample_unassigned_allocation.py b/backend/tests/integration/test_sample_unassigned_allocation.py
index cdca54f..bdc9099 100644
--- a/backend/tests/integration/test_sample_unassigned_allocation.py
+++ b/backend/tests/integration/test_sample_unassigned_allocation.py
@@ -13,10 +13,9 @@ def make_item(dataset: str) -> dict[str, Any]:
         "datasetName": dataset,
         "bucket": "00000000-0000-0000-0000-000000000000",
         "status": "draft",
-        "samplingBucket": 0,
-        "synthQuestion": "Q?",
-        "answer": None,
-        "refs": [],
+        "history": [
+            {"role": "user", "msg": "Q?"},
+        ],
         "manualTags": ["source:synthetic", "split:validation"],
     }
 
diff --git a/backend/tests/integration/test_snapshot_artifacts_cosmos.py b/backend/tests/integration/test_snapshot_artifacts_cosmos.py
index 79de2c0..91643a8 100644
--- a/backend/tests/integration/test_snapshot_artifacts_cosmos.py
+++ b/backend/tests/integration/test_snapshot_artifacts_cosmos.py
@@ -24,9 +24,10 @@ def make_item(dataset: str, item_id: str) -> dict[str, Any]:
         "id": item_id,
         "datasetName": dataset,
         "bucket": "00000000-0000-0000-0000-000000000000",
-        "synthQuestion": "Q?",
-        "answer": "A",
-        "refs": [],
+        "history": [
+            {"role": "user", "msg": "Q?"},
+            {"role": "assistant", "msg": "A"},
+        ],
         "manualTags": ["source:synthetic", "topic:general"],
     }
 
diff --git a/backend/tests/test_helpers.py b/backend/tests/test_helpers.py
index d725ab7..244eed6 100644
--- a/backend/tests/test_helpers.py
+++ b/backend/tests/test_helpers.py
@@ -1,7 +1,8 @@
 """Test helpers for creating AgenticGroundTruthEntry fixtures.
 
-After Phase 6: canonical state is history[]; question/answer/refs are derived
-from history or stored in plugins["rag-compat"].
+After Phase 6: canonical state is history[]; question/answer are derived from
+history and plugin-owned reference compatibility lives in
+plugins["rag-compat"].data.references.
 """
 
 from __future__ import annotations
@@ -34,12 +35,12 @@ def make_test_entry(
         id: Item ID (default: "test-item")
         dataset_name: Dataset name (default: "test-dataset")
         status: Item status (default: draft)
-        history: Explicit history array. If None and synth_question/answer provided,
-                 a simple Q&A history will be auto-generated.
-        synth_question: Question text (stored in rag-compat plugin)
-        edited_question: Edited question text (stored in rag-compat plugin)
-        answer: Answer text (stored in rag-compat plugin)
-        refs: References (stored in rag-compat plugin)
+        history: Explicit history array. If None and question/answer inputs are provided,
+                  a simple Q&A history will be auto-generated.
+        synth_question: Fallback question text used when edited_question is absent
+        edited_question: Preferred question text for generated history
+        answer: Answer text used for generated history
+        refs: References stored in rag-compat plugin data
         manual_tags: Manual tags list
         comment: Item comment
         reviewed_at: Review timestamp
@@ -92,25 +93,23 @@ def make_test_entry(
     if history is not None:
         # Use explicit history
         payload["history"] = history
-    elif synth_question or answer:
+    elif edited_question or synth_question or answer:
         # Auto-generate simple Q&A history from legacy-style params
         auto_history: list[dict[str, Any]] = []
-        if synth_question:
-            auto_history.append({"role": "user", "msg": synth_question})
+        question = edited_question or synth_question
+        if question:
+            auto_history.append({"role": "user", "msg": question})
         if answer:
             auto_history.append({"role": "assistant", "msg": answer})
         payload["history"] = auto_history
 
-    # Build rag-compat plugin data if any legacy fields are provided
+    # Build rag-compat plugin data when references are provided
     rag_compat_data: dict[str, Any] = {}
-    if synth_question is not None:
-        rag_compat_data["synthQuestion"] = synth_question
-    if edited_question is not None:
-        rag_compat_data["editedQuestion"] = edited_question
-    if answer is not None:
-        rag_compat_data["answer"] = answer
     if refs is not None:
-        rag_compat_data["refs"] = refs
+        rag_compat_data["references"] = [
+            ref.model_dump(by_alias=True, exclude_none=True) if hasattr(ref, "model_dump") else ref
+            for ref in refs
+        ]
 
     if rag_compat_data:
         payload["plugins"] = {
diff --git a/backend/tests/unit/plugins/test_plugin_dataset.py b/backend/tests/unit/plugins/test_plugin_dataset.py
index b07a0c1..f664ce8 100644
--- a/backend/tests/unit/plugins/test_plugin_dataset.py
+++ b/backend/tests/unit/plugins/test_plugin_dataset.py
@@ -4,8 +4,8 @@
 
 import pytest
 
-from app.domain.models import AgenticGroundTruthEntry
 from app.plugins.computed_tags.dataset import DatasetPlugin
+from tests.test_helpers import make_test_entry
 
 
 class TestDatasetPlugin:
@@ -30,9 +30,5 @@ def test_tag_key_is_dynamic_placeholder(self):
     def test_compute_returns_dataset_prefixed_tag(self, dataset_name, expected_tag):
         """compute() returns 'dataset:' prefix with the dataset name."""
         plugin = DatasetPlugin()
-        item = AgenticGroundTruthEntry(
-            id="test-id",
-            datasetName=dataset_name,
-            synthQuestion="Question",
-        )
+        item = make_test_entry(id="test-id", dataset_name=dataset_name)
         assert plugin.compute(item) == expected_tag
diff --git a/backend/tests/unit/plugins/test_plugin_no_answer.py b/backend/tests/unit/plugins/test_plugin_no_answer.py
index 3b561db..f7460b6 100644
--- a/backend/tests/unit/plugins/test_plugin_no_answer.py
+++ b/backend/tests/unit/plugins/test_plugin_no_answer.py
@@ -2,8 +2,8 @@
 
 from __future__ import annotations
 
-from app.domain.models import AgenticGroundTruthEntry
 from app.plugins.computed_tags.no_answer import NoAnswerPlugin
+from tests.test_helpers import make_test_entry
 
 
 class TestNoAnswerPlugin:
@@ -12,45 +12,43 @@ class TestNoAnswerPlugin:
     def test_no_answer_exact_match(self):
         """Should return tag when answer is exactly NO_ANSWER."""
         plugin = NoAnswerPlugin()
-        item = AgenticGroundTruthEntry(
-            id="test", datasetName="test", synthQuestion="Q", answer="NO_ANSWER"
+        item = make_test_entry(
+            id="test", dataset_name="test", synth_question="Q", answer="NO_ANSWER"
         )
         assert plugin.compute(item) == "answer:no_answer"
 
     def test_no_answer_with_whitespace(self):
         """Should return tag when answer is NO_ANSWER with surrounding whitespace."""
         plugin = NoAnswerPlugin()
-        item = AgenticGroundTruthEntry(
-            id="test", datasetName="test", synthQuestion="Q", answer="  NO_ANSWER  "
+        item = make_test_entry(
+            id="test", dataset_name="test", synth_question="Q", answer="  NO_ANSWER  "
         )
         assert plugin.compute(item) == "answer:no_answer"
 
     def test_no_answer_with_newlines(self):
         """Should return tag when answer is NO_ANSWER with newlines."""
         plugin = NoAnswerPlugin()
-        item = AgenticGroundTruthEntry(
-            id="test", datasetName="test", synthQuestion="Q", answer="\nNO_ANSWER\n"
+        item = make_test_entry(
+            id="test", dataset_name="test", synth_question="Q", answer="\nNO_ANSWER\n"
         )
         assert plugin.compute(item) == "answer:no_answer"
 
     def test_regular_answer_returns_none(self):
         """Should return None for regular answers."""
         plugin = NoAnswerPlugin()
-        item = AgenticGroundTruthEntry(
-            id="test", datasetName="test", synthQuestion="Q", answer="A valid answer"
+        item = make_test_entry(
+            id="test", dataset_name="test", synth_question="Q", answer="A valid answer"
         )
         assert plugin.compute(item) is None
 
     def test_none_answer_returns_none(self):
         """Should return None when answer is None."""
         plugin = NoAnswerPlugin()
-        item = AgenticGroundTruthEntry(
-            id="test", datasetName="test", synthQuestion="Q", answer=None
-        )
+        item = make_test_entry(id="test", dataset_name="test", synth_question="Q", answer=None)
         assert plugin.compute(item) is None
 
     def test_empty_answer_returns_none(self):
         """Should return None when answer is empty string."""
         plugin = NoAnswerPlugin()
-        item = AgenticGroundTruthEntry(id="test", datasetName="test", synthQuestion="Q", answer="")
+        item = make_test_entry(id="test", dataset_name="test", synth_question="Q", answer="")
         assert plugin.compute(item) is None
diff --git a/backend/tests/unit/plugins/test_plugin_question_length.py b/backend/tests/unit/plugins/test_plugin_question_length.py
index d740fb9..a2b997f 100644
--- a/backend/tests/unit/plugins/test_plugin_question_length.py
+++ b/backend/tests/unit/plugins/test_plugin_question_length.py
@@ -4,12 +4,12 @@
 
 import pytest
 
-from app.domain.models import AgenticGroundTruthEntry
 from app.plugins.computed_tags.question_length import (
     QuestionLengthLongPlugin,
     QuestionLengthMediumPlugin,
     QuestionLengthShortPlugin,
 )
+from tests.test_helpers import make_test_entry
 
 
 class TestQuestionLengthPlugins:
@@ -31,11 +31,7 @@ def test_mutually_exclusive_classification(
     ):
         """Each document gets exactly one length tag."""
         question = " ".join([f"word{i}" for i in range(word_count)])
-        item = AgenticGroundTruthEntry(
-            id="test-id",
-            datasetName="test-dataset",
-            synthQuestion=question,
-        )
+        item = make_test_entry(id="test-id", dataset_name="test-dataset", synth_question=question)
 
         short_plugin = QuestionLengthShortPlugin()
         medium_plugin = QuestionLengthMediumPlugin()
@@ -55,11 +51,11 @@ def test_mutually_exclusive_classification(
 
     def test_edited_question_takes_precedence(self):
         """editedQuestion is used over synthQuestion when present."""
-        item = AgenticGroundTruthEntry(
+        item = make_test_entry(
             id="test-id",
-            datasetName="test-dataset",
-            synthQuestion="short",  # 1 word
-            editedQuestion=" ".join([f"word{i}" for i in range(35)]),  # 35 words -> long
+            dataset_name="test-dataset",
+            synth_question="short",  # 1 word
+            edited_question=" ".join([f"word{i}" for i in range(35)]),  # 35 words -> long
         )
 
         assert QuestionLengthLongPlugin().compute(item) == "question_length:long"
diff --git a/backend/tests/unit/plugins/test_plugin_reference_type.py b/backend/tests/unit/plugins/test_plugin_reference_type.py
index 8d7dfd8..dc954fd 100644
--- a/backend/tests/unit/plugins/test_plugin_reference_type.py
+++ b/backend/tests/unit/plugins/test_plugin_reference_type.py
@@ -4,13 +4,14 @@
 
 import pytest
 
-from app.domain.models import AgenticGroundTruthEntry, Reference
+from app.domain.models import Reference
 from app.plugins.computed_tags.reference_type import (
     ReferenceTypeArticlePlugin,
     ReferenceTypeHelpcenterPlugin,
     _is_article_url,
     _is_helpcenter_url,
 )
+from tests.test_helpers import make_test_entry
 
 
 class TestUrlPatternDetection:
@@ -48,20 +49,18 @@ class TestReferenceTypePlugins:
 
     def test_no_refs_gets_no_tags(self):
         """Item with no refs should get neither tag."""
-        item = AgenticGroundTruthEntry(
-            id="test-no-refs",
-            datasetName="test-dataset",
-            synthQuestion="Question",
+        item = make_test_entry(
+            id="test-no-refs", dataset_name="test-dataset", synth_question="Question"
         )
         assert ReferenceTypeArticlePlugin().compute(item) is None
         assert ReferenceTypeHelpcenterPlugin().compute(item) is None
 
     def test_item_can_have_both_tags(self):
         """Item with both reference types should get both tags."""
-        item = AgenticGroundTruthEntry(
+        item = make_test_entry(
             id="test-both",
-            datasetName="test-dataset",
-            synthQuestion="Question",
+            dataset_name="test-dataset",
+            synth_question="Question",
             refs=[
                 Reference(url="https://docs.example.com/support/article/CS431120"),
                 Reference(url="https://support.example.com/help/product/page.html"),
@@ -72,10 +71,10 @@ def test_item_can_have_both_tags(self):
 
     def test_type_field_is_ignored(self):
         """Only URL matters, not the type field on Reference."""
-        item = AgenticGroundTruthEntry(
+        item = make_test_entry(
             id="test-type-ignored",
-            datasetName="test-dataset",
-            synthQuestion="Question",
+            dataset_name="test-dataset",
+            synth_question="Question",
             refs=[Reference(url="https://example.com/page", type="article")],
         )
         # URL doesn't match article pattern, so no tag even though type="article"
diff --git a/backend/tests/unit/plugins/test_plugin_retrieval_behavior.py b/backend/tests/unit/plugins/test_plugin_retrieval_behavior.py
index 3439b15..f94f847 100644
--- a/backend/tests/unit/plugins/test_plugin_retrieval_behavior.py
+++ b/backend/tests/unit/plugins/test_plugin_retrieval_behavior.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from app.domain.models import AgenticGroundTruthEntry, Reference, HistoryItem
+from app.domain.models import Reference
 from app.domain.enums import HistoryItemRole
 from app.plugins.computed_tags.retrieval_behavior import (
     RetrievalBehaviorNoRefsPlugin,
@@ -12,6 +12,7 @@
     RetrievalBehaviorTwoRefsPlugin,
     RetrievalBehaviorRichPlugin,
 )
+from tests.test_helpers import make_test_entry
 
 
 class TestRetrievalBehaviorPlugins:
@@ -30,10 +31,10 @@ class TestRetrievalBehaviorPlugins:
     )
     def test_mutually_exclusive_classification(self, num_refs, expected_tag):
         """Each document gets exactly one retrieval behavior tag."""
-        item = AgenticGroundTruthEntry(
+        item = make_test_entry(
             id=f"test-{num_refs}-refs",
-            datasetName="test-dataset",
-            synthQuestion="Question",
+            dataset_name="test-dataset",
+            synth_question="Question",
             refs=[Reference(url=f"https://example.com/doc{i}") for i in range(num_refs)],
         )
 
@@ -51,27 +52,21 @@ def test_mutually_exclusive_classification(self, num_refs, expected_tag):
         assert non_none[0] == expected_tag
 
     def test_refs_in_history_are_counted(self):
-        """References in history turns are included in the count."""
-        item = AgenticGroundTruthEntry(
+        """Canonical plugin references with turn ownership are included in the count."""
+        item = make_test_entry(
             id="test-history-refs",
-            datasetName="test-dataset",
-            synthQuestion="Follow up question",
+            dataset_name="test-dataset",
+            synth_question="Follow up question",
             history=[
-                HistoryItem(role=HistoryItemRole.user, msg="First question"),
-                HistoryItem(
-                    role=HistoryItemRole.assistant,
-                    msg="First answer",
-                    refs=[
-                        Reference(url="https://example.com/doc1"),
-                        Reference(url="https://example.com/doc2"),
-                    ],
-                ),
-                HistoryItem(role=HistoryItemRole.user, msg="Second question"),
-                HistoryItem(
-                    role=HistoryItemRole.assistant,
-                    msg="Second answer",
-                    refs=[Reference(url="https://example.com/doc3")],
-                ),
+                {"role": HistoryItemRole.user, "msg": "First question"},
+                {"role": HistoryItemRole.assistant, "msg": "First answer"},
+                {"role": HistoryItemRole.user, "msg": "Second question"},
+                {"role": HistoryItemRole.assistant, "msg": "Second answer"},
+            ],
+            refs=[
+                Reference(url="https://example.com/doc1", messageIndex=1),
+                Reference(url="https://example.com/doc2", messageIndex=1),
+                Reference(url="https://example.com/doc3", messageIndex=3),
             ],
         )
         # 3 refs total in history -> rich
diff --git a/backend/tests/unit/plugins/test_plugin_turns.py b/backend/tests/unit/plugins/test_plugin_turns.py
index c09d372..e2b8833 100644
--- a/backend/tests/unit/plugins/test_plugin_turns.py
+++ b/backend/tests/unit/plugins/test_plugin_turns.py
@@ -4,9 +4,10 @@
 
 import pytest
 
-from app.domain.models import AgenticGroundTruthEntry, HistoryItem
+from app.domain.models import HistoryItem
 from app.domain.enums import HistoryItemRole
 from app.plugins.computed_tags.turns import MultiTurnPlugin, SingleTurnPlugin
+from tests.test_helpers import make_test_entry
 
 
 class TestTurnsPlugins:
@@ -36,10 +37,10 @@ def test_mutually_exclusive_classification(self, history_len, expected_single, e
             else None
         )
 
-        item = AgenticGroundTruthEntry(
+        item = make_test_entry(
             id="test-id",
-            datasetName="test-dataset",
-            synthQuestion="Question",
+            dataset_name="test-dataset",
+            synth_question="Question",
             history=history,
         )
 
diff --git a/backend/tests/unit/test_assignments_skip_persist.py b/backend/tests/unit/test_assignments_skip_persist.py
index 3294aa4..ffa582e 100644
--- a/backend/tests/unit/test_assignments_skip_persist.py
+++ b/backend/tests/unit/test_assignments_skip_persist.py
@@ -129,7 +129,7 @@ async def test_status_skipped_keeps_assignment(async_client, user_headers):
             id=item_id,
             datasetName=dataset,
             bucket=bucket,
-            synthQuestion="Q?",
+            history=[{"role": "user", "msg": "Q?"}],
             status=GroundTruthStatus.draft,
             assignedTo=user_headers["X-User-Id"],
             assignedAt=assigned_at,
diff --git a/backend/tests/unit/test_bulk_import_tag_validation.py b/backend/tests/unit/test_bulk_import_tag_validation.py
index a716976..3699b50 100644
--- a/backend/tests/unit/test_bulk_import_tag_validation.py
+++ b/backend/tests/unit/test_bulk_import_tag_validation.py
@@ -1,7 +1,8 @@
 import pytest
 from unittest.mock import AsyncMock, MagicMock, patch
-from app.domain.models import AgenticGroundTruthEntry, BulkImportResult
+from app.domain.models import BulkImportResult
 from app.core.auth import UserContext
+from tests.test_helpers import make_test_entry
 
 
 @pytest.fixture
@@ -31,11 +32,11 @@ async def test_bulk_import_validates_tags(mock_container, mock_user):
     )
 
     items = [
-        AgenticGroundTruthEntry(
+        make_test_entry(
             id="test-1",
-            datasetName="test",
-            synthQuestion="What is Q?",
-            manualTags=["source:synthetic"],
+            dataset_name="test",
+            synth_question="What is Q?",
+            manual_tags=["source:synthetic"],
         )
     ]
 
@@ -65,8 +66,11 @@ async def test_bulk_import_rejects_invalid_tags(mock_container, mock_user):
     )
 
     items = [
-        AgenticGroundTruthEntry(
-            id="test-1", datasetName="test", synthQuestion="What is Q?", manualTags=["invalid:tag"]
+        make_test_entry(
+            id="test-1",
+            dataset_name="test",
+            synth_question="What is Q?",
+            manual_tags=["invalid:tag"],
         )
     ]
 
@@ -102,17 +106,17 @@ async def test_bulk_import_mixed_valid_invalid_tags(mock_container, mock_user):
     )
 
     items = [
-        AgenticGroundTruthEntry(
+        make_test_entry(
             id="test-1",
-            datasetName="test",
-            synthQuestion="Q1?",
-            manualTags=["source:synthetic"],  # valid
+            dataset_name="test",
+            synth_question="Q1?",
+            manual_tags=["source:synthetic"],  # valid
         ),
-        AgenticGroundTruthEntry(
+        make_test_entry(
             id="test-2",
-            datasetName="test",
-            synthQuestion="Q2?",
-            manualTags=["invalid:tag"],  # invalid
+            dataset_name="test",
+            synth_question="Q2?",
+            manual_tags=["invalid:tag"],  # invalid
         ),
     ]
 
@@ -149,11 +153,11 @@ async def test_bulk_import_no_tags(mock_container, mock_user):
     )
 
     items = [
-        AgenticGroundTruthEntry(
+        make_test_entry(
             id="test-1",
-            datasetName="test",
-            synthQuestion="What is Q?",
-            manualTags=[],  # no tags
+            dataset_name="test",
+            synth_question="What is Q?",
+            manual_tags=[],  # no tags
         )
     ]
 
@@ -183,11 +187,11 @@ async def test_bulk_import_tag_validation_single_registry_fetch(mock_container,
     )
 
     items = [
-        AgenticGroundTruthEntry(
+        make_test_entry(
             id=f"test-{i}",
-            datasetName="test",
-            synthQuestion=f"Q{i}?",
-            manualTags=["source:synthetic"],
+            dataset_name="test",
+            synth_question=f"Q{i}?",
+            manual_tags=["source:synthetic"],
         )
         for i in range(10)
     ]
diff --git a/backend/tests/unit/test_computed_tags_plugins.py b/backend/tests/unit/test_computed_tags_plugins.py
index 630dcb0..9bfecd7 100644
--- a/backend/tests/unit/test_computed_tags_plugins.py
+++ b/backend/tests/unit/test_computed_tags_plugins.py
@@ -26,7 +26,11 @@ class TestTagPluginRegistry:
     def test_empty_registry_returns_empty_tags(self):
         """An empty registry should return no tags."""
         registry = TagPluginRegistry()
-        item = AgenticGroundTruthEntry(id="test", datasetName="test", synthQuestion="Q")
+        item = AgenticGroundTruthEntry(
+            id="test",
+            datasetName="test",
+            history=[{"role": "user", "msg": "Q"}],
+        )
         assert registry.compute_all(item) == []
         assert registry.get_all_keys() == set()
 
@@ -207,7 +211,7 @@ def test_computed_and_manual_tags_merge(self):
         item = AgenticGroundTruthEntry(
             id="merge-test",
             datasetName="test-dataset",
-            synthQuestion="Test question",
+            history=[{"role": "user", "msg": "Test question"}],
             manualTags=["source:manual", "priority:high"],
             computedTags=["turns:singleturn"],
         )
diff --git a/backend/tests/unit/test_conversation_fields.py b/backend/tests/unit/test_conversation_fields.py
new file mode 100644
index 0000000..39d2c5e
--- /dev/null
+++ b/backend/tests/unit/test_conversation_fields.py
@@ -0,0 +1,36 @@
+from app.domain.conversation_fields import (
+    answer_text_from_item,
+    is_non_user_role,
+    is_user_role,
+    question_text_from_item,
+)
+from app.domain.models import AgenticGroundTruthEntry
+
+
+def test_role_helpers_use_strict_user_semantics():
+    assert is_user_role("user")
+    assert is_user_role(" User ")
+    assert not is_user_role("assistant")
+    assert not is_user_role("planner")
+
+    assert not is_non_user_role("user")
+    assert is_non_user_role("assistant")
+    assert is_non_user_role("planner")
+
+
+def test_question_and_answer_derivation_follow_user_vs_non_user_contract():
+    item = AgenticGroundTruthEntry.model_validate(
+        {
+            "id": "item-role-derivation",
+            "datasetName": "demo",
+            "history": [
+                {"role": "user", "msg": "Initial question"},
+                {"role": "planner", "msg": "Draft answer"},
+                {"role": "user", "msg": "Follow-up"},
+                {"role": "assistant", "msg": "Final answer"},
+            ],
+        }
+    )
+
+    assert question_text_from_item(item) == "Follow-up"
+    assert answer_text_from_item(item) == "Final answer"
diff --git a/backend/tests/unit/test_cosmos_repo.py b/backend/tests/unit/test_cosmos_repo.py
index 0647bf7..23c2f12 100644
--- a/backend/tests/unit/test_cosmos_repo.py
+++ b/backend/tests/unit/test_cosmos_repo.py
@@ -5,7 +5,13 @@
 
 import pytest  # type: ignore[import-not-found]
 
-from app.adapters.repos.cosmos_repo import CosmosGroundTruthRepo, SELECT_CLAUSE_C
+from app.adapters.repos.cosmos_repo import (
+    CosmosGroundTruthRepo,
+    SELECT_CLAUSE_C,
+    _normalize_unicode_for_cosmos,
+    _restore_unicode_from_cosmos,
+)
+from app.plugins.pack_registry import get_rag_compat_pack
 from app.domain.enums import GroundTruthStatus, SortField, SortOrder
 from app.domain.models import AgenticGroundTruthEntry
 from tests.test_helpers import make_test_entry
@@ -104,6 +110,66 @@ def test_resolve_sort_with_overrides(repo: CosmosGroundTruthRepo) -> None:
     assert direction is SortOrder.asc
 
 
+def test_emulator_unicode_normalization_encodes_canonical_reference_content(monkeypatch) -> None:
+    monkeypatch.setattr(
+        "app.adapters.repos.cosmos_repo.settings.COSMOS_DISABLE_UNICODE_ESCAPE", True
+    )
+
+    original_content = r"Snippet with invalid escape \q and unicode \u2603"
+    payload = {
+        "plugins": {
+            "rag-compat": {
+                "data": {
+                    "references": [
+                        {
+                            "url": "https://example.com/canonical",
+                            "content": original_content,
+                        }
+                    ]
+                }
+            }
+        }
+    }
+
+    normalized = _normalize_unicode_for_cosmos(payload)
+    ref = normalized["plugins"]["rag-compat"]["data"]["references"][0]
+
+    assert ref.get("_contentEncoded") is True
+    assert ref["content"] != original_content
+
+    restored = _restore_unicode_from_cosmos(normalized)
+    restored_ref = restored["plugins"]["rag-compat"]["data"]["references"][0]
+    assert restored_ref["content"] == original_content
+    assert "_contentEncoded" not in restored_ref
+
+
+def test_emulator_unicode_normalization_does_not_base64_encode_legacy_refs(monkeypatch) -> None:
+    monkeypatch.setattr(
+        "app.adapters.repos.cosmos_repo.settings.COSMOS_DISABLE_UNICODE_ESCAPE", True
+    )
+
+    original_content = r"Legacy snippet with invalid escape \q"
+    payload = {
+        "history": [
+            {
+                "role": "assistant",
+                "msg": "Answer",
+                "refs": [{"url": "https://example.com/legacy", "content": original_content}],
+            }
+        ]
+    }
+
+    normalized = _normalize_unicode_for_cosmos(payload)
+    ref = normalized["history"][0]["refs"][0]
+    assert "_contentEncoded" not in ref
+    assert ref["content"] != original_content
+
+    restored = _restore_unicode_from_cosmos(normalized)
+    restored_ref = restored["history"][0]["refs"][0]
+    assert restored_ref["content"] == original_content
+    assert "_contentEncoded" not in restored_ref
+
+
 def test_sort_key_has_answer(repo: CosmosGroundTruthRepo) -> None:
     example = make_test_entry(
         id="item",
@@ -134,28 +200,17 @@ def test_select_clause_includes_generic_phase_one_fields() -> None:
 
 
 # =============================================================================
-# Tests for totalReferences auto-computation (domain model validator)
+# Reference-count semantics via rag-compat pack helpers
 # =============================================================================
 
 
 class TestComputeTotalReferences:
-    """Unit tests for AgenticGroundTruthEntry.totalReferences computation.
-
-    The property calculates total references with the following logic:
-    - If history has refs, count only history refs (history takes priority)
-    - If history has no refs, count plugin-stored refs as fallback
-
-    **Phase 5 Audit (2026-03-12)**: ACTIVE COMPUTATION LOGIC - BLOCKING
-    The totalReferences field has active property logic that computes
-    values from history and plugin refs. This is not just compatibility
-    testing - it's core functionality that is used by:
-    - Model validation on all item saves
-    - Sort/filter operations that check reference counts
-    - UI displays of reference totals
-
-    Cannot delete totalReferences until this computation is either:
-    - Moved to a computed property on AgenticGroundTruthEntry, OR
-    - Replaced by direct history ref counting in callers
+    """Unit tests for rag-compat reference_count behavior.
+
+    These tests exercise reference counting through
+    ``get_rag_compat_pack().reference_count(item)`` using compatibility
+    payload shapes seeded in fixtures. The host model no longer owns
+    ``totalReferences`` behavior.
     """
 
     def _make_item(
@@ -164,56 +219,60 @@ def _make_item(
         history: list[dict] | None = None,
     ) -> AgenticGroundTruthEntry:
         """Helper to create an AgenticGroundTruthEntry with specified refs and history."""
+        normalized_history = history
+        if history is not None:
+            normalized_history = []
+            for turn in history:
+                turn_copy = dict(turn)
+                if turn_copy.get("refs") in (None, []):
+                    turn_copy.pop("refs", None)
+                normalized_history.append(turn_copy)
         return make_test_entry(
             id="test-item",
             dataset_name="test-dataset",
             synth_question="Test question?",
             refs=refs,
-            history=history,
+            history=normalized_history,
         )
 
     # -------------------------------------------------------------------------
-    # History refs take priority over item refs
+    # Compat-reference counting with conversation history present
     # -------------------------------------------------------------------------
 
     def test_history_refs_take_priority_over_item_refs(self) -> None:
-        """When history has refs, only history refs are counted (item refs ignored)."""
+        """Compat refs are counted even when conversation history is present."""
         item = self._make_item(
             refs=[{"url": "https://item-ref-1.com"}, {"url": "https://item-ref-2.com"}],
             history=[
                 {"role": "user", "msg": "Hello"},
-                {"role": "assistant", "msg": "Hi", "refs": [{"url": "https://history-ref.com"}]},
+                {"role": "assistant", "msg": "Hi"},
             ],
         )
-        # totalReferences is auto-computed by model_validator
-        # Should count only history refs (1), not item refs (2)
-        assert item.totalReferences == 1
+        assert get_rag_compat_pack().reference_count(item) == 2
 
     def test_history_refs_from_multiple_turns(self) -> None:
-        """Refs from all history turns are summed."""
+        """Compat refs are counted correctly with multi-turn history present."""
         item = self._make_item(
-            refs=[{"url": "https://ignored.com"}],
+            refs=[
+                {"url": "https://ref1.com"},
+                {"url": "https://ref2.com"},
+                {"url": "https://ref3.com"},
+            ],
             history=[
                 {"role": "user", "msg": "Q1"},
-                {
-                    "role": "assistant",
-                    "msg": "A1",
-                    "refs": [{"url": "https://ref1.com"}, {"url": "https://ref2.com"}],
-                },
+                {"role": "assistant", "msg": "A1"},
                 {"role": "user", "msg": "Q2"},
-                {"role": "assistant", "msg": "A2", "refs": [{"url": "https://ref3.com"}]},
+                {"role": "assistant", "msg": "A2"},
             ],
         )
-        # totalReferences is auto-computed by model_validator
-        # Should count all history refs: 2 + 1 = 3
-        assert item.totalReferences == 3
+        assert get_rag_compat_pack().reference_count(item) == 3
 
     # -------------------------------------------------------------------------
-    # Item refs used when no history refs exist
+    # Compat-reference fallback when history contributes no refs
     # -------------------------------------------------------------------------
 
     def test_item_refs_fallback_when_no_history(self) -> None:
-        """Item refs are counted when there is no history."""
+        """Plugin-owned compat refs are counted when there is no history."""
         item = self._make_item(
             refs=[
                 {"url": "https://ref1.com"},
@@ -222,18 +281,18 @@ def test_item_refs_fallback_when_no_history(self) -> None:
             ],
             history=None,
         )
-        assert item.totalReferences == 3
+        assert get_rag_compat_pack().reference_count(item) == 3
 
     def test_item_refs_fallback_when_history_empty(self) -> None:
-        """Item refs are counted when history is an empty list."""
+        """Plugin-owned compat refs are counted when history is an empty list."""
         item = self._make_item(
             refs=[{"url": "https://ref1.com"}, {"url": "https://ref2.com"}],
             history=[],
         )
-        assert item.totalReferences == 2
+        assert get_rag_compat_pack().reference_count(item) == 2
 
     def test_item_refs_fallback_when_history_has_no_refs(self) -> None:
-        """Item refs are counted when history exists but contains no refs."""
+        """Plugin-owned compat refs are counted when history exists but contains no refs."""
         item = self._make_item(
             refs=[{"url": "https://item-ref.com"}],
             history=[
@@ -241,11 +300,11 @@ def test_item_refs_fallback_when_history_has_no_refs(self) -> None:
                 {"role": "assistant", "msg": "Hi"},  # No refs
             ],
         )
-        # History has 0 refs, so item refs (1) should be used
-        assert item.totalReferences == 1
+        # History contributes 0 compat refs, so top-level compat refs (1) are used
+        assert get_rag_compat_pack().reference_count(item) == 1
 
     def test_item_refs_fallback_when_history_refs_are_empty_lists(self) -> None:
-        """Item refs are counted when history refs are empty lists."""
+        """Plugin-owned compat refs are counted when history refs are empty lists."""
         item = self._make_item(
             refs=[{"url": "https://item-ref.com"}],
             history=[
@@ -253,27 +312,27 @@ def test_item_refs_fallback_when_history_refs_are_empty_lists(self) -> None:
                 {"role": "assistant", "msg": "Hi", "refs": []},  # Empty refs list
             ],
         )
-        # History refs total is 0, so item refs (1) should be used
-        assert item.totalReferences == 1
+        # History compat refs total is 0, so top-level compat refs (1) are used
+        assert get_rag_compat_pack().reference_count(item) == 1
 
     # -------------------------------------------------------------------------
-    # Handle empty/null refs and history
+    # Handle empty/null compat refs and history
     # -------------------------------------------------------------------------
 
     def test_zero_when_no_refs_anywhere(self) -> None:
         """Returns 0 when there are no refs at any level."""
         item = self._make_item(refs=None, history=None)
-        assert item.totalReferences == 0
+        assert get_rag_compat_pack().reference_count(item) == 0
 
     def test_zero_when_empty_refs_and_no_history(self) -> None:
         """Returns 0 when refs is empty list and no history."""
         item = self._make_item(refs=[], history=None)
-        assert item.totalReferences == 0
+        assert get_rag_compat_pack().reference_count(item) == 0
 
     def test_zero_when_empty_refs_and_empty_history(self) -> None:
         """Returns 0 when refs is empty and history is empty list."""
         item = self._make_item(refs=[], history=[])
-        assert item.totalReferences == 0
+        assert get_rag_compat_pack().reference_count(item) == 0
 
     def test_handles_none_refs_in_history_turn(self) -> None:
         """Handles history turns where refs is explicitly None."""
@@ -284,61 +343,56 @@ def test_handles_none_refs_in_history_turn(self) -> None:
                 {"role": "assistant", "msg": "Hi", "refs": None},  # Explicitly None
             ],
         )
-        # History refs is 0, fallback to item refs
-        assert item.totalReferences == 1
+        # History compat refs is 0, so top-level compat refs are used
+        assert get_rag_compat_pack().reference_count(item) == 1
 
     # -------------------------------------------------------------------------
     # Complex scenarios with partial data
     # -------------------------------------------------------------------------
 
     def test_mixed_history_some_turns_with_refs_some_without(self) -> None:
-        """History with mix of turns with and without refs."""
+        """Compat refs are counted with mixed multi-turn history."""
         item = self._make_item(
-            refs=[{"url": "https://ignored.com"}],
+            refs=[
+                {"url": "https://ref1.com"},
+                {"url": "https://ref2.com"},
+                {"url": "https://ref3.com"},
+            ],
             history=[
                 {"role": "user", "msg": "Q1"},
                 {"role": "assistant", "msg": "A1"},  # No refs
                 {"role": "user", "msg": "Q2"},
-                {"role": "assistant", "msg": "A2", "refs": [{"url": "https://ref1.com"}]},
+                {"role": "assistant", "msg": "A2"},
                 {"role": "user", "msg": "Q3"},
-                {"role": "assistant", "msg": "A3", "refs": None},  # Explicitly None
+                {"role": "assistant", "msg": "A3"},
                 {"role": "user", "msg": "Q4"},
-                {
-                    "role": "assistant",
-                    "msg": "A4",
-                    "refs": [{"url": "https://ref2.com"}, {"url": "https://ref3.com"}],
-                },
+                {"role": "assistant", "msg": "A4"},
             ],
         )
-        # History refs: 0 + 1 + 0 + 2 = 3
-        assert item.totalReferences == 3
+        assert get_rag_compat_pack().reference_count(item) == 3
 
     def test_user_turns_with_refs_are_counted(self) -> None:
-        """Refs on user turns are also counted (not just assistant turns)."""
+        """Compat refs are counted regardless of turn roles."""
         item = self._make_item(
-            refs=[{"url": "https://ignored.com"}],
+            refs=[{"url": "https://user-ref.com"}, {"url": "https://assistant-ref.com"}],
             history=[
-                {"role": "user", "msg": "Here's a doc", "refs": [{"url": "https://user-ref.com"}]},
-                {
-                    "role": "assistant",
-                    "msg": "Thanks",
-                    "refs": [{"url": "https://assistant-ref.com"}],
-                },
+                {"role": "user", "msg": "Here's a doc"},
+                {"role": "assistant", "msg": "Thanks"},
             ],
         )
-        # Both user and assistant refs are counted: 1 + 1 = 2
-        assert item.totalReferences == 2
+        assert get_rag_compat_pack().reference_count(item) == 2
 
     def test_many_refs_in_single_turn(self) -> None:
-        """Handles turns with many references."""
+        """Handles many compatibility refs."""
         many_refs = [{"url": f"https://ref{i}.com"} for i in range(10)]
         item = self._make_item(
+            refs=many_refs,
             history=[
                 {"role": "user", "msg": "Q"},
-                {"role": "assistant", "msg": "A", "refs": many_refs},
+                {"role": "assistant", "msg": "A"},
             ],
         )
-        assert item.totalReferences == 10
+        assert get_rag_compat_pack().reference_count(item) == 10
 
     def test_item_only_no_history_field_at_all(self) -> None:
         """Item created without history field entirely."""
@@ -359,41 +413,30 @@ def test_item_only_no_history_field_at_all(self) -> None:
                 },
             }
         )
-        assert item.totalReferences == 1
+        assert get_rag_compat_pack().reference_count(item) == 1
 
     def test_complex_real_world_scenario(self) -> None:
         """Realistic multi-turn conversation with various ref patterns."""
         item = self._make_item(
-            # Item-level refs (should be ignored if history has any refs)
-            refs=[{"url": "https://old-ref.com"}],
+            refs=[
+                {"url": "https://kb.example.com/article1"},
+                {"url": "https://docs.example.com/troubleshooting"},
+                {"url": "https://kb.example.com/article2"},
+            ],
             history=[
                 # Turn 1: User asks question
                 {"role": "user", "msg": "How do I fix error X?"},
-                # Turn 2: Assistant responds with 2 refs
-                {
-                    "role": "assistant",
-                    "msg": "You can try these solutions...",
-                    "refs": [
-                        {"url": "https://kb.example.com/article1"},
-                        {"url": "https://docs.example.com/troubleshooting"},
-                    ],
-                },
+                {"role": "assistant", "msg": "You can try these solutions..."},
                 # Turn 3: User follow-up
                 {"role": "user", "msg": "That didn't work, any other ideas?"},
-                # Turn 4: Assistant with 1 more ref
-                {
-                    "role": "assistant",
-                    "msg": "Let's try this instead...",
-                    "refs": [{"url": "https://kb.example.com/article2"}],
-                },
+                {"role": "assistant", "msg": "Let's try this instead..."},
                 # Turn 5: User confirms
                 {"role": "user", "msg": "That worked, thanks!"},
                 # Turn 6: Assistant closes (no refs needed)
                 {"role": "assistant", "msg": "Glad I could help!"},
             ],
         )
-        # History refs: 2 + 1 = 3 (item-level ref is ignored)
-        assert item.totalReferences == 3
+        assert get_rag_compat_pack().reference_count(item) == 3
 
 
 # ---------------------------------------------------------------------------
diff --git a/backend/tests/unit/test_demo_mode_memory_api.py b/backend/tests/unit/test_demo_mode_memory_api.py
index abc0fa9..74202cb 100644
--- a/backend/tests/unit/test_demo_mode_memory_api.py
+++ b/backend/tests/unit/test_demo_mode_memory_api.py
@@ -37,6 +37,11 @@ async def test_demo_mode_seeds_memory_backend_for_api_usage() -> None:
     settings.DEMO_USER_ID = "anonymous"
 
     container.repo = None
+    container.assignment_service = None
+    container.search_service = None
+    container.snapshot_service = None
+    container.curation_service = None
+    container.init_memory_repo(enable_demo_data=True)
 
     app = create_app()
 
diff --git a/backend/tests/unit/test_groundtruthitem_tags_validation.py b/backend/tests/unit/test_groundtruthitem_tags_validation.py
index a80b3ec..923427c 100644
--- a/backend/tests/unit/test_groundtruthitem_tags_validation.py
+++ b/backend/tests/unit/test_groundtruthitem_tags_validation.py
@@ -3,7 +3,11 @@
 from app.domain.models import AgenticGroundTruthEntry
 
 
-BASE = dict(id="id1", datasetName="ds", synthQuestion="What is this product?")
+BASE = dict(
+    id="id1",
+    datasetName="ds",
+    history=[{"role": "user", "msg": "What is this product?"}],
+)
 
 
 def make_item(**overrides):
diff --git a/backend/tests/unit/test_history_with_refs.py b/backend/tests/unit/test_history_with_refs.py
index 49f3df5..8880ff6 100644
--- a/backend/tests/unit/test_history_with_refs.py
+++ b/backend/tests/unit/test_history_with_refs.py
@@ -1,35 +1,29 @@
-"""
-Unit tests for HistoryItem with refs field.
-Validates that history items can store references alongside agent messages.
-"""
+"""Unit tests for canonical HistoryItem validation semantics."""
+
+import pytest
+from pydantic import ValidationError
 
 from app.domain.models import HistoryItem, Reference
 from app.domain.enums import HistoryItemRole, ExpectedBehavior
 
 
-def test_history_item_with_refs():
-    """Test that HistoryItem can include refs."""
+def test_history_item_rejects_refs():
+    """HistoryItem rejects legacy refs; refs are plugin-owned canonical data."""
     refs = [
         Reference(url="https://example.com/doc1", content="Content 1"),
         Reference(url="https://example.com/doc2", content="Content 2", bonus=True),
     ]
 
-    history_item = HistoryItem(
-        role=HistoryItemRole.assistant,
-        msg="Here is the answer based on the documentation.",
-        refs=refs,
-    )
-
-    assert history_item.role == HistoryItemRole.assistant
-    assert history_item.msg == "Here is the answer based on the documentation."
-    assert history_item.refs is not None
-    assert len(history_item.refs) == 2
-    assert history_item.refs[0].url == "https://example.com/doc1"
-    assert history_item.refs[1].bonus is True
+    with pytest.raises(ValidationError):
+        HistoryItem(
+            role=HistoryItemRole.assistant,
+            msg="Here is the answer based on the documentation.",
+            refs=refs,
+        )
 
 
 def test_history_item_without_refs():
-    """Test that refs is optional in HistoryItem."""
+    """HistoryItem remains valid with canonical role/msg content only."""
     history_item = HistoryItem(
         role=HistoryItemRole.user,
         msg="What is the answer?",
@@ -37,33 +31,23 @@ def test_history_item_without_refs():
 
     assert history_item.role == HistoryItemRole.user
     assert history_item.msg == "What is the answer?"
-    assert history_item.refs is None
+    assert "refs" not in history_item.model_dump()
 
 
 def test_history_item_serialization():
-    """Test that HistoryItem serializes correctly with refs."""
-    refs = [
-        Reference(url="https://example.com/doc1", content="Content 1"),
-    ]
-
-    history_item = HistoryItem(
-        role=HistoryItemRole.assistant,
-        msg="Answer text",
-        refs=refs,
-    )
+    """HistoryItem serialization excludes legacy refs field."""
+    history_item = HistoryItem(role=HistoryItemRole.assistant, msg="Answer text")
 
     # Serialize to dict
     data = history_item.model_dump()
 
     assert data["role"] == "assistant"
     assert data["msg"] == "Answer text"
-    assert data["refs"] is not None
-    assert len(data["refs"]) == 1
-    assert data["refs"][0]["url"] == "https://example.com/doc1"
+    assert "refs" not in data
 
 
-def test_history_item_deserialization():
-    """Test that HistoryItem can be created from dict with refs."""
+def test_history_item_deserialization_rejects_refs():
+    """HistoryItem rejects dict payloads containing legacy refs."""
     data = {
         "role": "assistant",
         "msg": "Answer text",
@@ -73,33 +57,24 @@ def test_history_item_deserialization():
         ],
     }
 
-    history_item = HistoryItem(**data)
-
-    assert history_item.role == HistoryItemRole.assistant
-    assert history_item.msg == "Answer text"
-    assert history_item.refs is not None
-    assert len(history_item.refs) == 2
-    assert history_item.refs[0].url == "https://example.com/doc1"
-    assert history_item.refs[1].bonus is True
+    with pytest.raises(ValidationError):
+        HistoryItem(**data)
 
 
-def test_user_history_item_typically_no_refs():
-    """Test that user messages typically don't have refs (but could)."""
-    # User message without refs (typical)
+def test_user_history_item_rejects_refs():
+    """User history items also reject legacy refs."""
     user_item = HistoryItem(
         role=HistoryItemRole.user,
         msg="What is this product?",
     )
-    assert user_item.refs is None
-
-    # User message with refs (uncommon but allowed)
-    user_item_with_refs = HistoryItem(
-        role=HistoryItemRole.user,
-        msg="Based on this document, what is this product?",
-        refs=[Reference(url="https://example.com/doc1")],
-    )
-    assert user_item_with_refs.refs is not None
-    assert len(user_item_with_refs.refs) == 1
+    assert "refs" not in user_item.model_dump()
+
+    with pytest.raises(ValidationError):
+        HistoryItem(
+            role=HistoryItemRole.user,
+            msg="Based on this document, what is this product?",
+            refs=[Reference(url="https://example.com/doc1")],
+        )
 
 
 def test_history_item_with_expected_behavior():
diff --git a/backend/tests/unit/test_phase1_rework.py b/backend/tests/unit/test_phase1_rework.py
index 9810eef..e2eaae9 100644
--- a/backend/tests/unit/test_phase1_rework.py
+++ b/backend/tests/unit/test_phase1_rework.py
@@ -24,6 +24,7 @@
     BulkImportResult,
     HistoryEntry,
 )
+from app.plugins.pack_registry import get_rag_compat_pack
 from app.domain.enums import GroundTruthStatus
 
 
@@ -210,11 +211,11 @@ async def test_bulk_import_approve_enforces_plugin_pack_approval_hooks(self):
 
 
 class TestAssignmentHistoryReset:
-    """Test IV-002: Assignment route history edits reset totalReferences."""
+    """Test IV-002: Assignment route history edits preserve compat totalReferences."""
 
     @pytest.mark.asyncio
     async def test_assignment_update_history_resets_total_references(self):
-        """When history is updated via assignment route, totalReferences should be reset to 0."""
+        """When history updates, compat plugin totalReferences remains unchanged."""
         from app.core.auth import UserContext
         from app.container import container
         from app.api.v1.assignments import update_item
@@ -235,7 +236,13 @@ async def test_assignment_update_history_resets_total_references(self):
                 HistoryEntry(role="user", msg="Old question"),
                 HistoryEntry(role="assistant", msg="Old answer"),
             ],
-            totalReferences=5,  # Stale value
+            plugins={
+                "rag-compat": {
+                    "kind": "rag-compat",
+                    "version": "1.0",
+                    "data": {"totalReferences": 5},
+                }
+            },
             _etag="test-etag",
         )
 
@@ -277,16 +284,16 @@ async def mock_upsert(item):
                 if_match=None,
             )
 
-            # Verify totalReferences was reset to 0
+            # Verify compat totalReferences was preserved on the plugin payload
             assert saved_item is not None
-            assert saved_item.totalReferences == 0
+            assert get_rag_compat_pack().reference_count(saved_item) == 5
 
         finally:
             container.repo = original_repo
 
     @pytest.mark.asyncio
     async def test_assignment_clear_history_resets_total_references(self):
-        """When history is cleared via assignment route, totalReferences should be reset to 0."""
+        """When history is cleared, compat plugin totalReferences remains unchanged."""
         from app.core.auth import UserContext
         from app.container import container
         from app.api.v1.assignments import update_item
@@ -307,7 +314,13 @@ async def test_assignment_clear_history_resets_total_references(self):
                 HistoryEntry(role="user", msg="Question"),
                 HistoryEntry(role="assistant", msg="Answer"),
             ],
-            totalReferences=3,
+            plugins={
+                "rag-compat": {
+                    "kind": "rag-compat",
+                    "version": "1.0",
+                    "data": {"totalReferences": 3},
+                }
+            },
             _etag="test-etag",
         )
 
@@ -344,9 +357,9 @@ async def mock_upsert(item):
                 if_match=None,
             )
 
-            # Verify totalReferences was reset to 0
+            # Verify compat totalReferences was preserved on the plugin payload
             assert saved_item is not None
-            assert saved_item.totalReferences == 0
+            assert get_rag_compat_pack().reference_count(saved_item) == 3
 
         finally:
             container.repo = original_repo
diff --git a/backend/tests/unit/test_pii_detection.py b/backend/tests/unit/test_pii_detection.py
index 58673b8..430e3d3 100644
--- a/backend/tests/unit/test_pii_detection.py
+++ b/backend/tests/unit/test_pii_detection.py
@@ -165,7 +165,7 @@ class TestGroundTruthItemScanning:
     """Tests for scanning GroundTruthItem fields."""
 
     def test_scans_synth_question(self):
-        """Should detect PII in synthQuestion field."""
+        """Should detect PII in canonical question field."""
         item = make_test_entry(
             id="test-1",
             dataset_name="test-dataset",
@@ -174,12 +174,12 @@ def test_scans_synth_question(self):
         warnings = scan_item_for_pii(item)
         # Should find PII in multiple representations (history, plugin data, computed fields)
         assert len(warnings) >= 1
-        # Check that at least one warning is for synthQuestion
-        assert any(w.field == "synthQuestion" for w in warnings)
+        # Check that at least one warning is for canonical question text
+        assert any(w.field == "history.question" for w in warnings)
         assert any("email" in w.pattern_type for w in warnings)
 
     def test_scans_edited_question(self):
-        """Should detect PII in editedQuestion field."""
+        """Should detect PII in edited question via canonical question field."""
         item = make_test_entry(
             id="test-1",
             dataset_name="test-dataset",
@@ -187,13 +187,12 @@ def test_scans_edited_question(self):
             edited_question="Contact support@company.org for assistance",
         )
         warnings = scan_item_for_pii(item)
-        # Should find PII in multiple representations
         assert len(warnings) >= 1
-        assert any(w.field == "editedQuestion" for w in warnings)
+        assert any(w.field == "history.question" for w in warnings)
         assert any("email" in w.pattern_type for w in warnings)
 
     def test_scans_answer(self):
-        """Should detect PII in answer field."""
+        """Should detect PII in canonical answer field."""
         item = make_test_entry(
             id="test-1",
             dataset_name="test-dataset",
@@ -203,7 +202,7 @@ def test_scans_answer(self):
         warnings = scan_item_for_pii(item)
         # Should find PII in multiple representations
         assert len(warnings) >= 1
-        assert any(w.field == "answer" for w in warnings)
+        assert any(w.field == "history.answer" for w in warnings)
         assert any("phone" in w.pattern_type for w in warnings)
 
     def test_scans_comment(self):
@@ -231,9 +230,10 @@ def test_scans_history_messages(self):
             ],
         )
         warnings = scan_item_for_pii(item)
-        assert len(warnings) == 2
+        assert len(warnings) >= 2
         # Check field names include index
         fields = {w.field for w in warnings}
+        assert "history.question" in fields
         assert "history[0].msg" in fields
         assert "history[2].msg" in fields
 
@@ -360,14 +360,14 @@ def test_warning_model_serialization(self):
         """PIIWarning should serialize correctly."""
         warning = PIIWarning(
             item_id="test-1",
-            field="synthQuestion",
+            field="history.question",
             pattern_type="email",
             snippet="...[u***@e***e.com]...",
             position=10,
         )
         data = warning.model_dump()
         assert data["item_id"] == "test-1"
-        assert data["field"] == "synthQuestion"
+        assert data["field"] == "history.question"
         assert data["pattern_type"] == "email"
         assert data["snippet"] == "...[u***@e***e.com]..."
         assert data["position"] == 10
diff --git a/backend/tests/unit/test_rag_compat_approval.py b/backend/tests/unit/test_rag_compat_approval.py
index 951f505..bc73cce 100644
--- a/backend/tests/unit/test_rag_compat_approval.py
+++ b/backend/tests/unit/test_rag_compat_approval.py
@@ -24,7 +24,7 @@ def _make_item(**overrides) -> AgenticGroundTruthEntry:
     defaults = {
         "id": "rag-test-1",
         "datasetName": "demo",
-        "synthQuestion": "What is X?",
+        "history": [{"role": "user", "msg": "What is X?"}],
     }
     defaults.update(overrides)
     return AgenticGroundTruthEntry.model_validate(defaults)
@@ -39,10 +39,16 @@ def test_core_requires_assistant_message_even_with_refs():
     """After waiver removal, core always generates the assistant error."""
     item = _make_item(
         history=[{"role": "user", "msg": "hello"}],
-        totalReferences=5,
+        plugins={
+            "rag-compat": {
+                "kind": "rag-compat",
+                "version": "1.0",
+                "data": {"references": [{"url": "https://example.com/ref"}]},
+            }
+        },
     )
     errors = collect_approval_validation_errors(item)
-    assert "history must include at least one assistant message" in errors
+    assert "history must include at least one agent message" in errors
 
 
 def test_core_no_error_when_assistant_present():
@@ -65,18 +71,26 @@ def test_rag_pack_waives_assistant_error_when_refs_present():
     pack = RagCompatPack()
     item = _make_item(
         history=[{"role": "user", "msg": "hello"}],
-        totalReferences=3,
+        plugins={
+            "rag-compat": {
+                "kind": "rag-compat",
+                "version": "1.0",
+                "data": {"references": [{"url": "https://example.com/ref"}]},
+            }
+        },
     )
     core_errors = collect_approval_validation_errors(item)
     waivers = pack.collect_approval_waivers(item, core_errors)
-    assert "history must include at least one assistant message" in waivers
+    assert "history must include at least one agent message" in waivers
 
 
 def test_rag_pack_no_waiver_when_refs_zero():
     pack = RagCompatPack()
     item = _make_item(
         history=[{"role": "user", "msg": "hello"}],
-        totalReferences=0,
+        plugins={
+            "rag-compat": {"kind": "rag-compat", "version": "1.0", "data": {"references": []}}
+        },
     )
     core_errors = collect_approval_validation_errors(item)
     waivers = pack.collect_approval_waivers(item, core_errors)
@@ -88,7 +102,13 @@ def test_rag_pack_does_not_waive_user_message_error():
     pack = RagCompatPack()
     item = _make_item(
         history=[{"role": "assistant", "msg": "answer"}],
-        totalReferences=5,
+        plugins={
+            "rag-compat": {
+                "kind": "rag-compat",
+                "version": "1.0",
+                "data": {"references": [{"url": "https://example.com/ref"}]},
+            }
+        },
     )
     core_errors = collect_approval_validation_errors(item)
     waivers = pack.collect_approval_waivers(item, core_errors)
@@ -109,7 +129,13 @@ def test_rag_pack_waives_required_tools_error_when_refs_present():
             {"role": "assistant", "msg": "world"},
         ],
         toolCalls=[{"name": "search"}],
-        totalReferences=3,
+        plugins={
+            "rag-compat": {
+                "kind": "rag-compat",
+                "version": "1.0",
+                "data": {"references": [{"url": "https://example.com/ref"}]},
+            }
+        },
     )
     core_errors = collect_approval_validation_errors(item)
     waivers = pack.collect_approval_waivers(item, core_errors)
@@ -124,7 +150,9 @@ def test_rag_pack_no_required_tools_waiver_when_refs_zero():
             {"role": "assistant", "msg": "world"},
         ],
         toolCalls=[{"name": "search"}],
-        totalReferences=0,
+        plugins={
+            "rag-compat": {"kind": "rag-compat", "version": "1.0", "data": {"references": []}}
+        },
     )
     core_errors = collect_approval_validation_errors(item)
     waivers = pack.collect_approval_waivers(item, core_errors)
@@ -145,13 +173,19 @@ def test_registry_filters_waived_errors():
 
     item = _make_item(
         history=[{"role": "user", "msg": "hello"}],
-        totalReferences=3,
+        plugins={
+            "rag-compat": {
+                "kind": "rag-compat",
+                "version": "1.0",
+                "data": {"references": [{"url": "https://example.com/ref"}]},
+            }
+        },
     )
     core_errors = collect_approval_validation_errors(item)
-    assert "history must include at least one assistant message" in core_errors
+    assert "history must include at least one agent message" in core_errors
 
     filtered = registry.filter_core_errors(item, core_errors)
-    assert "history must include at least one assistant message" not in filtered
+    assert "history must include at least one agent message" not in filtered
 
 
 def test_registry_preserves_non_waived_errors():
@@ -161,7 +195,16 @@ def test_registry_preserves_non_waived_errors():
     registry.register(RagCompatPack())
 
     # Item with no history, no question, no answer → "no conversation message" error
-    item = _make_item(synthQuestion="", totalReferences=5)
+    item = _make_item(
+        history=[],
+        plugins={
+            "rag-compat": {
+                "kind": "rag-compat",
+                "version": "1.0",
+                "data": {"references": [{"url": "https://example.com/ref"}]},
+            }
+        },
+    )
     core_errors = collect_approval_validation_errors(item)
     filtered = registry.filter_core_errors(item, core_errors)
     # "history must contain at least one conversation message" is NOT waived
diff --git a/backend/tests/unit/test_rag_compat_pack.py b/backend/tests/unit/test_rag_compat_pack.py
index 84259cc..22c2d8e 100644
--- a/backend/tests/unit/test_rag_compat_pack.py
+++ b/backend/tests/unit/test_rag_compat_pack.py
@@ -1,36 +1,20 @@
-"""Unit tests for RagCompatPack plugin contracts and migration helpers.
-
-Core-generic behavior stays covered elsewhere. This file focuses on:
-- runtime-backed pack registration and registry presence
-- stable helper contracts for retrieval/reference ownership
-- compat-migration helpers that still project legacy payloads while the shim exists
-"""
+"""Unit tests for RagCompatPack plugin contracts and reference ownership."""
 
 from __future__ import annotations
 
 import pytest
 
 from app.domain.models import AgenticGroundTruthEntry, Reference
+from app.plugins.pack_registry import get_default_pack_registry, reset_default_pack_registry
 from app.plugins.packs.rag_compat import RagCompatPack, _RAG_COMPAT_KIND
-from app.plugins.pack_registry import (
-    get_default_pack_registry,
-    reset_default_pack_registry,
-)
-
-
-# ---------------------------------------------------------------------------
-# validate_registration
-# ---------------------------------------------------------------------------
 
 
 def test_validate_registration_passes():
-    """RagCompatPack registers successfully when constants are in sync."""
     pack = RagCompatPack()
-    pack.validate_registration()  # should not raise
+    pack.validate_registration()
 
 
 def test_validate_registration_name_matches_host_model_constant():
-    """The pack name must equal AgenticGroundTruthEntry._RAG_COMPAT_PLUGIN."""
     from app.domain.models import AgenticGroundTruthEntry
 
     pack = RagCompatPack()
@@ -38,26 +22,18 @@ def test_validate_registration_name_matches_host_model_constant():
 
 
 def test_validate_registration_kind_constant_correct():
-    """_RAG_COMPAT_KIND must match the host model constant."""
     from app.domain.models import AgenticGroundTruthEntry
 
     assert _RAG_COMPAT_KIND == AgenticGroundTruthEntry._RAG_COMPAT_PLUGIN
 
 
 def test_validate_registration_fails_on_constant_mismatch(monkeypatch: pytest.MonkeyPatch):
-    """validate_registration() must raise ValueError if constants diverge."""
     pack = RagCompatPack()
-    # Simulate a rename of the host-model constant
     monkeypatch.setattr(AgenticGroundTruthEntry, "_RAG_COMPAT_PLUGIN", "rag-v2")
     with pytest.raises(ValueError, match="does not match"):
         pack.validate_registration()
 
 
-# ---------------------------------------------------------------------------
-# Plugin-contract: approval hooks
-# ---------------------------------------------------------------------------
-
-
 def _generic_item() -> AgenticGroundTruthEntry:
     return AgenticGroundTruthEntry(
         id="gen-001",
@@ -74,74 +50,62 @@ def _rag_item() -> AgenticGroundTruthEntry:
         {
             "id": "rag-001",
             "datasetName": "rag-dataset",
-            "synthQuestion": "What is retrieval?",
-            "answer": "Retrieval is finding relevant docs.",
-            "refs": [{"url": "https://example.com/doc"}],
+            "history": [
+                {"role": "user", "msg": "What is retrieval?"},
+                {"role": "assistant", "msg": "Retrieval is finding relevant docs."},
+            ],
+            "plugins": {
+                "rag-compat": {
+                    "kind": "rag-compat",
+                    "version": "1.0",
+                    "data": {
+                        "references": [{"url": "https://example.com/doc"}],
+                    },
+                }
+            },
         }
     )
 
 
-def test_collect_approval_errors_generic_item_empty():
+def test_collect_approval_errors_are_empty():
     pack = RagCompatPack()
-    item = _generic_item()
-    assert pack.collect_approval_errors(item) == []
-
-
-def test_collect_approval_errors_rag_item_empty():
-    """RAG items currently produce no additional pack-level errors."""
-    pack = RagCompatPack()
-    item = _rag_item()
-    assert pack.collect_approval_errors(item) == []
-
-
-# ---------------------------------------------------------------------------
-# Plugin-contract: helper accessors
-# ---------------------------------------------------------------------------
+    assert pack.collect_approval_errors(_generic_item()) == []
+    assert pack.collect_approval_errors(_rag_item()) == []
 
 
 def test_rag_compat_data_empty_for_generic_item():
     pack = RagCompatPack()
-    item = _generic_item()
-    assert pack.rag_compat_data(item) == {}
-
-
-def test_rag_compat_data_populated_for_rag_item():
-    pack = RagCompatPack()
-    item = _rag_item()
-    data = pack.rag_compat_data(item)
-    # The model_validator moves synthQuestion, answer, refs into rag-compat plugin data
-    assert data  # non-empty
+    assert pack.rag_compat_data(_generic_item()) == {}
 
 
-def test_rag_compat_data_contains_synth_question():
+def test_rag_compat_data_contains_only_references_for_owned_payload():
     pack = RagCompatPack()
-    item = _rag_item()
-    data = pack.rag_compat_data(item)
-    assert "synthQuestion" in data
-    assert data["synthQuestion"] == "What is retrieval?"
-
-
-# ---------------------------------------------------------------------------
-# refs_from_item accessor
-# ---------------------------------------------------------------------------
+    data = pack.rag_compat_data(_rag_item())
+    assert list(data.keys()) == ["references"]
 
 
 def test_refs_from_item_empty_for_generic_item():
     pack = RagCompatPack()
-    item = _generic_item()
-    assert pack.refs_from_item(item) == []
+    assert pack.refs_from_item(_generic_item()) == []
 
 
-def test_refs_from_item_populated_for_rag_item():
+def test_refs_from_item_reads_owned_references():
     pack = RagCompatPack()
-    item = _rag_item()
-    refs = pack.refs_from_item(item)
+    refs = pack.refs_from_item(_rag_item())
     assert len(refs) == 1
     assert isinstance(refs[0], Reference)
     assert refs[0].url == "https://example.com/doc"
 
 
-def test_refs_from_item_flattens_per_call_retrieval_state():
+def test_get_search_documents_includes_stable_id():
+    pack = RagCompatPack()
+    docs = pack.get_search_documents(_rag_item())
+    assert len(docs) == 1
+    assert docs[0]["id"] == "rag-001:ref:0"
+    assert docs[0]["url"] == "https://example.com/doc"
+
+
+def test_refs_from_item_reads_legacy_retrieval_payloads():
     pack = RagCompatPack()
     item = AgenticGroundTruthEntry.model_validate(
         {
@@ -176,9 +140,44 @@ def test_refs_from_item_flattens_per_call_retrieval_state():
     assert refs[0].messageIndex == 2
 
 
-# ---------------------------------------------------------------------------
-# Plugin-contract: reference ownership helpers
-# ---------------------------------------------------------------------------
+def test_refs_from_item_respects_explicit_empty_canonical_references():
+    pack = RagCompatPack()
+    item = AgenticGroundTruthEntry.model_validate(
+        {
+            "id": "rag-002b",
+            "datasetName": "rag-dataset",
+            "history": [
+                {"role": "user", "msg": "Question"},
+            ],
+            "toolCalls": [{"id": "tc-1", "name": "search", "callType": "tool", "stepNumber": 3}],
+            "plugins": {
+                "rag-compat": {
+                    "kind": "rag-compat",
+                    "data": {
+                        "references": [],
+                        "retrievals": {
+                            "tc-1": {
+                                "candidates": [
+                                    {
+                                        "url": "https://example.com/stale",
+                                        "title": "Stale",
+                                        "chunk": "stale retrieval snippet",
+                                    }
+                                ]
+                            }
+                        },
+                    },
+                }
+            },
+        }
+    )
+
+    refs = pack.refs_from_item(item)
+    assert refs == []
+    assert pack.reference_count(item) == 0
+
+    core_errors = ["history must include at least one agent message"]
+    assert pack.collect_approval_waivers(item, core_errors) == []
 
 
 def test_attach_reference_adds_to_rag_item():
@@ -187,43 +186,28 @@ def test_attach_reference_adds_to_rag_item():
     initial_count = len(pack.refs_from_item(item))
     new_ref = Reference(url="https://newdoc.example.com/page")
     result = pack.attach_reference(item, new_ref)
-    assert result is item  # mutated in-place
+    assert result is item
     assert len(pack.refs_from_item(item)) == initial_count + 1
-    urls = [r.url for r in pack.refs_from_item(item)]
-    assert "https://newdoc.example.com/page" in urls
 
 
-def test_attach_reference_works_on_generic_item():
+def test_attach_reference_writes_owned_references_key():
     pack = RagCompatPack()
     item = _generic_item()
-    new_ref = Reference(url="https://docs.example.com/a")
-    pack.attach_reference(item, new_ref)
-    # The ref is written to rag-compat plugin payload via the setter
-    refs = pack.refs_from_item(item)
-    assert len(refs) == 1
-    assert refs[0].url == "https://docs.example.com/a"
+    pack.attach_reference(item, Reference(url="https://docs.example.com/a"))
+    assert item.plugins["rag-compat"].data == {
+        "references": [{"url": "https://docs.example.com/a", "bonus": False}]
+    }
 
 
 def test_detach_reference_removes_by_url():
     pack = RagCompatPack()
     item = _rag_item()
-    target_url = "https://example.com/doc"
-    assert any(r.url == target_url for r in pack.refs_from_item(item))
-
-    result = pack.detach_reference(item, target_url)
+    result = pack.detach_reference(item, "https://example.com/doc")
     assert result is item
-    assert not any(r.url == target_url for r in pack.refs_from_item(item))
-
-
-def test_detach_reference_nonexistent_url_is_noop():
-    pack = RagCompatPack()
-    item = _rag_item()
-    before = len(pack.refs_from_item(item))
-    pack.detach_reference(item, "https://nonexistent.example.com")
-    assert len(pack.refs_from_item(item)) == before
+    assert pack.refs_from_item(item) == []
 
 
-def test_replace_references_clears_per_call_retrieval_state():
+def test_replace_references_clears_legacy_fields():
     pack = RagCompatPack()
     item = AgenticGroundTruthEntry.model_validate(
         {
@@ -233,7 +217,12 @@ def test_replace_references_clears_per_call_retrieval_state():
                 "rag-compat": {
                     "kind": "rag-compat",
                     "data": {
-                        "retrievals": {"tc-1": {"candidates": [{"url": "https://example.com/old"}]}}
+                        "refs": [{"url": "https://example.com/old"}],
+                        "retrievals": {
+                            "tc-1": {"candidates": [{"url": "https://example.com/legacy"}]}
+                        },
+                        "totalReferences": 2,
+                        "synthQuestion": "legacy",
                     },
                 }
             },
@@ -242,50 +231,87 @@ def test_replace_references_clears_per_call_retrieval_state():
 
     pack.replace_references(item, [Reference(url="https://example.com/new")])
 
-    assert pack.has_per_call_state(item) is False
-    refs = pack.refs_from_item(item)
-    assert len(refs) == 1
-    assert refs[0].url == "https://example.com/new"
+    assert item.plugins["rag-compat"].data == {
+        "references": [{"url": "https://example.com/new", "bonus": False}]
+    }
 
 
-def test_export_transform_projects_retrieval_candidates_to_refs():
+def test_import_transform_normalizes_legacy_fields_to_history_and_references():
     pack = RagCompatPack()
-    transform = pack.get_export_transforms()[0].transform
+    transform = pack.get_import_transforms()[0].transform
 
-    projected = transform(
+    normalized = transform(
         {
-            "id": "rag-004",
+            "id": "legacy-001",
             "datasetName": "rag-dataset",
-            "toolCalls": [{"id": "tc-1", "stepNumber": 1}],
+            "editedQuestion": "What is retrieval?",
+            "answer": "Retrieval finds relevant docs.",
+            "refs": [{"url": "https://example.com/doc"}],
+        }
+    )
+
+    assert normalized["history"] == [
+        {"role": "user", "msg": "What is retrieval?"},
+        {"role": "assistant", "msg": "Retrieval finds relevant docs."},
+    ]
+    assert normalized["plugins"]["rag-compat"]["data"] == {
+        "references": [{"url": "https://example.com/doc", "bonus": False}]
+    }
+
+
+def test_import_transform_preserves_explicit_empty_canonical_references():
+    pack = RagCompatPack()
+    transform = pack.get_import_transforms()[0].transform
+
+    normalized = transform(
+        {
+            "id": "legacy-002",
+            "datasetName": "rag-dataset",
+            "history": [{"role": "user", "msg": "Question only"}],
             "plugins": {
                 "rag-compat": {
                     "kind": "rag-compat",
+                    "version": "1.0",
                     "data": {
+                        "references": [],
                         "retrievals": {
                             "tc-1": {
                                 "candidates": [
                                     {
-                                        "url": "https://example.com/exported",
-                                        "title": "Exported",
-                                        "chunk": "retrieved chunk",
+                                        "url": "https://example.com/stale",
+                                        "chunk": "stale retrieval snippet",
                                     }
                                 ]
                             }
-                        }
+                        },
                     },
                 }
             },
         }
     )
 
-    assert projected["totalReferences"] == 1
-    assert projected["refs"][0]["url"] == "https://example.com/exported"
-    assert projected["refs"][0]["messageIndex"] == 1
+    assert normalized["plugins"]["rag-compat"]["data"] == {"references": []}
+
 
+def test_export_transform_projects_references_and_count():
+    pack = RagCompatPack()
+    transform = pack.get_export_transforms()[0].transform
 
-# ---------------------------------------------------------------------------
-# Runtime-backed registry seam
-# ---------------------------------------------------------------------------
+    projected = transform(
+        {
+            "id": "rag-004",
+            "datasetName": "rag-dataset",
+            "plugins": {
+                "rag-compat": {
+                    "kind": "rag-compat",
+                    "data": {"references": [{"url": "https://example.com/exported"}]},
+                }
+            },
+        }
+    )
+
+    assert projected["totalReferences"] == 1
+    assert projected["references"][0]["url"] == "https://example.com/exported"
 
 
 def test_default_pack_registry_contains_rag_compat():
@@ -301,7 +327,7 @@ def test_default_pack_registry_validates_without_error():
     reset_default_pack_registry()
     try:
         registry = get_default_pack_registry()
-        registry.validate_all()  # should not raise
+        registry.validate_all()
     finally:
         reset_default_pack_registry()
 
diff --git a/backend/tests/unit/test_retrieval_per_call.py b/backend/tests/unit/test_retrieval_per_call.py
index b410d6e..8891c57 100644
--- a/backend/tests/unit/test_retrieval_per_call.py
+++ b/backend/tests/unit/test_retrieval_per_call.py
@@ -1,183 +1,68 @@
-"""Tests for RagCompatPack per-tool-call retrieval state (Phase 6)."""
+"""Unit tests for legacy retrieval compatibility normalization."""
 
 from __future__ import annotations
 
-
-from app.domain.models import AgenticGroundTruthEntry
+from app.domain.models import AgenticGroundTruthEntry, Reference
 from app.plugins.packs.rag_compat import RagCompatPack
 
 
-def _make_item(**overrides) -> AgenticGroundTruthEntry:
-    """Create a minimal item with default fields."""
-    base = {
-        "id": "test-item",
-        "datasetName": "ds",
-        "history": [
-            {"role": "user", "msg": "hi"},
-            {"role": "assistant", "msg": "hello"},
-        ],
-    }
-    base.update(overrides)
-    return AgenticGroundTruthEntry.model_validate(base)
-
-
-def _make_item_with_refs(**overrides) -> AgenticGroundTruthEntry:
-    """Create an item with top-level refs (legacy pattern)."""
-    return _make_item(
-        refs=[
-            {"url": "https://a.com", "title": "A", "content": "chunk-a"},
-            {"url": "https://b.com", "title": "B", "content": "chunk-b"},
-        ],
-        **overrides,
-    )
-
-
-def _make_item_with_tool_calls(**overrides) -> AgenticGroundTruthEntry:
-    """Create an item with tool calls and top-level refs."""
-    return _make_item(
-        refs=[
-            {"url": "https://a.com", "title": "A", "content": "chunk-a", "messageIndex": 1},
-            {"url": "https://b.com", "title": "B", "content": "chunk-b"},
-        ],
-        toolCalls=[
-            {"id": "tc-1", "name": "search", "callType": "tool", "stepNumber": 1},
-            {"id": "tc-2", "name": "lookup", "callType": "tool", "stepNumber": 2},
-        ],
-        **overrides,
-    )
-
-
-class TestPerCallRetrievalState:
-    """Per-tool-call retrieval management on RagCompatPack."""
-
-    def test_get_retrievals_empty_item(self):
-        pack = RagCompatPack()
-        item = _make_item()
-        assert pack.get_retrievals(item) == {}
-
-    def test_set_and_get_retrieval_candidates(self):
-        pack = RagCompatPack()
-        item = _make_item()
-        candidates = [
-            {"url": "https://a.com", "title": "A", "chunk": "text-a"},
-        ]
-        pack.set_retrieval_candidates(item, "tc-1", candidates)
-        assert pack.get_retrieval_candidates(item, "tc-1") == candidates
-
-    def test_get_retrieval_candidates_missing_tool_call(self):
-        pack = RagCompatPack()
-        item = _make_item()
-        assert pack.get_retrieval_candidates(item, "nonexistent") == []
-
-    def test_set_retrievals_replaces_all(self):
+class TestLegacyRetrievalCompatibility:
+    def test_refs_from_item_flattens_legacy_retrieval_candidates(self):
         pack = RagCompatPack()
-        item = _make_item()
-        pack.set_retrieval_candidates(item, "tc-1", [{"url": "https://a.com"}])
-        pack.set_retrievals(
-            item,
+        item = AgenticGroundTruthEntry.model_validate(
             {
-                "tc-2": {"candidates": [{"url": "https://b.com"}]},
-            },
-        )
-        assert pack.get_retrieval_candidates(item, "tc-1") == []
-        assert len(pack.get_retrieval_candidates(item, "tc-2")) == 1
-
-    def test_has_per_call_state_false_when_empty(self):
-        pack = RagCompatPack()
-        item = _make_item()
-        assert pack.has_per_call_state(item) is False
-
-    def test_has_per_call_state_true_after_set(self):
-        pack = RagCompatPack()
-        item = _make_item()
-        pack.set_retrieval_candidates(item, "tc-1", [{"url": "https://a.com"}])
-        assert pack.has_per_call_state(item) is True
-
-    def test_get_all_candidates_flat_from_per_call(self):
-        pack = RagCompatPack()
-        item = _make_item()
-        pack.set_retrieval_candidates(
-            item,
-            "tc-1",
-            [
-                {"url": "https://a.com", "title": "A"},
-            ],
+                "id": "test-item",
+                "datasetName": "ds",
+                "history": [
+                    {"role": "user", "msg": "hi"},
+                    {"role": "assistant", "msg": "hello"},
+                ],
+                "toolCalls": [
+                    {"id": "tc-1", "name": "search", "callType": "tool", "stepNumber": 1}
+                ],
+                "plugins": {
+                    "rag-compat": {
+                        "kind": "rag-compat",
+                        "version": "1.0",
+                        "data": {
+                            "retrievals": {
+                                "tc-1": {
+                                    "candidates": [
+                                        {"url": "https://a.com", "title": "A", "chunk": "chunk-a"},
+                                        {"url": "https://b.com", "title": "B", "chunk": "chunk-b"},
+                                    ]
+                                }
+                            }
+                        },
+                    }
+                },
+            }
         )
-        pack.set_retrieval_candidates(
-            item,
-            "tc-2",
-            [
-                {"url": "https://b.com", "title": "B"},
-            ],
-        )
-        flat = pack.get_all_candidates_flat(item)
-        assert len(flat) == 2
-        urls = {c["url"] for c in flat}
-        assert urls == {"https://a.com", "https://b.com"}
 
-    def test_get_all_candidates_flat_falls_back_to_top_level_refs(self):
-        pack = RagCompatPack()
-        item = _make_item_with_refs()
-        flat = pack.get_all_candidates_flat(item)
-        assert len(flat) == 2
-        assert flat[0]["url"] == "https://a.com"
-        assert flat[0]["chunk"] == "chunk-a"
+        refs = pack.refs_from_item(item)
+        assert [ref.url for ref in refs] == ["https://a.com", "https://b.com"]
+        assert [ref.messageIndex for ref in refs] == [1, 1]
 
-    def test_get_all_candidates_flat_includes_tool_call_id(self):
+    def test_replace_references_rewrites_payload_to_owned_references(self):
         pack = RagCompatPack()
-        item = _make_item()
-        pack.set_retrieval_candidates(
-            item,
-            "tc-1",
-            [
-                {"url": "https://a.com"},
-            ],
+        item = AgenticGroundTruthEntry.model_validate(
+            {
+                "id": "test-item",
+                "datasetName": "ds",
+                "plugins": {
+                    "rag-compat": {
+                        "kind": "rag-compat",
+                        "version": "1.0",
+                        "data": {
+                            "retrievals": {"tc-1": {"candidates": [{"url": "https://legacy.com"}]}}
+                        },
+                    }
+                },
+            }
         )
-        flat = pack.get_all_candidates_flat(item)
-        assert flat[0]["toolCallId"] == "tc-1"
-
-
-class TestMigrateRefsToPerCall:
-    """Tests for migrate_refs_to_per_call helper."""
-
-    def test_migrate_no_refs_returns_false(self):
-        pack = RagCompatPack()
-        item = _make_item()
-        assert pack.migrate_refs_to_per_call(item) is False
 
-    def test_migrate_already_migrated_returns_false(self):
-        pack = RagCompatPack()
-        item = _make_item()
-        pack.set_retrieval_candidates(item, "tc-1", [{"url": "https://a.com"}])
-        # Even with refs present, per-call state exists → skip migration
-        assert pack.migrate_refs_to_per_call(item) is False
+        pack.replace_references(item, [Reference(url="https://normalized.com")])
 
-    def test_migrate_top_level_refs_to_unassociated(self):
-        pack = RagCompatPack()
-        item = _make_item_with_refs()
-        assert pack.migrate_refs_to_per_call(item) is True
-        # All refs go to _unassociated since no tool calls
-        cands = pack.get_retrieval_candidates(item, "_unassociated")
-        assert len(cands) == 2
-        assert cands[0]["url"] == "https://a.com"
-
-    def test_migrate_refs_matched_to_tool_calls_by_step(self):
-        pack = RagCompatPack()
-        item = _make_item_with_tool_calls()
-        assert pack.migrate_refs_to_per_call(item) is True
-
-        # Ref with messageIndex=1 matches tc-1 (stepNumber=1)
-        tc1_cands = pack.get_retrieval_candidates(item, "tc-1")
-        assert len(tc1_cands) == 1
-        assert tc1_cands[0]["url"] == "https://a.com"
-
-        # Ref without messageIndex goes to _unassociated
-        unassociated = pack.get_retrieval_candidates(item, "_unassociated")
-        assert len(unassociated) == 1
-        assert unassociated[0]["url"] == "https://b.com"
-
-    def test_migrate_idempotent(self):
-        pack = RagCompatPack()
-        item = _make_item_with_refs()
-        assert pack.migrate_refs_to_per_call(item) is True
-        assert pack.migrate_refs_to_per_call(item) is False
+        assert item.plugins["rag-compat"].data == {
+            "references": [{"url": "https://normalized.com", "bonus": False}]
+        }
diff --git a/backend/tests/unit/test_snapshot_service.py b/backend/tests/unit/test_snapshot_service.py
index 9a8cfcf..52d2c5c 100644
--- a/backend/tests/unit/test_snapshot_service.py
+++ b/backend/tests/unit/test_snapshot_service.py
@@ -100,9 +100,10 @@ def _make_item(id: str, dataset: str, status: GroundTruthStatus) -> AgenticGroun
         datasetName=dataset,
         bucket=None,
         status=status,
-        synthQuestion="Q?",
-        answer="A",
-        refs=[],
+        history=[
+            {"role": "user", "msg": "Q?"},
+            {"role": "assistant", "msg": "A"},
+        ],
         manualTags=[],
         computedTags=[],
     )
diff --git a/backend/tests/unit/test_trace_export_adapter.py b/backend/tests/unit/test_trace_export_adapter.py
index 2f0f8ba..3b193b0 100644
--- a/backend/tests/unit/test_trace_export_adapter.py
+++ b/backend/tests/unit/test_trace_export_adapter.py
@@ -49,9 +49,11 @@ def test_trace_export_adapter_maps_trace_into_agentic_ground_truth() -> None:
     assert item.id == "trace-trace-123"
     assert item.datasetName == "customer-feedback"
     assert item.scenario_id == "trace-export:trace-123"
-    assert item.synth_question == "CX IS USING TOO MUCH DATA AND WANTS TO KNOW WHY"
-    assert item.answer is not None
-    assert "Root Cause" in item.answer
+    assert item.history[0].role == "user"
+    assert item.history[0].msg == "CX IS USING TOO MUCH DATA AND WANTS TO KNOW WHY"
+    assert item.history[1].role == "orchestrator-agent"
+    assert "cellular data" in item.history[1].msg
+    assert any("Root Cause" in turn.msg for turn in item.history)
     assert item.comment == "CUSTOMER WAS ON CELLULAR DATA INSTEAD OF WIFI"
     assert item.trace_ids == {
         "traceId": "trace-123",
diff --git a/backend/tests/unit/test_validation_required_tools.py b/backend/tests/unit/test_validation_required_tools.py
index babde3e..1efabee 100644
--- a/backend/tests/unit/test_validation_required_tools.py
+++ b/backend/tests/unit/test_validation_required_tools.py
@@ -97,7 +97,13 @@ def test_rag_pack_waives_required_tools_for_retrieval_items():
 
     item = _make_item(
         toolCalls=[{"name": "search"}],
-        totalReferences=3,
+        plugins={
+            "rag-compat": {
+                "kind": "rag-compat",
+                "version": "1.0",
+                "data": {"references": [{"url": "https://example.com/ref"}]},
+            }
+        },
     )
     core_errors = collect_approval_validation_errors(item)
     assert REQUIRED_TOOLS_ERROR in core_errors
@@ -113,7 +119,13 @@ def test_rag_pack_does_not_waive_required_tools_without_refs():
 
     item = _make_item(
         toolCalls=[{"name": "search"}],
-        totalReferences=0,
+        plugins={
+            "rag-compat": {
+                "kind": "rag-compat",
+                "version": "1.0",
+                "data": {"references": []},
+            }
+        },
     )
     core_errors = collect_approval_validation_errors(item)
     filtered = registry.filter_core_errors(item, core_errors)
diff --git a/backend/tests/unit/test_validation_service.py b/backend/tests/unit/test_validation_service.py
index 942eafd..c18d5f5 100644
--- a/backend/tests/unit/test_validation_service.py
+++ b/backend/tests/unit/test_validation_service.py
@@ -12,14 +12,63 @@ def test_approval_validation_accepts_legacy_question_answer_payload():
         {
             "id": "item-1",
             "datasetName": "demo",
-            "synthQuestion": "What is Ground Truth Curator?",
-            "answer": "It is a curation application.",
+            "history": [
+                {"role": "user", "msg": "What is Ground Truth Curator?"},
+                {"role": "assistant", "msg": "It is a curation application."},
+            ],
         }
     )
 
     assert collect_approval_validation_errors(item) == []
 
 
+def test_approval_validation_accepts_agent_answer_role():
+    item = AgenticGroundTruthEntry.model_validate(
+        {
+            "id": "item-agent",
+            "datasetName": "demo",
+            "history": [
+                {"role": "user", "msg": "What is Ground Truth Curator?"},
+                {"role": "agent", "msg": "It is a curation application."},
+            ],
+        }
+    )
+
+    assert collect_approval_validation_errors(item) == []
+
+
+def test_approval_validation_accepts_custom_non_user_answer_role():
+    item = AgenticGroundTruthEntry.model_validate(
+        {
+            "id": "item-planner",
+            "datasetName": "demo",
+            "history": [
+                {"role": "user", "msg": "Plan the rollout."},
+                {"role": "planner", "msg": "Step 1: scope. Step 2: validate."},
+            ],
+        }
+    )
+
+    assert collect_approval_validation_errors(item) == []
+
+
+def test_approval_validation_rejects_all_user_history():
+    item = AgenticGroundTruthEntry.model_validate(
+        {
+            "id": "item-all-user",
+            "datasetName": "demo",
+            "history": [
+                {"role": "user", "msg": "Question one"},
+                {"role": "user", "msg": "Question two"},
+            ],
+        }
+    )
+
+    assert collect_approval_validation_errors(item) == [
+        "history must include at least one agent message"
+    ]
+
+
 def test_approval_validation_requires_required_tool_when_tool_calls_exist():
     item = AgenticGroundTruthEntry(
         id="item-2",
diff --git a/frontend/src/adapters/apiMapper.ts b/frontend/src/adapters/apiMapper.ts
index f285c90..40e985c 100644
--- a/frontend/src/adapters/apiMapper.ts
+++ b/frontend/src/adapters/apiMapper.ts
@@ -3,61 +3,52 @@ import {
 	createConversationTurn,
 	ensureConversationTurnIdentity,
 	type GroundTruthItem,
-	getItemReferences,
-	getLastAgentTurn,
-	getLastUserTurn,
 	type PluginPayload,
-	type Reference,
 	type ToolCallRecord,
 	withDerivedLegacyFields,
 } from "../models/groundTruth";
-import { urlToTitle } from "../models/utils";
-
-const _RAG_COMPAT_KEY = "rag-compat";
-const _UNASSOCIATED_KEY = "_unassociated";
-
-type RetrievalBucket = {
-	candidates: Array<{
-		url: string;
-		title?: string;
-		chunk?: string;
-		relevance?: string;
-		toolCallId?: string;
-		messageIndex?: number;
-		turnId?: string;
-		keyParagraph?: string;
-		bonus?: boolean;
-	}>;
-};
-type RetrievalsMap = Record<string, RetrievalBucket>;
+import { sanitizeCompatPluginForPatch } from "./ragCompatBoundary";
+
+const _REMOVED_COMPAT_PATCH_KEYS = [
+	"synthQuestion",
+	"editedQuestion",
+	"answer",
+	"refs",
+	"totalReferences",
+	"retrievals",
+	"historyAnnotations",
+	"contextUsedForGeneration",
+	"contextSource",
+	"modelUsedForGeneration",
+	"semanticClusterNumber",
+	"weight",
+	"samplingBucket",
+	"questionLength",
+];
 
 type ConversationTurn = NonNullable<GroundTruthItem["history"]>[number];
+export type ApiReference = {
+	url: string;
+	title?: string | null;
+	content?: string | null;
+	keyExcerpt?: string | null;
+	type?: string | null;
+	bonus?: boolean;
+	messageIndex?: number | null;
+};
 export type ApiHistoryEntry = components["schemas"]["HistoryEntry"] & {
-	refs?: components["schemas"]["Reference"][];
+	refs?: ApiReference[];
 	expectedBehavior?: string[];
 	turnId?: string;
 	stepId?: string;
 };
-export type ApiGroundTruth =
-	components["schemas"]["AgenticGroundTruthEntry-Output"] & {
-		synthQuestion?: string | null;
-		editedQuestion?: string | null;
-		answer?: string | null;
-		refs?: components["schemas"]["Reference"][];
-		totalReferences?: number;
-		tags?: string[];
-		comment?: string | null;
-	} & Omit<
-			components["schemas"]["AgenticGroundTruthEntry-Output"],
-			"history"
-		> & {
-			history?: ApiHistoryEntry[];
-		};
-export type ApiReference = components["schemas"]["Reference"];
-
-type StoredTurnIdentity = {
-	turnId?: string;
-	stepId?: string;
+export type ApiGroundTruth = Omit<
+	components["schemas"]["AgenticGroundTruthEntry-Output"],
+	"history"
+> & {
+	tags?: string[];
+	comment?: string | null;
+	history?: ApiHistoryEntry[];
 };
 
 function hasOwnField(value: object, field: PropertyKey): boolean {
@@ -77,17 +68,6 @@ function normalizeToolCalls(
 	}));
 }
 
-function getStoredTurnIdentities(
-	plugins: Record<string, PluginPayload>,
-): StoredTurnIdentity[] {
-	const turnIdentity = (
-		plugins[_RAG_COMPAT_KEY]?.data as Record<string, unknown>
-	)?.turnIdentity;
-	return Array.isArray(turnIdentity)
-		? (turnIdentity as StoredTurnIdentity[])
-		: [];
-}
-
 export function groundTruthFromApi(
 	api: ApiGroundTruth,
 	providerId = "api",
@@ -96,130 +76,26 @@ export function groundTruthFromApi(
 		api.plugins && Object.keys(api.plugins).length
 			? (api.plugins as Record<string, PluginPayload>)
 			: {};
-	const storedTurnIdentity = getStoredTurnIdentities(plugins);
 	let history: GroundTruthItem["history"];
-	const legacyRefs: Reference[] = [];
-	let refIndex = 0;
 
-	if (api.history && api.history.length > 0) {
+	if (Array.isArray(api.history)) {
 		history = new Array(api.history.length);
 
 		for (let idx = 0; idx < api.history.length; idx++) {
 			const h = api.history[idx];
-			// Preserve free-form roles; map "assistant" to "agent" for backward compat.
-			const role = h.role === "assistant" ? "agent" : h.role;
-			const identity = storedTurnIdentity[idx];
 			history[idx] = createConversationTurn({
-				role,
+				role: h.role,
 				content: h.msg,
-				turnId: h.turnId || identity?.turnId,
-				stepId: h.stepId || identity?.stepId,
+				turnId: h.turnId,
+				stepId: h.stepId,
 				expectedBehavior:
 					h.expectedBehavior && h.expectedBehavior.length > 0
 						? (h.expectedBehavior as ConversationTurn["expectedBehavior"])
 						: undefined,
 			});
-
-			if (h.refs && h.refs.length > 0) {
-				for (const r of h.refs) {
-					legacyRefs.push({
-						id: `ref_${refIndex++}`,
-						title: r.title || (r.url ? urlToTitle(r.url) : undefined),
-						url: r.url,
-						snippet: r.content ?? undefined,
-						keyParagraph: r.keyExcerpt ?? undefined,
-						visitedAt: null,
-						bonus: r.bonus === true,
-						messageIndex: idx,
-						turnId: history[idx]?.turnId,
-					});
-				}
-			}
-		}
-	} else {
-		// Legacy single-turn item: create initial history from synthQuestion/editedQuestion
-		const initialQuestion = api.editedQuestion || api.synthQuestion || "";
-		if (initialQuestion) {
-			history = [
-				createConversationTurn({
-					role: "user",
-					content: initialQuestion,
-					turnId: storedTurnIdentity[0]?.turnId,
-					stepId: storedTurnIdentity[0]?.stepId,
-				}),
-				createConversationTurn({
-					role: "agent",
-					content: api.answer || "",
-					turnId: storedTurnIdentity[1]?.turnId,
-					stepId: storedTurnIdentity[1]?.stepId,
-				}),
-			];
-		}
-	}
-
-	// Process top-level refs (backward compatibility)
-	if (api.refs && api.refs.length > 0) {
-		const wasLegacyConversion = !api.history || api.history.length === 0;
-		const messageIndex = wasLegacyConversion ? 1 : undefined;
-		const turnId =
-			typeof messageIndex === "number"
-				? history?.[messageIndex]?.turnId
-				: undefined;
-
-		for (const r of api.refs) {
-			legacyRefs.push({
-				id: `ref_${refIndex++}`,
-				title: r.title || (r.url ? urlToTitle(r.url) : undefined),
-				url: r.url,
-				snippet: r.content ?? undefined,
-				keyParagraph: r.keyExcerpt ?? undefined,
-				visitedAt: null,
-				bonus: r.bonus === true,
-				messageIndex,
-				turnId,
-			});
 		}
 	}
 
-	// Read per-call retrieval state from plugin data if it already exists
-	const existingRetrievals = (
-		plugins[_RAG_COMPAT_KEY]?.data as Record<string, unknown> | undefined
-	)?.retrievals;
-	const hasPerCallState =
-		existingRetrievals &&
-		typeof existingRetrievals === "object" &&
-		!Array.isArray(existingRetrievals) &&
-		Object.keys(existingRetrievals as Record<string, unknown>).length > 0;
-
-	// When no per-call state exists but legacy refs were extracted, migrate them
-	if (!hasPerCallState && legacyRefs.length > 0) {
-		const retrievals: RetrievalsMap = {};
-		for (const ref of legacyRefs) {
-			const key = ref.toolCallId || _UNASSOCIATED_KEY;
-			if (!retrievals[key]) {
-				retrievals[key] = { candidates: [] };
-			}
-			retrievals[key].candidates.push({
-				url: ref.url,
-				title: ref.title,
-				chunk: ref.snippet,
-				relevance: undefined,
-				toolCallId: ref.toolCallId,
-				messageIndex: ref.turnId ? undefined : ref.messageIndex,
-				turnId: ref.turnId,
-				keyParagraph: ref.keyParagraph,
-				bonus: ref.bonus,
-			});
-		}
-
-		const existingPlugin = plugins[_RAG_COMPAT_KEY];
-		plugins[_RAG_COMPAT_KEY] = {
-			kind: _RAG_COMPAT_KEY,
-			version: existingPlugin?.version || "1.0",
-			data: { ...(existingPlugin?.data || {}), retrievals },
-		};
-	}
-
 	const deleted = api.status === "deleted";
 
 	return withDerivedLegacyFields({
@@ -235,7 +111,6 @@ export function groundTruthFromApi(
 		manualTags: api.manualTags || [],
 		computedTags: api.computedTags || [],
 		reviewedAt: api.reviewedAt ?? null,
-		totalReferences: api.totalReferences,
 		// Generic schema fields — passed through from the API
 		scenarioId: api.scenarioId || undefined,
 		contextEntries:
@@ -265,87 +140,25 @@ export function groundTruthFromApi(
 
 export function groundTruthToPatch(args: {
 	item: GroundTruthItem;
-	originalApi?: ApiGroundTruth;
 }): Partial<ApiGroundTruth> {
-	const { originalApi } = args;
 	const item = withDerivedLegacyFields(args.item);
 	const history = ensureConversationTurnIdentity(item.history);
 
-	// Extract references from per-call plugin state
-	const references = getItemReferences(item);
-
-	const hadLegacyTopLevelRefs =
-		!!originalApi &&
-		(!originalApi.history || originalApi.history.length === 0) &&
-		(originalApi.refs?.length || 0) > 0;
-
-	let topLevelRefs: ApiReference[] = [];
-	if (hadLegacyTopLevelRefs) {
-		const legacyAgentTurnId = history[1]?.turnId;
-		topLevelRefs = references
-			.filter(
-				(r) =>
-					r.turnId === legacyAgentTurnId ||
-					r.messageIndex === 1 ||
-					r.messageIndex === undefined,
-			)
-			.map((r) => ({
-				url: r.url,
-				title: r.title || undefined,
-				keyExcerpt: r.keyParagraph || undefined,
-				content: r.snippet || undefined,
-				bonus: !!r.bonus,
-			}));
-	} else {
-		topLevelRefs = references
-			.filter((r) => r.messageIndex === undefined)
-			.map((r) => ({
-				url: r.url,
-				title: r.title || undefined,
-				keyExcerpt: r.keyParagraph || undefined,
-				content: r.snippet || undefined,
-				bonus: !!r.bonus,
-			}));
-	}
-
 	const body: Partial<ApiGroundTruth> = {
 		status: (item.deleted
 			? "deleted"
 			: item.status) as components["schemas"]["GroundTruthStatus"],
-		answer: getLastAgentTurn(item),
-		editedQuestion: getLastUserTurn(item),
-		refs: topLevelRefs,
 		manualTags: item.manualTags || [],
 	};
 
 	if (history.length > 0) {
-		body.history = history.map((turn, idx) => {
-			let turnRefs: ApiReference[] | undefined;
-			if (turn.role !== "user") {
-				const refsForTurn = references.filter(
-					(r) => r.turnId === turn.turnId || r.messageIndex === idx,
-				);
-				if (refsForTurn.length > 0) {
-					turnRefs = refsForTurn.map((r) => ({
-						url: r.url,
-						title: r.title || undefined,
-						content: r.snippet || undefined,
-						keyExcerpt: r.keyParagraph || undefined,
-						bonus: !!r.bonus,
-					}));
-				}
-			}
-
-			// Map "agent" back to "assistant" for backward compat; preserve other free-form roles.
-			const apiRole = turn.role === "agent" ? "assistant" : turn.role;
-
+		body.history = history.map((turn) => {
 			return {
-				role: apiRole,
+				role: turn.role,
 				msg: turn.content,
 				turnId: turn.turnId,
 				stepId: turn.stepId,
 				expectedBehavior: turn.expectedBehavior || undefined,
-				...(turnRefs ? { refs: turnRefs } : {}),
 			};
 		});
 	}
@@ -373,22 +186,19 @@ export function groundTruthToPatch(args: {
 	if (item.metadata && Object.keys(item.metadata).length) {
 		(body as Record<string, unknown>).metadata = item.metadata;
 	}
-	const plugins = { ...(item.plugins || {}) };
-	const existingCompat = plugins[_RAG_COMPAT_KEY];
-	if (history.length > 0) {
-		plugins[_RAG_COMPAT_KEY] = {
-			kind: _RAG_COMPAT_KEY,
-			version: existingCompat?.version || "1.0",
-			data: {
-				...(existingCompat?.data || {}),
-				turnIdentity: history.map((turn) => ({
-					turnId: turn.turnId,
-					stepId: turn.stepId,
-				})),
-			},
-		};
-	}
-	if (Object.keys(plugins).length) {
+	const plugins = sanitizeCompatPluginForPatch({
+		plugins: item.plugins,
+		removedKeys: _REMOVED_COMPAT_PATCH_KEYS,
+		historyTurnIds: history.map((turn) => turn.turnId),
+		indexByTurnId: new Map(
+			history
+				.map((turn, index) =>
+					turn.turnId ? ([turn.turnId, index] as const) : null,
+				)
+				.filter((entry): entry is readonly [string, number] => entry !== null),
+		),
+	});
+	if (plugins && Object.keys(plugins).length) {
 		(body as Record<string, unknown>).plugins = plugins;
 	}
 	if (item.traceIds) {
diff --git a/frontend/src/adapters/apiProvider.ts b/frontend/src/adapters/apiProvider.ts
index 466d7de..32098fe 100644
--- a/frontend/src/adapters/apiProvider.ts
+++ b/frontend/src/adapters/apiProvider.ts
@@ -85,7 +85,7 @@ export class ApiProvider implements Provider {
 			const fresh = await getGroundTruthRaw(dataset, bucket, item.id);
 			updatedApi = fresh;
 		} else {
-			const patch = groundTruthToPatch({ item, originalApi: e.api });
+			const patch = groundTruthToPatch({ item });
 			const doUpdate = async (nextEtag?: string | null) =>
 				updateAssignedGroundTruth(
 					dataset,
diff --git a/frontend/src/adapters/ragCompatBoundary.ts b/frontend/src/adapters/ragCompatBoundary.ts
new file mode 100644
index 0000000..4c678f7
--- /dev/null
+++ b/frontend/src/adapters/ragCompatBoundary.ts
@@ -0,0 +1,103 @@
+import {
+	type CompatPluginsMap,
+	collectCanonicalReferencesFromCompatPlugins,
+	getCompatReferencesFromData,
+	getCompatRetrievalsFromData,
+	retrievalsToCanonicalReferences,
+	writeCompatPluginEnvelope,
+} from "../models/ragCompatPayload";
+
+const _RAG_COMPAT_KEY = "rag-compat";
+
+type ReferenceLike = {
+	id: string;
+	title?: string;
+	url: string;
+	snippet?: string;
+	visitedAt?: string | null;
+	keyParagraph?: string;
+	bonus?: boolean;
+	messageIndex?: number;
+	turnId?: string;
+	toolCallId?: string;
+};
+
+export function collectReferencesFromCompat(args: {
+	plugins: CompatPluginsMap | undefined;
+	historyTurnIds: Array<string | undefined>;
+	indexByTurnId: Map<string, number>;
+}): ReferenceLike[] {
+	const { plugins, historyTurnIds, indexByTurnId } = args;
+	return collectCanonicalReferencesFromCompatPlugins({
+		plugins,
+		historyTurnIds,
+		indexByTurnId,
+	});
+}
+
+export function withCompatReferences(args: {
+	plugins: CompatPluginsMap | undefined;
+	refs: ReferenceLike[];
+}): CompatPluginsMap {
+	const { plugins, refs } = args;
+	return writeCompatPluginEnvelope({ plugins, refs });
+}
+
+export function sanitizeCompatData(
+	data: unknown,
+	removedKeys: readonly string[],
+	historyTurnIds: Array<string | undefined> = [],
+	indexByTurnId: Map<string, number> = new Map(),
+): Record<string, unknown> {
+	if (!data || typeof data !== "object" || Array.isArray(data)) {
+		return {};
+	}
+	const sanitized = { ...(data as Record<string, unknown>) };
+	const canonicalRefs = getCompatReferencesFromData(sanitized);
+	if (!canonicalRefs) {
+		const retrievals = getCompatRetrievalsFromData(sanitized);
+		if (retrievals) {
+			const materialized = retrievalsToCanonicalReferences({
+				retrievals,
+				historyTurnIds,
+				indexByTurnId,
+			});
+			if (materialized.length > 0) {
+				sanitized.references = materialized;
+			}
+		}
+	}
+	for (const key of removedKeys) {
+		delete sanitized[key];
+	}
+	delete sanitized.retrievals;
+	return sanitized;
+}
+
+export function sanitizeCompatPluginForPatch(args: {
+	plugins: CompatPluginsMap | undefined;
+	removedKeys: readonly string[];
+	historyTurnIds?: Array<string | undefined>;
+	indexByTurnId?: Map<string, number>;
+}): CompatPluginsMap | undefined {
+	const { plugins, removedKeys, historyTurnIds, indexByTurnId } = args;
+	if (!plugins) {
+		return undefined;
+	}
+	const nextPlugins = { ...plugins };
+	const existingCompat = nextPlugins[_RAG_COMPAT_KEY];
+	if (!existingCompat) {
+		return nextPlugins;
+	}
+	nextPlugins[_RAG_COMPAT_KEY] = {
+		kind: _RAG_COMPAT_KEY,
+		version: existingCompat.version || "1.0",
+		data: sanitizeCompatData(
+			existingCompat.data,
+			removedKeys,
+			historyTurnIds,
+			indexByTurnId,
+		),
+	};
+	return nextPlugins;
+}
diff --git a/frontend/src/api/generated.ts b/frontend/src/api/generated.ts
index e2a83f0..2efc7da 100644
--- a/frontend/src/api/generated.ts
+++ b/frontend/src/api/generated.ts
@@ -689,16 +689,6 @@ export interface components {
             };
             /** Tags */
             readonly tags: string[];
-            /** Synthquestion */
-            readonly synthQuestion: string | null;
-            /** Editedquestion */
-            readonly editedQuestion: string | null;
-            /** Answer */
-            readonly answer: string | null;
-            /** Refs */
-            readonly refs: components["schemas"]["Reference"][];
-            /** Totalreferences */
-            readonly totalReferences: number;
         };
         /**
          * AssignItemRequest
@@ -1056,7 +1046,7 @@ export interface components {
             item_id: string;
             /**
              * Field
-             * @description Field name where PII was detected (e.g., 'synthQuestion', 'history[2].msg')
+             * @description Field name where the PII was detected (e.g., 'history.question', 'history[2].msg')
              */
             field: string;
             /**
@@ -1163,35 +1153,6 @@ export interface components {
              */
             duration_ms: number;
         };
-        /**
-         * Reference
-         * @description Legacy RAG reference object retained for compatibility helpers and tests.
-         */
-        Reference: {
-            /**
-             * Url
-             * @description Reference URL (required, non-empty)
-             */
-            url: string;
-            /**
-             * Title
-             * @description Human-readable title for the reference
-             */
-            title?: string | null;
-            /** Content */
-            content?: string | null;
-            /** Keyexcerpt */
-            keyExcerpt?: string | null;
-            /** Type */
-            type?: string | null;
-            /**
-             * Bonus
-             * @default false
-             */
-            bonus: boolean;
-            /** Messageindex */
-            messageIndex?: number | null;
-        };
         /** RemoveTagsRequest */
         RemoveTagsRequest: {
             /** Tags */
@@ -1219,7 +1180,7 @@ export interface components {
          * SortField
          * @enum {string}
          */
-        SortField: "reviewedAt" | "updatedAt" | "id" | "hasAnswer" | "totalReferences" | "tagCount";
+        SortField: "reviewedAt" | "updatedAt" | "id" | "hasAnswer" | "tagCount";
         /**
          * SortOrder
          * @enum {string}
@@ -1450,11 +1411,13 @@ export interface operations {
                 excludeTags?: string | null;
                 /** @description Search for items by ID (case-sensitive partial match) */
                 itemId?: string | null;
-                /** @description Search for items by reference URL (case-sensitive partial match) */
-                refUrl?: string | null;
+                /** @description Plugin-namespaced filters in key=value form (repeat query param). Example: pluginFilter=rag-compat:refUrl=https://example.com */
+                pluginFilter?: string[] | null;
                 /** @description Search for items by keyword (case-insensitive text search across questions, answers, and history) */
                 keyword?: string | null;
                 sortBy?: components["schemas"]["SortField"];
+                /** @description Plugin-namespaced sort key, e.g. rag-compat:totalReferences */
+                pluginSort?: string | null;
                 sortOrder?: components["schemas"]["SortOrder"];
                 page?: number;
                 limit?: number;
diff --git a/frontend/src/api/openapi.json b/frontend/src/api/openapi.json
index 6135025..b4cb427 100644
--- a/frontend/src/api/openapi.json
+++ b/frontend/src/api/openapi.json
@@ -230,22 +230,25 @@
 						"description": "Search for items by ID (case-sensitive partial match)"
 					},
 					{
-						"name": "refUrl",
+						"name": "pluginFilter",
 						"in": "query",
 						"required": false,
 						"schema": {
 							"anyOf": [
 								{
-									"type": "string"
+									"type": "array",
+									"items": {
+										"type": "string"
+									}
 								},
 								{
 									"type": "null"
 								}
 							],
-							"description": "Search for items by reference URL (case-sensitive partial match)",
-							"title": "Refurl"
+							"description": "Plugin-namespaced filters in key=value form (repeat query param). Example: pluginFilter=rag-compat:refUrl=https://example.com",
+							"title": "Pluginfilter"
 						},
-						"description": "Search for items by reference URL (case-sensitive partial match)"
+						"description": "Plugin-namespaced filters in key=value form (repeat query param). Example: pluginFilter=rag-compat:refUrl=https://example.com"
 					},
 					{
 						"name": "keyword",
@@ -274,6 +277,24 @@
 							"default": "reviewedAt"
 						}
 					},
+					{
+						"name": "pluginSort",
+						"in": "query",
+						"required": false,
+						"schema": {
+							"anyOf": [
+								{
+									"type": "string"
+								},
+								{
+									"type": "null"
+								}
+							],
+							"description": "Plugin-namespaced sort key, e.g. rag-compat:totalReferences",
+							"title": "Pluginsort"
+						},
+						"description": "Plugin-namespaced sort key, e.g. rag-compat:totalReferences"
+					},
 					{
 						"name": "sortOrder",
 						"in": "query",
@@ -1983,69 +2004,11 @@
 						"type": "array",
 						"title": "Tags",
 						"readOnly": true
-					},
-					"synthQuestion": {
-						"anyOf": [
-							{
-								"type": "string"
-							},
-							{
-								"type": "null"
-							}
-						],
-						"title": "Synthquestion",
-						"readOnly": true
-					},
-					"editedQuestion": {
-						"anyOf": [
-							{
-								"type": "string"
-							},
-							{
-								"type": "null"
-							}
-						],
-						"title": "Editedquestion",
-						"readOnly": true
-					},
-					"answer": {
-						"anyOf": [
-							{
-								"type": "string"
-							},
-							{
-								"type": "null"
-							}
-						],
-						"title": "Answer",
-						"readOnly": true
-					},
-					"refs": {
-						"items": {
-							"$ref": "#/components/schemas/Reference"
-						},
-						"type": "array",
-						"title": "Refs",
-						"readOnly": true
-					},
-					"totalReferences": {
-						"type": "integer",
-						"title": "Totalreferences",
-						"readOnly": true
 					}
 				},
 				"additionalProperties": false,
 				"type": "object",
-				"required": [
-					"id",
-					"datasetName",
-					"tags",
-					"synthQuestion",
-					"editedQuestion",
-					"answer",
-					"refs",
-					"totalReferences"
-				],
+				"required": ["id", "datasetName", "tags"],
 				"title": "AgenticGroundTruthEntry",
 				"description": "Generic agentic-first host model.\n\nThe core contract intentionally exposes only the generic schema in OpenAPI. Legacy\nRAG-shaped payloads are translated into this shape when validating this base class so\nexisting data can be carried forward without remaining top-level contract fields."
 			},
@@ -2899,7 +2862,7 @@
 					"field": {
 						"type": "string",
 						"title": "Field",
-						"description": "Field name where PII was detected (e.g., 'synthQuestion', 'history[2].msg')"
+						"description": "Field name where the PII was detected (e.g., 'history.question', 'history[2].msg')"
 					},
 					"pattern_type": {
 						"type": "string",
@@ -3041,80 +3004,6 @@
 				"title": "RecomputeTagsResponse",
 				"description": "Response for bulk computed tag recomputation."
 			},
-			"Reference": {
-				"properties": {
-					"url": {
-						"type": "string",
-						"title": "Url",
-						"description": "Reference URL (required, non-empty)"
-					},
-					"title": {
-						"anyOf": [
-							{
-								"type": "string"
-							},
-							{
-								"type": "null"
-							}
-						],
-						"title": "Title",
-						"description": "Human-readable title for the reference"
-					},
-					"content": {
-						"anyOf": [
-							{
-								"type": "string"
-							},
-							{
-								"type": "null"
-							}
-						],
-						"title": "Content"
-					},
-					"keyExcerpt": {
-						"anyOf": [
-							{
-								"type": "string"
-							},
-							{
-								"type": "null"
-							}
-						],
-						"title": "Keyexcerpt"
-					},
-					"type": {
-						"anyOf": [
-							{
-								"type": "string"
-							},
-							{
-								"type": "null"
-							}
-						],
-						"title": "Type"
-					},
-					"bonus": {
-						"type": "boolean",
-						"title": "Bonus",
-						"default": false
-					},
-					"messageIndex": {
-						"anyOf": [
-							{
-								"type": "integer"
-							},
-							{
-								"type": "null"
-							}
-						],
-						"title": "Messageindex"
-					}
-				},
-				"type": "object",
-				"required": ["url"],
-				"title": "Reference",
-				"description": "Legacy RAG reference object retained for compatibility helpers and tests."
-			},
 			"RemoveTagsRequest": {
 				"properties": {
 					"tags": {
@@ -3206,14 +3095,7 @@
 			},
 			"SortField": {
 				"type": "string",
-				"enum": [
-					"reviewedAt",
-					"updatedAt",
-					"id",
-					"hasAnswer",
-					"totalReferences",
-					"tagCount"
-				],
+				"enum": ["reviewedAt", "updatedAt", "id", "hasAnswer", "tagCount"],
 				"title": "SortField"
 			},
 			"SortOrder": {
diff --git a/frontend/src/components/app/QuestionsExplorer.example.tsx b/frontend/src/components/app/QuestionsExplorer.example.tsx
index 0ca7862..ceab33a 100644
--- a/frontend/src/components/app/QuestionsExplorer.example.tsx
+++ b/frontend/src/components/app/QuestionsExplorer.example.tsx
@@ -9,608 +9,82 @@ import QuestionsExplorer, {
 	type QuestionsExplorerItem,
 } from "./QuestionsExplorer";
 
-// Sample data - now with 50 items to demonstrate pagination and dataset filtering
-const sampleItems: QuestionsExplorerItem[] = [
-	{
-		id: "gt-001",
-		question: "What is the capital of France?",
-		answer: "Paris",
-		status: "approved",
-		providerId: "json",
-		views: 150,
-		reuses: 12,
-		datasetName: "geography",
-		tags: ["beginner", "popular"],
-		reviewedAt: "2025-09-15T10:30:00Z",
-	},
-	{
-		id: "gt-002",
-		question: "How does photosynthesis work?",
-		answer: "Photosynthesis converts light energy into chemical energy...",
-		status: "draft",
-		providerId: "json",
-		views: 45,
-		reuses: 3,
-		datasetName: "biology",
-		tags: ["science", "beginner"],
-		reviewedAt: "2025-09-20T14:20:00Z",
-	},
-	{
-		id: "gt-003",
-		question: "What is quantum computing?",
-		answer: "Quantum computing uses quantum-mechanical phenomena...",
-		status: "approved",
-		providerId: "json",
-		views: 230,
-		reuses: 28,
-		datasetName: "technology",
-		tags: ["advanced", "technical", "popular"],
-		reviewedAt: "2025-09-10T08:15:00Z",
-	},
-	{
-		id: "gt-004",
-		question: "Explain machine learning basics",
-		answer: "Machine learning is a subset of artificial intelligence...",
-		status: "deleted",
-		providerId: "json",
-		views: 89,
-		reuses: 7,
-		datasetName: "technology",
-		tags: ["technical", "AI"],
-		reviewedAt: "2025-08-25T16:45:00Z",
-	},
-	{
-		id: "gt-005",
-		question: "What are the benefits of exercise?",
-		answer: "Regular exercise improves cardiovascular health...",
-		status: "approved",
-		providerId: "json",
-		views: 320,
-		reuses: 41,
-		datasetName: "health",
-		tags: ["beginner", "wellness"],
-		reviewedAt: "2025-09-22T11:00:00Z",
-	},
-	{
-		id: "gt-006",
-		question: "How do neural networks work?",
-		answer:
-			"Neural networks are computing systems inspired by biological neural networks...",
-		status: "draft",
-		providerId: "json",
-		views: 198,
-		reuses: 15,
-		datasetName: "technology",
-		tags: ["advanced", "technical", "AI"],
-		reviewedAt: "2025-09-18T09:30:00Z",
-	},
-	{
-		id: "gt-007",
-		question: "What is blockchain technology?",
-		answer: "Blockchain is a distributed ledger technology...",
-		status: "approved",
-		providerId: "json",
-		views: 412,
-		reuses: 52,
-		datasetName: "technology",
-		tags: ["popular", "technical"],
-		reviewedAt: "2025-09-12T14:22:00Z",
-	},
-	{
-		id: "gt-008",
-		question: "Explain the water cycle",
-		answer: "The water cycle describes the continuous movement of water...",
-		status: "approved",
-		providerId: "json",
-		views: 276,
-		reuses: 34,
-		datasetName: "science",
-		tags: ["beginner", "science"],
-		reviewedAt: "2025-09-05T10:10:00Z",
-	},
-	{
-		id: "gt-009",
-		question: "What causes climate change?",
-		answer: "Climate change is primarily caused by greenhouse gas emissions...",
-		status: "draft",
-		providerId: "json",
-		views: 523,
-		reuses: 67,
-		datasetName: "science",
-		tags: ["science", "popular", "environmental"],
-		reviewedAt: "2025-09-28T15:45:00Z",
-	},
-	{
-		id: "gt-010",
-		question: "How does DNA replication work?",
-		answer: "DNA replication is the process of copying DNA molecules...",
-		status: "approved",
-		providerId: "json",
-		views: 145,
-		reuses: 18,
-		datasetName: "biology",
-		tags: ["science", "advanced"],
-		reviewedAt: "2025-09-08T13:20:00Z",
-	},
-	{
-		id: "gt-011",
-		question: "What is the theory of relativity?",
-		answer:
-			"Einstein's theory of relativity revolutionized our understanding of space and time...",
-		status: "approved",
-		providerId: "json",
-		views: 387,
-		reuses: 45,
-		datasetName: "physics",
-		tags: ["advanced", "science", "popular"],
-		reviewedAt: "2025-09-14T07:30:00Z",
-	},
-	{
-		id: "gt-012",
-		question: "How do vaccines work?",
-		answer:
-			"Vaccines work by training the immune system to recognize pathogens...",
-		status: "draft",
-		providerId: "json",
-		views: 612,
-		reuses: 78,
-		datasetName: "health",
-		reviewedAt: "2025-09-25T16:15:00Z",
-	},
-	{
-		id: "gt-013",
-		question: "What is cloud computing?",
-		answer: "Cloud computing delivers computing services over the internet...",
-		status: "deleted",
-		providerId: "json",
-		views: 298,
-		reuses: 39,
-		datasetName: "technology",
-		reviewedAt: "2025-08-30T12:00:00Z",
-	},
-	{
-		id: "gt-014",
-		question: "Explain Newton's laws of motion",
-		answer:
-			"Newton's three laws describe the relationship between objects and forces...",
-		status: "approved",
-		providerId: "json",
-		views: 165,
-		reuses: 21,
-		datasetName: "physics",
-		reviewedAt: "2025-09-03T09:45:00Z",
-	},
-	{
-		id: "gt-015",
-		question: "What is artificial intelligence?",
-		answer: "AI is the simulation of human intelligence by machines...",
-		status: "draft",
-		providerId: "json",
-		views: 734,
-		reuses: 91,
-		datasetName: "technology",
-		reviewedAt: "2025-09-29T18:00:00Z",
-	},
-	{
-		id: "gt-016",
-		question: "How do black holes form?",
-		answer:
-			"Black holes form when massive stars collapse at the end of their life cycle...",
-		status: "approved",
-		providerId: "json",
-		views: 445,
-		reuses: 56,
-		datasetName: "physics",
-		reviewedAt: "2025-09-11T11:20:00Z",
-	},
-	{
-		id: "gt-017",
-		question: "What is cryptocurrency?",
-		answer: "Cryptocurrency is a digital currency secured by cryptography...",
-		status: "draft",
-		providerId: "json",
-		views: 521,
-		reuses: 64,
-		datasetName: "technology",
-		reviewedAt: "2025-09-27T10:30:00Z",
-	},
-	{
-		id: "gt-018",
-		question: "Explain the concept of entropy",
-		answer: "Entropy is a measure of disorder or randomness in a system...",
-		status: "approved",
-		providerId: "json",
-		views: 187,
-		reuses: 23,
-		datasetName: "physics",
-		reviewedAt: "2025-09-06T14:50:00Z",
-	},
-	{
-		id: "gt-019",
-		question: "What are stem cells?",
-		answer:
-			"Stem cells are undifferentiated cells capable of developing into various cell types...",
-		status: "draft",
-		providerId: "json",
-		views: 354,
-		reuses: 42,
-		datasetName: "biology",
-		reviewedAt: "2025-09-19T08:15:00Z",
-	},
-	{
-		id: "gt-020",
-		question: "How does the internet work?",
-		answer: "The internet is a global network of interconnected computers...",
-		status: "approved",
-		providerId: "json",
-		views: 289,
-		reuses: 37,
-		datasetName: "technology",
-		reviewedAt: "2025-09-13T16:40:00Z",
-	},
-	{
-		id: "gt-021",
-		question: "What is natural selection?",
-		answer:
-			"Natural selection is the process by which organisms better adapted survive...",
-		status: "approved",
-		providerId: "json",
-		views: 423,
-		reuses: 54,
-		datasetName: "biology",
-		reviewedAt: "2025-09-16T12:30:00Z",
-	},
-	{
-		id: "gt-022",
-		question: "Explain quantum entanglement",
-		answer:
-			"Quantum entanglement is a phenomenon where particles remain connected...",
-		status: "draft",
-		providerId: "json",
-		views: 267,
-		reuses: 31,
-		datasetName: "physics",
-		reviewedAt: "2025-09-21T09:00:00Z",
-	},
-	{
-		id: "gt-023",
-		question: "What causes earthquakes?",
-		answer:
-			"Earthquakes occur when energy is released from tectonic plate movements...",
-		status: "deleted",
-		providerId: "json",
-		views: 198,
-		reuses: 25,
-		datasetName: "science",
-		reviewedAt: "2025-08-28T15:20:00Z",
-	},
-	{
-		id: "gt-024",
-		question: "How do solar panels work?",
-		answer:
-			"Solar panels convert sunlight into electricity using photovoltaic cells...",
-		status: "approved",
-		providerId: "json",
-		views: 512,
-		reuses: 66,
-		datasetName: "technology",
-		reviewedAt: "2025-09-17T13:45:00Z",
-	},
-	{
-		id: "gt-025",
-		question: "What is gene editing?",
-		answer: "Gene editing allows scientists to modify DNA sequences...",
-		status: "draft",
-		providerId: "json",
-		views: 389,
-		reuses: 48,
-		datasetName: "biology",
-		reviewedAt: "2025-09-24T11:10:00Z",
-	},
-	{
-		id: "gt-026",
-		question: "Explain the greenhouse effect",
-		answer:
-			"The greenhouse effect is the warming of Earth's surface and atmosphere...",
-		status: "approved",
-		providerId: "json",
-		views: 456,
-		reuses: 58,
-		datasetName: "science",
-		reviewedAt: "2025-09-09T10:25:00Z",
-	},
-	{
-		id: "gt-027",
-		question: "What is machine vision?",
-		answer:
-			"Machine vision enables computers to interpret visual information...",
-		status: "draft",
-		providerId: "json",
-		views: 312,
-		reuses: 38,
-		datasetName: "technology",
-		reviewedAt: "2025-09-23T14:00:00Z",
-	},
-	{
-		id: "gt-028",
-		question: "How does GPS work?",
-		answer: "GPS uses satellites to determine precise location on Earth...",
-		status: "approved",
-		providerId: "json",
-		views: 234,
-		reuses: 29,
-		datasetName: "technology",
-		reviewedAt: "2025-09-07T08:40:00Z",
-	},
-	{
-		id: "gt-029",
-		question: "What are exoplanets?",
-		answer:
-			"Exoplanets are planets that orbit stars outside our solar system...",
-		status: "approved",
-		providerId: "json",
-		views: 378,
-		reuses: 47,
-		datasetName: "physics",
-		reviewedAt: "2025-09-14T16:55:00Z",
-	},
-	{
-		id: "gt-030",
-		question: "Explain nuclear fusion",
-		answer: "Nuclear fusion is the process that powers the sun...",
-		status: "draft",
-		providerId: "json",
-		views: 521,
-		reuses: 63,
-		datasetName: "physics",
-		reviewedAt: "2025-09-26T09:30:00Z",
-	},
-	{
-		id: "gt-031",
-		question: "What is nanotechnology?",
-		answer:
-			"Nanotechnology involves manipulating matter at the atomic scale...",
-		status: "approved",
-		providerId: "json",
-		views: 267,
-		reuses: 33,
-		datasetName: "technology",
-		reviewedAt: "2025-09-04T12:15:00Z",
-	},
-	{
-		id: "gt-032",
-		question: "How do antibiotics work?",
-		answer: "Antibiotics kill or inhibit the growth of bacteria...",
-		status: "draft",
-		providerId: "json",
-		views: 445,
-		reuses: 55,
-		datasetName: "health",
-		reviewedAt: "2025-09-20T17:30:00Z",
-	},
-	{
-		id: "gt-033",
-		question: "What is dark matter?",
-		answer:
-			"Dark matter is an invisible form of matter that makes up most of the universe...",
-		status: "approved",
-		providerId: "json",
-		views: 598,
-		reuses: 72,
-		datasetName: "physics",
-		reviewedAt: "2025-09-15T11:45:00Z",
-	},
-	{
-		id: "gt-034",
-		question: "Explain the Big Bang theory",
-		answer: "The Big Bang theory describes the origin of the universe...",
-		status: "approved",
-		providerId: "json",
-		views: 487,
-		reuses: 61,
-		datasetName: "physics",
-		reviewedAt: "2025-09-12T09:20:00Z",
-	},
-	{
-		id: "gt-035",
-		question: "What is cybersecurity?",
-		answer: "Cybersecurity protects computer systems from digital attacks...",
-		status: "draft",
-		providerId: "json",
-		views: 623,
-		reuses: 79,
-		datasetName: "technology",
-		reviewedAt: "2025-09-28T13:50:00Z",
-	},
-	{
-		id: "gt-036",
-		question: "How does the human brain work?",
-		answer: "The brain processes information through billions of neurons...",
-		status: "approved",
-		providerId: "json",
-		views: 712,
-		reuses: 88,
-		datasetName: "biology",
-		reviewedAt: "2025-09-19T15:10:00Z",
-	},
-	{
-		id: "gt-037",
-		question: "What is 5G technology?",
-		answer: "5G is the fifth generation of cellular network technology...",
-		status: "draft",
-		providerId: "json",
-		views: 356,
-		reuses: 44,
-		datasetName: "technology",
-		reviewedAt: "2025-09-25T10:05:00Z",
-	},
-	{
-		id: "gt-038",
-		question: "Explain plate tectonics",
-		answer:
-			"Plate tectonics describes the movement of Earth's lithospheric plates...",
-		status: "approved",
-		providerId: "json",
-		views: 298,
-		reuses: 36,
-		datasetName: "science",
-		reviewedAt: "2025-09-08T14:25:00Z",
-	},
-	{
-		id: "gt-039",
-		question: "What is renewable energy?",
-		answer: "Renewable energy comes from naturally replenishing sources...",
-		status: "approved",
-		providerId: "json",
-		views: 534,
-		reuses: 68,
-		datasetName: "science",
-		reviewedAt: "2025-09-16T16:40:00Z",
-	},
-	{
-		id: "gt-040",
-		question: "How do batteries work?",
-		answer: "Batteries convert chemical energy into electrical energy...",
-		status: "draft",
-		providerId: "json",
-		views: 412,
-		reuses: 51,
-		datasetName: "technology",
-		reviewedAt: "2025-09-22T11:55:00Z",
-	},
-	{
-		id: "gt-041",
-		question: "What is the immune system?",
-		answer: "The immune system defends the body against harmful pathogens...",
-		status: "approved",
-		providerId: "json",
-		views: 467,
-		reuses: 59,
-		datasetName: "health",
-		reviewedAt: "2025-09-11T13:20:00Z",
-	},
-	{
-		id: "gt-042",
-		question: "Explain deep learning",
-		answer: "Deep learning uses neural networks with multiple layers...",
-		status: "draft",
-		providerId: "json",
-		views: 589,
-		reuses: 73,
-		datasetName: "technology",
-		reviewedAt: "2025-09-27T15:35:00Z",
-	},
-	{
-		id: "gt-043",
-		question: "What is bioengineering?",
-		answer:
-			"Bioengineering applies engineering principles to biological systems...",
-		status: "approved",
-		providerId: "json",
-		views: 321,
-		reuses: 40,
-		datasetName: "biology",
-		reviewedAt: "2025-09-10T10:50:00Z",
-	},
-	{
-		id: "gt-044",
-		question: "How do superconductors work?",
-		answer:
-			"Superconductors conduct electricity with zero resistance at low temperatures...",
-		status: "draft",
-		providerId: "json",
-		views: 245,
-		reuses: 30,
-		datasetName: "physics",
-		reviewedAt: "2025-09-18T12:05:00Z",
-	},
-	{
-		id: "gt-045",
-		question: "What is augmented reality?",
-		answer: "AR overlays digital information onto the real world...",
-		status: "approved",
-		providerId: "json",
-		views: 678,
-		reuses: 84,
-		datasetName: "technology",
-		reviewedAt: "2025-09-24T14:15:00Z",
-	},
-	{
-		id: "gt-046",
-		question: "Explain the carbon cycle",
-		answer: "The carbon cycle describes how carbon moves through ecosystems...",
-		status: "approved",
-		providerId: "json",
-		views: 334,
-		reuses: 42,
-		datasetName: "science",
-		reviewedAt: "2025-09-13T09:30:00Z",
-	},
-	{
-		id: "gt-047",
-		question: "What is quantum computing used for?",
-		answer:
-			"Quantum computers solve complex problems beyond classical computers...",
-		status: "draft",
-		providerId: "json",
-		views: 501,
-		reuses: 62,
-		datasetName: "technology",
-		reviewedAt: "2025-09-29T11:45:00Z",
-	},
-	{
-		id: "gt-048",
-		question: "How does protein synthesis work?",
-		answer:
-			"Protein synthesis involves transcription and translation of genetic code...",
-		status: "approved",
-		providerId: "json",
-		views: 287,
-		reuses: 35,
-		datasetName: "biology",
-		reviewedAt: "2025-09-06T16:00:00Z",
-	},
-	{
-		id: "gt-049",
-		question: "What is edge computing?",
-		answer: "Edge computing processes data closer to where it's generated...",
-		status: "draft",
-		providerId: "json",
-		views: 423,
-		reuses: 53,
-		datasetName: "technology",
-		reviewedAt: "2025-09-21T13:10:00Z",
-	},
-	{
-		id: "gt-050",
-		question: "Explain the Doppler effect",
-		answer:
-			"The Doppler effect is the change in frequency due to relative motion...",
-		status: "approved",
-		providerId: "json",
-		views: 198,
-		reuses: 24,
-		datasetName: "physics",
-		reviewedAt: "2025-09-05T11:25:00Z",
-	},
+const questionPrompts = [
+	"What is the capital of France?",
+	"How does photosynthesis work?",
+	"What is quantum computing?",
+	"Explain machine learning basics",
+	"What are the benefits of exercise?",
+	"How do neural networks work?",
+	"What is blockchain technology?",
+	"Explain the water cycle",
+	"What causes climate change?",
+	"How does DNA replication work?",
+];
+
+const answerSnippets = [
+	"Paris is the capital city of France.",
+	"Photosynthesis converts light energy into chemical energy.",
+	"Quantum computing uses superposition and entanglement.",
+	"Machine learning learns patterns from data.",
+	"Exercise improves cardiovascular and mental health.",
+	"Neural networks stack layers of weighted transformations.",
+	"Blockchain is a tamper-evident distributed ledger.",
+	"The water cycle moves water through evaporation and precipitation.",
+	"Climate change is driven largely by greenhouse gas emissions.",
+	"DNA replication copies genetic material before cell division.",
 ];
 
+const datasets = ["technology", "science", "biology", "physics", "health"];
+const statuses: QuestionsExplorerItem["status"][] = [
+	"approved",
+	"draft",
+	"approved",
+	"deleted",
+	"approved",
+];
+
+// Sample data with canonical history turns (no top-level answer convenience fields).
+const sampleItems: QuestionsExplorerItem[] = Array.from(
+	{ length: 50 },
+	(_, i) => {
+		const prompt = questionPrompts[i % questionPrompts.length];
+		const answer = answerSnippets[i % answerSnippets.length];
+		const id = `gt-${String(i + 1).padStart(3, "0")}`;
+
+		return {
+			id,
+			providerId: "json",
+			status: statuses[i % statuses.length],
+			deleted: statuses[i % statuses.length] === "deleted",
+			history: [
+				{ role: "user", content: prompt },
+				{ role: "agent", content: answer },
+			],
+			tags: i % 3 === 0 ? ["popular", "beginner"] : ["technical"],
+			manualTags: [],
+			computedTags: [],
+			datasetName: datasets[i % datasets.length],
+			reviewedAt: `2025-09-${String((i % 28) + 1).padStart(2, "0")}T10:30:00Z`,
+			views: 100 + i * 7,
+			reuses: 5 + (i % 40),
+		};
+	},
+);
+
 export default function QuestionsExplorerExample() {
 	const handleAssign = (item: QuestionsExplorerItem) => {
 		console.log(`Assign ground truth: ${item.id}`);
-		// Implementation: Open assignment modal or navigate to assignment flow
 		alert(`Assign functionality for ${item.id} would be triggered here`);
 	};
 
 	const handleInspect = (item: QuestionsExplorerItem) => {
 		console.log(`Inspect ground truth: ${item.id}`);
-		// Implementation: Open detail view or navigate to editor
 		alert(`Inspect functionality for ${item.id} would be triggered here`);
 	};
 
 	const handleDelete = (item: QuestionsExplorerItem) => {
 		console.log(`Delete ground truth: ${item.id}`);
-		// Implementation: Confirm and perform soft delete
 		const confirmed = window.confirm(
 			`Are you sure you want to delete ${item.id}?`,
 		);
diff --git a/frontend/src/components/app/QuestionsExplorer.tsx b/frontend/src/components/app/QuestionsExplorer.tsx
index a188e4b..0d3e2e8 100644
--- a/frontend/src/components/app/QuestionsExplorer.tsx
+++ b/frontend/src/components/app/QuestionsExplorer.tsx
@@ -2,7 +2,11 @@ import { Lock } from "lucide-react";
 import { useEffect, useId, useMemo, useRef, useState } from "react";
 import useTags from "../../hooks/useTags";
 import type { GroundTruthItem } from "../../models/groundTruth";
-import { getLastAgentTurn, getQueuePreview } from "../../models/groundTruth";
+import {
+	getItemReferences,
+	getLastAgentTurn,
+	getQueuePreview,
+} from "../../models/groundTruth";
 import { cn } from "../../models/utils";
 import { getExplorerExtensions } from "../../registry/ExplorerExtensions";
 import { fetchAvailableDatasets } from "../../services/datasets";
@@ -243,13 +247,16 @@ export default function QuestionsExplorer({
 		// Build API parameters from applied filters
 		// Note: toolCallCount is a client-side sort only (not passed to API)
 		const sortByParam =
+			appliedFilter.sortColumn === "tagCount"
+				? "tagCount"
+				: appliedFilter.sortColumn === "refs" ||
+						appliedFilter.sortColumn === "toolCallCount"
+					? null // plugin/client-side sort; do not pass as core sortBy
+					: appliedFilter.sortColumn;
+		const pluginSortParam =
 			appliedFilter.sortColumn === "refs"
-				? "totalReferences"
-				: appliedFilter.sortColumn === "tagCount"
-					? "tagCount"
-					: appliedFilter.sortColumn === "toolCallCount"
-						? null // client-side sort; do not pass to backend
-						: appliedFilter.sortColumn;
+				? "rag-compat:totalReferences"
+				: undefined;
 
 		// Ensure page is at least 1
 		const safePage = Math.max(1, currentPage);
@@ -267,10 +274,16 @@ export default function QuestionsExplorer({
 					? appliedFilter.tags.exclude
 					: undefined,
 			itemId: appliedFilter.itemId || undefined,
-			refUrl: appliedFilter.refUrl || undefined,
+			pluginFilter: appliedFilter.refUrl
+				? [`rag-compat:refUrl=${appliedFilter.refUrl}`]
+				: undefined,
 			keyword: appliedFilter.keyword || undefined,
-			sortBy: sortByParam,
-			sortOrder: sortByParam ? appliedFilter.sortDirection : undefined,
+			sortBy: sortByParam ?? undefined,
+			pluginSort: pluginSortParam,
+			sortOrder:
+				sortByParam || pluginSortParam
+					? appliedFilter.sortDirection
+					: undefined,
 			page: safePage,
 			limit: itemsPerPage,
 		};
@@ -1232,7 +1245,7 @@ export default function QuestionsExplorer({
 											</td>
 											{/* Refs */}
 											<td className="px-3 py-3 text-center text-sm font-medium text-slate-700 hidden lg:table-cell">
-												{item.totalReferences ?? 0}
+												{getItemReferences(item).length}
 											</td>
 											{/* Tag Count */}
 											<td className="px-3 py-3 text-center text-sm font-medium text-slate-700 hidden xl:table-cell">
diff --git a/frontend/src/dev/self-tests.ts b/frontend/src/dev/self-tests.ts
index 7659582..2c392eb 100644
--- a/frontend/src/dev/self-tests.ts
+++ b/frontend/src/dev/self-tests.ts
@@ -16,7 +16,7 @@ export function runSelfTests() {
 		const item: GroundTruthItem = {
 			id: "T",
 			question: "q",
-			answer: "a",
+			history: [{ role: "agent", content: "a" }],
 			status: "draft",
 			providerId: "json",
 		};
diff --git a/frontend/src/models/demoData.ts b/frontend/src/models/demoData.ts
index 07d7513..779c8a0 100644
--- a/frontend/src/models/demoData.ts
+++ b/frontend/src/models/demoData.ts
@@ -7,8 +7,6 @@ export const DEMO_JSON: GroundTruthItem[] = [
 		id: "demo-data-overage",
 		providerId: "json",
 		question: "CX IS USING TOO MUCH DATA AND WANTS TO KNOW WHY",
-		answer:
-			"The RCA shows the line exceeded the 50 GB plan cap after extended streaming and hotspot activity stayed on cellular data instead of Wi-Fi.",
 		history: [
 			{
 				role: "user",
@@ -151,8 +149,6 @@ export const DEMO_JSON: GroundTruthItem[] = [
 		providerId: "json",
 		question:
 			"CUSTOMER WAS CHARGED ROAMING FEES EVEN THOUGH THEY BOUGHT A PASS",
-		answer:
-			"The travel pass activated after the first charged roaming session, so the early usage billed at standard rates and later usage correctly switched to the pass.",
 		history: [
 			{
 				role: "user",
@@ -266,8 +262,6 @@ export const DEMO_JSON: GroundTruthItem[] = [
 		providerId: "json",
 		question:
 			"CUSTOMER THINKS THERE WAS AN OUTAGE WHEN DATA SLOWED DOWN AT A STADIUM",
-		answer:
-			"The slowdown was caused by short-lived cell congestion during a high-density event, not by a persistent account or device problem.",
 		history: [
 			{
 				role: "user",
@@ -350,8 +344,6 @@ export const DEMO_JSON: GroundTruthItem[] = [
 		providerId: "json",
 		question:
 			"What is our refund policy for services interrupted during a network outage?",
-		answer:
-			"Customers affected by confirmed outages lasting more than 4 hours are eligible for a pro-rated service credit. Credits are applied automatically within 2 billing cycles for outages flagged in the NOC system.",
 		history: [
 			{
 				role: "user",
diff --git a/frontend/src/models/groundTruth.ts b/frontend/src/models/groundTruth.ts
index 3592650..5db5876 100644
--- a/frontend/src/models/groundTruth.ts
+++ b/frontend/src/models/groundTruth.ts
@@ -1,3 +1,10 @@
+import {
+	collectCanonicalReferencesFromCompatData,
+	getCompatData,
+	getCompatRetrievalsFromData,
+	writeCompatPluginEnvelope,
+} from "./ragCompatPayload";
+
 // Domain models and constants for Ground Truth items
 
 // ---------------------------------------------------------------------------
@@ -66,51 +73,8 @@ export type RetrievalCandidate = {
 	toolCallId?: string;
 };
 
-// ---------------------------------------------------------------------------
-// Per-call retrieval helpers (Phase 6 — retrieval normalization)
-//
-// References are stored in plugins["rag-compat"].data.retrievals per tool
-// call.  The helpers below provide flat Reference[] access for UI
-// components that still consume the legacy Reference shape.
-// ---------------------------------------------------------------------------
-
-const _RAG_COMPAT_KEY = "rag-compat";
-const _UNASSOCIATED_KEY = "_unassociated";
-
-/** Per-call retrieval bucket as stored in plugin data. */
-type RetrievalBucket = {
-	candidates: Array<{
-		url: string;
-		title?: string;
-		chunk?: string;
-		rawPayload?: Record<string, unknown>;
-		relevance?: string;
-		toolCallId?: string | null;
-		messageIndex?: number;
-		turnId?: string;
-		keyParagraph?: string;
-		bonus?: boolean;
-		visitedAt?: string | null;
-	}>;
-};
-
-/** Typed shorthand for the retrievals dict inside rag-compat plugin data. */
-type RetrievalsMap = Record<string, RetrievalBucket>;
-
-/**
- * Read the per-call retrievals map from plugin data.
- * Returns `undefined` when no per-call state exists.
- */
-export function getRetrievalsMap(
-	item: Pick<GroundTruthItem, "plugins">,
-): RetrievalsMap | undefined {
-	const data = item.plugins?.[_RAG_COMPAT_KEY]?.data;
-	if (!data) return undefined;
-	const r = data.retrievals;
-	if (r && typeof r === "object" && !Array.isArray(r)) {
-		return r as RetrievalsMap;
-	}
-	return undefined;
+export function getRetrievalsMap(item: Pick<GroundTruthItem, "plugins">) {
+	return getCompatRetrievalsFromData(getCompatData(item.plugins));
 }
 
 /**
@@ -121,41 +85,15 @@ export function getRetrievalsMap(
  * exists (caller should provide legacy references separately if needed).
  */
 export function getItemReferences(item: GroundTruthItem): Reference[] {
-	const retrievals = getRetrievalsMap(item);
-	if (!retrievals) return [];
 	const history = ensureConversationTurnIdentity(item.history);
 	const indexByTurnId = getTurnIndexById(history);
+	const historyTurnIds = history.map((turn) => turn.turnId);
 
-	const refs: Reference[] = [];
-	let refIndex = 0;
-	for (const [toolCallId, bucket] of Object.entries(retrievals)) {
-		if (!bucket?.candidates) continue;
-		for (const c of bucket.candidates) {
-			const storedTurnId = c.turnId;
-			const resolvedMessageIndex =
-				storedTurnId && indexByTurnId.has(storedTurnId)
-					? indexByTurnId.get(storedTurnId)
-					: c.messageIndex;
-			const resolvedTurnId =
-				storedTurnId ||
-				(typeof resolvedMessageIndex === "number"
-					? history[resolvedMessageIndex]?.turnId
-					: undefined);
-			refs.push({
-				id: `ref_${refIndex++}`,
-				title: c.title,
-				url: c.url,
-				snippet: c.chunk,
-				visitedAt: c.visitedAt ?? null,
-				keyParagraph: c.keyParagraph,
-				bonus: c.bonus ?? false,
-				messageIndex: resolvedMessageIndex,
-				turnId: resolvedTurnId,
-				toolCallId: toolCallId !== _UNASSOCIATED_KEY ? toolCallId : undefined,
-			});
-		}
-	}
-	return refs;
+	return collectCanonicalReferencesFromCompatData({
+		data: getCompatData(item.plugins),
+		historyTurnIds,
+		indexByTurnId,
+	});
 }
 
 /**
@@ -167,35 +105,10 @@ export function withUpdatedReferences(
 	item: GroundTruthItem,
 	refs: Reference[],
 ): GroundTruthItem {
-	const retrievals: RetrievalsMap = {};
-	for (const ref of refs) {
-		const key = ref.toolCallId || _UNASSOCIATED_KEY;
-		if (!retrievals[key]) {
-			retrievals[key] = { candidates: [] };
-		}
-		retrievals[key].candidates.push({
-			url: ref.url,
-			title: ref.title,
-			chunk: ref.snippet,
-			relevance: undefined,
-			toolCallId: ref.toolCallId || undefined,
-			messageIndex: ref.turnId ? undefined : ref.messageIndex,
-			turnId: ref.turnId,
-			keyParagraph: ref.keyParagraph,
-			bonus: ref.bonus,
-			visitedAt: ref.visitedAt,
-		});
-	}
-
-	const plugins = { ...(item.plugins || {}) };
-	const existing = plugins[_RAG_COMPAT_KEY];
-	plugins[_RAG_COMPAT_KEY] = {
-		kind: _RAG_COMPAT_KEY,
-		version: existing?.version || "1.0",
-		data: { ...(existing?.data || {}), retrievals },
+	return {
+		...item,
+		plugins: writeCompatPluginEnvelope({ plugins: item.plugins, refs }),
 	};
-
-	return { ...item, plugins };
 }
 
 // ---------------------------------------------------------------------------
@@ -214,8 +127,8 @@ export type ConversationTurn = {
 	turnId?: string;
 	/** Stable workflow-step identity when a turn maps to a durable step. */
 	stepId?: string;
-	/** Free-form role string. "user" marks the human turn; any other value is a non-user (agent/assistant) turn.
-	 *  Common values: "user", "agent", "assistant", "output-agent", "orchestrator-agent". */
+	/** Free-form role string. "user" marks the human turn; all non-user roles
+	 *  represent non-user/answer content (e.g. "agent", "assistant", "planner"). */
 	role: string;
 	content: string;
 	/** Expected behavior(s) for this turn in the conversation (agent turns only, legacy/compat) */
@@ -340,8 +253,6 @@ export type GroundTruthItem = {
 	bucket?: string;
 	/** Legacy compatibility projection derived from history when absent. */
 	question?: string;
-	/** Legacy compatibility projection derived from history when absent. */
-	answer?: string;
 	/** ISO date string of the last review, when provided by the API. */
 	reviewedAt?: string | null;
 	/**
@@ -349,8 +260,6 @@ export type GroundTruthItem = {
 	 * Rendered in a collapsible pane above the Question/Answer editors.
 	 */
 	curationInstructions?: string;
-	/** Backend-computed total count of references (item-level + all turn-level). */
-	totalReferences?: number;
 	/** ETag for optimistic concurrency control */
 	_etag?: string;
 };
@@ -369,6 +278,18 @@ export function getLegacyHostDeleteGates(): LegacyHostDeleteGate[] {
 	return [...LEGACY_HOST_DELETE_GATES];
 }
 
+function normalizeRole(role: string): string {
+	return role.trim().toLowerCase();
+}
+
+export function isUserRole(role: string): boolean {
+	return normalizeRole(role) === "user";
+}
+
+export function isNonUserRole(role: string): boolean {
+	return !isUserRole(role);
+}
+
 export function createConversationTurn(args: {
 	role: string;
 	content: string;
@@ -408,7 +329,7 @@ export function getLastUserTurn(item: GroundTruthItem): string {
 	}
 	// Find the last user turn
 	for (let i = history.length - 1; i >= 0; i--) {
-		if (history[i].role === "user") {
+		if (isUserRole(history[i].role)) {
 			return history[i].content;
 		}
 	}
@@ -417,19 +338,19 @@ export function getLastUserTurn(item: GroundTruthItem): string {
 
 /**
  * Returns the last agent message from history.
- * "Agent" is any turn whose role is not "user" (supports free-form roles).
+ * Non-user turns are treated as answer content.
  */
 export function getLastAgentTurn(item: GroundTruthItem): string {
 	if (!Array.isArray(item.history)) {
-		return item.answer || "";
+		return "";
 	}
 	const history = ensureConversationTurnIdentity(item.history);
 	if (history.length === 0) {
 		return "";
 	}
-	// Find the last non-user turn (any agent/assistant/orchestrator role)
+	// Find the last non-user turn.
 	for (let i = history.length - 1; i >= 0; i--) {
-		if (history[i].role !== "user") {
+		if (isNonUserRole(history[i].role)) {
 			return history[i].content;
 		}
 	}
@@ -458,8 +379,8 @@ export function getQueuePreview(item: GroundTruthItem): string {
 	if (!Array.isArray(item.history)) {
 		return item.question || "(no message)";
 	}
-	const first = ensureConversationTurnIdentity(item.history).find(
-		(t) => t.role === "user",
+	const first = ensureConversationTurnIdentity(item.history).find((t) =>
+		isUserRole(t.role),
 	);
 	return first?.content || "(no message)";
 }
@@ -477,7 +398,6 @@ export function withDerivedLegacyFields(
 	return {
 		...derivedItem,
 		question: getLastUserTurn(derivedItem),
-		answer: getLastAgentTurn(derivedItem),
 	};
 }
 
diff --git a/frontend/src/models/ragCompatPayload.ts b/frontend/src/models/ragCompatPayload.ts
new file mode 100644
index 0000000..5340a4f
--- /dev/null
+++ b/frontend/src/models/ragCompatPayload.ts
@@ -0,0 +1,292 @@
+const _RAG_COMPAT_KEY = "rag-compat";
+const _UNASSOCIATED_KEY = "_unassociated";
+
+export type CompatPluginPayload = {
+	kind: string;
+	version: string;
+	data?: Record<string, unknown>;
+};
+
+export type CompatPluginsMap = Record<string, CompatPluginPayload>;
+
+export type CompatReferencePayload = {
+	url: string;
+	title?: string;
+	content?: string;
+	keyExcerpt?: string;
+	bonus?: boolean;
+	messageIndex?: number;
+	turnId?: string;
+	toolCallId?: string;
+	visitedAt?: string | null;
+};
+
+export type CanonicalReferencePayload = {
+	id: string;
+	url: string;
+	title?: string;
+	snippet?: string;
+	visitedAt?: string | null;
+	keyParagraph?: string;
+	bonus?: boolean;
+	messageIndex?: number;
+	turnId?: string;
+	toolCallId?: string;
+};
+
+export type RetrievalCandidatePayload = {
+	url: string;
+	title?: string;
+	chunk?: string;
+	relevance?: string;
+	toolCallId?: string | null;
+	messageIndex?: number;
+	turnId?: string;
+	keyParagraph?: string;
+	bonus?: boolean;
+	visitedAt?: string | null;
+};
+
+export type RetrievalBucketPayload = {
+	candidates: RetrievalCandidatePayload[];
+};
+
+export type RetrievalsMap = Record<string, RetrievalBucketPayload>;
+
+function asObjectRecord(value: unknown): Record<string, unknown> | undefined {
+	if (!value || typeof value !== "object" || Array.isArray(value)) {
+		return undefined;
+	}
+	return value as Record<string, unknown>;
+}
+
+export function getCompatData(
+	plugins: CompatPluginsMap | undefined,
+): Record<string, unknown> | undefined {
+	return asObjectRecord(plugins?.[_RAG_COMPAT_KEY]?.data);
+}
+
+export function getCompatReferencesFromData(
+	data: Record<string, unknown> | undefined,
+): CompatReferencePayload[] | undefined {
+	const references = data?.references;
+	if (!Array.isArray(references)) {
+		return undefined;
+	}
+	return references as CompatReferencePayload[];
+}
+
+export function getCompatReferencesFromPlugins(
+	plugins: CompatPluginsMap | undefined,
+): CompatReferencePayload[] | undefined {
+	return getCompatReferencesFromData(getCompatData(plugins));
+}
+
+export function getCompatRetrievalsFromData(
+	data: Record<string, unknown> | undefined,
+): RetrievalsMap | undefined {
+	const retrievals = data?.retrievals;
+	if (
+		retrievals &&
+		typeof retrievals === "object" &&
+		!Array.isArray(retrievals)
+	) {
+		return retrievals as RetrievalsMap;
+	}
+	return undefined;
+}
+
+export function getCompatRetrievalsFromPlugins(
+	plugins: CompatPluginsMap | undefined,
+): RetrievalsMap | undefined {
+	return getCompatRetrievalsFromData(getCompatData(plugins));
+}
+
+export function retrievalsToCanonicalReferences(args: {
+	retrievals: RetrievalsMap;
+	historyTurnIds: Array<string | undefined>;
+	indexByTurnId: Map<string, number>;
+}): CompatReferencePayload[] {
+	const { retrievals, historyTurnIds, indexByTurnId } = args;
+	const refs: CompatReferencePayload[] = [];
+	for (const [toolCallId, bucket] of Object.entries(retrievals)) {
+		if (!bucket?.candidates) continue;
+		for (const candidate of bucket.candidates) {
+			if (!candidate?.url) continue;
+			const storedTurnId = candidate.turnId;
+			const resolvedMessageIndex =
+				storedTurnId && indexByTurnId.has(storedTurnId)
+					? indexByTurnId.get(storedTurnId)
+					: candidate.messageIndex;
+			const resolvedTurnId =
+				storedTurnId ||
+				(typeof resolvedMessageIndex === "number"
+					? historyTurnIds[resolvedMessageIndex]
+					: undefined);
+			refs.push({
+				url: candidate.url,
+				title: candidate.title,
+				content: candidate.chunk,
+				keyExcerpt: candidate.keyParagraph,
+				bonus: candidate.bonus ?? false,
+				messageIndex: resolvedTurnId ? undefined : resolvedMessageIndex,
+				turnId: resolvedTurnId,
+				toolCallId:
+					toolCallId !== _UNASSOCIATED_KEY
+						? toolCallId
+						: candidate.toolCallId || undefined,
+				visitedAt: candidate.visitedAt ?? null,
+			});
+		}
+	}
+	return refs;
+}
+
+export function compatReferencesToCanonicalPayload(args: {
+	references: CompatReferencePayload[];
+	historyTurnIds: Array<string | undefined>;
+	indexByTurnId: Map<string, number>;
+}): CanonicalReferencePayload[] {
+	const { references, historyTurnIds, indexByTurnId } = args;
+	return references
+		.filter((ref): ref is CompatReferencePayload => !!ref?.url)
+		.map((ref, index) => {
+			const resolvedMessageIndex =
+				ref.turnId && indexByTurnId.has(ref.turnId)
+					? indexByTurnId.get(ref.turnId)
+					: ref.messageIndex;
+			const resolvedTurnId =
+				ref.turnId ||
+				(typeof resolvedMessageIndex === "number"
+					? historyTurnIds[resolvedMessageIndex]
+					: undefined);
+			return {
+				id: `ref_${index}`,
+				title: ref.title,
+				url: ref.url,
+				snippet: ref.content,
+				visitedAt: ref.visitedAt ?? null,
+				keyParagraph: ref.keyExcerpt,
+				bonus: ref.bonus ?? false,
+				messageIndex: resolvedMessageIndex,
+				turnId: resolvedTurnId,
+				toolCallId: ref.toolCallId,
+			};
+		});
+}
+
+export function collectCanonicalReferencesFromCompatData(args: {
+	data: Record<string, unknown> | undefined;
+	historyTurnIds: Array<string | undefined>;
+	indexByTurnId: Map<string, number>;
+}): CanonicalReferencePayload[] {
+	const { data, historyTurnIds, indexByTurnId } = args;
+	const canonicalRefs = getCompatReferencesFromData(data);
+	if (canonicalRefs) {
+		return compatReferencesToCanonicalPayload({
+			references: canonicalRefs,
+			historyTurnIds,
+			indexByTurnId,
+		});
+	}
+
+	const retrievals = getCompatRetrievalsFromData(data);
+	if (!retrievals) {
+		return [];
+	}
+	return retrievalsToCanonicalReferences({
+		retrievals,
+		historyTurnIds,
+		indexByTurnId,
+	}).map((ref, index) => ({
+		id: `ref_${index}`,
+		title: ref.title,
+		url: ref.url,
+		snippet: ref.content,
+		visitedAt: ref.visitedAt ?? null,
+		keyParagraph: ref.keyExcerpt,
+		bonus: ref.bonus ?? false,
+		messageIndex: ref.turnId ? undefined : ref.messageIndex,
+		turnId: ref.turnId,
+		toolCallId: ref.toolCallId,
+	}));
+}
+
+export function collectCanonicalReferencesFromCompatPlugins(args: {
+	plugins: CompatPluginsMap | undefined;
+	historyTurnIds: Array<string | undefined>;
+	indexByTurnId: Map<string, number>;
+}): CanonicalReferencePayload[] {
+	const { plugins, historyTurnIds, indexByTurnId } = args;
+	return collectCanonicalReferencesFromCompatData({
+		data: getCompatData(plugins),
+		historyTurnIds,
+		indexByTurnId,
+	});
+}
+
+export function serializeCanonicalReferences(
+	refs: Array<
+		Pick<
+			CanonicalReferencePayload,
+			| "url"
+			| "title"
+			| "snippet"
+			| "keyParagraph"
+			| "bonus"
+			| "messageIndex"
+			| "turnId"
+			| "toolCallId"
+			| "visitedAt"
+		>
+	>,
+): CompatReferencePayload[] {
+	return refs.map((ref) => ({
+		url: ref.url,
+		title: ref.title,
+		content: ref.snippet,
+		keyExcerpt: ref.keyParagraph,
+		bonus: ref.bonus ?? false,
+		messageIndex: ref.turnId ? undefined : ref.messageIndex,
+		turnId: ref.turnId,
+		toolCallId: ref.toolCallId,
+		visitedAt: ref.visitedAt ?? null,
+	}));
+}
+
+export function writeCompatPluginEnvelope(args: {
+	plugins: CompatPluginsMap | undefined;
+	refs: Array<
+		Pick<
+			CanonicalReferencePayload,
+			| "url"
+			| "title"
+			| "snippet"
+			| "keyParagraph"
+			| "bonus"
+			| "messageIndex"
+			| "turnId"
+			| "toolCallId"
+			| "visitedAt"
+		>
+	>;
+}): CompatPluginsMap {
+	const { plugins, refs } = args;
+	const references = serializeCanonicalReferences(refs);
+	const nextPlugins = { ...(plugins || {}) };
+	const existingCompat = nextPlugins[_RAG_COMPAT_KEY];
+	const existingData =
+		existingCompat?.data &&
+		typeof existingCompat.data === "object" &&
+		!Array.isArray(existingCompat.data)
+			? existingCompat.data
+			: {};
+	const { retrievals: _deprecatedRetrievals, ...restData } = existingData;
+
+	nextPlugins[_RAG_COMPAT_KEY] = {
+		kind: _RAG_COMPAT_KEY,
+		version: existingCompat?.version || "1.0",
+		data: { ...restData, references },
+	};
+	return nextPlugins;
+}
diff --git a/frontend/src/services/groundTruths.ts b/frontend/src/services/groundTruths.ts
index f99a443..3bbe703 100644
--- a/frontend/src/services/groundTruths.ts
+++ b/frontend/src/services/groundTruths.ts
@@ -1,8 +1,4 @@
-import type {
-	ApiGroundTruth,
-	ApiHistoryEntry,
-	ApiReference,
-} from "../adapters/apiMapper";
+import type { ApiGroundTruth, ApiHistoryEntry } from "../adapters/apiMapper";
 import { groundTruthFromApi } from "../adapters/apiMapper";
 import { client } from "../api/client";
 import type { components, operations } from "../api/generated";
@@ -14,11 +10,6 @@ type GroundTruthItemOut = Omit<
 	components["schemas"]["AgenticGroundTruthEntry-Output"],
 	"history"
 > & {
-	synthQuestion?: string | null;
-	editedQuestion?: string | null;
-	answer?: string | null;
-	refs?: ApiReference[];
-	totalReferences?: number;
 	tags?: string[];
 	comment?: string | null;
 	history?: ApiHistoryEntry[];
@@ -46,9 +37,10 @@ interface ListAllGroundTruthsParams {
 	tags?: string[];
 	excludeTags?: string[];
 	itemId?: string | null;
-	refUrl?: string | null;
+	pluginFilter?: string[];
 	keyword?: string | null;
 	sortBy?: string | null;
+	pluginSort?: string | null;
 	sortOrder?: "asc" | "desc" | null;
 	page?: number;
 	limit?: number;
@@ -71,10 +63,11 @@ export async function listAllGroundTruths(
 	if (params.excludeTags?.length)
 		query.excludeTags = params.excludeTags.join(",");
 	if (params.itemId) query.itemId = params.itemId;
-	if (params.refUrl) query.refUrl = params.refUrl;
+	if (params.pluginFilter?.length) query.pluginFilter = params.pluginFilter;
 	if (params.keyword) query.keyword = params.keyword;
 	if (params.sortBy)
 		query.sortBy = params.sortBy as components["schemas"]["SortField"];
+	if (params.pluginSort) query.pluginSort = params.pluginSort;
 	if (params.sortOrder) query.sortOrder = params.sortOrder;
 	if (typeof params.page === "number") query.page = params.page;
 	if (typeof params.limit === "number") query.limit = params.limit;
diff --git a/frontend/tests/unit/adapters/apiMapper.test.ts b/frontend/tests/unit/adapters/apiMapper.test.ts
index 7731f6b..0b6412d 100644
--- a/frontend/tests/unit/adapters/apiMapper.test.ts
+++ b/frontend/tests/unit/adapters/apiMapper.test.ts
@@ -11,11 +11,7 @@ function makeApiItem(overrides: Partial<ApiGroundTruth> = {}): ApiGroundTruth {
 	return {
 		id: "gt-1",
 		status: "draft",
-		answer: "Test answer",
-		synthQuestion: "Synth question",
-		editedQuestion: "Edited question",
 		history: undefined,
-		refs: [],
 		tags: [],
 		manualTags: [],
 		computedTags: [],
@@ -27,14 +23,28 @@ function makeApiItem(overrides: Partial<ApiGroundTruth> = {}): ApiGroundTruth {
 	} as ApiGroundTruth;
 }
 
+function withCompatData(
+	data: Record<string, unknown>,
+): Pick<ApiGroundTruth, "plugins"> {
+	return {
+		plugins: {
+			"rag-compat": {
+				kind: "rag-compat",
+				version: "1.0",
+				data,
+			},
+		},
+	};
+}
+
 describe("groundTruthFromApi", () => {
 	describe("role mapping", () => {
-		it("maps history role 'assistant' to 'agent'", () => {
+		it("preserves history role values from the API payload", () => {
 			const api = makeApiItem({
 				history: [{ role: "assistant", msg: "Hello from assistant" }],
 			});
 			const result = groundTruthFromApi(api);
-			expect(result.history?.[0].role).toBe("agent");
+			expect(result.history?.[0].role).toBe("assistant");
 			expect(result.history?.[0].content).toBe("Hello from assistant");
 		});
 
@@ -46,6 +56,19 @@ describe("groundTruthFromApi", () => {
 			expect(result.history?.[0].role).toBe("user");
 			expect(result.history?.[0].content).toBe("Hello from user");
 		});
+
+		it("derives compatibility question from the latest user turn", () => {
+			const api = makeApiItem({
+				history: [
+					{ role: "user", msg: "Initial question" },
+					{ role: "planner", msg: "Planner output" },
+					{ role: "user", msg: "Follow-up question" },
+					{ role: "assistant", msg: "Final answer" },
+				],
+			});
+			const result = groundTruthFromApi(api);
+			expect(result.question).toBe("Follow-up question");
+		});
 	});
 
 	describe("expectedBehavior handling", () => {
@@ -84,7 +107,32 @@ describe("groundTruthFromApi", () => {
 	});
 
 	describe("reference mapping", () => {
-		it("assigns turn refs to correct messageIndex", () => {
+		it("reads canonical rag-compat data.references", () => {
+			const api = makeApiItem({
+				history: [{ role: "assistant", msg: "A" }],
+				...withCompatData({
+					references: [
+						{
+							url: "https://canonical.ref/1",
+							title: "Canonical Ref",
+							content: "Canonical snippet",
+							keyExcerpt: "Canonical key excerpt",
+							bonus: true,
+							messageIndex: 0,
+						},
+					],
+				}),
+			});
+			const result = groundTruthFromApi(api);
+			const [ref] = getItemReferences(result);
+			expect(ref.url).toBe("https://canonical.ref/1");
+			expect(ref.title).toBe("Canonical Ref");
+			expect(ref.snippet).toBe("Canonical snippet");
+			expect(ref.keyParagraph).toBe("Canonical key excerpt");
+			expect(ref.bonus).toBe(true);
+		});
+
+		it("ignores retired turn-level refs from history payloads", () => {
 			const api = makeApiItem({
 				history: [
 					{ role: "user", msg: "Question" },
@@ -108,39 +156,28 @@ describe("groundTruthFromApi", () => {
 			});
 			const result = groundTruthFromApi(api);
 
-			// Refs from history[1] should have messageIndex 1
-			const allRefs = getItemReferences(result);
-			const refsAt1 = allRefs.filter((r) => r.messageIndex === 1);
-			expect(refsAt1).toHaveLength(2);
-			expect(refsAt1.map((r) => r.url)).toEqual([
-				"https://ref1.com",
-				"https://ref2.com",
-			]);
-
-			// Refs from history[3] should have messageIndex 3
-			const refsAt3 = allRefs.filter((r) => r.messageIndex === 3);
-			expect(refsAt3).toHaveLength(1);
-			expect(refsAt3[0].url).toBe("https://ref3.com");
+			expect(getItemReferences(result)).toEqual([]);
 		});
 
-		it("maps ref fields correctly", () => {
+		it("maps canonical plugin reference fields correctly", () => {
 			const api = makeApiItem({
-				history: [
-					{ role: "user", msg: "Q" },
-					{
-						role: "assistant",
-						msg: "A",
-						refs: [
-							{
-								url: "https://example.com",
-								title: "Example Title",
-								content: "Snippet content",
-								keyExcerpt: "Key paragraph",
-								bonus: true,
-							},
-						],
+				plugins: {
+					"rag-compat": {
+						kind: "rag-compat",
+						version: "1.0",
+						data: {
+							references: [
+								{
+									url: "https://example.com",
+									title: "Example Title",
+									content: "Snippet content",
+									keyExcerpt: "Key paragraph",
+									bonus: true,
+								},
+							],
+						},
 					},
-				],
+				},
 			});
 			const result = groundTruthFromApi(api);
 			const ref = getItemReferences(result)[0];
@@ -155,85 +192,118 @@ describe("groundTruthFromApi", () => {
 		});
 	});
 
-	describe("legacy single-turn conversion", () => {
-		it("creates 2-turn history from editedQuestion and answer", () => {
+	describe("retired single-turn compat behavior", () => {
+		it("does not synthesize history from editedQuestion and answer", () => {
 			const api = makeApiItem({
-				editedQuestion: "What is X?",
-				answer: "X is Y",
 				history: undefined,
+				...withCompatData({
+					editedQuestion: "What is X?",
+					answer: "X is Y",
+				}),
 			});
 			const result = groundTruthFromApi(api);
 
-			expect(result.history).toHaveLength(2);
-			expect(result.history?.[0]).toMatchObject({
-				role: "user",
-				content: "What is X?",
-			});
-			expect(result.history?.[1]).toMatchObject({
-				role: "agent",
-				content: "X is Y",
-			});
+			expect(result.history).toBeUndefined();
 		});
 
-		it("falls back to synthQuestion when editedQuestion is empty", () => {
+		it("does not fall back to synthQuestion when editedQuestion is empty", () => {
 			const api = makeApiItem({
-				synthQuestion: "Synth question?",
-				editedQuestion: "",
-				answer: "Answer",
 				history: undefined,
+				...withCompatData({
+					synthQuestion: "Synth question?",
+					editedQuestion: "",
+					answer: "Answer",
+				}),
 			});
 			const result = groundTruthFromApi(api);
 
-			expect(result.history?.[0].content).toBe("Synth question?");
+			expect(result.history).toBeUndefined();
 		});
 
-		it("assigns legacy top-level refs to messageIndex 1", () => {
+		it("does not import legacy top-level refs", () => {
 			const api = makeApiItem({
-				editedQuestion: "Question",
-				answer: "Answer",
-				refs: [
-					{
-						url: "https://legacy.ref",
-						content: "Legacy content",
-						bonus: false,
-					},
-				],
 				history: undefined,
+				...withCompatData({
+					editedQuestion: "Question",
+					answer: "Answer",
+					refs: [
+						{
+							url: "https://legacy.ref",
+							content: "Legacy content",
+							bonus: false,
+						},
+					],
+				}),
 			});
 			const result = groundTruthFromApi(api);
 
-			expect(getItemReferences(result)).toHaveLength(1);
-			expect(getItemReferences(result)[0].messageIndex).toBe(1);
+			expect(getItemReferences(result)).toEqual([]);
 		});
 
-		it("creates empty agent turn when answer is empty", () => {
+		it("does not create synthetic turns when answer is empty", () => {
 			const api = makeApiItem({
-				editedQuestion: "Question without answer",
-				answer: "",
 				history: undefined,
+				...withCompatData({
+					editedQuestion: "Question without answer",
+					answer: "",
+				}),
+			});
+			const result = groundTruthFromApi(api);
+
+			expect(result.history).toBeUndefined();
+		});
+
+		it("treats explicit API empty history as authoritative over compat question/answer", () => {
+			const api = makeApiItem({
+				history: [],
+				...withCompatData({
+					editedQuestion: "Compat question",
+					answer: "Compat answer",
+				}),
 			});
 			const result = groundTruthFromApi(api);
 
-			expect(result.history).toHaveLength(2);
-			expect(result.history?.[1].content).toBe("");
+			expect(result.history).toEqual([]);
 		});
 	});
 
 	describe("multi-turn item top-level refs", () => {
-		it("assigns top-level refs to undefined messageIndex for true multi-turn", () => {
+		it("does not import compat refs for true multi-turn", () => {
 			const api = makeApiItem({
 				history: [
 					{ role: "user", msg: "Q" },
 					{ role: "assistant", msg: "A" },
 				],
-				refs: [
-					{ url: "https://global.ref", content: "Global ref", bonus: false },
-				],
+				...withCompatData({
+					refs: [
+						{
+							url: "https://global.ref",
+							content: "Global ref",
+							bonus: false,
+						},
+					],
+				}),
+			});
+			const result = groundTruthFromApi(api);
+
+			expect(getItemReferences(result)).toEqual([]);
+		});
+
+		it("treats explicit empty canonical references as authoritative", () => {
+			const api = makeApiItem({
+				history: [{ role: "assistant", msg: "A" }],
+				...withCompatData({
+					references: [],
+					retrievals: {
+						_unassociated: {
+							candidates: [{ url: "https://stale.ref", messageIndex: 0 }],
+						},
+					},
+				}),
 			});
 			const result = groundTruthFromApi(api);
 
-			expect(getItemReferences(result)).toHaveLength(1);
-			expect(getItemReferences(result)[0].messageIndex).toBeUndefined();
+			expect(getItemReferences(result)).toEqual([]);
 		});
 	});
 
@@ -348,7 +418,7 @@ describe("groundTruthToPatch", () => {
 			id: "gt-1",
 			providerId: "api",
 			question: "Test question",
-			answer: "Test answer",
+			history: [{ role: "agent", content: "Test answer" }],
 			status: "draft",
 			deleted: false,
 			tags: [],
@@ -358,7 +428,7 @@ describe("groundTruthToPatch", () => {
 	}
 
 	describe("role mapping", () => {
-		it("maps UI role 'agent' to API role 'assistant'", () => {
+		it("preserves UI role values when serializing patch payloads", () => {
 			const item = makeDomainItem({
 				history: [
 					{ role: "user", content: "Q" },
@@ -368,12 +438,160 @@ describe("groundTruthToPatch", () => {
 			const patch = groundTruthToPatch({ item });
 
 			expect(patch.history?.[0].role).toBe("user");
-			expect(patch.history?.[1].role).toBe("assistant");
+			expect(patch.history?.[1].role).toBe("agent");
 		});
 	});
 
 	describe("reference handling", () => {
-		it("includes refs only on agent turns in history", () => {
+		it("does not create rag-compat plugin when history exists without compat payload", () => {
+			const item = makeDomainItem({
+				history: [
+					{ role: "user", content: "Q" },
+					{ role: "agent", content: "A" },
+				],
+				plugins: {
+					other: {
+						kind: "other",
+						version: "1.0",
+						data: { keep: true },
+					},
+				},
+			});
+			const patch = groundTruthToPatch({ item });
+			const patchPlugins = (patch as Record<string, unknown>).plugins as
+				| Record<string, { data?: Record<string, unknown> }>
+				| undefined;
+			expect(patchPlugins?.other?.data).toEqual({ keep: true });
+			expect(patchPlugins?.["rag-compat"]).toBeUndefined();
+		});
+
+		it("round-trips canonical rag-compat data.references through patch generation", () => {
+			const item = makeDomainItem({
+				history: [
+					{ role: "user", content: "Q" },
+					{ role: "agent", content: "A" },
+				],
+				plugins: {
+					"rag-compat": {
+						kind: "rag-compat",
+						version: "1.0",
+						data: {
+							references: [
+								{
+									url: "https://canonical.roundtrip/ref",
+									title: "Round Trip Ref",
+									content: "Round trip snippet",
+									keyExcerpt: "Round trip excerpt",
+									messageIndex: 1,
+								},
+							],
+						},
+					},
+				},
+			});
+
+			const patch = groundTruthToPatch({ item });
+			expect(patch.history?.[1].refs).toBeUndefined();
+			const patchPlugins = (patch as Record<string, unknown>).plugins as
+				| Record<string, { data?: Record<string, unknown> }>
+				| undefined;
+			expect(patchPlugins?.["rag-compat"]?.data?.references).toEqual([
+				expect.objectContaining({
+					url: "https://canonical.roundtrip/ref",
+					title: "Round Trip Ref",
+				}),
+			]);
+		});
+
+		it("materializes retrieval-only rag-compat payloads into canonical references during save patch", () => {
+			const fromApi = groundTruthFromApi(
+				makeApiItem({
+					history: [
+						{ role: "user", msg: "Q", turnId: "turn-user" },
+						{ role: "assistant", msg: "A", turnId: "turn-answer" },
+					],
+					...withCompatData({
+						retrievals: {
+							tc1: {
+								candidates: [
+									{
+										url: "https://retrieval.only/ref",
+										title: "Retrieval Only Ref",
+										chunk: "retrieval snippet",
+										messageIndex: 1,
+									},
+								],
+							},
+						},
+					}),
+				}),
+			);
+
+			const patch = groundTruthToPatch({ item: fromApi });
+			const patchPlugins = (patch as Record<string, unknown>).plugins as
+				| Record<string, { data?: Record<string, unknown> }>
+				| undefined;
+
+			expect(patchPlugins?.["rag-compat"]?.data?.references).toEqual([
+				expect.objectContaining({
+					url: "https://retrieval.only/ref",
+					title: "Retrieval Only Ref",
+					toolCallId: "tc1",
+					turnId: "turn-answer",
+				}),
+			]);
+			expect(patchPlugins?.["rag-compat"]?.data?.retrievals).toBeUndefined();
+		});
+
+		it("scrubs removed legacy compat keys and deprecated compat retrievals", () => {
+			const item = makeDomainItem({
+				history: [
+					{ role: "user", content: "Q" },
+					{ role: "agent", content: "A" },
+				],
+				plugins: {
+					"rag-compat": {
+						kind: "rag-compat",
+						version: "1.0",
+						data: {
+							synthQuestion: "legacy question",
+							editedQuestion: "legacy edited",
+							answer: "legacy answer",
+							refs: [{ url: "https://legacy.ref" }],
+							totalReferences: 99,
+							historyAnnotations: [{ note: "legacy" }],
+							references: [{ url: "https://canonical.ref" }],
+							retrievals: {
+								_unassociated: {
+									candidates: [{ url: "https://retrieval.ref" }],
+								},
+							},
+						},
+					},
+				},
+			});
+
+			const patch = groundTruthToPatch({ item });
+			const patchPlugins = (patch as Record<string, unknown>).plugins as
+				| Record<string, { data?: Record<string, unknown> }>
+				| undefined;
+			const compatData = patchPlugins?.["rag-compat"]?.data;
+
+			expect(compatData).toBeDefined();
+			expect(compatData?.synthQuestion).toBeUndefined();
+			expect(compatData?.editedQuestion).toBeUndefined();
+			expect(compatData?.answer).toBeUndefined();
+			expect(compatData?.refs).toBeUndefined();
+			expect(compatData?.totalReferences).toBeUndefined();
+			expect(compatData?.historyAnnotations).toBeUndefined();
+			expect(compatData?.references).toEqual([
+				expect.objectContaining({ url: "https://canonical.ref" }),
+			]);
+			expect(compatData?.retrievals).toBeUndefined();
+			expect(compatData?.turnIdentity).toBeUndefined();
+		});
+
+		it("does not emit retired refs on user or agent turns in history", () => {
 			const item = makeDomainItem({
 				history: [
 					{ role: "user", content: "Q" },
@@ -401,16 +619,10 @@ describe("groundTruthToPatch", () => {
 			// User turn should not have refs
 			expect(patch.history?.[0].refs).toBeUndefined();
 
-			// Agent turn should have refs
-			expect(patch.history?.[1].refs).toHaveLength(1);
-			expect(patch.history?.[1].refs?.[0].url).toBe("https://ref.com");
+			expect(patch.history?.[1].refs).toBeUndefined();
 		});
 
-		it("preserves top-level refs for legacy items", () => {
-			const originalApi = makeApiItem({
-				history: undefined,
-				refs: [{ url: "https://legacy.ref", bonus: false }],
-			});
+		it("does not emit refs even when mapped to non-user turns", () => {
 			const item = makeDomainItem({
 				history: [
 					{ role: "user", content: "Q" },
@@ -433,19 +645,12 @@ describe("groundTruthToPatch", () => {
 					},
 				},
 			});
-			const patch = groundTruthToPatch({ item, originalApi });
+			const patch = groundTruthToPatch({ item });
 
-			// Top-level refs should include refs with messageIndex 1
-			expect(patch.refs).toHaveLength(2);
-			expect(patch.refs?.map((r) => r.url)).toContain("https://legacy.ref");
-			expect(patch.refs?.map((r) => r.url)).toContain("https://new.ref");
+			expect(patch.history?.[1]?.refs).toBeUndefined();
 		});
 
-		it("preserves top-level refs when legacy items use empty history arrays", () => {
-			const originalApi = makeApiItem({
-				history: [],
-				refs: [{ url: "https://legacy-empty.ref", bonus: false }],
-			});
+		it("does not emit refs when item history exists", () => {
 			const item = makeDomainItem({
 				history: [
 					{ role: "user", content: "Q" },
@@ -468,27 +673,12 @@ describe("groundTruthToPatch", () => {
 					},
 				},
 			});
-			const patch = groundTruthToPatch({ item, originalApi });
+			const patch = groundTruthToPatch({ item });
 
-			expect(patch.refs).toHaveLength(2);
-			expect(patch.refs?.map((r) => r.url)).toContain(
-				"https://legacy-empty.ref",
-			);
-			expect(patch.refs?.map((r) => r.url)).toContain("https://new-empty.ref");
+			expect(patch.history?.[1]?.refs).toBeUndefined();
 		});
 
-		it("omits top-level refs for true multi-turn items", () => {
-			const originalApi = makeApiItem({
-				history: [
-					{ role: "user", msg: "Q" },
-					{
-						role: "assistant",
-						msg: "A",
-						refs: [{ url: "https://turn.ref", bonus: false }],
-					},
-				],
-				refs: [],
-			});
+		it("does not serialize refs into assistant history entries", () => {
 			const item = makeDomainItem({
 				history: [
 					{ role: "user", content: "Q" },
@@ -508,16 +698,12 @@ describe("groundTruthToPatch", () => {
 					},
 				},
 			});
-			const patch = groundTruthToPatch({ item, originalApi });
-
-			// Top-level refs should be empty for true multi-turn
-			expect(patch.refs).toHaveLength(0);
+			const patch = groundTruthToPatch({ item });
 
-			// Refs should be in history
-			expect(patch.history?.[1].refs).toHaveLength(1);
+			expect(patch.history?.[1].refs).toBeUndefined();
 		});
 
-		it("maps ref fields correctly in patch", () => {
+		it("omits ref fields from patch history entries", () => {
 			const item = makeDomainItem({
 				history: [
 					{ role: "user", content: "Q" },
@@ -547,13 +733,7 @@ describe("groundTruthToPatch", () => {
 				},
 			});
 			const patch = groundTruthToPatch({ item });
-			const ref = patch.history?.[1].refs?.[0];
-
-			expect(ref?.url).toBe("https://example.com");
-			expect(ref?.title).toBe("Title");
-			expect(ref?.content).toBe("Snippet");
-			expect(ref?.keyExcerpt).toBe("Key");
-			expect(ref?.bonus).toBe(true);
+			expect(patch.history?.[1].refs).toBeUndefined();
 		});
 	});
 
@@ -630,14 +810,22 @@ describe("groundTruthToPatch", () => {
 	});
 
 	describe("basic field mapping", () => {
-		it("includes answer and editedQuestion", () => {
+		it("serializes canonical history content without role remapping", () => {
 			const item = makeDomainItem({
-				question: "My question",
-				answer: "My answer",
+				history: [
+					{ role: "user", content: "My question" },
+					{ role: "agent", content: "My answer" },
+				],
 			});
 			const patch = groundTruthToPatch({ item });
-			expect(patch.answer).toBe("My answer");
-			expect(patch.editedQuestion).toBe("My question");
+			expect(patch.history?.[0]).toMatchObject({
+				role: "user",
+				msg: "My question",
+			});
+			expect(patch.history?.[1]).toMatchObject({
+				role: "agent",
+				msg: "My answer",
+			});
 		});
 
 		it("includes manualTags", () => {
diff --git a/frontend/tests/unit/adapters/apiProvider-etag.test.ts b/frontend/tests/unit/adapters/apiProvider-etag.test.ts
index 4377739..9af17ed 100644
--- a/frontend/tests/unit/adapters/apiProvider-etag.test.ts
+++ b/frontend/tests/unit/adapters/apiProvider-etag.test.ts
@@ -1,14 +1,12 @@
 import { beforeEach, describe, expect, it, vi } from "vitest";
 import { ApiProvider } from "../../../src/adapters/apiProvider";
 import type { components } from "../../../src/api/generated";
-import type { GroundTruthItem } from "../../../src/models/groundTruth";
+import {
+	type GroundTruthItem,
+	getLastAgentTurn,
+} from "../../../src/models/groundTruth";
 
 type ApiItem = components["schemas"]["AgenticGroundTruthEntry-Output"] & {
-	synthQuestion?: string | null;
-	editedQuestion?: string | null;
-	answer?: string | null;
-	refs?: components["schemas"]["Reference"][];
-	totalReferences?: number;
 	tags?: string[];
 	comment?: string | null;
 };
@@ -40,11 +38,7 @@ function makeApiItem(overrides: Partial<ApiItem> = {}): ApiItem {
 	return {
 		id: "gt-1",
 		status: "draft",
-		answer: "Original answer",
-		synthQuestion: "Synth question",
-		editedQuestion: "Edited question",
 		history: [],
-		refs: [],
 		tags: [],
 		comment: null,
 		datasetName: "dataset-1",
@@ -67,7 +61,7 @@ describe("ApiProvider ETag 412 retry behavior", () => {
 		const freshItem = makeApiItem({ _etag: "etag-fresh" });
 		const updatedItem = makeApiItem({
 			_etag: "etag-after-update",
-			answer: "Updated answer",
+			history: [{ role: "assistant", msg: "Updated answer" }],
 		});
 
 		mockGetMyAssignments.mockResolvedValue([originalItem]);
@@ -85,7 +79,7 @@ describe("ApiProvider ETag 412 retry behavior", () => {
 
 		const domainItem: GroundTruthItem = {
 			...items[0],
-			answer: "Updated answer",
+			history: [{ role: "agent", content: "Updated answer" }],
 		};
 
 		const result = await provider.save(domainItem);
@@ -108,7 +102,7 @@ describe("ApiProvider ETag 412 retry behavior", () => {
 		expect(mockUpdateAssignedGroundTruth.mock.calls[1][4]).toBe("etag-fresh");
 
 		// Result should be the updated item
-		expect(result.answer).toBe("Updated answer");
+		expect(getLastAgentTurn(result)).toBe("Updated answer");
 	});
 
 	it("updates cache with fresh ETag after 412 retry", async () => {
diff --git a/frontend/tests/unit/components/app/CurateLayout.integration.test.tsx b/frontend/tests/unit/components/app/CurateLayout.integration.test.tsx
index b61d784..65ecb5b 100644
--- a/frontend/tests/unit/components/app/CurateLayout.integration.test.tsx
+++ b/frontend/tests/unit/components/app/CurateLayout.integration.test.tsx
@@ -12,7 +12,6 @@ function MiniCurateApp() {
 		{
 			id: "1",
 			question: "Q-1",
-			answer: "",
 			history: [{ role: "user", content: "Q-1" }],
 			status: "draft",
 			providerId: "json",
@@ -21,7 +20,6 @@ function MiniCurateApp() {
 		{
 			id: "2",
 			question: "Q-2",
-			answer: "",
 			history: [{ role: "user", content: "Q-2" }],
 			status: "draft",
 			providerId: "json",
diff --git a/frontend/tests/unit/components/app/QuestionsExplorer.test.tsx b/frontend/tests/unit/components/app/QuestionsExplorer.test.tsx
index 2b292a7..97badc4 100644
--- a/frontend/tests/unit/components/app/QuestionsExplorer.test.tsx
+++ b/frontend/tests/unit/components/app/QuestionsExplorer.test.tsx
@@ -47,14 +47,21 @@ vi.mock("../../../../src/services/datasets", () => ({
 
 const createMockItem = (
 	overrides: Partial<QuestionsExplorerItem> = {},
-): QuestionsExplorerItem => ({
-	id: "item-1",
-	question: "Test Question",
-	answer: "Test Answer",
-	status: "draft",
-	providerId: "test",
-	...overrides,
-});
+): QuestionsExplorerItem => {
+	const question = overrides.question ?? "Test Question";
+	const history = overrides.history ?? [
+		{ role: "user", content: question },
+		{ role: "agent", content: "Test Answer" },
+	];
+	return {
+		id: "item-1",
+		question,
+		history,
+		status: "draft",
+		providerId: "test",
+		...overrides,
+	};
+};
 
 describe("QuestionsExplorer", () => {
 	const mockOnAssign = vi.fn();
@@ -109,14 +116,14 @@ describe("QuestionsExplorer", () => {
 			async (
 				params: {
 					itemId?: string;
-					refUrl?: string;
+					pluginFilter?: string[];
 					keyword?: string;
 					page?: number;
 				} = {},
 			) => {
 				const hasTextFilter =
 					Boolean(params.itemId) ||
-					Boolean(params.refUrl) ||
+					Boolean(params.pluginFilter?.length) ||
 					Boolean(params.keyword);
 				const page = typeof params.page === "number" ? params.page : 1;
 				const totalPages = hasTextFilter ? 1 : 3;
@@ -405,7 +412,9 @@ describe("QuestionsExplorer", () => {
 				name: "reference URL",
 				label: "Reference URL:",
 				value: "https://example.com/ref",
-				expectedFilter: { refUrl: "https://example.com/ref" },
+				expectedFilter: {
+					pluginFilter: ["rag-compat:refUrl=https://example.com/ref"],
+				},
 			},
 			{
 				name: "keyword",
diff --git a/frontend/tests/unit/components/app/pages/CuratePane.test.tsx b/frontend/tests/unit/components/app/pages/CuratePane.test.tsx
index 9498d62..1075bfe 100644
--- a/frontend/tests/unit/components/app/pages/CuratePane.test.tsx
+++ b/frontend/tests/unit/components/app/pages/CuratePane.test.tsx
@@ -5,7 +5,7 @@ import type { GroundTruthItem } from "../../../../../src/models/groundTruth";
 const item: GroundTruthItem = {
 	id: "1",
 	question: "What is this software?",
-	answer: "",
+	history: [{ role: "agent", content: "" }],
 	status: "draft",
 	providerId: "json",
 	tags: [],
diff --git a/frontend/tests/unit/components/app/pages/QuestionsList.test.tsx b/frontend/tests/unit/components/app/pages/QuestionsList.test.tsx
index 36d7247..3e3440e 100644
--- a/frontend/tests/unit/components/app/pages/QuestionsList.test.tsx
+++ b/frontend/tests/unit/components/app/pages/QuestionsList.test.tsx
@@ -5,7 +5,7 @@ import type { GroundTruthItem } from "../../../../../src/models/groundTruth";
 const mkItem = (id: string, deleted = false): GroundTruthItem => ({
 	id,
 	question: `Q-${id}`,
-	answer: "",
+	history: [{ role: "agent", content: "" }],
 	tags: [],
 	status: "draft",
 	providerId: "json",
diff --git a/frontend/tests/unit/components/app/pages/ReferencesSection.test.tsx b/frontend/tests/unit/components/app/pages/ReferencesSection.test.tsx
index 87a07dd..52b5778 100644
--- a/frontend/tests/unit/components/app/pages/ReferencesSection.test.tsx
+++ b/frontend/tests/unit/components/app/pages/ReferencesSection.test.tsx
@@ -85,7 +85,7 @@ const makeItem = (
 ): GroundTruthItem => ({
 	id: "i1",
 	question: "Q",
-	answer: "A",
+	history: [{ role: "agent", content: "A" }],
 	status: "draft",
 	providerId: "test",
 	...overrides,
diff --git a/frontend/tests/unit/hooks/useGroundTruth-deleteTurn.test.tsx b/frontend/tests/unit/hooks/useGroundTruth-deleteTurn.test.tsx
index 50f7545..a999b72 100644
--- a/frontend/tests/unit/hooks/useGroundTruth-deleteTurn.test.tsx
+++ b/frontend/tests/unit/hooks/useGroundTruth-deleteTurn.test.tsx
@@ -1,6 +1,10 @@
 import { act, renderHook, waitFor } from "@testing-library/react";
 import type { ConversationTurn } from "../../../src/models/groundTruth";
-import { getItemReferences } from "../../../src/models/groundTruth";
+import {
+	getItemReferences,
+	getLastAgentTurn,
+	getLastUserTurn,
+} from "../../../src/models/groundTruth";
 
 vi.mock("../../../src/config/demo", () => ({
 	default: true,
@@ -181,18 +185,18 @@ describe("useGroundTruth deleteTurn", () => {
 			},
 		];
 		await seedHistory(result, history);
-		expect(result.current.current?.question).toBe("Second question");
-		expect(result.current.current?.answer).toBe("Second answer");
+		expect(getLastUserTurn(result.current.current!)).toBe("Second question");
+		expect(getLastAgentTurn(result.current.current!)).toBe("Second answer");
 		await act(async () => {
 			result.current.deleteTurn(3);
 		});
-		expect(result.current.current?.question).toBe("Second question");
-		expect(result.current.current?.answer).toBe("First answer");
+		expect(getLastUserTurn(result.current.current!)).toBe("Second question");
+		expect(getLastAgentTurn(result.current.current!)).toBe("First answer");
 		await act(async () => {
 			result.current.deleteTurn(2);
 		});
-		expect(result.current.current?.question).toBe("First question");
-		expect(result.current.current?.answer).toBe("First answer");
+		expect(getLastUserTurn(result.current.current!)).toBe("First question");
+		expect(getLastAgentTurn(result.current.current!)).toBe("First answer");
 	});
 
 	it("handles empty or out-of-range deletions without breaking canonical state", async () => {
@@ -208,7 +212,7 @@ describe("useGroundTruth deleteTurn", () => {
 			result.current.deleteTurn(0);
 		});
 		expect(result.current.current?.history).toHaveLength(0);
-		expect(result.current.current?.question).toBe("");
-		expect(result.current.current?.answer).toBe("");
+		expect(getLastUserTurn(result.current.current!)).toBe("");
+		expect(getLastAgentTurn(result.current.current!)).toBe("");
 	});
 });
diff --git a/frontend/tests/unit/hooks/useGroundTruth-multiturn.test.tsx b/frontend/tests/unit/hooks/useGroundTruth-multiturn.test.tsx
index 41b00bd..2c5452b 100644
--- a/frontend/tests/unit/hooks/useGroundTruth-multiturn.test.tsx
+++ b/frontend/tests/unit/hooks/useGroundTruth-multiturn.test.tsx
@@ -1,6 +1,10 @@
 import { act, renderHook, waitFor } from "@testing-library/react";
 import type { ConversationTurn } from "../../../src/models/groundTruth";
-import { getItemReferences } from "../../../src/models/groundTruth";
+import {
+	getItemReferences,
+	getLastAgentTurn,
+	getLastUserTurn,
+} from "../../../src/models/groundTruth";
 
 vi.mock("../../../src/config/demo", () => ({
 	default: true,
@@ -54,8 +58,8 @@ describe("useGroundTruth multi-turn flows", () => {
 		expect(
 			result.current.current?.history?.every((turn) => !!turn.turnId),
 		).toBe(true);
-		expect(result.current.current?.question).toBe("New question");
-		expect(result.current.current?.answer).toBe("Fresh answer");
+		expect(getLastUserTurn(result.current.current!)).toBe("New question");
+		expect(getLastAgentTurn(result.current.current!)).toBe("Fresh answer");
 	});
 
 	it("addTurn appends to history and keeps question/answer in sync", async () => {
@@ -67,7 +71,7 @@ describe("useGroundTruth multi-turn flows", () => {
 		expect(result.current.current?.history?.length).toBe(
 			initialHistoryLength + 1,
 		);
-		expect(result.current.current?.question).toBe("Follow-up question");
+		expect(getLastUserTurn(result.current.current!)).toBe("Follow-up question");
 		await act(async () => {
 			result.current.addTurn("agent", "Agent reply");
 		});
@@ -75,7 +79,7 @@ describe("useGroundTruth multi-turn flows", () => {
 			role: "agent",
 			content: "Agent reply",
 		});
-		expect(result.current.current?.answer).toBe("Agent reply");
+		expect(getLastAgentTurn(result.current.current!)).toBe("Agent reply");
 	});
 
 	it("stateSignature ignores visitedAt mutations for hasUnsaved", async () => {
diff --git a/frontend/tests/unit/models/groundTruth.multiturn.test.ts b/frontend/tests/unit/models/groundTruth.multiturn.test.ts
index 15546f0..2c714a3 100644
--- a/frontend/tests/unit/models/groundTruth.multiturn.test.ts
+++ b/frontend/tests/unit/models/groundTruth.multiturn.test.ts
@@ -7,6 +7,7 @@ import {
 	getLastUserTurn,
 	getTurnCount,
 	isMultiTurn,
+	withDerivedLegacyFields,
 } from "../../../src/models/groundTruth";
 
 describe("groundTruth multi-turn helpers", () => {
@@ -16,7 +17,7 @@ describe("groundTruth multi-turn helpers", () => {
 		id: "item-1",
 		providerId: "demo",
 		question: "fallback question",
-		answer: "fallback answer",
+		history: [{ role: "agent", content: "fallback answer" }],
 		status: "draft",
 		...overrides,
 	});
@@ -58,6 +59,16 @@ describe("groundTruth multi-turn helpers", () => {
 			expect(getLastAgentTurn(item)).toBe("");
 		});
 
+		it("treats custom non-user roles as answer turns", () => {
+			const item = makeItem({
+				history: [
+					{ role: "user", content: "User" },
+					{ role: "planner", content: "Intermediate planner output" },
+				],
+			});
+			expect(getLastAgentTurn(item)).toBe("Intermediate planner output");
+		});
+
 		it("returns latest matching turn content", () => {
 			const item = makeItem({
 				history: [
@@ -70,6 +81,29 @@ describe("groundTruth multi-turn helpers", () => {
 			expect(getLastUserTurn(item)).toBe("Follow-up");
 			expect(getLastAgentTurn(item)).toBe("Updated answer");
 		});
+
+		it("derives compatibility question from the latest user turn for cross-layer parity", () => {
+			const item = makeItem({
+				history: [
+					{ role: "user", content: "Initial question" },
+					{ role: "planner", content: "Interim planning output" },
+					{ role: "user", content: "Follow-up question" },
+					{ role: "assistant", content: "Final answer" },
+				],
+			});
+			expect(withDerivedLegacyFields(item).question).toBe("Follow-up question");
+		});
+
+		it("returns the last non-user turn regardless of role label", () => {
+			const item = makeItem({
+				history: [
+					{ role: "user", content: "Question" },
+					{ role: "assistant", content: "Assistant output" },
+					{ role: "planner", content: "Planner output" },
+				],
+			});
+			expect(getLastAgentTurn(item)).toBe("Planner output");
+		});
 	});
 
 	describe("conversation metadata helpers", () => {
diff --git a/frontend/tests/unit/models/gtHelpers.expectedBehavior.test.ts b/frontend/tests/unit/models/gtHelpers.expectedBehavior.test.ts
index 12bb707..2c20da5 100644
--- a/frontend/tests/unit/models/gtHelpers.expectedBehavior.test.ts
+++ b/frontend/tests/unit/models/gtHelpers.expectedBehavior.test.ts
@@ -23,7 +23,6 @@ describe("canApproveMultiTurn - Expected Behavior Validation", () => {
 		id: "test-1",
 		providerId: "test",
 		question: "Test question",
-		answer: "Test answer",
 		status: "draft",
 		expectedTools: { required: [{ name: "search" }] },
 		toolCalls: [{ id: "tc1", name: "search", callType: "tool" }],
@@ -311,7 +310,7 @@ describe("canApproveMultiTurn - expectedTools gating", () => {
 		id: "test-et",
 		providerId: "test",
 		question: "Test question",
-		answer: "Test answer",
+		history: [{ role: "agent", content: "Test answer" }],
 		status: "draft",
 	};
 	const validHistory: ConversationTurn[] = [
diff --git a/frontend/tests/unit/provider/duplicate-json.test.ts b/frontend/tests/unit/provider/duplicate-json.test.ts
index 2b93f7a..1a36e54 100644
--- a/frontend/tests/unit/provider/duplicate-json.test.ts
+++ b/frontend/tests/unit/provider/duplicate-json.test.ts
@@ -1,6 +1,9 @@
 import { describe, expect, it } from "vitest";
 import { DEMO_JSON } from "../../../src/models/demoData";
-import { getItemReferences } from "../../../src/models/groundTruth";
+import {
+	getItemReferences,
+	getLastAgentTurn,
+} from "../../../src/models/groundTruth";
 import { JsonProvider } from "../../../src/models/provider";
 
 describe("JsonProvider duplicate", () => {
@@ -16,7 +19,7 @@ describe("JsonProvider duplicate", () => {
 		expect(created.id.startsWith("temp-")).toBe(true);
 		// Core fields copied
 		expect(created.question).toBe(original.question);
-		expect(created.answer).toBe(original.answer);
+		expect(getLastAgentTurn(created)).toBe(getLastAgentTurn(original));
 		expect(getItemReferences(created).length).toBe(
 			getItemReferences(original).length,
 		);
diff --git a/frontend/tests/unit/provider/provider.multiturn.test.ts b/frontend/tests/unit/provider/provider.multiturn.test.ts
index 27687d4..5e808bd 100644
--- a/frontend/tests/unit/provider/provider.multiturn.test.ts
+++ b/frontend/tests/unit/provider/provider.multiturn.test.ts
@@ -1,4 +1,5 @@
 import { beforeEach, describe, expect, it, vi } from "vitest";
+import type { ApiReference } from "../../../src/adapters/apiMapper";
 import { ApiProvider } from "../../../src/adapters/apiProvider";
 import type { components } from "../../../src/api/generated";
 import type {
@@ -34,18 +35,15 @@ vi.mock("../../../src/services/groundTruths", () => ({
 }));
 
 type ApiHistoryEntry = components["schemas"]["HistoryEntry"] & {
-	refs?: components["schemas"]["Reference"][];
+	refs?: ApiReference[];
 	expectedBehavior?: string[];
+	turnId?: string;
+	stepId?: string;
 };
 type ApiItem = Omit<
 	components["schemas"]["AgenticGroundTruthEntry-Output"],
 	"history"
 > & {
-	synthQuestion?: string | null;
-	editedQuestion?: string | null;
-	answer?: string | null;
-	refs?: components["schemas"]["Reference"][];
-	totalReferences?: number;
 	tags?: string[];
 	comment?: string | null;
 	history?: ApiHistoryEntry[];
@@ -57,11 +55,7 @@ function makeApiItem(overrides: Partial<ApiItem> = {}): ApiItem {
 	return {
 		id: "gt-1",
 		status: "draft",
-		answer: "Original answer",
-		synthQuestion: "Synth question",
-		editedQuestion: "Edited question",
 		history: [],
-		refs: [],
 		tags: [],
 		comment: null,
 		datasetName: "dataset-1",
@@ -71,6 +65,20 @@ function makeApiItem(overrides: Partial<ApiItem> = {}): ApiItem {
 	} as ApiItem;
 }
 
+function withCompatData(
+	data: Record<string, unknown>,
+): Pick<ApiItem, "plugins"> {
+	return {
+		plugins: {
+			"rag-compat": {
+				kind: "rag-compat",
+				version: "1.0",
+				data,
+			},
+		},
+	};
+}
+
 beforeEach(() => {
 	mockGetMyAssignments.mockReset();
 	mockUpdateAssignedGroundTruth.mockReset();
@@ -94,14 +102,14 @@ describe("ApiProvider mapping", () => {
 			expect(history).toHaveLength(2);
 			expect(history[0]).toMatchObject({ role: "user", content: "How do I?" });
 			expect(history[1]).toMatchObject({
-				role: "agent",
+				role: "assistant",
 				content: "Use the regenerate command.",
 			});
 			expect(history[0]?.turnId).toBeTruthy();
 			expect(history[1]?.turnId).toBeTruthy();
 		});
 
-		it("maps per-turn refs onto the owning non-user turn", async () => {
+		it("ignores retired history refs payloads on read", async () => {
 			const apiItem = makeApiItem({
 				history: [
 					{ role: "user", msg: "Q" },
@@ -122,77 +130,53 @@ describe("ApiProvider mapping", () => {
 			mockGetMyAssignments.mockResolvedValue([apiItem]);
 			const provider = new ApiProvider();
 			const { items } = await provider.list();
-			const turn = items[0].history?.[1];
-			const [ref] = getItemReferences(items[0]);
-			expect(ref).toMatchObject({
-				url: "https://turn.ref",
-				bonus: true,
-				messageIndex: 1,
-				turnId: turn?.turnId,
-			});
+			expect(getItemReferences(items[0])).toEqual([]);
 		});
 	});
 
-	describe("compat-migration read projections", () => {
-		it("projects legacy single-turn payloads into stable user and agent turns", async () => {
+	describe("retired compat read behavior", () => {
+		it("does not synthesize history from retired compat question/answer fields", async () => {
 			const apiItem = makeApiItem({
-				synthQuestion: "What is X?",
-				editedQuestion: "What is X exactly?",
-				answer: "X is Y",
-				tags: ["important", "technical"],
 				history: undefined,
+				...withCompatData({
+					synthQuestion: "What is X?",
+					editedQuestion: "What is X exactly?",
+					answer: "X is Y",
+				}),
 			});
 			mockGetMyAssignments.mockResolvedValue([apiItem]);
 			const provider = new ApiProvider();
 			const { items } = await provider.list();
-			const history = items[0].history ?? [];
-			expect(history).toHaveLength(2);
-			expect(history[0]).toMatchObject({
-				role: "user",
-				content: "What is X exactly?",
-			});
-			expect(history[1]).toMatchObject({
-				role: "agent",
-				content: "X is Y",
-			});
+			expect(items[0].history).toBeUndefined();
 		});
 
-		it("anchors legacy top-level refs to the synthesized agent turn even without an answer", async () => {
+		it("does not import retired compat refs into canonical references", async () => {
 			const apiItem = makeApiItem({
-				editedQuestion: "How do I configure authentication for my app?",
-				answer: "",
-				refs: [
-					{
-						url: "https://docs.example.com/auth",
-						content: "Authentication documentation content",
-						keyExcerpt: "Use OAuth 2.0 for authentication",
-						bonus: false,
-					},
-				],
 				history: undefined,
+				...withCompatData({
+					editedQuestion: "How do I configure authentication for my app?",
+					answer: "",
+					refs: [
+						{
+							url: "https://docs.example.com/auth",
+							content: "Authentication documentation content",
+							keyExcerpt: "Use OAuth 2.0 for authentication",
+							bonus: false,
+						},
+					],
+				}),
 			});
 			mockGetMyAssignments.mockResolvedValue([apiItem]);
 			const provider = new ApiProvider();
 			const { items } = await provider.list();
-			const history = items[0].history ?? [];
-			const [ref] = getItemReferences(items[0]);
-			expect(history).toHaveLength(2);
-			expect(history[0]?.content).toBe(
-				"How do I configure authentication for my app?",
-			);
-			expect(history[1]).toMatchObject({ role: "agent", content: "" });
-			expect(ref).toMatchObject({
-				url: "https://docs.example.com/auth",
-				messageIndex: 1,
-				turnId: history[1]?.turnId,
-			});
+			expect(getItemReferences(items[0])).toEqual([]);
 		});
 	});
 });
 
 describe("ApiProvider serialization", () => {
 	describe("core-generic multi-turn writes", () => {
-		it("serializes history roles and keeps refs scoped to non-user turns", async () => {
+		it("serializes history roles and omits retired history refs", async () => {
 			const apiItem = makeApiItem({
 				history: [
 					{ role: "user", msg: "Original Q" },
@@ -207,10 +191,6 @@ describe("ApiProvider serialization", () => {
 						...apiItem,
 						id,
 						history: (patch.history as ApiItem["history"]) ?? apiItem.history,
-						refs: (patch.refs as ApiItem["refs"]) ?? apiItem.refs,
-						answer: (patch.answer as string) ?? apiItem.answer,
-						editedQuestion:
-							(patch.editedQuestion as string) ?? apiItem.editedQuestion,
 						status: (patch.status as ApiItem["status"]) ?? apiItem.status,
 					} as ApiItem;
 				},
@@ -230,11 +210,6 @@ describe("ApiProvider serialization", () => {
 					url: "https://turn",
 					turnId: "turn-agent-updated",
 				},
-				{
-					id: "user-ref",
-					url: "https://user",
-					turnId: "turn-user-updated",
-				},
 			];
 			const updated: GroundTruthItem = withUpdatedReferences(
 				{ ...domain, history },
@@ -245,73 +220,29 @@ describe("ApiProvider serialization", () => {
 			const patch = capturedPatch as Patch;
 			const patchHistory = patch.history as ApiItem["history"];
 			expect(patchHistory?.[0]?.role).toBe("user");
-			expect(patchHistory?.[1]?.role).toBe("assistant");
+			expect(patchHistory?.[1]?.role).toBe("agent");
 			expect(patchHistory?.[0]?.refs).toBeUndefined();
-			expect(patchHistory?.[1]?.refs).toHaveLength(1);
-			expect(patchHistory?.[1]?.refs?.[0]?.url).toBe("https://turn");
-		});
-
-		it("keeps true multi-turn refs out of top-level compatibility fields", async () => {
-			const apiItem = makeApiItem({
-				history: [
-					{ role: "user", msg: "Question" },
-					{
-						role: "assistant",
-						msg: "Answer",
-						refs: [
-							{
-								url: "https://turn.ref",
-								content: "Turn content",
-								bonus: false,
-							},
-						],
-					},
-				],
-				refs: [],
-			});
-			let capturedPatch: Patch | undefined;
-			mockUpdateAssignedGroundTruth.mockImplementation(
-				async (
-					_dataset: string,
-					_bucket: string,
-					_id: string,
-					patch: Patch,
-				) => {
-					capturedPatch = patch;
-					return apiItem;
-				},
-			);
-			mockGetMyAssignments.mockResolvedValue([apiItem]);
-			const provider = new ApiProvider();
-			const { items } = await provider.list();
-			await provider.save(items[0]);
-			const patch = capturedPatch as Patch;
-			const patchHistory = patch.history as ApiItem["history"];
-			expect(patch.refs).toHaveLength(0);
-			expect(patchHistory?.[1]?.refs).toHaveLength(1);
-			expect(patchHistory?.[1]?.refs?.[0]?.url).toBe("https://turn.ref");
+			expect(patchHistory?.[1]?.refs).toBeUndefined();
+			expect((patch as Record<string, unknown>).refs).toBeUndefined();
 		});
 	});
 
-	describe("compat-migration write projections", () => {
-		it("preserves legacy top-level refs when saving a synthesized single-turn item", async () => {
+	describe("canonical write projections", () => {
+		it("persists canonical plugin references without history refs emission", async () => {
 			const apiItem = makeApiItem({
-				synthQuestion: "What is X?",
-				answer: "X is Y",
-				refs: [
-					{
-						url: "https://legacy.ref/doc1",
-						content: "Legacy content",
-						keyExcerpt: "Key paragraph",
-						bonus: false,
-					},
-					{
-						url: "https://legacy.ref/doc2",
-						content: "Bonus content",
-						bonus: true,
-					},
+				history: [
+					{ role: "user", msg: "Q", turnId: "t-user" },
+					{ role: "assistant", msg: "A", turnId: "t-agent" },
 				],
-				history: undefined,
+				...withCompatData({
+					references: [
+						{
+							url: "https://canonical.ref/doc1",
+							content: "Canonical content",
+							messageIndex: 1,
+						},
+					],
+				}),
 			});
 			let capturedPatch: Patch | undefined;
 			mockUpdateAssignedGroundTruth.mockImplementation(
@@ -320,7 +251,7 @@ describe("ApiProvider serialization", () => {
 					return {
 						...apiItem,
 						id,
-						refs: (patch.refs as ApiItem["refs"]) ?? apiItem.refs,
+						plugins: (patch.plugins as ApiItem["plugins"]) ?? apiItem.plugins,
 						status: (patch.status as ApiItem["status"]) ?? apiItem.status,
 					} as ApiItem;
 				},
@@ -328,11 +259,11 @@ describe("ApiProvider serialization", () => {
 			mockGetMyAssignments.mockResolvedValue([apiItem]);
 			const provider = new ApiProvider();
 			const { items } = await provider.list();
-			const legacyRefs = getItemReferences(items[0]);
+			const existingRefs = getItemReferences(items[0]);
 			const updated: GroundTruthItem = withUpdatedReferences(
 				items[0],
-				legacyRefs.map((ref) =>
-					ref.url === "https://legacy.ref/doc1"
+				existingRefs.map((ref) =>
+					ref.url === "https://canonical.ref/doc1"
 						? { ...ref, bonus: true, keyParagraph: "Updated key" }
 						: ref,
 				),
@@ -340,17 +271,12 @@ describe("ApiProvider serialization", () => {
 			await provider.save(updated);
 			const patch = capturedPatch as Patch;
 			const patchHistory = patch.history as ApiItem["history"];
-			expect(patch.refs).toHaveLength(2);
-			expect(patch.refs?.[0]).toMatchObject({
-				url: "https://legacy.ref/doc1",
-				bonus: true,
-				keyExcerpt: "Updated key",
-			});
-			expect(patch.refs?.[1]).toMatchObject({
-				url: "https://legacy.ref/doc2",
-				bonus: true,
-			});
-			expect(patchHistory?.[1]?.refs).toHaveLength(2);
+			expect((patch as Record<string, unknown>).refs).toBeUndefined();
+			expect(patchHistory?.[1]?.refs).toBeUndefined();
+			expect(
+				(patch.plugins?.["rag-compat"]?.data as { references?: unknown })
+					.references,
+			).toBeDefined();
 		});
 	});
 });
diff --git a/frontend/tests/unit/registry/RegistryRenderer.test.tsx b/frontend/tests/unit/registry/RegistryRenderer.test.tsx
index 1a01f2b..d73f685 100644
--- a/frontend/tests/unit/registry/RegistryRenderer.test.tsx
+++ b/frontend/tests/unit/registry/RegistryRenderer.test.tsx
@@ -25,7 +25,7 @@ function renderExtension(toolCall: ToolCallRecord) {
 				item: {
 					id: "item-1",
 					question: "q",
-					answer: "",
+					history: [{ role: "agent", content: "" }],
 					status: "draft",
 					providerId: "json",
 					tags: [],
diff --git a/frontend/tests/unit/services/groundTruths-mapping.test.ts b/frontend/tests/unit/services/groundTruths-mapping.test.ts
index 389a38e..4c68cc0 100644
--- a/frontend/tests/unit/services/groundTruths-mapping.test.ts
+++ b/frontend/tests/unit/services/groundTruths-mapping.test.ts
@@ -1,5 +1,8 @@
 import { describe, expect, it } from "vitest";
-import type { ApiGroundTruth } from "../../../src/adapters/apiMapper";
+import type {
+	ApiGroundTruth,
+	ApiReference,
+} from "../../../src/adapters/apiMapper";
 import { groundTruthFromApi } from "../../../src/adapters/apiMapper";
 import type { components } from "../../../src/api/generated";
 import { getItemReferences } from "../../../src/models/groundTruth";
@@ -9,15 +12,10 @@ type ApiItem = Omit<
 	components["schemas"]["AgenticGroundTruthEntry-Output"],
 	"history"
 > & {
-	synthQuestion?: string | null;
-	editedQuestion?: string | null;
-	answer?: string | null;
-	refs?: components["schemas"]["Reference"][];
-	totalReferences?: number;
 	tags?: string[];
 	comment?: string | null;
 	history?: (components["schemas"]["HistoryEntry"] & {
-		refs?: components["schemas"]["Reference"][];
+		refs?: ApiReference[];
 		expectedBehavior?: string[];
 	})[];
 };
@@ -26,11 +24,7 @@ function makeApiItem(overrides: Partial<ApiItem> = {}): ApiItem {
 	return {
 		id: "gt-1",
 		status: "draft",
-		answer: "",
-		synthQuestion: "",
-		editedQuestion: "",
 		history: undefined,
-		refs: [],
 		tags: [],
 		comment: null,
 		datasetName: "dataset-1",
@@ -40,9 +34,23 @@ function makeApiItem(overrides: Partial<ApiItem> = {}): ApiItem {
 	} as ApiItem;
 }
 
+function withCompatData(
+	data: Record<string, unknown>,
+): Pick<ApiItem, "plugins"> {
+	return {
+		plugins: {
+			"rag-compat": {
+				kind: "rag-compat",
+				version: "1.0",
+				data,
+			},
+		},
+	};
+}
+
 describe("mapGroundTruthFromApi", () => {
 	describe("core-generic mapping", () => {
-		it("converts assistant role to agent and keeps stable turn ids", () => {
+		it("preserves assistant role values and keeps stable turn ids", () => {
 			const apiItem = makeApiItem({
 				history: [
 					{ role: "user", msg: "Question" },
@@ -55,14 +63,14 @@ describe("mapGroundTruthFromApi", () => {
 				content: "Question",
 			});
 			expect(result.history?.[1]).toMatchObject({
-				role: "agent",
+				role: "assistant",
 				content: "Answer",
 			});
 			expect(result.history?.[0].turnId).toBeTruthy();
 			expect(result.history?.[1].turnId).toBeTruthy();
 		});
 
-		it("preserves per-turn refs when canonical history already exists", () => {
+		it("ignores retired per-turn history refs", () => {
 			const apiItem = makeApiItem({
 				history: [
 					{ role: "user", msg: "Q1" },
@@ -80,72 +88,53 @@ describe("mapGroundTruthFromApi", () => {
 				],
 			});
 			const result = mapGroundTruthFromApi(apiItem);
-			const [ref] = getItemReferences(result);
-			expect(ref).toMatchObject({
-				url: "https://turn-ref.com",
-				messageIndex: 1,
-				turnId: result.history?.[1]?.turnId,
-			});
+			expect(getItemReferences(result)).toEqual([]);
 		});
 	});
 
-	describe("compat-migration read mapping", () => {
-		it("creates synthesized user and agent turns from legacy single-turn fields", () => {
+	describe("retired compat read mapping", () => {
+		it("does not synthesize history from retired single-turn fields", () => {
 			const apiItem = makeApiItem({
-				synthQuestion: "Synth",
-				editedQuestion: "Edited",
-				answer: "A",
 				history: undefined,
+				...withCompatData({
+					synthQuestion: "Synth",
+					editedQuestion: "Edited",
+					answer: "A",
+				}),
 			});
 			const result = mapGroundTruthFromApi(apiItem);
-			expect(result.history).toHaveLength(2);
-			expect(result.history?.[0]).toMatchObject({
-				role: "user",
-				content: "Edited",
-			});
-			expect(result.history?.[1]).toMatchObject({
-				role: "agent",
-				content: "A",
-			});
+			expect(result.history).toBeUndefined();
 		});
 
-		it("anchors legacy top-level refs to the synthesized agent turn when answer is empty", () => {
+		it("does not import retired compat refs", () => {
 			const apiItem = makeApiItem({
-				editedQuestion: "How do I configure authentication for my app?",
-				answer: "",
-				refs: [
-					{
-						url: "https://docs.example.com/auth",
-						content: "Authentication documentation content",
-						keyExcerpt: "Use OAuth 2.0 for authentication",
-						bonus: false,
-					},
-				],
 				history: undefined,
+				...withCompatData({
+					editedQuestion: "How do I configure authentication for my app?",
+					answer: "",
+					refs: [
+						{
+							url: "https://docs.example.com/auth",
+							content: "Authentication documentation content",
+							keyExcerpt: "Use OAuth 2.0 for authentication",
+							bonus: false,
+						},
+					],
+				}),
 			});
 			const result = mapGroundTruthFromApi(apiItem);
-			const [ref] = getItemReferences(result);
-			expect(result.history).toHaveLength(2);
-			expect(result.history?.[1]).toMatchObject({ role: "agent", content: "" });
-			expect(ref).toMatchObject({
-				url: "https://docs.example.com/auth",
-				messageIndex: 1,
-				turnId: result.history?.[1]?.turnId,
-			});
+			expect(getItemReferences(result)).toEqual([]);
 		});
 	});
 
 	describe("providerId", () => {
 		it("defaults to 'api' when not provided", () => {
-			const result = mapGroundTruthFromApi(makeApiItem({ synthQuestion: "Q" }));
+			const result = mapGroundTruthFromApi(makeApiItem());
 			expect(result.providerId).toBe("api");
 		});
 
 		it("uses provided providerId", () => {
-			const result = mapGroundTruthFromApi(
-				makeApiItem({ synthQuestion: "Q" }),
-				"custom-provider",
-			);
+			const result = mapGroundTruthFromApi(makeApiItem(), "custom-provider");
 			expect(result.providerId).toBe("custom-provider");
 		});
 	});
@@ -206,11 +195,7 @@ describe("mapper parity: groundTruthFromApi and mapGroundTruthFromApi", () => {
 		return {
 			id: "parity-1",
 			status: "draft",
-			answer: "Parity answer",
-			synthQuestion: "Synth parity Q",
-			editedQuestion: "Edited parity Q",
 			history: undefined,
-			refs: [],
 			tags: ["t1"],
 			manualTags: ["m1"],
 			computedTags: ["c1"],
@@ -219,11 +204,20 @@ describe("mapper parity: groundTruthFromApi and mapGroundTruthFromApi", () => {
 			bucket: "bkt" as ApiGroundTruth["bucket"],
 			_etag: "etag-parity",
 			reviewedAt: "2024-01-01T00:00:00Z",
+			plugins: {
+				"rag-compat": {
+					kind: "rag-compat",
+					version: "1.0",
+					data: {
+						references: [],
+					},
+				},
+			},
 			...overrides,
 		} as ApiGroundTruth;
 	}
 
-	it("produces identical output for a legacy single-turn payload", () => {
+	it("produces identical output for a canonical payload", () => {
 		const payload = makeSharedPayload();
 		const fromProvider = groundTruthFromApi(payload);
 		const fromService = mapGroundTruthFromApi(payload);
@@ -232,11 +226,8 @@ describe("mapper parity: groundTruthFromApi and mapGroundTruthFromApi", () => {
 		);
 	});
 
-	it("produces identical output for a multi-turn payload with per-turn refs", () => {
+	it("produces identical output for a multi-turn payload with retired per-turn refs", () => {
 		const payload = makeSharedPayload({
-			editedQuestion: "",
-			synthQuestion: "",
-			answer: "",
 			history: [
 				{ role: "user", msg: "First question" },
 				{
@@ -257,7 +248,7 @@ describe("mapper parity: groundTruthFromApi and mapGroundTruthFromApi", () => {
 		expect(normalizeTurnIdentity(fromProvider)).toEqual(
 			normalizeTurnIdentity(fromService),
 		);
-		expect(getItemReferences(fromProvider)).toHaveLength(2);
+		expect(getItemReferences(fromProvider)).toHaveLength(0);
 	});
 
 	it("preserves reviewedAt through both paths identically", () => {