From e5a3fdbf46193f92d39721673fd4ca4d10c149ae Mon Sep 17 00:00:00 2001 From: Ryan Alyn Porter Date: Wed, 29 Apr 2026 18:20:03 -0400 Subject: [PATCH 1/6] Fix optimizer score versions default featured flag --- MCP/tools/score/score_update_guidelines_test.py | 6 +++--- MCP/tools/score/scores.py | 2 +- MCP/tools/score/scores_test.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/MCP/tools/score/score_update_guidelines_test.py b/MCP/tools/score/score_update_guidelines_test.py index 399333a14..fbb17093a 100644 --- a/MCP/tools/score/score_update_guidelines_test.py +++ b/MCP/tools/score/score_update_guidelines_test.py @@ -45,7 +45,7 @@ def test_guidelines_only_update_none_handling(self): 'configuration': current_version_data['configuration'], # Preserved existing code 'guidelines': update_params['guidelines'], # New guidelines 'note': update_params['version_note'], - 'isFeatured': "true", + 'isFeatured': "false", 'parentVersionId': 'version-current' } @@ -241,7 +241,7 @@ def simulate_version_input_creation(score_id, code_content, guidelines, note, pa 'scoreId': score_id, 'configuration': (code_content or '').strip(), # Fixed to handle None 'note': note or 'Updated via MCP score update tool', - 'isFeatured': "true" + 'isFeatured': "false" } # Add guidelines if provided (fixed logic) @@ -357,7 +357,7 @@ def simulate_complete_workflow(): 'scoreId': score_data['id'], 'configuration': (code or '').strip(), 'note': update_params['version_note'] or 'Updated via MCP', - 'isFeatured': "true" + 'isFeatured': "false" } if guidelines: diff --git a/MCP/tools/score/scores.py b/MCP/tools/score/scores.py index b1851d1a4..876edd20b 100644 --- a/MCP/tools/score/scores.py +++ b/MCP/tools/score/scores.py @@ -2682,7 +2682,7 @@ async def _create_version_from_code_with_parent( 'scoreId': score.id, 'configuration': (code_content or '').strip(), 'note': note or 'Updated via MCP score update tool', - 'isFeatured': "true" # Mark as featured by default + 'isFeatured': "false" } # Add guidelines if provided diff --git a/MCP/tools/score/scores_test.py b/MCP/tools/score/scores_test.py index e485a0926..7339be77b 100644 --- a/MCP/tools/score/scores_test.py +++ b/MCP/tools/score/scores_test.py @@ -1670,14 +1670,14 @@ def test_create_version_with_parent_version_input(self): 'configuration': code_content.strip(), 'guidelines': guidelines.strip(), 'note': note, - 'isFeatured': "true", + 'isFeatured': "false", 'parentVersionId': parent_version_id # Should set parent relationship } # Verify parent relationship is established assert expected_version_input['parentVersionId'] == parent_version_id assert expected_version_input['scoreId'] == score_id - assert expected_version_input['isFeatured'] == "true" + assert expected_version_input['isFeatured'] == "false" def test_create_version_with_parent_error_handling(self): """Test error handling in _create_version_from_code_with_parent""" From eaa2da9dfb1baf51f0c62a265a9d4dc7304b26dc Mon Sep 17 00:00:00 2001 From: Ryan Alyn Porter Date: Wed, 29 Apr 2026 18:29:26 -0400 Subject: [PATCH 2/6] Add score rubric consistency preflight --- MCP/tools/evaluation/evaluations.py | 26 ++- dashboard/components/EvaluationTask.tsx | 26 +++ plexus/cli/evaluation/evaluations.py | 86 +++++-- plexus/cli/item/items.py | 61 ++++- ...est_feedback_alignment_optimizer_config.py | 8 + .../feedback_alignment_optimizer.yaml | 1 + plexus/score_rubric_consistency.py | 213 ++++++++++++++++++ plexus/score_rubric_consistency_test.py | 93 ++++++++ 8 files changed, 495 insertions(+), 19 deletions(-) create mode 100644 plexus/score_rubric_consistency.py create mode 100644 plexus/score_rubric_consistency_test.py diff --git a/MCP/tools/evaluation/evaluations.py b/MCP/tools/evaluation/evaluations.py index 4aef629f1..d8cf242c6 100644 --- a/MCP/tools/evaluation/evaluations.py +++ b/MCP/tools/evaluation/evaluations.py @@ -345,6 +345,7 @@ async def plexus_evaluation_run( use_score_associated_dataset: bool = False, batch: Optional[List[Dict[str, Any]]] = None, notes: Optional[str] = None, + score_rubric_consistency_check: bool = False, ) -> str: """ Run an evaluation using the same code path as CLI. @@ -391,6 +392,9 @@ async def plexus_evaluation_run( the evaluation's parameters JSON under the "notes" key. Useful for recording context like "Baseline: deterministic accuracy dataset" or "Iteration 3: Added example for transfer-request edge case". + - score_rubric_consistency_check: Feedback evaluations only. When True, run a + preflight check that compares the evaluated ScoreVersion code against its + own rubric and store the paragraph on Evaluation.parameters. Returns: - JSON string with evaluation results including evaluation_id, metrics, and dashboard URL. @@ -434,19 +438,27 @@ def _apply_notes_to_evaluation(evaluation_id: str, notes_text: str) -> None: batch_list = list(batch) logger.info(f"Batch evaluation: dispatching {len(batch_list)} evaluations in parallel") for i, item in enumerate(batch_list): - logger.info(f" Batch item {i+1}: type={item.get('evaluation_type')}, score={item.get('score_name')}, wait={item.get('wait')}") + logger.info( + f" Batch item {i + 1}: type={item.get('evaluation_type')}, " + f"score={item.get('score_name')}, wait={item.get('wait')}" + ) tasks = [plexus_evaluation_run(**item) for item in batch_list] raw_results = await _asyncio.gather(*tasks, return_exceptions=True) output = [] for i, r in enumerate(raw_results): if isinstance(r, Exception): - logger.error(f"Batch item {i+1} raised exception: {type(r).__name__}: {r}", exc_info=r) + logger.error( + f"Batch item {i + 1} raised exception: {type(r).__name__}: {r}", + exc_info=r, + ) output.append({"error": f"{type(r).__name__}: {r}"}) else: try: output.append(json.loads(r)) except Exception as parse_exc: - logger.error(f"Batch item {i+1} result parse error: {parse_exc}, raw={str(r)[:500]}") + logger.error( + f"Batch item {i + 1} result parse error: {parse_exc}, raw={str(r)[:500]}" + ) output.append({"error": "Could not parse result", "raw": str(r)}) logger.info(f"Batch evaluation complete: {len(output)} results") # Post-batch notes application: apply notes to each eval sequentially @@ -459,7 +471,9 @@ def _apply_notes_to_evaluation(evaluation_id: str, notes_text: str) -> None: if eval_id: _apply_notes_to_evaluation(eval_id, item_notes) else: - logger.warning(f"Batch item {i+1}: no eval_id in result, cannot apply notes") + logger.warning( + f"Batch item {i + 1}: no eval_id in result, cannot apply notes" + ) return json.dumps(output) if not scorecard_name: @@ -576,6 +590,8 @@ def _spawn_feedback( cmd += ["--max-category-summary-items", str(max_category_items)] if runner_task_id: cmd += ["--task-id", runner_task_id] + if score_rubric_consistency_check: + cmd += ["--score-rubric-consistency-check"] subprocess.Popen( cmd, stdout=subprocess.DEVNULL, @@ -699,6 +715,8 @@ def _spawn_feedback( fb_args.extend(['--sample-seed', str(sample_seed)]) if notes: fb_args.extend(['--notes', notes]) + if score_rubric_consistency_check: + fb_args.append('--score-rubric-consistency-check') # When a specific version is requested, yaml mode must be disabled. effective_yaml = yaml and not resolved_version if effective_yaml: diff --git a/dashboard/components/EvaluationTask.tsx b/dashboard/components/EvaluationTask.tsx index 1f0160ddc..2e942c29a 100644 --- a/dashboard/components/EvaluationTask.tsx +++ b/dashboard/components/EvaluationTask.tsx @@ -1067,6 +1067,15 @@ const DetailContent = React.memo(({ const rootCauseTopics = rootCauseData?.topics ?? null const misclassificationAnalysis = rootCauseData?.misclassification_analysis ?? null + const scoreRubricConsistencyCheck = useMemo(() => { + try { + const params = parseJsonDeep(data.parameters) as Record | null + const check = params?.score_rubric_consistency_check + return (check && typeof check === 'object') ? check as Record : null + } catch { + return null + } + }, [data.parameters]) const rcaCoverage = useMemo(() => { try { const params = parseJsonDeep(data.parameters) as Record | null @@ -1531,6 +1540,23 @@ const DetailContent = React.memo(({ )} + {scoreRubricConsistencyCheck && ( + + + + Score/rubric consistency + {typeof scoreRubricConsistencyCheck.status === 'string' + ? `: ${scoreRubricConsistencyCheck.status}` + : ''} + + + {typeof scoreRubricConsistencyCheck.paragraph === 'string' + ? scoreRubricConsistencyCheck.paragraph + : 'No consistency summary was generated.'} + + + )} + {/* Score-Configuration RCA */} {(rootCauseData && ( (rootCauseTopics && rootCauseTopics.length > 0) || diff --git a/plexus/cli/evaluation/evaluations.py b/plexus/cli/evaluation/evaluations.py index b05212a79..939b4663e 100644 --- a/plexus/cli/evaluation/evaluations.py +++ b/plexus/cli/evaluation/evaluations.py @@ -69,6 +69,10 @@ def truncate_dict_strings(d, max_length=100): run_feedback_evaluation_orchestrated, ) from plexus.utils.feedback_selection import select_feedback_items +from plexus.score_rubric_consistency import ( + ScoreRubricConsistencyService, + merge_consistency_result_into_parameters, +) from plexus.utils import truncate_dict_strings_inner @@ -85,7 +89,7 @@ def log_scorecard_configurations(scorecard_instance, context=""): version = score_config.get('version', 'Not specified') champion_version = score_config.get('championVersionId', 'Not specified') - logging.info(f"Score #{i+1}: {score_name}") + logging.info(f"Score #{i + 1}: {score_name}") logging.info(f" ID: {score_id}") logging.info(f" Version field: {version}") logging.info(f" Champion version: {champion_version}") @@ -2756,9 +2760,9 @@ async def _run_accuracy(): scorecard_id_resolved = getattr(scorecard_instance, 'id', None) logging.info(f"Resolved from attributes: name='{scorecard_name_resolved}', key='{scorecard_key_resolved}', id='{scorecard_id_resolved}' (type: {type(scorecard_id_resolved)})") else: - scorecard_name_resolved = scorecard # Fallback to initial identifier - scorecard_key_resolved = scorecard # Fallback to initial identifier - scorecard_id_resolved = scorecard # Fallback to initial identifier + scorecard_name_resolved = scorecard # Fallback to initial identifier + scorecard_key_resolved = scorecard # Fallback to initial identifier + scorecard_id_resolved = scorecard # Fallback to initial identifier logging.info(f"Using fallback: name='{scorecard_name_resolved}', key='{scorecard_key_resolved}', id='{scorecard_id_resolved}' (type: {type(scorecard_id_resolved)})") # Check if any cloud dataset options are provided @@ -3205,9 +3209,9 @@ async def _run_accuracy(): raise # Display final results summary - logging.info(f"\n{'='*60}") + logging.info(f"\n{'=' * 60}") logging.info("EVALUATION RESULTS") - logging.info('='*60) + logging.info('=' * 60) logging.info(f"Sample Size: {len(labeled_samples_data)}") # Safely extract metrics (handle None case) @@ -3236,7 +3240,7 @@ async def _run_accuracy(): if detailed_summary: logging.info(detailed_summary) - logging.info('='*60) + logging.info('=' * 60) # Complete task lifecycle in tracker/API task record. if tracker: @@ -3538,8 +3542,8 @@ def check_dict_serializability(d, path=""): if isinstance(item, dict): non_serializable_paths.extend(check_dict_serializability(item, item_path)) elif isinstance(item, list): - # Handle nested lists if necessary, though less common in typical results - pass # Or add recursive list check if needed + # Handle nested lists if necessary, though less common in typical results + pass # Or add recursive list check if needed elif not is_json_serializable(item): logging.warning(f"Non-serializable list item found at path '{item_path}': type={type(item)}") non_serializable_paths.append(item_path) @@ -3600,8 +3604,8 @@ def score_text_wrapper(scorecard_instance, text, score_name, scorecard_name=None score_result_name = None if hasattr(score_result_obj, 'parameters') and hasattr(score_result_obj.parameters, 'name'): score_result_name = score_result_obj.parameters.name - elif hasattr(score_result_obj, 'name'): # Fallback if name is directly on the object - score_result_name = score_result_obj.name + elif hasattr(score_result_obj, 'name'): # Fallback if name is directly on the object + score_result_name = score_result_obj.name if score_result_name == score_name: # Ensure the returned object is a Score.Result instance or similar @@ -3623,8 +3627,8 @@ def score_text_wrapper(scorecard_instance, text, score_name, scorecard_name=None # return result[first_key] # else: # return {"error": "Empty result dictionary", "value": "ERROR"} - elif hasattr(result, 'value'): # Handle case where a single Result object is returned directly - return result + elif hasattr(result, 'value'): # Handle case where a single Result object is returned directly + return result else: # Handle unexpected result types logging.warning(f"Unexpected result type from score_entire_text: {type(result)}") @@ -4136,6 +4140,12 @@ def last(account_key: str, type: Optional[str]): @click.option('--yaml', 'use_yaml', is_flag=True, help='Load scorecard from local YAML files instead of the API') @click.option('--task-id', default=None, type=str, help='Task ID for progress tracking') @click.option('--notes', default=None, type=str, help='Freeform notes explaining why this evaluation is being run. Stored in evaluation parameters.') +@click.option( + '--score-rubric-consistency-check', + is_flag=True, + default=False, + help='Before feedback-backed predictions, compare the evaluated ScoreVersion code against its rubric and store the result.', +) def feedback( scorecard: str, score: str, @@ -4150,6 +4160,7 @@ def feedback( use_yaml: bool, task_id: Optional[str], notes: Optional[str] = None, + score_rubric_consistency_check: bool = False, ): """ Evaluate feedback alignment by analyzing feedback items over a time period for a specific score. @@ -4482,6 +4493,7 @@ def feedback( "sample_seed": sample_seed, "max_category_summary_items": max_category_summary_items, "mode": "accuracy_with_feedback_dataset", + "score_rubric_consistency_check_requested": bool(score_rubric_consistency_check), **({"notes": notes} if notes else {}), "metadata": { **({"baseline": baseline} if baseline else {}), @@ -4496,6 +4508,54 @@ def feedback( console.print(f"\nCreated evaluation record: {evaluation_id}") console.print(f"Dashboard URL: https://app.plexusanalytics.com/evaluations/{evaluation_id}") + + if score_rubric_consistency_check: + console.print("\n[bold]Checking score code against rubric...[/bold]") + try: + consistency_result = ScoreRubricConsistencyService().generate_from_api( + client=client, + scorecard_identifier=scorecard, + score_identifier=score_name_for_dataset, + score_id=score_id, + score_version_id=version, + ) + merged_parameters = merge_consistency_result_into_parameters( + evaluation_record.parameters, + consistency_result, + ) + evaluation_record.update(parameters=json.dumps(merged_parameters)) + console.print( + f"[dim]Score/rubric consistency: {consistency_result.status}[/dim]" + ) + except Exception as consistency_error: + logging.warning( + "Score/rubric consistency check failed for evaluation %s: %s", + evaluation_id, + consistency_error, + exc_info=True, + ) + merged_parameters = {} + try: + merged_parameters = ( + json.loads(evaluation_record.parameters) + if isinstance(evaluation_record.parameters, str) + and evaluation_record.parameters + else (evaluation_record.parameters or {}) + ) + except Exception: + merged_parameters = {} + merged_parameters["score_rubric_consistency_check"] = { + "scorecard_identifier": scorecard, + "score_identifier": score_name_for_dataset, + "score_version_id": version, + "status": "unavailable", + "paragraph": ( + "The score/rubric consistency check failed before predictions ran." + ), + "error": str(consistency_error), + "checked_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + } + evaluation_record.update(parameters=json.dumps(merged_parameters)) # Run accuracy evaluation with the modified scorecard console.print("\n[bold]Running accuracy evaluation with FeedbackItems dataset...[/bold]") diff --git a/plexus/cli/item/items.py b/plexus/cli/item/items.py index ad8ecccb3..714049e79 100644 --- a/plexus/cli/item/items.py +++ b/plexus/cli/item/items.py @@ -13,6 +13,11 @@ from plexus.cli.shared.console import console from plexus.cli.report.utils import resolve_account_id_for_command import json +from plexus.cli.shared.identifier_resolution import ( + resolve_score_identifier, + resolve_scorecard_identifier, +) +from plexus.score_rubric_consistency import ScoreRubricConsistencyService def format_datetime(dt: Optional[datetime]) -> str: """Format datetime with proper handling of None values""" @@ -471,6 +476,54 @@ def create(account: Optional[str], evaluation_id: Optional[str], text: Optional[ import traceback print(traceback.format_exc()) + +@items.command(name="contradictions") +@click.option("--scorecard", "scorecard_identifier", required=True) +@click.option("--score", "score_identifier", required=True) +@click.option("--version", "score_version_id", required=True) +@click.option("--item", "item_identifier", default=None, help="Optional item id or identifier for spot-check context.") +@click.option("--format", "output_format", type=click.Choice(["markdown", "json"]), default="markdown") +def contradictions( + scorecard_identifier: str, + score_identifier: str, + score_version_id: str, + item_identifier: Optional[str], + output_format: str, +): + """Check whether one ScoreVersion's code is consistent with its rubric.""" + client = create_client() + account_id = resolve_account_id_for_command(client, None) + scorecard_id = resolve_scorecard_identifier(client, scorecard_identifier) + if not scorecard_id: + raise click.ClickException(f"Could not resolve scorecard: {scorecard_identifier}") + score_id = resolve_score_identifier(client, scorecard_id, score_identifier) + if not score_id: + raise click.ClickException( + f"Could not resolve score '{score_identifier}' in scorecard '{scorecard_identifier}'" + ) + + item_text = "" + if item_identifier: + item = find_item_by_any_identifier(client, item_identifier, account_id) + if not item: + raise click.ClickException(f"Could not resolve item: {item_identifier}") + item_text = item.text or "" + + result = ScoreRubricConsistencyService().generate_from_api( + client=client, + scorecard_identifier=scorecard_identifier, + score_identifier=score_identifier, + score_id=score_id, + score_version_id=score_version_id, + item_text=item_text, + ) + payload = result.to_parameters_payload() + if output_format == "json": + console.print_json(json.dumps(payload)) + else: + console.print(f"[bold]Status:[/bold] {result.status}") + console.print(result.paragraph) + @items.command() @click.option('--account', help='Account key or ID (optional, uses default from environment if not provided)') @click.option('--evaluation-id', help='Filter by evaluation ID') @@ -878,7 +931,10 @@ def upsert(account: Optional[str], json_file: Optional[str], data: Optional[str] batch_end = min(batch_start + batch_size, len(items_data)) batch = items_data[batch_start:batch_end] - console.print(f"\n[bold]Processing batch {batch_start//batch_size + 1} ({batch_start + 1}-{batch_end} of {len(items_data)})[/bold]") + console.print( + f"\n[bold]Processing batch {batch_start // batch_size + 1} " + f"({batch_start + 1}-{batch_end} of {len(items_data)})[/bold]" + ) for i, item_data in enumerate(batch, batch_start + 1): try: @@ -975,6 +1031,7 @@ def item(): item.add_command(list) item.add_command(last) item.add_command(info) +item.add_command(contradictions) item.add_command(update) item.add_command(upsert) -item.add_command(delete) \ No newline at end of file +item.add_command(delete) diff --git a/plexus/cli/procedure/test_feedback_alignment_optimizer_config.py b/plexus/cli/procedure/test_feedback_alignment_optimizer_config.py index 1c530a6ec..c4435b4a9 100644 --- a/plexus/cli/procedure/test_feedback_alignment_optimizer_config.py +++ b/plexus/cli/procedure/test_feedback_alignment_optimizer_config.py @@ -321,6 +321,14 @@ def test_optimizer_yaml_runs_contradictions_directly_without_background_dispatch assert 'cache_key = "FeedbackContradictions (expanded): " .. scorecard_name .. " / " .. score_name' in code +def test_optimizer_baseline_feedback_runs_score_rubric_consistency_check(): + config = _load_optimizer_config() + code = config["code"] + + assert "score_rubric_consistency_check = true" in code + assert 'evaluation_type = "feedback"' in code + + def test_optimizer_yaml_treats_cycle_errors_as_terminal_and_does_not_extend_iteration_cap(): config = _load_optimizer_config() code = config["code"] diff --git a/plexus/procedures/feedback_alignment_optimizer.yaml b/plexus/procedures/feedback_alignment_optimizer.yaml index 974e61ca6..3b5b03f19 100644 --- a/plexus/procedures/feedback_alignment_optimizer.yaml +++ b/plexus/procedures/feedback_alignment_optimizer.yaml @@ -3819,6 +3819,7 @@ code: | max_feedback_items = params.max_samples or 100, sampling_mode = "newest", wait = true, + score_rubric_consistency_check = true, notes = "Baseline: recent feedback alignment dataset for " .. score_name, }) end diff --git a/plexus/score_rubric_consistency.py b/plexus/score_rubric_consistency.py new file mode 100644 index 000000000..ff0266a27 --- /dev/null +++ b/plexus/score_rubric_consistency.py @@ -0,0 +1,213 @@ +"""Score-version rubric consistency checks. + +This module owns the lightweight preflight that asks whether the score code for a +specific ScoreVersion appears consistent with that same version's rubric text. +The result is designed to be persisted on Evaluation.parameters and displayed as +operator context before RCA. +""" + +from __future__ import annotations + +import json +import os +import re +from dataclasses import asdict, dataclass +from datetime import datetime, timezone +from typing import Any, Callable, Dict, Optional + + +@dataclass(frozen=True) +class ScoreRubricConsistencyRequest: + scorecard_identifier: str + score_identifier: str + score_version_id: str + rubric_text: str + score_code: str + item_text: str = "" + + +@dataclass(frozen=True) +class ScoreRubricConsistencyResult: + scorecard_identifier: str + score_identifier: str + score_version_id: str + status: str + paragraph: str + checked_at: str + model: str + diagnostics: Dict[str, Any] + + def to_parameters_payload(self) -> Dict[str, Any]: + return asdict(self) + + +class ScoreRubricConsistencyService: + """Generate a concise score-code vs rubric consistency assessment.""" + + DEFAULT_MODEL = "gpt-5-mini" + VALID_STATUSES = {"consistent", "potential_conflict", "inconclusive"} + + def __init__( + self, + *, + invoke_model: Optional[Callable[[str, str], str]] = None, + model: str = DEFAULT_MODEL, + ): + self._invoke_model = invoke_model or self._invoke_openai + self._model = model + + def generate(self, request: ScoreRubricConsistencyRequest) -> ScoreRubricConsistencyResult: + prompt = self._build_prompt(request) + raw_text = self._invoke_model(prompt, self._model) + try: + parsed = self._parse_response(raw_text) + except json.JSONDecodeError: + repair_prompt = ( + f"{prompt}\n\nYour prior response was not valid JSON:\n" + f"{_truncate(raw_text or '(empty response)', 1000)}\n\n" + "Return ONLY valid JSON with exactly these keys: status, paragraph." + ) + raw_text = self._invoke_model(repair_prompt, self._model) + parsed = self._parse_response(raw_text) + status = str(parsed.get("status") or "inconclusive").strip() + if status not in self.VALID_STATUSES: + status = "inconclusive" + paragraph = _compact_paragraph(str(parsed.get("paragraph") or "")) + if not paragraph: + paragraph = "The consistency check did not produce a usable assessment." + status = "inconclusive" + return ScoreRubricConsistencyResult( + scorecard_identifier=request.scorecard_identifier, + score_identifier=request.score_identifier, + score_version_id=request.score_version_id, + status=status, + paragraph=paragraph, + checked_at=datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + model=self._model, + diagnostics={ + "rubric_characters": len(request.rubric_text or ""), + "score_code_characters": len(request.score_code or ""), + "item_context_characters": len(request.item_text or ""), + }, + ) + + def generate_from_api( + self, + *, + client: Any, + scorecard_identifier: str, + score_identifier: str, + score_id: str, + score_version_id: str, + item_text: str = "", + ) -> ScoreRubricConsistencyResult: + version = fetch_score_version_for_consistency(client, score_version_id) + return self.generate( + ScoreRubricConsistencyRequest( + scorecard_identifier=scorecard_identifier, + score_identifier=score_identifier, + score_version_id=score_version_id, + rubric_text=version.get("guidelines") or "", + score_code=version.get("configuration") or "", + item_text=item_text or "", + ) + ) + + def _build_prompt(self, request: ScoreRubricConsistencyRequest) -> str: + item_section = "" + if request.item_text: + item_section = ( + "\nOptional item context for a spot-check:\n" + f"{_truncate(request.item_text, 4000)}\n" + ) + return ( + "You are checking one Plexus ScoreVersion before evaluation.\n" + "Compare the score code/prompt against the rubric text stored on the same ScoreVersion.\n" + "Identify only meaningful policy mismatches that could affect evaluation results. " + "Do not critique style, formatting, implementation architecture, or missing tests.\n\n" + "Return ONLY JSON with exactly these keys:\n" + ' "status": one of "consistent", "potential_conflict", "inconclusive"\n' + ' "paragraph": one short paragraph, 2-4 sentences, no headings or bullets\n\n' + f"Scorecard: {request.scorecard_identifier}\n" + f"Score: {request.score_identifier}\n" + f"ScoreVersion: {request.score_version_id}\n\n" + f"Rubric text:\n{_truncate(request.rubric_text, 12000)}\n\n" + f"Score code/configuration:\n{_truncate(request.score_code, 16000)}\n" + f"{item_section}" + ) + + def _parse_response(self, text: str) -> Dict[str, Any]: + cleaned = (text or "").strip() + if "```" in cleaned: + match = re.search(r"```(?:json)?\s*([\s\S]*?)```", cleaned) + if match: + cleaned = match.group(1).strip() + obj_match = re.search(r"\{[\s\S]*\}", cleaned) + if obj_match: + cleaned = obj_match.group(0) + return json.loads(cleaned) + + def _invoke_openai(self, prompt: str, model: str) -> str: + from dotenv import load_dotenv + from openai import OpenAI + + load_dotenv(override=False) + client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) + response = client.responses.create( + model=model, + reasoning={"effort": "low"}, + input=[{"role": "user", "content": prompt}], + max_output_tokens=2000, + ) + return (response.output_text or "").strip() + + +def fetch_score_version_for_consistency(client: Any, score_version_id: str) -> Dict[str, Any]: + query = """ + query GetScoreVersionForRubricConsistency($id: ID!) { + getScoreVersion(id: $id) { + id + configuration + guidelines + note + score { + id + name + } + } + } + """ + result = client.execute(query, {"id": score_version_id}) + version = (result or {}).get("getScoreVersion") + if not version: + raise ValueError(f"ScoreVersion not found: {score_version_id}") + return version + + +def merge_consistency_result_into_parameters( + parameters: Any, + result: ScoreRubricConsistencyResult, +) -> Dict[str, Any]: + if isinstance(parameters, str): + try: + merged = json.loads(parameters) if parameters else {} + except Exception: + merged = {} + elif isinstance(parameters, dict): + merged = dict(parameters) + else: + merged = {} + merged["score_rubric_consistency_check"] = result.to_parameters_payload() + return merged + + +def _truncate(value: str, limit: int) -> str: + value = value or "" + if len(value) <= limit: + return value + return value[:limit] + "\n...[truncated]" + + +def _compact_paragraph(value: str) -> str: + value = re.sub(r"\s+", " ", value or "").strip() + return value[:1200] diff --git a/plexus/score_rubric_consistency_test.py b/plexus/score_rubric_consistency_test.py new file mode 100644 index 000000000..e8c6301e9 --- /dev/null +++ b/plexus/score_rubric_consistency_test.py @@ -0,0 +1,93 @@ +import json + +from plexus.score_rubric_consistency import ( + ScoreRubricConsistencyRequest, + ScoreRubricConsistencyService, + merge_consistency_result_into_parameters, +) + + +def test_score_rubric_consistency_service_returns_compact_payload(): + def invoke(prompt: str, model: str) -> str: + assert "Score code/configuration" in prompt + assert model == "test-model" + return json.dumps( + { + "status": "potential_conflict", + "paragraph": ( + "The rubric says two missing dosages should fail, but the prompt allows " + "two missing current medications. This may make the score more permissive " + "than the rubric during evaluation." + ), + } + ) + + result = ScoreRubricConsistencyService( + invoke_model=invoke, + model="test-model", + ).generate( + ScoreRubricConsistencyRequest( + scorecard_identifier="Scorecard", + score_identifier="Medication Review: Dosage", + score_version_id="version-1", + rubric_text="Fail when two or more current meds lack dosage.", + score_code="Pass when no more than two meds lack dosage.", + ) + ) + + assert result.status == "potential_conflict" + assert result.score_version_id == "version-1" + assert "more permissive than the rubric" in result.paragraph + assert result.diagnostics["rubric_characters"] > 0 + + +def test_merge_consistency_result_into_parameters_preserves_existing_fields(): + service = ScoreRubricConsistencyService( + invoke_model=lambda _prompt, _model: json.dumps( + {"status": "consistent", "paragraph": "The score and rubric match."} + ) + ) + result = service.generate( + ScoreRubricConsistencyRequest( + scorecard_identifier="Scorecard", + score_identifier="Score", + score_version_id="version-1", + rubric_text="Rubric", + score_code="Code", + ) + ) + + merged = merge_consistency_result_into_parameters( + json.dumps({"days": 90}), + result, + ) + + assert merged["days"] == 90 + assert merged["score_rubric_consistency_check"]["status"] == "consistent" + assert merged["score_rubric_consistency_check"]["score_version_id"] == "version-1" + + +def test_score_rubric_consistency_retries_invalid_json_once(): + calls = [] + + def invoke(prompt: str, _model: str) -> str: + calls.append(prompt) + if len(calls) == 1: + return "" + return json.dumps( + {"status": "consistent", "paragraph": "The score code follows the rubric."} + ) + + result = ScoreRubricConsistencyService(invoke_model=invoke).generate( + ScoreRubricConsistencyRequest( + scorecard_identifier="Scorecard", + score_identifier="Score", + score_version_id="version-1", + rubric_text="Rubric", + score_code="Code", + ) + ) + + assert result.status == "consistent" + assert len(calls) == 2 + assert "prior response was not valid JSON" in calls[1] From f501e80697fc9dda5963fe015a7ec166580f8878 Mon Sep 17 00:00:00 2001 From: Ryan Alyn Porter Date: Wed, 29 Apr 2026 19:48:51 -0400 Subject: [PATCH 3/6] Move rubric consistency command to score CLI --- plexus/cli/item/items.py | 53 ------------------- plexus/cli/score/scores.py | 63 ++++++++++++++++++++++ plexus/cli/score/scores_test.py | 94 ++++++++++++++++++++++++++++++++- 3 files changed, 155 insertions(+), 55 deletions(-) diff --git a/plexus/cli/item/items.py b/plexus/cli/item/items.py index 714049e79..8bde6902a 100644 --- a/plexus/cli/item/items.py +++ b/plexus/cli/item/items.py @@ -13,11 +13,6 @@ from plexus.cli.shared.console import console from plexus.cli.report.utils import resolve_account_id_for_command import json -from plexus.cli.shared.identifier_resolution import ( - resolve_score_identifier, - resolve_scorecard_identifier, -) -from plexus.score_rubric_consistency import ScoreRubricConsistencyService def format_datetime(dt: Optional[datetime]) -> str: """Format datetime with proper handling of None values""" @@ -477,53 +472,6 @@ def create(account: Optional[str], evaluation_id: Optional[str], text: Optional[ print(traceback.format_exc()) -@items.command(name="contradictions") -@click.option("--scorecard", "scorecard_identifier", required=True) -@click.option("--score", "score_identifier", required=True) -@click.option("--version", "score_version_id", required=True) -@click.option("--item", "item_identifier", default=None, help="Optional item id or identifier for spot-check context.") -@click.option("--format", "output_format", type=click.Choice(["markdown", "json"]), default="markdown") -def contradictions( - scorecard_identifier: str, - score_identifier: str, - score_version_id: str, - item_identifier: Optional[str], - output_format: str, -): - """Check whether one ScoreVersion's code is consistent with its rubric.""" - client = create_client() - account_id = resolve_account_id_for_command(client, None) - scorecard_id = resolve_scorecard_identifier(client, scorecard_identifier) - if not scorecard_id: - raise click.ClickException(f"Could not resolve scorecard: {scorecard_identifier}") - score_id = resolve_score_identifier(client, scorecard_id, score_identifier) - if not score_id: - raise click.ClickException( - f"Could not resolve score '{score_identifier}' in scorecard '{scorecard_identifier}'" - ) - - item_text = "" - if item_identifier: - item = find_item_by_any_identifier(client, item_identifier, account_id) - if not item: - raise click.ClickException(f"Could not resolve item: {item_identifier}") - item_text = item.text or "" - - result = ScoreRubricConsistencyService().generate_from_api( - client=client, - scorecard_identifier=scorecard_identifier, - score_identifier=score_identifier, - score_id=score_id, - score_version_id=score_version_id, - item_text=item_text, - ) - payload = result.to_parameters_payload() - if output_format == "json": - console.print_json(json.dumps(payload)) - else: - console.print(f"[bold]Status:[/bold] {result.status}") - console.print(result.paragraph) - @items.command() @click.option('--account', help='Account key or ID (optional, uses default from environment if not provided)') @click.option('--evaluation-id', help='Filter by evaluation ID') @@ -1031,7 +979,6 @@ def item(): item.add_command(list) item.add_command(last) item.add_command(info) -item.add_command(contradictions) item.add_command(update) item.add_command(upsert) item.add_command(delete) diff --git a/plexus/cli/score/scores.py b/plexus/cli/score/scores.py index 03e954620..8faecf50a 100644 --- a/plexus/cli/score/scores.py +++ b/plexus/cli/score/scores.py @@ -13,6 +13,7 @@ from rich.table import Table from rich.panel import Panel from plexus.cli.shared.console import console +from plexus.cli.report.utils import resolve_account_id_for_command from plexus.dashboard.api.client import PlexusDashboardClient from plexus.cli.shared.file_editor import FileEditor from typing import Optional @@ -40,6 +41,7 @@ clear_resolver_caches ) from plexus.cli.shared.score_config_fetching import fetch_and_cache_single_score +from plexus.score_rubric_consistency import ScoreRubricConsistencyService # Define the main command groups that will be exported @click.group() @@ -2097,6 +2099,67 @@ def score_evaluations(scorecard: str, score: str, version_id: Optional[str], sor scores.add_command(score_evaluations) +@score.command(name="contradictions") +@click.option("--scorecard", "scorecard_identifier", required=True) +@click.option("--score", "score_identifier", required=True) +@click.option("--version", "score_version_id", required=True) +@click.option( + "--item", + "item_identifier", + default=None, + help=( + "Optional item id or identifier to include as example context; " + "the check is still score-version-level." + ), +) +@click.option("--format", "output_format", type=click.Choice(["markdown", "json"]), default="markdown") +def contradictions( + scorecard_identifier: str, + score_identifier: str, + score_version_id: str, + item_identifier: Optional[str], + output_format: str, +): + """Check whether one ScoreVersion's code is consistent with its rubric.""" + client = create_client() + scorecard_id = memoized_resolve_scorecard_identifier(client, scorecard_identifier) + if not scorecard_id: + raise click.ClickException(f"Could not resolve scorecard: {scorecard_identifier}") + score_id = memoized_resolve_score_identifier(client, scorecard_id, score_identifier) + if not score_id: + raise click.ClickException( + f"Could not resolve score '{score_identifier}' in scorecard '{scorecard_identifier}'" + ) + + item_text = "" + if item_identifier: + from plexus.cli.item.items import find_item_by_any_identifier + + account_id = resolve_account_id_for_command(client, None) + item = find_item_by_any_identifier(client, item_identifier, account_id) + if not item: + raise click.ClickException(f"Could not resolve item: {item_identifier}") + item_text = item.text or "" + + result = ScoreRubricConsistencyService().generate_from_api( + client=client, + scorecard_identifier=scorecard_identifier, + score_identifier=score_identifier, + score_id=score_id, + score_version_id=score_version_id, + item_text=item_text, + ) + payload = result.to_parameters_payload() + if output_format == "json": + console.print_json(json.dumps(payload)) + else: + console.print(f"[bold]Status:[/bold] {result.status}") + console.print(result.paragraph) + + +scores.add_command(contradictions) + + @score.command(name="promotion-packet") @click.option('--scorecard', '-s', required=True, help='Scorecard identifier (name, key, or ID)') @click.option('--score', '-c', required=True, help='Score identifier (name, key, or ID)') diff --git a/plexus/cli/score/scores_test.py b/plexus/cli/score/scores_test.py index bf9d76ea5..b33c4f88c 100644 --- a/plexus/cli/score/scores_test.py +++ b/plexus/cli/score/scores_test.py @@ -1,6 +1,8 @@ import pytest +from types import SimpleNamespace from unittest.mock import Mock, patch -from plexus.cli.score.scores import optimize +from click.testing import CliRunner +from plexus.cli.score.scores import optimize, scores from plexus.cli.shared.file_editor import FileEditor @pytest.fixture @@ -202,4 +204,92 @@ def test_cli_create_missing_path(mock_file_editor): assert tool_result_content == "Error: Missing parameters or file not found (file_path missing)" assert file_edited is False - mock_file_editor.create.assert_called_once_with("", "New content\n") \ No newline at end of file + mock_file_editor.create.assert_called_once_with("", "New content\n") + + +def test_score_contradictions_runs_score_rubric_consistency_check(): + runner = CliRunner() + payload = { + "status": "potential_conflict", + "paragraph": "The prompt is more lenient than the rubric.", + } + result_obj = Mock() + result_obj.to_parameters_payload.return_value = payload + + with patch("plexus.cli.score.scores.create_client", return_value=Mock()) as create_client, \ + patch("plexus.cli.score.scores.memoized_resolve_scorecard_identifier", return_value="scorecard-1"), \ + patch("plexus.cli.score.scores.memoized_resolve_score_identifier", return_value="score-1"), \ + patch("plexus.cli.score.scores.ScoreRubricConsistencyService") as service_class: + service_class.return_value.generate_from_api.return_value = result_obj + + result = runner.invoke( + scores, + [ + "contradictions", + "--scorecard", + "Scorecard", + "--score", + "Score", + "--version", + "version-1", + "--format", + "json", + ], + ) + + assert result.exit_code == 0 + assert "potential_conflict" in result.output + service_class.return_value.generate_from_api.assert_called_once_with( + client=create_client.return_value, + scorecard_identifier="Scorecard", + score_identifier="Score", + score_id="score-1", + score_version_id="version-1", + item_text="", + ) + + +def test_score_contradictions_can_include_optional_item_context(): + runner = CliRunner() + result_obj = Mock() + result_obj.to_parameters_payload.return_value = { + "status": "consistent", + "paragraph": "The prompt follows the rubric.", + } + + with patch("plexus.cli.score.scores.create_client", return_value=Mock()), \ + patch("plexus.cli.score.scores.memoized_resolve_scorecard_identifier", return_value="scorecard-1"), \ + patch("plexus.cli.score.scores.memoized_resolve_score_identifier", return_value="score-1"), \ + patch("plexus.cli.score.scores.resolve_account_id_for_command", return_value="account-1"), \ + patch("plexus.cli.item.items.find_item_by_any_identifier", return_value=SimpleNamespace(text="item text")), \ + patch("plexus.cli.score.scores.ScoreRubricConsistencyService") as service_class: + service_class.return_value.generate_from_api.return_value = result_obj + + result = runner.invoke( + scores, + [ + "contradictions", + "--scorecard", + "Scorecard", + "--score", + "Score", + "--version", + "version-1", + "--item", + "item-1", + ], + ) + + assert result.exit_code == 0 + assert "Status:" in result.output + service_class.return_value.generate_from_api.assert_called_once() + assert service_class.return_value.generate_from_api.call_args.kwargs["item_text"] == "item text" + + +def test_item_contradictions_is_not_registered(): + from plexus.cli.item.items import item + + result = CliRunner().invoke(item, ["contradictions"]) + + assert result.exit_code != 0 + assert "No such command 'contradictions'" in result.output From 34ac8561027aa579b747dd9e03ae18dddf9af1d2 Mon Sep 17 00:00:00 2001 From: Ryan Alyn Porter Date: Wed, 29 Apr 2026 19:57:59 -0400 Subject: [PATCH 4/6] Fix evaluation RCA item filtering UI --- dashboard/components/EvaluationTask.tsx | 32 ++++++++++----- .../components/EvaluationTaskScoreResults.tsx | 40 +++++++++++++++++-- dashboard/components/ui/task-status.tsx | 7 +++- 3 files changed, 64 insertions(+), 15 deletions(-) diff --git a/dashboard/components/EvaluationTask.tsx b/dashboard/components/EvaluationTask.tsx index 2e942c29a..7fc487052 100644 --- a/dashboard/components/EvaluationTask.tsx +++ b/dashboard/components/EvaluationTask.tsx @@ -942,9 +942,12 @@ const DetailContent = React.memo(({ } const selectFirstFilteredScoreResult = (itemIds: string[]) => { - const firstItemId = itemIds.find(Boolean) + const normalizedItemIds = itemIds + .map(id => String(id).trim()) + .filter(Boolean) + const firstItemId = normalizedItemIds.find(Boolean) if (!firstItemId) return - const matching = parsedScoreResults.find(result => result.itemId === firstItemId) + const matching = parsedScoreResults.find(result => String(result.itemId ?? '').trim() === firstItemId) if (matching) { onSelectScoreResult?.(matching.id) } @@ -976,25 +979,35 @@ const DetailContent = React.memo(({ ) const itemIds: string[] = [] + const fallbackFeedbackItemIds: string[] = [] let missingCount = 0 filteredClassifications.forEach(classification => { - if (!classification.item_id) { + const normalizedItemId = classification.item_id ? String(classification.item_id).trim() : null + const normalizedFeedbackItemId = classification.feedback_item_id ? String(classification.feedback_item_id).trim() : null + + if (!normalizedItemId && !normalizedFeedbackItemId) { missingCount += 1 return } - itemIds.push(classification.item_id) + if (normalizedItemId) { + itemIds.push(normalizedItemId) + } else if (normalizedFeedbackItemId) { + fallbackFeedbackItemIds.push(normalizedFeedbackItemId) + } }) + const selectedIds = itemIds.length > 0 ? itemIds : fallbackFeedbackItemIds + setSelectedTopicItemIds(null) setSelectedTopicLabel(null) setSelectedCategoryKey(categoryKey) setSelectedCategoryLabel(categoryLabel) - setSelectedCategoryItemIds(Array.from(new Set(itemIds))) + setSelectedCategoryItemIds(Array.from(new Set(selectedIds))) setCategoryMissingItemIdCount(missingCount) setSelectedPredictedActual({ predicted: null, actual: null }) - selectFirstFilteredScoreResult(itemIds) + selectFirstFilteredScoreResult(selectedIds) } const clearCategoryFilter = () => { @@ -1685,12 +1698,13 @@ const DetailContent = React.memo(({ const summary = misclassificationCategoryBreakdown.categorySummaries?.[row.key] const summaryText = summary?.category_summary_text const patterns = Array.isArray(summary?.top_patterns) ? summary?.top_patterns : [] - const itemCount = summary?.item_count ?? 0 const categoryClassifications = (misclassificationCategoryBreakdown.itemClassifications ?? []) .filter(classification => classification.primary_category === row.key) + const itemCount = summary?.item_count ?? categoryClassifications.length ?? 0 const itemsWithMissingId = categoryClassifications .filter(classification => !classification.item_id) .length + if (itemCount <= 0) return null return (
@@ -1701,7 +1715,7 @@ const DetailContent = React.memo(({ {itemCount} item(s)
- {summaryText || 'No items in this category for this run.'} + {summaryText || 'Summary unavailable for this category.'}
{patterns.length > 0 && (
@@ -2735,7 +2749,6 @@ ${categoryLines}${mechanicalLines}
{variant !== 'detail' && evaluationNotes && (
-
Note

{children}

, @@ -2912,6 +2925,7 @@ ${categoryLines}${mechanicalLines} )} {evaluationNotes && (
+
Note

{children}

, diff --git a/dashboard/components/EvaluationTaskScoreResults.tsx b/dashboard/components/EvaluationTaskScoreResults.tsx index 0885b6add..48f644d9f 100644 --- a/dashboard/components/EvaluationTaskScoreResults.tsx +++ b/dashboard/components/EvaluationTaskScoreResults.tsx @@ -41,6 +41,33 @@ export function EvaluationTaskScoreResults({ navigationControls, isLoading = false }: EvaluationTaskScoreResultsProps) { + const toNormalized = (value: unknown): string | null => { + if (value === null || value === undefined) return null + const normalized = String(value).trim() + return normalized.length > 0 ? normalized : null + } + + const getResultFilterKeys = (result: ScoreResultData): string[] => { + const keys = new Set() + const itemId = toNormalized(result.itemId) + if (itemId) keys.add(itemId) + + const metadataItemId = toNormalized((result as any)?.metadata?.item_id) + if (metadataItemId) keys.add(metadataItemId) + + const feedbackItemId = toNormalized((result as any)?.feedbackItem?.id) + if (feedbackItemId) keys.add(feedbackItemId) + + if (Array.isArray(result.itemIdentifiers)) { + result.itemIdentifiers.forEach((identifier: any) => { + const value = toNormalized(identifier?.value) + if (value) keys.add(value) + }) + } + + return Array.from(keys) + } + console.log('EvaluationTaskScoreResults render:', { resultCount: results.length, firstResult: results[0], @@ -104,6 +131,10 @@ export function EvaluationTaskScoreResults({ } }); + const normalizedSelectedItemIds = selectedItemIds + ? new Set(selectedItemIds.map(toNormalized).filter((id): id is string => id !== null)) + : null + const filtered = results.filter(result => { if (filters.showCorrect !== null && result.metadata.correct !== filters.showCorrect) { return false @@ -117,9 +148,10 @@ export function EvaluationTaskScoreResults({ return false } - if (selectedItemIds && selectedItemIds.length > 0 && - !selectedItemIds.includes(result.itemId ?? '')) { - return false + if (normalizedSelectedItemIds && normalizedSelectedItemIds.size > 0) { + const resultKeys = getResultFilterKeys(result) + const hasMatch = resultKeys.some(key => normalizedSelectedItemIds.has(key)) + if (!hasMatch) return false } return true @@ -264,4 +296,4 @@ export function EvaluationTaskScoreResults({
) -} \ No newline at end of file +} diff --git a/dashboard/components/ui/task-status.tsx b/dashboard/components/ui/task-status.tsx index 0a5686dbd..bb99251b5 100644 --- a/dashboard/components/ui/task-status.tsx +++ b/dashboard/components/ui/task-status.tsx @@ -385,7 +385,10 @@ export const TaskStatus = React.memo(({ return (
-
+
{command && commandDisplay !== 'hide' && (
{command && commandDisplay !== 'hide' && (
Date: Wed, 29 Apr 2026 19:59:48 -0400 Subject: [PATCH 5/6] Expand evaluation category filter linkage --- dashboard/components/EvaluationTask.tsx | 78 +++++++++++++++---- .../components/EvaluationTaskScoreResults.tsx | 33 ++------ .../EvaluationTask.category-filter.test.tsx | 73 +++++++++++++++++ 3 files changed, 141 insertions(+), 43 deletions(-) diff --git a/dashboard/components/EvaluationTask.tsx b/dashboard/components/EvaluationTask.tsx index 7fc487052..6fa03b974 100644 --- a/dashboard/components/EvaluationTask.tsx +++ b/dashboard/components/EvaluationTask.tsx @@ -45,6 +45,35 @@ const parseJsonDeep = (value: unknown): unknown => { return current } +const toNormalizedId = (value: unknown): string | null => { + if (value === null || value === undefined) return null + const normalized = String(value).trim() + return normalized.length > 0 ? normalized : null +} + +const getScoreResultFilterKeys = (result: ScoreResultData): string[] => { + const keys = new Set() + const directId = toNormalizedId(result.id) + if (directId) keys.add(directId) + const itemId = toNormalizedId(result.itemId) + if (itemId) keys.add(itemId) + const metadataItemId = toNormalizedId((result as any)?.metadata?.item_id) + if (metadataItemId) keys.add(metadataItemId) + const feedbackItemId = toNormalizedId((result as any)?.feedbackItem?.id) + if (feedbackItemId) keys.add(feedbackItemId) + const metadataFeedbackItemId = toNormalizedId((result as any)?.metadata?.feedback_item_id) + if (metadataFeedbackItemId) keys.add(metadataFeedbackItemId) + + if (Array.isArray(result.itemIdentifiers)) { + result.itemIdentifiers.forEach((identifier: any) => { + const value = toNormalizedId(identifier?.value) + if (value) keys.add(value) + }) + } + + return Array.from(keys) +} + export interface EvaluationMetric { name: string value: number @@ -217,6 +246,7 @@ type MisclassificationCategorySummary = { category_summary_text?: string top_patterns?: Array<{ pattern?: string; count?: number }> representative_evidence?: Array<{ + score_result_id?: string feedback_item_id?: string item_id?: string source?: string @@ -269,6 +299,7 @@ type MisclassificationAnalysis = { item_classifications_all?: Array<{ topic_id?: number | string topic_label?: string + score_result_id?: string feedback_item_id?: string item_id?: string timestamp?: string @@ -941,13 +972,16 @@ const DetailContent = React.memo(({ onSelectScoreResult?.(result.id) } - const selectFirstFilteredScoreResult = (itemIds: string[]) => { - const normalizedItemIds = itemIds - .map(id => String(id).trim()) - .filter(Boolean) - const firstItemId = normalizedItemIds.find(Boolean) - if (!firstItemId) return - const matching = parsedScoreResults.find(result => String(result.itemId ?? '').trim() === firstItemId) + const selectFirstFilteredScoreResult = (filterIds: string[]) => { + const normalizedFilterIds = new Set( + filterIds + .map(id => toNormalizedId(id)) + .filter((id): id is string => id !== null) + ) + if (normalizedFilterIds.size === 0) return + const matching = parsedScoreResults.find(result => + getScoreResultFilterKeys(result).some(key => normalizedFilterIds.has(key)) + ) if (matching) { onSelectScoreResult?.(matching.id) } @@ -978,27 +1012,37 @@ const DetailContent = React.memo(({ classification => classification.primary_category === categoryKey ) + const scoreResultIds: string[] = [] const itemIds: string[] = [] const fallbackFeedbackItemIds: string[] = [] let missingCount = 0 filteredClassifications.forEach(classification => { - const normalizedItemId = classification.item_id ? String(classification.item_id).trim() : null - const normalizedFeedbackItemId = classification.feedback_item_id ? String(classification.feedback_item_id).trim() : null + const normalizedScoreResultId = toNormalizedId(classification.score_result_id) + const normalizedItemId = toNormalizedId(classification.item_id) + const normalizedFeedbackItemId = toNormalizedId(classification.feedback_item_id) - if (!normalizedItemId && !normalizedFeedbackItemId) { + if (!normalizedScoreResultId && !normalizedItemId && !normalizedFeedbackItemId) { missingCount += 1 return } + if (normalizedScoreResultId) { + scoreResultIds.push(normalizedScoreResultId) + } if (normalizedItemId) { itemIds.push(normalizedItemId) - } else if (normalizedFeedbackItemId) { + } + if (normalizedFeedbackItemId) { fallbackFeedbackItemIds.push(normalizedFeedbackItemId) } }) - const selectedIds = itemIds.length > 0 ? itemIds : fallbackFeedbackItemIds + const selectedIds = scoreResultIds.length > 0 + ? scoreResultIds + : itemIds.length > 0 + ? itemIds + : fallbackFeedbackItemIds setSelectedTopicItemIds(null) setSelectedTopicLabel(null) @@ -1702,7 +1746,11 @@ const DetailContent = React.memo(({ .filter(classification => classification.primary_category === row.key) const itemCount = summary?.item_count ?? categoryClassifications.length ?? 0 const itemsWithMissingId = categoryClassifications - .filter(classification => !classification.item_id) + .filter(classification => ( + !toNormalizedId(classification.item_id) + && !toNormalizedId(classification.feedback_item_id) + && !toNormalizedId(classification.score_result_id) + )) .length if (itemCount <= 0) return null return ( @@ -1741,14 +1789,14 @@ const DetailContent = React.memo(({ {selectedCategoryKey === row.key && categoryMissingItemIdCount > 0 && ( - {categoryMissingItemIdCount} item(s) missing item_id not shown + {categoryMissingItemIdCount} item(s) missing linkage ids not shown )}
)} {itemsWithMissingId > 0 && selectedCategoryKey !== row.key && (
- {itemsWithMissingId} item(s) in this category are missing item_id and cannot appear in score results. + {itemsWithMissingId} item(s) in this category are missing linkage ids and cannot appear in score results.
)}
diff --git a/dashboard/components/EvaluationTaskScoreResults.tsx b/dashboard/components/EvaluationTaskScoreResults.tsx index 48f644d9f..d21af5e5a 100644 --- a/dashboard/components/EvaluationTaskScoreResults.tsx +++ b/dashboard/components/EvaluationTaskScoreResults.tsx @@ -49,6 +49,8 @@ export function EvaluationTaskScoreResults({ const getResultFilterKeys = (result: ScoreResultData): string[] => { const keys = new Set() + const resultId = toNormalized(result.id) + if (resultId) keys.add(resultId) const itemId = toNormalized(result.itemId) if (itemId) keys.add(itemId) @@ -57,6 +59,8 @@ export function EvaluationTaskScoreResults({ const feedbackItemId = toNormalized((result as any)?.feedbackItem?.id) if (feedbackItemId) keys.add(feedbackItemId) + const metadataFeedbackItemId = toNormalized((result as any)?.metadata?.feedback_item_id) + if (metadataFeedbackItemId) keys.add(metadataFeedbackItemId) if (Array.isArray(result.itemIdentifiers)) { result.itemIdentifiers.forEach((identifier: any) => { @@ -68,17 +72,6 @@ export function EvaluationTaskScoreResults({ return Array.from(keys) } - console.log('EvaluationTaskScoreResults render:', { - resultCount: results.length, - firstResult: results[0], - lastResult: results[results.length - 1], - accuracy, - selectedPredictedValue, - selectedActualValue, - hasSelectedResult: !!selectedScoreResult, - selectedScoreResultId: selectedScoreResult?.id - }); - const [filters, setFilters] = useState({ showCorrect: null, predictedValue: null, @@ -122,15 +115,6 @@ export function EvaluationTaskScoreResults({ }, [results]) const filteredResults = useMemo(() => { - console.log('Filtering score results:', { - totalResults: results.length, - filters: { - showCorrect: filters.showCorrect, - predictedValue: filters.predictedValue, - actualValue: filters.actualValue - } - }); - const normalizedSelectedItemIds = selectedItemIds ? new Set(selectedItemIds.map(toNormalized).filter((id): id is string => id !== null)) : null @@ -148,7 +132,7 @@ export function EvaluationTaskScoreResults({ return false } - if (normalizedSelectedItemIds && normalizedSelectedItemIds.size > 0) { + if (normalizedSelectedItemIds) { const resultKeys = getResultFilterKeys(result) const hasMatch = resultKeys.some(key => normalizedSelectedItemIds.has(key)) if (!hasMatch) return false @@ -157,13 +141,6 @@ export function EvaluationTaskScoreResults({ return true }); - console.log('Filtered results:', { - inputCount: results.length, - filteredCount: filtered.length, - firstFiltered: filtered[0], - lastFiltered: filtered[filtered.length - 1] - }); - return filtered; }, [results, filters, selectedItemIds]); diff --git a/dashboard/components/__tests__/EvaluationTask.category-filter.test.tsx b/dashboard/components/__tests__/EvaluationTask.category-filter.test.tsx index a3c17aa53..d7d69cb73 100644 --- a/dashboard/components/__tests__/EvaluationTask.category-filter.test.tsx +++ b/dashboard/components/__tests__/EvaluationTask.category-filter.test.tsx @@ -128,6 +128,61 @@ const makeTask = () => { } as any } +const makeTaskWithScoreResultIdOnly = () => { + const task = makeTask() + task.data.parameters = JSON.stringify({ + root_cause: { + misclassification_analysis: { + category_totals: { + information_gap: 1, + }, + item_classifications_all: [ + { + score_result_id: 'sr-1', + primary_category: 'information_gap', + confidence: 'medium', + rationale_full: 'Matched only by score result id.', + }, + ], + category_summaries: { + information_gap: { + category_summary_text: 'Score result id only linkage.', + item_count: 1, + }, + }, + }, + }, + }) + return task +} + +const makeTaskWithMissingCategoryLinkage = () => { + const task = makeTask() + task.data.parameters = JSON.stringify({ + root_cause: { + misclassification_analysis: { + category_totals: { + information_gap: 1, + }, + item_classifications_all: [ + { + primary_category: 'information_gap', + confidence: 'medium', + rationale_full: 'No linkage ids on this row.', + }, + ], + category_summaries: { + information_gap: { + category_summary_text: 'No linkage ids available.', + item_count: 1, + }, + }, + }, + }, + }) + return task +} + describe('EvaluationTask category summary drill-down', () => { test('applies category filter and auto-selects first matching score result', async () => { const onSelectScoreResult = jest.fn() @@ -147,6 +202,24 @@ describe('EvaluationTask category summary drill-down', () => { }) }) + test('filters by score_result_id linkage when item_id is unavailable', async () => { + const onSelectScoreResult = jest.fn() + render() + + fireEvent.click(screen.getByRole('button', { name: /View items \(1\)/i })) + + expect(screen.getByTestId('selected-item-ids')).toHaveTextContent('["sr-1"]') + expect(onSelectScoreResult).toHaveBeenCalledWith('sr-1') + }) + + test('applies empty category filter when linkage ids are missing', async () => { + render() + + fireEvent.click(screen.getByRole('button', { name: /View items \(1\)/i })) + + expect(screen.getByTestId('selected-item-ids')).toHaveTextContent('[]') + }) + test('renders score version and procedure related-resource cards in detail view', async () => { const { container } = render() From 15dfe3ee1200b8159f927999fbfbdc8409eb6e75 Mon Sep 17 00:00:00 2001 From: Ryan Alyn Porter Date: Wed, 29 Apr 2026 20:11:50 -0400 Subject: [PATCH 6/6] Fix evaluation category View items filtering --- dashboard/components/EvaluationTask.tsx | 57 +++++++++++-------- .../EvaluationTask.category-filter.test.tsx | 2 +- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/dashboard/components/EvaluationTask.tsx b/dashboard/components/EvaluationTask.tsx index 4db10e74c..af96ea203 100644 --- a/dashboard/components/EvaluationTask.tsx +++ b/dashboard/components/EvaluationTask.tsx @@ -74,6 +74,19 @@ const getScoreResultFilterKeys = (result: ScoreResultData): string[] => { return Array.from(keys) } +const collectClassificationLinkageIds = (classification: { + score_result_id?: string + item_id?: string + feedback_item_id?: string +}): string[] => { + const ids = [ + toNormalizedId(classification.score_result_id), + toNormalizedId(classification.item_id), + toNormalizedId(classification.feedback_item_id), + ].filter((id): id is string => id !== null) + return ids +} + export interface EvaluationMetric { name: string value: number @@ -1012,46 +1025,42 @@ const DetailContent = React.memo(({ classification => classification.primary_category === categoryKey ) - const scoreResultIds: string[] = [] - const itemIds: string[] = [] - const fallbackFeedbackItemIds: string[] = [] + const linkageIds: string[] = [] let missingCount = 0 filteredClassifications.forEach(classification => { - const normalizedScoreResultId = toNormalizedId(classification.score_result_id) - const normalizedItemId = toNormalizedId(classification.item_id) - const normalizedFeedbackItemId = toNormalizedId(classification.feedback_item_id) - - if (!normalizedScoreResultId && !normalizedItemId && !normalizedFeedbackItemId) { + const classificationIds = collectClassificationLinkageIds(classification) + if (classificationIds.length === 0) { missingCount += 1 return } + linkageIds.push(...classificationIds) + }) - if (normalizedScoreResultId) { - scoreResultIds.push(normalizedScoreResultId) - } - if (normalizedItemId) { - itemIds.push(normalizedItemId) - } - if (normalizedFeedbackItemId) { - fallbackFeedbackItemIds.push(normalizedFeedbackItemId) - } + const summaryEvidence = misclassificationCategoryBreakdown.categorySummaries?.[categoryKey]?.representative_evidence ?? [] + summaryEvidence.forEach(evidence => { + linkageIds.push( + ...[ + toNormalizedId(evidence.score_result_id), + toNormalizedId(evidence.item_id), + toNormalizedId(evidence.feedback_item_id), + ].filter((id): id is string => id !== null) + ) }) - const selectedIds = scoreResultIds.length > 0 - ? scoreResultIds - : itemIds.length > 0 - ? itemIds - : fallbackFeedbackItemIds + const normalizedLinkageIds = new Set(linkageIds.map(id => toNormalizedId(id)).filter((id): id is string => id !== null)) + const selectedScoreResultIds = parsedScoreResults + .filter(result => getScoreResultFilterKeys(result).some(key => normalizedLinkageIds.has(key))) + .map(result => String(result.id).trim()) setSelectedTopicItemIds(null) setSelectedTopicLabel(null) setSelectedCategoryKey(categoryKey) setSelectedCategoryLabel(categoryLabel) - setSelectedCategoryItemIds(Array.from(new Set(selectedIds))) + setSelectedCategoryItemIds(Array.from(new Set(selectedScoreResultIds))) setCategoryMissingItemIdCount(missingCount) setSelectedPredictedActual({ predicted: null, actual: null }) - selectFirstFilteredScoreResult(selectedIds) + selectFirstFilteredScoreResult(selectedScoreResultIds) } const clearCategoryFilter = () => { diff --git a/dashboard/components/__tests__/EvaluationTask.category-filter.test.tsx b/dashboard/components/__tests__/EvaluationTask.category-filter.test.tsx index d7d69cb73..c4963a9e6 100644 --- a/dashboard/components/__tests__/EvaluationTask.category-filter.test.tsx +++ b/dashboard/components/__tests__/EvaluationTask.category-filter.test.tsx @@ -191,7 +191,7 @@ describe('EvaluationTask category summary drill-down', () => { fireEvent.click(screen.getByRole('button', { name: /View items \(1\)/i })) expect(screen.getByText('Filtered by category: Information gap')).toBeInTheDocument() - expect(screen.getByTestId('selected-item-ids')).toHaveTextContent('["item-1"]') + expect(screen.getByTestId('selected-item-ids')).toHaveTextContent('["sr-1"]') expect(onSelectScoreResult).toHaveBeenCalledWith('sr-1') fireEvent.click(screen.getByRole('button', { name: /Clear category filter/i }))