diff --git a/MCP/tools/evaluation/evaluations.py b/MCP/tools/evaluation/evaluations.py index 4aef629f1..d8cf242c6 100644 --- a/MCP/tools/evaluation/evaluations.py +++ b/MCP/tools/evaluation/evaluations.py @@ -345,6 +345,7 @@ async def plexus_evaluation_run( use_score_associated_dataset: bool = False, batch: Optional[List[Dict[str, Any]]] = None, notes: Optional[str] = None, + score_rubric_consistency_check: bool = False, ) -> str: """ Run an evaluation using the same code path as CLI. @@ -391,6 +392,9 @@ async def plexus_evaluation_run( the evaluation's parameters JSON under the "notes" key. Useful for recording context like "Baseline: deterministic accuracy dataset" or "Iteration 3: Added example for transfer-request edge case". + - score_rubric_consistency_check: Feedback evaluations only. When True, run a + preflight check that compares the evaluated ScoreVersion code against its + own rubric and store the paragraph on Evaluation.parameters. Returns: - JSON string with evaluation results including evaluation_id, metrics, and dashboard URL. @@ -434,19 +438,27 @@ def _apply_notes_to_evaluation(evaluation_id: str, notes_text: str) -> None: batch_list = list(batch) logger.info(f"Batch evaluation: dispatching {len(batch_list)} evaluations in parallel") for i, item in enumerate(batch_list): - logger.info(f" Batch item {i+1}: type={item.get('evaluation_type')}, score={item.get('score_name')}, wait={item.get('wait')}") + logger.info( + f" Batch item {i + 1}: type={item.get('evaluation_type')}, " + f"score={item.get('score_name')}, wait={item.get('wait')}" + ) tasks = [plexus_evaluation_run(**item) for item in batch_list] raw_results = await _asyncio.gather(*tasks, return_exceptions=True) output = [] for i, r in enumerate(raw_results): if isinstance(r, Exception): - logger.error(f"Batch item {i+1} raised exception: {type(r).__name__}: {r}", exc_info=r) + logger.error( + f"Batch item {i + 1} raised exception: {type(r).__name__}: {r}", + exc_info=r, + ) output.append({"error": f"{type(r).__name__}: {r}"}) else: try: output.append(json.loads(r)) except Exception as parse_exc: - logger.error(f"Batch item {i+1} result parse error: {parse_exc}, raw={str(r)[:500]}") + logger.error( + f"Batch item {i + 1} result parse error: {parse_exc}, raw={str(r)[:500]}" + ) output.append({"error": "Could not parse result", "raw": str(r)}) logger.info(f"Batch evaluation complete: {len(output)} results") # Post-batch notes application: apply notes to each eval sequentially @@ -459,7 +471,9 @@ def _apply_notes_to_evaluation(evaluation_id: str, notes_text: str) -> None: if eval_id: _apply_notes_to_evaluation(eval_id, item_notes) else: - logger.warning(f"Batch item {i+1}: no eval_id in result, cannot apply notes") + logger.warning( + f"Batch item {i + 1}: no eval_id in result, cannot apply notes" + ) return json.dumps(output) if not scorecard_name: @@ -576,6 +590,8 @@ def _spawn_feedback( cmd += ["--max-category-summary-items", str(max_category_items)] if runner_task_id: cmd += ["--task-id", runner_task_id] + if score_rubric_consistency_check: + cmd += ["--score-rubric-consistency-check"] subprocess.Popen( cmd, stdout=subprocess.DEVNULL, @@ -699,6 +715,8 @@ def _spawn_feedback( fb_args.extend(['--sample-seed', str(sample_seed)]) if notes: fb_args.extend(['--notes', notes]) + if score_rubric_consistency_check: + fb_args.append('--score-rubric-consistency-check') # When a specific version is requested, yaml mode must be disabled. effective_yaml = yaml and not resolved_version if effective_yaml: diff --git a/MCP/tools/score/score_update_guidelines_test.py b/MCP/tools/score/score_update_guidelines_test.py index 399333a14..fbb17093a 100644 --- a/MCP/tools/score/score_update_guidelines_test.py +++ b/MCP/tools/score/score_update_guidelines_test.py @@ -45,7 +45,7 @@ def test_guidelines_only_update_none_handling(self): 'configuration': current_version_data['configuration'], # Preserved existing code 'guidelines': update_params['guidelines'], # New guidelines 'note': update_params['version_note'], - 'isFeatured': "true", + 'isFeatured': "false", 'parentVersionId': 'version-current' } @@ -241,7 +241,7 @@ def simulate_version_input_creation(score_id, code_content, guidelines, note, pa 'scoreId': score_id, 'configuration': (code_content or '').strip(), # Fixed to handle None 'note': note or 'Updated via MCP score update tool', - 'isFeatured': "true" + 'isFeatured': "false" } # Add guidelines if provided (fixed logic) @@ -357,7 +357,7 @@ def simulate_complete_workflow(): 'scoreId': score_data['id'], 'configuration': (code or '').strip(), 'note': update_params['version_note'] or 'Updated via MCP', - 'isFeatured': "true" + 'isFeatured': "false" } if guidelines: diff --git a/MCP/tools/score/scores.py b/MCP/tools/score/scores.py index b1851d1a4..876edd20b 100644 --- a/MCP/tools/score/scores.py +++ b/MCP/tools/score/scores.py @@ -2682,7 +2682,7 @@ async def _create_version_from_code_with_parent( 'scoreId': score.id, 'configuration': (code_content or '').strip(), 'note': note or 'Updated via MCP score update tool', - 'isFeatured': "true" # Mark as featured by default + 'isFeatured': "false" } # Add guidelines if provided diff --git a/MCP/tools/score/scores_test.py b/MCP/tools/score/scores_test.py index e485a0926..7339be77b 100644 --- a/MCP/tools/score/scores_test.py +++ b/MCP/tools/score/scores_test.py @@ -1670,14 +1670,14 @@ def test_create_version_with_parent_version_input(self): 'configuration': code_content.strip(), 'guidelines': guidelines.strip(), 'note': note, - 'isFeatured': "true", + 'isFeatured': "false", 'parentVersionId': parent_version_id # Should set parent relationship } # Verify parent relationship is established assert expected_version_input['parentVersionId'] == parent_version_id assert expected_version_input['scoreId'] == score_id - assert expected_version_input['isFeatured'] == "true" + assert expected_version_input['isFeatured'] == "false" def test_create_version_with_parent_error_handling(self): """Test error handling in _create_version_from_code_with_parent""" diff --git a/dashboard/components/EvaluationTask.tsx b/dashboard/components/EvaluationTask.tsx index be66a4565..af96ea203 100644 --- a/dashboard/components/EvaluationTask.tsx +++ b/dashboard/components/EvaluationTask.tsx @@ -45,6 +45,48 @@ const parseJsonDeep = (value: unknown): unknown => { return current } +const toNormalizedId = (value: unknown): string | null => { + if (value === null || value === undefined) return null + const normalized = String(value).trim() + return normalized.length > 0 ? normalized : null +} + +const getScoreResultFilterKeys = (result: ScoreResultData): string[] => { + const keys = new Set() + const directId = toNormalizedId(result.id) + if (directId) keys.add(directId) + const itemId = toNormalizedId(result.itemId) + if (itemId) keys.add(itemId) + const metadataItemId = toNormalizedId((result as any)?.metadata?.item_id) + if (metadataItemId) keys.add(metadataItemId) + const feedbackItemId = toNormalizedId((result as any)?.feedbackItem?.id) + if (feedbackItemId) keys.add(feedbackItemId) + const metadataFeedbackItemId = toNormalizedId((result as any)?.metadata?.feedback_item_id) + if (metadataFeedbackItemId) keys.add(metadataFeedbackItemId) + + if (Array.isArray(result.itemIdentifiers)) { + result.itemIdentifiers.forEach((identifier: any) => { + const value = toNormalizedId(identifier?.value) + if (value) keys.add(value) + }) + } + + return Array.from(keys) +} + +const collectClassificationLinkageIds = (classification: { + score_result_id?: string + item_id?: string + feedback_item_id?: string +}): string[] => { + const ids = [ + toNormalizedId(classification.score_result_id), + toNormalizedId(classification.item_id), + toNormalizedId(classification.feedback_item_id), + ].filter((id): id is string => id !== null) + return ids +} + export interface EvaluationMetric { name: string value: number @@ -217,6 +259,7 @@ type MisclassificationCategorySummary = { category_summary_text?: string top_patterns?: Array<{ pattern?: string; count?: number }> representative_evidence?: Array<{ + score_result_id?: string feedback_item_id?: string item_id?: string source?: string @@ -269,6 +312,7 @@ type MisclassificationAnalysis = { item_classifications_all?: Array<{ topic_id?: number | string topic_label?: string + score_result_id?: string feedback_item_id?: string item_id?: string timestamp?: string @@ -941,10 +985,16 @@ const DetailContent = React.memo(({ onSelectScoreResult?.(result.id) } - const selectFirstFilteredScoreResult = (itemIds: string[]) => { - const firstItemId = itemIds.find(Boolean) - if (!firstItemId) return - const matching = parsedScoreResults.find(result => result.itemId === firstItemId) + const selectFirstFilteredScoreResult = (filterIds: string[]) => { + const normalizedFilterIds = new Set( + filterIds + .map(id => toNormalizedId(id)) + .filter((id): id is string => id !== null) + ) + if (normalizedFilterIds.size === 0) return + const matching = parsedScoreResults.find(result => + getScoreResultFilterKeys(result).some(key => normalizedFilterIds.has(key)) + ) if (matching) { onSelectScoreResult?.(matching.id) } @@ -975,26 +1025,42 @@ const DetailContent = React.memo(({ classification => classification.primary_category === categoryKey ) - const itemIds: string[] = [] + const linkageIds: string[] = [] let missingCount = 0 filteredClassifications.forEach(classification => { - if (!classification.item_id) { + const classificationIds = collectClassificationLinkageIds(classification) + if (classificationIds.length === 0) { missingCount += 1 return } + linkageIds.push(...classificationIds) + }) - itemIds.push(classification.item_id) + const summaryEvidence = misclassificationCategoryBreakdown.categorySummaries?.[categoryKey]?.representative_evidence ?? [] + summaryEvidence.forEach(evidence => { + linkageIds.push( + ...[ + toNormalizedId(evidence.score_result_id), + toNormalizedId(evidence.item_id), + toNormalizedId(evidence.feedback_item_id), + ].filter((id): id is string => id !== null) + ) }) + const normalizedLinkageIds = new Set(linkageIds.map(id => toNormalizedId(id)).filter((id): id is string => id !== null)) + const selectedScoreResultIds = parsedScoreResults + .filter(result => getScoreResultFilterKeys(result).some(key => normalizedLinkageIds.has(key))) + .map(result => String(result.id).trim()) + setSelectedTopicItemIds(null) setSelectedTopicLabel(null) setSelectedCategoryKey(categoryKey) setSelectedCategoryLabel(categoryLabel) - setSelectedCategoryItemIds(Array.from(new Set(itemIds))) + setSelectedCategoryItemIds(Array.from(new Set(selectedScoreResultIds))) setCategoryMissingItemIdCount(missingCount) setSelectedPredictedActual({ predicted: null, actual: null }) - selectFirstFilteredScoreResult(itemIds) + selectFirstFilteredScoreResult(selectedScoreResultIds) } const clearCategoryFilter = () => { @@ -1067,6 +1133,15 @@ const DetailContent = React.memo(({ const rootCauseTopics = rootCauseData?.topics ?? null const misclassificationAnalysis = rootCauseData?.misclassification_analysis ?? null + const scoreRubricConsistencyCheck = useMemo(() => { + try { + const params = parseJsonDeep(data.parameters) as Record | null + const check = params?.score_rubric_consistency_check + return (check && typeof check === 'object') ? check as Record : null + } catch { + return null + } + }, [data.parameters]) const rcaCoverage = useMemo(() => { try { const params = parseJsonDeep(data.parameters) as Record | null @@ -1531,6 +1606,23 @@ const DetailContent = React.memo(({ )} + {scoreRubricConsistencyCheck && ( + + + + Score/rubric consistency + {typeof scoreRubricConsistencyCheck.status === 'string' + ? `: ${scoreRubricConsistencyCheck.status}` + : ''} + + + {typeof scoreRubricConsistencyCheck.paragraph === 'string' + ? scoreRubricConsistencyCheck.paragraph + : 'No consistency summary was generated.'} + + + )} + {/* Score-Configuration RCA */} {(rootCauseData && ( (rootCauseTopics && rootCauseTopics.length > 0) || @@ -1659,12 +1751,17 @@ const DetailContent = React.memo(({ const summary = misclassificationCategoryBreakdown.categorySummaries?.[row.key] const summaryText = summary?.category_summary_text const patterns = Array.isArray(summary?.top_patterns) ? summary?.top_patterns : [] - const itemCount = summary?.item_count ?? 0 const categoryClassifications = (misclassificationCategoryBreakdown.itemClassifications ?? []) .filter(classification => classification.primary_category === row.key) + const itemCount = summary?.item_count ?? categoryClassifications.length ?? 0 const itemsWithMissingId = categoryClassifications - .filter(classification => !classification.item_id) + .filter(classification => ( + !toNormalizedId(classification.item_id) + && !toNormalizedId(classification.feedback_item_id) + && !toNormalizedId(classification.score_result_id) + )) .length + if (itemCount <= 0) return null return (
@@ -1675,7 +1772,7 @@ const DetailContent = React.memo(({ {itemCount} item(s)
- {summaryText || 'No items in this category for this run.'} + {summaryText || 'Summary unavailable for this category.'}
{patterns.length > 0 && (
@@ -1701,14 +1798,14 @@ const DetailContent = React.memo(({ {selectedCategoryKey === row.key && categoryMissingItemIdCount > 0 && ( - {categoryMissingItemIdCount} item(s) missing item_id not shown + {categoryMissingItemIdCount} item(s) missing linkage ids not shown )}
)} {itemsWithMissingId > 0 && selectedCategoryKey !== row.key && (
- {itemsWithMissingId} item(s) in this category are missing item_id and cannot appear in score results. + {itemsWithMissingId} item(s) in this category are missing linkage ids and cannot appear in score results.
)}
@@ -2709,7 +2806,6 @@ ${categoryLines}${mechanicalLines} {variant !== 'detail' && evaluationNotes && (
-
Note

{children}

, @@ -2886,6 +2982,7 @@ ${categoryLines}${mechanicalLines} )} {evaluationNotes && (
+
Note

{children}

, diff --git a/dashboard/components/EvaluationTaskScoreResults.tsx b/dashboard/components/EvaluationTaskScoreResults.tsx index 0885b6add..d21af5e5a 100644 --- a/dashboard/components/EvaluationTaskScoreResults.tsx +++ b/dashboard/components/EvaluationTaskScoreResults.tsx @@ -41,16 +41,36 @@ export function EvaluationTaskScoreResults({ navigationControls, isLoading = false }: EvaluationTaskScoreResultsProps) { - console.log('EvaluationTaskScoreResults render:', { - resultCount: results.length, - firstResult: results[0], - lastResult: results[results.length - 1], - accuracy, - selectedPredictedValue, - selectedActualValue, - hasSelectedResult: !!selectedScoreResult, - selectedScoreResultId: selectedScoreResult?.id - }); + const toNormalized = (value: unknown): string | null => { + if (value === null || value === undefined) return null + const normalized = String(value).trim() + return normalized.length > 0 ? normalized : null + } + + const getResultFilterKeys = (result: ScoreResultData): string[] => { + const keys = new Set() + const resultId = toNormalized(result.id) + if (resultId) keys.add(resultId) + const itemId = toNormalized(result.itemId) + if (itemId) keys.add(itemId) + + const metadataItemId = toNormalized((result as any)?.metadata?.item_id) + if (metadataItemId) keys.add(metadataItemId) + + const feedbackItemId = toNormalized((result as any)?.feedbackItem?.id) + if (feedbackItemId) keys.add(feedbackItemId) + const metadataFeedbackItemId = toNormalized((result as any)?.metadata?.feedback_item_id) + if (metadataFeedbackItemId) keys.add(metadataFeedbackItemId) + + if (Array.isArray(result.itemIdentifiers)) { + result.itemIdentifiers.forEach((identifier: any) => { + const value = toNormalized(identifier?.value) + if (value) keys.add(value) + }) + } + + return Array.from(keys) + } const [filters, setFilters] = useState({ showCorrect: null, @@ -95,14 +115,9 @@ export function EvaluationTaskScoreResults({ }, [results]) const filteredResults = useMemo(() => { - console.log('Filtering score results:', { - totalResults: results.length, - filters: { - showCorrect: filters.showCorrect, - predictedValue: filters.predictedValue, - actualValue: filters.actualValue - } - }); + const normalizedSelectedItemIds = selectedItemIds + ? new Set(selectedItemIds.map(toNormalized).filter((id): id is string => id !== null)) + : null const filtered = results.filter(result => { if (filters.showCorrect !== null && result.metadata.correct !== filters.showCorrect) { @@ -117,21 +132,15 @@ export function EvaluationTaskScoreResults({ return false } - if (selectedItemIds && selectedItemIds.length > 0 && - !selectedItemIds.includes(result.itemId ?? '')) { - return false + if (normalizedSelectedItemIds) { + const resultKeys = getResultFilterKeys(result) + const hasMatch = resultKeys.some(key => normalizedSelectedItemIds.has(key)) + if (!hasMatch) return false } return true }); - console.log('Filtered results:', { - inputCount: results.length, - filteredCount: filtered.length, - firstFiltered: filtered[0], - lastFiltered: filtered[filtered.length - 1] - }); - return filtered; }, [results, filters, selectedItemIds]); @@ -264,4 +273,4 @@ export function EvaluationTaskScoreResults({
) -} \ No newline at end of file +} diff --git a/dashboard/components/__tests__/EvaluationTask.category-filter.test.tsx b/dashboard/components/__tests__/EvaluationTask.category-filter.test.tsx index a3c17aa53..c4963a9e6 100644 --- a/dashboard/components/__tests__/EvaluationTask.category-filter.test.tsx +++ b/dashboard/components/__tests__/EvaluationTask.category-filter.test.tsx @@ -128,6 +128,61 @@ const makeTask = () => { } as any } +const makeTaskWithScoreResultIdOnly = () => { + const task = makeTask() + task.data.parameters = JSON.stringify({ + root_cause: { + misclassification_analysis: { + category_totals: { + information_gap: 1, + }, + item_classifications_all: [ + { + score_result_id: 'sr-1', + primary_category: 'information_gap', + confidence: 'medium', + rationale_full: 'Matched only by score result id.', + }, + ], + category_summaries: { + information_gap: { + category_summary_text: 'Score result id only linkage.', + item_count: 1, + }, + }, + }, + }, + }) + return task +} + +const makeTaskWithMissingCategoryLinkage = () => { + const task = makeTask() + task.data.parameters = JSON.stringify({ + root_cause: { + misclassification_analysis: { + category_totals: { + information_gap: 1, + }, + item_classifications_all: [ + { + primary_category: 'information_gap', + confidence: 'medium', + rationale_full: 'No linkage ids on this row.', + }, + ], + category_summaries: { + information_gap: { + category_summary_text: 'No linkage ids available.', + item_count: 1, + }, + }, + }, + }, + }) + return task +} + describe('EvaluationTask category summary drill-down', () => { test('applies category filter and auto-selects first matching score result', async () => { const onSelectScoreResult = jest.fn() @@ -136,7 +191,7 @@ describe('EvaluationTask category summary drill-down', () => { fireEvent.click(screen.getByRole('button', { name: /View items \(1\)/i })) expect(screen.getByText('Filtered by category: Information gap')).toBeInTheDocument() - expect(screen.getByTestId('selected-item-ids')).toHaveTextContent('["item-1"]') + expect(screen.getByTestId('selected-item-ids')).toHaveTextContent('["sr-1"]') expect(onSelectScoreResult).toHaveBeenCalledWith('sr-1') fireEvent.click(screen.getByRole('button', { name: /Clear category filter/i })) @@ -147,6 +202,24 @@ describe('EvaluationTask category summary drill-down', () => { }) }) + test('filters by score_result_id linkage when item_id is unavailable', async () => { + const onSelectScoreResult = jest.fn() + render() + + fireEvent.click(screen.getByRole('button', { name: /View items \(1\)/i })) + + expect(screen.getByTestId('selected-item-ids')).toHaveTextContent('["sr-1"]') + expect(onSelectScoreResult).toHaveBeenCalledWith('sr-1') + }) + + test('applies empty category filter when linkage ids are missing', async () => { + render() + + fireEvent.click(screen.getByRole('button', { name: /View items \(1\)/i })) + + expect(screen.getByTestId('selected-item-ids')).toHaveTextContent('[]') + }) + test('renders score version and procedure related-resource cards in detail view', async () => { const { container } = render() diff --git a/dashboard/components/ui/task-status.tsx b/dashboard/components/ui/task-status.tsx index 0a5686dbd..bb99251b5 100644 --- a/dashboard/components/ui/task-status.tsx +++ b/dashboard/components/ui/task-status.tsx @@ -385,7 +385,10 @@ export const TaskStatus = React.memo(({ return (
-
+
{command && commandDisplay !== 'hide' && (
{command && commandDisplay !== 'hide' && (
Dict[str, Any]: + return asdict(self) + + +class ScoreRubricConsistencyService: + """Generate a concise score-code vs rubric consistency assessment.""" + + DEFAULT_MODEL = "gpt-5-mini" + VALID_STATUSES = {"consistent", "potential_conflict", "inconclusive"} + + def __init__( + self, + *, + invoke_model: Optional[Callable[[str, str], str]] = None, + model: str = DEFAULT_MODEL, + ): + self._invoke_model = invoke_model or self._invoke_openai + self._model = model + + def generate(self, request: ScoreRubricConsistencyRequest) -> ScoreRubricConsistencyResult: + prompt = self._build_prompt(request) + raw_text = self._invoke_model(prompt, self._model) + try: + parsed = self._parse_response(raw_text) + except json.JSONDecodeError: + repair_prompt = ( + f"{prompt}\n\nYour prior response was not valid JSON:\n" + f"{_truncate(raw_text or '(empty response)', 1000)}\n\n" + "Return ONLY valid JSON with exactly these keys: status, paragraph." + ) + raw_text = self._invoke_model(repair_prompt, self._model) + parsed = self._parse_response(raw_text) + status = str(parsed.get("status") or "inconclusive").strip() + if status not in self.VALID_STATUSES: + status = "inconclusive" + paragraph = _compact_paragraph(str(parsed.get("paragraph") or "")) + if not paragraph: + paragraph = "The consistency check did not produce a usable assessment." + status = "inconclusive" + return ScoreRubricConsistencyResult( + scorecard_identifier=request.scorecard_identifier, + score_identifier=request.score_identifier, + score_version_id=request.score_version_id, + status=status, + paragraph=paragraph, + checked_at=datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + model=self._model, + diagnostics={ + "rubric_characters": len(request.rubric_text or ""), + "score_code_characters": len(request.score_code or ""), + "item_context_characters": len(request.item_text or ""), + }, + ) + + def generate_from_api( + self, + *, + client: Any, + scorecard_identifier: str, + score_identifier: str, + score_id: str, + score_version_id: str, + item_text: str = "", + ) -> ScoreRubricConsistencyResult: + version = fetch_score_version_for_consistency(client, score_version_id) + return self.generate( + ScoreRubricConsistencyRequest( + scorecard_identifier=scorecard_identifier, + score_identifier=score_identifier, + score_version_id=score_version_id, + rubric_text=version.get("guidelines") or "", + score_code=version.get("configuration") or "", + item_text=item_text or "", + ) + ) + + def _build_prompt(self, request: ScoreRubricConsistencyRequest) -> str: + item_section = "" + if request.item_text: + item_section = ( + "\nOptional item context for a spot-check:\n" + f"{_truncate(request.item_text, 4000)}\n" + ) + return ( + "You are checking one Plexus ScoreVersion before evaluation.\n" + "Compare the score code/prompt against the rubric text stored on the same ScoreVersion.\n" + "Identify only meaningful policy mismatches that could affect evaluation results. " + "Do not critique style, formatting, implementation architecture, or missing tests.\n\n" + "Return ONLY JSON with exactly these keys:\n" + ' "status": one of "consistent", "potential_conflict", "inconclusive"\n' + ' "paragraph": one short paragraph, 2-4 sentences, no headings or bullets\n\n' + f"Scorecard: {request.scorecard_identifier}\n" + f"Score: {request.score_identifier}\n" + f"ScoreVersion: {request.score_version_id}\n\n" + f"Rubric text:\n{_truncate(request.rubric_text, 12000)}\n\n" + f"Score code/configuration:\n{_truncate(request.score_code, 16000)}\n" + f"{item_section}" + ) + + def _parse_response(self, text: str) -> Dict[str, Any]: + cleaned = (text or "").strip() + if "```" in cleaned: + match = re.search(r"```(?:json)?\s*([\s\S]*?)```", cleaned) + if match: + cleaned = match.group(1).strip() + obj_match = re.search(r"\{[\s\S]*\}", cleaned) + if obj_match: + cleaned = obj_match.group(0) + return json.loads(cleaned) + + def _invoke_openai(self, prompt: str, model: str) -> str: + from dotenv import load_dotenv + from openai import OpenAI + + load_dotenv(override=False) + client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) + response = client.responses.create( + model=model, + reasoning={"effort": "low"}, + input=[{"role": "user", "content": prompt}], + max_output_tokens=2000, + ) + return (response.output_text or "").strip() + + +def fetch_score_version_for_consistency(client: Any, score_version_id: str) -> Dict[str, Any]: + query = """ + query GetScoreVersionForRubricConsistency($id: ID!) { + getScoreVersion(id: $id) { + id + configuration + guidelines + note + score { + id + name + } + } + } + """ + result = client.execute(query, {"id": score_version_id}) + version = (result or {}).get("getScoreVersion") + if not version: + raise ValueError(f"ScoreVersion not found: {score_version_id}") + return version + + +def merge_consistency_result_into_parameters( + parameters: Any, + result: ScoreRubricConsistencyResult, +) -> Dict[str, Any]: + if isinstance(parameters, str): + try: + merged = json.loads(parameters) if parameters else {} + except Exception: + merged = {} + elif isinstance(parameters, dict): + merged = dict(parameters) + else: + merged = {} + merged["score_rubric_consistency_check"] = result.to_parameters_payload() + return merged + + +def _truncate(value: str, limit: int) -> str: + value = value or "" + if len(value) <= limit: + return value + return value[:limit] + "\n...[truncated]" + + +def _compact_paragraph(value: str) -> str: + value = re.sub(r"\s+", " ", value or "").strip() + return value[:1200] diff --git a/plexus/score_rubric_consistency_test.py b/plexus/score_rubric_consistency_test.py new file mode 100644 index 000000000..e8c6301e9 --- /dev/null +++ b/plexus/score_rubric_consistency_test.py @@ -0,0 +1,93 @@ +import json + +from plexus.score_rubric_consistency import ( + ScoreRubricConsistencyRequest, + ScoreRubricConsistencyService, + merge_consistency_result_into_parameters, +) + + +def test_score_rubric_consistency_service_returns_compact_payload(): + def invoke(prompt: str, model: str) -> str: + assert "Score code/configuration" in prompt + assert model == "test-model" + return json.dumps( + { + "status": "potential_conflict", + "paragraph": ( + "The rubric says two missing dosages should fail, but the prompt allows " + "two missing current medications. This may make the score more permissive " + "than the rubric during evaluation." + ), + } + ) + + result = ScoreRubricConsistencyService( + invoke_model=invoke, + model="test-model", + ).generate( + ScoreRubricConsistencyRequest( + scorecard_identifier="Scorecard", + score_identifier="Medication Review: Dosage", + score_version_id="version-1", + rubric_text="Fail when two or more current meds lack dosage.", + score_code="Pass when no more than two meds lack dosage.", + ) + ) + + assert result.status == "potential_conflict" + assert result.score_version_id == "version-1" + assert "more permissive than the rubric" in result.paragraph + assert result.diagnostics["rubric_characters"] > 0 + + +def test_merge_consistency_result_into_parameters_preserves_existing_fields(): + service = ScoreRubricConsistencyService( + invoke_model=lambda _prompt, _model: json.dumps( + {"status": "consistent", "paragraph": "The score and rubric match."} + ) + ) + result = service.generate( + ScoreRubricConsistencyRequest( + scorecard_identifier="Scorecard", + score_identifier="Score", + score_version_id="version-1", + rubric_text="Rubric", + score_code="Code", + ) + ) + + merged = merge_consistency_result_into_parameters( + json.dumps({"days": 90}), + result, + ) + + assert merged["days"] == 90 + assert merged["score_rubric_consistency_check"]["status"] == "consistent" + assert merged["score_rubric_consistency_check"]["score_version_id"] == "version-1" + + +def test_score_rubric_consistency_retries_invalid_json_once(): + calls = [] + + def invoke(prompt: str, _model: str) -> str: + calls.append(prompt) + if len(calls) == 1: + return "" + return json.dumps( + {"status": "consistent", "paragraph": "The score code follows the rubric."} + ) + + result = ScoreRubricConsistencyService(invoke_model=invoke).generate( + ScoreRubricConsistencyRequest( + scorecard_identifier="Scorecard", + score_identifier="Score", + score_version_id="version-1", + rubric_text="Rubric", + score_code="Code", + ) + ) + + assert result.status == "consistent" + assert len(calls) == 2 + assert "prior response was not valid JSON" in calls[1]