Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions MCP/tools/evaluation/evaluations.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ async def plexus_evaluation_run(
use_score_associated_dataset: bool = False,
batch: Optional[List[Dict[str, Any]]] = None,
notes: Optional[str] = None,
score_rubric_consistency_check: bool = False,
) -> str:
"""
Run an evaluation using the same code path as CLI.
Expand Down Expand Up @@ -391,6 +392,9 @@ async def plexus_evaluation_run(
the evaluation's parameters JSON under the "notes" key. Useful for recording
context like "Baseline: deterministic accuracy dataset" or
"Iteration 3: Added example for transfer-request edge case".
- score_rubric_consistency_check: Feedback evaluations only. When True, run a
preflight check that compares the evaluated ScoreVersion code against its
own rubric and store the paragraph on Evaluation.parameters.

Returns:
- JSON string with evaluation results including evaluation_id, metrics, and dashboard URL.
Expand Down Expand Up @@ -434,19 +438,27 @@ def _apply_notes_to_evaluation(evaluation_id: str, notes_text: str) -> None:
batch_list = list(batch)
logger.info(f"Batch evaluation: dispatching {len(batch_list)} evaluations in parallel")
for i, item in enumerate(batch_list):
logger.info(f" Batch item {i+1}: type={item.get('evaluation_type')}, score={item.get('score_name')}, wait={item.get('wait')}")
logger.info(
f" Batch item {i + 1}: type={item.get('evaluation_type')}, "
f"score={item.get('score_name')}, wait={item.get('wait')}"
)
tasks = [plexus_evaluation_run(**item) for item in batch_list]
raw_results = await _asyncio.gather(*tasks, return_exceptions=True)
output = []
for i, r in enumerate(raw_results):
if isinstance(r, Exception):
logger.error(f"Batch item {i+1} raised exception: {type(r).__name__}: {r}", exc_info=r)
logger.error(
f"Batch item {i + 1} raised exception: {type(r).__name__}: {r}",
exc_info=r,
)
output.append({"error": f"{type(r).__name__}: {r}"})
else:
try:
output.append(json.loads(r))
except Exception as parse_exc:
logger.error(f"Batch item {i+1} result parse error: {parse_exc}, raw={str(r)[:500]}")
logger.error(
f"Batch item {i + 1} result parse error: {parse_exc}, raw={str(r)[:500]}"
)
output.append({"error": "Could not parse result", "raw": str(r)})
logger.info(f"Batch evaluation complete: {len(output)} results")
# Post-batch notes application: apply notes to each eval sequentially
Expand All @@ -459,7 +471,9 @@ def _apply_notes_to_evaluation(evaluation_id: str, notes_text: str) -> None:
if eval_id:
_apply_notes_to_evaluation(eval_id, item_notes)
else:
logger.warning(f"Batch item {i+1}: no eval_id in result, cannot apply notes")
logger.warning(
f"Batch item {i + 1}: no eval_id in result, cannot apply notes"
)
return json.dumps(output)

if not scorecard_name:
Expand Down Expand Up @@ -576,6 +590,8 @@ def _spawn_feedback(
cmd += ["--max-category-summary-items", str(max_category_items)]
if runner_task_id:
cmd += ["--task-id", runner_task_id]
if score_rubric_consistency_check:
cmd += ["--score-rubric-consistency-check"]
subprocess.Popen(
cmd,
stdout=subprocess.DEVNULL,
Expand Down Expand Up @@ -699,6 +715,8 @@ def _spawn_feedback(
fb_args.extend(['--sample-seed', str(sample_seed)])
if notes:
fb_args.extend(['--notes', notes])
if score_rubric_consistency_check:
fb_args.append('--score-rubric-consistency-check')
# When a specific version is requested, yaml mode must be disabled.
effective_yaml = yaml and not resolved_version
if effective_yaml:
Expand Down
6 changes: 3 additions & 3 deletions MCP/tools/score/score_update_guidelines_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def test_guidelines_only_update_none_handling(self):
'configuration': current_version_data['configuration'], # Preserved existing code
'guidelines': update_params['guidelines'], # New guidelines
'note': update_params['version_note'],
'isFeatured': "true",
'isFeatured': "false",
'parentVersionId': 'version-current'
}

Expand Down Expand Up @@ -241,7 +241,7 @@ def simulate_version_input_creation(score_id, code_content, guidelines, note, pa
'scoreId': score_id,
'configuration': (code_content or '').strip(), # Fixed to handle None
'note': note or 'Updated via MCP score update tool',
'isFeatured': "true"
'isFeatured': "false"
}

# Add guidelines if provided (fixed logic)
Expand Down Expand Up @@ -357,7 +357,7 @@ def simulate_complete_workflow():
'scoreId': score_data['id'],
'configuration': (code or '').strip(),
'note': update_params['version_note'] or 'Updated via MCP',
'isFeatured': "true"
'isFeatured': "false"
}

if guidelines:
Expand Down
2 changes: 1 addition & 1 deletion MCP/tools/score/scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -2682,7 +2682,7 @@ async def _create_version_from_code_with_parent(
'scoreId': score.id,
'configuration': (code_content or '').strip(),
'note': note or 'Updated via MCP score update tool',
'isFeatured': "true" # Mark as featured by default
'isFeatured': "false"
}

# Add guidelines if provided
Expand Down
4 changes: 2 additions & 2 deletions MCP/tools/score/scores_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1670,14 +1670,14 @@ def test_create_version_with_parent_version_input(self):
'configuration': code_content.strip(),
'guidelines': guidelines.strip(),
'note': note,
'isFeatured': "true",
'isFeatured': "false",
'parentVersionId': parent_version_id # Should set parent relationship
}

# Verify parent relationship is established
assert expected_version_input['parentVersionId'] == parent_version_id
assert expected_version_input['scoreId'] == score_id
assert expected_version_input['isFeatured'] == "true"
assert expected_version_input['isFeatured'] == "false"

def test_create_version_with_parent_error_handling(self):
"""Test error handling in _create_version_from_code_with_parent"""
Expand Down
127 changes: 112 additions & 15 deletions dashboard/components/EvaluationTask.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,48 @@ const parseJsonDeep = (value: unknown): unknown => {
return current
}

const toNormalizedId = (value: unknown): string | null => {
if (value === null || value === undefined) return null
const normalized = String(value).trim()
return normalized.length > 0 ? normalized : null
}

const getScoreResultFilterKeys = (result: ScoreResultData): string[] => {
const keys = new Set<string>()
const directId = toNormalizedId(result.id)
if (directId) keys.add(directId)
const itemId = toNormalizedId(result.itemId)
if (itemId) keys.add(itemId)
const metadataItemId = toNormalizedId((result as any)?.metadata?.item_id)
if (metadataItemId) keys.add(metadataItemId)
const feedbackItemId = toNormalizedId((result as any)?.feedbackItem?.id)
if (feedbackItemId) keys.add(feedbackItemId)
const metadataFeedbackItemId = toNormalizedId((result as any)?.metadata?.feedback_item_id)
if (metadataFeedbackItemId) keys.add(metadataFeedbackItemId)

if (Array.isArray(result.itemIdentifiers)) {
result.itemIdentifiers.forEach((identifier: any) => {
const value = toNormalizedId(identifier?.value)
if (value) keys.add(value)
})
}

return Array.from(keys)
}

const collectClassificationLinkageIds = (classification: {
score_result_id?: string
item_id?: string
feedback_item_id?: string
}): string[] => {
const ids = [
toNormalizedId(classification.score_result_id),
toNormalizedId(classification.item_id),
toNormalizedId(classification.feedback_item_id),
].filter((id): id is string => id !== null)
return ids
}

export interface EvaluationMetric {
name: string
value: number
Expand Down Expand Up @@ -217,6 +259,7 @@ type MisclassificationCategorySummary = {
category_summary_text?: string
top_patterns?: Array<{ pattern?: string; count?: number }>
representative_evidence?: Array<{
score_result_id?: string
feedback_item_id?: string
item_id?: string
source?: string
Expand Down Expand Up @@ -269,6 +312,7 @@ type MisclassificationAnalysis = {
item_classifications_all?: Array<{
topic_id?: number | string
topic_label?: string
score_result_id?: string
feedback_item_id?: string
item_id?: string
timestamp?: string
Expand Down Expand Up @@ -941,10 +985,16 @@ const DetailContent = React.memo(({
onSelectScoreResult?.(result.id)
}

const selectFirstFilteredScoreResult = (itemIds: string[]) => {
const firstItemId = itemIds.find(Boolean)
if (!firstItemId) return
const matching = parsedScoreResults.find(result => result.itemId === firstItemId)
const selectFirstFilteredScoreResult = (filterIds: string[]) => {
const normalizedFilterIds = new Set(
filterIds
.map(id => toNormalizedId(id))
.filter((id): id is string => id !== null)
)
if (normalizedFilterIds.size === 0) return
const matching = parsedScoreResults.find(result =>
getScoreResultFilterKeys(result).some(key => normalizedFilterIds.has(key))
)
if (matching) {
onSelectScoreResult?.(matching.id)
}
Expand Down Expand Up @@ -975,26 +1025,42 @@ const DetailContent = React.memo(({
classification => classification.primary_category === categoryKey
)

const itemIds: string[] = []
const linkageIds: string[] = []
let missingCount = 0

filteredClassifications.forEach(classification => {
if (!classification.item_id) {
const classificationIds = collectClassificationLinkageIds(classification)
if (classificationIds.length === 0) {
missingCount += 1
return
}
linkageIds.push(...classificationIds)
})

itemIds.push(classification.item_id)
const summaryEvidence = misclassificationCategoryBreakdown.categorySummaries?.[categoryKey]?.representative_evidence ?? []
summaryEvidence.forEach(evidence => {
linkageIds.push(
...[
toNormalizedId(evidence.score_result_id),
toNormalizedId(evidence.item_id),
toNormalizedId(evidence.feedback_item_id),
].filter((id): id is string => id !== null)
)
})

const normalizedLinkageIds = new Set(linkageIds.map(id => toNormalizedId(id)).filter((id): id is string => id !== null))
const selectedScoreResultIds = parsedScoreResults
.filter(result => getScoreResultFilterKeys(result).some(key => normalizedLinkageIds.has(key)))
.map(result => String(result.id).trim())

setSelectedTopicItemIds(null)
setSelectedTopicLabel(null)
setSelectedCategoryKey(categoryKey)
setSelectedCategoryLabel(categoryLabel)
setSelectedCategoryItemIds(Array.from(new Set(itemIds)))
setSelectedCategoryItemIds(Array.from(new Set(selectedScoreResultIds)))
setCategoryMissingItemIdCount(missingCount)
setSelectedPredictedActual({ predicted: null, actual: null })
selectFirstFilteredScoreResult(itemIds)
selectFirstFilteredScoreResult(selectedScoreResultIds)
}

const clearCategoryFilter = () => {
Expand Down Expand Up @@ -1067,6 +1133,15 @@ const DetailContent = React.memo(({

const rootCauseTopics = rootCauseData?.topics ?? null
const misclassificationAnalysis = rootCauseData?.misclassification_analysis ?? null
const scoreRubricConsistencyCheck = useMemo(() => {
try {
const params = parseJsonDeep(data.parameters) as Record<string, unknown> | null
const check = params?.score_rubric_consistency_check
return (check && typeof check === 'object') ? check as Record<string, unknown> : null
} catch {
return null
}
}, [data.parameters])
const rcaCoverage = useMemo(() => {
try {
const params = parseJsonDeep(data.parameters) as Record<string, unknown> | null
Expand Down Expand Up @@ -1531,6 +1606,23 @@ const DetailContent = React.memo(({
</div>
)}

{scoreRubricConsistencyCheck && (
<Alert className="mt-4">
<AlertTriangle className="h-4 w-4" />
<AlertTitle className="text-sm">
Score/rubric consistency
{typeof scoreRubricConsistencyCheck.status === 'string'
? `: ${scoreRubricConsistencyCheck.status}`
: ''}
</AlertTitle>
<AlertDescription className="text-sm">
{typeof scoreRubricConsistencyCheck.paragraph === 'string'
? scoreRubricConsistencyCheck.paragraph
: 'No consistency summary was generated.'}
</AlertDescription>
</Alert>
)}

{/* Score-Configuration RCA */}
{(rootCauseData && (
(rootCauseTopics && rootCauseTopics.length > 0) ||
Expand Down Expand Up @@ -1659,12 +1751,17 @@ const DetailContent = React.memo(({
const summary = misclassificationCategoryBreakdown.categorySummaries?.[row.key]
const summaryText = summary?.category_summary_text
const patterns = Array.isArray(summary?.top_patterns) ? summary?.top_patterns : []
const itemCount = summary?.item_count ?? 0
const categoryClassifications = (misclassificationCategoryBreakdown.itemClassifications ?? [])
.filter(classification => classification.primary_category === row.key)
const itemCount = summary?.item_count ?? categoryClassifications.length ?? 0
const itemsWithMissingId = categoryClassifications
.filter(classification => !classification.item_id)
.filter(classification => (
!toNormalizedId(classification.item_id)
&& !toNormalizedId(classification.feedback_item_id)
&& !toNormalizedId(classification.score_result_id)
))
.length
if (itemCount <= 0) return null
return (
<div key={`category-summary-${row.key}`} className="rounded-md bg-muted/40 p-2 space-y-1.5">
<div className="flex items-center justify-between gap-2 mb-1">
Expand All @@ -1675,7 +1772,7 @@ const DetailContent = React.memo(({
<span className="text-xs text-muted-foreground shrink-0">{itemCount} item(s)</span>
</div>
<div className="text-xs text-foreground">
{summaryText || 'No items in this category for this run.'}
{summaryText || 'Summary unavailable for this category.'}
</div>
{patterns.length > 0 && (
<div className="mt-1 text-xs text-muted-foreground">
Expand All @@ -1701,14 +1798,14 @@ const DetailContent = React.memo(({
</Button>
{selectedCategoryKey === row.key && categoryMissingItemIdCount > 0 && (
<span className="text-[11px] text-muted-foreground">
{categoryMissingItemIdCount} item(s) missing item_id not shown
{categoryMissingItemIdCount} item(s) missing linkage ids not shown
</span>
)}
</div>
)}
{itemsWithMissingId > 0 && selectedCategoryKey !== row.key && (
<div className="text-[11px] text-muted-foreground">
{itemsWithMissingId} item(s) in this category are missing item_id and cannot appear in score results.
{itemsWithMissingId} item(s) in this category are missing linkage ids and cannot appear in score results.
</div>
)}
</div>
Expand Down Expand Up @@ -2709,7 +2806,6 @@ ${categoryLines}${mechanicalLines}
</div>
{variant !== 'detail' && evaluationNotes && (
<div className="mt-1">
<div className="mb-1 text-xs font-medium text-foreground">Note</div>
<div className="prose prose-sm max-w-none text-muted-foreground prose-p:text-muted-foreground prose-strong:text-foreground prose-headings:text-muted-foreground prose-li:text-muted-foreground prose-code:text-foreground prose-pre:text-foreground prose-pre:bg-muted">
<ReactMarkdown remarkPlugins={[remarkGfm, remarkBreaks]} components={{
p: ({children}) => <p className="mb-1 last:mb-0 text-sm">{children}</p>,
Expand Down Expand Up @@ -2886,6 +2982,7 @@ ${categoryLines}${mechanicalLines}
)}
{evaluationNotes && (
<div className="mt-1">
<div className="mb-1 text-xs font-medium text-foreground">Note</div>
<div className="prose prose-sm max-w-none text-muted-foreground prose-p:text-muted-foreground prose-strong:text-foreground prose-headings:text-muted-foreground prose-li:text-muted-foreground prose-code:text-foreground prose-pre:text-foreground prose-pre:bg-muted">
<ReactMarkdown remarkPlugins={[remarkGfm, remarkBreaks]} components={{
p: ({children}) => <p className="mb-1 last:mb-0 text-sm">{children}</p>,
Expand Down
Loading
Loading