From cd173e520669a104415d43d0b7a85a99128190c0 Mon Sep 17 00:00:00 2001 From: "Charles C. Figueiredo" Date: Thu, 23 Apr 2026 23:09:29 -0400 Subject: [PATCH 1/2] Add context quality scoring --- ROADMAP.md | 2 +- .../__tests__/context-quality-table.test.tsx | 48 ++++ .../__tests__/maturity-summary.test.tsx | 1 + .../maturity/context-quality-table.tsx | 117 ++++++++++ frontend/src/pages/maturity.tsx | 6 +- frontend/src/types/api.ts | 21 ++ src/primer/common/schemas.py | 21 ++ .../server/services/maturity_service.py | 207 ++++++++++++++++-- tests/test_maturity.py | 68 ++++++ 9 files changed, 476 insertions(+), 15 deletions(-) create mode 100644 frontend/src/components/maturity/__tests__/context-quality-table.test.tsx create mode 100644 frontend/src/components/maturity/context-quality-table.tsx diff --git a/ROADMAP.md b/ROADMAP.md index efe926c..7114565 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -193,7 +193,7 @@ Derived data pipelines, performance optimization, durable background jobs, enter - [x] [P1] High-performer agent stack analysis: which combinations of MCPs, skills, commands, and subagents differentiate top performers - [x] [P0] Per-tool success rate tracking with compound reliability computation (10 steps at 99% = 90.4% end-to-end) - [x] [P0] Harness configuration fingerprinting from session telemetry (tools, context files, permissions, customizations) -- [ ] [P1] Context quality scoring: AGENTS.md freshness, token efficiency, guide/sensor coverage +- [x] [P1] Context quality scoring: AGENTS.md freshness, token efficiency, guide/sensor coverage - [ ] [P1] Harness evolution timeline: before/after correlation of configuration changes with outcome changes - [ ] [P1] Harnessability scoring per project: documentation quality, typing strength, module boundaries - [ ] [P1] Paragon's 4-dimension evaluation: tool correctness, tool usage accuracy, task completion, task efficiency diff --git a/frontend/src/components/maturity/__tests__/context-quality-table.test.tsx b/frontend/src/components/maturity/__tests__/context-quality-table.test.tsx new file mode 100644 index 0000000..39f9ede --- /dev/null +++ b/frontend/src/components/maturity/__tests__/context-quality-table.test.tsx @@ -0,0 +1,48 @@ +import { render, screen } from "@testing-library/react" + +import { ContextQualityTable } from "@/components/maturity/context-quality-table" + +describe("ContextQualityTable", () => { + it("renders an empty state", () => { + render() + + expect(screen.getByText("No context quality signals available yet.")).toBeInTheDocument() + }) + + it("renders context quality rows", () => { + render( + , + ) + + expect(screen.getByText("acme/api")).toBeInTheDocument() + expect(screen.getByText("12 sessions")).toBeInTheDocument() + expect(screen.getByText("82.8")).toBeInTheDocument() + expect(screen.getByText("100 fresh")).toBeInTheDocument() + expect(screen.getByText("42% cache, 18.5K avg input")).toBeInTheDocument() + expect(screen.getByText("50% context / 75% models")).toBeInTheDocument() + expect(screen.getByText("Add AGENTS.md")).toBeInTheDocument() + expect(screen.getByText("Complete outcome facets")).toBeInTheDocument() + }) +}) diff --git a/frontend/src/components/maturity/__tests__/maturity-summary.test.tsx b/frontend/src/components/maturity/__tests__/maturity-summary.test.tsx index 6d3fe94..eb29338 100644 --- a/frontend/src/components/maturity/__tests__/maturity-summary.test.tsx +++ b/frontend/src/components/maturity/__tests__/maturity-summary.test.tsx @@ -28,6 +28,7 @@ describe("MaturitySummary", () => { agent_team_modes: [], customization_outcomes: [], project_readiness: [], + context_quality: [], }} />, ) diff --git a/frontend/src/components/maturity/context-quality-table.tsx b/frontend/src/components/maturity/context-quality-table.tsx new file mode 100644 index 0000000..0a58584 --- /dev/null +++ b/frontend/src/components/maturity/context-quality-table.tsx @@ -0,0 +1,117 @@ +import { Badge } from "@/components/ui/badge" +import { Card, CardContent, CardHeader } from "@/components/ui/card" +import { formatMetric, formatPercent, formatTokens } from "@/lib/utils" +import type { ContextQualityEntry } from "@/types/api" + +interface ContextQualityTableProps { + rows: ContextQualityEntry[] +} + +function formatCoverage(value: number): string { + return `${value.toFixed(0)}%` +} + +export function ContextQualityTable({ rows }: ContextQualityTableProps) { + if (rows.length === 0) { + return ( + + +

Context Quality

+
+ +

+ No context quality signals available yet. +

+
+
+ ) + } + + return ( + + +

Context Quality

+

+ Scores project context by guidance coverage, freshness, token efficiency, and telemetry + sensor coverage. +

+
+ +
+ + + + + + + + + + + + + {rows.map((row) => ( + + + + + + + + + ))} + +
RepositoryScoreGuidesTokensSensorsTop Gaps
+
+

{row.repository}

+

{row.session_count} sessions

+
+
+
+

{formatMetric(row.context_quality_score)}

+

overall

+
+
+
+

{formatMetric(row.guide_coverage_score)}

+

+ {formatMetric(row.guide_freshness_score, 0)} fresh +

+
+
+
+

{formatMetric(row.token_efficiency_score)}

+

+ {formatPercent(row.cache_hit_rate)} cache,{" "} + {row.avg_input_tokens == null + ? "-" + : formatTokens(row.avg_input_tokens)}{" "} + avg input +

+
+
+
+

{formatMetric(row.sensor_coverage_score)}

+

+ {formatCoverage(row.context_usage_coverage_pct)} context /{" "} + {formatCoverage(row.model_coverage_pct)} models +

+
+
+
+ {row.top_gaps.length === 0 ? ( + No major gaps + ) : ( + row.top_gaps.map((gap) => ( + + {gap} + + )) + )} +
+
+
+
+
+ ) +} diff --git a/frontend/src/pages/maturity.tsx b/frontend/src/pages/maturity.tsx index 64a9a4a..4b66807 100644 --- a/frontend/src/pages/maturity.tsx +++ b/frontend/src/pages/maturity.tsx @@ -18,6 +18,7 @@ import { TeamCustomizationLandscapeTable } from "@/components/maturity/team-cust import { LeverageScoreTable } from "@/components/maturity/leverage-score-table" import { LeverageTrendChart } from "@/components/maturity/leverage-trend-chart" import { EffectivenessScatter } from "@/components/maturity/effectiveness-scatter" +import { ContextQualityTable } from "@/components/maturity/context-quality-table" import { ProjectReadinessTable } from "@/components/maturity/project-readiness-table" import { ToolAdoptionSummary } from "@/components/tools/tool-adoption-summary" import { ToolAdoptionChart } from "@/components/tools/tool-adoption-chart" @@ -151,7 +152,10 @@ export function MaturityPage({ teamId, dateRange }: MaturityPageProps) { )} {data && activeTab === "projects" && ( - +
+ + +
)} ) diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts index 7670459..7712bcf 100644 --- a/frontend/src/types/api.ts +++ b/frontend/src/types/api.ts @@ -2035,6 +2035,26 @@ export interface ProjectReadinessEntry { session_count: number } +export interface ContextQualityEntry { + repository: string + session_count: number + context_quality_score: number + guide_coverage_score: number + guide_freshness_score: number + token_efficiency_score: number + sensor_coverage_score: number + cache_hit_rate: number | null + avg_input_tokens: number | null + context_usage_coverage_pct: number + tool_coverage_pct: number + model_coverage_pct: number + facet_coverage_pct: number + has_claude_md: boolean + has_agents_md: boolean + readiness_checked_at: string | null + top_gaps: string[] +} + export interface MaturityAnalyticsResponse { tool_categories: ToolCategoryBreakdown engineer_profiles: EngineerLeverageProfile[] @@ -2050,6 +2070,7 @@ export interface MaturityAnalyticsResponse { agent_team_modes: AgentTeamModeSummary[] customization_outcomes: CustomizationOutcomeAttribution[] project_readiness: ProjectReadinessEntry[] + context_quality: ContextQualityEntry[] sessions_analyzed: number avg_leverage_score: number avg_effectiveness_score: number | null diff --git a/src/primer/common/schemas.py b/src/primer/common/schemas.py index 1e3e847..ae48667 100644 --- a/src/primer/common/schemas.py +++ b/src/primer/common/schemas.py @@ -2454,6 +2454,26 @@ class ProjectReadinessEntry(BaseModel): session_count: int +class ContextQualityEntry(BaseModel): + repository: str + session_count: int + context_quality_score: float + guide_coverage_score: float + guide_freshness_score: float + token_efficiency_score: float + sensor_coverage_score: float + cache_hit_rate: float | None = None + avg_input_tokens: float | None = None + context_usage_coverage_pct: float + tool_coverage_pct: float + model_coverage_pct: float + facet_coverage_pct: float + has_claude_md: bool + has_agents_md: bool + readiness_checked_at: datetime | None = None + top_gaps: list[str] = Field(default_factory=list) + + class MaturityAnalyticsResponse(BaseModel): tool_categories: ToolCategoryBreakdown engineer_profiles: list[EngineerLeverageProfile] @@ -2469,6 +2489,7 @@ class MaturityAnalyticsResponse(BaseModel): agent_team_modes: list[AgentTeamModeSummary] = [] customization_outcomes: list[CustomizationOutcomeAttribution] = [] project_readiness: list[ProjectReadinessEntry] + context_quality: list[ContextQualityEntry] = Field(default_factory=list) sessions_analyzed: int avg_leverage_score: float avg_effectiveness_score: float | None = None diff --git a/src/primer/server/services/maturity_service.py b/src/primer/server/services/maturity_service.py index 661211c..96cf1ba 100644 --- a/src/primer/server/services/maturity_service.py +++ b/src/primer/server/services/maturity_service.py @@ -29,6 +29,7 @@ from primer.common.schemas import ( AgentSkillUsage, AgentTeamModeSummary, + ContextQualityEntry, CustomizationOutcomeAttribution, CustomizationStateFunnel, CustomizationUsage, @@ -1490,40 +1491,219 @@ def _team_customization_label( ), ) - # 6. Project readiness + # 6. Project readiness and context quality project_readiness: list[ProjectReadinessEntry] = [] + context_quality: list[ContextQualityEntry] = [] if sessions_analyzed > 0: - repo_session_counts = ( - db.query(SessionModel.repository_id, func.count(SessionModel.id)) + repo_session_rows = ( + db.query( + SessionModel.id, + SessionModel.repository_id, + SessionModel.input_tokens, + SessionModel.cache_read_tokens, + SessionModel.source_metadata, + ) .filter( SessionModel.id.in_(db.query(session_id_subq.c.id)), SessionModel.repository_id.isnot(None), ) - .group_by(SessionModel.repository_id) .all() ) - repo_counts = dict(repo_session_counts) - if repo_counts: + facet_session_ids = { + row.session_id + for row in ( + db.query(SessionFacets.session_id) + .filter(SessionFacets.session_id.in_(db.query(session_id_subq.c.id))) + .all() + ) + } + model_session_ids = {session_id for session_id, *_rest in model_rows} + repo_context_buckets: dict[str, dict] = {} + for ( + session_id, + repository_id, + input_tokens, + cache_read_tokens, + source_metadata, + ) in repo_session_rows: + if repository_id is None: + continue + bucket = repo_context_buckets.setdefault( + repository_id, + { + "sessions": set(), + "input_tokens": 0, + "cache_read_tokens": 0, + "tool_sessions": set(), + "model_sessions": set(), + "facet_sessions": set(), + "context_usage_sessions": set(), + }, + ) + bucket["sessions"].add(session_id) + bucket["input_tokens"] += input_tokens or 0 + bucket["cache_read_tokens"] += cache_read_tokens or 0 + if per_session.get(session_id): + bucket["tool_sessions"].add(session_id) + if session_id in model_session_ids: + bucket["model_sessions"].add(session_id) + if session_id in facet_session_ids: + bucket["facet_sessions"].add(session_id) + if _context_signal_count(source_metadata) > 0: + bucket["context_usage_sessions"].add(session_id) + + def _coverage_pct(count: int, total: int) -> float: + return round((count / total) * 100, 1) if total > 0 else 0.0 + + def _guide_freshness_score(checked_at: datetime | None) -> float: + if checked_at is None: + return 0.0 + now = datetime.now(tz=checked_at.tzinfo) if checked_at.tzinfo else datetime.now() + age_days = max((now - checked_at).days, 0) + if age_days <= 14: + return 100.0 + if age_days <= 30: + return 80.0 + if age_days <= 90: + return 50.0 + return 25.0 + + def _prompt_efficiency_score(avg_input_tokens: float | None) -> float: + if avg_input_tokens is None: + return 0.0 + if avg_input_tokens <= 20_000: + return 1.0 + if avg_input_tokens <= 50_000: + return 0.8 + if avg_input_tokens <= 100_000: + return 0.5 + return 0.25 + + if repo_context_buckets: repos = ( db.query(GitRepository) .filter( - GitRepository.id.in_(list(repo_counts.keys())), - GitRepository.ai_readiness_score.isnot(None), + GitRepository.id.in_(list(repo_context_buckets.keys())), ) .all() ) + repos_by_id = {repo.id: repo for repo in repos} for repo in repos: - project_readiness.append( - ProjectReadinessEntry( + if repo.ai_readiness_score is not None: + project_readiness.append( + ProjectReadinessEntry( + repository=repo.full_name, + has_claude_md=repo.has_claude_md or False, + has_agents_md=repo.has_agents_md or False, + has_claude_dir=repo.has_claude_dir or False, + ai_readiness_score=repo.ai_readiness_score or 0.0, + session_count=len(repo_context_buckets[repo.id]["sessions"]), + ) + ) + for repository_id, bucket in repo_context_buckets.items(): + repo = repos_by_id.get(repository_id) + if repo is None: + continue + session_count = len(bucket["sessions"]) + input_tokens = bucket["input_tokens"] + cache_read_tokens = bucket["cache_read_tokens"] + token_denominator = input_tokens + cache_read_tokens + cache_hit_rate = ( + round(cache_read_tokens / token_denominator, 3) + if token_denominator > 0 + else None + ) + avg_input_tokens = ( + round(input_tokens / session_count, 1) + if session_count > 0 and token_denominator > 0 + else None + ) + + guide_coverage_score = ( + repo.ai_readiness_score + if repo.ai_readiness_score is not None + else ( + (50.0 if repo.has_claude_md else 0.0) + + (20.0 if repo.has_agents_md else 0.0) + + (30.0 if repo.has_claude_dir else 0.0) + ) + ) + guide_freshness_score = _guide_freshness_score(repo.ai_readiness_checked_at) + context_usage_coverage_pct = _coverage_pct( + len(bucket["context_usage_sessions"]), + session_count, + ) + tool_coverage_pct = _coverage_pct(len(bucket["tool_sessions"]), session_count) + model_coverage_pct = _coverage_pct(len(bucket["model_sessions"]), session_count) + facet_coverage_pct = _coverage_pct(len(bucket["facet_sessions"]), session_count) + sensor_coverage_score = round( + ( + context_usage_coverage_pct + + tool_coverage_pct + + model_coverage_pct + + facet_coverage_pct + ) + / 4, + 1, + ) + cache_score = min((cache_hit_rate or 0.0) / 0.5, 1.0) + token_efficiency_score = round( + ((cache_score * 0.6) + (_prompt_efficiency_score(avg_input_tokens) * 0.4)) + * 100, + 1, + ) + context_quality_score = round( + (guide_coverage_score * 0.30) + + (guide_freshness_score * 0.15) + + (token_efficiency_score * 0.25) + + (sensor_coverage_score * 0.30), + 1, + ) + gaps: list[str] = [] + if not repo.has_claude_md: + gaps.append("Add CLAUDE.md") + if not repo.has_agents_md: + gaps.append("Add AGENTS.md") + if guide_freshness_score < 75: + gaps.append("Refresh guidance scan") + if cache_hit_rate is None: + gaps.append("Add token/cache telemetry") + elif cache_hit_rate < 0.25: + gaps.append("Improve cache reuse") + if avg_input_tokens is not None and avg_input_tokens > 50_000: + gaps.append("Trim prompt/context payloads") + if context_usage_coverage_pct < 50: + gaps.append("Increase context telemetry coverage") + if tool_coverage_pct < 90: + gaps.append("Complete tool telemetry") + if model_coverage_pct < 90: + gaps.append("Complete model telemetry") + if facet_coverage_pct < 90: + gaps.append("Complete outcome facets") + + context_quality.append( + ContextQualityEntry( repository=repo.full_name, + session_count=session_count, + context_quality_score=context_quality_score, + guide_coverage_score=round(guide_coverage_score, 1), + guide_freshness_score=guide_freshness_score, + token_efficiency_score=token_efficiency_score, + sensor_coverage_score=sensor_coverage_score, + cache_hit_rate=cache_hit_rate, + avg_input_tokens=avg_input_tokens, + context_usage_coverage_pct=context_usage_coverage_pct, + tool_coverage_pct=tool_coverage_pct, + model_coverage_pct=model_coverage_pct, + facet_coverage_pct=facet_coverage_pct, has_claude_md=repo.has_claude_md or False, has_agents_md=repo.has_agents_md or False, - has_claude_dir=repo.has_claude_dir or False, - ai_readiness_score=repo.ai_readiness_score or 0.0, - session_count=repo_counts.get(repo.id, 0), + readiness_checked_at=repo.ai_readiness_checked_at, + top_gaps=gaps[:4], ) ) project_readiness.sort(key=lambda p: p.ai_readiness_score, reverse=True) + context_quality.sort(key=lambda row: (row.context_quality_score, -row.session_count)) # Aggregate metrics avg_leverage = ( @@ -1560,6 +1740,7 @@ def _team_customization_label( agent_team_modes=agent_team_modes, customization_outcomes=customization_outcomes, project_readiness=project_readiness, + context_quality=context_quality, sessions_analyzed=sessions_analyzed, avg_leverage_score=round(avg_leverage, 1), avg_effectiveness_score=( diff --git a/tests/test_maturity.py b/tests/test_maturity.py index 9f3a4b7..a87d231 100644 --- a/tests/test_maturity.py +++ b/tests/test_maturity.py @@ -94,6 +94,7 @@ def test_maturity_empty(client, admin_headers): assert data["daily_leverage"] == [] assert data["agent_skill_breakdown"] == [] assert data["harness_configuration_fingerprints"] == [] + assert data["context_quality"] == [] def test_get_maturity_analytics_uses_cached_payload(monkeypatch, db_session): @@ -634,6 +635,73 @@ def test_maturity_builds_harness_configuration_fingerprints( assert fingerprint["avg_leverage_score"] > 0 +def test_maturity_builds_context_quality_scores( + client, admin_headers, seeded_maturity_data, db_session +): + now = datetime.now(tz=UTC) + s1 = seeded_maturity_data["s1"] + s2 = seeded_maturity_data["s2"] + repo = GitRepository( + full_name=f"acme/context-{uuid.uuid4().hex[:8]}", + has_claude_md=True, + has_agents_md=False, + has_claude_dir=True, + ai_readiness_score=80.0, + ai_readiness_checked_at=now - timedelta(days=7), + ) + db_session.add(repo) + db_session.flush() + + s1.repository_id = repo.id + s1.input_tokens = 1000 + s1.cache_read_tokens = 3000 + s1.source_metadata = { + "native_telemetry": { + "context_usage": {"reference_count": 2}, + } + } + s2.repository_id = repo.id + s2.input_tokens = 3000 + s2.cache_read_tokens = 1000 + db_session.add_all( + [ + SessionFacets(session_id=s1.id, outcome="success"), + ModelUsage( + session_id=s1.id, + model_name="claude-sonnet-4-5-20250929", + input_tokens=1000, + output_tokens=500, + cache_read_tokens=3000, + cache_creation_tokens=0, + ), + ] + ) + db_session.flush() + + response = client.get("/api/v1/analytics/maturity", headers=admin_headers) + + assert response.status_code == 200 + data = response.json() + quality = next(row for row in data["context_quality"] if row["repository"] == repo.full_name) + assert quality["session_count"] == 2 + assert quality["guide_coverage_score"] == 80.0 + assert quality["guide_freshness_score"] == 100.0 + assert quality["cache_hit_rate"] == 0.5 + assert quality["avg_input_tokens"] == 2000.0 + assert quality["token_efficiency_score"] == 100.0 + assert quality["context_usage_coverage_pct"] == 50.0 + assert quality["tool_coverage_pct"] == 100.0 + assert quality["model_coverage_pct"] == 50.0 + assert quality["facet_coverage_pct"] == 50.0 + assert quality["sensor_coverage_score"] == 62.5 + assert quality["context_quality_score"] == 82.8 + assert quality["top_gaps"] == [ + "Add AGENTS.md", + "Complete model telemetry", + "Complete outcome facets", + ] + + def test_maturity_builds_delegation_patterns( client, admin_headers, seeded_maturity_data, db_session ): From 20bf93b22fca05ba1b61ffbe7799acf1b60bc798 Mon Sep 17 00:00:00 2001 From: "Charles C. Figueiredo" Date: Fri, 24 Apr 2026 00:15:50 -0400 Subject: [PATCH 2/2] Fix context quality sort order --- .../server/services/maturity_service.py | 2 +- tests/test_maturity.py | 22 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/src/primer/server/services/maturity_service.py b/src/primer/server/services/maturity_service.py index 96cf1ba..f19d9ee 100644 --- a/src/primer/server/services/maturity_service.py +++ b/src/primer/server/services/maturity_service.py @@ -1703,7 +1703,7 @@ def _prompt_efficiency_score(avg_input_tokens: float | None) -> float: ) ) project_readiness.sort(key=lambda p: p.ai_readiness_score, reverse=True) - context_quality.sort(key=lambda row: (row.context_quality_score, -row.session_count)) + context_quality.sort(key=lambda row: (-row.context_quality_score, -row.session_count)) # Aggregate metrics avg_leverage = ( diff --git a/tests/test_maturity.py b/tests/test_maturity.py index a87d231..25a28c0 100644 --- a/tests/test_maturity.py +++ b/tests/test_maturity.py @@ -652,6 +652,17 @@ def test_maturity_builds_context_quality_scores( db_session.add(repo) db_session.flush() + low_repo = GitRepository( + full_name=f"acme/context-low-{uuid.uuid4().hex[:8]}", + has_claude_md=False, + has_agents_md=False, + has_claude_dir=False, + ai_readiness_score=10.0, + ai_readiness_checked_at=now - timedelta(days=120), + ) + db_session.add(low_repo) + db_session.flush() + s1.repository_id = repo.id s1.input_tokens = 1000 s1.cache_read_tokens = 3000 @@ -663,8 +674,17 @@ def test_maturity_builds_context_quality_scores( s2.repository_id = repo.id s2.input_tokens = 3000 s2.cache_read_tokens = 1000 + low_session = Session( + id=str(uuid.uuid4()), + engineer_id=seeded_maturity_data["eng2"].id, + repository_id=low_repo.id, + started_at=now - timedelta(hours=6), + input_tokens=120_000, + cache_read_tokens=0, + ) db_session.add_all( [ + low_session, SessionFacets(session_id=s1.id, outcome="success"), ModelUsage( session_id=s1.id, @@ -682,6 +702,8 @@ def test_maturity_builds_context_quality_scores( assert response.status_code == 200 data = response.json() + assert data["context_quality"][0]["repository"] == repo.full_name + assert data["context_quality"][-1]["repository"] == low_repo.full_name quality = next(row for row in data["context_quality"] if row["repository"] == repo.full_name) assert quality["session_count"] == 2 assert quality["guide_coverage_score"] == 80.0