From 1f22d68e4aaecf4f13dfe31bebb7d89afc1da19f Mon Sep 17 00:00:00 2001 From: "Charles C. Figueiredo" Date: Thu, 23 Apr 2026 19:24:43 -0400 Subject: [PATCH 1/2] Add compound tool reliability scoring --- ROADMAP.md | 2 +- .../toolchain-reliability-table.test.tsx | 7 +++++++ .../maturity/toolchain-reliability-table.tsx | 16 ++++++++++++++++ frontend/src/types/api.ts | 3 +++ src/primer/common/schemas.py | 3 +++ .../server/services/maturity_service.py | 19 +++++++++++++++++++ tests/test_maturity.py | 19 +++++++++++++++++++ 7 files changed, 68 insertions(+), 1 deletion(-) diff --git a/ROADMAP.md b/ROADMAP.md index 96cf918a..ecf70c50 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -191,7 +191,7 @@ Derived data pipelines, performance optimization, durable background jobs, enter - [x] [P1] Outcome attribution for customizations: which MCPs, skills, commands, and subagents improve workflow, quality, cost, and friction outcomes - [x] [P1] Cross-team tooling landscape: overlap, reuse, and local best-of-breed tools - [x] [P1] High-performer agent stack analysis: which combinations of MCPs, skills, commands, and subagents differentiate top performers -- [ ] [P0] Per-tool success rate tracking with compound reliability computation (10 steps at 99% = 90.4% end-to-end) +- [x] [P0] Per-tool success rate tracking with compound reliability computation (10 steps at 99% = 90.4% end-to-end) - [ ] [P0] Harness configuration fingerprinting from session telemetry (tools, context files, permissions, customizations) - [ ] [P1] Context quality scoring: AGENTS.md freshness, token efficiency, guide/sensor coverage - [ ] [P1] Harness evolution timeline: before/after correlation of configuration changes with outcome changes diff --git a/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx b/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx index 9d605525..3f2bdfee 100644 --- a/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx +++ b/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx @@ -20,12 +20,15 @@ describe("ToolchainReliabilityTable", () => { source_classification: "marketplace", session_count: 4, engineer_count: 2, + total_call_count: 12, + avg_calls_per_session: 3, friction_session_count: 2, friction_session_rate: 0.5, failure_session_count: 1, failure_session_rate: 0.25, recovery_rate: 0.5, success_rate: 0.75, + compound_reliability_rate: 0.422, abandonment_rate: 0.25, avg_recovery_steps: 2.5, top_friction_types: ["tool_error", "timeout"], @@ -37,7 +40,11 @@ describe("ToolchainReliabilityTable", () => { expect(screen.getByText("github")).toBeInTheDocument() expect(screen.getByText("Mcp")).toBeInTheDocument() expect(screen.getByText("Marketplace")).toBeInTheDocument() + expect(screen.getByText("12")).toBeInTheDocument() + expect(screen.getByText("3.0 / session")).toBeInTheDocument() expect(screen.getAllByText("50%")).toHaveLength(2) + expect(screen.getByText("42%")).toBeInTheDocument() + expect(screen.getByText("chain estimate")).toBeInTheDocument() expect(screen.getByText("25%")).toBeInTheDocument() expect(screen.getByText("25% abandoned")).toBeInTheDocument() expect(screen.getByText("2.5 steps")).toBeInTheDocument() diff --git a/frontend/src/components/maturity/toolchain-reliability-table.tsx b/frontend/src/components/maturity/toolchain-reliability-table.tsx index 58215b31..f795fe4d 100644 --- a/frontend/src/components/maturity/toolchain-reliability-table.tsx +++ b/frontend/src/components/maturity/toolchain-reliability-table.tsx @@ -35,10 +35,12 @@ export function ToolchainReliabilityTable({ rows }: ToolchainReliabilityTablePro Surface Sessions + Calls Friction Failures Recovery Success + Compound Top Frictions @@ -70,6 +72,14 @@ export function ToolchainReliabilityTable({ rows }: ToolchainReliabilityTablePro

+ +
+

{row.total_call_count}

+

+ {row.avg_calls_per_session.toFixed(1)} / session +

+
+

{formatPercent(row.friction_session_rate)}

@@ -104,6 +114,12 @@ export function ToolchainReliabilityTable({ rows }: ToolchainReliabilityTablePro

+ +
+

{formatPercent(row.compound_reliability_rate)}

+

chain estimate

+
+
{row.top_friction_types.length === 0 ? ( diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts index 0bbe5144..b16d388b 100644 --- a/frontend/src/types/api.ts +++ b/frontend/src/types/api.ts @@ -1956,12 +1956,15 @@ export interface ToolchainReliabilityEntry { source_classification: CustomizationSourceClassification | null session_count: number engineer_count: number + total_call_count: number + avg_calls_per_session: number friction_session_count: number friction_session_rate: number | null failure_session_count: number failure_session_rate: number | null recovery_rate: number | null success_rate: number | null + compound_reliability_rate: number | null abandonment_rate: number | null avg_recovery_steps: number | null top_friction_types: string[] diff --git a/src/primer/common/schemas.py b/src/primer/common/schemas.py index 6e515f33..fd910a9b 100644 --- a/src/primer/common/schemas.py +++ b/src/primer/common/schemas.py @@ -2375,12 +2375,15 @@ class ToolchainReliabilityEntry(BaseModel): source_classification: CustomizationSourceClassification | None = None session_count: int engineer_count: int + total_call_count: int = 0 + avg_calls_per_session: float = 0.0 friction_session_count: int friction_session_rate: float | None = None failure_session_count: int failure_session_rate: float | None = None recovery_rate: float | None = None success_rate: float | None = None + compound_reliability_rate: float | None = None abandonment_rate: float | None = None avg_recovery_steps: float | None = None top_friction_types: list[str] = [] diff --git a/src/primer/server/services/maturity_service.py b/src/primer/server/services/maturity_service.py index 3e481d73..4f8c4399 100644 --- a/src/primer/server/services/maturity_service.py +++ b/src/primer/server/services/maturity_service.py @@ -707,6 +707,7 @@ def _ensure_reliability_bucket( "recovery_steps": [], "success_sessions": set(), "friction_type_counts": Counter(), + "total_call_count": 0, }, ) @@ -741,6 +742,7 @@ def _apply_session_reliability_metrics( continue bucket = _ensure_reliability_bucket("built_in_tool", tool_name, "built_in", "built_in") bucket["sessions"].add(sid) + bucket["total_call_count"] += call_count if sid in session_metrics: session_metric = session_metrics[sid] engineer_id_for_session = session_metric["engineer_id"] @@ -850,12 +852,22 @@ def _apply_session_reliability_metrics( ) reliability_bucket["sessions"].add(session_id) reliability_bucket["engineers"].add(engineer_id) + reliability_bucket["total_call_count"] += invocation_count or 0 _apply_session_reliability_metrics( reliability_bucket, session_id, session_metrics.get(session_id), ) + def _compound_reliability_rate(bucket: dict) -> float | None: + if not bucket["sessions"]: + return None + success_rate = len(bucket["success_sessions"]) / len(bucket["sessions"]) + avg_calls = bucket["total_call_count"] / len(bucket["sessions"]) + if avg_calls <= 0: + return None + return round(success_rate**avg_calls, 3) + customization_breakdown = [ CustomizationUsage( identifier=bucket["identifier"], @@ -933,6 +945,12 @@ def _apply_session_reliability_metrics( source_classification=bucket["source_classification"], session_count=len(bucket["sessions"]), engineer_count=len(bucket["engineers"]), + total_call_count=bucket["total_call_count"], + avg_calls_per_session=( + round(bucket["total_call_count"] / len(bucket["sessions"]), 1) + if bucket["sessions"] + else 0.0 + ), friction_session_count=len(bucket["friction_sessions"]), friction_session_rate=( round(len(bucket["friction_sessions"]) / len(bucket["sessions"]), 3) @@ -958,6 +976,7 @@ def _apply_session_reliability_metrics( if bucket["sessions"] else None ), + compound_reliability_rate=_compound_reliability_rate(bucket), abandonment_rate=( round(len(bucket["abandoned_sessions"]) / len(bucket["sessions"]), 3) if bucket["sessions"] diff --git a/tests/test_maturity.py b/tests/test_maturity.py index cb7701c8..6fab7786 100644 --- a/tests/test_maturity.py +++ b/tests/test_maturity.py @@ -476,6 +476,12 @@ def test_maturity_builds_toolchain_reliability_view( s1 = seeded_maturity_data["s1"] s2 = seeded_maturity_data["s2"] + db_session.add_all( + [ + ToolUsage(session_id=s1.id, tool_name="Bash", call_count=2), + ToolUsage(session_id=s2.id, tool_name="Bash", call_count=2), + ] + ) db_session.add_all( [ SessionFacets( @@ -530,18 +536,29 @@ def test_maturity_builds_toolchain_reliability_view( assert github["failure_session_rate"] == 1.0 assert github["recovery_rate"] == 1.0 assert github["success_rate"] == 1.0 + assert github["total_call_count"] == 3 + assert github["avg_calls_per_session"] == 3.0 + assert github["compound_reliability_rate"] == 1.0 assert github["abandonment_rate"] == 0.0 assert github["top_friction_types"] == ["tool_error", "timeout"] read_tool = reliability_rows[("built_in_tool", "Read")] assert read_tool["session_count"] == 2 assert read_tool["engineer_count"] == 2 + assert read_tool["total_call_count"] == 30 + assert read_tool["avg_calls_per_session"] == 15.0 assert read_tool["friction_session_rate"] == 1.0 assert read_tool["failure_session_rate"] == 0.5 assert read_tool["recovery_rate"] == 0.5 assert read_tool["success_rate"] == 0.5 + assert read_tool["compound_reliability_rate"] == 0.0 assert read_tool["abandonment_rate"] == 0.5 + bash_tool = reliability_rows[("built_in_tool", "Bash")] + assert bash_tool["success_rate"] == 0.5 + assert bash_tool["avg_calls_per_session"] == 2.0 + assert bash_tool["compound_reliability_rate"] == 0.25 + def test_maturity_builds_delegation_patterns( client, admin_headers, seeded_maturity_data, db_session @@ -654,6 +671,8 @@ def test_maturity_toolchain_reliability_deduplicates_duplicate_tool_rows( if row["surface_type"] == "built_in_tool" and row["identifier"] == "Read" ) assert read_tool["session_count"] == 2 + assert read_tool["total_call_count"] == 37 + assert read_tool["avg_calls_per_session"] == 18.5 assert read_tool["friction_session_count"] == 2 assert read_tool["avg_recovery_steps"] == 2.0 From 86141c6ab9350fdbb713feb817f0360a85b47dfe Mon Sep 17 00:00:00 2001 From: "Charles C. Figueiredo" Date: Thu, 23 Apr 2026 20:10:47 -0400 Subject: [PATCH 2/2] Fix compound reliability rate base --- .../maturity/__tests__/toolchain-reliability-table.test.tsx | 6 +++--- .../src/components/maturity/toolchain-reliability-table.tsx | 2 +- src/primer/server/services/maturity_service.py | 4 +++- tests/test_maturity.py | 4 ++-- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx b/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx index 3f2bdfee..5891ed22 100644 --- a/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx +++ b/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx @@ -28,7 +28,7 @@ describe("ToolchainReliabilityTable", () => { failure_session_rate: 0.25, recovery_rate: 0.5, success_rate: 0.75, - compound_reliability_rate: 0.422, + compound_reliability_rate: 0.904, abandonment_rate: 0.25, avg_recovery_steps: 2.5, top_friction_types: ["tool_error", "timeout"], @@ -43,8 +43,8 @@ describe("ToolchainReliabilityTable", () => { expect(screen.getByText("12")).toBeInTheDocument() expect(screen.getByText("3.0 / session")).toBeInTheDocument() expect(screen.getAllByText("50%")).toHaveLength(2) - expect(screen.getByText("42%")).toBeInTheDocument() - expect(screen.getByText("chain estimate")).toBeInTheDocument() + expect(screen.getByText("90%")).toBeInTheDocument() + expect(screen.getByText("10-step chain")).toBeInTheDocument() expect(screen.getByText("25%")).toBeInTheDocument() expect(screen.getByText("25% abandoned")).toBeInTheDocument() expect(screen.getByText("2.5 steps")).toBeInTheDocument() diff --git a/frontend/src/components/maturity/toolchain-reliability-table.tsx b/frontend/src/components/maturity/toolchain-reliability-table.tsx index f795fe4d..c185729a 100644 --- a/frontend/src/components/maturity/toolchain-reliability-table.tsx +++ b/frontend/src/components/maturity/toolchain-reliability-table.tsx @@ -117,7 +117,7 @@ export function ToolchainReliabilityTable({ rows }: ToolchainReliabilityTablePro

{formatPercent(row.compound_reliability_rate)}

-

chain estimate

+

10-step chain

diff --git a/src/primer/server/services/maturity_service.py b/src/primer/server/services/maturity_service.py index 4f8c4399..1349093a 100644 --- a/src/primer/server/services/maturity_service.py +++ b/src/primer/server/services/maturity_service.py @@ -57,6 +57,7 @@ from primer.server.services.effectiveness_service import build_effectiveness_score _RELIABILITY_FAILURE_FRICTION_TYPES = frozenset({"tool_error", "exec_error", "timeout"}) +_COMPOUND_RELIABILITY_CHAIN_LENGTH = 10 def get_maturity_analytics( @@ -866,7 +867,8 @@ def _compound_reliability_rate(bucket: dict) -> float | None: avg_calls = bucket["total_call_count"] / len(bucket["sessions"]) if avg_calls <= 0: return None - return round(success_rate**avg_calls, 3) + per_call_rate = success_rate ** (1.0 / avg_calls) + return round(per_call_rate**_COMPOUND_RELIABILITY_CHAIN_LENGTH, 3) customization_breakdown = [ CustomizationUsage( diff --git a/tests/test_maturity.py b/tests/test_maturity.py index 6fab7786..a3b0022e 100644 --- a/tests/test_maturity.py +++ b/tests/test_maturity.py @@ -551,13 +551,13 @@ def test_maturity_builds_toolchain_reliability_view( assert read_tool["failure_session_rate"] == 0.5 assert read_tool["recovery_rate"] == 0.5 assert read_tool["success_rate"] == 0.5 - assert read_tool["compound_reliability_rate"] == 0.0 + assert read_tool["compound_reliability_rate"] == 0.63 assert read_tool["abandonment_rate"] == 0.5 bash_tool = reliability_rows[("built_in_tool", "Bash")] assert bash_tool["success_rate"] == 0.5 assert bash_tool["avg_calls_per_session"] == 2.0 - assert bash_tool["compound_reliability_rate"] == 0.25 + assert bash_tool["compound_reliability_rate"] == 0.031 def test_maturity_builds_delegation_patterns(