From 1f22d68e4aaecf4f13dfe31bebb7d89afc1da19f Mon Sep 17 00:00:00 2001
From: "Charles C. Figueiredo"
Date: Thu, 23 Apr 2026 19:24:43 -0400
Subject: [PATCH 1/2] Add compound tool reliability scoring
---
ROADMAP.md | 2 +-
.../toolchain-reliability-table.test.tsx | 7 +++++++
.../maturity/toolchain-reliability-table.tsx | 16 ++++++++++++++++
frontend/src/types/api.ts | 3 +++
src/primer/common/schemas.py | 3 +++
.../server/services/maturity_service.py | 19 +++++++++++++++++++
tests/test_maturity.py | 19 +++++++++++++++++++
7 files changed, 68 insertions(+), 1 deletion(-)
diff --git a/ROADMAP.md b/ROADMAP.md
index 96cf918a..ecf70c50 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -191,7 +191,7 @@ Derived data pipelines, performance optimization, durable background jobs, enter
- [x] [P1] Outcome attribution for customizations: which MCPs, skills, commands, and subagents improve workflow, quality, cost, and friction outcomes
- [x] [P1] Cross-team tooling landscape: overlap, reuse, and local best-of-breed tools
- [x] [P1] High-performer agent stack analysis: which combinations of MCPs, skills, commands, and subagents differentiate top performers
-- [ ] [P0] Per-tool success rate tracking with compound reliability computation (10 steps at 99% = 90.4% end-to-end)
+- [x] [P0] Per-tool success rate tracking with compound reliability computation (10 steps at 99% = 90.4% end-to-end)
- [ ] [P0] Harness configuration fingerprinting from session telemetry (tools, context files, permissions, customizations)
- [ ] [P1] Context quality scoring: AGENTS.md freshness, token efficiency, guide/sensor coverage
- [ ] [P1] Harness evolution timeline: before/after correlation of configuration changes with outcome changes
diff --git a/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx b/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx
index 9d605525..3f2bdfee 100644
--- a/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx
+++ b/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx
@@ -20,12 +20,15 @@ describe("ToolchainReliabilityTable", () => {
source_classification: "marketplace",
session_count: 4,
engineer_count: 2,
+ total_call_count: 12,
+ avg_calls_per_session: 3,
friction_session_count: 2,
friction_session_rate: 0.5,
failure_session_count: 1,
failure_session_rate: 0.25,
recovery_rate: 0.5,
success_rate: 0.75,
+ compound_reliability_rate: 0.422,
abandonment_rate: 0.25,
avg_recovery_steps: 2.5,
top_friction_types: ["tool_error", "timeout"],
@@ -37,7 +40,11 @@ describe("ToolchainReliabilityTable", () => {
expect(screen.getByText("github")).toBeInTheDocument()
expect(screen.getByText("Mcp")).toBeInTheDocument()
expect(screen.getByText("Marketplace")).toBeInTheDocument()
+ expect(screen.getByText("12")).toBeInTheDocument()
+ expect(screen.getByText("3.0 / session")).toBeInTheDocument()
expect(screen.getAllByText("50%")).toHaveLength(2)
+ expect(screen.getByText("42%")).toBeInTheDocument()
+ expect(screen.getByText("chain estimate")).toBeInTheDocument()
expect(screen.getByText("25%")).toBeInTheDocument()
expect(screen.getByText("25% abandoned")).toBeInTheDocument()
expect(screen.getByText("2.5 steps")).toBeInTheDocument()
diff --git a/frontend/src/components/maturity/toolchain-reliability-table.tsx b/frontend/src/components/maturity/toolchain-reliability-table.tsx
index 58215b31..f795fe4d 100644
--- a/frontend/src/components/maturity/toolchain-reliability-table.tsx
+++ b/frontend/src/components/maturity/toolchain-reliability-table.tsx
@@ -35,10 +35,12 @@ export function ToolchainReliabilityTable({ rows }: ToolchainReliabilityTablePro
| Surface |
Sessions |
+ Calls |
Friction |
Failures |
Recovery |
Success |
+ Compound |
Top Frictions |
@@ -70,6 +72,14 @@ export function ToolchainReliabilityTable({ rows }: ToolchainReliabilityTablePro
+
+
+ {row.total_call_count}
+
+ {row.avg_calls_per_session.toFixed(1)} / session
+
+
+ |
{formatPercent(row.friction_session_rate)}
@@ -104,6 +114,12 @@ export function ToolchainReliabilityTable({ rows }: ToolchainReliabilityTablePro
|
+
+
+ {formatPercent(row.compound_reliability_rate)}
+ chain estimate
+
+ |
{row.top_friction_types.length === 0 ? (
diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts
index 0bbe5144..b16d388b 100644
--- a/frontend/src/types/api.ts
+++ b/frontend/src/types/api.ts
@@ -1956,12 +1956,15 @@ export interface ToolchainReliabilityEntry {
source_classification: CustomizationSourceClassification | null
session_count: number
engineer_count: number
+ total_call_count: number
+ avg_calls_per_session: number
friction_session_count: number
friction_session_rate: number | null
failure_session_count: number
failure_session_rate: number | null
recovery_rate: number | null
success_rate: number | null
+ compound_reliability_rate: number | null
abandonment_rate: number | null
avg_recovery_steps: number | null
top_friction_types: string[]
diff --git a/src/primer/common/schemas.py b/src/primer/common/schemas.py
index 6e515f33..fd910a9b 100644
--- a/src/primer/common/schemas.py
+++ b/src/primer/common/schemas.py
@@ -2375,12 +2375,15 @@ class ToolchainReliabilityEntry(BaseModel):
source_classification: CustomizationSourceClassification | None = None
session_count: int
engineer_count: int
+ total_call_count: int = 0
+ avg_calls_per_session: float = 0.0
friction_session_count: int
friction_session_rate: float | None = None
failure_session_count: int
failure_session_rate: float | None = None
recovery_rate: float | None = None
success_rate: float | None = None
+ compound_reliability_rate: float | None = None
abandonment_rate: float | None = None
avg_recovery_steps: float | None = None
top_friction_types: list[str] = []
diff --git a/src/primer/server/services/maturity_service.py b/src/primer/server/services/maturity_service.py
index 3e481d73..4f8c4399 100644
--- a/src/primer/server/services/maturity_service.py
+++ b/src/primer/server/services/maturity_service.py
@@ -707,6 +707,7 @@ def _ensure_reliability_bucket(
"recovery_steps": [],
"success_sessions": set(),
"friction_type_counts": Counter(),
+ "total_call_count": 0,
},
)
@@ -741,6 +742,7 @@ def _apply_session_reliability_metrics(
continue
bucket = _ensure_reliability_bucket("built_in_tool", tool_name, "built_in", "built_in")
bucket["sessions"].add(sid)
+ bucket["total_call_count"] += call_count
if sid in session_metrics:
session_metric = session_metrics[sid]
engineer_id_for_session = session_metric["engineer_id"]
@@ -850,12 +852,22 @@ def _apply_session_reliability_metrics(
)
reliability_bucket["sessions"].add(session_id)
reliability_bucket["engineers"].add(engineer_id)
+ reliability_bucket["total_call_count"] += invocation_count or 0
_apply_session_reliability_metrics(
reliability_bucket,
session_id,
session_metrics.get(session_id),
)
+ def _compound_reliability_rate(bucket: dict) -> float | None:
+ if not bucket["sessions"]:
+ return None
+ success_rate = len(bucket["success_sessions"]) / len(bucket["sessions"])
+ avg_calls = bucket["total_call_count"] / len(bucket["sessions"])
+ if avg_calls <= 0:
+ return None
+ return round(success_rate**avg_calls, 3)
+
customization_breakdown = [
CustomizationUsage(
identifier=bucket["identifier"],
@@ -933,6 +945,12 @@ def _apply_session_reliability_metrics(
source_classification=bucket["source_classification"],
session_count=len(bucket["sessions"]),
engineer_count=len(bucket["engineers"]),
+ total_call_count=bucket["total_call_count"],
+ avg_calls_per_session=(
+ round(bucket["total_call_count"] / len(bucket["sessions"]), 1)
+ if bucket["sessions"]
+ else 0.0
+ ),
friction_session_count=len(bucket["friction_sessions"]),
friction_session_rate=(
round(len(bucket["friction_sessions"]) / len(bucket["sessions"]), 3)
@@ -958,6 +976,7 @@ def _apply_session_reliability_metrics(
if bucket["sessions"]
else None
),
+ compound_reliability_rate=_compound_reliability_rate(bucket),
abandonment_rate=(
round(len(bucket["abandoned_sessions"]) / len(bucket["sessions"]), 3)
if bucket["sessions"]
diff --git a/tests/test_maturity.py b/tests/test_maturity.py
index cb7701c8..6fab7786 100644
--- a/tests/test_maturity.py
+++ b/tests/test_maturity.py
@@ -476,6 +476,12 @@ def test_maturity_builds_toolchain_reliability_view(
s1 = seeded_maturity_data["s1"]
s2 = seeded_maturity_data["s2"]
+ db_session.add_all(
+ [
+ ToolUsage(session_id=s1.id, tool_name="Bash", call_count=2),
+ ToolUsage(session_id=s2.id, tool_name="Bash", call_count=2),
+ ]
+ )
db_session.add_all(
[
SessionFacets(
@@ -530,18 +536,29 @@ def test_maturity_builds_toolchain_reliability_view(
assert github["failure_session_rate"] == 1.0
assert github["recovery_rate"] == 1.0
assert github["success_rate"] == 1.0
+ assert github["total_call_count"] == 3
+ assert github["avg_calls_per_session"] == 3.0
+ assert github["compound_reliability_rate"] == 1.0
assert github["abandonment_rate"] == 0.0
assert github["top_friction_types"] == ["tool_error", "timeout"]
read_tool = reliability_rows[("built_in_tool", "Read")]
assert read_tool["session_count"] == 2
assert read_tool["engineer_count"] == 2
+ assert read_tool["total_call_count"] == 30
+ assert read_tool["avg_calls_per_session"] == 15.0
assert read_tool["friction_session_rate"] == 1.0
assert read_tool["failure_session_rate"] == 0.5
assert read_tool["recovery_rate"] == 0.5
assert read_tool["success_rate"] == 0.5
+ assert read_tool["compound_reliability_rate"] == 0.0
assert read_tool["abandonment_rate"] == 0.5
+ bash_tool = reliability_rows[("built_in_tool", "Bash")]
+ assert bash_tool["success_rate"] == 0.5
+ assert bash_tool["avg_calls_per_session"] == 2.0
+ assert bash_tool["compound_reliability_rate"] == 0.25
+
def test_maturity_builds_delegation_patterns(
client, admin_headers, seeded_maturity_data, db_session
@@ -654,6 +671,8 @@ def test_maturity_toolchain_reliability_deduplicates_duplicate_tool_rows(
if row["surface_type"] == "built_in_tool" and row["identifier"] == "Read"
)
assert read_tool["session_count"] == 2
+ assert read_tool["total_call_count"] == 37
+ assert read_tool["avg_calls_per_session"] == 18.5
assert read_tool["friction_session_count"] == 2
assert read_tool["avg_recovery_steps"] == 2.0
From 86141c6ab9350fdbb713feb817f0360a85b47dfe Mon Sep 17 00:00:00 2001
From: "Charles C. Figueiredo"
Date: Thu, 23 Apr 2026 20:10:47 -0400
Subject: [PATCH 2/2] Fix compound reliability rate base
---
.../maturity/__tests__/toolchain-reliability-table.test.tsx | 6 +++---
.../src/components/maturity/toolchain-reliability-table.tsx | 2 +-
src/primer/server/services/maturity_service.py | 4 +++-
tests/test_maturity.py | 4 ++--
4 files changed, 9 insertions(+), 7 deletions(-)
diff --git a/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx b/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx
index 3f2bdfee..5891ed22 100644
--- a/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx
+++ b/frontend/src/components/maturity/__tests__/toolchain-reliability-table.test.tsx
@@ -28,7 +28,7 @@ describe("ToolchainReliabilityTable", () => {
failure_session_rate: 0.25,
recovery_rate: 0.5,
success_rate: 0.75,
- compound_reliability_rate: 0.422,
+ compound_reliability_rate: 0.904,
abandonment_rate: 0.25,
avg_recovery_steps: 2.5,
top_friction_types: ["tool_error", "timeout"],
@@ -43,8 +43,8 @@ describe("ToolchainReliabilityTable", () => {
expect(screen.getByText("12")).toBeInTheDocument()
expect(screen.getByText("3.0 / session")).toBeInTheDocument()
expect(screen.getAllByText("50%")).toHaveLength(2)
- expect(screen.getByText("42%")).toBeInTheDocument()
- expect(screen.getByText("chain estimate")).toBeInTheDocument()
+ expect(screen.getByText("90%")).toBeInTheDocument()
+ expect(screen.getByText("10-step chain")).toBeInTheDocument()
expect(screen.getByText("25%")).toBeInTheDocument()
expect(screen.getByText("25% abandoned")).toBeInTheDocument()
expect(screen.getByText("2.5 steps")).toBeInTheDocument()
diff --git a/frontend/src/components/maturity/toolchain-reliability-table.tsx b/frontend/src/components/maturity/toolchain-reliability-table.tsx
index f795fe4d..c185729a 100644
--- a/frontend/src/components/maturity/toolchain-reliability-table.tsx
+++ b/frontend/src/components/maturity/toolchain-reliability-table.tsx
@@ -117,7 +117,7 @@ export function ToolchainReliabilityTable({ rows }: ToolchainReliabilityTablePro
{formatPercent(row.compound_reliability_rate)}
- chain estimate
+ 10-step chain
|
diff --git a/src/primer/server/services/maturity_service.py b/src/primer/server/services/maturity_service.py
index 4f8c4399..1349093a 100644
--- a/src/primer/server/services/maturity_service.py
+++ b/src/primer/server/services/maturity_service.py
@@ -57,6 +57,7 @@
from primer.server.services.effectiveness_service import build_effectiveness_score
_RELIABILITY_FAILURE_FRICTION_TYPES = frozenset({"tool_error", "exec_error", "timeout"})
+_COMPOUND_RELIABILITY_CHAIN_LENGTH = 10
def get_maturity_analytics(
@@ -866,7 +867,8 @@ def _compound_reliability_rate(bucket: dict) -> float | None:
avg_calls = bucket["total_call_count"] / len(bucket["sessions"])
if avg_calls <= 0:
return None
- return round(success_rate**avg_calls, 3)
+ per_call_rate = success_rate ** (1.0 / avg_calls)
+ return round(per_call_rate**_COMPOUND_RELIABILITY_CHAIN_LENGTH, 3)
customization_breakdown = [
CustomizationUsage(
diff --git a/tests/test_maturity.py b/tests/test_maturity.py
index 6fab7786..a3b0022e 100644
--- a/tests/test_maturity.py
+++ b/tests/test_maturity.py
@@ -551,13 +551,13 @@ def test_maturity_builds_toolchain_reliability_view(
assert read_tool["failure_session_rate"] == 0.5
assert read_tool["recovery_rate"] == 0.5
assert read_tool["success_rate"] == 0.5
- assert read_tool["compound_reliability_rate"] == 0.0
+ assert read_tool["compound_reliability_rate"] == 0.63
assert read_tool["abandonment_rate"] == 0.5
bash_tool = reliability_rows[("built_in_tool", "Bash")]
assert bash_tool["success_rate"] == 0.5
assert bash_tool["avg_calls_per_session"] == 2.0
- assert bash_tool["compound_reliability_rate"] == 0.25
+ assert bash_tool["compound_reliability_rate"] == 0.031
def test_maturity_builds_delegation_patterns(
| |