Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ Derived data pipelines, performance optimization, durable background jobs, enter
- [x] [P1] Outcome attribution for customizations: which MCPs, skills, commands, and subagents improve workflow, quality, cost, and friction outcomes
- [x] [P1] Cross-team tooling landscape: overlap, reuse, and local best-of-breed tools
- [x] [P1] High-performer agent stack analysis: which combinations of MCPs, skills, commands, and subagents differentiate top performers
- [ ] [P0] Per-tool success rate tracking with compound reliability computation (10 steps at 99% = 90.4% end-to-end)
- [x] [P0] Per-tool success rate tracking with compound reliability computation (10 steps at 99% = 90.4% end-to-end)
- [ ] [P0] Harness configuration fingerprinting from session telemetry (tools, context files, permissions, customizations)
- [ ] [P1] Context quality scoring: AGENTS.md freshness, token efficiency, guide/sensor coverage
- [ ] [P1] Harness evolution timeline: before/after correlation of configuration changes with outcome changes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,15 @@ describe("ToolchainReliabilityTable", () => {
source_classification: "marketplace",
session_count: 4,
engineer_count: 2,
total_call_count: 12,
avg_calls_per_session: 3,
friction_session_count: 2,
friction_session_rate: 0.5,
failure_session_count: 1,
failure_session_rate: 0.25,
recovery_rate: 0.5,
success_rate: 0.75,
compound_reliability_rate: 0.904,
abandonment_rate: 0.25,
avg_recovery_steps: 2.5,
top_friction_types: ["tool_error", "timeout"],
Expand All @@ -37,7 +40,11 @@ describe("ToolchainReliabilityTable", () => {
expect(screen.getByText("github")).toBeInTheDocument()
expect(screen.getByText("Mcp")).toBeInTheDocument()
expect(screen.getByText("Marketplace")).toBeInTheDocument()
expect(screen.getByText("12")).toBeInTheDocument()
expect(screen.getByText("3.0 / session")).toBeInTheDocument()
expect(screen.getAllByText("50%")).toHaveLength(2)
expect(screen.getByText("90%")).toBeInTheDocument()
expect(screen.getByText("10-step chain")).toBeInTheDocument()
expect(screen.getByText("25%")).toBeInTheDocument()
expect(screen.getByText("25% abandoned")).toBeInTheDocument()
expect(screen.getByText("2.5 steps")).toBeInTheDocument()
Expand Down
16 changes: 16 additions & 0 deletions frontend/src/components/maturity/toolchain-reliability-table.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,12 @@ export function ToolchainReliabilityTable({ rows }: ToolchainReliabilityTablePro
<tr className="border-b border-border text-left text-muted-foreground">
<th className="pb-2 font-medium">Surface</th>
<th className="pb-2 text-right font-medium">Sessions</th>
<th className="pb-2 text-right font-medium">Calls</th>
<th className="pb-2 text-right font-medium">Friction</th>
<th className="pb-2 text-right font-medium">Failures</th>
<th className="pb-2 text-right font-medium">Recovery</th>
<th className="pb-2 text-right font-medium">Success</th>
<th className="pb-2 text-right font-medium">Compound</th>
<th className="pb-2 font-medium">Top Frictions</th>
</tr>
</thead>
Expand Down Expand Up @@ -70,6 +72,14 @@ export function ToolchainReliabilityTable({ rows }: ToolchainReliabilityTablePro
</p>
</div>
</td>
<td className="py-2 text-right">
<div>
<p>{row.total_call_count}</p>
<p className="text-xs text-muted-foreground">
{row.avg_calls_per_session.toFixed(1)} / session
</p>
</div>
</td>
<td className="py-2 text-right">
<div>
<p>{formatPercent(row.friction_session_rate)}</p>
Expand Down Expand Up @@ -104,6 +114,12 @@ export function ToolchainReliabilityTable({ rows }: ToolchainReliabilityTablePro
</p>
</div>
</td>
<td className="py-2 text-right">
<div>
<p>{formatPercent(row.compound_reliability_rate)}</p>
<p className="text-xs text-muted-foreground">10-step chain</p>
</div>
</td>
<td className="py-2">
<div className="flex flex-wrap gap-2">
{row.top_friction_types.length === 0 ? (
Expand Down
3 changes: 3 additions & 0 deletions frontend/src/types/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1956,12 +1956,15 @@ export interface ToolchainReliabilityEntry {
source_classification: CustomizationSourceClassification | null
session_count: number
engineer_count: number
total_call_count: number
avg_calls_per_session: number
friction_session_count: number
friction_session_rate: number | null
failure_session_count: number
failure_session_rate: number | null
recovery_rate: number | null
success_rate: number | null
compound_reliability_rate: number | null
abandonment_rate: number | null
avg_recovery_steps: number | null
top_friction_types: string[]
Expand Down
3 changes: 3 additions & 0 deletions src/primer/common/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2375,12 +2375,15 @@ class ToolchainReliabilityEntry(BaseModel):
source_classification: CustomizationSourceClassification | None = None
session_count: int
engineer_count: int
total_call_count: int = 0
avg_calls_per_session: float = 0.0
friction_session_count: int
friction_session_rate: float | None = None
failure_session_count: int
failure_session_rate: float | None = None
recovery_rate: float | None = None
success_rate: float | None = None
compound_reliability_rate: float | None = None
abandonment_rate: float | None = None
avg_recovery_steps: float | None = None
top_friction_types: list[str] = []
Expand Down
21 changes: 21 additions & 0 deletions src/primer/server/services/maturity_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
from primer.server.services.effectiveness_service import build_effectiveness_score

_RELIABILITY_FAILURE_FRICTION_TYPES = frozenset({"tool_error", "exec_error", "timeout"})
_COMPOUND_RELIABILITY_CHAIN_LENGTH = 10


def get_maturity_analytics(
Expand Down Expand Up @@ -707,6 +708,7 @@ def _ensure_reliability_bucket(
"recovery_steps": [],
"success_sessions": set(),
"friction_type_counts": Counter(),
"total_call_count": 0,
},
)

Expand Down Expand Up @@ -741,6 +743,7 @@ def _apply_session_reliability_metrics(
continue
bucket = _ensure_reliability_bucket("built_in_tool", tool_name, "built_in", "built_in")
bucket["sessions"].add(sid)
bucket["total_call_count"] += call_count
if sid in session_metrics:
session_metric = session_metrics[sid]
engineer_id_for_session = session_metric["engineer_id"]
Expand Down Expand Up @@ -850,12 +853,23 @@ def _apply_session_reliability_metrics(
)
reliability_bucket["sessions"].add(session_id)
reliability_bucket["engineers"].add(engineer_id)
reliability_bucket["total_call_count"] += invocation_count or 0
_apply_session_reliability_metrics(
reliability_bucket,
session_id,
session_metrics.get(session_id),
)

def _compound_reliability_rate(bucket: dict) -> float | None:
if not bucket["sessions"]:
return None
success_rate = len(bucket["success_sessions"]) / len(bucket["sessions"])
avg_calls = bucket["total_call_count"] / len(bucket["sessions"])
if avg_calls <= 0:
return None
per_call_rate = success_rate ** (1.0 / avg_calls)
return round(per_call_rate**_COMPOUND_RELIABILITY_CHAIN_LENGTH, 3)

customization_breakdown = [
CustomizationUsage(
identifier=bucket["identifier"],
Expand Down Expand Up @@ -933,6 +947,12 @@ def _apply_session_reliability_metrics(
source_classification=bucket["source_classification"],
session_count=len(bucket["sessions"]),
engineer_count=len(bucket["engineers"]),
total_call_count=bucket["total_call_count"],
avg_calls_per_session=(
round(bucket["total_call_count"] / len(bucket["sessions"]), 1)
if bucket["sessions"]
else 0.0
),
friction_session_count=len(bucket["friction_sessions"]),
friction_session_rate=(
round(len(bucket["friction_sessions"]) / len(bucket["sessions"]), 3)
Expand All @@ -958,6 +978,7 @@ def _apply_session_reliability_metrics(
if bucket["sessions"]
else None
),
compound_reliability_rate=_compound_reliability_rate(bucket),
abandonment_rate=(
round(len(bucket["abandoned_sessions"]) / len(bucket["sessions"]), 3)
if bucket["sessions"]
Expand Down
19 changes: 19 additions & 0 deletions tests/test_maturity.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,12 @@ def test_maturity_builds_toolchain_reliability_view(
s1 = seeded_maturity_data["s1"]
s2 = seeded_maturity_data["s2"]

db_session.add_all(
[
ToolUsage(session_id=s1.id, tool_name="Bash", call_count=2),
ToolUsage(session_id=s2.id, tool_name="Bash", call_count=2),
]
)
db_session.add_all(
[
SessionFacets(
Expand Down Expand Up @@ -530,18 +536,29 @@ def test_maturity_builds_toolchain_reliability_view(
assert github["failure_session_rate"] == 1.0
assert github["recovery_rate"] == 1.0
assert github["success_rate"] == 1.0
assert github["total_call_count"] == 3
assert github["avg_calls_per_session"] == 3.0
assert github["compound_reliability_rate"] == 1.0
assert github["abandonment_rate"] == 0.0
assert github["top_friction_types"] == ["tool_error", "timeout"]

read_tool = reliability_rows[("built_in_tool", "Read")]
assert read_tool["session_count"] == 2
assert read_tool["engineer_count"] == 2
assert read_tool["total_call_count"] == 30
assert read_tool["avg_calls_per_session"] == 15.0
assert read_tool["friction_session_rate"] == 1.0
assert read_tool["failure_session_rate"] == 0.5
assert read_tool["recovery_rate"] == 0.5
assert read_tool["success_rate"] == 0.5
assert read_tool["compound_reliability_rate"] == 0.63
assert read_tool["abandonment_rate"] == 0.5

bash_tool = reliability_rows[("built_in_tool", "Bash")]
assert bash_tool["success_rate"] == 0.5
assert bash_tool["avg_calls_per_session"] == 2.0
assert bash_tool["compound_reliability_rate"] == 0.031


def test_maturity_builds_delegation_patterns(
client, admin_headers, seeded_maturity_data, db_session
Expand Down Expand Up @@ -654,6 +671,8 @@ def test_maturity_toolchain_reliability_deduplicates_duplicate_tool_rows(
if row["surface_type"] == "built_in_tool" and row["identifier"] == "Read"
)
assert read_tool["session_count"] == 2
assert read_tool["total_call_count"] == 37
assert read_tool["avg_calls_per_session"] == 18.5
assert read_tool["friction_session_count"] == 2
assert read_tool["avg_recovery_steps"] == 2.0

Expand Down