diff --git a/apps/web/src/api/lib/uncertainty/calibration.ts b/apps/web/src/api/lib/uncertainty/calibration.ts new file mode 100644 index 0000000..015bc4b --- /dev/null +++ b/apps/web/src/api/lib/uncertainty/calibration.ts @@ -0,0 +1,124 @@ +import { eq, sql } from "drizzle-orm"; +import { uuidv7 } from "uuidv7"; +import type { Database } from "../../../db/client"; +import { + uncertaintyCalibrationSnapshot, + uncertaintyPrediction, +} from "../../../db/schema/uncertainty"; +import { BUCKET_COUNT, bucketBounds, bucketIndex, brierScore } from "./cohort"; + +export type CohortRow = { + cohortKey: string; + claimedConfidence: number; + outcomeCorrectness: number | null; + state: "emitted" | "witnessed" | "orphaned" | "retired"; +}; + +export type BucketSnapshot = { + cohortKey: string; + bucketLower: number; + bucketUpper: number; + claimedConfidence: number; + actualCorrectness: number; + predictionCount: number; + orphanCount: number; + brierScore: number; +}; + +/** + * Splits witnessed predictions in a cohort into 10 buckets of 0.1 width on + * claimed_confidence. Per bucket: mean claimed, mean correctness, count, and + * Brier score. Orphan count is per-cohort (not per-bucket) and attached to + * every emitted snapshot row so the dashboard can show the orphan share. + * + * Buckets with zero witnessed predictions are dropped -- a bucket only exists + * once at least one prediction has fully resolved. + */ +export function bucketCohort(rows: readonly CohortRow[]): BucketSnapshot[] { + if (rows.length === 0) return []; + const cohortKey = rows[0].cohortKey; + + const witnessed = rows.filter( + (r): r is CohortRow & { outcomeCorrectness: number } => + r.state === "witnessed" && r.outcomeCorrectness !== null, + ); + const orphanCount = rows.filter((r) => r.state === "orphaned").length; + + const buckets: { claimed: number[]; correctness: number[] }[] = Array.from( + { length: BUCKET_COUNT }, + () => ({ claimed: [], correctness: [] }), + ); + + for (const row of witnessed) { + const idx = bucketIndex(row.claimedConfidence); + buckets[idx].claimed.push(row.claimedConfidence); + buckets[idx].correctness.push(row.outcomeCorrectness); + } + + const out: BucketSnapshot[] = []; + for (let i = 0; i < BUCKET_COUNT; i++) { + const b = buckets[i]; + if (b.claimed.length === 0) continue; + const { lower, upper } = bucketBounds(i); + const meanClaimed = b.claimed.reduce((a, c) => a + c, 0) / b.claimed.length; + const meanCorrectness = b.correctness.reduce((a, c) => a + c, 0) / b.correctness.length; + out.push({ + cohortKey, + bucketLower: lower, + bucketUpper: upper, + claimedConfidence: meanClaimed, + actualCorrectness: meanCorrectness, + predictionCount: b.claimed.length, + orphanCount, + brierScore: brierScore(b.claimed, b.correctness), + }); + } + return out; +} + +/** + * Recompute snapshots for every active cohort. "Active" means the cohort has + * at least one resolved (witnessed or orphaned) prediction. + */ +export async function rollCalibration( + db: Database, + now: Date = new Date(), +): Promise<{ cohorts: number; rows: number }> { + const cohortKeys = await db + .select({ cohortKey: uncertaintyPrediction.cohortKey }) + .from(uncertaintyPrediction) + .where(sql`${uncertaintyPrediction.state} IN ('witnessed', 'orphaned')`) + .groupBy(uncertaintyPrediction.cohortKey); + + let totalRows = 0; + for (const { cohortKey } of cohortKeys) { + const rows = (await db + .select({ + cohortKey: uncertaintyPrediction.cohortKey, + claimedConfidence: uncertaintyPrediction.claimedConfidence, + outcomeCorrectness: uncertaintyPrediction.outcomeCorrectness, + state: uncertaintyPrediction.state, + }) + .from(uncertaintyPrediction) + .where(eq(uncertaintyPrediction.cohortKey, cohortKey))) as CohortRow[]; + + const snapshots = bucketCohort(rows); + + await db + .delete(uncertaintyCalibrationSnapshot) + .where(eq(uncertaintyCalibrationSnapshot.cohortKey, cohortKey)); + + if (snapshots.length > 0) { + await db.insert(uncertaintyCalibrationSnapshot).values( + snapshots.map((s) => ({ + id: uuidv7(), + ...s, + computedAt: now, + })), + ); + totalRows += snapshots.length; + } + } + + return { cohorts: cohortKeys.length, rows: totalRows }; +} diff --git a/apps/web/src/api/routes/uncertainty.ts b/apps/web/src/api/routes/uncertainty.ts index a5ad7d1..bdbd164 100644 --- a/apps/web/src/api/routes/uncertainty.ts +++ b/apps/web/src/api/routes/uncertainty.ts @@ -7,6 +7,7 @@ import { createDb } from "../../db/client"; import { uncertaintyCalibrationSnapshot, uncertaintyPrediction } from "../../db/schema/uncertainty"; import { outcomeLabelSchema, surfaceSchema } from "../../db/schema/uncertainty.zod"; import { cohortKey, DAY_MS, DEFAULT_ORPHAN_TTL_DAYS } from "../lib/uncertainty/cohort"; +import { rollCalibration } from "../lib/uncertainty/calibration"; import { sweepOrphans } from "../lib/uncertainty/orphan-sweep"; import { requireAuth } from "../middleware/auth"; import type { HonoEnv } from "../types"; @@ -142,14 +143,23 @@ const uncertaintyRoutes = new Hono() return c.json({ surfaces: rows }); }); -const internalRoutes = new Hono().post("/orphan-sweep", async (c) => { - if (!authorizeCron({ req: { header: (n) => c.req.header(n) }, env: c.env })) { - return c.json({ error: "Unauthorized" }, 401); - } - const db = createDb(c.env.DB); - const result = await sweepOrphans(db); - return c.json(result); -}); +const internalRoutes = new Hono() + .post("/orphan-sweep", async (c) => { + if (!authorizeCron({ req: { header: (n) => c.req.header(n) }, env: c.env })) { + return c.json({ error: "Unauthorized" }, 401); + } + const db = createDb(c.env.DB); + const result = await sweepOrphans(db); + return c.json(result); + }) + .post("/calibration-roll", async (c) => { + if (!authorizeCron({ req: { header: (n) => c.req.header(n) }, env: c.env })) { + return c.json({ error: "Unauthorized" }, 401); + } + const db = createDb(c.env.DB); + const result = await rollCalibration(db); + return c.json(result); + }); uncertaintyRoutes.route("/internal", internalRoutes); diff --git a/apps/web/tests/api/lib/uncertainty/calibration.test.ts b/apps/web/tests/api/lib/uncertainty/calibration.test.ts new file mode 100644 index 0000000..610db69 --- /dev/null +++ b/apps/web/tests/api/lib/uncertainty/calibration.test.ts @@ -0,0 +1,97 @@ +import { describe, expect, it } from "vitest"; +import { bucketCohort, type CohortRow } from "../../../../src/api/lib/uncertainty/calibration"; + +const COHORT = "rafters.color|claude-sonnet-4-7|2026-04-01|0.8"; + +function row(partial: Partial): CohortRow { + return { + cohortKey: COHORT, + claimedConfidence: 0.85, + outcomeCorrectness: 1, + state: "witnessed", + ...partial, + }; +} + +describe("bucketCohort", () => { + it("returns no buckets for an empty cohort", () => { + expect(bucketCohort([])).toEqual([]); + }); + + it("drops buckets with zero witnessed predictions", () => { + // Single witnessed prediction at 0.85 -> bucket 8 only + const out = bucketCohort([row({ claimedConfidence: 0.85, outcomeCorrectness: 1 })]); + expect(out).toHaveLength(1); + expect(out[0].bucketLower).toBeCloseTo(0.8, 5); + expect(out[0].bucketUpper).toBeCloseTo(0.9, 5); + }); + + it("means claimed and correctness within a bucket", () => { + const out = bucketCohort([ + row({ claimedConfidence: 0.82, outcomeCorrectness: 1 }), + row({ claimedConfidence: 0.88, outcomeCorrectness: 0.5 }), + ]); + expect(out).toHaveLength(1); + expect(out[0].claimedConfidence).toBeCloseTo(0.85, 5); + expect(out[0].actualCorrectness).toBeCloseTo(0.75, 5); + expect(out[0].predictionCount).toBe(2); + }); + + it("computes Brier as mean squared error within the bucket", () => { + const out = bucketCohort([ + row({ claimedConfidence: 0.8, outcomeCorrectness: 1 }), // (0.8-1)^2 = 0.04 + row({ claimedConfidence: 0.85, outcomeCorrectness: 0 }), // (0.85-0)^2 = 0.7225 + ]); + expect(out[0].brierScore).toBeCloseTo((0.04 + 0.7225) / 2, 5); + }); + + it("ignores witnessed rows with null correctness", () => { + const out = bucketCohort([ + row({ claimedConfidence: 0.85, outcomeCorrectness: null, state: "witnessed" }), + row({ claimedConfidence: 0.85, outcomeCorrectness: 1 }), + ]); + expect(out).toHaveLength(1); + expect(out[0].predictionCount).toBe(1); + }); + + it("ignores emitted-but-not-witnessed rows but does not double-count", () => { + const out = bucketCohort([ + row({ claimedConfidence: 0.85, outcomeCorrectness: null, state: "emitted" }), + row({ claimedConfidence: 0.85, outcomeCorrectness: 1 }), + ]); + expect(out).toHaveLength(1); + expect(out[0].predictionCount).toBe(1); + }); + + it("attaches per-cohort orphan_count to every emitted bucket row", () => { + const out = bucketCohort([ + row({ claimedConfidence: 0.25, outcomeCorrectness: 1 }), + row({ claimedConfidence: 0.85, outcomeCorrectness: 0.5 }), + row({ claimedConfidence: 0.85, outcomeCorrectness: null, state: "orphaned" }), + row({ claimedConfidence: 0.85, outcomeCorrectness: null, state: "orphaned" }), + ]); + expect(out).toHaveLength(2); + for (const snapshot of out) { + expect(snapshot.orphanCount).toBe(2); + } + }); + + it("places confidence=1.0 in the top bucket", () => { + const out = bucketCohort([row({ claimedConfidence: 1, outcomeCorrectness: 1 })]); + expect(out).toHaveLength(1); + expect(out[0].bucketLower).toBeCloseTo(0.9, 5); + }); + + it("returns buckets sorted by lower bound", () => { + const out = bucketCohort([ + row({ claimedConfidence: 0.95, outcomeCorrectness: 1 }), + row({ claimedConfidence: 0.15, outcomeCorrectness: 0 }), + row({ claimedConfidence: 0.55, outcomeCorrectness: 0.5 }), + ]); + expect(out.map((s) => s.bucketLower)).toEqual([ + expect.closeTo(0.1, 5), + expect.closeTo(0.5, 5), + expect.closeTo(0.9, 5), + ]); + }); +}); diff --git a/apps/web/wrangler.jsonc b/apps/web/wrangler.jsonc index 3432819..ccf12ea 100644 --- a/apps/web/wrangler.jsonc +++ b/apps/web/wrangler.jsonc @@ -47,11 +47,12 @@ "vars": { "CRON_SECRET": "", }, - // Hourly orphan sweep. The Astro adapter does not expose a scheduled() - // entrypoint, so the trigger calls POST /api/uncertainty/internal/orphan-sweep - // via an external scheduler (or a future companion worker) using the - // CRON_SECRET bearer token. + // Hourly orphan sweep + nightly calibration roll. The Astro adapter does + // not expose a scheduled() entrypoint, so triggers call the internal + // endpoints (/api/uncertainty/internal/orphan-sweep at :00 every hour, + // /api/uncertainty/internal/calibration-roll at 03:15 UTC daily) via an + // external scheduler using the CRON_SECRET bearer token. "triggers": { - "crons": ["0 * * * *"], + "crons": ["0 * * * *", "15 3 * * *"], }, }