diff --git a/apps/web/src/api/app.ts b/apps/web/src/api/app.ts index d1ef009..987ed32 100644 --- a/apps/web/src/api/app.ts +++ b/apps/web/src/api/app.ts @@ -7,6 +7,7 @@ import { loadSession } from "./middleware/auth"; import { authRoutes } from "./routes/auth"; import { colorRoutes } from "./routes/color"; import { ctrlRoutes } from "./routes/ctrl/index"; +import { uncertaintyRoutes } from "./routes/uncertainty"; import type { HonoEnv } from "./types"; const app = new Hono() @@ -15,7 +16,8 @@ const app = new Hono() .route("/auth", authRoutes) .use("/*", loadSession) .route("/color", colorRoutes) - .route("/ctrl", ctrlRoutes); + .route("/ctrl", ctrlRoutes) + .route("/uncertainty", uncertaintyRoutes); export type AppType = typeof app; export default app; diff --git a/apps/web/src/api/lib/uncertainty/cohort.ts b/apps/web/src/api/lib/uncertainty/cohort.ts new file mode 100644 index 0000000..770f2a6 --- /dev/null +++ b/apps/web/src/api/lib/uncertainty/cohort.ts @@ -0,0 +1,41 @@ +export const DAY_MS = 86_400_000; +export const DEFAULT_ORPHAN_TTL_DAYS = 30; +export const BUCKET_WIDTH = 0.1; +export const BUCKET_COUNT = 10; + +export function bucketLower(confidence: number): number { + if (confidence >= 1) return 0.9; + if (confidence < 0) return 0; + return Math.floor(confidence * 10) / 10; +} + +export function cohortKey( + surface: string, + model: string, + modelVersion: string, + confidence: number, +): string { + return `${surface}|${model}|${modelVersion}|${bucketLower(confidence).toFixed(1)}`; +} + +export function brierScore(claimed: readonly number[], correctness: readonly number[]): number { + if (claimed.length === 0 || claimed.length !== correctness.length) return 0; + let sum = 0; + for (let i = 0; i < claimed.length; i++) { + const diff = claimed[i] - correctness[i]; + sum += diff * diff; + } + return sum / claimed.length; +} + +export function bucketIndex(confidence: number): number { + if (confidence >= 1) return BUCKET_COUNT - 1; + if (confidence < 0) return 0; + return Math.floor(confidence * BUCKET_COUNT); +} + +export function bucketBounds(index: number): { lower: number; upper: number } { + const lower = index * BUCKET_WIDTH; + const upper = lower + BUCKET_WIDTH; + return { lower, upper }; +} diff --git a/apps/web/src/api/routes/uncertainty.ts b/apps/web/src/api/routes/uncertainty.ts new file mode 100644 index 0000000..eb4319d --- /dev/null +++ b/apps/web/src/api/routes/uncertainty.ts @@ -0,0 +1,133 @@ +import { zValidator } from "@hono/zod-validator"; +import { and, count, eq, like, sql } from "drizzle-orm"; +import { Hono } from "hono"; +import { uuidv7 } from "uuidv7"; +import { z } from "zod"; +import { createDb } from "../../db/client"; +import { uncertaintyCalibrationSnapshot, uncertaintyPrediction } from "../../db/schema/uncertainty"; +import { outcomeLabelSchema, surfaceSchema } from "../../db/schema/uncertainty.zod"; +import { cohortKey, DAY_MS, DEFAULT_ORPHAN_TTL_DAYS } from "../lib/uncertainty/cohort"; +import { requireAuth } from "../middleware/auth"; +import type { HonoEnv } from "../types"; + +const emitBodySchema = z.object({ + surface: surfaceSchema, + feature_key: z.string().min(1).max(128), + input_fingerprint: z.string().min(1).max(128), + model: z.string().min(1).max(64), + model_version: z.string().min(1).max(64), + claimed_confidence: z.number().min(0).max(1), + prediction_payload: z.unknown(), + orphan_ttl_days: z.number().int().min(1).max(365).optional(), +}); + +const witnessBodySchema = z.object({ + outcome_label: outcomeLabelSchema, + outcome_correctness: z.number().min(0).max(1), + outcome_payload: z.unknown().optional(), +}); + +const calibrationQuerySchema = z.object({ + surface: surfaceSchema, + model: z.string().min(1).max(64), + model_version: z.string().min(1).max(64), +}); + +const uncertaintyRoutes = new Hono() + .use("*", requireAuth) + .post("/predictions", zValidator("json", emitBodySchema), async (c) => { + const body = c.req.valid("json"); + const db = createDb(c.env.DB); + + const id = uuidv7(); + const now = Date.now(); + const ttlDays = body.orphan_ttl_days ?? DEFAULT_ORPHAN_TTL_DAYS; + const orphanAfter = now + ttlDays * DAY_MS; + + await db.insert(uncertaintyPrediction).values({ + id, + surface: body.surface, + featureKey: body.feature_key, + inputFingerprint: body.input_fingerprint, + model: body.model, + modelVersion: body.model_version, + claimedConfidence: body.claimed_confidence, + predictionPayload: body.prediction_payload, + state: "emitted", + createdAt: new Date(now), + orphanAfter: new Date(orphanAfter), + cohortKey: cohortKey(body.surface, body.model, body.model_version, body.claimed_confidence), + }); + + return c.json({ id, orphan_after: orphanAfter }, 201); + }) + .put( + "/predictions/:id/witness", + zValidator("param", z.object({ id: z.string().min(1) })), + zValidator("json", witnessBodySchema), + async (c) => { + const { id } = c.req.valid("param"); + const body = c.req.valid("json"); + const db = createDb(c.env.DB); + + const existing = await db + .select({ state: uncertaintyPrediction.state }) + .from(uncertaintyPrediction) + .where(eq(uncertaintyPrediction.id, id)) + .limit(1); + + if (existing.length === 0) { + return c.json({ error: "Prediction not found" }, 404); + } + + if (existing[0].state === "witnessed") { + return c.json({ error: "Prediction already witnessed" }, 409); + } + + await db + .update(uncertaintyPrediction) + .set({ + state: "witnessed", + outcomeLabel: body.outcome_label, + outcomeCorrectness: body.outcome_correctness, + outcomePayload: body.outcome_payload ?? null, + witnessedAt: new Date(), + }) + .where(and(eq(uncertaintyPrediction.id, id), eq(uncertaintyPrediction.state, "emitted"))); + + return c.json({ id, state: "witnessed" as const }); + }, + ) + .get("/calibration", zValidator("query", calibrationQuerySchema), async (c) => { + const q = c.req.valid("query"); + const db = createDb(c.env.DB); + + const cohortPrefix = `${q.surface}|${q.model}|${q.model_version}|`; + const rows = await db + .select() + .from(uncertaintyCalibrationSnapshot) + .where(like(uncertaintyCalibrationSnapshot.cohortKey, `${cohortPrefix}%`)) + .orderBy(uncertaintyCalibrationSnapshot.bucketLower); + + return c.json({ buckets: rows }); + }) + .get("/orphans", async (c) => { + const db = createDb(c.env.DB); + + const rows = await db + .select({ + surface: uncertaintyPrediction.surface, + orphan_count: count(sql`CASE WHEN ${uncertaintyPrediction.state} = 'orphaned' THEN 1 END`), + emitted_count: count(sql`CASE WHEN ${uncertaintyPrediction.state} = 'emitted' THEN 1 END`), + witnessed_count: count( + sql`CASE WHEN ${uncertaintyPrediction.state} = 'witnessed' THEN 1 END`, + ), + total: count(), + }) + .from(uncertaintyPrediction) + .groupBy(uncertaintyPrediction.surface); + + return c.json({ surfaces: rows }); + }); + +export { uncertaintyRoutes }; diff --git a/apps/web/src/db/client.ts b/apps/web/src/db/client.ts index 7c8600c..dfdf102 100644 --- a/apps/web/src/db/client.ts +++ b/apps/web/src/db/client.ts @@ -1,8 +1,9 @@ import { type DrizzleD1Database, drizzle } from "drizzle-orm/d1"; import * as authSchema from "./schema/auth"; import * as auditSchema from "./schema/audit"; +import * as uncertaintySchema from "./schema/uncertainty"; -const schema = { ...authSchema, ...auditSchema }; +const schema = { ...authSchema, ...auditSchema, ...uncertaintySchema }; export type Database = DrizzleD1Database; diff --git a/apps/web/src/pages/api/[...slug].ts b/apps/web/src/pages/api/[...slug].ts index f473498..e8e93ef 100644 --- a/apps/web/src/pages/api/[...slug].ts +++ b/apps/web/src/pages/api/[...slug].ts @@ -4,6 +4,7 @@ import { Hono } from "hono"; import { createAuth } from "../../api/auth"; import { colorRoutes } from "../../api/routes/color"; import { ctrlRoutes } from "../../api/routes/ctrl/index"; +import { uncertaintyRoutes } from "../../api/routes/uncertainty"; import { loadSession } from "../../api/middleware/auth"; import { requestLogger } from "../../lib/logging/middleware"; import type { HonoEnv } from "../../api/types"; @@ -26,6 +27,7 @@ app.use("/*", loadSession); app.route("/color", colorRoutes); app.route("/ctrl", ctrlRoutes); +app.route("/uncertainty", uncertaintyRoutes); const handle: APIRoute = (context) => app.fetch(context.request, env); diff --git a/apps/web/tests/api/routes/uncertainty.test.ts b/apps/web/tests/api/routes/uncertainty.test.ts new file mode 100644 index 0000000..d090cec --- /dev/null +++ b/apps/web/tests/api/routes/uncertainty.test.ts @@ -0,0 +1,142 @@ +import { describe, expect, it } from "vitest"; +import { + brierScore, + bucketBounds, + bucketIndex, + bucketLower, + cohortKey, +} from "../../../src/api/lib/uncertainty/cohort"; +import { + insertUncertaintyPredictionSchema, + outcomeLabelSchema, + predictionStateSchema, +} from "../../../src/db/schema/uncertainty.zod"; + +describe("bucketLower", () => { + it("floors confidence to 0.1 buckets", () => { + expect(bucketLower(0.0)).toBe(0); + expect(bucketLower(0.34)).toBeCloseTo(0.3, 5); + expect(bucketLower(0.5)).toBeCloseTo(0.5, 5); + expect(bucketLower(0.99)).toBeCloseTo(0.9, 5); + }); + + it("clamps a perfect-confidence call into the top bucket", () => { + expect(bucketLower(1)).toBe(0.9); + }); + + it("clamps negative confidence to the bottom bucket", () => { + expect(bucketLower(-0.2)).toBe(0); + }); +}); + +describe("cohortKey", () => { + it("composes surface, model, version, and bucket lower bound", () => { + expect(cohortKey("rafters.color", "claude-sonnet-4-7", "2026-04-01", 0.82)).toBe( + "rafters.color|claude-sonnet-4-7|2026-04-01|0.8", + ); + }); + + it("collapses confidences in the same bucket to the same cohort", () => { + const a = cohortKey("eavesdrop.classify", "kimi-k2", "v1", 0.71); + const b = cohortKey("eavesdrop.classify", "kimi-k2", "v1", 0.79); + expect(a).toBe(b); + }); + + it("separates different surfaces into different cohorts", () => { + const a = cohortKey("rafters.color", "m", "v", 0.5); + const b = cohortKey("mail.deliverability", "m", "v", 0.5); + expect(a).not.toBe(b); + }); +}); + +describe("bucketIndex + bucketBounds", () => { + it("indexes 0..9 across [0, 1)", () => { + expect(bucketIndex(0)).toBe(0); + expect(bucketIndex(0.5)).toBe(5); + expect(bucketIndex(0.99)).toBe(9); + }); + + it("clamps perfect confidence into the top bucket", () => { + expect(bucketIndex(1)).toBe(9); + }); + + it("returns matching [lower, upper) bounds", () => { + expect(bucketBounds(0)).toEqual({ lower: 0, upper: 0.1 }); + const seventh = bucketBounds(7); + expect(seventh.lower).toBeCloseTo(0.7, 5); + expect(seventh.upper).toBeCloseTo(0.8, 5); + }); +}); + +describe("brierScore", () => { + it("returns 0 for an empty cohort", () => { + expect(brierScore([], [])).toBe(0); + }); + + it("scores a perfectly calibrated cohort at 0", () => { + expect(brierScore([1, 0, 1], [1, 0, 1])).toBe(0); + }); + + it("scores a worst-case cohort at 1", () => { + expect(brierScore([1, 0], [0, 1])).toBe(1); + }); + + it("computes mean squared error", () => { + // claimed = 0.8, actual = 0.5 -> diff^2 = 0.09 (single-row mean) + expect(brierScore([0.8], [0.5])).toBeCloseTo(0.09, 5); + }); +}); + +describe("insertUncertaintyPredictionSchema", () => { + it("accepts a well-formed emit", () => { + const result = insertUncertaintyPredictionSchema.safeParse({ + surface: "rafters.color", + featureKey: "oklch.lowChromaHighLightness", + inputFingerprint: "abc123", + model: "claude-sonnet-4-7", + modelVersion: "2026-04-01", + claimedConfidence: 0.82, + predictionPayload: { name: "parchment" }, + state: "emitted", + orphanAfter: new Date(), + cohortKey: "rafters.color|claude-sonnet-4-7|2026-04-01|0.8", + }); + expect(result.success).toBe(true); + }); + + it("rejects out-of-range confidence", () => { + const result = insertUncertaintyPredictionSchema.safeParse({ + surface: "rafters.color", + featureKey: "x", + inputFingerprint: "y", + model: "m", + modelVersion: "v", + claimedConfidence: 1.5, + predictionPayload: {}, + state: "emitted", + orphanAfter: new Date(), + cohortKey: "k", + }); + expect(result.success).toBe(false); + }); +}); + +describe("predictionStateSchema", () => { + it("accepts the four lifecycle states", () => { + for (const s of ["emitted", "witnessed", "orphaned", "retired"] as const) { + expect(predictionStateSchema.safeParse(s).success).toBe(true); + } + }); + + it("rejects an unknown state", () => { + expect(predictionStateSchema.safeParse("calibrated").success).toBe(false); + }); +}); + +describe("outcomeLabelSchema", () => { + it("accepts the per-surface outcome labels", () => { + for (const l of ["accepted", "rejected", "edited", "ignored", "custom"] as const) { + expect(outcomeLabelSchema.safeParse(l).success).toBe(true); + } + }); +});