Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion apps/web/src/api/app.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { loadSession } from "./middleware/auth";
import { authRoutes } from "./routes/auth";
import { colorRoutes } from "./routes/color";
import { ctrlRoutes } from "./routes/ctrl/index";
import { uncertaintyRoutes } from "./routes/uncertainty";
import type { HonoEnv } from "./types";

const app = new Hono<HonoEnv>()
Expand All @@ -15,7 +16,8 @@ const app = new Hono<HonoEnv>()
.route("/auth", authRoutes)
.use("/*", loadSession)
.route("/color", colorRoutes)
.route("/ctrl", ctrlRoutes);
.route("/ctrl", ctrlRoutes)
.route("/uncertainty", uncertaintyRoutes);

export type AppType = typeof app;
export default app;
41 changes: 41 additions & 0 deletions apps/web/src/api/lib/uncertainty/cohort.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
export const DAY_MS = 86_400_000;
export const DEFAULT_ORPHAN_TTL_DAYS = 30;
export const BUCKET_WIDTH = 0.1;
export const BUCKET_COUNT = 10;

export function bucketLower(confidence: number): number {
if (confidence >= 1) return 0.9;
if (confidence < 0) return 0;
return Math.floor(confidence * 10) / 10;
}

export function cohortKey(
surface: string,
model: string,
modelVersion: string,
confidence: number,
): string {
return `${surface}|${model}|${modelVersion}|${bucketLower(confidence).toFixed(1)}`;
}

export function brierScore(claimed: readonly number[], correctness: readonly number[]): number {
if (claimed.length === 0 || claimed.length !== correctness.length) return 0;
let sum = 0;
for (let i = 0; i < claimed.length; i++) {
const diff = claimed[i] - correctness[i];
sum += diff * diff;
}
return sum / claimed.length;
}

export function bucketIndex(confidence: number): number {
if (confidence >= 1) return BUCKET_COUNT - 1;
if (confidence < 0) return 0;
return Math.floor(confidence * BUCKET_COUNT);
}

export function bucketBounds(index: number): { lower: number; upper: number } {
const lower = index * BUCKET_WIDTH;
const upper = lower + BUCKET_WIDTH;
return { lower, upper };
}
133 changes: 133 additions & 0 deletions apps/web/src/api/routes/uncertainty.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import { zValidator } from "@hono/zod-validator";
import { and, count, eq, like, sql } from "drizzle-orm";
import { Hono } from "hono";
import { uuidv7 } from "uuidv7";
import { z } from "zod";
import { createDb } from "../../db/client";
import { uncertaintyCalibrationSnapshot, uncertaintyPrediction } from "../../db/schema/uncertainty";
import { outcomeLabelSchema, surfaceSchema } from "../../db/schema/uncertainty.zod";
import { cohortKey, DAY_MS, DEFAULT_ORPHAN_TTL_DAYS } from "../lib/uncertainty/cohort";
import { requireAuth } from "../middleware/auth";
import type { HonoEnv } from "../types";

const emitBodySchema = z.object({
surface: surfaceSchema,
feature_key: z.string().min(1).max(128),
input_fingerprint: z.string().min(1).max(128),
model: z.string().min(1).max(64),
model_version: z.string().min(1).max(64),
claimed_confidence: z.number().min(0).max(1),
prediction_payload: z.unknown(),
orphan_ttl_days: z.number().int().min(1).max(365).optional(),
});

const witnessBodySchema = z.object({
outcome_label: outcomeLabelSchema,
outcome_correctness: z.number().min(0).max(1),
outcome_payload: z.unknown().optional(),
});

const calibrationQuerySchema = z.object({
surface: surfaceSchema,
model: z.string().min(1).max(64),
model_version: z.string().min(1).max(64),
});

const uncertaintyRoutes = new Hono<HonoEnv>()
.use("*", requireAuth)
.post("/predictions", zValidator("json", emitBodySchema), async (c) => {
const body = c.req.valid("json");
const db = createDb(c.env.DB);

const id = uuidv7();
const now = Date.now();
const ttlDays = body.orphan_ttl_days ?? DEFAULT_ORPHAN_TTL_DAYS;
const orphanAfter = now + ttlDays * DAY_MS;

await db.insert(uncertaintyPrediction).values({
id,
surface: body.surface,
featureKey: body.feature_key,
inputFingerprint: body.input_fingerprint,
model: body.model,
modelVersion: body.model_version,
claimedConfidence: body.claimed_confidence,
predictionPayload: body.prediction_payload,
state: "emitted",
createdAt: new Date(now),
orphanAfter: new Date(orphanAfter),
cohortKey: cohortKey(body.surface, body.model, body.model_version, body.claimed_confidence),
});

return c.json({ id, orphan_after: orphanAfter }, 201);
})
.put(
"/predictions/:id/witness",
zValidator("param", z.object({ id: z.string().min(1) })),
zValidator("json", witnessBodySchema),
async (c) => {
const { id } = c.req.valid("param");
const body = c.req.valid("json");
const db = createDb(c.env.DB);

const existing = await db
.select({ state: uncertaintyPrediction.state })
.from(uncertaintyPrediction)
.where(eq(uncertaintyPrediction.id, id))
.limit(1);

if (existing.length === 0) {
return c.json({ error: "Prediction not found" }, 404);
}

if (existing[0].state === "witnessed") {
return c.json({ error: "Prediction already witnessed" }, 409);
}

await db
.update(uncertaintyPrediction)
.set({
state: "witnessed",
outcomeLabel: body.outcome_label,
outcomeCorrectness: body.outcome_correctness,
outcomePayload: body.outcome_payload ?? null,
witnessedAt: new Date(),
})
.where(and(eq(uncertaintyPrediction.id, id), eq(uncertaintyPrediction.state, "emitted")));

return c.json({ id, state: "witnessed" as const });
},
)
.get("/calibration", zValidator("query", calibrationQuerySchema), async (c) => {
const q = c.req.valid("query");
const db = createDb(c.env.DB);

const cohortPrefix = `${q.surface}|${q.model}|${q.model_version}|`;
const rows = await db
.select()
.from(uncertaintyCalibrationSnapshot)
.where(like(uncertaintyCalibrationSnapshot.cohortKey, `${cohortPrefix}%`))
.orderBy(uncertaintyCalibrationSnapshot.bucketLower);

return c.json({ buckets: rows });
})
.get("/orphans", async (c) => {
const db = createDb(c.env.DB);

const rows = await db
.select({
surface: uncertaintyPrediction.surface,
orphan_count: count(sql`CASE WHEN ${uncertaintyPrediction.state} = 'orphaned' THEN 1 END`),
emitted_count: count(sql`CASE WHEN ${uncertaintyPrediction.state} = 'emitted' THEN 1 END`),
witnessed_count: count(
sql`CASE WHEN ${uncertaintyPrediction.state} = 'witnessed' THEN 1 END`,
),
total: count(),
})
.from(uncertaintyPrediction)
.groupBy(uncertaintyPrediction.surface);

return c.json({ surfaces: rows });
});

export { uncertaintyRoutes };
3 changes: 2 additions & 1 deletion apps/web/src/db/client.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import { type DrizzleD1Database, drizzle } from "drizzle-orm/d1";
import * as authSchema from "./schema/auth";
import * as auditSchema from "./schema/audit";
import * as uncertaintySchema from "./schema/uncertainty";

const schema = { ...authSchema, ...auditSchema };
const schema = { ...authSchema, ...auditSchema, ...uncertaintySchema };

export type Database = DrizzleD1Database<typeof schema>;

Expand Down
2 changes: 2 additions & 0 deletions apps/web/src/pages/api/[...slug].ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { Hono } from "hono";
import { createAuth } from "../../api/auth";
import { colorRoutes } from "../../api/routes/color";
import { ctrlRoutes } from "../../api/routes/ctrl/index";
import { uncertaintyRoutes } from "../../api/routes/uncertainty";
import { loadSession } from "../../api/middleware/auth";
import { requestLogger } from "../../lib/logging/middleware";
import type { HonoEnv } from "../../api/types";
Expand All @@ -26,6 +27,7 @@ app.use("/*", loadSession);

app.route("/color", colorRoutes);
app.route("/ctrl", ctrlRoutes);
app.route("/uncertainty", uncertaintyRoutes);

const handle: APIRoute = (context) => app.fetch(context.request, env);

Expand Down
142 changes: 142 additions & 0 deletions apps/web/tests/api/routes/uncertainty.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import { describe, expect, it } from "vitest";
import {
brierScore,
bucketBounds,
bucketIndex,
bucketLower,
cohortKey,
} from "../../../src/api/lib/uncertainty/cohort";
import {
insertUncertaintyPredictionSchema,
outcomeLabelSchema,
predictionStateSchema,
} from "../../../src/db/schema/uncertainty.zod";

describe("bucketLower", () => {
it("floors confidence to 0.1 buckets", () => {
expect(bucketLower(0.0)).toBe(0);
expect(bucketLower(0.34)).toBeCloseTo(0.3, 5);
expect(bucketLower(0.5)).toBeCloseTo(0.5, 5);
expect(bucketLower(0.99)).toBeCloseTo(0.9, 5);
});

it("clamps a perfect-confidence call into the top bucket", () => {
expect(bucketLower(1)).toBe(0.9);
});

it("clamps negative confidence to the bottom bucket", () => {
expect(bucketLower(-0.2)).toBe(0);
});
});

describe("cohortKey", () => {
it("composes surface, model, version, and bucket lower bound", () => {
expect(cohortKey("rafters.color", "claude-sonnet-4-7", "2026-04-01", 0.82)).toBe(
"rafters.color|claude-sonnet-4-7|2026-04-01|0.8",
);
});

it("collapses confidences in the same bucket to the same cohort", () => {
const a = cohortKey("eavesdrop.classify", "kimi-k2", "v1", 0.71);
const b = cohortKey("eavesdrop.classify", "kimi-k2", "v1", 0.79);
expect(a).toBe(b);
});

it("separates different surfaces into different cohorts", () => {
const a = cohortKey("rafters.color", "m", "v", 0.5);
const b = cohortKey("mail.deliverability", "m", "v", 0.5);
expect(a).not.toBe(b);
});
});

describe("bucketIndex + bucketBounds", () => {
it("indexes 0..9 across [0, 1)", () => {
expect(bucketIndex(0)).toBe(0);
expect(bucketIndex(0.5)).toBe(5);
expect(bucketIndex(0.99)).toBe(9);
});

it("clamps perfect confidence into the top bucket", () => {
expect(bucketIndex(1)).toBe(9);
});

it("returns matching [lower, upper) bounds", () => {
expect(bucketBounds(0)).toEqual({ lower: 0, upper: 0.1 });
const seventh = bucketBounds(7);
expect(seventh.lower).toBeCloseTo(0.7, 5);
expect(seventh.upper).toBeCloseTo(0.8, 5);
});
});

describe("brierScore", () => {
it("returns 0 for an empty cohort", () => {
expect(brierScore([], [])).toBe(0);
});

it("scores a perfectly calibrated cohort at 0", () => {
expect(brierScore([1, 0, 1], [1, 0, 1])).toBe(0);
});

it("scores a worst-case cohort at 1", () => {
expect(brierScore([1, 0], [0, 1])).toBe(1);
});

it("computes mean squared error", () => {
// claimed = 0.8, actual = 0.5 -> diff^2 = 0.09 (single-row mean)
expect(brierScore([0.8], [0.5])).toBeCloseTo(0.09, 5);
});
});

describe("insertUncertaintyPredictionSchema", () => {
it("accepts a well-formed emit", () => {
const result = insertUncertaintyPredictionSchema.safeParse({
surface: "rafters.color",
featureKey: "oklch.lowChromaHighLightness",
inputFingerprint: "abc123",
model: "claude-sonnet-4-7",
modelVersion: "2026-04-01",
claimedConfidence: 0.82,
predictionPayload: { name: "parchment" },
state: "emitted",
orphanAfter: new Date(),
cohortKey: "rafters.color|claude-sonnet-4-7|2026-04-01|0.8",
});
expect(result.success).toBe(true);
});

it("rejects out-of-range confidence", () => {
const result = insertUncertaintyPredictionSchema.safeParse({
surface: "rafters.color",
featureKey: "x",
inputFingerprint: "y",
model: "m",
modelVersion: "v",
claimedConfidence: 1.5,
predictionPayload: {},
state: "emitted",
orphanAfter: new Date(),
cohortKey: "k",
});
expect(result.success).toBe(false);
});
});

describe("predictionStateSchema", () => {
it("accepts the four lifecycle states", () => {
for (const s of ["emitted", "witnessed", "orphaned", "retired"] as const) {
expect(predictionStateSchema.safeParse(s).success).toBe(true);
}
});

it("rejects an unknown state", () => {
expect(predictionStateSchema.safeParse("calibrated").success).toBe(false);
});
});

describe("outcomeLabelSchema", () => {
it("accepts the per-surface outcome labels", () => {
for (const l of ["accepted", "rejected", "edited", "ignored", "custom"] as const) {
expect(outcomeLabelSchema.safeParse(l).success).toBe(true);
}
});
});
Loading