From 5904347d33f5466e3c5a8be770875c5b6879eac9 Mon Sep 17 00:00:00 2001 From: theu Date: Thu, 4 Dec 2025 22:11:09 -0300 Subject: [PATCH 01/10] remove openai key from env, fix packages --- package-lock.json | 91 +++++++++++++++++++++++++++-------------------- package.json | 1 + src/env.ts | 1 - 3 files changed, 54 insertions(+), 39 deletions(-) diff --git a/package-lock.json b/package-lock.json index 82fb721..bc3217e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1242,9 +1242,9 @@ } }, "node_modules/@modelcontextprotocol/sdk": { - "version": "1.21.1", - "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.21.1.tgz", - "integrity": "sha512-UyLFcJLDvUuZbGnaQqXFT32CpPpGj7VS19roLut6gkQVhb439xUzYWbsUvdI3ZPL+2hnFosuugtYWE0Mcs1rmQ==", + "version": "1.24.1", + "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.24.1.tgz", + "integrity": "sha512-YTg4v6bKSst8EJM8NXHC3nGm8kgHD08IbIBbognUeLAgGLVgLpYrgQswzLQd4OyTL4l614ejhqsDrV1//t02Qw==", "license": "MIT", "dependencies": { "ajv": "^8.17.1", @@ -1256,20 +1256,25 @@ "eventsource-parser": "^3.0.0", "express": "^5.0.1", "express-rate-limit": "^7.5.0", + "jose": "^6.1.1", "pkce-challenge": "^5.0.0", "raw-body": "^3.0.0", - "zod": "^3.23.8", - "zod-to-json-schema": "^3.24.1" + "zod": "^3.25 || ^4.0", + "zod-to-json-schema": "^3.25.0" }, "engines": { "node": ">=18" }, "peerDependencies": { - "@cfworker/json-schema": "^4.1.1" + "@cfworker/json-schema": "^4.1.1", + "zod": "^3.25 || ^4.0" }, "peerDependenciesMeta": { "@cfworker/json-schema": { "optional": true + }, + "zod": { + "optional": false } } }, @@ -1303,23 +1308,27 @@ } }, "node_modules/@modelcontextprotocol/sdk/node_modules/body-parser": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.0.tgz", - "integrity": "sha512-02qvAaxv8tp7fBa/mw1ga98OGm+eCbqzJOKoRt70sLmfEEi+jyBYVTDGfCL/k06/4EMk/z01gCe7HoCH/f2LTg==", + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.1.tgz", + "integrity": "sha512-nfDwkulwiZYQIGwxdy0RUmowMhKcFVcYXUU7m4QlKYim1rUtg83xm2yjZ40QjDuc291AJjjeSc9b++AWHSgSHw==", "license": "MIT", "dependencies": { "bytes": "^3.1.2", "content-type": "^1.0.5", - "debug": "^4.4.0", + "debug": "^4.4.3", "http-errors": "^2.0.0", - "iconv-lite": "^0.6.3", + "iconv-lite": "^0.7.0", "on-finished": "^2.4.1", "qs": "^6.14.0", - "raw-body": "^3.0.0", - "type-is": "^2.0.0" + "raw-body": "^3.0.1", + "type-is": "^2.0.1" }, "engines": { "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/@modelcontextprotocol/sdk/node_modules/content-disposition": { @@ -1412,15 +1421,19 @@ } }, "node_modules/@modelcontextprotocol/sdk/node_modules/iconv-lite": { - "version": "0.6.3", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", - "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "version": "0.7.0", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.0.tgz", + "integrity": "sha512-cf6L2Ds3h57VVmkZe+Pn+5APsT7FpqJtEhhieDCvrE2MK5Qk9MyffgQyuxQTm6BChfeZNtcOLHp9IcWRVcIcBQ==", "license": "MIT", "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" }, "engines": { "node": ">=0.10.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, "node_modules/@modelcontextprotocol/sdk/node_modules/json-schema-traverse": { @@ -1510,22 +1523,6 @@ "node": ">= 0.10" } }, - "node_modules/@modelcontextprotocol/sdk/node_modules/raw-body/node_modules/iconv-lite": { - "version": "0.7.0", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.0.tgz", - "integrity": "sha512-cf6L2Ds3h57VVmkZe+Pn+5APsT7FpqJtEhhieDCvrE2MK5Qk9MyffgQyuxQTm6BChfeZNtcOLHp9IcWRVcIcBQ==", - "license": "MIT", - "dependencies": { - "safer-buffer": ">= 2.1.2 < 3.0.0" - }, - "engines": { - "node": ">=0.10.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/express" - } - }, "node_modules/@modelcontextprotocol/sdk/node_modules/send": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/send/-/send-1.2.0.tgz", @@ -3236,9 +3233,9 @@ } }, "node_modules/drizzle-kit": { - "version": "0.31.6", - "resolved": "https://registry.npmjs.org/drizzle-kit/-/drizzle-kit-0.31.6.tgz", - "integrity": "sha512-/B4e/4pwnx25QwD5xXgdpo1S+077a2VZdosXbItE/oNmUgQwZydGDz9qJYmnQl/b+5IX0rLfwRhrPnroGtrg8Q==", + "version": "0.31.7", + "resolved": "https://registry.npmjs.org/drizzle-kit/-/drizzle-kit-0.31.7.tgz", + "integrity": "sha512-hOzRGSdyKIU4FcTSFYGKdXEjFsncVwHZ43gY3WU5Bz9j5Iadp6Rh6hxLSQ1IWXpKLBKt/d5y1cpSPcV+FcoQ1A==", "dev": true, "license": "MIT", "dependencies": { @@ -4438,6 +4435,15 @@ "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==", "license": "ISC" }, + "node_modules/jose": { + "version": "6.1.3", + "resolved": "https://registry.npmjs.org/jose/-/jose-6.1.3.tgz", + "integrity": "sha512-0TpaTfihd4QMNwrz/ob2Bp7X04yuxJkjRGi4aKmOqwhov54i6u79oCv7T+C7lo70MKH6BesI3vscD1yb/yzKXQ==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/panva" + } + }, "node_modules/js-tiktoken": { "version": "1.0.21", "resolved": "https://registry.npmjs.org/js-tiktoken/-/js-tiktoken-1.0.21.tgz", @@ -4449,9 +4455,9 @@ } }, "node_modules/js-yaml": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz", - "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==", + "version": "4.1.1", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz", + "integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==", "dev": true, "license": "MIT", "dependencies": { @@ -6801,6 +6807,15 @@ "funding": { "url": "https://github.com/sponsors/colinhacks" } + }, + "node_modules/zod-to-json-schema": { + "version": "3.25.0", + "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.25.0.tgz", + "integrity": "sha512-HvWtU2UG41LALjajJrML6uQejQhNJx+JBO9IflpSja4R03iNWfKXrj6W2h7ljuLyc1nKS+9yDyL/9tD1U/yBnQ==", + "license": "ISC", + "peerDependencies": { + "zod": "^3.25 || ^4" + } } } } diff --git a/package.json b/package.json index a84d5fa..6195eca 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,7 @@ "clean": "rm -rf .turbo node_modules dist", "dev": "tsx ./src/index.ts", "test:scraper": "tsx ./src/test/scraper.pipeline.ts", + "test:enhancement": "tsx ./src/test/test-enhancement.ts", "mcp:playwright": "npx @playwright/mcp@latest --config ./playwright-mcp.config.json --headless --port 8931", "format": "prettier --check . --cache", "format-fix": "prettier --write . --cache", diff --git a/src/env.ts b/src/env.ts index 8997d60..3472d41 100644 --- a/src/env.ts +++ b/src/env.ts @@ -11,7 +11,6 @@ export const env = createEnv({ SEARCHAPI_API_KEY: z.string(), OPENROUTER_API_KEY: z.string(), SCRAPER_API: z.string().optional(), - OPENAI_API_KEY: z.string(), SCRAPERAPI_KEY: z.string().optional(), SCRAPER_BUDGET_CREDITS: z.coerce.number().positive().optional(), SCRAPERAPI_PROXY_SERVER: z.string().optional(), From 56b6665769ca823e0310e45f38fd2c948121e984 Mon Sep 17 00:00:00 2001 From: theu Date: Fri, 5 Dec 2025 14:46:34 -0300 Subject: [PATCH 02/10] add scraper and navigation enhancements --- src/enhancement/enhancer.ts | 340 ++++++++++++++++++++++++++++++++++++ src/validator.ts | 127 +++++++++++++- 2 files changed, 465 insertions(+), 2 deletions(-) create mode 100644 src/enhancement/enhancer.ts diff --git a/src/enhancement/enhancer.ts b/src/enhancement/enhancer.ts new file mode 100644 index 0000000..5ce5c75 --- /dev/null +++ b/src/enhancement/enhancer.ts @@ -0,0 +1,340 @@ +import { logWithContext } from "../logger.js"; +import type { SearchResult } from "../search/searchapi.js"; +import type { Judgment } from "../llm/result-judge.js"; +import { + fetchWithEscalation, + looksBlocked, +} from "../scraper/scraper.fetch.js"; +import { + stripHtml, + chunkText, + findDataDownloadLinks, +} from "../scraper/scraper.text.js"; +import { + summarizeRankedChunks, + type RankedForSource, +} from "../scraper/scraper.summarize.js"; +import { rankByEmbedding } from "../llm/embeddings.js"; +import { createBrowserEnv } from "../browser/playwright/browserEnv.js"; +import { runOccamBrowserLoop } from "../browser/playwright/occamBrowserLoop.js"; +import { type BrowserEnv } from "../browser/playwright/types.js"; + +export interface EnhancementDecision { + shouldEnhance: boolean; + method: "none" | "scraper" | "navigator"; + reason: string; +} + +export interface EnhancementResult { + answer: string; + citations: Array<{ url: string; quotes: string[] }>; + status: "answered" | "insufficient" | "ambiguous"; + method: "scraper" | "navigator"; + creditsSpent?: number; + stepsUsed?: number; +} + +export interface EnhancementConfig { + scraper: { + totalBudgetCredits: number; + perUrlBudgetCredits: number; + maxUrls: number; + }; + navigator: { + maxSteps: number; + timeoutMs: number; + }; +} + +/** + * Decide if and how to enhance search results using scraper or navigator + * Uses heuristics to minimize costs while maximizing evidence quality + */ +export function decideEnhancement( + predictionText: string, + searchResults: SearchResult[], + judgment: Judgment, +): EnhancementDecision { + // Rule 1: If we already have strong evidence, no enhancement needed + if (judgment.sufficient && judgment.score >= 8) { + return { + shouldEnhance: false, + method: "none", + reason: "Strong evidence already found in search results", + }; + } + + // Rule 2: If no results at all, try navigation (might need form interaction) + if (searchResults.length === 0) { + return { + shouldEnhance: true, + method: "navigator", + reason: "No search results - prediction may require interactive exploration", + }; + } + + // Rule 3: Check if results are paywalled/blocked + const blockedCount = searchResults.filter( + (r) => + r.excerpt.toLowerCase().includes("subscribe") || + r.excerpt.toLowerCase().includes("sign in to read") || + r.excerpt.toLowerCase().includes("login to continue") || + r.excerpt.toLowerCase().includes("create account"), + ).length; + + if (blockedCount > searchResults.length / 2) { + return { + shouldEnhance: true, + method: "scraper", + reason: "Many results appear paywalled - scraper may bypass", + }; + } + + // Rule 4: Check if prediction needs historical/tabular data + const needsHistoricalData = + /\b(on|by|before|after|in)\s+\d{4}\b/.test(predictionText) || + /\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b/i.test( + predictionText, + ) || + /(historical|past|previous|last year|yesterday|archive)/.test( + predictionText.toLowerCase(), + ); + + if (needsHistoricalData && !judgment.sufficient) { + return { + shouldEnhance: true, + method: "navigator", + reason: + "Prediction requires historical data - navigation can find archives/tables", + }; + } + + // Rule 5: Check if we're close but need better content (marginal evidence) + if (judgment.score >= 4 && judgment.score <= 7 && !judgment.sufficient) { + return { + shouldEnhance: true, + method: "scraper", + reason: "Marginal evidence found - deeper scraping may find specifics", + }; + } + + // Rule 6: If judgment suggests we need more context and score is low + if (!judgment.sufficient && judgment.score < 5) { + return { + shouldEnhance: true, + method: "scraper", + reason: "Insufficient evidence with low score - try deeper content extraction", + }; + } + + // Default: no enhancement + return { + shouldEnhance: false, + method: "none", + reason: "No clear benefit to enhancement given search results", + }; +} + +/** + * Enhance search results using scraper module + * Fetches full page content with JS rendering, extracts text, ranks by relevance + */ +export async function enhanceWithScraper( + predictionId: string, + predictionText: string, + urls: string[], + config: EnhancementConfig, +): Promise { + const budget = { + remaining: config.scraper.totalBudgetCredits, + }; + let totalSpent = 0; + + logWithContext(predictionId, "Starting scraper enhancement..."); + + const focus = `${predictionText} โ€” exact numeric/date/time context`; + const allRanked: RankedForSource[] = []; + + const urlsToFetch = urls.slice(0, config.scraper.maxUrls); + + for (const url of urlsToFetch) { + if (budget.remaining <= 0) { + logWithContext(predictionId, "Scraper budget exhausted"); + break; + } + + try { + const perUrlBudget = { + remaining: Math.min( + config.scraper.perUrlBudgetCredits, + budget.remaining, + ), + }; + const beforeBudget = perUrlBudget.remaining; + + logWithContext( + predictionId, + `Fetching ${url} with budget ${perUrlBudget.remaining}`, + ); + + const result = await fetchWithEscalation(url, perUrlBudget, (html, meta) => { + if (meta.status === 200) return true; + const { blocked } = looksBlocked(meta.url, html); + return !blocked; + }); + + const spent = beforeBudget - perUrlBudget.remaining; + totalSpent += spent; + budget.remaining -= spent; + + logWithContext( + predictionId, + `Fetched ${url}: ${result.html.length} chars, spent ${spent} credits`, + ); + + const text = stripHtml(result.html); + let chunks = chunkText(text, 1000).slice(0, 20); + + // Try to find downloadable data links + const dataLinks = findDataDownloadLinks(result.html, url).slice(0, 2); + for (const dlink of dataLinks) { + try { + const dataResult = await fetchWithEscalation(dlink, perUrlBudget); + const dataText = stripHtml(dataResult.html); + const dataChunks = chunkText(dataText, 1200).slice(0, 3); + chunks = chunks.concat(dataChunks); + } catch (e) { + logWithContext( + predictionId, + `Failed to fetch data link: ${dlink}`, + ); + } + } + + const ranked = await rankByEmbedding(focus, chunks); + allRanked.push({ + url, + chunks: ranked.map((r) => ({ text: r.text, score: r.score })), + }); + + logWithContext( + predictionId, + `Ranked ${ranked.length} chunks for ${url}, top score: ${ranked[0]?.score.toFixed(3)}`, + ); + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + logWithContext(predictionId, `Failed to fetch ${url}: ${msg}`); + } + } + + if (allRanked.length === 0) { + logWithContext(predictionId, "No content extracted from any URL"); + return null; + } + + // Summarize all ranked chunks + try { + const now = new Date(); + const anchorISO = now.toISOString().slice(0, 10); + const summary = await summarizeRankedChunks( + predictionText, + allRanked, + anchorISO, + ); + + logWithContext( + predictionId, + `Scraper enhancement: ${summary.status}, ${summary.citations.length} citations`, + ); + + return { + answer: summary.answer || "", + citations: summary.citations, + status: summary.status, + method: "scraper", + creditsSpent: totalSpent, + }; + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + logWithContext(predictionId, `Scraper summarization failed: ${msg}`); + return null; + } +} + +/** + * Enhance using browser navigation + * Launches browser and autonomously navigates to find evidence + */ +export async function enhanceWithNavigator( + predictionId: string, + predictionText: string, + startUrl: string, + config: EnhancementConfig, +): Promise { + logWithContext(predictionId, "Starting navigator enhancement..."); + + let env: BrowserEnv | null = null; + + try { + env = await createBrowserEnv({ + homeUrl: startUrl, + headless: true, + useScraperProxy: false, + debug: false, + }); + + const goal = `Find evidence to validate: "${predictionText}"`; + + logWithContext(predictionId, `Navigator goal: ${goal}`); + + const result = await runOccamBrowserLoop({ + env, + goal, + initialUrl: startUrl, + maxSteps: config.navigator.maxSteps, + debug: false, + }); + + logWithContext( + predictionId, + `Navigator completed: ${result.steps} steps, action: ${result.lastAction?.action}`, + ); + + if (result.lastAction?.action === "stop") { + // Extract answer and try to build citations from final state + const answer = result.lastAction.answer || "No answer found"; + + // Extract text from final page to build pseudo-citations + const finalText = stripHtml(result.finalRawState.domHtml); + const chunks = chunkText(finalText, 800).slice(0, 5); + + return { + answer, + citations: [ + { + url: result.finalRawState.url, + quotes: chunks.slice(0, 2), + }, + ], + status: answer !== "No answer found" ? "answered" : "insufficient", + method: "navigator", + stepsUsed: result.steps, + }; + } + + logWithContext(predictionId, "Navigator did not reach stop action"); + return null; + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + logWithContext(predictionId, `Navigator enhancement failed: ${msg}`); + return null; + } finally { + if (env) { + try { + await env.close(); + } catch (e) { + logWithContext(predictionId, "Failed to close browser"); + } + } + } +} diff --git a/src/validator.ts b/src/validator.ts index 252ef56..b49f5ef 100644 --- a/src/validator.ts +++ b/src/validator.ts @@ -15,6 +15,13 @@ import { QueryEnhancer, type PastAttempt } from "./llm/query-enhancer.js"; import { ResultJudge } from "./llm/result-judge.js"; import { truncateText, writeCostLog } from "./utils.js"; import { logWithContext, logErrorWithContext } from "./logger.js"; +import { + decideEnhancement, + enhanceWithScraper, + enhanceWithNavigator, + type EnhancementResult, + type EnhancementConfig, +} from "./enhancement/enhancer.js"; export const ValidationOutcome = z.enum([ "MaturedTrue", @@ -74,10 +81,17 @@ const VALIDATION_CONFIG = { TRUE_DEFINITIVE_MIN: 9, FALSE_DEFINITIVE_MAX: 2, }, + enhancement: { + SCRAPER_TOTAL_BUDGET_CREDITS: 50, + SCRAPER_PER_URL_BUDGET_CREDITS: 15, + SCRAPER_MAX_URLS: 3, + NAVIGATOR_MAX_STEPS: 10, + NAVIGATOR_TIMEOUT_MS: 120_000, // 2 minutes + }, } as const; export class Validator { - constructor(_db: DB) {} + constructor(_db: DB) { } /** * Check if a prediction should be validated before doing expensive operations @@ -461,6 +475,19 @@ export class Validator { const queryEnhancer = new QueryEnhancer(); const resultJudge = new ResultJudge(); + // Enhancement configuration + const enhancementConfig: EnhancementConfig = { + scraper: { + totalBudgetCredits: VALIDATION_CONFIG.enhancement.SCRAPER_TOTAL_BUDGET_CREDITS, + perUrlBudgetCredits: VALIDATION_CONFIG.enhancement.SCRAPER_PER_URL_BUDGET_CREDITS, + maxUrls: VALIDATION_CONFIG.enhancement.SCRAPER_MAX_URLS, + }, + navigator: { + maxSteps: VALIDATION_CONFIG.enhancement.NAVIGATOR_MAX_STEPS, + timeoutMs: VALIDATION_CONFIG.enhancement.NAVIGATOR_TIMEOUT_MS, + }, + }; + logWithContext(predictionId, "Starting hybrid validation"); logWithContext( predictionId, @@ -502,11 +529,41 @@ export class Validator { `Total results found: ${combinedResults.length}`, ); + // Early enhancement check: if no results, try navigator immediately if (combinedResults.length === 0) { + logWithContext( + predictionId, + "No search results - attempting navigator enhancement", + ); + const navResult = await enhanceWithNavigator( + predictionId, + predictionText, + "https://www.google.com", + enhancementConfig, + ); + + if (navResult && navResult.status === "answered") { + logWithContext( + predictionId, + "Navigator found evidence when search failed", + ); + return { + prediction_id: predictionId, + outcome: "MaturedTrue", + proof: `Navigator enhancement: ${navResult.answer}`, + sources: navResult.citations.map((c) => ({ + url: c.url, + title: "Navigator-discovered evidence", + pub_date: null, + excerpt: c.quotes[0] || "", + })), + }; + } + return { prediction_id: predictionId, outcome: "MissingContext", - proof: "No search results found", + proof: "No search results found and navigator enhancement failed", sources: [], }; } @@ -589,6 +646,72 @@ export class Validator { ); } + // Enhancement decision: try scraper or navigator if still insufficient + let enhancementResult: EnhancementResult | null = null; + const enhancementDecision = decideEnhancement( + predictionText, + combinedResults, + judgment, + ); + + logWithContext( + predictionId, + `Enhancement decision: ${enhancementDecision.method} - ${enhancementDecision.reason}`, + ); + + if (enhancementDecision.shouldEnhance) { + if (enhancementDecision.method === "scraper") { + const urls = combinedResults.slice(0, 3).map((r) => r.url); + enhancementResult = await enhanceWithScraper( + predictionId, + predictionText, + urls, + enhancementConfig, + ); + } else if (enhancementDecision.method === "navigator") { + const startUrl = + combinedResults[0]?.url || "https://www.google.com"; + enhancementResult = await enhanceWithNavigator( + predictionId, + predictionText, + startUrl, + enhancementConfig, + ); + } + + if (enhancementResult && enhancementResult.status === "answered") { + logWithContext( + predictionId, + `Enhancement succeeded via ${enhancementResult.method}`, + ); + const result = enhancementResult; // Type narrowing for closure + // Override judgment with enhancement result + judgment = { + ...judgment, + decision: "TRUE", + score: 8, + summary: result.answer, + evidence: result.citations + .map((c) => `${c.url}: ${c.quotes.slice(0, 1).join("; ")}`) + .join("\n"), + sufficient: true, + }; + // Add enhancement sources to results + const enhancementSources = result.citations.map((c) => ({ + url: c.url, + title: `Enhanced via ${result.method}`, + pub_date: null, + excerpt: c.quotes[0] || "", + })); + combinedResults = [...enhancementSources, ...combinedResults]; + } else { + logWithContext( + predictionId, + `Enhancement failed or returned insufficient data`, + ); + } + } + let outcome: ValidationResult["outcome"]; if (judgment.decision === "TRUE") { From 731826b8951844c8a9500a163205bb7ee2b5b850 Mon Sep 17 00:00:00 2001 From: theu Date: Fri, 5 Dec 2025 16:33:34 -0300 Subject: [PATCH 03/10] enhancement test script --- src/test/test-enhancement.ts | 147 +++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 src/test/test-enhancement.ts diff --git a/src/test/test-enhancement.ts b/src/test/test-enhancement.ts new file mode 100644 index 0000000..720bcab --- /dev/null +++ b/src/test/test-enhancement.ts @@ -0,0 +1,147 @@ +import "dotenv/config"; +import { + enhanceWithScraper, + enhanceWithNavigator, + type EnhancementConfig, +} from "../enhancement/enhancer.js"; + +/** + * Test enhancement methods directly + * Usage: + * npm run test:enhancement scraper "ENS moving to L2" url1 url2 url3 + * npm run test:enhancement navigator "Bitcoin price on Jan 1 2024" https://google.com + */ + +async function testEnhancement() { + const method = process.argv[2]; // "scraper" or "navigator" + const predictionText = process.argv[3]; + + if (!method || !predictionText) { + console.error("โŒ Usage:"); + console.error(' Scraper: npm run test:enhancement scraper "prediction text" url1 url2 url3'); + console.error(' Navigator: npm run test:enhancement navigator "prediction text" https://start-url.com'); + process.exit(1); + } + + const predictionId = "test-" + Date.now(); + + const config: EnhancementConfig = { + scraper: { + totalBudgetCredits: 50, + perUrlBudgetCredits: 15, + maxUrls: 3, + }, + navigator: { + maxSteps: 10, + timeoutMs: 120_000, + }, + }; + + console.log("=== Enhancement Test ===\n"); + console.log(`Method: ${method}`); + console.log(`Prediction: "${predictionText}"`); + console.log(); + + try { + if (method === "scraper") { + const urls = process.argv.slice(4); + if (urls.length === 0) { + console.error("โŒ No URLs provided for scraper test"); + console.error(" Provide at least one URL to scrape"); + process.exit(1); + } + + console.log(`URLs to scrape: ${urls.length}`); + urls.forEach((url, i) => console.log(` ${i + 1}. ${url}`)); + console.log(); + + console.log("๐Ÿ”ง Starting scraper enhancement...\n"); + console.log("=".repeat(60)); + + const result = await enhanceWithScraper( + predictionId, + predictionText, + urls, + config, + ); + + console.log("=".repeat(60)); + console.log(); + + if (result) { + console.log("โœ… Scraper Enhancement Result:"); + console.log(` Status: ${result.status}`); + console.log(` Credits spent: ${result.creditsSpent || 0}`); + console.log(` Answer: ${result.answer.slice(0, 300)}${result.answer.length > 300 ? "..." : ""}`); + console.log(` Citations: ${result.citations.length}`); + if (result.citations.length > 0) { + console.log("\n Citations:"); + result.citations.forEach((c, i) => { + console.log(` ${i + 1}. ${c.url}`); + console.log(` Quotes: ${c.quotes.length}`); + c.quotes.slice(0, 2).forEach((q, j) => { + console.log(` ${j + 1}. "${q.slice(0, 100)}${q.length > 100 ? "..." : ""}"`); + }); + }); + } + } else { + console.log("โŒ Scraper enhancement failed"); + } + } else if (method === "navigator") { + const startUrl = process.argv[4] || "https://www.google.com"; + + console.log(`Start URL: ${startUrl}`); + console.log(); + + console.log("๐Ÿ”ง Starting navigator enhancement...\n"); + console.log("=".repeat(60)); + + const result = await enhanceWithNavigator( + predictionId, + predictionText, + startUrl, + config, + ); + + console.log("=".repeat(60)); + console.log(); + + if (result) { + console.log("โœ… Navigator Enhancement Result:"); + console.log(` Status: ${result.status}`); + console.log(` Steps used: ${result.stepsUsed || 0}`); + console.log(` Answer: ${result.answer.slice(0, 300)}${result.answer.length > 300 ? "..." : ""}`); + console.log(` Citations: ${result.citations.length}`); + if (result.citations.length > 0) { + console.log("\n Citations:"); + result.citations.forEach((c, i) => { + console.log(` ${i + 1}. ${c.url}`); + console.log(` Quotes: ${c.quotes.length}`); + c.quotes.slice(0, 2).forEach((q, j) => { + console.log(` ${j + 1}. "${q.slice(0, 100)}${q.length > 100 ? "..." : ""}"`); + }); + }); + } + } else { + console.log("โŒ Navigator enhancement failed"); + } + } else { + console.error(`โŒ Unknown method: ${method}`); + console.error(" Use 'scraper' or 'navigator'"); + process.exit(1); + } + } catch (error) { + console.error("\nโŒ Fatal error:", error); + if (error instanceof Error && error.stack) { + console.error(error.stack); + } + process.exit(1); + } + + process.exit(0); +} + +testEnhancement().catch((error) => { + console.error("โŒ Fatal error:", error); + process.exit(1); +}); From 299a0ca6a79a5ac83981efedbab6dbae3f181acf Mon Sep 17 00:00:00 2001 From: theu Date: Fri, 5 Dec 2025 16:46:19 -0300 Subject: [PATCH 04/10] bugfixes, add enhancement cost logging --- src/enhancement/enhancer.ts | 51 +++++++++++++++++++++++++++--------- src/test/test-enhancement.ts | 1 - src/utils.ts | 3 +++ src/validator.ts | 38 +++++++++++++++------------ 4 files changed, 62 insertions(+), 31 deletions(-) diff --git a/src/enhancement/enhancer.ts b/src/enhancement/enhancer.ts index 5ce5c75..1d545fc 100644 --- a/src/enhancement/enhancer.ts +++ b/src/enhancement/enhancer.ts @@ -1,10 +1,7 @@ import { logWithContext } from "../logger.js"; import type { SearchResult } from "../search/searchapi.js"; import type { Judgment } from "../llm/result-judge.js"; -import { - fetchWithEscalation, - looksBlocked, -} from "../scraper/scraper.fetch.js"; +import { fetchWithEscalation, looksBlocked } from "../scraper/scraper.fetch.js"; import { stripHtml, chunkText, @@ -19,6 +16,39 @@ import { createBrowserEnv } from "../browser/playwright/browserEnv.js"; import { runOccamBrowserLoop } from "../browser/playwright/occamBrowserLoop.js"; import { type BrowserEnv } from "../browser/playwright/types.js"; +/** + * Check if a search result excerpt suggests blocked/paywalled content. + */ +function excerptLooksBlocked(excerpt: string): boolean { + const lower = excerpt.toLowerCase(); + + // Subscription/paywall indicators + const paywallKeywords = [ + "subscribe", + "sign in to read", + "login to continue", + "create account", + "members only", + "premium content", + "exclusive access", + "unlock this article", + "start your free trial", + "paid subscribers", + ]; + + // Bot-wall indicators that might appear in excerpts + const botWallKeywords = [ + "access denied", + "please enable javascript", + "checking your browser", + "just a moment", + "verify you are human", + ]; + + const allKeywords = [...paywallKeywords, ...botWallKeywords]; + return allKeywords.some((keyword) => lower.includes(keyword)); +} + export interface EnhancementDecision { shouldEnhance: boolean; method: "none" | "scraper" | "navigator"; @@ -42,7 +72,6 @@ export interface EnhancementConfig { }; navigator: { maxSteps: number; - timeoutMs: number; }; } @@ -73,20 +102,16 @@ export function decideEnhancement( }; } - // Rule 3: Check if results are paywalled/blocked - const blockedCount = searchResults.filter( - (r) => - r.excerpt.toLowerCase().includes("subscribe") || - r.excerpt.toLowerCase().includes("sign in to read") || - r.excerpt.toLowerCase().includes("login to continue") || - r.excerpt.toLowerCase().includes("create account"), + // Rule 3: Check if results are paywalled/blocked using consolidated detection + const blockedCount = searchResults.filter((r) => + excerptLooksBlocked(r.excerpt), ).length; if (blockedCount > searchResults.length / 2) { return { shouldEnhance: true, method: "scraper", - reason: "Many results appear paywalled - scraper may bypass", + reason: "Many results appear paywalled/blocked - scraper may bypass", }; } diff --git a/src/test/test-enhancement.ts b/src/test/test-enhancement.ts index 720bcab..74f77b1 100644 --- a/src/test/test-enhancement.ts +++ b/src/test/test-enhancement.ts @@ -33,7 +33,6 @@ async function testEnhancement() { }, navigator: { maxSteps: 10, - timeoutMs: 120_000, }, }; diff --git a/src/utils.ts b/src/utils.ts index 7967832..e089a19 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -34,6 +34,9 @@ export interface CostLogEntry { totalOutputTokens: number; outcome: string; timestamp: string; + enhancementMethod?: "scraper" | "navigator" | "none" | undefined; + scraperCreditsSpent?: number | undefined; + navigatorStepsUsed?: number | undefined; } /** diff --git a/src/validator.ts b/src/validator.ts index b49f5ef..7bc544d 100644 --- a/src/validator.ts +++ b/src/validator.ts @@ -86,7 +86,6 @@ const VALIDATION_CONFIG = { SCRAPER_PER_URL_BUDGET_CREDITS: 15, SCRAPER_MAX_URLS: 3, NAVIGATOR_MAX_STEPS: 10, - NAVIGATOR_TIMEOUT_MS: 120_000, // 2 minutes }, } as const; @@ -484,7 +483,6 @@ export class Validator { }, navigator: { maxSteps: VALIDATION_CONFIG.enhancement.NAVIGATOR_MAX_STEPS, - timeoutMs: VALIDATION_CONFIG.enhancement.NAVIGATOR_TIMEOUT_MS, }, }; @@ -684,26 +682,29 @@ export class Validator { predictionId, `Enhancement succeeded via ${enhancementResult.method}`, ); - const result = enhancementResult; // Type narrowing for closure - // Override judgment with enhancement result - judgment = { - ...judgment, - decision: "TRUE", - score: 8, - summary: result.answer, - evidence: result.citations - .map((c) => `${c.url}: ${c.quotes.slice(0, 1).join("; ")}`) - .join("\n"), - sufficient: true, - }; - // Add enhancement sources to results - const enhancementSources = result.citations.map((c) => ({ + // Add enhancement sources to results for re-evaluation + const method = enhancementResult.method; + const enhancementSources = enhancementResult.citations.map((c) => ({ url: c.url, - title: `Enhanced via ${result.method}`, + title: `Enhanced via ${method}`, pub_date: null, excerpt: c.quotes[0] || "", })); combinedResults = [...enhancementSources, ...combinedResults]; + + // Re-evaluate with enhanced evidence using ResultJudge + logWithContext( + predictionId, + `Re-evaluating with ${enhancementSources.length} enhanced sources`, + ); + judgment = await resultJudge.evaluate(predictionText, combinedResults); + totalResultJudgeInputTokens += judgment.inputTokens; + totalResultJudgeOutputTokens += judgment.outputTokens; + + logWithContext( + predictionId, + `Post-enhancement judgment: ${judgment.decision} (score: ${judgment.score})`, + ); } else { logWithContext( predictionId, @@ -762,6 +763,9 @@ export class Validator { totalOutputTokens, outcome, timestamp: new Date().toISOString(), + enhancementMethod: enhancementDecision.method, + scraperCreditsSpent: enhancementResult?.creditsSpent, + navigatorStepsUsed: enhancementResult?.stepsUsed, }); const sources = From 3ab858504ac569549b9e8710b0fd96c76fc85158 Mon Sep 17 00:00:00 2001 From: theu Date: Mon, 8 Dec 2025 17:43:09 -0300 Subject: [PATCH 05/10] integrates LLM picker to select promising URLs --- src/enhancement/enhancer.ts | 40 +++++++++++++++++++++++++++++++++--- src/test/test-enhancement.ts | 9 +++++++- src/validator.ts | 4 ++-- 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/src/enhancement/enhancer.ts b/src/enhancement/enhancer.ts index 1d545fc..89533db 100644 --- a/src/enhancement/enhancer.ts +++ b/src/enhancement/enhancer.ts @@ -15,6 +15,8 @@ import { rankByEmbedding } from "../llm/embeddings.js"; import { createBrowserEnv } from "../browser/playwright/browserEnv.js"; import { runOccamBrowserLoop } from "../browser/playwright/occamBrowserLoop.js"; import { type BrowserEnv } from "../browser/playwright/types.js"; +import { gatePicksWithLLM } from "../scraper/scraper.llm-gate.js"; +import type { SerpItem } from "../scraper/scraper.schemas.js"; /** * Check if a search result excerpt suggests blocked/paywalled content. @@ -163,11 +165,12 @@ export function decideEnhancement( /** * Enhance search results using scraper module * Fetches full page content with JS rendering, extracts text, ranks by relevance + * Uses LLM gate picker to intelligently select best URLs to scrape */ export async function enhanceWithScraper( predictionId: string, predictionText: string, - urls: string[], + searchResults: SearchResult[], config: EnhancementConfig, ): Promise { const budget = { @@ -177,11 +180,42 @@ export async function enhanceWithScraper( logWithContext(predictionId, "Starting scraper enhancement..."); + // Convert SearchResult[] to SerpItem[] format for gatePicksWithLLM + const serpItems: SerpItem[] = searchResults.map((r) => ({ + title: r.title, + link: r.url, + snippet: r.excerpt, + date: r.pub_date, + domain: new URL(r.url).hostname, + })); + + // Use LLM gate picker to intelligently select URLs + logWithContext( + predictionId, + `Using LLM gate picker to select from ${serpItems.length} results`, + ); + + let urlsToFetch: string[]; + try { + urlsToFetch = await gatePicksWithLLM( + predictionText, + serpItems, + config.scraper.maxUrls, + ); + logWithContext( + predictionId, + `LLM gate picker selected ${urlsToFetch.length} URLs: ${urlsToFetch.join(", ")}`, + ); + } catch (e) { + const msg = e instanceof Error ? e.message : String(e); + logWithContext(predictionId, `LLM gate picker failed: ${msg}, falling back to top ${config.scraper.maxUrls} URLs`); + // Fallback to naive top-N if LLM fails + urlsToFetch = searchResults.slice(0, config.scraper.maxUrls).map((r) => r.url); + } + const focus = `${predictionText} โ€” exact numeric/date/time context`; const allRanked: RankedForSource[] = []; - const urlsToFetch = urls.slice(0, config.scraper.maxUrls); - for (const url of urlsToFetch) { if (budget.remaining <= 0) { logWithContext(predictionId, "Scraper budget exhausted"); diff --git a/src/test/test-enhancement.ts b/src/test/test-enhancement.ts index 74f77b1..6ea5347 100644 --- a/src/test/test-enhancement.ts +++ b/src/test/test-enhancement.ts @@ -54,13 +54,20 @@ async function testEnhancement() { urls.forEach((url, i) => console.log(` ${i + 1}. ${url}`)); console.log(); + const searchResults = urls.map((url, i) => ({ + url, + title: `Test result ${i + 1}`, + excerpt: `Test excerpt for ${url}`, + pub_date: null, + })); + console.log("๐Ÿ”ง Starting scraper enhancement...\n"); console.log("=".repeat(60)); const result = await enhanceWithScraper( predictionId, predictionText, - urls, + searchResults, config, ); diff --git a/src/validator.ts b/src/validator.ts index 7bc544d..0acba5c 100644 --- a/src/validator.ts +++ b/src/validator.ts @@ -659,11 +659,11 @@ export class Validator { if (enhancementDecision.shouldEnhance) { if (enhancementDecision.method === "scraper") { - const urls = combinedResults.slice(0, 3).map((r) => r.url); + // Pass search results to enhancer for intelligent URL selection via LLM gate picker enhancementResult = await enhanceWithScraper( predictionId, predictionText, - urls, + combinedResults, enhancementConfig, ); } else if (enhancementDecision.method === "navigator") { From 28772e86a98f5874002454be090c803c5592b82e Mon Sep 17 00:00:00 2001 From: theu Date: Mon, 8 Dec 2025 17:54:36 -0300 Subject: [PATCH 06/10] full pipeline test --- package.json | 1 + src/test/test-validation.ts | 126 ++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 src/test/test-validation.ts diff --git a/package.json b/package.json index 6195eca..23192c3 100644 --- a/package.json +++ b/package.json @@ -13,6 +13,7 @@ "dev": "tsx ./src/index.ts", "test:scraper": "tsx ./src/test/scraper.pipeline.ts", "test:enhancement": "tsx ./src/test/test-enhancement.ts", + "test:validation": "tsx ./src/test/test-validation.ts", "mcp:playwright": "npx @playwright/mcp@latest --config ./playwright-mcp.config.json --headless --port 8931", "format": "prettier --check . --cache", "format-fix": "prettier --write . --cache", diff --git a/src/test/test-validation.ts b/src/test/test-validation.ts new file mode 100644 index 0000000..d909bc8 --- /dev/null +++ b/src/test/test-validation.ts @@ -0,0 +1,126 @@ +import "dotenv/config"; +import { randomUUID } from "node:crypto"; +import { Validator } from "../validator.js"; +import { createDb } from "../db/client.js"; +import type { PredictionToValidate } from "../validator.js"; + +/** + * Test the full validation pipeline on a custom prediction text + * Usage: + * npm run test:validation:custom "Bitcoin will reach $50k by end of 2024" + */ + +async function testCustomValidation() { + const predictionText = process.argv.slice(2).join(" "); + + if (!predictionText) { + console.error("โŒ Usage: npm run test:validation:custom \"prediction text\""); + console.error(' Example: npm run test:validation:custom "Bitcoin will reach $50k by end of 2024"'); + process.exit(1); + } + + console.log("=== Full Validation Pipeline Test (Custom) ===\n"); + console.log(`Prediction: "${predictionText}"`); + console.log(); + + const db = createDb(); + const validator = new Validator(db); + + try { + await db.transaction(async (tx) => { + // Create a mock prediction object + const testId = randomUUID(); + const mockPrediction: PredictionToValidate = { + parsedPrediction: { + id: testId, + predictionId: randomUUID(), + goal: [{ start: 0, end: predictionText.length }], + timeframe: null, + topicId: null, + predictionQuality: 80, + llmConfidence: "0.9", + briefRationale: "Test prediction", + vagueness: "0.3", + context: null, + filterAgentId: null, + createdAt: new Date(), + updatedAt: new Date(), + }, + parsedPredictionDetails: { + parsedPredictionId: testId, + predictionContext: predictionText, + timeframeStatus: "valid", + timeframeStartUtc: null, + timeframeEndUtc: new Date(), + timeframePrecision: null, + timeframeReasoning: null, + timeframeAssumptions: null, + timeframeConfidence: null, + filterValidationConfidence: "0.95", + filterValidationReasoning: null, + verdictConfidence: null, + verdictSources: null, + createdAt: new Date(), + updatedAt: new Date(), + }, + scrapedTweet: { + id: BigInt(0), + text: predictionText, + authorId: BigInt(0), + date: new Date(), + conversationId: null, + parentTweetId: null, + predictionId: null, + createdAt: new Date(), + updatedAt: new Date(), + }, + }; + + console.log("๐Ÿ”ง Running full validation pipeline...\n"); + console.log("=".repeat(80)); + console.log(); + + const validationResult = await validator.validatePrediction( + tx, + mockPrediction, + ); + + console.log(); + console.log("=".repeat(80)); + console.log(); + + // Display results + console.log("โœ… Validation Complete!\n"); + console.log(`Outcome: ${validationResult.outcome}`); + console.log(`\nProof (${validationResult.proof.length} chars):`); + console.log("โ”€".repeat(80)); + console.log(validationResult.proof); + console.log("โ”€".repeat(80)); + console.log(`\nSources: ${validationResult.sources.length}`); + if (validationResult.sources.length > 0) { + validationResult.sources.forEach((source, i) => { + console.log(`\n ${i + 1}. ${source.title}`); + console.log(` URL: ${source.url}`); + console.log(` Date: ${source.pub_date || "N/A"}`); + console.log(` Excerpt: "${source.excerpt.slice(0, 150)}${source.excerpt.length > 150 ? "..." : ""}"`); + }); + } + + return validationResult; + }); + + console.log("\nโœ… Test completed successfully"); + process.exit(0); + } catch (error) { + console.error("\nโŒ Fatal error:", error); + if (error instanceof Error && error.stack) { + console.error(error.stack); + } + process.exit(1); + } +} + +testCustomValidation().catch((error) => { + console.error("โŒ Fatal error:", error); + process.exit(1); +}); From f0a78b410e01f5979e9d523943b7f0bdf7ffaf02 Mon Sep 17 00:00:00 2001 From: theu Date: Mon, 15 Dec 2025 18:43:02 -0300 Subject: [PATCH 07/10] add URL deduplication --- src/validator.ts | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/src/validator.ts b/src/validator.ts index 0acba5c..0f801f5 100644 --- a/src/validator.ts +++ b/src/validator.ts @@ -521,10 +521,22 @@ export class Validator { let totalResultJudgeOutputTokens = 0; let searchApiCalls = VALIDATION_CONFIG.search.INITIAL_QUERIES; + // Track visited URLs for deduplication across iterations + const visitedUrls = new Set(); + let combinedResults = initialResultSets.flat(); + // Deduplicate initial results + combinedResults = combinedResults.filter((result) => { + if (visitedUrls.has(result.url)) { + return false; + } + visitedUrls.add(result.url); + return true; + }); + logWithContext( predictionId, - `Total results found: ${combinedResults.length}`, + `Total results found: ${combinedResults.length} (after deduplication)`, ); // Early enhancement check: if no results, try navigator immediately @@ -628,10 +640,19 @@ export class Validator { ); searchApiCalls++; - combinedResults = [...combinedResults, ...refinedResults]; + // Deduplicate refined results against already visited URLs + const newResults = refinedResults.filter((result) => { + if (visitedUrls.has(result.url)) { + return false; + } + visitedUrls.add(result.url); + return true; + }); + + combinedResults = [...combinedResults, ...newResults]; logWithContext( predictionId, - `Additional results: ${refinedResults.length}, Total: ${combinedResults.length}`, + `Additional results: ${refinedResults.length} (${newResults.length} new), Total: ${combinedResults.length}`, ); judgment = await resultJudge.evaluate(predictionText, combinedResults); @@ -684,12 +705,17 @@ export class Validator { ); // Add enhancement sources to results for re-evaluation const method = enhancementResult.method; - const enhancementSources = enhancementResult.citations.map((c) => ({ - url: c.url, - title: `Enhanced via ${method}`, - pub_date: null, - excerpt: c.quotes[0] || "", - })); + const enhancementSources = enhancementResult.citations + .filter((c) => !visitedUrls.has(c.url)) // Deduplicate enhancement sources + .map((c) => { + visitedUrls.add(c.url); + return { + url: c.url, + title: `Enhanced via ${method}`, + pub_date: null, + excerpt: c.quotes[0] || "", + }; + }); combinedResults = [...enhancementSources, ...combinedResults]; // Re-evaluate with enhanced evidence using ResultJudge From 9fc6c13432babf1e97a9f6feeb77184607a05a22 Mon Sep 17 00:00:00 2001 From: theu Date: Mon, 15 Dec 2025 19:01:37 -0300 Subject: [PATCH 08/10] add query enhancement retries --- src/validator.ts | 53 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/src/validator.ts b/src/validator.ts index 0f801f5..f63f38b 100644 --- a/src/validator.ts +++ b/src/validator.ts @@ -69,7 +69,7 @@ const VALIDATION_CONFIG = { INITIAL_QUERIES: 2, RESULTS_PER_QUERY: 10, MAX_TOTAL_RESULTS: 30, - MAX_REFINEMENT_ITERATIONS: 1, + MAX_SERP_ITERATIONS: 3, }, quality: { FILTER_VALIDATION_CONFIDENCE_MIN: 0.85, @@ -594,14 +594,21 @@ export class Validator { `Judgment: ${judgment.decision} (score: ${judgment.score}), Sufficient: ${judgment.sufficient ? "yes" : "no"}`, ); - if ( + // Iterative SERP refinement loop + let serpIteration = 0; + const allQueries: string[] = [...initialQueryResult.queries]; + + while ( !judgment.sufficient && + serpIteration < VALIDATION_CONFIG.search.MAX_SERP_ITERATIONS && combinedResults.length < VALIDATION_CONFIG.search.MAX_TOTAL_RESULTS ) { + serpIteration++; logWithContext( predictionId, - "Step 4: Results insufficient, generating refined query...", + `Step 4.${serpIteration}: Results insufficient, generating refined query (iteration ${serpIteration}/${VALIDATION_CONFIG.search.MAX_SERP_ITERATIONS})...`, ); + if (judgment.nextQuerySuggestion) { logWithContext( predictionId, @@ -609,18 +616,19 @@ export class Validator { ); } - const pastAttempts: PastAttempt[] = initialQueryResult.queries.map( - (q) => { - const attempt: PastAttempt = { - query: q, - success: false, - }; - if (judgment.nextQuerySuggestion) { - attempt.reasoning = judgment.nextQuerySuggestion; - } - return attempt; - }, - ); + // Build past attempts from all queries so far + // Only attach the suggestion reasoning to the most recent query + const pastAttempts: PastAttempt[] = allQueries.map((q, idx) => { + const attempt: PastAttempt = { + query: q, + success: false, + }; + // Only add reasoning to the last query (current judgment feedback) + if (judgment.nextQuerySuggestion && idx === allQueries.length - 1) { + attempt.reasoning = judgment.nextQuerySuggestion; + } + return attempt; + }); const refinedQueryResult = await queryEnhancer.enhanceWithTokens( predictionText, @@ -629,6 +637,8 @@ export class Validator { totalQueryEnhancerInputTokens += refinedQueryResult.inputTokens; totalQueryEnhancerOutputTokens += refinedQueryResult.outputTokens; + allQueries.push(refinedQueryResult.query); + logWithContext( predictionId, `Refined query: "${refinedQueryResult.query}"`, @@ -652,16 +662,25 @@ export class Validator { combinedResults = [...combinedResults, ...newResults]; logWithContext( predictionId, - `Additional results: ${refinedResults.length} (${newResults.length} new), Total: ${combinedResults.length}`, + `Iteration ${serpIteration}: ${refinedResults.length} results (${newResults.length} new), Total: ${combinedResults.length}`, ); + // Re-evaluate with new results judgment = await resultJudge.evaluate(predictionText, combinedResults); totalResultJudgeInputTokens += judgment.inputTokens; totalResultJudgeOutputTokens += judgment.outputTokens; logWithContext( predictionId, - `Final judgment: ${judgment.decision} (score: ${judgment.score})`, + `Iteration ${serpIteration} judgment: ${judgment.decision} (score: ${judgment.score}), Sufficient: ${judgment.sufficient ? "yes" : "no"}`, + ); + } + + // Log final iteration summary + if (serpIteration > 0) { + logWithContext( + predictionId, + `Completed ${serpIteration} SERP iteration(s). Final: ${judgment.decision} (score: ${judgment.score})`, ); } From 77c01effef942153cdd581ce6cb555af8a5ca343 Mon Sep 17 00:00:00 2001 From: theu Date: Tue, 16 Dec 2025 21:14:21 -0300 Subject: [PATCH 09/10] add fixed e2e test --- .gitignore | 2 + package.json | 1 + src/test/e2e-benchmark.test.ts | 321 +++++++++++++++++++++++ test-fixtures/README.md | 122 +++++++++ test-fixtures/benchmark-predictions.json | 56 ++++ 5 files changed, 502 insertions(+) create mode 100644 src/test/e2e-benchmark.test.ts create mode 100644 test-fixtures/README.md create mode 100644 test-fixtures/benchmark-predictions.json diff --git a/.gitignore b/.gitignore index c753693..4163853 100644 --- a/.gitignore +++ b/.gitignore @@ -46,3 +46,5 @@ drizzle.config.ts reference/ AgentOccam/ + +test-results/ diff --git a/package.json b/package.json index 23192c3..4045f0b 100644 --- a/package.json +++ b/package.json @@ -14,6 +14,7 @@ "test:scraper": "tsx ./src/test/scraper.pipeline.ts", "test:enhancement": "tsx ./src/test/test-enhancement.ts", "test:validation": "tsx ./src/test/test-validation.ts", + "test:benchmark": "vitest run src/test/e2e-benchmark.test.ts", "mcp:playwright": "npx @playwright/mcp@latest --config ./playwright-mcp.config.json --headless --port 8931", "format": "prettier --check . --cache", "format-fix": "prettier --write . --cache", diff --git a/src/test/e2e-benchmark.test.ts b/src/test/e2e-benchmark.test.ts new file mode 100644 index 0000000..555d7ac --- /dev/null +++ b/src/test/e2e-benchmark.test.ts @@ -0,0 +1,321 @@ +import "dotenv/config"; +import { describe, it, expect } from "vitest"; +import { readFile, writeFile, mkdir } from "node:fs/promises"; +import { createDb } from "../db/client.js"; +import { Validator } from "../validator.js"; +import { + parsedPrediction, + parsedPredictionDetails, + scrapedTweet, +} from "../db/schema.js"; +import { eq } from "drizzle-orm"; +import type { PredictionToValidate } from "../validator.js"; + +interface BenchmarkFixture { + description: string; + predictions: Array<{ + id: string; + description: string; + expectedOutcome: string | null; + }>; +} + +interface BenchmarkResult { + predictionId: string; + description: string; + outcome: string; + proofLength: number; + sourcesCount: number; + costs: { + searchApiCalls: number; + queryEnhancerInputTokens: number; + queryEnhancerOutputTokens: number; + resultJudgeInputTokens: number; + resultJudgeOutputTokens: number; + totalInputTokens: number; + totalOutputTokens: number; + scraperCreditsSpent?: number; + navigatorStepsUsed?: number; + enhancementMethod?: string; + }; + durationMs: number; + timestamp: string; +} + +interface BenchmarkRun { + runId: string; + timestamp: string; + predictions: BenchmarkResult[]; + totals: { + totalSearchApiCalls: number; + totalInputTokens: number; + totalOutputTokens: number; + totalScraperCredits: number; + totalNavigatorSteps: number; + totalDurationMs: number; + averageDurationMs: number; + }; +} + +/** + * Load benchmark fixture + */ +async function loadFixture(): Promise { + try { + const content = await readFile( + "test-fixtures/benchmark-predictions.json", + "utf-8", + ); + return JSON.parse(content); + } catch (error) { + throw new Error( + 'Failed to load test-fixtures/benchmark-predictions.json. Run "tsx src/test/fetch-benchmark-predictions.ts" first.', + ); + } +} + +/** + * Fetch prediction from database + */ +async function fetchPrediction( + db: ReturnType, + predictionId: string, +): Promise { + const results = await db + .select({ + parsedPrediction: parsedPrediction, + parsedPredictionDetails: parsedPredictionDetails, + scrapedTweet: scrapedTweet, + }) + .from(parsedPrediction) + .innerJoin( + parsedPredictionDetails, + eq(parsedPrediction.id, parsedPredictionDetails.parsedPredictionId), + ) + .innerJoin( + scrapedTweet, + eq(parsedPrediction.predictionId, scrapedTweet.predictionId), + ) + .where(eq(parsedPrediction.id, predictionId)) + .limit(1); + + return results[0] || null; +} + +/** + * Extract cost metrics from cost log file + */ +async function extractCostMetrics( + predictionId: string, +): Promise { + try { + const logContent = await readFile("costs.json", "utf-8"); + + if (!logContent || logContent.trim().length === 0) { + console.warn(` โš ๏ธ costs.json is empty for ${predictionId}`); + return null; + } + + const lines = logContent.trim().split("\n"); + + // Find the most recent entry for this prediction + for (let i = lines.length - 1; i >= 0; i--) { + const line = lines[i]; + if (!line || line.trim().length === 0) continue; + + try { + const entry = JSON.parse(line); + if (entry.prediction_id === predictionId) { + return { + searchApiCalls: entry.searchApiCalls || 0, + queryEnhancerInputTokens: entry.queryEnhancerInputTokens || 0, + queryEnhancerOutputTokens: entry.queryEnhancerOutputTokens || 0, + resultJudgeInputTokens: entry.resultJudgeInputTokens || 0, + resultJudgeOutputTokens: entry.resultJudgeOutputTokens || 0, + totalInputTokens: entry.totalInputTokens || 0, + totalOutputTokens: entry.totalOutputTokens || 0, + scraperCreditsSpent: entry.scraperCreditsSpent, + navigatorStepsUsed: entry.navigatorStepsUsed, + enhancementMethod: entry.enhancementMethod, + }; + } + } catch (parseError) { + console.warn(` โš ๏ธ Failed to parse cost log line: ${line.slice(0, 50)}...`); + continue; + } + } + + console.warn(` โš ๏ธ No cost entry found for prediction ${predictionId} in costs.json`); + return null; + } catch (error) { + const errorMsg = error instanceof Error ? error.message : String(error); + console.warn(` โš ๏ธ Could not read costs.json: ${errorMsg}`); + if ((error as NodeJS.ErrnoException)?.code === 'ENOENT') { + console.warn(` ๐Ÿ’ก File costs.json does not exist yet. It will be created on first validation.`); + } + } + + return null; +} + +/** + * Save benchmark results to file + */ +async function saveBenchmarkResults(results: BenchmarkRun): Promise { + await mkdir("test-results", { recursive: true }); + + const filename = `test-results/benchmark-${results.runId}.json`; + await writeFile(filename, JSON.stringify(results, null, 2)); + + console.log(`\n๐Ÿ“Š Benchmark results saved to ${filename}`); +} + +describe("E2E Validation Benchmark", () => { + it("should validate fixed predictions and track costs", async () => { + const fixture = await loadFixture(); + + // Validate fixture is populated + const firstId = fixture.predictions[0]?.id; + expect(firstId).toBeDefined(); + expect(firstId).not.toMatch(/REPLACE_WITH/); + + const db = createDb(); + const validator = new Validator(db); + + const runId = new Date().toISOString().replace(/[:.]/g, "-"); + const benchmarkResults: BenchmarkResult[] = []; + + console.log(`\n๐Ÿงช Running benchmark on ${fixture.predictions.length} predictions...\n`); + + for (const testCase of fixture.predictions) { + console.log(`Testing: ${testCase.id}`); + console.log(` Description: ${testCase.description}\n`); + + const startTime = Date.now(); + + const result = await db.transaction(async (tx) => { + const prediction = await fetchPrediction(db, testCase.id); + + if (!prediction) { + throw new Error(`Prediction ${testCase.id} not found in database`); + } + + return await validator.validatePrediction(tx, prediction); + }); + + const durationMs = Date.now() - startTime; + + // Wait a bit for cost log to be written to disk + await new Promise((resolve) => setTimeout(resolve, 100)); + + // Assert on result shape + expect(result).toBeDefined(); + expect(result).toHaveProperty("outcome"); + expect(result).toHaveProperty("proof"); + expect(result).toHaveProperty("sources"); + expect(result.sources).toBeInstanceOf(Array); + expect(result.prediction_id).toBe(testCase.id); + + // If expected outcome is specified, assert on it + if (testCase.expectedOutcome) { + expect(result.outcome).toBe(testCase.expectedOutcome); + } + + // Extract cost metrics from cost log + const costs = await extractCostMetrics(testCase.id); + + if (!costs) { + console.warn(` โš ๏ธ Could not extract cost metrics for ${testCase.id}`); + } + + const benchmarkResult: BenchmarkResult = { + predictionId: testCase.id, + description: testCase.description, + outcome: result.outcome, + proofLength: result.proof.length, + sourcesCount: result.sources.length, + costs: costs || { + searchApiCalls: 0, + queryEnhancerInputTokens: 0, + queryEnhancerOutputTokens: 0, + resultJudgeInputTokens: 0, + resultJudgeOutputTokens: 0, + totalInputTokens: 0, + totalOutputTokens: 0, + }, + durationMs, + timestamp: new Date().toISOString(), + }; + + benchmarkResults.push(benchmarkResult); + + console.log(` โœ… Outcome: ${result.outcome}`); + console.log(` ๐Ÿ“ Proof: ${result.proof.length} chars`); + console.log(` ๐Ÿ”— Sources: ${result.sources.length}`); + if (costs) { + console.log(` ๐Ÿ’ฐ Costs:`); + console.log(` - Search API calls: ${costs.searchApiCalls}`); + console.log(` - Total tokens: ${costs.totalInputTokens + costs.totalOutputTokens}`); + if (costs.scraperCreditsSpent) { + console.log(` - Scraper credits: ${costs.scraperCreditsSpent}`); + } + if (costs.navigatorStepsUsed) { + console.log(` - Navigator steps: ${costs.navigatorStepsUsed}`); + } + } + console.log(` โฑ๏ธ Duration: ${durationMs}ms\n`); + } + + // Calculate totals + const totals = { + totalSearchApiCalls: benchmarkResults.reduce( + (sum, r) => sum + r.costs.searchApiCalls, + 0, + ), + totalInputTokens: benchmarkResults.reduce( + (sum, r) => sum + r.costs.totalInputTokens, + 0, + ), + totalOutputTokens: benchmarkResults.reduce( + (sum, r) => sum + r.costs.totalOutputTokens, + 0, + ), + totalScraperCredits: benchmarkResults.reduce( + (sum, r) => sum + (r.costs.scraperCreditsSpent || 0), + 0, + ), + totalNavigatorSteps: benchmarkResults.reduce( + (sum, r) => sum + (r.costs.navigatorStepsUsed || 0), + 0, + ), + totalDurationMs: benchmarkResults.reduce((sum, r) => sum + r.durationMs, 0), + averageDurationMs: Math.round( + benchmarkResults.reduce((sum, r) => sum + r.durationMs, 0) / + benchmarkResults.length, + ), + }; + + const run: BenchmarkRun = { + runId, + timestamp: new Date().toISOString(), + predictions: benchmarkResults, + totals, + }; + + await saveBenchmarkResults(run); + + console.log("๐Ÿ“Š Benchmark Summary:"); + console.log(` Total search API calls: ${totals.totalSearchApiCalls}`); + console.log(` Total input tokens: ${totals.totalInputTokens}`); + console.log(` Total output tokens: ${totals.totalOutputTokens}`); + console.log(` Total tokens: ${totals.totalInputTokens + totals.totalOutputTokens}`); + if (totals.totalScraperCredits > 0) { + console.log(` Total scraper credits: ${totals.totalScraperCredits}`); + } + if (totals.totalNavigatorSteps > 0) { + console.log(` Total navigator steps: ${totals.totalNavigatorSteps}`); + } + console.log(` Total duration: ${totals.totalDurationMs}ms`); + console.log(` Average duration: ${totals.averageDurationMs}ms`); + }, 600000); // 10 minute timeout for the full benchmark +}); diff --git a/test-fixtures/README.md b/test-fixtures/README.md new file mode 100644 index 0000000..fce1ea0 --- /dev/null +++ b/test-fixtures/README.md @@ -0,0 +1,122 @@ +# E2E Benchmark Testing + +This directory contains the test fixtures for end-to-end validation benchmarking. + +## Quick Start + +### 1. Fetch Benchmark Predictions from Database + +```bash +npm run test:benchmark:fetch [count] +``` + +This will: +- Query your database for `[count]` matured predictions (default: 5) +- Save them to `test-fixtures/benchmark-predictions.json` +- Display the prediction IDs and context previews + +Example: +```bash +npm run test:benchmark:fetch 3 +``` + +### 2. Run the Benchmark + +```bash +npm run test:benchmark +``` + +This will: +- Load predictions from `test-fixtures/benchmark-predictions.json` +- Run full validation pipeline on each prediction +- Track costs (API calls, tokens, scraper credits, etc.) +- Save results to `test-results/benchmark-{timestamp}.json` +- Run automated assertions on result shape + +## What Gets Tracked + +For each prediction, the benchmark tracks: + +**Validation Results:** +- Outcome (MaturedTrue, MaturedFalse, etc.) +- Proof length +- Number of sources +- Duration (ms) + +**Cost Metrics:** +- Search API calls +- Query enhancer tokens (input/output) +- Result judge tokens (input/output) +- Total tokens +- Scraper credits spent (if enhancement used) +- Navigator steps used (if enhancement used) +- Enhancement method (scraper/navigator/none) + +**Aggregate Totals:** +- Total costs across all predictions +- Average duration per prediction + +## Comparing Runs + +Benchmark results are saved with timestamps in `test-results/`: + +``` +test-results/ + benchmark-2025-01-15T10-30-00-000Z.json + benchmark-2025-01-15T14-45-00-000Z.json + benchmark-2025-01-16T09-00-00-000Z.json +``` + +You can compare files to detect: +- Cost regressions (increased token usage) +- Performance regressions (slower validation) +- Behavior changes (different outcomes) + +## Expected Outcomes (Optional) + +You can add expected outcomes to `benchmark-predictions.json` for stricter assertions: + +```json +{ + "predictions": [ + { + "id": "abc123", + "description": "Bitcoin prediction", + "expectedOutcome": "MaturedTrue" // โ† Add this + } + ] +} +``` + +If specified, the test will assert that the actual outcome matches the expected outcome. + +## Example Output + +``` +๐Ÿงช Running benchmark on 3 predictions... + +Testing: abc123 + Description: Bitcoin will reach $50k by... + + โœ… Outcome: MaturedTrue + ๐Ÿ“ Proof: 342 chars + ๐Ÿ”— Sources: 2 + ๐Ÿ’ฐ Costs: + - Search API calls: 3 + - Total tokens: 4523 + - Scraper credits: 25 + โฑ๏ธ Duration: 8234ms + +... + +๐Ÿ“Š Benchmark Summary: + Total search API calls: 9 + Total input tokens: 8521 + Total output tokens: 4832 + Total tokens: 13353 + Total scraper credits: 50 + Total duration: 24701ms + Average duration: 8234ms + +๐Ÿ“Š Benchmark results saved to test-results/benchmark-2025-01-15T10-30-00-000Z.json +``` diff --git a/test-fixtures/benchmark-predictions.json b/test-fixtures/benchmark-predictions.json new file mode 100644 index 0000000..43d7c44 --- /dev/null +++ b/test-fixtures/benchmark-predictions.json @@ -0,0 +1,56 @@ +{ + "description": "Fixed set of predictions for E2E benchmarking", + "lastUpdated": "2025-12-16T20:29:18.748Z", + "predictions": [ + { + "id": "019a40b6-cbc6-721b-bd7d-05302e27a322", + "description": "The author is discussing a shift in business focus back to 'run the business' and states that this c", + "expectedOutcome": null + }, + { + "id": "019a40b6-d1f8-7c49-b490-db1a1eb6a608", + "description": "The author predicts that other major industries will follow the tech industry in moving away from 'w", + "expectedOutcome": null + }, + { + "id": "019a44e7-deb9-7bd6-966b-88e5abbae23e", + "description": "The first tweet shows a picture of a brickyard and states 'it is not realty โ€”yet.' The second tweet,", + "expectedOutcome": null + }, + { + "id": "019a44e8-0b5e-70b1-b960-ebe8f53ca79d", + "description": "The first tweet states 'The woke mind virus must disappear.' The second tweet, in response, simply s", + "expectedOutcome": null + }, + { + "id": "019a44e8-0b5e-7142-a4a4-bdbc12b24ede", + "description": "The author is drawing a parallel between the rapid adoption of cars in the early 20th century and th", + "expectedOutcome": null + }, + { + "id": "019a45b5-a5c0-76ce-8fe0-a64380233a75", + "description": "The author is discussing the need for other entities to develop Mars infrastructure that aligns with", + "expectedOutcome": null + }, + { + "id": "019a45b5-a5c0-7787-8945-53c2a21971f4", + "description": "The author states 'All right, time to build that AGI cluster.' This implies an intention or a plan t", + "expectedOutcome": null + }, + { + "id": "019a44e8-0b5e-70eb-ab09-0fe75ebe67f2", + "description": "The author is predicting that Grok 3 will represent a significant advancement.", + "expectedOutcome": null + }, + { + "id": "019a58fd-1cdd-7ce7-b47c-f484af4bc226", + "description": "No context", + "expectedOutcome": null + }, + { + "id": "019a6f43-ac5b-7e2c-b485-4e5dcea33727", + "description": "No context", + "expectedOutcome": null + } + ] +} \ No newline at end of file From 271dd075aa2a5243bead56fa8828d5126e001586 Mon Sep 17 00:00:00 2001 From: theu Date: Wed, 17 Dec 2025 19:44:18 -0300 Subject: [PATCH 10/10] delete outdated readme --- test-fixtures/README.md | 122 ---------------------------------------- 1 file changed, 122 deletions(-) delete mode 100644 test-fixtures/README.md diff --git a/test-fixtures/README.md b/test-fixtures/README.md deleted file mode 100644 index fce1ea0..0000000 --- a/test-fixtures/README.md +++ /dev/null @@ -1,122 +0,0 @@ -# E2E Benchmark Testing - -This directory contains the test fixtures for end-to-end validation benchmarking. - -## Quick Start - -### 1. Fetch Benchmark Predictions from Database - -```bash -npm run test:benchmark:fetch [count] -``` - -This will: -- Query your database for `[count]` matured predictions (default: 5) -- Save them to `test-fixtures/benchmark-predictions.json` -- Display the prediction IDs and context previews - -Example: -```bash -npm run test:benchmark:fetch 3 -``` - -### 2. Run the Benchmark - -```bash -npm run test:benchmark -``` - -This will: -- Load predictions from `test-fixtures/benchmark-predictions.json` -- Run full validation pipeline on each prediction -- Track costs (API calls, tokens, scraper credits, etc.) -- Save results to `test-results/benchmark-{timestamp}.json` -- Run automated assertions on result shape - -## What Gets Tracked - -For each prediction, the benchmark tracks: - -**Validation Results:** -- Outcome (MaturedTrue, MaturedFalse, etc.) -- Proof length -- Number of sources -- Duration (ms) - -**Cost Metrics:** -- Search API calls -- Query enhancer tokens (input/output) -- Result judge tokens (input/output) -- Total tokens -- Scraper credits spent (if enhancement used) -- Navigator steps used (if enhancement used) -- Enhancement method (scraper/navigator/none) - -**Aggregate Totals:** -- Total costs across all predictions -- Average duration per prediction - -## Comparing Runs - -Benchmark results are saved with timestamps in `test-results/`: - -``` -test-results/ - benchmark-2025-01-15T10-30-00-000Z.json - benchmark-2025-01-15T14-45-00-000Z.json - benchmark-2025-01-16T09-00-00-000Z.json -``` - -You can compare files to detect: -- Cost regressions (increased token usage) -- Performance regressions (slower validation) -- Behavior changes (different outcomes) - -## Expected Outcomes (Optional) - -You can add expected outcomes to `benchmark-predictions.json` for stricter assertions: - -```json -{ - "predictions": [ - { - "id": "abc123", - "description": "Bitcoin prediction", - "expectedOutcome": "MaturedTrue" // โ† Add this - } - ] -} -``` - -If specified, the test will assert that the actual outcome matches the expected outcome. - -## Example Output - -``` -๐Ÿงช Running benchmark on 3 predictions... - -Testing: abc123 - Description: Bitcoin will reach $50k by... - - โœ… Outcome: MaturedTrue - ๐Ÿ“ Proof: 342 chars - ๐Ÿ”— Sources: 2 - ๐Ÿ’ฐ Costs: - - Search API calls: 3 - - Total tokens: 4523 - - Scraper credits: 25 - โฑ๏ธ Duration: 8234ms - -... - -๐Ÿ“Š Benchmark Summary: - Total search API calls: 9 - Total input tokens: 8521 - Total output tokens: 4832 - Total tokens: 13353 - Total scraper credits: 50 - Total duration: 24701ms - Average duration: 8234ms - -๐Ÿ“Š Benchmark results saved to test-results/benchmark-2025-01-15T10-30-00-000Z.json -```