From 5904347d33f5466e3c5a8be770875c5b6879eac9 Mon Sep 17 00:00:00 2001
From: theu <matheusvargas412@gmail.com>
Date: Thu, 4 Dec 2025 22:11:09 -0300
Subject: [PATCH 01/10] remove openai key from env, fix packages

---
 package-lock.json | 91 +++++++++++++++++++++++++++--------------------
 package.json      |  1 +
 src/env.ts        |  1 -
 3 files changed, 54 insertions(+), 39 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 82fb721..bc3217e 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1242,9 +1242,9 @@
       }
     },
     "node_modules/@modelcontextprotocol/sdk": {
-      "version": "1.21.1",
-      "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.21.1.tgz",
-      "integrity": "sha512-UyLFcJLDvUuZbGnaQqXFT32CpPpGj7VS19roLut6gkQVhb439xUzYWbsUvdI3ZPL+2hnFosuugtYWE0Mcs1rmQ==",
+      "version": "1.24.1",
+      "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.24.1.tgz",
+      "integrity": "sha512-YTg4v6bKSst8EJM8NXHC3nGm8kgHD08IbIBbognUeLAgGLVgLpYrgQswzLQd4OyTL4l614ejhqsDrV1//t02Qw==",
       "license": "MIT",
       "dependencies": {
         "ajv": "^8.17.1",
@@ -1256,20 +1256,25 @@
         "eventsource-parser": "^3.0.0",
         "express": "^5.0.1",
         "express-rate-limit": "^7.5.0",
+        "jose": "^6.1.1",
         "pkce-challenge": "^5.0.0",
         "raw-body": "^3.0.0",
-        "zod": "^3.23.8",
-        "zod-to-json-schema": "^3.24.1"
+        "zod": "^3.25 || ^4.0",
+        "zod-to-json-schema": "^3.25.0"
       },
       "engines": {
         "node": ">=18"
       },
       "peerDependencies": {
-        "@cfworker/json-schema": "^4.1.1"
+        "@cfworker/json-schema": "^4.1.1",
+        "zod": "^3.25 || ^4.0"
       },
       "peerDependenciesMeta": {
         "@cfworker/json-schema": {
           "optional": true
+        },
+        "zod": {
+          "optional": false
         }
       }
     },
@@ -1303,23 +1308,27 @@
       }
     },
     "node_modules/@modelcontextprotocol/sdk/node_modules/body-parser": {
-      "version": "2.2.0",
-      "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.0.tgz",
-      "integrity": "sha512-02qvAaxv8tp7fBa/mw1ga98OGm+eCbqzJOKoRt70sLmfEEi+jyBYVTDGfCL/k06/4EMk/z01gCe7HoCH/f2LTg==",
+      "version": "2.2.1",
+      "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.1.tgz",
+      "integrity": "sha512-nfDwkulwiZYQIGwxdy0RUmowMhKcFVcYXUU7m4QlKYim1rUtg83xm2yjZ40QjDuc291AJjjeSc9b++AWHSgSHw==",
       "license": "MIT",
       "dependencies": {
         "bytes": "^3.1.2",
         "content-type": "^1.0.5",
-        "debug": "^4.4.0",
+        "debug": "^4.4.3",
         "http-errors": "^2.0.0",
-        "iconv-lite": "^0.6.3",
+        "iconv-lite": "^0.7.0",
         "on-finished": "^2.4.1",
         "qs": "^6.14.0",
-        "raw-body": "^3.0.0",
-        "type-is": "^2.0.0"
+        "raw-body": "^3.0.1",
+        "type-is": "^2.0.1"
       },
       "engines": {
         "node": ">=18"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
       }
     },
     "node_modules/@modelcontextprotocol/sdk/node_modules/content-disposition": {
@@ -1412,15 +1421,19 @@
       }
     },
     "node_modules/@modelcontextprotocol/sdk/node_modules/iconv-lite": {
-      "version": "0.6.3",
-      "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz",
-      "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==",
+      "version": "0.7.0",
+      "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.0.tgz",
+      "integrity": "sha512-cf6L2Ds3h57VVmkZe+Pn+5APsT7FpqJtEhhieDCvrE2MK5Qk9MyffgQyuxQTm6BChfeZNtcOLHp9IcWRVcIcBQ==",
       "license": "MIT",
       "dependencies": {
         "safer-buffer": ">= 2.1.2 < 3.0.0"
       },
       "engines": {
         "node": ">=0.10.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/express"
       }
     },
     "node_modules/@modelcontextprotocol/sdk/node_modules/json-schema-traverse": {
@@ -1510,22 +1523,6 @@
         "node": ">= 0.10"
       }
     },
-    "node_modules/@modelcontextprotocol/sdk/node_modules/raw-body/node_modules/iconv-lite": {
-      "version": "0.7.0",
-      "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.0.tgz",
-      "integrity": "sha512-cf6L2Ds3h57VVmkZe+Pn+5APsT7FpqJtEhhieDCvrE2MK5Qk9MyffgQyuxQTm6BChfeZNtcOLHp9IcWRVcIcBQ==",
-      "license": "MIT",
-      "dependencies": {
-        "safer-buffer": ">= 2.1.2 < 3.0.0"
-      },
-      "engines": {
-        "node": ">=0.10.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/express"
-      }
-    },
     "node_modules/@modelcontextprotocol/sdk/node_modules/send": {
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/send/-/send-1.2.0.tgz",
@@ -3236,9 +3233,9 @@
       }
     },
     "node_modules/drizzle-kit": {
-      "version": "0.31.6",
-      "resolved": "https://registry.npmjs.org/drizzle-kit/-/drizzle-kit-0.31.6.tgz",
-      "integrity": "sha512-/B4e/4pwnx25QwD5xXgdpo1S+077a2VZdosXbItE/oNmUgQwZydGDz9qJYmnQl/b+5IX0rLfwRhrPnroGtrg8Q==",
+      "version": "0.31.7",
+      "resolved": "https://registry.npmjs.org/drizzle-kit/-/drizzle-kit-0.31.7.tgz",
+      "integrity": "sha512-hOzRGSdyKIU4FcTSFYGKdXEjFsncVwHZ43gY3WU5Bz9j5Iadp6Rh6hxLSQ1IWXpKLBKt/d5y1cpSPcV+FcoQ1A==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
@@ -4438,6 +4435,15 @@
       "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
       "license": "ISC"
     },
+    "node_modules/jose": {
+      "version": "6.1.3",
+      "resolved": "https://registry.npmjs.org/jose/-/jose-6.1.3.tgz",
+      "integrity": "sha512-0TpaTfihd4QMNwrz/ob2Bp7X04yuxJkjRGi4aKmOqwhov54i6u79oCv7T+C7lo70MKH6BesI3vscD1yb/yzKXQ==",
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/panva"
+      }
+    },
     "node_modules/js-tiktoken": {
       "version": "1.0.21",
       "resolved": "https://registry.npmjs.org/js-tiktoken/-/js-tiktoken-1.0.21.tgz",
@@ -4449,9 +4455,9 @@
       }
     },
     "node_modules/js-yaml": {
-      "version": "4.1.0",
-      "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz",
-      "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==",
+      "version": "4.1.1",
+      "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz",
+      "integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
@@ -6801,6 +6807,15 @@
       "funding": {
         "url": "https://github.com/sponsors/colinhacks"
       }
+    },
+    "node_modules/zod-to-json-schema": {
+      "version": "3.25.0",
+      "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.25.0.tgz",
+      "integrity": "sha512-HvWtU2UG41LALjajJrML6uQejQhNJx+JBO9IflpSja4R03iNWfKXrj6W2h7ljuLyc1nKS+9yDyL/9tD1U/yBnQ==",
+      "license": "ISC",
+      "peerDependencies": {
+        "zod": "^3.25 || ^4"
+      }
     }
   }
 }
diff --git a/package.json b/package.json
index a84d5fa..6195eca 100644
--- a/package.json
+++ b/package.json
@@ -12,6 +12,7 @@
     "clean": "rm -rf .turbo node_modules dist",
     "dev": "tsx ./src/index.ts",
     "test:scraper": "tsx ./src/test/scraper.pipeline.ts",
+    "test:enhancement": "tsx ./src/test/test-enhancement.ts",
     "mcp:playwright": "npx @playwright/mcp@latest --config ./playwright-mcp.config.json --headless --port 8931",
     "format": "prettier --check . --cache",
     "format-fix": "prettier --write . --cache",
diff --git a/src/env.ts b/src/env.ts
index 8997d60..3472d41 100644
--- a/src/env.ts
+++ b/src/env.ts
@@ -11,7 +11,6 @@ export const env = createEnv({
     SEARCHAPI_API_KEY: z.string(),
     OPENROUTER_API_KEY: z.string(),
     SCRAPER_API: z.string().optional(),
-    OPENAI_API_KEY: z.string(),
     SCRAPERAPI_KEY: z.string().optional(),
     SCRAPER_BUDGET_CREDITS: z.coerce.number().positive().optional(),
     SCRAPERAPI_PROXY_SERVER: z.string().optional(),

From 56b6665769ca823e0310e45f38fd2c948121e984 Mon Sep 17 00:00:00 2001
From: theu <matheusvargas412@gmail.com>
Date: Fri, 5 Dec 2025 14:46:34 -0300
Subject: [PATCH 02/10] add scraper and navigation enhancements

---
 src/enhancement/enhancer.ts | 340 ++++++++++++++++++++++++++++++++++++
 src/validator.ts            | 127 +++++++++++++-
 2 files changed, 465 insertions(+), 2 deletions(-)
 create mode 100644 src/enhancement/enhancer.ts

diff --git a/src/enhancement/enhancer.ts b/src/enhancement/enhancer.ts
new file mode 100644
index 0000000..5ce5c75
--- /dev/null
+++ b/src/enhancement/enhancer.ts
@@ -0,0 +1,340 @@
+import { logWithContext } from "../logger.js";
+import type { SearchResult } from "../search/searchapi.js";
+import type { Judgment } from "../llm/result-judge.js";
+import {
+  fetchWithEscalation,
+  looksBlocked,
+} from "../scraper/scraper.fetch.js";
+import {
+  stripHtml,
+  chunkText,
+  findDataDownloadLinks,
+} from "../scraper/scraper.text.js";
+import {
+  summarizeRankedChunks,
+  type RankedForSource,
+} from "../scraper/scraper.summarize.js";
+import { rankByEmbedding } from "../llm/embeddings.js";
+import { createBrowserEnv } from "../browser/playwright/browserEnv.js";
+import { runOccamBrowserLoop } from "../browser/playwright/occamBrowserLoop.js";
+import { type BrowserEnv } from "../browser/playwright/types.js";
+
+export interface EnhancementDecision {
+  shouldEnhance: boolean;
+  method: "none" | "scraper" | "navigator";
+  reason: string;
+}
+
+export interface EnhancementResult {
+  answer: string;
+  citations: Array<{ url: string; quotes: string[] }>;
+  status: "answered" | "insufficient" | "ambiguous";
+  method: "scraper" | "navigator";
+  creditsSpent?: number;
+  stepsUsed?: number;
+}
+
+export interface EnhancementConfig {
+  scraper: {
+    totalBudgetCredits: number;
+    perUrlBudgetCredits: number;
+    maxUrls: number;
+  };
+  navigator: {
+    maxSteps: number;
+    timeoutMs: number;
+  };
+}
+
+/**
+ * Decide if and how to enhance search results using scraper or navigator
+ * Uses heuristics to minimize costs while maximizing evidence quality
+ */
+export function decideEnhancement(
+  predictionText: string,
+  searchResults: SearchResult[],
+  judgment: Judgment,
+): EnhancementDecision {
+  // Rule 1: If we already have strong evidence, no enhancement needed
+  if (judgment.sufficient && judgment.score >= 8) {
+    return {
+      shouldEnhance: false,
+      method: "none",
+      reason: "Strong evidence already found in search results",
+    };
+  }
+
+  // Rule 2: If no results at all, try navigation (might need form interaction)
+  if (searchResults.length === 0) {
+    return {
+      shouldEnhance: true,
+      method: "navigator",
+      reason: "No search results - prediction may require interactive exploration",
+    };
+  }
+
+  // Rule 3: Check if results are paywalled/blocked
+  const blockedCount = searchResults.filter(
+    (r) =>
+      r.excerpt.toLowerCase().includes("subscribe") ||
+      r.excerpt.toLowerCase().includes("sign in to read") ||
+      r.excerpt.toLowerCase().includes("login to continue") ||
+      r.excerpt.toLowerCase().includes("create account"),
+  ).length;
+
+  if (blockedCount > searchResults.length / 2) {
+    return {
+      shouldEnhance: true,
+      method: "scraper",
+      reason: "Many results appear paywalled - scraper may bypass",
+    };
+  }
+
+  // Rule 4: Check if prediction needs historical/tabular data
+  const needsHistoricalData =
+    /\b(on|by|before|after|in)\s+\d{4}\b/.test(predictionText) ||
+    /\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\b/i.test(
+      predictionText,
+    ) ||
+    /(historical|past|previous|last year|yesterday|archive)/.test(
+      predictionText.toLowerCase(),
+    );
+
+  if (needsHistoricalData && !judgment.sufficient) {
+    return {
+      shouldEnhance: true,
+      method: "navigator",
+      reason:
+        "Prediction requires historical data - navigation can find archives/tables",
+    };
+  }
+
+  // Rule 5: Check if we're close but need better content (marginal evidence)
+  if (judgment.score >= 4 && judgment.score <= 7 && !judgment.sufficient) {
+    return {
+      shouldEnhance: true,
+      method: "scraper",
+      reason: "Marginal evidence found - deeper scraping may find specifics",
+    };
+  }
+
+  // Rule 6: If judgment suggests we need more context and score is low
+  if (!judgment.sufficient && judgment.score < 5) {
+    return {
+      shouldEnhance: true,
+      method: "scraper",
+      reason: "Insufficient evidence with low score - try deeper content extraction",
+    };
+  }
+
+  // Default: no enhancement
+  return {
+    shouldEnhance: false,
+    method: "none",
+    reason: "No clear benefit to enhancement given search results",
+  };
+}
+
+/**
+ * Enhance search results using scraper module
+ * Fetches full page content with JS rendering, extracts text, ranks by relevance
+ */
+export async function enhanceWithScraper(
+  predictionId: string,
+  predictionText: string,
+  urls: string[],
+  config: EnhancementConfig,
+): Promise<EnhancementResult | null> {
+  const budget = {
+    remaining: config.scraper.totalBudgetCredits,
+  };
+  let totalSpent = 0;
+
+  logWithContext(predictionId, "Starting scraper enhancement...");
+
+  const focus = `${predictionText} — exact numeric/date/time context`;
+  const allRanked: RankedForSource[] = [];
+
+  const urlsToFetch = urls.slice(0, config.scraper.maxUrls);
+
+  for (const url of urlsToFetch) {
+    if (budget.remaining <= 0) {
+      logWithContext(predictionId, "Scraper budget exhausted");
+      break;
+    }
+
+    try {
+      const perUrlBudget = {
+        remaining: Math.min(
+          config.scraper.perUrlBudgetCredits,
+          budget.remaining,
+        ),
+      };
+      const beforeBudget = perUrlBudget.remaining;
+
+      logWithContext(
+        predictionId,
+        `Fetching ${url} with budget ${perUrlBudget.remaining}`,
+      );
+
+      const result = await fetchWithEscalation(url, perUrlBudget, (html, meta) => {
+        if (meta.status === 200) return true;
+        const { blocked } = looksBlocked(meta.url, html);
+        return !blocked;
+      });
+
+      const spent = beforeBudget - perUrlBudget.remaining;
+      totalSpent += spent;
+      budget.remaining -= spent;
+
+      logWithContext(
+        predictionId,
+        `Fetched ${url}: ${result.html.length} chars, spent ${spent} credits`,
+      );
+
+      const text = stripHtml(result.html);
+      let chunks = chunkText(text, 1000).slice(0, 20);
+
+      // Try to find downloadable data links
+      const dataLinks = findDataDownloadLinks(result.html, url).slice(0, 2);
+      for (const dlink of dataLinks) {
+        try {
+          const dataResult = await fetchWithEscalation(dlink, perUrlBudget);
+          const dataText = stripHtml(dataResult.html);
+          const dataChunks = chunkText(dataText, 1200).slice(0, 3);
+          chunks = chunks.concat(dataChunks);
+        } catch (e) {
+          logWithContext(
+            predictionId,
+            `Failed to fetch data link: ${dlink}`,
+          );
+        }
+      }
+
+      const ranked = await rankByEmbedding(focus, chunks);
+      allRanked.push({
+        url,
+        chunks: ranked.map((r) => ({ text: r.text, score: r.score })),
+      });
+
+      logWithContext(
+        predictionId,
+        `Ranked ${ranked.length} chunks for ${url}, top score: ${ranked[0]?.score.toFixed(3)}`,
+      );
+    } catch (e) {
+      const msg = e instanceof Error ? e.message : String(e);
+      logWithContext(predictionId, `Failed to fetch ${url}: ${msg}`);
+    }
+  }
+
+  if (allRanked.length === 0) {
+    logWithContext(predictionId, "No content extracted from any URL");
+    return null;
+  }
+
+  // Summarize all ranked chunks
+  try {
+    const now = new Date();
+    const anchorISO = now.toISOString().slice(0, 10);
+    const summary = await summarizeRankedChunks(
+      predictionText,
+      allRanked,
+      anchorISO,
+    );
+
+    logWithContext(
+      predictionId,
+      `Scraper enhancement: ${summary.status}, ${summary.citations.length} citations`,
+    );
+
+    return {
+      answer: summary.answer || "",
+      citations: summary.citations,
+      status: summary.status,
+      method: "scraper",
+      creditsSpent: totalSpent,
+    };
+  } catch (e) {
+    const msg = e instanceof Error ? e.message : String(e);
+    logWithContext(predictionId, `Scraper summarization failed: ${msg}`);
+    return null;
+  }
+}
+
+/**
+ * Enhance using browser navigation
+ * Launches browser and autonomously navigates to find evidence
+ */
+export async function enhanceWithNavigator(
+  predictionId: string,
+  predictionText: string,
+  startUrl: string,
+  config: EnhancementConfig,
+): Promise<EnhancementResult | null> {
+  logWithContext(predictionId, "Starting navigator enhancement...");
+
+  let env: BrowserEnv | null = null;
+
+  try {
+    env = await createBrowserEnv({
+      homeUrl: startUrl,
+      headless: true,
+      useScraperProxy: false,
+      debug: false,
+    });
+
+    const goal = `Find evidence to validate: "${predictionText}"`;
+
+    logWithContext(predictionId, `Navigator goal: ${goal}`);
+
+    const result = await runOccamBrowserLoop({
+      env,
+      goal,
+      initialUrl: startUrl,
+      maxSteps: config.navigator.maxSteps,
+      debug: false,
+    });
+
+    logWithContext(
+      predictionId,
+      `Navigator completed: ${result.steps} steps, action: ${result.lastAction?.action}`,
+    );
+
+    if (result.lastAction?.action === "stop") {
+      // Extract answer and try to build citations from final state
+      const answer = result.lastAction.answer || "No answer found";
+
+      // Extract text from final page to build pseudo-citations
+      const finalText = stripHtml(result.finalRawState.domHtml);
+      const chunks = chunkText(finalText, 800).slice(0, 5);
+
+      return {
+        answer,
+        citations: [
+          {
+            url: result.finalRawState.url,
+            quotes: chunks.slice(0, 2),
+          },
+        ],
+        status: answer !== "No answer found" ? "answered" : "insufficient",
+        method: "navigator",
+        stepsUsed: result.steps,
+      };
+    }
+
+    logWithContext(predictionId, "Navigator did not reach stop action");
+    return null;
+  } catch (e) {
+    const msg = e instanceof Error ? e.message : String(e);
+    logWithContext(predictionId, `Navigator enhancement failed: ${msg}`);
+    return null;
+  } finally {
+    if (env) {
+      try {
+        await env.close();
+      } catch (e) {
+        logWithContext(predictionId, "Failed to close browser");
+      }
+    }
+  }
+}
diff --git a/src/validator.ts b/src/validator.ts
index 252ef56..b49f5ef 100644
--- a/src/validator.ts
+++ b/src/validator.ts
@@ -15,6 +15,13 @@ import { QueryEnhancer, type PastAttempt } from "./llm/query-enhancer.js";
 import { ResultJudge } from "./llm/result-judge.js";
 import { truncateText, writeCostLog } from "./utils.js";
 import { logWithContext, logErrorWithContext } from "./logger.js";
+import {
+  decideEnhancement,
+  enhanceWithScraper,
+  enhanceWithNavigator,
+  type EnhancementResult,
+  type EnhancementConfig,
+} from "./enhancement/enhancer.js";
 
 export const ValidationOutcome = z.enum([
   "MaturedTrue",
@@ -74,10 +81,17 @@ const VALIDATION_CONFIG = {
     TRUE_DEFINITIVE_MIN: 9,
     FALSE_DEFINITIVE_MAX: 2,
   },
+  enhancement: {
+    SCRAPER_TOTAL_BUDGET_CREDITS: 50,
+    SCRAPER_PER_URL_BUDGET_CREDITS: 15,
+    SCRAPER_MAX_URLS: 3,
+    NAVIGATOR_MAX_STEPS: 10,
+    NAVIGATOR_TIMEOUT_MS: 120_000, // 2 minutes
+  },
 } as const;
 
 export class Validator {
-  constructor(_db: DB) {}
+  constructor(_db: DB) { }
 
   /**
    * Check if a prediction should be validated before doing expensive operations
@@ -461,6 +475,19 @@ export class Validator {
     const queryEnhancer = new QueryEnhancer();
     const resultJudge = new ResultJudge();
 
+    // Enhancement configuration
+    const enhancementConfig: EnhancementConfig = {
+      scraper: {
+        totalBudgetCredits: VALIDATION_CONFIG.enhancement.SCRAPER_TOTAL_BUDGET_CREDITS,
+        perUrlBudgetCredits: VALIDATION_CONFIG.enhancement.SCRAPER_PER_URL_BUDGET_CREDITS,
+        maxUrls: VALIDATION_CONFIG.enhancement.SCRAPER_MAX_URLS,
+      },
+      navigator: {
+        maxSteps: VALIDATION_CONFIG.enhancement.NAVIGATOR_MAX_STEPS,
+        timeoutMs: VALIDATION_CONFIG.enhancement.NAVIGATOR_TIMEOUT_MS,
+      },
+    };
+
     logWithContext(predictionId, "Starting hybrid validation");
     logWithContext(
       predictionId,
@@ -502,11 +529,41 @@ export class Validator {
         `Total results found: ${combinedResults.length}`,
       );
 
+      // Early enhancement check: if no results, try navigator immediately
       if (combinedResults.length === 0) {
+        logWithContext(
+          predictionId,
+          "No search results - attempting navigator enhancement",
+        );
+        const navResult = await enhanceWithNavigator(
+          predictionId,
+          predictionText,
+          "https://www.google.com",
+          enhancementConfig,
+        );
+
+        if (navResult && navResult.status === "answered") {
+          logWithContext(
+            predictionId,
+            "Navigator found evidence when search failed",
+          );
+          return {
+            prediction_id: predictionId,
+            outcome: "MaturedTrue",
+            proof: `Navigator enhancement: ${navResult.answer}`,
+            sources: navResult.citations.map((c) => ({
+              url: c.url,
+              title: "Navigator-discovered evidence",
+              pub_date: null,
+              excerpt: c.quotes[0] || "",
+            })),
+          };
+        }
+
         return {
           prediction_id: predictionId,
           outcome: "MissingContext",
-          proof: "No search results found",
+          proof: "No search results found and navigator enhancement failed",
           sources: [],
         };
       }
@@ -589,6 +646,72 @@ export class Validator {
         );
       }
 
+      // Enhancement decision: try scraper or navigator if still insufficient
+      let enhancementResult: EnhancementResult | null = null;
+      const enhancementDecision = decideEnhancement(
+        predictionText,
+        combinedResults,
+        judgment,
+      );
+
+      logWithContext(
+        predictionId,
+        `Enhancement decision: ${enhancementDecision.method} - ${enhancementDecision.reason}`,
+      );
+
+      if (enhancementDecision.shouldEnhance) {
+        if (enhancementDecision.method === "scraper") {
+          const urls = combinedResults.slice(0, 3).map((r) => r.url);
+          enhancementResult = await enhanceWithScraper(
+            predictionId,
+            predictionText,
+            urls,
+            enhancementConfig,
+          );
+        } else if (enhancementDecision.method === "navigator") {
+          const startUrl =
+            combinedResults[0]?.url || "https://www.google.com";
+          enhancementResult = await enhanceWithNavigator(
+            predictionId,
+            predictionText,
+            startUrl,
+            enhancementConfig,
+          );
+        }
+
+        if (enhancementResult && enhancementResult.status === "answered") {
+          logWithContext(
+            predictionId,
+            `Enhancement succeeded via ${enhancementResult.method}`,
+          );
+          const result = enhancementResult; // Type narrowing for closure
+          // Override judgment with enhancement result
+          judgment = {
+            ...judgment,
+            decision: "TRUE",
+            score: 8,
+            summary: result.answer,
+            evidence: result.citations
+              .map((c) => `${c.url}: ${c.quotes.slice(0, 1).join("; ")}`)
+              .join("\n"),
+            sufficient: true,
+          };
+          // Add enhancement sources to results
+          const enhancementSources = result.citations.map((c) => ({
+            url: c.url,
+            title: `Enhanced via ${result.method}`,
+            pub_date: null,
+            excerpt: c.quotes[0] || "",
+          }));
+          combinedResults = [...enhancementSources, ...combinedResults];
+        } else {
+          logWithContext(
+            predictionId,
+            `Enhancement failed or returned insufficient data`,
+          );
+        }
+      }
+
       let outcome: ValidationResult["outcome"];
 
       if (judgment.decision === "TRUE") {

From 731826b8951844c8a9500a163205bb7ee2b5b850 Mon Sep 17 00:00:00 2001
From: theu <matheusvargas412@gmail.com>
Date: Fri, 5 Dec 2025 16:33:34 -0300
Subject: [PATCH 03/10] enhancement test script

---
 src/test/test-enhancement.ts | 147 +++++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 src/test/test-enhancement.ts

diff --git a/src/test/test-enhancement.ts b/src/test/test-enhancement.ts
new file mode 100644
index 0000000..720bcab
--- /dev/null
+++ b/src/test/test-enhancement.ts
@@ -0,0 +1,147 @@
+import "dotenv/config";
+import {
+  enhanceWithScraper,
+  enhanceWithNavigator,
+  type EnhancementConfig,
+} from "../enhancement/enhancer.js";
+
+/**
+ * Test enhancement methods directly
+ * Usage:
+ *   npm run test:enhancement scraper "ENS moving to L2" url1 url2 url3
+ *   npm run test:enhancement navigator "Bitcoin price on Jan 1 2024" https://google.com
+ */
+
+async function testEnhancement() {
+  const method = process.argv[2]; // "scraper" or "navigator"
+  const predictionText = process.argv[3];
+
+  if (!method || !predictionText) {
+    console.error("❌ Usage:");
+    console.error('  Scraper: npm run test:enhancement scraper "prediction text" url1 url2 url3');
+    console.error('  Navigator: npm run test:enhancement navigator "prediction text" https://start-url.com');
+    process.exit(1);
+  }
+
+  const predictionId = "test-" + Date.now();
+
+  const config: EnhancementConfig = {
+    scraper: {
+      totalBudgetCredits: 50,
+      perUrlBudgetCredits: 15,
+      maxUrls: 3,
+    },
+    navigator: {
+      maxSteps: 10,
+      timeoutMs: 120_000,
+    },
+  };
+
+  console.log("=== Enhancement Test ===\n");
+  console.log(`Method: ${method}`);
+  console.log(`Prediction: "${predictionText}"`);
+  console.log();
+
+  try {
+    if (method === "scraper") {
+      const urls = process.argv.slice(4);
+      if (urls.length === 0) {
+        console.error("❌ No URLs provided for scraper test");
+        console.error("   Provide at least one URL to scrape");
+        process.exit(1);
+      }
+
+      console.log(`URLs to scrape: ${urls.length}`);
+      urls.forEach((url, i) => console.log(`  ${i + 1}. ${url}`));
+      console.log();
+
+      console.log("🔧 Starting scraper enhancement...\n");
+      console.log("=".repeat(60));
+
+      const result = await enhanceWithScraper(
+        predictionId,
+        predictionText,
+        urls,
+        config,
+      );
+
+      console.log("=".repeat(60));
+      console.log();
+
+      if (result) {
+        console.log("✅ Scraper Enhancement Result:");
+        console.log(`  Status: ${result.status}`);
+        console.log(`  Credits spent: ${result.creditsSpent || 0}`);
+        console.log(`  Answer: ${result.answer.slice(0, 300)}${result.answer.length > 300 ? "..." : ""}`);
+        console.log(`  Citations: ${result.citations.length}`);
+        if (result.citations.length > 0) {
+          console.log("\n  Citations:");
+          result.citations.forEach((c, i) => {
+            console.log(`    ${i + 1}. ${c.url}`);
+            console.log(`       Quotes: ${c.quotes.length}`);
+            c.quotes.slice(0, 2).forEach((q, j) => {
+              console.log(`         ${j + 1}. "${q.slice(0, 100)}${q.length > 100 ? "..." : ""}"`);
+            });
+          });
+        }
+      } else {
+        console.log("❌ Scraper enhancement failed");
+      }
+    } else if (method === "navigator") {
+      const startUrl = process.argv[4] || "https://www.google.com";
+
+      console.log(`Start URL: ${startUrl}`);
+      console.log();
+
+      console.log("🔧 Starting navigator enhancement...\n");
+      console.log("=".repeat(60));
+
+      const result = await enhanceWithNavigator(
+        predictionId,
+        predictionText,
+        startUrl,
+        config,
+      );
+
+      console.log("=".repeat(60));
+      console.log();
+
+      if (result) {
+        console.log("✅ Navigator Enhancement Result:");
+        console.log(`  Status: ${result.status}`);
+        console.log(`  Steps used: ${result.stepsUsed || 0}`);
+        console.log(`  Answer: ${result.answer.slice(0, 300)}${result.answer.length > 300 ? "..." : ""}`);
+        console.log(`  Citations: ${result.citations.length}`);
+        if (result.citations.length > 0) {
+          console.log("\n  Citations:");
+          result.citations.forEach((c, i) => {
+            console.log(`    ${i + 1}. ${c.url}`);
+            console.log(`       Quotes: ${c.quotes.length}`);
+            c.quotes.slice(0, 2).forEach((q, j) => {
+              console.log(`         ${j + 1}. "${q.slice(0, 100)}${q.length > 100 ? "..." : ""}"`);
+            });
+          });
+        }
+      } else {
+        console.log("❌ Navigator enhancement failed");
+      }
+    } else {
+      console.error(`❌ Unknown method: ${method}`);
+      console.error("   Use 'scraper' or 'navigator'");
+      process.exit(1);
+    }
+  } catch (error) {
+    console.error("\n❌ Fatal error:", error);
+    if (error instanceof Error && error.stack) {
+      console.error(error.stack);
+    }
+    process.exit(1);
+  }
+
+  process.exit(0);
+}
+
+testEnhancement().catch((error) => {
+  console.error("❌ Fatal error:", error);
+  process.exit(1);
+});

From 299a0ca6a79a5ac83981efedbab6dbae3f181acf Mon Sep 17 00:00:00 2001
From: theu <matheusvargas412@gmail.com>
Date: Fri, 5 Dec 2025 16:46:19 -0300
Subject: [PATCH 04/10] bugfixes, add enhancement cost logging

---
 src/enhancement/enhancer.ts  | 51 +++++++++++++++++++++++++++---------
 src/test/test-enhancement.ts |  1 -
 src/utils.ts                 |  3 +++
 src/validator.ts             | 38 +++++++++++++++------------
 4 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/src/enhancement/enhancer.ts b/src/enhancement/enhancer.ts
index 5ce5c75..1d545fc 100644
--- a/src/enhancement/enhancer.ts
+++ b/src/enhancement/enhancer.ts
@@ -1,10 +1,7 @@
 import { logWithContext } from "../logger.js";
 import type { SearchResult } from "../search/searchapi.js";
 import type { Judgment } from "../llm/result-judge.js";
-import {
-  fetchWithEscalation,
-  looksBlocked,
-} from "../scraper/scraper.fetch.js";
+import { fetchWithEscalation, looksBlocked } from "../scraper/scraper.fetch.js";
 import {
   stripHtml,
   chunkText,
@@ -19,6 +16,39 @@ import { createBrowserEnv } from "../browser/playwright/browserEnv.js";
 import { runOccamBrowserLoop } from "../browser/playwright/occamBrowserLoop.js";
 import { type BrowserEnv } from "../browser/playwright/types.js";
 
+/**
+ * Check if a search result excerpt suggests blocked/paywalled content.
+ */
+function excerptLooksBlocked(excerpt: string): boolean {
+  const lower = excerpt.toLowerCase();
+
+  // Subscription/paywall indicators
+  const paywallKeywords = [
+    "subscribe",
+    "sign in to read",
+    "login to continue",
+    "create account",
+    "members only",
+    "premium content",
+    "exclusive access",
+    "unlock this article",
+    "start your free trial",
+    "paid subscribers",
+  ];
+
+  // Bot-wall indicators that might appear in excerpts
+  const botWallKeywords = [
+    "access denied",
+    "please enable javascript",
+    "checking your browser",
+    "just a moment",
+    "verify you are human",
+  ];
+
+  const allKeywords = [...paywallKeywords, ...botWallKeywords];
+  return allKeywords.some((keyword) => lower.includes(keyword));
+}
+
 export interface EnhancementDecision {
   shouldEnhance: boolean;
   method: "none" | "scraper" | "navigator";
@@ -42,7 +72,6 @@ export interface EnhancementConfig {
   };
   navigator: {
     maxSteps: number;
-    timeoutMs: number;
   };
 }
 
@@ -73,20 +102,16 @@ export function decideEnhancement(
     };
   }
 
-  // Rule 3: Check if results are paywalled/blocked
-  const blockedCount = searchResults.filter(
-    (r) =>
-      r.excerpt.toLowerCase().includes("subscribe") ||
-      r.excerpt.toLowerCase().includes("sign in to read") ||
-      r.excerpt.toLowerCase().includes("login to continue") ||
-      r.excerpt.toLowerCase().includes("create account"),
+  // Rule 3: Check if results are paywalled/blocked using consolidated detection
+  const blockedCount = searchResults.filter((r) =>
+    excerptLooksBlocked(r.excerpt),
   ).length;
 
   if (blockedCount > searchResults.length / 2) {
     return {
       shouldEnhance: true,
       method: "scraper",
-      reason: "Many results appear paywalled - scraper may bypass",
+      reason: "Many results appear paywalled/blocked - scraper may bypass",
     };
   }
 
diff --git a/src/test/test-enhancement.ts b/src/test/test-enhancement.ts
index 720bcab..74f77b1 100644
--- a/src/test/test-enhancement.ts
+++ b/src/test/test-enhancement.ts
@@ -33,7 +33,6 @@ async function testEnhancement() {
     },
     navigator: {
       maxSteps: 10,
-      timeoutMs: 120_000,
     },
   };
 
diff --git a/src/utils.ts b/src/utils.ts
index 7967832..e089a19 100644
--- a/src/utils.ts
+++ b/src/utils.ts
@@ -34,6 +34,9 @@ export interface CostLogEntry {
   totalOutputTokens: number;
   outcome: string;
   timestamp: string;
+  enhancementMethod?: "scraper" | "navigator" | "none" | undefined;
+  scraperCreditsSpent?: number | undefined;
+  navigatorStepsUsed?: number | undefined;
 }
 
 /**
diff --git a/src/validator.ts b/src/validator.ts
index b49f5ef..7bc544d 100644
--- a/src/validator.ts
+++ b/src/validator.ts
@@ -86,7 +86,6 @@ const VALIDATION_CONFIG = {
     SCRAPER_PER_URL_BUDGET_CREDITS: 15,
     SCRAPER_MAX_URLS: 3,
     NAVIGATOR_MAX_STEPS: 10,
-    NAVIGATOR_TIMEOUT_MS: 120_000, // 2 minutes
   },
 } as const;
 
@@ -484,7 +483,6 @@ export class Validator {
       },
       navigator: {
         maxSteps: VALIDATION_CONFIG.enhancement.NAVIGATOR_MAX_STEPS,
-        timeoutMs: VALIDATION_CONFIG.enhancement.NAVIGATOR_TIMEOUT_MS,
       },
     };
 
@@ -684,26 +682,29 @@ export class Validator {
             predictionId,
             `Enhancement succeeded via ${enhancementResult.method}`,
           );
-          const result = enhancementResult; // Type narrowing for closure
-          // Override judgment with enhancement result
-          judgment = {
-            ...judgment,
-            decision: "TRUE",
-            score: 8,
-            summary: result.answer,
-            evidence: result.citations
-              .map((c) => `${c.url}: ${c.quotes.slice(0, 1).join("; ")}`)
-              .join("\n"),
-            sufficient: true,
-          };
-          // Add enhancement sources to results
-          const enhancementSources = result.citations.map((c) => ({
+          // Add enhancement sources to results for re-evaluation
+          const method = enhancementResult.method;
+          const enhancementSources = enhancementResult.citations.map((c) => ({
             url: c.url,
-            title: `Enhanced via ${result.method}`,
+            title: `Enhanced via ${method}`,
             pub_date: null,
             excerpt: c.quotes[0] || "",
           }));
           combinedResults = [...enhancementSources, ...combinedResults];
+
+          // Re-evaluate with enhanced evidence using ResultJudge
+          logWithContext(
+            predictionId,
+            `Re-evaluating with ${enhancementSources.length} enhanced sources`,
+          );
+          judgment = await resultJudge.evaluate(predictionText, combinedResults);
+          totalResultJudgeInputTokens += judgment.inputTokens;
+          totalResultJudgeOutputTokens += judgment.outputTokens;
+
+          logWithContext(
+            predictionId,
+            `Post-enhancement judgment: ${judgment.decision} (score: ${judgment.score})`,
+          );
         } else {
           logWithContext(
             predictionId,
@@ -762,6 +763,9 @@ export class Validator {
         totalOutputTokens,
         outcome,
         timestamp: new Date().toISOString(),
+        enhancementMethod: enhancementDecision.method,
+        scraperCreditsSpent: enhancementResult?.creditsSpent,
+        navigatorStepsUsed: enhancementResult?.stepsUsed,
       });
 
       const sources =

From 3ab858504ac569549b9e8710b0fd96c76fc85158 Mon Sep 17 00:00:00 2001
From: theu <matheusvargas412@gmail.com>
Date: Mon, 8 Dec 2025 17:43:09 -0300
Subject: [PATCH 05/10] integrates LLM picker to select promising URLs

---
 src/enhancement/enhancer.ts  | 40 +++++++++++++++++++++++++++++++++---
 src/test/test-enhancement.ts |  9 +++++++-
 src/validator.ts             |  4 ++--
 3 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/src/enhancement/enhancer.ts b/src/enhancement/enhancer.ts
index 1d545fc..89533db 100644
--- a/src/enhancement/enhancer.ts
+++ b/src/enhancement/enhancer.ts
@@ -15,6 +15,8 @@ import { rankByEmbedding } from "../llm/embeddings.js";
 import { createBrowserEnv } from "../browser/playwright/browserEnv.js";
 import { runOccamBrowserLoop } from "../browser/playwright/occamBrowserLoop.js";
 import { type BrowserEnv } from "../browser/playwright/types.js";
+import { gatePicksWithLLM } from "../scraper/scraper.llm-gate.js";
+import type { SerpItem } from "../scraper/scraper.schemas.js";
 
 /**
  * Check if a search result excerpt suggests blocked/paywalled content.
@@ -163,11 +165,12 @@ export function decideEnhancement(
 /**
  * Enhance search results using scraper module
  * Fetches full page content with JS rendering, extracts text, ranks by relevance
+ * Uses LLM gate picker to intelligently select best URLs to scrape
  */
 export async function enhanceWithScraper(
   predictionId: string,
   predictionText: string,
-  urls: string[],
+  searchResults: SearchResult[],
   config: EnhancementConfig,
 ): Promise<EnhancementResult | null> {
   const budget = {
@@ -177,11 +180,42 @@ export async function enhanceWithScraper(
 
   logWithContext(predictionId, "Starting scraper enhancement...");
 
+  // Convert SearchResult[] to SerpItem[] format for gatePicksWithLLM
+  const serpItems: SerpItem[] = searchResults.map((r) => ({
+    title: r.title,
+    link: r.url,
+    snippet: r.excerpt,
+    date: r.pub_date,
+    domain: new URL(r.url).hostname,
+  }));
+
+  // Use LLM gate picker to intelligently select URLs
+  logWithContext(
+    predictionId,
+    `Using LLM gate picker to select from ${serpItems.length} results`,
+  );
+
+  let urlsToFetch: string[];
+  try {
+    urlsToFetch = await gatePicksWithLLM(
+      predictionText,
+      serpItems,
+      config.scraper.maxUrls,
+    );
+    logWithContext(
+      predictionId,
+      `LLM gate picker selected ${urlsToFetch.length} URLs: ${urlsToFetch.join(", ")}`,
+    );
+  } catch (e) {
+    const msg = e instanceof Error ? e.message : String(e);
+    logWithContext(predictionId, `LLM gate picker failed: ${msg}, falling back to top ${config.scraper.maxUrls} URLs`);
+    // Fallback to naive top-N if LLM fails
+    urlsToFetch = searchResults.slice(0, config.scraper.maxUrls).map((r) => r.url);
+  }
+
   const focus = `${predictionText} — exact numeric/date/time context`;
   const allRanked: RankedForSource[] = [];
 
-  const urlsToFetch = urls.slice(0, config.scraper.maxUrls);
-
   for (const url of urlsToFetch) {
     if (budget.remaining <= 0) {
       logWithContext(predictionId, "Scraper budget exhausted");
diff --git a/src/test/test-enhancement.ts b/src/test/test-enhancement.ts
index 74f77b1..6ea5347 100644
--- a/src/test/test-enhancement.ts
+++ b/src/test/test-enhancement.ts
@@ -54,13 +54,20 @@ async function testEnhancement() {
       urls.forEach((url, i) => console.log(`  ${i + 1}. ${url}`));
       console.log();
 
+      const searchResults = urls.map((url, i) => ({
+        url,
+        title: `Test result ${i + 1}`,
+        excerpt: `Test excerpt for ${url}`,
+        pub_date: null,
+      }));
+
       console.log("🔧 Starting scraper enhancement...\n");
       console.log("=".repeat(60));
 
       const result = await enhanceWithScraper(
         predictionId,
         predictionText,
-        urls,
+        searchResults,
         config,
       );
 
diff --git a/src/validator.ts b/src/validator.ts
index 7bc544d..0acba5c 100644
--- a/src/validator.ts
+++ b/src/validator.ts
@@ -659,11 +659,11 @@ export class Validator {
 
       if (enhancementDecision.shouldEnhance) {
         if (enhancementDecision.method === "scraper") {
-          const urls = combinedResults.slice(0, 3).map((r) => r.url);
+          // Pass search results to enhancer for intelligent URL selection via LLM gate picker
           enhancementResult = await enhanceWithScraper(
             predictionId,
             predictionText,
-            urls,
+            combinedResults,
             enhancementConfig,
           );
         } else if (enhancementDecision.method === "navigator") {

From 28772e86a98f5874002454be090c803c5592b82e Mon Sep 17 00:00:00 2001
From: theu <matheusvargas412@gmail.com>
Date: Mon, 8 Dec 2025 17:54:36 -0300
Subject: [PATCH 06/10] full pipeline test

---
 package.json                |   1 +
 src/test/test-validation.ts | 126 ++++++++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 src/test/test-validation.ts

diff --git a/package.json b/package.json
index 6195eca..23192c3 100644
--- a/package.json
+++ b/package.json
@@ -13,6 +13,7 @@
     "dev": "tsx ./src/index.ts",
     "test:scraper": "tsx ./src/test/scraper.pipeline.ts",
     "test:enhancement": "tsx ./src/test/test-enhancement.ts",
+    "test:validation": "tsx ./src/test/test-validation.ts",
     "mcp:playwright": "npx @playwright/mcp@latest --config ./playwright-mcp.config.json --headless --port 8931",
     "format": "prettier --check . --cache",
     "format-fix": "prettier --write . --cache",
diff --git a/src/test/test-validation.ts b/src/test/test-validation.ts
new file mode 100644
index 0000000..d909bc8
--- /dev/null
+++ b/src/test/test-validation.ts
@@ -0,0 +1,126 @@
+import "dotenv/config";
+import { randomUUID } from "node:crypto";
+import { Validator } from "../validator.js";
+import { createDb } from "../db/client.js";
+import type { PredictionToValidate } from "../validator.js";
+
+/**
+ * Test the full validation pipeline on a custom prediction text
+ * Usage:
+ *   npm run test:validation:custom "Bitcoin will reach $50k by end of 2024"
+ */
+
+async function testCustomValidation() {
+  const predictionText = process.argv.slice(2).join(" ");
+
+  if (!predictionText) {
+    console.error("❌ Usage: npm run test:validation:custom \"prediction text\"");
+    console.error('   Example: npm run test:validation:custom "Bitcoin will reach $50k by end of 2024"');
+    process.exit(1);
+  }
+
+  console.log("=== Full Validation Pipeline Test (Custom) ===\n");
+  console.log(`Prediction: "${predictionText}"`);
+  console.log();
+
+  const db = createDb();
+  const validator = new Validator(db);
+
+  try {
+    await db.transaction(async (tx) => {
+      // Create a mock prediction object
+      const testId = randomUUID();
+      const mockPrediction: PredictionToValidate = {
+        parsedPrediction: {
+          id: testId,
+          predictionId: randomUUID(),
+          goal: [{ start: 0, end: predictionText.length }],
+          timeframe: null,
+          topicId: null,
+          predictionQuality: 80,
+          llmConfidence: "0.9",
+          briefRationale: "Test prediction",
+          vagueness: "0.3",
+          context: null,
+          filterAgentId: null,
+          createdAt: new Date(),
+          updatedAt: new Date(),
+        },
+        parsedPredictionDetails: {
+          parsedPredictionId: testId,
+          predictionContext: predictionText,
+          timeframeStatus: "valid",
+          timeframeStartUtc: null,
+          timeframeEndUtc: new Date(),
+          timeframePrecision: null,
+          timeframeReasoning: null,
+          timeframeAssumptions: null,
+          timeframeConfidence: null,
+          filterValidationConfidence: "0.95",
+          filterValidationReasoning: null,
+          verdictConfidence: null,
+          verdictSources: null,
+          createdAt: new Date(),
+          updatedAt: new Date(),
+        },
+        scrapedTweet: {
+          id: BigInt(0),
+          text: predictionText,
+          authorId: BigInt(0),
+          date: new Date(),
+          conversationId: null,
+          parentTweetId: null,
+          predictionId: null,
+          createdAt: new Date(),
+          updatedAt: new Date(),
+        },
+      };
+
+      console.log("🔧 Running full validation pipeline...\n");
+      console.log("=".repeat(80));
+      console.log();
+
+      const validationResult = await validator.validatePrediction(
+        tx,
+        mockPrediction,
+      );
+
+      console.log();
+      console.log("=".repeat(80));
+      console.log();
+
+      // Display results
+      console.log("✅ Validation Complete!\n");
+      console.log(`Outcome: ${validationResult.outcome}`);
+      console.log(`\nProof (${validationResult.proof.length} chars):`);
+      console.log("─".repeat(80));
+      console.log(validationResult.proof);
+      console.log("─".repeat(80));
+      console.log(`\nSources: ${validationResult.sources.length}`);
+      if (validationResult.sources.length > 0) {
+        validationResult.sources.forEach((source, i) => {
+          console.log(`\n  ${i + 1}. ${source.title}`);
+          console.log(`     URL: ${source.url}`);
+          console.log(`     Date: ${source.pub_date || "N/A"}`);
+          console.log(`     Excerpt: "${source.excerpt.slice(0, 150)}${source.excerpt.length > 150 ? "..." : ""}"`);
+        });
+      }
+
+      return validationResult;
+    });
+
+    console.log("\n✅ Test completed successfully");
+    process.exit(0);
+  } catch (error) {
+    console.error("\n❌ Fatal error:", error);
+    if (error instanceof Error && error.stack) {
+      console.error(error.stack);
+    }
+    process.exit(1);
+  }
+}
+
+testCustomValidation().catch((error) => {
+  console.error("❌ Fatal error:", error);
+  process.exit(1);
+});

From f0a78b410e01f5979e9d523943b7f0bdf7ffaf02 Mon Sep 17 00:00:00 2001
From: theu <matheusvargas412@gmail.com>
Date: Mon, 15 Dec 2025 18:43:02 -0300
Subject: [PATCH 07/10] add URL deduplication

---
 src/validator.ts | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/validator.ts b/src/validator.ts
index 0acba5c..0f801f5 100644
--- a/src/validator.ts
+++ b/src/validator.ts
@@ -521,10 +521,22 @@ export class Validator {
       let totalResultJudgeOutputTokens = 0;
       let searchApiCalls = VALIDATION_CONFIG.search.INITIAL_QUERIES;
 
+      // Track visited URLs for deduplication across iterations
+      const visitedUrls = new Set<string>();
+
       let combinedResults = initialResultSets.flat();
+      // Deduplicate initial results
+      combinedResults = combinedResults.filter((result) => {
+        if (visitedUrls.has(result.url)) {
+          return false;
+        }
+        visitedUrls.add(result.url);
+        return true;
+      });
+
       logWithContext(
         predictionId,
-        `Total results found: ${combinedResults.length}`,
+        `Total results found: ${combinedResults.length} (after deduplication)`,
       );
 
       // Early enhancement check: if no results, try navigator immediately
@@ -628,10 +640,19 @@ export class Validator {
         );
         searchApiCalls++;
 
-        combinedResults = [...combinedResults, ...refinedResults];
+        // Deduplicate refined results against already visited URLs
+        const newResults = refinedResults.filter((result) => {
+          if (visitedUrls.has(result.url)) {
+            return false;
+          }
+          visitedUrls.add(result.url);
+          return true;
+        });
+
+        combinedResults = [...combinedResults, ...newResults];
         logWithContext(
           predictionId,
-          `Additional results: ${refinedResults.length}, Total: ${combinedResults.length}`,
+          `Additional results: ${refinedResults.length} (${newResults.length} new), Total: ${combinedResults.length}`,
         );
 
         judgment = await resultJudge.evaluate(predictionText, combinedResults);
@@ -684,12 +705,17 @@ export class Validator {
           );
           // Add enhancement sources to results for re-evaluation
           const method = enhancementResult.method;
-          const enhancementSources = enhancementResult.citations.map((c) => ({
-            url: c.url,
-            title: `Enhanced via ${method}`,
-            pub_date: null,
-            excerpt: c.quotes[0] || "",
-          }));
+          const enhancementSources = enhancementResult.citations
+            .filter((c) => !visitedUrls.has(c.url)) // Deduplicate enhancement sources
+            .map((c) => {
+              visitedUrls.add(c.url);
+              return {
+                url: c.url,
+                title: `Enhanced via ${method}`,
+                pub_date: null,
+                excerpt: c.quotes[0] || "",
+              };
+            });
           combinedResults = [...enhancementSources, ...combinedResults];
 
           // Re-evaluate with enhanced evidence using ResultJudge

From 9fc6c13432babf1e97a9f6feeb77184607a05a22 Mon Sep 17 00:00:00 2001
From: theu <matheusvargas412@gmail.com>
Date: Mon, 15 Dec 2025 19:01:37 -0300
Subject: [PATCH 08/10] add query enhancement retries

---
 src/validator.ts | 53 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/src/validator.ts b/src/validator.ts
index 0f801f5..f63f38b 100644
--- a/src/validator.ts
+++ b/src/validator.ts
@@ -69,7 +69,7 @@ const VALIDATION_CONFIG = {
     INITIAL_QUERIES: 2,
     RESULTS_PER_QUERY: 10,
     MAX_TOTAL_RESULTS: 30,
-    MAX_REFINEMENT_ITERATIONS: 1,
+    MAX_SERP_ITERATIONS: 3,
   },
   quality: {
     FILTER_VALIDATION_CONFIDENCE_MIN: 0.85,
@@ -594,14 +594,21 @@ export class Validator {
         `Judgment: ${judgment.decision} (score: ${judgment.score}), Sufficient: ${judgment.sufficient ? "yes" : "no"}`,
       );
 
-      if (
+      // Iterative SERP refinement loop
+      let serpIteration = 0;
+      const allQueries: string[] = [...initialQueryResult.queries];
+
+      while (
         !judgment.sufficient &&
+        serpIteration < VALIDATION_CONFIG.search.MAX_SERP_ITERATIONS &&
         combinedResults.length < VALIDATION_CONFIG.search.MAX_TOTAL_RESULTS
       ) {
+        serpIteration++;
         logWithContext(
           predictionId,
-          "Step 4: Results insufficient, generating refined query...",
+          `Step 4.${serpIteration}: Results insufficient, generating refined query (iteration ${serpIteration}/${VALIDATION_CONFIG.search.MAX_SERP_ITERATIONS})...`,
         );
+
         if (judgment.nextQuerySuggestion) {
           logWithContext(
             predictionId,
@@ -609,18 +616,19 @@ export class Validator {
           );
         }
 
-        const pastAttempts: PastAttempt[] = initialQueryResult.queries.map(
-          (q) => {
-            const attempt: PastAttempt = {
-              query: q,
-              success: false,
-            };
-            if (judgment.nextQuerySuggestion) {
-              attempt.reasoning = judgment.nextQuerySuggestion;
-            }
-            return attempt;
-          },
-        );
+        // Build past attempts from all queries so far
+        // Only attach the suggestion reasoning to the most recent query
+        const pastAttempts: PastAttempt[] = allQueries.map((q, idx) => {
+          const attempt: PastAttempt = {
+            query: q,
+            success: false,
+          };
+          // Only add reasoning to the last query (current judgment feedback)
+          if (judgment.nextQuerySuggestion && idx === allQueries.length - 1) {
+            attempt.reasoning = judgment.nextQuerySuggestion;
+          }
+          return attempt;
+        });
 
         const refinedQueryResult = await queryEnhancer.enhanceWithTokens(
           predictionText,
@@ -629,6 +637,8 @@ export class Validator {
         totalQueryEnhancerInputTokens += refinedQueryResult.inputTokens;
         totalQueryEnhancerOutputTokens += refinedQueryResult.outputTokens;
 
+        allQueries.push(refinedQueryResult.query);
+
         logWithContext(
           predictionId,
           `Refined query: "${refinedQueryResult.query}"`,
@@ -652,16 +662,25 @@ export class Validator {
         combinedResults = [...combinedResults, ...newResults];
         logWithContext(
           predictionId,
-          `Additional results: ${refinedResults.length} (${newResults.length} new), Total: ${combinedResults.length}`,
+          `Iteration ${serpIteration}: ${refinedResults.length} results (${newResults.length} new), Total: ${combinedResults.length}`,
         );
 
+        // Re-evaluate with new results
         judgment = await resultJudge.evaluate(predictionText, combinedResults);
         totalResultJudgeInputTokens += judgment.inputTokens;
         totalResultJudgeOutputTokens += judgment.outputTokens;
 
         logWithContext(
           predictionId,
-          `Final judgment: ${judgment.decision} (score: ${judgment.score})`,
+          `Iteration ${serpIteration} judgment: ${judgment.decision} (score: ${judgment.score}), Sufficient: ${judgment.sufficient ? "yes" : "no"}`,
+        );
+      }
+
+      // Log final iteration summary
+      if (serpIteration > 0) {
+        logWithContext(
+          predictionId,
+          `Completed ${serpIteration} SERP iteration(s). Final: ${judgment.decision} (score: ${judgment.score})`,
         );
       }
 

From 77c01effef942153cdd581ce6cb555af8a5ca343 Mon Sep 17 00:00:00 2001
From: theu <matheusvargas412@gmail.com>
Date: Tue, 16 Dec 2025 21:14:21 -0300
Subject: [PATCH 09/10] add fixed e2e test

---
 .gitignore                               |   2 +
 package.json                             |   1 +
 src/test/e2e-benchmark.test.ts           | 321 +++++++++++++++++++++++
 test-fixtures/README.md                  | 122 +++++++++
 test-fixtures/benchmark-predictions.json |  56 ++++
 5 files changed, 502 insertions(+)
 create mode 100644 src/test/e2e-benchmark.test.ts
 create mode 100644 test-fixtures/README.md
 create mode 100644 test-fixtures/benchmark-predictions.json

diff --git a/.gitignore b/.gitignore
index c753693..4163853 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,3 +46,5 @@ drizzle.config.ts
 
 reference/
 AgentOccam/
+
+test-results/
diff --git a/package.json b/package.json
index 23192c3..4045f0b 100644
--- a/package.json
+++ b/package.json
@@ -14,6 +14,7 @@
     "test:scraper": "tsx ./src/test/scraper.pipeline.ts",
     "test:enhancement": "tsx ./src/test/test-enhancement.ts",
     "test:validation": "tsx ./src/test/test-validation.ts",
+    "test:benchmark": "vitest run src/test/e2e-benchmark.test.ts",
     "mcp:playwright": "npx @playwright/mcp@latest --config ./playwright-mcp.config.json --headless --port 8931",
     "format": "prettier --check . --cache",
     "format-fix": "prettier --write . --cache",
diff --git a/src/test/e2e-benchmark.test.ts b/src/test/e2e-benchmark.test.ts
new file mode 100644
index 0000000..555d7ac
--- /dev/null
+++ b/src/test/e2e-benchmark.test.ts
@@ -0,0 +1,321 @@
+import "dotenv/config";
+import { describe, it, expect } from "vitest";
+import { readFile, writeFile, mkdir } from "node:fs/promises";
+import { createDb } from "../db/client.js";
+import { Validator } from "../validator.js";
+import {
+  parsedPrediction,
+  parsedPredictionDetails,
+  scrapedTweet,
+} from "../db/schema.js";
+import { eq } from "drizzle-orm";
+import type { PredictionToValidate } from "../validator.js";
+
+interface BenchmarkFixture {
+  description: string;
+  predictions: Array<{
+    id: string;
+    description: string;
+    expectedOutcome: string | null;
+  }>;
+}
+
+interface BenchmarkResult {
+  predictionId: string;
+  description: string;
+  outcome: string;
+  proofLength: number;
+  sourcesCount: number;
+  costs: {
+    searchApiCalls: number;
+    queryEnhancerInputTokens: number;
+    queryEnhancerOutputTokens: number;
+    resultJudgeInputTokens: number;
+    resultJudgeOutputTokens: number;
+    totalInputTokens: number;
+    totalOutputTokens: number;
+    scraperCreditsSpent?: number;
+    navigatorStepsUsed?: number;
+    enhancementMethod?: string;
+  };
+  durationMs: number;
+  timestamp: string;
+}
+
+interface BenchmarkRun {
+  runId: string;
+  timestamp: string;
+  predictions: BenchmarkResult[];
+  totals: {
+    totalSearchApiCalls: number;
+    totalInputTokens: number;
+    totalOutputTokens: number;
+    totalScraperCredits: number;
+    totalNavigatorSteps: number;
+    totalDurationMs: number;
+    averageDurationMs: number;
+  };
+}
+
+/**
+ * Load benchmark fixture
+ */
+async function loadFixture(): Promise<BenchmarkFixture> {
+  try {
+    const content = await readFile(
+      "test-fixtures/benchmark-predictions.json",
+      "utf-8",
+    );
+    return JSON.parse(content);
+  } catch (error) {
+    throw new Error(
+      'Failed to load test-fixtures/benchmark-predictions.json. Run "tsx src/test/fetch-benchmark-predictions.ts" first.',
+    );
+  }
+}
+
+/**
+ * Fetch prediction from database
+ */
+async function fetchPrediction(
+  db: ReturnType<typeof createDb>,
+  predictionId: string,
+): Promise<PredictionToValidate | null> {
+  const results = await db
+    .select({
+      parsedPrediction: parsedPrediction,
+      parsedPredictionDetails: parsedPredictionDetails,
+      scrapedTweet: scrapedTweet,
+    })
+    .from(parsedPrediction)
+    .innerJoin(
+      parsedPredictionDetails,
+      eq(parsedPrediction.id, parsedPredictionDetails.parsedPredictionId),
+    )
+    .innerJoin(
+      scrapedTweet,
+      eq(parsedPrediction.predictionId, scrapedTweet.predictionId),
+    )
+    .where(eq(parsedPrediction.id, predictionId))
+    .limit(1);
+
+  return results[0] || null;
+}
+
+/**
+ * Extract cost metrics from cost log file
+ */
+async function extractCostMetrics(
+  predictionId: string,
+): Promise<BenchmarkResult["costs"] | null> {
+  try {
+    const logContent = await readFile("costs.json", "utf-8");
+
+    if (!logContent || logContent.trim().length === 0) {
+      console.warn(`  ⚠️  costs.json is empty for ${predictionId}`);
+      return null;
+    }
+
+    const lines = logContent.trim().split("\n");
+
+    // Find the most recent entry for this prediction
+    for (let i = lines.length - 1; i >= 0; i--) {
+      const line = lines[i];
+      if (!line || line.trim().length === 0) continue;
+
+      try {
+        const entry = JSON.parse(line);
+        if (entry.prediction_id === predictionId) {
+          return {
+            searchApiCalls: entry.searchApiCalls || 0,
+            queryEnhancerInputTokens: entry.queryEnhancerInputTokens || 0,
+            queryEnhancerOutputTokens: entry.queryEnhancerOutputTokens || 0,
+            resultJudgeInputTokens: entry.resultJudgeInputTokens || 0,
+            resultJudgeOutputTokens: entry.resultJudgeOutputTokens || 0,
+            totalInputTokens: entry.totalInputTokens || 0,
+            totalOutputTokens: entry.totalOutputTokens || 0,
+            scraperCreditsSpent: entry.scraperCreditsSpent,
+            navigatorStepsUsed: entry.navigatorStepsUsed,
+            enhancementMethod: entry.enhancementMethod,
+          };
+        }
+      } catch (parseError) {
+        console.warn(`  ⚠️  Failed to parse cost log line: ${line.slice(0, 50)}...`);
+        continue;
+      }
+    }
+
+    console.warn(`  ⚠️  No cost entry found for prediction ${predictionId} in costs.json`);
+    return null;
+  } catch (error) {
+    const errorMsg = error instanceof Error ? error.message : String(error);
+    console.warn(`  ⚠️  Could not read costs.json: ${errorMsg}`);
+    if ((error as NodeJS.ErrnoException)?.code === 'ENOENT') {
+      console.warn(`  💡 File costs.json does not exist yet. It will be created on first validation.`);
+    }
+  }
+
+  return null;
+}
+
+/**
+ * Save benchmark results to file
+ */
+async function saveBenchmarkResults(results: BenchmarkRun): Promise<void> {
+  await mkdir("test-results", { recursive: true });
+
+  const filename = `test-results/benchmark-${results.runId}.json`;
+  await writeFile(filename, JSON.stringify(results, null, 2));
+
+  console.log(`\n📊 Benchmark results saved to ${filename}`);
+}
+
+describe("E2E Validation Benchmark", () => {
+  it("should validate fixed predictions and track costs", async () => {
+    const fixture = await loadFixture();
+
+    // Validate fixture is populated
+    const firstId = fixture.predictions[0]?.id;
+    expect(firstId).toBeDefined();
+    expect(firstId).not.toMatch(/REPLACE_WITH/);
+
+    const db = createDb();
+    const validator = new Validator(db);
+
+    const runId = new Date().toISOString().replace(/[:.]/g, "-");
+    const benchmarkResults: BenchmarkResult[] = [];
+
+    console.log(`\n🧪 Running benchmark on ${fixture.predictions.length} predictions...\n`);
+
+    for (const testCase of fixture.predictions) {
+      console.log(`Testing: ${testCase.id}`);
+      console.log(`  Description: ${testCase.description}\n`);
+
+      const startTime = Date.now();
+
+      const result = await db.transaction(async (tx) => {
+        const prediction = await fetchPrediction(db, testCase.id);
+
+        if (!prediction) {
+          throw new Error(`Prediction ${testCase.id} not found in database`);
+        }
+
+        return await validator.validatePrediction(tx, prediction);
+      });
+
+      const durationMs = Date.now() - startTime;
+
+      // Wait a bit for cost log to be written to disk
+      await new Promise((resolve) => setTimeout(resolve, 100));
+
+      // Assert on result shape
+      expect(result).toBeDefined();
+      expect(result).toHaveProperty("outcome");
+      expect(result).toHaveProperty("proof");
+      expect(result).toHaveProperty("sources");
+      expect(result.sources).toBeInstanceOf(Array);
+      expect(result.prediction_id).toBe(testCase.id);
+
+      // If expected outcome is specified, assert on it
+      if (testCase.expectedOutcome) {
+        expect(result.outcome).toBe(testCase.expectedOutcome);
+      }
+
+      // Extract cost metrics from cost log
+      const costs = await extractCostMetrics(testCase.id);
+
+      if (!costs) {
+        console.warn(`  ⚠️  Could not extract cost metrics for ${testCase.id}`);
+      }
+
+      const benchmarkResult: BenchmarkResult = {
+        predictionId: testCase.id,
+        description: testCase.description,
+        outcome: result.outcome,
+        proofLength: result.proof.length,
+        sourcesCount: result.sources.length,
+        costs: costs || {
+          searchApiCalls: 0,
+          queryEnhancerInputTokens: 0,
+          queryEnhancerOutputTokens: 0,
+          resultJudgeInputTokens: 0,
+          resultJudgeOutputTokens: 0,
+          totalInputTokens: 0,
+          totalOutputTokens: 0,
+        },
+        durationMs,
+        timestamp: new Date().toISOString(),
+      };
+
+      benchmarkResults.push(benchmarkResult);
+
+      console.log(`  ✅ Outcome: ${result.outcome}`);
+      console.log(`  📝 Proof: ${result.proof.length} chars`);
+      console.log(`  🔗 Sources: ${result.sources.length}`);
+      if (costs) {
+        console.log(`  💰 Costs:`);
+        console.log(`     - Search API calls: ${costs.searchApiCalls}`);
+        console.log(`     - Total tokens: ${costs.totalInputTokens + costs.totalOutputTokens}`);
+        if (costs.scraperCreditsSpent) {
+          console.log(`     - Scraper credits: ${costs.scraperCreditsSpent}`);
+        }
+        if (costs.navigatorStepsUsed) {
+          console.log(`     - Navigator steps: ${costs.navigatorStepsUsed}`);
+        }
+      }
+      console.log(`  ⏱️  Duration: ${durationMs}ms\n`);
+    }
+
+    // Calculate totals
+    const totals = {
+      totalSearchApiCalls: benchmarkResults.reduce(
+        (sum, r) => sum + r.costs.searchApiCalls,
+        0,
+      ),
+      totalInputTokens: benchmarkResults.reduce(
+        (sum, r) => sum + r.costs.totalInputTokens,
+        0,
+      ),
+      totalOutputTokens: benchmarkResults.reduce(
+        (sum, r) => sum + r.costs.totalOutputTokens,
+        0,
+      ),
+      totalScraperCredits: benchmarkResults.reduce(
+        (sum, r) => sum + (r.costs.scraperCreditsSpent || 0),
+        0,
+      ),
+      totalNavigatorSteps: benchmarkResults.reduce(
+        (sum, r) => sum + (r.costs.navigatorStepsUsed || 0),
+        0,
+      ),
+      totalDurationMs: benchmarkResults.reduce((sum, r) => sum + r.durationMs, 0),
+      averageDurationMs: Math.round(
+        benchmarkResults.reduce((sum, r) => sum + r.durationMs, 0) /
+        benchmarkResults.length,
+      ),
+    };
+
+    const run: BenchmarkRun = {
+      runId,
+      timestamp: new Date().toISOString(),
+      predictions: benchmarkResults,
+      totals,
+    };
+
+    await saveBenchmarkResults(run);
+
+    console.log("📊 Benchmark Summary:");
+    console.log(`  Total search API calls: ${totals.totalSearchApiCalls}`);
+    console.log(`  Total input tokens: ${totals.totalInputTokens}`);
+    console.log(`  Total output tokens: ${totals.totalOutputTokens}`);
+    console.log(`  Total tokens: ${totals.totalInputTokens + totals.totalOutputTokens}`);
+    if (totals.totalScraperCredits > 0) {
+      console.log(`  Total scraper credits: ${totals.totalScraperCredits}`);
+    }
+    if (totals.totalNavigatorSteps > 0) {
+      console.log(`  Total navigator steps: ${totals.totalNavigatorSteps}`);
+    }
+    console.log(`  Total duration: ${totals.totalDurationMs}ms`);
+    console.log(`  Average duration: ${totals.averageDurationMs}ms`);
+  }, 600000); // 10 minute timeout for the full benchmark
+});
diff --git a/test-fixtures/README.md b/test-fixtures/README.md
new file mode 100644
index 0000000..fce1ea0
--- /dev/null
+++ b/test-fixtures/README.md
@@ -0,0 +1,122 @@
+# E2E Benchmark Testing
+
+This directory contains the test fixtures for end-to-end validation benchmarking.
+
+## Quick Start
+
+### 1. Fetch Benchmark Predictions from Database
+
+```bash
+npm run test:benchmark:fetch [count]
+```
+
+This will:
+- Query your database for `[count]` matured predictions (default: 5)
+- Save them to `test-fixtures/benchmark-predictions.json`
+- Display the prediction IDs and context previews
+
+Example:
+```bash
+npm run test:benchmark:fetch 3
+```
+
+### 2. Run the Benchmark
+
+```bash
+npm run test:benchmark
+```
+
+This will:
+- Load predictions from `test-fixtures/benchmark-predictions.json`
+- Run full validation pipeline on each prediction
+- Track costs (API calls, tokens, scraper credits, etc.)
+- Save results to `test-results/benchmark-{timestamp}.json`
+- Run automated assertions on result shape
+
+## What Gets Tracked
+
+For each prediction, the benchmark tracks:
+
+**Validation Results:**
+- Outcome (MaturedTrue, MaturedFalse, etc.)
+- Proof length
+- Number of sources
+- Duration (ms)
+
+**Cost Metrics:**
+- Search API calls
+- Query enhancer tokens (input/output)
+- Result judge tokens (input/output)
+- Total tokens
+- Scraper credits spent (if enhancement used)
+- Navigator steps used (if enhancement used)
+- Enhancement method (scraper/navigator/none)
+
+**Aggregate Totals:**
+- Total costs across all predictions
+- Average duration per prediction
+
+## Comparing Runs
+
+Benchmark results are saved with timestamps in `test-results/`:
+
+```
+test-results/
+  benchmark-2025-01-15T10-30-00-000Z.json
+  benchmark-2025-01-15T14-45-00-000Z.json
+  benchmark-2025-01-16T09-00-00-000Z.json
+```
+
+You can compare files to detect:
+- Cost regressions (increased token usage)
+- Performance regressions (slower validation)
+- Behavior changes (different outcomes)
+
+## Expected Outcomes (Optional)
+
+You can add expected outcomes to `benchmark-predictions.json` for stricter assertions:
+
+```json
+{
+  "predictions": [
+    {
+      "id": "abc123",
+      "description": "Bitcoin prediction",
+      "expectedOutcome": "MaturedTrue"  // ← Add this
+    }
+  ]
+}
+```
+
+If specified, the test will assert that the actual outcome matches the expected outcome.
+
+## Example Output
+
+```
+🧪 Running benchmark on 3 predictions...
+
+Testing: abc123
+  Description: Bitcoin will reach $50k by...
+
+  ✅ Outcome: MaturedTrue
+  📝 Proof: 342 chars
+  🔗 Sources: 2
+  💰 Costs:
+     - Search API calls: 3
+     - Total tokens: 4523
+     - Scraper credits: 25
+  ⏱️  Duration: 8234ms
+
+...
+
+📊 Benchmark Summary:
+  Total search API calls: 9
+  Total input tokens: 8521
+  Total output tokens: 4832
+  Total tokens: 13353
+  Total scraper credits: 50
+  Total duration: 24701ms
+  Average duration: 8234ms
+
+📊 Benchmark results saved to test-results/benchmark-2025-01-15T10-30-00-000Z.json
+```
diff --git a/test-fixtures/benchmark-predictions.json b/test-fixtures/benchmark-predictions.json
new file mode 100644
index 0000000..43d7c44
--- /dev/null
+++ b/test-fixtures/benchmark-predictions.json
@@ -0,0 +1,56 @@
+{
+  "description": "Fixed set of predictions for E2E benchmarking",
+  "lastUpdated": "2025-12-16T20:29:18.748Z",
+  "predictions": [
+    {
+      "id": "019a40b6-cbc6-721b-bd7d-05302e27a322",
+      "description": "The author is discussing a shift in business focus back to 'run the business' and states that this c",
+      "expectedOutcome": null
+    },
+    {
+      "id": "019a40b6-d1f8-7c49-b490-db1a1eb6a608",
+      "description": "The author predicts that other major industries will follow the tech industry in moving away from 'w",
+      "expectedOutcome": null
+    },
+    {
+      "id": "019a44e7-deb9-7bd6-966b-88e5abbae23e",
+      "description": "The first tweet shows a picture of a brickyard and states 'it is not realty —yet.' The second tweet,",
+      "expectedOutcome": null
+    },
+    {
+      "id": "019a44e8-0b5e-70b1-b960-ebe8f53ca79d",
+      "description": "The first tweet states 'The woke mind virus must disappear.' The second tweet, in response, simply s",
+      "expectedOutcome": null
+    },
+    {
+      "id": "019a44e8-0b5e-7142-a4a4-bdbc12b24ede",
+      "description": "The author is drawing a parallel between the rapid adoption of cars in the early 20th century and th",
+      "expectedOutcome": null
+    },
+    {
+      "id": "019a45b5-a5c0-76ce-8fe0-a64380233a75",
+      "description": "The author is discussing the need for other entities to develop Mars infrastructure that aligns with",
+      "expectedOutcome": null
+    },
+    {
+      "id": "019a45b5-a5c0-7787-8945-53c2a21971f4",
+      "description": "The author states 'All right, time to build that AGI cluster.' This implies an intention or a plan t",
+      "expectedOutcome": null
+    },
+    {
+      "id": "019a44e8-0b5e-70eb-ab09-0fe75ebe67f2",
+      "description": "The author is predicting that Grok 3 will represent a significant advancement.",
+      "expectedOutcome": null
+    },
+    {
+      "id": "019a58fd-1cdd-7ce7-b47c-f484af4bc226",
+      "description": "No context",
+      "expectedOutcome": null
+    },
+    {
+      "id": "019a6f43-ac5b-7e2c-b485-4e5dcea33727",
+      "description": "No context",
+      "expectedOutcome": null
+    }
+  ]
+}
\ No newline at end of file

From 271dd075aa2a5243bead56fa8828d5126e001586 Mon Sep 17 00:00:00 2001
From: theu <matheusvargas412@gmail.com>
Date: Wed, 17 Dec 2025 19:44:18 -0300
Subject: [PATCH 10/10] delete outdated readme

---
 test-fixtures/README.md | 122 ----------------------------------------
 1 file changed, 122 deletions(-)
 delete mode 100644 test-fixtures/README.md

diff --git a/test-fixtures/README.md b/test-fixtures/README.md
deleted file mode 100644
index fce1ea0..0000000
--- a/test-fixtures/README.md
+++ /dev/null
@@ -1,122 +0,0 @@
-# E2E Benchmark Testing
-
-This directory contains the test fixtures for end-to-end validation benchmarking.
-
-## Quick Start
-
-### 1. Fetch Benchmark Predictions from Database
-
-```bash
-npm run test:benchmark:fetch [count]
-```
-
-This will:
-- Query your database for `[count]` matured predictions (default: 5)
-- Save them to `test-fixtures/benchmark-predictions.json`
-- Display the prediction IDs and context previews
-
-Example:
-```bash
-npm run test:benchmark:fetch 3
-```
-
-### 2. Run the Benchmark
-
-```bash
-npm run test:benchmark
-```
-
-This will:
-- Load predictions from `test-fixtures/benchmark-predictions.json`
-- Run full validation pipeline on each prediction
-- Track costs (API calls, tokens, scraper credits, etc.)
-- Save results to `test-results/benchmark-{timestamp}.json`
-- Run automated assertions on result shape
-
-## What Gets Tracked
-
-For each prediction, the benchmark tracks:
-
-**Validation Results:**
-- Outcome (MaturedTrue, MaturedFalse, etc.)
-- Proof length
-- Number of sources
-- Duration (ms)
-
-**Cost Metrics:**
-- Search API calls
-- Query enhancer tokens (input/output)
-- Result judge tokens (input/output)
-- Total tokens
-- Scraper credits spent (if enhancement used)
-- Navigator steps used (if enhancement used)
-- Enhancement method (scraper/navigator/none)
-
-**Aggregate Totals:**
-- Total costs across all predictions
-- Average duration per prediction
-
-## Comparing Runs
-
-Benchmark results are saved with timestamps in `test-results/`:
-
-```
-test-results/
-  benchmark-2025-01-15T10-30-00-000Z.json
-  benchmark-2025-01-15T14-45-00-000Z.json
-  benchmark-2025-01-16T09-00-00-000Z.json
-```
-
-You can compare files to detect:
-- Cost regressions (increased token usage)
-- Performance regressions (slower validation)
-- Behavior changes (different outcomes)
-
-## Expected Outcomes (Optional)
-
-You can add expected outcomes to `benchmark-predictions.json` for stricter assertions:
-
-```json
-{
-  "predictions": [
-    {
-      "id": "abc123",
-      "description": "Bitcoin prediction",
-      "expectedOutcome": "MaturedTrue"  // ← Add this
-    }
-  ]
-}
-```
-
-If specified, the test will assert that the actual outcome matches the expected outcome.
-
-## Example Output
-
-```
-🧪 Running benchmark on 3 predictions...
-
-Testing: abc123
-  Description: Bitcoin will reach $50k by...
-
-  ✅ Outcome: MaturedTrue
-  📝 Proof: 342 chars
-  🔗 Sources: 2
-  💰 Costs:
-     - Search API calls: 3
-     - Total tokens: 4523
-     - Scraper credits: 25
-  ⏱️  Duration: 8234ms
-
-...
-
-📊 Benchmark Summary:
-  Total search API calls: 9
-  Total input tokens: 8521
-  Total output tokens: 4832
-  Total tokens: 13353
-  Total scraper credits: 50
-  Total duration: 24701ms
-  Average duration: 8234ms
-
-📊 Benchmark results saved to test-results/benchmark-2025-01-15T10-30-00-000Z.json
-```