From 61be92a8348c1a7cbc1a0e0843dbbfd56c112967 Mon Sep 17 00:00:00 2001 From: ozymandiashh <234437643+ozymandiashh@users.noreply.github.com> Date: Mon, 4 May 2026 02:15:04 +0300 Subject: [PATCH] Stream-parse Codex session files to handle 250+ MB rollouts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Heavy Codex users hit MAX_SESSION_FILE_BYTES (128 MB) on long-running sessions. The file is read in full via readSessionFile and then split on '\n', so even bumping the cap eventually runs into V8's 512 MB string limit (split doubles the high-water mark). readSessionLines is a streaming generator that already exists in fs-utils for exactly this case but only readFirstLine was using it. Switch the Codex provider to consume it and let the cap apply only when streaming would still be unreasonable. Changes: - src/fs-utils.ts: introduce MAX_STREAM_SESSION_FILE_BYTES (2 GB) and apply it in readSessionLines instead of the full-read cap. Keep MAX_SESSION_FILE_BYTES for readSessionFile / readSessionFileSync consumers that materialize the whole file. - src/providers/codex.ts: replace `readSessionFile -> split('\n')` with `for await (... of readSessionLines)`. Add sawAnyLine guard so a failed/empty stream skips cache write, preserving the previous early-return behavior. Empirical impact on a real account with one 247 MB rollout: 7-day totals went from 4,536 calls / €358.69 / 20.1M input tokens to 6,111 calls / €550.67 / 37.3M input tokens. The previously-skipped session is now included; no other behavior changes. Refs #204 --- src/fs-utils.ts | 13 +++++++++++-- src/providers/codex.ts | 21 ++++++++++++++++----- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/src/fs-utils.ts b/src/fs-utils.ts index 823a630a..d851ce4c 100644 --- a/src/fs-utils.ts +++ b/src/fs-utils.ts @@ -8,6 +8,13 @@ import { createInterface } from 'readline' export const MAX_SESSION_FILE_BYTES = 128 * 1024 * 1024 export const STREAM_THRESHOLD_BYTES = 8 * 1024 * 1024 +// Line-by-line streaming has bounded memory (one line at a time) and is not +// constrained by V8's string limit, so it can safely handle multi-GB session +// files. The cap here is purely a sanity check against pathological inputs; +// real Codex sessions for heavy users have been observed at 250+ MB and will +// continue to grow as context windows expand. +export const MAX_STREAM_SESSION_FILE_BYTES = 2 * 1024 * 1024 * 1024 + function verbose(): boolean { return process.env.CODEBURN_VERBOSE === '1' } @@ -78,8 +85,10 @@ export async function* readSessionLines(filePath: string): AsyncGenerator MAX_SESSION_FILE_BYTES) { - warn(`skipped oversize file ${filePath} (${size} bytes > cap ${MAX_SESSION_FILE_BYTES})`) + if (size > MAX_STREAM_SESSION_FILE_BYTES) { + warn( + `skipped oversize file ${filePath} (${size} bytes > stream cap ${MAX_STREAM_SESSION_FILE_BYTES})`, + ) return } diff --git a/src/providers/codex.ts b/src/providers/codex.ts index 9500ee0c..83d81ebd 100644 --- a/src/providers/codex.ts +++ b/src/providers/codex.ts @@ -4,7 +4,7 @@ import { createInterface } from 'readline' import { basename, join } from 'path' import { homedir } from 'os' -import { readSessionFile } from '../fs-utils.js' +import { readSessionLines } from '../fs-utils.js' import { calculateCost } from '../models.js' import { readCachedCodexResults, writeCachedCodexResults, getCachedCodexProject, fingerprintFile } from '../codex-cache.js' import type { Provider, SessionSource, SessionParser, ParsedProviderCall } from './types.js' @@ -201,9 +201,6 @@ function createParser(source: SessionSource, seenKeys: Set): SessionPars const fp = await fingerprintFile(source.path) if (!fp) return - const content = await readSessionFile(source.path) - if (content === null) return - const lines = content.split('\n').filter(l => l.trim()) let sessionModel: string | undefined let sessionId = '' let prevCumulativeTotal = 0 @@ -215,9 +212,18 @@ function createParser(source: SessionSource, seenKeys: Set): SessionPars let pendingUserMessage = '' let pendingOutputChars = 0 let estCounter = 0 + let sawAnyLine = false const results: ParsedProviderCall[] = [] - for (const line of lines) { + // Stream the session file line by line. Heavy Codex sessions can exceed + // 250 MB on disk; reading the entire file into a string would either hit + // the readSessionFile cap or push V8 toward its 512 MB string limit + // after split('\n'). readSessionLines streams via readline so memory + // stays bounded to the longest line. + for await (const rawLine of readSessionLines(source.path)) { + sawAnyLine = true + const line = rawLine.trim() + if (!line) continue let entry: CodexEntry try { entry = JSON.parse(line) as CodexEntry @@ -391,6 +397,11 @@ function createParser(source: SessionSource, seenKeys: Set): SessionPars } } + // If the stream yielded nothing the file was unreadable, oversized, or + // empty. Skip cache write so a transient failure can't pin an empty + // result set against a fingerprint that would otherwise be re-parsed. + if (!sawAnyLine) return + await writeCachedCodexResults(source.path, source.project, results, fp) for (const call of results) {