diff --git a/src/fs-utils.ts b/src/fs-utils.ts index 823a630..d851ce4 100644 --- a/src/fs-utils.ts +++ b/src/fs-utils.ts @@ -8,6 +8,13 @@ import { createInterface } from 'readline' export const MAX_SESSION_FILE_BYTES = 128 * 1024 * 1024 export const STREAM_THRESHOLD_BYTES = 8 * 1024 * 1024 +// Line-by-line streaming has bounded memory (one line at a time) and is not +// constrained by V8's string limit, so it can safely handle multi-GB session +// files. The cap here is purely a sanity check against pathological inputs; +// real Codex sessions for heavy users have been observed at 250+ MB and will +// continue to grow as context windows expand. +export const MAX_STREAM_SESSION_FILE_BYTES = 2 * 1024 * 1024 * 1024 + function verbose(): boolean { return process.env.CODEBURN_VERBOSE === '1' } @@ -78,8 +85,10 @@ export async function* readSessionLines(filePath: string): AsyncGenerator MAX_SESSION_FILE_BYTES) { - warn(`skipped oversize file ${filePath} (${size} bytes > cap ${MAX_SESSION_FILE_BYTES})`) + if (size > MAX_STREAM_SESSION_FILE_BYTES) { + warn( + `skipped oversize file ${filePath} (${size} bytes > stream cap ${MAX_STREAM_SESSION_FILE_BYTES})`, + ) return } diff --git a/src/providers/codex.ts b/src/providers/codex.ts index 9500ee0..83d81eb 100644 --- a/src/providers/codex.ts +++ b/src/providers/codex.ts @@ -4,7 +4,7 @@ import { createInterface } from 'readline' import { basename, join } from 'path' import { homedir } from 'os' -import { readSessionFile } from '../fs-utils.js' +import { readSessionLines } from '../fs-utils.js' import { calculateCost } from '../models.js' import { readCachedCodexResults, writeCachedCodexResults, getCachedCodexProject, fingerprintFile } from '../codex-cache.js' import type { Provider, SessionSource, SessionParser, ParsedProviderCall } from './types.js' @@ -201,9 +201,6 @@ function createParser(source: SessionSource, seenKeys: Set): SessionPars const fp = await fingerprintFile(source.path) if (!fp) return - const content = await readSessionFile(source.path) - if (content === null) return - const lines = content.split('\n').filter(l => l.trim()) let sessionModel: string | undefined let sessionId = '' let prevCumulativeTotal = 0 @@ -215,9 +212,18 @@ function createParser(source: SessionSource, seenKeys: Set): SessionPars let pendingUserMessage = '' let pendingOutputChars = 0 let estCounter = 0 + let sawAnyLine = false const results: ParsedProviderCall[] = [] - for (const line of lines) { + // Stream the session file line by line. Heavy Codex sessions can exceed + // 250 MB on disk; reading the entire file into a string would either hit + // the readSessionFile cap or push V8 toward its 512 MB string limit + // after split('\n'). readSessionLines streams via readline so memory + // stays bounded to the longest line. + for await (const rawLine of readSessionLines(source.path)) { + sawAnyLine = true + const line = rawLine.trim() + if (!line) continue let entry: CodexEntry try { entry = JSON.parse(line) as CodexEntry @@ -391,6 +397,11 @@ function createParser(source: SessionSource, seenKeys: Set): SessionPars } } + // If the stream yielded nothing the file was unreadable, oversized, or + // empty. Skip cache write so a transient failure can't pin an empty + // result set against a fingerprint that would otherwise be re-parsed. + if (!sawAnyLine) return + await writeCachedCodexResults(source.path, source.project, results, fp) for (const call of results) {