From 800c1062502cd587450192b0cdbf1bab40d1bc37 Mon Sep 17 00:00:00 2001 From: iamtoruk Date: Sat, 2 May 2026 22:30:17 -0700 Subject: [PATCH] Fix streaming dedup: keep last occurrence of each message.id within session files Claude Code writes the same message.id multiple times during streaming. The first write has partial tokens (often 1) and no tool_use blocks. The last write has authoritative token counts and all tool_use/MCP blocks. Old behavior kept the first occurrence (keep-first), silently dropping real output tokens (+6.3% undercount) and all MCP tool calls. New behavior keeps the last occurrence's content but preserves the first occurrence's timestamp for correct date bucketing. Validated against 21,390 real session files: 40.5% had duplicate IDs, output tokens were understated by up to 78% per session. --- src/parser.ts | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/src/parser.ts b/src/parser.ts index fa6a345..97469d2 100644 --- a/src/parser.ts +++ b/src/parser.ts @@ -121,6 +121,30 @@ function parseApiCall(entry: JournalEntry): ParsedApiCall | null { } } +function dedupeStreamingMessageIds(entries: JournalEntry[]): JournalEntry[] { + const firstIdxById = new Map() + const lastIdxById = new Map() + for (let i = 0; i < entries.length; i++) { + const id = getMessageId(entries[i]!) + if (!id) continue + if (!firstIdxById.has(id)) firstIdxById.set(id, i) + lastIdxById.set(id, i) + } + if (lastIdxById.size === 0) return entries + const result: JournalEntry[] = [] + for (let i = 0; i < entries.length; i++) { + const id = getMessageId(entries[i]!) + if (id && lastIdxById.get(id) !== i) continue + if (id && firstIdxById.get(id) !== i) { + const firstTs = entries[firstIdxById.get(id)!]!.timestamp + result.push({ ...entries[i]!, timestamp: firstTs ?? entries[i]!.timestamp }) + continue + } + result.push(entries[i]!) + } + return result +} + function groupIntoTurns(entries: JournalEntry[], seenMsgIds: Set): ParsedTurn[] { const turns: ParsedTurn[] = [] let currentUserMessage = '' @@ -291,7 +315,8 @@ async function parseSessionFile( if (entries.length === 0) return null const sessionId = basename(filePath, '.jsonl') - let turns = groupIntoTurns(entries, seenMsgIds) + const dedupedEntries = dedupeStreamingMessageIds(entries) + let turns = groupIntoTurns(dedupedEntries, seenMsgIds) if (dateRange) { // Bucket a turn by the timestamp of its first assistant call (when the cost was // actually incurred). Filtering entries directly produced orphan assistant calls