Skip to content

Commit ad93e45

Browse files
authored
Fix telemetry event loss on build failures and server shutdown (#85867)
## Problem Telemetry events were not being captured in three scenarios: ### 1. MCP Telemetry Lost on Dev Server Shutdown Two telemetry instances were created during dev server startup. MCP events were recorded to one instance but shutdown flushed a different instance. **Fix:** Reuse the existing telemetry instance from `traceGlobals`. ### 2. Process Exit Timeout Killing Async Telemetry The CLI force-kills child processes after 100ms (`NEXT_EXIT_TIMEOUT_MS`). Async telemetry operations were interrupted before completion. **Fix:** Use `flushDetached()` to write events to disk and spawn a detached process for async submission, allowing immediate parent exit. ### 3. Build Errors Calling `process.exit()` Before Telemetry Flush `process.exit()` calls in error handlers (SWC failures, missing dependencies, route conflicts) bypassed finally blocks and killed processes before telemetry completed. **Fix:** Throw errors instead of calling `process.exit()`, add finally blocks with `await telemetry.flush()` in critical paths. ## Changes **Core:** - `server/dev/next-dev-server.ts`: Reuse telemetry instance - `server/lib/start-server.ts`: Use `flushDetached()` for shutdown - `build/swc/index.ts`: Remove `process.exit()` calls - `build/turbopack-build/impl.ts`: Add telemetry flush in worker finally block - `build/index.ts`: Add telemetry flush in finally block, throw instead of exit - `lib/verify-partytown-setup.ts`: Throw instead of exit **Supporting:** - `telemetry/storage.ts`: Generate unique event files per PID - `telemetry/detached-flush.ts`: Accept optional eventsFile parameter - `errors.json`: Update detached-flush error message ## Testing All telemetry tests now pass: - MCP telemetry on dev server shutdown - SWC load failures in worker threads - Build configuration errors ✅ Non-blocking shutdown ✅ Timeout-independent ✅ No race conditions ✅ Works in worker threads
1 parent 4b3ccc1 commit ad93e45

File tree

9 files changed

+124
-61
lines changed

9 files changed

+124
-61
lines changed

packages/next/errors.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -905,5 +905,7 @@
905905
"904": "The file \"%s\" must export a function, either as a default export or as a named \"%s\" export.",
906906
"905": "Page \"%s\" cannot use \\`export const unstable_prefetch = ...\\` without enabling \\`cacheComponents\\`.",
907907
"906": "Bindings not loaded yet, but they are being loaded, did you forget to await?",
908-
"907": "bindings not loaded yet. Either call `loadBindings` to wait for them to be available or ensure that `installBindings` has already been called."
908+
"907": "bindings not loaded yet. Either call `loadBindings` to wait for them to be available or ensure that `installBindings` has already been called.",
909+
"908": "Invalid flags should be run as node detached-flush dev ./path-to/project [eventsFile]",
910+
"909": "Failed to load SWC binary for %s/%s, see more info here: https://nextjs.org/docs/messages/failed-loading-swc"
909911
}

packages/next/src/build/index.ts

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,11 +1151,11 @@ export default async function build(
11511151
}
11521152

11531153
if (appDir && 'exportPathMap' in config) {
1154-
Log.error(
1154+
const errorMessage =
11551155
'The "exportPathMap" configuration cannot be used with the "app" directory. Please use generateStaticParams() instead.'
1156-
)
1156+
Log.error(errorMessage)
11571157
await telemetry.flush()
1158-
process.exit(1)
1158+
throw new Error(errorMessage)
11591159
}
11601160

11611161
const validFileMatcher = createValidFileMatcher(
@@ -1495,16 +1495,15 @@ export default async function build(
14951495
if (bundler !== Bundler.Turbopack) {
14961496
const numConflictingAppPaths = conflictingAppPagePaths.length
14971497
if (mappedAppPages && numConflictingAppPaths > 0) {
1498-
Log.error(
1499-
`Conflicting app and page file${
1500-
numConflictingAppPaths === 1 ? ' was' : 's were'
1501-
} found, please remove the conflicting files to continue:`
1502-
)
1498+
const errorMessage = `Conflicting app and page file${
1499+
numConflictingAppPaths === 1 ? ' was' : 's were'
1500+
} found, please remove the conflicting files to continue:`
1501+
Log.error(errorMessage)
15031502
for (const [pagePath, appPath] of conflictingAppPagePaths) {
15041503
Log.error(` "${pagePath}" - "${appPath}"`)
15051504
}
15061505
await telemetry.flush()
1507-
process.exit(1)
1506+
throw new Error(errorMessage)
15081507
}
15091508
}
15101509

@@ -1835,7 +1834,7 @@ export default async function build(
18351834
})
18361835
.catch((err) => {
18371836
console.error(err)
1838-
process.exit(1)
1837+
throw err
18391838
})
18401839
})
18411840
}
@@ -2702,7 +2701,7 @@ export default async function build(
27022701
outputFileTracingRoot,
27032702
}).catch((err) => {
27042703
console.error(err)
2705-
process.exit(1)
2704+
throw err
27062705
})
27072706
})
27082707
}
@@ -4304,6 +4303,12 @@ export default async function build(
43044303
// Ensure we wait for lockfile patching if present
43054304
await lockfilePatchPromise.cur
43064305

4306+
// Flush telemetry before finishing (waits for async operations like setTimeout in debug mode)
4307+
const telemetry: Telemetry | undefined = traceGlobals.get('telemetry')
4308+
if (telemetry) {
4309+
await telemetry.flush()
4310+
}
4311+
43074312
// Ensure all traces are flushed before finishing the command
43084313
await flushAllTraces()
43094314
teardownTraceSubscriber()

packages/next/src/build/swc/index.ts

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ export async function loadBindings(
224224
process.stderr._handle.setBlocking?.(true)
225225
}
226226

227-
pendingBindings = new Promise(async (resolve, _reject) => {
227+
pendingBindings = new Promise(async (resolve, reject) => {
228228
if (!lockfilePatchPromise.cur) {
229229
// always run lockfile check once so that it gets patched
230230
// even if it doesn't fail to load locally
@@ -294,7 +294,13 @@ export async function loadBindings(
294294
}
295295
}
296296

297-
logLoadFailure(attempts, true)
297+
await logLoadFailure(attempts, true)
298+
// Reject the promise to propagate the error (process.exit was removed to allow telemetry flush)
299+
reject(
300+
new Error(
301+
`Failed to load SWC binary for ${PlatformName}/${ArchName}, see more info here: https://nextjs.org/docs/messages/failed-loading-swc`
302+
)
303+
)
298304
})
299305
loadedBindings = await pendingBindings
300306
pendingBindings = undefined
@@ -380,13 +386,22 @@ function loadBindingsSync() {
380386
attempts = attempts.concat(a)
381387
}
382388

389+
// Fire-and-forget telemetry logging (loadBindingsSync must remain synchronous)
390+
// Worker error handler will await telemetry.flush() before exit
383391
logLoadFailure(attempts)
392+
384393
throw new Error('Failed to load bindings', { cause: attempts })
385394
}
386395

387396
let loggingLoadFailure = false
388397

389-
function logLoadFailure(attempts: any, triedWasm = false) {
398+
/**
399+
* Logs SWC load failure telemetry and error messages.
400+
*
401+
* Note: Does NOT call process.exit() - errors must propagate to caller's error handler
402+
* which will await telemetry.flush() before exit (critical for worker threads with async telemetry).
403+
*/
404+
async function logLoadFailure(attempts: any, triedWasm = false) {
390405
// make sure we only emit the event and log the failure once
391406
if (loggingLoadFailure) return
392407
loggingLoadFailure = true
@@ -396,17 +411,15 @@ function logLoadFailure(attempts: any, triedWasm = false) {
396411
}
397412

398413
// @ts-expect-error TODO: this event has a wrong type.
399-
eventSwcLoadFailure({
414+
await eventSwcLoadFailure({
400415
wasm: triedWasm ? 'failed' : undefined,
401416
nativeBindingsErrorCode: lastNativeBindingsLoadErrorCode,
402417
})
403-
.then(() => lockfilePatchPromise.cur || Promise.resolve())
404-
.finally(() => {
405-
Log.error(
406-
`Failed to load SWC binary for ${PlatformName}/${ArchName}, see more info here: https://nextjs.org/docs/messages/failed-loading-swc`
407-
)
408-
process.exit(1)
409-
})
418+
await (lockfilePatchPromise.cur || Promise.resolve())
419+
420+
Log.error(
421+
`Failed to load SWC binary for ${PlatformName}/${ArchName}, see more info here: https://nextjs.org/docs/messages/failed-loading-swc`
422+
)
410423
}
411424

412425
type RustifiedEnv = { name: string; value: string }[]

packages/next/src/build/turbopack-build/impl.ts

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -245,15 +245,20 @@ export async function workerMain(workerData: {
245245
// Install bindings early so we can access synchronously later
246246
await installBindings(config.experimental?.useWasmBinary)
247247

248-
const {
249-
shutdownPromise: resultShutdownPromise,
250-
buildTraceContext,
251-
duration,
252-
} = await turbopackBuild()
253-
shutdownPromise = resultShutdownPromise
254-
return {
255-
buildTraceContext,
256-
duration,
248+
try {
249+
const {
250+
shutdownPromise: resultShutdownPromise,
251+
buildTraceContext,
252+
duration,
253+
} = await turbopackBuild()
254+
shutdownPromise = resultShutdownPromise
255+
return {
256+
buildTraceContext,
257+
duration,
258+
}
259+
} finally {
260+
// Always flush telemetry before worker exits (waits for async operations like setTimeout in debug mode)
261+
await telemetry.flush()
257262
}
258263
}
259264

packages/next/src/lib/verify-partytown-setup.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,8 @@ export async function verifyPartytownSetup(
9191
// Don't show a stack trace when there is an error due to missing dependencies
9292
if (err instanceof FatalError) {
9393
console.error(err.message)
94-
process.exit(1)
94+
// Throw to allow finally blocks to run (e.g., telemetry flush)
95+
throw err
9596
}
9697
throw err
9798
}

packages/next/src/server/dev/next-dev-server.ts

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ import { pathHasPrefix } from '../../shared/lib/router/utils/path-has-prefix'
4040
import { removePathPrefix } from '../../shared/lib/router/utils/remove-path-prefix'
4141
import { Telemetry } from '../../telemetry/storage'
4242
import { type Span, setGlobal, trace } from '../../trace'
43+
import { traceGlobals } from '../../trace/shared'
4344
import { findPageFile } from '../lib/find-page-file'
4445
import { getFormattedNodeOptionsWithoutInspect } from '../lib/utils'
4546
import { withCoalescedInvoke } from '../../lib/coalesced-function'
@@ -289,7 +290,12 @@ export default class DevServer extends Server {
289290
setGlobal('distDir', this.distDir)
290291
setGlobal('phase', PHASE_DEVELOPMENT_SERVER)
291292

292-
const telemetry = new Telemetry({ distDir: this.distDir })
293+
// Use existing telemetry instance from traceGlobals instead of creating a new one.
294+
// Creating a new instance would overwrite the existing one, causing any telemetry
295+
// events recorded to the original instance to be lost during cleanup/flush.
296+
const existingTelemetry = traceGlobals.get('telemetry')
297+
const telemetry =
298+
existingTelemetry || new Telemetry({ distDir: this.distDir })
293299

294300
await super.prepareImpl()
295301
await this.matchers.reload()
@@ -303,7 +309,10 @@ export default class DevServer extends Server {
303309
// This is required by the tracing subsystem.
304310
setGlobal('appDir', this.appDir)
305311
setGlobal('pagesDir', this.pagesDir)
306-
setGlobal('telemetry', telemetry)
312+
// Only set telemetry if it wasn't already set
313+
if (!existingTelemetry) {
314+
setGlobal('telemetry', telemetry)
315+
}
307316

308317
process.on('unhandledRejection', (reason) => {
309318
if (isPostpone(reason)) {

packages/next/src/server/lib/start-server.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,10 @@ export async function startServer(
417417
>
418418
| undefined
419419
if (telemetry) {
420-
await telemetry.flush()
420+
// Use flushDetached to avoid blocking process exit
421+
// Each process writes to a unique file (_events_${pid}.json)
422+
// to avoid race conditions with the parent process
423+
telemetry.flushDetached('dev', dir)
421424
}
422425
} catch (_) {
423426
// Ignore telemetry errors during cleanup

packages/next/src/telemetry/detached-flush.ts

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,27 @@ import { PHASE_DEVELOPMENT_SERVER } from '../shared/lib/constants'
99
// this process should be started with following arg order
1010
// 1. mode e.g. dev, export, start
1111
// 2. project dir
12+
// 3. events filename (optional, defaults to _events.json)
1213
;(async () => {
1314
const args = [...process.argv]
15+
const eventsFile = args.pop()
1416
let dir = args.pop()
1517
const mode = args.pop()
1618

1719
if (!dir || mode !== 'dev') {
1820
throw new Error(
19-
`Invalid flags should be run as node detached-flush dev ./path-to/project`
21+
`Invalid flags should be run as node detached-flush dev ./path-to/project [eventsFile]`
2022
)
2123
}
2224
dir = getProjectDir(dir)
2325

2426
const config = await loadConfig(PHASE_DEVELOPMENT_SERVER, dir)
2527
const distDir = path.join(dir, config.distDir || '.next')
26-
const eventsPath = path.join(distDir, '_events.json')
28+
// Support both old format (no eventsFile arg) and new format (with eventsFile arg)
29+
const eventsPath = path.join(
30+
distDir,
31+
eventsFile && !eventsFile.includes('/') ? eventsFile : '_events.json'
32+
)
2733

2834
let events: TelemetryEvent[]
2935
try {
@@ -40,7 +46,8 @@ import { PHASE_DEVELOPMENT_SERVER } from '../shared/lib/constants'
4046
await telemetry.record(events)
4147
await telemetry.flush()
4248

43-
// finished flushing events clean-up/exit
49+
// finished flushing events clean-up
4450
fs.unlinkSync(eventsPath)
45-
process.exit(0)
51+
// Don't call process.exit() here - let Node.js exit naturally after
52+
// all pending work completes (e.g., setTimeout in debug telemetry)
4653
})()

packages/next/src/telemetry/storage.ts

Lines changed: 39 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,9 @@ export class Telemetry {
216216
return prom
217217
}
218218

219-
flush = async () => Promise.all(this.queue).catch(() => null)
219+
flush = async () => {
220+
return Promise.all(this.queue).catch(() => null)
221+
}
220222

221223
// writes current events to disk and spawns separate
222224
// detached process to submit the records without blocking
@@ -232,9 +234,17 @@ export class Telemetry {
232234
// if we fail to abort ignore this event
233235
}
234236
})
237+
238+
if (allEvents.length === 0) {
239+
// No events to flush
240+
return
241+
}
242+
235243
fs.mkdirSync(this.distDir, { recursive: true })
244+
// Use unique filename per process to avoid race conditions between parent/child
245+
const eventsFile = `_events_${process.pid}.json`
236246
fs.writeFileSync(
237-
path.join(this.distDir, '_events.json'),
247+
path.join(this.distDir, eventsFile),
238248
JSON.stringify(allEvents)
239249
)
240250

@@ -249,16 +259,20 @@ export class Telemetry {
249259
? child_process.spawnSync
250260
: child_process.spawn
251261

252-
spawn(process.execPath, [require.resolve('./detached-flush'), mode, dir], {
253-
detached: !this.NEXT_TELEMETRY_DEBUG,
254-
windowsHide: true,
255-
shell: false,
256-
...(this.NEXT_TELEMETRY_DEBUG
257-
? {
258-
stdio: 'inherit',
259-
}
260-
: {}),
261-
})
262+
spawn(
263+
process.execPath,
264+
[require.resolve('./detached-flush'), mode, dir, eventsFile],
265+
{
266+
detached: !this.NEXT_TELEMETRY_DEBUG,
267+
windowsHide: true,
268+
shell: false,
269+
...(this.NEXT_TELEMETRY_DEBUG
270+
? {
271+
stdio: 'inherit',
272+
}
273+
: {}),
274+
}
275+
)
262276
}
263277

264278
private submitRecord = async (
@@ -276,15 +290,19 @@ export class Telemetry {
276290
}
277291

278292
if (this.NEXT_TELEMETRY_DEBUG) {
279-
// Print to standard error to simplify selecting the output
280-
events.forEach(({ eventName, payload }) =>
281-
console.error(
282-
`[telemetry] ` + JSON.stringify({ eventName, payload }, null, 2)
283-
)
284-
)
285-
// Do not send the telemetry data if debugging. Users may use this feature
286-
// to preview what data would be sent.
287-
return Promise.resolve()
293+
// Return a promise that resolves after logging to ensure the output
294+
// is captured before the process exits (e.g., during flushDetached)
295+
return new Promise((resolve) => {
296+
setTimeout(() => {
297+
// Print to standard error to simplify selecting the output
298+
events.forEach(({ eventName, payload }) =>
299+
console.error(
300+
`[telemetry] ` + JSON.stringify({ eventName, payload }, null, 2)
301+
)
302+
)
303+
resolve(undefined)
304+
}, 100)
305+
})
288306
}
289307

290308
// Skip recording telemetry if the feature is disabled

0 commit comments

Comments
 (0)