From 6280dbaaef2ac4a457833a907105e76a301f4de3 Mon Sep 17 00:00:00 2001 From: Trevin Chow Date: Thu, 2 Apr 2026 04:24:22 -0700 Subject: [PATCH] drivers: improve error message when executor process crashes When the executor process terminates unexpectedly (e.g. OOM killed), the task event shows a raw gRPC error like "rpc error: code = Unavailable desc = error reading from server: read tcp ...". This is confusing because operators see an internal RPC error instead of a clear indication that the executor crashed. This changes the error message to "executor: the executor process terminated unexpectedly" when the process state is nil (which indicates the executor died before reporting exit status). The original error is still included for debugging. When process state IS available, the original message is preserved. Applied consistently to all four task drivers: exec, rawexec, java, and qemu. Fixes #24220 --- drivers/exec/driver.go | 18 +++++++++++------- drivers/java/driver.go | 18 +++++++++++------- drivers/qemu/driver.go | 18 +++++++++++------- drivers/rawexec/driver.go | 18 +++++++++++------- 4 files changed, 44 insertions(+), 28 deletions(-) diff --git a/drivers/exec/driver.go b/drivers/exec/driver.go index 94c4ca5fba8..bf922a8f71c 100644 --- a/drivers/exec/driver.go +++ b/drivers/exec/driver.go @@ -592,14 +592,18 @@ func (d *Driver) handleWait(ctx context.Context, handle *taskHandle, ch chan *dr var result *drivers.ExitResult ps, err := handle.exec.Wait(ctx) if err != nil { - result = &drivers.ExitResult{ - Err: fmt.Errorf("executor: error waiting on process: %v", err), - } - // if process state is nil, we've probably been killed, so return a reasonable - // exit state to the handlers + // if process state is nil, the executor process likely crashed or was + // killed (e.g. OOM). Wrap the raw RPC error with a clearer message. if ps == nil { - result.ExitCode = -1 - result.OOMKilled = false + result = &drivers.ExitResult{ + Err: fmt.Errorf("executor: the executor process terminated unexpectedly: %v", err), + ExitCode: -1, + OOMKilled: false, + } + } else { + result = &drivers.ExitResult{ + Err: fmt.Errorf("executor: error waiting on process: %v", err), + } } } else { result = &drivers.ExitResult{ diff --git a/drivers/java/driver.go b/drivers/java/driver.go index bf7247de942..8af9f78e38f 100644 --- a/drivers/java/driver.go +++ b/drivers/java/driver.go @@ -601,14 +601,18 @@ func (d *Driver) handleWait(ctx context.Context, handle *taskHandle, ch chan *dr var result *drivers.ExitResult ps, err := handle.exec.Wait(ctx) if err != nil { - result = &drivers.ExitResult{ - Err: fmt.Errorf("executor: error waiting on process: %v", err), - } - // if process state is nil, we've probably been killed, so return a reasonable - // exit state to the handlers + // if process state is nil, the executor process likely crashed or was + // killed (e.g. OOM). Wrap the raw RPC error with a clearer message. if ps == nil { - result.ExitCode = -1 - result.OOMKilled = false + result = &drivers.ExitResult{ + Err: fmt.Errorf("executor: the executor process terminated unexpectedly: %v", err), + ExitCode: -1, + OOMKilled: false, + } + } else { + result = &drivers.ExitResult{ + Err: fmt.Errorf("executor: error waiting on process: %v", err), + } } } else { result = &drivers.ExitResult{ diff --git a/drivers/qemu/driver.go b/drivers/qemu/driver.go index d6c78a5fda2..150fd4ced99 100644 --- a/drivers/qemu/driver.go +++ b/drivers/qemu/driver.go @@ -831,14 +831,18 @@ func (d *Driver) handleWait(ctx context.Context, handle *taskHandle, ch chan *dr var result *drivers.ExitResult ps, err := handle.exec.Wait(ctx) if err != nil { - result = &drivers.ExitResult{ - Err: fmt.Errorf("executor: error waiting on process: %v", err), - } - // if process state is nil, we've probably been killed, so return a reasonable - // exit state to the handlers + // if process state is nil, the executor process likely crashed or was + // killed (e.g. OOM). Wrap the raw RPC error with a clearer message. if ps == nil { - result.ExitCode = -1 - result.OOMKilled = false + result = &drivers.ExitResult{ + Err: fmt.Errorf("executor: the executor process terminated unexpectedly: %v", err), + ExitCode: -1, + OOMKilled: false, + } + } else { + result = &drivers.ExitResult{ + Err: fmt.Errorf("executor: error waiting on process: %v", err), + } } } else { result = &drivers.ExitResult{ diff --git a/drivers/rawexec/driver.go b/drivers/rawexec/driver.go index 1af4a09e204..c66c07adacc 100644 --- a/drivers/rawexec/driver.go +++ b/drivers/rawexec/driver.go @@ -498,14 +498,18 @@ func (d *Driver) handleWait(ctx context.Context, handle *taskHandle, ch chan *dr var result *drivers.ExitResult ps, err := handle.exec.Wait(ctx) if err != nil { - result = &drivers.ExitResult{ - Err: fmt.Errorf("executor: error waiting on process: %v", err), - } - // if process state is nil, we've probably been killed, so return a reasonable - // exit state to the handlers + // if process state is nil, the executor process likely crashed or was + // killed (e.g. OOM). Wrap the raw RPC error with a clearer message. if ps == nil { - result.ExitCode = -1 - result.OOMKilled = false + result = &drivers.ExitResult{ + Err: fmt.Errorf("executor: the executor process terminated unexpectedly: %v", err), + ExitCode: -1, + OOMKilled: false, + } + } else { + result = &drivers.ExitResult{ + Err: fmt.Errorf("executor: error waiting on process: %v", err), + } } } else { result = &drivers.ExitResult{