From c170c5fc3006344e5242f6ecb8e144bcd9833a3c Mon Sep 17 00:00:00 2001 From: Josh Friend Date: Mon, 23 Mar 2026 14:17:23 -0400 Subject: [PATCH] perf: override GOMAXPROCS to use all available CPUs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Container environments often set GOMAXPROCS to a low value (e.g. 2) based on cgroup CPU limits. With only 2 OS threads available, the Go scheduler can't run the zstd decoder, tar reader, and file-write goroutines concurrently — they time-slice, and the tar reader (the single-threaded funnel feeding the pipeline) gets starved. Overriding to runtime.NumCPU() lets all goroutines actually run in parallel. Benchmarked on r8id.metal-48xlarge, this alone accounted for 3.7s of a 5.7s total improvement on a 334K-file 2.4 GB cache restore. --- cmd/cachew/main.go | 7 +++++++ cmd/cachewd/main.go | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/cmd/cachew/main.go b/cmd/cachew/main.go index 903bcb44..e779456b 100644 --- a/cmd/cachew/main.go +++ b/cmd/cachew/main.go @@ -40,6 +40,13 @@ type CLI struct { } func main() { + // Override GOMAXPROCS: container environments may set it to a low value + // (e.g. 2) based on cgroup CPU limits. With few OS threads available, the + // Go scheduler can't run the zstd decoder, tar reader, and file-write + // goroutines concurrently — they time-slice, and the pipeline stalls. + // Using all available CPUs lets all goroutines actually run in parallel. + runtime.GOMAXPROCS(runtime.NumCPU()) + cli := CLI{} kctx := kong.Parse(&cli, kong.UsageOnError(), kong.HelpOptions{Compact: true}, kong.DefaultEnvars("CACHEW"), kong.Bind(&cli)) ctx := context.Background() diff --git a/cmd/cachewd/main.go b/cmd/cachewd/main.go index 26d8625f..59b840bf 100644 --- a/cmd/cachewd/main.go +++ b/cmd/cachewd/main.go @@ -8,6 +8,7 @@ import ( "net/http" _ "net/http/pprof" //nolint:gosec "os" + "runtime" "strings" "time" @@ -53,6 +54,13 @@ type CLI struct { } func main() { + // Override GOMAXPROCS: container environments may set it to a low value + // (e.g. 2) based on cgroup CPU limits. With few OS threads available, the + // Go scheduler can't run the zstd decoder, tar reader, and file-write + // goroutines concurrently — they time-slice, and the pipeline stalls. + // Using all available CPUs lets all goroutines actually run in parallel. + runtime.GOMAXPROCS(runtime.NumCPU()) + var cli CLI kctx := kong.Parse(&cli, kong.DefaultEnvars("CACHEW"))