From fe075a295c1ce88f3626e74980405f85a20fca9e Mon Sep 17 00:00:00 2001 From: Moses Narrow <36607567+0pcom@users.noreply.github.com> Date: Mon, 4 May 2026 02:47:41 +0000 Subject: [PATCH] fix tpd panic loop: cxo Finc-to-negative + httputil short-write discriminator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Production TPD was restarting every 30-40s (RestartCount 556 over 6h on the prod host) because two distinct panics tear down the process: 1. pkg/cxo/skyobject/cache.go (*Cache).Finc:1189,1216 panic: "Finc to negative for: " The filling-refcount went below zero — likely a duplicate Finc on a Filler.incs map, or an Inc/Finc mismatch across overlapping fillers. Hard process kill via panic. Filler.apply / Filler.reject already consume Finc's error return and just log; surface the inconsistency through that path instead. Clamp fc to 0, log the condition with key and the offending inc, and continue. Worst case is a leaked filling-item slot — orders of magnitude better than killing the service. 2. pkg/httputil/httputil.go WriteJSON:50 panic: "short write: i/o deadline reached" isIOError checks errors.Is(err, io.ErrShortWrite), but net/http's timeoutWriter returns its own error value with the same message string when a write deadline expires mid-response. errors.Is misses it (different sentinel), the fallback string match didn't include "short write", so getAllTransports' ~1MB JSON write to a slow client panics on every deadline hit. Added "short write", "i/o timeout", and "deadline exceeded" to the string-match fallback. New TestIsIOErrorShortWriteVariants pins all sentinel and string-match cases. Neither bug is caused by #2415/#2418/#2421; they were just made visible because deploys cycling at 30-40s aren't subtle. Together these stop the panic loop without changing any data semantics. --- pkg/cxo/skyobject/cache.go | 19 +++++++++++-- pkg/httputil/httputil.go | 16 ++++++++--- pkg/httputil/httputil_test.go | 50 +++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 5 deletions(-) create mode 100644 pkg/httputil/httputil_test.go diff --git a/pkg/cxo/skyobject/cache.go b/pkg/cxo/skyobject/cache.go index c8f4437283..29f1034587 100644 --- a/pkg/cxo/skyobject/cache.go +++ b/pkg/cxo/skyobject/cache.go @@ -2,6 +2,7 @@ package skyobject import ( "fmt" + "log" "sort" "sync" "time" @@ -1186,7 +1187,18 @@ func (c *Cache) Finc( } if it.fc < 0 { - panic("Finc to negative for: " + key.Hex()[:7]) + // Refcount inconsistency: a filler decremented fc by + // more than the cache's recorded Inc total. Likely a + // duplicate Finc on the same filler.incs map, or an + // Inc/Finc mismatch across overlapping fillers. The + // historical behavior was to panic — that's a hard + // process kill that observably restarts TPD on prod + // every 30-40s. Clamp to zero and continue so the + // process stays alive; the worst case is a leaked + // filling-item slot, not data corruption. + log.Printf("[CXO] Finc apply clamped negative fc to 0 for %s (was %d, inc %d)", + key.Hex(), it.fc+inc, inc) + it.fc = 0 } // the fc is zero @@ -1213,7 +1225,10 @@ func (c *Cache) Finc( it.fc += inc // fc = fc - abs(inc) if it.fc < 0 { - panic("Finc to negative for: " + key.Hex()[:7]) + // Same recovery as the apply branch — see comment above. + log.Printf("[CXO] Finc reject clamped negative fc to 0 for %s (was %d, inc %d)", + key.Hex(), it.fc-inc, inc) + it.fc = 0 } // and if it.fc turns to be zero, then the diff --git a/pkg/httputil/httputil.go b/pkg/httputil/httputil.go index 701b20e9f2..992e876a10 100644 --- a/pkg/httputil/httputil.go +++ b/pkg/httputil/httputil.go @@ -76,11 +76,21 @@ func isIOError(err error) bool { if errors.As(err, &netErr) && netErr.Timeout() { return true } - // String-match fallback for syscall errors that have no clean Go sentinel - // (kernel-side errno wrappings that surface as descriptive strings). + // String-match fallback for errors that have no clean Go sentinel, + // or whose value differs from a sentinel that shares its message. + // + // "short write" specifically: http.timeoutWriter (from net/http's + // per-write deadline machinery) returns its own error when the + // response deadline expires mid-write — same text as io.ErrShortWrite, + // different identity, so errors.Is misses it. Production TPD was + // panicking every ~30s on large /all-transports responses to slow + // clients before this case was added. errStr := err.Error() if strings.Contains(errStr, "broken pipe") || - strings.Contains(errStr, "connection reset") { + strings.Contains(errStr, "connection reset") || + strings.Contains(errStr, "short write") || + strings.Contains(errStr, "i/o timeout") || + strings.Contains(errStr, "deadline exceeded") { return true } return false diff --git a/pkg/httputil/httputil_test.go b/pkg/httputil/httputil_test.go new file mode 100644 index 0000000000..6fac5828f0 --- /dev/null +++ b/pkg/httputil/httputil_test.go @@ -0,0 +1,50 @@ +package httputil + +import ( + "errors" + "fmt" + "io" + "net" + "os" + "testing" +) + +// TestIsIOErrorShortWriteVariants pins the fix for the production +// panic loop where http.timeoutWriter's "short write: i/o deadline +// reached" failed errors.Is(io.ErrShortWrite) (different sentinel) +// and slipped past the discriminator, causing WriteJSON to panic +// rather than return on a slow-client write timeout. +func TestIsIOErrorShortWriteVariants(t *testing.T) { + cases := []struct { + name string + err error + want bool + }{ + {"nil", nil, false}, + {"random unrelated", errors.New("malformed UTF-8"), false}, + + // sentinels: must keep working + {"io.ErrShortWrite sentinel", io.ErrShortWrite, true}, + {"io.ErrClosedPipe sentinel", io.ErrClosedPipe, true}, + {"net.ErrClosed sentinel", net.ErrClosed, true}, + {"os.ErrDeadlineExceeded sentinel", os.ErrDeadlineExceeded, true}, + {"wrapped io.ErrShortWrite", fmt.Errorf("foo: %w", io.ErrShortWrite), true}, + + // the production case: same message, different identity + {"http.timeoutWriter short-write text", errors.New("short write: i/o deadline reached"), true}, + {"i/o timeout text", errors.New("write tcp 10.0.0.1:443: i/o timeout"), true}, + {"deadline exceeded text", errors.New("context deadline exceeded"), true}, + + // existing string-match fallbacks still work + {"broken pipe text", errors.New("write tcp: broken pipe"), true}, + {"connection reset text", errors.New("read tcp: connection reset by peer"), true}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := isIOError(c.err) + if got != c.want { + t.Errorf("isIOError(%v) = %v, want %v", c.err, got, c.want) + } + }) + } +}