diff --git a/pkg/cxo/skyobject/cache.go b/pkg/cxo/skyobject/cache.go index c8f4437283..29f1034587 100644 --- a/pkg/cxo/skyobject/cache.go +++ b/pkg/cxo/skyobject/cache.go @@ -2,6 +2,7 @@ package skyobject import ( "fmt" + "log" "sort" "sync" "time" @@ -1186,7 +1187,18 @@ func (c *Cache) Finc( } if it.fc < 0 { - panic("Finc to negative for: " + key.Hex()[:7]) + // Refcount inconsistency: a filler decremented fc by + // more than the cache's recorded Inc total. Likely a + // duplicate Finc on the same filler.incs map, or an + // Inc/Finc mismatch across overlapping fillers. The + // historical behavior was to panic — that's a hard + // process kill that observably restarts TPD on prod + // every 30-40s. Clamp to zero and continue so the + // process stays alive; the worst case is a leaked + // filling-item slot, not data corruption. + log.Printf("[CXO] Finc apply clamped negative fc to 0 for %s (was %d, inc %d)", + key.Hex(), it.fc+inc, inc) + it.fc = 0 } // the fc is zero @@ -1213,7 +1225,10 @@ func (c *Cache) Finc( it.fc += inc // fc = fc - abs(inc) if it.fc < 0 { - panic("Finc to negative for: " + key.Hex()[:7]) + // Same recovery as the apply branch — see comment above. + log.Printf("[CXO] Finc reject clamped negative fc to 0 for %s (was %d, inc %d)", + key.Hex(), it.fc-inc, inc) + it.fc = 0 } // and if it.fc turns to be zero, then the diff --git a/pkg/httputil/httputil.go b/pkg/httputil/httputil.go index 701b20e9f2..992e876a10 100644 --- a/pkg/httputil/httputil.go +++ b/pkg/httputil/httputil.go @@ -76,11 +76,21 @@ func isIOError(err error) bool { if errors.As(err, &netErr) && netErr.Timeout() { return true } - // String-match fallback for syscall errors that have no clean Go sentinel - // (kernel-side errno wrappings that surface as descriptive strings). + // String-match fallback for errors that have no clean Go sentinel, + // or whose value differs from a sentinel that shares its message. + // + // "short write" specifically: http.timeoutWriter (from net/http's + // per-write deadline machinery) returns its own error when the + // response deadline expires mid-write — same text as io.ErrShortWrite, + // different identity, so errors.Is misses it. Production TPD was + // panicking every ~30s on large /all-transports responses to slow + // clients before this case was added. errStr := err.Error() if strings.Contains(errStr, "broken pipe") || - strings.Contains(errStr, "connection reset") { + strings.Contains(errStr, "connection reset") || + strings.Contains(errStr, "short write") || + strings.Contains(errStr, "i/o timeout") || + strings.Contains(errStr, "deadline exceeded") { return true } return false diff --git a/pkg/httputil/httputil_test.go b/pkg/httputil/httputil_test.go new file mode 100644 index 0000000000..6fac5828f0 --- /dev/null +++ b/pkg/httputil/httputil_test.go @@ -0,0 +1,50 @@ +package httputil + +import ( + "errors" + "fmt" + "io" + "net" + "os" + "testing" +) + +// TestIsIOErrorShortWriteVariants pins the fix for the production +// panic loop where http.timeoutWriter's "short write: i/o deadline +// reached" failed errors.Is(io.ErrShortWrite) (different sentinel) +// and slipped past the discriminator, causing WriteJSON to panic +// rather than return on a slow-client write timeout. +func TestIsIOErrorShortWriteVariants(t *testing.T) { + cases := []struct { + name string + err error + want bool + }{ + {"nil", nil, false}, + {"random unrelated", errors.New("malformed UTF-8"), false}, + + // sentinels: must keep working + {"io.ErrShortWrite sentinel", io.ErrShortWrite, true}, + {"io.ErrClosedPipe sentinel", io.ErrClosedPipe, true}, + {"net.ErrClosed sentinel", net.ErrClosed, true}, + {"os.ErrDeadlineExceeded sentinel", os.ErrDeadlineExceeded, true}, + {"wrapped io.ErrShortWrite", fmt.Errorf("foo: %w", io.ErrShortWrite), true}, + + // the production case: same message, different identity + {"http.timeoutWriter short-write text", errors.New("short write: i/o deadline reached"), true}, + {"i/o timeout text", errors.New("write tcp 10.0.0.1:443: i/o timeout"), true}, + {"deadline exceeded text", errors.New("context deadline exceeded"), true}, + + // existing string-match fallbacks still work + {"broken pipe text", errors.New("write tcp: broken pipe"), true}, + {"connection reset text", errors.New("read tcp: connection reset by peer"), true}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + got := isIOError(c.err) + if got != c.want { + t.Errorf("isIOError(%v) = %v, want %v", c.err, got, c.want) + } + }) + } +}