Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions pkg/cxo/skyobject/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package skyobject

import (
"fmt"
"log"
"sort"
"sync"
"time"
Expand Down Expand Up @@ -1186,7 +1187,18 @@ func (c *Cache) Finc(
}

if it.fc < 0 {
panic("Finc to negative for: " + key.Hex()[:7])
// Refcount inconsistency: a filler decremented fc by
// more than the cache's recorded Inc total. Likely a
// duplicate Finc on the same filler.incs map, or an
// Inc/Finc mismatch across overlapping fillers. The
// historical behavior was to panic — that's a hard
// process kill that observably restarts TPD on prod
// every 30-40s. Clamp to zero and continue so the
// process stays alive; the worst case is a leaked
// filling-item slot, not data corruption.
log.Printf("[CXO] Finc apply clamped negative fc to 0 for %s (was %d, inc %d)",
key.Hex(), it.fc+inc, inc)
it.fc = 0
}

// the fc is zero
Expand All @@ -1213,7 +1225,10 @@ func (c *Cache) Finc(
it.fc += inc // fc = fc - abs(inc)

if it.fc < 0 {
panic("Finc to negative for: " + key.Hex()[:7])
// Same recovery as the apply branch — see comment above.
log.Printf("[CXO] Finc reject clamped negative fc to 0 for %s (was %d, inc %d)",
key.Hex(), it.fc-inc, inc)
it.fc = 0
}

// and if it.fc turns to be zero, then the
Expand Down
16 changes: 13 additions & 3 deletions pkg/httputil/httputil.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,21 @@ func isIOError(err error) bool {
if errors.As(err, &netErr) && netErr.Timeout() {
return true
}
// String-match fallback for syscall errors that have no clean Go sentinel
// (kernel-side errno wrappings that surface as descriptive strings).
// String-match fallback for errors that have no clean Go sentinel,
// or whose value differs from a sentinel that shares its message.
//
// "short write" specifically: http.timeoutWriter (from net/http's
// per-write deadline machinery) returns its own error when the
// response deadline expires mid-write — same text as io.ErrShortWrite,
// different identity, so errors.Is misses it. Production TPD was
// panicking every ~30s on large /all-transports responses to slow
// clients before this case was added.
errStr := err.Error()
if strings.Contains(errStr, "broken pipe") ||
strings.Contains(errStr, "connection reset") {
strings.Contains(errStr, "connection reset") ||
strings.Contains(errStr, "short write") ||
strings.Contains(errStr, "i/o timeout") ||
strings.Contains(errStr, "deadline exceeded") {
return true
}
return false
Expand Down
50 changes: 50 additions & 0 deletions pkg/httputil/httputil_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package httputil

import (
"errors"
"fmt"
"io"
"net"
"os"
"testing"
)

// TestIsIOErrorShortWriteVariants pins the fix for the production
// panic loop where http.timeoutWriter's "short write: i/o deadline
// reached" failed errors.Is(io.ErrShortWrite) (different sentinel)
// and slipped past the discriminator, causing WriteJSON to panic
// rather than return on a slow-client write timeout.
func TestIsIOErrorShortWriteVariants(t *testing.T) {
cases := []struct {
name string
err error
want bool
}{
{"nil", nil, false},
{"random unrelated", errors.New("malformed UTF-8"), false},

// sentinels: must keep working
{"io.ErrShortWrite sentinel", io.ErrShortWrite, true},
{"io.ErrClosedPipe sentinel", io.ErrClosedPipe, true},
{"net.ErrClosed sentinel", net.ErrClosed, true},
{"os.ErrDeadlineExceeded sentinel", os.ErrDeadlineExceeded, true},
{"wrapped io.ErrShortWrite", fmt.Errorf("foo: %w", io.ErrShortWrite), true},

// the production case: same message, different identity
{"http.timeoutWriter short-write text", errors.New("short write: i/o deadline reached"), true},
{"i/o timeout text", errors.New("write tcp 10.0.0.1:443: i/o timeout"), true},
{"deadline exceeded text", errors.New("context deadline exceeded"), true},

// existing string-match fallbacks still work
{"broken pipe text", errors.New("write tcp: broken pipe"), true},
{"connection reset text", errors.New("read tcp: connection reset by peer"), true},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
got := isIOError(c.err)
if got != c.want {
t.Errorf("isIOError(%v) = %v, want %v", c.err, got, c.want)
}
})
}
}
Loading