From 3aa93bd5fe5edb755a9443841f2cd3822c0971c2 Mon Sep 17 00:00:00 2001 From: Moses Narrow <36607567+0pcom@users.noreply.github.com> Date: Mon, 4 May 2026 03:06:28 +0000 Subject: [PATCH] cxo/node: guard fillHead nil-deref races (closeFiller + handleDelConn) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #2422 silenced the two dominant TPD panics ('Finc to negative', 'short write: i/o deadline reached'). RestartCount went from ~556/6h to 2/10min. The remaining two panics are nil-derefs in pkg/cxo/node/head.go from torn-down fillHead state: 1. createFiller line 338: cr.c.String() panics when cr.c is nil. handleDelConn (line 517) sets f.p.c = nil when the source connection is removed. f.p.r remains non-zero, so the f.p == (connRoot{}) gate at handleFillingResult does NOT catch the nil-c case, and createFiller(f.p) runs with a nil source. Fix: nil-c short-circuit at the top of createFiller. The fill can't proceed without a source connection — log and return. 2. handleSuccess line 228 (and the parallel sites in handleRequest, handleRequestFailure, handleReceivedRoot): f.fc.PushBack on a nil *list.List. closeFiller (line 384) sets `f.rqo, f.fc, f.rq = nil, nil, nil`, so any in-flight Filler goroutine messages that drain after close land on a nil list. Fix: nil-guard each PushBack/PushFront site (4 sites total). Same recovery as #2422's panic→drop pattern: the closed filler's work is no longer relevant. These are the actual races; the right "real" fix would be to drain or signal-shutdown the channels before nilling state, but that's a larger structural change. Guards stop the process kill while preserving observability. --- pkg/cxo/node/head.go | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/pkg/cxo/node/head.go b/pkg/cxo/node/head.go index 061ae6761c..c1c4a4e0d6 100644 --- a/pkg/cxo/node/head.go +++ b/pkg/cxo/node/head.go @@ -217,6 +217,13 @@ func (n *nodeHead) handle() { func (f *fillHead) handleRequest(key cipher.SHA256) { f.node().Debugln(FillPin, "[fill] handleRequest", key.Hex()[:7]) + // closeFiller niled f.rqo before this message drained — drop the + // stale request rather than nil-deref. See closeFiller for the + // race: in-flight Filler goroutines can land here after the + // fillHead's lists were torn down. + if f.rqo == nil { + return + } f.rqo.PushBack(key) f.triggerRequest() } @@ -225,6 +232,12 @@ func (f *fillHead) handleSuccess(c *Conn) { f.node().Debugln(FillPin, "[fill] handleSuccess", c.String()) f.requesting-- + if f.fc == nil { + // Same closeFiller race as handleRequest — production saw + // "panic: runtime error: invalid memory address" at the + // PushBack below until this guard landed. + return + } f.fc.PushBack(c) // push f.triggerRequest() } @@ -259,6 +272,9 @@ func (f *fillHead) handleRequestFailure(fr failedRequest) { } + if f.rqo == nil { + return // closeFiller race; see handleRequest + } f.rqo.PushFront(fr.key) // shift f.triggerRequest() @@ -279,6 +295,9 @@ func (f *fillHead) handleReceivedRoot(cr connRoot) { f.cs.addKnown(cr.c, cr.r.Seq) // add to known if cr.r.Seq == f.r.r.Seq { + if f.fc == nil { + return // closeFiller race + } f.fc.PushBack(cr.c) // add to filling connections f.triggerRequest() return @@ -335,6 +354,18 @@ func (f *fillHead) maxParallel() (mp int) { } func (f *fillHead) createFiller(cr connRoot) { + // handleDelConn nils out connRoot.c when the source connection is + // removed (line 517 in handleDelConn). The connRoot may still + // have a non-zero r, so the f.p == (connRoot{}) check at + // handleFillingResult does NOT catch the nil-c case. Without + // this guard, cr.c.String() below nil-derefs and panics the + // process. The fill simply can't proceed without a source + // connection — drop it. + if cr.c == nil { + f.node().Debugln(FillPin, "[fill] createFiller: source connection gone, skipping", + cr.r.Short()) + return + } f.node().Debugln(FillPin, "[fill] createFiller", cr.c.String(), cr.r.Short())