From f7341b5dcd46a2e8192cf4472e6bd1e2eb0841af Mon Sep 17 00:00:00 2001 From: alexsavio Date: Sun, 12 Apr 2026 01:55:06 +0200 Subject: [PATCH 1/6] [client] Replace WG interface monitor polling with netlink subscription on Linux MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The WireGuard interface monitor introduced in #4370 spawns a 2 s ticker that calls net.InterfaceByName(ifaceName) on every tick. On Linux, that function issues syscall.NetlinkRIB(RTM_GETLINK, ...) and dumps the entire kernel link table on every call, then linear-scans it for the matching name. On hosts with many veth interfaces (Docker, containerd, k8s) the per-call cost is dozens of KB and the ticker generates roughly 1 GB/day of allocation churn from this single source. On long-running clients the GC pressure plus span fragmentation manifests as a slow, monotonic RSS climb — observed at ~920 MB/day on a 4 GB Raspberry Pi running v0.68.1, eventually starving every other service on the host. This is one of the persistent leak sources behind #3678 (still open across 0.65–0.68.x). A pprof allocs profile from a freshly-restarted 0.68.1 daemon shows (*WGIfaceMonitor).Start → getInterfaceIndex → net.InterfaceByName → syscall.NetlinkRIB as the single largest allocator at 513 MB / 29% of all allocations within ~20 minutes of uptime. Fix: split the watcher by build tag. * `wg_iface_monitor_linux.go` subscribes to RTNLGRP_LINK via netlink.LinkSubscribe (already a transitive dependency via `vishvananda/netlink`) and reacts to RTM_DELLINK / RTM_NEWLINK events for the tracked interface index. Allocations between events drop to zero. The same pattern is already used by `client/internal/networkmonitor/check_change_linux.go` for route events, so the dependency, idiom, and review surface are familiar. * `wg_iface_monitor_other.go` keeps the original 2 s polling loop for darwin / windows / freebsd / android / ios. No behavior change on those platforms — they do not exhibit the NetlinkRIB cost. * `wg_iface_monitor.go` keeps the shared `WGIfaceMonitor` type, the early-return checks (mobile / netstack / empty name), and `getInterfaceIndex`, then dispatches to the platform-specific `watchInterface`. A small race window between the initial `getInterfaceIndex` call in `Start` and `LinkSubscribe` completing its handshake is closed by re-checking the index after subscribing. Windows is also reported as affected in #3678. The same event-driven treatment can be applied there using `NotifyIpInterfaceChange` from `iphlpapi`; left as a follow-up so this PR stays focused on the worst offender (Linux) with the smallest possible diff. Refs: #3678 --- client/internal/wg_iface_monitor.go | 31 ++------ client/internal/wg_iface_monitor_linux.go | 87 +++++++++++++++++++++++ client/internal/wg_iface_monitor_other.go | 51 +++++++++++++ 3 files changed, 143 insertions(+), 26 deletions(-) create mode 100644 client/internal/wg_iface_monitor_linux.go create mode 100644 client/internal/wg_iface_monitor_other.go diff --git a/client/internal/wg_iface_monitor.go b/client/internal/wg_iface_monitor.go index a870c114584..2a2fa23660b 100644 --- a/client/internal/wg_iface_monitor.go +++ b/client/internal/wg_iface_monitor.go @@ -6,7 +6,6 @@ import ( "fmt" "net" "runtime" - "time" log "github.com/sirupsen/logrus" @@ -28,6 +27,10 @@ func NewWGIfaceMonitor() *WGIfaceMonitor { // Start begins monitoring the WireGuard interface. // It relies on the provided context cancellation to stop. +// +// On Linux the watcher is event-driven (RTNLGRP_LINK netlink subscription) +// to avoid the allocation churn of repeatedly dumping the kernel link +// table; on other platforms it falls back to a low-frequency poll. func (m *WGIfaceMonitor) Start(ctx context.Context, ifaceName string) (shouldRestart bool, err error) { defer close(m.done) @@ -56,31 +59,7 @@ func (m *WGIfaceMonitor) Start(ctx context.Context, ifaceName string) (shouldRes log.Infof("Interface monitor: watching %s (index: %d)", ifaceName, expectedIndex) - ticker := time.NewTicker(2 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - log.Infof("Interface monitor: stopped for %s", ifaceName) - return false, fmt.Errorf("wg interface monitor stopped: %v", ctx.Err()) - case <-ticker.C: - currentIndex, err := getInterfaceIndex(ifaceName) - if err != nil { - // Interface was deleted - log.Infof("Interface monitor: %s deleted", ifaceName) - return true, fmt.Errorf("interface %s deleted: %w", ifaceName, err) - } - - // Check if interface index changed (interface was recreated) - if currentIndex != expectedIndex { - log.Infof("Interface monitor: %s recreated (index changed from %d to %d), restarting engine", - ifaceName, expectedIndex, currentIndex) - return true, nil - } - } - } - + return watchInterface(ctx, ifaceName, expectedIndex) } // getInterfaceIndex returns the index of a network interface by name. diff --git a/client/internal/wg_iface_monitor_linux.go b/client/internal/wg_iface_monitor_linux.go new file mode 100644 index 00000000000..a1c5c0d60cf --- /dev/null +++ b/client/internal/wg_iface_monitor_linux.go @@ -0,0 +1,87 @@ +//go:build linux + +package internal + +import ( + "context" + "fmt" + "syscall" + + log "github.com/sirupsen/logrus" + "github.com/vishvananda/netlink" +) + +// watchInterface uses an RTNLGRP_LINK netlink subscription to detect +// deletion or recreation of the WireGuard interface. +// +// The previous implementation polled net.InterfaceByName every 2 s, which +// on Linux issues syscall.NetlinkRIB(RTM_GETLINK, ...) and dumps the +// entire kernel link table on every call. On hosts with many veth +// interfaces (containers, bridges) the resulting allocation churn was on +// the order of ~1 GB/day from this single ticker, which on small ARM +// hosts manifested as a slow RSS climb (see netbirdio/netbird#3678). +// +// The event-driven version below allocates only when the kernel actually +// publishes a link event for the tracked interface — typically zero +// allocations between events. +func watchInterface(ctx context.Context, ifaceName string, expectedIndex int) (bool, error) { + done := make(chan struct{}) + defer close(done) + + // Buffer the channel to absorb event bursts (e.g. when many veth + // pairs are created/destroyed at once by container runtimes). + linkChan := make(chan netlink.LinkUpdate, 32) + if err := netlink.LinkSubscribe(linkChan, done); err != nil { + return false, fmt.Errorf("subscribe to link updates: %w", err) + } + + // Race window: the interface could have been deleted (or recreated) + // between the initial getInterfaceIndex() in Start and LinkSubscribe + // completing its handshake with the kernel. Re-check explicitly so we + // do not block forever waiting for an event that already fired. + if currentIndex, err := getInterfaceIndex(ifaceName); err != nil { + log.Infof("Interface monitor: %s deleted before subscription completed", ifaceName) + return true, fmt.Errorf("interface %s deleted: %w", ifaceName, err) + } else if currentIndex != expectedIndex { + log.Infof("Interface monitor: %s recreated (index changed from %d to %d) before subscription completed", + ifaceName, expectedIndex, currentIndex) + return true, nil + } + + for { + select { + case <-ctx.Done(): + log.Infof("Interface monitor: stopped for %s", ifaceName) + return false, fmt.Errorf("wg interface monitor stopped: %v", ctx.Err()) + + case update, ok := <-linkChan: + if !ok { + return false, fmt.Errorf("link subscription channel closed unexpectedly") + } + + eventIndex := int(update.Index) + eventType := update.Header.Type + eventName := "" + if attrs := update.Attrs(); attrs != nil { + eventName = attrs.Name + } + + switch eventType { + case syscall.RTM_DELLINK: + if eventIndex == expectedIndex { + log.Infof("Interface monitor: %s deleted", ifaceName) + return true, fmt.Errorf("interface %s deleted", ifaceName) + } + case syscall.RTM_NEWLINK: + // Recreation: a new link with our name appears at a + // different index. Same name + same index is just a + // flag/state change on the existing interface — ignore. + if eventName == ifaceName && eventIndex != expectedIndex { + log.Infof("Interface monitor: %s recreated (index changed from %d to %d), restarting engine", + ifaceName, expectedIndex, eventIndex) + return true, nil + } + } + } + } +} diff --git a/client/internal/wg_iface_monitor_other.go b/client/internal/wg_iface_monitor_other.go new file mode 100644 index 00000000000..d7213111014 --- /dev/null +++ b/client/internal/wg_iface_monitor_other.go @@ -0,0 +1,51 @@ +//go:build !linux + +package internal + +import ( + "context" + "fmt" + "time" + + log "github.com/sirupsen/logrus" +) + +// watchInterface polls net.InterfaceByName at a fixed interval to detect +// deletion or recreation of the WireGuard interface. +// +// This is the cross-platform fallback used on darwin, windows, freebsd, +// android, and ios. The Linux build (see wg_iface_monitor_linux.go) uses +// an event-driven RTNLGRP_LINK netlink subscription instead, because on +// Linux net.InterfaceByName issues syscall.NetlinkRIB(RTM_GETLINK, ...) +// which dumps the entire kernel link table on every call and produces +// significant allocation churn (netbirdio/netbird#3678). +// +// Windows is also reported in #3678 as affected by RSS climb. A future +// follow-up could implement an event-driven watcher there using +// NotifyIpInterfaceChange from iphlpapi. +func watchInterface(ctx context.Context, ifaceName string, expectedIndex int) (bool, error) { + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + log.Infof("Interface monitor: stopped for %s", ifaceName) + return false, fmt.Errorf("wg interface monitor stopped: %v", ctx.Err()) + case <-ticker.C: + currentIndex, err := getInterfaceIndex(ifaceName) + if err != nil { + // Interface was deleted + log.Infof("Interface monitor: %s deleted", ifaceName) + return true, fmt.Errorf("interface %s deleted: %w", ifaceName, err) + } + + // Check if interface index changed (interface was recreated) + if currentIndex != expectedIndex { + log.Infof("Interface monitor: %s recreated (index changed from %d to %d), restarting engine", + ifaceName, expectedIndex, currentIndex) + return true, nil + } + } + } +} From 962cd4a60325e715027658999880f076513241cb Mon Sep 17 00:00:00 2001 From: alexsavio Date: Sun, 12 Apr 2026 02:07:20 +0200 Subject: [PATCH 2/6] fix(client): address PR review comments on WG interface monitor - linux: handle interface rename (RTM_NEWLINK with same index but different name). The previous polling implementation caught this implicitly via net.InterfaceByName returning "not found"; the event-driven version has to test it explicitly. - linux: return shouldRestart=true when the netlink subscription channel closes unexpectedly so the engine re-establishes monitoring instead of silently leaving the host unmonitored. Also log the closure at warn level. - other: tighten docstring to clarify that android/ios are compiled but never reached at runtime, since Start() exits early on mobile. --- client/internal/wg_iface_monitor_linux.go | 30 ++++++++++++++++++++--- client/internal/wg_iface_monitor_other.go | 15 ++++++++---- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/client/internal/wg_iface_monitor_linux.go b/client/internal/wg_iface_monitor_linux.go index a1c5c0d60cf..c3529344284 100644 --- a/client/internal/wg_iface_monitor_linux.go +++ b/client/internal/wg_iface_monitor_linux.go @@ -56,7 +56,12 @@ func watchInterface(ctx context.Context, ifaceName string, expectedIndex int) (b case update, ok := <-linkChan: if !ok { - return false, fmt.Errorf("link subscription channel closed unexpectedly") + // The vishvananda/netlink subscription goroutine closes + // the channel on receive errors. Signal the engine to + // restart so monitoring is re-established instead of + // silently ending. + log.Warnf("Interface monitor: link subscription channel closed unexpectedly for %s", ifaceName) + return true, fmt.Errorf("link subscription channel closed unexpectedly") } eventIndex := int(update.Index) @@ -73,14 +78,31 @@ func watchInterface(ctx context.Context, ifaceName string, expectedIndex int) (b return true, fmt.Errorf("interface %s deleted", ifaceName) } case syscall.RTM_NEWLINK: - // Recreation: a new link with our name appears at a - // different index. Same name + same index is just a - // flag/state change on the existing interface — ignore. + // Two cases trigger a restart: + // + // 1. Recreation: a link with our name appears at a + // different index (the old interface was deleted + // and a fresh one took its place). + // + // 2. Rename: a link still at our index now has a + // different name. The previous polling + // implementation caught this implicitly because + // net.InterfaceByName(ifaceName) would start + // failing; the event-driven version has to handle + // it explicitly. + // + // Same name + same index is just a flag/state change on + // the existing interface — ignore. if eventName == ifaceName && eventIndex != expectedIndex { log.Infof("Interface monitor: %s recreated (index changed from %d to %d), restarting engine", ifaceName, expectedIndex, eventIndex) return true, nil } + if eventIndex == expectedIndex && eventName != "" && eventName != ifaceName { + log.Infof("Interface monitor: %s renamed to %s (index %d), restarting engine", + ifaceName, eventName, expectedIndex) + return true, fmt.Errorf("interface %s renamed to %s", ifaceName, eventName) + } } } } diff --git a/client/internal/wg_iface_monitor_other.go b/client/internal/wg_iface_monitor_other.go index d7213111014..6e59f4c6518 100644 --- a/client/internal/wg_iface_monitor_other.go +++ b/client/internal/wg_iface_monitor_other.go @@ -13,11 +13,16 @@ import ( // watchInterface polls net.InterfaceByName at a fixed interval to detect // deletion or recreation of the WireGuard interface. // -// This is the cross-platform fallback used on darwin, windows, freebsd, -// android, and ios. The Linux build (see wg_iface_monitor_linux.go) uses -// an event-driven RTNLGRP_LINK netlink subscription instead, because on -// Linux net.InterfaceByName issues syscall.NetlinkRIB(RTM_GETLINK, ...) -// which dumps the entire kernel link table on every call and produces +// This is the fallback used on non-Linux desktop and server platforms +// (darwin, windows, freebsd). It is also compiled on android and ios so +// the package builds on every supported GOOS, but it is never reached +// at runtime there because Start() in wg_iface_monitor.go exits early +// on mobile platforms. +// +// The Linux build (see wg_iface_monitor_linux.go) uses an event-driven +// RTNLGRP_LINK netlink subscription instead, because on Linux +// net.InterfaceByName issues syscall.NetlinkRIB(RTM_GETLINK, ...) which +// dumps the entire kernel link table on every call and produces // significant allocation churn (netbirdio/netbird#3678). // // Windows is also reported in #3678 as affected by RSS climb. A future From 371b8ce06811371d957b57d6cb4be9dae1fe38df Mon Sep 17 00:00:00 2001 From: alexsavio Date: Sun, 12 Apr 2026 02:20:50 +0200 Subject: [PATCH 3/6] refactor(client): extract link-event inspection to satisfy SonarCloud The rename-detection branch added in the previous commit pushed watchInterface over SonarCloud's cognitive-complexity limit (21 vs. 20 allowed) and made the RTM_NEWLINK case clause 11 lines long (rule limit: 10). Pull the entire switch into a small inspectLinkEvent helper. The main loop is now a thin receive-and-dispatch and the helper has its own self-contained complexity. No behavior change. Refs: SonarCloud go:S3776, go:S1151 --- client/internal/wg_iface_monitor_linux.go | 82 ++++++++++++----------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/client/internal/wg_iface_monitor_linux.go b/client/internal/wg_iface_monitor_linux.go index c3529344284..cb0abbbd8de 100644 --- a/client/internal/wg_iface_monitor_linux.go +++ b/client/internal/wg_iface_monitor_linux.go @@ -63,47 +63,51 @@ func watchInterface(ctx context.Context, ifaceName string, expectedIndex int) (b log.Warnf("Interface monitor: link subscription channel closed unexpectedly for %s", ifaceName) return true, fmt.Errorf("link subscription channel closed unexpectedly") } - - eventIndex := int(update.Index) - eventType := update.Header.Type - eventName := "" - if attrs := update.Attrs(); attrs != nil { - eventName = attrs.Name + if restart, err := inspectLinkEvent(update, ifaceName, expectedIndex); restart { + return true, err } + } + } +} - switch eventType { - case syscall.RTM_DELLINK: - if eventIndex == expectedIndex { - log.Infof("Interface monitor: %s deleted", ifaceName) - return true, fmt.Errorf("interface %s deleted", ifaceName) - } - case syscall.RTM_NEWLINK: - // Two cases trigger a restart: - // - // 1. Recreation: a link with our name appears at a - // different index (the old interface was deleted - // and a fresh one took its place). - // - // 2. Rename: a link still at our index now has a - // different name. The previous polling - // implementation caught this implicitly because - // net.InterfaceByName(ifaceName) would start - // failing; the event-driven version has to handle - // it explicitly. - // - // Same name + same index is just a flag/state change on - // the existing interface — ignore. - if eventName == ifaceName && eventIndex != expectedIndex { - log.Infof("Interface monitor: %s recreated (index changed from %d to %d), restarting engine", - ifaceName, expectedIndex, eventIndex) - return true, nil - } - if eventIndex == expectedIndex && eventName != "" && eventName != ifaceName { - log.Infof("Interface monitor: %s renamed to %s (index %d), restarting engine", - ifaceName, eventName, expectedIndex) - return true, fmt.Errorf("interface %s renamed to %s", ifaceName, eventName) - } - } +// inspectLinkEvent classifies a single netlink link update against the +// tracked WireGuard interface. It returns (true, err) when the engine +// should restart monitoring; (false, nil) means the event is unrelated +// and the caller should keep waiting. +// +// The error component, when non-nil, describes the kernel-side reason +// (deletion or rename); the recreation case returns (true, nil) since +// no error condition is reported. +func inspectLinkEvent(update netlink.LinkUpdate, ifaceName string, expectedIndex int) (bool, error) { + eventIndex := int(update.Index) + eventName := "" + if attrs := update.Attrs(); attrs != nil { + eventName = attrs.Name + } + + switch update.Header.Type { + case syscall.RTM_DELLINK: + if eventIndex == expectedIndex { + log.Infof("Interface monitor: %s deleted", ifaceName) + return true, fmt.Errorf("interface %s deleted", ifaceName) + } + case syscall.RTM_NEWLINK: + // Recreation: a link with our name appears at a different index + // (the old interface was deleted and a fresh one took its place). + if eventName == ifaceName && eventIndex != expectedIndex { + log.Infof("Interface monitor: %s recreated (index changed from %d to %d), restarting engine", + ifaceName, expectedIndex, eventIndex) + return true, nil + } + // Rename: a link still at our index now has a different name. + // The previous polling implementation caught this implicitly + // because net.InterfaceByName(ifaceName) would start failing; + // the event-driven version has to handle it explicitly. + if eventIndex == expectedIndex && eventName != "" && eventName != ifaceName { + log.Infof("Interface monitor: %s renamed to %s (index %d), restarting engine", + ifaceName, eventName, expectedIndex) + return true, fmt.Errorf("interface %s renamed to %s", ifaceName, eventName) } } + return false, nil } From 7dad47076d61a0260e8fda4ffe28a918757535d0 Mon Sep 17 00:00:00 2001 From: alexsavio Date: Sun, 12 Apr 2026 02:26:26 +0200 Subject: [PATCH 4/6] refactor(client): split RTM_DELLINK and RTM_NEWLINK handlers The RTM_NEWLINK case clause in inspectLinkEvent was still 11 lines after the previous refactor (SonarCloud rule limit: 10). Pull both event handlers into their own helpers (inspectDelLink, inspectNewLink) so the case clauses become single-line dispatches. No behavior change. Refs: SonarCloud go:S1151 --- client/internal/wg_iface_monitor_linux.go | 58 +++++++++++++++-------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/client/internal/wg_iface_monitor_linux.go b/client/internal/wg_iface_monitor_linux.go index cb0abbbd8de..58a56d63dd4 100644 --- a/client/internal/wg_iface_monitor_linux.go +++ b/client/internal/wg_iface_monitor_linux.go @@ -87,27 +87,45 @@ func inspectLinkEvent(update netlink.LinkUpdate, ifaceName string, expectedIndex switch update.Header.Type { case syscall.RTM_DELLINK: - if eventIndex == expectedIndex { - log.Infof("Interface monitor: %s deleted", ifaceName) - return true, fmt.Errorf("interface %s deleted", ifaceName) - } + return inspectDelLink(eventIndex, ifaceName, expectedIndex) case syscall.RTM_NEWLINK: - // Recreation: a link with our name appears at a different index - // (the old interface was deleted and a fresh one took its place). - if eventName == ifaceName && eventIndex != expectedIndex { - log.Infof("Interface monitor: %s recreated (index changed from %d to %d), restarting engine", - ifaceName, expectedIndex, eventIndex) - return true, nil - } - // Rename: a link still at our index now has a different name. - // The previous polling implementation caught this implicitly - // because net.InterfaceByName(ifaceName) would start failing; - // the event-driven version has to handle it explicitly. - if eventIndex == expectedIndex && eventName != "" && eventName != ifaceName { - log.Infof("Interface monitor: %s renamed to %s (index %d), restarting engine", - ifaceName, eventName, expectedIndex) - return true, fmt.Errorf("interface %s renamed to %s", ifaceName, eventName) - } + return inspectNewLink(eventIndex, eventName, ifaceName, expectedIndex) + } + return false, nil +} + +// inspectDelLink reports a restart when an RTM_DELLINK arrives for the +// tracked interface index. +func inspectDelLink(eventIndex int, ifaceName string, expectedIndex int) (bool, error) { + if eventIndex != expectedIndex { + return false, nil + } + log.Infof("Interface monitor: %s deleted", ifaceName) + return true, fmt.Errorf("interface %s deleted", ifaceName) +} + +// inspectNewLink reports a restart when an RTM_NEWLINK either: +// +// 1. Introduces a link with our name at a different index (recreation +// after a delete), or +// +// 2. Reports a link still at our index but with a different name +// (in-place rename). The previous polling implementation caught +// this implicitly because net.InterfaceByName(ifaceName) would +// start failing; the event-driven version has to test it. +// +// Same name + same index is just a flag/state change on the existing +// interface and is ignored. +func inspectNewLink(eventIndex int, eventName, ifaceName string, expectedIndex int) (bool, error) { + if eventName == ifaceName && eventIndex != expectedIndex { + log.Infof("Interface monitor: %s recreated (index changed from %d to %d), restarting engine", + ifaceName, expectedIndex, eventIndex) + return true, nil + } + if eventIndex == expectedIndex && eventName != "" && eventName != ifaceName { + log.Infof("Interface monitor: %s renamed to %s (index %d), restarting engine", + ifaceName, eventName, expectedIndex) + return true, fmt.Errorf("interface %s renamed to %s", ifaceName, eventName) } return false, nil } From c67604393007160bfac5868ec0eb30247f098343 Mon Sep 17 00:00:00 2001 From: alexsavio Date: Sun, 12 Apr 2026 10:52:59 +0200 Subject: [PATCH 5/6] fix(client): wrap context error with %w for proper error chain Use %w instead of %v in fmt.Errorf so downstream code can inspect the wrapped error with errors.Is(err, context.Canceled) and errors.Is(err, context.DeadlineExceeded). Applied to both wg_iface_monitor_linux.go and wg_iface_monitor_other.go. --- client/internal/wg_iface_monitor_linux.go | 2 +- client/internal/wg_iface_monitor_other.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/client/internal/wg_iface_monitor_linux.go b/client/internal/wg_iface_monitor_linux.go index 58a56d63dd4..c2deecb6a7a 100644 --- a/client/internal/wg_iface_monitor_linux.go +++ b/client/internal/wg_iface_monitor_linux.go @@ -52,7 +52,7 @@ func watchInterface(ctx context.Context, ifaceName string, expectedIndex int) (b select { case <-ctx.Done(): log.Infof("Interface monitor: stopped for %s", ifaceName) - return false, fmt.Errorf("wg interface monitor stopped: %v", ctx.Err()) + return false, fmt.Errorf("wg interface monitor stopped: %w", ctx.Err()) case update, ok := <-linkChan: if !ok { diff --git a/client/internal/wg_iface_monitor_other.go b/client/internal/wg_iface_monitor_other.go index 6e59f4c6518..afebbf4df3a 100644 --- a/client/internal/wg_iface_monitor_other.go +++ b/client/internal/wg_iface_monitor_other.go @@ -36,7 +36,7 @@ func watchInterface(ctx context.Context, ifaceName string, expectedIndex int) (b select { case <-ctx.Done(): log.Infof("Interface monitor: stopped for %s", ifaceName) - return false, fmt.Errorf("wg interface monitor stopped: %v", ctx.Err()) + return false, fmt.Errorf("wg interface monitor stopped: %w", ctx.Err()) case <-ticker.C: currentIndex, err := getInterfaceIndex(ifaceName) if err != nil { From 7133da0e213c8bba2663d1fab73a191c3aaa532e Mon Sep 17 00:00:00 2001 From: alexsavio Date: Sun, 12 Apr 2026 12:59:26 +0200 Subject: [PATCH 6/6] fix(client): recover monitoring on LinkSubscribe failure Return shouldRestart=true when netlink.LinkSubscribe fails so the engine triggers a client restart and re-establishes interface monitoring, instead of silently disabling it for the process lifetime. --- client/internal/wg_iface_monitor_linux.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/client/internal/wg_iface_monitor_linux.go b/client/internal/wg_iface_monitor_linux.go index c2deecb6a7a..2662b99d6e1 100644 --- a/client/internal/wg_iface_monitor_linux.go +++ b/client/internal/wg_iface_monitor_linux.go @@ -32,7 +32,10 @@ func watchInterface(ctx context.Context, ifaceName string, expectedIndex int) (b // pairs are created/destroyed at once by container runtimes). linkChan := make(chan netlink.LinkUpdate, 32) if err := netlink.LinkSubscribe(linkChan, done); err != nil { - return false, fmt.Errorf("subscribe to link updates: %w", err) + // Return shouldRestart=true so the engine recovers monitoring + // via triggerClientRestart instead of silently losing it for + // the rest of the process lifetime. + return true, fmt.Errorf("subscribe to link updates: %w", err) } // Race window: the interface could have been deleted (or recreated)