diff --git a/client/internal/wg_iface_monitor.go b/client/internal/wg_iface_monitor.go index a870c114584..2a2fa23660b 100644 --- a/client/internal/wg_iface_monitor.go +++ b/client/internal/wg_iface_monitor.go @@ -6,7 +6,6 @@ import ( "fmt" "net" "runtime" - "time" log "github.com/sirupsen/logrus" @@ -28,6 +27,10 @@ func NewWGIfaceMonitor() *WGIfaceMonitor { // Start begins monitoring the WireGuard interface. // It relies on the provided context cancellation to stop. +// +// On Linux the watcher is event-driven (RTNLGRP_LINK netlink subscription) +// to avoid the allocation churn of repeatedly dumping the kernel link +// table; on other platforms it falls back to a low-frequency poll. func (m *WGIfaceMonitor) Start(ctx context.Context, ifaceName string) (shouldRestart bool, err error) { defer close(m.done) @@ -56,31 +59,7 @@ func (m *WGIfaceMonitor) Start(ctx context.Context, ifaceName string) (shouldRes log.Infof("Interface monitor: watching %s (index: %d)", ifaceName, expectedIndex) - ticker := time.NewTicker(2 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ctx.Done(): - log.Infof("Interface monitor: stopped for %s", ifaceName) - return false, fmt.Errorf("wg interface monitor stopped: %v", ctx.Err()) - case <-ticker.C: - currentIndex, err := getInterfaceIndex(ifaceName) - if err != nil { - // Interface was deleted - log.Infof("Interface monitor: %s deleted", ifaceName) - return true, fmt.Errorf("interface %s deleted: %w", ifaceName, err) - } - - // Check if interface index changed (interface was recreated) - if currentIndex != expectedIndex { - log.Infof("Interface monitor: %s recreated (index changed from %d to %d), restarting engine", - ifaceName, expectedIndex, currentIndex) - return true, nil - } - } - } - + return watchInterface(ctx, ifaceName, expectedIndex) } // getInterfaceIndex returns the index of a network interface by name. diff --git a/client/internal/wg_iface_monitor_linux.go b/client/internal/wg_iface_monitor_linux.go new file mode 100644 index 00000000000..2662b99d6e1 --- /dev/null +++ b/client/internal/wg_iface_monitor_linux.go @@ -0,0 +1,134 @@ +//go:build linux + +package internal + +import ( + "context" + "fmt" + "syscall" + + log "github.com/sirupsen/logrus" + "github.com/vishvananda/netlink" +) + +// watchInterface uses an RTNLGRP_LINK netlink subscription to detect +// deletion or recreation of the WireGuard interface. +// +// The previous implementation polled net.InterfaceByName every 2 s, which +// on Linux issues syscall.NetlinkRIB(RTM_GETLINK, ...) and dumps the +// entire kernel link table on every call. On hosts with many veth +// interfaces (containers, bridges) the resulting allocation churn was on +// the order of ~1 GB/day from this single ticker, which on small ARM +// hosts manifested as a slow RSS climb (see netbirdio/netbird#3678). +// +// The event-driven version below allocates only when the kernel actually +// publishes a link event for the tracked interface — typically zero +// allocations between events. +func watchInterface(ctx context.Context, ifaceName string, expectedIndex int) (bool, error) { + done := make(chan struct{}) + defer close(done) + + // Buffer the channel to absorb event bursts (e.g. when many veth + // pairs are created/destroyed at once by container runtimes). + linkChan := make(chan netlink.LinkUpdate, 32) + if err := netlink.LinkSubscribe(linkChan, done); err != nil { + // Return shouldRestart=true so the engine recovers monitoring + // via triggerClientRestart instead of silently losing it for + // the rest of the process lifetime. + return true, fmt.Errorf("subscribe to link updates: %w", err) + } + + // Race window: the interface could have been deleted (or recreated) + // between the initial getInterfaceIndex() in Start and LinkSubscribe + // completing its handshake with the kernel. Re-check explicitly so we + // do not block forever waiting for an event that already fired. + if currentIndex, err := getInterfaceIndex(ifaceName); err != nil { + log.Infof("Interface monitor: %s deleted before subscription completed", ifaceName) + return true, fmt.Errorf("interface %s deleted: %w", ifaceName, err) + } else if currentIndex != expectedIndex { + log.Infof("Interface monitor: %s recreated (index changed from %d to %d) before subscription completed", + ifaceName, expectedIndex, currentIndex) + return true, nil + } + + for { + select { + case <-ctx.Done(): + log.Infof("Interface monitor: stopped for %s", ifaceName) + return false, fmt.Errorf("wg interface monitor stopped: %w", ctx.Err()) + + case update, ok := <-linkChan: + if !ok { + // The vishvananda/netlink subscription goroutine closes + // the channel on receive errors. Signal the engine to + // restart so monitoring is re-established instead of + // silently ending. + log.Warnf("Interface monitor: link subscription channel closed unexpectedly for %s", ifaceName) + return true, fmt.Errorf("link subscription channel closed unexpectedly") + } + if restart, err := inspectLinkEvent(update, ifaceName, expectedIndex); restart { + return true, err + } + } + } +} + +// inspectLinkEvent classifies a single netlink link update against the +// tracked WireGuard interface. It returns (true, err) when the engine +// should restart monitoring; (false, nil) means the event is unrelated +// and the caller should keep waiting. +// +// The error component, when non-nil, describes the kernel-side reason +// (deletion or rename); the recreation case returns (true, nil) since +// no error condition is reported. +func inspectLinkEvent(update netlink.LinkUpdate, ifaceName string, expectedIndex int) (bool, error) { + eventIndex := int(update.Index) + eventName := "" + if attrs := update.Attrs(); attrs != nil { + eventName = attrs.Name + } + + switch update.Header.Type { + case syscall.RTM_DELLINK: + return inspectDelLink(eventIndex, ifaceName, expectedIndex) + case syscall.RTM_NEWLINK: + return inspectNewLink(eventIndex, eventName, ifaceName, expectedIndex) + } + return false, nil +} + +// inspectDelLink reports a restart when an RTM_DELLINK arrives for the +// tracked interface index. +func inspectDelLink(eventIndex int, ifaceName string, expectedIndex int) (bool, error) { + if eventIndex != expectedIndex { + return false, nil + } + log.Infof("Interface monitor: %s deleted", ifaceName) + return true, fmt.Errorf("interface %s deleted", ifaceName) +} + +// inspectNewLink reports a restart when an RTM_NEWLINK either: +// +// 1. Introduces a link with our name at a different index (recreation +// after a delete), or +// +// 2. Reports a link still at our index but with a different name +// (in-place rename). The previous polling implementation caught +// this implicitly because net.InterfaceByName(ifaceName) would +// start failing; the event-driven version has to test it. +// +// Same name + same index is just a flag/state change on the existing +// interface and is ignored. +func inspectNewLink(eventIndex int, eventName, ifaceName string, expectedIndex int) (bool, error) { + if eventName == ifaceName && eventIndex != expectedIndex { + log.Infof("Interface monitor: %s recreated (index changed from %d to %d), restarting engine", + ifaceName, expectedIndex, eventIndex) + return true, nil + } + if eventIndex == expectedIndex && eventName != "" && eventName != ifaceName { + log.Infof("Interface monitor: %s renamed to %s (index %d), restarting engine", + ifaceName, eventName, expectedIndex) + return true, fmt.Errorf("interface %s renamed to %s", ifaceName, eventName) + } + return false, nil +} diff --git a/client/internal/wg_iface_monitor_other.go b/client/internal/wg_iface_monitor_other.go new file mode 100644 index 00000000000..afebbf4df3a --- /dev/null +++ b/client/internal/wg_iface_monitor_other.go @@ -0,0 +1,56 @@ +//go:build !linux + +package internal + +import ( + "context" + "fmt" + "time" + + log "github.com/sirupsen/logrus" +) + +// watchInterface polls net.InterfaceByName at a fixed interval to detect +// deletion or recreation of the WireGuard interface. +// +// This is the fallback used on non-Linux desktop and server platforms +// (darwin, windows, freebsd). It is also compiled on android and ios so +// the package builds on every supported GOOS, but it is never reached +// at runtime there because Start() in wg_iface_monitor.go exits early +// on mobile platforms. +// +// The Linux build (see wg_iface_monitor_linux.go) uses an event-driven +// RTNLGRP_LINK netlink subscription instead, because on Linux +// net.InterfaceByName issues syscall.NetlinkRIB(RTM_GETLINK, ...) which +// dumps the entire kernel link table on every call and produces +// significant allocation churn (netbirdio/netbird#3678). +// +// Windows is also reported in #3678 as affected by RSS climb. A future +// follow-up could implement an event-driven watcher there using +// NotifyIpInterfaceChange from iphlpapi. +func watchInterface(ctx context.Context, ifaceName string, expectedIndex int) (bool, error) { + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + log.Infof("Interface monitor: stopped for %s", ifaceName) + return false, fmt.Errorf("wg interface monitor stopped: %w", ctx.Err()) + case <-ticker.C: + currentIndex, err := getInterfaceIndex(ifaceName) + if err != nil { + // Interface was deleted + log.Infof("Interface monitor: %s deleted", ifaceName) + return true, fmt.Errorf("interface %s deleted: %w", ifaceName, err) + } + + // Check if interface index changed (interface was recreated) + if currentIndex != expectedIndex { + log.Infof("Interface monitor: %s recreated (index changed from %d to %d), restarting engine", + ifaceName, expectedIndex, currentIndex) + return true, nil + } + } + } +}