From 46a5a7d1ebdef37f572b915b0ffc5715f80060bb Mon Sep 17 00:00:00 2001 From: fpenezic Date: Sat, 25 Apr 2026 10:52:41 +0200 Subject: [PATCH 1/5] fix: close stale relay connection on ErrConnAlreadyExists to recover tunnel after NAT IP change --- client/internal/peer/conn.go | 9 +++++++++ client/internal/peer/worker_ice.go | 18 ++++++++++++++++++ client/internal/peer/worker_relay.go | 12 +++++++++--- shared/relay/client/client.go | 21 +++++++++++++++++++++ shared/relay/client/manager.go | 12 ++++++++++++ 5 files changed, 69 insertions(+), 3 deletions(-) diff --git a/client/internal/peer/conn.go b/client/internal/peer/conn.go index 8d1585b3f99..50a439e1fa5 100644 --- a/client/internal/peer/conn.go +++ b/client/internal/peer/conn.go @@ -619,6 +619,15 @@ func (conn *Conn) onWGDisconnected() { case conntype.Relay: conn.workerRelay.CloseConn() conn.handleRelayDisconnectedLocked() + // When running over relay, workerICE is not closed so its session ID is + // never rotated. The next offer would carry the same session ID, causing + // the remote peer to skip ICE agent recreation (it already has an agent + // for that session) and reuse stale candidates — preventing recovery + // after a NAT IP change (e.g. PPPoE reconnect). Force a new session ID + // so the remote peer creates a fresh ICE agent with current candidates. + if conn.workerICE != nil { + conn.workerICE.ResetSessionID() + } case conntype.ICEP2P, conntype.ICETurn: conn.workerICE.Close() default: diff --git a/client/internal/peer/worker_ice.go b/client/internal/peer/worker_ice.go index 29bf5aaaa74..4e086fa4222 100644 --- a/client/internal/peer/worker_ice.go +++ b/client/internal/peer/worker_ice.go @@ -249,6 +249,24 @@ func (w *WorkerICE) SessionID() ICESessionID { return w.sessionID } +// ResetSessionID generates a new session ID and clears the remote session ID. +// This must be called when the WireGuard handshake times out while using a relay +// connection, so that the next ICE offer carries a fresh session ID and the remote +// peer recreates its ICE agent with up-to-date candidates instead of skipping the +// offer because the session ID matches the previous (failed) attempt. +func (w *WorkerICE) ResetSessionID() { + w.muxAgent.Lock() + defer w.muxAgent.Unlock() + + sessionID, err := NewICESessionID() + if err != nil { + w.log.Errorf("failed to create new session ID: %s", err) + return + } + w.sessionID = sessionID + w.remoteSessionID = "" +} + // will block until connection succeeded // but it won't release if ICE Agent went into Disconnected or Failed state, // so we have to cancel it with the provided context once agent detected a broken connection diff --git a/client/internal/peer/worker_relay.go b/client/internal/peer/worker_relay.go index 06309fbafc4..095d3f3d3de 100644 --- a/client/internal/peer/worker_relay.go +++ b/client/internal/peer/worker_relay.go @@ -64,11 +64,17 @@ func (w *WorkerRelay) OnNewOffer(remoteOfferAnswer *OfferAnswer) { relayedConn, err := w.relayManager.OpenConn(w.peerCtx, srv, w.config.Key) if err != nil { if errors.Is(err, relayClient.ErrConnAlreadyExists) { - w.log.Debugf("handled offer by reusing existing relay connection") + w.log.Infof("relay conn already exists, closing stale conn and retrying") + w.relayManager.CloseConnByPeerKey(w.config.Key) + relayedConn, err = w.relayManager.OpenConn(w.peerCtx, srv, w.config.Key) + if err != nil { + w.log.Errorf("failed to reopen connection via Relay after closing stale: %s", err) + return + } + } else { + w.log.Errorf("failed to open connection via Relay: %s", err) return } - w.log.Errorf("failed to open connection via Relay: %s", err) - return } w.relayLock.Lock() diff --git a/shared/relay/client/client.go b/shared/relay/client/client.go index b10b056173a..62bb06a4161 100644 --- a/shared/relay/client/client.go +++ b/shared/relay/client/client.go @@ -627,6 +627,27 @@ func (c *Client) closeConn(containerRef *connContainer, id messages.PeerID) erro return nil } +// CloseConnByPeerKey closes an existing relay connection for the given peer key, +// removing it from the internal connection map so a new one can be opened. +func (c *Client) CloseConnByPeerKey(peerKey string) { + peerID := messages.HashID(peerKey) + c.mu.Lock() + container, ok := c.conns[peerID] + if !ok { + c.mu.Unlock() + return + } + + c.log.Infof("force closing stale relay connection for peer: %s", peerID) + delete(c.conns, peerID) + c.mu.Unlock() + + if err := c.stateSubscription.UnsubscribeStateChange([]messages.PeerID{peerID}); err != nil { + c.log.Errorf("failed to unsubscribe from peer state change: %s", err) + } + container.close() +} + func (c *Client) close(gracefullyExit bool) error { c.readLoopMutex.Lock() defer c.readLoopMutex.Unlock() diff --git a/shared/relay/client/manager.go b/shared/relay/client/manager.go index 6220e7f6b06..16e85630c26 100644 --- a/shared/relay/client/manager.go +++ b/shared/relay/client/manager.go @@ -147,6 +147,18 @@ func (m *Manager) OpenConn(ctx context.Context, serverAddress, peerKey string) ( return netConn, err } +// CloseConnByPeerKey closes an existing relay connection for the given peer key +// so that a subsequent OpenConn can create a fresh one. +func (m *Manager) CloseConnByPeerKey(peerKey string) { + m.relayClientMu.RLock() + defer m.relayClientMu.RUnlock() + + if m.relayClient == nil { + return + } + m.relayClient.CloseConnByPeerKey(peerKey) +} + // Ready returns true if the home Relay client is connected to the relay server. func (m *Manager) Ready() bool { m.relayClientMu.RLock() From 0f1fc4f4beebaedc0ff5a17be62ca2411418e075 Mon Sep 17 00:00:00 2001 From: fpenezic Date: Sat, 25 Apr 2026 10:52:41 +0200 Subject: [PATCH 2/5] fix: route CloseConnByPeerKey to correct relay client for foreign servers --- client/internal/peer/worker_relay.go | 2 +- shared/relay/client/manager.go | 24 +++++++++++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/client/internal/peer/worker_relay.go b/client/internal/peer/worker_relay.go index 095d3f3d3de..01e18811a46 100644 --- a/client/internal/peer/worker_relay.go +++ b/client/internal/peer/worker_relay.go @@ -65,7 +65,7 @@ func (w *WorkerRelay) OnNewOffer(remoteOfferAnswer *OfferAnswer) { if err != nil { if errors.Is(err, relayClient.ErrConnAlreadyExists) { w.log.Infof("relay conn already exists, closing stale conn and retrying") - w.relayManager.CloseConnByPeerKey(w.config.Key) + w.relayManager.CloseConnByPeerKey(srv, w.config.Key) relayedConn, err = w.relayManager.OpenConn(w.peerCtx, srv, w.config.Key) if err != nil { w.log.Errorf("failed to reopen connection via Relay after closing stale: %s", err) diff --git a/shared/relay/client/manager.go b/shared/relay/client/manager.go index 16e85630c26..e8a5b2e0b56 100644 --- a/shared/relay/client/manager.go +++ b/shared/relay/client/manager.go @@ -148,15 +148,29 @@ func (m *Manager) OpenConn(ctx context.Context, serverAddress, peerKey string) ( } // CloseConnByPeerKey closes an existing relay connection for the given peer key -// so that a subsequent OpenConn can create a fresh one. -func (m *Manager) CloseConnByPeerKey(peerKey string) { +// on the relay client associated with serverAddress, so that a subsequent +// OpenConn can create a fresh one. +func (m *Manager) CloseConnByPeerKey(serverAddress, peerKey string) { m.relayClientMu.RLock() - defer m.relayClientMu.RUnlock() + homeClient := m.relayClient + m.relayClientMu.RUnlock() - if m.relayClient == nil { + if homeClient == nil { + return + } + + homeAddr, err := homeClient.ServerInstanceURL() + if err == nil && homeAddr == serverAddress { + homeClient.CloseConnByPeerKey(peerKey) return } - m.relayClient.CloseConnByPeerKey(peerKey) + + m.relayClientsMutex.RLock() + rt, ok := m.relayClients[serverAddress] + m.relayClientsMutex.RUnlock() + if ok && rt.relayClient != nil { + rt.relayClient.CloseConnByPeerKey(peerKey) + } } // Ready returns true if the home Relay client is connected to the relay server. From 159183b55d58bf78d704fbfcfa5564e9ddaef071 Mon Sep 17 00:00:00 2001 From: fpenezic Date: Sat, 25 Apr 2026 10:52:41 +0200 Subject: [PATCH 3/5] docs: explain Guard invariant for ErrConnAlreadyExists branch --- client/internal/peer/worker_relay.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/client/internal/peer/worker_relay.go b/client/internal/peer/worker_relay.go index 01e18811a46..9cdd5769688 100644 --- a/client/internal/peer/worker_relay.go +++ b/client/internal/peer/worker_relay.go @@ -63,6 +63,11 @@ func (w *WorkerRelay) OnNewOffer(remoteOfferAnswer *OfferAnswer) { relayedConn, err := w.relayManager.OpenConn(w.peerCtx, srv, w.config.Key) if err != nil { + // ErrConnAlreadyExists here means the conn map was not cleaned up after the + // previous session (e.g. remote NAT IP change with no relay-level close event). + // Guard only triggers OnNewOffer after detecting a disconnect, so the existing + // entry is stale by contract. Close it and open a fresh conn to match the + // remote peer's state. if errors.Is(err, relayClient.ErrConnAlreadyExists) { w.log.Infof("relay conn already exists, closing stale conn and retrying") w.relayManager.CloseConnByPeerKey(srv, w.config.Key) From 691331ade8564a9b922eb59fd912934af8cd8f33 Mon Sep 17 00:00:00 2001 From: fpenezic Date: Sat, 25 Apr 2026 10:53:21 +0200 Subject: [PATCH 4/5] fix: gate ErrConnAlreadyExists close-and-retry behind stale flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous fix unconditionally closed and reopened the existing relay conn whenever OnNewOffer hit ErrConnAlreadyExists. That caused an infinite tear-down/rebuild loop when the remote peer sent rapid successive offers (e.g. during reconnection): close → reopen → remote sees teardown → sends new offer → ErrConnAlreadyExists → close → loop. Introduce a relayConnStale atomic flag that is set only when an event indicates the existing entry is no longer backed by a live peer session: local WG handshake timeout, relay server close, or explicit CloseConn. OnNewOffer only tears down and reopens when the flag is set; otherwise it reuses the existing healthy connection. Signal sources for the flag: - conn.onWGDisconnected (Relay path) → MarkStale before CloseConn - WorkerRelay.CloseConn → marks stale on close - WorkerRelay.onRelayClientDisconnected → marks stale on server close --- client/internal/peer/conn.go | 4 +++ client/internal/peer/worker_relay.go | 37 +++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/client/internal/peer/conn.go b/client/internal/peer/conn.go index 50a439e1fa5..85d27ca15f7 100644 --- a/client/internal/peer/conn.go +++ b/client/internal/peer/conn.go @@ -617,6 +617,10 @@ func (conn *Conn) onWGDisconnected() { // Close the active connection based on current priority switch conn.currentConnPriority { case conntype.Relay: + // Mark the relay conn entry as stale so the next OnNewOffer closes + // and reopens it instead of reusing a dead pipe. MarkStale covers + // the case where CloseConn is a no-op (e.g. relayedConn already nil). + conn.workerRelay.MarkStale() conn.workerRelay.CloseConn() conn.handleRelayDisconnectedLocked() // When running over relay, workerICE is not closed so its session ID is diff --git a/client/internal/peer/worker_relay.go b/client/internal/peer/worker_relay.go index 9cdd5769688..852d523d010 100644 --- a/client/internal/peer/worker_relay.go +++ b/client/internal/peer/worker_relay.go @@ -30,6 +30,14 @@ type WorkerRelay struct { relayLock sync.Mutex relaySupportedOnRemotePeer atomic.Bool + + // relayConnStale is set to true when an event indicates that the current + // relay connection entry in the relay client's conns map is no longer + // backed by a live peer session (e.g. local WG handshake timeout, relay + // server close event, explicit CloseConn). When OnNewOffer observes + // ErrConnAlreadyExists, it only closes the stale entry if this flag is + // set; otherwise it bails out and reuses the existing healthy connection. + relayConnStale atomic.Bool } func NewWorkerRelay(ctx context.Context, log *log.Entry, ctrl bool, config ConnConfig, conn *Conn, relayManager *relayClient.Manager) *WorkerRelay { @@ -63,19 +71,25 @@ func (w *WorkerRelay) OnNewOffer(remoteOfferAnswer *OfferAnswer) { relayedConn, err := w.relayManager.OpenConn(w.peerCtx, srv, w.config.Key) if err != nil { - // ErrConnAlreadyExists here means the conn map was not cleaned up after the - // previous session (e.g. remote NAT IP change with no relay-level close event). - // Guard only triggers OnNewOffer after detecting a disconnect, so the existing - // entry is stale by contract. Close it and open a fresh conn to match the - // remote peer's state. if errors.Is(err, relayClient.ErrConnAlreadyExists) { - w.log.Infof("relay conn already exists, closing stale conn and retrying") + // Only tear down the existing conn if something previously marked + // it as stale (local WG handshake timeout, relay server close, or + // explicit CloseConn). Without that signal, the existing conn is + // assumed healthy and is reused — unconditional close on every + // colliding offer causes an infinite tear-down/rebuild loop when + // the remote peer sends rapid successive offers. + if !w.relayConnStale.Load() { + w.log.Debugf("relay conn already exists and is not marked stale, reusing") + return + } + w.log.Infof("relay conn already exists and is marked stale, closing and retrying") w.relayManager.CloseConnByPeerKey(srv, w.config.Key) relayedConn, err = w.relayManager.OpenConn(w.peerCtx, srv, w.config.Key) if err != nil { w.log.Errorf("failed to reopen connection via Relay after closing stale: %s", err) return } + w.relayConnStale.Store(false) } else { w.log.Errorf("failed to open connection via Relay: %s", err) return @@ -120,11 +134,21 @@ func (w *WorkerRelay) CloseConn() { return } + w.relayConnStale.Store(true) if err := w.relayedConn.Close(); err != nil { w.log.Warnf("failed to close relay connection: %v", err) } } +// MarkStale marks the relay connection entry as stale so that the next +// OnNewOffer call with ErrConnAlreadyExists will tear it down and open a +// fresh one. Callers signal staleness from outside the relay client path, +// e.g. when the local WG handshake watcher fires while the relay is the +// active transport. +func (w *WorkerRelay) MarkStale() { + w.relayConnStale.Store(true) +} + func (w *WorkerRelay) isRelaySupported(answer *OfferAnswer) bool { if !w.relayManager.HasRelayAddress() { return false @@ -140,5 +164,6 @@ func (w *WorkerRelay) preferredRelayServer(myRelayAddress, remoteRelayAddress st } func (w *WorkerRelay) onRelayClientDisconnected() { + w.relayConnStale.Store(true) go w.conn.onRelayDisconnected() } From c0b3e4ed3d59a0f8e10aeb4cb4fdf84a4acd1a14 Mon Sep 17 00:00:00 2001 From: fpenezic Date: Sat, 25 Apr 2026 10:56:32 +0200 Subject: [PATCH 5/5] fix: rotate ICE session ID on WG timeout in ICE mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When WireGuard handshake times out while running over ICE, onWGDisconnected calls workerICE.Close(). Close() sets w.agent=nil synchronously before pion's ICE library fires the asynchronous ConnectionStateClosed event. By the time onConnectionStateChange runs closeAgent(), the `w.agent == agent` guard fails (w.agent is already nil) and the session ID is not rotated. Without session ID rotation, the next ICE offer carries the same local session ID. The remote peer in OnNewOffer compares remoteSessionID against the incoming offer's sessionID, finds them equal, and skips agent recreation — reusing the existing agent and its stale candidates from the broken network state. Observed in production: 30s reconnect loop after a NAT/conntrack event with the same offer session ID logged ~20+ times in a row, WG handshake never recovering despite "ICE connection succeeded" on every cycle. Mirror the existing Relay-path behavior: explicitly rotate the session ID before closing the ICE worker so the remote peer always recreates its agent after a WG timeout on ICE. --- client/internal/peer/conn.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/client/internal/peer/conn.go b/client/internal/peer/conn.go index 85d27ca15f7..82dc11492cc 100644 --- a/client/internal/peer/conn.go +++ b/client/internal/peer/conn.go @@ -633,6 +633,15 @@ func (conn *Conn) onWGDisconnected() { conn.workerICE.ResetSessionID() } case conntype.ICEP2P, conntype.ICETurn: + // WorkerICE.Close() sets agent=nil before pion's ICE library fires + // ConnectionStateClosed. By the time onConnectionStateChange runs + // closeAgent(), the w.agent==agent guard fails and the session ID + // is not rotated. Without rotation, the next offer carries the same + // session ID and the remote peer skips ICE agent recreation in + // OnNewOffer (sessionID match), reusing stale candidates from the + // previous network state. Rotate explicitly here so the remote peer + // always recreates its agent after a WG timeout on ICE. + conn.workerICE.ResetSessionID() conn.workerICE.Close() default: conn.Log.Debugf("No active connection to close on WG timeout")