From dc3c15b4c6e2fe4fcd67611d73d49e6832c42fec Mon Sep 17 00:00:00 2001 From: Patrick O'Doherty Date: Mon, 27 Oct 2025 13:18:13 -0700 Subject: [PATCH 01/25] control/controlclient: back out HW key attestation (#17664) Temporarily back out the TPM-based hw attestation code while we debug Windows exceptions. Updates tailscale/corp#31269 Signed-off-by: Patrick O'Doherty (cherry picked from commit a760cbe33f4bed64b63c6118808d02b2771ff785) --- control/controlclient/direct.go | 22 --------------- ipn/ipnlocal/hwattest.go | 48 --------------------------------- ipn/ipnlocal/local.go | 1 - ipn/ipnlocal/profiles.go | 10 ------- ipn/ipnlocal/profiles_test.go | 1 - ipn/prefs_test.go | 2 +- types/persist/persist.go | 18 ++----------- types/persist/persist_clone.go | 4 --- types/persist/persist_test.go | 2 +- types/persist/persist_view.go | 10 +++---- 10 files changed, 8 insertions(+), 110 deletions(-) delete mode 100644 ipn/ipnlocal/hwattest.go diff --git a/control/controlclient/direct.go b/control/controlclient/direct.go index 63a12b2495fd8..fe7cc235b05f8 100644 --- a/control/controlclient/direct.go +++ b/control/controlclient/direct.go @@ -7,8 +7,6 @@ import ( "bytes" "cmp" "context" - "crypto" - "crypto/sha256" "encoding/binary" "encoding/json" "errors" @@ -948,26 +946,6 @@ func (c *Direct) sendMapRequest(ctx context.Context, isStreaming bool, nu Netmap ConnectionHandleForTest: connectionHandleForTest, } - // If we have a hardware attestation key, sign the node key with it and send - // the key & signature in the map request. - if buildfeatures.HasTPM { - if k := persist.AsStruct().AttestationKey; k != nil && !k.IsZero() { - hwPub := key.HardwareAttestationPublicFromPlatformKey(k) - request.HardwareAttestationKey = hwPub - - t := c.clock.Now() - msg := fmt.Sprintf("%d|%s", t.Unix(), nodeKey.String()) - digest := sha256.Sum256([]byte(msg)) - sig, err := k.Sign(nil, digest[:], crypto.SHA256) - if err != nil { - c.logf("failed to sign node key with hardware attestation key: %v", err) - } else { - request.HardwareAttestationKeySignature = sig - request.HardwareAttestationKeySignatureTimestamp = t - } - } - } - var extraDebugFlags []string if buildfeatures.HasAdvertiseRoutes && hi != nil && c.netMon != nil && !c.skipIPForwardingCheck && ipForwardingBroken(hi.RoutableIPs, c.netMon.InterfaceState()) { diff --git a/ipn/ipnlocal/hwattest.go b/ipn/ipnlocal/hwattest.go deleted file mode 100644 index 2c93cad4c97ff..0000000000000 --- a/ipn/ipnlocal/hwattest.go +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) Tailscale Inc & AUTHORS -// SPDX-License-Identifier: BSD-3-Clause - -//go:build !ts_omit_tpm - -package ipnlocal - -import ( - "errors" - - "tailscale.com/feature" - "tailscale.com/types/key" - "tailscale.com/types/logger" - "tailscale.com/types/persist" -) - -func init() { - feature.HookGenerateAttestationKeyIfEmpty.Set(generateAttestationKeyIfEmpty) -} - -// generateAttestationKeyIfEmpty generates a new hardware attestation key if -// none exists. It returns true if a new key was generated and stored in -// p.AttestationKey. -func generateAttestationKeyIfEmpty(p *persist.Persist, logf logger.Logf) (bool, error) { - // attempt to generate a new hardware attestation key if none exists - var ak key.HardwareAttestationKey - if p != nil { - ak = p.AttestationKey - } - - if ak == nil || ak.IsZero() { - var err error - ak, err = key.NewHardwareAttestationKey() - if err != nil { - if !errors.Is(err, key.ErrUnsupported) { - logf("failed to create hardware attestation key: %v", err) - } - } else if ak != nil { - logf("using new hardware attestation key: %v", ak.Public()) - if p == nil { - p = &persist.Persist{} - } - p.AttestationKey = ak - return true, nil - } - } - return false, nil -} diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 36e4ad8a589e9..8cc74c41ebc60 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -1216,7 +1216,6 @@ func stripKeysFromPrefs(p ipn.PrefsView) ipn.PrefsView { p2.Persist.PrivateNodeKey = key.NodePrivate{} p2.Persist.OldPrivateNodeKey = key.NodePrivate{} p2.Persist.NetworkLockKey = key.NLPrivate{} - p2.Persist.AttestationKey = nil return p2.View() } diff --git a/ipn/ipnlocal/profiles.go b/ipn/ipnlocal/profiles.go index 9c217637890cc..3e80cdaa93d1f 100644 --- a/ipn/ipnlocal/profiles.go +++ b/ipn/ipnlocal/profiles.go @@ -19,9 +19,7 @@ import ( "tailscale.com/ipn" "tailscale.com/ipn/ipnext" "tailscale.com/tailcfg" - "tailscale.com/types/key" "tailscale.com/types/logger" - "tailscale.com/types/persist" "tailscale.com/util/clientmetric" "tailscale.com/util/eventbus" ) @@ -656,14 +654,6 @@ func (pm *profileManager) loadSavedPrefs(k ipn.StateKey) (ipn.PrefsView, error) return ipn.PrefsView{}, err } savedPrefs := ipn.NewPrefs() - - // if supported by the platform, create an empty hardware attestation key to use when deserializing - // to avoid type exceptions from json.Unmarshaling into an interface{}. - hw, _ := key.NewEmptyHardwareAttestationKey() - savedPrefs.Persist = &persist.Persist{ - AttestationKey: hw, - } - if err := ipn.PrefsFromBytes(bs, savedPrefs); err != nil { return ipn.PrefsView{}, fmt.Errorf("parsing saved prefs: %v", err) } diff --git a/ipn/ipnlocal/profiles_test.go b/ipn/ipnlocal/profiles_test.go index deeab2ade9b15..60c92ff8d3493 100644 --- a/ipn/ipnlocal/profiles_test.go +++ b/ipn/ipnlocal/profiles_test.go @@ -151,7 +151,6 @@ func TestProfileDupe(t *testing.T) { ID: tailcfg.UserID(user), LoginName: fmt.Sprintf("user%d@example.com", user), }, - AttestationKey: nil, } } user1Node1 := newPersist(1, 1) diff --git a/ipn/prefs_test.go b/ipn/prefs_test.go index 2336164096c14..3339a631ce827 100644 --- a/ipn/prefs_test.go +++ b/ipn/prefs_test.go @@ -501,7 +501,7 @@ func TestPrefsPretty(t *testing.T) { }, }, "linux", - `Prefs{ra=false dns=false want=false routes=[] nf=off update=off Persist{o=, n=[B1VKl] u="" ak=-}}`, + `Prefs{ra=false dns=false want=false routes=[] nf=off update=off Persist{o=, n=[B1VKl] u=""}}`, }, { Prefs{ diff --git a/types/persist/persist.go b/types/persist/persist.go index 4b62c79ddd186..d888a6afb6af5 100644 --- a/types/persist/persist.go +++ b/types/persist/persist.go @@ -26,7 +26,6 @@ type Persist struct { UserProfile tailcfg.UserProfile NetworkLockKey key.NLPrivate NodeID tailcfg.StableNodeID - AttestationKey key.HardwareAttestationKey `json:",omitempty"` // DisallowedTKAStateIDs stores the tka.State.StateID values which // this node will not operate network lock on. This is used to @@ -85,20 +84,11 @@ func (p *Persist) Equals(p2 *Persist) bool { return false } - var pub, p2Pub key.HardwareAttestationPublic - if p.AttestationKey != nil && !p.AttestationKey.IsZero() { - pub = key.HardwareAttestationPublicFromPlatformKey(p.AttestationKey) - } - if p2.AttestationKey != nil && !p2.AttestationKey.IsZero() { - p2Pub = key.HardwareAttestationPublicFromPlatformKey(p2.AttestationKey) - } - return p.PrivateNodeKey.Equal(p2.PrivateNodeKey) && p.OldPrivateNodeKey.Equal(p2.OldPrivateNodeKey) && p.UserProfile.Equal(&p2.UserProfile) && p.NetworkLockKey.Equal(p2.NetworkLockKey) && p.NodeID == p2.NodeID && - pub.Equal(p2Pub) && reflect.DeepEqual(nilIfEmpty(p.DisallowedTKAStateIDs), nilIfEmpty(p2.DisallowedTKAStateIDs)) } @@ -106,16 +96,12 @@ func (p *Persist) Pretty() string { var ( ok, nk key.NodePublic ) - akString := "-" if !p.OldPrivateNodeKey.IsZero() { ok = p.OldPrivateNodeKey.Public() } if !p.PrivateNodeKey.IsZero() { nk = p.PublicNodeKey() } - if p.AttestationKey != nil && !p.AttestationKey.IsZero() { - akString = fmt.Sprintf("%v", p.AttestationKey.Public()) - } - return fmt.Sprintf("Persist{o=%v, n=%v u=%#v ak=%s}", - ok.ShortString(), nk.ShortString(), p.UserProfile.LoginName, akString) + return fmt.Sprintf("Persist{o=%v, n=%v u=%#v}", + ok.ShortString(), nk.ShortString(), p.UserProfile.LoginName) } diff --git a/types/persist/persist_clone.go b/types/persist/persist_clone.go index 9dbe7e0f6fa6d..680419ff2f30b 100644 --- a/types/persist/persist_clone.go +++ b/types/persist/persist_clone.go @@ -19,9 +19,6 @@ func (src *Persist) Clone() *Persist { } dst := new(Persist) *dst = *src - if src.AttestationKey != nil { - dst.AttestationKey = src.AttestationKey.Clone() - } dst.DisallowedTKAStateIDs = append(src.DisallowedTKAStateIDs[:0:0], src.DisallowedTKAStateIDs...) return dst } @@ -34,6 +31,5 @@ var _PersistCloneNeedsRegeneration = Persist(struct { UserProfile tailcfg.UserProfile NetworkLockKey key.NLPrivate NodeID tailcfg.StableNodeID - AttestationKey key.HardwareAttestationKey DisallowedTKAStateIDs []string }{}) diff --git a/types/persist/persist_test.go b/types/persist/persist_test.go index 713114b74dcd5..dbf2a6d8c7662 100644 --- a/types/persist/persist_test.go +++ b/types/persist/persist_test.go @@ -21,7 +21,7 @@ func fieldsOf(t reflect.Type) (fields []string) { } func TestPersistEqual(t *testing.T) { - persistHandles := []string{"PrivateNodeKey", "OldPrivateNodeKey", "UserProfile", "NetworkLockKey", "NodeID", "AttestationKey", "DisallowedTKAStateIDs"} + persistHandles := []string{"PrivateNodeKey", "OldPrivateNodeKey", "UserProfile", "NetworkLockKey", "NodeID", "DisallowedTKAStateIDs"} if have := fieldsOf(reflect.TypeFor[Persist]()); !reflect.DeepEqual(have, persistHandles) { t.Errorf("Persist.Equal check might be out of sync\nfields: %q\nhandled: %q\n", have, persistHandles) diff --git a/types/persist/persist_view.go b/types/persist/persist_view.go index dbf8294ef5a7a..7d1507468fc65 100644 --- a/types/persist/persist_view.go +++ b/types/persist/persist_view.go @@ -89,11 +89,10 @@ func (v *PersistView) UnmarshalJSONFrom(dec *jsontext.Decoder) error { func (v PersistView) PrivateNodeKey() key.NodePrivate { return v.ж.PrivateNodeKey } // needed to request key rotation -func (v PersistView) OldPrivateNodeKey() key.NodePrivate { return v.ж.OldPrivateNodeKey } -func (v PersistView) UserProfile() tailcfg.UserProfile { return v.ж.UserProfile } -func (v PersistView) NetworkLockKey() key.NLPrivate { return v.ж.NetworkLockKey } -func (v PersistView) NodeID() tailcfg.StableNodeID { return v.ж.NodeID } -func (v PersistView) AttestationKey() tailcfg.StableNodeID { panic("unsupported") } +func (v PersistView) OldPrivateNodeKey() key.NodePrivate { return v.ж.OldPrivateNodeKey } +func (v PersistView) UserProfile() tailcfg.UserProfile { return v.ж.UserProfile } +func (v PersistView) NetworkLockKey() key.NLPrivate { return v.ж.NetworkLockKey } +func (v PersistView) NodeID() tailcfg.StableNodeID { return v.ж.NodeID } // DisallowedTKAStateIDs stores the tka.State.StateID values which // this node will not operate network lock on. This is used to @@ -111,6 +110,5 @@ var _PersistViewNeedsRegeneration = Persist(struct { UserProfile tailcfg.UserProfile NetworkLockKey key.NLPrivate NodeID tailcfg.StableNodeID - AttestationKey key.HardwareAttestationKey DisallowedTKAStateIDs []string }{}) From bad03eefa1b3ce097877e4337d5fa1ddc17f64f5 Mon Sep 17 00:00:00 2001 From: Max Coulombe Date: Mon, 27 Oct 2025 16:33:03 -0400 Subject: [PATCH 02/25] feature/identityfederation: strip query params on clientID (#17666) Updates #9192 Change-Id: I35c88df8a0242ecc19a23265d392ef78ac176b9d Signed-off-by: mcoulombe (cherry picked from commit 34e992f59db2feed0c5cd857d4829ea5ef5e0298) --- .../identityfederation/identityfederation.go | 19 +++++++++++-------- .../identityfederation_test.go | 10 +++++++++- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/feature/identityfederation/identityfederation.go b/feature/identityfederation/identityfederation.go index a4470fc27eaea..ab1b65f1217d1 100644 --- a/feature/identityfederation/identityfederation.go +++ b/feature/identityfederation/identityfederation.go @@ -42,12 +42,12 @@ func resolveAuthKey(ctx context.Context, baseURL, clientID, idToken string, tags baseURL = ipn.DefaultControlURL } - ephemeral, preauth, err := parseOptionalAttributes(clientID) + strippedID, ephemeral, preauth, err := parseOptionalAttributes(clientID) if err != nil { return "", fmt.Errorf("failed to parse optional config attributes: %w", err) } - accessToken, err := exchangeJWTForToken(ctx, baseURL, clientID, idToken) + accessToken, err := exchangeJWTForToken(ctx, baseURL, strippedID, idToken) if err != nil { return "", fmt.Errorf("failed to exchange JWT for access token: %w", err) } @@ -79,15 +79,15 @@ func resolveAuthKey(ctx context.Context, baseURL, clientID, idToken string, tags return authkey, nil } -func parseOptionalAttributes(clientID string) (ephemeral bool, preauthorized bool, err error) { - _, attrs, found := strings.Cut(clientID, "?") +func parseOptionalAttributes(clientID string) (strippedID string, ephemeral bool, preauthorized bool, err error) { + strippedID, attrs, found := strings.Cut(clientID, "?") if !found { - return true, false, nil + return clientID, true, false, nil } parsed, err := url.ParseQuery(attrs) if err != nil { - return false, false, fmt.Errorf("failed to parse optional config attributes: %w", err) + return "", false, false, fmt.Errorf("failed to parse optional config attributes: %w", err) } for k := range parsed { @@ -97,11 +97,14 @@ func parseOptionalAttributes(clientID string) (ephemeral bool, preauthorized boo case "preauthorized": preauthorized, err = strconv.ParseBool(parsed.Get(k)) default: - return false, false, fmt.Errorf("unknown optional config attribute %q", k) + return "", false, false, fmt.Errorf("unknown optional config attribute %q", k) } } + if err != nil { + return "", false, false, err + } - return ephemeral, preauthorized, err + return strippedID, ephemeral, preauthorized, nil } // exchangeJWTForToken exchanges a JWT for a Tailscale access token. diff --git a/feature/identityfederation/identityfederation_test.go b/feature/identityfederation/identityfederation_test.go index 7b75852a819a1..a673a42982706 100644 --- a/feature/identityfederation/identityfederation_test.go +++ b/feature/identityfederation/identityfederation_test.go @@ -87,6 +87,7 @@ func TestParseOptionalAttributes(t *testing.T) { tests := []struct { name string clientID string + wantClientID string wantEphemeral bool wantPreauth bool wantErr string @@ -94,6 +95,7 @@ func TestParseOptionalAttributes(t *testing.T) { { name: "default values", clientID: "client-123", + wantClientID: "client-123", wantEphemeral: true, wantPreauth: false, wantErr: "", @@ -101,6 +103,7 @@ func TestParseOptionalAttributes(t *testing.T) { { name: "custom values", clientID: "client-123?ephemeral=false&preauthorized=true", + wantClientID: "client-123", wantEphemeral: false, wantPreauth: true, wantErr: "", @@ -108,6 +111,7 @@ func TestParseOptionalAttributes(t *testing.T) { { name: "unknown attribute", clientID: "client-123?unknown=value", + wantClientID: "", wantEphemeral: false, wantPreauth: false, wantErr: `unknown optional config attribute "unknown"`, @@ -115,6 +119,7 @@ func TestParseOptionalAttributes(t *testing.T) { { name: "invalid value", clientID: "client-123?ephemeral=invalid", + wantClientID: "", wantEphemeral: false, wantPreauth: false, wantErr: `strconv.ParseBool: parsing "invalid": invalid syntax`, @@ -123,7 +128,7 @@ func TestParseOptionalAttributes(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - ephemeral, preauth, err := parseOptionalAttributes(tt.clientID) + strippedID, ephemeral, preauth, err := parseOptionalAttributes(tt.clientID) if tt.wantErr != "" { if err == nil { t.Errorf("parseOptionalAttributes() error = nil, want %q", tt.wantErr) @@ -138,6 +143,9 @@ func TestParseOptionalAttributes(t *testing.T) { return } } + if strippedID != tt.wantClientID { + t.Errorf("parseOptionalAttributes() strippedID = %v, want %v", strippedID, tt.wantClientID) + } if ephemeral != tt.wantEphemeral { t.Errorf("parseOptionalAttributes() ephemeral = %v, want %v", ephemeral, tt.wantEphemeral) } From 033adc398c0267a173863f028715c1267882f8d1 Mon Sep 17 00:00:00 2001 From: srwareham Date: Mon, 27 Oct 2025 15:20:57 -0700 Subject: [PATCH 03/25] cmd/tailscale/cli: move JetKVM scripts to /userdata/init.d for persistence (#17610) Updates #16524 Updates jetkvm/rv1106-system#34 Signed-off-by: srwareham (cherry picked from commit f4e2720821d4975de8a1964b9274db3f19da48d2) --- cmd/tailscale/cli/configure-jetkvm.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cmd/tailscale/cli/configure-jetkvm.go b/cmd/tailscale/cli/configure-jetkvm.go index a8e0a7cb542ef..c80bf673605cf 100644 --- a/cmd/tailscale/cli/configure-jetkvm.go +++ b/cmd/tailscale/cli/configure-jetkvm.go @@ -48,9 +48,12 @@ func runConfigureJetKVM(ctx context.Context, args []string) error { if runtime.GOOS != "linux" || distro.Get() != distro.JetKVM { return errors.New("only implemented on JetKVM") } - err := os.WriteFile("/etc/init.d/S22tailscale", bytes.TrimLeft([]byte(` + if err := os.MkdirAll("/userdata/init.d", 0755); err != nil { + return errors.New("unable to create /userdata/init.d") + } + err := os.WriteFile("/userdata/init.d/S22tailscale", bytes.TrimLeft([]byte(` #!/bin/sh -# /etc/init.d/S22tailscale +# /userdata/init.d/S22tailscale # Start/stop tailscaled case "$1" in From 53004dded1e574d5ccf86023dd19b57982c0a5a1 Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Tue, 28 Oct 2025 08:34:34 -0700 Subject: [PATCH 04/25] wgengine/magicsock: fix js/wasm crash regression loading non-existent portmapper Thanks for the report, @Need-an-AwP! Fixes #17681 Updates #9394 Change-Id: I2e0b722ef9b460bd7e79499192d1a315504ca84c Signed-off-by: Brad Fitzpatrick (cherry picked from commit edb11e0e60ce702ebe62e7bfca345f167ac5efad) --- client/local/local.go | 13 +++++++++++++ client/tailscale/apitype/apitype.go | 10 ++++++++++ feature/feature.go | 6 ++++++ feature/portmapper/portmapper.go | 2 ++ ipn/localapi/debug.go | 10 ++++++++++ tstest/integration/integration_test.go | 22 ++++++++++++++++++++++ wgengine/magicsock/magicsock.go | 8 ++++++-- 7 files changed, 69 insertions(+), 2 deletions(-) diff --git a/client/local/local.go b/client/local/local.go index 582c7b8487957..2382a12252a20 100644 --- a/client/local/local.go +++ b/client/local/local.go @@ -596,6 +596,19 @@ func (lc *Client) DebugResultJSON(ctx context.Context, action string) (any, erro return x, nil } +// QueryOptionalFeatures queries the optional features supported by the Tailscale daemon. +func (lc *Client) QueryOptionalFeatures(ctx context.Context) (*apitype.OptionalFeatures, error) { + body, err := lc.send(ctx, "POST", "/localapi/v0/debug-optional-features", 200, nil) + if err != nil { + return nil, fmt.Errorf("error %w: %s", err, body) + } + var x apitype.OptionalFeatures + if err := json.Unmarshal(body, &x); err != nil { + return nil, err + } + return &x, nil +} + // SetDevStoreKeyValue set a statestore key/value. It's only meant for development. // The schema (including when keys are re-read) is not a stable interface. func (lc *Client) SetDevStoreKeyValue(ctx context.Context, key, value string) error { diff --git a/client/tailscale/apitype/apitype.go b/client/tailscale/apitype/apitype.go index 58cdcecc78d4f..6d239d082cd95 100644 --- a/client/tailscale/apitype/apitype.go +++ b/client/tailscale/apitype/apitype.go @@ -94,3 +94,13 @@ type DNSQueryResponse struct { // Resolvers is the list of resolvers that the forwarder deemed able to resolve the query. Resolvers []*dnstype.Resolver } + +// OptionalFeatures describes which optional features are enabled in the build. +type OptionalFeatures struct { + // Features is the map of optional feature names to whether they are + // enabled. + // + // Disabled features may be absent from the map. (That is, false values + // are not guaranteed to be present.) + Features map[string]bool +} diff --git a/feature/feature.go b/feature/feature.go index 0d383b398ab60..110b104daae00 100644 --- a/feature/feature.go +++ b/feature/feature.go @@ -13,6 +13,12 @@ var ErrUnavailable = errors.New("feature not included in this build") var in = map[string]bool{} +// Registered reports the set of registered features. +// +// The returned map should not be modified by the caller, +// not accessed concurrently with calls to Register. +func Registered() map[string]bool { return in } + // Register notes that the named feature is linked into the binary. func Register(name string) { if _, ok := in[name]; ok { diff --git a/feature/portmapper/portmapper.go b/feature/portmapper/portmapper.go index e7be00ad17d8c..d1b903cb69c20 100644 --- a/feature/portmapper/portmapper.go +++ b/feature/portmapper/portmapper.go @@ -6,6 +6,7 @@ package portmapper import ( + "tailscale.com/feature" "tailscale.com/net/netmon" "tailscale.com/net/portmapper" "tailscale.com/net/portmapper/portmappertype" @@ -14,6 +15,7 @@ import ( ) func init() { + feature.Register("portmapper") portmappertype.HookNewPortMapper.Set(newPortMapper) } diff --git a/ipn/localapi/debug.go b/ipn/localapi/debug.go index b3b919d31ede2..8aca7f0093f7d 100644 --- a/ipn/localapi/debug.go +++ b/ipn/localapi/debug.go @@ -19,6 +19,7 @@ import ( "sync" "time" + "tailscale.com/client/tailscale/apitype" "tailscale.com/feature" "tailscale.com/feature/buildfeatures" "tailscale.com/ipn" @@ -39,6 +40,7 @@ func init() { Register("debug-packet-filter-matches", (*Handler).serveDebugPacketFilterMatches) Register("debug-packet-filter-rules", (*Handler).serveDebugPacketFilterRules) Register("debug-peer-endpoint-changes", (*Handler).serveDebugPeerEndpointChanges) + Register("debug-optional-features", (*Handler).serveDebugOptionalFeatures) } func (h *Handler) serveDebugPeerEndpointChanges(w http.ResponseWriter, r *http.Request) { @@ -463,3 +465,11 @@ func (h *Handler) serveDebugLog(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusNoContent) } + +func (h *Handler) serveDebugOptionalFeatures(w http.ResponseWriter, r *http.Request) { + of := &apitype.OptionalFeatures{ + Features: feature.Registered(), + } + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(of) +} diff --git a/tstest/integration/integration_test.go b/tstest/integration/integration_test.go index 234bb8c6ec11a..64f49c7b80afd 100644 --- a/tstest/integration/integration_test.go +++ b/tstest/integration/integration_test.go @@ -175,6 +175,28 @@ func TestControlKnobs(t *testing.T) { } } +func TestExpectedFeaturesLinked(t *testing.T) { + tstest.Shard(t) + tstest.Parallel(t) + env := NewTestEnv(t) + n1 := NewTestNode(t, env) + + d1 := n1.StartDaemon() + n1.AwaitResponding() + lc := n1.LocalClient() + got, err := lc.QueryOptionalFeatures(t.Context()) + if err != nil { + t.Fatal(err) + } + if !got.Features["portmapper"] { + t.Errorf("optional feature portmapper unexpectedly not found: got %v", got.Features) + } + + d1.MustCleanShutdown(t) + + t.Logf("number of HTTP logcatcher requests: %v", env.LogCatcher.numRequests()) +} + func TestCollectPanic(t *testing.T) { flakytest.Mark(t, "https://github.com/tailscale/tailscale/issues/15865") tstest.Shard(t) diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index e3c2d478e9882..6584789017624 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -719,9 +719,13 @@ func NewConn(opts Options) (*Conn, error) { newPortMapper, ok := portmappertype.HookNewPortMapper.GetOk() if ok { c.portMapper = newPortMapper(portmapperLogf, opts.EventBus, opts.NetMon, disableUPnP, c.onlyTCP443.Load) - } else if !testenv.InTest() { - panic("unexpected: HookNewPortMapper not set") } + // If !ok, the HookNewPortMapper hook is not set (so feature/portmapper + // isn't linked), but the build tag to explicitly omit the portmapper + // isn't set either. This should only happen to js/wasm builds, where + // the portmapper is a no-op even if linked (but it's no longer linked, + // since the move to feature/portmapper), or if people are wiring up + // their own Tailscale build from pieces. } c.netMon = opts.NetMon From 2dd72f6ec271dc419e1727d5b9c30ada41f423e5 Mon Sep 17 00:00:00 2001 From: "M. J. Fromberger" Date: Tue, 28 Oct 2025 08:45:22 -0700 Subject: [PATCH 05/25] Revert "logtail: avoid racing eventbus subscriptions with Shutdown (#17639)" (#17684) This reverts commit 4346615d77a6de16854c6e78f9d49375d6424e6e. We averted the shutdown race, but will need to service the subscriber even when we are not waiting for a change so that we do not delay the bus as a whole. Updates #17638 Change-Id: I5488466ed83f5ad1141c95267f5ae54878a24657 Signed-off-by: M. J. Fromberger (cherry picked from commit db5815fb978db0873752618d4531ee2ac9f5f83d) --- logtail/logtail.go | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/logtail/logtail.go b/logtail/logtail.go index 52823fedf4309..675422890149c 100644 --- a/logtail/logtail.go +++ b/logtail/logtail.go @@ -124,7 +124,6 @@ func NewLogger(cfg Config, logf tslogger.Logf) *Logger { if cfg.Bus != nil { l.eventClient = cfg.Bus.Client("logtail.Logger") - l.changeDeltaSub = eventbus.Subscribe[netmon.ChangeDelta](l.eventClient) } l.SetSockstatsLabel(sockstats.LabelLogtailLogger) l.compressLogs = cfg.CompressLogs @@ -163,7 +162,6 @@ type Logger struct { httpDoCalls atomic.Int32 sockstatsLabel atomicSocktatsLabel eventClient *eventbus.Client - changeDeltaSub *eventbus.Subscriber[netmon.ChangeDelta] procID uint32 includeProcSequence bool @@ -429,23 +427,8 @@ func (l *Logger) internetUp() bool { func (l *Logger) awaitInternetUp(ctx context.Context) { if l.eventClient != nil { - for { - if l.internetUp() { - return - } - select { - case <-ctx.Done(): - return // give up - case <-l.changeDeltaSub.Done(): - return // give up (closing down) - case delta := <-l.changeDeltaSub.Events(): - if delta.New.AnyInterfaceUp() || l.internetUp() { - fmt.Fprintf(l.stderr, "logtail: internet back up\n") - return - } - fmt.Fprintf(l.stderr, "logtail: network changed, but is not up") - } - } + l.awaitInternetUpBus(ctx) + return } upc := make(chan bool, 1) defer l.netMonitor.RegisterChangeCallback(func(delta *netmon.ChangeDelta) { @@ -466,6 +449,24 @@ func (l *Logger) awaitInternetUp(ctx context.Context) { } } +func (l *Logger) awaitInternetUpBus(ctx context.Context) { + if l.internetUp() { + return + } + sub := eventbus.Subscribe[netmon.ChangeDelta](l.eventClient) + defer sub.Close() + select { + case delta := <-sub.Events(): + if delta.New.AnyInterfaceUp() { + fmt.Fprintf(l.stderr, "logtail: internet back up\n") + return + } + fmt.Fprintf(l.stderr, "logtail: network changed, but is not up") + case <-ctx.Done(): + return + } +} + // upload uploads body to the log server. // origlen indicates the pre-compression body length. // origlen of -1 indicates that the body is not compressed. From 68cba300e4903d87f3f315e451fc70e67c58c8e6 Mon Sep 17 00:00:00 2001 From: Nick Khyl Date: Tue, 28 Oct 2025 13:29:24 -0500 Subject: [PATCH 06/25] VERSION.txt: this is v1.90.4 Signed-off-by: Nick Khyl --- VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION.txt b/VERSION.txt index 604e786f2b495..fc9befffdd888 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -1.90.3 +1.90.4 From 1a6c31538eb2c9e392f81ed9859df4581702e771 Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Wed, 29 Oct 2025 13:02:29 -0700 Subject: [PATCH 07/25] sessionrecording: fix regression in recent http2 package change In 3f5c560fd45664813 I changed to use std net/http's HTTP/2 support, instead of pulling in x/net/http2. But I forgot to update DialTLSContext to DialContext, which meant it was falling back to using the std net.Dialer for its dials, instead of the passed-in one. The tests only passed because they were using localhost addresses, so the std net.Dialer worked. But in prod, where a tsnet Dialer would be needed, it didn't work, and would time out for 10 seconds before resorting to the old protocol. So this fixes the tests to use an isolated in-memory network to prevent that class of problem in the future. With the test change, the old code fails and the new code passes. Thanks to @jasonodonnell for debugging! Updates #17304 Updates 3f5c560fd45664813 Change-Id: I3602bafd07dc6548e2c62985af9ac0afb3a0e967 Signed-off-by: Brad Fitzpatrick (cherry picked from commit 89962546471472823f4fce7877ca7f906c07ecb0) --- sessionrecording/connect.go | 5 +---- sessionrecording/connect_test.go | 14 ++++++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/sessionrecording/connect.go b/sessionrecording/connect.go index 8abf9dd7e9142..9d20b41f9b31a 100644 --- a/sessionrecording/connect.go +++ b/sessionrecording/connect.go @@ -405,10 +405,7 @@ func clientHTTP2(dialCtx context.Context, dial netx.DialFunc) *http.Client { return &http.Client{ Transport: &http.Transport{ Protocols: &p, - // Pretend like we're using TLS, but actually use the provided - // DialFunc underneath. This is necessary to convince the transport - // to actually dial. - DialTLSContext: func(ctx context.Context, network, addr string) (net.Conn, error) { + DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) { perAttemptCtx, cancel := context.WithTimeout(ctx, perDialAttemptTimeout) defer cancel() go func() { diff --git a/sessionrecording/connect_test.go b/sessionrecording/connect_test.go index cacf061d79b79..e834828f5a6cc 100644 --- a/sessionrecording/connect_test.go +++ b/sessionrecording/connect_test.go @@ -21,6 +21,7 @@ import ( "golang.org/x/net/http2" "golang.org/x/net/http2/h2c" + "tailscale.com/net/memnet" ) func TestConnectToRecorder(t *testing.T) { @@ -145,7 +146,14 @@ func TestConnectToRecorder(t *testing.T) { t.Run(tt.desc, func(t *testing.T) { mux, uploadHash := tt.setup(t) - srv := httptest.NewUnstartedServer(mux) + memNet := &memnet.Network{} + ln := memNet.NewLocalTCPListener() + + srv := &httptest.Server{ + Config: &http.Server{Handler: mux}, + Listener: ln, + } + if tt.http2 { // Wire up h2c-compatible HTTP/2 server. This is optional // because the v1 recorder didn't support HTTP/2 and we try to @@ -159,10 +167,8 @@ func TestConnectToRecorder(t *testing.T) { srv.Start() t.Cleanup(srv.Close) - d := new(net.Dialer) - ctx := context.Background() - w, _, errc, err := ConnectToRecorder(ctx, []netip.AddrPort{netip.MustParseAddrPort(srv.Listener.Addr().String())}, d.DialContext) + w, _, errc, err := ConnectToRecorder(ctx, []netip.AddrPort{netip.MustParseAddrPort(ln.Addr().String())}, memNet.Dial) if err != nil { t.Fatalf("ConnectToRecorder: %v", err) } From 300e6062bf15f3a0d901273a149c1919e4fd4190 Mon Sep 17 00:00:00 2001 From: Brad Fitzpatrick Date: Wed, 29 Oct 2025 13:21:23 -0700 Subject: [PATCH 08/25] cmd/k8s-operator/generate: skip tests if no network or Helm is down Updates helm/helm#31434 Change-Id: I5eb20e97ff543f883d5646c9324f50f54180851d Signed-off-by: Brad Fitzpatrick (cherry picked from commit d5a40c01ab5bc5e33ef2b0ec4bea3cbd38050f48) --- cmd/k8s-operator/generate/main.go | 2 +- cmd/k8s-operator/generate/main_test.go | 26 +++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/cmd/k8s-operator/generate/main.go b/cmd/k8s-operator/generate/main.go index 6904f1df02ec0..5fd5d551b5e02 100644 --- a/cmd/k8s-operator/generate/main.go +++ b/cmd/k8s-operator/generate/main.go @@ -144,7 +144,7 @@ func generate(baseDir string) error { if _, err := file.Write([]byte(helmConditionalEnd)); err != nil { return fmt.Errorf("error writing helm if-statement end: %w", err) } - return nil + return file.Close() } for _, crd := range []struct { crdPath, templatePath string diff --git a/cmd/k8s-operator/generate/main_test.go b/cmd/k8s-operator/generate/main_test.go index c7956dcdbef8f..5ea7fec80971a 100644 --- a/cmd/k8s-operator/generate/main_test.go +++ b/cmd/k8s-operator/generate/main_test.go @@ -7,26 +7,50 @@ package main import ( "bytes" + "context" + "net" "os" "os/exec" "path/filepath" "strings" "testing" + "time" + + "tailscale.com/tstest/nettest" + "tailscale.com/util/cibuild" ) func Test_generate(t *testing.T) { + nettest.SkipIfNoNetwork(t) + + ctx, cancel := context.WithTimeout(t.Context(), 10*time.Second) + defer cancel() + if _, err := net.DefaultResolver.LookupIPAddr(ctx, "get.helm.sh"); err != nil { + // https://github.com/helm/helm/issues/31434 + t.Skipf("get.helm.sh seems down or unreachable; skipping test") + } + base, err := os.Getwd() base = filepath.Join(base, "../../../") if err != nil { t.Fatalf("error getting current working directory: %v", err) } defer cleanup(base) + + helmCLIPath := filepath.Join(base, "tool/helm") + if out, err := exec.Command(helmCLIPath, "version").CombinedOutput(); err != nil && cibuild.On() { + // It's not just DNS. Azure is generating bogus certs within GitHub Actions at least for + // helm. So try to run it and see if we can even fetch it. + // + // https://github.com/helm/helm/issues/31434 + t.Skipf("error fetching helm; skipping test in CI: %v, %s", err, out) + } + if err := generate(base); err != nil { t.Fatalf("CRD template generation: %v", err) } tempDir := t.TempDir() - helmCLIPath := filepath.Join(base, "tool/helm") helmChartTemplatesPath := filepath.Join(base, "cmd/k8s-operator/deploy/chart") helmPackageCmd := exec.Command(helmCLIPath, "package", helmChartTemplatesPath, "--destination", tempDir, "--version", "0.0.1") helmPackageCmd.Stderr = os.Stderr From 63242007ae0430ee99e6f63664d62d5237013142 Mon Sep 17 00:00:00 2001 From: Nick Khyl Date: Thu, 30 Oct 2025 12:38:25 -0500 Subject: [PATCH 09/25] VERSION.txt: this is v1.90.5 Signed-off-by: Nick Khyl --- VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION.txt b/VERSION.txt index fc9befffdd888..b0397692828e0 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -1.90.4 +1.90.5 From faca4c08b70dd54b433c33c211a26fff9fc3d3d0 Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Tue, 21 Oct 2025 09:52:23 +0100 Subject: [PATCH 10/25] .github/workflows: pin the google/oss-fuzz GitHub Actions Updates https://github.com/tailscale/corp/issues/31017 Signed-off-by: Alex Chan (cherry picked from commit 3944809a118153b83aa0a606e515e20b6fe6190b) --- .github/workflows/test.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c3aa4f1bca1ff..b6d41e937c2db 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -613,7 +613,9 @@ jobs: steps: - name: build fuzzers id: build - uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master + # As of 21 October 2025, this repo doesn't tag releases, so this commit + # hash is just the tip of master. + uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@1242ccb5b6352601e73c00f189ac2ae397242264 # continue-on-error makes steps.build.conclusion be 'success' even if # steps.build.outcome is 'failure'. This means this step does not # contribute to the job's overall pass/fail evaluation. @@ -643,7 +645,9 @@ jobs: # report a failure because TS_FUZZ_CURRENTLY_BROKEN is set to the wrong # value. if: steps.build.outcome == 'success' - uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master + # As of 21 October 2025, this repo doesn't tag releases, so this commit + # hash is just the tip of master. + uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@1242ccb5b6352601e73c00f189ac2ae397242264 with: oss-fuzz-project-name: 'tailscale' fuzz-seconds: 150 From 6e2f2bb31aaaf78f730456795a935c7eec24ad9a Mon Sep 17 00:00:00 2001 From: "M. J. Fromberger" Date: Wed, 29 Oct 2025 08:37:19 -0700 Subject: [PATCH 11/25] ipn/ipnlocal: do not stall event processing for appc route updates (#17663) A follow-up to #17411. Put AppConnector events into a task queue, as they may take some time to process. Ensure that the queue is stopped at shutdown so that cleanup will remain orderly. Because events are delivered on a separate goroutine, slow processing of an event does not cause an immediate problem; however, a subscriber that blocks for a long time will push back on the bus as a whole. See https://godoc.org/tailscale.com/util/eventbus#hdr-Expected_subscriber_behavior for more discussion. Updates #17192 Updates #15160 Change-Id: Ib313cc68aec273daf2b1ad79538266c81ef063e3 Signed-off-by: M. J. Fromberger (cherry picked from commit 06b092388e4efb2226a264a03df14b778505278c) --- ipn/ipnlocal/local.go | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/ipn/ipnlocal/local.go b/ipn/ipnlocal/local.go index 8cc74c41ebc60..6c62a07269900 100644 --- a/ipn/ipnlocal/local.go +++ b/ipn/ipnlocal/local.go @@ -87,6 +87,7 @@ import ( "tailscale.com/util/clientmetric" "tailscale.com/util/dnsname" "tailscale.com/util/eventbus" + "tailscale.com/util/execqueue" "tailscale.com/util/goroutines" "tailscale.com/util/mak" "tailscale.com/util/osuser" @@ -187,6 +188,7 @@ type LocalBackend struct { statsLogf logger.Logf // for printing peers stats on change sys *tsd.System eventSubs eventbus.Monitor + appcTask execqueue.ExecQueue // handles updates from appc health *health.Tracker // always non-nil polc policyclient.Client // always non-nil @@ -642,12 +644,14 @@ func (b *LocalBackend) consumeEventbusTopics(ec *eventbus.Client) func(*eventbus // We need to find a way to ensure that changes to the backend state are applied // consistently in the presnce of profile changes, which currently may not happen in // a single atomic step. See: https://github.com/tailscale/tailscale/issues/17414 - if err := b.AdvertiseRoute(ru.Advertise...); err != nil { - b.logf("appc: failed to advertise routes: %v: %v", ru.Advertise, err) - } - if err := b.UnadvertiseRoute(ru.Unadvertise...); err != nil { - b.logf("appc: failed to unadvertise routes: %v: %v", ru.Unadvertise, err) - } + b.appcTask.Add(func() { + if err := b.AdvertiseRoute(ru.Advertise...); err != nil { + b.logf("appc: failed to advertise routes: %v: %v", ru.Advertise, err) + } + if err := b.UnadvertiseRoute(ru.Unadvertise...); err != nil { + b.logf("appc: failed to unadvertise routes: %v: %v", ru.Unadvertise, err) + } + }) case ri := <-storeRoutesSub.Events(): // Whether or not routes should be stored can change over time. shouldStoreRoutes := b.ControlKnobs().AppCStoreRoutes.Load() @@ -1113,6 +1117,7 @@ func (b *LocalBackend) Shutdown() { // they can deadlock with c.Shutdown(). // 2. LocalBackend.consumeEventbusTopics event handlers may not guard against // undesirable post/in-progress LocalBackend.Shutdown() behaviors. + b.appcTask.Shutdown() b.eventSubs.Close() b.em.close() From b6eabd403862f9ccc2fd11ecd330fe85592be583 Mon Sep 17 00:00:00 2001 From: "M. J. Fromberger" Date: Thu, 30 Oct 2025 14:40:57 -0700 Subject: [PATCH 12/25] util/eventbus: allow logging of slow subscribers (#17705) Add options to the eventbus.Bus to plumb in a logger. Route that logger in to the subscriber machinery, and trigger a log message to it when a subscriber fails to respond to its delivered events for 5s or more. The log message includes the package, filename, and line number of the call site that created the subscription. Add tests that verify this works. Updates #17680 Change-Id: I0546516476b1e13e6a9cf79f19db2fe55e56c698 Signed-off-by: M. J. Fromberger (cherry picked from commit 061e6266cf4e9c9a0f06b0d60d4d7840f6b7678d) --- flake.nix | 2 +- go.mod | 2 +- go.mod.sri | 2 +- go.sum | 4 +-- shell.nix | 2 +- util/eventbus/bus.go | 35 ++++++++++++++++-- util/eventbus/bus_test.go | 73 ++++++++++++++++++++++++++++++++++++++ util/eventbus/client.go | 7 ++-- util/eventbus/debug.go | 36 +++++++++++++++++++ util/eventbus/subscribe.go | 35 ++++++++++++++++-- 10 files changed, 185 insertions(+), 13 deletions(-) diff --git a/flake.nix b/flake.nix index 726757f7a76b7..e8b5420c55cef 100644 --- a/flake.nix +++ b/flake.nix @@ -151,5 +151,5 @@ }); }; } -# nix-direnv cache busting line: sha256-rV3C2Vi48FCifGt58OdEO4+Av0HRIs8sUJVvp/gEBLw= +# nix-direnv cache busting line: sha256-AUOjLomba75qfzb9Vxc0Sktyeces6hBSuOMgboWcDnE= diff --git a/go.mod b/go.mod index 3c281fa7a34bf..96df00f65044f 100644 --- a/go.mod +++ b/go.mod @@ -42,7 +42,7 @@ require ( github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da github.com/golang/snappy v0.0.4 github.com/golangci/golangci-lint v1.57.1 - github.com/google/go-cmp v0.6.0 + github.com/google/go-cmp v0.7.0 github.com/google/go-containerregistry v0.20.3 github.com/google/go-tpm v0.9.4 github.com/google/gopacket v1.1.19 diff --git a/go.mod.sri b/go.mod.sri index f94054422c6d7..790d851a19a94 100644 --- a/go.mod.sri +++ b/go.mod.sri @@ -1 +1 @@ -sha256-rV3C2Vi48FCifGt58OdEO4+Av0HRIs8sUJVvp/gEBLw= +sha256-AUOjLomba75qfzb9Vxc0Sktyeces6hBSuOMgboWcDnE= diff --git a/go.sum b/go.sum index bc386d1fdb37f..9864eab71aa9a 100644 --- a/go.sum +++ b/go.sum @@ -490,8 +490,8 @@ github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/go-containerregistry v0.20.3 h1:oNx7IdTI936V8CQRveCjaxOiegWwvM7kqkbXTpyiovI= github.com/google/go-containerregistry v0.20.3/go.mod h1:w00pIgBRDVUDFM6bq+Qx8lwNWK+cxgCuX1vd3PIBDNI= github.com/google/go-github/v66 v66.0.0 h1:ADJsaXj9UotwdgK8/iFZtv7MLc8E8WBl62WLd/D/9+M= diff --git a/shell.nix b/shell.nix index ec345998afe30..99fc7fa4de547 100644 --- a/shell.nix +++ b/shell.nix @@ -16,4 +16,4 @@ ) { src = ./.; }).shellNix -# nix-direnv cache busting line: sha256-rV3C2Vi48FCifGt58OdEO4+Av0HRIs8sUJVvp/gEBLw= +# nix-direnv cache busting line: sha256-AUOjLomba75qfzb9Vxc0Sktyeces6hBSuOMgboWcDnE= diff --git a/util/eventbus/bus.go b/util/eventbus/bus.go index d1507d8e67587..b1639136a5133 100644 --- a/util/eventbus/bus.go +++ b/util/eventbus/bus.go @@ -5,10 +5,12 @@ package eventbus import ( "context" + "log" "reflect" "slices" "sync" + "tailscale.com/types/logger" "tailscale.com/util/set" ) @@ -30,6 +32,7 @@ type Bus struct { write chan PublishedEvent snapshot chan chan []PublishedEvent routeDebug hook[RoutedEvent] + logf logger.Logf topicsMu sync.Mutex topics map[reflect.Type][]*subscribeState @@ -40,19 +43,42 @@ type Bus struct { clients set.Set[*Client] } -// New returns a new bus. Use [Publish] to make event publishers, -// and [Subscribe] and [SubscribeFunc] to make event subscribers. -func New() *Bus { +// New returns a new bus with default options. It is equivalent to +// calling [NewWithOptions] with zero [BusOptions]. +func New() *Bus { return NewWithOptions(BusOptions{}) } + +// NewWithOptions returns a new [Bus] with the specified [BusOptions]. +// Use [Bus.Client] to construct clients on the bus. +// Use [Publish] to make event publishers. +// Use [Subscribe] and [SubscribeFunc] to make event subscribers. +func NewWithOptions(opts BusOptions) *Bus { ret := &Bus{ write: make(chan PublishedEvent), snapshot: make(chan chan []PublishedEvent), topics: map[reflect.Type][]*subscribeState{}, clients: set.Set[*Client]{}, + logf: opts.logger(), } ret.router = runWorker(ret.pump) return ret } +// BusOptions are optional parameters for a [Bus]. A zero value is ready for +// use and provides defaults as described. +type BusOptions struct { + // Logf, if non-nil, is used for debug logs emitted by the bus and clients, + // publishers, and subscribers under its care. If it is nil, logs are sent + // to [log.Printf]. + Logf logger.Logf +} + +func (o BusOptions) logger() logger.Logf { + if o.Logf == nil { + return log.Printf + } + return o.Logf +} + // Client returns a new client with no subscriptions. Use [Subscribe] // to receive events, and [Publish] to emit events. // @@ -166,6 +192,9 @@ func (b *Bus) pump(ctx context.Context) { } } +// logger returns a [logger.Logf] to which logs related to bus activity should be written. +func (b *Bus) logger() logger.Logf { return b.logf } + func (b *Bus) dest(t reflect.Type) []*subscribeState { b.topicsMu.Lock() defer b.topicsMu.Unlock() diff --git a/util/eventbus/bus_test.go b/util/eventbus/bus_test.go index de292cf1adb5b..1e0cd8abf2cff 100644 --- a/util/eventbus/bus_test.go +++ b/util/eventbus/bus_test.go @@ -4,8 +4,11 @@ package eventbus_test import ( + "bytes" "errors" "fmt" + "log" + "regexp" "testing" "testing/synctest" "time" @@ -436,6 +439,76 @@ func TestMonitor(t *testing.T) { t.Run("Wait", testMon(t, func(c *eventbus.Client, m eventbus.Monitor) { c.Close(); m.Wait() })) } +func TestSlowSubs(t *testing.T) { + swapLogBuf := func(t *testing.T) *bytes.Buffer { + logBuf := new(bytes.Buffer) + save := log.Writer() + log.SetOutput(logBuf) + t.Cleanup(func() { log.SetOutput(save) }) + return logBuf + } + + t.Run("Subscriber", func(t *testing.T) { + synctest.Test(t, func(t *testing.T) { + buf := swapLogBuf(t) + + b := eventbus.New() + defer b.Close() + + pc := b.Client("pub") + p := eventbus.Publish[EventA](pc) + + sc := b.Client("sub") + s := eventbus.Subscribe[EventA](sc) + + go func() { + time.Sleep(6 * time.Second) // trigger the slow check at 5s. + t.Logf("Subscriber accepted %v", <-s.Events()) + }() + + p.Publish(EventA{12345}) + + time.Sleep(7 * time.Second) // advance time... + synctest.Wait() // subscriber is done + + want := regexp.MustCompile(`^.* tailscale.com/util/eventbus_test bus_test.go:\d+: ` + + `subscriber for eventbus_test.EventA is slow.*`) + if got := buf.String(); !want.MatchString(got) { + t.Errorf("Wrong log output\ngot: %q\nwant: %s", got, want) + } + }) + }) + + t.Run("SubscriberFunc", func(t *testing.T) { + synctest.Test(t, func(t *testing.T) { + buf := swapLogBuf(t) + + b := eventbus.New() + defer b.Close() + + pc := b.Client("pub") + p := eventbus.Publish[EventB](pc) + + sc := b.Client("sub") + eventbus.SubscribeFunc[EventB](sc, func(e EventB) { + time.Sleep(6 * time.Second) // trigger the slow check at 5s. + t.Logf("SubscriberFunc processed %v", e) + }) + + p.Publish(EventB{67890}) + + time.Sleep(7 * time.Second) // advance time... + synctest.Wait() // subscriber is done + + want := regexp.MustCompile(`^.* tailscale.com/util/eventbus_test bus_test.go:\d+: ` + + `subscriber for eventbus_test.EventB is slow.*`) + if got := buf.String(); !want.MatchString(got) { + t.Errorf("Wrong log output\ngot: %q\nwant: %s", got, want) + } + }) + }) +} + func TestRegression(t *testing.T) { bus := eventbus.New() t.Cleanup(bus.Close) diff --git a/util/eventbus/client.go b/util/eventbus/client.go index 9e3f3ee76cc31..c119c67a939c2 100644 --- a/util/eventbus/client.go +++ b/util/eventbus/client.go @@ -7,6 +7,7 @@ import ( "reflect" "sync" + "tailscale.com/types/logger" "tailscale.com/util/set" ) @@ -29,6 +30,8 @@ type Client struct { func (c *Client) Name() string { return c.name } +func (c *Client) logger() logger.Logf { return c.bus.logger() } + // Close closes the client. It implicitly closes all publishers and // subscribers obtained from this client. func (c *Client) Close() { @@ -142,7 +145,7 @@ func Subscribe[T any](c *Client) *Subscriber[T] { } r := c.subscribeStateLocked() - s := newSubscriber[T](r) + s := newSubscriber[T](r, logfForCaller(c.logger())) r.addSubscriber(s) return s } @@ -165,7 +168,7 @@ func SubscribeFunc[T any](c *Client, f func(T)) *SubscriberFunc[T] { } r := c.subscribeStateLocked() - s := newSubscriberFunc[T](r, f) + s := newSubscriberFunc[T](r, f, logfForCaller(c.logger())) r.addSubscriber(s) return s } diff --git a/util/eventbus/debug.go b/util/eventbus/debug.go index 6d5463bece7b2..2f2c9589ad0e2 100644 --- a/util/eventbus/debug.go +++ b/util/eventbus/debug.go @@ -6,12 +6,22 @@ package eventbus import ( "cmp" "fmt" + "path/filepath" "reflect" + "runtime" "slices" + "strings" "sync" "sync/atomic" + "time" + + "tailscale.com/types/logger" ) +// slowSubscriberTimeout is a timeout after which a subscriber that does not +// accept a pending event will be flagged as being slow. +const slowSubscriberTimeout = 5 * time.Second + // A Debugger offers access to a bus's privileged introspection and // debugging facilities. // @@ -204,3 +214,29 @@ type DebugTopic struct { Publisher string Subscribers []string } + +// logfForCaller returns a [logger.Logf] that prefixes its output with the +// package, filename, and line number of the caller's caller. +// If logf == nil, it returns [logger.Discard]. +// If the caller location could not be determined, it returns logf unmodified. +func logfForCaller(logf logger.Logf) logger.Logf { + if logf == nil { + return logger.Discard + } + pc, fpath, line, _ := runtime.Caller(2) // +1 for my caller, +1 for theirs + if f := runtime.FuncForPC(pc); f != nil { + return logger.WithPrefix(logf, fmt.Sprintf("%s %s:%d: ", funcPackageName(f.Name()), filepath.Base(fpath), line)) + } + return logf +} + +func funcPackageName(funcName string) string { + ls := max(strings.LastIndex(funcName, "/"), 0) + for { + i := strings.LastIndex(funcName, ".") + if i <= ls { + return funcName + } + funcName = funcName[:i] + } +} diff --git a/util/eventbus/subscribe.go b/util/eventbus/subscribe.go index c35c7e7f05682..0b821b3f51586 100644 --- a/util/eventbus/subscribe.go +++ b/util/eventbus/subscribe.go @@ -8,6 +8,9 @@ import ( "fmt" "reflect" "sync" + "time" + + "tailscale.com/types/logger" ) type DeliveredEvent struct { @@ -182,12 +185,18 @@ type Subscriber[T any] struct { stop stopFlag read chan T unregister func() + logf logger.Logf + slow *time.Timer // used to detect slow subscriber service } -func newSubscriber[T any](r *subscribeState) *Subscriber[T] { +func newSubscriber[T any](r *subscribeState, logf logger.Logf) *Subscriber[T] { + slow := time.NewTimer(0) + slow.Stop() // reset in dispatch return &Subscriber[T]{ read: make(chan T), unregister: func() { r.deleteSubscriber(reflect.TypeFor[T]()) }, + logf: logf, + slow: slow, } } @@ -212,6 +221,11 @@ func (s *Subscriber[T]) monitor(debugEvent T) { func (s *Subscriber[T]) dispatch(ctx context.Context, vals *queue[DeliveredEvent], acceptCh func() chan DeliveredEvent, snapshot chan chan []DeliveredEvent) bool { t := vals.Peek().Event.(T) + + start := time.Now() + s.slow.Reset(slowSubscriberTimeout) + defer s.slow.Stop() + for { // Keep the cases in this select in sync with subscribeState.pump // above. The only difference should be that this select @@ -226,6 +240,9 @@ func (s *Subscriber[T]) dispatch(ctx context.Context, vals *queue[DeliveredEvent return false case ch := <-snapshot: ch <- vals.Snapshot() + case <-s.slow.C: + s.logf("subscriber for %T is slow (%v elapsed)", t, time.Since(start)) + s.slow.Reset(slowSubscriberTimeout) } } } @@ -260,12 +277,18 @@ type SubscriberFunc[T any] struct { stop stopFlag read func(T) unregister func() + logf logger.Logf + slow *time.Timer // used to detect slow subscriber service } -func newSubscriberFunc[T any](r *subscribeState, f func(T)) *SubscriberFunc[T] { +func newSubscriberFunc[T any](r *subscribeState, f func(T), logf logger.Logf) *SubscriberFunc[T] { + slow := time.NewTimer(0) + slow.Stop() // reset in dispatch return &SubscriberFunc[T]{ read: f, unregister: func() { r.deleteSubscriber(reflect.TypeFor[T]()) }, + logf: logf, + slow: slow, } } @@ -285,6 +308,11 @@ func (s *SubscriberFunc[T]) dispatch(ctx context.Context, vals *queue[DeliveredE t := vals.Peek().Event.(T) callDone := make(chan struct{}) go s.runCallback(t, callDone) + + start := time.Now() + s.slow.Reset(slowSubscriberTimeout) + defer s.slow.Stop() + // Keep the cases in this select in sync with subscribeState.pump // above. The only difference should be that this select // delivers a value by calling s.read. @@ -299,6 +327,9 @@ func (s *SubscriberFunc[T]) dispatch(ctx context.Context, vals *queue[DeliveredE return false case ch := <-snapshot: ch <- vals.Snapshot() + case <-s.slow.C: + s.logf("subscriber for %T is slow (%v elapsed)", t, time.Since(start)) + s.slow.Reset(slowSubscriberTimeout) } } } From 28f6c2dbfc5d3f7dde8d566a9214f6e0f55a9d17 Mon Sep 17 00:00:00 2001 From: Nick Khyl Date: Fri, 31 Oct 2025 16:18:03 -0500 Subject: [PATCH 13/25] VERSION.txt: this is v1.90.6 Signed-off-by: Nick Khyl --- VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION.txt b/VERSION.txt index b0397692828e0..34ad0d880160b 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -1.90.5 +1.90.6 From e602907cf549830ef8ad5e5d031950725bfc42e7 Mon Sep 17 00:00:00 2001 From: Jordan Whited Date: Mon, 10 Nov 2025 20:07:33 -0800 Subject: [PATCH 14/25] wgengine/magicsock: validate endpoint.derpAddr in Conn.onUDPRelayAllocResp (#17828) Otherwise a zero value will panic in Conn.sendUDPStd. Updates #17827 Signed-off-by: Jordan Whited (cherry picked from commit 18806de400a29b035a9985f22d1390a50e38fcab) --- wgengine/magicsock/magicsock.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index 6584789017624..a32b46cc958a2 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -651,7 +651,9 @@ func (c *Conn) onUDPRelayAllocResp(allocResp UDPRelayAllocResp) { ep.mu.Lock() defer ep.mu.Unlock() derpAddr := ep.derpAddr - go c.sendDiscoMessage(epAddr{ap: derpAddr}, ep.publicKey, disco.key, allocResp.Message, discoVerboseLog) + if derpAddr.IsValid() { + go c.sendDiscoMessage(epAddr{ap: derpAddr}, ep.publicKey, disco.key, allocResp.Message, discoVerboseLog) + } } // Synchronize waits for all [eventbus] events published From 771a9d29ffa77b084bbd34bfc89f6c96ae43559a Mon Sep 17 00:00:00 2001 From: Jordan Whited Date: Mon, 10 Nov 2025 21:08:13 -0800 Subject: [PATCH 15/25] wgengine/magicsock: fix UDPRelayAllocReq/Resp deadlock (#17831) Updates #17830 Signed-off-by: Jordan Whited (cherry picked from commit 2ad2d4d409e6b5eac5dbecb59ce307eb3297587c) --- wgengine/magicsock/magicsock.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/wgengine/magicsock/magicsock.go b/wgengine/magicsock/magicsock.go index a32b46cc958a2..6ee14164d0a99 100644 --- a/wgengine/magicsock/magicsock.go +++ b/wgengine/magicsock/magicsock.go @@ -2444,7 +2444,10 @@ func (c *Conn) handleDiscoMessage(msg []byte, src epAddr, shouldBeRelayHandshake if !nodeHasCap(c.filt, c.peers.At(peerI), c.self, tailcfg.PeerCapabilityRelay) { return } - c.allocRelayEndpointPub.Publish(UDPRelayAllocReq{ + // [Conn.mu] must not be held while publishing, or [Conn.onUDPRelayAllocResp] + // can deadlock as the req sub and resp pub are the same goroutine. + // See #17830. + go c.allocRelayEndpointPub.Publish(UDPRelayAllocReq{ RxFromDiscoKey: sender, RxFromNodeKey: nodeKey, Message: req, From eb03b354f6698cbda9f8bc65964b7ab80e22ea1a Mon Sep 17 00:00:00 2001 From: Jordan Whited Date: Wed, 12 Nov 2025 15:47:01 -0800 Subject: [PATCH 16/25] net/udprelay: replace VNI pool with selection algorithm (#17868) This reduces memory usage when tailscaled is acting as a peer relay. Updates #17801 Signed-off-by: Jordan Whited (cherry picked from commit f4f9dd7f8c95bdcdc84de7de7c0de4fb591b73d0) --- net/udprelay/server.go | 45 +++++++++++++++++++++++++++---------- net/udprelay/server_test.go | 23 +++++++++++++++++++ 2 files changed, 56 insertions(+), 12 deletions(-) diff --git a/net/udprelay/server.go b/net/udprelay/server.go index 83831dd698164..815b301a168f4 100644 --- a/net/udprelay/server.go +++ b/net/udprelay/server.go @@ -77,11 +77,17 @@ type Server struct { addrPorts []netip.AddrPort // the ip:port pairs returned as candidate endpoints closed bool lamportID uint64 - vniPool []uint32 // the pool of available VNIs + nextVNI uint32 byVNI map[uint32]*serverEndpoint byDisco map[key.SortedPairOfDiscoPublic]*serverEndpoint } +const ( + minVNI = uint32(1) + maxVNI = uint32(1<<24 - 1) + totalPossibleVNI = maxVNI - minVNI + 1 +) + // serverEndpoint contains Server-internal [endpoint.ServerEndpoint] state. // serverEndpoint methods are not thread-safe. type serverEndpoint struct { @@ -281,15 +287,10 @@ func NewServer(logf logger.Logf, port int, overrideAddrs []netip.Addr) (s *Serve steadyStateLifetime: defaultSteadyStateLifetime, closeCh: make(chan struct{}), byDisco: make(map[key.SortedPairOfDiscoPublic]*serverEndpoint), + nextVNI: minVNI, byVNI: make(map[uint32]*serverEndpoint), } s.discoPublic = s.disco.Public() - // TODO: instead of allocating 10s of MBs for the full pool, allocate - // smaller chunks and increase as needed - s.vniPool = make([]uint32, 0, 1<<24-1) - for i := 1; i < 1<<24; i++ { - s.vniPool = append(s.vniPool, uint32(i)) - } // TODO(creachadair): Find a way to plumb this in during initialization. // As-written, messages published here will not be seen by other components @@ -557,7 +558,6 @@ func (s *Server) Close() error { defer s.mu.Unlock() clear(s.byVNI) clear(s.byDisco) - s.vniPool = nil s.closed = true s.bus.Close() }) @@ -579,7 +579,6 @@ func (s *Server) endpointGCLoop() { if v.isExpired(now, s.bindLifetime, s.steadyStateLifetime) { delete(s.byDisco, k) delete(s.byVNI, v.vni) - s.vniPool = append(s.vniPool, v.vni) } } } @@ -714,6 +713,27 @@ func (e ErrServerNotReady) Error() string { return fmt.Sprintf("server not ready, retry after %v", e.RetryAfter) } +// getNextVNILocked returns the next available VNI. It implements the +// "Traditional BSD Port Selection Algorithm" from RFC6056. This algorithm does +// not attempt to obfuscate the selection, i.e. the selection is predictable. +// For now, we favor simplicity and reducing VNI re-use over more complex +// ephemeral port (VNI) selection algorithms. +func (s *Server) getNextVNILocked() (uint32, error) { + for i := uint32(0); i < totalPossibleVNI; i++ { + vni := s.nextVNI + if vni == maxVNI { + s.nextVNI = minVNI + } else { + s.nextVNI++ + } + _, ok := s.byVNI[vni] + if !ok { + return vni, nil + } + } + return 0, errors.New("VNI pool exhausted") +} + // AllocateEndpoint allocates an [endpoint.ServerEndpoint] for the provided pair // of [key.DiscoPublic]'s. If an allocation already exists for discoA and discoB // it is returned without modification/reallocation. AllocateEndpoint returns @@ -762,8 +782,9 @@ func (s *Server) AllocateEndpoint(discoA, discoB key.DiscoPublic) (endpoint.Serv }, nil } - if len(s.vniPool) == 0 { - return endpoint.ServerEndpoint{}, errors.New("VNI pool exhausted") + vni, err := s.getNextVNILocked() + if err != nil { + return endpoint.ServerEndpoint{}, err } s.lamportID++ @@ -771,10 +792,10 @@ func (s *Server) AllocateEndpoint(discoA, discoB key.DiscoPublic) (endpoint.Serv discoPubKeys: pair, lamportID: s.lamportID, allocatedAt: time.Now(), + vni: vni, } e.discoSharedSecrets[0] = s.disco.Shared(e.discoPubKeys.Get()[0]) e.discoSharedSecrets[1] = s.disco.Shared(e.discoPubKeys.Get()[1]) - e.vni, s.vniPool = s.vniPool[0], s.vniPool[1:] s.byDisco[pair] = e s.byVNI[e.vni] = e diff --git a/net/udprelay/server_test.go b/net/udprelay/server_test.go index 8fc4a4f78cb47..bf7f0a9b5f1de 100644 --- a/net/udprelay/server_test.go +++ b/net/udprelay/server_test.go @@ -10,6 +10,7 @@ import ( "testing" "time" + qt "github.com/frankban/quicktest" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "go4.org/mem" @@ -319,3 +320,25 @@ func TestServer(t *testing.T) { }) } } + +func TestServer_getNextVNILocked(t *testing.T) { + t.Parallel() + c := qt.New(t) + s := &Server{ + nextVNI: minVNI, + byVNI: make(map[uint32]*serverEndpoint), + } + for i := uint64(0); i < uint64(totalPossibleVNI); i++ { + vni, err := s.getNextVNILocked() + if err != nil { // using quicktest here triples test time + t.Fatal(err) + } + s.byVNI[vni] = nil + } + c.Assert(s.nextVNI, qt.Equals, minVNI) + _, err := s.getNextVNILocked() + c.Assert(err, qt.IsNotNil) + delete(s.byVNI, minVNI) + _, err = s.getNextVNILocked() + c.Assert(err, qt.IsNil) +} From 0f421d3def15f10660ed06e9c29dca82e89b25cd Mon Sep 17 00:00:00 2001 From: Jordan Whited Date: Thu, 13 Nov 2025 20:57:48 -0800 Subject: [PATCH 17/25] feature/relayserver,ipn/ipnlocal,net/udprelay: plumb DERPMap (#17881) This commit replaces usage of local.Client in net/udprelay with DERPMap plumbing over the eventbus. This has been a longstanding TODO. This work was also accelerated by a memory leak in net/http when using local.Client over long periods of time. So, this commit also addresses said leak. Updates #17801 Signed-off-by: Jordan Whited (cherry picked from commit 9e4d1fd87fc3ab6cfa1b91c7a7c3ced53348fb02) --- feature/relayserver/relayserver.go | 220 ++++++++++++----------- feature/relayserver/relayserver_test.go | 222 +++++++++++++++++++----- ipn/ipnlocal/node_backend.go | 13 +- net/udprelay/server.go | 38 ++-- 4 files changed, 324 insertions(+), 169 deletions(-) diff --git a/feature/relayserver/relayserver.go b/feature/relayserver/relayserver.go index df2fb4cb7c165..2646a0cbfee6e 100644 --- a/feature/relayserver/relayserver.go +++ b/feature/relayserver/relayserver.go @@ -21,8 +21,10 @@ import ( "tailscale.com/ipn/ipnext" "tailscale.com/ipn/localapi" "tailscale.com/net/udprelay" + "tailscale.com/net/udprelay/endpoint" "tailscale.com/net/udprelay/status" "tailscale.com/tailcfg" + "tailscale.com/types/key" "tailscale.com/types/logger" "tailscale.com/types/ptr" "tailscale.com/util/eventbus" @@ -68,25 +70,41 @@ func servePeerRelayDebugSessions(h *localapi.Handler, w http.ResponseWriter, r * // extension. It is registered with [ipnext.RegisterExtension] if the package is // imported. func newExtension(logf logger.Logf, sb ipnext.SafeBackend) (ipnext.Extension, error) { - return &extension{ + e := &extension{ + newServerFn: func(logf logger.Logf, port int, overrideAddrs []netip.Addr) (relayServer, error) { + return udprelay.NewServer(logf, port, overrideAddrs) + }, logf: logger.WithPrefix(logf, featureName+": "), - bus: sb.Sys().Bus.Get(), - }, nil + } + e.ec = sb.Sys().Bus.Get().Client("relayserver.extension") + e.respPub = eventbus.Publish[magicsock.UDPRelayAllocResp](e.ec) + eventbus.SubscribeFunc(e.ec, e.onDERPMapView) + eventbus.SubscribeFunc(e.ec, e.onAllocReq) + return e, nil +} + +// relayServer is an interface for [udprelay.Server]. +type relayServer interface { + Close() error + AllocateEndpoint(discoA, discoB key.DiscoPublic) (endpoint.ServerEndpoint, error) + GetSessions() []status.ServerSession + SetDERPMapView(tailcfg.DERPMapView) } // extension is an [ipnext.Extension] managing the relay server on platforms // that import this package. type extension struct { - logf logger.Logf - bus *eventbus.Bus + newServerFn func(logf logger.Logf, port int, overrideAddrs []netip.Addr) (relayServer, error) // swappable for tests + logf logger.Logf + ec *eventbus.Client + respPub *eventbus.Publisher[magicsock.UDPRelayAllocResp] - mu sync.Mutex // guards the following fields - shutdown bool - - port *int // ipn.Prefs.RelayServerPort, nil if disabled - eventSubs *eventbus.Monitor // nil if not connected to eventbus - debugSessionsCh chan chan []status.ServerSession // non-nil if consumeEventbusTopics is running - hasNodeAttrDisableRelayServer bool // tailcfg.NodeAttrDisableRelayServer + mu sync.Mutex // guards the following fields + shutdown bool // true if Shutdown() has been called + rs relayServer // nil when disabled + port *int // ipn.Prefs.RelayServerPort, nil if disabled + derpMapView tailcfg.DERPMapView // latest seen over the eventbus + hasNodeAttrDisableRelayServer bool // [tailcfg.NodeAttrDisableRelayServer] } // Name implements [ipnext.Extension]. @@ -104,26 +122,83 @@ func (e *extension) Init(host ipnext.Host) error { return nil } -// handleBusLifetimeLocked handles the lifetime of consumeEventbusTopics. -func (e *extension) handleBusLifetimeLocked() { - busShouldBeRunning := !e.shutdown && e.port != nil && !e.hasNodeAttrDisableRelayServer - if !busShouldBeRunning { - e.disconnectFromBusLocked() +func (e *extension) onDERPMapView(view tailcfg.DERPMapView) { + e.mu.Lock() + defer e.mu.Unlock() + e.derpMapView = view + if e.rs != nil { + e.rs.SetDERPMapView(view) + } +} + +func (e *extension) onAllocReq(req magicsock.UDPRelayAllocReq) { + e.mu.Lock() + defer e.mu.Unlock() + if e.shutdown { + return + } + if e.rs == nil { + if !e.relayServerShouldBeRunningLocked() { + return + } + e.tryStartRelayServerLocked() + if e.rs == nil { + return + } + } + se, err := e.rs.AllocateEndpoint(req.Message.ClientDisco[0], req.Message.ClientDisco[1]) + if err != nil { + e.logf("error allocating endpoint: %v", err) + return + } + e.respPub.Publish(magicsock.UDPRelayAllocResp{ + ReqRxFromNodeKey: req.RxFromNodeKey, + ReqRxFromDiscoKey: req.RxFromDiscoKey, + Message: &disco.AllocateUDPRelayEndpointResponse{ + Generation: req.Message.Generation, + UDPRelayEndpoint: disco.UDPRelayEndpoint{ + ServerDisco: se.ServerDisco, + ClientDisco: se.ClientDisco, + LamportID: se.LamportID, + VNI: se.VNI, + BindLifetime: se.BindLifetime.Duration, + SteadyStateLifetime: se.SteadyStateLifetime.Duration, + AddrPorts: se.AddrPorts, + }, + }, + }) +} + +func (e *extension) tryStartRelayServerLocked() { + rs, err := e.newServerFn(e.logf, *e.port, overrideAddrs()) + if err != nil { + e.logf("error initializing server: %v", err) return - } else if e.eventSubs != nil { - return // already running } + e.rs = rs + e.rs.SetDERPMapView(e.derpMapView) +} - ec := e.bus.Client("relayserver.extension") - e.debugSessionsCh = make(chan chan []status.ServerSession) - e.eventSubs = ptr.To(ec.Monitor(e.consumeEventbusTopics(ec, *e.port))) +func (e *extension) relayServerShouldBeRunningLocked() bool { + return !e.shutdown && e.port != nil && !e.hasNodeAttrDisableRelayServer +} + +// handleRelayServerLifetimeLocked handles the lifetime of [e.rs]. +func (e *extension) handleRelayServerLifetimeLocked() { + if !e.relayServerShouldBeRunningLocked() { + e.stopRelayServerLocked() + return + } else if e.rs != nil { + return // already running + } + e.tryStartRelayServerLocked() } func (e *extension) selfNodeViewChanged(nodeView tailcfg.NodeView) { e.mu.Lock() defer e.mu.Unlock() e.hasNodeAttrDisableRelayServer = nodeView.HasCap(tailcfg.NodeAttrDisableRelayServer) - e.handleBusLifetimeLocked() + e.handleRelayServerLifetimeLocked() } func (e *extension) profileStateChanged(_ ipn.LoginProfileView, prefs ipn.PrefsView, sameNode bool) { @@ -133,13 +208,13 @@ func (e *extension) profileStateChanged(_ ipn.LoginProfileView, prefs ipn.PrefsV enableOrDisableServer := ok != (e.port != nil) portChanged := ok && e.port != nil && newPort != *e.port if enableOrDisableServer || portChanged || !sameNode { - e.disconnectFromBusLocked() + e.stopRelayServerLocked() e.port = nil if ok { e.port = ptr.To(newPort) } } - e.handleBusLifetimeLocked() + e.handleRelayServerLifetimeLocked() } // overrideAddrs returns TS_DEBUG_RELAY_SERVER_ADDRS as []netip.Addr, if set. It @@ -162,88 +237,20 @@ var overrideAddrs = sync.OnceValue(func() (ret []netip.Addr) { return }) -// consumeEventbusTopics serves endpoint allocation requests over the eventbus. -// It also serves [relayServer] debug information on a channel. -// consumeEventbusTopics must never acquire [extension.mu], which can be held -// by other goroutines while waiting to receive on [extension.eventSubs] or the -// inner [extension.debugSessionsCh] channel. -func (e *extension) consumeEventbusTopics(ec *eventbus.Client, port int) func(*eventbus.Client) { - reqSub := eventbus.Subscribe[magicsock.UDPRelayAllocReq](ec) - respPub := eventbus.Publish[magicsock.UDPRelayAllocResp](ec) - debugSessionsCh := e.debugSessionsCh - - return func(ec *eventbus.Client) { - rs, err := udprelay.NewServer(e.logf, port, overrideAddrs()) - if err != nil { - e.logf("error initializing server: %v", err) - } - - defer func() { - if rs != nil { - rs.Close() - } - }() - for { - select { - case <-ec.Done(): - return - case respCh := <-debugSessionsCh: - if rs == nil { - respCh <- nil - continue - } - sessions := rs.GetSessions() - respCh <- sessions - case req := <-reqSub.Events(): - if rs == nil { - // The server may have previously failed to initialize if - // the configured port was in use, try again. - rs, err = udprelay.NewServer(e.logf, port, overrideAddrs()) - if err != nil { - e.logf("error initializing server: %v", err) - continue - } - } - se, err := rs.AllocateEndpoint(req.Message.ClientDisco[0], req.Message.ClientDisco[1]) - if err != nil { - e.logf("error allocating endpoint: %v", err) - continue - } - respPub.Publish(magicsock.UDPRelayAllocResp{ - ReqRxFromNodeKey: req.RxFromNodeKey, - ReqRxFromDiscoKey: req.RxFromDiscoKey, - Message: &disco.AllocateUDPRelayEndpointResponse{ - Generation: req.Message.Generation, - UDPRelayEndpoint: disco.UDPRelayEndpoint{ - ServerDisco: se.ServerDisco, - ClientDisco: se.ClientDisco, - LamportID: se.LamportID, - VNI: se.VNI, - BindLifetime: se.BindLifetime.Duration, - SteadyStateLifetime: se.SteadyStateLifetime.Duration, - AddrPorts: se.AddrPorts, - }, - }, - }) - } - } - } -} - -func (e *extension) disconnectFromBusLocked() { - if e.eventSubs != nil { - e.eventSubs.Close() - e.eventSubs = nil - e.debugSessionsCh = nil +func (e *extension) stopRelayServerLocked() { + if e.rs != nil { + e.rs.Close() } + e.rs = nil } // Shutdown implements [ipnlocal.Extension]. func (e *extension) Shutdown() error { e.mu.Lock() defer e.mu.Unlock() - e.disconnectFromBusLocked() e.shutdown = true + e.ec.Close() + e.stopRelayServerLocked() return nil } @@ -253,23 +260,14 @@ func (e *extension) Shutdown() error { func (e *extension) serverStatus() status.ServerStatus { e.mu.Lock() defer e.mu.Unlock() - st := status.ServerStatus{ UDPPort: nil, Sessions: nil, } - if e.port == nil || e.eventSubs == nil { + if e.rs == nil { return st } st.UDPPort = ptr.To(*e.port) - - ch := make(chan []status.ServerSession) - select { - case e.debugSessionsCh <- ch: - resp := <-ch - st.Sessions = resp - return st - case <-e.eventSubs.Done(): - return st - } + st.Sessions = e.rs.GetSessions() + return st } diff --git a/feature/relayserver/relayserver_test.go b/feature/relayserver/relayserver_test.go index 65c503524c5de..2184b51759b61 100644 --- a/feature/relayserver/relayserver_test.go +++ b/feature/relayserver/relayserver_test.go @@ -4,13 +4,20 @@ package relayserver import ( + "errors" + "net/netip" + "reflect" "testing" "tailscale.com/ipn" + "tailscale.com/net/udprelay/endpoint" + "tailscale.com/net/udprelay/status" + "tailscale.com/tailcfg" "tailscale.com/tsd" + "tailscale.com/tstime" + "tailscale.com/types/key" "tailscale.com/types/logger" "tailscale.com/types/ptr" - "tailscale.com/util/eventbus" ) func Test_extension_profileStateChanged(t *testing.T) { @@ -19,29 +26,33 @@ func Test_extension_profileStateChanged(t *testing.T) { type fields struct { port *int + rs relayServer } type args struct { prefs ipn.PrefsView sameNode bool } tests := []struct { - name string - fields fields - args args - wantPort *int - wantBusRunning bool + name string + fields fields + args args + wantPort *int + wantRelayServerFieldNonNil bool + wantRelayServerFieldMutated bool }{ { - name: "no changes non-nil port", + name: "no changes non-nil port previously running", fields: fields{ port: ptr.To(1), + rs: mockRelayServerNotZeroVal(), }, args: args{ prefs: prefsWithPortOne.View(), sameNode: true, }, - wantPort: ptr.To(1), - wantBusRunning: true, + wantPort: ptr.To(1), + wantRelayServerFieldNonNil: true, + wantRelayServerFieldMutated: false, }, { name: "prefs port nil", @@ -52,8 +63,23 @@ func Test_extension_profileStateChanged(t *testing.T) { prefs: prefsWithNilPort.View(), sameNode: true, }, - wantPort: nil, - wantBusRunning: false, + wantPort: nil, + wantRelayServerFieldNonNil: false, + wantRelayServerFieldMutated: false, + }, + { + name: "prefs port nil previously running", + fields: fields{ + port: ptr.To(1), + rs: mockRelayServerNotZeroVal(), + }, + args: args{ + prefs: prefsWithNilPort.View(), + sameNode: true, + }, + wantPort: nil, + wantRelayServerFieldNonNil: false, + wantRelayServerFieldMutated: true, }, { name: "prefs port changed", @@ -64,8 +90,23 @@ func Test_extension_profileStateChanged(t *testing.T) { prefs: prefsWithPortOne.View(), sameNode: true, }, - wantPort: ptr.To(1), - wantBusRunning: true, + wantPort: ptr.To(1), + wantRelayServerFieldNonNil: true, + wantRelayServerFieldMutated: true, + }, + { + name: "prefs port changed previously running", + fields: fields{ + port: ptr.To(2), + rs: mockRelayServerNotZeroVal(), + }, + args: args{ + prefs: prefsWithPortOne.View(), + sameNode: true, + }, + wantPort: ptr.To(1), + wantRelayServerFieldNonNil: true, + wantRelayServerFieldMutated: true, }, { name: "sameNode false", @@ -76,8 +117,23 @@ func Test_extension_profileStateChanged(t *testing.T) { prefs: prefsWithPortOne.View(), sameNode: false, }, - wantPort: ptr.To(1), - wantBusRunning: true, + wantPort: ptr.To(1), + wantRelayServerFieldNonNil: true, + wantRelayServerFieldMutated: true, + }, + { + name: "sameNode false previously running", + fields: fields{ + port: ptr.To(1), + rs: mockRelayServerNotZeroVal(), + }, + args: args{ + prefs: prefsWithPortOne.View(), + sameNode: false, + }, + wantPort: ptr.To(1), + wantRelayServerFieldNonNil: true, + wantRelayServerFieldMutated: true, }, { name: "prefs port non-nil extension port nil", @@ -88,85 +144,165 @@ func Test_extension_profileStateChanged(t *testing.T) { prefs: prefsWithPortOne.View(), sameNode: false, }, - wantPort: ptr.To(1), - wantBusRunning: true, + wantPort: ptr.To(1), + wantRelayServerFieldNonNil: true, + wantRelayServerFieldMutated: true, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { sys := tsd.NewSystem() - bus := sys.Bus.Get() - e := &extension{ - logf: logger.Discard, - port: tt.fields.port, - bus: bus, + ipne, err := newExtension(logger.Discard, mockSafeBackend{sys}) + if err != nil { + t.Fatal(err) + } + e := ipne.(*extension) + e.newServerFn = func(logf logger.Logf, port int, overrideAddrs []netip.Addr) (relayServer, error) { + return &mockRelayServer{}, nil } - defer e.disconnectFromBusLocked() + e.port = tt.fields.port + e.rs = tt.fields.rs + defer e.Shutdown() e.profileStateChanged(ipn.LoginProfileView{}, tt.args.prefs, tt.args.sameNode) - if tt.wantBusRunning != (e.eventSubs != nil) { - t.Errorf("wantBusRunning: %v != (e.eventSubs != nil): %v", tt.wantBusRunning, e.eventSubs != nil) + if tt.wantRelayServerFieldNonNil != (e.rs != nil) { + t.Errorf("wantRelayServerFieldNonNil: %v != (e.rs != nil): %v", tt.wantRelayServerFieldNonNil, e.rs != nil) } if (tt.wantPort == nil) != (e.port == nil) { t.Errorf("(tt.wantPort == nil): %v != (e.port == nil): %v", tt.wantPort == nil, e.port == nil) } else if tt.wantPort != nil && *tt.wantPort != *e.port { t.Errorf("wantPort: %d != *e.port: %d", *tt.wantPort, *e.port) } + if tt.wantRelayServerFieldMutated != !reflect.DeepEqual(tt.fields.rs, e.rs) { + t.Errorf("wantRelayServerFieldMutated: %v != !reflect.DeepEqual(tt.fields.rs, e.rs): %v", tt.wantRelayServerFieldMutated, !reflect.DeepEqual(tt.fields.rs, e.rs)) + } }) } } -func Test_extension_handleBusLifetimeLocked(t *testing.T) { +func mockRelayServerNotZeroVal() *mockRelayServer { + return &mockRelayServer{true} +} + +type mockRelayServer struct { + set bool +} + +func (mockRelayServer) Close() error { return nil } +func (mockRelayServer) AllocateEndpoint(_, _ key.DiscoPublic) (endpoint.ServerEndpoint, error) { + return endpoint.ServerEndpoint{}, errors.New("not implemented") +} +func (mockRelayServer) GetSessions() []status.ServerSession { return nil } +func (mockRelayServer) SetDERPMapView(tailcfg.DERPMapView) { return } + +type mockSafeBackend struct { + sys *tsd.System +} + +func (m mockSafeBackend) Sys() *tsd.System { return m.sys } +func (mockSafeBackend) Clock() tstime.Clock { return nil } +func (mockSafeBackend) TailscaleVarRoot() string { return "" } + +func Test_extension_handleRelayServerLifetimeLocked(t *testing.T) { tests := []struct { name string shutdown bool port *int - eventSubs *eventbus.Monitor + rs relayServer hasNodeAttrDisableRelayServer bool - wantBusRunning bool + wantRelayServerFieldNonNil bool + wantRelayServerFieldMutated bool }{ { name: "want running", shutdown: false, port: ptr.To(1), hasNodeAttrDisableRelayServer: false, - wantBusRunning: true, + wantRelayServerFieldNonNil: true, + wantRelayServerFieldMutated: true, + }, + { + name: "want running previously running", + shutdown: false, + port: ptr.To(1), + rs: mockRelayServerNotZeroVal(), + hasNodeAttrDisableRelayServer: false, + wantRelayServerFieldNonNil: true, + wantRelayServerFieldMutated: false, }, { name: "shutdown true", shutdown: true, port: ptr.To(1), hasNodeAttrDisableRelayServer: false, - wantBusRunning: false, + wantRelayServerFieldNonNil: false, + wantRelayServerFieldMutated: false, + }, + { + name: "shutdown true previously running", + shutdown: true, + port: ptr.To(1), + rs: mockRelayServerNotZeroVal(), + hasNodeAttrDisableRelayServer: false, + wantRelayServerFieldNonNil: false, + wantRelayServerFieldMutated: true, }, { name: "port nil", shutdown: false, port: nil, hasNodeAttrDisableRelayServer: false, - wantBusRunning: false, + wantRelayServerFieldNonNil: false, + wantRelayServerFieldMutated: false, + }, + { + name: "port nil previously running", + shutdown: false, + port: nil, + rs: mockRelayServerNotZeroVal(), + hasNodeAttrDisableRelayServer: false, + wantRelayServerFieldNonNil: false, + wantRelayServerFieldMutated: true, }, { name: "hasNodeAttrDisableRelayServer true", shutdown: false, port: nil, hasNodeAttrDisableRelayServer: true, - wantBusRunning: false, + wantRelayServerFieldNonNil: false, + wantRelayServerFieldMutated: false, + }, + { + name: "hasNodeAttrDisableRelayServer true previously running", + shutdown: false, + port: nil, + rs: mockRelayServerNotZeroVal(), + hasNodeAttrDisableRelayServer: true, + wantRelayServerFieldNonNil: false, + wantRelayServerFieldMutated: true, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - e := &extension{ - logf: logger.Discard, - bus: eventbus.New(), - shutdown: tt.shutdown, - port: tt.port, - eventSubs: tt.eventSubs, - hasNodeAttrDisableRelayServer: tt.hasNodeAttrDisableRelayServer, + sys := tsd.NewSystem() + ipne, err := newExtension(logger.Discard, mockSafeBackend{sys}) + if err != nil { + t.Fatal(err) + } + e := ipne.(*extension) + e.newServerFn = func(logf logger.Logf, port int, overrideAddrs []netip.Addr) (relayServer, error) { + return &mockRelayServer{}, nil + } + e.shutdown = tt.shutdown + e.port = tt.port + e.rs = tt.rs + e.hasNodeAttrDisableRelayServer = tt.hasNodeAttrDisableRelayServer + e.handleRelayServerLifetimeLocked() + defer e.Shutdown() + if tt.wantRelayServerFieldNonNil != (e.rs != nil) { + t.Errorf("wantRelayServerFieldNonNil: %v != (e.rs != nil): %v", tt.wantRelayServerFieldNonNil, e.rs != nil) } - e.handleBusLifetimeLocked() - defer e.disconnectFromBusLocked() - if tt.wantBusRunning != (e.eventSubs != nil) { - t.Errorf("wantBusRunning: %v != (e.eventSubs != nil): %v", tt.wantBusRunning, e.eventSubs != nil) + if tt.wantRelayServerFieldMutated != !reflect.DeepEqual(tt.rs, e.rs) { + t.Errorf("wantRelayServerFieldMutated: %v != !reflect.DeepEqual(tt.rs, e.rs): %v", tt.wantRelayServerFieldMutated, !reflect.DeepEqual(tt.rs, e.rs)) } }) } diff --git a/ipn/ipnlocal/node_backend.go b/ipn/ipnlocal/node_backend.go index 3408d4cbb325d..dbe23e4d5245a 100644 --- a/ipn/ipnlocal/node_backend.go +++ b/ipn/ipnlocal/node_backend.go @@ -75,10 +75,11 @@ type nodeBackend struct { filterAtomic atomic.Pointer[filter.Filter] // initialized once and immutable - eventClient *eventbus.Client - filterPub *eventbus.Publisher[magicsock.FilterUpdate] - nodeViewsPub *eventbus.Publisher[magicsock.NodeViewsUpdate] - nodeMutsPub *eventbus.Publisher[magicsock.NodeMutationsUpdate] + eventClient *eventbus.Client + filterPub *eventbus.Publisher[magicsock.FilterUpdate] + nodeViewsPub *eventbus.Publisher[magicsock.NodeViewsUpdate] + nodeMutsPub *eventbus.Publisher[magicsock.NodeMutationsUpdate] + derpMapViewPub *eventbus.Publisher[tailcfg.DERPMapView] // TODO(nickkhyl): maybe use sync.RWMutex? mu sync.Mutex // protects the following fields @@ -121,6 +122,7 @@ func newNodeBackend(ctx context.Context, logf logger.Logf, bus *eventbus.Bus) *n nb.filterPub = eventbus.Publish[magicsock.FilterUpdate](nb.eventClient) nb.nodeViewsPub = eventbus.Publish[magicsock.NodeViewsUpdate](nb.eventClient) nb.nodeMutsPub = eventbus.Publish[magicsock.NodeMutationsUpdate](nb.eventClient) + nb.derpMapViewPub = eventbus.Publish[tailcfg.DERPMapView](nb.eventClient) nb.filterPub.Publish(magicsock.FilterUpdate{Filter: nb.filterAtomic.Load()}) return nb } @@ -435,6 +437,9 @@ func (nb *nodeBackend) SetNetMap(nm *netmap.NetworkMap) { if nm != nil { nv.SelfNode = nm.SelfNode nv.Peers = nm.Peers + nb.derpMapViewPub.Publish(nm.DERPMap.View()) + } else { + nb.derpMapViewPub.Publish(tailcfg.DERPMapView{}) } nb.nodeViewsPub.Publish(nv) } diff --git a/net/udprelay/server.go b/net/udprelay/server.go index 815b301a168f4..86dee18e12f67 100644 --- a/net/udprelay/server.go +++ b/net/udprelay/server.go @@ -21,7 +21,6 @@ import ( "go4.org/mem" "golang.org/x/net/ipv6" - "tailscale.com/client/local" "tailscale.com/disco" "tailscale.com/net/batching" "tailscale.com/net/netaddr" @@ -32,6 +31,7 @@ import ( "tailscale.com/net/stun" "tailscale.com/net/udprelay/endpoint" "tailscale.com/net/udprelay/status" + "tailscale.com/tailcfg" "tailscale.com/tstime" "tailscale.com/types/key" "tailscale.com/types/logger" @@ -72,7 +72,8 @@ type Server struct { closeCh chan struct{} netChecker *netcheck.Client - mu sync.Mutex // guards the following fields + mu sync.Mutex // guards the following fields + derpMap *tailcfg.DERPMap addrDiscoveryOnce bool // addrDiscovery completed once (successfully or unsuccessfully) addrPorts []netip.AddrPort // the ip:port pairs returned as candidate endpoints closed bool @@ -374,15 +375,12 @@ func (s *Server) addrDiscoveryLoop() { } } - // fetch DERPMap to feed to netcheck - derpMapCtx, derpMapCancel := context.WithTimeout(context.Background(), time.Second) - defer derpMapCancel() - localClient := &local.Client{} - // TODO(jwhited): We are in-process so use eventbus or similar. - // local.Client gets us going. - dm, err := localClient.CurrentDERPMap(derpMapCtx) - if err != nil { - return nil, err + dm := s.getDERPMap() + if dm == nil { + // We don't have a DERPMap which is required to dynamically + // discover external addresses, but we can return the endpoints we + // do have. + return addrPorts.Slice(), nil } // get addrPorts as visible from DERP @@ -849,3 +847,21 @@ func (s *Server) GetSessions() []status.ServerSession { } return sessions } + +// SetDERPMapView sets the [tailcfg.DERPMapView] to use for future netcheck +// reports. +func (s *Server) SetDERPMapView(view tailcfg.DERPMapView) { + s.mu.Lock() + defer s.mu.Unlock() + if !view.Valid() { + s.derpMap = nil + return + } + s.derpMap = view.AsStruct() +} + +func (s *Server) getDERPMap() *tailcfg.DERPMap { + s.mu.Lock() + defer s.mu.Unlock() + return s.derpMap +} From ea8eeeb2f709f582cb1a97a4bb6ea93ab93e9149 Mon Sep 17 00:00:00 2001 From: Jordan Whited Date: Fri, 14 Nov 2025 10:22:58 -0800 Subject: [PATCH 18/25] feature/relayserver: fix Shutdown() deadlock (#17898) Updates #17894 Signed-off-by: Jordan Whited (cherry picked from commit 0285e1d5fb2b06cd4003ab3a7c1037caa091a85e) --- feature/relayserver/relayserver.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/feature/relayserver/relayserver.go b/feature/relayserver/relayserver.go index 2646a0cbfee6e..868d5f61a2fa7 100644 --- a/feature/relayserver/relayserver.go +++ b/feature/relayserver/relayserver.go @@ -246,10 +246,13 @@ func (e *extension) stopRelayServerLocked() { // Shutdown implements [ipnlocal.Extension]. func (e *extension) Shutdown() error { + // [extension.mu] must not be held when closing the [eventbus.Client]. Close + // blocks until all [eventbus.SubscribeFunc]'s have returned, and the ones + // used in this package also acquire [extension.mu]. See #17894. + e.ec.Close() e.mu.Lock() defer e.mu.Unlock() e.shutdown = true - e.ec.Close() e.stopRelayServerLocked() return nil } From fa514c7280f286d456e09f7c1ccff141801fe321 Mon Sep 17 00:00:00 2001 From: Jonathan Nobels Date: Mon, 17 Nov 2025 15:40:46 -0500 Subject: [PATCH 19/25] net/netmon: do not abandon a subscriber when exiting early (#17899) (#17905) LinkChangeLogLimiter keeps a subscription to track rate limits for log messages. But when its context ended, it would exit the subscription loop, leaving the subscriber still alive. Ensure the subscriber gets cleaned up when the context ends, so we don't stall event processing. Updates tailscale/corp#34311 Change-Id: I82749e482e9a00dfc47f04afbc69dd0237537cb2 (cherry picked from commit ab4b990d51c41aff8e1ae7a08435dedfe621ce0d) Signed-off-by: M. J. Fromberger Co-authored-by: M. J. Fromberger --- net/netmon/loghelper.go | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/net/netmon/loghelper.go b/net/netmon/loghelper.go index 2e28e8cda7895..675762cd10b18 100644 --- a/net/netmon/loghelper.go +++ b/net/netmon/loghelper.go @@ -18,13 +18,13 @@ import ( // done. func LinkChangeLogLimiter(ctx context.Context, logf logger.Logf, nm *Monitor) logger.Logf { var formatSeen sync.Map // map[string]bool - nm.b.Monitor(nm.changeDeltaWatcher(nm.b, ctx, func(cd ChangeDelta) { + sub := eventbus.SubscribeFunc(nm.b, func(cd ChangeDelta) { // If we're in a major change or a time jump, clear the seen map. if cd.Major || cd.TimeJumped { formatSeen.Clear() } - })) - + }) + context.AfterFunc(ctx, sub.Close) return func(format string, args ...any) { // We only store 'true' in the map, so if it's present then it // means we've already logged this format string. @@ -42,19 +42,3 @@ func LinkChangeLogLimiter(ctx context.Context, logf logger.Logf, nm *Monitor) lo logf(format, args...) } } - -func (nm *Monitor) changeDeltaWatcher(ec *eventbus.Client, ctx context.Context, fn func(ChangeDelta)) func(*eventbus.Client) { - sub := eventbus.Subscribe[ChangeDelta](ec) - return func(ec *eventbus.Client) { - for { - select { - case <-ctx.Done(): - return - case <-sub.Done(): - return - case change := <-sub.Events(): - fn(change) - } - } - } -} From 6b64718bb9902fe97870af5514fcfc2a557ca0df Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Tue, 21 Oct 2025 11:07:33 +0100 Subject: [PATCH 20/25] tka: don't try to read AUMs which are partway through being written Fixes https://github.com/tailscale/tailscale/issues/17600 Signed-off-by: Alex Chan (cherry picked from commit 23359dc72706b7d9f32dcd428f22f5e4fdbfc4b7) --- tka/tailchonk.go | 10 +++++++++- tka/tailchonk_test.go | 44 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/tka/tailchonk.go b/tka/tailchonk.go index cb683c273d033..289b1179fe861 100644 --- a/tka/tailchonk.go +++ b/tka/tailchonk.go @@ -9,6 +9,7 @@ import ( "bytes" "errors" "fmt" + "log" "os" "path/filepath" "slices" @@ -403,9 +404,16 @@ func (c *FS) scanHashes(eachHashInfo func(*fsHashInfo)) error { return fmt.Errorf("reading prefix dir: %v", err) } for _, file := range files { + // Ignore files whose names aren't valid AUM hashes, which may be + // temporary files which are partway through being written, or other + // files added by the OS (like .DS_Store) which we can ignore. + // TODO(alexc): it might be useful to append a suffix like `.aum` to + // filenames, so we can more easily distinguish between AUMs and + // arbitrary other files. var h AUMHash if err := h.UnmarshalText([]byte(file.Name())); err != nil { - return fmt.Errorf("invalid aum file: %s: %w", file.Name(), err) + log.Printf("ignoring unexpected non-AUM: %s: %v", file.Name(), err) + continue } info, err := c.get(h) if err != nil { diff --git a/tka/tailchonk_test.go b/tka/tailchonk_test.go index 08686598033b8..1a6bad4592053 100644 --- a/tka/tailchonk_test.go +++ b/tka/tailchonk_test.go @@ -7,6 +7,7 @@ import ( "bytes" "os" "path/filepath" + "slices" "sync" "testing" "time" @@ -83,6 +84,49 @@ func TestTailchonkFS_CommitTime(t *testing.T) { } } +// If we were interrupted while writing a temporary file, AllAUMs() +// should ignore it when scanning the AUM directory. +func TestTailchonkFS_IgnoreTempFile(t *testing.T) { + base := t.TempDir() + chonk := must.Get(ChonkDir(base)) + parentHash := randHash(t, 1) + aum := AUM{MessageKind: AUMNoOp, PrevAUMHash: parentHash[:]} + must.Do(chonk.CommitVerifiedAUMs([]AUM{aum})) + + writeAUMFile := func(filename, contents string) { + t.Helper() + if err := os.MkdirAll(filepath.Join(base, filename[0:2]), os.ModePerm); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(base, filename[0:2], filename), []byte(contents), 0600); err != nil { + t.Fatal(err) + } + } + + // Check that calling AllAUMs() returns the single committed AUM + got, err := chonk.AllAUMs() + if err != nil { + t.Fatalf("AllAUMs() failed: %v", err) + } + want := []AUMHash{aum.Hash()} + if !slices.Equal(got, want) { + t.Fatalf("AllAUMs() is wrong: got %v, want %v", got, want) + } + + // Write some temporary files which are named like partially-committed AUMs, + // then check that AllAUMs() only returns the single committed AUM. + writeAUMFile("AUM1234.tmp", "incomplete AUM\n") + writeAUMFile("AUM1234.tmp_123", "second incomplete AUM\n") + + got, err = chonk.AllAUMs() + if err != nil { + t.Fatalf("AllAUMs() failed: %v", err) + } + if !slices.Equal(got, want) { + t.Fatalf("AllAUMs() is wrong: got %v, want %v", got, want) + } +} + func TestMarkActiveChain(t *testing.T) { type aumTemplate struct { AUM AUM From 43ab8b4b181422f92c6bf7d094eaedfed2aaf86c Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Wed, 29 Oct 2025 11:00:17 +0000 Subject: [PATCH 21/25] tka: rename a mutex to `mu` instead of single-letter `l` See http://go/no-ell Updates tailscale/corp#33846 Signed-off-by: Alex Chan Change-Id: I88ecd9db847e04237c1feab9dfcede5ca1050cc5 (cherry picked from commit fca66fb51af7d9d5d4ccf159dc45bf3f0c1462b3) --- tka/tailchonk.go | 26 +++++++++++++------------- tka/tailchonk_test.go | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tka/tailchonk.go b/tka/tailchonk.go index 289b1179fe861..3c50e5efab0d1 100644 --- a/tka/tailchonk.go +++ b/tka/tailchonk.go @@ -82,7 +82,7 @@ type CompactableChonk interface { // // Mem implements the Chonk interface. type Mem struct { - l sync.RWMutex + mu sync.RWMutex aums map[AUMHash]AUM parentIndex map[AUMHash][]AUMHash @@ -90,23 +90,23 @@ type Mem struct { } func (c *Mem) SetLastActiveAncestor(hash AUMHash) error { - c.l.Lock() - defer c.l.Unlock() + c.mu.Lock() + defer c.mu.Unlock() c.lastActiveAncestor = &hash return nil } func (c *Mem) LastActiveAncestor() (*AUMHash, error) { - c.l.RLock() - defer c.l.RUnlock() + c.mu.RLock() + defer c.mu.RUnlock() return c.lastActiveAncestor, nil } // Heads returns AUMs for which there are no children. In other // words, the latest AUM in all chains (the 'leaf'). func (c *Mem) Heads() ([]AUM, error) { - c.l.RLock() - defer c.l.RUnlock() + c.mu.RLock() + defer c.mu.RUnlock() out := make([]AUM, 0, 6) // An AUM is a 'head' if there are no nodes for which it is the parent. @@ -120,8 +120,8 @@ func (c *Mem) Heads() ([]AUM, error) { // AUM returns the AUM with the specified digest. func (c *Mem) AUM(hash AUMHash) (AUM, error) { - c.l.RLock() - defer c.l.RUnlock() + c.mu.RLock() + defer c.mu.RUnlock() aum, ok := c.aums[hash] if !ok { return AUM{}, os.ErrNotExist @@ -132,8 +132,8 @@ func (c *Mem) AUM(hash AUMHash) (AUM, error) { // ChildAUMs returns all AUMs with a specified previous // AUM hash. func (c *Mem) ChildAUMs(prevAUMHash AUMHash) ([]AUM, error) { - c.l.RLock() - defer c.l.RUnlock() + c.mu.RLock() + defer c.mu.RUnlock() out := make([]AUM, 0, 6) for _, entry := range c.parentIndex[prevAUMHash] { out = append(out, c.aums[entry]) @@ -147,8 +147,8 @@ func (c *Mem) ChildAUMs(prevAUMHash AUMHash) ([]AUM, error) { // as the rest of the TKA implementation assumes that only // verified AUMs are stored. func (c *Mem) CommitVerifiedAUMs(updates []AUM) error { - c.l.Lock() - defer c.l.Unlock() + c.mu.Lock() + defer c.mu.Unlock() if c.aums == nil { c.parentIndex = make(map[AUMHash][]AUMHash, 64) c.aums = make(map[AUMHash]AUM, 64) diff --git a/tka/tailchonk_test.go b/tka/tailchonk_test.go index 1a6bad4592053..7816d2dc158b5 100644 --- a/tka/tailchonk_test.go +++ b/tka/tailchonk_test.go @@ -496,7 +496,7 @@ func (c *compactingChonkFake) PurgeAUMs(hashes []AUMHash) error { // Avoid go vet complaining about copying a lock value func cloneMem(src, dst *Mem) { - dst.l = sync.RWMutex{} + dst.mu = sync.RWMutex{} dst.aums = src.aums dst.parentIndex = src.parentIndex dst.lastActiveAncestor = src.lastActiveAncestor From 37b63eff1c1a705d4268d017af354690117c7cf9 Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Wed, 29 Oct 2025 11:09:28 +0000 Subject: [PATCH 22/25] ipn/ipnlocal: use an in-memory TKA store if FS is unavailable This requires making the internals of LocalBackend a bit more generic, and implementing the `tka.CompactableChonk` interface for `tka.Mem`. Signed-off-by: Alex Chan Updates https://github.com/tailscale/corp/issues/33599 (cherry picked from commit 1723cb83ed95db76fa933348e8d9df7d9fcb960d) --- cmd/tailscale/cli/up.go | 1 + health/healthmsg/healthmsg.go | 11 ++-- ipn/ipnlocal/network-lock.go | 54 +++++++++++------- tka/tailchonk.go | 92 +++++++++++++++++++++++++++++- tka/tailchonk_test.go | 37 ++++++++++++ tstest/chonktest/tailchonk_test.go | 6 ++ 6 files changed, 174 insertions(+), 27 deletions(-) diff --git a/cmd/tailscale/cli/up.go b/cmd/tailscale/cli/up.go index 91a6b60878a93..61cade8de68d0 100644 --- a/cmd/tailscale/cli/up.go +++ b/cmd/tailscale/cli/up.go @@ -818,6 +818,7 @@ func upWorthyWarning(s string) bool { strings.Contains(s, healthmsg.WarnAcceptRoutesOff) || strings.Contains(s, healthmsg.LockedOut) || strings.Contains(s, healthmsg.WarnExitNodeUsage) || + strings.Contains(s, healthmsg.InMemoryTailnetLockState) || strings.Contains(strings.ToLower(s), "update available: ") } diff --git a/health/healthmsg/healthmsg.go b/health/healthmsg/healthmsg.go index 2384103738cf3..5ea1c736d8851 100644 --- a/health/healthmsg/healthmsg.go +++ b/health/healthmsg/healthmsg.go @@ -8,9 +8,10 @@ package healthmsg const ( - WarnAcceptRoutesOff = "Some peers are advertising routes but --accept-routes is false" - TailscaleSSHOnBut = "Tailscale SSH enabled, but " // + ... something from caller - LockedOut = "this node is locked out; it will not have connectivity until it is signed. For more info, see https://tailscale.com/s/locked-out" - WarnExitNodeUsage = "The following issues on your machine will likely make usage of exit nodes impossible" - DisableRPFilter = "Please set rp_filter=2 instead of rp_filter=1; see https://github.com/tailscale/tailscale/issues/3310" + WarnAcceptRoutesOff = "Some peers are advertising routes but --accept-routes is false" + TailscaleSSHOnBut = "Tailscale SSH enabled, but " // + ... something from caller + LockedOut = "this node is locked out; it will not have connectivity until it is signed. For more info, see https://tailscale.com/s/locked-out" + WarnExitNodeUsage = "The following issues on your machine will likely make usage of exit nodes impossible" + DisableRPFilter = "Please set rp_filter=2 instead of rp_filter=1; see https://github.com/tailscale/tailscale/issues/3310" + InMemoryTailnetLockState = "Tailnet Lock state is only being stored in-memory. Set --statedir to store state on disk, which is more secure. See https://tailscale.com/kb/1226/tailnet-lock#tailnet-lock-state" ) diff --git a/ipn/ipnlocal/network-lock.go b/ipn/ipnlocal/network-lock.go index 4990824453c47..c769e242d4405 100644 --- a/ipn/ipnlocal/network-lock.go +++ b/ipn/ipnlocal/network-lock.go @@ -23,6 +23,7 @@ import ( "slices" "time" + "tailscale.com/health" "tailscale.com/health/healthmsg" "tailscale.com/ipn" "tailscale.com/ipn/ipnstate" @@ -54,7 +55,7 @@ var ( type tkaState struct { profile ipn.ProfileID authority *tka.Authority - storage *tka.FS + storage tka.CompactableChonk filtered []ipnstate.TKAPeer } @@ -75,7 +76,7 @@ func (b *LocalBackend) initTKALocked() error { root := b.TailscaleVarRoot() if root == "" { b.tka = nil - b.logf("network-lock unavailable; no state directory") + b.logf("cannot fetch existing TKA state; no state directory for network-lock") return nil } @@ -90,6 +91,7 @@ func (b *LocalBackend) initTKALocked() error { if err != nil { return fmt.Errorf("initializing tka: %v", err) } + if err := authority.Compact(storage, tkaCompactionDefaults); err != nil { b.logf("tka compaction failed: %v", err) } @@ -105,6 +107,16 @@ func (b *LocalBackend) initTKALocked() error { return nil } +// noNetworkLockStateDirWarnable is a Warnable to warn the user that Tailnet Lock data +// (in particular, the list of AUMs in the TKA state) is being stored in memory and will +// be lost when tailscaled restarts. +var noNetworkLockStateDirWarnable = health.Register(&health.Warnable{ + Code: "no-tailnet-lock-state-dir", + Title: "No statedir for Tailnet Lock", + Severity: health.SeverityMedium, + Text: health.StaticMessage(healthmsg.InMemoryTailnetLockState), +}) + // tkaFilterNetmapLocked checks the signatures on each node key, dropping // nodes from the netmap whose signature does not verify. // @@ -442,7 +454,7 @@ func (b *LocalBackend) tkaSyncLocked(ourNodeKey key.NodePublic) error { // b.mu must be held & TKA must be initialized. func (b *LocalBackend) tkaApplyDisablementLocked(secret []byte) error { if b.tka.authority.ValidDisablement(secret) { - if err := os.RemoveAll(b.chonkPathLocked()); err != nil { + if err := b.tka.storage.RemoveAll(); err != nil { return err } b.tka = nil @@ -486,19 +498,21 @@ func (b *LocalBackend) tkaBootstrapFromGenesisLocked(g tkatype.MarshaledAUM, per } } - chonkDir := b.chonkPathLocked() - if err := os.Mkdir(filepath.Dir(chonkDir), 0755); err != nil && !os.IsExist(err) { - return fmt.Errorf("creating chonk root dir: %v", err) - } - if err := os.Mkdir(chonkDir, 0755); err != nil && !os.IsExist(err) { - return fmt.Errorf("mkdir: %v", err) - } - - chonk, err := tka.ChonkDir(chonkDir) - if err != nil { - return fmt.Errorf("chonk: %v", err) + root := b.TailscaleVarRoot() + var storage tka.CompactableChonk + if root == "" { + b.health.SetUnhealthy(noNetworkLockStateDirWarnable, nil) + b.logf("network-lock using in-memory storage; no state directory") + storage = &tka.Mem{} + } else { + chonkDir := b.chonkPathLocked() + chonk, err := tka.ChonkDir(chonkDir) + if err != nil { + return fmt.Errorf("chonk: %v", err) + } + storage = chonk } - authority, err := tka.Bootstrap(chonk, genesis) + authority, err := tka.Bootstrap(storage, genesis) if err != nil { return fmt.Errorf("tka bootstrap: %v", err) } @@ -506,7 +520,7 @@ func (b *LocalBackend) tkaBootstrapFromGenesisLocked(g tkatype.MarshaledAUM, per b.tka = &tkaState{ profile: b.pm.CurrentProfile().ID(), authority: authority, - storage: chonk, + storage: storage, } return nil } @@ -519,10 +533,6 @@ func (b *LocalBackend) CanSupportNetworkLock() error { return nil } - if b.TailscaleVarRoot() == "" { - return errors.New("network-lock is not supported in this configuration, try setting --statedir") - } - // There's a var root (aka --statedir), so if network lock gets // initialized we have somewhere to store our AUMs. That's all // we need. @@ -642,6 +652,7 @@ func tkaStateFromPeer(p tailcfg.NodeView) ipnstate.TKAPeer { // needing signatures is returned as a response. // The Finish RPC submits signatures for all these nodes, at which point // Control has everything it needs to atomically enable network lock. +// TODO(alexc): Only with persistent backend func (b *LocalBackend) NetworkLockInit(keys []tka.Key, disablementValues [][]byte, supportDisablement []byte) error { if err := b.CanSupportNetworkLock(); err != nil { return err @@ -762,7 +773,7 @@ func (b *LocalBackend) NetworkLockForceLocalDisable() error { return fmt.Errorf("saving prefs: %w", err) } - if err := os.RemoveAll(b.chonkPathLocked()); err != nil { + if err := b.tka.storage.RemoveAll(); err != nil { return fmt.Errorf("deleting TKA state: %w", err) } b.tka = nil @@ -771,6 +782,7 @@ func (b *LocalBackend) NetworkLockForceLocalDisable() error { // NetworkLockSign signs the given node-key and submits it to the control plane. // rotationPublic, if specified, must be an ed25519 public key. +// TODO(alexc): in-memory only func (b *LocalBackend) NetworkLockSign(nodeKey key.NodePublic, rotationPublic []byte) error { ourNodeKey, sig, err := func(nodeKey key.NodePublic, rotationPublic []byte) (key.NodePublic, tka.NodeKeySignature, error) { b.mu.Lock() diff --git a/tka/tailchonk.go b/tka/tailchonk.go index 3c50e5efab0d1..f20eb5920ad63 100644 --- a/tka/tailchonk.go +++ b/tka/tailchonk.go @@ -10,6 +10,7 @@ import ( "errors" "fmt" "log" + "maps" "os" "path/filepath" "slices" @@ -57,6 +58,10 @@ type Chonk interface { // as a hint to pick the correct chain in the event that the Chonk stores // multiple distinct chains. LastActiveAncestor() (*AUMHash, error) + + // RemoveAll permanently and completely clears the TKA state. This should + // be called when the user disables Tailnet Lock. + RemoveAll() error } // CompactableChonk implementation are extensions of Chonk, which are @@ -78,12 +83,21 @@ type CompactableChonk interface { } // Mem implements in-memory storage of TKA state, suitable for -// tests. +// tests or cases where filesystem storage is unavailable. // // Mem implements the Chonk interface. +// +// Mem is thread-safe. type Mem struct { mu sync.RWMutex aums map[AUMHash]AUM + commitTimes map[AUMHash]time.Time + + // parentIndex is a map of AUMs to the AUMs for which they are + // the parent. + // + // For example, if parent index is {1 -> {2, 3, 4}}, that means + // that AUMs 2, 3, 4 all have aum.PrevAUMHash = 1. parentIndex map[AUMHash][]AUMHash lastActiveAncestor *AUMHash @@ -152,12 +166,14 @@ func (c *Mem) CommitVerifiedAUMs(updates []AUM) error { if c.aums == nil { c.parentIndex = make(map[AUMHash][]AUMHash, 64) c.aums = make(map[AUMHash]AUM, 64) + c.commitTimes = make(map[AUMHash]time.Time, 64) } updateLoop: for _, aum := range updates { aumHash := aum.Hash() c.aums[aumHash] = aum + c.commitTimes[aumHash] = time.Now() parent, ok := aum.Parent() if ok { @@ -173,6 +189,71 @@ updateLoop: return nil } +// RemoveAll permanently and completely clears the TKA state. +func (c *Mem) RemoveAll() error { + c.mu.Lock() + defer c.mu.Unlock() + c.aums = nil + c.commitTimes = nil + c.parentIndex = nil + c.lastActiveAncestor = nil + return nil +} + +// AllAUMs returns all AUMs stored in the chonk. +func (c *Mem) AllAUMs() ([]AUMHash, error) { + c.mu.RLock() + defer c.mu.RUnlock() + + return slices.Collect(maps.Keys(c.aums)), nil +} + +// CommitTime returns the time at which the AUM was committed. +// +// If the AUM does not exist, then os.ErrNotExist is returned. +func (c *Mem) CommitTime(h AUMHash) (time.Time, error) { + c.mu.RLock() + defer c.mu.RUnlock() + + t, ok := c.commitTimes[h] + if ok { + return t, nil + } else { + return time.Time{}, os.ErrNotExist + } +} + +// PurgeAUMs marks the specified AUMs for deletion from storage. +func (c *Mem) PurgeAUMs(hashes []AUMHash) error { + c.mu.Lock() + defer c.mu.Unlock() + + for _, h := range hashes { + // Remove the deleted AUM from the list of its parents' children. + // + // However, we leave the list of this AUM's children in parentIndex, + // so we can find them later in ChildAUMs(). + if aum, ok := c.aums[h]; ok { + parent, hasParent := aum.Parent() + if hasParent { + c.parentIndex[parent] = slices.DeleteFunc( + c.parentIndex[parent], + func(other AUMHash) bool { return bytes.Equal(h[:], other[:]) }, + ) + if len(c.parentIndex[parent]) == 0 { + delete(c.parentIndex, parent) + } + } + } + + // Delete this AUM from the list of AUMs and commit times. + delete(c.aums, h) + delete(c.commitTimes, h) + } + + return nil +} + // FS implements filesystem storage of TKA state. // // FS implements the Chonk interface. @@ -184,6 +265,10 @@ type FS struct { // ChonkDir returns an implementation of Chonk which uses the // given directory to store TKA state. func ChonkDir(dir string) (*FS, error) { + if err := os.MkdirAll(dir, 0755); err != nil && !os.IsExist(err) { + return nil, fmt.Errorf("creating chonk root dir: %v", err) + } + stat, err := os.Stat(dir) if err != nil { return nil, err @@ -376,6 +461,11 @@ func (c *FS) Heads() ([]AUM, error) { return out, nil } +// RemoveAll permanently and completely clears the TKA state. +func (c *FS) RemoveAll() error { + return os.RemoveAll(c.base) +} + // AllAUMs returns all AUMs stored in the chonk. func (c *FS) AllAUMs() ([]AUMHash, error) { c.mu.RLock() diff --git a/tka/tailchonk_test.go b/tka/tailchonk_test.go index 7816d2dc158b5..70b7dc9a72fbb 100644 --- a/tka/tailchonk_test.go +++ b/tka/tailchonk_test.go @@ -127,6 +127,43 @@ func TestTailchonkFS_IgnoreTempFile(t *testing.T) { } } +// If we use a non-existent directory with filesystem Chonk storage, +// it's automatically created. +func TestTailchonkFS_CreateChonkDir(t *testing.T) { + base := filepath.Join(t.TempDir(), "a", "b", "c") + + chonk, err := ChonkDir(base) + if err != nil { + t.Fatalf("ChonkDir: %v", err) + } + + aum := AUM{MessageKind: AUMNoOp} + must.Do(chonk.CommitVerifiedAUMs([]AUM{aum})) + + got, err := chonk.AUM(aum.Hash()) + if err != nil { + t.Errorf("Chonk.AUM: %v", err) + } + if diff := cmp.Diff(got, aum); diff != "" { + t.Errorf("wrong AUM; (-got+want):%v", diff) + } + + if _, err := os.Stat(base); err != nil { + t.Errorf("os.Stat: %v", err) + } +} + +// You can't use a file as the root of your filesystem Chonk storage. +func TestTailchonkFS_CannotUseFile(t *testing.T) { + base := filepath.Join(t.TempDir(), "tka_storage.txt") + must.Do(os.WriteFile(base, []byte("this won't work"), 0644)) + + _, err := ChonkDir(base) + if err == nil { + t.Fatal("ChonkDir succeeded; expected an error") + } +} + func TestMarkActiveChain(t *testing.T) { type aumTemplate struct { AUM AUM diff --git a/tstest/chonktest/tailchonk_test.go b/tstest/chonktest/tailchonk_test.go index ce6b043248de1..6dfab798ed11f 100644 --- a/tstest/chonktest/tailchonk_test.go +++ b/tstest/chonktest/tailchonk_test.go @@ -39,6 +39,12 @@ func TestImplementsCompactableChonk(t *testing.T) { name string newChonk func(t *testing.T) tka.CompactableChonk }{ + { + name: "Mem", + newChonk: func(t *testing.T) tka.CompactableChonk { + return &tka.Mem{} + }, + }, { name: "FS", newChonk: func(t *testing.T) tka.CompactableChonk { From 90d3cb3c9573958386061cccefbf8708e870220a Mon Sep 17 00:00:00 2001 From: Nick Khyl Date: Tue, 18 Nov 2025 11:32:04 -0600 Subject: [PATCH 23/25] VERSION.txt: this is v1.90.7 Signed-off-by: Nick Khyl --- VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION.txt b/VERSION.txt index 34ad0d880160b..4efb9c5aae36d 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -1.90.6 +1.90.7 From 6b0fbffd4f1c7790d97e09fd28c38960acb2b1e9 Mon Sep 17 00:00:00 2001 From: Alex Chan Date: Tue, 18 Nov 2025 09:44:12 +0000 Subject: [PATCH 24/25] tka: move RemoveAll() to CompactableChonk I added a RemoveAll() method on tka.Chonk in #17946, but it's only used in the node to purge local AUMs. We don't need it in the SQLite storage, which currently implements tka.Chonk, so move it to CompactableChonk instead. Also add some automated tests, as a safety net. Updates tailscale/corp#33599 Change-Id: I54de9ccf1d6a3d29b36a94eccb0ebd235acd4ebc Signed-off-by: Alex Chan (cherry picked from commit c17ba6412984a0b801d112fe1b399b1b8b2a3441) --- tka/tailchonk.go | 8 +++--- tstest/chonktest/chonktest.go | 47 +++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/tka/tailchonk.go b/tka/tailchonk.go index f20eb5920ad63..616abaf2b4190 100644 --- a/tka/tailchonk.go +++ b/tka/tailchonk.go @@ -58,10 +58,6 @@ type Chonk interface { // as a hint to pick the correct chain in the event that the Chonk stores // multiple distinct chains. LastActiveAncestor() (*AUMHash, error) - - // RemoveAll permanently and completely clears the TKA state. This should - // be called when the user disables Tailnet Lock. - RemoveAll() error } // CompactableChonk implementation are extensions of Chonk, which are @@ -80,6 +76,10 @@ type CompactableChonk interface { // PurgeAUMs permanently and irrevocably deletes the specified // AUMs from storage. PurgeAUMs(hashes []AUMHash) error + + // RemoveAll permanently and completely clears the TKA state. This should + // be called when the user disables Tailnet Lock. + RemoveAll() error } // Mem implements in-memory storage of TKA state, suitable for diff --git a/tstest/chonktest/chonktest.go b/tstest/chonktest/chonktest.go index bfe394b28fcaf..404f1ec47f16c 100644 --- a/tstest/chonktest/chonktest.go +++ b/tstest/chonktest/chonktest.go @@ -9,6 +9,7 @@ package chonktest import ( "bytes" "encoding/binary" + "errors" "math/rand" "os" "testing" @@ -253,4 +254,50 @@ func RunCompactableChonkTests(t *testing.T, newChonk func(t *testing.T) tka.Comp t.Fatalf("ChildAUMs() output differs (-want, +got):\n%s", diff) } }) + + t.Run("RemoveAll", func(t *testing.T) { + t.Parallel() + chonk := newChonk(t) + parentHash := randHash(t, 1) + data := []tka.AUM{ + { + MessageKind: tka.AUMRemoveKey, + KeyID: []byte{1, 2}, + PrevAUMHash: parentHash[:], + }, + { + MessageKind: tka.AUMRemoveKey, + KeyID: []byte{3, 4}, + PrevAUMHash: parentHash[:], + }, + } + + if err := chonk.CommitVerifiedAUMs(data); err != nil { + t.Fatalf("CommitVerifiedAUMs failed: %v", err) + } + + // Check we can retrieve the AUMs we just stored + for _, want := range data { + got, err := chonk.AUM(want.Hash()) + if err != nil { + t.Fatalf("could not get %s: %v", want.Hash(), err) + } + if diff := cmp.Diff(want, got); diff != "" { + t.Errorf("stored AUM %s differs (-want, +got):\n%s", want.Hash(), diff) + } + } + + // Call RemoveAll() to drop all the AUM state + if err := chonk.RemoveAll(); err != nil { + t.Fatalf("RemoveAll failed: %v", err) + } + + // Check we can no longer retrieve the previously-stored AUMs + for _, want := range data { + aum, err := chonk.AUM(want.Hash()) + if !errors.Is(err, os.ErrNotExist) { + t.Fatalf("expected os.ErrNotExist for %s, instead got aum=%v, err=%v", want.Hash(), aum, err) + } + } + }) } From ccf4f3c7ce53f977d7bffc80734a927964a0a890 Mon Sep 17 00:00:00 2001 From: Nick Khyl Date: Tue, 18 Nov 2025 12:31:30 -0600 Subject: [PATCH 25/25] VERSION.txt: this is v1.90.8 Signed-off-by: Nick Khyl --- VERSION.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION.txt b/VERSION.txt index 4efb9c5aae36d..237c0b66ad7cd 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -1.90.7 +1.90.8