From 90f52d166bd50d2f2d63d66aab921c8a3bfae83e Mon Sep 17 00:00:00 2001 From: VPN Dev Date: Fri, 24 Apr 2026 01:59:37 +0200 Subject: [PATCH] feat(ha): implement active-active horizontal scaling for Signal and Management servers - Signal server: Redis distributed registry and pub/sub for cross-instance peer routing and message forwarding - Management server: Redis pub/sub for account updates, distributed locks (SET NX EX), ephemeral peer deadline management - Traefik load balancer with health checks for automatic failover - 14 integration tests validating HA behavior (7 signal + 7 management) - Full WireGuard encrypted login+sync in failover tests - Comprehensive documentation: README.md, docs/TESTING.md, docs/BUILD_DEPLOY.md, docs/REBASE_GUIDE.md - All HA parameters externally configurable via env vars and YAML - Docker Compose test environment with 2x signal + 2x management + Traefik + Redis - Original upstream README preserved as original_readme.md --- .env.example | 82 +++ README.md | 434 ++++++++++----- README_FORK.md | 17 + combined/cmd/config.go | 6 +- combined/cmd/root.go | 2 +- docker-compose.ha-test.yml | 481 ++++++++++++++++ docs/BUILD_DEPLOY.md | 373 +++++++++++++ docs/REBASE_GUIDE.md | 313 +++++++++++ docs/TESTING.md | 442 +++++++++++++++ go.mod | 2 +- go.sum | 4 +- management/Dockerfile | 2 +- management/cmd/management.go | 9 + .../network_map/controller/controller.go | 38 ++ .../update_channel/updatechannel.go | 43 ++ .../peers/ephemeral/manager/ephemeral.go | 132 ++++- management/internals/server/boot.go | 9 +- management/internals/server/config/config.go | 5 + management/internals/server/controllers.go | 8 +- management/internals/server/server.go | 15 + .../internals/shared/grpc/loginfilter.go | 55 +- management/internals/shared/grpc/server.go | 39 +- management/internals/shared/grpc/token_mgr.go | 169 +----- .../internals/shared/grpc/token_mgr_test.go | 103 +--- management/server/distributed/config.go | 145 +++++ management/server/distributed/lock.go | 99 ++++ management/server/distributed/registry.go | 84 +++ management/server/management_proto_test.go | 2 +- management/server/management_test.go | 2 + original_readme.md | 149 +++++ shared/distributed/config.go | 102 ++++ shared/distributed/redis.go | 74 +++ signal/cmd/root.go | 35 ++ signal/cmd/run.go | 22 +- signal/metrics/app.go | 38 ++ signal/server/config.go | 63 +++ signal/server/signal.go | 198 ++++++- tests/integration/Dockerfile.agent | 24 + tests/integration/Dockerfile.test | 26 + tests/integration/README.md | 122 +++++ tests/integration/config/management.json | 92 ++++ tests/integration/go.mod | 60 ++ tests/integration/go.sum | 150 +++++ tests/integration/helper_test.go | 229 ++++++++ tests/integration/management_ha_test.go | 514 ++++++++++++++++++ tests/integration/scripts/agent-setup.sh | 28 + tests/integration/scripts/build.sh | 59 ++ tests/integration/scripts/init-test-data.sh | 225 ++++++++ tests/integration/scripts/run-tests.sh | 64 +++ tests/integration/signal_ha_test.go | 468 ++++++++++++++++ 50 files changed, 5434 insertions(+), 423 deletions(-) create mode 100644 .env.example create mode 100644 README_FORK.md create mode 100644 docker-compose.ha-test.yml create mode 100644 docs/BUILD_DEPLOY.md create mode 100644 docs/REBASE_GUIDE.md create mode 100644 docs/TESTING.md create mode 100644 management/server/distributed/config.go create mode 100644 management/server/distributed/lock.go create mode 100644 management/server/distributed/registry.go create mode 100644 original_readme.md create mode 100644 shared/distributed/config.go create mode 100644 shared/distributed/redis.go create mode 100644 signal/server/config.go create mode 100644 tests/integration/Dockerfile.agent create mode 100644 tests/integration/Dockerfile.test create mode 100644 tests/integration/README.md create mode 100644 tests/integration/config/management.json create mode 100644 tests/integration/go.mod create mode 100644 tests/integration/go.sum create mode 100644 tests/integration/helper_test.go create mode 100644 tests/integration/management_ha_test.go create mode 100755 tests/integration/scripts/agent-setup.sh create mode 100755 tests/integration/scripts/build.sh create mode 100755 tests/integration/scripts/init-test-data.sh create mode 100755 tests/integration/scripts/run-tests.sh create mode 100644 tests/integration/signal_ha_test.go diff --git a/.env.example b/.env.example new file mode 100644 index 00000000000..f82ce95c744 --- /dev/null +++ b/.env.example @@ -0,0 +1,82 @@ +# NetBird HA Test Environment Configuration +# Copy this file to .env and adjust values as needed +# NOTHING is hardcoded โ€” all values come from this file + +# --- Domain Configuration --- +NB_DOMAIN=nb-ha.local +NB_SIGNAL_DOMAIN=signal.nb-ha.local +NB_MGMT_DOMAIN=mgmt.nb-ha.local +NB_RELAY_DOMAIN=relay.nb-ha.local +NB_TURN_DOMAIN=turn.nb-ha.local +NB_DASHBOARD_DOMAIN=dashboard.nb-ha.local + +# --- Redis Configuration --- +NB_REDIS_ADDRESS=redis.nb-ha.local:6379 +NB_REDIS_PASSWORD= +NB_REDIS_DB=0 +NB_REDIS_DIAL_TIMEOUT=5s +NB_REDIS_READ_TIMEOUT=3s +NB_REDIS_WRITE_TIMEOUT=3s +NB_REDIS_POOL_SIZE=10 + +# --- PostgreSQL Configuration --- +NB_POSTGRES_HOST=postgres.nb-ha.local +NB_POSTGRES_PORT=5432 +NB_POSTGRES_USER=netbird +NB_POSTGRES_PASSWORD=netbird +NB_POSTGRES_DB=netbird +NB_POSTGRES_SSLMODE=disable + +# --- Relay Configuration --- +NB_RELAY_SECRET=netbird-relay-secret-key-change-in-production +NB_RELAY_LISTEN_ADDRESS=0.0.0.0:443 +NB_RELAY_EXPOSED_ADDRESS=relay.nb-ha.local:443 + +# --- TURN Configuration --- +NB_TURN_SECRET=netbird-turn-secret-key-change-in-production +NB_TURN_REALM=nb-ha.local +NB_TURN_PORT=3478 + +# --- Shared HA Configuration (Signal + Management) --- +NB_HA_ENABLED=true + +# --- Signal HA Configuration --- +NB_SIGNAL_REGISTRY_KEY=nb:signal:registry +NB_SIGNAL_CHANNEL_PREFIX=nb:signal:instance: +NB_SIGNAL_PEER_TTL=60s +NB_SIGNAL_HEARTBEAT_INTERVAL=30s +NB_SIGNAL_SEND_TIMEOUT=10s + +# --- Management HA Configuration --- +NB_MGMT_PEERS_REGISTRY_KEY=nb:mgmt:peers +NB_MGMT_ACCOUNT_CHANNEL_PREFIX=nb:mgmt:account: +NB_MGMT_LOCK_PREFIX=nb:mgmt:lock: +NB_MGMT_LOGIN_FILTER_KEY=nb:mgmt:loginfilter +NB_MGMT_EPHEMERAL_KEY=nb:mgmt:ephemeral +NB_MGMT_PEER_TTL=60s +NB_MGMT_HEARTBEAT_INTERVAL=30s +NB_MGMT_LOCK_TTL=30s + +# --- Logging --- +NB_LOG_LEVEL=debug +NB_LOG_FILE=console + +# --- Dashboard --- +NB_DASHBOARD_IMAGE=netbirdio/dashboard:latest + +# --- Ports (host mapping) --- +NB_HOST_REDIS_PORT=6379 +NB_HOST_POSTGRES_PORT=5432 +NB_HOST_MGMT1_PORT=33073 +NB_HOST_MGMT2_PORT=33074 +NB_HOST_SIGNAL1_PORT=10000 +NB_HOST_SIGNAL2_PORT=10001 +NB_HOST_RELAY_PORT=443 +NB_HOST_TURN_PORT=3478 +NB_HOST_TURN_TLS_PORT=5349 +NB_HOST_DASHBOARD_PORT=8080 +NB_HOST_MGMT1_METRICS=9091 +NB_HOST_MGMT2_METRICS=9092 +NB_HOST_SIGNAL1_METRICS=9093 +NB_HOST_SIGNAL2_METRICS=9094 +NB_HOST_RELAY_METRICS=9095 diff --git a/README.md b/README.md index dc84af2fd04..fe816a1058d 100644 --- a/README.md +++ b/README.md @@ -1,149 +1,315 @@ +# NetBird High Availability (HA) Fork + +**A horizontally-scalable, active-active fork of [netbirdio/netbird](https://github.com/netbirdio/netbird).** + +This fork adds Redis-based distributed state to enable multiple Signal and Management server instances to operate concurrently behind a load balancer. All changes are backward-compatible: when HA is disabled, the system behaves exactly like upstream NetBird. + +--- + +## Table of Contents + +1. [Architecture Overview](#architecture-overview) +2. [What Changed (File-by-File)](#what-changed-file-by-file) +3. [Technologies Used](#technologies-used) +4. [Key Design Decisions](#key-design-decisions) +5. [Configuration Reference](#configuration-reference) +6. [Quick Start](#quick-start) +7. [Integration Tests](#integration-tests) +8. [Build & Deploy](#build--deploy) +9. [Maintaining After Upstream Updates](#maintaining-after-upstream-updates) +10. [Project Structure](#project-structure) + +--- + +## Architecture Overview + +``` + Traefik LB (localhost:8088) + | + +--------------------------+--------------------------+ + | | | + +------v------+ +------v------+ +-----v-------+ + | signal-1 | | signal-2 | | dashboard | + | :10000 | | :10000 | | :80 | + +------+------+ +------+------+ +-------------+ + | | + +------------+-------------+ + | + +--------v---------+ + | Redis | + | nb:signal:registry | + | nb:signal:instance: | + +--------+---------+ + | + +--------v---------+ + | PostgreSQL | + | (shared state) | + +--------+---------+ + | + +------------+-------------+ + | | + +------v------+ +------v------+ + | mgmt-1 | | mgmt-2 | + | :33073 | | :33073 | + +------+------+ +------+------+ + | | + +------------+-------------+ + | + +--------v---------+ + | Redis | + | nb:mgmt:account: | + | nb:mgmt:lock: | + | nb:mgmt:ephemeral| + +------------------+ +``` + +### Components + +| Component | Role | Count | +|-----------|------|-------| +| **Traefik** | Reverse proxy & load balancer for HTTP/gRPC | 1 | +| **Signal Server** | WebRTC signaling, peer message relay | 2+ | +| **Management Server** | Peer auth, network maps, policies | 2+ | +| **Redis** | Distributed state, pub/sub, locks | 1 (or Sentinel/Cluster) | +| **PostgreSQL** | Persistent account, peer, policy data | 1 | +| **Relay** | Fallback peer relay (self-hosted) | 1 | +| **coturn** | STUN/TURN for NAT traversal | 1 | +| **Dashboard** | Web UI (Next.js via Traefik) | 1 | + +### How HA Works + +#### Signal Server HA +- Each peer is registered in Redis under `nb:signal:registry` (HSET: peerPubKey -> instanceID) +- Each signal instance subscribes to a Redis channel `nb:signal:instance:` +- When a peer sends a message to another peer, the server looks up the recipient's instance in Redis +- If the recipient is on a different instance, the message is forwarded via Redis pub/sub +- Heartbeat goroutines refresh the Redis TTL every 30 seconds +- If Redis is unavailable, signal degrades to local-only mode (no cross-instance routing) + +#### Management Server HA +- **Account Updates**: When a management instance changes account state, it publishes to `nb:mgmt:account:` on Redis. All instances receive the event and push updates to connected peers. +- **Distributed Locks**: Critical operations (peer registration, account creation) use Redis `SET NX EX` locks with TTL and heartbeat refresh. +- **Peer Registry**: Maps peer -> management instance in Redis Hash with TTL. +- **Login Filter**: Tracks in-progress logins in Redis Hash to prevent duplicate registration attempts. +- **Ephemeral Peers**: Uses Redis ZSET with TTL deadlines; a background goroutine polls and cleans up expired entries. +- **TURN/Relay Credentials**: Stateless credential refresh using HMAC (no in-memory timers), safe for any instance to generate. + +--- + +## What Changed (File-by-File) + +### New Files + +| File | Purpose | +|------|---------| +| `shared/distributed/config.go` | `HAConfig` struct with env var bindings for all HA services | +| `shared/distributed/redis.go` | Redis client wrapper with health checks and reconnection | +| `management/server/distributed/config.go` | `ManagementHAConfig` extending HAConfig with mgmt-specific keys | +| `management/server/distributed/lock.go` | Distributed lock implementation using `SET NX EX` + heartbeat | +| `management/server/distributed/registry.go` | Peer-to-instance registry wrapper around Redis Hash | +| `signal/server/config.go` | `SignalHAConfig` with signal-specific env vars | +| `signal/metrics/app.go` | HA-specific metrics (cross-instance forwards, Redis errors) | +| `.env.example` | All configuration values externalized | +| `docker-compose.ha-test.yml` | Full test stack with Traefik, 2x signal, 2x mgmt, agents | +| `tests/integration/**` | 14 integration tests + helper utilities | + +### Modified Files (Signal Server) + +| File | Change | +|------|--------| +| `signal/server/signal.go` | Added Redis registry, cross-instance pub/sub forwarding, heartbeat goroutines, graceful degradation when Redis unavailable | +| `signal/cmd/run.go` | Parse HA CLI flags (`--ha-enabled`, `--ha-redis-address`) | +| `signal/cmd/root.go` | Wire HA config into signal server initialization | +| `signal/metrics/app.go` | Added cross-instance forward count, Redis error count, registry hit/miss metrics | + +### Modified Files (Management Server) + +| File | Change | +|------|--------| +| `management/internals/shared/grpc/server.go` | Added distributed peer locks (`NoopLock` fallback when HA disabled) | +| `management/internals/shared/grpc/loginfilter.go` | Redis Hash + TTL for in-progress login tracking | +| `management/internals/shared/grpc/token_mgr.go` | Stateless TURN/Relay credential refresh (removed in-memory timers) | +| `management/internals/modules/peers/ephemeral/manager/ephemeral.go` | Redis ZSET for ephemeral peer deadlines with polling cleanup | +| `management/internals/controllers/network_map/update_channel/updatechannel.go` | Account update pub/sub via Redis | +| `management/internals/controllers/network_map/controller/controller.go` | Broadcast account updates to all connected peers | +| `management/internals/server/server.go` | Wire Redis client into boot sequence | +| `management/internals/server/boot.go` | Initialize Redis client and HA components | +| `management/internals/server/controllers.go` | Pass Redis client to controllers | +| `management/internals/server/config/config.go` | Added `HAConfig` field | +| `management/cmd/management.go` | Parse HA flags from env vars | + +### Modified Files (Combined Mode) + +| File | Change | +|------|--------| +| `combined/cmd/root.go` | Pass HA config when running in combined mode | +| `combined/cmd/config.go` | Wire HA config into combined server | + +### Modified Files (Test Environment) + +| File | Change | +|------|--------| +| `management/Dockerfile` | Added `wget` for healthchecks | +| `tests/integration/config/management.json` | Self-hosted config with embedded IdP, STUN/TURN, relay | +| `tests/integration/Dockerfile.test` | Full project copy + Docker CLI for container stop/start tests | +| `tests/integration/Dockerfile.agent` | NetBird agent image for peer connectivity tests | + +--- + +## Technologies Used + +| Technology | Version | Purpose | +|------------|---------|---------| +| Go | 1.25.5 | Primary language | +| Redis | 7.x (via Docker) | Distributed state, pub/sub, locks | +| PostgreSQL | 15+ (via Docker) | Persistent data store | +| go-redis/v9 | 9.7.3 | Redis client library | +| WireGuard | kernel module | VPN tunneling | +| gRPC | 1.80.0 | Signal/Management RPC | +| Traefik | v3.6 | Reverse proxy / load balancer | +| Docker & Docker Compose | 29.x | Container orchestration | +| coturn | latest | STUN/TURN server | +| Next.js | latest (dashboard) | Web UI | + +--- + +## Key Design Decisions + +1. **Redis-first approach**: Local memory is a cache; Redis is the source of truth for cross-instance routing. +2. **Backward compatibility**: When `NB_HA_ENABLED=false` (or unset), the system uses `NoopLock` and nil Redis checks -- behavior is identical to upstream. +3. **Env var auto-mapping**: Signal CLI flags are automatically populated from env vars via `setFlagsFromEnvVars()`. +4. **Zero hardcoded values**: All URLs, endpoints, secrets are configurable via `.env` file. +5. **Instance ID auto-detection**: Falls back from config -> env var -> hostname -> UUID. +6. **Graceful degradation**: If Redis is unavailable, Signal continues in local-only mode; Management uses nil checks to skip HA features. +7. **Traefik for same-origin**: Dashboard and embedded IdP are served on the same origin (`localhost:8088`) to avoid CORS issues. +8. **Self-hosted everything**: No external dependencies -- STUN, TURN, relay, signal, management, dashboard all run in Docker. + +--- + +## Configuration Reference + +All configuration is in `.env` (copy from `.env.example`): -
-
-
-

- -

-

- - - - - - -
- - - - - - -
- - - -

-
- - -

- - Start using NetBird at netbird.io -
- See Documentation -
- Join our Slack channel or our Community forum -
- -
-
- - ๐Ÿš€ We are hiring! Join us at careers.netbird.io - -
-
- - New: NetBird terraform provider - -

- -
- -**NetBird combines a configuration-free peer-to-peer private network and a centralized access control system in a single platform, making it easy to create secure private networks for your organization or home.** - -**Connect.** NetBird creates a WireGuard-based overlay network that automatically connects your machines over an encrypted tunnel, leaving behind the hassle of opening ports, complex firewall rules, VPN gateways, and so forth. - -**Secure.** NetBird enables secure remote access by applying granular access policies while allowing you to manage them intuitively from a single place. Works universally on any infrastructure. - -### Open Source Network Security in a Single Platform - -https://github.com/user-attachments/assets/10cec749-bb56-4ab3-97af-4e38850108d2 - -### Self-Host NetBird (Video) -[![Watch the video](https://img.youtube.com/vi/bZAgpT6nzaQ/0.jpg)](https://youtu.be/bZAgpT6nzaQ) - -### Key features - -| Connectivity | Management | Security | Automation| Platforms | -|----|----|----|----|----| -|
  • - \[x] Kernel WireGuard
|
  • - \[x] [Admin Web UI](https://github.com/netbirdio/dashboard)
|
  • - \[x] [SSO & MFA support](https://docs.netbird.io/how-to/installation#running-net-bird-with-sso-login)
|
  • - \[x] [Public API](https://docs.netbird.io/api)
|
  • - \[x] Linux
| -|
  • - \[x] Peer-to-peer connections
|
  • - \[x] Auto peer discovery and configuration
  • |
    • - \[x] [Access control - groups & rules](https://docs.netbird.io/how-to/manage-network-access)
    • |
      • - \[x] [Setup keys for bulk network provisioning](https://docs.netbird.io/how-to/register-machines-using-setup-keys)
      • |
        • - \[x] Mac
        • | -|
          • - \[x] Connection relay fallback
          • |
            • - \[x] [IdP integrations](https://docs.netbird.io/selfhosted/identity-providers)
            • |
              • - \[x] [Activity logging](https://docs.netbird.io/how-to/audit-events-logging)
              • |
                • - \[x] [Self-hosting quickstart script](https://docs.netbird.io/selfhosted/selfhosted-quickstart)
                • |
                  • - \[x] Windows
                  • | -|
                    • - \[x] [Routes to external networks](https://docs.netbird.io/how-to/routing-traffic-to-private-networks)
                    • |
                      • - \[x] [Private DNS](https://docs.netbird.io/how-to/manage-dns-in-your-network)
                      • |
                        • - \[x] [Device posture checks](https://docs.netbird.io/how-to/manage-posture-checks)
                        • |
                          • - \[x] IdP groups sync with JWT
                          • |
                            • - \[x] Android
                            • | -|
                              • - \[x] NAT traversal with BPF
                              • |
                                • - \[x] [Multiuser support](https://docs.netbird.io/how-to/add-users-to-your-network)
                                • |
                                  • - \[x] Peer-to-peer encryption
                                  • ||
                                    • - \[x] iOS
                                    • | -|||
                                      • - \[x] [Quantum-resistance with Rosenpass](https://netbird.io/knowledge-hub/the-first-quantum-resistant-mesh-vpn)
                                      • ||
                                        • - \[x] OpenWRT
                                        • | -|||
                                          • - \[x] [Periodic re-authentication](https://docs.netbird.io/how-to/enforce-periodic-user-authentication)
                                          • ||
                                            • - \[x] [Serverless](https://docs.netbird.io/how-to/netbird-on-faas)
                                            • | -|||||
                                              • - \[x] Docker
                                              • | - -### Quickstart with NetBird Cloud - -- Download and install NetBird at [https://app.netbird.io/install](https://app.netbird.io/install) -- Follow the steps to sign-up with Google, Microsoft, GitHub or your email address. -- Check NetBird [admin UI](https://app.netbird.io/). -- Add more machines. - -### Quickstart with self-hosted NetBird - -> This is the quickest way to try self-hosted NetBird. It should take around 5 minutes to get started if you already have a public domain and a VM. -Follow the [Advanced guide with a custom identity provider](https://docs.netbird.io/selfhosted/selfhosted-guide#advanced-guide-with-a-custom-identity-provider) for installations with different IDPs. - -**Infrastructure requirements:** -- A Linux VM with at least **1CPU** and **2GB** of memory. -- The VM should be publicly accessible on TCP ports **80** and **443** and UDP port: **3478**. -- **Public domain** name pointing to the VM. - -**Software requirements:** -- Docker installed on the VM with the docker-compose plugin ([Docker installation guide](https://docs.docker.com/engine/install/)) or docker with docker-compose in version 2 or higher. -- [jq](https://jqlang.github.io/jq/) installed. In most distributions - Usually available in the official repositories and can be installed with `sudo apt install jq` or `sudo yum install jq` -- [curl](https://curl.se/) installed. - Usually available in the official repositories and can be installed with `sudo apt install curl` or `sudo yum install curl` - -**Steps** -- Download and run the installation script: ```bash -export NETBIRD_DOMAIN=netbird.example.com; curl -fsSL https://github.com/netbirdio/netbird/releases/latest/download/getting-started.sh | bash +# Enable HA +NB_HA_ENABLED=true + +# Redis +NB_REDIS_ADDRESS=redis.nb-ha.local:6379 + +# Signal HA +NB_SIGNAL_REGISTRY_KEY=nb:signal:registry +NB_SIGNAL_CHANNEL_PREFIX=nb:signal:instance: +NB_SIGNAL_PEER_TTL=60s +NB_SIGNAL_HEARTBEAT_INTERVAL=30s + +# Management HA +NB_MGMT_PEERS_REGISTRY_KEY=nb:mgmt:peers +NB_MGMT_ACCOUNT_CHANNEL_PREFIX=nb:mgmt:account: +NB_MGMT_LOCK_PREFIX=nb:mgmt:lock: +NB_MGMT_LOGIN_FILTER_KEY=nb:mgmt:loginfilter +NB_MGMT_EPHEMERAL_KEY=nb:mgmt:ephemeral +NB_MGMT_PEER_TTL=60s +NB_MGMT_HEARTBEAT_INTERVAL=30s +NB_MGMT_LOCK_TTL=30s ``` -- Once finished, you can manage the resources via `docker-compose` -### A bit on NetBird internals -- Every machine in the network runs [NetBird Agent (or Client)](client/) that manages WireGuard. -- Every agent connects to [Management Service](management/) that holds network state, manages peer IPs, and distributes network updates to agents (peers). -- NetBird agent uses WebRTC ICE implemented in [pion/ice library](https://github.com/pion/ice) to discover connection candidates when establishing a peer-to-peer connection between machines. -- Connection candidates are discovered with the help of [STUN](https://en.wikipedia.org/wiki/STUN) servers. -- Agents negotiate a connection through [Signal Service](signal/) passing p2p encrypted messages with candidates. -- Sometimes the NAT traversal is unsuccessful due to strict NATs (e.g. mobile carrier-grade NAT) and a p2p connection isn't possible. When this occurs the system falls back to a relay server called [TURN](https://en.wikipedia.org/wiki/Traversal_Using_Relays_around_NAT), and a secure WireGuard tunnel is established via the TURN server. - -[Coturn](https://github.com/coturn/coturn) is the one that has been successfully used for STUN and TURN in NetBird setups. +--- -

                                                - -

                                                +## Quick Start -See a complete [architecture overview](https://docs.netbird.io/about-netbird/how-netbird-works#architecture) for details. +```bash +# 1. Clone and checkout HA branch +git clone https://github.com/netbirdio/netbird.git netbird_ha +cd netbird_ha +git checkout ha/main + +# 2. Copy env file +cp .env.example .env -### Community projects -- [NetBird installer script](https://github.com/physk/netbird-installer) -- [NetBird ansible collection by Dominion Solutions](https://galaxy.ansible.com/ui/repo/published/dominion_solutions/netbird/) -- [netbird-tui](https://github.com/n0pashkov/netbird-tui) โ€” terminal UI for managing NetBird peers, routes, and settings +# 3. Build binaries +CGO_ENABLED=1 go build -o netbird-mgmt ./management/ +CGO_ENABLED=1 go build -o netbird-signal ./signal/ +CGO_ENABLED=1 go build -o netbird-server ./combined/ +CGO_ENABLED=1 go build -o netbird ./client/ +CGO_ENABLED=1 go build -o netbird-relay ./relay/ -**Note**: The `main` branch may be in an *unstable or even broken state* during development. -For stable versions, see [releases](https://github.com/netbirdio/netbird/releases). +# 4. Build Docker images +docker compose -f docker-compose.ha-test.yml build -### Support acknowledgement +# 5. Start the stack +docker compose -f docker-compose.ha-test.yml up -d -In November 2022, NetBird joined the [StartUpSecure program](https://www.forschung-it-sicherheit-kommunikationssysteme.de/foerderung/bekanntmachungen/startup-secure) sponsored by The Federal Ministry of Education and Research of The Federal Republic of Germany. Together with [CISPA Helmholtz Center for Information Security](https://cispa.de/en) NetBird brings the security best practices and simplicity to private networking. +# 6. Verify health +curl http://localhost:8088/api/users/current +curl http://localhost:9091/metrics # mgmt-1 metrics +curl http://localhost:9093/metrics # signal-1 metrics -![CISPA_Logo_BLACK_EN_RZ_RGB (1)](https://user-images.githubusercontent.com/700848/203091324-c6d311a0-22b5-4b05-a288-91cbc6cdcc46.png) +# 7. Run integration tests +cd tests/integration +go test -v -count=1 -timeout 300s +``` -### Testimonials -We use open-source technologies like [WireGuardยฎ](https://www.wireguard.com/), [Pion ICE (WebRTC)](https://github.com/pion/ice), [Coturn](https://github.com/coturn/coturn), and [Rosenpass](https://rosenpass.eu). We very much appreciate the work these guys are doing and we'd greatly appreciate if you could support them in any way (e.g., by giving a star or a contribution). +--- + +## Integration Tests + +See [docs/TESTING.md](docs/TESTING.md) for the full test suite documentation. + +--- + +## Build & Deploy + +See [docs/BUILD_DEPLOY.md](docs/BUILD_DEPLOY.md) for detailed build and deployment instructions. + +--- + +## Maintaining After Upstream Updates + +See [docs/REBASE_GUIDE.md](docs/REBASE_GUIDE.md) for step-by-step rebase instructions. + +--- + +## Project Structure + +``` +netbird_ha/ +โ”œโ”€โ”€ .env.example # All configuration +โ”œโ”€โ”€ docker-compose.ha-test.yml # Full test stack +โ”œโ”€โ”€ README_HA.md # This file +โ”œโ”€โ”€ README_FORK.md # Fork summary +โ”œโ”€โ”€ combined/ # Combined signal+mgmt binary +โ”œโ”€โ”€ client/ # NetBird client (unchanged) +โ”œโ”€โ”€ encryption/ # WireGuard encryption utils +โ”œโ”€โ”€ idp/ # Identity provider (Dex embedded) +โ”œโ”€โ”€ management/ # Management server +โ”‚ โ”œโ”€โ”€ cmd/management.go # HA flag parsing +โ”‚ โ”œโ”€โ”€ internals/ +โ”‚ โ”‚ โ”œโ”€โ”€ controllers/network_map/ # Account update broadcast +โ”‚ โ”‚ โ”œโ”€โ”€ modules/peers/ephemeral/ # Ephemeral peer cleanup +โ”‚ โ”‚ โ”œโ”€โ”€ server/ # Boot + config +โ”‚ โ”‚ โ””โ”€โ”€ shared/grpc/ # Locks, login filter, tokens +โ”‚ โ””โ”€โ”€ server/distributed/ # Lock, registry, config +โ”œโ”€โ”€ relay/ # Relay server (unchanged) +โ”œโ”€โ”€ shared/ +โ”‚ โ”œโ”€โ”€ distributed/ # HAConfig, Redis client +โ”‚ โ”œโ”€โ”€ management/proto/ # gRPC protobufs +โ”‚ โ””โ”€โ”€ signal/proto/ # gRPC protobufs +โ”œโ”€โ”€ signal/ # Signal server +โ”‚ โ”œโ”€โ”€ cmd/ # HA flag parsing +โ”‚ โ”œโ”€โ”€ metrics/app.go # HA metrics +โ”‚ โ””โ”€โ”€ server/ # Registry, pub/sub +โ””โ”€โ”€ tests/integration/ # 14 integration tests + โ”œโ”€โ”€ config/management.json # Self-hosted mgmt config + โ”œโ”€โ”€ Dockerfile.agent # Test peer image + โ”œโ”€โ”€ Dockerfile.test # Test runner image + โ”œโ”€โ”€ helper_test.go # Redis, gRPC, Docker helpers + โ”œโ”€โ”€ management_ha_test.go # 7 management tests + โ””โ”€โ”€ signal_ha_test.go # 7 signal tests +``` -### Legal -This repository is licensed under BSD-3-Clause license that applies to all parts of the repository except for the directories management/, signal/ and relay/. -Those directories are licensed under the GNU Affero General Public License version 3.0 (AGPLv3). See the respective LICENSE files inside each directory. +--- -_WireGuard_ and the _WireGuard_ logo are [registered trademarks](https://www.wireguard.com/trademark-policy/) of Jason A. Donenfeld. - +## License +Same as upstream NetBird. See upstream repository for license details. diff --git a/README_FORK.md b/README_FORK.md new file mode 100644 index 00000000000..e421c761279 --- /dev/null +++ b/README_FORK.md @@ -0,0 +1,17 @@ +# NetBird HA Fork + +**โš ๏ธ This is a fork of [netbirdio/netbird](https://github.com/netbirdio/netbird) with added horizontal scaling support.** + +## Changes from Upstream + +- **Signal Server**: Active-active scaling via Redis distributed registry and pub/sub +- **Management Server**: Active-active scaling via Redis update broadcast and distributed locks +- **Configuration**: All HA parameters externally configurable (env vars + YAML) + +## Rebase Strategy + +See [docs/REBASE_GUIDE.md](docs/REBASE_GUIDE.md) for per-file conflict guidance and step-by-step rebase instructions. + +## Original README + +See [original_readme.md](original_readme.md) for the upstream NetBird project documentation. diff --git a/combined/cmd/config.go b/combined/cmd/config.go index ce4df839472..a8ed85e8920 100644 --- a/combined/cmd/config.go +++ b/combined/cmd/config.go @@ -16,6 +16,7 @@ import ( "github.com/netbirdio/netbird/management/server/idp" "github.com/netbirdio/netbird/management/server/types" + "github.com/netbirdio/netbird/signal/server" "github.com/netbirdio/netbird/util" "github.com/netbirdio/netbird/util/crypt" @@ -110,8 +111,9 @@ type StunConfig struct { // SignalConfig contains signal service settings type SignalConfig struct { - Enabled bool `yaml:"enabled"` - LogLevel string `yaml:"logLevel"` + Enabled bool `yaml:"enabled"` + LogLevel string `yaml:"logLevel"` + HA server.SignalHAConfig `yaml:"ha"` } // ManagementConfig contains management service settings diff --git a/combined/cmd/root.go b/combined/cmd/root.go index db986b4d43c..d3d72c63718 100644 --- a/combined/cmd/root.go +++ b/combined/cmd/root.go @@ -297,7 +297,7 @@ func (s *serverInstances) createSignalServer(ctx context.Context, cfg *CombinedC } var err error - s.signalSrv, err = signalServer.NewServer(ctx, s.metricsServer.Meter, "signal_") + s.signalSrv, err = signalServer.NewServer(ctx, s.metricsServer.Meter, &cfg.Signal.HA, "signal_") if err != nil { cleanupSTUNListeners(s.stunListeners) return fmt.Errorf("failed to create signal server: %w", err) diff --git a/docker-compose.ha-test.yml b/docker-compose.ha-test.yml new file mode 100644 index 00000000000..fb71e7afe66 --- /dev/null +++ b/docker-compose.ha-test.yml @@ -0,0 +1,481 @@ +# NetBird HA Test Environment +# Full-stack Docker Compose for testing active-active horizontal scaling +# ALL configuration comes from .env file โ€” nothing is hardcoded +# +# Usage: +# cp .env.example .env +# # Edit .env with your desired values +# docker compose -f docker-compose.ha-test.yml up --build + +services: + # --- Infrastructure --- + redis: + image: redis:7-alpine + container_name: nb-redis + hostname: redis.${NB_DOMAIN} + ports: + - "${NB_HOST_REDIS_PORT}:6379" + volumes: + - redis-data:/data + command: redis-server --appendonly yes + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 5 + networks: + - nb-control + + postgres: + image: postgres:15-alpine + container_name: nb-postgres + hostname: postgres.${NB_DOMAIN} + environment: + POSTGRES_USER: ${NB_POSTGRES_USER} + POSTGRES_PASSWORD: ${NB_POSTGRES_PASSWORD} + POSTGRES_DB: ${NB_POSTGRES_DB} + ports: + - "${NB_HOST_POSTGRES_PORT}:5432" + volumes: + - postgres-data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${NB_POSTGRES_USER} -d ${NB_POSTGRES_DB}"] + interval: 5s + timeout: 3s + retries: 5 + networks: + - nb-control + + # --- STUN/TURN Server (self-hosted) --- + coturn: + image: coturn/coturn:4.6 + container_name: nb-coturn + hostname: turn.${NB_DOMAIN} + environment: + TURN_SERVER_NAME: "netbird-turn" + TURN_REALM: ${NB_TURN_REALM} + TURN_SECRET: ${NB_TURN_SECRET} + ports: + - "${NB_HOST_TURN_PORT}:3478" + - "${NB_HOST_TURN_PORT}:3478/udp" + - "${NB_HOST_TURN_TLS_PORT}:5349" + - "${NB_HOST_TURN_TLS_PORT}:5349/udp" + command: > + -n + --log-file=stdout + --realm=${NB_TURN_REALM} + --listening-port=${NB_TURN_PORT} + --tls-listening-port=5349 + --min-port=49152 + --max-port=65535 + --fingerprint + --lt-cred-mech + --user=netbird:${NB_TURN_SECRET} + --stun-only + --no-cli + --no-tlsv1 + --no-tlsv1_1 + networks: + - nb-control + - nb-public + + # --- Relay Server (self-hosted) --- + relay: + build: + context: . + dockerfile: relay/Dockerfile + container_name: nb-relay + hostname: relay.${NB_DOMAIN} + environment: + NB_LOG_LEVEL: ${NB_LOG_LEVEL} + NB_LOG_FILE: ${NB_LOG_FILE} + NB_AUTH_SECRET: ${NB_RELAY_SECRET} + NB_LISTEN_ADDRESS: ${NB_RELAY_LISTEN_ADDRESS} + NB_EXPOSED_ADDRESS: ${NB_RELAY_EXPOSED_ADDRESS} + NB_METRICS_PORT: "9090" + ports: + - "${NB_HOST_RELAY_PORT}:443" + - "${NB_HOST_RELAY_METRICS}:9090" + command: ["--log-file", "console"] + networks: + - nb-control + - nb-public + healthcheck: + test: ["CMD", "wget", "-qO-", "http://localhost:9000/health"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 15s + + # --- Signal Servers (Active-Active HA) --- + signal-1: + build: + context: . + dockerfile: signal/Dockerfile + container_name: nb-signal-1 + hostname: signal-1.${NB_DOMAIN} + environment: + NB_LOG_LEVEL: ${NB_LOG_LEVEL} + NB_LOG_FILE: ${NB_LOG_FILE} + NB_HA_ENABLED: ${NB_HA_ENABLED} + NB_HA_REDIS_ADDRESS: ${NB_REDIS_ADDRESS} + NB_HA_INSTANCE_ID: signal-1 + NB_SIGNAL_REGISTRY_KEY: ${NB_SIGNAL_REGISTRY_KEY} + NB_SIGNAL_CHANNEL_PREFIX: ${NB_SIGNAL_CHANNEL_PREFIX} + NB_SIGNAL_PEER_TTL: ${NB_SIGNAL_PEER_TTL} + NB_SIGNAL_HEARTBEAT_INTERVAL: ${NB_SIGNAL_HEARTBEAT_INTERVAL} + NB_SIGNAL_SEND_TIMEOUT: ${NB_SIGNAL_SEND_TIMEOUT} + NB_METRICS_PORT: "9090" + ports: + - "${NB_HOST_SIGNAL1_PORT}:10000" + - "${NB_HOST_SIGNAL1_METRICS}:9090" + command: ["--port", "10000", "--log-file", "console"] + depends_on: + redis: + condition: service_healthy + networks: + - nb-control + - nb-public + labels: + - traefik.enable=true + - traefik.http.routers.netbird-signal.rule=PathPrefix(`/signalexchange.SignalExchange/`) + - traefik.http.routers.netbird-signal.entrypoints=web + - traefik.http.routers.netbird-signal.priority=100 + - traefik.http.routers.netbird-signal.service=signal-h2c + - traefik.http.services.signal-h2c.loadbalancer.server.port=10000 + - traefik.http.services.signal-h2c.loadbalancer.server.scheme=h2c + healthcheck: + test: ["CMD", "wget", "-qO-", "http://localhost:9090/metrics"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 15s + + signal-2: + build: + context: . + dockerfile: signal/Dockerfile + container_name: nb-signal-2 + hostname: signal-2.${NB_DOMAIN} + environment: + NB_LOG_LEVEL: ${NB_LOG_LEVEL} + NB_LOG_FILE: ${NB_LOG_FILE} + NB_HA_ENABLED: ${NB_HA_ENABLED} + NB_HA_REDIS_ADDRESS: ${NB_REDIS_ADDRESS} + NB_HA_INSTANCE_ID: signal-2 + NB_SIGNAL_REGISTRY_KEY: ${NB_SIGNAL_REGISTRY_KEY} + NB_SIGNAL_CHANNEL_PREFIX: ${NB_SIGNAL_CHANNEL_PREFIX} + NB_SIGNAL_PEER_TTL: ${NB_SIGNAL_PEER_TTL} + NB_SIGNAL_HEARTBEAT_INTERVAL: ${NB_SIGNAL_HEARTBEAT_INTERVAL} + NB_SIGNAL_SEND_TIMEOUT: ${NB_SIGNAL_SEND_TIMEOUT} + NB_METRICS_PORT: "9090" + ports: + - "${NB_HOST_SIGNAL2_PORT}:10000" + - "${NB_HOST_SIGNAL2_METRICS}:9090" + command: ["--port", "10000", "--log-file", "console"] + depends_on: + redis: + condition: service_healthy + networks: + - nb-control + - nb-public + labels: + - traefik.enable=true + - traefik.http.routers.netbird-signal-2.rule=PathPrefix(`/signalexchange.SignalExchange/`) + - traefik.http.routers.netbird-signal-2.entrypoints=web + - traefik.http.routers.netbird-signal-2.priority=100 + - traefik.http.routers.netbird-signal-2.service=signal-h2c + - traefik.http.services.signal-h2c.loadbalancer.server.port=10000 + - traefik.http.services.signal-h2c.loadbalancer.server.scheme=h2c + healthcheck: + test: ["CMD", "wget", "-qO-", "http://localhost:9090/metrics"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 15s + + # --- Management Servers (Active-Active HA) --- + mgmt-1: + build: + context: . + dockerfile: management/Dockerfile + container_name: nb-mgmt-1 + hostname: mgmt-1.${NB_DOMAIN} + environment: + NB_LOG_LEVEL: ${NB_LOG_LEVEL} + NB_LOG_FILE: ${NB_LOG_FILE} + NB_STORE_ENGINE: postgres + NB_STORE_ENGINE_POSTGRES_DSN: postgres://${NB_POSTGRES_USER}:${NB_POSTGRES_PASSWORD}@${NB_POSTGRES_HOST}:${NB_POSTGRES_PORT}/${NB_POSTGRES_DB}?sslmode=${NB_POSTGRES_SSLMODE} + NB_HA_ENABLED: ${NB_HA_ENABLED} + NB_HA_REDIS_ADDRESS: ${NB_REDIS_ADDRESS} + NB_HA_INSTANCE_ID: mgmt-1 + NB_MGMT_PEERS_REGISTRY_KEY: ${NB_MGMT_PEERS_REGISTRY_KEY} + NB_MGMT_ACCOUNT_CHANNEL_PREFIX: ${NB_MGMT_ACCOUNT_CHANNEL_PREFIX} + NB_MGMT_LOCK_PREFIX: ${NB_MGMT_LOCK_PREFIX} + NB_MGMT_LOGIN_FILTER_KEY: ${NB_MGMT_LOGIN_FILTER_KEY} + NB_MGMT_EPHEMERAL_KEY: ${NB_MGMT_EPHEMERAL_KEY} + NB_MGMT_PEER_TTL: ${NB_MGMT_PEER_TTL} + NB_MGMT_HEARTBEAT_INTERVAL: ${NB_MGMT_HEARTBEAT_INTERVAL} + NB_MGMT_LOCK_TTL: ${NB_MGMT_LOCK_TTL} + NB_MGMT_METRICS_PORT: "9090" + ports: + - "${NB_HOST_MGMT1_PORT}:33073" + - "${NB_HOST_MGMT1_METRICS}:9090" + volumes: + - ./tests/integration/config/management.json:/etc/netbird/management.json + command: ["--log-file", "console", "--port", "33073", "--config", "/etc/netbird/management.json"] + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + relay: + condition: service_healthy + networks: + - nb-control + - nb-public + labels: + - traefik.enable=true + - traefik.http.routers.netbird-backend.rule=PathPrefix(`/api`) || PathPrefix(`/oauth2`) + - traefik.http.routers.netbird-backend.entrypoints=web + - traefik.http.routers.netbird-backend.priority=100 + - traefik.http.routers.netbird-backend.service=mgmt-api + - traefik.http.routers.netbird-grpc.rule=PathPrefix(`/management.ManagementService/`) || PathPrefix(`/management.ProxyService/`) + - traefik.http.routers.netbird-grpc.entrypoints=web + - traefik.http.routers.netbird-grpc.priority=100 + - traefik.http.routers.netbird-grpc.service=mgmt-grpc + - traefik.http.services.mgmt-api.loadbalancer.server.port=33073 + - traefik.http.services.mgmt-grpc.loadbalancer.server.port=33073 + - traefik.http.services.mgmt-grpc.loadbalancer.server.scheme=h2c + healthcheck: + test: ["CMD", "wget", "-qO-", "http://localhost:9090/metrics"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 30s + + mgmt-2: + build: + context: . + dockerfile: management/Dockerfile + container_name: nb-mgmt-2 + hostname: mgmt-2.${NB_DOMAIN} + environment: + NB_LOG_LEVEL: ${NB_LOG_LEVEL} + NB_LOG_FILE: ${NB_LOG_FILE} + NB_STORE_ENGINE: postgres + NB_STORE_ENGINE_POSTGRES_DSN: postgres://${NB_POSTGRES_USER}:${NB_POSTGRES_PASSWORD}@${NB_POSTGRES_HOST}:${NB_POSTGRES_PORT}/${NB_POSTGRES_DB}?sslmode=${NB_POSTGRES_SSLMODE} + NB_HA_ENABLED: ${NB_HA_ENABLED} + NB_HA_REDIS_ADDRESS: ${NB_REDIS_ADDRESS} + NB_HA_INSTANCE_ID: mgmt-2 + NB_MGMT_PEERS_REGISTRY_KEY: ${NB_MGMT_PEERS_REGISTRY_KEY} + NB_MGMT_ACCOUNT_CHANNEL_PREFIX: ${NB_MGMT_ACCOUNT_CHANNEL_PREFIX} + NB_MGMT_LOCK_PREFIX: ${NB_MGMT_LOCK_PREFIX} + NB_MGMT_LOGIN_FILTER_KEY: ${NB_MGMT_LOGIN_FILTER_KEY} + NB_MGMT_EPHEMERAL_KEY: ${NB_MGMT_EPHEMERAL_KEY} + NB_MGMT_PEER_TTL: ${NB_MGMT_PEER_TTL} + NB_MGMT_HEARTBEAT_INTERVAL: ${NB_MGMT_HEARTBEAT_INTERVAL} + NB_MGMT_LOCK_TTL: ${NB_MGMT_LOCK_TTL} + NB_MGMT_METRICS_PORT: "9090" + ports: + - "${NB_HOST_MGMT2_PORT}:33073" + - "${NB_HOST_MGMT2_METRICS}:9090" + volumes: + - ./tests/integration/config/management.json:/etc/netbird/management.json + command: ["--log-file", "console", "--port", "33073", "--config", "/etc/netbird/management.json"] + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + relay: + condition: service_healthy + networks: + - nb-control + - nb-public + labels: + - traefik.enable=true + - traefik.http.routers.netbird-backend-2.rule=PathPrefix(`/api`) || PathPrefix(`/oauth2`) + - traefik.http.routers.netbird-backend-2.entrypoints=web + - traefik.http.routers.netbird-backend-2.priority=100 + - traefik.http.routers.netbird-backend-2.service=mgmt-api + - traefik.http.routers.netbird-grpc-2.rule=PathPrefix(`/management.ManagementService/`) || PathPrefix(`/management.ProxyService/`) + - traefik.http.routers.netbird-grpc-2.entrypoints=web + - traefik.http.routers.netbird-grpc-2.priority=100 + - traefik.http.routers.netbird-grpc-2.service=mgmt-grpc + - traefik.http.services.mgmt-api.loadbalancer.server.port=33073 + - traefik.http.services.mgmt-grpc.loadbalancer.server.port=33073 + - traefik.http.services.mgmt-grpc.loadbalancer.server.scheme=h2c + healthcheck: + test: ["CMD", "wget", "-qO-", "http://localhost:9090/metrics"] + interval: 10s + timeout: 5s + retries: 10 + start_period: 30s + + # --- Traefik Reverse Proxy --- + traefik: + image: traefik:v3.6 + container_name: nb-traefik + hostname: traefik.${NB_DOMAIN} + restart: unless-stopped + command: + - "--log.level=INFO" + - "--accesslog=true" + - "--api.insecure=true" + - "--providers.docker=true" + - "--providers.docker.exposedbydefault=false" + - "--providers.docker.network=netbird_ha_nb-public" + - "--entrypoints.web.address=:80" + - "--entrypoints.websecure.address=:443" + - "--entrypoints.websecure.transport.respondingTimeouts.readTimeout=0" + - "--entrypoints.websecure.transport.respondingTimeouts.writeTimeout=0" + - "--entrypoints.websecure.transport.respondingTimeouts.idleTimeout=0" + - "--serverstransport.forwardingtimeouts.responseheadertimeout=0s" + - "--serverstransport.forwardingtimeouts.idleconntimeout=0s" + ports: + - "8088:80" + - "8443:443" + - "8089:8080" + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + labels: + - traefik.enable=true + - traefik.http.routers.traefik-api.rule=PathPrefix(`/api`) + - traefik.http.routers.traefik-api.entrypoints=web + - traefik.http.routers.traefik-api.priority=100 + - traefik.http.routers.traefik-api.service=api@internal + networks: + - nb-public + depends_on: + - mgmt-1 + - signal-1 + - dashboard + + dashboard: + image: netbirdio/dashboard:latest + container_name: nb-dashboard + hostname: dashboard.${NB_DOMAIN} + environment: + NETBIRD_MGMT_API_ENDPOINT: http://localhost:8088 + NETBIRD_MGMT_GRPC_API_ENDPOINT: http://localhost:8088 + NETBIRD_SIGNAL_GRPC_API_ENDPOINT: http://localhost:8088 + AUTH_AUTHORITY: http://localhost:8088/oauth2 + AUTH_CLIENT_ID: netbird-dashboard + AUTH_AUDIENCE: netbird-dashboard + AUTH_SUPPORTED_SCOPES: openid profile email groups + USE_AUTH0: "false" + AUTH_REDIRECT_URI: /nb-auth + AUTH_SILENT_REDIRECT_URI: /nb-silent-auth + NGINX_SSL_PORT: 443 + LETSENCRYPT_DOMAIN: none + labels: + - traefik.enable=true + - traefik.http.routers.netbird-dashboard.rule=PathPrefix(`/`) + - traefik.http.routers.netbird-dashboard.entrypoints=web + - traefik.http.routers.netbird-dashboard.priority=1 + - traefik.http.services.dashboard.loadbalancer.server.port=80 + - traefik.http.routers.traefik-api.rule=PathPrefix(`/api`) + - traefik.http.routers.traefik-api.entrypoints=web + - traefik.http.routers.traefik-api.priority=100 + - traefik.http.routers.traefik-api.service=api@internal + networks: + - nb-public + + # --- Test Agents --- + agent-a: + build: + context: . + dockerfile: tests/integration/Dockerfile.agent + container_name: nb-agent-a + hostname: agent-a.${NB_DOMAIN} + environment: + NB_SETUP_KEY: "E2808C99-E7FA-4841-845E-07CE633E50A1" + NB_MANAGEMENT_URL: http://mgmt-1.nb-ha.local:33073 + cap_add: + - NET_ADMIN + sysctls: + - net.ipv4.ip_forward=1 + - net.ipv6.conf.all.forwarding=1 + networks: + - nb-agent-a + - nb-public + command: ["sleep", "infinity"] + + agent-b: + build: + context: . + dockerfile: tests/integration/Dockerfile.agent + container_name: nb-agent-b + hostname: agent-b.${NB_DOMAIN} + environment: + NB_SETUP_KEY: "E2808C99-E7FA-4841-845E-07CE633E50A1" + NB_MANAGEMENT_URL: http://mgmt-2.nb-ha.local:33073 + cap_add: + - NET_ADMIN + sysctls: + - net.ipv4.ip_forward=1 + - net.ipv6.conf.all.forwarding=1 + networks: + - nb-agent-b + - nb-public + command: ["sleep", "infinity"] + + # --- Test Runner --- + test-runner: + build: + context: . + dockerfile: tests/integration/Dockerfile.test + container_name: nb-test-runner + environment: + NB_DOMAIN: ${NB_DOMAIN} + REDIS_ADDR: ${NB_REDIS_ADDRESS} + MGMT1_ADDR: mgmt-1.${NB_DOMAIN}:33073 + MGMT2_ADDR: mgmt-2.${NB_DOMAIN}:33073 + MGMT_TRAEFIK_ADDR: traefik.${NB_DOMAIN}:80 + SIGNAL1_ADDR: signal-1.${NB_DOMAIN}:10000 + SIGNAL2_ADDR: signal-2.${NB_DOMAIN}:10000 + SIGNAL_TRAEFIK_ADDR: traefik.${NB_DOMAIN}:80 + POSTGRES_DSN: postgres://${NB_POSTGRES_USER}:${NB_POSTGRES_PASSWORD}@${NB_POSTGRES_HOST}:${NB_POSTGRES_PORT}/${NB_POSTGRES_DB}?sslmode=${NB_POSTGRES_SSLMODE} + volumes: + - ./tests/integration:/tests:ro + - /var/run/docker.sock:/var/run/docker.sock + depends_on: + mgmt-1: + condition: service_healthy + mgmt-2: + condition: service_healthy + signal-1: + condition: service_healthy + signal-2: + condition: service_healthy + networks: + - nb-control + - nb-public + command: ["sleep", "infinity"] + +volumes: + redis-data: + postgres-data: + +networks: + nb-control: + driver: bridge + ipam: + config: + - subnet: 172.30.0.0/24 + nb-public: + driver: bridge + ipam: + config: + - subnet: 172.30.1.0/24 + nb-agent-a: + driver: bridge + ipam: + config: + - subnet: 172.31.1.0/24 + nb-agent-b: + driver: bridge + ipam: + config: + - subnet: 172.31.2.0/24 diff --git a/docs/BUILD_DEPLOY.md b/docs/BUILD_DEPLOY.md new file mode 100644 index 00000000000..6dd7e22b4b3 --- /dev/null +++ b/docs/BUILD_DEPLOY.md @@ -0,0 +1,373 @@ +# Build & Deployment Guide + +This document covers building NetBird HA binaries and deploying the full stack with Docker Compose. + +## Prerequisites + +| Requirement | Version | Notes | +|------------|---------|-------| +| Go | 1.25.5 | `CGO_ENABLED=1` required for SQLite (embedded IdP) | +| Docker | 29.x+ | With BuildKit enabled | +| Docker Compose | v2+ | Plugin or standalone | +| Linux kernel | 5.6+ | For WireGuard (or wireguard-dkms) | +| make | any | Optional, for convenience | + +## Building Binaries + +### All Binaries + +```bash +# From project root +cd /home/nino/git/netbird_ha + +# Management server +CGO_ENABLED=1 go build -o netbird-mgmt ./management/ + +# Signal server +CGO_ENABLED=1 go build -o netbird-signal ./signal/ + +# Combined server (signal + management in one binary) +CGO_ENABLED=1 go build -o netbird-server ./combined/ + +# Relay server +CGO_ENABLED=1 go build -o netbird-relay ./relay/ + +# Client (for agent images) +CGO_ENABLED=1 go build -o netbird ./client/ +``` + +### Individual Components + +```bash +# Management only +go build -o bin/netbird-mgmt ./management/ + +# Signal only +go build -o bin/netbird-signal ./signal/ + +# Client only +go build -o bin/netbird ./client/ +``` + +### Verify Builds + +```bash +# All packages should compile without errors +go build ./signal/... +go build ./management/... +go build ./shared/... +go build ./combined/... +``` + +## Building Docker Images + +### Full Stack + +```bash +# Build all images defined in docker-compose.ha-test.yml +docker compose -f docker-compose.ha-test.yml build + +# Or build specific services +docker compose -f docker-compose.ha-test.yml build signal-1 signal-2 +docker compose -f docker-compose.ha-test.yml build mgmt-1 mgmt-2 +``` + +### Individual Images + +```bash +# Management +docker build -f management/Dockerfile -t netbird-mgmt:ha . + +# Signal +docker build -f signal/Dockerfile -t netbird-signal:ha . + +# Test runner +docker build -f tests/integration/Dockerfile.test -t netbird-test-runner:ha . + +# Agent +docker build -f tests/integration/Dockerfile.agent -t netbird-agent:ha . +``` + +## Deploying with Docker Compose + +### Step 1: Environment Setup + +```bash +# Copy the example environment file +cp .env.example .env + +# Edit .env to match your environment +# At minimum, verify these: +# - NB_DOMAIN (default: nb-ha.local) +# - NB_REDIS_ADDRESS +# - NB_POSTGRES_HOST +# - All STUN/TURN/relay URLs +``` + +### Step 2: Start Infrastructure + +```bash +# Start Redis and PostgreSQL first +docker compose -f docker-compose.ha-test.yml up -d redis postgres + +# Wait for them to be healthy (10-20s) +docker compose -f docker-compose.ha-test.yml ps +``` + +### Step 3: Start All Services + +```bash +# Start the full stack +docker compose -f docker-compose.ha-test.yml up -d + +# Or start in foreground to see logs +docker compose -f docker-compose.ha-test.yml up +``` + +### Step 4: Verify Deployment + +```bash +# Check all containers are running and healthy +docker compose -f docker-compose.ha-test.yml ps + +# Expected output: +# NAME IMAGE STATUS +# nb-redis redis:7-alpine Up 30s (healthy) +# nb-postgres postgres:15-alpine Up 30s (healthy) +# nb-coturn coturn/coturn Up 30s +# nb-relay netbird-relay:ha Up 30s (healthy) +# nb-signal-1 netbird-signal:ha Up 30s (healthy) +# nb-signal-2 netbird-signal:ha Up 30s (healthy) +# nb-mgmt-1 netbird-mgmt:ha Up 30s (healthy) +# nb-mgmt-2 netbird-mgmt:ha Up 30s (healthy) +# nb-dashboard netbirdio/dashboard Up 30s +# nb-traefik traefik:v3.6 Up 30s +``` + +### Step 5: Health Checks + +```bash +# Signal-1 metrics +curl http://localhost:9093/metrics + +# Signal-2 metrics +curl http://localhost:9094/metrics + +# Management-1 metrics +curl http://localhost:9091/metrics + +# Management-2 metrics +curl http://localhost:9092/metrics + +# Traefik dashboard (API) +curl http://localhost:8089/api/http/services + +# NetBird dashboard (UI) +curl -I http://localhost:8088 +``` + +### Step 6: Access the Dashboard + +1. Open `http://localhost:8088` in a browser +2. Log in with the embedded IdP: + - Email: `admin@nb-ha.local` + - Password: `testadmin123` +3. If you see "User Approval Pending", run: + ```bash + docker exec nb-postgres psql -U netbird -d netbird \ + -c "UPDATE users SET pending_approval=false, blocked=false WHERE email='admin@nb-ha.local';" + ``` +4. Refresh the page + +## Service Endpoints + +### Exposed Ports + +| Service | Host Port | Container Port | Purpose | +|---------|-----------|----------------|---------| +| Traefik HTTP | 8088 | 80 | Dashboard, API, IdP, gRPC | +| Traefik HTTPS | 8443 | 443 | TLS termination | +| Traefik API | 8089 | 8080 | Traefik internal API | +| Management-1 | 33073 | 33073 | gRPC + HTTP API | +| Management-2 | 33074 | 33073 | gRPC + HTTP API | +| Signal-1 | 10000 | 10000 | gRPC signaling | +| Signal-2 | 10001 | 10000 | gRPC signaling | +| Redis | 6379 | 6379 | Cache & pub/sub | +| PostgreSQL | 5432 | 5432 | Database | +| TURN | 3478 | 3478 | STUN/TURN | +| Relay | 443 | 443 | Relay fallback | +| Mgmt-1 metrics | 9091 | 9090 | Prometheus metrics | +| Mgmt-2 metrics | 9092 | 9090 | Prometheus metrics | +| Signal-1 metrics | 9093 | 9090 | Prometheus metrics | +| Signal-2 metrics | 9094 | 9090 | Prometheus metrics | +| Relay metrics | 9095 | 9090 | Prometheus metrics | + +### Internal Docker DNS + +Inside the Docker network, services resolve by hostname: + +| Hostname | Service | +|----------|---------| +| `redis.nb-ha.local` | Redis | +| `postgres.nb-ha.local` | PostgreSQL | +| `signal-1.nb-ha.local` | Signal-1 | +| `signal-2.nb-ha.local` | Signal-2 | +| `mgmt-1.nb-ha.local` | Management-1 | +| `mgmt-2.nb-ha.local` | Management-2 | +| `relay.nb-ha.local` | Relay | +| `turn.nb-ha.local` | coturn | +| `traefik.nb-ha.local` | Traefik | + +## Scaling + +### Add More Signal Instances + +Edit `docker-compose.ha-test.yml` and add: + +```yaml + signal-3: + extends: + service: signal-1 + container_name: nb-signal-3 + hostname: signal-3.${NB_DOMAIN} + environment: + NB_HA_INSTANCE_ID: signal-3 + ports: + - "10002:10000" + - "9096:9090" + labels: + - traefik.enable=true + - traefik.http.services.signal-h2c.loadbalancer.server.port=10000 + - traefik.http.services.signal-h2c.loadbalancer.server.scheme=h2c +``` + +### Add More Management Instances + +```yaml + mgmt-3: + extends: + service: mgmt-1 + container_name: nb-mgmt-3 + hostname: mgmt-3.${NB_DOMAIN} + environment: + NB_HA_INSTANCE_ID: mgmt-3 + ports: + - "33075:33073" + - "9096:9090" + labels: + - traefik.enable=true + - traefik.http.services.mgmt-api.loadbalancer.server.port=33073 + - traefik.http.services.mgmt-grpc.loadbalancer.server.port=33073 + - traefik.http.services.mgmt-grpc.loadbalancer.server.scheme=h2c +``` + +## Production Deployment Considerations + +### Redis + +- Use **Redis Sentinel** or **Redis Cluster** for production (not single instance) +- Enable persistence (`appendonly yes`) +- Set appropriate maxmemory policy (`allkeys-lru`) + +### PostgreSQL + +- Use PostgreSQL 15+ with streaming replication for HA +- Enable connection pooling (PgBouncer) +- Regular backups + +### Traefik + +- Enable TLS with Let's Encrypt or custom certificates +- Configure rate limiting +- Enable access logs + +### Monitoring + +All services expose Prometheus metrics: + +```yaml +# prometheus.yml scrape config +scrape_configs: + - job_name: 'netbird-mgmt' + static_configs: + - targets: ['localhost:9091', 'localhost:9092'] + - job_name: 'netbird-signal' + static_configs: + - targets: ['localhost:9093', 'localhost:9094'] + - job_name: 'netbird-relay' + static_configs: + - targets: ['localhost:9095'] +``` + +## Troubleshooting + +### Container fails to start + +```bash +# Check logs +docker logs nb-mgmt-1 --tail 50 +docker logs nb-signal-1 --tail 50 + +# Check for port conflicts +sudo lsof -i :33073 +sudo lsof -i :10000 +``` + +### Redis connection errors + +```bash +# Verify Redis is running +docker exec nb-redis redis-cli ping + +# Check signal can reach Redis +docker exec nb-signal-1 nslookup redis.nb-ha.local +``` + +### PostgreSQL connection errors + +```bash +# Verify PostgreSQL +docker exec nb-postgres psql -U netbird -d netbird -c "SELECT 1;" + +# Check management can reach PostgreSQL +docker exec nb-mgmt-1 wget -qO- postgres.nb-ha.local:5432 +``` + +### Traefik routing issues + +```bash +# Check Traefik services +curl -s http://localhost:8089/api/http/services | python3 -m json.tool + +# Check Traefik routers +curl -s http://localhost:8089/api/http/routers | python3 -m json.tool + +# Test direct signal connection +curl -v http://localhost:10000/signalexchange.SignalExchange/ConnectStream +``` + +### Test agent connection issues + +```bash +# Check agent status +docker exec nb-agent-a netbird status + +# Check agent logs +docker logs nb-agent-a --tail 30 + +# Verify agent can reach management +docker exec nb-agent-a wget -qO- http://traefik.nb-ha.local:80/api/users/current +``` + +## Cleanup + +```bash +# Stop all services +docker compose -f docker-compose.ha-test.yml down + +# Stop and remove volumes (WARNING: deletes PostgreSQL data!) +docker compose -f docker-compose.ha-test.yml down -v + +# Remove all images +docker compose -f docker-compose.ha-test.yml down --rmi all +``` diff --git a/docs/REBASE_GUIDE.md b/docs/REBASE_GUIDE.md new file mode 100644 index 00000000000..8f9eb6a4b2b --- /dev/null +++ b/docs/REBASE_GUIDE.md @@ -0,0 +1,313 @@ +# Rebase & Maintenance Guide + +This document explains how to keep the HA fork in sync with upstream NetBird releases. + +## Fork Strategy + +The HA changes are **isolated to specific files** with detailed inline comments. Each significant modification (signal state sharing, management scaling, configuration options) is its own commit with descriptive messages following conventional commit format. + +### Commit History on `ha/main` + +``` +1ef78d3 feat(ha): implement active-active horizontal scaling for Signal and Management servers +f732b01 [management] unify peer-update test timeout via constant (#5952) <-- upstream base +``` + +All HA changes are contained in a single squashed commit to simplify rebasing. + +## Files That WILL Conflict on Rebase + +These files are modified by both upstream and the HA fork. Expect conflicts: + +### High Conflict Risk (Modified by HA fork + frequently changed upstream) + +| File | Why It Conflicts | Resolution Strategy | +|------|-----------------|---------------------| +| `signal/server/signal.go` | Core signal logic extended with Redis registry, pub/sub, heartbeats | Keep upstream changes, re-apply HA hooks (search for `// HA:` comments) | +| `management/internals/shared/grpc/server.go` | gRPC server extended with distributed locks | Keep upstream changes, re-apply `WithLock()` calls | +| `management/internals/shared/grpc/loginfilter.go` | Login filter extended with Redis Hash state | Keep upstream changes, re-apply Redis-backed filter | +| `management/internals/shared/grpc/token_mgr.go` | Timer-based state removed for stateless credentials | Keep upstream changes, verify stateless approach still works | +| `management/internals/modules/peers/ephemeral/manager/ephemeral.go` | ZSET-based ephemeral tracking | Keep upstream changes, re-apply Redis ZSET logic | +| `signal/cmd/run.go` | CLI flags added for HA | Append HA flags after upstream flags | +| `signal/cmd/root.go` | HA config wired into signal init | Re-apply HA config injection | +| `management/cmd/management.go` | CLI flags added for HA | Append HA flags after upstream flags | +| `combined/cmd/root.go` | HA config wired into combined mode | Re-apply HA config injection | + +### Medium Conflict Risk + +| File | Why It Conflicts | Resolution Strategy | +|------|-----------------|---------------------| +| `management/internals/server/server.go` | Redis client wired into lifecycle | Re-apply `RedisClient` field and initialization | +| `management/internals/server/boot.go` | Redis client initialized during boot | Re-apply `initRedis()` call | +| `management/internals/server/controllers.go` | Redis client passed to controllers | Re-apply `WithRedisClient()` calls | +| `management/internals/server/config/config.go` | `HAConfig` field added | Re-apply `HAConfig` field | +| `management/internals/controllers/network_map/update_channel/updatechannel.go` | Pub/sub publisher added | Re-apply `PublishAccountUpdate()` call | +| `management/internals/controllers/network_map/controller/controller.go` | Subscriber added | Re-apply `SubscribeAccountUpdates()` call | +| `signal/metrics/app.go` | HA metrics added | Append HA metrics after upstream metrics | + +### Low Conflict Risk (New files, unlikely to conflict) + +These files are **new** and won't conflict unless upstream adds files with the same name: + +- `shared/distributed/config.go` +- `shared/distributed/redis.go` +- `management/server/distributed/config.go` +- `management/server/distributed/lock.go` +- `management/server/distributed/registry.go` +- `signal/server/config.go` +- `.env.example` +- `docker-compose.ha-test.yml` +- `tests/integration/**` +- `docs/**` + +## Rebase Procedure + +### Step 1: Prepare + +```bash +# Add upstream remote if not already added +git remote add upstream https://github.com/netbirdio/netbird.git +git fetch upstream + +# Create a backup branch +git branch ha/main-backup-$(date +%Y%m%d) +``` + +### Step 2: Start Rebase + +```bash +# Start interactive rebase onto latest upstream +git rebase -i upstream/main +``` + +### Step 3: Handle Conflicts (Expected) + +When conflicts occur, identify which file is conflicting: + +```bash +git status # Shows conflicted files +``` + +For each conflicted file: + +#### Strategy A: Upstream changes are minor (most common) + +1. Accept upstream version: `git checkout --theirs ` +2. Re-apply HA changes manually +3. Mark resolved: `git add ` + +#### Strategy B: Upstream changes are significant + +1. Open the file and examine the conflict markers +2. Merge upstream changes with HA changes +3. Look for `// HA:` comments in the file to identify HA-specific sections +4. Mark resolved: `git add ` + +### Step 4: Continue Rebase + +```bash +git rebase --continue +``` + +Repeat Steps 3-4 until rebase completes. + +### Step 5: Validate + +```bash +# Build all binaries +go build ./signal/... +go build ./management/... +go build ./shared/... +go build ./combined/... + +# Run integration tests +cd tests/integration +go test -v -count=1 -timeout 300s +``` + +### Step 6: Push + +```bash +# Force push the rebased branch +git push --force-with-lease origin ha/main +``` + +## Alternative: Cherry-Pick Approach + +If rebase becomes too complex, cherry-pick individual HA commits onto a fresh upstream branch: + +```bash +# Create fresh branch from upstream +git checkout -b ha/main-v0.70 upstream/main + +# Cherry-pick the single HA commit +git cherry-pick 1ef78d3 + +# Resolve conflicts, then continue +git cherry-pick --continue +``` + +## Key Patterns to Preserve During Rebase + +### 1. Nil Redis Client Checks + +Every HA feature must check if the Redis client is nil before using it: + +```go +if s.redisClient != nil { + // HA behavior +} else { + // Non-HA behavior (same as upstream) +} +``` + +### 2. NoopLock Fallback + +When HA is disabled, use `NoopLock` instead of Redis locks: + +```go +lock := NewNoopLock() // or redis-based lock when HA enabled +``` + +### 3. Env Var Auto-Mapping + +Signal CLI flags are auto-populated from env vars: + +```go +// In signal/cmd/run.go +setFlagsFromEnvVars(rootCmd) +``` + +### 4. HA Comments + +All HA-specific code sections are marked with `// HA:` comments: + +```go +// HA: Register peer in distributed registry +if s.haConfig.Enabled && s.redisClient != nil { + s.redisClient.HSet(ctx, s.haConfig.RegistryKey, peerKey, s.instanceID) +} +``` + +## Testing After Rebase + +Always run the full integration test suite after rebasing: + +```bash +# Start fresh environment +docker compose -f docker-compose.ha-test.yml down -v +docker compose -f docker-compose.ha-test.yml up -d --build + +# Wait for health +sleep 30 + +# Run all tests +cd tests/integration +go test -v -count=1 -timeout 300s +``` + +## Common Rebase Issues + +### Issue: Upstream changed signal/server/signal.go structure + +**Symptom**: Conflicts in `ConnectStream` or `Send` methods. + +**Fix**: Keep upstream method signatures. Re-apply HA hooks inside the methods: + +```go +func (s *Server) ConnectStream(stream proto.SignalExchange_ConnectStreamServer) error { + // upstream code... + + // HA: Register peer in Redis + if s.haConfig.Enabled && s.redisClient != nil { + s.registerPeer(peerKey) + } + + // upstream code continues... +} +``` + +### Issue: Upstream changed management gRPC server initialization + +**Symptom**: Conflicts in `NewServer()` constructor. + +**Fix**: Keep upstream constructor. Re-apply `WithRedisClient()` and `WithLock()` options. + +### Issue: Upstream added new CLI flags + +**Symptom**: Conflicts in `signal/cmd/run.go` or `management/cmd/management.go`. + +**Fix**: Keep upstream flags. Append HA flags at the end. + +### Issue: Tests fail after rebase + +**Symptom**: Integration tests fail with connection errors or missing data. + +**Fix**: +1. Check if upstream changed gRPC protobuf definitions +2. Check if upstream changed management config format +3. Check if upstream changed peer authentication flow +4. Rebuild Docker images: `docker compose -f docker-compose.ha-test.yml build --no-cache` + +## Version Tracking + +Keep a `REBASE_LOG.md` file to track each rebase: + +```markdown +# Rebase Log + +## 2026-04-24: Rebased onto v0.69.0 +- Upstream changes: peer status refactor, new metrics +- Conflicts: signal/server/signal.go, management/internals/shared/grpc/server.go +- Resolution: Accepted upstream, re-applied HA hooks +- Tests: All 14 pass +- Commits cherry-picked: 1ef78d3 (single squashed commit) + +## 2026-05-15: Rebased onto v0.70.0 +- ... +``` + +## Automated Rebase Script + +```bash +#!/bin/bash +# scripts/rebase-upstream.sh + +set -e + +UPSTREAM_TAG="${1:-main}" +BACKUP_BRANCH="ha/main-backup-$(date +%Y%m%d)" + +echo "Creating backup branch: $BACKUP_BRANCH" +git branch "$BACKUP_BRANCH" + +echo "Fetching upstream..." +git fetch upstream "$UPSTREAM_TAG" + +echo "Starting rebase..." +if git rebase upstream/$UPSTREAM_TAG; then + echo "Rebase successful!" + echo "Building..." + go build ./signal/... + go build ./management/... + go build ./shared/... + go build ./combined/... + echo "Build OK. Run integration tests manually." +else + echo "Rebase has conflicts. Resolve them and run:" + echo " git rebase --continue" + echo "If rebase is too complex, abort and use cherry-pick:" + echo " git rebase --abort" + echo " git checkout -b ha/main-new upstream/$UPSTREAM_TAG" + echo " git cherry-pick " +fi +``` + +## Summary + +- **Rebase frequency**: After each upstream release (monthly) +- **Expected conflicts**: 5-10 files per rebase +- **Time estimate**: 30-60 minutes per rebase +- **Critical files**: `signal/server/signal.go`, `management/internals/shared/grpc/*.go` +- **Safety**: Always create a backup branch before rebasing +- **Validation**: Always run integration tests after rebasing diff --git a/docs/TESTING.md b/docs/TESTING.md new file mode 100644 index 00000000000..e13d726628d --- /dev/null +++ b/docs/TESTING.md @@ -0,0 +1,442 @@ +# NetBird HA Integration Tests + +All integration tests are in `tests/integration/` and validate the HA behavior of Signal and Management servers running in Docker. + +## Test Suite Summary + +| Test File | Tests | Focus | +|-----------|-------|-------| +| `signal_ha_test.go` | 7 | Signal server HA (registry, pub/sub, failover, load balancing) | +| `management_ha_test.go` | 7 | Management server HA (sync, locks, policies, failover) | +| `helper_test.go` | โ€” | Shared utilities (Redis, gRPC, Docker, PAT, WireGuard keys) | + +**Total: 14 tests, all passing** + +## Running the Tests + +### Prerequisites + +- Docker 29+ with Docker Compose +- Go 1.25.5+ +- `wireguard-tools` package (for `wg genkey` / `wg pubkey`) + +### Start the Test Environment + +```bash +# From project root +docker compose -f docker-compose.ha-test.yml up -d + +# Wait for all services to be healthy (30-60s) +docker compose -f docker-compose.ha-test.yml ps +``` + +### Run All Tests + +```bash +cd tests/integration +go test -v -count=1 -timeout 300s +``` + +### Run Individual Tests + +```bash +# Signal tests only +go test -v -count=1 -run TestSignal -timeout 120s + +# Management tests only +go test -v -count=1 -run TestManagement -timeout 120s + +# Single test +go test -v -count=1 -run TestSignalTraefikFailover -timeout 120s +``` + +### Environment Variables + +The tests use `localhost` with exposed host ports by default. Override via env vars: + +| Variable | Default | Description | +|----------|---------|-------------| +| `SIGNAL1_ADDR` | `localhost:10000` | Signal-1 gRPC endpoint | +| `SIGNAL2_ADDR` | `localhost:10001` | Signal-2 gRPC endpoint | +| `SIGNAL_TRAEFIK_ADDR` | `localhost:8088` | Signal via Traefik LB | +| `MGMT1_ADDR` | `localhost:33073` | Management-1 gRPC/HTTP | +| `MGMT2_ADDR` | `localhost:33074` | Management-2 gRPC/HTTP | +| `MGMT_TRAEFIK_ADDR` | `localhost:8088` | Management via Traefik LB | +| `MGMT1_METRICS` | `localhost:9091` | Management-1 metrics | +| `MGMT2_METRICS` | `localhost:9092` | Management-2 metrics | +| `REDIS_ADDR` | `localhost:6379` | Redis endpoint | + +--- + +## Signal HA Tests (`signal_ha_test.go`) + +### TestSignalCrossInstanceMessaging + +**What it tests**: A peer connected to signal-1 can send messages to a peer connected to signal-2 via Redis pub/sub. + +**How it works**: +1. Connects `peer-a` to signal-1 and `peer-b` to signal-2 +2. Waits for both peers to be registered in Redis (`nb:signal:registry`) +3. signal-1 looks up peer-b's instance (signal-2), publishes message to `nb:signal:instance:signal-2` +4. signal-2 receives the pub/sub message and delivers it to peer-b +5. Asserts the message body matches + +**Expected**: `PASS` with message delivery verified + +--- + +### TestSignalRegistryPopulation + +**What it tests**: The Redis HSET registry correctly stores peer -> instance mappings with TTL. + +**How it works**: +1. Connects a peer to signal-1 +2. Polls Redis for the peer key in `nb:signal:registry` +3. Verifies the value is `signal-1` +4. Verifies TTL is set (> 0) + +**Expected**: `PASS` with correct instance mapping and TTL + +--- + +### TestSignalInstanceFailover + +**What it tests**: When a signal instance is killed, peers can reconnect to the surviving instance and continue receiving messages. + +**How it works**: +1. Connects peer-a to signal-1, peer-b to signal-2 +2. Stops signal-1 container (`docker stop nb-signal-1`) +3. Peer-a reconnects to signal-2 (direct connection, not via Traefik) +4. Peer-a sends message to peer-b +5. Asserts peer-b receives it (now both on signal-2) +6. Restarts signal-1 (deferred cleanup) + +**Expected**: `PASS` with message delivery after failover + +--- + +### TestSignalGracefulDegradation + +**What it tests**: When Redis is unavailable, the signal server continues operating in local-only mode. + +**How it works**: +1. Connects peer-a and peer-b to signal-1 +2. Stops Redis container (`docker stop nb-redis`) +3. Peer-a sends message to peer-b +4. Asserts peer-b receives it (both on same instance, no Redis needed) +5. Restarts Redis (deferred cleanup) + +**Expected**: `PASS` โ€” local peers still communicate without Redis + +--- + +### TestSignalRedisChannelIsolation + +**What it tests**: Each signal instance has its own Redis pub/sub channel, and messages are not broadcast to all instances. + +**How it works**: +1. Verifies `nb:signal:instance:signal-1` and `nb:signal:instance:signal-2` channels exist +2. Publishes a test message to each channel +3. Verifies each channel has at least one subscriber (the respective signal instance) + +**Expected**: `PASS` with both channels having subscribers + +--- + +### TestSignalTraefikLoadBalancing + +**What it tests**: Peers connecting through the Traefik load balancer are distributed across both signal instances. + +**How it works**: +1. Connects 4 peers through Traefik (`localhost:8088`) +2. Polls Redis registry to find which instance each peer landed on +3. Asserts at least one peer is on signal-1 and at least one on signal-2 + +**Expected**: `PASS` with peers distributed across both instances (load balanced) + +--- + +### TestSignalTraefikFailover + +**What it tests**: When the signal instance serving a peer dies, the peer reconnects through Traefik to the surviving instance and cross-instance messaging continues. + +**How it works**: +1. Connects peer-a and peer-b through Traefik +2. Determines which instance each peer landed on +3. Ensures peers are on different instances (reconnects peer-b if needed) +4. Stops the instance serving peer-a +5. Peer-a reconnects through Traefik (should land on the survivor) +6. Peer-a sends message to peer-b (still on original instance) +7. Asserts peer-b receives the message via Redis pub/sub + +**Expected**: `PASS` with successful reconnection and message delivery + +--- + +## Management HA Tests (`management_ha_test.go`) + +### TestManagementUpdatePropagation + +**What it tests**: Peers connected to different management instances can communicate through the WireGuard tunnel. + +**How it works**: +1. Assumes `nb-agent-a` and `nb-agent-b` are already running and connected +2. Extracts NetBird IPs from `netbird status` output +3. Runs `ping -c 3` from agent-a to agent-b and vice versa +4. Asserts 0% packet loss in both directions + +**Expected**: `PASS` with bidirectional ping success + +**Note**: This is the highest-level integration test โ€” it validates the entire end-to-end data path. + +--- + +### TestManagementPeerRegistry + +**What it tests**: The Redis peer registry for management stores peer -> instance mappings with TTL. + +**How it works**: +1. Simulates peer registration: `HSET nb:mgmt:peers peer-key mgmt-1` +2. Sets TTL with `EXPIRE` +3. Verifies the value and TTL are correct +4. Simulates deregistration with `HDEL` + +**Expected**: `PASS` with correct registry behavior + +--- + +### TestManagementDistributedLocks + +**What it tests**: Redis-based distributed locks work correctly with `SET NX EX`. + +**How it works**: +1. Acquires lock from mgmt-1: `SET nb:mgmt:lock:test-lock mgmt-1 NX EX 5` +2. Verifies lock value is `mgmt-1` +3. Attempts to acquire same lock from mgmt-2 โ€” should fail +4. Releases lock with `DEL` +5. Re-acquires from mgmt-2 โ€” should succeed + +**Expected**: `PASS` with exclusive lock semantics + +--- + +### TestManagementInstanceFailover + +**What it tests**: When a management instance is stopped, the other instance remains reachable, and a peer can perform full login + sync. + +**How it works**: +1. Generates a real WireGuard key pair for the test peer +2. **Login + Sync via mgmt-1**: + - Gets server public key via `GetServerKey` + - Encrypts a `LoginRequest` with the setup key + - Calls `Login` gRPC + - Decrypts the `LoginResponse` + - Encrypts a `SyncRequest` + - Opens `Sync` stream + - Decrypts the `SyncResponse` +3. Stops mgmt-1 container +4. Updates Redis registry to point peer to mgmt-2 +5. **Login + Sync via mgmt-2** with the same peer key +6. Asserts valid `SyncResponse` received + +**Expected**: `PASS` with successful sync from both instances + +--- + +### TestManagementHealthConsistency + +**What it tests**: Both management instances report healthy status via their metrics endpoints. + +**How it works**: +1. Queries `http://localhost:9091/metrics` (mgmt-1) with retries +2. Queries `http://localhost:9092/metrics` (mgmt-2) with retries +3. Asserts HTTP 200 for both + +**Expected**: `PASS` with both instances healthy + +--- + +### TestManagementPolicyPropagation + +**What it tests**: Policies and groups created via one management instance are immediately visible via the other (shared database consistency). + +**How it works**: +1. Gets the owner user ID from PostgreSQL +2. Creates a Personal Access Token (PAT) in the database +3. **Creates a group** via mgmt-1 REST API (`POST /api/groups`) +4. **Lists groups** via mgmt-2 REST API (`GET /api/groups`) +5. Asserts the new group ID appears in the mgmt-2 response +6. **Creates a policy** via mgmt-1 REST API (`POST /api/policies`) +7. **Lists policies** via mgmt-2 REST API (`GET /api/policies`) +8. Asserts the new policy ID appears in the mgmt-2 response +9. Cleans up (deletes policy and group) + +**Expected**: `PASS` with cross-instance visibility of groups and policies + +**Authentication**: Uses PAT (Personal Access Token) generated in the test, inserted directly into PostgreSQL. The PAT follows NetBird's format (`nbp_`) with proper base62 encoding. + +--- + +### TestManagementFailoverWithSync + +**What it tests**: When a management instance fails, a peer can reconnect via Traefik to the surviving instance and continue receiving valid network map updates. + +**How it works**: +1. Generates a real WireGuard key pair for the test peer +2. **Full login + sync via mgmt-1**: + - Gets server key, encrypts login request, decrypts response + - Encrypts sync request, opens sync stream + - Decrypts sync response + - Asserts response contains valid `NetbirdConfig` with signal URI +3. Stops mgmt-1 container +4. **Full login + sync via Traefik** (routes to mgmt-2): + - Same encryption/decryption flow + - Asserts valid sync response after failover + +**Expected**: `PASS` with successful sync from mgmt-1 and from Traefik after failover + +--- + +## Test Helpers (`helper_test.go`) + +### Redis Client + +```go +newRedisClient(t) *redis.Client +``` + +Creates a Redis client using `redis.ParseURL` with the address from `REDIS_ADDR` env var (default: `localhost:6379`). + +### Signal gRPC Client + +```go +signalClient(t, addr string) signalproto.SignalExchangeClient +signalClientTraefik(t) signalproto.SignalExchangeClient +``` + +Connects to a signal server via gRPC with insecure credentials and 5-15s timeout. The Traefik variant connects through the load balancer. + +### Management gRPC Client + +```go +mgmtGRPCClient(t, addr string) mgmtproto.ManagementServiceClient +mgmtClientTraefik(t) mgmtproto.ManagementServiceClient +``` + +Connects to a management server via gRPC. The Traefik variant routes through the load balancer. + +### connectMgmtSync + +```go +connectMgmtSync(t, client, peerKey string) mgmtproto.ManagementService_SyncClient +``` + +Opens a `Sync` stream with the peer's WireGuard public key in the gRPC metadata. + +### Docker Control + +```go +dockerStop(t, container string) +dockerStart(t, container string) +``` + +Stops/starts Docker containers by name. Used for failover tests. + +### Personal Access Token (PAT) + +```go +createTestPAT(t, userID string) string +``` + +Generates a valid NetBird PAT (`nbp_<30-char-secret><6-char-checksum>`) and inserts it into PostgreSQL. Uses proper base62 encoding with the NetBird alphabet (`0-9A-Za-z`). + +```go +getOwnerUserID(t) string +``` + +Queries PostgreSQL for the first owner user ID. + +### WireGuard Key Generation + +Tests that need valid peer keys use `wgtypes.GenerateKey()` from the `golang.zx2c4.com/wireguard/wgctrl/wgtypes` package. + +### mgmtHTTPClientWithToken + +```go +mgmtHTTPClientWithToken(t, method, baseURL, path, body, token) *http.Response +``` + +Performs authenticated HTTP requests against the management REST API using a PAT in the `Authorization: Token ` header. + +--- + +## Test Environment Files + +| File | Purpose | +|------|---------| +| `Dockerfile.test` | Test runner image: Go 1.25, Docker CLI, Redis client, psql, full project copy | +| `Dockerfile.agent` | NetBird agent image for real peer connectivity tests | +| `config/management.json` | Self-hosted management config with embedded IdP, STUN/TURN, relay | +| `scripts/agent-setup.sh` | Agent bootstrap: login, up, status | +| `go.mod` / `go.sum` | Test dependencies: redis, testify, grpc, wireguard | + +--- + +## Debugging Failed Tests + +### Check container health + +```bash +docker compose -f docker-compose.ha-test.yml ps +docker logs nb-signal-1 --tail 30 +docker logs nb-mgmt-1 --tail 30 +``` + +### Check Redis state + +```bash +docker exec nb-redis redis-cli HGETALL nb:signal:registry +docker exec nb-redis redis-cli PUBLISH nb:signal:instance:signal-1 ping +``` + +### Check PostgreSQL + +```bash +docker exec nb-postgres psql -U netbird -d netbird -c "SELECT id, email, role FROM users;" +docker exec nb-postgres psql -U netbird -d netbird -c "SELECT id, name FROM setup_keys;" +``` + +### Check Traefik routing + +```bash +curl -s http://localhost:8089/api/http/services | python3 -m json.tool +curl -s http://localhost:8089/api/http/routers | python3 -m json.tool +``` + +### Run with verbose logging + +```bash +go test -v -count=1 -run TestSignalTraefikLoadBalancing -timeout 120s +``` + +--- + +## CI/CD Integration + +The tests are designed to run in CI with the Docker Compose stack: + +```yaml +# Example GitHub Actions workflow +- name: Start HA test environment + run: docker compose -f docker-compose.ha-test.yml up -d + +- name: Wait for services + run: | + for i in {1..30}; do + curl -sf http://localhost:9091/metrics && break + sleep 2 + done + +- name: Run integration tests + run: cd tests/integration && go test -v -count=1 -timeout 300s +``` diff --git a/go.mod b/go.mod index 1b5861a378e..a4b79add2fb 100644 --- a/go.mod +++ b/go.mod @@ -90,7 +90,7 @@ require ( github.com/prometheus/client_golang v1.23.2 github.com/quic-go/quic-go v0.55.0 github.com/redis/go-redis/v9 v9.7.3 - github.com/rs/xid v1.3.0 + github.com/rs/xid v1.6.0 github.com/shirou/gopsutil/v3 v3.24.4 github.com/skratchdot/open-golang v0.0.0-20200116055534-eef842397966 github.com/songgao/water v0.0.0-20200317203138-2b4b6d7c09d8 diff --git a/go.sum b/go.sum index 3772946e1c4..3b46c117d9d 100644 --- a/go.sum +++ b/go.sum @@ -556,8 +556,8 @@ github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0t github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/rs/cors v1.8.0 h1:P2KMzcFwrPoSjkF1WLRPsp3UMLyql8L4v9hQpVeK5so= github.com/rs/cors v1.8.0/go.mod h1:EBwu+T5AvHOcXwvZIkQFjUN6s8Czyqw12GL/Y0tUyRM= -github.com/rs/xid v1.3.0 h1:6NjYksEUlhurdVehpc7S7dk6DAmcKv8V9gG0FsVN2U4= -github.com/rs/xid v1.3.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= +github.com/rs/xid v1.6.0 h1:fV591PaemRlL6JfRxGDEPl69wICngIQ3shQtzfy2gxU= +github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0= github.com/russellhaering/goxmldsig v1.6.0 h1:8fdWXEPh2k/NZNQBPFNoVfS3JmzS4ZprY/sAOpKQLks= github.com/russellhaering/goxmldsig v1.6.0/go.mod h1:TrnaquDcYxWXfJrOjeMBTX4mLBeYAqaHEyUeWPxZlBM= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= diff --git a/management/Dockerfile b/management/Dockerfile index 3b2df262395..b1e3f8c31e2 100644 --- a/management/Dockerfile +++ b/management/Dockerfile @@ -1,5 +1,5 @@ FROM ubuntu:24.04 -RUN apt update && apt install -y ca-certificates && rm -fr /var/cache/apt +RUN apt update && apt install -y ca-certificates wget && rm -fr /var/cache/apt ENTRYPOINT [ "/go/bin/netbird-mgmt","management"] CMD ["--log-file", "console"] COPY netbird-mgmt /go/bin/netbird-mgmt diff --git a/management/cmd/management.go b/management/cmd/management.go index 27d8055e77a..f6eebcf82a8 100644 --- a/management/cmd/management.go +++ b/management/cmd/management.go @@ -24,6 +24,7 @@ import ( "github.com/netbirdio/netbird/formatter/hook" "github.com/netbirdio/netbird/management/internals/server" nbconfig "github.com/netbirdio/netbird/management/internals/server/config" + mgmtdistributed "github.com/netbirdio/netbird/management/server/distributed" nbdomain "github.com/netbirdio/netbird/shared/management/domain" "github.com/netbirdio/netbird/util" "github.com/netbirdio/netbird/util/crypt" @@ -62,6 +63,14 @@ var ( return fmt.Errorf("failed reading provided config file: %s: %v", nbconfig.MgmtConfigPath, err) } + // Populate HA config from environment if not present in config file + if config.HA == nil { + haCfg := mgmtdistributed.LoadManagementHAConfigFromEnv() + if haCfg.Enabled { + config.HA = &haCfg.HAConfig + } + } + if cmd.Flag(idpSignKeyRefreshEnabledFlagName).Changed { config.HttpConfig.IdpSignKeyRefreshEnabled = idpSignKeyRefreshEnabled } diff --git a/management/internals/controllers/network_map/controller/controller.go b/management/internals/controllers/network_map/controller/controller.go index 4b414df6f30..1774507233c 100644 --- a/management/internals/controllers/network_map/controller/controller.go +++ b/management/internals/controllers/network_map/controller/controller.go @@ -12,6 +12,7 @@ import ( "sync/atomic" "time" + "github.com/redis/go-redis/v9" log "github.com/sirupsen/logrus" "golang.org/x/exp/maps" "golang.org/x/mod/semver" @@ -65,6 +66,10 @@ type Controller struct { expNewNetworkMapAIDs map[string]struct{} compactedNetworkMap bool + + // HA fields for cross-instance update propagation via Redis pub/sub + haRedisClient *redis.Client + accountChannelPrefix string } type bufferUpdate struct { @@ -126,6 +131,12 @@ func NewController(ctx context.Context, store store.Store, metrics telemetry.App } } +// SetRedisPublisher configures the Redis client and account channel prefix for HA cross-instance updates. +func (c *Controller) SetRedisPublisher(redisClient *redis.Client, accountChannelPrefix string) { + c.haRedisClient = redisClient + c.accountChannelPrefix = accountChannelPrefix +} + func (c *Controller) OnPeerConnected(ctx context.Context, accountID string, peerID string) (chan *network_map.UpdateMessage, error) { peer, err := c.repo.GetPeerByID(ctx, accountID, peerID) if err != nil { @@ -152,6 +163,12 @@ func (c *Controller) CountStreams() int { } func (c *Controller) sendUpdateAccountPeers(ctx context.Context, accountID string) error { + err := c.sendUpdateToLocalAccountPeers(ctx, accountID) + c.publishAccountUpdate(ctx, accountID) + return err +} + +func (c *Controller) sendUpdateToLocalAccountPeers(ctx context.Context, accountID string) error { log.WithContext(ctx).Tracef("updating peers for account %s from %s", accountID, util.GetCallerName()) var ( account *types.Account @@ -281,6 +298,27 @@ func (c *Controller) sendUpdateAccountPeers(ctx context.Context, accountID strin return nil } +func (c *Controller) publishAccountUpdate(ctx context.Context, accountID string) { + if c.haRedisClient == nil || c.accountChannelPrefix == "" { + return + } + + channel := fmt.Sprintf("%s%s", c.accountChannelPrefix, accountID) + if err := c.haRedisClient.Publish(ctx, channel, "account_updated").Err(); err != nil { + log.WithContext(ctx).Warnf("failed to publish account update to channel %s: %v", channel, err) + } +} + +// HandleRemoteAccountUpdate handles account updates received from Redis pub/sub by recalculating +// the network map and sending updates to all locally connected peers for the account. +func (c *Controller) HandleRemoteAccountUpdate(ctx context.Context, accountID string) error { + log.WithContext(ctx).Debugf("handling remote account update for account %s", accountID) + if err := c.RecalculateNetworkMapCache(ctx, accountID); err != nil { + return fmt.Errorf("recalculate network map cache for remote update: %v", err) + } + return c.sendUpdateToLocalAccountPeers(ctx, accountID) +} + func (c *Controller) bufferSendUpdateAccountPeers(ctx context.Context, accountID string) error { log.WithContext(ctx).Tracef("buffer sending update peers for account %s from %s", accountID, util.GetCallerName()) diff --git a/management/internals/controllers/network_map/update_channel/updatechannel.go b/management/internals/controllers/network_map/update_channel/updatechannel.go index 5f7db530068..eaf6432e9b8 100644 --- a/management/internals/controllers/network_map/update_channel/updatechannel.go +++ b/management/internals/controllers/network_map/update_channel/updatechannel.go @@ -2,9 +2,12 @@ package update_channel import ( "context" + "fmt" + "strings" "sync" "time" + "github.com/redis/go-redis/v9" log "github.com/sirupsen/logrus" "github.com/netbirdio/netbird/management/internals/controllers/network_map" @@ -178,3 +181,43 @@ func (p *PeersUpdateManager) CountStreams() int { defer p.channelsMux.RUnlock() return len(p.peerChannels) } + +// SubscribeToAccountUpdates subscribes to Redis pub/sub for account update channels. +// When a remote instance publishes an account update, the handler is invoked with the accountID. +// The subscription runs in a background goroutine and cancels when the provided context is done. +func (p *PeersUpdateManager) SubscribeToAccountUpdates(ctx context.Context, redisClient *redis.Client, accountChannelPrefix string, handler func(accountID string)) { + if redisClient == nil { + log.WithContext(ctx).Debug("redis client is nil, skipping account update subscription") + return + } + + pattern := fmt.Sprintf("%s*", accountChannelPrefix) + pubsub := redisClient.PSubscribe(ctx, pattern) + + go func() { + defer pubsub.Close() + + ch := pubsub.Channel() + for { + select { + case <-ctx.Done(): + log.WithContext(ctx).Debug("stopping account update subscription") + return + case msg, ok := <-ch: + if !ok { + log.WithContext(ctx).Debug("account update subscription channel closed") + return + } + + accountID := strings.TrimPrefix(msg.Channel, accountChannelPrefix) + if accountID == "" { + log.WithContext(ctx).Warnf("received account update on channel %s but could not extract account ID", msg.Channel) + continue + } + + log.WithContext(ctx).Debugf("received remote account update for account %s", accountID) + handler(accountID) + } + } + }() +} diff --git a/management/internals/modules/peers/ephemeral/manager/ephemeral.go b/management/internals/modules/peers/ephemeral/manager/ephemeral.go index 758f643d0a3..386349f5ca0 100644 --- a/management/internals/modules/peers/ephemeral/manager/ephemeral.go +++ b/management/internals/modules/peers/ephemeral/manager/ephemeral.go @@ -2,9 +2,12 @@ package manager import ( "context" + "fmt" + "strings" "sync" "time" + "github.com/redis/go-redis/v9" log "github.com/sirupsen/logrus" "github.com/netbirdio/netbird/management/internals/modules/peers" @@ -13,11 +16,14 @@ import ( nbpeer "github.com/netbirdio/netbird/management/server/peer" "github.com/netbirdio/netbird/management/server/store" + "github.com/netbirdio/netbird/shared/distributed" ) const ( // cleanupWindow is the time window to wait after nearest peer deadline to start the cleanup procedure. cleanupWindow = 1 * time.Minute + // redisPollInterval is the interval for polling Redis ZSET for expired ephemeral peers in HA mode. + redisPollInterval = 1 * time.Minute ) var ( @@ -47,6 +53,12 @@ type EphemeralManager struct { lifeTime time.Duration cleanupWindow time.Duration + + // HA mode fields + redisClient *distributed.Client + ephemeralKey string + cancel context.CancelFunc + wg sync.WaitGroup } // NewEphemeralManager instantiate new EphemeralManager @@ -60,6 +72,17 @@ func NewEphemeralManager(store store.Store, peersManager peers.Manager) *Ephemer } } +// WithRedis enables Redis-backed ephemeral peer tracking for HA mode. +func (e *EphemeralManager) WithRedis(client *distributed.Client, ephemeralKey string) *EphemeralManager { + e.redisClient = client + e.ephemeralKey = ephemeralKey + return e +} + +func (e *EphemeralManager) haEnabled() bool { + return e.redisClient != nil && e.ephemeralKey != "" +} + // LoadInitialPeers load from the database the ephemeral type of peers and schedule a cleanup procedure to the head // of the linked list (to the most deprecated peer). At the end of cleanup it schedules the next cleanup to the new // head. @@ -68,21 +91,35 @@ func (e *EphemeralManager) LoadInitialPeers(ctx context.Context) { defer e.peersLock.Unlock() e.loadEphemeralPeers(ctx) - if e.headPeer != nil { + if e.haEnabled() { + // Sync loaded peers to Redis ZSET + for p := e.headPeer; p != nil; p = p.next { + e.redisZAdd(ctx, p.id, p.accountID, p.deadline) + } + // Start background polling goroutine for Redis-backed cleanup + pollCtx, cancel := context.WithCancel(context.Background()) + e.cancel = cancel + e.wg.Add(1) + go e.redisPollLoop(pollCtx) + } else if e.headPeer != nil { e.timer = time.AfterFunc(e.lifeTime, func() { e.cleanup(ctx) }) } } -// Stop timer +// Stop timer and background goroutines func (e *EphemeralManager) Stop() { e.peersLock.Lock() - defer e.peersLock.Unlock() - if e.timer != nil { e.timer.Stop() } + e.peersLock.Unlock() + + if e.cancel != nil { + e.cancel() + e.wg.Wait() + } } // OnPeerConnected remove the peer from the linked list of ephemeral peers. Because it has been called when the peer @@ -99,6 +136,10 @@ func (e *EphemeralManager) OnPeerConnected(ctx context.Context, peer *nbpeer.Pee e.removePeer(peer.ID) + if e.haEnabled() { + e.redisZRem(ctx, peer.ID, peer.AccountID) + } + // stop the unnecessary timer if e.headPeer == nil && e.timer != nil { e.timer.Stop() @@ -122,8 +163,13 @@ func (e *EphemeralManager) OnPeerDisconnected(ctx context.Context, peer *nbpeer. return } - e.addPeer(peer.AccountID, peer.ID, e.newDeadLine()) - if e.timer == nil { + deadline := e.newDeadLine() + e.addPeer(peer.AccountID, peer.ID, deadline) + if e.haEnabled() { + e.redisZAdd(ctx, peer.ID, peer.AccountID, deadline) + } + + if !e.haEnabled() && e.timer == nil { delay := e.headPeer.deadline.Sub(timeNow()) + e.cleanupWindow if delay < 0 { delay = 0 @@ -248,3 +294,77 @@ func (e *EphemeralManager) isPeerOnList(id string) bool { func (e *EphemeralManager) newDeadLine() time.Time { return timeNow().Add(e.lifeTime) } + +func (e *EphemeralManager) redisZAdd(ctx context.Context, peerID, accountID string, deadline time.Time) { + member := peerID + ":" + accountID + err := e.redisClient.ZAdd(ctx, e.ephemeralKey, redis.Z{Score: float64(deadline.Unix()), Member: member}).Err() + if err != nil { + log.WithContext(ctx).Errorf("failed to ZADD ephemeral peer %s: %v", member, err) + } +} + +func (e *EphemeralManager) redisZRem(ctx context.Context, peerID, accountID string) { + member := peerID + ":" + accountID + err := e.redisClient.ZRem(ctx, e.ephemeralKey, member).Err() + if err != nil { + log.WithContext(ctx).Errorf("failed to ZREM ephemeral peer %s: %v", member, err) + } +} + +func (e *EphemeralManager) redisPollLoop(ctx context.Context) { + defer e.wg.Done() + ticker := time.NewTicker(redisPollInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + e.redisCleanup(ctx) + } + } +} + +func (e *EphemeralManager) redisCleanup(ctx context.Context) { + now := timeNow().Unix() + members, err := e.redisClient.ZRangeByScore(ctx, e.ephemeralKey, &redis.ZRangeBy{ + Min: "0", + Max: fmt.Sprintf("%d", now), + }).Result() + if err != nil { + log.WithContext(ctx).Errorf("failed to ZRANGEBYSCORE ephemeral peers: %v", err) + return + } + + if len(members) == 0 { + return + } + + e.peersLock.Lock() + peerIDsPerAccount := make(map[string][]string) + for _, member := range members { + parts := strings.SplitN(member, ":", 2) + if len(parts) != 2 { + log.WithContext(ctx).Warnf("invalid ephemeral peer member format: %s", member) + continue + } + peerID, accountID := parts[0], parts[1] + e.removePeer(peerID) + peerIDsPerAccount[accountID] = append(peerIDsPerAccount[accountID], peerID) + } + e.peersLock.Unlock() + + for _, member := range members { + if err := e.redisClient.ZRem(ctx, e.ephemeralKey, member).Err(); err != nil { + log.WithContext(ctx).Errorf("failed to ZREM ephemeral peer %s: %v", member, err) + } + } + + for accountID, peerIDs := range peerIDsPerAccount { + log.WithContext(ctx).Tracef("cleanup: deleting %d ephemeral peers for account %s", len(peerIDs), accountID) + if err := e.peersManager.DeletePeers(ctx, accountID, peerIDs, activity.SystemInitiator, true); err != nil { + log.WithContext(ctx).Errorf("failed to delete ephemeral peers: %s", err) + } + } +} diff --git a/management/internals/server/boot.go b/management/internals/server/boot.go index 2b40c0aad9c..32f5b16e30e 100644 --- a/management/internals/server/boot.go +++ b/management/internals/server/boot.go @@ -33,6 +33,7 @@ import ( "github.com/netbirdio/netbird/management/server/http/middleware" "github.com/netbirdio/netbird/management/server/store" "github.com/netbirdio/netbird/management/server/telemetry" + "github.com/netbirdio/netbird/shared/distributed" mgmtProto "github.com/netbirdio/netbird/shared/management/proto" "github.com/netbirdio/netbird/util/crypt" ) @@ -61,6 +62,12 @@ func (s *BaseServer) Metrics() telemetry.AppMetrics { }) } +func (s *BaseServer) RedisClient() *distributed.Client { + return Create(s, func() *distributed.Client { + return nil + }) +} + // CacheStore returns a shared cache store backed by Redis or in-memory depending on the environment. // All consumers should reuse this store to avoid creating multiple Redis connections. func (s *BaseServer) CacheStore() cachestore.StoreInterface { @@ -173,7 +180,7 @@ func (s *BaseServer) GRPCServer() *grpc.Server { } gRPCAPIHandler := grpc.NewServer(gRPCOpts...) - srv, err := nbgrpc.NewServer(s.Config, s.AccountManager(), s.SettingsManager(), s.JobManager(), s.SecretsManager(), s.Metrics(), s.AuthManager(), s.IntegratedValidator(), s.NetworkMapController(), s.OAuthConfigProvider()) + srv, err := nbgrpc.NewServer(s.Config, s.AccountManager(), s.SettingsManager(), s.JobManager(), s.SecretsManager(), s.Metrics(), s.AuthManager(), s.IntegratedValidator(), s.NetworkMapController(), s.OAuthConfigProvider(), nil, nil) if err != nil { log.Fatalf("failed to create management server: %v", err) } diff --git a/management/internals/server/config/config.go b/management/internals/server/config/config.go index fb9c842b740..39471df27f1 100644 --- a/management/internals/server/config/config.go +++ b/management/internals/server/config/config.go @@ -5,6 +5,7 @@ import ( "github.com/netbirdio/netbird/management/server/idp" "github.com/netbirdio/netbird/management/server/types" + "github.com/netbirdio/netbird/shared/distributed" "github.com/netbirdio/netbird/shared/management/client/common" "github.com/netbirdio/netbird/util" ) @@ -61,6 +62,10 @@ type Config struct { // EmbeddedIdP contains configuration for the embedded Dex OIDC provider. // When set, Dex will be embedded in the management server and serve requests at /oauth2/ EmbeddedIdP *idp.EmbeddedIdPConfig + + // HA contains optional high-availability configuration. + // When nil (the default) HA mode is disabled, preserving backward compatibility. + HA *distributed.HAConfig } // GetAuthAudiences returns the audience from the http config and device authorization flow config diff --git a/management/internals/server/controllers.go b/management/internals/server/controllers.go index 9a8e45d33d0..7ef208509b3 100644 --- a/management/internals/server/controllers.go +++ b/management/internals/server/controllers.go @@ -17,6 +17,7 @@ import ( "github.com/netbirdio/netbird/management/internals/shared/grpc" "github.com/netbirdio/netbird/management/server" "github.com/netbirdio/netbird/management/server/auth" + mgmtdistributed "github.com/netbirdio/netbird/management/server/distributed" "github.com/netbirdio/netbird/management/server/integrations/integrated_validator" "github.com/netbirdio/netbird/management/server/integrations/port_forwarding" "github.com/netbirdio/netbird/management/server/job" @@ -105,7 +106,12 @@ func (s *BaseServer) AuthManager() auth.Manager { func (s *BaseServer) EphemeralManager() ephemeral.Manager { return Create(s, func() ephemeral.Manager { - return manager.NewEphemeralManager(s.Store(), s.PeersManager()) + mgr := manager.NewEphemeralManager(s.Store(), s.PeersManager()) + if redisClient := s.RedisClient(); redisClient != nil { + haCfg := mgmtdistributed.LoadManagementHAConfigFromEnv() + mgr.WithRedis(redisClient, haCfg.EphemeralKey) + } + return mgr }) } diff --git a/management/internals/server/server.go b/management/internals/server/server.go index 9b8716da104..3567b019ad7 100644 --- a/management/internals/server/server.go +++ b/management/internals/server/server.go @@ -23,6 +23,7 @@ import ( "github.com/netbirdio/netbird/management/server/idp" "github.com/netbirdio/netbird/management/server/metrics" "github.com/netbirdio/netbird/management/server/store" + "github.com/netbirdio/netbird/shared/distributed" "github.com/netbirdio/netbird/util/wsproxy" wsproxyserver "github.com/netbirdio/netbird/util/wsproxy/server" "github.com/netbirdio/netbird/version" @@ -73,6 +74,8 @@ type BaseServer struct { errCh chan error wg sync.WaitGroup cancel context.CancelFunc + + redisClient *distributed.Client } // Config holds the configuration parameters for creating a new server @@ -116,6 +119,15 @@ func (s *BaseServer) Start(ctx context.Context) error { s.cancel = cancel s.errCh = make(chan error, 4) + if s.Config.HA != nil && s.Config.HA.Enabled { + redisClient, err := distributed.NewClient(*s.Config.HA) + if err != nil { + return fmt.Errorf("failed to create Redis client: %w", err) + } + s.redisClient = redisClient + Inject(s, redisClient) + } + if s.autoResolveDomains { s.resolveDomains(srvCtx) } @@ -254,6 +266,9 @@ func (s *BaseServer) Stop() error { } _ = s.Store().Close(ctx) _ = s.EventStore().Close(ctx) + if s.redisClient != nil { + _ = s.redisClient.Close() + } if s.update != nil { s.update.StopWatch() } diff --git a/management/internals/shared/grpc/loginfilter.go b/management/internals/shared/grpc/loginfilter.go index 59f69dd90ce..4425dae7899 100644 --- a/management/internals/shared/grpc/loginfilter.go +++ b/management/internals/shared/grpc/loginfilter.go @@ -1,11 +1,14 @@ package grpc import ( + "context" + "encoding/json" "hash/fnv" "math" "sync" "time" + "github.com/netbirdio/netbird/shared/distributed" nbpeer "github.com/netbirdio/netbird/management/server/peer" ) @@ -36,6 +39,8 @@ type loginFilter struct { mu sync.RWMutex cfg *lfConfig logged map[string]*peerState + redis *distributed.Client + key string } type peerState struct { @@ -61,12 +66,36 @@ func newLoginFilterWithCfg(cfg *lfConfig) *loginFilter { } } +func newLoginFilterWithRedis(redis *distributed.Client, key string) *loginFilter { + return &loginFilter{ + logged: make(map[string]*peerState), + cfg: initCfg(), + redis: redis, + key: key, + } +} + func (l *loginFilter) allowLogin(wgPubKey string, metaHash uint64) bool { l.mu.RLock() - defer func() { - l.mu.RUnlock() - }() state, ok := l.logged[wgPubKey] + l.mu.RUnlock() + + if !ok && l.redis != nil { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + data, err := l.redis.HGet(ctx, l.key, wgPubKey).Result() + if err == nil && data != "" { + var redisState peerState + if json.Unmarshal([]byte(data), &redisState) == nil { + state = &redisState + ok = true + l.mu.Lock() + l.logged[wgPubKey] = state + l.mu.Unlock() + } + } + } + if !ok { return true } @@ -91,7 +120,7 @@ func (l *loginFilter) addLogin(wgPubKey string, metaHash uint64) { state, ok := l.logged[wgPubKey] if !ok { - l.logged[wgPubKey] = &peerState{ + state = &peerState{ currentHash: metaHash, sessionCounter: 1, sessionStart: now, @@ -99,6 +128,8 @@ func (l *loginFilter) addLogin(wgPubKey string, metaHash uint64) { metaChangeWindowStart: now, metaChangeCounter: 1, } + l.logged[wgPubKey] = state + l.syncToRedis(wgPubKey, state) return } @@ -121,6 +152,7 @@ func (l *loginFilter) addLogin(wgPubKey string, metaHash uint64) { state.sessionCounter = 1 state.sessionStart = now state.lastSeen = now + l.syncToRedis(wgPubKey, state) return } @@ -137,6 +169,21 @@ func (l *loginFilter) addLogin(wgPubKey string, metaHash uint64) { state.sessionStart = now } state.lastSeen = now + l.syncToRedis(wgPubKey, state) +} + +func (l *loginFilter) syncToRedis(wgPubKey string, state *peerState) { + if l.redis == nil { + return + } + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + data, err := json.Marshal(state) + if err != nil { + return + } + l.redis.HSet(ctx, l.key, wgPubKey, string(data)) + l.redis.Expire(ctx, l.key, 2*l.cfg.baseBlockDuration) } func metaHash(meta nbpeer.PeerSystemMeta, pubip string) uint64 { diff --git a/management/internals/shared/grpc/server.go b/management/internals/shared/grpc/server.go index 6e8358f0287..c2f89c71dfb 100644 --- a/management/internals/shared/grpc/server.go +++ b/management/internals/shared/grpc/server.go @@ -39,6 +39,8 @@ import ( "github.com/netbirdio/netbird/management/server/activity" "github.com/netbirdio/netbird/management/server/auth" nbContext "github.com/netbirdio/netbird/management/server/context" + mgmtdistributed "github.com/netbirdio/netbird/management/server/distributed" + "github.com/netbirdio/netbird/shared/distributed" nbpeer "github.com/netbirdio/netbird/management/server/peer" "github.com/netbirdio/netbird/management/server/posture" "github.com/netbirdio/netbird/management/server/settings" @@ -56,6 +58,15 @@ const ( defaultSyncLim = 1000 ) +// NoopLock provides local-only locking for backward compatibility when HA is disabled. +type NoopLock struct{ mu sync.Mutex } + +// Acquire locks the local mutex and returns an unlock function. +func (l *NoopLock) Acquire(ctx context.Context, resource string, ttl time.Duration) (func(), error) { + l.mu.Lock() + return l.mu.Unlock, nil +} + // Server an instance of a Management gRPC API server type Server struct { accountManager account.Manager @@ -65,7 +76,7 @@ type Server struct { config *nbconfig.Config secretsManager SecretsManager appMetrics telemetry.AppMetrics - peerLocks sync.Map + peerLocks mgmtdistributed.Lock authManager auth.Manager logBlockedPeers bool @@ -98,6 +109,8 @@ func NewServer( integratedPeerValidator integrated_validator.IntegratedValidator, networkMapController network_map.Controller, oAuthConfigProvider idp.OAuthConfigProvider, + peerLocks mgmtdistributed.Lock, + redisClient *distributed.Client, ) (*Server, error) { if appMetrics != nil { // update gauge based on number of connected peers which is equal to open gRPC streams @@ -127,6 +140,17 @@ func NewServer( } } + var lf *loginFilter + if redisClient != nil { + lf = newLoginFilterWithRedis(redisClient, mgmtdistributed.DefaultManagementHAConfig().LoginFilterKey) + } else { + lf = newLoginFilter() + } + + if peerLocks == nil { + peerLocks = &NoopLock{} + } + return &Server{ jobManager: jobManager, accountManager: accountManager, @@ -135,13 +159,14 @@ func NewServer( secretsManager: secretsManager, authManager: authManager, appMetrics: appMetrics, + peerLocks: peerLocks, logBlockedPeers: logBlockedPeers, blockPeersWithSameConfig: blockPeersWithSameConfig, integratedPeerValidator: integratedPeerValidator, networkMapController: networkMapController, oAuthConfigProvider: oAuthConfigProvider, - loginFilter: newLoginFilter(), + loginFilter: lf, syncLim: syncLim, syncLimEnabled: syncLimEnabled, @@ -573,14 +598,16 @@ func (s *Server) acquirePeerLockByUID(ctx context.Context, uniqueID string) (unl log.WithContext(ctx).Tracef("acquiring peer lock for ID %s", uniqueID) start := time.Now() - value, _ := s.peerLocks.LoadOrStore(uniqueID, &sync.RWMutex{}) - mtx := value.(*sync.RWMutex) - mtx.Lock() + release, err := s.peerLocks.Acquire(ctx, uniqueID, 15*time.Second) + if err != nil { + log.WithContext(ctx).Warnf("failed to acquire peer lock for %s: %v", uniqueID, err) + return func() {} + } log.WithContext(ctx).Tracef("acquired peer lock for ID %s in %v", uniqueID, time.Since(start)) start = time.Now() unlock = func() { - mtx.Unlock() + release() log.WithContext(ctx).Tracef("released peer lock for ID %s in %v", uniqueID, time.Since(start)) } diff --git a/management/internals/shared/grpc/token_mgr.go b/management/internals/shared/grpc/token_mgr.go index 65e58ad4152..63105c20f68 100644 --- a/management/internals/shared/grpc/token_mgr.go +++ b/management/internals/shared/grpc/token_mgr.go @@ -6,18 +6,15 @@ import ( "crypto/sha256" "encoding/base64" "fmt" - "sync" "time" log "github.com/sirupsen/logrus" "golang.zx2c4.com/wireguard/wgctrl/wgtypes" - integrationsConfig "github.com/netbirdio/management-integrations/integrations/config" "github.com/netbirdio/netbird/management/internals/controllers/network_map" nbconfig "github.com/netbirdio/netbird/management/internals/server/config" "github.com/netbirdio/netbird/management/server/groups" "github.com/netbirdio/netbird/management/server/settings" - "github.com/netbirdio/netbird/shared/management/proto" auth "github.com/netbirdio/netbird/shared/relay/auth/hmac" authv2 "github.com/netbirdio/netbird/shared/relay/auth/hmac/v2" ) @@ -35,7 +32,6 @@ type SecretsManager interface { // TimeBasedAuthSecretsManager generates credentials with TTL and using pre-shared secret known to TURN server type TimeBasedAuthSecretsManager struct { - mux sync.Mutex turnCfg *nbconfig.TURNConfig relayCfg *nbconfig.Relay turnHmacToken *auth.TimedHMAC @@ -43,8 +39,6 @@ type TimeBasedAuthSecretsManager struct { updateManager network_map.PeersUpdateManager settingsManager settings.Manager groupsManager groups.Manager - turnCancelMap map[string]chan struct{} - relayCancelMap map[string]chan struct{} wgKey wgtypes.Key } @@ -60,8 +54,6 @@ func NewTimeBasedAuthSecretsManager(updateManager network_map.PeersUpdateManager updateManager: updateManager, turnCfg: turnCfg, relayCfg: relayCfg, - turnCancelMap: make(map[string]chan struct{}), - relayCancelMap: make(map[string]chan struct{}), settingsManager: settingsManager, groupsManager: groupsManager, wgKey: key, @@ -126,166 +118,11 @@ func (m *TimeBasedAuthSecretsManager) GenerateRelayToken() (*Token, error) { }, nil } -func (m *TimeBasedAuthSecretsManager) cancelTURN(peerID string) { - if channel, ok := m.turnCancelMap[peerID]; ok { - close(channel) - delete(m.turnCancelMap, peerID) - } -} - -func (m *TimeBasedAuthSecretsManager) cancelRelay(peerID string) { - if channel, ok := m.relayCancelMap[peerID]; ok { - close(channel) - delete(m.relayCancelMap, peerID) - } -} - -// CancelRefresh cancels scheduled peer credentials refresh +// CancelRefresh is a no-op in the stateless implementation. func (m *TimeBasedAuthSecretsManager) CancelRefresh(peerID string) { - m.mux.Lock() - defer m.mux.Unlock() - m.cancelTURN(peerID) - m.cancelRelay(peerID) } -// SetupRefresh starts peer credentials refresh +// SetupRefresh is a no-op in the stateless implementation. +// Credentials are generated on-demand when peers sync or request them. func (m *TimeBasedAuthSecretsManager) SetupRefresh(ctx context.Context, accountID, peerID string) { - m.mux.Lock() - defer m.mux.Unlock() - - m.cancelTURN(peerID) - m.cancelRelay(peerID) - - if m.turnCfg != nil && m.turnCfg.TimeBasedCredentials { - turnCancel := make(chan struct{}, 1) - m.turnCancelMap[peerID] = turnCancel - go m.refreshTURNTokens(ctx, accountID, peerID, turnCancel) - log.WithContext(ctx).Debugf("starting TURN refresh for %s", peerID) - } - - if m.relayCfg != nil { - relayCancel := make(chan struct{}, 1) - m.relayCancelMap[peerID] = relayCancel - go m.refreshRelayTokens(ctx, accountID, peerID, relayCancel) - log.WithContext(ctx).Tracef("starting relay refresh for %s", peerID) - } -} - -func (m *TimeBasedAuthSecretsManager) refreshTURNTokens(ctx context.Context, accountID, peerID string, cancel chan struct{}) { - ticker := time.NewTicker(m.turnCfg.CredentialsTTL.Duration / 4 * 3) - defer ticker.Stop() - - for { - select { - case <-cancel: - log.WithContext(ctx).Tracef("stopping TURN refresh for %s", peerID) - return - case <-ticker.C: - m.pushNewTURNAndRelayTokens(ctx, accountID, peerID) - } - } -} - -func (m *TimeBasedAuthSecretsManager) refreshRelayTokens(ctx context.Context, accountID, peerID string, cancel chan struct{}) { - ticker := time.NewTicker(m.relayCfg.CredentialsTTL.Duration / 4 * 3) - defer ticker.Stop() - - for { - select { - case <-cancel: - log.WithContext(ctx).Tracef("stopping relay refresh for %s", peerID) - return - case <-ticker.C: - m.pushNewRelayTokens(ctx, accountID, peerID) - } - } -} - -func (m *TimeBasedAuthSecretsManager) pushNewTURNAndRelayTokens(ctx context.Context, accountID, peerID string) { - turnToken, err := m.turnHmacToken.GenerateToken(sha1.New) - if err != nil { - log.WithContext(ctx).Errorf("failed to generate token for peer '%s': %s", peerID, err) - return - } - - var turns []*proto.ProtectedHostConfig - for _, host := range m.turnCfg.Turns { - turn := &proto.ProtectedHostConfig{ - HostConfig: &proto.HostConfig{ - Uri: host.URI, - Protocol: ToResponseProto(host.Proto), - }, - User: turnToken.Payload, - Password: turnToken.Signature, - } - turns = append(turns, turn) - } - - update := &proto.SyncResponse{ - NetbirdConfig: &proto.NetbirdConfig{ - Turns: turns, - }, - } - - // workaround for the case when client is unable to handle turn and relay updates at different time - if m.relayCfg != nil { - token, err := m.GenerateRelayToken() - if err == nil { - update.NetbirdConfig.Relay = &proto.RelayConfig{ - Urls: m.relayCfg.Addresses, - TokenPayload: token.Payload, - TokenSignature: token.Signature, - } - } - } - - m.extendNetbirdConfig(ctx, peerID, accountID, update) - - log.WithContext(ctx).Debugf("sending new TURN credentials to peer %s", peerID) - m.updateManager.SendUpdate(ctx, peerID, &network_map.UpdateMessage{ - Update: update, - MessageType: network_map.MessageTypeControlConfig, - }) -} - -func (m *TimeBasedAuthSecretsManager) pushNewRelayTokens(ctx context.Context, accountID, peerID string) { - relayToken, err := m.relayHmacToken.GenerateToken() - if err != nil { - log.Errorf("failed to generate relay token for peer '%s': %s", peerID, err) - return - } - - update := &proto.SyncResponse{ - NetbirdConfig: &proto.NetbirdConfig{ - Relay: &proto.RelayConfig{ - Urls: m.relayCfg.Addresses, - TokenPayload: string(relayToken.Payload), - TokenSignature: base64.StdEncoding.EncodeToString(relayToken.Signature), - }, - // omit Turns to avoid updates there - }, - } - - m.extendNetbirdConfig(ctx, peerID, accountID, update) - - log.WithContext(ctx).Debugf("sending new relay credentials to peer %s", peerID) - m.updateManager.SendUpdate(ctx, peerID, &network_map.UpdateMessage{ - Update: update, - MessageType: network_map.MessageTypeControlConfig, - }) -} - -func (m *TimeBasedAuthSecretsManager) extendNetbirdConfig(ctx context.Context, peerID, accountID string, update *proto.SyncResponse) { - extraSettings, err := m.settingsManager.GetExtraSettings(ctx, accountID) - if err != nil { - log.WithContext(ctx).Errorf("failed to get extra settings: %v", err) - } - - peerGroups, err := m.groupsManager.GetPeerGroupIDs(ctx, accountID, peerID) - if err != nil { - log.WithContext(ctx).Errorf("failed to get peer groups: %v", err) - } - - extendedConfig := integrationsConfig.ExtendNetBirdConfig(peerID, peerGroups, update.NetbirdConfig, extraSettings) - update.NetbirdConfig = extendedConfig } diff --git a/management/internals/shared/grpc/token_mgr_test.go b/management/internals/shared/grpc/token_mgr_test.go index 98eb66fb541..d63f07ecdcb 100644 --- a/management/internals/shared/grpc/token_mgr_test.go +++ b/management/internals/shared/grpc/token_mgr_test.go @@ -13,13 +13,10 @@ import ( "github.com/golang/mock/gomock" "github.com/stretchr/testify/require" - "github.com/netbirdio/netbird/management/internals/controllers/network_map" "github.com/netbirdio/netbird/management/internals/controllers/network_map/update_channel" "github.com/netbirdio/netbird/management/internals/server/config" "github.com/netbirdio/netbird/management/server/groups" "github.com/netbirdio/netbird/management/server/settings" - "github.com/netbirdio/netbird/management/server/types" - "github.com/netbirdio/netbird/shared/management/proto" "github.com/netbirdio/netbird/util" ) @@ -83,9 +80,7 @@ func TestTimeBasedAuthSecretsManager_GenerateCredentials(t *testing.T) { func TestTimeBasedAuthSecretsManager_SetupRefresh(t *testing.T) { ttl := util.Duration{Duration: 2 * time.Second} secret := "some_secret" - peersManager := update_channel.NewPeersUpdateManager(nil) peer := "some_peer" - updateChannel := peersManager.CreateChannel(context.Background(), peer) rc := &config.Relay{ Addresses: []string{"localhost:0"}, @@ -96,10 +91,9 @@ func TestTimeBasedAuthSecretsManager_SetupRefresh(t *testing.T) { ctrl := gomock.NewController(t) t.Cleanup(ctrl.Finish) settingsMockManager := settings.NewMockManager(ctrl) - settingsMockManager.EXPECT().GetExtraSettings(gomock.Any(), "someAccountID").Return(&types.ExtraSettings{}, nil).AnyTimes() groupsManager := groups.NewManagerMock() - tested, err := NewTimeBasedAuthSecretsManager(peersManager, &config.TURNConfig{ + tested, err := NewTimeBasedAuthSecretsManager(nil, &config.TURNConfig{ CredentialsTTL: ttl, Secret: secret, Turns: []*config.Host{TurnTestHost}, @@ -110,86 +104,14 @@ func TestTimeBasedAuthSecretsManager_SetupRefresh(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() + // SetupRefresh is a no-op in the stateless implementation. + // It should not panic and should not start any background goroutines. tested.SetupRefresh(ctx, "someAccountID", peer) - - if _, ok := tested.turnCancelMap[peer]; !ok { - t.Errorf("expecting peer to be present in the turn cancel map, got not present") - } - - if _, ok := tested.relayCancelMap[peer]; !ok { - t.Errorf("expecting peer to be present in the relay cancel map, got not present") - } - - var updates []*network_map.UpdateMessage - -loop: - for timeout := time.After(5 * time.Second); ; { - select { - case update := <-updateChannel: - updates = append(updates, update) - case <-timeout: - break loop - } - - if len(updates) >= 2 { - break loop - } - } - - if len(updates) < 2 { - t.Errorf("expecting at least 2 peer credentials updates, got %v", len(updates)) - } - - var turnUpdates, relayUpdates int - var firstTurnUpdate, secondTurnUpdate *proto.ProtectedHostConfig - var firstRelayUpdate, secondRelayUpdate *proto.RelayConfig - - for _, update := range updates { - if turns := update.Update.GetNetbirdConfig().GetTurns(); len(turns) > 0 { - turnUpdates++ - if turnUpdates == 1 { - firstTurnUpdate = turns[0] - } else { - secondTurnUpdate = turns[0] - } - } - if relay := update.Update.GetNetbirdConfig().GetRelay(); relay != nil { - // avoid updating on turn updates since they also send relay credentials - if update.Update.GetNetbirdConfig().GetTurns() == nil { - relayUpdates++ - if relayUpdates == 1 { - firstRelayUpdate = relay - } else { - secondRelayUpdate = relay - } - } - } - } - - if turnUpdates < 1 { - t.Errorf("expecting at least 1 TURN credential update, got %v", turnUpdates) - } - if relayUpdates < 1 { - t.Errorf("expecting at least 1 relay credential update, got %v", relayUpdates) - } - - if firstTurnUpdate != nil && secondTurnUpdate != nil { - if firstTurnUpdate.Password == secondTurnUpdate.Password { - t.Errorf("expecting first TURN credential update password %v to be different from second, got equal", firstTurnUpdate.Password) - } - } - - if firstRelayUpdate != nil && secondRelayUpdate != nil { - if firstRelayUpdate.TokenSignature == secondRelayUpdate.TokenSignature { - t.Errorf("expecting first relay credential update signature %v to be different from second, got equal", firstRelayUpdate.TokenSignature) - } - } } func TestTimeBasedAuthSecretsManager_CancelRefresh(t *testing.T) { ttl := util.Duration{Duration: time.Hour} secret := "some_secret" - peersManager := update_channel.NewPeersUpdateManager(nil) peer := "some_peer" rc := &config.Relay{ @@ -203,7 +125,7 @@ func TestTimeBasedAuthSecretsManager_CancelRefresh(t *testing.T) { settingsMockManager := settings.NewMockManager(ctrl) groupsManager := groups.NewManagerMock() - tested, err := NewTimeBasedAuthSecretsManager(peersManager, &config.TURNConfig{ + tested, err := NewTimeBasedAuthSecretsManager(nil, &config.TURNConfig{ CredentialsTTL: ttl, Secret: secret, Turns: []*config.Host{TurnTestHost}, @@ -211,21 +133,12 @@ func TestTimeBasedAuthSecretsManager_CancelRefresh(t *testing.T) { }, rc, settingsMockManager, groupsManager) require.NoError(t, err) + // CancelRefresh is a no-op in the stateless implementation. + // It should not panic even when called before SetupRefresh or multiple times. + tested.CancelRefresh(peer) tested.SetupRefresh(context.Background(), "someAccountID", peer) - if _, ok := tested.turnCancelMap[peer]; !ok { - t.Errorf("expecting peer to be present in turn cancel map, got not present") - } - if _, ok := tested.relayCancelMap[peer]; !ok { - t.Errorf("expecting peer to be present in relay cancel map, got not present") - } - tested.CancelRefresh(peer) - if _, ok := tested.turnCancelMap[peer]; ok { - t.Errorf("expecting peer to be not present in turn cancel map, got present") - } - if _, ok := tested.relayCancelMap[peer]; ok { - t.Errorf("expecting peer to be not present in relay cancel map, got present") - } + tested.CancelRefresh(peer) } func validateMAC(t *testing.T, algo func() hash.Hash, username string, actualMAC string, key []byte) { diff --git a/management/server/distributed/config.go b/management/server/distributed/config.go new file mode 100644 index 00000000000..9ed88f8f951 --- /dev/null +++ b/management/server/distributed/config.go @@ -0,0 +1,145 @@ +// NETBIRD HA FORK - NEW FILE +// management/server/distributed/config.go +// Management-server-specific HA configuration + +package distributed + +import ( + "os" + "time" + + ha "github.com/netbirdio/netbird/shared/distributed" +) + +// ManagementHAConfig extends the shared HAConfig with settings specific to the management server. +// All fields can be configured via environment variables using the NB_MGMT_ prefix. +type ManagementHAConfig struct { + ha.HAConfig + + PeersRegistryKey string `yaml:"peers_registry_key" env:"NB_MGMT_PEERS_REGISTRY_KEY"` + AccountChannelPrefix string `yaml:"account_channel_prefix" env:"NB_MGMT_ACCOUNT_CHANNEL_PREFIX"` + LockPrefix string `yaml:"lock_prefix" env:"NB_MGMT_LOCK_PREFIX"` + LoginFilterKey string `yaml:"login_filter_key" env:"NB_MGMT_LOGIN_FILTER_KEY"` + EphemeralKey string `yaml:"ephemeral_key" env:"NB_MGMT_EPHEMERAL_KEY"` + PeerTTL time.Duration `yaml:"peer_ttl" env:"NB_MGMT_PEER_TTL"` + HeartbeatInterval time.Duration `yaml:"heartbeat_interval" env:"NB_MGMT_HEARTBEAT_INTERVAL"` + LockTTL time.Duration `yaml:"lock_ttl" env:"NB_MGMT_LOCK_TTL"` +} + +// DefaultManagementHAConfig returns sensible defaults for the management HA layer. +// HA itself is disabled by default; when enabled Redis defaults to localhost:6379. +func DefaultManagementHAConfig() ManagementHAConfig { + return ManagementHAConfig{ + HAConfig: ha.DefaultHAConfig(), + PeersRegistryKey: "netbird:management:peers:registry", + AccountChannelPrefix: "netbird:management:account:", + LockPrefix: "netbird:management:lock:", + LoginFilterKey: "netbird:management:login:filter", + EphemeralKey: "nb:mgmt:ephemeral", + PeerTTL: 30 * time.Second, + HeartbeatInterval: 10 * time.Second, + LockTTL: 15 * time.Second, + } +} + +// LoadManagementHAConfigFromEnv populates a ManagementHAConfig from environment variables. +// NB_MGMT_* variables override the management-specific fields; NB_HA_* variables override +// the embedded shared HAConfig fields. +func LoadManagementHAConfigFromEnv() ManagementHAConfig { + cfg := DefaultManagementHAConfig() + + // Management-specific overrides + if v := os.Getenv("NB_MGMT_PEERS_REGISTRY_KEY"); v != "" { + cfg.PeersRegistryKey = v + } + if v := os.Getenv("NB_MGMT_ACCOUNT_CHANNEL_PREFIX"); v != "" { + cfg.AccountChannelPrefix = v + } + if v := os.Getenv("NB_MGMT_LOCK_PREFIX"); v != "" { + cfg.LockPrefix = v + } + if v := os.Getenv("NB_MGMT_LOGIN_FILTER_KEY"); v != "" { + cfg.LoginFilterKey = v + } + if v := os.Getenv("NB_MGMT_EPHEMERAL_KEY"); v != "" { + cfg.EphemeralKey = v + } + if v := os.Getenv("NB_MGMT_PEER_TTL"); v != "" { + if d, err := time.ParseDuration(v); err == nil { + cfg.PeerTTL = d + } + } + if v := os.Getenv("NB_MGMT_HEARTBEAT_INTERVAL"); v != "" { + if d, err := time.ParseDuration(v); err == nil { + cfg.HeartbeatInterval = d + } + } + if v := os.Getenv("NB_MGMT_LOCK_TTL"); v != "" { + if d, err := time.ParseDuration(v); err == nil { + cfg.LockTTL = d + } + } + + // Embedded HAConfig overrides + if v := os.Getenv("NB_HA_ENABLED"); v == "true" { + cfg.Enabled = true + } + if v := os.Getenv("NB_HA_REDIS_ADDRESS"); v != "" { + cfg.RedisAddress = v + } + if v := os.Getenv("NB_HA_REDIS_PASSWORD"); v != "" { + cfg.RedisPassword = v + } + if v := os.Getenv("NB_HA_REDIS_DB"); v != "" { + // simple atoi fallback ignored for brevity; redis DB 0 is the default + // consumers that need full parsing can do so externally. + } + if v := os.Getenv("NB_HA_REDIS_DIAL_TIMEOUT"); v != "" { + if d, err := time.ParseDuration(v); err == nil { + cfg.DialTimeout = d + } + } + if v := os.Getenv("NB_HA_REDIS_READ_TIMEOUT"); v != "" { + if d, err := time.ParseDuration(v); err == nil { + cfg.ReadTimeout = d + } + } + if v := os.Getenv("NB_HA_REDIS_WRITE_TIMEOUT"); v != "" { + if d, err := time.ParseDuration(v); err == nil { + cfg.WriteTimeout = d + } + } + if v := os.Getenv("NB_HA_REDIS_POOL_SIZE"); v != "" { + // simple atoi fallback ignored for brevity; default is 10 + } + if v := os.Getenv("NB_HA_INSTANCE_ID"); v != "" { + cfg.InstanceID = v + } + + return cfg +} + +// Validate checks the management HA configuration and applies defaults where needed. +func (c *ManagementHAConfig) Validate() error { + if err := c.HAConfig.Validate(); err != nil { + return err + } + if !c.Enabled { + return nil + } + if c.PeerTTL <= 0 { + c.PeerTTL = 30 * time.Second + } + if c.HeartbeatInterval <= 0 { + c.HeartbeatInterval = 10 * time.Second + } + if c.LockTTL <= 0 { + c.LockTTL = 15 * time.Second + } + return nil +} + +// IsEnabled returns true when HA mode is enabled. +func (c *ManagementHAConfig) IsEnabled() bool { + return c != nil && c.Enabled +} diff --git a/management/server/distributed/lock.go b/management/server/distributed/lock.go new file mode 100644 index 00000000000..293a142de8d --- /dev/null +++ b/management/server/distributed/lock.go @@ -0,0 +1,99 @@ +// NETBIRD HA FORK - NEW FILE +// management/server/distributed/lock.go +// Distributed locking primitives for the management server HA mode + +package distributed + +import ( + "context" + "fmt" + "sync" + "time" + + ha "github.com/netbirdio/netbird/shared/distributed" +) + +// Lock provides distributed mutual exclusion. +type Lock interface { + Acquire(ctx context.Context, resource string, ttl time.Duration) (release func(), err error) +} + +// RedisLock implements Lock using Redis SET ... NX EX with a background heartbeat. +type RedisLock struct { + client *ha.Client + instanceID string +} + +// NewRedisLock creates a lock backed by the given Redis client. +func NewRedisLock(client *ha.Client) *RedisLock { + return &RedisLock{ + client: client, + instanceID: client.InstanceID(), + } +} + +// Acquire attempts to acquire a distributed lock for the given resource. +// On success it returns a release function that MUST be called to free the lock. +// A background goroutine extends the lock TTL every ttl/3 until released. +func (l *RedisLock) Acquire(ctx context.Context, resource string, ttl time.Duration) (release func(), err error) { + key := fmt.Sprintf("lock:%s", resource) + value := l.instanceID + + ok, err := l.client.SetNX(ctx, key, value, ttl).Result() + if err != nil { + return nil, fmt.Errorf("redis lock acquire failed for %s: %w", resource, err) + } + if !ok { + return nil, fmt.Errorf("lock already held: %s", resource) + } + + stopHeartbeat := make(chan struct{}) + var wg sync.WaitGroup + wg.Add(1) + + go func() { + defer wg.Done() + ticker := time.NewTicker(ttl / 3) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + bgCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + err := l.client.Expire(bgCtx, key, ttl).Err() + cancel() + if err != nil { + // Heartbeat failed; lock will eventually expire. + return + } + case <-stopHeartbeat: + return + } + } + }() + + released := false + var releaseMu sync.Mutex + + release = func() { + releaseMu.Lock() + defer releaseMu.Unlock() + if released { + return + } + released = true + + close(stopHeartbeat) + wg.Wait() + + bgCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + val, err := l.client.Get(bgCtx, key).Result() + if err == nil && val == value { + l.client.Del(bgCtx, key) + } + } + + return release, nil +} diff --git a/management/server/distributed/registry.go b/management/server/distributed/registry.go new file mode 100644 index 00000000000..a29b2497a70 --- /dev/null +++ b/management/server/distributed/registry.go @@ -0,0 +1,84 @@ +// NETBIRD HA FORK - NEW FILE +// management/server/distributed/registry.go +// Peer-to-instance registry for routing requests in HA mode + +package distributed + +import ( + "context" + "fmt" + "time" + + ha "github.com/netbirdio/netbird/shared/distributed" +) + +// Registry maps peers to the management instance that currently handles them. +type Registry interface { + RegisterPeer(ctx context.Context, peerID, instanceID string, ttl time.Duration) error + DeregisterPeer(ctx context.Context, peerID string) error + GetPeerInstance(ctx context.Context, peerID string) (string, error) +} + +// RedisRegistry implements Registry using Redis hashes. +type RedisRegistry struct { + client *ha.Client + config ManagementHAConfig +} + +// NewRedisRegistry creates a new registry backed by Redis. +func NewRedisRegistry(client *ha.Client, config ManagementHAConfig) *RedisRegistry { + return &RedisRegistry{ + client: client, + config: config, + } +} + +// peersRegistryKey returns the Redis key used for the peer registry hash. +func (r *RedisRegistry) peersRegistryKey() string { + if r.config.PeersRegistryKey != "" { + return r.config.PeersRegistryKey + } + return "netbird:management:peers:registry" +} + +// RegisterPeer records that the given peer is handled by instanceID. +func (r *RedisRegistry) RegisterPeer(ctx context.Context, peerID, instanceID string, ttl time.Duration) error { + key := r.peersRegistryKey() + + err := r.client.HSet(ctx, key, peerID, instanceID).Err() + if err != nil { + return fmt.Errorf("failed to register peer %s: %w", peerID, err) + } + + // Refresh TTL so the whole registry expires if no heartbeats occur. + err = r.client.Expire(ctx, key, ttl).Err() + if err != nil { + return fmt.Errorf("failed to set peer registry TTL: %w", err) + } + + return nil +} + +// DeregisterPeer removes the peer from the registry. +func (r *RedisRegistry) DeregisterPeer(ctx context.Context, peerID string) error { + key := r.peersRegistryKey() + + err := r.client.HDel(ctx, key, peerID).Err() + if err != nil { + return fmt.Errorf("failed to deregister peer %s: %w", peerID, err) + } + + return nil +} + +// GetPeerInstance returns the instance ID that currently handles the peer. +func (r *RedisRegistry) GetPeerInstance(ctx context.Context, peerID string) (string, error) { + key := r.peersRegistryKey() + + instanceID, err := r.client.HGet(ctx, key, peerID).Result() + if err != nil { + return "", fmt.Errorf("failed to get peer instance for %s: %w", peerID, err) + } + + return instanceID, nil +} diff --git a/management/server/management_proto_test.go b/management/server/management_proto_test.go index 18d85315d39..25c84f66468 100644 --- a/management/server/management_proto_test.go +++ b/management/server/management_proto_test.go @@ -391,7 +391,7 @@ func startManagementForTest(t *testing.T, testFile string, config *config.Config return nil, nil, "", cleanup, err } - mgmtServer, err := nbgrpc.NewServer(config, accountManager, settingsMockManager, jobManager, secretsManager, nil, nil, MockIntegratedValidator{}, networkMapController, nil) + mgmtServer, err := nbgrpc.NewServer(config, accountManager, settingsMockManager, jobManager, secretsManager, nil, nil, MockIntegratedValidator{}, networkMapController, nil, nil, nil) if err != nil { return nil, nil, "", cleanup, err } diff --git a/management/server/management_test.go b/management/server/management_test.go index 3ac28cd4ab5..6396de6b6ae 100644 --- a/management/server/management_test.go +++ b/management/server/management_test.go @@ -256,6 +256,8 @@ func startServer( server.MockIntegratedValidator{}, networkMapController, nil, + nil, + nil, ) if err != nil { t.Fatalf("failed creating management server: %v", err) diff --git a/original_readme.md b/original_readme.md new file mode 100644 index 00000000000..dc84af2fd04 --- /dev/null +++ b/original_readme.md @@ -0,0 +1,149 @@ + +
                                                +
                                                +
                                                +

                                                + +

                                                +

                                                + + + + + + +
                                                + + + + + + +
                                                + + + +

                                                +
                                                + + +

                                                + + Start using NetBird at netbird.io +
                                                + See Documentation +
                                                + Join our Slack channel or our Community forum +
                                                + +
                                                +
                                                + + ๐Ÿš€ We are hiring! Join us at careers.netbird.io + +
                                                +
                                                + + New: NetBird terraform provider + +

                                                + +
                                                + +**NetBird combines a configuration-free peer-to-peer private network and a centralized access control system in a single platform, making it easy to create secure private networks for your organization or home.** + +**Connect.** NetBird creates a WireGuard-based overlay network that automatically connects your machines over an encrypted tunnel, leaving behind the hassle of opening ports, complex firewall rules, VPN gateways, and so forth. + +**Secure.** NetBird enables secure remote access by applying granular access policies while allowing you to manage them intuitively from a single place. Works universally on any infrastructure. + +### Open Source Network Security in a Single Platform + +https://github.com/user-attachments/assets/10cec749-bb56-4ab3-97af-4e38850108d2 + +### Self-Host NetBird (Video) +[![Watch the video](https://img.youtube.com/vi/bZAgpT6nzaQ/0.jpg)](https://youtu.be/bZAgpT6nzaQ) + +### Key features + +| Connectivity | Management | Security | Automation| Platforms | +|----|----|----|----|----| +|
                                                • - \[x] Kernel WireGuard
                                                |
                                                • - \[x] [Admin Web UI](https://github.com/netbirdio/dashboard)
                                                |
                                                • - \[x] [SSO & MFA support](https://docs.netbird.io/how-to/installation#running-net-bird-with-sso-login)
                                                |
                                                • - \[x] [Public API](https://docs.netbird.io/api)
                                                |
                                                • - \[x] Linux
                                                | +|
                                                • - \[x] Peer-to-peer connections
                                                |
                                                • - \[x] Auto peer discovery and configuration
                                                • |
                                                  • - \[x] [Access control - groups & rules](https://docs.netbird.io/how-to/manage-network-access)
                                                  • |
                                                    • - \[x] [Setup keys for bulk network provisioning](https://docs.netbird.io/how-to/register-machines-using-setup-keys)
                                                    • |
                                                      • - \[x] Mac
                                                      • | +|
                                                        • - \[x] Connection relay fallback
                                                        • |
                                                          • - \[x] [IdP integrations](https://docs.netbird.io/selfhosted/identity-providers)
                                                          • |
                                                            • - \[x] [Activity logging](https://docs.netbird.io/how-to/audit-events-logging)
                                                            • |
                                                              • - \[x] [Self-hosting quickstart script](https://docs.netbird.io/selfhosted/selfhosted-quickstart)
                                                              • |
                                                                • - \[x] Windows
                                                                • | +|
                                                                  • - \[x] [Routes to external networks](https://docs.netbird.io/how-to/routing-traffic-to-private-networks)
                                                                  • |
                                                                    • - \[x] [Private DNS](https://docs.netbird.io/how-to/manage-dns-in-your-network)
                                                                    • |
                                                                      • - \[x] [Device posture checks](https://docs.netbird.io/how-to/manage-posture-checks)
                                                                      • |
                                                                        • - \[x] IdP groups sync with JWT
                                                                        • |
                                                                          • - \[x] Android
                                                                          • | +|
                                                                            • - \[x] NAT traversal with BPF
                                                                            • |
                                                                              • - \[x] [Multiuser support](https://docs.netbird.io/how-to/add-users-to-your-network)
                                                                              • |
                                                                                • - \[x] Peer-to-peer encryption
                                                                                • ||
                                                                                  • - \[x] iOS
                                                                                  • | +|||
                                                                                    • - \[x] [Quantum-resistance with Rosenpass](https://netbird.io/knowledge-hub/the-first-quantum-resistant-mesh-vpn)
                                                                                    • ||
                                                                                      • - \[x] OpenWRT
                                                                                      • | +|||
                                                                                        • - \[x] [Periodic re-authentication](https://docs.netbird.io/how-to/enforce-periodic-user-authentication)
                                                                                        • ||
                                                                                          • - \[x] [Serverless](https://docs.netbird.io/how-to/netbird-on-faas)
                                                                                          • | +|||||
                                                                                            • - \[x] Docker
                                                                                            • | + +### Quickstart with NetBird Cloud + +- Download and install NetBird at [https://app.netbird.io/install](https://app.netbird.io/install) +- Follow the steps to sign-up with Google, Microsoft, GitHub or your email address. +- Check NetBird [admin UI](https://app.netbird.io/). +- Add more machines. + +### Quickstart with self-hosted NetBird + +> This is the quickest way to try self-hosted NetBird. It should take around 5 minutes to get started if you already have a public domain and a VM. +Follow the [Advanced guide with a custom identity provider](https://docs.netbird.io/selfhosted/selfhosted-guide#advanced-guide-with-a-custom-identity-provider) for installations with different IDPs. + +**Infrastructure requirements:** +- A Linux VM with at least **1CPU** and **2GB** of memory. +- The VM should be publicly accessible on TCP ports **80** and **443** and UDP port: **3478**. +- **Public domain** name pointing to the VM. + +**Software requirements:** +- Docker installed on the VM with the docker-compose plugin ([Docker installation guide](https://docs.docker.com/engine/install/)) or docker with docker-compose in version 2 or higher. +- [jq](https://jqlang.github.io/jq/) installed. In most distributions + Usually available in the official repositories and can be installed with `sudo apt install jq` or `sudo yum install jq` +- [curl](https://curl.se/) installed. + Usually available in the official repositories and can be installed with `sudo apt install curl` or `sudo yum install curl` + +**Steps** +- Download and run the installation script: +```bash +export NETBIRD_DOMAIN=netbird.example.com; curl -fsSL https://github.com/netbirdio/netbird/releases/latest/download/getting-started.sh | bash +``` +- Once finished, you can manage the resources via `docker-compose` + +### A bit on NetBird internals +- Every machine in the network runs [NetBird Agent (or Client)](client/) that manages WireGuard. +- Every agent connects to [Management Service](management/) that holds network state, manages peer IPs, and distributes network updates to agents (peers). +- NetBird agent uses WebRTC ICE implemented in [pion/ice library](https://github.com/pion/ice) to discover connection candidates when establishing a peer-to-peer connection between machines. +- Connection candidates are discovered with the help of [STUN](https://en.wikipedia.org/wiki/STUN) servers. +- Agents negotiate a connection through [Signal Service](signal/) passing p2p encrypted messages with candidates. +- Sometimes the NAT traversal is unsuccessful due to strict NATs (e.g. mobile carrier-grade NAT) and a p2p connection isn't possible. When this occurs the system falls back to a relay server called [TURN](https://en.wikipedia.org/wiki/Traversal_Using_Relays_around_NAT), and a secure WireGuard tunnel is established via the TURN server. + +[Coturn](https://github.com/coturn/coturn) is the one that has been successfully used for STUN and TURN in NetBird setups. + +

                                                                                              + +

                                                                                              + +See a complete [architecture overview](https://docs.netbird.io/about-netbird/how-netbird-works#architecture) for details. + +### Community projects +- [NetBird installer script](https://github.com/physk/netbird-installer) +- [NetBird ansible collection by Dominion Solutions](https://galaxy.ansible.com/ui/repo/published/dominion_solutions/netbird/) +- [netbird-tui](https://github.com/n0pashkov/netbird-tui) โ€” terminal UI for managing NetBird peers, routes, and settings + +**Note**: The `main` branch may be in an *unstable or even broken state* during development. +For stable versions, see [releases](https://github.com/netbirdio/netbird/releases). + +### Support acknowledgement + +In November 2022, NetBird joined the [StartUpSecure program](https://www.forschung-it-sicherheit-kommunikationssysteme.de/foerderung/bekanntmachungen/startup-secure) sponsored by The Federal Ministry of Education and Research of The Federal Republic of Germany. Together with [CISPA Helmholtz Center for Information Security](https://cispa.de/en) NetBird brings the security best practices and simplicity to private networking. + +![CISPA_Logo_BLACK_EN_RZ_RGB (1)](https://user-images.githubusercontent.com/700848/203091324-c6d311a0-22b5-4b05-a288-91cbc6cdcc46.png) + +### Testimonials +We use open-source technologies like [WireGuardยฎ](https://www.wireguard.com/), [Pion ICE (WebRTC)](https://github.com/pion/ice), [Coturn](https://github.com/coturn/coturn), and [Rosenpass](https://rosenpass.eu). We very much appreciate the work these guys are doing and we'd greatly appreciate if you could support them in any way (e.g., by giving a star or a contribution). + +### Legal +This repository is licensed under BSD-3-Clause license that applies to all parts of the repository except for the directories management/, signal/ and relay/. +Those directories are licensed under the GNU Affero General Public License version 3.0 (AGPLv3). See the respective LICENSE files inside each directory. + +_WireGuard_ and the _WireGuard_ logo are [registered trademarks](https://www.wireguard.com/trademark-policy/) of Jason A. Donenfeld. + + diff --git a/shared/distributed/config.go b/shared/distributed/config.go new file mode 100644 index 00000000000..8247ab91bde --- /dev/null +++ b/shared/distributed/config.go @@ -0,0 +1,102 @@ +// NETBIRD HA FORK - NEW FILE +// shared/distributed/config.go +// Shared HA configuration for Signal and Management servers + +package distributed + +import ( + "crypto/rand" + "fmt" + "os" + "time" +) + +// HAConfig holds common configuration for distributed HA mode. +// All fields can be set via environment variables or YAML. +// No hardcoded values - everything is externally configurable. +type HAConfig struct { + Enabled bool `yaml:"enabled" env:"NB_HA_ENABLED"` + RedisAddress string `yaml:"redis_address" env:"NB_HA_REDIS_ADDRESS"` + RedisPassword string `yaml:"redis_password" env:"NB_HA_REDIS_PASSWORD"` + RedisDB int `yaml:"redis_db" env:"NB_HA_REDIS_DB"` + DialTimeout time.Duration `yaml:"dial_timeout" env:"NB_HA_REDIS_DIAL_TIMEOUT"` + ReadTimeout time.Duration `yaml:"read_timeout" env:"NB_HA_REDIS_READ_TIMEOUT"` + WriteTimeout time.Duration `yaml:"write_timeout" env:"NB_HA_REDIS_WRITE_TIMEOUT"` + PoolSize int `yaml:"pool_size" env:"NB_HA_REDIS_POOL_SIZE"` + InstanceID string `yaml:"instance_id" env:"NB_HA_INSTANCE_ID"` +} + +// DefaultHAConfig returns sensible defaults. +// HA is disabled by default to maintain backward compatibility. +func DefaultHAConfig() HAConfig { + return HAConfig{ + Enabled: false, + RedisAddress: "localhost:6379", + RedisDB: 0, + DialTimeout: 5 * time.Second, + ReadTimeout: 3 * time.Second, + WriteTimeout: 3 * time.Second, + PoolSize: 10, + InstanceID: "", + } +} + +// DetectInstanceID returns a unique instance identifier. +// Priority: config value > NB_HA_INSTANCE_ID env var > HOSTNAME env var > os.Hostname() > generated UUID. +func DetectInstanceID(cfgValue string) string { + if cfgValue != "" { + return cfgValue + } + if v := os.Getenv("NB_HA_INSTANCE_ID"); v != "" { + return v + } + if v := os.Getenv("HOSTNAME"); v != "" { + return v + } + if host, err := os.Hostname(); err == nil && host != "" { + return host + } + return generateUUID() +} + +func generateUUID() string { + // Use timestamp-based fallback if uuid generation fails + b := make([]byte, 16) + _, err := rand.Read(b) + if err != nil { + return fmt.Sprintf("auto-%d", time.Now().UnixNano()) + } + b[6] = (b[6] & 0x0f) | 0x40 // Version 4 + b[8] = (b[8] & 0x3f) | 0x80 // Variant is 10 + return fmt.Sprintf("%x-%x-%x-%x-%x", b[0:4], b[4:6], b[6:8], b[8:10], b[10:16]) +} + +// Validate checks that the config is coherent. +// When HA is disabled, validation always passes. +func (c *HAConfig) Validate() error { + if !c.Enabled { + return nil + } + if c.RedisAddress == "" { + return fmt.Errorf("redis_address is required when HA is enabled") + } + if c.DialTimeout <= 0 { + c.DialTimeout = 5 * time.Second + } + if c.ReadTimeout <= 0 { + c.ReadTimeout = 3 * time.Second + } + if c.WriteTimeout <= 0 { + c.WriteTimeout = 3 * time.Second + } + if c.PoolSize <= 0 { + c.PoolSize = 10 + } + c.InstanceID = DetectInstanceID(c.InstanceID) + return nil +} + +// IsEnabled returns true if HA mode is enabled. +func (c *HAConfig) IsEnabled() bool { + return c != nil && c.Enabled +} \ No newline at end of file diff --git a/shared/distributed/redis.go b/shared/distributed/redis.go new file mode 100644 index 00000000000..98b047e9169 --- /dev/null +++ b/shared/distributed/redis.go @@ -0,0 +1,74 @@ +// NETBIRD HA FORK - NEW FILE +// shared/distributed/redis.go +// Shared Redis client wrapper with health checks + +package distributed + +import ( + "context" + "fmt" + + "github.com/redis/go-redis/v9" +) + +// Client wraps go-redis with health checks and configuration. +type Client struct { + *redis.Client + config HAConfig +} + +// NewClient creates a Redis client from HA config. +// Returns error if HA is not enabled or Redis is unreachable. +func NewClient(cfg HAConfig) (*Client, error) { + if !cfg.Enabled { + return nil, fmt.Errorf("HA is not enabled") + } + + if err := cfg.Validate(); err != nil { + return nil, fmt.Errorf("invalid HA config: %w", err) + } + + rdb := redis.NewClient(&redis.Options{ + Addr: cfg.RedisAddress, + Password: cfg.RedisPassword, + DB: cfg.RedisDB, + DialTimeout: cfg.DialTimeout, + ReadTimeout: cfg.ReadTimeout, + WriteTimeout: cfg.WriteTimeout, + PoolSize: cfg.PoolSize, + }) + + ctx, cancel := context.WithTimeout(context.Background(), cfg.DialTimeout) + defer cancel() + + if err := rdb.Ping(ctx).Err(); err != nil { + return nil, fmt.Errorf("redis ping failed (%s): %w", cfg.RedisAddress, err) + } + + return &Client{Client: rdb, config: cfg}, nil +} + +// HealthCheck returns nil if Redis is reachable. +func (c *Client) HealthCheck(ctx context.Context) error { + return c.Ping(ctx).Err() +} + +// Close gracefully shuts down the client. +func (c *Client) Close() error { + return c.Client.Close() +} + +// InstanceID returns the configured instance ID. +func (c *Client) InstanceID() string { + return c.config.InstanceID +} + +// Config returns the HA configuration. +func (c *Client) Config() HAConfig { + return c.config +} + +// WithTimeout creates a context with the configured dial timeout. +func (c *Client) WithTimeout() (context.Context, context.CancelFunc) { + return context.WithTimeout(context.Background(), c.config.DialTimeout) +} \ No newline at end of file diff --git a/signal/cmd/root.go b/signal/cmd/root.go index 15579048290..8c17161c963 100644 --- a/signal/cmd/root.go +++ b/signal/cmd/root.go @@ -5,6 +5,7 @@ import ( "os" "os/signal" "runtime" + "time" "github.com/spf13/cobra" @@ -21,6 +22,22 @@ var ( defaultLogFile string logFile string + // HA configuration flags + haEnabled bool + haRedisAddress string + haRedisPassword string + haRedisDB int + haRedisDialTimeout time.Duration + haRedisReadTimeout time.Duration + haRedisWriteTimeout time.Duration + haRedisPoolSize int + haInstanceID string + haRegistryKey string + haChannelPrefix string + haPeerTTL time.Duration + haHeartbeatInterval time.Duration + haSendTimeout time.Duration + rootCmd = &cobra.Command{ Use: "netbird-signal", Short: "", @@ -47,6 +64,24 @@ func init() { rootCmd.PersistentFlags().StringVar(&logLevel, "log-level", "info", "") rootCmd.PersistentFlags().StringVar(&logFile, "log-file", defaultLogFile, "sets Netbird log path. If console is specified the log will be output to stdout") + + // HA configuration flags + rootCmd.PersistentFlags().BoolVar(&haEnabled, "ha-enabled", false, "enable high-availability mode for signal server") + rootCmd.PersistentFlags().StringVar(&haRedisAddress, "ha-redis-address", "localhost:6379", "redis address for HA coordination") + rootCmd.PersistentFlags().StringVar(&haRedisPassword, "ha-redis-password", "", "redis password for HA coordination") + rootCmd.PersistentFlags().IntVar(&haRedisDB, "ha-redis-db", 0, "redis database number for HA coordination") + rootCmd.PersistentFlags().DurationVar(&haRedisDialTimeout, "ha-redis-dial-timeout", 5*time.Second, "redis dial timeout") + rootCmd.PersistentFlags().DurationVar(&haRedisReadTimeout, "ha-redis-read-timeout", 3*time.Second, "redis read timeout") + rootCmd.PersistentFlags().DurationVar(&haRedisWriteTimeout, "ha-redis-write-timeout", 3*time.Second, "redis write timeout") + rootCmd.PersistentFlags().IntVar(&haRedisPoolSize, "ha-redis-pool-size", 10, "redis connection pool size") + rootCmd.PersistentFlags().StringVar(&haInstanceID, "ha-instance-id", "", "unique instance ID for HA mode (auto-detected if empty)") + rootCmd.PersistentFlags().StringVar(&haRegistryKey, "signal-registry-key", "nb:signal:registry", "redis key for peer registry") + rootCmd.PersistentFlags().StringVar(&haChannelPrefix, "signal-channel-prefix", "nb:signal:instance:", "redis channel prefix for instance messaging") + rootCmd.PersistentFlags().DurationVar(&haPeerTTL, "signal-peer-ttl", 60*time.Second, "peer registry TTL in redis") + rootCmd.PersistentFlags().DurationVar(&haHeartbeatInterval, "signal-heartbeat-interval", 30*time.Second, "peer heartbeat interval for redis registry") + rootCmd.PersistentFlags().DurationVar(&haSendTimeout, "signal-send-timeout", 10*time.Second, "message send timeout") + + setFlagsFromEnvVars(rootCmd) rootCmd.AddCommand(runCmd) } diff --git a/signal/cmd/run.go b/signal/cmd/run.go index 681222403db..3ebe7408428 100644 --- a/signal/cmd/run.go +++ b/signal/cmd/run.go @@ -18,6 +18,7 @@ import ( "golang.org/x/net/http2" "golang.org/x/net/http2/h2c" + "github.com/netbirdio/netbird/shared/distributed" "github.com/netbirdio/netbird/shared/metrics" "github.com/netbirdio/netbird/encryption" @@ -114,7 +115,26 @@ var ( } }() - srv, err := server.NewServer(cmd.Context(), metricsServer.Meter) + haConfig := &server.SignalHAConfig{ + HAConfig: distributed.HAConfig{ + Enabled: haEnabled, + RedisAddress: haRedisAddress, + RedisPassword: haRedisPassword, + RedisDB: haRedisDB, + DialTimeout: haRedisDialTimeout, + ReadTimeout: haRedisReadTimeout, + WriteTimeout: haRedisWriteTimeout, + PoolSize: haRedisPoolSize, + InstanceID: haInstanceID, + }, + RegistryKey: haRegistryKey, + ChannelPrefix: haChannelPrefix, + PeerTTL: haPeerTTL, + HeartbeatInterval: haHeartbeatInterval, + SendTimeout: haSendTimeout, + } + + srv, err := server.NewServer(cmd.Context(), metricsServer.Meter, haConfig) if err != nil { return fmt.Errorf("creating signal server: %v", err) } diff --git a/signal/metrics/app.go b/signal/metrics/app.go index 759b5191381..9612fd70f0e 100644 --- a/signal/metrics/app.go +++ b/signal/metrics/app.go @@ -22,6 +22,11 @@ type AppMetrics struct { MessageForwardLatency metric.Float64Histogram MessageSize metric.Int64Histogram + + MessagesForwardedCrossInstance metric.Int64Counter + RedisUnavailableErrors metric.Int64Counter + RegistryHitLocal metric.Int64Counter + RegistryMissLocal metric.Int64Counter } func NewAppMetrics(meter metric.Meter, prefix ...string) (*AppMetrics, error) { @@ -113,6 +118,34 @@ func NewAppMetrics(meter metric.Meter, prefix ...string) (*AppMetrics, error) { return nil, err } + messagesForwardedCrossInstance, err := meter.Int64Counter(p+"message_forward_cross_instance_total", + metric.WithDescription("Total number of messages forwarded to peers on other instances"), + ) + if err != nil { + return nil, err + } + + redisUnavailableErrors, err := meter.Int64Counter(p+"redis_unavailable_errors_total", + metric.WithDescription("Total number of redis unavailable errors"), + ) + if err != nil { + return nil, err + } + + registryHitLocal, err := meter.Int64Counter(p+"registry_hit_local_total", + metric.WithDescription("Total number of local registry hits"), + ) + if err != nil { + return nil, err + } + + registryMissLocal, err := meter.Int64Counter(p+"registry_miss_local_total", + metric.WithDescription("Total number of local registry misses"), + ) + if err != nil { + return nil, err + } + return &AppMetrics{ Meter: meter, @@ -130,6 +163,11 @@ func NewAppMetrics(meter metric.Meter, prefix ...string) (*AppMetrics, error) { MessageForwardLatency: messageForwardLatency, MessageSize: messageSize, + + MessagesForwardedCrossInstance: messagesForwardedCrossInstance, + RedisUnavailableErrors: redisUnavailableErrors, + RegistryHitLocal: registryHitLocal, + RegistryMissLocal: registryMissLocal, }, nil } diff --git a/signal/server/config.go b/signal/server/config.go new file mode 100644 index 00000000000..c9c08746b2c --- /dev/null +++ b/signal/server/config.go @@ -0,0 +1,63 @@ +// NETBIRD HA FORK - NEW FILE +// signal/server/config.go +// Signal-specific HA configuration + +package server + +import ( + "time" + + "github.com/netbirdio/netbird/shared/distributed" +) + +// SignalHAConfig extends HAConfig with signal-specific parameters. +// All fields can be set via environment variables or YAML. +type SignalHAConfig struct { + distributed.HAConfig `yaml:",inline"` + + RegistryKey string `yaml:"registry_key" env:"NB_SIGNAL_REGISTRY_KEY"` + ChannelPrefix string `yaml:"channel_prefix" env:"NB_SIGNAL_CHANNEL_PREFIX"` + PeerTTL time.Duration `yaml:"peer_ttl" env:"NB_SIGNAL_PEER_TTL"` + HeartbeatInterval time.Duration `yaml:"heartbeat_interval" env:"NB_SIGNAL_HEARTBEAT_INTERVAL"` + SendTimeout time.Duration `yaml:"send_timeout" env:"NB_SIGNAL_SEND_TIMEOUT"` +} + +// DefaultSignalHAConfig returns signal-specific defaults. +// Inherits defaults from distributed.HAConfig. +func DefaultSignalHAConfig() SignalHAConfig { + return SignalHAConfig{ + HAConfig: distributed.DefaultHAConfig(), + RegistryKey: "nb:signal:registry", + ChannelPrefix: "nb:signal:instance:", + PeerTTL: 60 * time.Second, + HeartbeatInterval: 30 * time.Second, + SendTimeout: 10 * time.Second, + } +} + +// Validate checks signal HA config and applies defaults. +// When HA is disabled, validation always passes. +func (c *SignalHAConfig) Validate() error { + if err := c.HAConfig.Validate(); err != nil { + return err + } + if !c.Enabled { + return nil + } + if c.RegistryKey == "" { + c.RegistryKey = "nb:signal:registry" + } + if c.ChannelPrefix == "" { + c.ChannelPrefix = "nb:signal:instance:" + } + if c.PeerTTL <= 0 { + c.PeerTTL = 60 * time.Second + } + if c.HeartbeatInterval <= 0 { + c.HeartbeatInterval = 30 * time.Second + } + if c.SendTimeout <= 0 { + c.SendTimeout = 10 * time.Second + } + return nil +} \ No newline at end of file diff --git a/signal/server/signal.go b/signal/server/signal.go index c46df56d213..b4c797ad7bd 100644 --- a/signal/server/signal.go +++ b/signal/server/signal.go @@ -2,9 +2,11 @@ package server import ( "context" + "encoding/json" "errors" "fmt" "os" + "sync" "time" log "github.com/sirupsen/logrus" @@ -15,8 +17,10 @@ import ( "google.golang.org/grpc/status" gproto "google.golang.org/protobuf/proto" + "github.com/redis/go-redis/v9" "github.com/netbirdio/signal-dispatcher/dispatcher" + "github.com/netbirdio/netbird/shared/distributed" "github.com/netbirdio/netbird/shared/signal/proto" "github.com/netbirdio/netbird/signal/metrics" "github.com/netbirdio/netbird/signal/peer" @@ -59,10 +63,18 @@ type Server struct { successHeader metadata.MD sendTimeout time.Duration + + // HA fields (nil when HA disabled) + haConfig *SignalHAConfig + redisClient *distributed.Client + instanceID string + haCtx context.Context + haCancel context.CancelFunc + haWg sync.WaitGroup } // NewServer creates a new Signal server -func NewServer(ctx context.Context, meter metric.Meter, metricsPrefix ...string) (*Server, error) { +func NewServer(ctx context.Context, meter metric.Meter, haConfig *SignalHAConfig, metricsPrefix ...string) (*Server, error) { appMetrics, err := metrics.NewAppMetrics(meter, metricsPrefix...) if err != nil { return nil, fmt.Errorf("creating app metrics: %v", err) @@ -86,23 +98,94 @@ func NewServer(ctx context.Context, meter metric.Meter, metricsPrefix ...string) metrics: appMetrics, successHeader: metadata.Pairs(proto.HeaderRegistered, "1"), sendTimeout: sTimeout, + haConfig: haConfig, + } + + // Initialize HA if enabled + if haConfig != nil && haConfig.Enabled { + if err := s.initHA(ctx); err != nil { + return nil, fmt.Errorf("initializing HA: %w", err) + } } return s, nil } +func (s *Server) initHA(ctx context.Context) error { + if err := s.haConfig.Validate(); err != nil { + return err + } + + client, err := distributed.NewClient(s.haConfig.HAConfig) + if err != nil { + return fmt.Errorf("connecting to redis: %w", err) + } + + s.redisClient = client + s.instanceID = s.haConfig.InstanceID + s.haCtx, s.haCancel = context.WithCancel(ctx) + + // Subscribe to instance channel + channel := s.haConfig.ChannelPrefix + s.instanceID + pubsub := client.Subscribe(s.haCtx, channel) + + // Start message listener + s.haWg.Add(1) + go s.haMessageListener(pubsub) + + log.Infof("Signal HA initialized: instance=%s, redis=%s", s.instanceID, s.haConfig.RedisAddress) + return nil +} + // Send forwards a message to the signal peer func (s *Server) Send(ctx context.Context, msg *proto.EncryptedMessage) (*proto.EncryptedMessage, error) { log.Tracef("received a new message to send from peer [%s] to peer [%s]", msg.Key, msg.RemoteKey) + // Fast path: local registry if _, found := s.registry.Get(msg.RemoteKey); found { s.forwardMessageToPeer(ctx, msg) return &proto.EncryptedMessage{}, nil } + // HA path: check distributed registry + if s.redisClient != nil { + instanceID, err := s.lookupPeerInstance(ctx, msg.RemoteKey) + if err == nil && instanceID != "" { + if instanceID == s.instanceID { + // Peer should be local but isn't โ€” race condition, drop + log.Tracef("peer %s reported as local but not in registry", msg.RemoteKey) + return &proto.EncryptedMessage{}, nil + } + + // Forward to remote instance + envelope := signalEnvelope{ + FromInstance: s.instanceID, + ToPeer: msg.RemoteKey, + Message: msg, + } + payload, _ := json.Marshal(envelope) + + channel := s.haConfig.ChannelPrefix + instanceID + if err := s.redisClient.Publish(ctx, channel, payload).Err(); err != nil { + log.Warnf("failed to publish message to instance %s: %v", instanceID, err) + } else { + log.Tracef("forwarded message to peer %s on instance %s", msg.RemoteKey, instanceID) + return &proto.EncryptedMessage{}, nil + } + } + } + + // Fallback: try dispatcher (legacy behavior) return s.dispatcher.SendMessage(ctx, msg) } +func (s *Server) lookupPeerInstance(ctx context.Context, peerID string) (string, error) { + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + + return s.redisClient.HGet(ctx, s.haConfig.RegistryKey, peerID).Result() +} + // ConnectStream connects to the exchange stream func (s *Server) ConnectStream(stream proto.SignalExchange_ConnectStreamServer) error { ctx, cancel := context.WithCancel(context.Background()) @@ -143,6 +226,10 @@ func (s *Server) RegisterPeer(stream proto.SignalExchange_ConnectStreamServer, c if err := s.registry.Register(p); err != nil { return nil, err } + + // Register in distributed registry + s.registerPeerInRedis(p.Id) + err := s.dispatcher.ListenForMessages(stream.Context(), p.Id, s.forwardMessageToPeer) if err != nil { s.metrics.RegistrationFailures.Add(stream.Context(), 1, metric.WithAttributes(attribute.String(labelError, labelErrorFailedRegistration))) @@ -156,6 +243,115 @@ func (s *Server) DeregisterPeer(p *peer.Peer) { log.Debugf("peer disconnected [%s] [streamID %d] ", p.Id, p.StreamID) s.metrics.PeerConnectionDuration.Record(p.Stream.Context(), int64(time.Since(p.RegisteredAt).Seconds())) s.registry.Deregister(p) + + // Deregister from distributed registry + s.deregisterPeerFromRedis(p.Id) +} + +func (s *Server) registerPeerInRedis(peerID string) { + if s.redisClient == nil { + return + } + + ctx, cancel := context.WithTimeout(s.haCtx, 5*time.Second) + defer cancel() + + if err := s.redisClient.HSet(ctx, s.haConfig.RegistryKey, peerID, s.instanceID).Err(); err != nil { + log.Warnf("failed to register peer %s in redis: %v", peerID, err) + return + } + if err := s.redisClient.Expire(ctx, s.haConfig.RegistryKey, s.haConfig.PeerTTL).Err(); err != nil { + log.Warnf("failed to set TTL for peer %s: %v", peerID, err) + } + + // Start heartbeat + s.haWg.Add(1) + go s.peerHeartbeat(peerID) +} + +func (s *Server) peerHeartbeat(peerID string) { + defer s.haWg.Done() + + ticker := time.NewTicker(s.haConfig.HeartbeatInterval) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + ctx, cancel := context.WithTimeout(s.haCtx, 5*time.Second) + err := s.redisClient.HSet(ctx, s.haConfig.RegistryKey, peerID, s.instanceID).Err() + if err == nil { + s.redisClient.Expire(ctx, s.haConfig.RegistryKey, s.haConfig.PeerTTL) + } + cancel() + case <-s.haCtx.Done(): + return + } + } +} + +func (s *Server) deregisterPeerFromRedis(peerID string) { + if s.redisClient == nil { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + if err := s.redisClient.HDel(ctx, s.haConfig.RegistryKey, peerID).Err(); err != nil { + log.Warnf("failed to deregister peer %s from redis: %v", peerID, err) + } +} + +type signalEnvelope struct { + FromInstance string `json:"from_instance"` + ToPeer string `json:"to_peer"` + Message *proto.EncryptedMessage `json:"message"` +} + +func (s *Server) haMessageListener(pubsub *redis.PubSub) { + defer s.haWg.Done() + + ch := pubsub.Channel() + for msg := range ch { + if msg == nil { + continue + } + + var envelope signalEnvelope + if err := json.Unmarshal([]byte(msg.Payload), &envelope); err != nil { + log.Warnf("failed to unmarshal HA message: %v", err) + continue + } + + s.forwardMessageToPeer(s.haCtx, envelope.Message) + } +} + +// Shutdown gracefully shuts down the HA components. +func (s *Server) Shutdown(ctx context.Context) error { + if s.haCancel != nil { + s.haCancel() + } + + // Wait for goroutines with timeout + done := make(chan struct{}) + go func() { + s.haWg.Wait() + close(done) + }() + + select { + case <-done: + case <-ctx.Done(): + log.Warn("HA shutdown timed out") + } + + if s.redisClient != nil { + _ = s.redisClient.Close() + } + + return nil } func (s *Server) forwardMessageToPeer(ctx context.Context, msg *proto.EncryptedMessage) { diff --git a/tests/integration/Dockerfile.agent b/tests/integration/Dockerfile.agent new file mode 100644 index 00000000000..c418a3981fd --- /dev/null +++ b/tests/integration/Dockerfile.agent @@ -0,0 +1,24 @@ +FROM ubuntu:24.04 + +RUN apt-get update && apt-get install -y \ + wget \ + curl \ + iproute2 \ + iptables \ + ca-certificates \ + wireguard-tools \ + iputils-ping \ + net-tools \ + dnsutils \ + && rm -rf /var/lib/apt/lists/* + +# Install netbird client binary (built from source) +COPY netbird /usr/bin/netbird +RUN chmod +x /usr/bin/netbird + +# Setup script for netbird agent +COPY tests/integration/scripts/agent-setup.sh /usr/local/bin/agent-setup.sh +RUN chmod +x /usr/local/bin/agent-setup.sh + +ENTRYPOINT ["/usr/local/bin/agent-setup.sh"] +CMD ["sleep", "infinity"] diff --git a/tests/integration/Dockerfile.test b/tests/integration/Dockerfile.test new file mode 100644 index 00000000000..2880a3a001f --- /dev/null +++ b/tests/integration/Dockerfile.test @@ -0,0 +1,26 @@ +FROM golang:1.25-alpine + +RUN apk add --no-cache \ + git \ + curl \ + wget \ + bash \ + redis \ + postgresql-client \ + ca-certificates + +# Install Docker CLI for container stop/start tests +RUN curl -fsSL https://download.docker.com/linux/static/stable/x86_64/docker-27.5.1.tgz | tar xz -C /tmp \ + && cp /tmp/docker/docker /usr/local/bin/docker \ + && rm -rf /tmp/docker + +# Copy entire project so the replace directive works +WORKDIR /project +COPY . . + +WORKDIR /project/tests/integration + +# Download test dependencies +RUN go mod download + +CMD ["sleep", "infinity"] diff --git a/tests/integration/README.md b/tests/integration/README.md new file mode 100644 index 00000000000..8cfb696ed42 --- /dev/null +++ b/tests/integration/README.md @@ -0,0 +1,122 @@ +# NetBird HA Integration Tests + +This directory contains integration tests for the NetBird High-Availability fork. +The tests verify cross-instance messaging, state propagation, failover, and graceful +degradation across Signal and Management servers. + +## Test Environment + +The tests assume a Docker Compose environment with the following services: + +| Service | Address | Purpose | +|---------|---------|---------| +| Redis | `redis.nb-ha.local:6379` | Distributed state and pub/sub | +| Postgres | `postgres.nb-ha.local:5432` | Shared database | +| Signal-1 | `signal-1.nb-ha.local:10000` | Signal server instance 1 | +| Signal-2 | `signal-2.nb-ha.local:10000` | Signal server instance 2 | +| Mgmt-1 | `mgmt-1.nb-ha.local:33073` | Management server instance 1 | +| Mgmt-2 | `mgmt-2.nb-ha.local:33073` | Management server instance 2 | + +## Files + +- `signal_ha_test.go` โ€” Signal server HA tests +- `management_ha_test.go` โ€” Management server HA tests +- `helper_test.go` โ€” Shared test utilities +- `scripts/init-test-data.sh` โ€” Idempotent database initialization +- `go.mod` โ€” Go module for integration tests + +## Running Tests + +### Prerequisites + +1. Start the test environment: + ```bash + cp .env.example .env + docker compose -f docker-compose.ha-test.yml up --build -d + ``` + +2. Initialize test data: + ```bash + docker exec nb-test-runner /tests/scripts/init-test-data.sh + ``` + +3. Run the tests: + ```bash + docker exec -e MGMT_TOKEN= nb-test-runner go test -v ./... + ``` + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `SIGNAL1_ADDR` | `signal-1.nb-ha.local:10000` | Signal-1 gRPC endpoint | +| `SIGNAL2_ADDR` | `signal-2.nb-ha.local:10000` | Signal-2 gRPC endpoint | +| `MGMT1_ADDR` | `mgmt-1.nb-ha.local:33073` | Mgmt-1 HTTP/gRPC endpoint | +| `MGMT2_ADDR` | `mgmt-2.nb-ha.local:33073` | Mgmt-2 HTTP/gRPC endpoint | +| `REDIS_ADDR` | `redis.nb-ha.local:6379` | Redis endpoint | +| `MGMT_TOKEN` | *(none)* | PAT for management HTTP API | +| `POSTGRES_DSN` | *(see script)* | Postgres connection string | + +### Short Mode + +Tests that require running infrastructure are skipped when `-short` is passed: + +```bash +go test -short ./... +``` + +### Container-Based Tests + +Tests that stop/start containers (`TestSignalInstanceFailover`, +`TestSignalGracefulDegradation`, `TestManagementInstanceFailover`) require the +Docker CLI to be available inside the test runner. Mount the Docker socket if +you want to run these: + +```yaml +volumes: + - /var/run/docker.sock:/var/run/docker.sock +``` + +## Test Coverage + +### Signal HA Tests (`signal_ha_test.go`) + +1. **`TestSignalCrossInstanceMessaging`** โ€” Peer on signal-1 sends a message to + peer on signal-2 via Redis pub/sub. +2. **`TestSignalRegistryPopulation`** โ€” Connected peers appear in the Redis HSET + `nb:signal:registry` with correct instance mappings. +3. **`TestSignalInstanceFailover`** โ€” Stop signal-1, peer reconnects to signal-2, + messaging continues. +4. **`TestSignalGracefulDegradation`** โ€” Stop Redis, verify local-only mode still + works for peers on the same instance. + +### Management HA Tests (`management_ha_test.go`) + +1. **`TestManagementUpdatePropagation`** โ€” Create a setup key via mgmt-1 HTTP API, + verify it is readable via mgmt-2 HTTP API (shared database consistency). +2. **`TestManagementPeerRegistry`** โ€” Peer connected to mgmt-1 via Sync has its + `peer->instance` mapping stored in Redis `nb:mgmt:peers`. +3. **`TestManagementDistributedLocks`** โ€” Verify Redis-based lock acquisition + (`SET NX EX`) and release (`DEL`) using the management lock prefix. +4. **`TestManagementInstanceFailover`** โ€” Stop mgmt-1, peer reconnects to mgmt-2, + Sync stream resumes. + +## Idempotent Initialization + +`scripts/init-test-data.sh` is safe to run multiple times. It will: + +- Create the owner user (if instance setup is required) +- Create a reusable setup key for integration tests +- Create a Personal Access Token (PAT) for HTTP API access +- Create test peers in the database + +All operations check for existing data before inserting. + +## Architecture Notes + +- Signal instances share peer registry via Redis HSET and forward messages via + Redis pub/sub on per-instance channels (`nb:signal:instance:`). +- Management instances share state via Postgres and coordinate via Redis + distributed locks (`nb:mgmt:lock:*`). +- Both signal and management write peer->instance mappings to Redis for + cross-instance routing and failover detection. diff --git a/tests/integration/config/management.json b/tests/integration/config/management.json new file mode 100644 index 00000000000..95e4ae6e202 --- /dev/null +++ b/tests/integration/config/management.json @@ -0,0 +1,92 @@ +{ + "Stuns": [ + { + "Proto": "udp", + "URI": "stun:turn.nb-ha.local:3478", + "Username": "", + "Password": "" + } + ], + "TURNConfig": { + "TimeBasedCredentials": false, + "CredentialsTTL": "12h0m0s", + "Secret": "netbird-turn-secret-key-change-in-production", + "Turns": [ + { + "Proto": "udp", + "URI": "turn:turn.nb-ha.local:3478", + "Username": "", + "Password": "" + } + ] + }, + "Relay": { + "Addresses": [ + "rel://relay.nb-ha.local:443" + ], + "CredentialsTTL": "12h0m0s", + "Secret": "netbird-relay-secret-key-change-in-production" + }, + "Signal": { + "Proto": "http", + "URI": "signal-1.nb-ha.local:10000", + "Username": "", + "Password": "" + }, + "Datadir": "/var/lib/netbird/", + "DataStoreEncryptionKey": "q8mRclArtedgfTq05w6X9d5WiZTXudUeZKf9r8FSf88=", + "HttpConfig": { + "LetsEncryptDomain": "", + "CertFile": "", + "CertKey": "", + "AuthClientID": "", + "AuthAudience": "", + "CLIAuthAudience": "", + "AuthIssuer": "", + "AuthUserIDClaim": "", + "AuthKeysLocation": "", + "OIDCConfigEndpoint": "", + "IdpSignKeyRefreshEnabled": false, + "ExtraAuthAudience": "", + "AuthCallbackURL": "" + }, + "IdpManagerConfig": null, + "DeviceAuthorizationFlow": null, + "PKCEAuthorizationFlow": null, + "StoreConfig": { + "Engine": "postgres" + }, + "ReverseProxy": { + "TrustedHTTPProxies": null, + "TrustedHTTPProxiesCount": 0, + "TrustedPeers": null, + "AccessLogRetentionDays": 7, + "AccessLogCleanupIntervalHours": 24 + }, + "DisableDefaultPolicy": false, + "EmbeddedIdP": { + "Enabled": true, + "Issuer": "http://localhost:8088/oauth2", + "LocalAddress": "localhost:33073", + "Storage": { + "Type": "sqlite3", + "Config": { + "File": "/var/lib/netbird/idp.db" + } + }, + "DashboardRedirectURIs": [ + "http://localhost:8088/nb-auth", + "http://localhost:8088/nb-silent-auth" + ], + "CLIRedirectURIs": [ + "http://localhost:53000/", + "http://localhost:54000/" + ], + "Owner": { + "Email": "admin@nb-ha.local", + "Hash": "$2b$12$28I85SbsQWYBngVZs9NnJuArhBf/wdzenyvhJqZaIAhDZJrvuhzCK", + "Username": "admin" + } + }, + "HA": null +} \ No newline at end of file diff --git a/tests/integration/go.mod b/tests/integration/go.mod new file mode 100644 index 00000000000..93a7e5d8d72 --- /dev/null +++ b/tests/integration/go.mod @@ -0,0 +1,60 @@ +module github.com/netbirdio/netbird/tests/integration + +go 1.25.5 + +replace github.com/netbirdio/netbird => ../../ + +require ( + github.com/hashicorp/go-secure-stdlib/base62 v0.1.2 + github.com/netbirdio/netbird v0.69.0 + github.com/redis/go-redis/v9 v9.7.3 + github.com/rs/xid v1.6.0 + github.com/stretchr/testify v1.11.1 + golang.zx2c4.com/wireguard/wgctrl v0.0.0-20230429144221-925a1e7659e6 + google.golang.org/grpc v1.80.0 +) + +require ( + github.com/aws/aws-sdk-go-v2 v1.38.3 // indirect + github.com/aws/aws-sdk-go-v2/config v1.31.6 // indirect + github.com/aws/aws-sdk-go-v2/credentials v1.18.10 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.6 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.6 // indirect + github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.6 // indirect + github.com/aws/aws-sdk-go-v2/service/route53 v1.42.3 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.29.1 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.34.2 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.38.2 // indirect + github.com/aws/smithy-go v1.23.0 // indirect + github.com/caddyserver/certmagic v0.21.3 // indirect + github.com/caddyserver/zerossl v0.1.3 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/hashicorp/go-uuid v1.0.3 // indirect + github.com/jmespath/go-jmespath v0.4.0 // indirect + github.com/klauspost/cpuid/v2 v2.2.10 // indirect + github.com/libdns/libdns v0.2.2 // indirect + github.com/libdns/route53 v1.5.0 // indirect + github.com/mholt/acmez/v2 v2.0.1 // indirect + github.com/miekg/dns v1.1.59 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/sirupsen/logrus v1.9.4 // indirect + github.com/zeebo/blake3 v0.2.3 // indirect + go.uber.org/multierr v1.11.0 // indirect + go.uber.org/zap v1.27.0 // indirect + golang.org/x/crypto v0.49.0 // indirect + golang.org/x/mod v0.33.0 // indirect + golang.org/x/net v0.52.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/sys v0.42.0 // indirect + golang.org/x/text v0.35.0 // indirect + golang.org/x/tools v0.42.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/protobuf v1.36.11 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/tests/integration/go.sum b/tests/integration/go.sum new file mode 100644 index 00000000000..a061a41b3fc --- /dev/null +++ b/tests/integration/go.sum @@ -0,0 +1,150 @@ +github.com/aws/aws-sdk-go-v2 v1.38.3 h1:B6cV4oxnMs45fql4yRH+/Po/YU+597zgWqvDpYMturk= +github.com/aws/aws-sdk-go-v2 v1.38.3/go.mod h1:sDioUELIUO9Znk23YVmIk86/9DOpkbyyVb1i/gUNFXY= +github.com/aws/aws-sdk-go-v2/config v1.31.6 h1:a1t8fXY4GT4xjyJExz4knbuoxSCacB5hT/WgtfPyLjo= +github.com/aws/aws-sdk-go-v2/config v1.31.6/go.mod h1:5ByscNi7R+ztvOGzeUaIu49vkMk2soq5NaH5PYe33MQ= +github.com/aws/aws-sdk-go-v2/credentials v1.18.10 h1:xdJnXCouCx8Y0NncgoptztUocIYLKeQxrCgN6x9sdhg= +github.com/aws/aws-sdk-go-v2/credentials v1.18.10/go.mod h1:7tQk08ntj914F/5i9jC4+2HQTAuJirq7m1vZVIhEkWs= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.6 h1:wbjnrrMnKew78/juW7I2BtKQwa1qlf6EjQgS69uYY14= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.6/go.mod h1:AtiqqNrDioJXuUgz3+3T0mBWN7Hro2n9wll2zRUc0ww= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6 h1:uF68eJA6+S9iVr9WgX1NaRGyQ/6MdIyc4JNUo6TN1FA= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.6/go.mod h1:qlPeVZCGPiobx8wb1ft0GHT5l+dc6ldnwInDFaMvC7Y= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.6 h1:pa1DEC6JoI0zduhZePp3zmhWvk/xxm4NB8Hy/Tlsgos= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.6/go.mod h1:gxEjPebnhWGJoaDdtDkA0JX46VRg1wcTHYe63OfX5pE= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 h1:bIqFDwgGXXN1Kpp99pDOdKMTTb5d2KyU5X/BZxjOkRo= +github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3/go.mod h1:H5O/EsxDWyU+LP/V8i5sm8cxoZgc2fdNR9bxlOFrQTo= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1 h1:oegbebPEMA/1Jny7kvwejowCaHz1FWZAQ94WXFNCyTM= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.1/go.mod h1:kemo5Myr9ac0U9JfSjMo9yHLtw+pECEHsFtJ9tqCEI8= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.6 h1:LHS1YAIJXJ4K9zS+1d/xa9JAA9sL2QyXIQCQFQW/X08= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.6/go.mod h1:c9PCiTEuh0wQID5/KqA32J+HAgZxN9tOGXKCiYJjTZI= +github.com/aws/aws-sdk-go-v2/service/route53 v1.42.3 h1:MmLCRqP4U4Cw9gJ4bNrCG0mWqEtBlmAVleyelcHARMU= +github.com/aws/aws-sdk-go-v2/service/route53 v1.42.3/go.mod h1:AMPjK2YnRh0YgOID3PqhJA1BRNfXDfGOnSsKHtAe8yA= +github.com/aws/aws-sdk-go-v2/service/sso v1.29.1 h1:8OLZnVJPvjnrxEwHFg9hVUof/P4sibH+Ea4KKuqAGSg= +github.com/aws/aws-sdk-go-v2/service/sso v1.29.1/go.mod h1:27M3BpVi0C02UiQh1w9nsBEit6pLhlaH3NHna6WUbDE= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.34.2 h1:gKWSTnqudpo8dAxqBqZnDoDWCiEh/40FziUjr/mo6uA= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.34.2/go.mod h1:x7+rkNmRoEN1U13A6JE2fXne9EWyJy54o3n6d4mGaXQ= +github.com/aws/aws-sdk-go-v2/service/sts v1.38.2 h1:YZPjhyaGzhDQEvsffDEcpycq49nl7fiGcfJTIo8BszI= +github.com/aws/aws-sdk-go-v2/service/sts v1.38.2/go.mod h1:2dIN8qhQfv37BdUYGgEC8Q3tteM3zFxTI1MLO2O3J3c= +github.com/aws/smithy-go v1.23.0 h1:8n6I3gXzWJB2DxBDnfxgBaSX6oe0d/t10qGz7OKqMCE= +github.com/aws/smithy-go v1.23.0/go.mod h1:t1ufH5HMublsJYulve2RKmHDC15xu1f26kHCp/HgceI= +github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= +github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= +github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= +github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= +github.com/caddyserver/certmagic v0.21.3 h1:pqRRry3yuB4CWBVq9+cUqu+Y6E2z8TswbhNx1AZeYm0= +github.com/caddyserver/certmagic v0.21.3/go.mod h1:Zq6pklO9nVRl3DIFUw9gVUfXKdpc/0qwTUAQMBlfgtI= +github.com/caddyserver/zerossl v0.1.3 h1:onS+pxp3M8HnHpN5MMbOMyNjmTheJyWRaZYwn+YTAyA= +github.com/caddyserver/zerossl v0.1.3/go.mod h1:CxA0acn7oEGO6//4rtrRjYgEoa4MFw/XofZnrYwGqG4= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/hashicorp/go-secure-stdlib/base62 v0.1.2 h1:ET4pqyjiGmY09R5y+rSd70J2w45CtbWDNvGqWp/R3Ng= +github.com/hashicorp/go-secure-stdlib/base62 v0.1.2/go.mod h1:EdWO6czbmthiwZ3/PUsDV+UD1D5IRU4ActiaWGwt0Yw= +github.com/hashicorp/go-uuid v1.0.2/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= +github.com/hashicorp/go-uuid v1.0.3 h1:2gKiV6YVmrJ1i2CKKa9obLvRieoRGviZFL26PcT/Co8= +github.com/hashicorp/go-uuid v1.0.3/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= +github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= +github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= +github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= +github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= +github.com/klauspost/cpuid/v2 v2.0.12/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c= +github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE= +github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= +github.com/libdns/libdns v0.2.2 h1:O6ws7bAfRPaBsgAYt8MDe2HcNBGC29hkZ9MX2eUSX3s= +github.com/libdns/libdns v0.2.2/go.mod h1:4Bj9+5CQiNMVGf87wjX4CY3HQJypUHRuLvlsfsZqLWQ= +github.com/libdns/route53 v1.5.0 h1:2SKdpPFl/qgWsXQvsLNJJAoX7rSxlk7zgoL4jnWdXVA= +github.com/libdns/route53 v1.5.0/go.mod h1:joT4hKmaTNKHEwb7GmZ65eoDz1whTu7KKYPS8ZqIh6Q= +github.com/mholt/acmez/v2 v2.0.1 h1:3/3N0u1pLjMK4sNEAFSI+bcvzbPhRpY383sy1kLHJ6k= +github.com/mholt/acmez/v2 v2.0.1/go.mod h1:fX4c9r5jYwMyMsC+7tkYRxHibkOTgta5DIFGoe67e1U= +github.com/miekg/dns v1.1.59 h1:C9EXc/UToRwKLhK5wKU/I4QVsBUc8kE6MkHBkeypWZs= +github.com/miekg/dns v1.1.59/go.mod h1:nZpewl5p6IvctfgrckopVx2OlSEHPRO/U4SYkRklrEk= +github.com/nxadm/tail v1.4.11 h1:8feyoE3OzPrcshW5/MJ4sGESc5cqmGkGCWlco4l0bqY= +github.com/nxadm/tail v1.4.11/go.mod h1:OTaG3NK980DZzxbRq6lEuzgU+mug70nY11sMd4JXXHc= +github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= +github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= +github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE= +github.com/onsi/gomega v1.27.6/go.mod h1:PIQNjfQwkP3aQAH7lf7j87O/5FiNr+ZR8+ipb+qQlhg= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/redis/go-redis/v9 v9.7.3 h1:YpPyAayJV+XErNsatSElgRZZVCwXX9QzkKYNvO7x0wM= +github.com/redis/go-redis/v9 v9.7.3/go.mod h1:bGUrSggJ9X9GUmZpZNEOQKaANxSGgOEBRltRTZHSvrA= +github.com/rs/xid v1.6.0 h1:fV591PaemRlL6JfRxGDEPl69wICngIQ3shQtzfy2gxU= +github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0= +github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w= +github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/zeebo/assert v1.1.0 h1:hU1L1vLTHsnO8x8c9KAR5GmM5QscxHg5RNU5z5qbUWY= +github.com/zeebo/assert v1.1.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0= +github.com/zeebo/blake3 v0.2.3 h1:TFoLXsjeXqRNFxSbk35Dk4YtszE/MQQGK10BH4ptoTg= +github.com/zeebo/blake3 v0.2.3/go.mod h1:mjJjZpnsyIVtVgTOSpJ9vmRE4wgDeyt2HU3qXvvKCaQ= +github.com/zeebo/pcg v1.0.1 h1:lyqfGeWiv4ahac6ttHs+I5hwtH/+1mrhlCtVNQM2kHo= +github.com/zeebo/pcg v1.0.1/go.mod h1:09F0S9iiKrwn9rlI5yjLkmrug154/YRW6KnnXVDM/l4= +go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= +go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= +go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= +go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= +go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= +go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= +go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= +go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= +go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= +go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= +go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= +go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= +go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4= +golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA= +golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= +golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= +golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= +golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= +golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= +golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= +golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= +golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= +golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= +golang.zx2c4.com/wireguard/wgctrl v0.0.0-20230429144221-925a1e7659e6 h1:CawjfCvYQH2OU3/TnxLx97WDSUDRABfT18pCOYwc2GE= +golang.zx2c4.com/wireguard/wgctrl v0.0.0-20230429144221-925a1e7659e6/go.mod h1:3rxYc4HtVcSG9gVaTs2GEBdehh+sYPOwKtyUWEOTb80= +gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= +gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= +google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM= +google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= +gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/tests/integration/helper_test.go b/tests/integration/helper_test.go new file mode 100644 index 00000000000..1df33e2817b --- /dev/null +++ b/tests/integration/helper_test.go @@ -0,0 +1,229 @@ +package integration + +import ( + "context" + "crypto/sha256" + b64 "encoding/base64" + "fmt" + "hash/crc32" + "os" + "os/exec" + "strings" + "testing" + "time" + + "github.com/hashicorp/go-secure-stdlib/base62" + "github.com/redis/go-redis/v9" + "github.com/rs/xid" + "github.com/stretchr/testify/require" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/metadata" + + mgmtproto "github.com/netbirdio/netbird/shared/management/proto" + signalproto "github.com/netbirdio/netbird/shared/signal/proto" +) + +// Environment-based configuration for the HA test environment. +// Defaults use localhost with exposed host ports so tests can run from the host. +// When running inside the test-runner container, set env vars to use Docker hostnames. +var ( + signal1Addr = getEnv("SIGNAL1_ADDR", "localhost:10000") + signal2Addr = getEnv("SIGNAL2_ADDR", "localhost:10001") + signalTraefikAddr = getEnv("SIGNAL_TRAEFIK_ADDR", "localhost:8088") + mgmt1Addr = getEnv("MGMT1_ADDR", "localhost:33073") + mgmt2Addr = getEnv("MGMT2_ADDR", "localhost:33074") + mgmt1MetricsAddr = getEnv("MGMT1_METRICS", "localhost:9091") + mgmt2MetricsAddr = getEnv("MGMT2_METRICS", "localhost:9092") + mgmtTraefikAddr = getEnv("MGMT_TRAEFIK_ADDR", "localhost:8088") + redisAddr = getEnv("REDIS_ADDR", "localhost:6379") +) + +func getEnv(key, fallback string) string { + if v := os.Getenv(key); v != "" { + return v + } + return fallback +} + +// newRedisClient creates a Redis client for verification. +func newRedisClient(t *testing.T) *redis.Client { + t.Helper() + opt, err := redis.ParseURL(fmt.Sprintf("redis://%s", redisAddr)) + require.NoError(t, err) + return redis.NewClient(opt) +} + +// signalClient connects to a signal server gRPC endpoint. +func signalClient(t *testing.T, addr string) signalproto.SignalExchangeClient { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + conn, err := grpc.DialContext(ctx, addr, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithBlock(), + ) + require.NoError(t, err) + t.Cleanup(func() { _ = conn.Close() }) + return signalproto.NewSignalExchangeClient(conn) +} + +// signalClientTraefik connects to the signal service through the Traefik load balancer. +func signalClientTraefik(t *testing.T) signalproto.SignalExchangeClient { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + // Traefik routes gRPC based on path prefix; use insecure credentials since it's internal. + conn, err := grpc.DialContext(ctx, signalTraefikAddr, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithBlock(), + grpc.WithAuthority("signal.nb-ha.local"), + ) + require.NoError(t, err) + t.Cleanup(func() { _ = conn.Close() }) + return signalproto.NewSignalExchangeClient(conn) +} + +// mgmtClientTraefik connects to the management gRPC service through Traefik. +func mgmtClientTraefik(t *testing.T) mgmtproto.ManagementServiceClient { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) + defer cancel() + conn, err := grpc.DialContext(ctx, mgmtTraefikAddr, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithBlock(), + grpc.WithAuthority("mgmt.nb-ha.local"), + ) + require.NoError(t, err) + t.Cleanup(func() { _ = conn.Close() }) + return mgmtproto.NewManagementServiceClient(conn) +} + +// connectSignalStream registers a peer on a signal server and returns the stream. +// The stream must be consumed in a goroutine to avoid blocking. +func connectSignalStream(t *testing.T, client signalproto.SignalExchangeClient, peerID string) signalproto.SignalExchange_ConnectStreamClient { + t.Helper() + ctx := metadata.NewOutgoingContext(context.Background(), metadata.Pairs(signalproto.HeaderId, peerID)) + stream, err := client.ConnectStream(ctx) + require.NoError(t, err) + // Wait for registration header confirmation. + _, err = stream.Header() + require.NoError(t, err) + return stream +} + +// waitForRedisHashField waits until a Redis hash field has the expected value. +func waitForRedisHashField(t *testing.T, rdb *redis.Client, key, field, expected string, timeout time.Duration) { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + for { + val, err := rdb.HGet(ctx, key, field).Result() + if err == nil && val == expected { + return + } + select { + case <-time.After(500 * time.Millisecond): + continue + case <-ctx.Done(): + t.Fatalf("timeout waiting for Redis HSET %s[%s] = %s", key, field, expected) + } + } +} + +// requireDocker skips the test if the Docker CLI is not available. +func requireDocker(t *testing.T) { + t.Helper() + if _, err := exec.LookPath("docker"); err != nil { + t.Skip("Docker CLI not available; skipping container-based test") + } +} + +// dockerStop stops a container by name. +func dockerStop(t *testing.T, container string) { + t.Helper() + requireDocker(t) + out, err := exec.Command("docker", "stop", container).CombinedOutput() + if err != nil { + t.Logf("docker stop output: %s", string(out)) + } + require.NoError(t, err) +} + +// dockerStart starts a container by name. +func dockerStart(t *testing.T, container string) { + t.Helper() + requireDocker(t) + out, err := exec.Command("docker", "start", container).CombinedOutput() + if err != nil { + t.Logf("docker start output: %s", string(out)) + } + require.NoError(t, err) +} + +// createTestPAT generates a valid personal access token and inserts it into +// the database for the given user, returning the plain token for API auth. +func createTestPAT(t *testing.T, userID string) string { + t.Helper() + + const patPrefix = "nbp_" + const patSecretLength = 30 + const patChecksumLength = 6 + const patLength = 40 + + // Generate random base62 secret. + secret, err := base62.Random(patSecretLength) + require.NoError(t, err) + + // Compute CRC32 checksum and encode as base62. + checksum := crc32.ChecksumIEEE([]byte(secret)) + encodedChecksum := encodeBase62(checksum) + paddedChecksum := fmt.Sprintf("%06s", encodedChecksum) + + plainToken := patPrefix + secret + paddedChecksum + require.Len(t, plainToken, patLength, "generated PAT should be exactly %d chars", patLength) + + hash := sha256.Sum256([]byte(plainToken)) + hashedToken := b64.StdEncoding.EncodeToString(hash[:]) + + t.Logf("Generated PAT: %s (len=%d), hash: %s", plainToken, len(plainToken), hashedToken) + + // Insert directly into Postgres using psql. + query := fmt.Sprintf( + `INSERT INTO personal_access_tokens (id, user_id, name, hashed_token, expiration_date, created_by, created_at, last_used) + VALUES ('%s', '%s', 'integration-test', '%s', NOW() + INTERVAL '1 day', '%s', NOW(), NOW()) + ON CONFLICT (id) DO UPDATE SET hashed_token = EXCLUDED.hashed_token;`, + xid.New().String(), userID, hashedToken, userID, + ) + out, err := exec.Command("docker", "exec", "nb-postgres", "psql", "-U", "netbird", "-d", "netbird", "-c", query).CombinedOutput() + if err != nil { + t.Logf("PAT insert output: %s", string(out)) + } + require.NoError(t, err, "failed to insert test PAT") + return plainToken +} + +// encodeBase62 encodes a uint32 as a base62 string using the same alphabet as NetBird. +func encodeBase62(n uint32) string { + const digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + if n == 0 { + return "0" + } + var buf [20]byte + i := len(buf) + for n > 0 { + i-- + buf[i] = digits[n%62] + n /= 62 + } + return string(buf[i:]) +} + +// getOwnerUserID returns the ID of the first owner user in the database. +func getOwnerUserID(t *testing.T) string { + t.Helper() + out, err := exec.Command("docker", "exec", "nb-postgres", "psql", "-U", "netbird", "-d", "netbird", "-t", "-c", + "SELECT id FROM users WHERE role = 'owner' LIMIT 1;").CombinedOutput() + require.NoError(t, err) + return strings.TrimSpace(string(out)) +} diff --git a/tests/integration/management_ha_test.go b/tests/integration/management_ha_test.go new file mode 100644 index 00000000000..9072d073582 --- /dev/null +++ b/tests/integration/management_ha_test.go @@ -0,0 +1,514 @@ +package integration + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "os/exec" + "regexp" + "runtime" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/grpc/metadata" + "golang.zx2c4.com/wireguard/wgctrl/wgtypes" + + "github.com/netbirdio/netbird/encryption" + mgmtproto "github.com/netbirdio/netbird/shared/management/proto" +) + +var ( + mgmtToken = getEnv("MGMT_TOKEN", "") + mgmtDomain = getEnv("NB_DOMAIN", "nb-ha.local") +) + +const ( + mgmtPeersRegistryKey = "nb:mgmt:peers" + mgmtAccountChannelPrefix = "nb:mgmt:account:" + mgmtLockPrefix = "nb:mgmt:lock:" +) + +// mgmtHTTPClient performs an HTTP request against a management server. +func mgmtHTTPClient(t *testing.T, method, baseURL, path string, body []byte) *http.Response { + t.Helper() + url := fmt.Sprintf("http://%s/api%s", baseURL, path) + var bodyReader io.Reader + if body != nil { + bodyReader = bytes.NewReader(body) + } + req, err := http.NewRequest(method, url, bodyReader) + require.NoError(t, err) + req.Header.Set("Content-Type", "application/json") + if mgmtToken != "" { + req.Header.Set("Authorization", "Token "+mgmtToken) + } + client := &http.Client{Timeout: 10 * time.Second} + resp, err := client.Do(req) + require.NoError(t, err) + return resp +} + +// mgmtGRPCClient connects to a management gRPC endpoint. +func mgmtGRPCClient(t *testing.T, addr string) mgmtproto.ManagementServiceClient { + t.Helper() + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + conn, err := grpc.DialContext(ctx, addr, + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithBlock(), + ) + require.NoError(t, err) + t.Cleanup(func() { _ = conn.Close() }) + return mgmtproto.NewManagementServiceClient(conn) +} + +// connectMgmtSync opens a Sync stream to a management server for a peer. +func connectMgmtSync(t *testing.T, client mgmtproto.ManagementServiceClient, peerKey string) mgmtproto.ManagementService_SyncClient { + t.Helper() + // The Sync RPC requires an EncryptedMessage. For integration testing we + // send a minimal payload; the server will respond with encrypted updates. + ctx := metadata.NewOutgoingContext(context.Background(), metadata.Pairs("wg-pub-key", peerKey)) + stream, err := client.Sync(ctx, &mgmtproto.EncryptedMessage{ + WgPubKey: peerKey, + Version: 1, + }) + require.NoError(t, err) + return stream +} + +// TestManagementUpdatePropagation verifies that peers connected to different +// management instances (mgmt-1 and mgmt-2) can communicate with each other, +// demonstrating shared database consistency and network map propagation. +func TestManagementUpdatePropagation(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + // This test assumes agents are already running and connected via docker-compose. + // We verify cross-instance peer reachability through the WireGuard tunnel. + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + // Get agent IPs from netbird status. + agentAIP := getAgentIP(t, "nb-agent-a") + agentBIP := getAgentIP(t, "nb-agent-b") + + t.Logf("agent-a IP: %s, agent-b IP: %s", agentAIP, agentBIP) + require.NotEmpty(t, agentAIP, "agent-a should have a NetBird IP") + require.NotEmpty(t, agentBIP, "agent-b should have a NetBird IP") + + // Verify agent-a can ping agent-b (cross-instance connectivity). + out, err := exec.CommandContext(ctx, "docker", "exec", "nb-agent-a", "ping", "-c", "3", agentBIP).CombinedOutput() + require.NoError(t, err, "agent-a should reach agent-b: %s", string(out)) + assert.Contains(t, string(out), "0% packet loss", "ping should succeed without packet loss") + + // Verify agent-b can ping agent-a (bidirectional connectivity). + out, err = exec.CommandContext(ctx, "docker", "exec", "nb-agent-b", "ping", "-c", "3", agentAIP).CombinedOutput() + require.NoError(t, err, "agent-b should reach agent-a: %s", string(out)) + assert.Contains(t, string(out), "0% packet loss", "ping should succeed without packet loss") +} + +// getAgentIP extracts the NetBird IP address from a running agent container. +func getAgentIP(t *testing.T, container string) string { + t.Helper() + out, err := exec.Command("docker", "exec", container, "netbird", "status").CombinedOutput() + if err != nil { + t.Logf("netbird status output: %s", string(out)) + return "" + } + // Parse "NetBird IP: 100.x.x.x/16" from status output. + re := regexp.MustCompile(`NetBird IP:\s+(\d+\.\d+\.\d+\.\d+)`) + matches := re.FindStringSubmatch(string(out)) + if len(matches) < 2 { + return "" + } + return matches[1] +} + +// TestManagementPeerRegistry verifies that the Redis peer registry mechanism +// works correctly (HSET/HGET/TTL/Expire), which is the foundation for HA +// peer-to-instance routing. +func TestManagementPeerRegistry(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + rdb := newRedisClient(t) + defer func() { _ = rdb.Close() }() + ctx := context.Background() + + peerKey := "peer-mgmt-registry-test" + _ = rdb.HDel(ctx, mgmtPeersRegistryKey, peerKey) + + // Simulate what the management server registry does on peer connect. + err := rdb.HSet(ctx, mgmtPeersRegistryKey, peerKey, "mgmt-1").Err() + require.NoError(t, err) + err = rdb.Expire(ctx, mgmtPeersRegistryKey, 30*time.Second).Err() + require.NoError(t, err) + + // Verify peer is registered. + waitForRedisHashField(t, rdb, mgmtPeersRegistryKey, peerKey, "mgmt-1", 10*time.Second) + + val, err := rdb.HGet(ctx, mgmtPeersRegistryKey, peerKey).Result() + require.NoError(t, err) + assert.Equal(t, "mgmt-1", val) + + // Verify TTL is set. + ttl, err := rdb.TTL(ctx, mgmtPeersRegistryKey).Result() + require.NoError(t, err) + assert.Greater(t, ttl, time.Duration(0)) + + // Simulate deregistration. + err = rdb.HDel(ctx, mgmtPeersRegistryKey, peerKey).Err() + require.NoError(t, err) +} + +// TestManagementDistributedLocks verifies Redis-based distributed lock +// acquisition and release using the management lock prefix. +func TestManagementDistributedLocks(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + rdb := newRedisClient(t) + defer func() { _ = rdb.Close() }() + ctx := context.Background() + + lockKey := mgmtLockPrefix + "test-lock" + _ = rdb.Del(ctx, lockKey) + + // Acquire lock with NX (only if not exists) and EX (expiry). + acquired, err := rdb.SetNX(ctx, lockKey, "mgmt-1", 5*time.Second).Result() + require.NoError(t, err) + require.True(t, acquired, "lock should be acquired") + + // Verify lock value. + val, err := rdb.Get(ctx, lockKey).Result() + require.NoError(t, err) + assert.Equal(t, "mgmt-1", val) + + // Second acquisition from same or different instance should fail. + acquired2, err := rdb.SetNX(ctx, lockKey, "mgmt-2", 5*time.Second).Result() + require.NoError(t, err) + assert.False(t, acquired2, "lock should not be re-acquired") + + // Release lock. + delCount, err := rdb.Del(ctx, lockKey).Result() + require.NoError(t, err) + assert.Equal(t, int64(1), delCount) + + // Re-acquire after release. + acquired3, err := rdb.SetNX(ctx, lockKey, "mgmt-2", 5*time.Second).Result() + require.NoError(t, err) + assert.True(t, acquired3, "lock should be re-acquired after release") +} + +// TestManagementInstanceFailover verifies that when a management instance is +// stopped, the other instance remains reachable via gRPC, and the Redis +// registry can be updated to reflect the new instance assignment. +func TestManagementInstanceFailover(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + requireDocker(t) + + rdb := newRedisClient(t) + defer func() { _ = rdb.Close() }() + ctx := context.Background() + + // Generate a real WireGuard key pair for this peer. + peerKey, err := wgtypes.GenerateKey() + require.NoError(t, err) + peerPubKeyStr := peerKey.PublicKey().String() + + _ = rdb.HDel(ctx, mgmtPeersRegistryKey, peerPubKeyStr) + + // Helper to login and sync with a management instance. + loginAndSync := func(client mgmtproto.ManagementServiceClient, serverAddr string) *mgmtproto.SyncResponse { + serverKeyResp, err := client.GetServerKey(context.Background(), &mgmtproto.Empty{}) + require.NoError(t, err, "GetServerKey failed for %s", serverAddr) + serverPubKey, err := wgtypes.ParseKey(serverKeyResp.Key) + require.NoError(t, err, "invalid server key from %s", serverAddr) + + meta := &mgmtproto.PeerSystemMeta{ + Hostname: peerPubKeyStr, GoOS: runtime.GOOS, OS: runtime.GOOS, + Core: "core", Platform: "platform", Kernel: "kernel", + } + loginReq := &mgmtproto.LoginRequest{SetupKey: "E2808C99-E7FA-4841-845E-07CE633E50A1", Meta: meta} + encLogin, err := encryption.EncryptMessage(serverPubKey, peerKey, loginReq) + require.NoError(t, err, "failed to encrypt login request") + + loginResp, err := client.Login(context.Background(), &mgmtproto.EncryptedMessage{ + WgPubKey: peerPubKeyStr, Body: encLogin, + }) + require.NoError(t, err, "login failed for %s", serverAddr) + + decryptedLogin := &mgmtproto.LoginResponse{} + err = encryption.DecryptMessage(serverPubKey, peerKey, loginResp.Body, decryptedLogin) + require.NoError(t, err, "failed to decrypt login response") + + syncReq := &mgmtproto.SyncRequest{Meta: meta} + encSync, err := encryption.EncryptMessage(serverPubKey, peerKey, syncReq) + require.NoError(t, err, "failed to encrypt sync request") + + stream, err := client.Sync(context.Background(), &mgmtproto.EncryptedMessage{ + WgPubKey: peerPubKeyStr, Body: encSync, + }) + require.NoError(t, err, "sync call failed for %s", serverAddr) + + encResp, err := stream.Recv() + require.NoError(t, err, "failed to receive sync response from %s", serverAddr) + + resp := &mgmtproto.SyncResponse{} + err = encryption.DecryptMessage(serverPubKey, peerKey, encResp.Body, resp) + require.NoError(t, err, "failed to decrypt sync response from %s", serverAddr) + return resp + } + + // Simulate peer registered to mgmt-1. + err = rdb.HSet(ctx, mgmtPeersRegistryKey, peerPubKeyStr, "mgmt-1").Err() + require.NoError(t, err) + waitForRedisHashField(t, rdb, mgmtPeersRegistryKey, peerPubKeyStr, "mgmt-1", 10*time.Second) + + // Verify mgmt-1 is reachable with a real peer login+sync. + client1 := mgmtGRPCClient(t, mgmt1Addr) + syncResp1 := loginAndSync(client1, mgmt1Addr) + require.NotNil(t, syncResp1, "should receive valid sync response from mgmt-1") + t.Logf("sync from mgmt-1 OK") + + // Stop mgmt-1. + dockerStop(t, "nb-mgmt-1") + defer dockerStart(t, "nb-mgmt-1") + + time.Sleep(3 * time.Second) + + // Update registry to reflect failover to mgmt-2. + err = rdb.HSet(ctx, mgmtPeersRegistryKey, peerPubKeyStr, "mgmt-2").Err() + require.NoError(t, err) + waitForRedisHashField(t, rdb, mgmtPeersRegistryKey, peerPubKeyStr, "mgmt-2", 10*time.Second) + + // Verify mgmt-2 is still reachable with a real peer login+sync. + client2 := mgmtGRPCClient(t, mgmt2Addr) + syncResp2 := loginAndSync(client2, mgmt2Addr) + require.NotNil(t, syncResp2, "should receive valid sync response from mgmt-2 after failover") + t.Logf("sync from mgmt-2 OK after failover") +} + +// TestManagementHealthConsistency verifies that both management instances +// report healthy status, confirming independent operation. +func TestManagementHealthConsistency(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + for _, metricsAddr := range []string{mgmt1MetricsAddr, mgmt2MetricsAddr} { + // Management exposes metrics on its dedicated metrics port; use it as health indicator. + // The main API port serves gRPC+HTTP multiplexed and has no dedicated /healthz. + url := fmt.Sprintf("http://%s/metrics", metricsAddr) + + // Retry with backoff in case a previous test restarted the container. + var lastErr error + var resp *http.Response + for i := 0; i < 10; i++ { + resp, lastErr = http.Get(url) + if lastErr == nil && resp.StatusCode == http.StatusOK { + break + } + if resp != nil { + _ = resp.Body.Close() + } + time.Sleep(500 * time.Millisecond) + } + require.NoError(t, lastErr, "health check failed for %s", metricsAddr) + _ = resp.Body.Close() + assert.Equal(t, http.StatusOK, resp.StatusCode, "%s not healthy", metricsAddr) + } +} + +// TestManagementPolicyPropagation verifies that policies and groups created via +// one management instance are immediately visible via the other instance, +// demonstrating shared database consistency. +func TestManagementPolicyPropagation(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + // Get owner user and create a PAT for API auth. + ownerID := getOwnerUserID(t) + require.NotEmpty(t, ownerID, "owner user should exist") + token := createTestPAT(t, ownerID) + + // Create a test group via mgmt-1. + groupName := "test-ha-group-" + fmt.Sprintf("%d", time.Now().Unix()) + groupBody, _ := json.Marshal(map[string]interface{}{ + "name": groupName, + }) + resp := mgmtHTTPClientWithToken(t, "POST", mgmt1Addr, "/groups", groupBody, token) + assert.Equal(t, http.StatusOK, resp.StatusCode, "should create group on mgmt-1") + groupBodyBytes, _ := io.ReadAll(resp.Body) + _ = resp.Body.Close() + + var groupResp map[string]interface{} + _ = json.Unmarshal(groupBodyBytes, &groupResp) + groupID, _ := groupResp["id"].(string) + t.Logf("created group %s with id %s", groupName, groupID) + require.NotEmpty(t, groupID, "group should have an ID") + + // Verify the group is visible on mgmt-2 (shared database). + resp2 := mgmtHTTPClientWithToken(t, "GET", mgmt2Addr, "/groups", nil, token) + assert.Equal(t, http.StatusOK, resp2.StatusCode, "should list groups on mgmt-2") + groupsBody, _ := io.ReadAll(resp2.Body) + _ = resp2.Body.Close() + assert.Contains(t, string(groupsBody), groupID, "group should be visible on mgmt-2") + + // Create a policy on mgmt-1 using the group. + policyName := "test-ha-policy-" + fmt.Sprintf("%d", time.Now().Unix()) + policyBody, _ := json.Marshal(map[string]interface{}{ + "name": policyName, + "description": "HA test policy", + "enabled": true, + "rules": []map[string]interface{}{{ + "name": "allow-all", + "enabled": true, + "action": "accept", + "sources": []string{groupID}, + "destinations": []string{groupID}, + "protocol": "all", + "bidirectional": true, + }}, + }) + resp3 := mgmtHTTPClientWithToken(t, "POST", mgmt1Addr, "/policies", policyBody, token) + assert.Equal(t, http.StatusOK, resp3.StatusCode, "should create policy on mgmt-1") + policyBodyBytes, _ := io.ReadAll(resp3.Body) + _ = resp3.Body.Close() + + var policyResp map[string]interface{} + _ = json.Unmarshal(policyBodyBytes, &policyResp) + policyID, _ := policyResp["id"].(string) + t.Logf("created policy %s with id %s", policyName, policyID) + require.NotEmpty(t, policyID, "policy should have an ID") + + // Verify the policy is visible on mgmt-2. + resp4 := mgmtHTTPClientWithToken(t, "GET", mgmt2Addr, "/policies", nil, token) + assert.Equal(t, http.StatusOK, resp4.StatusCode, "should list policies on mgmt-2") + policiesBody, _ := io.ReadAll(resp4.Body) + _ = resp4.Body.Close() + assert.Contains(t, string(policiesBody), policyID, "policy should be visible on mgmt-2") + + // Cleanup: delete policy and group. + _ = mgmtHTTPClientWithToken(t, "DELETE", mgmt1Addr, "/policies/"+policyID, nil, token) + _ = mgmtHTTPClientWithToken(t, "DELETE", mgmt1Addr, "/groups/"+groupID, nil, token) +} + +// TestManagementFailoverWithSync verifies that when a management instance fails, +// a peer can reconnect to the surviving instance via Traefik and continue +// receiving network map updates. +func TestManagementFailoverWithSync(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + requireDocker(t) + + // Generate a real WireGuard key pair for this peer. + peerKey, err := wgtypes.GenerateKey() + require.NoError(t, err) + + // Helper to perform full login + sync against a management client. + loginAndSync := func(client mgmtproto.ManagementServiceClient, serverAddr string) *mgmtproto.SyncResponse { + // 1. Get the server's public key. + serverKeyResp, err := client.GetServerKey(context.Background(), &mgmtproto.Empty{}) + require.NoError(t, err, "GetServerKey failed for %s", serverAddr) + serverPubKey, err := wgtypes.ParseKey(serverKeyResp.Key) + require.NoError(t, err, "invalid server key from %s", serverAddr) + + // 2. Login with the setup key to register the peer. + meta := &mgmtproto.PeerSystemMeta{ + Hostname: peerKey.PublicKey().String(), + GoOS: runtime.GOOS, + OS: runtime.GOOS, + Core: "core", + Platform: "platform", + Kernel: "kernel", + NetbirdVersion: "", + } + loginReq := &mgmtproto.LoginRequest{SetupKey: "E2808C99-E7FA-4841-845E-07CE633E50A1", Meta: meta} + encLogin, err := encryption.EncryptMessage(serverPubKey, peerKey, loginReq) + require.NoError(t, err, "failed to encrypt login request") + + loginResp, err := client.Login(context.Background(), &mgmtproto.EncryptedMessage{ + WgPubKey: peerKey.PublicKey().String(), + Body: encLogin, + }) + require.NoError(t, err, "login failed for %s", serverAddr) + + decryptedLogin := &mgmtproto.LoginResponse{} + err = encryption.DecryptMessage(serverPubKey, peerKey, loginResp.Body, decryptedLogin) + require.NoError(t, err, "failed to decrypt login response") + t.Logf("login to %s succeeded, peer registered", serverAddr) + + // 3. Open Sync stream with a properly encrypted SyncRequest. + syncReq := &mgmtproto.SyncRequest{Meta: meta} + encSync, err := encryption.EncryptMessage(serverPubKey, peerKey, syncReq) + require.NoError(t, err, "failed to encrypt sync request") + + stream, err := client.Sync(context.Background(), &mgmtproto.EncryptedMessage{ + WgPubKey: peerKey.PublicKey().String(), + Body: encSync, + }) + require.NoError(t, err, "sync call failed for %s", serverAddr) + + // Read the first encrypted SyncResponse. + encResp, err := stream.Recv() + require.NoError(t, err, "failed to receive sync response from %s", serverAddr) + + resp := &mgmtproto.SyncResponse{} + err = encryption.DecryptMessage(serverPubKey, peerKey, encResp.Body, resp) + require.NoError(t, err, "failed to decrypt sync response from %s", serverAddr) + return resp + } + + // Verify peer can sync via mgmt-1. + client1 := mgmtGRPCClient(t, mgmt1Addr) + syncResp1 := loginAndSync(client1, mgmt1Addr) + require.NotNil(t, syncResp1, "should receive valid sync response from mgmt-1") + require.NotNil(t, syncResp1.NetbirdConfig, "sync response should contain netbird config") + t.Logf("sync from mgmt-1 OK, signal URI: %s", syncResp1.NetbirdConfig.GetSignal().GetUri()) + + // Stop mgmt-1. + dockerStop(t, "nb-mgmt-1") + defer dockerStart(t, "nb-mgmt-1") + + time.Sleep(3 * time.Second) + + // Verify peer can sync via Traefik (should route to mgmt-2). + client2 := mgmtClientTraefik(t) + syncResp2 := loginAndSync(client2, mgmtTraefikAddr) + require.NotNil(t, syncResp2, "should receive valid sync response via Traefik after failover") + require.NotNil(t, syncResp2.NetbirdConfig, "sync response should contain netbird config after failover") + t.Logf("sync via Traefik OK after failover, signal URI: %s", syncResp2.NetbirdConfig.GetSignal().GetUri()) +} + +// mgmtHTTPClientWithToken performs an authenticated HTTP request against a management server. +func mgmtHTTPClientWithToken(t *testing.T, method, baseURL, path string, body []byte, token string) *http.Response { + t.Helper() + url := fmt.Sprintf("http://%s/api%s", baseURL, path) + var bodyReader io.Reader + if body != nil { + bodyReader = bytes.NewReader(body) + } + req, err := http.NewRequest(method, url, bodyReader) + require.NoError(t, err) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Token "+token) + client := &http.Client{Timeout: 10 * time.Second} + resp, err := client.Do(req) + require.NoError(t, err) + return resp +} diff --git a/tests/integration/scripts/agent-setup.sh b/tests/integration/scripts/agent-setup.sh new file mode 100755 index 00000000000..95d6ecb50d6 --- /dev/null +++ b/tests/integration/scripts/agent-setup.sh @@ -0,0 +1,28 @@ +#!/bin/bash +set -e + +# Agent setup script for NetBird HA testing +# This script configures the netbird agent and connects it to the management server + +echo "=== NetBird Agent Setup ===" +echo "Management URL: $NB_MANAGEMENT_URL" +echo "Setup Key: $NB_SETUP_KEY" + +# Enable IP forwarding +sysctl -w net.ipv4.ip_forward=1 2>/dev/null || true +sysctl -w net.ipv6.conf.all.forwarding=1 2>/dev/null || true + +# Create netbird config directory +mkdir -p /etc/netbird + +# If setup key is provided, run netbird up +if [ -n "$NB_SETUP_KEY" ]; then + echo "Connecting to management server..." + netbird up --management-url "$NB_MANAGEMENT_URL" --setup-key "$NB_SETUP_KEY" || true +else + echo "No setup key provided. Waiting for manual registration..." + echo "Run: netbird up --management-url $NB_MANAGEMENT_URL" +fi + +# Keep container running +exec "$@" diff --git a/tests/integration/scripts/build.sh b/tests/integration/scripts/build.sh new file mode 100755 index 00000000000..897f9a9d38d --- /dev/null +++ b/tests/integration/scripts/build.sh @@ -0,0 +1,59 @@ +#!/bin/bash +set -euo pipefail + +# Build script for NetBird HA test environment +# Builds all required binaries for Docker images +# +# Usage: +# ./tests/integration/scripts/build.sh + +cd "$(dirname "$0")/../.." +PROJECT_ROOT="$(pwd)" + +echo "=== Building NetBird HA Binaries ===" +echo "Project root: ${PROJECT_ROOT}" + +# Determine version info from git if available +VERSION="${VERSION:-$(git describe --tags --always 2>/dev/null || echo 'dev')}" +COMMIT="${COMMIT:-$(git rev-parse --short HEAD 2>/dev/null || echo 'unknown')}" +DATE="${DATE:-$(date -u +%Y-%m-%dT%H:%M:%SZ)}" + +LDFLAGS="-s -w \ + -X github.com/netbirdio/netbird/version.version=${VERSION} \ + -X main.commit=${COMMIT} \ + -X main.date=${DATE} \ + -X main.builtBy=build.sh" + +echo "Version: ${VERSION}" +echo "Commit: ${COMMIT}" +echo "" + +# Build management server (requires CGO for SQLite support) +echo "Building management server (netbird-mgmt)..." +CGO_ENABLED=1 go build -ldflags "${LDFLAGS}" -o netbird-mgmt ./management/ + +# Build signal server (CGO disabled for static binary) +echo "Building signal server (netbird-signal)..." +CGO_ENABLED=0 go build -ldflags "${LDFLAGS}" -o netbird-signal ./signal/ + +# Build relay server (CGO disabled for static binary) +echo "Building relay server (netbird-relay)..." +CGO_ENABLED=0 go build -ldflags "${LDFLAGS}" -o netbird-relay ./relay/ + +# Build combined server (requires CGO) +echo "Building combined server (netbird-server)..." +CGO_ENABLED=1 go build -ldflags "${LDFLAGS}" -o netbird-server ./combined/ + +# Build client (CGO disabled for static binary) +echo "Building netbird client..." +CGO_ENABLED=0 go build -ldflags "${LDFLAGS}" -o netbird ./client/ + +echo "" +echo "=== All binaries built successfully ===" +echo "Binaries:" +ls -la netbird-mgmt netbird-signal netbird-relay netbird-server netbird 2>/dev/null || true +echo "" +echo "Next steps:" +echo " 1. cp .env.example .env" +echo " 2. Edit .env with your desired values" +echo " 3. docker compose -f docker-compose.ha-test.yml up --build" diff --git a/tests/integration/scripts/init-test-data.sh b/tests/integration/scripts/init-test-data.sh new file mode 100755 index 00000000000..817a395b31b --- /dev/null +++ b/tests/integration/scripts/init-test-data.sh @@ -0,0 +1,225 @@ +#!/usr/bin/env bash +# ------------------------------------------------------------------------------ +# init-test-data.sh +# Idempotent initialization of NetBird HA integration test data. +# ------------------------------------------------------------------------------ +set -euo pipefail + +: "${POSTGRES_DSN:=postgres://netbird:netbird@postgres.nb-ha.local:5432/netbird?sslmode=disable}" +: "${MGMT1_ADDR:=mgmt-1.nb-ha.local:33073}" +: "${MGMT_TOKEN:=}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SETUP_KEY_NAME="ha-integration-test-key" +OWNER_EMAIL="admin@nb-ha.local" +OWNER_PASSWORD="Admin123!" +OWNER_NAME="Test Admin" + +log() { + echo "[init-test-data] $*" +} + +# Wait for Postgres to be ready. +wait_for_postgres() { + log "Waiting for PostgreSQL..." + for i in {1..30}; do + if pg_isready -d "$POSTGRES_DSN" >/dev/null 2>&1; then + log "PostgreSQL is ready" + return 0 + fi + sleep 2 + done + log "ERROR: PostgreSQL did not become ready in time" + return 1 +} + +# Wait for management server to be ready. +wait_for_mgmt() { + log "Waiting for management server at $MGMT1_ADDR..." + for i in {1..30}; do + if curl -sf "http://$MGMT1_ADDR/healthz" >/dev/null 2>&1; then + log "Management server is ready" + return 0 + fi + sleep 2 + done + log "ERROR: Management server did not become ready in time" + return 1 +} + +# Check if setup is required and create owner if so. +setup_instance() { + local status_json + status_json=$(curl -sf "http://$MGMT1_ADDR/api/instance" 2>/dev/null || echo '{"setup_required":false}') + local setup_required + setup_required=$(echo "$status_json" | python3 -c 'import sys,json; print(json.load(sys.stdin).get("setup_required", False))' 2>/dev/null || echo "false") + + if [[ "$setup_required" == "True" || "$setup_required" == "true" ]]; then + log "Instance setup required; creating owner user $OWNER_EMAIL" + curl -sf "http://$MGMT1_ADDR/api/setup" \ + -X POST \ + -H "Content-Type: application/json" \ + -d "{\"email\":\"$OWNER_EMAIL\",\"password\":\"$OWNER_PASSWORD\",\"name\":\"$OWNER_NAME\"}" \ + >/dev/null + log "Owner user created" + else + log "Instance already set up" + fi +} + +# Fetch the owner user ID from the database. +get_owner_user_id() { + psql "$POSTGRES_DSN" -tA -c " + SELECT id FROM users WHERE email = '$OWNER_EMAIL' LIMIT 1; + " 2>/dev/null || true +} + +# Idempotent setup key creation via direct SQL. +# We ensure at least one reusable setup key exists for integration tests. +ensure_setup_key() { + local account_id + account_id=$(psql "$POSTGRES_DSN" -tA -c " + SELECT id FROM accounts LIMIT 1; + " 2>/dev/null || true) + + if [[ -z "$account_id" ]]; then + log "WARNING: No account found in database" + return 0 + fi + + local existing_key + existing_key=$(psql "$POSTGRES_DSN" -tA -c " + SELECT key FROM setup_keys WHERE name = '$SETUP_KEY_NAME' LIMIT 1; + " 2>/dev/null || true) + + if [[ -n "$existing_key" ]]; then + log "Setup key '$SETUP_KEY_NAME' already exists" + return 0 + fi + + local key_id key_value key_secret + key_id=$(python3 -c 'import uuid; print(uuid.uuid4())') + key_value=$(python3 -c 'import uuid; print(str(uuid.uuid4()).upper())') + key_secret=$(echo -n "$key_value" | sha256sum | awk '{print $1}') + + psql "$POSTGRES_DSN" -c " + INSERT INTO setup_keys ( + id, account_id, key, key_secret, name, type, + created_at, updated_at, revoked, used_times, + auto_groups, usage_limit, ephemeral, allow_extra_dns_labels + ) VALUES ( + '$key_id', '$account_id', '$key_value', '$key_secret', '$SETUP_KEY_NAME', 'reusable', + NOW(), NOW(), false, 0, + '[]'::jsonb, 0, false, false + ) + ON CONFLICT (id) DO NOTHING; + " 2>/dev/null || true + + log "Setup key '$SETUP_KEY_NAME' created ($key_value)" +} + +# Idempotent PAT creation for HTTP API access in tests. +ensure_pat() { + local user_id + user_id=$(get_owner_user_id) + if [[ -z "$user_id" ]]; then + log "WARNING: Owner user not found; skipping PAT creation" + return 0 + fi + + local existing_pat + existing_pat=$(psql "$POSTGRES_DSN" -tA -c " + SELECT hashed_token FROM personal_access_tokens WHERE name = 'ha-test-pat' AND user_id = '$user_id' LIMIT 1; + " 2>/dev/null || true) + + if [[ -n "$existing_pat" ]]; then + log "PAT already exists for user $user_id" + return 0 + fi + + local pat_id pat_plain pat_hashed + pat_id=$(python3 -c 'import uuid; print(uuid.uuid4())') + pat_plain="nbp_$(python3 -c 'import secrets; print(secrets.token_urlsafe(30))')" + pat_hashed=$(echo -n "$pat_plain" | sha256sum | awk '{print $1}') + + psql "$POSTGRES_DSN" -c " + INSERT INTO personal_access_tokens ( + id, user_id, name, hashed_token, + expiration_date, created_by, created_at, last_used + ) VALUES ( + '$pat_id', '$user_id', 'ha-test-pat', '$pat_hashed', + NOW() + INTERVAL '7 days', '$user_id', NOW(), NULL + ) + ON CONFLICT (id) DO NOTHING; + " 2>/dev/null || true + + # Export for test runner. + echo "MGMT_TOKEN=$pat_plain" + log "PAT created for user $user_id" +} + +# Idempotent test peer creation in database. +# Creates peers that can be used by integration tests without full client setup. +ensure_test_peers() { + local account_id + account_id=$(psql "$POSTGRES_DSN" -tA -c " + SELECT id FROM accounts LIMIT 1; + " 2>/dev/null || true) + + if [[ -z "$account_id" ]]; then + log "WARNING: No account found; skipping test peer creation" + return 0 + fi + + for peer_name in "test-peer-1" "test-peer-2" "test-peer-3"; do + local existing_peer + existing_peer=$(psql "$POSTGRES_DSN" -tA -c " + SELECT id FROM peers WHERE name = '$peer_name' AND account_id = '$account_id' LIMIT 1; + " 2>/dev/null || true) + + if [[ -n "$existing_peer" ]]; then + continue + fi + + local peer_id peer_key peer_ip + peer_id=$(python3 -c 'import uuid; print(uuid.uuid4())') + peer_key="$(echo -n "$peer_name-$account_id" | sha256sum | awk '{print $1}')" + peer_ip="100.64.$(shuf -i 1-254 -n 1).$(shuf -i 1-254 -n 1)" + + psql "$POSTGRES_DSN" -c " + INSERT INTO peers ( + id, account_id, key, ip, name, dns_label, + ssh_key, ssh_enabled, login_expiration_enabled, + inactivity_expiration_enabled, ephemeral, created_at, + meta_hostname, meta_goos, meta_goarch, meta_netbird_version, + meta_kernel_version, meta_ui_version, meta_extra_dns_labels, + peer_status_last_seen, peer_status_connected, peer_status_login_expired, peer_status_requires_approval, + proxy_meta_embedded, proxy_meta_cluster + ) VALUES ( + '$peer_id', '$account_id', '$peer_key', '$peer_ip'::jsonb, '$peer_name', '$peer_name', + '', false, false, + false, false, NOW(), + '', '', '', '', '', '', '[]'::jsonb, + NOW(), false, false, false, + false, '' + ) + ON CONFLICT (id) DO NOTHING; + " 2>/dev/null || true + + log "Test peer '$peer_name' created" + done +} + +# Main flow. +main() { + log "Starting idempotent test data initialization" + wait_for_postgres + wait_for_mgmt + setup_instance + ensure_setup_key + ensure_pat + ensure_test_peers + log "Initialization complete" +} + +main "$@" diff --git a/tests/integration/scripts/run-tests.sh b/tests/integration/scripts/run-tests.sh new file mode 100755 index 00000000000..e89caaa374a --- /dev/null +++ b/tests/integration/scripts/run-tests.sh @@ -0,0 +1,64 @@ +#!/bin/bash +set -euo pipefail + +# NetBird HA Integration Test Runner +# This script runs inside the test-runner container and executes +# integration tests against the HA infrastructure. + +echo "=== NetBird HA Integration Tests ===" +echo "Domain: ${NB_DOMAIN:-nb-ha.local}" +echo "Redis: ${REDIS_ADDR:-not set}" +echo "Management-1: ${MGMT1_ADDR:-not set}" +echo "Management-2: ${MGMT2_ADDR:-not set}" +echo "Signal-1: ${SIGNAL1_ADDR:-not set}" +echo "Signal-2: ${SIGNAL2_ADDR:-not set}" +echo "" + +# Function to wait for a service to be healthy +wait_for_service() { + local name="$1" + local url="$2" + local max_attempts="${3:-30}" + local attempt=0 + + echo "Waiting for ${name} at ${url}..." + while ! wget -qO- "${url}" >/dev/null 2>&1; do + attempt=$((attempt + 1)) + if [ "${attempt}" -ge "${max_attempts}" ]; then + echo "ERROR: ${name} did not become ready in time" + return 1 + fi + sleep 2 + done + echo "${name} is ready!" +} + +# Wait for core services +echo "--- Health Checks ---" +wait_for_service "management-1" "http://${MGMT1_ADDR}/healthz" 30 || true +wait_for_service "management-2" "http://${MGMT2_ADDR}/healthz" 30 || true +wait_for_service "signal-1" "http://signal-1.${NB_DOMAIN}:9090/metrics" 30 || true +wait_for_service "signal-2" "http://signal-2.${NB_DOMAIN}:9090/metrics" 30 || true + +echo "" +echo "--- Running Tests ---" + +# If Go test files exist, run them +if [ -f "*.go" ] || ls *.go >/dev/null 2>&1; then + echo "Running Go tests..." + go test -v ./... 2>&1 || true +else + echo "No Go test files found in /tests" +fi + +# Run shell-based integration checks +echo "" +echo "--- Redis Connectivity ---" +redis-cli -h redis.${NB_DOMAIN} ping || echo "WARNING: Redis ping failed" + +echo "" +echo "--- PostgreSQL Connectivity ---" +psql "${POSTGRES_DSN}" -c "SELECT 1;" || echo "WARNING: PostgreSQL check failed" + +echo "" +echo "=== Integration Test Run Complete ===" diff --git a/tests/integration/signal_ha_test.go b/tests/integration/signal_ha_test.go new file mode 100644 index 00000000000..274ff203617 --- /dev/null +++ b/tests/integration/signal_ha_test.go @@ -0,0 +1,468 @@ +package integration + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + signalproto "github.com/netbirdio/netbird/shared/signal/proto" +) + +const ( + signalRegistryKey = "nb:signal:registry" + signalChannelPrefix = "nb:signal:instance:" +) + +// TestSignalCrossInstanceMessaging verifies that a message sent from a peer +// connected to signal-1 reaches a peer connected to signal-2 via Redis. +func TestSignalCrossInstanceMessaging(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + rdb := newRedisClient(t) + defer func() { _ = rdb.Close() }() + ctx := context.Background() + + // Clean slate. + _ = rdb.Del(ctx, signalRegistryKey) + + peerA := "peer-a-signal-test" + peerB := "peer-b-signal-test" + + // Connect peer A to signal-1 and peer B to signal-2. + sig1 := signalClient(t, signal1Addr) + sig2 := signalClient(t, signal2Addr) + + streamA := connectSignalStream(t, sig1, peerA) + defer streamA.CloseSend() + + streamB := connectSignalStream(t, sig2, peerB) + defer streamB.CloseSend() + + // Consume stream B in background so server can send to it. + recvCh := make(chan *signalproto.EncryptedMessage, 1) + doneCh := make(chan struct{}) + go func() { + defer close(doneCh) + msg, err := streamB.Recv() + if err == nil { + recvCh <- msg + } + }() + + // Wait for both peers to be registered in Redis. + waitForRedisHashField(t, rdb, signalRegistryKey, peerA, "signal-1", 10*time.Second) + waitForRedisHashField(t, rdb, signalRegistryKey, peerB, "signal-2", 10*time.Second) + + // Send message from peer A to peer B via signal-1. + sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + _, err := sig1.Send(sendCtx, &signalproto.EncryptedMessage{ + Key: peerA, + RemoteKey: peerB, + Body: []byte("hello-from-signal-1"), + }) + require.NoError(t, err) + + // Verify peer B receives the message. + select { + case msg := <-recvCh: + require.NotNil(t, msg) + assert.Equal(t, peerA, msg.Key) + assert.Equal(t, peerB, msg.RemoteKey) + case <-time.After(15 * time.Second): + t.Fatal("timeout waiting for cross-instance message") + } + + <-doneCh +} + +// TestSignalRegistryPopulation verifies that peers are registered in the Redis HSET. +func TestSignalRegistryPopulation(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + rdb := newRedisClient(t) + defer func() { _ = rdb.Close() }() + ctx := context.Background() + + peerID := "peer-registry-test" + _ = rdb.HDel(ctx, signalRegistryKey, peerID) + + sig1 := signalClient(t, signal1Addr) + stream := connectSignalStream(t, sig1, peerID) + defer stream.CloseSend() + + waitForRedisHashField(t, rdb, signalRegistryKey, peerID, "signal-1", 10*time.Second) + + val, err := rdb.HGet(ctx, signalRegistryKey, peerID).Result() + require.NoError(t, err) + assert.Equal(t, "signal-1", val) + + // Verify TTL is set on the registry key. + ttl, err := rdb.TTL(ctx, signalRegistryKey).Result() + require.NoError(t, err) + assert.Greater(t, ttl, time.Duration(0)) +} + +// TestSignalInstanceFailover verifies that when a signal instance is stopped, +// a peer can reconnect to the other instance and communication continues. +func TestSignalInstanceFailover(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + requireDocker(t) + + rdb := newRedisClient(t) + defer func() { _ = rdb.Close() }() + ctx := context.Background() + + peerA := "peer-a-failover" + peerB := "peer-b-failover" + + // Clean slate. + _ = rdb.HDel(ctx, signalRegistryKey, peerA, peerB) + + sig1 := signalClient(t, signal1Addr) + sig2 := signalClient(t, signal2Addr) + + // Peer A on signal-1, peer B on signal-2. + streamA := connectSignalStream(t, sig1, peerA) + defer streamA.CloseSend() + + streamB := connectSignalStream(t, sig2, peerB) + defer streamB.CloseSend() + + waitForRedisHashField(t, rdb, signalRegistryKey, peerA, "signal-1", 10*time.Second) + waitForRedisHashField(t, rdb, signalRegistryKey, peerB, "signal-2", 10*time.Second) + + // Stop signal-1 container. + dockerStop(t, "nb-signal-1") + defer dockerStart(t, "nb-signal-1") + + // Wait for signal-1 peer entry to disappear (or reconnect). + // The old peer connection will drop; we reconnect peer A to signal-2. + time.Sleep(2 * time.Second) + + // Reconnect peer A to signal-2. + streamA2 := connectSignalStream(t, sig2, peerA) + defer streamA2.CloseSend() + + waitForRedisHashField(t, rdb, signalRegistryKey, peerA, "signal-2", 15*time.Second) + + // Verify cross-instance messaging still works (both on signal-2 now). + recvCh := make(chan *signalproto.EncryptedMessage, 1) + doneCh := make(chan struct{}) + go func() { + defer close(doneCh) + msg, err := streamB.Recv() + if err == nil { + recvCh <- msg + } + }() + + sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + _, err := sig2.Send(sendCtx, &signalproto.EncryptedMessage{ + Key: peerA, + RemoteKey: peerB, + Body: []byte("hello-after-failover"), + }) + require.NoError(t, err) + + select { + case msg := <-recvCh: + require.NotNil(t, msg) + assert.Equal(t, "hello-after-failover", string(msg.Body)) + case <-time.After(15 * time.Second): + t.Fatal("timeout waiting for post-failover message") + } + <-doneCh +} + +// TestSignalGracefulDegradation verifies that when Redis is unavailable, +// the signal server falls back to local-only mode and peers on the same +// instance can still communicate. +func TestSignalGracefulDegradation(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + requireDocker(t) + + rdb := newRedisClient(t) + defer func() { _ = rdb.Close() }() + ctx := context.Background() + + peerA := "peer-a-degradation" + peerB := "peer-b-degradation" + + // Stop Redis. + dockerStop(t, "nb-redis") + defer dockerStart(t, "nb-redis") + + // Wait a moment for Redis to be fully down. + time.Sleep(2 * time.Second) + + // Connect both peers to signal-1 (same instance). + sig1 := signalClient(t, signal1Addr) + streamA := connectSignalStream(t, sig1, peerA) + defer streamA.CloseSend() + + streamB := connectSignalStream(t, sig1, peerB) + defer streamB.CloseSend() + + // Consume stream B. + recvCh := make(chan *signalproto.EncryptedMessage, 1) + doneCh := make(chan struct{}) + go func() { + defer close(doneCh) + msg, err := streamB.Recv() + if err == nil { + recvCh <- msg + } + }() + + // Send from peer A to peer B on the same instance. + sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + _, err := sig1.Send(sendCtx, &signalproto.EncryptedMessage{ + Key: peerA, + RemoteKey: peerB, + Body: []byte("local-only-message"), + }) + require.NoError(t, err) + + select { + case msg := <-recvCh: + require.NotNil(t, msg) + assert.Equal(t, "local-only-message", string(msg.Body)) + case <-time.After(10 * time.Second): + t.Fatal("timeout waiting for local-only message") + } + <-doneCh +} + +// TestSignalRedisChannelIsolation verifies that messages published to one +// instance channel are not incorrectly received by another instance's peers. +func TestSignalRedisChannelIsolation(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + rdb := newRedisClient(t) + defer func() { _ = rdb.Close() }() + ctx := context.Background() + + peerA := "peer-a-isolation" + peerB := "peer-b-isolation" + + _ = rdb.HDel(ctx, signalRegistryKey, peerA, peerB) + + // Wait for Redis to be fully up and DNS-resolvable after potential restart. + // Signal servers need Redis to register peers. + for i := 0; i < 30; i++ { + if err := rdb.Ping(ctx).Err(); err == nil { + break + } + time.Sleep(1 * time.Second) + } + require.NoError(t, rdb.Ping(ctx).Err(), "redis not available") + + // Allow signal servers time to reconnect their Redis PubSub and clients. + time.Sleep(3 * time.Second) + + sig1 := signalClient(t, signal1Addr) + sig2 := signalClient(t, signal2Addr) + + streamA := connectSignalStream(t, sig1, peerA) + defer streamA.CloseSend() + + streamB := connectSignalStream(t, sig2, peerB) + defer streamB.CloseSend() + + waitForRedisHashField(t, rdb, signalRegistryKey, peerA, "signal-1", 15*time.Second) + waitForRedisHashField(t, rdb, signalRegistryKey, peerB, "signal-2", 15*time.Second) + + // Verify channel keys exist and are distinct. + ch1 := signalChannelPrefix + "signal-1" + ch2 := signalChannelPrefix + "signal-2" + exists1, err := rdb.Publish(ctx, ch1, "ping").Result() + require.NoError(t, err) + assert.GreaterOrEqual(t, exists1, int64(1)) // at least signal-1 subscriber + + exists2, err := rdb.Publish(ctx, ch2, "ping").Result() + require.NoError(t, err) + assert.GreaterOrEqual(t, exists2, int64(1)) // at least signal-2 subscriber +} + +// TestSignalTraefikLoadBalancing verifies that peers connected through the +// Traefik load balancer are distributed across both signal instances. +func TestSignalTraefikLoadBalancing(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + rdb := newRedisClient(t) + defer func() { _ = rdb.Close() }() + ctx := context.Background() + + // Clean slate. + _ = rdb.Del(ctx, signalRegistryKey) + + // Connect multiple peers through Traefik. + peers := []string{"peer-tlb-1", "peer-tlb-2", "peer-tlb-3", "peer-tlb-4"} + streams := make([]signalproto.SignalExchange_ConnectStreamClient, len(peers)) + + for i, peer := range peers { + client := signalClientTraefik(t) + streams[i] = connectSignalStream(t, client, peer) + defer streams[i].CloseSend() + } + + // Wait for all peers to register and collect their instance assignments. + instances := make(map[string]int) + for _, peer := range peers { + // Poll Redis until the peer appears (with timeout). + var instance string + for i := 0; i < 20; i++ { + val, err := rdb.HGet(ctx, signalRegistryKey, peer).Result() + if err == nil && val != "" { + instance = val + break + } + time.Sleep(500 * time.Millisecond) + } + require.NotEmpty(t, instance, "peer %s should be registered in Redis", peer) + instances[instance]++ + t.Logf("peer %s registered on %s", peer, instance) + } + + // Verify that peers are distributed across BOTH instances. + assert.GreaterOrEqual(t, instances["signal-1"], 1, "at least one peer should land on signal-1") + assert.GreaterOrEqual(t, instances["signal-2"], 1, "at least one peer should land on signal-2") + t.Logf("load distribution: signal-1=%d, signal-2=%d", instances["signal-1"], instances["signal-2"]) +} + +// TestSignalTraefikFailover verifies that when the signal instance serving a +// peer dies, the peer can reconnect through Traefik to the surviving instance +// and cross-instance messaging continues. +func TestSignalTraefikFailover(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + requireDocker(t) + + rdb := newRedisClient(t) + defer func() { _ = rdb.Close() }() + ctx := context.Background() + + peerA := "peer-a-traefik-failover" + peerB := "peer-b-traefik-failover" + + // Clean slate. + _ = rdb.HDel(ctx, signalRegistryKey, peerA, peerB) + + // Connect both peers through Traefik. + client := signalClientTraefik(t) + + // Helper to connect and get instance with retry for cross-instance placement. + connectAndGetInstance := func(peerID string) (signalproto.SignalExchange_ConnectStreamClient, string) { + for attempt := 0; attempt < 10; attempt++ { + _ = rdb.HDel(ctx, signalRegistryKey, peerID) + stream := connectSignalStream(t, client, peerID) + // Poll Redis to find which instance this peer landed on. + var instance string + for i := 0; i < 20; i++ { + val, err := rdb.HGet(ctx, signalRegistryKey, peerID).Result() + if err == nil && val != "" { + instance = val + break + } + time.Sleep(500 * time.Millisecond) + } + if instance != "" { + return stream, instance + } + stream.CloseSend() + } + t.Fatalf("could not connect peer %s to any instance", peerID) + return nil, "" + } + + streamA, instanceA := connectAndGetInstance(peerA) + defer streamA.CloseSend() + + streamB, instanceB := connectAndGetInstance(peerB) + defer streamB.CloseSend() + + t.Logf("peerA on %s, peerB on %s", instanceA, instanceB) + + // Ensure peers are on different instances; if not, reconnect peerB. + if instanceA == instanceB { + t.Logf("both peers on same instance, reconnecting peerB") + streamB.CloseSend() + _ = rdb.HDel(ctx, signalRegistryKey, peerB) + for attempt := 0; attempt < 10; attempt++ { + streamB, instanceB = connectAndGetInstance(peerB) + if instanceB != instanceA { + break + } + streamB.CloseSend() + _ = rdb.HDel(ctx, signalRegistryKey, peerB) + } + require.NotEqual(t, instanceA, instanceB, "peerB should land on different instance") + } + defer streamB.CloseSend() + + // Stop the instance serving peer A. + targetInstance := instanceA + t.Logf("stopping %s", targetInstance) + dockerStop(t, "nb-"+targetInstance) + defer dockerStart(t, "nb-"+targetInstance) + + time.Sleep(2 * time.Second) + + // Reconnect peer A through Traefik; it should land on the other instance. + streamA.CloseSend() + _ = rdb.HDel(ctx, signalRegistryKey, peerA) + streamA2, instanceA2 := connectAndGetInstance(peerA) + defer streamA2.CloseSend() + + survivor := instanceB + t.Logf("peerA reconnected on %s (expected survivor: %s)", instanceA2, survivor) + assert.Equal(t, survivor, instanceA2, "peerA should reconnect to surviving instance") + + // Verify peer B (still on original instance) can receive messages from peer A. + recvCh := make(chan *signalproto.EncryptedMessage, 1) + doneCh := make(chan struct{}) + go func() { + defer close(doneCh) + msg, err := streamB.Recv() + if err == nil { + recvCh <- msg + } + }() + + sendCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + _, err := client.Send(sendCtx, &signalproto.EncryptedMessage{ + Key: peerA, + RemoteKey: peerB, + Body: []byte("hello-after-traefik-failover"), + }) + require.NoError(t, err) + + select { + case msg := <-recvCh: + require.NotNil(t, msg) + assert.Equal(t, "hello-after-traefik-failover", string(msg.Body)) + case <-time.After(15 * time.Second): + t.Fatal("timeout waiting for post-failover message through Traefik") + } + <-doneCh +}