diff --git a/Cargo.lock b/Cargo.lock index 942c9f8..c6e9989 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2778,7 +2778,7 @@ dependencies = [ [[package]] name = "talos-pilot" -version = "0.1.3" +version = "0.1.4" dependencies = [ "clap", "color-eyre", @@ -2791,7 +2791,7 @@ dependencies = [ [[package]] name = "talos-pilot-core" -version = "0.1.3" +version = "0.1.4" dependencies = [ "chrono", "serde", @@ -2802,7 +2802,7 @@ dependencies = [ [[package]] name = "talos-pilot-tui" -version = "0.1.3" +version = "0.1.4" dependencies = [ "arboard", "base64", @@ -2819,6 +2819,7 @@ dependencies = [ "tachyonfx", "talos-pilot-core", "talos-rs", + "tempfile", "thiserror 2.0.17", "tokio", "tokio-stream", @@ -2828,7 +2829,7 @@ dependencies = [ [[package]] name = "talos-rs" -version = "0.1.3" +version = "0.1.4" dependencies = [ "base64", "dirs-next", diff --git a/Cargo.toml b/Cargo.toml index 13e3d44..1556c97 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["crates/*"] resolver = "2" [workspace.package] -version = "0.1.3" +version = "0.1.4" edition = "2024" authors = ["Ken Udovic"] license = "MIT" diff --git a/README.md b/README.md index 3853f97..6243f47 100644 --- a/README.md +++ b/README.md @@ -215,7 +215,7 @@ RUST_LOG=debug cargo run tail -f /tmp/talos-pilot.log # Check for warnings -cargo clippy --all +cargo clippy --all --all-targets -- -D warnings ``` ### Local Testing with Docker @@ -225,7 +225,7 @@ See [docs/local-talos-setup.md](docs/local-talos-setup.md) for setting up a loca ### Current Stats - **Core library**: ~1,760 lines across 8 modules -- **Tests**: 70 total (47 core + 6 TUI + 6 talos-rs + 11 doc) +- **Tests**: 88 total (47 core + 8 TUI + 22 talos-rs + 11 doc) - **Components**: 12 TUI components - **Build warnings**: 0 diff --git a/crates/talos-pilot-tui/Cargo.toml b/crates/talos-pilot-tui/Cargo.toml index 3b444b6..3628b3f 100644 --- a/crates/talos-pilot-tui/Cargo.toml +++ b/crates/talos-pilot-tui/Cargo.toml @@ -49,3 +49,6 @@ chrono.workspace = true # Home directory dirs-next.workspace = true + +[dev-dependencies] +tempfile = "3" diff --git a/crates/talos-pilot-tui/src/components/diagnostics/k8s.rs b/crates/talos-pilot-tui/src/components/diagnostics/k8s.rs index 8af164e..1b2c16a 100644 --- a/crates/talos-pilot-tui/src/components/diagnostics/k8s.rs +++ b/crates/talos-pilot-tui/src/components/diagnostics/k8s.rs @@ -35,10 +35,26 @@ pub async fn create_k8s_client(talos_client: &TalosClient) -> Result, ) -> Result { + // Try KUBECONFIG environment variable first (via Config::infer()) + // This respects standard K8s tooling conventions + if let Ok(config) = Config::infer().await + && let Ok(client) = Client::try_from(config) + { + tracing::debug!("Using kubeconfig from environment (KUBECONFIG or default path)"); + return Ok(client); + } + + // Fall back to fetching kubeconfig from Talos API + tracing::debug!("Falling back to fetching kubeconfig from Talos API"); + // Use the provided kubeconfig_client if available, otherwise use the main client let client_for_kubeconfig = kubeconfig_client.unwrap_or(_talos_client); @@ -1045,3 +1061,117 @@ pub async fn wait_for_node_ready( tokio::time::sleep(poll_interval).await; } } + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + use tempfile::NamedTempFile; + use tokio::sync::Mutex; + + static ENV_MUTEX: Mutex<()> = Mutex::const_new(()); + + /// Test that KUBECONFIG environment variable is respected. + /// + /// This test creates a valid kubeconfig file pointing to a non-existent cluster. + /// If KUBECONFIG is respected, Config::infer() will find it and try to use it, + /// resulting in a connection error (not a kubeconfig fetch error from Talos). + /// + /// Note: This test manipulates environment variables and must be run serially. + #[tokio::test] + async fn test_kubeconfig_env_is_tried_first() { + let _guard = ENV_MUTEX.lock().await; + + // Create a valid kubeconfig file pointing to a non-existent cluster + let kubeconfig_content = r#" +apiVersion: v1 +kind: Config +clusters: +- cluster: + server: https://127.0.0.1:64321 + insecure-skip-tls-verify: true + name: test-cluster +contexts: +- context: + cluster: test-cluster + user: test-user + name: test-context +current-context: test-context +users: +- name: test-user + user: + token: test-token +"#; + + let mut temp_file = NamedTempFile::new().unwrap(); + temp_file.write_all(kubeconfig_content.as_bytes()).unwrap(); + let kubeconfig_path = temp_file.path().to_string_lossy().to_string(); + + // Set KUBECONFIG to our temp file + let old_kubeconfig = std::env::var("KUBECONFIG").ok(); + unsafe { + std::env::set_var("KUBECONFIG", &kubeconfig_path); + } + + // Try to infer config - this should succeed in finding the kubeconfig + // (it will fail to connect, but that's expected) + let config_result = Config::infer().await; + + // Restore original KUBECONFIG + unsafe { + if let Some(old_value) = old_kubeconfig { + std::env::set_var("KUBECONFIG", old_value); + } else { + std::env::remove_var("KUBECONFIG"); + } + } + + // The config should have been loaded (even though connection would fail) + assert!( + config_result.is_ok(), + "Config::infer() should find our KUBECONFIG file" + ); + + // Verify it points to our test cluster + let config = config_result.unwrap(); + assert_eq!( + config.cluster_url.to_string(), + "https://127.0.0.1:64321/", + "Should use the cluster URL from our kubeconfig" + ); + } + + /// Test that when KUBECONFIG points to a non-existent file, it falls back gracefully. + /// + /// Note: This test can't fully verify the Talos fallback without a real Talos client, + /// but it verifies that the function handles missing KUBECONFIG correctly. + #[tokio::test] + async fn test_kubeconfig_fallback_on_missing_file() { + let _guard = ENV_MUTEX.lock().await; + + // Set KUBECONFIG to a non-existent file + let old_kubeconfig = std::env::var("KUBECONFIG").ok(); + unsafe { + std::env::set_var("KUBECONFIG", "/non/existent/path/kubeconfig"); + } + + // Try to infer config - this should fail + let config_result = Config::infer().await; + + // Restore original KUBECONFIG + unsafe { + if let Some(old_value) = old_kubeconfig { + std::env::set_var("KUBECONFIG", old_value); + } else { + std::env::remove_var("KUBECONFIG"); + } + } + + // Config::infer() should fail when kubeconfig doesn't exist + // This triggers the fallback to Talos API in create_k8s_client_with_kubeconfig_source + assert!( + config_result.is_err(), + "Config::infer() should fail with non-existent kubeconfig" + ); + } +} diff --git a/crates/talos-rs/src/client.rs b/crates/talos-rs/src/client.rs index 7f935f6..4a7fad7 100644 --- a/crates/talos-rs/src/client.rs +++ b/crates/talos-rs/src/client.rs @@ -31,6 +31,8 @@ pub struct TalosClient { channel: Channel, /// Target nodes for API requests nodes: Vec, + /// Endpoints from configuration (used to filter out vIPs from node targeting) + endpoints: Vec, } impl TalosClient { @@ -38,8 +40,13 @@ impl TalosClient { pub async fn from_context(ctx: &Context) -> Result { let channel = create_channel(ctx).await?; let nodes = ctx.target_nodes().to_vec(); + let endpoints = ctx.endpoints.clone(); - Ok(Self { channel, nodes }) + Ok(Self { + channel, + nodes, + endpoints, + }) } /// Create a new client from the default talosconfig @@ -65,6 +72,7 @@ impl TalosClient { Self { channel: self.channel.clone(), nodes: vec![node.to_string()], + endpoints: self.endpoints.clone(), } } @@ -111,10 +119,24 @@ impl TalosClient { // When nodes is empty or same as endpoints, skip the header if !self.nodes.is_empty() { // Filter out localhost/127.0.0.1 entries as these are endpoint proxies + // Also filter out entries that match endpoints (likely vIPs) let valid_nodes: Vec = self .nodes .iter() - .filter(|n| !n.starts_with("127.0.0.1") && !n.starts_with("localhost")) + .filter(|n| { + let is_localhost = n.starts_with("127.0.0.1") || n.starts_with("localhost"); + + // Extract hostname (without port) for comparison + let node_host = n.split(':').next().unwrap_or(n); + + // Check if this node matches any endpoint (which could be a vIP) + let is_endpoint = self.endpoints.iter().any(|e| { + let endpoint_host = e.split(':').next().unwrap_or(e); + endpoint_host == node_host + }); + + !is_localhost && !is_endpoint + }) .map(|n| n.split(':').next().unwrap_or(n).to_string()) .collect(); @@ -128,6 +150,35 @@ impl TalosClient { request } + /// Get the filtered target nodes that would be sent in API requests + /// + /// This filters out: + /// - localhost/127.0.0.1 entries (proxy endpoints) + /// - Entries that match configured endpoints (likely vIPs) + /// + /// Returns the list of actual node hostnames (without ports). + #[doc(hidden)] + pub fn filtered_target_nodes(&self) -> Vec { + self.nodes + .iter() + .filter(|n| { + let is_localhost = n.starts_with("127.0.0.1") || n.starts_with("localhost"); + + // Extract hostname (without port) for comparison + let node_host = n.split(':').next().unwrap_or(n); + + // Check if this node matches any endpoint (which could be a vIP) + let is_endpoint = self.endpoints.iter().any(|e| { + let endpoint_host = e.split(':').next().unwrap_or(e); + endpoint_host == node_host + }); + + !is_localhost && !is_endpoint + }) + .map(|n| n.split(':').next().unwrap_or(n).to_string()) + .collect() + } + /// Get version information from all configured nodes pub async fn version(&self) -> Result, TalosError> { let mut client = self.machine_client(); @@ -2439,3 +2490,173 @@ impl NodeTimeInfo { if self.synced { "synced" } else { "not synced" } } } + +#[cfg(test)] +mod tests { + use super::*; + + /// Helper to create a TalosClient for testing without a real connection + fn create_test_client(nodes: Vec, endpoints: Vec) -> TalosClient { + // Create a dummy channel - we won't actually use it for these tests + // This is a bit of a hack, but it allows us to test the filtering logic + let channel = tonic::transport::Channel::from_static("http://[::1]:50000").connect_lazy(); + + TalosClient { + channel, + nodes, + endpoints, + } + } + + #[tokio::test] + async fn test_filtered_nodes_removes_vip_endpoint() { + // Scenario: talosconfig has a vIP endpoint and actual node hostnames + // The vIP should be filtered out when targeting nodes + let client = create_test_client( + vec![ + "cluster.example.com".to_string(), // vIP (matches endpoint) + "kubec01".to_string(), + "kubec02".to_string(), + "kubec03".to_string(), + ], + vec!["cluster.example.com:50000".to_string()], // endpoint is the vIP + ); + + let filtered = client.filtered_target_nodes(); + + // vIP should be filtered out, only actual nodes remain + assert_eq!(filtered, vec!["kubec01", "kubec02", "kubec03"]); + assert!(!filtered.contains(&"cluster.example.com".to_string())); + } + + #[tokio::test] + async fn test_filtered_nodes_removes_localhost() { + // Scenario: local proxy endpoint + let client = create_test_client( + vec![ + "127.0.0.1:50000".to_string(), + "node1".to_string(), + "node2".to_string(), + ], + vec!["127.0.0.1:50000".to_string()], + ); + + let filtered = client.filtered_target_nodes(); + + assert_eq!(filtered, vec!["node1", "node2"]); + assert!(!filtered.iter().any(|n| n.starts_with("127.0.0.1"))); + } + + #[tokio::test] + async fn test_filtered_nodes_removes_localhost_variant() { + // Scenario: localhost hostname + let client = create_test_client( + vec!["localhost:50000".to_string(), "node1".to_string()], + vec!["localhost:50000".to_string()], + ); + + let filtered = client.filtered_target_nodes(); + + assert_eq!(filtered, vec!["node1"]); + } + + #[tokio::test] + async fn test_filtered_nodes_strips_ports() { + // Ports should be stripped from node names + let client = create_test_client( + vec!["node1:50000".to_string(), "node2:50000".to_string()], + vec!["vip.example.com:50000".to_string()], + ); + + let filtered = client.filtered_target_nodes(); + + assert_eq!(filtered, vec!["node1", "node2"]); + } + + #[tokio::test] + async fn test_filtered_nodes_empty_when_only_endpoints() { + // Scenario: nodes fallback to endpoints (empty nodes list means endpoints are used) + // When target_nodes() returns endpoints, they should all be filtered out + let client = create_test_client( + vec!["cluster.example.com:50000".to_string()], // same as endpoint + vec!["cluster.example.com:50000".to_string()], + ); + + let filtered = client.filtered_target_nodes(); + + // All nodes match the endpoint, so result should be empty + assert!(filtered.is_empty()); + } + + #[tokio::test] + async fn test_filtered_nodes_preserves_non_endpoint_nodes() { + // Scenario: mix of endpoint and non-endpoint nodes + let client = create_test_client( + vec![ + "vip.cluster.local".to_string(), + "actual-node-1.cluster.local".to_string(), + "actual-node-2.cluster.local".to_string(), + ], + vec!["vip.cluster.local:50000".to_string()], + ); + + let filtered = client.filtered_target_nodes(); + + assert_eq!( + filtered, + vec!["actual-node-1.cluster.local", "actual-node-2.cluster.local"] + ); + } + + #[tokio::test] + async fn test_filtered_nodes_multiple_endpoints() { + // Scenario: multiple endpoints (e.g., multiple vIPs or proxies) + let client = create_test_client( + vec![ + "vip1.example.com".to_string(), + "vip2.example.com".to_string(), + "node1".to_string(), + "node2".to_string(), + ], + vec![ + "vip1.example.com:50000".to_string(), + "vip2.example.com:50000".to_string(), + ], + ); + + let filtered = client.filtered_target_nodes(); + + assert_eq!(filtered, vec!["node1", "node2"]); + } + + #[tokio::test] + async fn test_filtered_nodes_no_endpoints() { + // Scenario: no endpoints configured (edge case) + let client = create_test_client(vec!["node1".to_string(), "node2".to_string()], vec![]); + + let filtered = client.filtered_target_nodes(); + + // With no endpoints to filter, all non-localhost nodes remain + assert_eq!(filtered, vec!["node1", "node2"]); + } + + #[tokio::test] + async fn test_filtered_nodes_with_port_in_endpoint_only() { + // Scenario: endpoint has port, nodes don't + // This tests the host extraction logic + let client = create_test_client( + vec![ + "vip.example.com".to_string(), + "node1".to_string(), + "node2".to_string(), + ], + vec!["vip.example.com:50000".to_string()], + ); + + let filtered = client.filtered_target_nodes(); + + // vip.example.com should be filtered out even though it doesn't have a port + // because the endpoint vip.example.com:50000 matches after stripping port + assert_eq!(filtered, vec!["node1", "node2"]); + } +} diff --git a/test-clusters/scripts/multi-endpoint-test.sh b/test-clusters/scripts/multi-endpoint-test.sh new file mode 100755 index 0000000..6a32db6 --- /dev/null +++ b/test-clusters/scripts/multi-endpoint-test.sh @@ -0,0 +1,283 @@ +#!/usr/bin/env bash +# +# Multi-Endpoint Test Script +# +# Creates a talosconfig context with multiple endpoints pointing to the same cluster +# to reproduce the duplicate node issue. +# +# Problem: When multiple endpoints are configured, queries through different endpoints +# return the same nodes, leading to duplicate entries in the cluster view. +# +# Example user config causing the issue: +# ```yaml +# context: prod +# contexts: +# prod: +# endpoints: +# - cluster.example.com # vIP +# - kubec01.example.com +# - kubec02.example.com +# - kubec03.example.com +# nodes: +# - kubec01.example.com +# - kubec02.example.com +# - kubec03.example.com +# - kubew01.example.com +# - kubew02.example.com +# ``` +# +# Usage: +# ./multi-endpoint-test.sh setup Create test context with multiple endpoints +# ./multi-endpoint-test.sh cleanup Remove test context +# ./multi-endpoint-test.sh status Show current talosconfig contexts +# ./multi-endpoint-test.sh help Show help +# + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +TEST_CONTEXT="multi-endpoint-test" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +CYAN='\033[0;36m' +NC='\033[0m' + +log_info() { echo -e "${BLUE}[INFO]${NC} $*"; } +log_success() { echo -e "${GREEN}[OK]${NC} $*"; } +log_warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +log_error() { echo -e "${RED}[ERROR]${NC} $*" >&2; } + +show_help() { + cat << 'EOF' +Multi-Endpoint Test Script + +Reproduces the duplicate node issue that occurs when a talosconfig has multiple +endpoints pointing to the same cluster. + +USAGE: + ./multi-endpoint-test.sh + +COMMANDS: + setup Create a test context with multiple endpoints using cluster-alpha + cleanup Remove the test context + status Show current talosconfig contexts + help Show this help + +WHAT THIS TESTS: + +The duplicate node bug occurs when: +1. A talosconfig context has multiple endpoints (e.g., VIP + individual node IPs) +2. talos-pilot queries through each endpoint +3. Each endpoint returns the same nodes +4. Without deduplication, nodes appear multiple times in the UI + +After running 'setup', you'll have a 'multi-endpoint-test' context with: +- 3 endpoints (control plane + 2 workers from cluster-alpha) +- 3 nodes (the same 3 machines) + +This mimics a production setup where users configure: +- VIP endpoint (for HA) +- Individual control plane endpoints (for direct access) + +Run talos-pilot and switch to this context to verify the deduplication fix. + +PREREQUISITES: + - cluster-alpha must be running (use multi-cluster-test.sh create first) + +EOF +} + +check_prerequisites() { + # Check if cluster-alpha exists + if ! docker ps --format '{{.Names}}' | grep -q "^cluster-alpha-"; then + log_error "cluster-alpha is not running" + echo "Run './multi-cluster-test.sh create' first to create test clusters" + exit 1 + fi + + # Check talosctl is available + if ! command -v talosctl &>/dev/null; then + log_error "talosctl is not installed" + exit 1 + fi +} + +setup_multi_endpoint_context() { + check_prerequisites + + log_info "Creating multi-endpoint test context..." + + # The main talosconfig is at ~/.talos/config + local talosconfig="${HOME}/.talos/config" + if [[ ! -f "${talosconfig}" ]]; then + log_error "talosconfig not found at ${talosconfig}" + exit 1 + fi + + # The cluster-alpha nodes are (internal Docker network IPs): + # - 10.5.0.2 (controlplane) + # - 10.5.0.3 (worker-1) + # - 10.5.0.4 (worker-2) + + # First, remove old context if it exists + talosctl config remove "${TEST_CONTEXT}" -y 2>/dev/null || true + + # Create output directory + mkdir -p "${SCRIPT_DIR}/../output" + + # Use Python to properly extract and create the config (more reliable YAML handling) + log_info "Creating ${TEST_CONTEXT} context with multiple endpoints..." + + python3 << 'PYEOF' +import yaml +import os + +talosconfig_path = os.path.expanduser("~/.talos/config") +output_path = os.path.join(os.path.dirname(os.path.abspath(".")), "test-clusters/output/multi-endpoint-temp.yaml") + +with open(talosconfig_path, 'r') as f: + config = yaml.safe_load(f) + +# Get cluster-alpha credentials +alpha_ctx = config['contexts']['cluster-alpha'] + +# Create new context with multiple endpoints +new_config = { + 'context': 'multi-endpoint-test', + 'contexts': { + 'multi-endpoint-test': { + 'endpoints': ['10.5.0.2', '10.5.0.3', '10.5.0.4'], + 'nodes': ['10.5.0.2', '10.5.0.3', '10.5.0.4'], + 'ca': alpha_ctx['ca'], + 'crt': alpha_ctx['crt'], + 'key': alpha_ctx['key'] + } + } +} + +# Write to temp file +output_file = os.path.expanduser("~/.talos/multi-endpoint-temp.yaml") +with open(output_file, 'w') as f: + yaml.dump(new_config, f, default_flow_style=False) + +print(f"Created temp config at {output_file}") +PYEOF + + local temp_config="${HOME}/.talos/multi-endpoint-temp.yaml" + + if [[ ! -f "${temp_config}" ]]; then + log_error "Failed to create temp config" + exit 1 + fi + + # Merge this config + log_info "Merging ${TEST_CONTEXT} context..." + talosctl config merge "${temp_config}" + + # Switch to the new context + talosctl config context "${TEST_CONTEXT}" + + # Clean up temp file + rm -f "${temp_config}" + + echo "" + log_success "Multi-endpoint test context created!" + echo "" + echo -e "${CYAN}=== Context Configuration ===${NC}" + echo "" + echo "Context: ${TEST_CONTEXT}" + echo "" + echo "Endpoints (3 - simulates VIP + individual nodes):" + echo " - 10.5.0.2 (controlplane)" + echo " - 10.5.0.3 (worker-1)" + echo " - 10.5.0.4 (worker-2)" + echo "" + echo "Nodes (3 - same machines as endpoints):" + echo " - 10.5.0.2" + echo " - 10.5.0.3" + echo " - 10.5.0.4" + echo "" + echo -e "${CYAN}=== Testing ===${NC}" + echo "" + echo "1. Verify the context is active:" + echo " talosctl config contexts" + echo "" + echo "2. Run talos-pilot:" + echo " cargo run --bin talos-pilot" + echo "" + echo "3. Expected behavior (with deduplication fix):" + echo " - Should see 3 nodes total (not 9 or more)" + echo " - Each node appears exactly once" + echo "" + echo "4. Bug behavior (without deduplication):" + echo " - Nodes would appear 3x each (once per endpoint)" + echo " - Total of 9 node entries for 3 actual nodes" + echo "" +} + +cleanup_context() { + log_info "Removing ${TEST_CONTEXT} context..." + + if talosctl config remove "${TEST_CONTEXT}" -y 2>/dev/null; then + log_success "Context ${TEST_CONTEXT} removed" + else + log_warn "Context ${TEST_CONTEXT} was not found" + fi + + # Switch back to cluster-alpha if available + if talosctl config contexts 2>/dev/null | grep -q "cluster-alpha"; then + talosctl config context cluster-alpha + log_info "Switched back to cluster-alpha context" + fi +} + +show_status() { + echo "" + echo -e "${CYAN}=== Talos Contexts ===${NC}" + echo "" + talosctl config contexts + echo "" + + # Check if our test context exists + if talosctl config contexts 2>/dev/null | grep -q "${TEST_CONTEXT}"; then + echo -e "${GREEN}Test context '${TEST_CONTEXT}' is configured${NC}" + echo "" + echo "To test:" + echo " 1. talosctl config context ${TEST_CONTEXT}" + echo " 2. cargo run --bin talos-pilot" + else + echo -e "${YELLOW}Test context '${TEST_CONTEXT}' is not configured${NC}" + echo "" + echo "Run './multi-endpoint-test.sh setup' to create it" + fi +} + +main() { + local command="${1:-help}" + + case "${command}" in + setup) + setup_multi_endpoint_context + ;; + cleanup) + cleanup_context + ;; + status) + show_status + ;; + help|--help|-h) + show_help + ;; + *) + log_error "Unknown command: ${command}" + echo "Run './multi-endpoint-test.sh help' for usage" + exit 1 + ;; + esac +} + +main "$@" diff --git a/test-clusters/scripts/setup-vip-config.sh b/test-clusters/scripts/setup-vip-config.sh new file mode 100755 index 0000000..e905de8 --- /dev/null +++ b/test-clusters/scripts/setup-vip-config.sh @@ -0,0 +1,174 @@ +#!/bin/bash +# setup-vip-config.sh - Configure talosconfig with vIP test scenarios +# +# Run this AFTER creating a cluster with: +# sudo -E talosctl cluster create --name test-cluster --cidr 10.5.0.0/24 --controlplanes 1 --workers 2 +# +# This script sets up multiple talosconfig contexts to test the vIP filtering fix. + +set -e + +CLUSTER_NAME="talos-pilot" +CP_IP="10.5.0.2" +WORKER1_IP="10.5.0.3" +WORKER2_IP="10.5.0.4" +VIP_HOSTNAME="cluster.local" + +echo "==========================================" +echo "Setting up vIP Test Contexts" +echo "==========================================" +echo "" + +# Check if talosconfig exists +if [ ! -f ~/.talos/config ]; then + echo "ERROR: ~/.talos/config not found!" + echo "" + echo "Please create a cluster first:" + echo " sudo -E talosctl cluster create --name test-cluster --cidr 10.5.0.0/24 --controlplanes 1 --workers 2" + exit 1 +fi + +# Verify we can connect to the cluster +echo "[1/4] Verifying cluster connectivity..." +if ! talosctl version -n "$CP_IP" > /dev/null 2>&1; then + echo "ERROR: Cannot connect to cluster at $CP_IP" + echo "Make sure the cluster is running." + exit 1 +fi +echo " Connected to cluster" + +# Get node hostnames +echo "" +echo "[2/4] Getting node information..." +CP_HOSTNAME=$(talosctl get hostname -n "$CP_IP" -o json 2>/dev/null | grep -o '"hostname":"[^"]*"' | cut -d'"' -f4 || echo "") +WORKER1_HOSTNAME=$(talosctl get hostname -n "$WORKER1_IP" -o json 2>/dev/null | grep -o '"hostname":"[^"]*"' | cut -d'"' -f4 || echo "") +WORKER2_HOSTNAME=$(talosctl get hostname -n "$WORKER2_IP" -o json 2>/dev/null | grep -o '"hostname":"[^"]*"' | cut -d'"' -f4 || echo "") + +# Fallback to IPs if hostnames not available +CP_HOSTNAME=${CP_HOSTNAME:-$CP_IP} +WORKER1_HOSTNAME=${WORKER1_HOSTNAME:-$WORKER1_IP} +WORKER2_HOSTNAME=${WORKER2_HOSTNAME:-$WORKER2_IP} + +echo " Control Plane: $CP_HOSTNAME ($CP_IP)" +echo " Worker 1: $WORKER1_HOSTNAME ($WORKER1_IP)" +echo " Worker 2: $WORKER2_HOSTNAME ($WORKER2_IP)" + +# Add /etc/hosts entry for vIP hostname +echo "" +echo "[3/4] Setting up vIP hostname in /etc/hosts..." +if grep -q "$VIP_HOSTNAME" /etc/hosts 2>/dev/null; then + echo " Entry for $VIP_HOSTNAME already exists, updating..." + sudo sed -i "/$VIP_HOSTNAME/d" /etc/hosts 2>/dev/null || true +fi +echo "$CP_IP $VIP_HOSTNAME" | sudo tee -a /etc/hosts > /dev/null +echo " Added: $CP_IP $VIP_HOSTNAME" + +# Extract credentials from existing config +echo "" +echo "[4/4] Creating talosconfig with test contexts..." + +ORIGINAL_CONFIG=~/.talos/config +CA=$(grep "ca:" "$ORIGINAL_CONFIG" | head -1 | awk '{print $2}') +CRT=$(grep "crt:" "$ORIGINAL_CONFIG" | head -1 | awk '{print $2}') +KEY=$(grep "key:" "$ORIGINAL_CONFIG" | head -1 | awk '{print $2}') + +if [ -z "$CA" ] || [ -z "$CRT" ] || [ -z "$KEY" ]; then + echo "ERROR: Could not extract credentials from talosconfig" + exit 1 +fi + +# Create new config with multiple test contexts +cat > ~/.talos/config << EOF +context: vip-with-nodes +contexts: + # Context 1: Normal setup (single endpoint, no nodes specified) + # Expected: Works normally, targets endpoint node only + normal: + endpoints: + - $CP_IP + ca: $CA + crt: $CRT + key: $KEY + + # Context 2: vIP endpoint with nodes list including the vIP + # This is the BUG SCENARIO we fixed: + # - endpoint is $VIP_HOSTNAME (resolves to $CP_IP) + # - nodes includes $VIP_HOSTNAME AND actual node hostnames + # Before fix: vIP would be passed in node header -> empty etcd results + # After fix: vIP should be filtered out -> correct etcd results + vip-with-nodes: + endpoints: + - $VIP_HOSTNAME + nodes: + - $VIP_HOSTNAME + - $CP_HOSTNAME + - $WORKER1_HOSTNAME + - $WORKER2_HOSTNAME + ca: $CA + crt: $CRT + key: $KEY + + # Context 3: IP endpoint with nodes list including the IP + # Similar bug scenario but with IP instead of hostname + ip-with-nodes: + endpoints: + - $CP_IP + nodes: + - $CP_IP + - $WORKER1_IP + - $WORKER2_IP + ca: $CA + crt: $CRT + key: $KEY + + # Context 4: Multiple endpoints (all real nodes) + # Tests that we don't break normal multi-endpoint configs + multi-endpoint: + endpoints: + - $CP_IP + - $WORKER1_IP + - $WORKER2_IP + ca: $CA + crt: $CRT + key: $KEY + + # Context 5: Original cluster context (preserved) + $CLUSTER_NAME: + endpoints: + - $CP_IP + ca: $CA + crt: $CRT + key: $KEY +EOF + +echo " Created contexts:" +echo " - normal: Single endpoint, no nodes (baseline)" +echo " - vip-with-nodes: vIP hostname in both endpoints and nodes (BUG SCENARIO)" +echo " - ip-with-nodes: IP in both endpoints and nodes (similar bug)" +echo " - multi-endpoint: Multiple real endpoints" +echo " - $CLUSTER_NAME: Original cluster context" + +echo "" +echo "==========================================" +echo "Setup Complete!" +echo "==========================================" +echo "" +echo "Current context: vip-with-nodes (the bug scenario)" +echo "" +echo "Test commands:" +echo " # Run talos-pilot with the vIP bug scenario" +echo " cargo run --bin talos-pilot" +echo "" +echo " # Check etcd members directly" +echo " talosctl etcd members" +echo "" +echo " # Switch contexts" +echo " talosctl config context normal" +echo " talosctl config context vip-with-nodes" +echo " talosctl config context ip-with-nodes" +echo "" +echo "What to verify:" +echo " 1. In 'vip-with-nodes' context: etcd should show 1/1 (not 0/0)" +echo " 2. In 'ip-with-nodes' context: etcd should show 1/1 (not 0/0)" +echo " 3. Cluster view should show all 3 nodes correctly" +echo "" diff --git a/test-clusters/scripts/vip-test-setup.sh b/test-clusters/scripts/vip-test-setup.sh new file mode 100755 index 0000000..90575dc --- /dev/null +++ b/test-clusters/scripts/vip-test-setup.sh @@ -0,0 +1,205 @@ +#!/bin/bash +# vip-test-setup.sh - Clean setup with vIP endpoint test scenario +# +# This script: +# 1. Cleans up all existing Talos Docker clusters +# 2. Removes ~/.talos/config +# 3. Creates a fresh 3-node cluster (1 CP + 2 workers) +# 4. Sets up talosconfig with vIP scenario to test the etcd fix +# +# The vIP scenario: endpoint is a hostname that also appears in nodes list +# This reproduces the bug where vIP gets passed to node targeting header + +set -e + +CLUSTER_NAME="test-cluster" +CLUSTER_NETWORK="10.5.0.0/24" +CP_IP="10.5.0.2" +WORKER1_IP="10.5.0.3" +WORKER2_IP="10.5.0.4" +VIP_HOSTNAME="cluster.local" + +echo "==========================================" +echo "Talos vIP Test Scenario Setup" +echo "==========================================" +echo "" + +# Step 1: Clean up existing Talos clusters +echo "[1/6] Cleaning up existing Talos Docker clusters..." + +# Find containers running the Talos image +TALOS_CONTAINERS=$(docker ps -a --filter "ancestor=ghcr.io/siderolabs/talos:v1.12.1" --format "{{.Names}}" 2>/dev/null || true) +# Also find by common naming patterns (cluster-*, talos-*) +CLUSTER_CONTAINERS=$(docker ps -a --format "{{.Names}}" 2>/dev/null | grep -E "^(cluster-|talos-)" || true) +ALL_CONTAINERS=$(echo -e "$TALOS_CONTAINERS\n$CLUSTER_CONTAINERS" | sort -u | grep -v '^$' || true) + +if [ -n "$ALL_CONTAINERS" ]; then + echo " Stopping and removing containers:" + for container in $ALL_CONTAINERS; do + echo " - $container" + docker rm -f "$container" 2>/dev/null || true + done +else + echo " No Talos containers found" +fi + +# Remove talos/cluster networks +TALOS_NETWORKS=$(docker network ls --format "{{.Name}}" 2>/dev/null | grep -E "^(talos|cluster-)" || true) +if [ -n "$TALOS_NETWORKS" ]; then + echo " Removing networks:" + for network in $TALOS_NETWORKS; do + echo " - $network" + docker network rm "$network" 2>/dev/null || true + done +else + echo " No Talos networks found" +fi + +# Step 2: Clean up ~/.talos/config +echo "" +echo "[2/6] Cleaning up ~/.talos/config..." +if [ -f ~/.talos/config ]; then + rm -f ~/.talos/config + echo " Removed ~/.talos/config" +else + echo " No existing config found" +fi + +# Step 3: Create fresh cluster +echo "" +echo "[3/6] Creating fresh Talos cluster: $CLUSTER_NAME" +echo " Network: $CLUSTER_NETWORK" +echo " Control Plane: $CP_IP" +echo " Workers: $WORKER1_IP, $WORKER2_IP" +echo "" + +sudo -E talosctl cluster create \ + --name "$CLUSTER_NAME" \ + --cidr "$CLUSTER_NETWORK" \ + --controlplanes 1 \ + --workers 2 \ + --wait-timeout 10m + +echo "" +echo " Cluster created successfully!" + +# Step 4: Get node hostnames +echo "" +echo "[4/6] Getting node information..." +CP_HOSTNAME=$(talosctl get hostname -n "$CP_IP" -o json 2>/dev/null | jq -r '.spec.hostname' || echo "talos-cp-1") +WORKER1_HOSTNAME=$(talosctl get hostname -n "$WORKER1_IP" -o json 2>/dev/null | jq -r '.spec.hostname' || echo "talos-worker-1") +WORKER2_HOSTNAME=$(talosctl get hostname -n "$WORKER2_IP" -o json 2>/dev/null | jq -r '.spec.hostname' || echo "talos-worker-2") + +echo " Control Plane: $CP_HOSTNAME ($CP_IP)" +echo " Worker 1: $WORKER1_HOSTNAME ($WORKER1_IP)" +echo " Worker 2: $WORKER2_HOSTNAME ($WORKER2_IP)" + +# Step 5: Add /etc/hosts entry for vIP hostname +echo "" +echo "[5/6] Setting up vIP hostname in /etc/hosts..." +# Remove old entry if exists +sudo sed -i "/$VIP_HOSTNAME/d" /etc/hosts 2>/dev/null || true +# Add new entry pointing vIP hostname to control plane +echo "$CP_IP $VIP_HOSTNAME" | sudo tee -a /etc/hosts > /dev/null +echo " Added: $CP_IP $VIP_HOSTNAME" + +# Step 6: Create talosconfig with vIP test scenarios +echo "" +echo "[6/6] Creating talosconfig with test contexts..." + +# Backup original config created by talosctl cluster create +ORIGINAL_CONFIG=~/.talos/config + +# Extract CA, CRT, and KEY from the original config using grep/awk +# The config format has these on their own lines after the context +CA=$(grep -A 20 "contexts:" "$ORIGINAL_CONFIG" | grep "ca:" | head -1 | awk '{print $2}') +CRT=$(grep -A 20 "contexts:" "$ORIGINAL_CONFIG" | grep "crt:" | head -1 | awk '{print $2}') +KEY=$(grep -A 20 "contexts:" "$ORIGINAL_CONFIG" | grep "key:" | head -1 | awk '{print $2}') + +# Create new config with multiple test contexts +cat > ~/.talos/config << EOF +context: vip-with-nodes +contexts: + # Context 1: Normal setup (single endpoint, no nodes specified) + # Expected: Works normally, targets endpoint node only + normal: + endpoints: + - $CP_IP + ca: $CA + crt: $CRT + key: $KEY + + # Context 2: vIP endpoint with nodes list including the vIP + # This is the BUG SCENARIO we fixed: + # - endpoint is $VIP_HOSTNAME (resolves to $CP_IP) + # - nodes includes $VIP_HOSTNAME AND actual node hostnames + # Before fix: vIP would be passed in node header -> empty etcd results + # After fix: vIP should be filtered out -> correct etcd results + vip-with-nodes: + endpoints: + - $VIP_HOSTNAME + nodes: + - $VIP_HOSTNAME + - $CP_HOSTNAME + - $WORKER1_HOSTNAME + - $WORKER2_HOSTNAME + ca: $CA + crt: $CRT + key: $KEY + + # Context 3: IP endpoint with nodes list including the IP + # Similar bug scenario but with IP instead of hostname + ip-with-nodes: + endpoints: + - $CP_IP + nodes: + - $CP_IP + - $WORKER1_IP + - $WORKER2_IP + ca: $CA + crt: $CRT + key: $KEY + + # Context 4: Multiple endpoints (all real nodes) + # Tests that we don't break normal multi-endpoint configs + multi-endpoint: + endpoints: + - $CP_IP + - $WORKER1_IP + - $WORKER2_IP + ca: $CA + crt: $CRT + key: $KEY +EOF + +echo " Created contexts:" +echo " - normal: Single endpoint, no nodes (baseline)" +echo " - vip-with-nodes: vIP hostname in both endpoints and nodes (BUG SCENARIO)" +echo " - ip-with-nodes: IP in both endpoints and nodes (similar bug)" +echo " - multi-endpoint: Multiple real endpoints (should work normally)" + +echo "" +echo "==========================================" +echo "Setup Complete!" +echo "==========================================" +echo "" +echo "Current context: vip-with-nodes (the bug scenario)" +echo "" +echo "Test commands:" +echo " # Run talos-pilot with the vIP bug scenario" +echo " cargo run --bin talos-pilot" +echo "" +echo " # Check etcd members directly (should show 1/1 if fix works)" +echo " talosctl etcd members" +echo "" +echo " # Switch to normal context for comparison" +echo " talosctl config context normal" +echo "" +echo " # List all contexts" +echo " talosctl config contexts" +echo "" +echo "What to verify:" +echo " 1. In 'vip-with-nodes' context, etcd should show 1/1 members (not 0/0)" +echo " 2. In 'ip-with-nodes' context, etcd should show 1/1 members (not 0/0)" +echo " 3. Cluster view should show all 3 nodes correctly" +echo ""