From 3ae55a7a3801d269f1c184fc916d57e0c1eaa216 Mon Sep 17 00:00:00 2001 From: Kenny Udovic Date: Sat, 17 Jan 2026 08:57:03 -0500 Subject: [PATCH] fix: etcd status now queries all control plane nodes Fixed etcd status calls that only queried a single node, causing "quorum at risk (1/3)" on multi-CP clusters. Added etcd_status_for_nodes() method to target specific nodes. Updated all etcd status consumers (etcd view, lifecycle pre-checks, node operations, cluster summary, diagnostics) to first fetch member hostnames, then query all control planes for accurate quorum display. --- Cargo.lock | 8 ++-- Cargo.toml | 2 +- .../talos-pilot-tui/src/components/cluster.rs | 29 +++++++----- .../src/components/diagnostics/core.rs | 9 +++- crates/talos-pilot-tui/src/components/etcd.rs | 47 +++++++++++++------ .../src/components/lifecycle.rs | 7 ++- .../src/components/node_operations.rs | 45 ++++++++++-------- crates/talos-rs/src/client.rs | 25 ++++++++-- 8 files changed, 114 insertions(+), 58 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c6e9989..2adcd2f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2778,7 +2778,7 @@ dependencies = [ [[package]] name = "talos-pilot" -version = "0.1.4" +version = "0.1.6" dependencies = [ "clap", "color-eyre", @@ -2791,7 +2791,7 @@ dependencies = [ [[package]] name = "talos-pilot-core" -version = "0.1.4" +version = "0.1.6" dependencies = [ "chrono", "serde", @@ -2802,7 +2802,7 @@ dependencies = [ [[package]] name = "talos-pilot-tui" -version = "0.1.4" +version = "0.1.6" dependencies = [ "arboard", "base64", @@ -2829,7 +2829,7 @@ dependencies = [ [[package]] name = "talos-rs" -version = "0.1.4" +version = "0.1.6" dependencies = [ "base64", "dirs-next", diff --git a/Cargo.toml b/Cargo.toml index 3e0a8e4..5bb4255 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["crates/*"] resolver = "2" [workspace.package] -version = "0.1.5" +version = "0.1.6" edition = "2024" authors = ["Ken Udovic"] license = "MIT" diff --git a/crates/talos-pilot-tui/src/components/cluster.rs b/crates/talos-pilot-tui/src/components/cluster.rs index 8488117..8e21db2 100644 --- a/crates/talos-pilot-tui/src/components/cluster.rs +++ b/crates/talos-pilot-tui/src/components/cluster.rs @@ -571,18 +571,23 @@ impl ClusterComponent { cluster.load_avg = load_avg; cluster.cpu_info = cpu_info; - // Fetch etcd status for header summary - if let Some(client) = &cluster.client - && let Ok(statuses) = client.etcd_status().await - { - let total = cluster.etcd_members.len(); - let healthy = statuses.len(); - let quorum_needed = total / 2 + 1; - cluster.etcd_summary = Some(EtcdSummary { - healthy, - total, - has_quorum: healthy >= quorum_needed, - }); + // Fetch etcd status for header summary (target all control planes) + if let Some(client) = &cluster.client { + let cp_hostnames: Vec = cluster + .etcd_members + .iter() + .map(|m| m.hostname.clone()) + .collect(); + if let Ok(statuses) = client.etcd_status_for_nodes(&cp_hostnames).await { + let total = cluster.etcd_members.len(); + let healthy = statuses.len(); + let quorum_needed = total / 2 + 1; + cluster.etcd_summary = Some(EtcdSummary { + healthy, + total, + has_quorum: healthy >= quorum_needed, + }); + } } } } diff --git a/crates/talos-pilot-tui/src/components/diagnostics/core.rs b/crates/talos-pilot-tui/src/components/diagnostics/core.rs index 5c7daa8..f7b3f5b 100644 --- a/crates/talos-pilot-tui/src/components/diagnostics/core.rs +++ b/crates/talos-pilot-tui/src/components/diagnostics/core.rs @@ -132,7 +132,12 @@ pub async fn run_kubernetes_checks( // Etcd check (for control plane nodes) if ctx.node_role.contains("controlplane") || ctx.node_role.contains("control") { - match client.etcd_status().await { + // Target this specific node for etcd status + let target_node = ctx + .node_endpoint + .clone() + .unwrap_or_else(|| ctx.hostname.clone()); + match client.etcd_status_for_nodes(&[target_node]).await { Ok(status_list) => { if let Some(status) = status_list.first() { let is_leader = status.is_leader(); @@ -142,6 +147,8 @@ pub async fn run_kubernetes_checks( format!("Follower (leader: {:x})", status.leader_id) }; checks.push(DiagnosticCheck::pass("etcd", "Etcd", &msg)); + } else { + checks.push(DiagnosticCheck::warn("etcd", "Etcd", "No status returned")); } } Err(e) => { diff --git a/crates/talos-pilot-tui/src/components/etcd.rs b/crates/talos-pilot-tui/src/components/etcd.rs index fa962dc..1519e35 100644 --- a/crates/talos-pilot-tui/src/components/etcd.rs +++ b/crates/talos-pilot-tui/src/components/etcd.rs @@ -105,18 +105,46 @@ impl EtcdComponent { self.state.start_loading(); - // Fetch member list, status, and alarms in parallel with timeout let timeout = Duration::from_secs(10); + + // Step 1: Fetch member list first (we need hostnames to target status calls) + let members_result = tokio::time::timeout(timeout, client.etcd_members()).await; + + let member_infos = match members_result { + Ok(Ok(members)) => members, + Ok(Err(e)) => { + let msg = format_talos_error(&e); + self.state + .set_error_with_retry(format!("Failed to fetch members: {}", msg)); + return Ok(()); + } + Err(_) => { + self.state.set_error_with_retry(format!( + "Request timed out after {}s", + timeout.as_secs() + )); + return Ok(()); + } + }; + + // Step 2: Extract control plane hostnames from members + let cp_hostnames: Vec = member_infos.iter().map(|m| m.hostname.clone()).collect(); + + tracing::debug!( + "Fetching etcd status from control planes: {:?}", + cp_hostnames + ); + + // Step 3: Fetch status (targeting all CPs) and alarms in parallel let fetch_result = tokio::time::timeout(timeout, async { tokio::join!( - client.etcd_members(), - client.etcd_status(), + client.etcd_status_for_nodes(&cp_hostnames), client.etcd_alarms() ) }) .await; - let (members_result, status_result, alarms_result) = match fetch_result { + let (status_result, alarms_result) = match fetch_result { Ok(results) => results, Err(_) => { self.state.set_error_with_retry(format!( @@ -127,17 +155,6 @@ impl EtcdComponent { } }; - // Process member list - this is critical, fail if we can't get it - let member_infos = match members_result { - Ok(members) => members, - Err(e) => { - let msg = format_talos_error(&e); - self.state - .set_error_with_retry(format!("Failed to fetch members: {}", msg)); - return Ok(()); - } - }; - // Process status - non-critical, just log warning let statuses = match status_result { Ok(s) => s, diff --git a/crates/talos-pilot-tui/src/components/lifecycle.rs b/crates/talos-pilot-tui/src/components/lifecycle.rs index bfaba54..6d70e65 100644 --- a/crates/talos-pilot-tui/src/components/lifecycle.rs +++ b/crates/talos-pilot-tui/src/components/lifecycle.rs @@ -359,8 +359,11 @@ impl LifecycleComponent { let etcd_quorum = match client.etcd_members().await { Ok(members) => { let total = members.len(); - // Try to get status to determine healthy members - let healthy = match client.etcd_status().await { + // Extract control plane hostnames to target status calls + let cp_hostnames: Vec = + members.iter().map(|m| m.hostname.clone()).collect(); + // Try to get status from all control planes + let healthy = match client.etcd_status_for_nodes(&cp_hostnames).await { Ok(statuses) => { // Count members with status members diff --git a/crates/talos-pilot-tui/src/components/node_operations.rs b/crates/talos-pilot-tui/src/components/node_operations.rs index 244b01e..8d98777 100644 --- a/crates/talos-pilot-tui/src/components/node_operations.rs +++ b/crates/talos-pilot-tui/src/components/node_operations.rs @@ -686,34 +686,39 @@ impl NodeOperationsComponent { .iter() .any(|m| m.peer_urls.iter().any(|url| url.contains(node_addr))); - // Try to get leader info - let is_leader = match client.etcd_status().await { - Ok(statuses) => { - // Find the status for this node - statuses.iter().any(|s| { - let member = members.iter().find(|m| m.id == s.member_id); - if let Some(m) = member { - m.peer_urls.iter().any(|url| url.contains(node_addr)) - && s.is_leader() - } else { - false - } - }) + // Extract control plane hostnames to target status calls + let cp_hostnames: Vec = + members.iter().map(|m| m.hostname.clone()).collect(); + + // Get status from all control planes (single call for both is_leader and healthy) + let statuses = client + .etcd_status_for_nodes(&cp_hostnames) + .await + .unwrap_or_default(); + + // Check if this node is the leader + let is_leader = statuses.iter().any(|s| { + let member = members.iter().find(|m| m.id == s.member_id); + if let Some(m) = member { + m.peer_urls.iter().any(|url| url.contains(node_addr)) && s.is_leader() + } else { + false } - Err(_) => false, - }; + }); // Calculate members after this node goes down let members_after = if is_member { total - 1 } else { total }; let quorum_maintained = members_after >= quorum_needed; - // Get healthy member count - let healthy = match client.etcd_status().await { - Ok(statuses) => members + // Count healthy members (those with status) + // If no statuses returned, assume all healthy + let healthy = if statuses.is_empty() { + total + } else { + members .iter() .filter(|m| statuses.iter().any(|s| s.member_id == m.id)) - .count(), - Err(_) => total, // Assume all healthy if can't get status + .count() }; if let Some(data) = self.data_mut() { diff --git a/crates/talos-rs/src/client.rs b/crates/talos-rs/src/client.rs index 88eb82e..282c312 100644 --- a/crates/talos-rs/src/client.rs +++ b/crates/talos-rs/src/client.rs @@ -671,11 +671,30 @@ impl TalosClient { /// Get etcd status from control plane nodes /// Returns status for each etcd member that responds /// - /// Note: We don't use node targeting because etcd only runs on control plane nodes. + /// Use `etcd_status_for_nodes()` if you need to target specific control plane nodes. pub async fn etcd_status(&self) -> Result, TalosError> { + self.etcd_status_for_nodes(&[]).await + } + + /// Get etcd status from specific control plane nodes + /// + /// Pass the hostnames from `etcd_members()` to get status from all control planes. + /// If nodes is empty, queries only the endpoint node. + pub async fn etcd_status_for_nodes( + &self, + nodes: &[String], + ) -> Result, TalosError> { let mut client = self.machine_client(); - // Don't use with_nodes() - etcd only runs on control plane - let request = Request::new(()); + + let mut request = Request::new(()); + + // Add node targeting if specific nodes provided + if !nodes.is_empty() { + let nodes_str = nodes.join(","); + request + .metadata_mut() + .insert("nodes", nodes_str.parse().unwrap()); + } let response = client.etcd_status(request).await?; let inner = response.into_inner();