Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ members = ["crates/*"]
resolver = "2"

[workspace.package]
version = "0.1.5"
version = "0.1.6"
edition = "2024"
authors = ["Ken Udovic"]
license = "MIT"
Expand Down
29 changes: 17 additions & 12 deletions crates/talos-pilot-tui/src/components/cluster.rs
Original file line number Diff line number Diff line change
Expand Up @@ -571,18 +571,23 @@ impl ClusterComponent {
cluster.load_avg = load_avg;
cluster.cpu_info = cpu_info;

// Fetch etcd status for header summary
if let Some(client) = &cluster.client
&& let Ok(statuses) = client.etcd_status().await
{
let total = cluster.etcd_members.len();
let healthy = statuses.len();
let quorum_needed = total / 2 + 1;
cluster.etcd_summary = Some(EtcdSummary {
healthy,
total,
has_quorum: healthy >= quorum_needed,
});
// Fetch etcd status for header summary (target all control planes)
if let Some(client) = &cluster.client {
let cp_hostnames: Vec<String> = cluster
.etcd_members
.iter()
.map(|m| m.hostname.clone())
.collect();
if let Ok(statuses) = client.etcd_status_for_nodes(&cp_hostnames).await {
let total = cluster.etcd_members.len();
let healthy = statuses.len();
let quorum_needed = total / 2 + 1;
cluster.etcd_summary = Some(EtcdSummary {
healthy,
total,
has_quorum: healthy >= quorum_needed,
});
}
}
}
}
Expand Down
9 changes: 8 additions & 1 deletion crates/talos-pilot-tui/src/components/diagnostics/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,12 @@ pub async fn run_kubernetes_checks(

// Etcd check (for control plane nodes)
if ctx.node_role.contains("controlplane") || ctx.node_role.contains("control") {
match client.etcd_status().await {
// Target this specific node for etcd status
let target_node = ctx
.node_endpoint
.clone()
.unwrap_or_else(|| ctx.hostname.clone());
match client.etcd_status_for_nodes(&[target_node]).await {
Ok(status_list) => {
if let Some(status) = status_list.first() {
let is_leader = status.is_leader();
Expand All @@ -142,6 +147,8 @@ pub async fn run_kubernetes_checks(
format!("Follower (leader: {:x})", status.leader_id)
};
checks.push(DiagnosticCheck::pass("etcd", "Etcd", &msg));
} else {
checks.push(DiagnosticCheck::warn("etcd", "Etcd", "No status returned"));
}
}
Err(e) => {
Expand Down
47 changes: 32 additions & 15 deletions crates/talos-pilot-tui/src/components/etcd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,18 +105,46 @@ impl EtcdComponent {

self.state.start_loading();

// Fetch member list, status, and alarms in parallel with timeout
let timeout = Duration::from_secs(10);

// Step 1: Fetch member list first (we need hostnames to target status calls)
let members_result = tokio::time::timeout(timeout, client.etcd_members()).await;

let member_infos = match members_result {
Ok(Ok(members)) => members,
Ok(Err(e)) => {
let msg = format_talos_error(&e);
self.state
.set_error_with_retry(format!("Failed to fetch members: {}", msg));
return Ok(());
}
Err(_) => {
self.state.set_error_with_retry(format!(
"Request timed out after {}s",
timeout.as_secs()
));
return Ok(());
}
};

// Step 2: Extract control plane hostnames from members
let cp_hostnames: Vec<String> = member_infos.iter().map(|m| m.hostname.clone()).collect();

tracing::debug!(
"Fetching etcd status from control planes: {:?}",
cp_hostnames
);

// Step 3: Fetch status (targeting all CPs) and alarms in parallel
let fetch_result = tokio::time::timeout(timeout, async {
tokio::join!(
client.etcd_members(),
client.etcd_status(),
client.etcd_status_for_nodes(&cp_hostnames),
client.etcd_alarms()
)
})
.await;

let (members_result, status_result, alarms_result) = match fetch_result {
let (status_result, alarms_result) = match fetch_result {
Ok(results) => results,
Err(_) => {
self.state.set_error_with_retry(format!(
Expand All @@ -127,17 +155,6 @@ impl EtcdComponent {
}
};

// Process member list - this is critical, fail if we can't get it
let member_infos = match members_result {
Ok(members) => members,
Err(e) => {
let msg = format_talos_error(&e);
self.state
.set_error_with_retry(format!("Failed to fetch members: {}", msg));
return Ok(());
}
};

// Process status - non-critical, just log warning
let statuses = match status_result {
Ok(s) => s,
Expand Down
7 changes: 5 additions & 2 deletions crates/talos-pilot-tui/src/components/lifecycle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -359,8 +359,11 @@ impl LifecycleComponent {
let etcd_quorum = match client.etcd_members().await {
Ok(members) => {
let total = members.len();
// Try to get status to determine healthy members
let healthy = match client.etcd_status().await {
// Extract control plane hostnames to target status calls
let cp_hostnames: Vec<String> =
members.iter().map(|m| m.hostname.clone()).collect();
// Try to get status from all control planes
let healthy = match client.etcd_status_for_nodes(&cp_hostnames).await {
Ok(statuses) => {
// Count members with status
members
Expand Down
45 changes: 25 additions & 20 deletions crates/talos-pilot-tui/src/components/node_operations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -686,34 +686,39 @@ impl NodeOperationsComponent {
.iter()
.any(|m| m.peer_urls.iter().any(|url| url.contains(node_addr)));

// Try to get leader info
let is_leader = match client.etcd_status().await {
Ok(statuses) => {
// Find the status for this node
statuses.iter().any(|s| {
let member = members.iter().find(|m| m.id == s.member_id);
if let Some(m) = member {
m.peer_urls.iter().any(|url| url.contains(node_addr))
&& s.is_leader()
} else {
false
}
})
// Extract control plane hostnames to target status calls
let cp_hostnames: Vec<String> =
members.iter().map(|m| m.hostname.clone()).collect();

// Get status from all control planes (single call for both is_leader and healthy)
let statuses = client
.etcd_status_for_nodes(&cp_hostnames)
.await
.unwrap_or_default();

// Check if this node is the leader
let is_leader = statuses.iter().any(|s| {
let member = members.iter().find(|m| m.id == s.member_id);
if let Some(m) = member {
m.peer_urls.iter().any(|url| url.contains(node_addr)) && s.is_leader()
} else {
false
}
Err(_) => false,
};
});

// Calculate members after this node goes down
let members_after = if is_member { total - 1 } else { total };
let quorum_maintained = members_after >= quorum_needed;

// Get healthy member count
let healthy = match client.etcd_status().await {
Ok(statuses) => members
// Count healthy members (those with status)
// If no statuses returned, assume all healthy
let healthy = if statuses.is_empty() {
total
} else {
members
.iter()
.filter(|m| statuses.iter().any(|s| s.member_id == m.id))
.count(),
Err(_) => total, // Assume all healthy if can't get status
.count()
};

if let Some(data) = self.data_mut() {
Expand Down
25 changes: 22 additions & 3 deletions crates/talos-rs/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -671,11 +671,30 @@ impl TalosClient {
/// Get etcd status from control plane nodes
/// Returns status for each etcd member that responds
///
/// Note: We don't use node targeting because etcd only runs on control plane nodes.
/// Use `etcd_status_for_nodes()` if you need to target specific control plane nodes.
pub async fn etcd_status(&self) -> Result<Vec<EtcdMemberStatus>, TalosError> {
self.etcd_status_for_nodes(&[]).await
}

/// Get etcd status from specific control plane nodes
///
/// Pass the hostnames from `etcd_members()` to get status from all control planes.
/// If nodes is empty, queries only the endpoint node.
pub async fn etcd_status_for_nodes(
&self,
nodes: &[String],
) -> Result<Vec<EtcdMemberStatus>, TalosError> {
let mut client = self.machine_client();
// Don't use with_nodes() - etcd only runs on control plane
let request = Request::new(());

let mut request = Request::new(());

// Add node targeting if specific nodes provided
if !nodes.is_empty() {
let nodes_str = nodes.join(",");
request
.metadata_mut()
.insert("nodes", nodes_str.parse().unwrap());
}

let response = client.etcd_status(request).await?;
let inner = response.into_inner();
Expand Down