From 65708530e807680ab41d554554234b2f6b2eb3fd Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 11 Dec 2025 15:03:43 -0500 Subject: [PATCH 01/25] all_omicron_zones() -> danger_all_omicron_zones() --- dev-tools/omdb/src/bin/omdb/db.rs | 2 +- dev-tools/reconfigurator-cli/src/lib.rs | 4 +- live-tests/tests/test_nexus_add_remove.rs | 4 +- live-tests/tests/test_nexus_handoff.rs | 8 +-- .../db-queries/src/db/datastore/deployment.rs | 14 ++-- nexus/db-queries/src/db/datastore/rack.rs | 16 ++--- .../src/db/datastore/support_bundle.rs | 2 +- nexus/reconfigurator/blippy/src/checks.rs | 10 +-- .../execution/src/clickhouse.rs | 4 +- nexus/reconfigurator/execution/src/dns.rs | 6 +- .../planning/src/blueprint_builder/builder.rs | 6 +- .../allocators/external_networking.rs | 2 +- nexus/reconfigurator/planning/src/example.rs | 4 +- .../tests/integration_tests/planner.rs | 68 +++++++++---------- .../background/tasks/blueprint_execution.rs | 2 +- .../tasks/crdb_node_id_collector.rs | 4 +- nexus/src/app/deployment.rs | 2 +- nexus/src/app/quiesce.rs | 6 +- nexus/src/lib.rs | 2 +- nexus/types/src/deployment.rs | 8 ++- nexus/types/src/deployment/execution/dns.rs | 2 +- nexus/types/src/deployment/execution/utils.rs | 2 +- 22 files changed, 90 insertions(+), 88 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index d7f8af04f8f..7d0831bffe6 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -1623,7 +1623,7 @@ async fn lookup_service_info( blueprint: &Blueprint, ) -> anyhow::Result> { let Some(zone_config) = blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) + .danger_all_omicron_zones(BlueprintZoneDisposition::any) .find_map(|(_sled_id, zone_config)| { if zone_config.id.into_untyped_uuid() == service_id { Some(zone_config) diff --git a/dev-tools/reconfigurator-cli/src/lib.rs b/dev-tools/reconfigurator-cli/src/lib.rs index 5f69431574a..2953aaee23b 100644 --- a/dev-tools/reconfigurator-cli/src/lib.rs +++ b/dev-tools/reconfigurator-cli/src/lib.rs @@ -160,7 +160,7 @@ impl ReconfiguratorSim { // Handle zone networking setup first for (_, zone) in parent_blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) { if let Some((external_ip, nic)) = zone.zone_type.external_networking() @@ -233,7 +233,7 @@ impl ReconfiguratorSim { state.config().active_nexus_zone_generation(); let mut not_yet_nexus_zones = BTreeSet::new(); for (_, zone) in parent_blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) { match &zone.zone_type { nexus_types::deployment::BlueprintZoneType::Nexus( diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index c8852caa07a..365ce016789 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -207,7 +207,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { .await .expect("editing blueprint to expunge zone"); let (_, expunged_zone_config) = blueprint3 - .all_omicron_zones(|_| true) + .danger_all_omicron_zones(|_| true) .find(|(_sled_id, zone_config)| zone_config.id == new_zone.id) .expect("expunged zone in new blueprint"); let BlueprintZoneDisposition::Expunged { @@ -306,7 +306,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { // We don't need to check this here. It just provides a better error // message if something has gone wrong up to this point. let (_, expunged_zone_config) = new_blueprint - .all_omicron_zones(|_| true) + .danger_all_omicron_zones(|_| true) .find(|(_sled_id, zone_config)| zone_config.id == new_zone.id) .expect("expunged zone in new blueprint"); assert!(expunged_zone_config.disposition.is_ready_for_cleanup()); diff --git a/live-tests/tests/test_nexus_handoff.rs b/live-tests/tests/test_nexus_handoff.rs index 37de5318f60..bce8f20adf5 100644 --- a/live-tests/tests/test_nexus_handoff.rs +++ b/live-tests/tests/test_nexus_handoff.rs @@ -69,7 +69,7 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { // there exist no Nexus zones with a generation newer than the blueprint's // `nexus_generation`. let new_zones = blueprint_initial - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter_map(|(_sled_id, z)| { let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { nexus_generation, @@ -102,7 +102,7 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { cfg: &'a blueprint_zone_type::Nexus, } let current_nexus_zones: BTreeMap = blueprint_initial - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter_map(|(sled_id, z)| { let BlueprintZoneType::Nexus( cfg @ blueprint_zone_type::Nexus { nexus_generation, .. }, @@ -504,7 +504,7 @@ async fn check_internal_dns( // Find the DNS server based on what's currently in the blueprint. let dns_sockaddr = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .find_map(|(_sled_id, zone_cfg)| { if let BlueprintZoneType::InternalDns( blueprint_zone_type::InternalDns { dns_address, .. }, @@ -568,7 +568,7 @@ async fn check_external_dns( // Find the DNS server based on what's currently in the blueprint. let dns_http_sockaddr = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .find_map(|(_sled_id, zone_cfg)| { if let BlueprintZoneType::ExternalDns( blueprint_zone_type::ExternalDns { http_address, .. }, diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 4580c6e4257..e361b215e2f 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -1956,7 +1956,7 @@ impl DataStore { &conn, &opctx.log, blueprint - .all_omicron_zones(|disposition| { + .danger_all_omicron_zones(|disposition| { !disposition.is_in_service() }) .map(|(_sled_id, zone)| zone), @@ -1967,7 +1967,7 @@ impl DataStore { &conn, opctx, blueprint - .all_omicron_zones(|disposition| { + .danger_all_omicron_zones(|disposition| { disposition.is_in_service() }) .map(|(_sled_id, zone)| zone), @@ -3312,7 +3312,7 @@ mod tests { ); assert_eq!(blueprint1.sleds.len(), collection.sled_agents.len()); assert_eq!( - blueprint1.all_omicron_zones(BlueprintZoneDisposition::any).count(), + blueprint1.danger_all_omicron_zones(BlueprintZoneDisposition::any).count(), collection.all_ledgered_omicron_zones().count() ); // All zones should be in service. @@ -3644,9 +3644,9 @@ mod tests { ); assert_eq!(blueprint1.sleds.len() + 1, blueprint2.sleds.len()); assert_eq!( - blueprint1.all_omicron_zones(BlueprintZoneDisposition::any).count() + blueprint1.danger_all_omicron_zones(BlueprintZoneDisposition::any).count() + num_new_sled_zones, - blueprint2.all_omicron_zones(BlueprintZoneDisposition::any).count() + blueprint2.danger_all_omicron_zones(BlueprintZoneDisposition::any).count() ); // All zones should be in service. @@ -4296,7 +4296,7 @@ mod tests { // Insert an IP pool range covering the one Nexus IP. let nexus_ip = blueprint1 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .find_map(|(_, zone_config)| { zone_config .zone_type @@ -4516,7 +4516,7 @@ mod tests { fn assert_all_zones_in_service(blueprint: &Blueprint) { let not_in_service = blueprint - .all_omicron_zones(|disposition| !disposition.is_in_service()) + .danger_all_omicron_zones(|disposition| !disposition.is_in_service()) .collect::>(); assert!( not_in_service.is_empty(), diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index b51556e4b34..23118f49a7c 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -842,7 +842,7 @@ impl DataStore { // Insert Nexus database access records self.initialize_nexus_access_from_blueprint_on_connection( &conn, - blueprint.all_omicron_zones(BlueprintZoneDisposition::is_in_service) + blueprint.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter_map(|(_sled, zone_cfg)| { if zone_cfg.zone_type.is_nexus() { Some(zone_cfg.id) @@ -856,7 +856,7 @@ impl DataStore { })?; // Allocate networking records for all services. - for (_, zone_config) in blueprint.all_omicron_zones(BlueprintZoneDisposition::is_in_service) { + for (_, zone_config) in blueprint.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) { self.rack_populate_service_networking_records( &conn, &log, @@ -1504,7 +1504,7 @@ mod test { let mut ntp2_id = None; let mut ntp3_id = None; for (sled_id, zone) in - blueprint.all_omicron_zones(BlueprintZoneDisposition::is_in_service) + blueprint.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) { match &zone.zone_type { BlueprintZoneType::BoundaryNtp(_) => { @@ -1751,7 +1751,7 @@ mod test { // We should see both of the Nexus services we provisioned. let mut observed_zones: Vec<_> = observed_blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) + .danger_all_omicron_zones(BlueprintZoneDisposition::any) .map(|(_, z)| z) .collect(); observed_zones.sort_by_key(|z| z.id); @@ -1779,7 +1779,7 @@ mod test { external_ip, .. }) = &blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) + .danger_all_omicron_zones(BlueprintZoneDisposition::any) .next() .unwrap() .1 @@ -1798,7 +1798,7 @@ mod test { external_ip, .. }) = &blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) + .danger_all_omicron_zones(BlueprintZoneDisposition::any) .nth(1) .unwrap() .1 @@ -1956,7 +1956,7 @@ mod test { // We should see the Nexus service we provisioned. let mut observed_zones: Vec<_> = observed_blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) + .danger_all_omicron_zones(BlueprintZoneDisposition::any) .map(|(_, z)| z) .collect(); observed_zones.sort_by_key(|z| z.id); @@ -1986,7 +1986,7 @@ mod test { external_ip, .. }) = &blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) + .danger_all_omicron_zones(BlueprintZoneDisposition::any) .next() .unwrap() .1 diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index c53bcb04f82..4077c7731a9 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -227,7 +227,7 @@ impl DataStore { // For this blueprint: The set of all expunged Nexus zones that are // ready for cleanup let invalid_nexus_zones = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup) .filter_map(|(_sled, zone)| { if zone.zone_type.is_nexus() { Some(zone.id.into_untyped_uuid()) diff --git a/nexus/reconfigurator/blippy/src/checks.rs b/nexus/reconfigurator/blippy/src/checks.rs index eec34c2a34d..35e8b39fb14 100644 --- a/nexus/reconfigurator/blippy/src/checks.rs +++ b/nexus/reconfigurator/blippy/src/checks.rs @@ -67,7 +67,7 @@ fn check_underlay_ips(blippy: &mut Blippy<'_>) { for (sled_id, zone) in blippy .blueprint() - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) { let ip = zone.underlay_ip(); @@ -160,7 +160,7 @@ fn check_external_networking(blippy: &mut Blippy<'_>) { for (sled_id, zone, external_ip, nic) in blippy .blueprint() - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter_map(|(sled_id, zone)| { zone.zone_type .external_networking() @@ -265,7 +265,7 @@ fn check_dataset_zpool_uniqueness(blippy: &mut Blippy<'_>) { // kind. for (sled_id, zone) in blippy .blueprint() - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) { // Check "one kind per zpool" for transient datasets... let filesystem_dataset = zone.filesystem_dataset(); @@ -691,7 +691,7 @@ fn check_nexus_generation_consistency(blippy: &mut Blippy<'_>) { // Collect all Nexus zones and their generations for (sled_id, zone) in blippy .blueprint() - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) { if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { generation_info.entry(nexus.nexus_generation).or_default().push(( @@ -2143,7 +2143,7 @@ fn check_planning_input_network_records_appear_in_blueprint( // check for duplicates here: we could very well see reuse of IPs // between expunged zones or between expunged -> running zones. for (_, z) in - blippy.blueprint().all_omicron_zones(BlueprintZoneDisposition::any) + blippy.blueprint().danger_all_omicron_zones(BlueprintZoneDisposition::any) { let zone_type = &z.zone_type; match zone_type { diff --git a/nexus/reconfigurator/execution/src/clickhouse.rs b/nexus/reconfigurator/execution/src/clickhouse.rs index b5ec2beb4c5..190150e0cb4 100644 --- a/nexus/reconfigurator/execution/src/clickhouse.rs +++ b/nexus/reconfigurator/execution/src/clickhouse.rs @@ -74,7 +74,7 @@ pub(crate) async fn deploy_nodes( // This is tracked in https://github.com/oxidecomputer/omicron/issues/7724 deploy_nodes_impl( opctx, - blueprint.all_omicron_zones(BlueprintZoneDisposition::any), + blueprint.danger_all_omicron_zones(BlueprintZoneDisposition::any), clickhouse_cluster_config, ) .await @@ -235,7 +235,7 @@ pub(crate) async fn deploy_single_node( deploy_single_node_impl( opctx, blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| z.zone_type.is_clickhouse()), ) .await diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 202188c2621..ae884fd39f5 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -822,7 +822,7 @@ mod test { // To start, we need a mapping from underlay IP to the corresponding // Omicron zone. let omicron_zones_by_ip: BTreeMap<_, _> = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .map(|(_, zone)| (zone.underlay_ip(), zone.id)) .collect(); println!("omicron zones by IP: {:#?}", omicron_zones_by_ip); @@ -1554,11 +1554,11 @@ mod test { eprintln!("blueprint2: {}", blueprint2.display()); // Figure out the id of the new zone. let zones_before = blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) + .danger_all_omicron_zones(BlueprintZoneDisposition::any) .filter_map(|(_, z)| z.zone_type.is_nexus().then_some(z.id)) .collect::>(); let zones_after = blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::any) + .danger_all_omicron_zones(BlueprintZoneDisposition::any) .filter_map(|(_, z)| z.zone_type.is_nexus().then_some(z.id)) .collect::>(); let new_zones: Vec<_> = zones_after.difference(&zones_before).collect(); diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 217eefd11f2..aed6393ed05 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -3246,7 +3246,7 @@ pub mod test { // Nexus with no remaining external IPs should fail. let mut used_ip_ranges = Vec::new(); for (_, z) in - parent.all_omicron_zones(BlueprintZoneDisposition::any) + parent.danger_all_omicron_zones(BlueprintZoneDisposition::any) { if let Some((external_ip, _)) = z.zone_type.external_networking() @@ -3317,7 +3317,7 @@ pub mod test { // provisions CRDB; this check makes sure we update our use of it if // that changes). for (_, z) in - parent.all_omicron_zones(BlueprintZoneDisposition::is_in_service) + parent.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) { assert!( !z.zone_type.is_cockroach(), @@ -3361,7 +3361,7 @@ pub mod test { verify_blueprint(&blueprint, &input); assert_eq!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(sled_id, z)| { *sled_id == target_sled_id && z.zone_type.kind() == ZoneKind::CockroachDb diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs b/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs index c428ec39c7f..a40a74944ce 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs @@ -87,7 +87,7 @@ impl ExternalNetworkingAllocator { ) -> anyhow::Result { Self::new( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .map(|(_sled_id, zone)| zone), external_ip_policy, ) diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index efd690b2815..43ee781d571 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -1446,7 +1446,7 @@ mod tests { } ServiceName::Crucible(_) => { // Each Crucible zone should be queryable. - for (_, zone) in blueprint.all_omicron_zones( + for (_, zone) in blueprint.danger_all_omicron_zones( BlueprintZoneDisposition::is_in_service, ) { if zone.kind() == ZoneKind::Crucible { @@ -1471,7 +1471,7 @@ mod tests { kind: ZoneKind, ) -> Vec<&BlueprintZoneConfig> { blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) + .danger_all_omicron_zones(BlueprintZoneDisposition::any) .filter_map(|(_, zone)| { (zone.zone_type.kind() == kind).then_some(zone) }) diff --git a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs index b9a06e34100..ed5f9bb1596 100644 --- a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs +++ b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs @@ -116,7 +116,7 @@ fn get_nexus_ids_at_generation( generation: Generation, ) -> BTreeSet { blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter_map(|(_, z)| match &z.zone_type { BlueprintZoneType::Nexus(nexus_zone) if nexus_zone.nexus_generation == generation => @@ -502,7 +502,7 @@ fn test_spread_internal_dns_zones_across_sleds() { // Expunge two of the internal DNS zones; the planner should put new // zones back in their places. let zones_to_expunge = blueprint1 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter_map(|(sled_id, zone)| { zone.zone_type.is_internal_dns().then_some((sled_id, zone.id)) }) @@ -760,7 +760,7 @@ fn test_reuse_external_dns_ips_from_expunged_zones() { assert_eq!( blueprint1a - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, zone)| zone.zone_type.is_external_dns()) .count(), 3, @@ -775,7 +775,7 @@ fn test_reuse_external_dns_ips_from_expunged_zones() { // This blueprint should have three external DNS zones. assert_eq!( blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, zone)| zone.zone_type.is_external_dns()) .count(), 3, @@ -810,7 +810,7 @@ fn test_reuse_external_dns_ips_from_expunged_zones() { // The IP addresses of the new external DNS zones should be the // same as the original set that we set up. let mut ips = blueprint3 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter_map(|(_id, zone)| { zone.zone_type .is_external_dns() @@ -1342,7 +1342,7 @@ fn test_disk_expungement_removes_zones_transient_filesystem() { let mut zones_on_pool = BTreeSet::new(); let mut zone_kinds_on_pool = BTreeMap::<_, usize>::new(); for (_, zone_config) in - blueprint1.all_omicron_zones(BlueprintZoneDisposition::is_in_service) + blueprint1.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) { if pool_to_expunge == zone_config.filesystem_pool { zones_on_pool.insert(zone_config.id); @@ -1959,7 +1959,7 @@ fn test_crucible_pantry() { // We should start with CRUCIBLE_PANTRY_REDUNDANCY pantries spread out // to at most 1 per sled. Find one of the sleds running one. let pantry_sleds = blueprint1 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter_map(|(sled_id, zone)| { zone.zone_type.is_crucible_pantry().then_some(sled_id) }) @@ -1981,7 +1981,7 @@ fn test_crucible_pantry() { println!("1 -> 2 (expunged sled):\n{}", diff.display()); assert_eq!( blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(sled_id, zone)| *sled_id != expunged_sled_id && zone.zone_type.is_crucible_pantry()) .count(), @@ -2013,7 +2013,7 @@ fn test_single_node_clickhouse() { // We should start with one ClickHouse zone. Find out which sled it's on. let clickhouse_sleds = blueprint1 - .all_omicron_zones(BlueprintZoneDisposition::any) + .danger_all_omicron_zones(BlueprintZoneDisposition::any) .filter_map(|(sled, zone)| { zone.zone_type.is_clickhouse().then(|| Some(sled)) }) @@ -2034,7 +2034,7 @@ fn test_single_node_clickhouse() { println!("1 -> 2 (expunged sled):\n{}", diff.display()); assert_eq!( blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(sled, zone)| *sled != clickhouse_sled && zone.zone_type.is_clickhouse()) .count(), @@ -2087,7 +2087,7 @@ fn test_plan_deploy_all_clickhouse_cluster_nodes() { // We should see zones for 3 clickhouse keepers, and 2 servers created let active_zones: Vec<_> = blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .map(|(_, z)| z.clone()) .collect(); @@ -2214,7 +2214,7 @@ fn test_plan_deploy_all_clickhouse_cluster_nodes() { ); let active_zones: Vec<_> = blueprint5 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .map(|(_, z)| z.clone()) .collect(); @@ -2349,7 +2349,7 @@ fn test_expunge_clickhouse_clusters() { // We should see zones for 3 clickhouse keepers, and 2 servers created let active_zones: Vec<_> = blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .map(|(_, z)| z.clone()) .collect(); @@ -2422,7 +2422,7 @@ fn test_expunge_clickhouse_clusters() { // Find the sled containing one of the keeper zones and expunge it let (sled_id, bp_zone_config) = blueprint3 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .find(|(_, z)| z.zone_type.is_clickhouse_keeper()) .unwrap(); @@ -2547,7 +2547,7 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { // We should see zones for 3 clickhouse keepers, and 2 servers created let active_zones: Vec<_> = blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .map(|(_, z)| z.clone()) .collect(); @@ -2582,7 +2582,7 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { // We should have expunged our single-node clickhouse zone let expunged_zones: Vec<_> = blueprint3 - .all_omicron_zones(BlueprintZoneDisposition::is_expunged) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_expunged) .map(|(_, z)| z.clone()) .collect(); @@ -2605,7 +2605,7 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { // All our clickhouse keeper and server zones that we created when we // enabled our clickhouse policy should be expunged when we disable it. let expunged_zones: Vec<_> = blueprint4 - .all_omicron_zones(BlueprintZoneDisposition::is_expunged) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_expunged) .map(|(_, z)| z.clone()) .collect(); @@ -2627,7 +2627,7 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { assert_eq!( 1, blueprint4 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| z.zone_type.is_clickhouse()) .count() ); @@ -3115,7 +3115,7 @@ fn test_update_crucible_pantry_before_nexus() { // default. assert!( blueprint1 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .all(|(_, z)| matches!( &z.image_source, BlueprintZoneImageSource::Artifact { version, hash: _ } @@ -3296,7 +3296,7 @@ fn test_update_crucible_pantry_before_nexus() { // All Crucible Pantries should now be updated. assert_eq!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_up_to_date_pantry(z)) .count(), CRUCIBLE_PANTRY_REDUNDANCY @@ -3305,7 +3305,7 @@ fn test_update_crucible_pantry_before_nexus() { // All old Pantry zones should now be expunged. assert_eq!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_expunged) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_expunged) .filter(|(_, z)| is_old_pantry(z)) .count(), CRUCIBLE_PANTRY_REDUNDANCY @@ -3415,14 +3415,14 @@ fn test_update_crucible_pantry_before_nexus() { let blueprint12 = parent; assert_eq!( blueprint12 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_up_to_date_nexus(z)) .count(), NEXUS_REDUNDANCY, ); assert_eq!( blueprint12 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_old_nexus(z)) .count(), 0, @@ -3485,7 +3485,7 @@ fn test_update_cockroach() { eprintln!("{}", blueprint.display()); assert!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .all(|(_, z)| matches!( &z.image_source, BlueprintZoneImageSource::Artifact { version, hash: _ } @@ -3658,14 +3658,14 @@ fn test_update_cockroach() { assert_eq!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_old_cockroach(z)) .count(), COCKROACHDB_REDUNDANCY - i ); assert_eq!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_up_to_date_cockroach(z)) .count(), i @@ -3859,7 +3859,7 @@ fn test_update_boundary_ntp() { // All zones should be sourced from the 0.0.1 repo by default. assert!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .all(|(_, z)| matches!( &z.image_source, BlueprintZoneImageSource::Artifact { version, hash: _ } @@ -3939,7 +3939,7 @@ fn test_update_boundary_ntp() { }; let old_boundary_ntp_count = |blueprint: &Blueprint| -> usize { blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_old_boundary_ntp(z)) .count() }; @@ -3948,7 +3948,7 @@ fn test_update_boundary_ntp() { }; let up_to_date_boundary_ntp_count = |blueprint: &Blueprint| -> usize { blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_up_to_date_boundary_ntp(z)) .count() }; @@ -4253,7 +4253,7 @@ fn test_update_internal_dns() { // All zones should be sourced from the initial TUF repo by default. assert!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .all(|(_, z)| matches!( &z.image_source, BlueprintZoneImageSource::Artifact { version, hash: _ } @@ -4452,14 +4452,14 @@ fn test_update_internal_dns() { assert_eq!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_old_internal_dns(z)) .count(), INTERNAL_DNS_REDUNDANCY - i ); assert_eq!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| is_up_to_date_internal_dns(z)) .count(), i @@ -4509,7 +4509,7 @@ fn test_update_all_zones() { // All zones should be sourced from the 0.0.1 repo by default. assert!( blueprint1 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .all(|(_, z)| matches!( &z.image_source, BlueprintZoneImageSource::Artifact { version, hash: _ } @@ -4581,7 +4581,7 @@ fn test_update_all_zones() { { assert!( blueprint - .all_omicron_zones( + .danger_all_omicron_zones( BlueprintZoneDisposition::is_in_service ) .all(|(_, zone)| zone.image_source == image_source), diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index be6c31a889c..1c33f7c9f20 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -521,7 +521,7 @@ mod test { // Insert records for the zpools backing the datasets in these zones. for (sled_id, config) in - loaded.blueprint.all_omicron_zones(BlueprintZoneDisposition::any) + loaded.blueprint.danger_all_omicron_zones(BlueprintZoneDisposition::any) { let Some(dataset) = config.zone_type.durable_dataset() else { continue; diff --git a/nexus/src/app/background/tasks/crdb_node_id_collector.rs b/nexus/src/app/background/tasks/crdb_node_id_collector.rs index 529c38b7d0b..304fb0a26c5 100644 --- a/nexus/src/app/background/tasks/crdb_node_id_collector.rs +++ b/nexus/src/app/background/tasks/crdb_node_id_collector.rs @@ -148,7 +148,7 @@ impl CockroachAdminFromBlueprint for CockroachAdminFromBlueprintViaFixedPort { // collected, we have to wait until they're running. let zone_filter = BlueprintZoneDisposition::is_in_service; - blueprint.all_omicron_zones(zone_filter).filter_map( + blueprint.danger_all_omicron_zones(zone_filter).filter_map( |(_sled_id, zone)| match &zone.zone_type { BlueprintZoneType::CockroachDb( blueprint_zone_type::CockroachDb { address, .. }, @@ -269,7 +269,7 @@ mod tests { // `ExampleSystemBuilder` doesn't place any cockroach nodes; assert so // we bail out early if that changes. let ncockroach = bp0 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter(|(_, z)| z.zone_type.is_cockroach()) .count(); assert_eq!(ncockroach, 0); diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index 999ccba3126..c4adb8edd06 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -368,7 +368,7 @@ fn is_target_release_change_allowed( // Now check zone configs. for (_, zone_config) in current_blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) { match &zone_config.image_source { BlueprintZoneImageSource::InstallDataset => { diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index 623f9ab9d36..080a8b4b8a5 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -352,7 +352,7 @@ async fn check_all_sagas_drained( // This doesn't ever change once we've determined it once. But we don't // know what the value is until we see our first blueprint. let Some(my_generation) = current_blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .find_map(|(_sled_id, zone)| { if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { (zone.id == my_nexus_id).then_some(nexus.nexus_generation) @@ -441,7 +441,7 @@ async fn check_all_sagas_drained( // were all drained up to the same point, which is good enough for us to // proceed, too. let our_gen_nexus_ids: BTreeSet = current_blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter_map(|(_sled_id, zone)| { if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { (nexus.nexus_generation == my_generation).then_some(zone.id) @@ -854,7 +854,7 @@ mod test { "test_quiesce_multi", ); let nexus_ids = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter_map(|(_sled_id, z)| z.zone_type.is_nexus().then_some(z.id)) .collect::>(); // The example system creates three sleds. Each sled gets a Nexus zone. diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index f3e65e5caee..025858a3b26 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -358,7 +358,7 @@ impl nexus_test_interface::NexusServer for Server { // system services. But here, we fake up IP pool ranges based on the // external addresses of services that we start or mock. let internal_services_ip_pool_ranges = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter_map(|(_, zc)| match &zc.zone_type { BlueprintZoneType::BoundaryNtp( blueprint_zone_type::BoundaryNtp { external_ip, .. }, diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 080b1f3c883..c94c2afa4b2 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -309,7 +309,9 @@ impl Blueprint { /// Iterate over the [`BlueprintZoneConfig`] instances in the blueprint /// that match the provided filter, along with the associated sled id. - pub fn all_omicron_zones( + /// + /// TODO-john Explain danger? Add reason arg? + pub fn danger_all_omicron_zones( &self, mut filter: F, ) -> impl Iterator @@ -334,7 +336,7 @@ impl Blueprint { where F: FnMut(BlueprintZoneDisposition) -> bool, { - self.all_omicron_zones(filter).filter_map(|(sled_id, zone)| { + self.danger_all_omicron_zones(filter).filter_map(|(sled_id, zone)| { if let BlueprintZoneType::Nexus(nexus_config) = &zone.zone_type { Some((sled_id, zone, nexus_config)) } else { @@ -418,7 +420,7 @@ impl Blueprint { nexus_id: OmicronZoneUuid, ) -> Result { let zone = self - .all_omicron_zones(|_z| true) + .danger_all_omicron_zones(|_z| true) .find(|(_sled_id, zone_config)| zone_config.id == nexus_id) .ok_or_else(|| { anyhow!("zone {} does not exist in blueprint", nexus_id) diff --git a/nexus/types/src/deployment/execution/dns.rs b/nexus/types/src/deployment/execution/dns.rs index 2e509d12b28..dd371399d7a 100644 --- a/nexus/types/src/deployment/execution/dns.rs +++ b/nexus/types/src/deployment/execution/dns.rs @@ -41,7 +41,7 @@ pub fn blueprint_internal_dns_config( let mut dns_builder = DnsConfigBuilder::new(); 'all_zones: for (_, zone) in - blueprint.all_omicron_zones(BlueprintZoneDisposition::is_in_service) + blueprint.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) { let (service_name, &address) = match &zone.zone_type { BlueprintZoneType::BoundaryNtp( diff --git a/nexus/types/src/deployment/execution/utils.rs b/nexus/types/src/deployment/execution/utils.rs index fd6bb3f87b3..71015bd9137 100644 --- a/nexus/types/src/deployment/execution/utils.rs +++ b/nexus/types/src/deployment/execution/utils.rs @@ -104,7 +104,7 @@ pub fn blueprint_external_dns_nameserver_ips( blueprint: &Blueprint, ) -> Vec { blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) .filter_map(|(_, z)| match z.zone_type { BlueprintZoneType::ExternalDns( blueprint_zone_type::ExternalDns { dns_address, .. }, From 0ea816b63bdec6b91962b9fa29089174de677746 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 11 Dec 2025 15:10:53 -0500 Subject: [PATCH 02/25] add in_service_zones() --- dev-tools/reconfigurator-cli/src/lib.rs | 4 +- live-tests/tests/test_nexus_handoff.rs | 8 +-- .../db-queries/src/db/datastore/deployment.rs | 2 +- nexus/db-queries/src/db/datastore/rack.rs | 6 +- nexus/reconfigurator/blippy/src/checks.rs | 8 +-- .../execution/src/clickhouse.rs | 2 +- nexus/reconfigurator/execution/src/dns.rs | 2 +- .../planning/src/blueprint_builder/builder.rs | 4 +- .../allocators/external_networking.rs | 2 +- .../tests/integration_tests/planner.rs | 58 +++++++++---------- .../tasks/crdb_node_id_collector.rs | 2 +- nexus/src/app/deployment.rs | 2 +- nexus/src/app/quiesce.rs | 6 +- nexus/src/lib.rs | 2 +- nexus/types/src/deployment.rs | 10 ++++ nexus/types/src/deployment/execution/dns.rs | 2 +- nexus/types/src/deployment/execution/utils.rs | 2 +- 17 files changed, 66 insertions(+), 56 deletions(-) diff --git a/dev-tools/reconfigurator-cli/src/lib.rs b/dev-tools/reconfigurator-cli/src/lib.rs index 2953aaee23b..39127cb48f5 100644 --- a/dev-tools/reconfigurator-cli/src/lib.rs +++ b/dev-tools/reconfigurator-cli/src/lib.rs @@ -160,7 +160,7 @@ impl ReconfiguratorSim { // Handle zone networking setup first for (_, zone) in parent_blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() { if let Some((external_ip, nic)) = zone.zone_type.external_networking() @@ -233,7 +233,7 @@ impl ReconfiguratorSim { state.config().active_nexus_zone_generation(); let mut not_yet_nexus_zones = BTreeSet::new(); for (_, zone) in parent_blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() { match &zone.zone_type { nexus_types::deployment::BlueprintZoneType::Nexus( diff --git a/live-tests/tests/test_nexus_handoff.rs b/live-tests/tests/test_nexus_handoff.rs index bce8f20adf5..ad2e553495d 100644 --- a/live-tests/tests/test_nexus_handoff.rs +++ b/live-tests/tests/test_nexus_handoff.rs @@ -69,7 +69,7 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { // there exist no Nexus zones with a generation newer than the blueprint's // `nexus_generation`. let new_zones = blueprint_initial - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_sled_id, z)| { let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { nexus_generation, @@ -102,7 +102,7 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { cfg: &'a blueprint_zone_type::Nexus, } let current_nexus_zones: BTreeMap = blueprint_initial - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(sled_id, z)| { let BlueprintZoneType::Nexus( cfg @ blueprint_zone_type::Nexus { nexus_generation, .. }, @@ -504,7 +504,7 @@ async fn check_internal_dns( // Find the DNS server based on what's currently in the blueprint. let dns_sockaddr = blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .find_map(|(_sled_id, zone_cfg)| { if let BlueprintZoneType::InternalDns( blueprint_zone_type::InternalDns { dns_address, .. }, @@ -568,7 +568,7 @@ async fn check_external_dns( // Find the DNS server based on what's currently in the blueprint. let dns_http_sockaddr = blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .find_map(|(_sled_id, zone_cfg)| { if let BlueprintZoneType::ExternalDns( blueprint_zone_type::ExternalDns { http_address, .. }, diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index e361b215e2f..e418b6661ba 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -4296,7 +4296,7 @@ mod tests { // Insert an IP pool range covering the one Nexus IP. let nexus_ip = blueprint1 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .find_map(|(_, zone_config)| { zone_config .zone_type diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index 23118f49a7c..99cbdee00b1 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -842,7 +842,7 @@ impl DataStore { // Insert Nexus database access records self.initialize_nexus_access_from_blueprint_on_connection( &conn, - blueprint.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + blueprint.in_service_zones() .filter_map(|(_sled, zone_cfg)| { if zone_cfg.zone_type.is_nexus() { Some(zone_cfg.id) @@ -856,7 +856,7 @@ impl DataStore { })?; // Allocate networking records for all services. - for (_, zone_config) in blueprint.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) { + for (_, zone_config) in blueprint.in_service_zones() { self.rack_populate_service_networking_records( &conn, &log, @@ -1504,7 +1504,7 @@ mod test { let mut ntp2_id = None; let mut ntp3_id = None; for (sled_id, zone) in - blueprint.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + blueprint.in_service_zones() { match &zone.zone_type { BlueprintZoneType::BoundaryNtp(_) => { diff --git a/nexus/reconfigurator/blippy/src/checks.rs b/nexus/reconfigurator/blippy/src/checks.rs index 35e8b39fb14..64d0806c8ed 100644 --- a/nexus/reconfigurator/blippy/src/checks.rs +++ b/nexus/reconfigurator/blippy/src/checks.rs @@ -67,7 +67,7 @@ fn check_underlay_ips(blippy: &mut Blippy<'_>) { for (sled_id, zone) in blippy .blueprint() - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() { let ip = zone.underlay_ip(); @@ -160,7 +160,7 @@ fn check_external_networking(blippy: &mut Blippy<'_>) { for (sled_id, zone, external_ip, nic) in blippy .blueprint() - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(sled_id, zone)| { zone.zone_type .external_networking() @@ -265,7 +265,7 @@ fn check_dataset_zpool_uniqueness(blippy: &mut Blippy<'_>) { // kind. for (sled_id, zone) in blippy .blueprint() - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() { // Check "one kind per zpool" for transient datasets... let filesystem_dataset = zone.filesystem_dataset(); @@ -691,7 +691,7 @@ fn check_nexus_generation_consistency(blippy: &mut Blippy<'_>) { // Collect all Nexus zones and their generations for (sled_id, zone) in blippy .blueprint() - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() { if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { generation_info.entry(nexus.nexus_generation).or_default().push(( diff --git a/nexus/reconfigurator/execution/src/clickhouse.rs b/nexus/reconfigurator/execution/src/clickhouse.rs index 190150e0cb4..1601a5a2532 100644 --- a/nexus/reconfigurator/execution/src/clickhouse.rs +++ b/nexus/reconfigurator/execution/src/clickhouse.rs @@ -235,7 +235,7 @@ pub(crate) async fn deploy_single_node( deploy_single_node_impl( opctx, blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| z.zone_type.is_clickhouse()), ) .await diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index ae884fd39f5..1b8f3ecebe8 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -822,7 +822,7 @@ mod test { // To start, we need a mapping from underlay IP to the corresponding // Omicron zone. let omicron_zones_by_ip: BTreeMap<_, _> = blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .map(|(_, zone)| (zone.underlay_ip(), zone.id)) .collect(); println!("omicron zones by IP: {:#?}", omicron_zones_by_ip); diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index aed6393ed05..570ddd0fe10 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -3317,7 +3317,7 @@ pub mod test { // provisions CRDB; this check makes sure we update our use of it if // that changes). for (_, z) in - parent.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + parent.in_service_zones() { assert!( !z.zone_type.is_cockroach(), @@ -3361,7 +3361,7 @@ pub mod test { verify_blueprint(&blueprint, &input); assert_eq!( blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(sled_id, z)| { *sled_id == target_sled_id && z.zone_type.kind() == ZoneKind::CockroachDb diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs b/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs index a40a74944ce..e9bda4924eb 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs @@ -87,7 +87,7 @@ impl ExternalNetworkingAllocator { ) -> anyhow::Result { Self::new( blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .map(|(_sled_id, zone)| zone), external_ip_policy, ) diff --git a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs index ed5f9bb1596..7cf25ba4bf7 100644 --- a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs +++ b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs @@ -116,7 +116,7 @@ fn get_nexus_ids_at_generation( generation: Generation, ) -> BTreeSet { blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_, z)| match &z.zone_type { BlueprintZoneType::Nexus(nexus_zone) if nexus_zone.nexus_generation == generation => @@ -502,7 +502,7 @@ fn test_spread_internal_dns_zones_across_sleds() { // Expunge two of the internal DNS zones; the planner should put new // zones back in their places. let zones_to_expunge = blueprint1 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(sled_id, zone)| { zone.zone_type.is_internal_dns().then_some((sled_id, zone.id)) }) @@ -760,7 +760,7 @@ fn test_reuse_external_dns_ips_from_expunged_zones() { assert_eq!( blueprint1a - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, zone)| zone.zone_type.is_external_dns()) .count(), 3, @@ -775,7 +775,7 @@ fn test_reuse_external_dns_ips_from_expunged_zones() { // This blueprint should have three external DNS zones. assert_eq!( blueprint2 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, zone)| zone.zone_type.is_external_dns()) .count(), 3, @@ -810,7 +810,7 @@ fn test_reuse_external_dns_ips_from_expunged_zones() { // The IP addresses of the new external DNS zones should be the // same as the original set that we set up. let mut ips = blueprint3 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_id, zone)| { zone.zone_type .is_external_dns() @@ -1342,7 +1342,7 @@ fn test_disk_expungement_removes_zones_transient_filesystem() { let mut zones_on_pool = BTreeSet::new(); let mut zone_kinds_on_pool = BTreeMap::<_, usize>::new(); for (_, zone_config) in - blueprint1.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + blueprint1.in_service_zones() { if pool_to_expunge == zone_config.filesystem_pool { zones_on_pool.insert(zone_config.id); @@ -1959,7 +1959,7 @@ fn test_crucible_pantry() { // We should start with CRUCIBLE_PANTRY_REDUNDANCY pantries spread out // to at most 1 per sled. Find one of the sleds running one. let pantry_sleds = blueprint1 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(sled_id, zone)| { zone.zone_type.is_crucible_pantry().then_some(sled_id) }) @@ -1981,7 +1981,7 @@ fn test_crucible_pantry() { println!("1 -> 2 (expunged sled):\n{}", diff.display()); assert_eq!( blueprint2 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(sled_id, zone)| *sled_id != expunged_sled_id && zone.zone_type.is_crucible_pantry()) .count(), @@ -2034,7 +2034,7 @@ fn test_single_node_clickhouse() { println!("1 -> 2 (expunged sled):\n{}", diff.display()); assert_eq!( blueprint2 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(sled, zone)| *sled != clickhouse_sled && zone.zone_type.is_clickhouse()) .count(), @@ -2087,7 +2087,7 @@ fn test_plan_deploy_all_clickhouse_cluster_nodes() { // We should see zones for 3 clickhouse keepers, and 2 servers created let active_zones: Vec<_> = blueprint2 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .map(|(_, z)| z.clone()) .collect(); @@ -2214,7 +2214,7 @@ fn test_plan_deploy_all_clickhouse_cluster_nodes() { ); let active_zones: Vec<_> = blueprint5 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .map(|(_, z)| z.clone()) .collect(); @@ -2349,7 +2349,7 @@ fn test_expunge_clickhouse_clusters() { // We should see zones for 3 clickhouse keepers, and 2 servers created let active_zones: Vec<_> = blueprint2 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .map(|(_, z)| z.clone()) .collect(); @@ -2422,7 +2422,7 @@ fn test_expunge_clickhouse_clusters() { // Find the sled containing one of the keeper zones and expunge it let (sled_id, bp_zone_config) = blueprint3 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .find(|(_, z)| z.zone_type.is_clickhouse_keeper()) .unwrap(); @@ -2547,7 +2547,7 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { // We should see zones for 3 clickhouse keepers, and 2 servers created let active_zones: Vec<_> = blueprint2 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .map(|(_, z)| z.clone()) .collect(); @@ -2627,7 +2627,7 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { assert_eq!( 1, blueprint4 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| z.zone_type.is_clickhouse()) .count() ); @@ -3115,7 +3115,7 @@ fn test_update_crucible_pantry_before_nexus() { // default. assert!( blueprint1 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .all(|(_, z)| matches!( &z.image_source, BlueprintZoneImageSource::Artifact { version, hash: _ } @@ -3296,7 +3296,7 @@ fn test_update_crucible_pantry_before_nexus() { // All Crucible Pantries should now be updated. assert_eq!( blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_up_to_date_pantry(z)) .count(), CRUCIBLE_PANTRY_REDUNDANCY @@ -3415,14 +3415,14 @@ fn test_update_crucible_pantry_before_nexus() { let blueprint12 = parent; assert_eq!( blueprint12 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_up_to_date_nexus(z)) .count(), NEXUS_REDUNDANCY, ); assert_eq!( blueprint12 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_old_nexus(z)) .count(), 0, @@ -3485,7 +3485,7 @@ fn test_update_cockroach() { eprintln!("{}", blueprint.display()); assert!( blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .all(|(_, z)| matches!( &z.image_source, BlueprintZoneImageSource::Artifact { version, hash: _ } @@ -3658,14 +3658,14 @@ fn test_update_cockroach() { assert_eq!( blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_old_cockroach(z)) .count(), COCKROACHDB_REDUNDANCY - i ); assert_eq!( blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_up_to_date_cockroach(z)) .count(), i @@ -3859,7 +3859,7 @@ fn test_update_boundary_ntp() { // All zones should be sourced from the 0.0.1 repo by default. assert!( blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .all(|(_, z)| matches!( &z.image_source, BlueprintZoneImageSource::Artifact { version, hash: _ } @@ -3939,7 +3939,7 @@ fn test_update_boundary_ntp() { }; let old_boundary_ntp_count = |blueprint: &Blueprint| -> usize { blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_old_boundary_ntp(z)) .count() }; @@ -3948,7 +3948,7 @@ fn test_update_boundary_ntp() { }; let up_to_date_boundary_ntp_count = |blueprint: &Blueprint| -> usize { blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_up_to_date_boundary_ntp(z)) .count() }; @@ -4253,7 +4253,7 @@ fn test_update_internal_dns() { // All zones should be sourced from the initial TUF repo by default. assert!( blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .all(|(_, z)| matches!( &z.image_source, BlueprintZoneImageSource::Artifact { version, hash: _ } @@ -4452,14 +4452,14 @@ fn test_update_internal_dns() { assert_eq!( blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_old_internal_dns(z)) .count(), INTERNAL_DNS_REDUNDANCY - i ); assert_eq!( blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_up_to_date_internal_dns(z)) .count(), i @@ -4509,7 +4509,7 @@ fn test_update_all_zones() { // All zones should be sourced from the 0.0.1 repo by default. assert!( blueprint1 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .all(|(_, z)| matches!( &z.image_source, BlueprintZoneImageSource::Artifact { version, hash: _ } diff --git a/nexus/src/app/background/tasks/crdb_node_id_collector.rs b/nexus/src/app/background/tasks/crdb_node_id_collector.rs index 304fb0a26c5..7a672888426 100644 --- a/nexus/src/app/background/tasks/crdb_node_id_collector.rs +++ b/nexus/src/app/background/tasks/crdb_node_id_collector.rs @@ -269,7 +269,7 @@ mod tests { // `ExampleSystemBuilder` doesn't place any cockroach nodes; assert so // we bail out early if that changes. let ncockroach = bp0 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| z.zone_type.is_cockroach()) .count(); assert_eq!(ncockroach, 0); diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index c4adb8edd06..28bfe4457d3 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -368,7 +368,7 @@ fn is_target_release_change_allowed( // Now check zone configs. for (_, zone_config) in current_blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() { match &zone_config.image_source { BlueprintZoneImageSource::InstallDataset => { diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index 080a8b4b8a5..0a7c7386902 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -352,7 +352,7 @@ async fn check_all_sagas_drained( // This doesn't ever change once we've determined it once. But we don't // know what the value is until we see our first blueprint. let Some(my_generation) = current_blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .find_map(|(_sled_id, zone)| { if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { (zone.id == my_nexus_id).then_some(nexus.nexus_generation) @@ -441,7 +441,7 @@ async fn check_all_sagas_drained( // were all drained up to the same point, which is good enough for us to // proceed, too. let our_gen_nexus_ids: BTreeSet = current_blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_sled_id, zone)| { if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { (nexus.nexus_generation == my_generation).then_some(zone.id) @@ -854,7 +854,7 @@ mod test { "test_quiesce_multi", ); let nexus_ids = blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_sled_id, z)| z.zone_type.is_nexus().then_some(z.id)) .collect::>(); // The example system creates three sleds. Each sled gets a Nexus zone. diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index 025858a3b26..2c9482878f9 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -358,7 +358,7 @@ impl nexus_test_interface::NexusServer for Server { // system services. But here, we fake up IP pool ranges based on the // external addresses of services that we start or mock. let internal_services_ip_pool_ranges = blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_, zc)| match &zc.zone_type { BlueprintZoneType::BoundaryNtp( blueprint_zone_type::BoundaryNtp { external_ip, .. }, diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index c94c2afa4b2..5d738f57d06 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -307,6 +307,16 @@ impl Blueprint { } } + /// Iterate over the in-service [`BlueprintZoneConfig`] instances in the + /// blueprint, along with the associated sled id. + pub fn in_service_zones( + &self, + ) -> impl Iterator + { + // TODO-john explain + self.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + } + /// Iterate over the [`BlueprintZoneConfig`] instances in the blueprint /// that match the provided filter, along with the associated sled id. /// diff --git a/nexus/types/src/deployment/execution/dns.rs b/nexus/types/src/deployment/execution/dns.rs index dd371399d7a..a5ed7901b74 100644 --- a/nexus/types/src/deployment/execution/dns.rs +++ b/nexus/types/src/deployment/execution/dns.rs @@ -41,7 +41,7 @@ pub fn blueprint_internal_dns_config( let mut dns_builder = DnsConfigBuilder::new(); 'all_zones: for (_, zone) in - blueprint.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + blueprint.in_service_zones() { let (service_name, &address) = match &zone.zone_type { BlueprintZoneType::BoundaryNtp( diff --git a/nexus/types/src/deployment/execution/utils.rs b/nexus/types/src/deployment/execution/utils.rs index 71015bd9137..3e6f745b871 100644 --- a/nexus/types/src/deployment/execution/utils.rs +++ b/nexus/types/src/deployment/execution/utils.rs @@ -104,7 +104,7 @@ pub fn blueprint_external_dns_nameserver_ips( blueprint: &Blueprint, ) -> Vec { blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_, z)| match z.zone_type { BlueprintZoneType::ExternalDns( blueprint_zone_type::ExternalDns { dns_address, .. }, From 9d6265317a13a90407c22b5b0a3daebd86f74d38 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 11 Dec 2025 15:29:53 -0500 Subject: [PATCH 03/25] add expunged_zones() --- live-tests/tests/test_nexus_add_remove.rs | 5 +- .../tests/integration_tests/planner.rs | 134 +++++++----------- nexus/types/src/deployment.rs | 18 ++- nexus/types/src/deployment/execution/dns.rs | 7 +- 4 files changed, 71 insertions(+), 93 deletions(-) diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index 365ce016789..d795292ab10 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -19,6 +19,7 @@ use nexus_reconfigurator_planning::blueprint_editor::ExternalNetworkingAllocator use nexus_reconfigurator_planning::planner::Planner; use nexus_reconfigurator_planning::planner::PlannerRng; use nexus_reconfigurator_preparation::PlanningInputFromDb; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::PlannerConfig; @@ -207,7 +208,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { .await .expect("editing blueprint to expunge zone"); let (_, expunged_zone_config) = blueprint3 - .danger_all_omicron_zones(|_| true) + .expunged_zones(BlueprintExpungedZoneAccessReason::Test) .find(|(_sled_id, zone_config)| zone_config.id == new_zone.id) .expect("expunged zone in new blueprint"); let BlueprintZoneDisposition::Expunged { @@ -215,7 +216,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { .. } = expunged_zone_config.disposition else { - panic!("expected expunged zone to have disposition Expunged"); + unreachable!("expunged_zones() returned a non-expunged zone"); }; // At some point, we should be unable to reach this Nexus any more. diff --git a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs index 7cf25ba4bf7..b915b9c48e3 100644 --- a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs +++ b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs @@ -16,6 +16,7 @@ use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintArtifactVersion; use nexus_types::deployment::BlueprintDatasetDisposition; use nexus_types::deployment::BlueprintDiffSummary; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; use nexus_types::deployment::BlueprintSource; use nexus_types::deployment::BlueprintZoneConfig; @@ -1341,9 +1342,7 @@ fn test_disk_expungement_removes_zones_transient_filesystem() { // Find all the zones using this same zpool. let mut zones_on_pool = BTreeSet::new(); let mut zone_kinds_on_pool = BTreeMap::<_, usize>::new(); - for (_, zone_config) in - blueprint1.in_service_zones() - { + for (_, zone_config) in blueprint1.in_service_zones() { if pool_to_expunge == zone_config.filesystem_pool { zones_on_pool.insert(zone_config.id); *zone_kinds_on_pool @@ -2086,10 +2085,8 @@ fn test_plan_deploy_all_clickhouse_cluster_nodes() { ); // We should see zones for 3 clickhouse keepers, and 2 servers created - let active_zones: Vec<_> = blueprint2 - .in_service_zones() - .map(|(_, z)| z.clone()) - .collect(); + let active_zones: Vec<_> = + blueprint2.in_service_zones().map(|(_, z)| z.clone()).collect(); let keeper_zone_ids: BTreeSet<_> = active_zones .iter() @@ -2213,10 +2210,8 @@ fn test_plan_deploy_all_clickhouse_cluster_nodes() { &diff.display().to_string(), ); - let active_zones: Vec<_> = blueprint5 - .in_service_zones() - .map(|(_, z)| z.clone()) - .collect(); + let active_zones: Vec<_> = + blueprint5.in_service_zones().map(|(_, z)| z.clone()).collect(); let new_keeper_zone_ids: BTreeSet<_> = active_zones .iter() @@ -2348,10 +2343,8 @@ fn test_expunge_clickhouse_clusters() { let blueprint2 = sim.run_planner().expect("planning succeeded"); // We should see zones for 3 clickhouse keepers, and 2 servers created - let active_zones: Vec<_> = blueprint2 - .in_service_zones() - .map(|(_, z)| z.clone()) - .collect(); + let active_zones: Vec<_> = + blueprint2.in_service_zones().map(|(_, z)| z.clone()).collect(); let keeper_zone_ids: BTreeSet<_> = active_zones .iter() @@ -2546,10 +2539,8 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { let blueprint2 = sim.run_planner().expect("planning succeeded"); // We should see zones for 3 clickhouse keepers, and 2 servers created - let active_zones: Vec<_> = blueprint2 - .in_service_zones() - .map(|(_, z)| z.clone()) - .collect(); + let active_zones: Vec<_> = + blueprint2.in_service_zones().map(|(_, z)| z.clone()).collect(); let keeper_zone_ids: BTreeSet<_> = active_zones .iter() @@ -2582,7 +2573,7 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { // We should have expunged our single-node clickhouse zone let expunged_zones: Vec<_> = blueprint3 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_expunged) + .expunged_zones(BlueprintExpungedZoneAccessReason::Test) .map(|(_, z)| z.clone()) .collect(); @@ -2605,7 +2596,7 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { // All our clickhouse keeper and server zones that we created when we // enabled our clickhouse policy should be expunged when we disable it. let expunged_zones: Vec<_> = blueprint4 - .danger_all_omicron_zones(BlueprintZoneDisposition::is_expunged) + .expunged_zones(BlueprintExpungedZoneAccessReason::Test) .map(|(_, z)| z.clone()) .collect(); @@ -3113,17 +3104,13 @@ fn test_update_crucible_pantry_before_nexus() { // All zones should be sourced from the initial 0.0.1 target release by // default. - assert!( - blueprint1 - .in_service_zones() - .all(|(_, z)| matches!( - &z.image_source, - BlueprintZoneImageSource::Artifact { version, hash: _ } - if version == &BlueprintArtifactVersion::Available { - version: ArtifactVersion::new_const("0.0.1") - } - )) - ); + assert!(blueprint1.in_service_zones().all(|(_, z)| matches!( + &z.image_source, + BlueprintZoneImageSource::Artifact { version, hash: _ } + if version == &BlueprintArtifactVersion::Available { + version: ArtifactVersion::new_const("0.0.1") + } + ))); // Manually specify a TUF repo with fake zone images. let version = ArtifactVersion::new_static("1.0.0-freeform") @@ -3305,7 +3292,7 @@ fn test_update_crucible_pantry_before_nexus() { // All old Pantry zones should now be expunged. assert_eq!( blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_expunged) + .expunged_zones(BlueprintExpungedZoneAccessReason::Test) .filter(|(_, z)| is_old_pantry(z)) .count(), CRUCIBLE_PANTRY_REDUNDANCY @@ -3421,10 +3408,7 @@ fn test_update_crucible_pantry_before_nexus() { NEXUS_REDUNDANCY, ); assert_eq!( - blueprint12 - .in_service_zones() - .filter(|(_, z)| is_old_nexus(z)) - .count(), + blueprint12.in_service_zones().filter(|(_, z)| is_old_nexus(z)).count(), 0, ); @@ -3483,17 +3467,13 @@ fn test_update_cockroach() { // All zones should be sourced from the initial 0.0.1 target release by // default. eprintln!("{}", blueprint.display()); - assert!( - blueprint - .in_service_zones() - .all(|(_, z)| matches!( - &z.image_source, - BlueprintZoneImageSource::Artifact { version, hash: _ } - if version == &BlueprintArtifactVersion::Available { - version: ArtifactVersion::new_const("0.0.1") - } - )) - ); + assert!(blueprint.in_service_zones().all(|(_, z)| matches!( + &z.image_source, + BlueprintZoneImageSource::Artifact { version, hash: _ } + if version == &BlueprintArtifactVersion::Available { + version: ArtifactVersion::new_const("0.0.1") + } + ))); // This test "starts" here -- we specify a new TUF repo with an updated // CockroachDB image. We create a new TUF repo where version of @@ -3857,17 +3837,13 @@ fn test_update_boundary_ntp() { ); // All zones should be sourced from the 0.0.1 repo by default. - assert!( - blueprint - .in_service_zones() - .all(|(_, z)| matches!( - &z.image_source, - BlueprintZoneImageSource::Artifact { version, hash: _ } - if version == &BlueprintArtifactVersion::Available { - version: ArtifactVersion::new_const("0.0.1") - } - )) - ); + assert!(blueprint.in_service_zones().all(|(_, z)| matches!( + &z.image_source, + BlueprintZoneImageSource::Artifact { version, hash: _ } + if version == &BlueprintArtifactVersion::Available { + version: ArtifactVersion::new_const("0.0.1") + } + ))); // This test "starts" here -- we specify a new TUF repo with an updated // Boundary NTP image. We create a new TUF repo where version of @@ -4251,17 +4227,13 @@ fn test_update_internal_dns() { let blueprint = sim.assert_latest_blueprint_is_blippy_clean(); // All zones should be sourced from the initial TUF repo by default. - assert!( - blueprint - .in_service_zones() - .all(|(_, z)| matches!( - &z.image_source, - BlueprintZoneImageSource::Artifact { version, hash: _ } - if version == &BlueprintArtifactVersion::Available { - version: ArtifactVersion::new_const("0.0.1") - } - )) - ); + assert!(blueprint.in_service_zones().all(|(_, z)| matches!( + &z.image_source, + BlueprintZoneImageSource::Artifact { version, hash: _ } + if version == &BlueprintArtifactVersion::Available { + version: ArtifactVersion::new_const("0.0.1") + } + ))); // This test "starts" here -- we specify a new TUF repo with an updated // Internal DNS image. We create a new TUF repo where version of @@ -4507,17 +4479,13 @@ fn test_update_all_zones() { let blueprint1 = sim.assert_latest_blueprint_is_blippy_clean(); // All zones should be sourced from the 0.0.1 repo by default. - assert!( - blueprint1 - .in_service_zones() - .all(|(_, z)| matches!( - &z.image_source, - BlueprintZoneImageSource::Artifact { version, hash: _ } - if version == &BlueprintArtifactVersion::Available { - version: ArtifactVersion::new_const("0.0.1") - } - )) - ); + assert!(blueprint1.in_service_zones().all(|(_, z)| matches!( + &z.image_source, + BlueprintZoneImageSource::Artifact { version, hash: _ } + if version == &BlueprintArtifactVersion::Available { + version: ArtifactVersion::new_const("0.0.1") + } + ))); // Manually specify a TUF repo with fake images for all zones. // Only the name and kind of the artifacts matter. @@ -4581,9 +4549,7 @@ fn test_update_all_zones() { { assert!( blueprint - .danger_all_omicron_zones( - BlueprintZoneDisposition::is_in_service - ) + .in_service_zones() .all(|(_, zone)| zone.image_source == image_source), "failed to update all zones" ); diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 5d738f57d06..994b83d2a10 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -311,12 +311,20 @@ impl Blueprint { /// blueprint, along with the associated sled id. pub fn in_service_zones( &self, - ) -> impl Iterator - { + ) -> impl Iterator { // TODO-john explain self.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) } + /// TODO-john + pub fn expunged_zones( + &self, + _reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator { + // TODO-john explain + self.danger_all_omicron_zones(BlueprintZoneDisposition::is_expunged) + } + /// Iterate over the [`BlueprintZoneConfig`] instances in the blueprint /// that match the provided filter, along with the associated sled id. /// @@ -617,6 +625,12 @@ pub struct OperatorNexusConfig<'a> { pub external_dns_servers: &'a [IpAddr], } +/// TODO-john +pub enum BlueprintExpungedZoneAccessReason { + /// TODO-john + Test, +} + /// Description of the source of a blueprint. #[derive( Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize, Diffable, diff --git a/nexus/types/src/deployment/execution/dns.rs b/nexus/types/src/deployment/execution/dns.rs index a5ed7901b74..a212db79bde 100644 --- a/nexus/types/src/deployment/execution/dns.rs +++ b/nexus/types/src/deployment/execution/dns.rs @@ -13,8 +13,7 @@ use omicron_common::api::external::{Generation, Name}; use crate::{ deployment::{ - Blueprint, BlueprintZoneDisposition, BlueprintZoneType, SledFilter, - blueprint_zone_type, + Blueprint, BlueprintZoneType, SledFilter, blueprint_zone_type, }, internal_api::params::{DnsConfigZone, DnsRecord}, silo::{default_silo_name, silo_dns_name}, @@ -40,9 +39,7 @@ pub fn blueprint_internal_dns_config( // the details. let mut dns_builder = DnsConfigBuilder::new(); - 'all_zones: for (_, zone) in - blueprint.in_service_zones() - { + 'all_zones: for (_, zone) in blueprint.in_service_zones() { let (service_name, &address) = match &zone.zone_type { BlueprintZoneType::BoundaryNtp( blueprint_zone_type::BoundaryNtp { address, .. }, From 42de0a49c78486b09183163858627be2e19a061d Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Thu, 11 Dec 2025 15:43:00 -0500 Subject: [PATCH 04/25] pruning more danger_all_omicron_zones() calls --- live-tests/tests/test_nexus_add_remove.rs | 2 +- .../db-queries/src/db/datastore/deployment.rs | 23 +++++------- nexus/db-queries/src/db/datastore/rack.rs | 37 ++++--------------- nexus/reconfigurator/execution/src/dns.rs | 4 +- .../planning/src/blueprint_builder/builder.rs | 8 +--- nexus/reconfigurator/planning/src/example.rs | 7 +--- .../tests/integration_tests/planner.rs | 2 +- .../background/tasks/blueprint_execution.rs | 4 +- .../tasks/crdb_node_id_collector.rs | 6 +-- nexus/types/src/deployment.rs | 2 + 10 files changed, 31 insertions(+), 64 deletions(-) diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index d795292ab10..3f5592d580a 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -307,7 +307,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { // We don't need to check this here. It just provides a better error // message if something has gone wrong up to this point. let (_, expunged_zone_config) = new_blueprint - .danger_all_omicron_zones(|_| true) + .expunged_zones(BlueprintExpungedZoneAccessReason::Test) .find(|(_sled_id, zone_config)| zone_config.id == new_zone.id) .expect("expunged zone in new blueprint"); assert!(expunged_zone_config.disposition.is_ready_for_cleanup()); diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index e418b6661ba..5a9a86c079d 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -75,6 +75,7 @@ use nexus_db_schema::enums::HwM2SlotEnum; use nexus_db_schema::enums::HwRotSlotEnum; use nexus_db_schema::enums::SpTypeEnum; use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintMetadata; use nexus_types::deployment::BlueprintSledConfig; use nexus_types::deployment::BlueprintSource; @@ -1956,9 +1957,10 @@ impl DataStore { &conn, &opctx.log, blueprint - .danger_all_omicron_zones(|disposition| { - !disposition.is_in_service() - }) + .expunged_zones( + BlueprintExpungedZoneAccessReason + ::DeallocateExternalNetworkingResources + ) .map(|(_sled_id, zone)| zone), ) .await @@ -1966,11 +1968,7 @@ impl DataStore { self.ensure_zone_external_networking_allocated_on_connection( &conn, opctx, - blueprint - .danger_all_omicron_zones(|disposition| { - disposition.is_in_service() - }) - .map(|(_sled_id, zone)| zone), + blueprint.in_service_zones().map(|(_sled_id, zone)| zone), ) .await .map_err(|e| err.bail(e))?; @@ -3312,7 +3310,7 @@ mod tests { ); assert_eq!(blueprint1.sleds.len(), collection.sled_agents.len()); assert_eq!( - blueprint1.danger_all_omicron_zones(BlueprintZoneDisposition::any).count(), + blueprint1.in_service_zones().count(), collection.all_ledgered_omicron_zones().count() ); // All zones should be in service. @@ -3644,9 +3642,8 @@ mod tests { ); assert_eq!(blueprint1.sleds.len() + 1, blueprint2.sleds.len()); assert_eq!( - blueprint1.danger_all_omicron_zones(BlueprintZoneDisposition::any).count() - + num_new_sled_zones, - blueprint2.danger_all_omicron_zones(BlueprintZoneDisposition::any).count() + blueprint1.in_service_zones().count() + num_new_sled_zones, + blueprint2.in_service_zones().count() ); // All zones should be in service. @@ -4516,7 +4513,7 @@ mod tests { fn assert_all_zones_in_service(blueprint: &Blueprint) { let not_in_service = blueprint - .danger_all_omicron_zones(|disposition| !disposition.is_in_service()) + .expunged_zones(BlueprintExpungedZoneAccessReason::Test) .collect::>(); assert!( not_in_service.is_empty(), diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index 99cbdee00b1..10dc3caac00 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -1503,9 +1503,7 @@ mod test { let mut ntp1_id = None; let mut ntp2_id = None; let mut ntp3_id = None; - for (sled_id, zone) in - blueprint.in_service_zones() - { + for (sled_id, zone) in blueprint.in_service_zones() { match &zone.zone_type { BlueprintZoneType::BoundaryNtp(_) => { let which = if sled_id == sled1.id() { @@ -1750,10 +1748,8 @@ mod test { assert_eq!(observed_blueprint, blueprint); // We should see both of the Nexus services we provisioned. - let mut observed_zones: Vec<_> = observed_blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::any) - .map(|(_, z)| z) - .collect(); + let mut observed_zones: Vec<_> = + observed_blueprint.in_service_zones().map(|(_, z)| z).collect(); observed_zones.sort_by_key(|z| z.id); assert_eq!(observed_zones.len(), 2); @@ -1778,12 +1774,7 @@ mod test { if let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { external_ip, .. - }) = &blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::any) - .next() - .unwrap() - .1 - .zone_type + }) = &blueprint.in_service_zones().next().unwrap().1.zone_type { external_ip.ip } else { @@ -1797,12 +1788,7 @@ mod test { if let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { external_ip, .. - }) = &blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::any) - .nth(1) - .unwrap() - .1 - .zone_type + }) = &blueprint.in_service_zones().nth(1).unwrap().1.zone_type { external_ip.ip } else { @@ -1955,10 +1941,8 @@ mod test { assert_eq!(observed_blueprint, blueprint); // We should see the Nexus service we provisioned. - let mut observed_zones: Vec<_> = observed_blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::any) - .map(|(_, z)| z) - .collect(); + let mut observed_zones: Vec<_> = + observed_blueprint.in_service_zones().map(|(_, z)| z).collect(); observed_zones.sort_by_key(|z| z.id); assert_eq!(observed_zones.len(), 1); @@ -1985,12 +1969,7 @@ mod test { if let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { external_ip, .. - }) = &blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::any) - .next() - .unwrap() - .1 - .zone_type + }) = &blueprint.in_service_zones().next().unwrap().1.zone_type { external_ip.ip } else { diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 1b8f3ecebe8..693f412bd02 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -1554,11 +1554,11 @@ mod test { eprintln!("blueprint2: {}", blueprint2.display()); // Figure out the id of the new zone. let zones_before = blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::any) + .in_service_zones() .filter_map(|(_, z)| z.zone_type.is_nexus().then_some(z.id)) .collect::>(); let zones_after = blueprint2 - .danger_all_omicron_zones(BlueprintZoneDisposition::any) + .in_service_zones() .filter_map(|(_, z)| z.zone_type.is_nexus().then_some(z.id)) .collect::>(); let new_zones: Vec<_> = zones_after.difference(&zones_before).collect(); diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 570ddd0fe10..181b21b9ef3 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -3245,9 +3245,7 @@ pub mod test { // that are already in use by existing zones. Attempting to add a // Nexus with no remaining external IPs should fail. let mut used_ip_ranges = Vec::new(); - for (_, z) in - parent.danger_all_omicron_zones(BlueprintZoneDisposition::any) - { + for (_, z) in parent.in_service_zones() { if let Some((external_ip, _)) = z.zone_type.external_networking() { @@ -3316,9 +3314,7 @@ pub mod test { // Ensure no CRDB zones (currently `ExampleSystemBuilder` never // provisions CRDB; this check makes sure we update our use of it if // that changes). - for (_, z) in - parent.in_service_zones() - { + for (_, z) in parent.in_service_zones() { assert!( !z.zone_type.is_cockroach(), "unexpected cockroach zone \ diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index 43ee781d571..84bb23d7f66 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -1018,7 +1018,6 @@ mod tests { use internal_dns_resolver::Resolver; use internal_dns_types::names::ServiceName; use nexus_types::deployment::BlueprintZoneConfig; - use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::execution::blueprint_internal_dns_config; use nexus_types::deployment::execution::overridables; use nexus_types::internal_api::params::DnsConfigParams; @@ -1446,9 +1445,7 @@ mod tests { } ServiceName::Crucible(_) => { // Each Crucible zone should be queryable. - for (_, zone) in blueprint.danger_all_omicron_zones( - BlueprintZoneDisposition::is_in_service, - ) { + for (_, zone) in blueprint.in_service_zones() { if zone.kind() == ZoneKind::Crucible { out.insert(ServiceName::Crucible(zone.id), Ok(())); } @@ -1471,7 +1468,7 @@ mod tests { kind: ZoneKind, ) -> Vec<&BlueprintZoneConfig> { blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::any) + .in_service_zones() .filter_map(|(_, zone)| { (zone.zone_type.kind() == kind).then_some(zone) }) diff --git a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs index b915b9c48e3..e7cbbbdceb3 100644 --- a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs +++ b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs @@ -2012,7 +2012,7 @@ fn test_single_node_clickhouse() { // We should start with one ClickHouse zone. Find out which sled it's on. let clickhouse_sleds = blueprint1 - .danger_all_omicron_zones(BlueprintZoneDisposition::any) + .in_service_zones() .filter_map(|(sled, zone)| { zone.zone_type.is_clickhouse().then(|| Some(sled)) }) diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index 1c33f7c9f20..c77784ec299 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -520,9 +520,7 @@ mod test { .await; // Insert records for the zpools backing the datasets in these zones. - for (sled_id, config) in - loaded.blueprint.danger_all_omicron_zones(BlueprintZoneDisposition::any) - { + for (sled_id, config) in loaded.blueprint.in_service_zones() { let Some(dataset) = config.zone_type.durable_dataset() else { continue; }; diff --git a/nexus/src/app/background/tasks/crdb_node_id_collector.rs b/nexus/src/app/background/tasks/crdb_node_id_collector.rs index 7a672888426..646f7829072 100644 --- a/nexus/src/app/background/tasks/crdb_node_id_collector.rs +++ b/nexus/src/app/background/tasks/crdb_node_id_collector.rs @@ -143,12 +143,10 @@ impl CockroachAdminFromBlueprint for CockroachAdminFromBlueprintViaFixedPort { &'a self, blueprint: &'a Blueprint, ) -> impl Iterator + 'a { - // We can only actively collect from zones that should be running; if + // We can only actively collect from zones that should be in-service; if // there are CRDB zones in other states that still need their node ID // collected, we have to wait until they're running. - let zone_filter = BlueprintZoneDisposition::is_in_service; - - blueprint.danger_all_omicron_zones(zone_filter).filter_map( + blueprint.in_service_zones().filter_map( |(_sled_id, zone)| match &zone.zone_type { BlueprintZoneType::CockroachDb( blueprint_zone_type::CockroachDb { address, .. }, diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 994b83d2a10..04ea475ba7d 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -627,6 +627,8 @@ pub struct OperatorNexusConfig<'a> { /// TODO-john pub enum BlueprintExpungedZoneAccessReason { + /// TODO-john + DeallocateExternalNetworkingResources, /// TODO-john Test, } From 0a8b08fb6d5267b9b1476e126216f4acdad3f644 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 12 Dec 2025 11:19:45 -0500 Subject: [PATCH 05/25] split up expunged_zones() --- live-tests/tests/test_nexus_add_remove.rs | 8 ++++-- .../db-queries/src/db/datastore/deployment.rs | 25 +++++++++++++++++-- .../tests/integration_tests/planner.rs | 17 +++++++++---- nexus/types/src/deployment.rs | 25 +++++++++++++++++-- 4 files changed, 64 insertions(+), 11 deletions(-) diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index 3f5592d580a..913e4b77295 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -208,7 +208,9 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { .await .expect("editing blueprint to expunge zone"); let (_, expunged_zone_config) = blueprint3 - .expunged_zones(BlueprintExpungedZoneAccessReason::Test) + .expunged_zones_not_ready_for_cleanup( + BlueprintExpungedZoneAccessReason::Test, + ) .find(|(_sled_id, zone_config)| zone_config.id == new_zone.id) .expect("expunged zone in new blueprint"); let BlueprintZoneDisposition::Expunged { @@ -307,7 +309,9 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { // We don't need to check this here. It just provides a better error // message if something has gone wrong up to this point. let (_, expunged_zone_config) = new_blueprint - .expunged_zones(BlueprintExpungedZoneAccessReason::Test) + .expunged_zones_ready_for_cleanup( + BlueprintExpungedZoneAccessReason::Test, + ) .find(|(_sled_id, zone_config)| zone_config.id == new_zone.id) .expect("expunged zone in new blueprint"); assert!(expunged_zone_config.disposition.is_ready_for_cleanup()); diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 5a9a86c079d..4759a00f43a 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -1956,11 +1956,27 @@ impl DataStore { self.ensure_zone_external_networking_deallocated_on_connection( &conn, &opctx.log, + // TODO-correctness Currently the planner _does not wait_ + // for zones using external IPs to be ready for cleanup + // before reassigning the external IP to a new zone, so we + // have to deallocate IPs for both "ready for cleanup" and + // "not ready for cleanup" zones. We should fix the planner, + // at which point we can operate on only "ready for cleanup" + // zones here. + // + // blueprint - .expunged_zones( + .expunged_zones_ready_for_cleanup( BlueprintExpungedZoneAccessReason ::DeallocateExternalNetworkingResources ) + .chain( + blueprint + .expunged_zones_not_ready_for_cleanup( + BlueprintExpungedZoneAccessReason + ::DeallocateExternalNetworkingResources + ) + ) .map(|(_sled_id, zone)| zone), ) .await @@ -4513,7 +4529,12 @@ mod tests { fn assert_all_zones_in_service(blueprint: &Blueprint) { let not_in_service = blueprint - .expunged_zones(BlueprintExpungedZoneAccessReason::Test) + .expunged_zones_not_ready_for_cleanup( + BlueprintExpungedZoneAccessReason::Test, + ) + .chain(blueprint.expunged_zones_ready_for_cleanup( + BlueprintExpungedZoneAccessReason::Test, + )) .collect::>(); assert!( not_in_service.is_empty(), diff --git a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs index e7cbbbdceb3..b8728db5f39 100644 --- a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs +++ b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs @@ -2571,9 +2571,12 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { ); let blueprint3 = sim.run_planner().expect("planning succeeded"); - // We should have expunged our single-node clickhouse zone + // We should have expunged our single-node clickhouse zone, but it won't be + // `ready_for_cleanup` yet. let expunged_zones: Vec<_> = blueprint3 - .expunged_zones(BlueprintExpungedZoneAccessReason::Test) + .expunged_zones_not_ready_for_cleanup( + BlueprintExpungedZoneAccessReason::Test, + ) .map(|(_, z)| z.clone()) .collect(); @@ -2596,7 +2599,9 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { // All our clickhouse keeper and server zones that we created when we // enabled our clickhouse policy should be expunged when we disable it. let expunged_zones: Vec<_> = blueprint4 - .expunged_zones(BlueprintExpungedZoneAccessReason::Test) + .expunged_zones_not_ready_for_cleanup( + BlueprintExpungedZoneAccessReason::Test, + ) .map(|(_, z)| z.clone()) .collect(); @@ -3289,10 +3294,12 @@ fn test_update_crucible_pantry_before_nexus() { CRUCIBLE_PANTRY_REDUNDANCY ); - // All old Pantry zones should now be expunged. + // All old Pantry zones should now be expunged and ready for cleanup. assert_eq!( blueprint - .expunged_zones(BlueprintExpungedZoneAccessReason::Test) + .expunged_zones_ready_for_cleanup( + BlueprintExpungedZoneAccessReason::Test + ) .filter(|(_, z)| is_old_pantry(z)) .count(), CRUCIBLE_PANTRY_REDUNDANCY diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 04ea475ba7d..b2f32cc0062 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -317,12 +317,33 @@ impl Blueprint { } /// TODO-john - pub fn expunged_zones( + pub fn expunged_zones_not_ready_for_cleanup( &self, _reason: BlueprintExpungedZoneAccessReason, ) -> impl Iterator { // TODO-john explain - self.danger_all_omicron_zones(BlueprintZoneDisposition::is_expunged) + self.danger_all_omicron_zones(|disposition| match disposition { + BlueprintZoneDisposition::InService => false, + BlueprintZoneDisposition::Expunged { + as_of_generation: _, + ready_for_cleanup, + } => !ready_for_cleanup, + }) + } + + /// TODO-john + pub fn expunged_zones_ready_for_cleanup( + &self, + _reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator { + // TODO-john explain + self.danger_all_omicron_zones(|disposition| match disposition { + BlueprintZoneDisposition::InService => false, + BlueprintZoneDisposition::Expunged { + as_of_generation: _, + ready_for_cleanup, + } => ready_for_cleanup, + }) } /// Iterate over the [`BlueprintZoneConfig`] instances in the blueprint From 89f4ab2ddf4bbb9e1ee4aa52405cfa0cbf061a17 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 12 Dec 2025 11:49:05 -0500 Subject: [PATCH 06/25] break up `all_nexus_zones()` --- dev-tools/omdb/src/bin/omdb/db/db_metadata.rs | 4 +- dev-tools/reconfigurator-cli/src/lib.rs | 4 +- live-tests/tests/test_nexus_handoff.rs | 6 +-- nexus/reconfigurator/blippy/src/checks.rs | 30 ++++------- .../reconfigurator/execution/src/database.rs | 4 +- nexus/reconfigurator/execution/src/sagas.rs | 12 +++-- .../execution/src/test_utils.rs | 2 +- nexus/reconfigurator/planning/src/example.rs | 2 +- nexus/types/src/deployment.rs | 53 +++++++++++++------ nexus/types/src/deployment/execution/utils.rs | 2 +- 10 files changed, 67 insertions(+), 52 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs b/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs index a86b512b52f..11afeb3d071 100644 --- a/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs +++ b/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs @@ -133,7 +133,7 @@ async fn get_db_metadata_nexus_rows( ]; let nexus_generation_by_zone = blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .map(|(_, zone, nexus_zone)| (zone.id, nexus_zone.nexus_generation)) .collect::>(); @@ -213,7 +213,7 @@ pub async fn cmd_db_metadata_force_mark_nexus_quiesced( .await .context("loading current target blueprint")?; let nexus_generation = current_target_blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .find_map(|(_, zone, nexus_zone)| { if zone.id == args.id { Some(nexus_zone.nexus_generation) diff --git a/dev-tools/reconfigurator-cli/src/lib.rs b/dev-tools/reconfigurator-cli/src/lib.rs index 39127cb48f5..ff1456222e6 100644 --- a/dev-tools/reconfigurator-cli/src/lib.rs +++ b/dev-tools/reconfigurator-cli/src/lib.rs @@ -214,7 +214,7 @@ impl ReconfiguratorSim { state.config().active_nexus_zone_generation(); let mut active_nexus_zones = BTreeSet::new(); for (_, zone, nexus) in parent_blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() { if nexus.nexus_generation == active_nexus_gen { active_nexus_zones.insert(zone.id); @@ -2494,7 +2494,7 @@ fn cmd_blueprint_edit( BlueprintEditCommands::BumpNexusGeneration => { let current_generation = builder.nexus_generation(); let current_max = blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .fold( current_generation, |current_max, (_sled_id, _zone_config, nexus_config)| { diff --git a/live-tests/tests/test_nexus_handoff.rs b/live-tests/tests/test_nexus_handoff.rs index ad2e553495d..7d3d5f9ea42 100644 --- a/live-tests/tests/test_nexus_handoff.rs +++ b/live-tests/tests/test_nexus_handoff.rs @@ -215,7 +215,7 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { // Find the new Nexus zones and make clients for them. let new_nexus_clients = blueprint_new_nexus - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .filter_map(|(_sled_id, zone_cfg, nexus_config)| { (nexus_config.nexus_generation == next_generation).then(|| { ( @@ -495,7 +495,7 @@ async fn check_internal_dns( // Compute what we expect to find, based on which Nexus instances in the // blueprint have the specified generation. let expected_nexus_addrs = blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .filter_map(|(_sled_id, _zone_cfg, nexus_config)| { (nexus_config.nexus_generation == active_generation) .then_some(nexus_config.internal_address) @@ -559,7 +559,7 @@ async fn check_external_dns( // Compute which Nexus instances we expect to find in external DNS based on // what's in-service in the blueprint. let expected_nexus_addrs = blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .filter_map(|(_sled_id, _zone_cfg, nexus_config)| { (nexus_config.nexus_generation == active_generation) .then_some(nexus_config.external_ip.ip) diff --git a/nexus/reconfigurator/blippy/src/checks.rs b/nexus/reconfigurator/blippy/src/checks.rs index 64d0806c8ed..281868a2723 100644 --- a/nexus/reconfigurator/blippy/src/checks.rs +++ b/nexus/reconfigurator/blippy/src/checks.rs @@ -65,10 +65,7 @@ fn check_underlay_ips(blippy: &mut Blippy<'_>) { > = BTreeMap::new(); let mut rack_dns_subnets: BTreeSet = BTreeSet::new(); - for (sled_id, zone) in blippy - .blueprint() - .in_service_zones() - { + for (sled_id, zone) in blippy.blueprint().in_service_zones() { let ip = zone.underlay_ip(); // There should be no duplicate underlay IPs. @@ -158,10 +155,8 @@ fn check_external_networking(blippy: &mut Blippy<'_>) { let mut used_nic_ips = BTreeMap::new(); let mut used_nic_macs = BTreeMap::new(); - for (sled_id, zone, external_ip, nic) in blippy - .blueprint() - .in_service_zones() - .filter_map(|(sled_id, zone)| { + for (sled_id, zone, external_ip, nic) in + blippy.blueprint().in_service_zones().filter_map(|(sled_id, zone)| { zone.zone_type .external_networking() .map(|(external_ip, nic)| (sled_id, zone, external_ip, nic)) @@ -263,10 +258,7 @@ fn check_dataset_zpool_uniqueness(blippy: &mut Blippy<'_>) { // On any given zpool, we should have at most one zone of any given // kind. - for (sled_id, zone) in blippy - .blueprint() - .in_service_zones() - { + for (sled_id, zone) in blippy.blueprint().in_service_zones() { // Check "one kind per zpool" for transient datasets... let filesystem_dataset = zone.filesystem_dataset(); let kind = zone.zone_type.kind(); @@ -689,10 +681,7 @@ fn check_nexus_generation_consistency(blippy: &mut Blippy<'_>) { > = HashMap::new(); // Collect all Nexus zones and their generations - for (sled_id, zone) in blippy - .blueprint() - .in_service_zones() - { + for (sled_id, zone) in blippy.blueprint().in_service_zones() { if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { generation_info.entry(nexus.nexus_generation).or_default().push(( sled_id, @@ -1751,7 +1740,7 @@ mod tests { let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); let crucible_addr_by_zpool = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_, z)| match z.zone_type { BlueprintZoneType::Crucible( blueprint_zone_type::Crucible { address, .. }, @@ -1996,7 +1985,7 @@ mod tests { // Find the Nexus zones let ((sled1, zone1_id), (sled2, zone2_id)) = { let nexus_zones: Vec<_> = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(sled_id, zone)| { if matches!(zone.zone_type, BlueprintZoneType::Nexus(_)) { Some((sled_id, zone)) @@ -2142,8 +2131,9 @@ fn check_planning_input_network_records_appear_in_blueprint( // constructed above in `BuilderExternalNetworking::new()`, we do not // check for duplicates here: we could very well see reuse of IPs // between expunged zones or between expunged -> running zones. - for (_, z) in - blippy.blueprint().danger_all_omicron_zones(BlueprintZoneDisposition::any) + for (_, z) in blippy + .blueprint() + .danger_all_omicron_zones(BlueprintZoneDisposition::any) { let zone_type = &z.zone_type; match zone_type { diff --git a/nexus/reconfigurator/execution/src/database.rs b/nexus/reconfigurator/execution/src/database.rs index df96d8bab3b..000693c49b8 100644 --- a/nexus/reconfigurator/execution/src/database.rs +++ b/nexus/reconfigurator/execution/src/database.rs @@ -31,7 +31,7 @@ pub(crate) async fn deploy_db_metadata_nexus_records( // The actual generation number that's currently active is necessarily the // generation number of the Nexus instance that's doing the execution. let active_generation = blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .find_map(|(_sled_id, zone_cfg, nexus_config)| { (zone_cfg.id == nexus_id).then_some(nexus_config.nexus_generation) }) @@ -45,7 +45,7 @@ pub(crate) async fn deploy_db_metadata_nexus_records( let mut active = BTreeSet::new(); let mut not_yet = BTreeSet::new(); for (_sled_id, zone_config, nexus_config) in - blueprint.all_nexus_zones(BlueprintZoneDisposition::is_in_service) + blueprint.in_service_nexus_zones() { if nexus_config.nexus_generation == active_generation { active.insert(zone_config.id); diff --git a/nexus/reconfigurator/execution/src/sagas.rs b/nexus/reconfigurator/execution/src/sagas.rs index d973c2c75fe..a8deb511382 100644 --- a/nexus/reconfigurator/execution/src/sagas.rs +++ b/nexus/reconfigurator/execution/src/sagas.rs @@ -7,7 +7,9 @@ use nexus_db_model::SecId; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; -use nexus_types::deployment::{Blueprint, BlueprintZoneDisposition}; +use nexus_types::deployment::{ + Blueprint, BlueprintExpungedZoneAccessReason, BlueprintZoneDisposition, +}; use omicron_common::api::external::Error; use omicron_uuid_kinds::{GenericUuid, OmicronZoneUuid}; use slog::{debug, info, warn}; @@ -87,7 +89,9 @@ fn find_expunged_same_generation( let active_nexus_generation = blueprint.find_generation_for_self(nexus_zone_id)?; Ok(blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_ready_for_cleanup) + .expunged_nexus_zones_ready_for_cleanup( + BlueprintExpungedZoneAccessReason::NexusSagaReassignment, + ) .filter_map(|(_sled_id, zone_config, nexus_config)| { (nexus_config.nexus_generation == active_nexus_generation) .then_some(zone_config.id) @@ -126,7 +130,7 @@ mod test { ExampleSystemBuilder::new(log, TEST_NAME).nexus_count(4).build(); let g1 = Generation::new(); let g1_nexus_ids: Vec<_> = blueprint1 - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .map(|(sled_id, zone_config, nexus_config)| { assert_eq!(nexus_config.nexus_generation, g1); (sled_id, zone_config.id, zone_config.image_source.clone()) @@ -181,7 +185,7 @@ mod test { let blueprint2 = builder.build(BlueprintSource::Test); let g2_nexus_ids: Vec<_> = blueprint2 - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .filter_map(|(sled_id, zone_config, nexus_config)| { (nexus_config.nexus_generation == g2) .then_some((sled_id, zone_config.id)) diff --git a/nexus/reconfigurator/execution/src/test_utils.rs b/nexus/reconfigurator/execution/src/test_utils.rs index 737a2b16b59..ffdd62a92ad 100644 --- a/nexus/reconfigurator/execution/src/test_utils.rs +++ b/nexus/reconfigurator/execution/src/test_utils.rs @@ -44,7 +44,7 @@ pub(crate) async fn realize_blueprint_and_expect( // Act on behalf of one of the Nexus instances that could be currently // active in the blueprint. let nexus_id = blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .find_map(|(_sled_id, zone_config, nexus_config)| { (nexus_config.nexus_generation == blueprint.nexus_generation) .then_some(zone_config.id) diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index 84bb23d7f66..77fbe9f1f1f 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -730,7 +730,7 @@ impl ExampleSystemBuilder { // Find and set the set of active Nexuses let active_nexus_zone_ids: BTreeSet<_> = blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .filter_map(|(_, zone, nexus_zone)| { if nexus_zone.nexus_generation == blueprint.nexus_generation { Some(zone.id) diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index b2f32cc0062..e390511b6f4 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -365,17 +365,13 @@ impl Blueprint { .filter(move |(_, z)| filter(z.disposition)) } - /// Iterate over all Nexus zones that match the provided filter. - pub fn all_nexus_zones( + /// Iterate over all in-service Nexus zones. + pub fn in_service_nexus_zones( &self, - filter: F, ) -> impl Iterator< Item = (SledUuid, &BlueprintZoneConfig, &blueprint_zone_type::Nexus), - > - where - F: FnMut(BlueprintZoneDisposition) -> bool, - { - self.danger_all_omicron_zones(filter).filter_map(|(sled_id, zone)| { + > { + self.in_service_zones().filter_map(|(sled_id, zone)| { if let BlueprintZoneType::Nexus(nexus_config) = &zone.zone_type { Some((sled_id, zone, nexus_config)) } else { @@ -384,6 +380,26 @@ impl Blueprint { }) } + /// Iterate over all expunged Nexus zones that are ready for cleanup (i.e., + /// have been confirmed to be shut down and will not restart). + pub fn expunged_nexus_zones_ready_for_cleanup( + &self, + reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator< + Item = (SledUuid, &BlueprintZoneConfig, &blueprint_zone_type::Nexus), + > { + self.expunged_zones_ready_for_cleanup(reason).filter_map( + |(sled_id, zone)| { + if let BlueprintZoneType::Nexus(nexus_config) = &zone.zone_type + { + Some((sled_id, zone, nexus_config)) + } else { + None + } + }, + ) + } + /// Iterate over the [`BlueprintPhysicalDiskConfig`] instances in the /// blueprint that match the provided filter, along with the associated /// sled id. @@ -484,9 +500,7 @@ impl Blueprint { nexus_zones: &BTreeSet, ) -> Result, anyhow::Error> { let mut r#gen = None; - for (_, zone, nexus_zone) in - self.all_nexus_zones(BlueprintZoneDisposition::is_in_service) - { + for (_, zone, nexus_zone) in self.in_service_nexus_zones() { if nexus_zones.contains(&zone.id) { let found_gen = nexus_zone.nexus_generation; if let Some(r#gen) = r#gen { @@ -508,11 +522,16 @@ impl Blueprint { &self, nexus_id: OmicronZoneUuid, ) -> Result { - for (_sled_id, zone_config, nexus_config) in - self.all_nexus_zones(BlueprintZoneDisposition::could_be_running) - { - if zone_config.id == nexus_id { - return Ok(nexus_config.nexus_generation); + // TODO-john + for (_sled_id, zone_config) in self.danger_all_omicron_zones( + BlueprintZoneDisposition::could_be_running, + ) { + if let BlueprintZoneType::Nexus(nexus_config) = + &zone_config.zone_type + { + if zone_config.id == nexus_id { + return Ok(nexus_config.nexus_generation); + } } } @@ -651,6 +670,8 @@ pub enum BlueprintExpungedZoneAccessReason { /// TODO-john DeallocateExternalNetworkingResources, /// TODO-john + NexusSagaReassignment, + /// TODO-john Test, } diff --git a/nexus/types/src/deployment/execution/utils.rs b/nexus/types/src/deployment/execution/utils.rs index 3e6f745b871..afabda9bf05 100644 --- a/nexus/types/src/deployment/execution/utils.rs +++ b/nexus/types/src/deployment/execution/utils.rs @@ -90,7 +90,7 @@ pub fn blueprint_nexus_external_ips( active_generation: Generation, ) -> Vec { blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .filter_map(|(_sled_id, _zone_config, nexus_config)| { (nexus_config.nexus_generation == active_generation) .then_some(nexus_config.external_ip.ip) From db9769eaf93f60c4bbf3f2414e4ccf62061b944d Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 12 Dec 2025 13:36:36 -0500 Subject: [PATCH 07/25] break up `all_nexus_zones()` --- nexus/db-queries/src/db/datastore/support_bundle.rs | 7 +++++-- nexus/types/src/deployment.rs | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 4077c7731a9..34bb51a4db4 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -21,6 +21,7 @@ use nexus_db_errors::OptionalError; use nexus_db_errors::public_error_from_diesel; use nexus_db_lookup::LookupPath; use nexus_types::deployment::BlueprintDatasetDisposition; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneDisposition; use omicron_common::api::external; use omicron_common::api::external::CreateResult; @@ -227,8 +228,10 @@ impl DataStore { // For this blueprint: The set of all expunged Nexus zones that are // ready for cleanup let invalid_nexus_zones = blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup) - .filter_map(|(_sled, zone)| { + .expunged_nexus_zones_ready_for_cleanup( + BlueprintExpungedZoneAccessReason::NexusSupportBundleMarkFailed, + ) + .filter_map(|(_sled, zone, _nexus_config)| { if zone.zone_type.is_nexus() { Some(zone.id.into_untyped_uuid()) } else { diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index e390511b6f4..bd4db74769a 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -672,6 +672,8 @@ pub enum BlueprintExpungedZoneAccessReason { /// TODO-john NexusSagaReassignment, /// TODO-john + NexusSupportBundleMarkFailed, + /// TODO-john Test, } From cb9d939c5783d5e9e4af293372dece1af0586035 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Fri, 12 Dec 2025 14:34:01 -0500 Subject: [PATCH 08/25] fixup after rebase --- nexus/reconfigurator/execution/src/omicron_zones.rs | 12 +++++++++--- nexus/types/src/deployment.rs | 6 +++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index 58b59d5234c..073a473bfcf 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -95,7 +95,9 @@ fn clean_up_expunged_nexus_zones<'a>( blueprint: &'a Blueprint, ) -> impl Stream)> + 'a { let zones_to_clean_up = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup) + .danger_all_omicron_zones( + BlueprintZoneDisposition::is_ready_for_cleanup, + ) .filter(|(_sled_id, zone)| zone.zone_type.is_nexus()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { @@ -119,7 +121,9 @@ fn clean_up_expunged_cockroach_zones<'a, R: CleanupResolver>( blueprint: &'a Blueprint, ) -> impl Stream)> + 'a { let zones_to_clean_up = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup) + .danger_all_omicron_zones( + BlueprintZoneDisposition::is_ready_for_cleanup, + ) .filter(|(_sled_id, zone)| zone.zone_type.is_cockroach()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { @@ -142,7 +146,9 @@ fn clean_up_expunged_oximeter_zones<'a>( blueprint: &'a Blueprint, ) -> impl Stream)> + 'a { let zones_to_clean_up = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup) + .danger_all_omicron_zones( + BlueprintZoneDisposition::is_ready_for_cleanup, + ) .filter(|(_sled_id, zone)| zone.zone_type.is_oximeter()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index bd4db74769a..5d892ba43bd 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -560,7 +560,7 @@ impl Blueprint { // zones. (Real racks will always have at least one in-service boundary // NTP zone, but some test or test systems may have 0 if they have only // a single sled and that sled's boundary NTP zone is being upgraded.) - self.all_omicron_zones(BlueprintZoneDisposition::any).find_map( + self.danger_all_omicron_zones(BlueprintZoneDisposition::any).find_map( |(_sled_id, zone)| match &zone.zone_type { BlueprintZoneType::BoundaryNtp(ntp_config) => { Some(UpstreamNtpConfig { @@ -590,7 +590,7 @@ impl Blueprint { // zones. (Real racks will always have at least one in-service Nexus // zone - the one calling this code - but some tests create blueprints // without any.) - self.all_omicron_zones(BlueprintZoneDisposition::any).find_map( + self.danger_all_omicron_zones(BlueprintZoneDisposition::any).find_map( |(_sled_id, zone)| match &zone.zone_type { BlueprintZoneType::Nexus(nexus_config) => { Some(OperatorNexusConfig { @@ -641,7 +641,7 @@ impl Blueprint { // don't check for that here. That would be an illegal blueprint; blippy // would complain, and attempting to execute it would fail due to // database constraints on external IP uniqueness. - self.all_omicron_zones(BlueprintZoneDisposition::any) + self.danger_all_omicron_zones(BlueprintZoneDisposition::any) .filter_map(|(_id, zone)| match &zone.zone_type { BlueprintZoneType::ExternalDns(dns) => { Some(dns.dns_address.addr.ip()) From 91b716345bf81cd900c3d0d9f36babda2f9a2955 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 15 Dec 2025 11:48:14 -0500 Subject: [PATCH 09/25] remove more danger_all_omicron_zones() uses --- .../execution/src/omicron_zones.rs | 16 +++++++--------- nexus/types/src/deployment.rs | 6 ++++++ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index 073a473bfcf..566a0bc4887 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -18,6 +18,7 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_db_queries::db::datastore::CollectorReassignment; use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneDisposition; use omicron_common::address::COCKROACH_ADMIN_PORT; use omicron_uuid_kinds::GenericUuid; @@ -94,10 +95,9 @@ fn clean_up_expunged_nexus_zones<'a>( datastore: &'a DataStore, blueprint: &'a Blueprint, ) -> impl Stream)> + 'a { + use BlueprintExpungedZoneAccessReason::NexusDeleteMetadataRecord; let zones_to_clean_up = blueprint - .danger_all_omicron_zones( - BlueprintZoneDisposition::is_ready_for_cleanup, - ) + .expunged_zones_ready_for_cleanup(NexusDeleteMetadataRecord) .filter(|(_sled_id, zone)| zone.zone_type.is_nexus()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { @@ -120,10 +120,9 @@ fn clean_up_expunged_cockroach_zones<'a, R: CleanupResolver>( resolver: &'a R, blueprint: &'a Blueprint, ) -> impl Stream)> + 'a { + use BlueprintExpungedZoneAccessReason::CockroachDecommission; let zones_to_clean_up = blueprint - .danger_all_omicron_zones( - BlueprintZoneDisposition::is_ready_for_cleanup, - ) + .expunged_zones_ready_for_cleanup(CockroachDecommission) .filter(|(_sled_id, zone)| zone.zone_type.is_cockroach()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { @@ -145,10 +144,9 @@ fn clean_up_expunged_oximeter_zones<'a>( datastore: &'a DataStore, blueprint: &'a Blueprint, ) -> impl Stream)> + 'a { + use BlueprintExpungedZoneAccessReason::OximeterExpungeAndReassignProducers; let zones_to_clean_up = blueprint - .danger_all_omicron_zones( - BlueprintZoneDisposition::is_ready_for_cleanup, - ) + .expunged_zones_ready_for_cleanup(OximeterExpungeAndReassignProducers) .filter(|(_sled_id, zone)| zone.zone_type.is_oximeter()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 5d892ba43bd..62716724fd6 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -667,13 +667,19 @@ pub struct OperatorNexusConfig<'a> { /// TODO-john pub enum BlueprintExpungedZoneAccessReason { + /// TODO-john + CockroachDecommission, /// TODO-john DeallocateExternalNetworkingResources, /// TODO-john + NexusDeleteMetadataRecord, + /// TODO-john NexusSagaReassignment, /// TODO-john NexusSupportBundleMarkFailed, /// TODO-john + OximeterExpungeAndReassignProducers, + /// TODO-john Test, } From fbf8663bb94035fa886fc7e66c7c6b667038a905 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 15 Dec 2025 11:59:33 -0500 Subject: [PATCH 10/25] add clickhouse IP expunge zone access reason --- .../execution/src/clickhouse.rs | 26 ++++++++++++------- nexus/types/src/deployment.rs | 2 ++ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/nexus/reconfigurator/execution/src/clickhouse.rs b/nexus/reconfigurator/execution/src/clickhouse.rs index 1601a5a2532..ec1700e6f7a 100644 --- a/nexus/reconfigurator/execution/src/clickhouse.rs +++ b/nexus/reconfigurator/execution/src/clickhouse.rs @@ -23,6 +23,7 @@ use futures::stream::FuturesUnordered; use futures::stream::StreamExt; use nexus_db_queries::context::OpContext; use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::ClickhouseClusterConfig; @@ -45,9 +46,11 @@ pub(crate) async fn deploy_nodes( blueprint: &Blueprint, clickhouse_cluster_config: &ClickhouseClusterConfig, ) -> Result<(), Vec> { - // Important: We must continue to pass in `BlueprintZoneDisposition::any` - // here, instead of `BlueprintZoneDisposition::is_in_service`, as would - // be expected. + use BlueprintExpungedZoneAccessReason::ClickhouseKeeperServerConfigIps; + + // Important: We must look at all zones here, including expunged zones in + // both "not ready for cleanup" and "ready for cleanup" states, instead of + // just in-service zones, as would be expected. // // We can only add or remove one clickhouse keeper node at a time, // and the planner generates the `ClickhouseClusterConfig` under this @@ -72,12 +75,17 @@ pub(crate) async fn deploy_nodes( // `ClickhouseClusterConfig`. // // This is tracked in https://github.com/oxidecomputer/omicron/issues/7724 - deploy_nodes_impl( - opctx, - blueprint.danger_all_omicron_zones(BlueprintZoneDisposition::any), - clickhouse_cluster_config, - ) - .await + let all_zones = + blueprint + .in_service_zones() + .chain(blueprint.expunged_zones_not_ready_for_cleanup( + ClickhouseKeeperServerConfigIps, + )) + .chain(blueprint.expunged_zones_ready_for_cleanup( + ClickhouseKeeperServerConfigIps, + )); + + deploy_nodes_impl(opctx, all_zones, clickhouse_cluster_config).await } async fn deploy_nodes_impl<'a, I>( diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 62716724fd6..7799b57c394 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -667,6 +667,8 @@ pub struct OperatorNexusConfig<'a> { /// TODO-john pub enum BlueprintExpungedZoneAccessReason { + /// TODO-john + ClickhouseKeeperServerConfigIps, /// TODO-john CockroachDecommission, /// TODO-john From 5a5b07975dc9fd088bc1a2326a183023972a87e6 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 15 Dec 2025 13:03:12 -0500 Subject: [PATCH 11/25] combine two expunged_zones methods into one with a `ReadyForCleanup` arg --- dev-tools/omdb/src/bin/omdb/db.rs | 28 +++++---- live-tests/tests/test_nexus_add_remove.rs | 7 ++- .../db-queries/src/db/datastore/deployment.rs | 17 ++---- .../execution/src/clickhouse.rs | 19 +++---- .../execution/src/omicron_zones.rs | 21 +++++-- .../tests/integration_tests/planner.rs | 10 +++- nexus/types/src/deployment.rs | 57 ++++++++++--------- 7 files changed, 89 insertions(+), 70 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 7d0831bffe6..51e07697a14 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -131,9 +131,11 @@ use nexus_db_queries::db::pagination::paginated; use nexus_db_queries::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; use nexus_db_queries::db::queries::region_allocation; use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::DiskFilter; +use nexus_types::deployment::ReadyForCleanup; use nexus_types::deployment::SledFilter; use nexus_types::external_api::params; use nexus_types::external_api::views::PhysicalDiskPolicy; @@ -1622,16 +1624,22 @@ async fn lookup_service_info( service_id: Uuid, blueprint: &Blueprint, ) -> anyhow::Result> { - let Some(zone_config) = blueprint - .danger_all_omicron_zones(BlueprintZoneDisposition::any) - .find_map(|(_sled_id, zone_config)| { - if zone_config.id.into_untyped_uuid() == service_id { - Some(zone_config) - } else { - None - } - }) - else { + // We don't know anything about `service_id`; it may be in-service or it may + // be expunged. Check all the zone states. + let mut all_zones = blueprint.in_service_zones().chain( + blueprint.expunged_zones( + ReadyForCleanup::Both, + BlueprintExpungedZoneAccessReason::Omdb, + ), + ); + + let Some(zone_config) = all_zones.find_map(|(_sled_id, zone_config)| { + if zone_config.id.into_untyped_uuid() == service_id { + Some(zone_config) + } else { + None + } + }) else { return Ok(None); }; diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index 913e4b77295..fc760cd28a9 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -23,6 +23,7 @@ use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::PlannerConfig; +use nexus_types::deployment::ReadyForCleanup; use nexus_types::deployment::SledFilter; use nexus_types::deployment::blueprint_zone_type; use omicron_common::address::NEXUS_LOCKSTEP_PORT; @@ -208,7 +209,8 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { .await .expect("editing blueprint to expunge zone"); let (_, expunged_zone_config) = blueprint3 - .expunged_zones_not_ready_for_cleanup( + .expunged_zones( + ReadyForCleanup::No, BlueprintExpungedZoneAccessReason::Test, ) .find(|(_sled_id, zone_config)| zone_config.id == new_zone.id) @@ -309,7 +311,8 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { // We don't need to check this here. It just provides a better error // message if something has gone wrong up to this point. let (_, expunged_zone_config) = new_blueprint - .expunged_zones_ready_for_cleanup( + .expunged_zones( + ReadyForCleanup::Yes, BlueprintExpungedZoneAccessReason::Test, ) .find(|(_sled_id, zone_config)| zone_config.id == new_zone.id) diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 4759a00f43a..6d07715b9da 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -91,6 +91,7 @@ use nexus_types::deployment::PendingMgsUpdateRotBootloaderDetails; use nexus_types::deployment::PendingMgsUpdateRotDetails; use nexus_types::deployment::PendingMgsUpdateSpDetails; use nexus_types::deployment::PendingMgsUpdates; +use nexus_types::deployment::ReadyForCleanup; use nexus_types::inventory::BaseboardId; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; @@ -1966,17 +1967,11 @@ impl DataStore { // // blueprint - .expunged_zones_ready_for_cleanup( + .expunged_zones( + ReadyForCleanup::Both, BlueprintExpungedZoneAccessReason ::DeallocateExternalNetworkingResources ) - .chain( - blueprint - .expunged_zones_not_ready_for_cleanup( - BlueprintExpungedZoneAccessReason - ::DeallocateExternalNetworkingResources - ) - ) .map(|(_sled_id, zone)| zone), ) .await @@ -4529,12 +4524,10 @@ mod tests { fn assert_all_zones_in_service(blueprint: &Blueprint) { let not_in_service = blueprint - .expunged_zones_not_ready_for_cleanup( + .expunged_zones( + ReadyForCleanup::Both, BlueprintExpungedZoneAccessReason::Test, ) - .chain(blueprint.expunged_zones_ready_for_cleanup( - BlueprintExpungedZoneAccessReason::Test, - )) .collect::>(); assert!( not_in_service.is_empty(), diff --git a/nexus/reconfigurator/execution/src/clickhouse.rs b/nexus/reconfigurator/execution/src/clickhouse.rs index ec1700e6f7a..263f0a6b16e 100644 --- a/nexus/reconfigurator/execution/src/clickhouse.rs +++ b/nexus/reconfigurator/execution/src/clickhouse.rs @@ -27,6 +27,7 @@ use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::ClickhouseClusterConfig; +use nexus_types::deployment::ReadyForCleanup; use omicron_common::address::CLICKHOUSE_ADMIN_PORT; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SledUuid; @@ -46,8 +47,6 @@ pub(crate) async fn deploy_nodes( blueprint: &Blueprint, clickhouse_cluster_config: &ClickhouseClusterConfig, ) -> Result<(), Vec> { - use BlueprintExpungedZoneAccessReason::ClickhouseKeeperServerConfigIps; - // Important: We must look at all zones here, including expunged zones in // both "not ready for cleanup" and "ready for cleanup" states, instead of // just in-service zones, as would be expected. @@ -75,15 +74,13 @@ pub(crate) async fn deploy_nodes( // `ClickhouseClusterConfig`. // // This is tracked in https://github.com/oxidecomputer/omicron/issues/7724 - let all_zones = - blueprint - .in_service_zones() - .chain(blueprint.expunged_zones_not_ready_for_cleanup( - ClickhouseKeeperServerConfigIps, - )) - .chain(blueprint.expunged_zones_ready_for_cleanup( - ClickhouseKeeperServerConfigIps, - )); + use BlueprintExpungedZoneAccessReason::ClickhouseKeeperServerConfigIps; + let all_zones = blueprint.in_service_zones().chain( + blueprint.expunged_zones( + ReadyForCleanup::Both, + ClickhouseKeeperServerConfigIps, + ), + ); deploy_nodes_impl(opctx, all_zones, clickhouse_cluster_config).await } diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index 566a0bc4887..c28bf1ee77f 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -19,7 +19,7 @@ use nexus_db_queries::db::DataStore; use nexus_db_queries::db::datastore::CollectorReassignment; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintExpungedZoneAccessReason; -use nexus_types::deployment::BlueprintZoneDisposition; +use nexus_types::deployment::ReadyForCleanup; use omicron_common::address::COCKROACH_ADMIN_PORT; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; @@ -97,7 +97,10 @@ fn clean_up_expunged_nexus_zones<'a>( ) -> impl Stream)> + 'a { use BlueprintExpungedZoneAccessReason::NexusDeleteMetadataRecord; let zones_to_clean_up = blueprint - .expunged_zones_ready_for_cleanup(NexusDeleteMetadataRecord) + .expunged_zones( + ReadyForCleanup::Yes, + NexusDeleteMetadataRecord, + ) .filter(|(_sled_id, zone)| zone.zone_type.is_nexus()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { @@ -122,7 +125,10 @@ fn clean_up_expunged_cockroach_zones<'a, R: CleanupResolver>( ) -> impl Stream)> + 'a { use BlueprintExpungedZoneAccessReason::CockroachDecommission; let zones_to_clean_up = blueprint - .expunged_zones_ready_for_cleanup(CockroachDecommission) + .expunged_zones( + ReadyForCleanup::Yes, + CockroachDecommission, + ) .filter(|(_sled_id, zone)| zone.zone_type.is_cockroach()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { @@ -146,7 +152,10 @@ fn clean_up_expunged_oximeter_zones<'a>( ) -> impl Stream)> + 'a { use BlueprintExpungedZoneAccessReason::OximeterExpungeAndReassignProducers; let zones_to_clean_up = blueprint - .expunged_zones_ready_for_cleanup(OximeterExpungeAndReassignProducers) + .expunged_zones( + ReadyForCleanup::Yes, + OximeterExpungeAndReassignProducers, + ) .filter(|(_sled_id, zone)| zone.zone_type.is_oximeter()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { @@ -332,8 +341,8 @@ mod test { use httptest::responders::{json_encoded, status_code}; use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::{ - BlueprintZoneConfig, BlueprintZoneImageSource, BlueprintZoneType, - blueprint_zone_type, + BlueprintZoneConfig, BlueprintZoneDisposition, + BlueprintZoneImageSource, BlueprintZoneType, blueprint_zone_type, }; use omicron_common::api::external::Generation; use omicron_common::zpool_name::ZpoolName; diff --git a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs index b8728db5f39..90f714149f6 100644 --- a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs +++ b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs @@ -29,6 +29,7 @@ use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::CockroachDbPreserveDowngrade; use nexus_types::deployment::CockroachDbSettings; use nexus_types::deployment::OmicronZoneExternalSnatIp; +use nexus_types::deployment::ReadyForCleanup; use nexus_types::deployment::SledDisk; use nexus_types::deployment::TargetReleaseDescription; use nexus_types::deployment::blueprint_zone_type; @@ -2574,7 +2575,8 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { // We should have expunged our single-node clickhouse zone, but it won't be // `ready_for_cleanup` yet. let expunged_zones: Vec<_> = blueprint3 - .expunged_zones_not_ready_for_cleanup( + .expunged_zones( + ReadyForCleanup::No, BlueprintExpungedZoneAccessReason::Test, ) .map(|(_, z)| z.clone()) @@ -2599,7 +2601,8 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { // All our clickhouse keeper and server zones that we created when we // enabled our clickhouse policy should be expunged when we disable it. let expunged_zones: Vec<_> = blueprint4 - .expunged_zones_not_ready_for_cleanup( + .expunged_zones( + ReadyForCleanup::No, BlueprintExpungedZoneAccessReason::Test, ) .map(|(_, z)| z.clone()) @@ -3297,7 +3300,8 @@ fn test_update_crucible_pantry_before_nexus() { // All old Pantry zones should now be expunged and ready for cleanup. assert_eq!( blueprint - .expunged_zones_ready_for_cleanup( + .expunged_zones( + ReadyForCleanup::Yes, BlueprintExpungedZoneAccessReason::Test ) .filter(|(_, z)| is_old_pantry(z)) diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 7799b57c394..7f473eb3922 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -317,32 +317,25 @@ impl Blueprint { } /// TODO-john - pub fn expunged_zones_not_ready_for_cleanup( + pub fn expunged_zones( &self, + ready_for_cleanup: ReadyForCleanup, _reason: BlueprintExpungedZoneAccessReason, ) -> impl Iterator { // TODO-john explain - self.danger_all_omicron_zones(|disposition| match disposition { - BlueprintZoneDisposition::InService => false, - BlueprintZoneDisposition::Expunged { - as_of_generation: _, - ready_for_cleanup, - } => !ready_for_cleanup, - }) - } - - /// TODO-john - pub fn expunged_zones_ready_for_cleanup( - &self, - _reason: BlueprintExpungedZoneAccessReason, - ) -> impl Iterator { - // TODO-john explain - self.danger_all_omicron_zones(|disposition| match disposition { - BlueprintZoneDisposition::InService => false, - BlueprintZoneDisposition::Expunged { - as_of_generation: _, - ready_for_cleanup, - } => ready_for_cleanup, + self.danger_all_omicron_zones(move |disposition| { + let this_zone_ready_for_cleanup = match disposition { + BlueprintZoneDisposition::InService => return false, + BlueprintZoneDisposition::Expunged { + as_of_generation: _, + ready_for_cleanup: this_zone_ready_for_cleanup, + } => this_zone_ready_for_cleanup, + }; + match ready_for_cleanup { + ReadyForCleanup::Yes => this_zone_ready_for_cleanup, + ReadyForCleanup::No => !this_zone_ready_for_cleanup, + ReadyForCleanup::Both => true, + } }) } @@ -388,16 +381,15 @@ impl Blueprint { ) -> impl Iterator< Item = (SledUuid, &BlueprintZoneConfig, &blueprint_zone_type::Nexus), > { - self.expunged_zones_ready_for_cleanup(reason).filter_map( - |(sled_id, zone)| { + self.expunged_zones(ReadyForCleanup::Yes, reason) + .filter_map(|(sled_id, zone)| { if let BlueprintZoneType::Nexus(nexus_config) = &zone.zone_type { Some((sled_id, zone, nexus_config)) } else { None } - }, - ) + }) } /// Iterate over the [`BlueprintPhysicalDiskConfig`] instances in the @@ -666,7 +658,18 @@ pub struct OperatorNexusConfig<'a> { } /// TODO-john +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub enum ReadyForCleanup { + Yes, + No, + Both, +} + +/// TODO-john +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub enum BlueprintExpungedZoneAccessReason { + /// TODO-john + Blippy, /// TODO-john ClickhouseKeeperServerConfigIps, /// TODO-john @@ -680,6 +683,8 @@ pub enum BlueprintExpungedZoneAccessReason { /// TODO-john NexusSupportBundleMarkFailed, /// TODO-john + Omdb, + /// TODO-john OximeterExpungeAndReassignProducers, /// TODO-john Test, From 11ce5b4405aa9edfa8ce420713b7ba5cbaedfdad Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 15 Dec 2025 13:03:47 -0500 Subject: [PATCH 12/25] cargo fmt --- dev-tools/omdb/src/bin/omdb/db.rs | 7 ++--- dev-tools/reconfigurator-cli/src/lib.rs | 29 ++++++------------- .../execution/src/clickhouse.rs | 7 ++--- .../execution/src/omicron_zones.rs | 10 ++----- .../allocators/external_networking.rs | 4 +-- .../tasks/crdb_node_id_collector.rs | 24 +++++++-------- nexus/src/app/deployment.rs | 4 +-- nexus/src/app/quiesce.rs | 5 ++-- nexus/types/src/deployment.rs | 7 +++-- 9 files changed, 37 insertions(+), 60 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 51e07697a14..4a4d4fe450c 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -1626,12 +1626,11 @@ async fn lookup_service_info( ) -> anyhow::Result> { // We don't know anything about `service_id`; it may be in-service or it may // be expunged. Check all the zone states. - let mut all_zones = blueprint.in_service_zones().chain( - blueprint.expunged_zones( + let mut all_zones = + blueprint.in_service_zones().chain(blueprint.expunged_zones( ReadyForCleanup::Both, BlueprintExpungedZoneAccessReason::Omdb, - ), - ); + )); let Some(zone_config) = all_zones.find_map(|(_sled_id, zone_config)| { if zone_config.id.into_untyped_uuid() == service_id { diff --git a/dev-tools/reconfigurator-cli/src/lib.rs b/dev-tools/reconfigurator-cli/src/lib.rs index ff1456222e6..6b494d291dc 100644 --- a/dev-tools/reconfigurator-cli/src/lib.rs +++ b/dev-tools/reconfigurator-cli/src/lib.rs @@ -159,9 +159,7 @@ impl ReconfiguratorSim { builder.set_external_dns_version(parent_blueprint.external_dns_version); // Handle zone networking setup first - for (_, zone) in parent_blueprint - .in_service_zones() - { + for (_, zone) in parent_blueprint.in_service_zones() { if let Some((external_ip, nic)) = zone.zone_type.external_networking() { @@ -213,9 +211,7 @@ impl ReconfiguratorSim { let active_nexus_gen = state.config().active_nexus_zone_generation(); let mut active_nexus_zones = BTreeSet::new(); - for (_, zone, nexus) in parent_blueprint - .in_service_nexus_zones() - { + for (_, zone, nexus) in parent_blueprint.in_service_nexus_zones() { if nexus.nexus_generation == active_nexus_gen { active_nexus_zones.insert(zone.id); } @@ -232,9 +228,7 @@ impl ReconfiguratorSim { let active_nexus_gen = state.config().active_nexus_zone_generation(); let mut not_yet_nexus_zones = BTreeSet::new(); - for (_, zone) in parent_blueprint - .in_service_zones() - { + for (_, zone) in parent_blueprint.in_service_zones() { match &zone.zone_type { nexus_types::deployment::BlueprintZoneType::Nexus( nexus, @@ -2493,17 +2487,12 @@ fn cmd_blueprint_edit( } BlueprintEditCommands::BumpNexusGeneration => { let current_generation = builder.nexus_generation(); - let current_max = blueprint - .in_service_nexus_zones() - .fold( - current_generation, - |current_max, (_sled_id, _zone_config, nexus_config)| { - std::cmp::max( - nexus_config.nexus_generation, - current_max, - ) - }, - ); + let current_max = blueprint.in_service_nexus_zones().fold( + current_generation, + |current_max, (_sled_id, _zone_config, nexus_config)| { + std::cmp::max(nexus_config.nexus_generation, current_max) + }, + ); ensure!( current_max > current_generation, "cannot bump blueprint generation (currently \ diff --git a/nexus/reconfigurator/execution/src/clickhouse.rs b/nexus/reconfigurator/execution/src/clickhouse.rs index 263f0a6b16e..88f77db2196 100644 --- a/nexus/reconfigurator/execution/src/clickhouse.rs +++ b/nexus/reconfigurator/execution/src/clickhouse.rs @@ -75,12 +75,11 @@ pub(crate) async fn deploy_nodes( // // This is tracked in https://github.com/oxidecomputer/omicron/issues/7724 use BlueprintExpungedZoneAccessReason::ClickhouseKeeperServerConfigIps; - let all_zones = blueprint.in_service_zones().chain( - blueprint.expunged_zones( + let all_zones = + blueprint.in_service_zones().chain(blueprint.expunged_zones( ReadyForCleanup::Both, ClickhouseKeeperServerConfigIps, - ), - ); + )); deploy_nodes_impl(opctx, all_zones, clickhouse_cluster_config).await } diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index c28bf1ee77f..a924ae65bc4 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -97,10 +97,7 @@ fn clean_up_expunged_nexus_zones<'a>( ) -> impl Stream)> + 'a { use BlueprintExpungedZoneAccessReason::NexusDeleteMetadataRecord; let zones_to_clean_up = blueprint - .expunged_zones( - ReadyForCleanup::Yes, - NexusDeleteMetadataRecord, - ) + .expunged_zones(ReadyForCleanup::Yes, NexusDeleteMetadataRecord) .filter(|(_sled_id, zone)| zone.zone_type.is_nexus()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { @@ -125,10 +122,7 @@ fn clean_up_expunged_cockroach_zones<'a, R: CleanupResolver>( ) -> impl Stream)> + 'a { use BlueprintExpungedZoneAccessReason::CockroachDecommission; let zones_to_clean_up = blueprint - .expunged_zones( - ReadyForCleanup::Yes, - CockroachDecommission, - ) + .expunged_zones(ReadyForCleanup::Yes, CockroachDecommission) .filter(|(_sled_id, zone)| zone.zone_type.is_cockroach()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs b/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs index e9bda4924eb..e1f41bbc870 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs @@ -86,9 +86,7 @@ impl ExternalNetworkingAllocator { external_ip_policy: &ExternalIpPolicy, ) -> anyhow::Result { Self::new( - blueprint - .in_service_zones() - .map(|(_sled_id, zone)| zone), + blueprint.in_service_zones().map(|(_sled_id, zone)| zone), external_ip_policy, ) } diff --git a/nexus/src/app/background/tasks/crdb_node_id_collector.rs b/nexus/src/app/background/tasks/crdb_node_id_collector.rs index 646f7829072..bc34a428e36 100644 --- a/nexus/src/app/background/tasks/crdb_node_id_collector.rs +++ b/nexus/src/app/background/tasks/crdb_node_id_collector.rs @@ -146,18 +146,18 @@ impl CockroachAdminFromBlueprint for CockroachAdminFromBlueprintViaFixedPort { // We can only actively collect from zones that should be in-service; if // there are CRDB zones in other states that still need their node ID // collected, we have to wait until they're running. - blueprint.in_service_zones().filter_map( - |(_sled_id, zone)| match &zone.zone_type { - BlueprintZoneType::CockroachDb( - blueprint_zone_type::CockroachDb { address, .. }, - ) => { - let mut admin_addr = *address; - admin_addr.set_port(COCKROACH_ADMIN_PORT); - Some((zone.id, admin_addr)) - } - _ => None, - }, - ) + blueprint.in_service_zones().filter_map(|(_sled_id, zone)| match &zone + .zone_type + { + BlueprintZoneType::CockroachDb( + blueprint_zone_type::CockroachDb { address, .. }, + ) => { + let mut admin_addr = *address; + admin_addr.set_port(COCKROACH_ADMIN_PORT); + Some((zone.id, admin_addr)) + } + _ => None, + }) } } diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index 28bfe4457d3..9706647d122 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -367,9 +367,7 @@ fn is_target_release_change_allowed( } // Now check zone configs. - for (_, zone_config) in current_blueprint - .in_service_zones() - { + for (_, zone_config) in current_blueprint.in_service_zones() { match &zone_config.image_source { BlueprintZoneImageSource::InstallDataset => { // A mupdate has occurred; we must allow a new target release. diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index 0a7c7386902..477e4163195 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -351,9 +351,8 @@ async fn check_all_sagas_drained( // // This doesn't ever change once we've determined it once. But we don't // know what the value is until we see our first blueprint. - let Some(my_generation) = current_blueprint - .in_service_zones() - .find_map(|(_sled_id, zone)| { + let Some(my_generation) = + current_blueprint.in_service_zones().find_map(|(_sled_id, zone)| { if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { (zone.id == my_nexus_id).then_some(nexus.nexus_generation) } else { diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 7f473eb3922..d7a8c138ed8 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -381,15 +381,16 @@ impl Blueprint { ) -> impl Iterator< Item = (SledUuid, &BlueprintZoneConfig, &blueprint_zone_type::Nexus), > { - self.expunged_zones(ReadyForCleanup::Yes, reason) - .filter_map(|(sled_id, zone)| { + self.expunged_zones(ReadyForCleanup::Yes, reason).filter_map( + |(sled_id, zone)| { if let BlueprintZoneType::Nexus(nexus_config) = &zone.zone_type { Some((sled_id, zone, nexus_config)) } else { None } - }) + }, + ) } /// Iterate over the [`BlueprintPhysicalDiskConfig`] instances in the From 05c81a96418914fd1138ec2e22c7e7816c3a385c Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 15 Dec 2025 13:06:52 -0500 Subject: [PATCH 13/25] last danger_all_omicron_zones() external caller --- nexus/reconfigurator/blippy/src/checks.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/nexus/reconfigurator/blippy/src/checks.rs b/nexus/reconfigurator/blippy/src/checks.rs index 281868a2723..750bf15cfa7 100644 --- a/nexus/reconfigurator/blippy/src/checks.rs +++ b/nexus/reconfigurator/blippy/src/checks.rs @@ -9,6 +9,7 @@ use crate::blippy::Severity; use crate::blippy::SledKind; use nexus_types::deployment::BlueprintDatasetConfig; use nexus_types::deployment::BlueprintDatasetDisposition; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintHostPhase2DesiredContents; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; use nexus_types::deployment::BlueprintSledConfig; @@ -18,6 +19,7 @@ use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::OmicronZoneExternalIp; use nexus_types::deployment::PlanningInput; +use nexus_types::deployment::ReadyForCleanup; use nexus_types::deployment::SledFilter; use nexus_types::deployment::blueprint_zone_type; use omicron_common::address::DnsSubnet; @@ -2131,10 +2133,12 @@ fn check_planning_input_network_records_appear_in_blueprint( // constructed above in `BuilderExternalNetworking::new()`, we do not // check for duplicates here: we could very well see reuse of IPs // between expunged zones or between expunged -> running zones. - for (_, z) in blippy - .blueprint() - .danger_all_omicron_zones(BlueprintZoneDisposition::any) - { + for (_, z) in blippy.blueprint().in_service_zones().chain( + blippy.blueprint().expunged_zones( + ReadyForCleanup::Both, + BlueprintExpungedZoneAccessReason::Blippy, + ), + ) { let zone_type = &z.zone_type; match zone_type { BlueprintZoneType::BoundaryNtp(ntp) => { From e9c005b501d654893551c83e4e22d872f5d5cd1c Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 15 Dec 2025 13:13:58 -0500 Subject: [PATCH 14/25] unused imports --- dev-tools/omdb/src/bin/omdb/db/db_metadata.rs | 1 - live-tests/tests/test_nexus_handoff.rs | 1 - nexus/db-queries/src/db/datastore/rack.rs | 5 +---- nexus/db-queries/src/db/datastore/support_bundle.rs | 2 +- nexus/reconfigurator/blippy/src/checks.rs | 1 - nexus/reconfigurator/execution/src/clickhouse.rs | 2 +- nexus/reconfigurator/execution/src/database.rs | 1 - nexus/reconfigurator/execution/src/sagas.rs | 4 +--- nexus/reconfigurator/execution/src/test_utils.rs | 2 +- nexus/reconfigurator/planning/src/example.rs | 1 - nexus/src/app/background/tasks/crdb_node_id_collector.rs | 1 - nexus/src/app/deployment.rs | 2 +- nexus/src/app/quiesce.rs | 2 -- nexus/src/lib.rs | 1 - nexus/types/src/deployment/execution/utils.rs | 5 +---- 15 files changed, 7 insertions(+), 24 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs b/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs index 11afeb3d071..789b06e87ce 100644 --- a/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs +++ b/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs @@ -17,7 +17,6 @@ use nexus_db_model::DbMetadataNexusState; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; -use nexus_types::deployment::BlueprintZoneDisposition; use omicron_common::api::external::Generation; use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::OmicronZoneUuid; diff --git a/live-tests/tests/test_nexus_handoff.rs b/live-tests/tests/test_nexus_handoff.rs index 7d3d5f9ea42..a73a20ff7a7 100644 --- a/live-tests/tests/test_nexus_handoff.rs +++ b/live-tests/tests/test_nexus_handoff.rs @@ -18,7 +18,6 @@ use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; use nexus_reconfigurator_planning::blueprint_editor::ExternalNetworkingAllocator; use nexus_reconfigurator_preparation::PlanningInputFromDb; use nexus_types::deployment::Blueprint; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::PlannerConfig; diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index 10dc3caac00..89cfd72f4f5 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -48,7 +48,6 @@ use nexus_db_model::SledUnderlaySubnetAllocation; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintTarget; use nexus_types::deployment::BlueprintZoneConfig; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::OmicronZoneExternalIp; use nexus_types::deployment::blueprint_zone_type; @@ -1068,9 +1067,7 @@ mod test { use nexus_types::deployment::ExternalIpPolicy; use nexus_types::deployment::PendingMgsUpdates; use nexus_types::deployment::SledFilter; - use nexus_types::deployment::{ - BlueprintZoneDisposition, BlueprintZoneImageSource, OximeterReadMode, - }; + use nexus_types::deployment::{BlueprintZoneImageSource, OximeterReadMode}; use nexus_types::external_api::shared::SiloIdentityMode; use nexus_types::identity::Asset; use nexus_types::internal_api::params::DnsRecord; diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 34bb51a4db4..309ef03cab9 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -22,7 +22,6 @@ use nexus_db_errors::public_error_from_diesel; use nexus_db_lookup::LookupPath; use nexus_types::deployment::BlueprintDatasetDisposition; use nexus_types::deployment::BlueprintExpungedZoneAccessReason; -use nexus_types::deployment::BlueprintZoneDisposition; use omicron_common::api::external; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; @@ -547,6 +546,7 @@ mod test { use nexus_reconfigurator_planning::example::ExampleSystemBuilder; use nexus_reconfigurator_planning::example::SimRngState; use nexus_types::deployment::Blueprint; + use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use omicron_common::api::external::ByteCount; use omicron_common::api::external::LookupType; diff --git a/nexus/reconfigurator/blippy/src/checks.rs b/nexus/reconfigurator/blippy/src/checks.rs index 750bf15cfa7..eef6c91ffca 100644 --- a/nexus/reconfigurator/blippy/src/checks.rs +++ b/nexus/reconfigurator/blippy/src/checks.rs @@ -14,7 +14,6 @@ use nexus_types::deployment::BlueprintHostPhase2DesiredContents; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; use nexus_types::deployment::BlueprintSledConfig; use nexus_types::deployment::BlueprintZoneConfig; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::OmicronZoneExternalIp; diff --git a/nexus/reconfigurator/execution/src/clickhouse.rs b/nexus/reconfigurator/execution/src/clickhouse.rs index 88f77db2196..5c9c0711cb3 100644 --- a/nexus/reconfigurator/execution/src/clickhouse.rs +++ b/nexus/reconfigurator/execution/src/clickhouse.rs @@ -25,7 +25,6 @@ use nexus_db_queries::context::OpContext; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneConfig; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::ClickhouseClusterConfig; use nexus_types::deployment::ReadyForCleanup; use omicron_common::address::CLICKHOUSE_ADMIN_PORT; @@ -388,6 +387,7 @@ mod test { use clickhouse_admin_types::KeeperId; use clickhouse_admin_types::ServerId; use nexus_types::deployment::BlueprintZoneConfig; + use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::blueprint_zone_type; diff --git a/nexus/reconfigurator/execution/src/database.rs b/nexus/reconfigurator/execution/src/database.rs index 000693c49b8..64bb33c3bba 100644 --- a/nexus/reconfigurator/execution/src/database.rs +++ b/nexus/reconfigurator/execution/src/database.rs @@ -8,7 +8,6 @@ use anyhow::anyhow; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; -use nexus_types::deployment::BlueprintZoneDisposition; use omicron_uuid_kinds::OmicronZoneUuid; use std::collections::BTreeSet; diff --git a/nexus/reconfigurator/execution/src/sagas.rs b/nexus/reconfigurator/execution/src/sagas.rs index a8deb511382..5660c3a55a8 100644 --- a/nexus/reconfigurator/execution/src/sagas.rs +++ b/nexus/reconfigurator/execution/src/sagas.rs @@ -7,9 +7,7 @@ use nexus_db_model::SecId; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; -use nexus_types::deployment::{ - Blueprint, BlueprintExpungedZoneAccessReason, BlueprintZoneDisposition, -}; +use nexus_types::deployment::{Blueprint, BlueprintExpungedZoneAccessReason}; use omicron_common::api::external::Error; use omicron_uuid_kinds::{GenericUuid, OmicronZoneUuid}; use slog::{debug, info, warn}; diff --git a/nexus/reconfigurator/execution/src/test_utils.rs b/nexus/reconfigurator/execution/src/test_utils.rs index ffdd62a92ad..c606d384cb0 100644 --- a/nexus/reconfigurator/execution/src/test_utils.rs +++ b/nexus/reconfigurator/execution/src/test_utils.rs @@ -10,7 +10,7 @@ use internal_dns_resolver::Resolver; use nexus_db_queries::{context::OpContext, db::DataStore}; use nexus_types::{ deployment::{ - Blueprint, BlueprintZoneDisposition, PendingMgsUpdates, + Blueprint, PendingMgsUpdates, execution::{EventBuffer, Overridables}, }, quiesce::SagaQuiesceHandle, diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index 77fbe9f1f1f..f47bc679cb5 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -30,7 +30,6 @@ use nexus_types::deployment::BlueprintArtifactVersion; use nexus_types::deployment::BlueprintHostPhase2DesiredContents; use nexus_types::deployment::BlueprintHostPhase2DesiredSlots; use nexus_types::deployment::BlueprintSource; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::ExpectedVersion; use nexus_types::deployment::OmicronZoneNic; use nexus_types::deployment::PlanningInput; diff --git a/nexus/src/app/background/tasks/crdb_node_id_collector.rs b/nexus/src/app/background/tasks/crdb_node_id_collector.rs index bc34a428e36..c5957dc4d46 100644 --- a/nexus/src/app/background/tasks/crdb_node_id_collector.rs +++ b/nexus/src/app/background/tasks/crdb_node_id_collector.rs @@ -34,7 +34,6 @@ use futures::stream; use nexus_auth::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::blueprint_zone_type; use omicron_common::address::COCKROACH_ADMIN_PORT; diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index 9706647d122..b4ea1d6f4cf 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -15,7 +15,6 @@ use nexus_types::deployment::BlueprintHostPhase2DesiredContents; use nexus_types::deployment::BlueprintMetadata; use nexus_types::deployment::BlueprintTarget; use nexus_types::deployment::BlueprintTargetSet; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::PlannerConfig; use nexus_types::deployment::PlanningInput; @@ -416,6 +415,7 @@ mod tests { use super::*; use nexus_reconfigurator_planning::example::example; use nexus_types::deployment::BlueprintHostPhase2DesiredSlots; + use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::external_api::views::SledState; use omicron_common::api::external::Generation; use omicron_test_utils::dev::test_setup_log; diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index 477e4163195..465729bfac6 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -12,7 +12,6 @@ use nexus_db_model::DbMetadataNexusState; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::internal_api::views::QuiesceState; use nexus_types::internal_api::views::QuiesceStatus; @@ -521,7 +520,6 @@ mod test { use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::BlueprintTarget; use nexus_types::deployment::BlueprintTargetSet; - use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::quiesce::SagaReassignmentDone; use omicron_test_utils::dev::poll::CondCheckError; use omicron_test_utils::dev::poll::wait_for_condition; diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index 2c9482878f9..aa6e3cb484b 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -29,7 +29,6 @@ use nexus_config::NexusConfig; use nexus_db_model::RendezvousDebugDataset; use nexus_db_queries::db; use nexus_types::deployment::Blueprint; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::blueprint_zone_type; use nexus_types::internal_api::params::{ diff --git a/nexus/types/src/deployment/execution/utils.rs b/nexus/types/src/deployment/execution/utils.rs index afabda9bf05..8e5c235de68 100644 --- a/nexus/types/src/deployment/execution/utils.rs +++ b/nexus/types/src/deployment/execution/utils.rs @@ -13,10 +13,7 @@ use omicron_uuid_kinds::SledUuid; use sled_agent_types_versions::latest::inventory::SledRole; use crate::{ - deployment::{ - Blueprint, BlueprintZoneDisposition, BlueprintZoneType, - blueprint_zone_type, - }, + deployment::{Blueprint, BlueprintZoneType, blueprint_zone_type}, external_api::views::SledPolicy, }; From 5e716e35d3cd11d4c8c7ada11c631493d1405032 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 15 Dec 2025 14:00:57 -0500 Subject: [PATCH 15/25] remove more internal danger_all_omicron_zones() calls --- nexus/types/src/deployment.rs | 92 +++++++++++++++++++++++------------ 1 file changed, 60 insertions(+), 32 deletions(-) diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index d7a8c138ed8..6dbb181f8bb 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -312,17 +312,28 @@ impl Blueprint { pub fn in_service_zones( &self, ) -> impl Iterator { - // TODO-john explain + // Danger note: this call has no danger of accessing expunged zones, + // because we're filtering to in-service. self.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) } - /// TODO-john + /// Iterate over the expunged [`BlueprintZoneConfig`] instances in the + /// blueprint, along with the associated sled id. + /// + /// Each call must specify whether they want zones that are + /// `ready_for_cleanup` (i.e., have been confirmed to be shut down and will + /// not be restarted), and must also specify a + /// [`BlueprintExpungedZoneAccessReason`]. The latter allows us to + /// statically track all uses of expunged zones, each of which we must + /// account for in the planner's logic to permanently prune expunged zones + /// from the blueprint. pub fn expunged_zones( &self, ready_for_cleanup: ReadyForCleanup, _reason: BlueprintExpungedZoneAccessReason, ) -> impl Iterator { - // TODO-john explain + // Danger note: this call will definitely access expunged zones, but we + // know the caller has provided a known reason to do so. self.danger_all_omicron_zones(move |disposition| { let this_zone_ready_for_cleanup = match disposition { BlueprintZoneDisposition::InService => return false, @@ -342,8 +353,13 @@ impl Blueprint { /// Iterate over the [`BlueprintZoneConfig`] instances in the blueprint /// that match the provided filter, along with the associated sled id. /// - /// TODO-john Explain danger? Add reason arg? - pub fn danger_all_omicron_zones( + /// This method is prefixed with `danger_` and is private because it allows + /// the caller to potentially act on expunged zones without providing a + /// reason for doing so. It should only be called by `in_service_zones()` + /// and `expunged_zones()` above; all other callers that want access to + /// expunged zones must go through `expunged_zones()`, which forces them to + /// specify a [`BlueprintExpungedZoneAccessReason`]. + fn danger_all_omicron_zones( &self, mut filter: F, ) -> impl Iterator @@ -461,14 +477,19 @@ impl Blueprint { BlueprintDisplay { blueprint: self } } - /// Returns whether the given Nexus instance should be quiescing or quiesced - /// in preparation for handoff to the next generation + /// Returns whether the Nexus instance `nexus_id`, which is assumed to refer + /// to the currently-running Nexus instance (the current process), should be + /// quiescing or quiesced in preparation for handoff to the next generation pub fn is_nexus_quiescing( &self, nexus_id: OmicronZoneUuid, ) -> Result { let zone = self - .danger_all_omicron_zones(|_z| true) + .in_service_zones() + .chain(self.expunged_zones( + ReadyForCleanup::Both, + BlueprintExpungedZoneAccessReason::NexusSelfIsQuiescing, + )) .find(|(_sled_id, zone_config)| zone_config.id == nexus_id) .ok_or_else(|| { anyhow!("zone {} does not exist in blueprint", nexus_id) @@ -515,10 +536,12 @@ impl Blueprint { &self, nexus_id: OmicronZoneUuid, ) -> Result { - // TODO-john - for (_sled_id, zone_config) in self.danger_all_omicron_zones( - BlueprintZoneDisposition::could_be_running, - ) { + for (_sled_id, zone_config) in + self.in_service_zones().chain(self.expunged_zones( + ReadyForCleanup::Both, + BlueprintExpungedZoneAccessReason::NexusSelfGeneration, + )) + { if let BlueprintZoneType::Nexus(nexus_config) = &zone_config.zone_type { @@ -553,8 +576,12 @@ impl Blueprint { // zones. (Real racks will always have at least one in-service boundary // NTP zone, but some test or test systems may have 0 if they have only // a single sled and that sled's boundary NTP zone is being upgraded.) - self.danger_all_omicron_zones(BlueprintZoneDisposition::any).find_map( - |(_sled_id, zone)| match &zone.zone_type { + self.in_service_zones() + .chain(self.expunged_zones( + ReadyForCleanup::Both, + BlueprintExpungedZoneAccessReason::NtpUpstreamConfig, + )) + .find_map(|(_sled_id, zone)| match &zone.zone_type { BlueprintZoneType::BoundaryNtp(ntp_config) => { Some(UpstreamNtpConfig { ntp_servers: &ntp_config.ntp_servers, @@ -563,8 +590,7 @@ impl Blueprint { }) } _ => None, - }, - ) + }) } /// Return the operator-specified configuration of Nexus. @@ -583,8 +609,12 @@ impl Blueprint { // zones. (Real racks will always have at least one in-service Nexus // zone - the one calling this code - but some tests create blueprints // without any.) - self.danger_all_omicron_zones(BlueprintZoneDisposition::any).find_map( - |(_sled_id, zone)| match &zone.zone_type { + self.in_service_zones() + .chain(self.expunged_zones( + ReadyForCleanup::Both, + BlueprintExpungedZoneAccessReason::NexusExternalConfig, + )) + .find_map(|(_sled_id, zone)| match &zone.zone_type { BlueprintZoneType::Nexus(nexus_config) => { Some(OperatorNexusConfig { external_tls: nexus_config.external_tls, @@ -593,8 +623,7 @@ impl Blueprint { }) } _ => None, - }, - ) + }) } /// Returns the complete set of external IP addresses assigned to external @@ -634,7 +663,11 @@ impl Blueprint { // don't check for that here. That would be an illegal blueprint; blippy // would complain, and attempting to execute it would fail due to // database constraints on external IP uniqueness. - self.danger_all_omicron_zones(BlueprintZoneDisposition::any) + self.in_service_zones() + .chain(self.expunged_zones( + ReadyForCleanup::Both, + BlueprintExpungedZoneAccessReason::ExternalDnsExternalIps, + )) .filter_map(|(_id, zone)| match &zone.zone_type { BlueprintZoneType::ExternalDns(dns) => { Some(dns.dns_address.addr.ip()) @@ -666,28 +699,23 @@ pub enum ReadyForCleanup { Both, } -/// TODO-john +/// TODO-john docs on all variants #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub enum BlueprintExpungedZoneAccessReason { - /// TODO-john Blippy, - /// TODO-john ClickhouseKeeperServerConfigIps, - /// TODO-john CockroachDecommission, - /// TODO-john DeallocateExternalNetworkingResources, - /// TODO-john + ExternalDnsExternalIps, NexusDeleteMetadataRecord, - /// TODO-john + NexusExternalConfig, + NexusSelfGeneration, + NexusSelfIsQuiescing, NexusSagaReassignment, - /// TODO-john NexusSupportBundleMarkFailed, - /// TODO-john + NtpUpstreamConfig, Omdb, - /// TODO-john OximeterExpungeAndReassignProducers, - /// TODO-john Test, } From 62241ab5b0e8e6161c9a6b973922cbec168ed15b Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 15 Dec 2025 14:34:28 -0500 Subject: [PATCH 16/25] cleanup --- nexus/db-queries/src/db/datastore/support_bundle.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 309ef03cab9..86b1f4ed174 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -230,13 +230,7 @@ impl DataStore { .expunged_nexus_zones_ready_for_cleanup( BlueprintExpungedZoneAccessReason::NexusSupportBundleMarkFailed, ) - .filter_map(|(_sled, zone, _nexus_config)| { - if zone.zone_type.is_nexus() { - Some(zone.id.into_untyped_uuid()) - } else { - None - } - }) + .map(|(_sled, zone, _nexus_config)| zone.id.into_untyped_uuid()) .collect::>(); // For this blueprint: The set of expunged debug datasets From c2f79f29d8cf1161215b7335736ea9aa4807188e Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 15 Dec 2025 14:34:34 -0500 Subject: [PATCH 17/25] documentation --- nexus/types/src/deployment.rs | 130 ++++++++++++++++++++++++++++++++-- 1 file changed, 124 insertions(+), 6 deletions(-) diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 6dbb181f8bb..eb8caa745fa 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -579,7 +579,7 @@ impl Blueprint { self.in_service_zones() .chain(self.expunged_zones( ReadyForCleanup::Both, - BlueprintExpungedZoneAccessReason::NtpUpstreamConfig, + BlueprintExpungedZoneAccessReason::BoundaryNtpUpstreamConfig, )) .find_map(|(_sled_id, zone)| match &zone.zone_type { BlueprintZoneType::BoundaryNtp(ntp_config) => { @@ -691,31 +691,149 @@ pub struct OperatorNexusConfig<'a> { pub external_dns_servers: &'a [IpAddr], } -/// TODO-john +/// Argument to [`Blueprint::expunged_zones()`] allowing the caller whether they +/// zones that are ready for cleanup, not ready for cleanup, or both/either. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub enum ReadyForCleanup { + /// Only return zones that are ready for cleanup (i.e., have been confirmed + /// shut down by sled-agent and will not be restarted). Yes, + /// Only return zones that are NOT YET ready for cleanup. These zones are + /// expunged but could still be running. No, + /// Return expunged zones regardless in either "ready for cleanup" state. Both, } -/// TODO-john docs on all variants +/// Argument to [`Blueprint::expunged_zones()`] requiring the caller to specify +/// the reason they want to access a blueprint's expunged zones. +/// +/// The planner is responsible for permanently pruning expunged zones from the +/// blueprint. However, doing so correctly requires waiting for any cleanup +/// actions that need to happen, all of which are specific to the zone kind. +/// For example, blueprint execution checks for expunged Oximeter zones to +/// perform metric producer reassignment, which means the planner cannot prune +/// an expunged Oximeter zone until it knows that all possible such +/// reassignments are complete. +/// +/// These reasons are not used at runtime; they exist soley to document and +/// attempt to statically guard against new code adding a new call to +/// `expunged_zones()` that the planner doesn't know about (and therefore +/// doesn't get updated to account for!). If you are attempting to call +/// `expunged_zones()` for a new reason, you must: +/// +/// 1. Add a new variant to this enum. +/// 2. Update the planner to account for it, to prevent the planner from pruning +/// the zone before whatever your use of it is completes. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub enum BlueprintExpungedZoneAccessReason { - Blippy, + // -------------------------------------------------------------------- + // Zone-kind-specific variants. Keep this sorted alphabetically, prefix + // them by the zone kind if applicable, and add details explaining the + // conditions the planner must consider during pruning. + // -------------------------------------------------------------------- + + /// Carrying forward the upstream NTP configuration provided by the operator + /// during rack setup; see [`Blueprint::upstream_ntp_config()`]. + /// + /// The planner must not prune a boundary NTP zone if it's the last zone + /// remaining with the set of configuration. + BoundaryNtpUpstreamConfig, + + /// Multinode Clickhouse configuration changes must operate one node at a + /// time, but the [`ClickhouseClusterConfig`] does not currently include the + /// IP addresses of all nodes. Blueprint execution therefore must scan + /// expunged nodes to find IP addresses. + /// + /// The planner must not prune a Clickhouse keeper or server zone if its + /// zone ID is contained in the current [`ClickhouseClusterConfig`]. ClickhouseKeeperServerConfigIps, + + /// The cockroachdb cluster should be instructed to decommission any + /// expunged cockroach nodes. + /// + /// The planner must not prune a CRDB zone if it has not yet been + /// decommissioned. CockroachDecommission, + + /// This is a catch-all variant for updating the `external_ip` and + /// `network_interface` tables for all zone types that have external + /// networking (Nexus, boundary NTP, and external DNS). + /// + /// The planner must not prune any of these zone types if their external + /// networking bits are still referenced by the active CRDB tables. DeallocateExternalNetworkingResources, + + /// Carrying forward the external DNS external IPs provided by the operator + /// during rack setup; see [`Blueprint::all_external_dns_external_ips()`]. + /// + /// The planner must not prune an external DNS zone if it's the last zone + /// remaining with the set of IPs. ExternalDnsExternalIps, + + /// After handoff, expunged Nexus zones must have their database access row + /// deleted. + /// + /// The planner must not prune a Nexus zone if it's still referenced in the + /// `db_metadata_nexus` table. NexusDeleteMetadataRecord, + + /// Carrying forward the external Nexus configuration provided by the + /// operator during rack setup; see [`Blueprint::operator_nexus_config()`]. + /// + /// The planner must not prune a Nexus zone if it's the last zone + /// remaining with the set of configuration. NexusExternalConfig, + + /// Nexus needs to determine its own generation. If the actively-running + /// Nexus has been expunged (but not yet shut down), it should still be able + /// to find its own generation! + /// + /// The planner does not need to account for this when pruning Nexus zones. NexusSelfGeneration, + + /// Nexus needs to whether it itself should be quiescing. If the + /// actively-running Nexus has been expunged (but not yet shut down), it + /// should still be able to determine this! + /// + /// The planner does not need to account for this when pruning Nexus zones. NexusSelfIsQuiescing, + + /// Sagas assigneed to any expunged Nexus must be reassigned to an + /// in-service Nexus. + /// + /// The planner must not prune a Nexus zone if it still has any sagas + /// assigned to it. NexusSagaReassignment, + + /// Support bundles assigned to an expunged Nexus must either be reassigned + /// or marked as failed. + /// + /// The planner must not prune a Nexus zone if it still has any support + /// bundles assigned to it. NexusSupportBundleMarkFailed, - NtpUpstreamConfig, - Omdb, + + /// An expunged Oximeter zone must be marked expunged in the `oximeter` CRDB + /// table (so Nexus knows to stop assigning producers to it), and any + /// producers previously assigned to it must be reassigned to new Oximeters. + /// + /// The planner must not prune an Oximeter zone if it's still eligible for + /// new producers or if it has any assigned producers. OximeterExpungeAndReassignProducers, + + // -------------------------------------------------------------------- + // Catch-all variants for non-production callers. The planner does not need + // to account for these when pruning. + // -------------------------------------------------------------------- + + /// Blippy performs checks that include expunged zones. + Blippy, + + /// Omdb allows support operators to poke at blueprint contents, including + /// expunged zones. + Omdb, + + /// Various unit and integration tests access expunged zones. Test, } From 38a8bca69369433a7e4ce5392fe9ee3c12fb800f Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 15 Dec 2025 14:34:56 -0500 Subject: [PATCH 18/25] cargo fmt --- nexus/types/src/deployment.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index eb8caa745fa..00ad6efdf01 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -732,7 +732,6 @@ pub enum BlueprintExpungedZoneAccessReason { // them by the zone kind if applicable, and add details explaining the // conditions the planner must consider during pruning. // -------------------------------------------------------------------- - /// Carrying forward the upstream NTP configuration provided by the operator /// during rack setup; see [`Blueprint::upstream_ntp_config()`]. /// @@ -825,7 +824,6 @@ pub enum BlueprintExpungedZoneAccessReason { // Catch-all variants for non-production callers. The planner does not need // to account for these when pruning. // -------------------------------------------------------------------- - /// Blippy performs checks that include expunged zones. Blippy, From 08531e2478de4f31a67482adb757c7de20af7a91 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 15 Dec 2025 15:46:53 -0500 Subject: [PATCH 19/25] rename --- nexus/db-queries/src/db/datastore/support_bundle.rs | 2 +- nexus/types/src/deployment.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index 86b1f4ed174..afb0557a44b 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -228,7 +228,7 @@ impl DataStore { // ready for cleanup let invalid_nexus_zones = blueprint .expunged_nexus_zones_ready_for_cleanup( - BlueprintExpungedZoneAccessReason::NexusSupportBundleMarkFailed, + BlueprintExpungedZoneAccessReason::NexusSupportBundleReassign, ) .map(|(_sled, zone, _nexus_config)| zone.id.into_untyped_uuid()) .collect::>(); diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 00ad6efdf01..ea6364ae1a0 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -810,7 +810,7 @@ pub enum BlueprintExpungedZoneAccessReason { /// /// The planner must not prune a Nexus zone if it still has any support /// bundles assigned to it. - NexusSupportBundleMarkFailed, + NexusSupportBundleReassign, /// An expunged Oximeter zone must be marked expunged in the `oximeter` CRDB /// table (so Nexus knows to stop assigning producers to it), and any From baf0391726fbd7280e515051745ed6afa9c18f4b Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 16 Dec 2025 11:45:28 -0500 Subject: [PATCH 20/25] rename ReadyForCleanup -> ZoneRunningStatus --- dev-tools/omdb/src/bin/omdb/db.rs | 4 +- live-tests/tests/test_nexus_add_remove.rs | 6 +-- .../db-queries/src/db/datastore/deployment.rs | 6 +-- nexus/reconfigurator/blippy/src/checks.rs | 4 +- .../execution/src/clickhouse.rs | 4 +- .../execution/src/omicron_zones.rs | 8 +-- .../tests/integration_tests/planner.rs | 8 +-- nexus/types/src/deployment.rs | 54 +++++++++++-------- 8 files changed, 53 insertions(+), 41 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 4a4d4fe450c..3442d5b69e4 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -135,8 +135,8 @@ use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::DiskFilter; -use nexus_types::deployment::ReadyForCleanup; use nexus_types::deployment::SledFilter; +use nexus_types::deployment::ZoneRunningStatus; use nexus_types::external_api::params; use nexus_types::external_api::views::PhysicalDiskPolicy; use nexus_types::external_api::views::PhysicalDiskState; @@ -1628,7 +1628,7 @@ async fn lookup_service_info( // be expunged. Check all the zone states. let mut all_zones = blueprint.in_service_zones().chain(blueprint.expunged_zones( - ReadyForCleanup::Both, + ZoneRunningStatus::Any, BlueprintExpungedZoneAccessReason::Omdb, )); diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index fc760cd28a9..09ae6d1ebb8 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -23,8 +23,8 @@ use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::PlannerConfig; -use nexus_types::deployment::ReadyForCleanup; use nexus_types::deployment::SledFilter; +use nexus_types::deployment::ZoneRunningStatus; use nexus_types::deployment::blueprint_zone_type; use omicron_common::address::NEXUS_LOCKSTEP_PORT; use omicron_test_utils::dev::poll::CondCheckError; @@ -210,7 +210,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { .expect("editing blueprint to expunge zone"); let (_, expunged_zone_config) = blueprint3 .expunged_zones( - ReadyForCleanup::No, + ZoneRunningStatus::MaybeRunning, BlueprintExpungedZoneAccessReason::Test, ) .find(|(_sled_id, zone_config)| zone_config.id == new_zone.id) @@ -312,7 +312,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { // message if something has gone wrong up to this point. let (_, expunged_zone_config) = new_blueprint .expunged_zones( - ReadyForCleanup::Yes, + ZoneRunningStatus::Shutdown, BlueprintExpungedZoneAccessReason::Test, ) .find(|(_sled_id, zone_config)| zone_config.id == new_zone.id) diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 6d07715b9da..dcccdc07ad3 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -91,7 +91,7 @@ use nexus_types::deployment::PendingMgsUpdateRotBootloaderDetails; use nexus_types::deployment::PendingMgsUpdateRotDetails; use nexus_types::deployment::PendingMgsUpdateSpDetails; use nexus_types::deployment::PendingMgsUpdates; -use nexus_types::deployment::ReadyForCleanup; +use nexus_types::deployment::ZoneRunningStatus; use nexus_types::inventory::BaseboardId; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; @@ -1968,7 +1968,7 @@ impl DataStore { // blueprint .expunged_zones( - ReadyForCleanup::Both, + ZoneRunningStatus::Any, BlueprintExpungedZoneAccessReason ::DeallocateExternalNetworkingResources ) @@ -4525,7 +4525,7 @@ mod tests { fn assert_all_zones_in_service(blueprint: &Blueprint) { let not_in_service = blueprint .expunged_zones( - ReadyForCleanup::Both, + ZoneRunningStatus::Any, BlueprintExpungedZoneAccessReason::Test, ) .collect::>(); diff --git a/nexus/reconfigurator/blippy/src/checks.rs b/nexus/reconfigurator/blippy/src/checks.rs index eef6c91ffca..82c31d743ed 100644 --- a/nexus/reconfigurator/blippy/src/checks.rs +++ b/nexus/reconfigurator/blippy/src/checks.rs @@ -18,8 +18,8 @@ use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::OmicronZoneExternalIp; use nexus_types::deployment::PlanningInput; -use nexus_types::deployment::ReadyForCleanup; use nexus_types::deployment::SledFilter; +use nexus_types::deployment::ZoneRunningStatus; use nexus_types::deployment::blueprint_zone_type; use omicron_common::address::DnsSubnet; use omicron_common::address::Ipv6Subnet; @@ -2134,7 +2134,7 @@ fn check_planning_input_network_records_appear_in_blueprint( // between expunged zones or between expunged -> running zones. for (_, z) in blippy.blueprint().in_service_zones().chain( blippy.blueprint().expunged_zones( - ReadyForCleanup::Both, + ZoneRunningStatus::Any, BlueprintExpungedZoneAccessReason::Blippy, ), ) { diff --git a/nexus/reconfigurator/execution/src/clickhouse.rs b/nexus/reconfigurator/execution/src/clickhouse.rs index 5c9c0711cb3..6313d936cd1 100644 --- a/nexus/reconfigurator/execution/src/clickhouse.rs +++ b/nexus/reconfigurator/execution/src/clickhouse.rs @@ -26,7 +26,7 @@ use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::ClickhouseClusterConfig; -use nexus_types::deployment::ReadyForCleanup; +use nexus_types::deployment::ZoneRunningStatus; use omicron_common::address::CLICKHOUSE_ADMIN_PORT; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SledUuid; @@ -76,7 +76,7 @@ pub(crate) async fn deploy_nodes( use BlueprintExpungedZoneAccessReason::ClickhouseKeeperServerConfigIps; let all_zones = blueprint.in_service_zones().chain(blueprint.expunged_zones( - ReadyForCleanup::Both, + ZoneRunningStatus::Any, ClickhouseKeeperServerConfigIps, )); diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index a924ae65bc4..d0971f41523 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -19,7 +19,7 @@ use nexus_db_queries::db::DataStore; use nexus_db_queries::db::datastore::CollectorReassignment; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintExpungedZoneAccessReason; -use nexus_types::deployment::ReadyForCleanup; +use nexus_types::deployment::ZoneRunningStatus; use omicron_common::address::COCKROACH_ADMIN_PORT; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; @@ -97,7 +97,7 @@ fn clean_up_expunged_nexus_zones<'a>( ) -> impl Stream)> + 'a { use BlueprintExpungedZoneAccessReason::NexusDeleteMetadataRecord; let zones_to_clean_up = blueprint - .expunged_zones(ReadyForCleanup::Yes, NexusDeleteMetadataRecord) + .expunged_zones(ZoneRunningStatus::Shutdown, NexusDeleteMetadataRecord) .filter(|(_sled_id, zone)| zone.zone_type.is_nexus()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { @@ -122,7 +122,7 @@ fn clean_up_expunged_cockroach_zones<'a, R: CleanupResolver>( ) -> impl Stream)> + 'a { use BlueprintExpungedZoneAccessReason::CockroachDecommission; let zones_to_clean_up = blueprint - .expunged_zones(ReadyForCleanup::Yes, CockroachDecommission) + .expunged_zones(ZoneRunningStatus::Shutdown, CockroachDecommission) .filter(|(_sled_id, zone)| zone.zone_type.is_cockroach()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { @@ -147,7 +147,7 @@ fn clean_up_expunged_oximeter_zones<'a>( use BlueprintExpungedZoneAccessReason::OximeterExpungeAndReassignProducers; let zones_to_clean_up = blueprint .expunged_zones( - ReadyForCleanup::Yes, + ZoneRunningStatus::Shutdown, OximeterExpungeAndReassignProducers, ) .filter(|(_sled_id, zone)| zone.zone_type.is_oximeter()); diff --git a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs index 90f714149f6..1d229796d29 100644 --- a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs +++ b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs @@ -29,9 +29,9 @@ use nexus_types::deployment::CockroachDbClusterVersion; use nexus_types::deployment::CockroachDbPreserveDowngrade; use nexus_types::deployment::CockroachDbSettings; use nexus_types::deployment::OmicronZoneExternalSnatIp; -use nexus_types::deployment::ReadyForCleanup; use nexus_types::deployment::SledDisk; use nexus_types::deployment::TargetReleaseDescription; +use nexus_types::deployment::ZoneRunningStatus; use nexus_types::deployment::blueprint_zone_type; use nexus_types::deployment::blueprint_zone_type::InternalDns; use nexus_types::external_api::views::PhysicalDiskPolicy; @@ -2576,7 +2576,7 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { // `ready_for_cleanup` yet. let expunged_zones: Vec<_> = blueprint3 .expunged_zones( - ReadyForCleanup::No, + ZoneRunningStatus::MaybeRunning, BlueprintExpungedZoneAccessReason::Test, ) .map(|(_, z)| z.clone()) @@ -2602,7 +2602,7 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { // enabled our clickhouse policy should be expunged when we disable it. let expunged_zones: Vec<_> = blueprint4 .expunged_zones( - ReadyForCleanup::No, + ZoneRunningStatus::MaybeRunning, BlueprintExpungedZoneAccessReason::Test, ) .map(|(_, z)| z.clone()) @@ -3301,7 +3301,7 @@ fn test_update_crucible_pantry_before_nexus() { assert_eq!( blueprint .expunged_zones( - ReadyForCleanup::Yes, + ZoneRunningStatus::Shutdown, BlueprintExpungedZoneAccessReason::Test ) .filter(|(_, z)| is_old_pantry(z)) diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index ea6364ae1a0..1de74e0867b 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -329,7 +329,7 @@ impl Blueprint { /// from the blueprint. pub fn expunged_zones( &self, - ready_for_cleanup: ReadyForCleanup, + ready_for_cleanup: ZoneRunningStatus, _reason: BlueprintExpungedZoneAccessReason, ) -> impl Iterator { // Danger note: this call will definitely access expunged zones, but we @@ -343,9 +343,9 @@ impl Blueprint { } => this_zone_ready_for_cleanup, }; match ready_for_cleanup { - ReadyForCleanup::Yes => this_zone_ready_for_cleanup, - ReadyForCleanup::No => !this_zone_ready_for_cleanup, - ReadyForCleanup::Both => true, + ZoneRunningStatus::Shutdown => this_zone_ready_for_cleanup, + ZoneRunningStatus::MaybeRunning => !this_zone_ready_for_cleanup, + ZoneRunningStatus::Any => true, } }) } @@ -397,7 +397,7 @@ impl Blueprint { ) -> impl Iterator< Item = (SledUuid, &BlueprintZoneConfig, &blueprint_zone_type::Nexus), > { - self.expunged_zones(ReadyForCleanup::Yes, reason).filter_map( + self.expunged_zones(ZoneRunningStatus::Shutdown, reason).filter_map( |(sled_id, zone)| { if let BlueprintZoneType::Nexus(nexus_config) = &zone.zone_type { @@ -487,7 +487,7 @@ impl Blueprint { let zone = self .in_service_zones() .chain(self.expunged_zones( - ReadyForCleanup::Both, + ZoneRunningStatus::Any, BlueprintExpungedZoneAccessReason::NexusSelfIsQuiescing, )) .find(|(_sled_id, zone_config)| zone_config.id == nexus_id) @@ -538,7 +538,7 @@ impl Blueprint { ) -> Result { for (_sled_id, zone_config) in self.in_service_zones().chain(self.expunged_zones( - ReadyForCleanup::Both, + ZoneRunningStatus::Any, BlueprintExpungedZoneAccessReason::NexusSelfGeneration, )) { @@ -578,7 +578,7 @@ impl Blueprint { // a single sled and that sled's boundary NTP zone is being upgraded.) self.in_service_zones() .chain(self.expunged_zones( - ReadyForCleanup::Both, + ZoneRunningStatus::Any, BlueprintExpungedZoneAccessReason::BoundaryNtpUpstreamConfig, )) .find_map(|(_sled_id, zone)| match &zone.zone_type { @@ -611,7 +611,7 @@ impl Blueprint { // without any.) self.in_service_zones() .chain(self.expunged_zones( - ReadyForCleanup::Both, + ZoneRunningStatus::Any, BlueprintExpungedZoneAccessReason::NexusExternalConfig, )) .find_map(|(_sled_id, zone)| match &zone.zone_type { @@ -665,7 +665,7 @@ impl Blueprint { // database constraints on external IP uniqueness. self.in_service_zones() .chain(self.expunged_zones( - ReadyForCleanup::Both, + ZoneRunningStatus::Any, BlueprintExpungedZoneAccessReason::ExternalDnsExternalIps, )) .filter_map(|(_id, zone)| match &zone.zone_type { @@ -691,18 +691,30 @@ pub struct OperatorNexusConfig<'a> { pub external_dns_servers: &'a [IpAddr], } -/// Argument to [`Blueprint::expunged_zones()`] allowing the caller whether they -/// zones that are ready for cleanup, not ready for cleanup, or both/either. +/// `ZoneRunningStatus` is an argument to [`Blueprint::expunged_zones()`] +/// allowing the caller to to specify whether they want to operate on zones that +/// are shut down, could still be running, or both/either. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] -pub enum ReadyForCleanup { - /// Only return zones that are ready for cleanup (i.e., have been confirmed - /// shut down by sled-agent and will not be restarted). - Yes, - /// Only return zones that are NOT YET ready for cleanup. These zones are - /// expunged but could still be running. - No, - /// Return expunged zones regardless in either "ready for cleanup" state. - Both, +pub enum ZoneRunningStatus { + /// Only return zones that are guaranteed to be shutdown and will not be + /// restarted. + /// + /// This corresponds to `ready_for_cleanup: true` in a zone's + /// [`BlueprintZoneDisposition::Expunged`] state. + Shutdown, + /// Only return expunged zones that could still be running. This is + /// inherently racy. Zones in this state may already be stopped, and if they + /// aren't they are likely (but not guaranteed) to be stopped soon. + /// + /// This corresponds to `ready_for_cleanup: false` in a zone's + /// [`BlueprintZoneDisposition::Expunged`] state. + MaybeRunning, + /// Return expunged zones regardless of whether they're shut down or could + /// still be running. + /// + /// This corresponds to ignoring the `ready_for_cleanup` field of a zone's + /// [`BlueprintZoneDisposition::Expunged`] state. + Any, } /// Argument to [`Blueprint::expunged_zones()`] requiring the caller to specify From 4203003631c12a601e8012d0ce2e9b282f490733 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 16 Dec 2025 11:55:52 -0500 Subject: [PATCH 21/25] add Blueprint::all_in_service_and_expunged_zones() --- dev-tools/omdb/src/bin/omdb/db.rs | 8 +- nexus/reconfigurator/blippy/src/checks.rs | 7 +- .../execution/src/clickhouse.rs | 7 +- nexus/types/src/deployment.rs | 116 +++++++++--------- 4 files changed, 68 insertions(+), 70 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 3442d5b69e4..504dfcb0e70 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -1626,11 +1626,9 @@ async fn lookup_service_info( ) -> anyhow::Result> { // We don't know anything about `service_id`; it may be in-service or it may // be expunged. Check all the zone states. - let mut all_zones = - blueprint.in_service_zones().chain(blueprint.expunged_zones( - ZoneRunningStatus::Any, - BlueprintExpungedZoneAccessReason::Omdb, - )); + let mut all_zones = blueprint.all_in_service_and_expunged_zones( + BlueprintExpungedZoneAccessReason::Omdb, + ); let Some(zone_config) = all_zones.find_map(|(_sled_id, zone_config)| { if zone_config.id.into_untyped_uuid() == service_id { diff --git a/nexus/reconfigurator/blippy/src/checks.rs b/nexus/reconfigurator/blippy/src/checks.rs index 82c31d743ed..ad78e05efda 100644 --- a/nexus/reconfigurator/blippy/src/checks.rs +++ b/nexus/reconfigurator/blippy/src/checks.rs @@ -2132,11 +2132,8 @@ fn check_planning_input_network_records_appear_in_blueprint( // constructed above in `BuilderExternalNetworking::new()`, we do not // check for duplicates here: we could very well see reuse of IPs // between expunged zones or between expunged -> running zones. - for (_, z) in blippy.blueprint().in_service_zones().chain( - blippy.blueprint().expunged_zones( - ZoneRunningStatus::Any, - BlueprintExpungedZoneAccessReason::Blippy, - ), + for (_, z) in blippy.blueprint().all_in_service_and_expunged_zones( + BlueprintExpungedZoneAccessReason::Blippy, ) { let zone_type = &z.zone_type; match zone_type { diff --git a/nexus/reconfigurator/execution/src/clickhouse.rs b/nexus/reconfigurator/execution/src/clickhouse.rs index 6313d936cd1..972f69492b0 100644 --- a/nexus/reconfigurator/execution/src/clickhouse.rs +++ b/nexus/reconfigurator/execution/src/clickhouse.rs @@ -74,11 +74,8 @@ pub(crate) async fn deploy_nodes( // // This is tracked in https://github.com/oxidecomputer/omicron/issues/7724 use BlueprintExpungedZoneAccessReason::ClickhouseKeeperServerConfigIps; - let all_zones = - blueprint.in_service_zones().chain(blueprint.expunged_zones( - ZoneRunningStatus::Any, - ClickhouseKeeperServerConfigIps, - )); + let all_zones = blueprint + .all_in_service_and_expunged_zones(ClickhouseKeeperServerConfigIps); deploy_nodes_impl(opctx, all_zones, clickhouse_cluster_config).await } diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 1de74e0867b..c282e5c4e8e 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -350,15 +350,33 @@ impl Blueprint { }) } + /// Iterate over all zones in the blueprint, regardless of whether they're + /// in-service or expunged. + /// + /// Like [`Self::expunged_zones()`], callers are required to specify a + /// reason to access expunged zones. + /// + /// The set of zones returned by this method is equivalent to the set of + /// zones returned by chaining together calls to `Self::in_service_zones()` + /// and `Self::expunged_zones(ZoneRunningStatus::Any, reason)`, but only + /// iterates over the zones once. + pub fn all_in_service_and_expunged_zones( + &self, + _reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator { + // Danger note: this call will definitely access expunged zones, but we + // know the caller has provided a known reason to do so. + self.danger_all_omicron_zones(BlueprintZoneDisposition::any) + } + /// Iterate over the [`BlueprintZoneConfig`] instances in the blueprint /// that match the provided filter, along with the associated sled id. /// /// This method is prefixed with `danger_` and is private because it allows /// the caller to potentially act on expunged zones without providing a /// reason for doing so. It should only be called by `in_service_zones()` - /// and `expunged_zones()` above; all other callers that want access to - /// expunged zones must go through `expunged_zones()`, which forces them to - /// specify a [`BlueprintExpungedZoneAccessReason`]. + /// and the helper methods that require callers to specify a + /// [`BlueprintExpungedZoneAccessReason`] defined above. fn danger_all_omicron_zones( &self, mut filter: F, @@ -485,11 +503,9 @@ impl Blueprint { nexus_id: OmicronZoneUuid, ) -> Result { let zone = self - .in_service_zones() - .chain(self.expunged_zones( - ZoneRunningStatus::Any, + .all_in_service_and_expunged_zones( BlueprintExpungedZoneAccessReason::NexusSelfIsQuiescing, - )) + ) .find(|(_sled_id, zone_config)| zone_config.id == nexus_id) .ok_or_else(|| { anyhow!("zone {} does not exist in blueprint", nexus_id) @@ -536,12 +552,9 @@ impl Blueprint { &self, nexus_id: OmicronZoneUuid, ) -> Result { - for (_sled_id, zone_config) in - self.in_service_zones().chain(self.expunged_zones( - ZoneRunningStatus::Any, - BlueprintExpungedZoneAccessReason::NexusSelfGeneration, - )) - { + for (_sled_id, zone_config) in self.all_in_service_and_expunged_zones( + BlueprintExpungedZoneAccessReason::NexusSelfGeneration, + ) { if let BlueprintZoneType::Nexus(nexus_config) = &zone_config.zone_type { @@ -576,21 +589,19 @@ impl Blueprint { // zones. (Real racks will always have at least one in-service boundary // NTP zone, but some test or test systems may have 0 if they have only // a single sled and that sled's boundary NTP zone is being upgraded.) - self.in_service_zones() - .chain(self.expunged_zones( - ZoneRunningStatus::Any, - BlueprintExpungedZoneAccessReason::BoundaryNtpUpstreamConfig, - )) - .find_map(|(_sled_id, zone)| match &zone.zone_type { - BlueprintZoneType::BoundaryNtp(ntp_config) => { - Some(UpstreamNtpConfig { - ntp_servers: &ntp_config.ntp_servers, - dns_servers: &ntp_config.dns_servers, - domain: ntp_config.domain.as_deref(), - }) - } - _ => None, - }) + self.all_in_service_and_expunged_zones( + BlueprintExpungedZoneAccessReason::BoundaryNtpUpstreamConfig, + ) + .find_map(|(_sled_id, zone)| match &zone.zone_type { + BlueprintZoneType::BoundaryNtp(ntp_config) => { + Some(UpstreamNtpConfig { + ntp_servers: &ntp_config.ntp_servers, + dns_servers: &ntp_config.dns_servers, + domain: ntp_config.domain.as_deref(), + }) + } + _ => None, + }) } /// Return the operator-specified configuration of Nexus. @@ -609,21 +620,18 @@ impl Blueprint { // zones. (Real racks will always have at least one in-service Nexus // zone - the one calling this code - but some tests create blueprints // without any.) - self.in_service_zones() - .chain(self.expunged_zones( - ZoneRunningStatus::Any, - BlueprintExpungedZoneAccessReason::NexusExternalConfig, - )) - .find_map(|(_sled_id, zone)| match &zone.zone_type { - BlueprintZoneType::Nexus(nexus_config) => { - Some(OperatorNexusConfig { - external_tls: nexus_config.external_tls, - external_dns_servers: &nexus_config - .external_dns_servers, - }) - } - _ => None, - }) + self.all_in_service_and_expunged_zones( + BlueprintExpungedZoneAccessReason::NexusExternalConfig, + ) + .find_map(|(_sled_id, zone)| match &zone.zone_type { + BlueprintZoneType::Nexus(nexus_config) => { + Some(OperatorNexusConfig { + external_tls: nexus_config.external_tls, + external_dns_servers: &nexus_config.external_dns_servers, + }) + } + _ => None, + }) } /// Returns the complete set of external IP addresses assigned to external @@ -663,18 +671,16 @@ impl Blueprint { // don't check for that here. That would be an illegal blueprint; blippy // would complain, and attempting to execute it would fail due to // database constraints on external IP uniqueness. - self.in_service_zones() - .chain(self.expunged_zones( - ZoneRunningStatus::Any, - BlueprintExpungedZoneAccessReason::ExternalDnsExternalIps, - )) - .filter_map(|(_id, zone)| match &zone.zone_type { - BlueprintZoneType::ExternalDns(dns) => { - Some(dns.dns_address.addr.ip()) - } - _ => None, - }) - .collect() + self.all_in_service_and_expunged_zones( + BlueprintExpungedZoneAccessReason::ExternalDnsExternalIps, + ) + .filter_map(|(_id, zone)| match &zone.zone_type { + BlueprintZoneType::ExternalDns(dns) => { + Some(dns.dns_address.addr.ip()) + } + _ => None, + }) + .collect() } } From 940ed0844e9714b7b8a9a4457b4399b6b79be819 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 16 Dec 2025 12:00:01 -0500 Subject: [PATCH 22/25] fix erroneous behavior change to `Blueprint::find_generation_for_self()` --- nexus/types/src/deployment.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index c282e5c4e8e..60c808ca616 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -369,6 +369,28 @@ impl Blueprint { self.danger_all_omicron_zones(BlueprintZoneDisposition::any) } + /// Iterate over all zones in the blueprint, returning any that could be + /// running: either they're in-service, or they're expunged but have not yet + /// been confirmed shut down. + /// + /// Like [`Self::expunged_zones()`], callers are required to specify a + /// reason to access expunged zones. + /// + /// The set of zones returned by this method is equivalent to the set of + /// zones returned by chaining together calls to `Self::in_service_zones()` + /// and `Self::expunged_zones(ZoneRunningStatus::MaybeRunning, reason)`, but + /// only iterates over the zones once. + pub fn all_maybe_running_zones( + &self, + _reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator { + // Danger note: this call will definitely access expunged zones, but we + // know the caller has provided a known reason to do so. + self.danger_all_omicron_zones( + BlueprintZoneDisposition::could_be_running, + ) + } + /// Iterate over the [`BlueprintZoneConfig`] instances in the blueprint /// that match the provided filter, along with the associated sled id. /// @@ -552,7 +574,7 @@ impl Blueprint { &self, nexus_id: OmicronZoneUuid, ) -> Result { - for (_sled_id, zone_config) in self.all_in_service_and_expunged_zones( + for (_sled_id, zone_config) in self.all_maybe_running_zones( BlueprintExpungedZoneAccessReason::NexusSelfGeneration, ) { if let BlueprintZoneType::Nexus(nexus_config) = From 1f87152907f126bbafd733d0092105a5b80adb3b Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 16 Dec 2025 12:00:44 -0500 Subject: [PATCH 23/25] typos --- nexus/types/src/deployment.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 60c808ca616..3a62b9f04d1 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -756,7 +756,7 @@ pub enum ZoneRunningStatus { /// an expunged Oximeter zone until it knows that all possible such /// reassignments are complete. /// -/// These reasons are not used at runtime; they exist soley to document and +/// These reasons are not used at runtime; they exist solely to document and /// attempt to statically guard against new code adding a new call to /// `expunged_zones()` that the planner doesn't know about (and therefore /// doesn't get updated to account for!). If you are attempting to call @@ -764,7 +764,7 @@ pub enum ZoneRunningStatus { /// /// 1. Add a new variant to this enum. /// 2. Update the planner to account for it, to prevent the planner from pruning -/// the zone before whatever your use of it is completes. +/// the zone before whatever your use of it is completed. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub enum BlueprintExpungedZoneAccessReason { // -------------------------------------------------------------------- From 12cfe410d66c6de96df23a92c02290a69f1f727c Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 16 Dec 2025 12:09:44 -0500 Subject: [PATCH 24/25] unused imports --- dev-tools/omdb/src/bin/omdb/db.rs | 1 - nexus/reconfigurator/blippy/src/checks.rs | 1 - nexus/reconfigurator/execution/src/clickhouse.rs | 1 - 3 files changed, 3 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index 504dfcb0e70..ff65784de80 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -136,7 +136,6 @@ use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::DiskFilter; use nexus_types::deployment::SledFilter; -use nexus_types::deployment::ZoneRunningStatus; use nexus_types::external_api::params; use nexus_types::external_api::views::PhysicalDiskPolicy; use nexus_types::external_api::views::PhysicalDiskState; diff --git a/nexus/reconfigurator/blippy/src/checks.rs b/nexus/reconfigurator/blippy/src/checks.rs index ad78e05efda..331a6dbbe79 100644 --- a/nexus/reconfigurator/blippy/src/checks.rs +++ b/nexus/reconfigurator/blippy/src/checks.rs @@ -19,7 +19,6 @@ use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::OmicronZoneExternalIp; use nexus_types::deployment::PlanningInput; use nexus_types::deployment::SledFilter; -use nexus_types::deployment::ZoneRunningStatus; use nexus_types::deployment::blueprint_zone_type; use omicron_common::address::DnsSubnet; use omicron_common::address::Ipv6Subnet; diff --git a/nexus/reconfigurator/execution/src/clickhouse.rs b/nexus/reconfigurator/execution/src/clickhouse.rs index 972f69492b0..defab3197fc 100644 --- a/nexus/reconfigurator/execution/src/clickhouse.rs +++ b/nexus/reconfigurator/execution/src/clickhouse.rs @@ -26,7 +26,6 @@ use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneConfig; use nexus_types::deployment::ClickhouseClusterConfig; -use nexus_types::deployment::ZoneRunningStatus; use omicron_common::address::CLICKHOUSE_ADMIN_PORT; use omicron_uuid_kinds::OmicronZoneUuid; use omicron_uuid_kinds::SledUuid; From 7f511d63d4f85805676c2b8920f7254f3136e5d2 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Tue, 16 Dec 2025 15:58:45 -0500 Subject: [PATCH 25/25] remove `reason` from `all_maybe_running_zones()` --- nexus/types/src/deployment.rs | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 3a62b9f04d1..2bc3a6eb540 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -373,19 +373,17 @@ impl Blueprint { /// running: either they're in-service, or they're expunged but have not yet /// been confirmed shut down. /// - /// Like [`Self::expunged_zones()`], callers are required to specify a - /// reason to access expunged zones. - /// /// The set of zones returned by this method is equivalent to the set of /// zones returned by chaining together calls to `Self::in_service_zones()` /// and `Self::expunged_zones(ZoneRunningStatus::MaybeRunning, reason)`, but - /// only iterates over the zones once. + /// only iterates over the zones once and does not require a `reason`. pub fn all_maybe_running_zones( &self, - _reason: BlueprintExpungedZoneAccessReason, ) -> impl Iterator { - // Danger note: this call will definitely access expunged zones, but we - // know the caller has provided a known reason to do so. + // Danger note: this call will definitely access expunged zones, but + // only those that are not yet `ready_for_cleanup`. The planner's + // pruning only acts on `ready_for_cleanup` zones, so we don't need to + // track accesses of this kind of expunged zone. self.danger_all_omicron_zones( BlueprintZoneDisposition::could_be_running, ) @@ -574,9 +572,7 @@ impl Blueprint { &self, nexus_id: OmicronZoneUuid, ) -> Result { - for (_sled_id, zone_config) in self.all_maybe_running_zones( - BlueprintExpungedZoneAccessReason::NexusSelfGeneration, - ) { + for (_sled_id, zone_config) in self.all_maybe_running_zones() { if let BlueprintZoneType::Nexus(nexus_config) = &zone_config.zone_type { @@ -824,13 +820,6 @@ pub enum BlueprintExpungedZoneAccessReason { /// remaining with the set of configuration. NexusExternalConfig, - /// Nexus needs to determine its own generation. If the actively-running - /// Nexus has been expunged (but not yet shut down), it should still be able - /// to find its own generation! - /// - /// The planner does not need to account for this when pruning Nexus zones. - NexusSelfGeneration, - /// Nexus needs to whether it itself should be quiescing. If the /// actively-running Nexus has been expunged (but not yet shut down), it /// should still be able to determine this!