diff --git a/dev-tools/omdb/src/bin/omdb/db.rs b/dev-tools/omdb/src/bin/omdb/db.rs index d7f8af04f8f..ff65784de80 100644 --- a/dev-tools/omdb/src/bin/omdb/db.rs +++ b/dev-tools/omdb/src/bin/omdb/db.rs @@ -131,6 +131,7 @@ use nexus_db_queries::db::pagination::paginated; use nexus_db_queries::db::queries::ALLOW_FULL_TABLE_SCAN_SQL; use nexus_db_queries::db::queries::region_allocation; use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::DiskFilter; @@ -1622,16 +1623,19 @@ async fn lookup_service_info( service_id: Uuid, blueprint: &Blueprint, ) -> anyhow::Result> { - let Some(zone_config) = blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) - .find_map(|(_sled_id, zone_config)| { - if zone_config.id.into_untyped_uuid() == service_id { - Some(zone_config) - } else { - None - } - }) - else { + // We don't know anything about `service_id`; it may be in-service or it may + // be expunged. Check all the zone states. + let mut all_zones = blueprint.all_in_service_and_expunged_zones( + BlueprintExpungedZoneAccessReason::Omdb, + ); + + let Some(zone_config) = all_zones.find_map(|(_sled_id, zone_config)| { + if zone_config.id.into_untyped_uuid() == service_id { + Some(zone_config) + } else { + None + } + }) else { return Ok(None); }; diff --git a/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs b/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs index a86b512b52f..789b06e87ce 100644 --- a/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs +++ b/dev-tools/omdb/src/bin/omdb/db/db_metadata.rs @@ -17,7 +17,6 @@ use nexus_db_model::DbMetadataNexusState; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; -use nexus_types::deployment::BlueprintZoneDisposition; use omicron_common::api::external::Generation; use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::OmicronZoneUuid; @@ -133,7 +132,7 @@ async fn get_db_metadata_nexus_rows( ]; let nexus_generation_by_zone = blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .map(|(_, zone, nexus_zone)| (zone.id, nexus_zone.nexus_generation)) .collect::>(); @@ -213,7 +212,7 @@ pub async fn cmd_db_metadata_force_mark_nexus_quiesced( .await .context("loading current target blueprint")?; let nexus_generation = current_target_blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .find_map(|(_, zone, nexus_zone)| { if zone.id == args.id { Some(nexus_zone.nexus_generation) diff --git a/dev-tools/reconfigurator-cli/src/lib.rs b/dev-tools/reconfigurator-cli/src/lib.rs index 68db451ec1b..9579061d8f4 100644 --- a/dev-tools/reconfigurator-cli/src/lib.rs +++ b/dev-tools/reconfigurator-cli/src/lib.rs @@ -159,9 +159,7 @@ impl ReconfiguratorSim { builder.set_external_dns_version(parent_blueprint.external_dns_version); // Handle zone networking setup first - for (_, zone) in parent_blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - { + for (_, zone) in parent_blueprint.in_service_zones() { if let Some((external_ip, nic)) = zone.zone_type.external_networking() { @@ -213,9 +211,7 @@ impl ReconfiguratorSim { let active_nexus_gen = state.config().active_nexus_zone_generation(); let mut active_nexus_zones = BTreeSet::new(); - for (_, zone, nexus) in parent_blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) - { + for (_, zone, nexus) in parent_blueprint.in_service_nexus_zones() { if nexus.nexus_generation == active_nexus_gen { active_nexus_zones.insert(zone.id); } @@ -232,9 +228,7 @@ impl ReconfiguratorSim { let active_nexus_gen = state.config().active_nexus_zone_generation(); let mut not_yet_nexus_zones = BTreeSet::new(); - for (_, zone) in parent_blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - { + for (_, zone) in parent_blueprint.in_service_zones() { match &zone.zone_type { nexus_types::deployment::BlueprintZoneType::Nexus( nexus, @@ -2493,17 +2487,12 @@ fn cmd_blueprint_edit( } BlueprintEditCommands::BumpNexusGeneration => { let current_generation = builder.nexus_generation(); - let current_max = blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) - .fold( - current_generation, - |current_max, (_sled_id, _zone_config, nexus_config)| { - std::cmp::max( - nexus_config.nexus_generation, - current_max, - ) - }, - ); + let current_max = blueprint.in_service_nexus_zones().fold( + current_generation, + |current_max, (_sled_id, _zone_config, nexus_config)| { + std::cmp::max(nexus_config.nexus_generation, current_max) + }, + ); ensure!( current_max > current_generation, "cannot bump blueprint generation (currently \ diff --git a/live-tests/tests/test_nexus_add_remove.rs b/live-tests/tests/test_nexus_add_remove.rs index c8852caa07a..09ae6d1ebb8 100644 --- a/live-tests/tests/test_nexus_add_remove.rs +++ b/live-tests/tests/test_nexus_add_remove.rs @@ -19,10 +19,12 @@ use nexus_reconfigurator_planning::blueprint_editor::ExternalNetworkingAllocator use nexus_reconfigurator_planning::planner::Planner; use nexus_reconfigurator_planning::planner::PlannerRng; use nexus_reconfigurator_preparation::PlanningInputFromDb; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::PlannerConfig; use nexus_types::deployment::SledFilter; +use nexus_types::deployment::ZoneRunningStatus; use nexus_types::deployment::blueprint_zone_type; use omicron_common::address::NEXUS_LOCKSTEP_PORT; use omicron_test_utils::dev::poll::CondCheckError; @@ -207,7 +209,10 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { .await .expect("editing blueprint to expunge zone"); let (_, expunged_zone_config) = blueprint3 - .all_omicron_zones(|_| true) + .expunged_zones( + ZoneRunningStatus::MaybeRunning, + BlueprintExpungedZoneAccessReason::Test, + ) .find(|(_sled_id, zone_config)| zone_config.id == new_zone.id) .expect("expunged zone in new blueprint"); let BlueprintZoneDisposition::Expunged { @@ -215,7 +220,7 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { .. } = expunged_zone_config.disposition else { - panic!("expected expunged zone to have disposition Expunged"); + unreachable!("expunged_zones() returned a non-expunged zone"); }; // At some point, we should be unable to reach this Nexus any more. @@ -306,7 +311,10 @@ async fn test_nexus_add_remove(lc: &LiveTestContext) { // We don't need to check this here. It just provides a better error // message if something has gone wrong up to this point. let (_, expunged_zone_config) = new_blueprint - .all_omicron_zones(|_| true) + .expunged_zones( + ZoneRunningStatus::Shutdown, + BlueprintExpungedZoneAccessReason::Test, + ) .find(|(_sled_id, zone_config)| zone_config.id == new_zone.id) .expect("expunged zone in new blueprint"); assert!(expunged_zone_config.disposition.is_ready_for_cleanup()); diff --git a/live-tests/tests/test_nexus_handoff.rs b/live-tests/tests/test_nexus_handoff.rs index 37de5318f60..a73a20ff7a7 100644 --- a/live-tests/tests/test_nexus_handoff.rs +++ b/live-tests/tests/test_nexus_handoff.rs @@ -18,7 +18,6 @@ use nexus_reconfigurator_planning::blueprint_builder::BlueprintBuilder; use nexus_reconfigurator_planning::blueprint_editor::ExternalNetworkingAllocator; use nexus_reconfigurator_preparation::PlanningInputFromDb; use nexus_types::deployment::Blueprint; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::PlannerConfig; @@ -69,7 +68,7 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { // there exist no Nexus zones with a generation newer than the blueprint's // `nexus_generation`. let new_zones = blueprint_initial - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_sled_id, z)| { let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { nexus_generation, @@ -102,7 +101,7 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { cfg: &'a blueprint_zone_type::Nexus, } let current_nexus_zones: BTreeMap = blueprint_initial - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(sled_id, z)| { let BlueprintZoneType::Nexus( cfg @ blueprint_zone_type::Nexus { nexus_generation, .. }, @@ -215,7 +214,7 @@ async fn test_nexus_handoff(lc: &LiveTestContext) { // Find the new Nexus zones and make clients for them. let new_nexus_clients = blueprint_new_nexus - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .filter_map(|(_sled_id, zone_cfg, nexus_config)| { (nexus_config.nexus_generation == next_generation).then(|| { ( @@ -495,7 +494,7 @@ async fn check_internal_dns( // Compute what we expect to find, based on which Nexus instances in the // blueprint have the specified generation. let expected_nexus_addrs = blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .filter_map(|(_sled_id, _zone_cfg, nexus_config)| { (nexus_config.nexus_generation == active_generation) .then_some(nexus_config.internal_address) @@ -504,7 +503,7 @@ async fn check_internal_dns( // Find the DNS server based on what's currently in the blueprint. let dns_sockaddr = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .find_map(|(_sled_id, zone_cfg)| { if let BlueprintZoneType::InternalDns( blueprint_zone_type::InternalDns { dns_address, .. }, @@ -559,7 +558,7 @@ async fn check_external_dns( // Compute which Nexus instances we expect to find in external DNS based on // what's in-service in the blueprint. let expected_nexus_addrs = blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .filter_map(|(_sled_id, _zone_cfg, nexus_config)| { (nexus_config.nexus_generation == active_generation) .then_some(nexus_config.external_ip.ip) @@ -568,7 +567,7 @@ async fn check_external_dns( // Find the DNS server based on what's currently in the blueprint. let dns_http_sockaddr = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .find_map(|(_sled_id, zone_cfg)| { if let BlueprintZoneType::ExternalDns( blueprint_zone_type::ExternalDns { http_address, .. }, diff --git a/nexus/db-queries/src/db/datastore/deployment.rs b/nexus/db-queries/src/db/datastore/deployment.rs index 4580c6e4257..dcccdc07ad3 100644 --- a/nexus/db-queries/src/db/datastore/deployment.rs +++ b/nexus/db-queries/src/db/datastore/deployment.rs @@ -75,6 +75,7 @@ use nexus_db_schema::enums::HwM2SlotEnum; use nexus_db_schema::enums::HwRotSlotEnum; use nexus_db_schema::enums::SpTypeEnum; use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintMetadata; use nexus_types::deployment::BlueprintSledConfig; use nexus_types::deployment::BlueprintSource; @@ -90,6 +91,7 @@ use nexus_types::deployment::PendingMgsUpdateRotBootloaderDetails; use nexus_types::deployment::PendingMgsUpdateRotDetails; use nexus_types::deployment::PendingMgsUpdateSpDetails; use nexus_types::deployment::PendingMgsUpdates; +use nexus_types::deployment::ZoneRunningStatus; use nexus_types::inventory::BaseboardId; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; @@ -1955,10 +1957,21 @@ impl DataStore { self.ensure_zone_external_networking_deallocated_on_connection( &conn, &opctx.log, + // TODO-correctness Currently the planner _does not wait_ + // for zones using external IPs to be ready for cleanup + // before reassigning the external IP to a new zone, so we + // have to deallocate IPs for both "ready for cleanup" and + // "not ready for cleanup" zones. We should fix the planner, + // at which point we can operate on only "ready for cleanup" + // zones here. + // + // blueprint - .all_omicron_zones(|disposition| { - !disposition.is_in_service() - }) + .expunged_zones( + ZoneRunningStatus::Any, + BlueprintExpungedZoneAccessReason + ::DeallocateExternalNetworkingResources + ) .map(|(_sled_id, zone)| zone), ) .await @@ -1966,11 +1979,7 @@ impl DataStore { self.ensure_zone_external_networking_allocated_on_connection( &conn, opctx, - blueprint - .all_omicron_zones(|disposition| { - disposition.is_in_service() - }) - .map(|(_sled_id, zone)| zone), + blueprint.in_service_zones().map(|(_sled_id, zone)| zone), ) .await .map_err(|e| err.bail(e))?; @@ -3312,7 +3321,7 @@ mod tests { ); assert_eq!(blueprint1.sleds.len(), collection.sled_agents.len()); assert_eq!( - blueprint1.all_omicron_zones(BlueprintZoneDisposition::any).count(), + blueprint1.in_service_zones().count(), collection.all_ledgered_omicron_zones().count() ); // All zones should be in service. @@ -3644,9 +3653,8 @@ mod tests { ); assert_eq!(blueprint1.sleds.len() + 1, blueprint2.sleds.len()); assert_eq!( - blueprint1.all_omicron_zones(BlueprintZoneDisposition::any).count() - + num_new_sled_zones, - blueprint2.all_omicron_zones(BlueprintZoneDisposition::any).count() + blueprint1.in_service_zones().count() + num_new_sled_zones, + blueprint2.in_service_zones().count() ); // All zones should be in service. @@ -4296,7 +4304,7 @@ mod tests { // Insert an IP pool range covering the one Nexus IP. let nexus_ip = blueprint1 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .find_map(|(_, zone_config)| { zone_config .zone_type @@ -4516,7 +4524,10 @@ mod tests { fn assert_all_zones_in_service(blueprint: &Blueprint) { let not_in_service = blueprint - .all_omicron_zones(|disposition| !disposition.is_in_service()) + .expunged_zones( + ZoneRunningStatus::Any, + BlueprintExpungedZoneAccessReason::Test, + ) .collect::>(); assert!( not_in_service.is_empty(), diff --git a/nexus/db-queries/src/db/datastore/rack.rs b/nexus/db-queries/src/db/datastore/rack.rs index b51556e4b34..89cfd72f4f5 100644 --- a/nexus/db-queries/src/db/datastore/rack.rs +++ b/nexus/db-queries/src/db/datastore/rack.rs @@ -48,7 +48,6 @@ use nexus_db_model::SledUnderlaySubnetAllocation; use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintTarget; use nexus_types::deployment::BlueprintZoneConfig; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::OmicronZoneExternalIp; use nexus_types::deployment::blueprint_zone_type; @@ -842,7 +841,7 @@ impl DataStore { // Insert Nexus database access records self.initialize_nexus_access_from_blueprint_on_connection( &conn, - blueprint.all_omicron_zones(BlueprintZoneDisposition::is_in_service) + blueprint.in_service_zones() .filter_map(|(_sled, zone_cfg)| { if zone_cfg.zone_type.is_nexus() { Some(zone_cfg.id) @@ -856,7 +855,7 @@ impl DataStore { })?; // Allocate networking records for all services. - for (_, zone_config) in blueprint.all_omicron_zones(BlueprintZoneDisposition::is_in_service) { + for (_, zone_config) in blueprint.in_service_zones() { self.rack_populate_service_networking_records( &conn, &log, @@ -1068,9 +1067,7 @@ mod test { use nexus_types::deployment::ExternalIpPolicy; use nexus_types::deployment::PendingMgsUpdates; use nexus_types::deployment::SledFilter; - use nexus_types::deployment::{ - BlueprintZoneDisposition, BlueprintZoneImageSource, OximeterReadMode, - }; + use nexus_types::deployment::{BlueprintZoneImageSource, OximeterReadMode}; use nexus_types::external_api::shared::SiloIdentityMode; use nexus_types::identity::Asset; use nexus_types::internal_api::params::DnsRecord; @@ -1503,9 +1500,7 @@ mod test { let mut ntp1_id = None; let mut ntp2_id = None; let mut ntp3_id = None; - for (sled_id, zone) in - blueprint.all_omicron_zones(BlueprintZoneDisposition::is_in_service) - { + for (sled_id, zone) in blueprint.in_service_zones() { match &zone.zone_type { BlueprintZoneType::BoundaryNtp(_) => { let which = if sled_id == sled1.id() { @@ -1750,10 +1745,8 @@ mod test { assert_eq!(observed_blueprint, blueprint); // We should see both of the Nexus services we provisioned. - let mut observed_zones: Vec<_> = observed_blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) - .map(|(_, z)| z) - .collect(); + let mut observed_zones: Vec<_> = + observed_blueprint.in_service_zones().map(|(_, z)| z).collect(); observed_zones.sort_by_key(|z| z.id); assert_eq!(observed_zones.len(), 2); @@ -1778,12 +1771,7 @@ mod test { if let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { external_ip, .. - }) = &blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) - .next() - .unwrap() - .1 - .zone_type + }) = &blueprint.in_service_zones().next().unwrap().1.zone_type { external_ip.ip } else { @@ -1797,12 +1785,7 @@ mod test { if let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { external_ip, .. - }) = &blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) - .nth(1) - .unwrap() - .1 - .zone_type + }) = &blueprint.in_service_zones().nth(1).unwrap().1.zone_type { external_ip.ip } else { @@ -1955,10 +1938,8 @@ mod test { assert_eq!(observed_blueprint, blueprint); // We should see the Nexus service we provisioned. - let mut observed_zones: Vec<_> = observed_blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) - .map(|(_, z)| z) - .collect(); + let mut observed_zones: Vec<_> = + observed_blueprint.in_service_zones().map(|(_, z)| z).collect(); observed_zones.sort_by_key(|z| z.id); assert_eq!(observed_zones.len(), 1); @@ -1985,12 +1966,7 @@ mod test { if let BlueprintZoneType::Nexus(blueprint_zone_type::Nexus { external_ip, .. - }) = &blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) - .next() - .unwrap() - .1 - .zone_type + }) = &blueprint.in_service_zones().next().unwrap().1.zone_type { external_ip.ip } else { diff --git a/nexus/db-queries/src/db/datastore/support_bundle.rs b/nexus/db-queries/src/db/datastore/support_bundle.rs index c53bcb04f82..afb0557a44b 100644 --- a/nexus/db-queries/src/db/datastore/support_bundle.rs +++ b/nexus/db-queries/src/db/datastore/support_bundle.rs @@ -21,7 +21,7 @@ use nexus_db_errors::OptionalError; use nexus_db_errors::public_error_from_diesel; use nexus_db_lookup::LookupPath; use nexus_types::deployment::BlueprintDatasetDisposition; -use nexus_types::deployment::BlueprintZoneDisposition; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use omicron_common::api::external; use omicron_common::api::external::CreateResult; use omicron_common::api::external::DataPageParams; @@ -227,14 +227,10 @@ impl DataStore { // For this blueprint: The set of all expunged Nexus zones that are // ready for cleanup let invalid_nexus_zones = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup) - .filter_map(|(_sled, zone)| { - if zone.zone_type.is_nexus() { - Some(zone.id.into_untyped_uuid()) - } else { - None - } - }) + .expunged_nexus_zones_ready_for_cleanup( + BlueprintExpungedZoneAccessReason::NexusSupportBundleReassign, + ) + .map(|(_sled, zone, _nexus_config)| zone.id.into_untyped_uuid()) .collect::>(); // For this blueprint: The set of expunged debug datasets @@ -544,6 +540,7 @@ mod test { use nexus_reconfigurator_planning::example::ExampleSystemBuilder; use nexus_reconfigurator_planning::example::SimRngState; use nexus_types::deployment::Blueprint; + use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use omicron_common::api::external::ByteCount; use omicron_common::api::external::LookupType; diff --git a/nexus/reconfigurator/blippy/src/checks.rs b/nexus/reconfigurator/blippy/src/checks.rs index eec34c2a34d..331a6dbbe79 100644 --- a/nexus/reconfigurator/blippy/src/checks.rs +++ b/nexus/reconfigurator/blippy/src/checks.rs @@ -9,11 +9,11 @@ use crate::blippy::Severity; use crate::blippy::SledKind; use nexus_types::deployment::BlueprintDatasetConfig; use nexus_types::deployment::BlueprintDatasetDisposition; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintHostPhase2DesiredContents; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; use nexus_types::deployment::BlueprintSledConfig; use nexus_types::deployment::BlueprintZoneConfig; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::OmicronZoneExternalIp; @@ -65,10 +65,7 @@ fn check_underlay_ips(blippy: &mut Blippy<'_>) { > = BTreeMap::new(); let mut rack_dns_subnets: BTreeSet = BTreeSet::new(); - for (sled_id, zone) in blippy - .blueprint() - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - { + for (sled_id, zone) in blippy.blueprint().in_service_zones() { let ip = zone.underlay_ip(); // There should be no duplicate underlay IPs. @@ -158,10 +155,8 @@ fn check_external_networking(blippy: &mut Blippy<'_>) { let mut used_nic_ips = BTreeMap::new(); let mut used_nic_macs = BTreeMap::new(); - for (sled_id, zone, external_ip, nic) in blippy - .blueprint() - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter_map(|(sled_id, zone)| { + for (sled_id, zone, external_ip, nic) in + blippy.blueprint().in_service_zones().filter_map(|(sled_id, zone)| { zone.zone_type .external_networking() .map(|(external_ip, nic)| (sled_id, zone, external_ip, nic)) @@ -263,10 +258,7 @@ fn check_dataset_zpool_uniqueness(blippy: &mut Blippy<'_>) { // On any given zpool, we should have at most one zone of any given // kind. - for (sled_id, zone) in blippy - .blueprint() - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - { + for (sled_id, zone) in blippy.blueprint().in_service_zones() { // Check "one kind per zpool" for transient datasets... let filesystem_dataset = zone.filesystem_dataset(); let kind = zone.zone_type.kind(); @@ -689,10 +681,7 @@ fn check_nexus_generation_consistency(blippy: &mut Blippy<'_>) { > = HashMap::new(); // Collect all Nexus zones and their generations - for (sled_id, zone) in blippy - .blueprint() - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - { + for (sled_id, zone) in blippy.blueprint().in_service_zones() { if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { generation_info.entry(nexus.nexus_generation).or_default().push(( sled_id, @@ -1751,7 +1740,7 @@ mod tests { let (_, _, mut blueprint) = example(&logctx.log, TEST_NAME); let crucible_addr_by_zpool = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_, z)| match z.zone_type { BlueprintZoneType::Crucible( blueprint_zone_type::Crucible { address, .. }, @@ -1996,7 +1985,7 @@ mod tests { // Find the Nexus zones let ((sled1, zone1_id), (sled2, zone2_id)) = { let nexus_zones: Vec<_> = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(sled_id, zone)| { if matches!(zone.zone_type, BlueprintZoneType::Nexus(_)) { Some((sled_id, zone)) @@ -2142,9 +2131,9 @@ fn check_planning_input_network_records_appear_in_blueprint( // constructed above in `BuilderExternalNetworking::new()`, we do not // check for duplicates here: we could very well see reuse of IPs // between expunged zones or between expunged -> running zones. - for (_, z) in - blippy.blueprint().all_omicron_zones(BlueprintZoneDisposition::any) - { + for (_, z) in blippy.blueprint().all_in_service_and_expunged_zones( + BlueprintExpungedZoneAccessReason::Blippy, + ) { let zone_type = &z.zone_type; match zone_type { BlueprintZoneType::BoundaryNtp(ntp) => { diff --git a/nexus/reconfigurator/execution/src/clickhouse.rs b/nexus/reconfigurator/execution/src/clickhouse.rs index b5ec2beb4c5..defab3197fc 100644 --- a/nexus/reconfigurator/execution/src/clickhouse.rs +++ b/nexus/reconfigurator/execution/src/clickhouse.rs @@ -23,8 +23,8 @@ use futures::stream::FuturesUnordered; use futures::stream::StreamExt; use nexus_db_queries::context::OpContext; use nexus_types::deployment::Blueprint; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintZoneConfig; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::ClickhouseClusterConfig; use omicron_common::address::CLICKHOUSE_ADMIN_PORT; use omicron_uuid_kinds::OmicronZoneUuid; @@ -45,9 +45,9 @@ pub(crate) async fn deploy_nodes( blueprint: &Blueprint, clickhouse_cluster_config: &ClickhouseClusterConfig, ) -> Result<(), Vec> { - // Important: We must continue to pass in `BlueprintZoneDisposition::any` - // here, instead of `BlueprintZoneDisposition::is_in_service`, as would - // be expected. + // Important: We must look at all zones here, including expunged zones in + // both "not ready for cleanup" and "ready for cleanup" states, instead of + // just in-service zones, as would be expected. // // We can only add or remove one clickhouse keeper node at a time, // and the planner generates the `ClickhouseClusterConfig` under this @@ -72,12 +72,11 @@ pub(crate) async fn deploy_nodes( // `ClickhouseClusterConfig`. // // This is tracked in https://github.com/oxidecomputer/omicron/issues/7724 - deploy_nodes_impl( - opctx, - blueprint.all_omicron_zones(BlueprintZoneDisposition::any), - clickhouse_cluster_config, - ) - .await + use BlueprintExpungedZoneAccessReason::ClickhouseKeeperServerConfigIps; + let all_zones = blueprint + .all_in_service_and_expunged_zones(ClickhouseKeeperServerConfigIps); + + deploy_nodes_impl(opctx, all_zones, clickhouse_cluster_config).await } async fn deploy_nodes_impl<'a, I>( @@ -235,7 +234,7 @@ pub(crate) async fn deploy_single_node( deploy_single_node_impl( opctx, blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| z.zone_type.is_clickhouse()), ) .await @@ -384,6 +383,7 @@ mod test { use clickhouse_admin_types::KeeperId; use clickhouse_admin_types::ServerId; use nexus_types::deployment::BlueprintZoneConfig; + use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::blueprint_zone_type; diff --git a/nexus/reconfigurator/execution/src/database.rs b/nexus/reconfigurator/execution/src/database.rs index df96d8bab3b..64bb33c3bba 100644 --- a/nexus/reconfigurator/execution/src/database.rs +++ b/nexus/reconfigurator/execution/src/database.rs @@ -8,7 +8,6 @@ use anyhow::anyhow; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; -use nexus_types::deployment::BlueprintZoneDisposition; use omicron_uuid_kinds::OmicronZoneUuid; use std::collections::BTreeSet; @@ -31,7 +30,7 @@ pub(crate) async fn deploy_db_metadata_nexus_records( // The actual generation number that's currently active is necessarily the // generation number of the Nexus instance that's doing the execution. let active_generation = blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .find_map(|(_sled_id, zone_cfg, nexus_config)| { (zone_cfg.id == nexus_id).then_some(nexus_config.nexus_generation) }) @@ -45,7 +44,7 @@ pub(crate) async fn deploy_db_metadata_nexus_records( let mut active = BTreeSet::new(); let mut not_yet = BTreeSet::new(); for (_sled_id, zone_config, nexus_config) in - blueprint.all_nexus_zones(BlueprintZoneDisposition::is_in_service) + blueprint.in_service_nexus_zones() { if nexus_config.nexus_generation == active_generation { active.insert(zone_config.id); diff --git a/nexus/reconfigurator/execution/src/dns.rs b/nexus/reconfigurator/execution/src/dns.rs index 202188c2621..693f412bd02 100644 --- a/nexus/reconfigurator/execution/src/dns.rs +++ b/nexus/reconfigurator/execution/src/dns.rs @@ -822,7 +822,7 @@ mod test { // To start, we need a mapping from underlay IP to the corresponding // Omicron zone. let omicron_zones_by_ip: BTreeMap<_, _> = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .map(|(_, zone)| (zone.underlay_ip(), zone.id)) .collect(); println!("omicron zones by IP: {:#?}", omicron_zones_by_ip); @@ -1554,11 +1554,11 @@ mod test { eprintln!("blueprint2: {}", blueprint2.display()); // Figure out the id of the new zone. let zones_before = blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) + .in_service_zones() .filter_map(|(_, z)| z.zone_type.is_nexus().then_some(z.id)) .collect::>(); let zones_after = blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::any) + .in_service_zones() .filter_map(|(_, z)| z.zone_type.is_nexus().then_some(z.id)) .collect::>(); let new_zones: Vec<_> = zones_after.difference(&zones_before).collect(); diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index 58b59d5234c..d0971f41523 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -18,7 +18,8 @@ use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_db_queries::db::datastore::CollectorReassignment; use nexus_types::deployment::Blueprint; -use nexus_types::deployment::BlueprintZoneDisposition; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; +use nexus_types::deployment::ZoneRunningStatus; use omicron_common::address::COCKROACH_ADMIN_PORT; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::OmicronZoneUuid; @@ -94,8 +95,9 @@ fn clean_up_expunged_nexus_zones<'a>( datastore: &'a DataStore, blueprint: &'a Blueprint, ) -> impl Stream)> + 'a { + use BlueprintExpungedZoneAccessReason::NexusDeleteMetadataRecord; let zones_to_clean_up = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup) + .expunged_zones(ZoneRunningStatus::Shutdown, NexusDeleteMetadataRecord) .filter(|(_sled_id, zone)| zone.zone_type.is_nexus()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { @@ -118,8 +120,9 @@ fn clean_up_expunged_cockroach_zones<'a, R: CleanupResolver>( resolver: &'a R, blueprint: &'a Blueprint, ) -> impl Stream)> + 'a { + use BlueprintExpungedZoneAccessReason::CockroachDecommission; let zones_to_clean_up = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup) + .expunged_zones(ZoneRunningStatus::Shutdown, CockroachDecommission) .filter(|(_sled_id, zone)| zone.zone_type.is_cockroach()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { @@ -141,8 +144,12 @@ fn clean_up_expunged_oximeter_zones<'a>( datastore: &'a DataStore, blueprint: &'a Blueprint, ) -> impl Stream)> + 'a { + use BlueprintExpungedZoneAccessReason::OximeterExpungeAndReassignProducers; let zones_to_clean_up = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_ready_for_cleanup) + .expunged_zones( + ZoneRunningStatus::Shutdown, + OximeterExpungeAndReassignProducers, + ) .filter(|(_sled_id, zone)| zone.zone_type.is_oximeter()); stream::iter(zones_to_clean_up).then(move |(sled_id, zone)| async move { @@ -328,8 +335,8 @@ mod test { use httptest::responders::{json_encoded, status_code}; use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::{ - BlueprintZoneConfig, BlueprintZoneImageSource, BlueprintZoneType, - blueprint_zone_type, + BlueprintZoneConfig, BlueprintZoneDisposition, + BlueprintZoneImageSource, BlueprintZoneType, blueprint_zone_type, }; use omicron_common::api::external::Generation; use omicron_common::zpool_name::ZpoolName; diff --git a/nexus/reconfigurator/execution/src/sagas.rs b/nexus/reconfigurator/execution/src/sagas.rs index d973c2c75fe..5660c3a55a8 100644 --- a/nexus/reconfigurator/execution/src/sagas.rs +++ b/nexus/reconfigurator/execution/src/sagas.rs @@ -7,7 +7,7 @@ use nexus_db_model::SecId; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; -use nexus_types::deployment::{Blueprint, BlueprintZoneDisposition}; +use nexus_types::deployment::{Blueprint, BlueprintExpungedZoneAccessReason}; use omicron_common::api::external::Error; use omicron_uuid_kinds::{GenericUuid, OmicronZoneUuid}; use slog::{debug, info, warn}; @@ -87,7 +87,9 @@ fn find_expunged_same_generation( let active_nexus_generation = blueprint.find_generation_for_self(nexus_zone_id)?; Ok(blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_ready_for_cleanup) + .expunged_nexus_zones_ready_for_cleanup( + BlueprintExpungedZoneAccessReason::NexusSagaReassignment, + ) .filter_map(|(_sled_id, zone_config, nexus_config)| { (nexus_config.nexus_generation == active_nexus_generation) .then_some(zone_config.id) @@ -126,7 +128,7 @@ mod test { ExampleSystemBuilder::new(log, TEST_NAME).nexus_count(4).build(); let g1 = Generation::new(); let g1_nexus_ids: Vec<_> = blueprint1 - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .map(|(sled_id, zone_config, nexus_config)| { assert_eq!(nexus_config.nexus_generation, g1); (sled_id, zone_config.id, zone_config.image_source.clone()) @@ -181,7 +183,7 @@ mod test { let blueprint2 = builder.build(BlueprintSource::Test); let g2_nexus_ids: Vec<_> = blueprint2 - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .filter_map(|(sled_id, zone_config, nexus_config)| { (nexus_config.nexus_generation == g2) .then_some((sled_id, zone_config.id)) diff --git a/nexus/reconfigurator/execution/src/test_utils.rs b/nexus/reconfigurator/execution/src/test_utils.rs index 737a2b16b59..c606d384cb0 100644 --- a/nexus/reconfigurator/execution/src/test_utils.rs +++ b/nexus/reconfigurator/execution/src/test_utils.rs @@ -10,7 +10,7 @@ use internal_dns_resolver::Resolver; use nexus_db_queries::{context::OpContext, db::DataStore}; use nexus_types::{ deployment::{ - Blueprint, BlueprintZoneDisposition, PendingMgsUpdates, + Blueprint, PendingMgsUpdates, execution::{EventBuffer, Overridables}, }, quiesce::SagaQuiesceHandle, @@ -44,7 +44,7 @@ pub(crate) async fn realize_blueprint_and_expect( // Act on behalf of one of the Nexus instances that could be currently // active in the blueprint. let nexus_id = blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .find_map(|(_sled_id, zone_config, nexus_config)| { (nexus_config.nexus_generation == blueprint.nexus_generation) .then_some(zone_config.id) diff --git a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs index 217eefd11f2..181b21b9ef3 100644 --- a/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs +++ b/nexus/reconfigurator/planning/src/blueprint_builder/builder.rs @@ -3245,9 +3245,7 @@ pub mod test { // that are already in use by existing zones. Attempting to add a // Nexus with no remaining external IPs should fail. let mut used_ip_ranges = Vec::new(); - for (_, z) in - parent.all_omicron_zones(BlueprintZoneDisposition::any) - { + for (_, z) in parent.in_service_zones() { if let Some((external_ip, _)) = z.zone_type.external_networking() { @@ -3316,9 +3314,7 @@ pub mod test { // Ensure no CRDB zones (currently `ExampleSystemBuilder` never // provisions CRDB; this check makes sure we update our use of it if // that changes). - for (_, z) in - parent.all_omicron_zones(BlueprintZoneDisposition::is_in_service) - { + for (_, z) in parent.in_service_zones() { assert!( !z.zone_type.is_cockroach(), "unexpected cockroach zone \ @@ -3361,7 +3357,7 @@ pub mod test { verify_blueprint(&blueprint, &input); assert_eq!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(sled_id, z)| { *sled_id == target_sled_id && z.zone_type.kind() == ZoneKind::CockroachDb diff --git a/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs b/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs index c428ec39c7f..e1f41bbc870 100644 --- a/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs +++ b/nexus/reconfigurator/planning/src/blueprint_editor/allocators/external_networking.rs @@ -86,9 +86,7 @@ impl ExternalNetworkingAllocator { external_ip_policy: &ExternalIpPolicy, ) -> anyhow::Result { Self::new( - blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .map(|(_sled_id, zone)| zone), + blueprint.in_service_zones().map(|(_sled_id, zone)| zone), external_ip_policy, ) } diff --git a/nexus/reconfigurator/planning/src/example.rs b/nexus/reconfigurator/planning/src/example.rs index efd690b2815..f47bc679cb5 100644 --- a/nexus/reconfigurator/planning/src/example.rs +++ b/nexus/reconfigurator/planning/src/example.rs @@ -30,7 +30,6 @@ use nexus_types::deployment::BlueprintArtifactVersion; use nexus_types::deployment::BlueprintHostPhase2DesiredContents; use nexus_types::deployment::BlueprintHostPhase2DesiredSlots; use nexus_types::deployment::BlueprintSource; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::ExpectedVersion; use nexus_types::deployment::OmicronZoneNic; use nexus_types::deployment::PlanningInput; @@ -730,7 +729,7 @@ impl ExampleSystemBuilder { // Find and set the set of active Nexuses let active_nexus_zone_ids: BTreeSet<_> = blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .filter_map(|(_, zone, nexus_zone)| { if nexus_zone.nexus_generation == blueprint.nexus_generation { Some(zone.id) @@ -1018,7 +1017,6 @@ mod tests { use internal_dns_resolver::Resolver; use internal_dns_types::names::ServiceName; use nexus_types::deployment::BlueprintZoneConfig; - use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::execution::blueprint_internal_dns_config; use nexus_types::deployment::execution::overridables; use nexus_types::internal_api::params::DnsConfigParams; @@ -1446,9 +1444,7 @@ mod tests { } ServiceName::Crucible(_) => { // Each Crucible zone should be queryable. - for (_, zone) in blueprint.all_omicron_zones( - BlueprintZoneDisposition::is_in_service, - ) { + for (_, zone) in blueprint.in_service_zones() { if zone.kind() == ZoneKind::Crucible { out.insert(ServiceName::Crucible(zone.id), Ok(())); } @@ -1471,7 +1467,7 @@ mod tests { kind: ZoneKind, ) -> Vec<&BlueprintZoneConfig> { blueprint - .all_omicron_zones(BlueprintZoneDisposition::any) + .in_service_zones() .filter_map(|(_, zone)| { (zone.zone_type.kind() == kind).then_some(zone) }) diff --git a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs index b9a06e34100..1d229796d29 100644 --- a/nexus/reconfigurator/planning/tests/integration_tests/planner.rs +++ b/nexus/reconfigurator/planning/tests/integration_tests/planner.rs @@ -16,6 +16,7 @@ use nexus_types::deployment::Blueprint; use nexus_types::deployment::BlueprintArtifactVersion; use nexus_types::deployment::BlueprintDatasetDisposition; use nexus_types::deployment::BlueprintDiffSummary; +use nexus_types::deployment::BlueprintExpungedZoneAccessReason; use nexus_types::deployment::BlueprintPhysicalDiskDisposition; use nexus_types::deployment::BlueprintSource; use nexus_types::deployment::BlueprintZoneConfig; @@ -30,6 +31,7 @@ use nexus_types::deployment::CockroachDbSettings; use nexus_types::deployment::OmicronZoneExternalSnatIp; use nexus_types::deployment::SledDisk; use nexus_types::deployment::TargetReleaseDescription; +use nexus_types::deployment::ZoneRunningStatus; use nexus_types::deployment::blueprint_zone_type; use nexus_types::deployment::blueprint_zone_type::InternalDns; use nexus_types::external_api::views::PhysicalDiskPolicy; @@ -116,7 +118,7 @@ fn get_nexus_ids_at_generation( generation: Generation, ) -> BTreeSet { blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_, z)| match &z.zone_type { BlueprintZoneType::Nexus(nexus_zone) if nexus_zone.nexus_generation == generation => @@ -502,7 +504,7 @@ fn test_spread_internal_dns_zones_across_sleds() { // Expunge two of the internal DNS zones; the planner should put new // zones back in their places. let zones_to_expunge = blueprint1 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(sled_id, zone)| { zone.zone_type.is_internal_dns().then_some((sled_id, zone.id)) }) @@ -760,7 +762,7 @@ fn test_reuse_external_dns_ips_from_expunged_zones() { assert_eq!( blueprint1a - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, zone)| zone.zone_type.is_external_dns()) .count(), 3, @@ -775,7 +777,7 @@ fn test_reuse_external_dns_ips_from_expunged_zones() { // This blueprint should have three external DNS zones. assert_eq!( blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, zone)| zone.zone_type.is_external_dns()) .count(), 3, @@ -810,7 +812,7 @@ fn test_reuse_external_dns_ips_from_expunged_zones() { // The IP addresses of the new external DNS zones should be the // same as the original set that we set up. let mut ips = blueprint3 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_id, zone)| { zone.zone_type .is_external_dns() @@ -1341,9 +1343,7 @@ fn test_disk_expungement_removes_zones_transient_filesystem() { // Find all the zones using this same zpool. let mut zones_on_pool = BTreeSet::new(); let mut zone_kinds_on_pool = BTreeMap::<_, usize>::new(); - for (_, zone_config) in - blueprint1.all_omicron_zones(BlueprintZoneDisposition::is_in_service) - { + for (_, zone_config) in blueprint1.in_service_zones() { if pool_to_expunge == zone_config.filesystem_pool { zones_on_pool.insert(zone_config.id); *zone_kinds_on_pool @@ -1959,7 +1959,7 @@ fn test_crucible_pantry() { // We should start with CRUCIBLE_PANTRY_REDUNDANCY pantries spread out // to at most 1 per sled. Find one of the sleds running one. let pantry_sleds = blueprint1 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(sled_id, zone)| { zone.zone_type.is_crucible_pantry().then_some(sled_id) }) @@ -1981,7 +1981,7 @@ fn test_crucible_pantry() { println!("1 -> 2 (expunged sled):\n{}", diff.display()); assert_eq!( blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(sled_id, zone)| *sled_id != expunged_sled_id && zone.zone_type.is_crucible_pantry()) .count(), @@ -2013,7 +2013,7 @@ fn test_single_node_clickhouse() { // We should start with one ClickHouse zone. Find out which sled it's on. let clickhouse_sleds = blueprint1 - .all_omicron_zones(BlueprintZoneDisposition::any) + .in_service_zones() .filter_map(|(sled, zone)| { zone.zone_type.is_clickhouse().then(|| Some(sled)) }) @@ -2034,7 +2034,7 @@ fn test_single_node_clickhouse() { println!("1 -> 2 (expunged sled):\n{}", diff.display()); assert_eq!( blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(sled, zone)| *sled != clickhouse_sled && zone.zone_type.is_clickhouse()) .count(), @@ -2086,10 +2086,8 @@ fn test_plan_deploy_all_clickhouse_cluster_nodes() { ); // We should see zones for 3 clickhouse keepers, and 2 servers created - let active_zones: Vec<_> = blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .map(|(_, z)| z.clone()) - .collect(); + let active_zones: Vec<_> = + blueprint2.in_service_zones().map(|(_, z)| z.clone()).collect(); let keeper_zone_ids: BTreeSet<_> = active_zones .iter() @@ -2213,10 +2211,8 @@ fn test_plan_deploy_all_clickhouse_cluster_nodes() { &diff.display().to_string(), ); - let active_zones: Vec<_> = blueprint5 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .map(|(_, z)| z.clone()) - .collect(); + let active_zones: Vec<_> = + blueprint5.in_service_zones().map(|(_, z)| z.clone()).collect(); let new_keeper_zone_ids: BTreeSet<_> = active_zones .iter() @@ -2348,10 +2344,8 @@ fn test_expunge_clickhouse_clusters() { let blueprint2 = sim.run_planner().expect("planning succeeded"); // We should see zones for 3 clickhouse keepers, and 2 servers created - let active_zones: Vec<_> = blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .map(|(_, z)| z.clone()) - .collect(); + let active_zones: Vec<_> = + blueprint2.in_service_zones().map(|(_, z)| z.clone()).collect(); let keeper_zone_ids: BTreeSet<_> = active_zones .iter() @@ -2422,7 +2416,7 @@ fn test_expunge_clickhouse_clusters() { // Find the sled containing one of the keeper zones and expunge it let (sled_id, bp_zone_config) = blueprint3 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .find(|(_, z)| z.zone_type.is_clickhouse_keeper()) .unwrap(); @@ -2546,10 +2540,8 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { let blueprint2 = sim.run_planner().expect("planning succeeded"); // We should see zones for 3 clickhouse keepers, and 2 servers created - let active_zones: Vec<_> = blueprint2 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .map(|(_, z)| z.clone()) - .collect(); + let active_zones: Vec<_> = + blueprint2.in_service_zones().map(|(_, z)| z.clone()).collect(); let keeper_zone_ids: BTreeSet<_> = active_zones .iter() @@ -2580,9 +2572,13 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { ); let blueprint3 = sim.run_planner().expect("planning succeeded"); - // We should have expunged our single-node clickhouse zone + // We should have expunged our single-node clickhouse zone, but it won't be + // `ready_for_cleanup` yet. let expunged_zones: Vec<_> = blueprint3 - .all_omicron_zones(BlueprintZoneDisposition::is_expunged) + .expunged_zones( + ZoneRunningStatus::MaybeRunning, + BlueprintExpungedZoneAccessReason::Test, + ) .map(|(_, z)| z.clone()) .collect(); @@ -2605,7 +2601,10 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { // All our clickhouse keeper and server zones that we created when we // enabled our clickhouse policy should be expunged when we disable it. let expunged_zones: Vec<_> = blueprint4 - .all_omicron_zones(BlueprintZoneDisposition::is_expunged) + .expunged_zones( + ZoneRunningStatus::MaybeRunning, + BlueprintExpungedZoneAccessReason::Test, + ) .map(|(_, z)| z.clone()) .collect(); @@ -2627,7 +2626,7 @@ fn test_expunge_clickhouse_zones_after_policy_is_changed() { assert_eq!( 1, blueprint4 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| z.zone_type.is_clickhouse()) .count() ); @@ -3113,17 +3112,13 @@ fn test_update_crucible_pantry_before_nexus() { // All zones should be sourced from the initial 0.0.1 target release by // default. - assert!( - blueprint1 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .all(|(_, z)| matches!( - &z.image_source, - BlueprintZoneImageSource::Artifact { version, hash: _ } - if version == &BlueprintArtifactVersion::Available { - version: ArtifactVersion::new_const("0.0.1") - } - )) - ); + assert!(blueprint1.in_service_zones().all(|(_, z)| matches!( + &z.image_source, + BlueprintZoneImageSource::Artifact { version, hash: _ } + if version == &BlueprintArtifactVersion::Available { + version: ArtifactVersion::new_const("0.0.1") + } + ))); // Manually specify a TUF repo with fake zone images. let version = ArtifactVersion::new_static("1.0.0-freeform") @@ -3296,16 +3291,19 @@ fn test_update_crucible_pantry_before_nexus() { // All Crucible Pantries should now be updated. assert_eq!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_up_to_date_pantry(z)) .count(), CRUCIBLE_PANTRY_REDUNDANCY ); - // All old Pantry zones should now be expunged. + // All old Pantry zones should now be expunged and ready for cleanup. assert_eq!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_expunged) + .expunged_zones( + ZoneRunningStatus::Shutdown, + BlueprintExpungedZoneAccessReason::Test + ) .filter(|(_, z)| is_old_pantry(z)) .count(), CRUCIBLE_PANTRY_REDUNDANCY @@ -3415,16 +3413,13 @@ fn test_update_crucible_pantry_before_nexus() { let blueprint12 = parent; assert_eq!( blueprint12 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_up_to_date_nexus(z)) .count(), NEXUS_REDUNDANCY, ); assert_eq!( - blueprint12 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .filter(|(_, z)| is_old_nexus(z)) - .count(), + blueprint12.in_service_zones().filter(|(_, z)| is_old_nexus(z)).count(), 0, ); @@ -3483,17 +3478,13 @@ fn test_update_cockroach() { // All zones should be sourced from the initial 0.0.1 target release by // default. eprintln!("{}", blueprint.display()); - assert!( - blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .all(|(_, z)| matches!( - &z.image_source, - BlueprintZoneImageSource::Artifact { version, hash: _ } - if version == &BlueprintArtifactVersion::Available { - version: ArtifactVersion::new_const("0.0.1") - } - )) - ); + assert!(blueprint.in_service_zones().all(|(_, z)| matches!( + &z.image_source, + BlueprintZoneImageSource::Artifact { version, hash: _ } + if version == &BlueprintArtifactVersion::Available { + version: ArtifactVersion::new_const("0.0.1") + } + ))); // This test "starts" here -- we specify a new TUF repo with an updated // CockroachDB image. We create a new TUF repo where version of @@ -3658,14 +3649,14 @@ fn test_update_cockroach() { assert_eq!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_old_cockroach(z)) .count(), COCKROACHDB_REDUNDANCY - i ); assert_eq!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_up_to_date_cockroach(z)) .count(), i @@ -3857,17 +3848,13 @@ fn test_update_boundary_ntp() { ); // All zones should be sourced from the 0.0.1 repo by default. - assert!( - blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .all(|(_, z)| matches!( - &z.image_source, - BlueprintZoneImageSource::Artifact { version, hash: _ } - if version == &BlueprintArtifactVersion::Available { - version: ArtifactVersion::new_const("0.0.1") - } - )) - ); + assert!(blueprint.in_service_zones().all(|(_, z)| matches!( + &z.image_source, + BlueprintZoneImageSource::Artifact { version, hash: _ } + if version == &BlueprintArtifactVersion::Available { + version: ArtifactVersion::new_const("0.0.1") + } + ))); // This test "starts" here -- we specify a new TUF repo with an updated // Boundary NTP image. We create a new TUF repo where version of @@ -3939,7 +3926,7 @@ fn test_update_boundary_ntp() { }; let old_boundary_ntp_count = |blueprint: &Blueprint| -> usize { blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_old_boundary_ntp(z)) .count() }; @@ -3948,7 +3935,7 @@ fn test_update_boundary_ntp() { }; let up_to_date_boundary_ntp_count = |blueprint: &Blueprint| -> usize { blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_up_to_date_boundary_ntp(z)) .count() }; @@ -4251,17 +4238,13 @@ fn test_update_internal_dns() { let blueprint = sim.assert_latest_blueprint_is_blippy_clean(); // All zones should be sourced from the initial TUF repo by default. - assert!( - blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .all(|(_, z)| matches!( - &z.image_source, - BlueprintZoneImageSource::Artifact { version, hash: _ } - if version == &BlueprintArtifactVersion::Available { - version: ArtifactVersion::new_const("0.0.1") - } - )) - ); + assert!(blueprint.in_service_zones().all(|(_, z)| matches!( + &z.image_source, + BlueprintZoneImageSource::Artifact { version, hash: _ } + if version == &BlueprintArtifactVersion::Available { + version: ArtifactVersion::new_const("0.0.1") + } + ))); // This test "starts" here -- we specify a new TUF repo with an updated // Internal DNS image. We create a new TUF repo where version of @@ -4452,14 +4435,14 @@ fn test_update_internal_dns() { assert_eq!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_old_internal_dns(z)) .count(), INTERNAL_DNS_REDUNDANCY - i ); assert_eq!( blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| is_up_to_date_internal_dns(z)) .count(), i @@ -4507,17 +4490,13 @@ fn test_update_all_zones() { let blueprint1 = sim.assert_latest_blueprint_is_blippy_clean(); // All zones should be sourced from the 0.0.1 repo by default. - assert!( - blueprint1 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .all(|(_, z)| matches!( - &z.image_source, - BlueprintZoneImageSource::Artifact { version, hash: _ } - if version == &BlueprintArtifactVersion::Available { - version: ArtifactVersion::new_const("0.0.1") - } - )) - ); + assert!(blueprint1.in_service_zones().all(|(_, z)| matches!( + &z.image_source, + BlueprintZoneImageSource::Artifact { version, hash: _ } + if version == &BlueprintArtifactVersion::Available { + version: ArtifactVersion::new_const("0.0.1") + } + ))); // Manually specify a TUF repo with fake images for all zones. // Only the name and kind of the artifacts matter. @@ -4581,9 +4560,7 @@ fn test_update_all_zones() { { assert!( blueprint - .all_omicron_zones( - BlueprintZoneDisposition::is_in_service - ) + .in_service_zones() .all(|(_, zone)| zone.image_source == image_source), "failed to update all zones" ); diff --git a/nexus/src/app/background/tasks/blueprint_execution.rs b/nexus/src/app/background/tasks/blueprint_execution.rs index be6c31a889c..c77784ec299 100644 --- a/nexus/src/app/background/tasks/blueprint_execution.rs +++ b/nexus/src/app/background/tasks/blueprint_execution.rs @@ -520,9 +520,7 @@ mod test { .await; // Insert records for the zpools backing the datasets in these zones. - for (sled_id, config) in - loaded.blueprint.all_omicron_zones(BlueprintZoneDisposition::any) - { + for (sled_id, config) in loaded.blueprint.in_service_zones() { let Some(dataset) = config.zone_type.durable_dataset() else { continue; }; diff --git a/nexus/src/app/background/tasks/crdb_node_id_collector.rs b/nexus/src/app/background/tasks/crdb_node_id_collector.rs index 529c38b7d0b..c5957dc4d46 100644 --- a/nexus/src/app/background/tasks/crdb_node_id_collector.rs +++ b/nexus/src/app/background/tasks/crdb_node_id_collector.rs @@ -34,7 +34,6 @@ use futures::stream; use nexus_auth::context::OpContext; use nexus_db_queries::db::DataStore; use nexus_types::deployment::Blueprint; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::blueprint_zone_type; use omicron_common::address::COCKROACH_ADMIN_PORT; @@ -143,23 +142,21 @@ impl CockroachAdminFromBlueprint for CockroachAdminFromBlueprintViaFixedPort { &'a self, blueprint: &'a Blueprint, ) -> impl Iterator + 'a { - // We can only actively collect from zones that should be running; if + // We can only actively collect from zones that should be in-service; if // there are CRDB zones in other states that still need their node ID // collected, we have to wait until they're running. - let zone_filter = BlueprintZoneDisposition::is_in_service; - - blueprint.all_omicron_zones(zone_filter).filter_map( - |(_sled_id, zone)| match &zone.zone_type { - BlueprintZoneType::CockroachDb( - blueprint_zone_type::CockroachDb { address, .. }, - ) => { - let mut admin_addr = *address; - admin_addr.set_port(COCKROACH_ADMIN_PORT); - Some((zone.id, admin_addr)) - } - _ => None, - }, - ) + blueprint.in_service_zones().filter_map(|(_sled_id, zone)| match &zone + .zone_type + { + BlueprintZoneType::CockroachDb( + blueprint_zone_type::CockroachDb { address, .. }, + ) => { + let mut admin_addr = *address; + admin_addr.set_port(COCKROACH_ADMIN_PORT); + Some((zone.id, admin_addr)) + } + _ => None, + }) } } @@ -269,7 +266,7 @@ mod tests { // `ExampleSystemBuilder` doesn't place any cockroach nodes; assert so // we bail out early if that changes. let ncockroach = bp0 - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter(|(_, z)| z.zone_type.is_cockroach()) .count(); assert_eq!(ncockroach, 0); diff --git a/nexus/src/app/deployment.rs b/nexus/src/app/deployment.rs index 999ccba3126..b4ea1d6f4cf 100644 --- a/nexus/src/app/deployment.rs +++ b/nexus/src/app/deployment.rs @@ -15,7 +15,6 @@ use nexus_types::deployment::BlueprintHostPhase2DesiredContents; use nexus_types::deployment::BlueprintMetadata; use nexus_types::deployment::BlueprintTarget; use nexus_types::deployment::BlueprintTargetSet; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneImageSource; use nexus_types::deployment::PlannerConfig; use nexus_types::deployment::PlanningInput; @@ -367,9 +366,7 @@ fn is_target_release_change_allowed( } // Now check zone configs. - for (_, zone_config) in current_blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - { + for (_, zone_config) in current_blueprint.in_service_zones() { match &zone_config.image_source { BlueprintZoneImageSource::InstallDataset => { // A mupdate has occurred; we must allow a new target release. @@ -418,6 +415,7 @@ mod tests { use super::*; use nexus_reconfigurator_planning::example::example; use nexus_types::deployment::BlueprintHostPhase2DesiredSlots; + use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::external_api::views::SledState; use omicron_common::api::external::Generation; use omicron_test_utils::dev::test_setup_log; diff --git a/nexus/src/app/quiesce.rs b/nexus/src/app/quiesce.rs index 623f9ab9d36..465729bfac6 100644 --- a/nexus/src/app/quiesce.rs +++ b/nexus/src/app/quiesce.rs @@ -12,7 +12,6 @@ use nexus_db_model::DbMetadataNexusState; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::internal_api::views::QuiesceState; use nexus_types::internal_api::views::QuiesceStatus; @@ -351,9 +350,8 @@ async fn check_all_sagas_drained( // // This doesn't ever change once we've determined it once. But we don't // know what the value is until we see our first blueprint. - let Some(my_generation) = current_blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) - .find_map(|(_sled_id, zone)| { + let Some(my_generation) = + current_blueprint.in_service_zones().find_map(|(_sled_id, zone)| { if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { (zone.id == my_nexus_id).then_some(nexus.nexus_generation) } else { @@ -441,7 +439,7 @@ async fn check_all_sagas_drained( // were all drained up to the same point, which is good enough for us to // proceed, too. let our_gen_nexus_ids: BTreeSet = current_blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_sled_id, zone)| { if let BlueprintZoneType::Nexus(nexus) = &zone.zone_type { (nexus.nexus_generation == my_generation).then_some(zone.id) @@ -522,7 +520,6 @@ mod test { use nexus_test_utils_macros::nexus_test; use nexus_types::deployment::BlueprintTarget; use nexus_types::deployment::BlueprintTargetSet; - use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::quiesce::SagaReassignmentDone; use omicron_test_utils::dev::poll::CondCheckError; use omicron_test_utils::dev::poll::wait_for_condition; @@ -854,7 +851,7 @@ mod test { "test_quiesce_multi", ); let nexus_ids = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_sled_id, z)| z.zone_type.is_nexus().then_some(z.id)) .collect::>(); // The example system creates three sleds. Each sled gets a Nexus zone. diff --git a/nexus/src/lib.rs b/nexus/src/lib.rs index f3e65e5caee..aa6e3cb484b 100644 --- a/nexus/src/lib.rs +++ b/nexus/src/lib.rs @@ -29,7 +29,6 @@ use nexus_config::NexusConfig; use nexus_db_model::RendezvousDebugDataset; use nexus_db_queries::db; use nexus_types::deployment::Blueprint; -use nexus_types::deployment::BlueprintZoneDisposition; use nexus_types::deployment::BlueprintZoneType; use nexus_types::deployment::blueprint_zone_type; use nexus_types::internal_api::params::{ @@ -358,7 +357,7 @@ impl nexus_test_interface::NexusServer for Server { // system services. But here, we fake up IP pool ranges based on the // external addresses of services that we start or mock. let internal_services_ip_pool_ranges = blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_, zc)| match &zc.zone_type { BlueprintZoneType::BoundaryNtp( blueprint_zone_type::BoundaryNtp { external_ip, .. }, diff --git a/nexus/types/src/deployment.rs b/nexus/types/src/deployment.rs index 080b1f3c883..2bc3a6eb540 100644 --- a/nexus/types/src/deployment.rs +++ b/nexus/types/src/deployment.rs @@ -307,9 +307,97 @@ impl Blueprint { } } + /// Iterate over the in-service [`BlueprintZoneConfig`] instances in the + /// blueprint, along with the associated sled id. + pub fn in_service_zones( + &self, + ) -> impl Iterator { + // Danger note: this call has no danger of accessing expunged zones, + // because we're filtering to in-service. + self.danger_all_omicron_zones(BlueprintZoneDisposition::is_in_service) + } + + /// Iterate over the expunged [`BlueprintZoneConfig`] instances in the + /// blueprint, along with the associated sled id. + /// + /// Each call must specify whether they want zones that are + /// `ready_for_cleanup` (i.e., have been confirmed to be shut down and will + /// not be restarted), and must also specify a + /// [`BlueprintExpungedZoneAccessReason`]. The latter allows us to + /// statically track all uses of expunged zones, each of which we must + /// account for in the planner's logic to permanently prune expunged zones + /// from the blueprint. + pub fn expunged_zones( + &self, + ready_for_cleanup: ZoneRunningStatus, + _reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator { + // Danger note: this call will definitely access expunged zones, but we + // know the caller has provided a known reason to do so. + self.danger_all_omicron_zones(move |disposition| { + let this_zone_ready_for_cleanup = match disposition { + BlueprintZoneDisposition::InService => return false, + BlueprintZoneDisposition::Expunged { + as_of_generation: _, + ready_for_cleanup: this_zone_ready_for_cleanup, + } => this_zone_ready_for_cleanup, + }; + match ready_for_cleanup { + ZoneRunningStatus::Shutdown => this_zone_ready_for_cleanup, + ZoneRunningStatus::MaybeRunning => !this_zone_ready_for_cleanup, + ZoneRunningStatus::Any => true, + } + }) + } + + /// Iterate over all zones in the blueprint, regardless of whether they're + /// in-service or expunged. + /// + /// Like [`Self::expunged_zones()`], callers are required to specify a + /// reason to access expunged zones. + /// + /// The set of zones returned by this method is equivalent to the set of + /// zones returned by chaining together calls to `Self::in_service_zones()` + /// and `Self::expunged_zones(ZoneRunningStatus::Any, reason)`, but only + /// iterates over the zones once. + pub fn all_in_service_and_expunged_zones( + &self, + _reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator { + // Danger note: this call will definitely access expunged zones, but we + // know the caller has provided a known reason to do so. + self.danger_all_omicron_zones(BlueprintZoneDisposition::any) + } + + /// Iterate over all zones in the blueprint, returning any that could be + /// running: either they're in-service, or they're expunged but have not yet + /// been confirmed shut down. + /// + /// The set of zones returned by this method is equivalent to the set of + /// zones returned by chaining together calls to `Self::in_service_zones()` + /// and `Self::expunged_zones(ZoneRunningStatus::MaybeRunning, reason)`, but + /// only iterates over the zones once and does not require a `reason`. + pub fn all_maybe_running_zones( + &self, + ) -> impl Iterator { + // Danger note: this call will definitely access expunged zones, but + // only those that are not yet `ready_for_cleanup`. The planner's + // pruning only acts on `ready_for_cleanup` zones, so we don't need to + // track accesses of this kind of expunged zone. + self.danger_all_omicron_zones( + BlueprintZoneDisposition::could_be_running, + ) + } + /// Iterate over the [`BlueprintZoneConfig`] instances in the blueprint /// that match the provided filter, along with the associated sled id. - pub fn all_omicron_zones( + /// + /// This method is prefixed with `danger_` and is private because it allows + /// the caller to potentially act on expunged zones without providing a + /// reason for doing so. It should only be called by `in_service_zones()` + /// and the helper methods that require callers to specify a + /// [`BlueprintExpungedZoneAccessReason`] defined above. + fn danger_all_omicron_zones( &self, mut filter: F, ) -> impl Iterator @@ -324,17 +412,13 @@ impl Blueprint { .filter(move |(_, z)| filter(z.disposition)) } - /// Iterate over all Nexus zones that match the provided filter. - pub fn all_nexus_zones( + /// Iterate over all in-service Nexus zones. + pub fn in_service_nexus_zones( &self, - filter: F, ) -> impl Iterator< Item = (SledUuid, &BlueprintZoneConfig, &blueprint_zone_type::Nexus), - > - where - F: FnMut(BlueprintZoneDisposition) -> bool, - { - self.all_omicron_zones(filter).filter_map(|(sled_id, zone)| { + > { + self.in_service_zones().filter_map(|(sled_id, zone)| { if let BlueprintZoneType::Nexus(nexus_config) = &zone.zone_type { Some((sled_id, zone, nexus_config)) } else { @@ -343,6 +427,26 @@ impl Blueprint { }) } + /// Iterate over all expunged Nexus zones that are ready for cleanup (i.e., + /// have been confirmed to be shut down and will not restart). + pub fn expunged_nexus_zones_ready_for_cleanup( + &self, + reason: BlueprintExpungedZoneAccessReason, + ) -> impl Iterator< + Item = (SledUuid, &BlueprintZoneConfig, &blueprint_zone_type::Nexus), + > { + self.expunged_zones(ZoneRunningStatus::Shutdown, reason).filter_map( + |(sled_id, zone)| { + if let BlueprintZoneType::Nexus(nexus_config) = &zone.zone_type + { + Some((sled_id, zone, nexus_config)) + } else { + None + } + }, + ) + } + /// Iterate over the [`BlueprintPhysicalDiskConfig`] instances in the /// blueprint that match the provided filter, along with the associated /// sled id. @@ -411,14 +515,17 @@ impl Blueprint { BlueprintDisplay { blueprint: self } } - /// Returns whether the given Nexus instance should be quiescing or quiesced - /// in preparation for handoff to the next generation + /// Returns whether the Nexus instance `nexus_id`, which is assumed to refer + /// to the currently-running Nexus instance (the current process), should be + /// quiescing or quiesced in preparation for handoff to the next generation pub fn is_nexus_quiescing( &self, nexus_id: OmicronZoneUuid, ) -> Result { let zone = self - .all_omicron_zones(|_z| true) + .all_in_service_and_expunged_zones( + BlueprintExpungedZoneAccessReason::NexusSelfIsQuiescing, + ) .find(|(_sled_id, zone_config)| zone_config.id == nexus_id) .ok_or_else(|| { anyhow!("zone {} does not exist in blueprint", nexus_id) @@ -443,9 +550,7 @@ impl Blueprint { nexus_zones: &BTreeSet, ) -> Result, anyhow::Error> { let mut r#gen = None; - for (_, zone, nexus_zone) in - self.all_nexus_zones(BlueprintZoneDisposition::is_in_service) - { + for (_, zone, nexus_zone) in self.in_service_nexus_zones() { if nexus_zones.contains(&zone.id) { let found_gen = nexus_zone.nexus_generation; if let Some(r#gen) = r#gen { @@ -467,11 +572,13 @@ impl Blueprint { &self, nexus_id: OmicronZoneUuid, ) -> Result { - for (_sled_id, zone_config, nexus_config) in - self.all_nexus_zones(BlueprintZoneDisposition::could_be_running) - { - if zone_config.id == nexus_id { - return Ok(nexus_config.nexus_generation); + for (_sled_id, zone_config) in self.all_maybe_running_zones() { + if let BlueprintZoneType::Nexus(nexus_config) = + &zone_config.zone_type + { + if zone_config.id == nexus_id { + return Ok(nexus_config.nexus_generation); + } } } @@ -500,18 +607,19 @@ impl Blueprint { // zones. (Real racks will always have at least one in-service boundary // NTP zone, but some test or test systems may have 0 if they have only // a single sled and that sled's boundary NTP zone is being upgraded.) - self.all_omicron_zones(BlueprintZoneDisposition::any).find_map( - |(_sled_id, zone)| match &zone.zone_type { - BlueprintZoneType::BoundaryNtp(ntp_config) => { - Some(UpstreamNtpConfig { - ntp_servers: &ntp_config.ntp_servers, - dns_servers: &ntp_config.dns_servers, - domain: ntp_config.domain.as_deref(), - }) - } - _ => None, - }, + self.all_in_service_and_expunged_zones( + BlueprintExpungedZoneAccessReason::BoundaryNtpUpstreamConfig, ) + .find_map(|(_sled_id, zone)| match &zone.zone_type { + BlueprintZoneType::BoundaryNtp(ntp_config) => { + Some(UpstreamNtpConfig { + ntp_servers: &ntp_config.ntp_servers, + dns_servers: &ntp_config.dns_servers, + domain: ntp_config.domain.as_deref(), + }) + } + _ => None, + }) } /// Return the operator-specified configuration of Nexus. @@ -530,18 +638,18 @@ impl Blueprint { // zones. (Real racks will always have at least one in-service Nexus // zone - the one calling this code - but some tests create blueprints // without any.) - self.all_omicron_zones(BlueprintZoneDisposition::any).find_map( - |(_sled_id, zone)| match &zone.zone_type { - BlueprintZoneType::Nexus(nexus_config) => { - Some(OperatorNexusConfig { - external_tls: nexus_config.external_tls, - external_dns_servers: &nexus_config - .external_dns_servers, - }) - } - _ => None, - }, + self.all_in_service_and_expunged_zones( + BlueprintExpungedZoneAccessReason::NexusExternalConfig, ) + .find_map(|(_sled_id, zone)| match &zone.zone_type { + BlueprintZoneType::Nexus(nexus_config) => { + Some(OperatorNexusConfig { + external_tls: nexus_config.external_tls, + external_dns_servers: &nexus_config.external_dns_servers, + }) + } + _ => None, + }) } /// Returns the complete set of external IP addresses assigned to external @@ -581,14 +689,16 @@ impl Blueprint { // don't check for that here. That would be an illegal blueprint; blippy // would complain, and attempting to execute it would fail due to // database constraints on external IP uniqueness. - self.all_omicron_zones(BlueprintZoneDisposition::any) - .filter_map(|(_id, zone)| match &zone.zone_type { - BlueprintZoneType::ExternalDns(dns) => { - Some(dns.dns_address.addr.ip()) - } - _ => None, - }) - .collect() + self.all_in_service_and_expunged_zones( + BlueprintExpungedZoneAccessReason::ExternalDnsExternalIps, + ) + .filter_map(|(_id, zone)| match &zone.zone_type { + BlueprintZoneType::ExternalDns(dns) => { + Some(dns.dns_address.addr.ip()) + } + _ => None, + }) + .collect() } } @@ -605,6 +715,155 @@ pub struct OperatorNexusConfig<'a> { pub external_dns_servers: &'a [IpAddr], } +/// `ZoneRunningStatus` is an argument to [`Blueprint::expunged_zones()`] +/// allowing the caller to to specify whether they want to operate on zones that +/// are shut down, could still be running, or both/either. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub enum ZoneRunningStatus { + /// Only return zones that are guaranteed to be shutdown and will not be + /// restarted. + /// + /// This corresponds to `ready_for_cleanup: true` in a zone's + /// [`BlueprintZoneDisposition::Expunged`] state. + Shutdown, + /// Only return expunged zones that could still be running. This is + /// inherently racy. Zones in this state may already be stopped, and if they + /// aren't they are likely (but not guaranteed) to be stopped soon. + /// + /// This corresponds to `ready_for_cleanup: false` in a zone's + /// [`BlueprintZoneDisposition::Expunged`] state. + MaybeRunning, + /// Return expunged zones regardless of whether they're shut down or could + /// still be running. + /// + /// This corresponds to ignoring the `ready_for_cleanup` field of a zone's + /// [`BlueprintZoneDisposition::Expunged`] state. + Any, +} + +/// Argument to [`Blueprint::expunged_zones()`] requiring the caller to specify +/// the reason they want to access a blueprint's expunged zones. +/// +/// The planner is responsible for permanently pruning expunged zones from the +/// blueprint. However, doing so correctly requires waiting for any cleanup +/// actions that need to happen, all of which are specific to the zone kind. +/// For example, blueprint execution checks for expunged Oximeter zones to +/// perform metric producer reassignment, which means the planner cannot prune +/// an expunged Oximeter zone until it knows that all possible such +/// reassignments are complete. +/// +/// These reasons are not used at runtime; they exist solely to document and +/// attempt to statically guard against new code adding a new call to +/// `expunged_zones()` that the planner doesn't know about (and therefore +/// doesn't get updated to account for!). If you are attempting to call +/// `expunged_zones()` for a new reason, you must: +/// +/// 1. Add a new variant to this enum. +/// 2. Update the planner to account for it, to prevent the planner from pruning +/// the zone before whatever your use of it is completed. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub enum BlueprintExpungedZoneAccessReason { + // -------------------------------------------------------------------- + // Zone-kind-specific variants. Keep this sorted alphabetically, prefix + // them by the zone kind if applicable, and add details explaining the + // conditions the planner must consider during pruning. + // -------------------------------------------------------------------- + /// Carrying forward the upstream NTP configuration provided by the operator + /// during rack setup; see [`Blueprint::upstream_ntp_config()`]. + /// + /// The planner must not prune a boundary NTP zone if it's the last zone + /// remaining with the set of configuration. + BoundaryNtpUpstreamConfig, + + /// Multinode Clickhouse configuration changes must operate one node at a + /// time, but the [`ClickhouseClusterConfig`] does not currently include the + /// IP addresses of all nodes. Blueprint execution therefore must scan + /// expunged nodes to find IP addresses. + /// + /// The planner must not prune a Clickhouse keeper or server zone if its + /// zone ID is contained in the current [`ClickhouseClusterConfig`]. + ClickhouseKeeperServerConfigIps, + + /// The cockroachdb cluster should be instructed to decommission any + /// expunged cockroach nodes. + /// + /// The planner must not prune a CRDB zone if it has not yet been + /// decommissioned. + CockroachDecommission, + + /// This is a catch-all variant for updating the `external_ip` and + /// `network_interface` tables for all zone types that have external + /// networking (Nexus, boundary NTP, and external DNS). + /// + /// The planner must not prune any of these zone types if their external + /// networking bits are still referenced by the active CRDB tables. + DeallocateExternalNetworkingResources, + + /// Carrying forward the external DNS external IPs provided by the operator + /// during rack setup; see [`Blueprint::all_external_dns_external_ips()`]. + /// + /// The planner must not prune an external DNS zone if it's the last zone + /// remaining with the set of IPs. + ExternalDnsExternalIps, + + /// After handoff, expunged Nexus zones must have their database access row + /// deleted. + /// + /// The planner must not prune a Nexus zone if it's still referenced in the + /// `db_metadata_nexus` table. + NexusDeleteMetadataRecord, + + /// Carrying forward the external Nexus configuration provided by the + /// operator during rack setup; see [`Blueprint::operator_nexus_config()`]. + /// + /// The planner must not prune a Nexus zone if it's the last zone + /// remaining with the set of configuration. + NexusExternalConfig, + + /// Nexus needs to whether it itself should be quiescing. If the + /// actively-running Nexus has been expunged (but not yet shut down), it + /// should still be able to determine this! + /// + /// The planner does not need to account for this when pruning Nexus zones. + NexusSelfIsQuiescing, + + /// Sagas assigneed to any expunged Nexus must be reassigned to an + /// in-service Nexus. + /// + /// The planner must not prune a Nexus zone if it still has any sagas + /// assigned to it. + NexusSagaReassignment, + + /// Support bundles assigned to an expunged Nexus must either be reassigned + /// or marked as failed. + /// + /// The planner must not prune a Nexus zone if it still has any support + /// bundles assigned to it. + NexusSupportBundleReassign, + + /// An expunged Oximeter zone must be marked expunged in the `oximeter` CRDB + /// table (so Nexus knows to stop assigning producers to it), and any + /// producers previously assigned to it must be reassigned to new Oximeters. + /// + /// The planner must not prune an Oximeter zone if it's still eligible for + /// new producers or if it has any assigned producers. + OximeterExpungeAndReassignProducers, + + // -------------------------------------------------------------------- + // Catch-all variants for non-production callers. The planner does not need + // to account for these when pruning. + // -------------------------------------------------------------------- + /// Blippy performs checks that include expunged zones. + Blippy, + + /// Omdb allows support operators to poke at blueprint contents, including + /// expunged zones. + Omdb, + + /// Various unit and integration tests access expunged zones. + Test, +} + /// Description of the source of a blueprint. #[derive( Clone, Debug, Eq, PartialEq, JsonSchema, Deserialize, Serialize, Diffable, diff --git a/nexus/types/src/deployment/execution/dns.rs b/nexus/types/src/deployment/execution/dns.rs index 2e509d12b28..a212db79bde 100644 --- a/nexus/types/src/deployment/execution/dns.rs +++ b/nexus/types/src/deployment/execution/dns.rs @@ -13,8 +13,7 @@ use omicron_common::api::external::{Generation, Name}; use crate::{ deployment::{ - Blueprint, BlueprintZoneDisposition, BlueprintZoneType, SledFilter, - blueprint_zone_type, + Blueprint, BlueprintZoneType, SledFilter, blueprint_zone_type, }, internal_api::params::{DnsConfigZone, DnsRecord}, silo::{default_silo_name, silo_dns_name}, @@ -40,9 +39,7 @@ pub fn blueprint_internal_dns_config( // the details. let mut dns_builder = DnsConfigBuilder::new(); - 'all_zones: for (_, zone) in - blueprint.all_omicron_zones(BlueprintZoneDisposition::is_in_service) - { + 'all_zones: for (_, zone) in blueprint.in_service_zones() { let (service_name, &address) = match &zone.zone_type { BlueprintZoneType::BoundaryNtp( blueprint_zone_type::BoundaryNtp { address, .. }, diff --git a/nexus/types/src/deployment/execution/utils.rs b/nexus/types/src/deployment/execution/utils.rs index fd6bb3f87b3..8e5c235de68 100644 --- a/nexus/types/src/deployment/execution/utils.rs +++ b/nexus/types/src/deployment/execution/utils.rs @@ -13,10 +13,7 @@ use omicron_uuid_kinds::SledUuid; use sled_agent_types_versions::latest::inventory::SledRole; use crate::{ - deployment::{ - Blueprint, BlueprintZoneDisposition, BlueprintZoneType, - blueprint_zone_type, - }, + deployment::{Blueprint, BlueprintZoneType, blueprint_zone_type}, external_api::views::SledPolicy, }; @@ -90,7 +87,7 @@ pub fn blueprint_nexus_external_ips( active_generation: Generation, ) -> Vec { blueprint - .all_nexus_zones(BlueprintZoneDisposition::is_in_service) + .in_service_nexus_zones() .filter_map(|(_sled_id, _zone_config, nexus_config)| { (nexus_config.nexus_generation == active_generation) .then_some(nexus_config.external_ip.ip) @@ -104,7 +101,7 @@ pub fn blueprint_external_dns_nameserver_ips( blueprint: &Blueprint, ) -> Vec { blueprint - .all_omicron_zones(BlueprintZoneDisposition::is_in_service) + .in_service_zones() .filter_map(|(_, z)| match z.zone_type { BlueprintZoneType::ExternalDns( blueprint_zone_type::ExternalDns { dns_address, .. },