diff --git a/nexus/db-queries/src/db/datastore/multicast/members.rs b/nexus/db-queries/src/db/datastore/multicast/members.rs index ce9a2be5666..686ac93ddb1 100644 --- a/nexus/db-queries/src/db/datastore/multicast/members.rs +++ b/nexus/db-queries/src/db/datastore/multicast/members.rs @@ -2734,6 +2734,12 @@ mod tests { logctx.cleanup_successful(); } + /// Test error handling for invalid group/instance in attach operations. + /// + /// Tests three error cases: + /// 1. Attach to non-existent group → fails + /// 2. Attach non-existent instance to existing group → fails + /// 3. Both group and instance don't exist → fails with instance error #[tokio::test] async fn test_member_attach_invalid_group_or_instance() { let logctx = @@ -2741,6 +2747,26 @@ mod tests { let db = TestDatabase::new_with_datastore(&logctx.log).await; let (opctx, datastore) = (db.opctx(), db.datastore()); + // Case: Both invalid -> instance error takes priority + let fake_group_id = Uuid::new_v4(); + let fake_instance_id = Uuid::new_v4(); + let result = datastore + .multicast_group_member_attach_to_instance( + &opctx, + MulticastGroupUuid::from_untyped_uuid(fake_group_id), + InstanceUuid::from_untyped_uuid(fake_instance_id), + Some(vec![]), + ) + .await; + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(matches!(err, external::Error::InvalidRequest { .. })); + assert!( + err.to_string().contains("Instance does not exist"), + "Expected InstanceNotFound error priority, got: {err}" + ); + + // Setup for remaining tests let setup = multicast::create_test_setup( &opctx, &datastore, @@ -2749,7 +2775,7 @@ mod tests { ) .await; - // Create a valid instance + // Case: Valid instance, but a non-existent group let (instance, _vmm) = create_instance_with_vmm( &opctx, &datastore, @@ -2760,8 +2786,6 @@ mod tests { .await; let instance_id = *instance.as_untyped_uuid(); - // Attach to non-existent group - let fake_group_id = Uuid::new_v4(); let result = datastore .multicast_group_member_attach_to_instance( &opctx, @@ -2770,13 +2794,13 @@ mod tests { Some(vec![]), ) .await; - - // Should fail with GroupNotFound (group doesn't exist) assert!(result.is_err()); - let err = result.unwrap_err(); - assert!(matches!(err, external::Error::InvalidRequest { .. })); + assert!(matches!( + result.unwrap_err(), + external::Error::InvalidRequest { .. } + )); - // Create a valid active group + // Case: Valid group, but non-existent instance let group = multicast::create_test_group_with_state( &opctx, &datastore, @@ -2787,8 +2811,6 @@ mod tests { ) .await; - // Attach non-existent instance - let fake_instance_id = Uuid::new_v4(); let result = datastore .multicast_group_member_attach_to_instance( &opctx, @@ -2797,12 +2819,11 @@ mod tests { Some(vec![]), ) .await; - - // Should fail because CTE validates instance exists atomically assert!(result.is_err()); - let err = result.unwrap_err(); - // The error will be InvalidRequest from the CTE (instance not found) - assert!(matches!(err, external::Error::InvalidRequest { .. })); + assert!(matches!( + result.unwrap_err(), + external::Error::InvalidRequest { .. } + )); db.terminate().await; logctx.cleanup_successful(); @@ -3362,40 +3383,6 @@ mod tests { logctx.cleanup_successful(); } - #[tokio::test] - async fn test_member_attach_error_priority_both_invalid() { - let logctx = dev::test_setup_log( - "test_member_attach_error_priority_both_invalid", - ); - let db = TestDatabase::new_with_datastore(&logctx.log).await; - let (opctx, datastore) = (db.opctx(), db.datastore()); - - let fake_group_id = Uuid::new_v4(); - let fake_instance_id = Uuid::new_v4(); - - // Attempt to attach non-existent instance to non-existent group - let result = datastore - .multicast_group_member_attach_to_instance( - &opctx, - MulticastGroupUuid::from_untyped_uuid(fake_group_id), - InstanceUuid::from_untyped_uuid(fake_instance_id), - Some(vec![]), - ) - .await; - - // Should fail with InstanceNotFound (checked first), not GroupNotFound - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!(matches!(err, external::Error::InvalidRequest { .. })); - assert!( - err.to_string().contains("Instance does not exist"), - "Expected InstanceNotFound error, got: {err}" - ); - - db.terminate().await; - logctx.cleanup_successful(); - } - #[tokio::test] async fn test_member_attach_stopped_instance() { let logctx = dev::test_setup_log("test_member_attach_stopped_instance"); @@ -3463,13 +3450,25 @@ mod tests { logctx.cleanup_successful(); } + /// Test for `multicast_groups_source_ips_union` covering: + /// - Empty input returns empty map + /// - Group with no members returns empty source_ips + /// - Union across multiple members with overlapping IPs + /// - ASM members (no source_ips) don't affect union #[tokio::test] - async fn test_source_ips_union_across_members() { - let logctx = - dev::test_setup_log("test_source_ips_union_across_members"); + async fn test_source_ips_union() { + let logctx = dev::test_setup_log("test_source_ips_union"); let db = TestDatabase::new_with_datastore(&logctx.log).await; let (opctx, datastore) = (db.opctx(), db.datastore()); + // Case: Empty input returns empty map without DB query + let result = datastore + .multicast_groups_source_ips_union(&opctx, &[]) + .await + .expect("Empty input should succeed"); + assert!(result.is_empty(), "Empty input should return empty map"); + + // Setup for remaining tests let setup = multicast::create_test_setup( &opctx, &datastore, @@ -3478,7 +3477,33 @@ mod tests { ) .await; - // Create active group + // Case: Group with no members returns empty source_ips + let empty_group = multicast::create_test_group_with_state( + &opctx, + &datastore, + &setup, + "empty-group", + "224.10.1.1", + true, // make_active + ) + .await; + let empty_group_id = + MulticastGroupUuid::from_untyped_uuid(empty_group.id()); + + let result = datastore + .multicast_groups_source_ips_union(&opctx, &[empty_group_id]) + .await + .expect("Should succeed for group with no members"); + assert!( + result.contains_key(&empty_group.id()), + "Group should be present in result map" + ); + assert!( + result.get(&empty_group.id()).unwrap().is_empty(), + "Group with no members should have empty source_ips" + ); + + // Case: Union across members with overlapping IPs let group = multicast::create_test_group_with_state( &opctx, &datastore, @@ -3512,7 +3537,6 @@ mod tests { .await .expect("Should add member1"); - // Verify union with single member let union_map = datastore .multicast_groups_source_ips_union(&opctx, &[group_id]) .await @@ -3542,7 +3566,6 @@ mod tests { .await .expect("Should add member2"); - // Verify union deduplicates overlapping IPs let union_map = datastore .multicast_groups_source_ips_union(&opctx, &[group_id]) .await @@ -3551,10 +3574,10 @@ mod tests { assert_eq!( union.len(), 3, - "Union should have 3 unique IPs (10.0.0.1, 10.0.0.2, 10.0.0.3)" + "Union should have 3 unique IPs (deduplicates overlapping)" ); - // Add member3 with no source IPs (ASM member) + // Case: ASM member (no source_ips) doesn't affect union let instance3 = create_stopped_instance_record( &opctx, &datastore, @@ -3573,7 +3596,6 @@ mod tests { .await .expect("Should add ASM member"); - // Union should still be 3 (ASM member contributes nothing) let union_map = datastore .multicast_groups_source_ips_union(&opctx, &[group_id]) .await @@ -3584,8 +3606,6 @@ mod tests { 3, "Union should still be 3 (ASM member has no sources)" ); - - // Verify actual IPs in union assert!(union.contains(&"10.0.0.1".parse().unwrap())); assert!(union.contains(&"10.0.0.2".parse().unwrap())); assert!(union.contains(&"10.0.0.3".parse().unwrap())); @@ -3594,82 +3614,15 @@ mod tests { logctx.cleanup_successful(); } - /// Test that empty group IDs returns empty map without DB query. - #[tokio::test] - async fn test_source_ips_union_empty_input() { - let logctx = dev::test_setup_log("test_source_ips_union_empty_input"); - let db = TestDatabase::new_with_datastore(&logctx.log).await; - let (opctx, datastore) = (db.opctx(), db.datastore()); - - // Call with empty slice - should return empty map without hitting DB - let result = datastore - .multicast_groups_source_ips_union(&opctx, &[]) - .await - .expect("Empty input should succeed"); - - assert!(result.is_empty(), "Empty input should return empty map"); - - db.terminate().await; - logctx.cleanup_successful(); - } - - #[tokio::test] - async fn test_source_ips_union_group_with_no_members() { - let logctx = - dev::test_setup_log("test_source_ips_union_group_with_no_members"); - let db = TestDatabase::new_with_datastore(&logctx.log).await; - let (opctx, datastore) = (db.opctx(), db.datastore()); - - let setup = multicast::create_test_setup( - &opctx, - &datastore, - "no-members-pool", - "no-members-project", - ) - .await; - - // Create active group with no members - let group = multicast::create_test_group_with_state( - &opctx, - &datastore, - &setup, - "empty-group", - "224.10.1.1", - true, // make_active - ) - .await; - let group_id = MulticastGroupUuid::from_untyped_uuid(group.id()); - - // Query source IPs for group with no members - let result = datastore - .multicast_groups_source_ips_union(&opctx, &[group_id]) - .await - .expect("Should succeed for group with no members"); - - // Group should be in result map with empty vector (not missing) - assert!( - result.contains_key(&group.id()), - "Group should be present in result map" - ); - let sources = result.get(&group.id()).unwrap(); - assert!( - sources.is_empty(), - "Group with no members should have empty source_ips" - ); - - db.terminate().await; - logctx.cleanup_successful(); - } - - /// Test that `None` preserves source_ips on reactivation. + /// Test covering source_ips handling on reactivation. /// - /// This verifies the distinction between: + /// Verifies the distinction between: /// - `None` → preserve existing source_ips /// - `Some([])` → clear source_ips (switch to ASM) #[tokio::test] - async fn test_member_attach_preserves_sources_on_reactivation() { + async fn test_member_attach_source_ips_on_reactivation() { let logctx = dev::test_setup_log( - "test_member_attach_preserves_sources_on_reactivation", + "test_member_attach_source_ips_on_reactivation", ); let db = TestDatabase::new_with_datastore(&logctx.log).await; let (opctx, datastore) = (db.opctx(), db.datastore()); @@ -3677,8 +3630,8 @@ mod tests { let setup = multicast::create_test_setup( &opctx, &datastore, - "add-preserve-sources-pool", - "add-preserve-sources-project", + "reactivation-sources-pool", + "reactivation-sources-project", ) .await; @@ -3694,173 +3647,106 @@ mod tests { .await; let group_id = MulticastGroupUuid::from_untyped_uuid(group.id()); - // Create stopped instance - let instance = create_stopped_instance_record( + let original_sources = + vec!["10.5.5.1".parse().unwrap(), "10.5.5.2".parse().unwrap()]; + + // Case: `None` preserves source_ips + let instance1 = create_stopped_instance_record( &opctx, &datastore, &setup.authz_project, - "test-instance", + "instance-preserve", ) .await; - // Add member with `source_ips` via HTTP API path - let original_sources = - vec!["10.5.5.1".parse().unwrap(), "10.5.5.2".parse().unwrap()]; - let member = datastore + let member1 = datastore .multicast_group_member_attach_to_instance( &opctx, group_id, - instance, + instance1, Some(original_sources.clone()), ) .await .expect("Should add member with sources"); - - assert_eq!( - member.source_ips.len(), - 2, - "Member should have 2 source IPs" - ); + assert_eq!(member1.source_ips.len(), 2); // Transition to "Left" state datastore - .multicast_group_members_detach_by_instance(&opctx, instance) + .multicast_group_members_detach_by_instance(&opctx, instance1) .await .expect("Should detach"); - // Verify member is in "Left" state let left_member = datastore - .multicast_group_member_get_by_id(&opctx, member.id, false) + .multicast_group_member_get_by_id(&opctx, member1.id, false) .await .expect("Should get member") .expect("Member should exist"); assert_eq!(left_member.state, MulticastGroupMemberState::Left); - // Source IPs should still be stored (just in "Left" state) assert_eq!(left_member.source_ips.len(), 2); - // Reactivate via HTTP API path with `None` (preserve existing sources) - let reactivated = datastore + // Reactivate with `None` → should preserve source_ips + let reactivated1 = datastore .multicast_group_member_attach_to_instance( - &opctx, group_id, instance, + &opctx, group_id, instance1, None, // None = preserve existing source_ips ) .await .expect("Should reactivate member"); - // Verify `source_ips` were preserved (not cleared) assert_eq!( - reactivated.source_ips.len(), + reactivated1.source_ips.len(), 2, "Source IPs should be preserved on reactivation with None" ); - let reactivated_ips: Vec = - reactivated.source_ips.iter().map(|n| n.ip()).collect(); - assert!(reactivated_ips.contains(&"10.5.5.1".parse().unwrap())); - assert!(reactivated_ips.contains(&"10.5.5.2".parse().unwrap())); - - // Verify state is back to Joining - assert_eq!(reactivated.state, MulticastGroupMemberState::Joining); + let ips: Vec = + reactivated1.source_ips.iter().map(|n| n.ip()).collect(); + assert!(ips.contains(&"10.5.5.1".parse().unwrap())); + assert!(ips.contains(&"10.5.5.2".parse().unwrap())); + assert_eq!(reactivated1.state, MulticastGroupMemberState::Joining); - db.terminate().await; - logctx.cleanup_successful(); - } - - /// Test that `Some([])` clears `source_ips` on reactivation (switch to ASM). - /// - /// This verifies the distinction between: - /// - `None` → preserve existing `source_ips` - /// - `Some([])` → clear `source_ips` (switch to ASM) - #[tokio::test] - async fn test_member_attach_clears_sources_on_reactivation() { - let logctx = dev::test_setup_log( - "test_member_attach_clears_sources_on_reactivation", - ); - let db = TestDatabase::new_with_datastore(&logctx.log).await; - let (opctx, datastore) = (db.opctx(), db.datastore()); - - let setup = multicast::create_test_setup( - &opctx, - &datastore, - "add-clear-sources-pool", - "add-clear-sources-project", - ) - .await; - - // Create active group - let group = multicast::create_test_group_with_state( - &opctx, - &datastore, - &setup, - "test-group", - "224.10.1.1", - true, // make_active - ) - .await; - let group_id = MulticastGroupUuid::from_untyped_uuid(group.id()); - - // Create stopped instance - let instance = create_stopped_instance_record( + // Case: `Some([])` clears source_ips + let instance2 = create_stopped_instance_record( &opctx, &datastore, &setup.authz_project, - "test-instance", + "instance-clear", ) .await; - // Add member with `source_ips` - let original_sources = - vec!["10.5.5.1".parse().unwrap(), "10.5.5.2".parse().unwrap()]; - let member = datastore + let member2 = datastore .multicast_group_member_attach_to_instance( &opctx, group_id, - instance, + instance2, Some(original_sources.clone()), ) .await .expect("Should add member with sources"); - - assert_eq!( - member.source_ips.len(), - 2, - "Member should have 2 source IPs" - ); + assert_eq!(member2.source_ips.len(), 2); // Transition to "Left" state datastore - .multicast_group_members_detach_by_instance(&opctx, instance) + .multicast_group_members_detach_by_instance(&opctx, instance2) .await .expect("Should detach"); - // Verify member is in "Left" state with sources still stored - let left_member = datastore - .multicast_group_member_get_by_id(&opctx, member.id, false) - .await - .expect("Should get member") - .expect("Member should exist"); - assert_eq!(left_member.state, MulticastGroupMemberState::Left); - assert_eq!(left_member.source_ips.len(), 2); - - // Reactivate to clear sources (switch to ASM) - let reactivated = datastore + // Reactivate with `Some([])` → should clear source_ips + let reactivated2 = datastore .multicast_group_member_attach_to_instance( &opctx, group_id, - instance, + instance2, Some(vec![]), // Some([]) = clear source_ips ) .await .expect("Should reactivate member"); - // Verify `source_ips` were cleared assert_eq!( - reactivated.source_ips.len(), + reactivated2.source_ips.len(), 0, "Source IPs should be cleared on reactivation with Some([])" ); - - // Verify state is back to "Joining" - assert_eq!(reactivated.state, MulticastGroupMemberState::Joining); + assert_eq!(reactivated2.state, MulticastGroupMemberState::Joining); db.terminate().await; logctx.cleanup_successful(); diff --git a/nexus/tests/integration_tests/multicast/api.rs b/nexus/tests/integration_tests/multicast/api.rs index bf9a2ba5056..a6662262356 100644 --- a/nexus/tests/integration_tests/multicast/api.rs +++ b/nexus/tests/integration_tests/multicast/api.rs @@ -16,7 +16,7 @@ //! - SSM (Source-Specific): 232.0.0.0/8, sources required per-member //! - SSM validation: Every SSM member must specify sources (S,G subscription) //! - New groups: Validated before creation -//! - Existing groups: Validated on join (by IP, name, or ID) +//! - Existing groups: Validated on join (all paths share `instance_multicast_group_join`) //! - Empty sources array: Treated same as None (invalid for SSM) //! - Source IP validation: ASM can have sources; SSM requires them //! - Pool validation: IP must be in a linked multicast pool @@ -461,7 +461,13 @@ async fn test_join_by_ip_ssm_with_sources(cptestctx: &ControlPlaneTestContext) { } /// Test SSM join-by-IP without sources should fail. +/// /// SSM addresses (232.0.0.0/8) require source IPs for implicit creation. +/// +/// This is the canonical test for SSM source validation. The validation +/// code path is shared regardless of how you join (by IP, name, or ID) - +/// all routes converge on the same `instance_multicast_group_join` logic +/// that checks `is_ssm_address()` and rejects joins without sources. #[nexus_test] async fn test_join_by_ip_ssm_without_sources_fails( cptestctx: &ControlPlaneTestContext, @@ -516,158 +522,6 @@ async fn test_join_by_ip_ssm_without_sources_fails( cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; } -/// Test joining an existing SSM group by ID without sources should fail. -/// -/// This tests the SSM validation for join-by-ID path: if an SSM group exists -/// (created by first instance with sources), a second instance cannot join -/// by group ID without providing sources. -#[nexus_test] -async fn test_join_existing_ssm_group_by_id_without_sources_fails( - cptestctx: &ControlPlaneTestContext, -) { - let client = &cptestctx.external_client; - let project_name = "ssm-id-fail-project"; - - // Setup: SSM pool - let (_, _, _ssm_pool) = ops::join3( - create_project(client, project_name), - create_default_ip_pool(client), - create_multicast_ip_pool_with_range( - client, - "ssm-id-fail-pool", - (232, 40, 0, 1), - (232, 40, 0, 255), - ), - ) - .await; - - create_instance(client, project_name, "ssm-id-inst-1").await; - create_instance(client, project_name, "ssm-id-inst-2").await; - - // First instance creates SSM group with sources - let ssm_ip = "232.40.0.100"; - let source_ip: IpAddr = "10.40.0.1".parse().unwrap(); - let join_url_1 = format!( - "/v1/instances/ssm-id-inst-1/multicast-groups/{ssm_ip}?project={project_name}" - ); - - let join_body_1 = - InstanceMulticastGroupJoin { source_ips: Some(vec![source_ip]) }; - let member_1: MulticastGroupMember = - put_upsert(client, &join_url_1, &join_body_1).await; - - let group_id = member_1.multicast_group_id; - - // Second instance tries to join by group ID WITHOUT sources - should fail - let join_url_by_id = format!( - "/v1/instances/ssm-id-inst-2/multicast-groups/{group_id}?project={project_name}" - ); - - let error = NexusRequest::new( - RequestBuilder::new(client, Method::PUT, &join_url_by_id) - .body(Some(&InstanceMulticastGroupJoin { - source_ips: None, // No sources! - })) - .expect_status(Some(StatusCode::BAD_REQUEST)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Join by ID without sources should fail for SSM group"); - - let error_body: dropshot::HttpErrorResponseBody = - error.parsed_body().unwrap(); - assert!( - error_body.message.contains("SSM") - || error_body.message.contains("source"), - "Error should mention SSM or source IPs: {}", - error_body.message - ); - - let expected_group_name = format!("mcast-{}", ssm_ip.replace('.', "-")); - cleanup_instances( - cptestctx, - client, - project_name, - &["ssm-id-inst-1", "ssm-id-inst-2"], - ) - .await; - wait_for_group_deleted(client, &expected_group_name).await; -} - -/// Test joining an existing SSM group by NAME without sources should fail. -#[nexus_test] -async fn test_join_existing_ssm_group_by_name_without_sources_fails( - cptestctx: &ControlPlaneTestContext, -) { - let client = &cptestctx.external_client; - let project_name = "ssm-name-fail-project"; - - // Setup: SSM pool - let (_, _, _ssm_pool) = ops::join3( - create_project(client, project_name), - create_default_ip_pool(client), - create_multicast_ip_pool_with_range( - client, - "ssm-name-fail-pool", - (232, 45, 0, 1), - (232, 45, 0, 100), - ), - ) - .await; - - create_instance(client, project_name, "ssm-name-inst-1").await; - create_instance(client, project_name, "ssm-name-inst-2").await; - - // First instance creates SSM group with sources - let ssm_ip = "232.45.0.50"; - let join_url = format!( - "/v1/instances/ssm-name-inst-1/multicast-groups/{ssm_ip}?project={project_name}" - ); - let join_body = InstanceMulticastGroupJoin { - source_ips: Some(vec!["10.0.0.1".parse().unwrap()]), - }; - - put_upsert::<_, MulticastGroupMember>(client, &join_url, &join_body).await; - - // Get the group's auto-generated name - let expected_group_name = format!("mcast-{}", ssm_ip.replace('.', "-")); - - // Second instance tries to join by NAME without sources - should fail - let join_by_name_url = format!( - "/v1/instances/ssm-name-inst-2/multicast-groups/{expected_group_name}?project={project_name}" - ); - let join_body_no_sources = InstanceMulticastGroupJoin { source_ips: None }; - - let error = NexusRequest::new( - RequestBuilder::new(client, Method::PUT, &join_by_name_url) - .body(Some(&join_body_no_sources)) - .expect_status(Some(StatusCode::BAD_REQUEST)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Join by name without sources should fail for SSM group"); - - let error_body: dropshot::HttpErrorResponseBody = - error.parsed_body().unwrap(); - assert!( - error_body.message.contains("SSM") - || error_body.message.contains("source"), - "Error should mention SSM or source IPs: {}", - error_body.message - ); - - cleanup_instances( - cptestctx, - client, - project_name, - &["ssm-name-inst-1", "ssm-name-inst-2"], - ) - .await; - wait_for_group_deleted(client, &expected_group_name).await; -} - /// Test that SSM join-by-IP with empty sources array fails. /// /// `source_ips: Some(vec![])` (empty array) is treated the same as @@ -725,83 +579,6 @@ async fn test_ssm_with_empty_sources_array_fails( cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; } -/// Test joining an existing SSM group by IP without sources fails. -/// -/// When an SSM group already exists (created by first instance with sources), -/// a second instance joining by IP should still fail without sources since -/// the group is SSM. -#[nexus_test] -async fn test_join_existing_ssm_group_by_ip_without_sources_fails( - cptestctx: &ControlPlaneTestContext, -) { - let client = &cptestctx.external_client; - let project_name = "ssm-ip-existing-fail-project"; - - // Setup: SSM pool - let (_, _, _ssm_pool) = ops::join3( - create_project(client, project_name), - create_default_ip_pool(client), - create_multicast_ip_pool_with_range( - client, - "ssm-ip-existing-fail-pool", - (232, 47, 0, 1), - (232, 47, 0, 100), - ), - ) - .await; - - create_instance(client, project_name, "ssm-ip-inst-1").await; - create_instance(client, project_name, "ssm-ip-inst-2").await; - - // First instance creates SSM group with sources - let ssm_ip = "232.47.0.50"; - let join_url = format!( - "/v1/instances/ssm-ip-inst-1/multicast-groups/{ssm_ip}?project={project_name}" - ); - let join_body = InstanceMulticastGroupJoin { - source_ips: Some(vec!["10.0.0.1".parse().unwrap()]), - }; - - put_upsert::<_, MulticastGroupMember>(client, &join_url, &join_body).await; - - let expected_group_name = format!("mcast-{}", ssm_ip.replace('.', "-")); - - // Second instance tries to join by IP without sources - should fail - // Even though the group exists, SSM still requires sources - let join_url_2 = format!( - "/v1/instances/ssm-ip-inst-2/multicast-groups/{ssm_ip}?project={project_name}" - ); - let join_body_no_sources = InstanceMulticastGroupJoin { source_ips: None }; - - let error = NexusRequest::new( - RequestBuilder::new(client, Method::PUT, &join_url_2) - .body(Some(&join_body_no_sources)) - .expect_status(Some(StatusCode::BAD_REQUEST)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Join existing SSM group by IP without sources should fail"); - - let error_body: dropshot::HttpErrorResponseBody = - error.parsed_body().unwrap(); - assert!( - error_body.message.contains("SSM") - || error_body.message.contains("source"), - "Error should mention SSM or source IPs: {}", - error_body.message - ); - - cleanup_instances( - cptestctx, - client, - project_name, - &["ssm-ip-inst-1", "ssm-ip-inst-2"], - ) - .await; - wait_for_group_deleted(client, &expected_group_name).await; -} - /// Test join-by-IP with IP not in any pool should fail. #[nexus_test] async fn test_join_by_ip_not_in_pool_fails( diff --git a/nexus/tests/integration_tests/multicast/cache_invalidation.rs b/nexus/tests/integration_tests/multicast/cache_invalidation.rs index ccf8c69db2e..5928a9b3e62 100644 --- a/nexus/tests/integration_tests/multicast/cache_invalidation.rs +++ b/nexus/tests/integration_tests/multicast/cache_invalidation.rs @@ -11,6 +11,8 @@ //! - Cache TTL refresh: Verifies caches are refreshed when TTL expires //! - Backplane cache expiry: Tests that stale backplane mappings are cleaned up +use std::time::Duration; + use gateway_client::types::{PowerState, RotState, SpState}; use nexus_db_queries::context::OpContext; use nexus_test_utils::resource_helpers::{ @@ -57,7 +59,7 @@ async fn test_sled_move_updates_multicast_port_mapping( ) .await; - // Create instance (no multicast groups at creation - implicit model) + // Create instance (no multicast groups at creation, implicit model) let instance = instance_for_multicast_groups( cptestctx, PROJECT_NAME, @@ -279,20 +281,25 @@ async fn test_cache_ttl_driven_refresh() { const GROUP_NAME: &str = "ttl-test-group"; const INSTANCE_NAME: &str = "ttl-test-instance"; + // Test cache TTL values + const SLED_CACHE_TTL: Duration = Duration::from_millis(500); + const BACKPLANE_CACHE_TTL: Duration = Duration::from_millis(250); + // Buffer to ensure TTL has definitely expired + const TTL_EXPIRY_BUFFER: Duration = Duration::from_millis(100); + // Start test server with custom config let cptestctx = nexus_test_utils::ControlPlaneBuilder::new( "test_cache_ttl_driven_refresh", ) .customize_nexus_config(&|config| { - // Set short cache TTLs for testing (2 seconds for sled cache) + // Set short cache TTLs for testing config.pkg.background_tasks.multicast_reconciler.sled_cache_ttl_secs = - chrono::TimeDelta::seconds(2).to_std().unwrap(); + SLED_CACHE_TTL; config .pkg .background_tasks .multicast_reconciler - .backplane_cache_ttl_secs = - chrono::TimeDelta::seconds(1).to_std().unwrap(); + .backplane_cache_ttl_secs = BACKPLANE_CACHE_TTL; // Ensure multicast is enabled config.pkg.multicast.enabled = true; @@ -318,7 +325,7 @@ async fn test_cache_ttl_driven_refresh() { ) .await; - // Create instance (no multicast groups at creation - implicit model) + // Create instance (no multicast groups at creation, implicit model) let instance = instance_for_multicast_groups( &cptestctx, PROJECT_NAME, @@ -435,9 +442,8 @@ async fn test_cache_ttl_driven_refresh() { .await .expect("Should insert new inventory collection"); - // Wait for cache TTL to expire (sled_cache_ttl = 1 second) - // Sleep for 1.5 seconds to ensure TTL has expired - tokio::time::sleep(std::time::Duration::from_millis(1500)).await; + // Wait for cache TTL to expire + tokio::time::sleep(SLED_CACHE_TTL + TTL_EXPIRY_BUFFER).await; wait_for_condition_with_reconciler( &cptestctx.lockstep_client, @@ -458,7 +464,7 @@ async fn test_cache_ttl_driven_refresh() { } }, &POLL_INTERVAL, - &MULTICAST_OPERATION_TIMEOUT, + &MEDIUM_TIMEOUT, // DPD update requires reconciler processing ) .await .expect("DPD should update with new rear port after TTL expiry"); @@ -483,23 +489,26 @@ async fn test_backplane_cache_ttl_expiry() { const GROUP_NAME: &str = "backplane-ttl-group"; const INSTANCE_NAME: &str = "backplane-ttl-instance"; + // Cache TTL values: backplane shorter than sled to test them independently + const BACKPLANE_CACHE_TTL: Duration = Duration::from_millis(250); + const SLED_CACHE_TTL: Duration = Duration::from_secs(2); + // Buffer to ensure TTL has definitely expired (20% margin) + const TTL_EXPIRY_BUFFER: Duration = Duration::from_millis(50); + let cptestctx = nexus_test_utils::ControlPlaneBuilder::new( "test_backplane_cache_ttl_expiry", ) .customize_nexus_config(&|config| { - // Set backplane cache TTL to 1 second (shorter than sled cache to test - // independently) + // Set backplane cache TTL shorter than sled cache to test independently config .pkg .background_tasks .multicast_reconciler - .backplane_cache_ttl_secs = - chrono::TimeDelta::seconds(1).to_std().unwrap(); + .backplane_cache_ttl_secs = BACKPLANE_CACHE_TTL; - // Keep sled cache TTL longer to ensure we're testing backplane cache - // expiry + // Keep sled cache TTL longer to ensure we're testing backplane cache expiry config.pkg.background_tasks.multicast_reconciler.sled_cache_ttl_secs = - chrono::TimeDelta::seconds(10).to_std().unwrap(); + SLED_CACHE_TTL; // Ensure multicast is enabled config.pkg.multicast.enabled = true; @@ -519,7 +528,7 @@ async fn test_backplane_cache_ttl_expiry() { ) .await; - // Create instance (no multicast groups at creation - implicit model) + // Create instance (no multicast groups at creation, implicit model) let instance = instance_for_multicast_groups( &cptestctx, PROJECT_NAME, @@ -550,9 +559,8 @@ async fn test_backplane_cache_ttl_expiry() { .await .expect("Should verify initial port mapping"); - // Wait for backplane cache TTL to expire (500ms) but not sled cache (5 seconds) - // Sleep for 1 second to ensure backplane TTL has expired - tokio::time::sleep(std::time::Duration::from_secs(1)).await; + // Wait for backplane cache TTL to expire but not sled cache + tokio::time::sleep(BACKPLANE_CACHE_TTL + TTL_EXPIRY_BUFFER).await; // Force cache access by triggering reconciler // This will cause the reconciler to check backplane cache, find it expired, diff --git a/nexus/tests/integration_tests/multicast/failures.rs b/nexus/tests/integration_tests/multicast/failures.rs index bf67d634915..68dbfaaaa26 100644 --- a/nexus/tests/integration_tests/multicast/failures.rs +++ b/nexus/tests/integration_tests/multicast/failures.rs @@ -50,14 +50,26 @@ use crate::integration_tests::instances::{ instance_simulate, instance_wait_for_state, }; +/// Test DPD communication failure and recovery with multiple groups. +/// +/// This is the canonical test for DPD failure/recovery behavior. It verifies: +/// - Multiple groups remain in "Creating" state when DPD is unavailable +/// - Members stay in "Joining" or "Left" state during DPD failure +/// - All groups recover to "Active" after DPD restart +/// - Running instance members recover to "Joined" state +/// +/// Uses multiple groups to verify the reconciler handles concurrent recovery +/// correctly, and one DPD restart cycle to test the full recovery path. #[nexus_test] async fn test_multicast_group_dpd_communication_failure_recovery( cptestctx: &ControlPlaneTestContext, ) { let client = &cptestctx.external_client; let project_name = "test-project"; - let group_name = "dpd-failure-group"; - let instance_name = "dpd-failure-instance"; + let group1_name = "dpd-failure-group-1"; + let group2_name = "dpd-failure-group-2"; + let instance1_name = "dpd-failure-instance-1"; + let instance2_name = "dpd-failure-instance-2"; // Setup: project, pools - parallelize creation ops::join3( @@ -67,53 +79,120 @@ async fn test_multicast_group_dpd_communication_failure_recovery( ) .await; - // Create instance first - create_instance(client, project_name, instance_name).await; + // Create instance1 (will start running by default) + let instance1 = create_instance(client, project_name, instance1_name).await; + let instance1_id = InstanceUuid::from_untyped_uuid(instance1.identity.id); + let nexus = &cptestctx.server.server_context().nexus; + instance_simulate(nexus, &instance1_id).await; + instance_wait_for_state(client, instance1_id, InstanceState::Running).await; + + // Create instance2 as stopped (won't start) + create_instance_with( + client, + project_name, + instance2_name, + &InstanceNetworkInterfaceAttachment::Default, + vec![], + vec![], + false, // don't start + None, + None, + vec![], + ) + .await; // Stop DPD before implicit creation to test failure recovery cptestctx.stop_dendrite(SwitchLocation::Switch0).await; - // Add instance to multicast group via instance-centric API - multicast_group_attach(cptestctx, project_name, instance_name, group_name) - .await; + // Add instances to their respective groups via instance-centric API + ops::join2( + multicast_group_attach( + cptestctx, + project_name, + instance1_name, + group1_name, + ), + multicast_group_attach( + cptestctx, + project_name, + instance2_name, + group2_name, + ), + ) + .await; - // Verify group was implicitly created and is in "Creating" state since DPD is unavailable - // The reconciler can't progress the group to Active without DPD communication - let group_get_url = mcast_group_url(group_name); - let fetched_group: MulticastGroup = - object_get(client, &group_get_url).await; + // Verify both groups are in "Creating" state since DPD is unavailable + for group_name in [group1_name, group2_name] { + let group_get_url = mcast_group_url(group_name); + let fetched_group: MulticastGroup = + object_get(client, &group_get_url).await; - assert_eq!( - fetched_group.state, "Creating", - "Group should remain in Creating state when DPD is unavailable, found: {}", - fetched_group.state - ); + assert_eq!( + fetched_group.state, "Creating", + "Group {group_name} should remain in Creating state when DPD is unavailable, found: {}", + fetched_group.state + ); - // Verify group properties are maintained despite DPD issues - assert_eq!(fetched_group.identity.name.as_str(), group_name); + // Verify member state during DPD failure + let members = list_multicast_group_members(client, group_name).await; + assert_eq!( + members.len(), + 1, + "Group {group_name} should have one member" + ); + assert!( + members[0].state == "Joining" || members[0].state == "Left", + "Member in {group_name} should be Joining or Left when DPD unavailable, got: {}", + members[0].state + ); + } - // Case: Verify member state during DPD failure - // Members should be in "Joining" or "Left" state when DPD is unavailable - // (they can't transition to "Joined" without successful DPD programming) - let members = list_multicast_group_members(client, group_name).await; - assert_eq!(members.len(), 1, "Should have exactly one member"); - assert!( - members[0].state == "Joining" || members[0].state == "Left", - "Member should be in Joining or Left state when DPD is unavailable, got: {}", - members[0].state - ); + // Restart DPD and kick off reconciler + cptestctx.restart_dendrite(SwitchLocation::Switch0).await; + activate_multicast_reconciler(&cptestctx.lockstep_client).await; - // Start instance so it has a valid VMM state for recovery - let instance: Instance = object_get( - client, - &format!("/v1/instances/{instance_name}?project={project_name}"), + // Both groups should become "Active" + ops::join2( + wait_for_group_active(client, group1_name), + wait_for_group_active(client, group2_name), ) .await; - let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); - let nexus = &cptestctx.server.server_context().nexus; + + // Running instance's member should recover to Joined + wait_for_member_state( + cptestctx, + group1_name, + instance1.identity.id, + nexus_db_model::MulticastGroupMemberState::Joined, + ) + .await; + + let recovered_members = + list_multicast_group_members(client, group1_name).await; + assert_eq!( + recovered_members[0].state, "Joined", + "Running instance member should recover to Joined after DPD restart" + ); + + // Stopped instance's member should stay in "Left" (group is "Active" but instance not running) + let group2_members = + list_multicast_group_members(client, group2_name).await; + assert_eq!( + group2_members[0].state, "Left", + "Stopped instance member should be Left even after group becomes Active" + ); + + // Case: Lifecycle transitions: verify start/stop state changes + + // Start the stopped instance2 → verify member transitions to "Joined" + let instance2_url = + format!("/v1/instances/{instance2_name}?project={project_name}"); + let instance2: omicron_common::api::external::Instance = + object_get(client, &instance2_url).await; + let instance2_id = InstanceUuid::from_untyped_uuid(instance2.identity.id); let start_url = - format!("/v1/instances/{instance_name}/start?project={project_name}"); + format!("/v1/instances/{instance2_name}/start?project={project_name}"); NexusRequest::new( RequestBuilder::new(client, Method::POST, &start_url) .body(None as Option<&serde_json::Value>) @@ -122,135 +201,71 @@ async fn test_multicast_group_dpd_communication_failure_recovery( .authn_as(AuthnMode::PrivilegedUser) .execute() .await - .expect("Should start instance"); - instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(client, instance_id, InstanceState::Running).await; + .expect("Should start instance2"); - // Restart DPD and verify member recovers to Joined - cptestctx.restart_dendrite(SwitchLocation::Switch0).await; + instance_simulate(nexus, &instance2_id).await; + instance_wait_for_state(client, instance2_id, InstanceState::Running).await; wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - // Group should become Active and member should recover to Joined - wait_for_group_active(client, group_name).await; wait_for_member_state( cptestctx, - group_name, - instance.identity.id, + group2_name, + instance2.identity.id, nexus_db_model::MulticastGroupMemberState::Joined, ) .await; - let recovered_members = - list_multicast_group_members(client, group_name).await; + let members_after_start = + list_multicast_group_members(client, group2_name).await; assert_eq!( - recovered_members[0].state, "Joined", - "Member should recover to Joined after DPD restart" + members_after_start[0].state, "Joined", + "Member should transition to Joined when stopped instance starts" ); - cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; - wait_for_group_deleted(client, group_name).await; -} - -#[nexus_test] -async fn test_multicast_reconciler_state_consistency_validation( - cptestctx: &ControlPlaneTestContext, -) { - let client = &cptestctx.external_client; - let project_name = "test-project"; - - // Setup: project and pools - let (_, _, _) = ops::join3( - create_project(&client, project_name), - create_default_ip_pool(&client), - create_multicast_ip_pool(&client, "mcast-pool"), + // Stop the running instance1 → verify member transitions to "Left" + let stop_url = + format!("/v1/instances/{instance1_name}/stop?project={project_name}"); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &stop_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), ) - .await; - - // Group names for implicit groups (implicitly created when first member joins) - let group_names = - ["consistency-group-1", "consistency-group-2", "consistency-group-3"]; - - // Create instances first (groups will be implicitly created when members attach) - let instance_names: Vec<_> = group_names - .iter() - .map(|&group_name| format!("instance-{group_name}")) - .collect(); - - // Create all instances in parallel - let create_futures = instance_names.iter().map(|instance_name| { - create_instance(client, project_name, instance_name) - }); - ops::join_all(create_futures).await; - - // Stop DPD before attaching members to test state consistency during failure - // Groups will be implicitly created but stay in "Creating" state - cptestctx.stop_dendrite(SwitchLocation::Switch0).await; - - // Attach instances to their respective groups (triggers implicit creation for each group) - // Since DPD is down, groups will remain in "Creating" state - for (instance_name, &group_name) in instance_names.iter().zip(&group_names) - { - multicast_group_attach( - cptestctx, - project_name, - instance_name, - group_name, - ) - .await; - } + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Should stop instance1"); - // Wait for reconciler to attempt processing (will fail due to DPD being down) + instance_simulate(nexus, &instance1_id).await; + instance_wait_for_state(client, instance1_id, InstanceState::Stopped).await; wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - // Verify each group is in a consistent state (DPD failure prevents reconciliation) - for group_name in group_names.iter() { - let group_get_url = mcast_group_url(group_name); - let fetched_group: MulticastGroup = - object_get(client, &group_get_url).await; - - // State should be "Creating" since DPD is down - assert_eq!( - fetched_group.state, "Creating", - "Group {group_name} should remain in Creating state when DPD is unavailable, found: {}", - fetched_group.state - ); - - // Case: Verify member state during DPD failure - let members = list_multicast_group_members(client, group_name).await; - assert_eq!( - members.len(), - 1, - "Group {group_name} should have exactly one member" - ); - assert!( - members[0].state == "Joining" || members[0].state == "Left", - "Member in group {group_name} should be Joining or Left when DPD unavailable, got: {}", - members[0].state - ); - } - - // Verify groups are still stuck in expected states before restarting DPD - // This explicitly validates that without DPD, groups cannot transition - for group_name in group_names.iter() { - verify_group_deleted_or_in_states(client, group_name, &["Creating"]) - .await; - } - - // Restart DPD before cleanup so instances can stop properly - cptestctx.restart_dendrite(SwitchLocation::Switch0).await; - - let instance_name_refs: Vec<&str> = - instance_names.iter().map(|s| s.as_str()).collect(); - cleanup_instances(cptestctx, client, project_name, &instance_name_refs) - .await; + wait_for_member_state( + cptestctx, + group1_name, + instance1.identity.id, + nexus_db_model::MulticastGroupMemberState::Left, + ) + .await; - // With DPD now restored, groups should be cleaned up via implicit deletion - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + let members_after_stop = + list_multicast_group_members(client, group1_name).await; + assert_eq!( + members_after_stop[0].state, "Left", + "Member should transition to Left when running instance stops" + ); - // Verify groups are deleted (implicit deletion completes with DPD available) - for group_name in group_names.iter() { - wait_for_group_deleted(client, group_name).await; - } + cleanup_instances( + cptestctx, + client, + project_name, + &[instance1_name, instance2_name], + ) + .await; + ops::join2( + wait_for_group_deleted(client, group1_name), + wait_for_group_deleted(client, group2_name), + ) + .await; } #[nexus_test] @@ -280,10 +295,7 @@ async fn test_dpd_failure_during_creating_state( multicast_group_attach(cptestctx, project_name, instance_name, group_name) .await; - // Wait for reconciler to process - tests DPD communication handling during "Creating" state - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - // Check group state after reconciler processes with DPD unavailable + // Check group state should still be "Creating" since DPD is unavailable let group_get_url = mcast_group_url(group_name); let fetched_group: MulticastGroup = object_get(client, &group_get_url).await; @@ -308,10 +320,9 @@ async fn test_dpd_failure_during_creating_state( ); // Test cleanup - remove member, which triggers implicit deletion + // Don't wait for reconciler since DPD is still down multicast_group_detach(client, project_name, instance_name, group_name) .await; - - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; } #[nexus_test] @@ -716,9 +727,9 @@ async fn test_implicit_deletion_with_dpd_failure( ); assert_eq!(group.identity.id, created_group.identity.id); - // Restart DPD and verify cleanup completes + // Restart DPD and kick off reconciler (non-blocking) cptestctx.restart_dendrite(SwitchLocation::Switch0).await; - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + activate_multicast_reconciler(&cptestctx.lockstep_client).await; // Both group and orphaned members should be cleaned up wait_for_group_deleted(client, group_name).await; diff --git a/nexus/tests/integration_tests/multicast/groups.rs b/nexus/tests/integration_tests/multicast/groups.rs index 003b7b60d6a..c811e434ade 100644 --- a/nexus/tests/integration_tests/multicast/groups.rs +++ b/nexus/tests/integration_tests/multicast/groups.rs @@ -298,8 +298,11 @@ async fn test_multicast_group_member_operations( // Implicit deletion model: group is implicitly deleted when last member is removed // Wait for both Nexus group and DPD group to be deleted - wait_for_group_deleted(client, group_name).await; - wait_for_group_deleted_from_dpd(cptestctx, external_multicast_ip).await; + ops::join2( + wait_for_group_deleted(client, group_name), + wait_for_group_deleted_from_dpd(cptestctx, external_multicast_ip), + ) + .await; } #[nexus_test] @@ -686,10 +689,12 @@ async fn test_instance_deletion_removes_multicast_memberships( object_get_error(client, &instance_url, StatusCode::NOT_FOUND).await; // Implicit model: group is implicitly deleted when last member (instance) is removed - wait_for_group_deleted(client, group_name).await; - - // Wait for reconciler to clean up DPD state (activates reconciler repeatedly until DPD confirms deletion) - wait_for_group_deleted_from_dpd(cptestctx, multicast_ip).await; + // Wait for both Nexus group and DPD group to be deleted + ops::join2( + wait_for_group_deleted(client, group_name), + wait_for_group_deleted_from_dpd(cptestctx, multicast_ip), + ) + .await; } /// Test that the multicast_ip field is correctly populated in MulticastGroupMember API responses. @@ -763,45 +768,8 @@ async fn test_member_response_includes_multicast_ip( "Listed member should also include multicast_ip field" ); - // Case: Remove and re-add member (reactivation) - verify field preserved - let member_remove_url = format!( - "/v1/instances/{instance_name}/multicast-groups/{group_name}?project={project_name}" - ); - NexusRequest::new( - RequestBuilder::new(client, http::Method::DELETE, &member_remove_url) - .expect_status(Some(StatusCode::NO_CONTENT)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Should remove member"); - - wait_for_group_deleted(client, group_name).await; - - // Re-create group by adding member again - let readded_member: MulticastGroupMember = - put_upsert(client, &join_url, &join_params).await; - - wait_for_group_active(client, group_name).await; - - let new_group: MulticastGroup = - object_get(client, &mcast_group_url(group_name)).await; - - // Verify multicast_ip field is present in re-added member - assert_eq!( - readded_member.multicast_ip, new_group.multicast_ip, - "Re-added member should also have multicast_ip field" - ); - - NexusRequest::new( - RequestBuilder::new(client, http::Method::DELETE, &member_remove_url) - .expect_status(Some(StatusCode::NO_CONTENT)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Should remove member for cleanup"); - + // Cleanup + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; wait_for_group_deleted(client, group_name).await; } @@ -1192,6 +1160,14 @@ async fn test_default_pool_on_implicit_creation( object_get(client, &mcast_group_url(group_name2)).await; assert_eq!(group.ip_pool_id, mcast_pool.identity.id); + // Verify IP is in pool's range (224.2.x.x from create_multicast_ip_pool) + if let IpAddr::V4(ip) = group.multicast_ip { + assert_eq!(ip.octets()[0], 224); + assert_eq!(ip.octets()[1], 2); + } else { + panic!("Expected IPv4 multicast address"); + } + // Cleanup cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; wait_for_group_deleted(client, group_name2).await; @@ -1287,112 +1263,6 @@ async fn test_pool_range_allocation(cptestctx: &ControlPlaneTestContext) { .await; } -/// Test that groups are allocated from the auto-discovered pool. -/// -/// Pool selection is automatic - when multiple pools exist, the first one -/// alphabetically is used (after preferring any default pool). -#[nexus_test] -async fn test_automatic_pool_selection(cptestctx: &ControlPlaneTestContext) { - let client = &cptestctx.external_client; - let project_name = "pool-selection-test-project"; - let instance_name = "pool-selection-instance"; - - // Setup: project and default IP pool in parallel - let (_, _) = ops::join2( - create_project(&client, project_name), - create_default_ip_pool(&client), - ) - .await; - create_instance(client, project_name, instance_name).await; - - // Create a multicast pool (after instance, to test auto-discovery) - let mcast_pool = create_multicast_ip_pool_with_range( - &client, - "mcast-pool", - (224, 20, 0, 1), - (224, 20, 0, 10), - ) - .await; - - // Case: Join group - pool is auto-discovered - let group_name = "auto-pool-group"; - multicast_group_attach(cptestctx, project_name, instance_name, group_name) - .await; - - let group_view: MulticastGroup = - object_get(client, &mcast_group_url(group_name)).await; - // Pool is auto-discovered from available multicast pools - assert_eq!(group_view.ip_pool_id, mcast_pool.identity.id); - // Verify IP is in pool's range (224.20.0.x) - if let IpAddr::V4(ip) = group_view.multicast_ip { - assert_eq!(ip.octets()[0], 224); - assert_eq!(ip.octets()[1], 20); - } else { - panic!("Expected IPv4 multicast address"); - } - - // Cleanup - cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; - wait_for_group_deleted(client, group_name).await; -} - -/// Test validation errors for pool exhaustion. -#[nexus_test] -async fn test_pool_exhaustion(cptestctx: &ControlPlaneTestContext) { - let client = &cptestctx.external_client; - let project_name = "pool-exhaustion-test-project"; - - // Create project and IP pools in parallel (multicast pool has single IP) - let (_, _, _) = ops::join3( - create_project(&client, project_name), - create_default_ip_pool(&client), - create_multicast_ip_pool_with_range( - &client, - "empty-pool", - (224, 99, 0, 1), - (224, 99, 0, 1), // Single IP - ), - ) - .await; - - // Use the single IP - let instance_name = "pool-exhaust-instance"; - create_instance(client, project_name, instance_name).await; - let group_exhaust = "exhaust-empty-pool"; - multicast_group_attach( - cptestctx, - project_name, - instance_name, - group_exhaust, - ) - .await; - - // Now try to create another group - should fail - let instance2_name = "pool-exhaust-instance-2"; - create_instance(client, project_name, instance2_name).await; - let group_fail = "fail-empty-pool"; - let join_url_fail = format!( - "/v1/instances/{instance2_name}/multicast-groups/{group_fail}?project={project_name}" - ); - let join_params_fail = InstanceMulticastGroupJoin { source_ips: None }; - object_put_error( - client, - &join_url_fail, - &join_params_fail, - StatusCode::INSUFFICIENT_STORAGE, - ) - .await; - - // Cleanup - cleanup_instances( - cptestctx, - client, - project_name, - &[instance_name, instance2_name], - ) - .await; -} - /// Test multiple instances joining different SSM groups from the same SSM pool. /// /// Verifies: @@ -1550,7 +1420,10 @@ async fn test_multiple_ssm_groups_same_pool( cleanup_instances(cptestctx, client, project_name, &all_instances).await; // Verify all groups are deleted - for (group_name, _) in &group_configs { - wait_for_group_deleted(client, group_name).await; - } + ops::join_all( + group_configs + .iter() + .map(|(group_name, _)| wait_for_group_deleted(client, group_name)), + ) + .await; } diff --git a/nexus/tests/integration_tests/multicast/instances.rs b/nexus/tests/integration_tests/multicast/instances.rs index a6e2c9dc62b..2c1fe65cda8 100644 --- a/nexus/tests/integration_tests/multicast/instances.rs +++ b/nexus/tests/integration_tests/multicast/instances.rs @@ -42,7 +42,7 @@ use nexus_types::internal_api::params::InstanceMigrateRequest; use omicron_common::api::external::{ ByteCount, IdentityMetadataCreateParams, Instance, InstanceCpuCount, - InstanceState, + InstanceState, Nullable, }; use omicron_nexus::TestInterfaces; use omicron_uuid_kinds::{GenericUuid, InstanceUuid}; @@ -468,311 +468,6 @@ async fn test_multicast_group_attach_limits( .await; } -#[nexus_test] -async fn test_multicast_group_instance_state_transitions( - cptestctx: &ControlPlaneTestContext, -) { - let client = &cptestctx.external_client; - - // Create project and pools in parallel - let (_, _, _) = ops::join3( - create_default_ip_pool(&client), - create_project(client, PROJECT_NAME), - create_multicast_ip_pool(&client, "mcast-pool"), - ) - .await; - - // Create stopped instance (no multicast groups at creation) - let stopped_instance = instance_for_multicast_groups( - cptestctx, - PROJECT_NAME, - "state-test-instance", - false, - &[], - ) - .await; - - // Add instance to group (group implicitly creates if it doesn't exist) - multicast_group_attach( - cptestctx, - PROJECT_NAME, - "state-test-instance", - "state-test-group", - ) - .await; - - // Wait for group to become Active before proceeding - wait_for_group_active(client, "state-test-group").await; - - // Verify instance is stopped and in multicast group - assert_eq!(stopped_instance.runtime.run_state, InstanceState::Stopped); - - // Wait for member to reach "Left" state (stopped instance members start in "Left" state) - wait_for_member_state( - cptestctx, - "state-test-group", - stopped_instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Left, - ) - .await; - - // Start the instance and verify multicast behavior - let instance_id = - InstanceUuid::from_untyped_uuid(stopped_instance.identity.id); - let nexus = &cptestctx.server.server_context().nexus; - - // Start the instance using direct POST request (not PUT) - let start_url = format!( - "/v1/instances/state-test-instance/start?project={PROJECT_NAME}" - ); - NexusRequest::new( - RequestBuilder::new(client, Method::POST, &start_url) - .body(None as Option<&serde_json::Value>) - .expect_status(Some(StatusCode::ACCEPTED)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .unwrap() - .parsed_body::() - .unwrap(); - instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(&client, instance_id, InstanceState::Running).await; - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - // Stop the instance and verify multicast behavior persists - let stop_url = format!( - "/v1/instances/state-test-instance/stop?project={PROJECT_NAME}" - ); - NexusRequest::new( - RequestBuilder::new(client, Method::POST, &stop_url) - .body(None as Option<&serde_json::Value>) - .expect_status(Some(StatusCode::ACCEPTED)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .unwrap() - .parsed_body::() - .unwrap(); - instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(&client, instance_id, InstanceState::Stopped).await; - - // Verify control plane still shows membership regardless of instance state - let members_url = mcast_group_members_url("state-test-group"); - let final_members: Vec = - nexus_test_utils::http_testing::NexusRequest::iter_collection_authn( - client, - &members_url, - "", - None, - ) - .await - .unwrap() - .all_items; - - assert_eq!( - final_members.len(), - 1, - "Control plane should maintain multicast membership across instance state changes" - ); - assert_eq!(final_members[0].instance_id, stopped_instance.identity.id); - - object_delete( - client, - &format!("/v1/instances/state-test-instance?project={PROJECT_NAME}"), - ) - .await; - - wait_for_group_deleted(client, "state-test-group").await; -} - -/// Test that multicast group membership persists through instance stop/start cycles -/// (parallel to external IP persistence behavior) -#[nexus_test] -async fn test_multicast_group_persistence_through_stop_start( - cptestctx: &ControlPlaneTestContext, -) { - // Ensure inventory and DPD are ready before creating instances with multicast groups - ensure_multicast_test_ready(cptestctx).await; - - let client = &cptestctx.external_client; - - // Create project and pools in parallel - let (_, _, _) = ops::join3( - create_default_ip_pool(&client), - create_project(client, PROJECT_NAME), - create_multicast_ip_pool(&client, "mcast-pool"), - ) - .await; - - // Create instance and start it (no multicast groups at creation) - let instance = instance_for_multicast_groups( - cptestctx, - PROJECT_NAME, - "persist-test-instance", - true, - &[], - ) - .await; - - // Add instance to group (group implicitly creates if it doesn't exist) - multicast_group_attach( - cptestctx, - PROJECT_NAME, - "persist-test-instance", - "persist-test-group", - ) - .await; - - // Wait for group to become Active - wait_for_group_active(client, "persist-test-group").await; - - let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); - - // Simulate the instance transitioning to Running state - let nexus = &cptestctx.server.server_context().nexus; - instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(client, instance_id, InstanceState::Running).await; - - // Wait for member to be joined (reconciler will process the sled_id set by instance start) - wait_for_member_state( - cptestctx, - "persist-test-group", - instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - - // Verify instance is in the group - let members_url = mcast_group_members_url("persist-test-group"); - let members_before_stop = - nexus_test_utils::http_testing::NexusRequest::iter_collection_authn::< - MulticastGroupMember, - >(client, &members_url, "", None) - .await - .expect("Should list group members before stop") - .all_items; - - assert_eq!( - members_before_stop.len(), - 1, - "Group should have 1 member before stop" - ); - assert_eq!(members_before_stop[0].instance_id, instance.identity.id); - - // Stop the instance - let instance_stop_url = format!( - "/v1/instances/persist-test-instance/stop?project={PROJECT_NAME}" - ); - nexus_test_utils::http_testing::NexusRequest::new( - nexus_test_utils::http_testing::RequestBuilder::new( - client, - http::Method::POST, - &instance_stop_url, - ) - .body(None as Option<&serde_json::Value>) - .expect_status(Some(http::StatusCode::ACCEPTED)), - ) - .authn_as(nexus_test_utils::http_testing::AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Should stop instance"); - - // Simulate the stop transition - let nexus = &cptestctx.server.server_context().nexus; - instance_simulate(nexus, &instance_id).await; - - // Wait for instance to be stopped - instance_wait_for_state( - client, - instance_id, - omicron_common::api::external::InstanceState::Stopped, - ) - .await; - - // Verify multicast group membership persists while stopped - let members_while_stopped = - nexus_test_utils::http_testing::NexusRequest::iter_collection_authn::< - MulticastGroupMember, - >(client, &members_url, "", None) - .await - .expect("Should list group members while stopped") - .all_items; - - assert_eq!( - members_while_stopped.len(), - 1, - "Group membership should persist while instance is stopped" - ); - assert_eq!(members_while_stopped[0].instance_id, instance.identity.id); - - // Start the instance again - let instance_start_url = format!( - "/v1/instances/persist-test-instance/start?project={PROJECT_NAME}" - ); - nexus_test_utils::http_testing::NexusRequest::new( - nexus_test_utils::http_testing::RequestBuilder::new( - client, - http::Method::POST, - &instance_start_url, - ) - .body(None as Option<&serde_json::Value>) - .expect_status(Some(http::StatusCode::ACCEPTED)), - ) - .authn_as(nexus_test_utils::http_testing::AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Should start instance"); - - // Simulate the instance transitioning back to "Running" state - let nexus = &cptestctx.server.server_context().nexus; - instance_simulate(nexus, &instance_id).await; - - // Wait for instance to be running again - instance_wait_for_state( - client, - instance_id, - omicron_common::api::external::InstanceState::Running, - ) - .await; - - // Verify multicast group membership still exists after restart - let members_after_restart = - nexus_test_utils::http_testing::NexusRequest::iter_collection_authn::< - MulticastGroupMember, - >(client, &members_url, "", None) - .await - .expect("Should list group members after restart") - .all_items; - - assert_eq!( - members_after_restart.len(), - 1, - "Group membership should persist after instance restart" - ); - assert_eq!(members_after_restart[0].instance_id, instance.identity.id); - - // Wait for member to be joined again after restart - wait_for_member_state( - cptestctx, - "persist-test-group", - instance.identity.id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - - cleanup_instances( - cptestctx, - client, - PROJECT_NAME, - &["persist-test-instance"], - ) - .await; - // Group is implicitly deleted when last member (instance) is removed - wait_for_group_deleted(client, "persist-test-group").await; -} - /// Verify concurrent multicast operations maintain correct member states. /// /// The system handles multiple instances joining simultaneously, rapid attach/detach @@ -1059,28 +754,25 @@ async fn test_multicast_member_cleanup_instance_never_started( wait_for_group_deleted_from_dpd(cptestctx, underlay_multicast_ip).await; } -/// Verify multicast group membership persists through instance migration. +/// Test multicast group membership during instance migration. /// -/// The RPW reconciler detects sled_id changes and updates DPD configuration on +/// This test verifies two migration scenarios: +/// 1. Single instance migration: membership persists, DPD is updated, port mapping works +/// 2. Concurrent migrations: multiple instances migrate simultaneously without interference +/// +/// The RPW reconciler detects `sled_id` changes and updates DPD configuration on /// both source and target switches to maintain uninterrupted multicast traffic. -/// Member state follows the expected lifecycle: Joined on source `sled` → `sled_id` -/// updated during migration → "Joined" again on target sled after reconciler -/// processes the change. #[nexus_test(extra_sled_agents = 1)] -async fn test_multicast_group_membership_during_migration( +async fn test_multicast_migration_scenarios( cptestctx: &ControlPlaneTestContext, ) { - // Ensure inventory and DPD are ready before creating instances with multicast groups ensure_multicast_test_ready(cptestctx).await; let client = &cptestctx.external_client; let lockstep_client = &cptestctx.lockstep_client; let nexus = &cptestctx.server.server_context().nexus; - let project_name = "migration-test-project"; - let group_name = "migration-test-group"; - let instance_name = "migration-test-instance"; + let project_name = "migration-project"; - // Create project and pools in parallel ops::join3( create_project(client, project_name), create_default_ip_pool(client), @@ -1093,263 +785,137 @@ async fn test_multicast_group_membership_during_migration( ) .await; - // Create and start instance first (no multicast groups at creation) - let instance = instance_for_multicast_groups( + let available_sleds = + [cptestctx.first_sled_id(), cptestctx.second_sled_id()]; + + // Case: Single instance migration with DPD verification + + let group1_name = "single-migration-group"; + let instance1 = instance_for_multicast_groups( cptestctx, project_name, - instance_name, + "single-migration-inst", true, &[], ) .await; + let instance1_id = InstanceUuid::from_untyped_uuid(instance1.identity.id); - // Add instance to group (group implicitly creates if it doesn't exist) - multicast_group_attach(cptestctx, project_name, instance_name, group_name) - .await; - wait_for_group_active(client, group_name).await; - - // Get the group's multicast IP for DPD verification later - let created_group = get_multicast_group(client, group_name).await; - let multicast_ip = created_group.multicast_ip; - - let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); + multicast_group_attach( + cptestctx, + project_name, + "single-migration-inst", + group1_name, + ) + .await; + wait_for_group_active(client, group1_name).await; - // Simulate instance startup and wait for Running state - instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(client, instance_id, InstanceState::Running).await; + let group1 = get_multicast_group(client, group1_name).await; + let multicast_ip = group1.multicast_ip; - // Wait for instance to reach "Joined" state (member creation is processed by reconciler) + instance_simulate(nexus, &instance1_id).await; + instance_wait_for_state(client, instance1_id, InstanceState::Running).await; wait_for_member_state( cptestctx, - group_name, - instance.identity.id, + group1_name, + instance1.identity.id, nexus_db_model::MulticastGroupMemberState::Joined, ) .await; - let pre_migration_members = - list_multicast_group_members(client, group_name).await; - assert_eq!(pre_migration_members.len(), 1); - assert_eq!(pre_migration_members[0].instance_id, instance.identity.id); - assert_eq!(pre_migration_members[0].state, "Joined"); - - // Verify group exists in DPD before migration + // Verify DPD before migration let dpd_client = nexus_test_utils::dpd_client(cptestctx); dpd_client .multicast_group_get(&multicast_ip) .await - .expect("Multicast group should exist in DPD before migration"); + .expect("Group should exist in DPD before migration"); - // Get source and target sleds for migration - let source_sled_id = nexus - .active_instance_info(&instance_id, None) + // Migrate instance + let source_sled = nexus + .active_instance_info(&instance1_id, None) .await .unwrap() .expect("Running instance should be on a sled") .sled_id; + let target_sled = + *available_sleds.iter().find(|&&s| s != source_sled).unwrap(); - let target_sled_id = if source_sled_id == cptestctx.first_sled_id() { - cptestctx.second_sled_id() - } else { - cptestctx.first_sled_id() - }; - - // Initiate migration - let migrate_url = format!("/instances/{instance_id}/migrate"); - nexus_test_utils::http_testing::NexusRequest::new( - nexus_test_utils::http_testing::RequestBuilder::new( - lockstep_client, - Method::POST, - &migrate_url, - ) - .body(Some(&InstanceMigrateRequest { dst_sled_id: target_sled_id })) - .expect_status(Some(StatusCode::OK)), + let migrate_url = format!("/instances/{instance1_id}/migrate"); + NexusRequest::new( + RequestBuilder::new(lockstep_client, Method::POST, &migrate_url) + .body(Some(&InstanceMigrateRequest { dst_sled_id: target_sled })) + .expect_status(Some(StatusCode::OK)), ) - .authn_as(nexus_test_utils::http_testing::AuthnMode::PrivilegedUser) + .authn_as(AuthnMode::PrivilegedUser) .execute() .await - .expect("Should initiate instance migration"); + .expect("Should initiate migration"); - // Get propolis IDs for source and target - let info = nexus - .active_instance_info(&instance_id, None) - .await - .unwrap() - .expect("Instance should be on a sled"); - let src_propolis_id = info.propolis_id; - let dst_propolis_id = - info.dst_propolis_id.expect("Instance should have a migration target"); + let info = + nexus.active_instance_info(&instance1_id, None).await.unwrap().unwrap(); + let src_propolis = info.propolis_id; + let dst_propolis = info.dst_propolis_id.unwrap(); - // Complete migration on source sled and wait for instance to enter "Migrating" - vmm_simulate_on_sled(cptestctx, nexus, source_sled_id, src_propolis_id) + vmm_simulate_on_sled(cptestctx, nexus, source_sled, src_propolis).await; + instance_wait_for_state(client, instance1_id, InstanceState::Migrating) .await; - // Instance should transition to "Migrating"; membership should remain "Joined" - instance_wait_for_state(client, instance_id, InstanceState::Migrating) - .await; + // Verify membership persists during migration let migrating_members = - list_multicast_group_members(client, group_name).await; - assert_eq!( - migrating_members.len(), - 1, - "Membership should remain during migration" - ); - assert_eq!(migrating_members[0].instance_id, instance.identity.id); - assert_eq!( - migrating_members[0].state, "Joined", - "Member should stay Joined while migrating" - ); + list_multicast_group_members(client, group1_name).await; + assert_eq!(migrating_members.len(), 1); + assert_eq!(migrating_members[0].state, "Joined"); - // Complete migration on target sled - vmm_simulate_on_sled(cptestctx, nexus, target_sled_id, dst_propolis_id) - .await; + vmm_simulate_on_sled(cptestctx, nexus, target_sled, dst_propolis).await; + instance_wait_for_state(client, instance1_id, InstanceState::Running).await; - // Wait for migration to complete - instance_wait_for_state(client, instance_id, InstanceState::Running).await; - - // Verify instance is now on the target sled - let post_migration_sled = nexus - .active_instance_info(&instance_id, None) + // Verify post-migration state + let post_sled = nexus + .active_instance_info(&instance1_id, None) .await .unwrap() - .expect("Migrated instance should still be on a sled") + .unwrap() .sled_id; + assert_eq!(post_sled, target_sled, "Instance should be on target sled"); - assert_eq!( - post_migration_sled, target_sled_id, - "Instance should be on target sled after migration" - ); - - // Wait for multicast reconciler to process the sled_id change - // The RPW reconciler should detect the sled_id change and re-apply DPD configuration wait_for_multicast_reconciler(lockstep_client).await; - - // Verify multicast membership persists after migration - let post_migration_members = - list_multicast_group_members(client, group_name).await; - - assert_eq!( - post_migration_members.len(), - 1, - "Multicast membership should persist through migration" - ); - assert_eq!(post_migration_members[0].instance_id, instance.identity.id); - - // Wait for member to reach "Joined" state on target sled - // The RPW reconciler should transition the member back to "Joined" after re-applying DPD configuration wait_for_member_state( cptestctx, - group_name, - instance.identity.id, + group1_name, + instance1.identity.id, nexus_db_model::MulticastGroupMemberState::Joined, ) .await; - let final_member_state = &post_migration_members[0]; - assert_eq!( - final_member_state.state, "Joined", - "Member should be in 'Joined' state after migration completes" - ); - - // Verify inventory-based port mapping updated correctly after migration - // This confirms the RPW reconciler correctly mapped the new sled to its rear port - verify_inventory_based_port_mapping(cptestctx, &instance_id) + verify_inventory_based_port_mapping(cptestctx, &instance1_id) .await - .expect("Port mapping should be updated after migration"); - - // Verify group still exists in DPD after migration + .expect("Port mapping should be updated"); dpd_client .multicast_group_get(&multicast_ip) .await - .expect("Multicast group should exist in DPD after migration"); - - // Cleanup: Stop and delete instance - let stop_url = - format!("/v1/instances/{instance_name}/stop?project={project_name}"); - nexus_test_utils::http_testing::NexusRequest::new( - nexus_test_utils::http_testing::RequestBuilder::new( - client, - Method::POST, - &stop_url, - ) - .body(None as Option<&serde_json::Value>) - .expect_status(Some(StatusCode::ACCEPTED)), - ) - .authn_as(nexus_test_utils::http_testing::AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Should stop instance"); - - // Simulate stop and wait for stopped state - instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(client, instance_id, InstanceState::Stopped).await; - - // Delete instance; group is implicitly deleted when last member removed - object_delete( - client, - &format!("/v1/instances/{instance_name}?project={project_name}"), - ) - .await; + .expect("Group should exist in DPD after migration"); - // Implicit model: group is implicitly deleted when last member (instance) is removed - wait_for_group_deleted(client, group_name).await; -} - -/// Verify the RPW reconciler handles concurrent instance migrations within the same multicast group. -/// -/// Multiple instances in the same multicast group can migrate simultaneously without -/// interfering with each other's membership states. The reconciler correctly processes -/// concurrent sled_id changes for all members, ensuring each reaches Joined state on -/// their respective target sleds. -#[nexus_test(extra_sled_agents = 1)] -async fn test_multicast_group_concurrent_member_migrations( - cptestctx: &ControlPlaneTestContext, -) { - let client = &cptestctx.external_client; - let lockstep_client = &cptestctx.lockstep_client; - let nexus = &cptestctx.server.server_context().nexus; - let project_name = "concurrent-migration-project"; - let group_name = "concurrent-migration-group"; + // Case: Concurrent migrations - // Create project and pools in parallel - ops::join3( - create_project(client, project_name), - create_default_ip_pool(client), - create_multicast_ip_pool_with_range( - client, - "concurrent-migration-pool", - (224, 62, 0, 1), - (224, 62, 0, 255), - ), - ) - .await; - - // Ensure inventory and DPD are ready before creating instances - ensure_multicast_test_ready(cptestctx).await; - - // Create multiple instances - let instance_names = ["concurrent-instance-1", "concurrent-instance-2"]; - let create_futures = instance_names - .iter() - .map(|name| create_instance(client, project_name, name)); + let group2_name = "concurrent-migration-group"; + let instance_names = ["concurrent-inst-1", "concurrent-inst-2"]; + let create_futures = + instance_names.iter().map(|n| create_instance(client, project_name, n)); let instances = ops::join_all(create_futures).await; - // First instance attach (implicitly creates the group) multicast_group_attach( cptestctx, project_name, instance_names[0], - group_name, + group2_name, ) .await; - wait_for_group_active(client, group_name).await; - - // Second instance attach (group already exists) + wait_for_group_active(client, group2_name).await; multicast_group_attach( cptestctx, project_name, instance_names[1], - group_name, + group2_name, ) .await; @@ -1358,37 +924,25 @@ async fn test_multicast_group_concurrent_member_migrations( .map(|i| InstanceUuid::from_untyped_uuid(i.identity.id)) .collect(); - // Simulate all instances to Running state in parallel - let simulate_futures = instance_ids.iter().map(|&instance_id| async move { + // Start all instances via simulation + for &instance_id in &instance_ids { instance_simulate(nexus, &instance_id).await; instance_wait_for_state(client, instance_id, InstanceState::Running) .await; - }); - ops::join_all(simulate_futures).await; - - // Wait for all members to reach "Joined" state - for instance in &instances { + } + for inst in &instances { wait_for_member_state( cptestctx, - group_name, - instance.identity.id, + group2_name, + inst.identity.id, nexus_db_model::MulticastGroupMemberState::Joined, ) .await; } - // Verify we have 2 members initially - let pre_migration_members = - list_multicast_group_members(client, group_name).await; - assert_eq!(pre_migration_members.len(), 2); - - // Get current sleds for all instances + // Get source/target sleds for each instance let mut source_sleds = Vec::new(); let mut target_sleds = Vec::new(); - - let available_sleds = - [cptestctx.first_sled_id(), cptestctx.second_sled_id()]; - for &instance_id in &instance_ids { let current_sled = nexus .active_instance_info(&instance_id, None) @@ -1397,121 +951,101 @@ async fn test_multicast_group_concurrent_member_migrations( .expect("Running instance should be on a sled") .sled_id; source_sleds.push(current_sled); - - // Find a different sled for migration target - let target_sled = available_sleds - .iter() - .find(|&&sled| sled != current_sled) - .copied() - .expect("Should have available target sled"); - target_sleds.push(target_sled); + target_sleds.push( + *available_sleds.iter().find(|&&s| s != current_sled).unwrap(), + ); } - // Initiate both migrations concurrently - let migration_futures = instance_ids.iter().zip(target_sleds.iter()).map( - |(&instance_id, &target_sled)| { - let migrate_url = format!("/instances/{instance_id}/migrate"); - nexus_test_utils::http_testing::NexusRequest::new( - nexus_test_utils::http_testing::RequestBuilder::new( - lockstep_client, - Method::POST, - &migrate_url, - ) - .body(Some(&InstanceMigrateRequest { - dst_sled_id: target_sled, - })) - .expect_status(Some(StatusCode::OK)), + // Initiate concurrent migrations + let migration_futures = + instance_ids.iter().zip(target_sleds.iter()).map(|(&id, &target)| { + let url = format!("/instances/{id}/migrate"); + NexusRequest::new( + RequestBuilder::new(lockstep_client, Method::POST, &url) + .body(Some(&InstanceMigrateRequest { dst_sled_id: target })) + .expect_status(Some(StatusCode::OK)), ) - .authn_as(nexus_test_utils::http_testing::AuthnMode::PrivilegedUser) + .authn_as(AuthnMode::PrivilegedUser) .execute() - }, - ); - - // Execute both migrations concurrently - let migration_responses = ops::join_all(migration_futures).await; - - // Verify both migrations were initiated successfully - for response in migration_responses { - response.expect("Migration should initiate successfully"); + }); + let responses = ops::join_all(migration_futures).await; + for r in responses { + r.expect("Migration should initiate"); } - // Complete both migrations by simulating on both source and target sleds + // Complete all migrations for (i, &instance_id) in instance_ids.iter().enumerate() { - // Get propolis IDs for this instance let info = nexus .active_instance_info(&instance_id, None) .await .unwrap() - .expect("Instance should be on a sled"); - let src_propolis_id = info.propolis_id; - let dst_propolis_id = info - .dst_propolis_id - .expect("Instance should have a migration target"); - - // Complete migration on source and target + .unwrap(); vmm_simulate_on_sled( cptestctx, nexus, source_sleds[i], - src_propolis_id, + info.propolis_id, ) .await; vmm_simulate_on_sled( cptestctx, nexus, target_sleds[i], - dst_propolis_id, + info.dst_propolis_id.unwrap(), ) .await; - instance_wait_for_state(client, instance_id, InstanceState::Running) .await; } - // Verify all instances are on their target sleds + // Verify all on target sleds for (i, &instance_id) in instance_ids.iter().enumerate() { - let current_sled = nexus + let sled = nexus .active_instance_info(&instance_id, None) .await .unwrap() - .expect("Migrated instance should be on target sled") + .unwrap() .sled_id; - assert_eq!( - current_sled, + sled, target_sleds[i], - "Instance {} should be on target sled after migration", + "Instance {} should be on target sled", i + 1 ); } - // Wait for multicast reconciler to process all sled_id changes wait_for_multicast_reconciler(lockstep_client).await; - // Verify all members are still in the group and reach "Joined" state - let post_migration_members = - list_multicast_group_members(client, group_name).await; - + let post_members = list_multicast_group_members(client, group2_name).await; assert_eq!( - post_migration_members.len(), + post_members.len(), 2, - "Both instances should remain multicast group members after concurrent migration" + "Both members should persist after concurrent migration" ); - // Verify both members reach "Joined" state on their new sleds - for instance in &instances { + for inst in &instances { wait_for_member_state( cptestctx, - group_name, - instance.identity.id, + group2_name, + inst.identity.id, nexus_db_model::MulticastGroupMemberState::Joined, ) .await; } - // Cleanup and delete instances (group is automatically deleted when last member removed) - cleanup_instances(cptestctx, client, project_name, &instance_names).await; - wait_for_group_deleted(client, group_name).await; + // Cleanup + cleanup_instances( + cptestctx, + client, + project_name, + &["single-migration-inst", instance_names[0], instance_names[1]], + ) + .await; + ops::join2( + wait_for_group_deleted(client, group1_name), + wait_for_group_deleted(client, group2_name), + ) + .await; } /// Test that source_ips are preserved across instance stop/start. @@ -1836,7 +1370,67 @@ async fn test_source_ips_preserved_on_instance_reconfigure( "ASM member added via reconfigure should have empty source_ips" ); + // Case: Reconfigure with only ASM group, verify SSM membership is removed + let update_body_to_remove_ssm = serde_json::json!({ + "ncpus": 2, + "memory": 4294967296_u64, + "boot_disk": null, + "auto_restart_policy": null, + "cpu_platform": null, + "multicast_groups": [ + // Only ASM group, SSM group omitted (should be removed) + { "group": asm_ip, "source_ips": null }, + ] + }); + + NexusRequest::new( + RequestBuilder::new(client, Method::PUT, &update_url) + .body(Some(&update_body_to_remove_ssm)) + .expect_status(Some(StatusCode::OK)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Should reconfigure instance, removing SSM group"); + + // Verify SSM membership was removed + let ssm_members_after: Vec = + NexusRequest::iter_collection_authn( + client, + &ssm_members_url, + "", + Some(10), + ) + .await + .expect("Should list SSM group members") + .all_items; + + assert!( + ssm_members_after.is_empty(), + "SSM membership should be removed when group omitted from reconfigure" + ); + + // Verify ASM membership still exists + let asm_members_after: Vec = + NexusRequest::iter_collection_authn( + client, + &asm_members_url, + "", + Some(10), + ) + .await + .expect("Should list ASM group members") + .all_items; + + assert_eq!( + asm_members_after.len(), + 1, + "ASM membership should persist when included in reconfigure" + ); + cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; + // SSM group should already be deleted (implicit deletion when last member removed) + // but wait for it anyway in case of timing wait_for_group_deleted(client, &ssm_group_name).await; wait_for_group_deleted(client, &asm_group_name).await; } @@ -1938,21 +1532,20 @@ async fn test_instance_create_with_ssm_multicast_groups( wait_for_group_deleted(client, &ssm_group_name).await; } -/// Test that creating an instance with SSM multicast group (by IP) without -/// sources fails validation. +/// Test that SSM multicast groups without sources fail validation on both +/// instance create and reconfigure paths. /// /// SSM addresses (232/8 for IPv4) require source IPs to be specified. This -/// test verifies the validation happens during instance creation and prevents -/// creating the instance without proper SSM sources. +/// test verifies the validation happens during both: +/// a). Instance creation (POST /v1/instances) with SSM group without sources +/// b). Instance reconfigure (PUT /v1/instances) adding new SSM group without sources #[nexus_test] -async fn test_instance_create_with_ssm_without_sources_fails( +async fn test_ssm_without_sources_fails_create_and_reconfigure( cptestctx: &ControlPlaneTestContext, ) { - use nexus_types::external_api::params::MulticastGroupJoinSpec; - let client = &cptestctx.external_client; - let project_name = "ssm-nosrc-create-project"; - let instance_name = "ssm-nosrc-create-instance"; + let project_name = "ssm-nosrc-project"; + let instance_name = "ssm-nosrc-instance"; // Setup: create pools and project ops::join3( @@ -1960,18 +1553,17 @@ async fn test_instance_create_with_ssm_without_sources_fails( create_project(client, project_name), create_multicast_ip_pool_with_range( client, - "ssm-nosrc-create-pool", + "ssm-nosrc-pool", (232, 80, 0, 1), (232, 80, 0, 100), ), ) .await; - // Try to create instance with SSM multicast group WITHOUT source_ips - // This should fail validation let ssm_ip: IpAddr = "232.80.0.10".parse().unwrap(); - let instance_params = InstanceCreate { + // Case: Instance creation with SSM group without sources should fail + let instance_params_with_ssm = InstanceCreate { identity: IdentityMetadataCreateParams { name: instance_name.parse().unwrap(), description: "Instance should fail with SSM without sources".into(), @@ -1989,7 +1581,6 @@ async fn test_instance_create_with_ssm_without_sources_fails( auto_restart_policy: None, anti_affinity_groups: Vec::new(), cpu_platform: None, - // SSM group by IP with NO source_ips - should fail multicast_groups: vec![MulticastGroupJoinSpec { group: ssm_ip.to_string().parse().unwrap(), source_ips: None, // Missing sources for SSM! @@ -1997,9 +1588,9 @@ async fn test_instance_create_with_ssm_without_sources_fails( }; let instance_url = format!("/v1/instances?project={project_name}"); - let error = NexusRequest::new( + let create_error = NexusRequest::new( RequestBuilder::new(client, Method::POST, &instance_url) - .body(Some(&instance_params)) + .body(Some(&instance_params_with_ssm)) .expect_status(Some(StatusCode::BAD_REQUEST)), ) .authn_as(AuthnMode::PrivilegedUser) @@ -2007,47 +1598,21 @@ async fn test_instance_create_with_ssm_without_sources_fails( .await .expect("Creating instance with SSM without sources should fail"); - // Verify the error message mentions SSM/source requirements - let error_body: serde_json::Value = - serde_json::from_slice(&error.body).unwrap(); - let error_message = error_body["message"].as_str().unwrap_or(""); + let create_error_body: serde_json::Value = + serde_json::from_slice(&create_error.body).unwrap(); + let create_error_message = + create_error_body["message"].as_str().unwrap_or(""); assert!( - error_message.contains("SSM") || error_message.contains("source"), - "Error should mention SSM or source IPs: {error_message}" + create_error_message.contains("SSM") + || create_error_message.contains("source"), + "Create error should mention SSM or source IPs: {create_error_message}" ); -} - -/// Test that instance reconfigure adding a new SSM group without sources fails. -/// -/// When reconfiguring an instance to add new multicast groups: -/// - For existing memberships: `source_ips = None` means "preserve existing" -/// - For new memberships: `source_ips = None` means "no sources" which is -/// invalid for SSM -#[nexus_test] -async fn test_instance_reconfigure_add_new_ssm_without_sources_fails( - cptestctx: &ControlPlaneTestContext, -) { - use omicron_common::api::external::Nullable; - let client = &cptestctx.external_client; - let project_name = "ssm-nosrc-reconfig-project"; - let instance_name = "ssm-nosrc-reconfig-instance"; - - // Setup: create pools and project - ops::join3( - create_default_ip_pool(&client), - create_project(client, project_name), - create_multicast_ip_pool_with_range( - client, - "ssm-nosrc-reconfig-pool", - (232, 81, 0, 1), - (232, 81, 0, 100), - ), - ) - .await; - - // First: create an instance WITHOUT any multicast groups - let instance_params = InstanceCreate { + // Case: Instance reconfiguration while adding SSM group without sources + // should fail + // + // We first create instance without multicast groups + let instance_params_no_mcast = InstanceCreate { identity: IdentityMetadataCreateParams { name: instance_name.parse().unwrap(), description: "Instance for SSM reconfigure test".into(), @@ -2065,29 +1630,24 @@ async fn test_instance_reconfigure_add_new_ssm_without_sources_fails( auto_restart_policy: None, anti_affinity_groups: Vec::new(), cpu_platform: None, - multicast_groups: vec![], // No multicast groups initially + multicast_groups: vec![], // No multicast groups init }; - let instance_url = format!("/v1/instances?project={project_name}"); let instance: Instance = - object_create(client, &instance_url, &instance_params).await; + object_create(client, &instance_url, &instance_params_no_mcast).await; let instance_id = InstanceUuid::from_untyped_uuid(instance.identity.id); - // Simulate instance to running state let nexus = &cptestctx.server.server_context().nexus; instance_simulate(nexus, &instance_id).await; instance_wait_for_state(client, instance_id, InstanceState::Running).await; - // Now try to reconfigure to add a new SSM group without sources - let ssm_ip: IpAddr = "232.81.0.10".parse().unwrap(); - + // Try to reconfigure to add SSM group without sources let update_params = InstanceUpdate { ncpus: InstanceCpuCount(2), memory: ByteCount::from_gibibytes_u32(4), boot_disk: Nullable(None), auto_restart_policy: Nullable(None), cpu_platform: Nullable(None), - // Try to add new SSM group without sources - should fail multicast_groups: Some(vec![MulticastGroupJoinSpec { group: ssm_ip.to_string().parse().unwrap(), source_ips: None, // Missing sources for new SSM group! @@ -2096,7 +1656,7 @@ async fn test_instance_reconfigure_add_new_ssm_without_sources_fails( let update_url = format!("/v1/instances/{instance_name}?project={project_name}"); - let error = NexusRequest::new( + let reconfig_error = NexusRequest::new( RequestBuilder::new(client, Method::PUT, &update_url) .body(Some(&update_params)) .expect_status(Some(StatusCode::BAD_REQUEST)), @@ -2104,32 +1664,35 @@ async fn test_instance_reconfigure_add_new_ssm_without_sources_fails( .authn_as(AuthnMode::PrivilegedUser) .execute() .await - .expect("Reconfigure adding new SSM group without sources should fail"); + .expect("Reconfigure adding SSM group without sources should fail"); - // Verify the error message mentions SSM/source requirements - let error_body: serde_json::Value = - serde_json::from_slice(&error.body).unwrap(); - let error_message = error_body["message"].as_str().unwrap_or(""); + let reconfig_error_body: serde_json::Value = + serde_json::from_slice(&reconfig_error.body).unwrap(); + let reconfig_error_message = + reconfig_error_body["message"].as_str().unwrap_or(""); assert!( - error_message.contains("SSM") || error_message.contains("source"), - "Error should mention SSM or source IPs: {error_message}" + reconfig_error_message.contains("SSM") + || reconfig_error_message.contains("source"), + "Reconfigure error should mention SSM or source IPs: {reconfig_error_message}" ); - // Cleanup cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; } -/// Test explicit member state transitions during reactivation (Left → Joining → Joined). +/// Test member state transitions through instance start/stop cycle. /// /// Verifies the 3-state lifecycle: /// - Create instance with multicast group → member in "Left" state (stopped) /// - Start instance → RPW transitions to "Joined" -/// - Stop instance → RPW transitions to "Left" -/// - Start instance again → RPW transitions Left → Joining → Joined (reactivation) +/// - Stop instance → RPW transitions back to "Left" +/// +/// This is the canonical test for the multicast member state machine. The +/// RPW reconciler handles all state transitions (Left ↔ Joining ↔ Joined) +/// using the same code path regardless of how the instance lifecycle is +/// triggered. Other tests (e.g., `test_source_ips_preserved_on_instance_restart`) +/// also exercise start/stop but focus on different invariants. #[nexus_test] -async fn test_member_state_transitions_on_reactivation( - cptestctx: &ControlPlaneTestContext, -) { +async fn test_member_state_transitions(cptestctx: &ControlPlaneTestContext) { let client = &cptestctx.external_client; let project_name = "state-transition-project"; let instance_name = "state-transition-inst"; @@ -2240,28 +1803,6 @@ async fn test_member_state_transitions_on_reactivation( ) .await; - // Start instance again (reactivation) - NexusRequest::new( - RequestBuilder::new(client, Method::POST, &start_url) - .expect_status(Some(StatusCode::ACCEPTED)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .expect("Restart should succeed"); - - instance_simulate(nexus, &instance_id).await; - instance_wait_for_state(client, instance_id, InstanceState::Running).await; - - // Case: Reactivation complete -> member goes back to "Joined" state - wait_for_member_state( - cptestctx, - &expected_group_name, - member.instance_id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - // Cleanup cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; wait_for_group_deleted(client, &expected_group_name).await; diff --git a/nexus/tests/integration_tests/multicast/mod.rs b/nexus/tests/integration_tests/multicast/mod.rs index 563ba039b1e..2fc0adad6b0 100644 --- a/nexus/tests/integration_tests/multicast/mod.rs +++ b/nexus/tests/integration_tests/multicast/mod.rs @@ -66,8 +66,15 @@ mod networking_integration; mod pool_selection; // Timeout constants for test operations -const POLL_INTERVAL: Duration = Duration::from_millis(80); -const MULTICAST_OPERATION_TIMEOUT: Duration = Duration::from_secs(120); +const POLL_INTERVAL: Duration = Duration::from_millis(50); + +// Tiered timeouts for different operation types: +// - Short: For operations that should complete quickly (group state checks, member lists) +// - Medium: For operations requiring reconciler processing or simulation (state transitions) +// - Long: For test setup operations requiring external systems (inventory population) +const SHORT_TIMEOUT: Duration = Duration::from_secs(15); +const MEDIUM_TIMEOUT: Duration = Duration::from_secs(60); +const LONG_TIMEOUT: Duration = Duration::from_secs(90); /// Generic helper for PUT upsert requests that return 201 Created. /// @@ -202,8 +209,8 @@ pub(crate) async fn activate_multicast_reconciler( /// Wait for a condition to be true, activating the reconciler periodically. /// /// This is like `wait_for_condition` but activates the multicast reconciler -/// periodically (not on every poll) to drive state changes. We activate the -/// reconciler every 500ms. +/// periodically (not on every poll) to drive state changes. We first wait for +/// any in-progress run, then activate every 60ms in the poll loop. /// /// Useful for tests that need to wait for reconciler-driven state changes /// (e.g., member state transitions). @@ -217,18 +224,16 @@ where F: Fn() -> Fut, Fut: Future>>, { - // Activate reconciler less frequently than we check the condition - // This reduces overhead while still driving state changes forward - const RECONCILER_ACTIVATION_INTERVAL: Duration = Duration::from_millis(500); + const RECONCILER_ACTIVATION_INTERVAL: Duration = Duration::from_millis(60); let last_reconciler_activation = Arc::new(Mutex::new(Instant::now())); - // Activate once at the start to kick things off + // Wait for any in-progress run to complete first wait_for_multicast_reconciler(lockstep_client).await; wait_for_condition( || async { - // Only activate reconciler if enough time has passed + // Activate reconciler if enough time has passed since last activation let now = Instant::now(); let should_activate = { let last = last_reconciler_activation.lock().unwrap(); @@ -236,7 +241,7 @@ where }; if should_activate { - wait_for_multicast_reconciler(lockstep_client).await; + activate_multicast_reconciler(lockstep_client).await; *last_reconciler_activation.lock().unwrap() = now; } @@ -331,8 +336,8 @@ pub(crate) async fn ensure_inventory_ready( Err(CondCheckError::::NotYet) } }, - &Duration::from_millis(500), // Check every 500ms - &Duration::from_secs(120), // Wait up to 120s + &Duration::from_millis(150), + &LONG_TIMEOUT, ) .await { @@ -394,8 +399,8 @@ pub(crate) async fn ensure_dpd_ready(cptestctx: &ControlPlaneTestContext) { } } }, - &Duration::from_millis(200), // Check every 200ms - &Duration::from_secs(30), // Wait up to 30 seconds for switches + &Duration::from_millis(100), + &Duration::from_secs(30), ) .await { @@ -467,7 +472,7 @@ pub(crate) async fn wait_for_group_state( } }, &POLL_INTERVAL, - &MULTICAST_OPERATION_TIMEOUT, + &SHORT_TIMEOUT, // Group state checks should be quick ) .await { @@ -564,7 +569,8 @@ pub(crate) async fn wait_for_member_state( } }; - // Use reconciler-activating wait for "Joined" state + // Use reconciler-activating wait for "Joined" state (requires DPD programming) + // For other states, use shorter timeout since they're simpler state changes let res = if expected_state == nexus_db_model::MulticastGroupMemberState::Joined { @@ -572,14 +578,14 @@ pub(crate) async fn wait_for_member_state( lockstep_client, check_member, &POLL_INTERVAL, - &MULTICAST_OPERATION_TIMEOUT, + &MEDIUM_TIMEOUT, // Joined requires reconciler + DPD ) .await } else { wait_for_condition( check_member, &POLL_INTERVAL, - &MULTICAST_OPERATION_TIMEOUT, + &SHORT_TIMEOUT, // Left/Joining are quick state changes ) .await }; @@ -666,7 +672,7 @@ pub(crate) async fn wait_for_instance_sled_assignment( } }, &POLL_INTERVAL, - &MULTICAST_OPERATION_TIMEOUT, + &MEDIUM_TIMEOUT, // VMM creation requires sled agent simulation ) .await { @@ -838,7 +844,7 @@ pub(crate) async fn wait_for_member_count( } }, &POLL_INTERVAL, - &MULTICAST_OPERATION_TIMEOUT, + &SHORT_TIMEOUT, // Member count checks are simple queries ) .await { @@ -880,7 +886,7 @@ pub(crate) async fn wait_for_group_deleted( } }, &POLL_INTERVAL, - &MULTICAST_OPERATION_TIMEOUT, + &MEDIUM_TIMEOUT, // Group deletion requires reconciler processing ) .await { @@ -896,39 +902,6 @@ pub(crate) async fn wait_for_group_deleted( } } -/// Verify a group is either deleted or in one of the expected states. -/// -/// Useful when DPD is unavailable and groups can't complete state transitions. -/// For example, when DPD is down during deletion, groups may be stuck in -/// "Creating" or "Deleting" state rather than being fully deleted. -pub(crate) async fn verify_group_deleted_or_in_states( - client: &ClientTestContext, - group_name: &str, - expected_states: &[&str], -) { - let groups_result = - nexus_test_utils::resource_helpers::objects_list_page_authz::< - MulticastGroup, - >(client, "/v1/multicast-groups") - .await; - - let matching_groups: Vec<_> = groups_result - .items - .into_iter() - .filter(|g| g.identity.name == group_name) - .collect(); - - if !matching_groups.is_empty() { - // Group still exists - should be in one of the expected states - let actual_state = &matching_groups[0].state; - assert!( - expected_states.contains(&actual_state.as_str()), - "Group {group_name} should be in one of {expected_states:?} states, found: \"{actual_state}\"" - ); - } - // If group is gone, that's also valid - operation completed -} - /// Wait for a multicast group to be deleted from DPD (dataplane) with reconciler activation. /// /// This function waits for the DPD to report that the multicast group no longer exists @@ -952,7 +925,7 @@ pub(crate) async fn wait_for_group_deleted_from_dpd( } }, &POLL_INTERVAL, - &MULTICAST_OPERATION_TIMEOUT, + &MEDIUM_TIMEOUT, // DPD deletion requires reconciler ) .await { diff --git a/nexus/tests/integration_tests/multicast/networking_integration.rs b/nexus/tests/integration_tests/multicast/networking_integration.rs index 0f397c5c1d1..c51e8e79352 100644 --- a/nexus/tests/integration_tests/multicast/networking_integration.rs +++ b/nexus/tests/integration_tests/multicast/networking_integration.rs @@ -34,40 +34,43 @@ use crate::integration_tests::instances::{ fetch_instance_external_ips, instance_simulate, instance_wait_for_state, }; -/// Verify instances can have both external IPs and multicast group membership. +/// Verify external IP allocation/deallocation works with multicast group members. /// -/// External IP allocation works for multicast group members, multicast state persists -/// through external IP operations, and no conflicts occur between external IP and multicast -/// DPD configuration. +/// External IP attach/detach doesn't affect multicast state, and dataplane +/// configuration remains consistent throughout the lifecycle. +/// +/// This is the canonical test for external IP + multicast coexistence. The +/// DPD configuration paths for external IPs and multicast are independent, +/// so testing one attach/detach cycle is sufficient to verify no interference. #[nexus_test] -async fn test_multicast_with_external_ip_basic( +async fn test_multicast_external_ip_lifecycle( cptestctx: &nexus_test_utils::ControlPlaneTestContext< omicron_nexus::Server, >, ) { let client = &cptestctx.external_client; - let project_name = "external-ip-mcast-project"; - let group_name = "external-ip-mcast-group"; - let instance_name = "external-ip-mcast-instance"; + let project_name = "external-ip-lifecycle-project"; + let group_name = "external-ip-lifecycle-group"; + let instance_name = "external-ip-lifecycle-instance"; - // Setup: project and IP pools in parallel + // Setup in parallel let (_, _, _) = ops::join3( create_project(client, project_name), - create_default_ip_pool(client), // For external IPs + create_default_ip_pool(client), create_multicast_ip_pool_with_range( client, - "external-ip-mcast-pool", - (224, 100, 0, 1), - (224, 100, 0, 255), + "external-ip-lifecycle-pool", + (224, 101, 0, 1), + (224, 101, 0, 255), ), ) .await; - // Create instance (will start by default) + // Create instance let instance_params = InstanceCreate { identity: IdentityMetadataCreateParams { name: instance_name.parse().unwrap(), - description: "Instance with external IP and multicast".to_string(), + description: "Instance for external IP lifecycle test".to_string(), }, ncpus: InstanceCpuCount::try_from(1).unwrap(), memory: ByteCount::from_gibibytes_u32(1), @@ -75,12 +78,12 @@ async fn test_multicast_with_external_ip_basic( user_data: vec![], ssh_public_keys: None, network_interfaces: InstanceNetworkInterfaceAttachment::Default, - external_ips: vec![], // Start without external IP + external_ips: vec![], multicast_groups: vec![], disks: vec![], boot_disk: None, cpu_platform: None, - start: true, // Start the instance + start: true, auto_restart_policy: Default::default(), anti_affinity_groups: Vec::new(), }; @@ -90,7 +93,7 @@ async fn test_multicast_with_external_ip_basic( object_create(client, &instance_url, &instance_params).await; let instance_id = instance.identity.id; - // Transition instance to Running state + // Start instance and add to multicast group let nexus = &cptestctx.server.server_context().nexus; let instance_uuid = InstanceUuid::from_untyped_uuid(instance_id); instance_simulate(nexus, &instance_uuid).await; @@ -106,7 +109,7 @@ async fn test_multicast_with_external_ip_basic( .await; wait_for_group_active(client, group_name).await; - // Wait for multicast member to reach "Joined" state + // Wait for member to transition from "Joining"->"Joined" wait_for_member_state( cptestctx, group_name, @@ -115,11 +118,14 @@ async fn test_multicast_with_external_ip_basic( ) .await; - // Verify member count - let members = list_multicast_group_members(client, group_name).await; - assert_eq!(members.len(), 1, "Should have one multicast member"); + // Verify initial multicast state + let initial_members = + list_multicast_group_members(client, group_name).await; + assert_eq!(initial_members.len(), 1); + assert_eq!(initial_members[0].state, "Joined"); - // Allocate ephemeral external IP to the same instance + // Test external IP allocation/deallocation cycle + // Allocate ephemeral external IP let ephemeral_ip_url = format!( "/v1/instances/{instance_name}/external-ips/ephemeral?project={project_name}" ); @@ -135,33 +141,31 @@ async fn test_multicast_with_external_ip_basic( .await .unwrap(); - // Verify both multicast and external IP work together + // Wait for dataplane configuration to settle + wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - // Check that multicast membership is preserved - let members_after_ip = + // Verify multicast state is preserved + let members_with_ip = list_multicast_group_members(client, group_name).await; assert_eq!( - members_after_ip.len(), + members_with_ip.len(), 1, - "Multicast member should still exist after external IP allocation" + "Multicast member should persist during external IP allocation" ); - assert_eq!(members_after_ip[0].instance_id, instance_id); assert_eq!( - members_after_ip[0].state, "Joined", - "Member state should remain Joined" + members_with_ip[0].state, "Joined", + "Member should remain Joined" ); - // Check that external IP is properly attached - let external_ips_after_attach = + // Verify external IP is attached + let external_ips_with_ip = fetch_instance_external_ips(client, instance_name, project_name).await; assert!( - !external_ips_after_attach.is_empty(), + !external_ips_with_ip.is_empty(), "Instance should have external IP" ); - // Note: external_ip.ip() from the response may differ from what's actually attached, - // so we just verify that an external IP exists - // Remove ephemeral external IP and verify multicast is unaffected + // Deallocate ephemeral external IP let external_ip_detach_url = format!( "/v1/instances/{instance_name}/external-ips/ephemeral?project={project_name}" ); @@ -170,197 +174,27 @@ async fn test_multicast_with_external_ip_basic( // Wait for operations to settle wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - // Verify multicast membership is still intact after external IP removal - let members_after_detach = + // Verify multicast state is still preserved + let members_without_ip = list_multicast_group_members(client, group_name).await; assert_eq!( - members_after_detach.len(), + members_without_ip.len(), 1, "Multicast member should persist after external IP removal" ); - assert_eq!(members_after_detach[0].instance_id, instance_id); assert_eq!( - members_after_detach[0].state, "Joined", - "Member should remain Joined" + members_without_ip[0].state, "Joined", + "Member should remain Joined after IP removal" ); // Verify ephemeral external IP is removed (SNAT IP may still be present) - let external_ips_after_detach = + let external_ips_without_ip = fetch_instance_external_ips(client, instance_name, project_name).await; - // Instance should have at most 1 IP left (the SNAT IP), not the ephemeral IP we attached assert!( - external_ips_after_detach.len() <= 1, + external_ips_without_ip.len() <= 1, "Instance should have at most SNAT IP remaining" ); - // Cleanup - cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; - // Implicit deletion model: group is implicitly deleted when last member (instance) is removed - wait_for_group_deleted(client, group_name).await; -} - -/// Verify external IP allocation/deallocation lifecycle for multicast group members. -/// -/// Multiple external IP attach/detach cycles don't affect multicast state, concurrent -/// operations don't cause race conditions, and dataplane configuration remains consistent -/// throughout the lifecycle. -#[nexus_test] -async fn test_multicast_external_ip_lifecycle( - cptestctx: &nexus_test_utils::ControlPlaneTestContext< - omicron_nexus::Server, - >, -) { - let client = &cptestctx.external_client; - let project_name = "external-ip-lifecycle-project"; - let group_name = "external-ip-lifecycle-group"; - let instance_name = "external-ip-lifecycle-instance"; - - // Setup in parallel - let (_, _, _) = ops::join3( - create_project(client, project_name), - create_default_ip_pool(client), - create_multicast_ip_pool_with_range( - client, - "external-ip-lifecycle-pool", - (224, 101, 0, 1), - (224, 101, 0, 255), - ), - ) - .await; - - // Create instance - let instance_params = InstanceCreate { - identity: IdentityMetadataCreateParams { - name: instance_name.parse().unwrap(), - description: "Instance for external IP lifecycle test".to_string(), - }, - ncpus: InstanceCpuCount::try_from(1).unwrap(), - memory: ByteCount::from_gibibytes_u32(1), - hostname: instance_name.parse().unwrap(), - user_data: vec![], - ssh_public_keys: None, - network_interfaces: InstanceNetworkInterfaceAttachment::Default, - external_ips: vec![], - multicast_groups: vec![], - disks: vec![], - boot_disk: None, - cpu_platform: None, - start: true, - auto_restart_policy: Default::default(), - anti_affinity_groups: Vec::new(), - }; - - let instance_url = format!("/v1/instances?project={project_name}"); - let instance: Instance = - object_create(client, &instance_url, &instance_params).await; - let instance_id = instance.identity.id; - - // Start instance and add to multicast group - let nexus = &cptestctx.server.server_context().nexus; - let instance_uuid = InstanceUuid::from_untyped_uuid(instance_id); - instance_simulate(nexus, &instance_uuid).await; - instance_wait_for_state(client, instance_uuid, InstanceState::Running) - .await; - - // Ensure multicast test prerequisites (inventory + DPD) are ready - ensure_multicast_test_ready(cptestctx).await; - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - // Add instance to multicast group via instance-centric API - multicast_group_attach(cptestctx, project_name, instance_name, group_name) - .await; - wait_for_group_active(client, group_name).await; - - // Wait for member to transition from "Joining"->"Joined" - wait_for_member_state( - cptestctx, - group_name, - instance_id, - nexus_db_model::MulticastGroupMemberState::Joined, - ) - .await; - - // Verify initial multicast state - let initial_members = - list_multicast_group_members(client, group_name).await; - assert_eq!(initial_members.len(), 1); - assert_eq!(initial_members[0].state, "Joined"); - - // Test multiple external IP allocation/deallocation cycles - for cycle in 1..=3 { - // Allocate ephemeral external IP - let ephemeral_ip_url = format!( - "/v1/instances/{instance_name}/external-ips/ephemeral?project={project_name}" - ); - NexusRequest::new( - RequestBuilder::new(client, Method::POST, &ephemeral_ip_url) - .body(Some(&EphemeralIpCreate { - pool: None, // Use default pool - })) - .expect_status(Some(StatusCode::ACCEPTED)), - ) - .authn_as(AuthnMode::PrivilegedUser) - .execute() - .await - .unwrap(); - - // Wait for dataplane configuration to settle - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - // Verify multicast state is preserved - let members_with_ip = - list_multicast_group_members(client, group_name).await; - assert_eq!( - members_with_ip.len(), - 1, - "Cycle {cycle}: Multicast member should persist during external IP allocation" - ); - assert_eq!( - members_with_ip[0].state, "Joined", - "Cycle {cycle}: Member should remain Joined" - ); - - // Verify external IP is attached - let external_ips_with_ip = - fetch_instance_external_ips(client, instance_name, project_name) - .await; - assert!( - !external_ips_with_ip.is_empty(), - "Cycle {cycle}: Instance should have external IP" - ); - - // Deallocate ephemeral external IP - let external_ip_detach_url = format!( - "/v1/instances/{instance_name}/external-ips/ephemeral?project={project_name}" - ); - object_delete(client, &external_ip_detach_url).await; - - // Wait for operations to settle - wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; - - // Verify multicast state is still preserved - let members_without_ip = - list_multicast_group_members(client, group_name).await; - assert_eq!( - members_without_ip.len(), - 1, - "Cycle {cycle}: Multicast member should persist after external IP removal" - ); - assert_eq!( - members_without_ip[0].state, "Joined", - "Cycle {cycle}: Member should remain Joined after IP removal" - ); - - // Verify ephemeral external IP is removed (SNAT IP may still be present) - let external_ips_without_ip = - fetch_instance_external_ips(client, instance_name, project_name) - .await; - assert!( - external_ips_without_ip.len() <= 1, - "Cycle {cycle}: Instance should have at most SNAT IP remaining" - ); - } - cleanup_instances(cptestctx, client, project_name, &[instance_name]).await; wait_for_group_deleted(client, group_name).await; } diff --git a/nexus/tests/integration_tests/multicast/pool_selection.rs b/nexus/tests/integration_tests/multicast/pool_selection.rs index 1f5100c44fe..a3e4701e985 100644 --- a/nexus/tests/integration_tests/multicast/pool_selection.rs +++ b/nexus/tests/integration_tests/multicast/pool_selection.rs @@ -97,11 +97,13 @@ async fn test_ssm_to_asm_fallback_with_sources( assert_eq!(group.state, "Active"); } -/// Test that SSM pool is preferred when both ASM and SSM pools exist. +/// Test pool selection: SSM preferred with sources, ASM used without. +/// +/// When both ASM and SSM pools exist: +/// - With sources → SSM pool is preferred (232.x.x.x) +/// - Without sources → ASM pool is used directly (224.x.x.x) #[nexus_test] -async fn test_ssm_pool_preferred_with_sources( - cptestctx: &ControlPlaneTestContext, -) { +async fn test_pool_selection_ssm_vs_asm(cptestctx: &ControlPlaneTestContext) { let client = &cptestctx.external_client; // Setup: create both ASM and SSM pools @@ -116,8 +118,8 @@ async fn test_ssm_pool_preferred_with_sources( ) .await; - // Create an instance - let instance = instance_for_multicast_groups( + // Case: With sources → SSM pool preferred + let ssm_instance = instance_for_multicast_groups( cptestctx, PROJECT_NAME, "ssm-prefer-instance", @@ -126,65 +128,37 @@ async fn test_ssm_pool_preferred_with_sources( ) .await; - // Join with sources - should use SSM pool (not ASM) - let join_url = format!( + let ssm_join_url = format!( "/v1/instances/{}/multicast-groups/ssm-preferred-group?project={PROJECT_NAME}", - instance.identity.id + ssm_instance.identity.id ); - put_upsert::<_, MulticastGroupMember>( client, - &join_url, + &ssm_join_url, &InstanceMulticastGroupJoin { source_ips: Some(vec!["10.0.0.1".parse::().unwrap()]), }, ) .await; - // Trigger reconciler to process the new group ("Creating" → "Active") wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + let ssm_group = wait_for_group_active(client, "ssm-preferred-group").await; - // Wait for group to become Active - let group = wait_for_group_active(client, "ssm-preferred-group").await; - - // Verify the group got an SSM IP (232.x.x.x) - let ip = group.multicast_ip; - match ip { + // Verify SSM IP (232.x.x.x) + match ssm_group.multicast_ip { IpAddr::V4(v4) => { assert!( v4.octets()[0] == 232, - "Expected SSM IP (232.x.x.x), got {ip}" + "Expected SSM IP (232.x.x.x), got {}", + ssm_group.multicast_ip ); } - IpAddr::V6(_) => { - panic!("Expected IPv4 SSM address, got IPv6: {ip}"); - } + IpAddr::V6(_) => panic!("Expected IPv4 SSM address"), } + assert_eq!(ssm_group.state, "Active"); - assert_eq!(group.state, "Active"); -} - -/// Test that ASM pool is used directly when no sources provided. -#[nexus_test] -async fn test_asm_pool_used_without_sources( - cptestctx: &ControlPlaneTestContext, -) { - let client = &cptestctx.external_client; - - // Setup: create both ASM and SSM pools - create_default_ip_pool(client).await; - create_project(client, PROJECT_NAME).await; - create_multicast_ip_pool(client, "asm-pool").await; - create_multicast_ip_pool_with_range( - client, - "ssm-pool", - (232, 1, 0, 0), - (232, 1, 0, 255), - ) - .await; - - // Create an instance - let instance = instance_for_multicast_groups( + // Case: Without sources → ASM pool used + let asm_instance = instance_for_multicast_groups( cptestctx, PROJECT_NAME, "asm-direct-instance", @@ -193,38 +167,30 @@ async fn test_asm_pool_used_without_sources( ) .await; - // Join without sources - should use ASM pool directly (skip SSM) - let join_url = format!( + let asm_join_url = format!( "/v1/instances/{}/multicast-groups/asm-direct-group?project={PROJECT_NAME}", - instance.identity.id + asm_instance.identity.id ); - put_upsert::<_, MulticastGroupMember>( client, - &join_url, + &asm_join_url, &InstanceMulticastGroupJoin { source_ips: None }, ) .await; - // Trigger reconciler to process the new group ("Creating" → "Active") wait_for_multicast_reconciler(&cptestctx.lockstep_client).await; + let asm_group = wait_for_group_active(client, "asm-direct-group").await; - // Wait for group to become Active - let group = wait_for_group_active(client, "asm-direct-group").await; - - // Verify the group got an ASM IP (224.x.x.x) - let ip = group.multicast_ip; - match ip { + // Verify ASM IP (224.x.x.x) + match asm_group.multicast_ip { IpAddr::V4(v4) => { assert!( v4.octets()[0] == 224, - "Expected ASM IP (224.x.x.x), got {ip}" + "Expected ASM IP (224.x.x.x), got {}", + asm_group.multicast_ip ); } - IpAddr::V6(_) => { - panic!("Expected IPv4 ASM address, got IPv6: {ip}"); - } + IpAddr::V6(_) => panic!("Expected IPv4 ASM address"), } - - assert_eq!(group.state, "Active"); + assert_eq!(asm_group.state, "Active"); }