diff --git a/platform-manageability-agent/cmd/platform-manageability-agent/platform-manageability-agent.go b/platform-manageability-agent/cmd/platform-manageability-agent/platform-manageability-agent.go index 19f6d7b9..42905fd3 100644 --- a/platform-manageability-agent/cmd/platform-manageability-agent/platform-manageability-agent.go +++ b/platform-manageability-agent/cmd/platform-manageability-agent/platform-manageability-agent.go @@ -135,6 +135,9 @@ func main() { hostID := confs.GUID + // Declare lastCheckTimestamp here so it can be used in the callback + var lastCheckTimestamp int64 + op := func() error { log.Infof("Reporting AMT status for host %s", hostID) status, err := dmMgrClient.ReportAMTStatus(auth.GetAuthContext(ctx, confs.AccessTokenPath), hostID) @@ -178,11 +181,16 @@ func main() { } log.Infof("Successfully reported AMT status for host %s", hostID) + // Set up status update callback for long-running activation operations + dmMgrClient.SetStatusUpdateCallback(func() { + atomic.StoreInt64(&lastCheckTimestamp, time.Now().Unix()) + log.Info("Status timestamp updated via callback during activation operation") + }) + var ( wg sync.WaitGroup activationCheckInterval = confs.Manageability.HeartbeatInterval lastActivationCheckTimestamp int64 - lastCheckTimestamp int64 ) wg.Add(1) diff --git a/platform-manageability-agent/internal/comms/comms.go b/platform-manageability-agent/internal/comms/comms.go index 6cc67bd4..ee0654a8 100644 --- a/platform-manageability-agent/internal/comms/comms.go +++ b/platform-manageability-agent/internal/comms/comms.go @@ -73,9 +73,18 @@ type Client struct { connectingStateStartTime *time.Time // Track when AMT entered "connecting" state previousState string // Track the previous AMT state to detect direct transitions deactivationInProgress bool // Track if deactivation is currently in progress + activationInProgress bool // Track if activation is currently in progress + statusUpdateCallback func() // Callback to update timestamp before long-running activation operations mu sync.RWMutex // Protects concurrent access to the fields above } +// SetStatusUpdateCallback sets the callback function to update status timestamp +func (cli *Client) SetStatusUpdateCallback(callback func()) { + cli.mu.Lock() + defer cli.mu.Unlock() + cli.statusUpdateCallback = callback +} + func WithNetworkDialer(serviceAddr string) func(*Client) { return func(s *Client) { s.Dialer = grpc.WithContextDialer(func(ctx context.Context, s string) (net.Conn, error) { @@ -279,38 +288,8 @@ func (cli *Client) RetrieveActivationDetails(ctx context.Context, hostID string, cli.connectingStateStartTime = nil cli.mu.Unlock() - // Execute activation command - rpsAddress := fmt.Sprintf("wss://%s/activate", conf.RPSAddress) - password := resp.ActionPassword - activationOutput, activationErr := cli.Executor.ExecuteAMTActivate(rpsAddress, resp.ProfileName, password) - - // handles intermittent activation failures - // and allows the main periodic timer to retry activation in the next cycle - outputStr := string(activationOutput) - if strings.Contains(outputStr, `msg="interrupted system call"`) || - strings.Contains(outputStr, "exit code: 10") { - log.Logger.Warnf("Interrupted system call detected for host %s - retrying next cycle", hostID) - } - if strings.Contains(outputStr, `msg="Unable to authenticate with AMT"`) { - log.Logger.Warnf("Unable to authenticate with AMT for host %s - triggering deactivation", hostID) - cli.triggerDeactivationAsync(hostID) - } - - if activationErr != nil { - log.Logger.Errorf("Failed to execute activation command for host %s: %v", hostID, activationErr) - } - - // Check provisioning status - ok := cli.isProvisioned(ctx, outputStr, hostID) - if !ok { - log.Logger.Errorf("Failed to execute activation command for host %s: %v, Output: %s", - hostID, activationErr, outputStr) - activationStatus = pb.ActivationStatus_ACTIVATION_FAILED - } else { - log.Logger.Debugf("Activation command output for host %s: %s", hostID, outputStr) - activationStatus = pb.ActivationStatus_ACTIVATING - log.Logger.Debugf("setting activation status to %s: %s", activationStatus, hostID) - } + // Trigger async activation and return immediately + activationStatus = cli.triggerActivationAsync(ctx, hostID, conf, resp.ProfileName, resp.ActionPassword) case "connecting": log.Logger.Infof("host %s is in 'connecting' state", hostID) activationStatus = cli.handleConnectingState(hostID, normalizedStatus) @@ -387,6 +366,95 @@ func (cli *Client) handleConnectingState(hostID string, currentState string) pb. return pb.ActivationStatus_ACTIVATING } +// triggerActivationAsync launches activation in background goroutine and returns immediately +// The periodic timer will retry if activation fails and RAS status remains "not connected" +func (cli *Client) triggerActivationAsync(ctx context.Context, hostID string, conf *config.Config, profileName, password string) pb.ActivationStatus { + cli.mu.Lock() + defer cli.mu.Unlock() + + // Only trigger activation if not already in progress + if cli.activationInProgress { + log.Logger.Infof("Activation already in progress for host %s, skipping", hostID) + return pb.ActivationStatus_ACTIVATING + } + + // Mark activation as in progress + cli.activationInProgress = true + log.Logger.Infof("Starting async activation for host %s", hostID) + + // Launch goroutine for activation + go cli.performActivationAsync(ctx, hostID, conf, profileName, password) + // Return ACTIVATING immediately to indicate activation started + return pb.ActivationStatus_ACTIVATING +} + +// performActivationAsync executes activation in background and updates status timestamp periodically +func (cli *Client) performActivationAsync(ctx context.Context, hostID string, conf *config.Config, profileName, password string) { + log.Logger.Infof("Starting async activation for host %s", hostID) + + // Always reset activation in progress flag when done + defer func() { + cli.mu.Lock() + cli.activationInProgress = false + cli.mu.Unlock() + }() + + // Update status timestamp before starting long-running operation + cli.mu.RLock() + if cli.statusUpdateCallback != nil { + cli.statusUpdateCallback() + log.Logger.Debug("Updated status timestamp before async activation") + } + cli.mu.RUnlock() + + // Execute activation command + rpsAddress := fmt.Sprintf("wss://%s/activate", conf.RPSAddress) + activationOutput, activationErr := cli.Executor.ExecuteAMTActivate(rpsAddress, profileName, password) + + // Update status timestamp after activation completes + cli.mu.RLock() + if cli.statusUpdateCallback != nil { + cli.statusUpdateCallback() + log.Logger.Debug("Updated status timestamp after async activation completed") + } + cli.mu.RUnlock() + + // Handle intermittent activation failures + outputStr := string(activationOutput) + if strings.Contains(outputStr, `msg="interrupted system call"`) || + strings.Contains(outputStr, "exit code: 10") { + log.Logger.Warnf("Interrupted system call detected for host %s - will retry next cycle", hostID) + return + } + if strings.Contains(outputStr, `msg="Unable to authenticate with AMT"`) { + log.Logger.Warnf("Unable to authenticate with AMT for host %s - triggering deactivation", hostID) + cli.triggerDeactivationAsync(hostID) + // Report activation failed before deactivation + _ = cli.reportActivationResult(ctx, hostID, pb.ActivationStatus_ACTIVATION_FAILED) + return + } + + if activationErr != nil { + log.Logger.Errorf("Failed to execute activation command for host %s: %v", hostID, activationErr) + // Report activation failed to dm-manager + _ = cli.reportActivationResult(ctx, hostID, pb.ActivationStatus_ACTIVATION_FAILED) + return + } + + // Check provisioning status + ok := cli.isProvisioned(ctx, outputStr, hostID) + if !ok { + log.Logger.Errorf("Failed to provision host %s - Output: %s", hostID, outputStr) + // Report activation failed to dm-manager + _ = cli.reportActivationResult(ctx, hostID, pb.ActivationStatus_ACTIVATION_FAILED) + } else { + log.Logger.Infof("Activation completed successfully for host %s", hostID) + // Activation is still in progress - will transition to ACTIVATED when RAS status becomes "connected" + // The next periodic check will detect "connected" state and report ACTIVATED + // No need to report here as activation is still progressing + } +} + // triggerDeactivationAsync launches deactivation in background goroutine and returns immediately func (cli *Client) triggerDeactivationAsync(hostID string) pb.ActivationStatus { cli.mu.Lock() diff --git a/platform-manageability-agent/internal/comms/comms_test.go b/platform-manageability-agent/internal/comms/comms_test.go index 9887d21a..9f36c477 100644 --- a/platform-manageability-agent/internal/comms/comms_test.go +++ b/platform-manageability-agent/internal/comms/comms_test.go @@ -8,6 +8,7 @@ import ( "crypto/tls" "fmt" "net" + "sync" "testing" "time" @@ -179,12 +180,15 @@ func TestRetrieveActivationDetails_Success(t *testing.T) { } func TestRetrieveActivationDetails_Failed(t *testing.T) { - var capturedRequest *pb.ActivationResultRequest + var capturedRequests []*pb.ActivationResultRequest + var mu sync.Mutex // Protect capturedRequests from concurrent access mockServer := &mockDeviceManagementServer{ operationType: pb.OperationType_ACTIVATE, onReportActivationResults: func(ctx context.Context, req *pb.ActivationResultRequest) (*pb.ActivationResultResponse, error) { - capturedRequest = req // Capture the request to verify later. + mu.Lock() + capturedRequests = append(capturedRequests, req) // Capture all requests + mu.Unlock() log.Logger.Infof("Received ReportActivationResults request: %v", req) return &pb.ActivationResultResponse{}, nil }, @@ -217,11 +221,23 @@ func TestRetrieveActivationDetails_Failed(t *testing.T) { }) assert.NoError(t, err, "RetrieveActivationDetails should succeed") + // Wait briefly for async activation to complete + time.Sleep(100 * time.Millisecond) + // Verify that the activation result was reported with ACTIVATION_FAILED status since CIRA is not configured. - assert.NotNil(t, capturedRequest, "Activation result should have been reported") - assert.Equal(t, "host-id", capturedRequest.HostId, "Host ID should match") - assert.Equal(t, pb.ActivationStatus_ACTIVATION_FAILED, capturedRequest.ActivationStatus, - "Activation status should be ACTIVATION_FAILED when CIRA is not configured") + mu.Lock() + assert.NotEmpty(t, capturedRequests, "At least one activation result should have been reported") + + // Check that final status is ACTIVATION_FAILED + hasActivationFailed := false + for _, req := range capturedRequests { + assert.Equal(t, "host-id", req.HostId, "Host ID should match") + if req.ActivationStatus == pb.ActivationStatus_ACTIVATION_FAILED { + hasActivationFailed = true + } + } + mu.Unlock() + assert.True(t, hasActivationFailed, "Should have received ACTIVATION_FAILED status when CIRA is not configured") } func TestRetrieveActivationDetails_Connecting_Timeout(t *testing.T) { @@ -511,10 +527,13 @@ func TestRetrieveActivationDetails_InterruptedSystemCall(t *testing.T) { }) assert.NoError(t, err, "RetrieveActivationDetails should succeed") - // Verify activation failed due to interrupted system call + // Wait briefly for async activation to complete + time.Sleep(100 * time.Millisecond) + + // Verify activation status - interrupted system call should return ACTIVATING for retry assert.NotNil(t, capturedRequest, "Activation result should have been reported") - assert.Equal(t, pb.ActivationStatus_ACTIVATION_FAILED, capturedRequest.ActivationStatus, - "Activation should fail when interrupted system call occurs") + assert.Equal(t, pb.ActivationStatus_ACTIVATING, capturedRequest.ActivationStatus, + "Activation should return ACTIVATING when interrupted system call occurs (retryable)") } // TestRetrieveActivationDetails_InterruptedSystemCallWithExitCode tests exit code 10 detection @@ -556,10 +575,13 @@ func TestRetrieveActivationDetails_InterruptedSystemCallWithExitCode(t *testing. }) assert.NoError(t, err, "RetrieveActivationDetails should succeed") - // Verify activation failed due to exit code 10 + // Wait briefly for async activation to complete + time.Sleep(100 * time.Millisecond) + + // Verify activation status - exit code 10 should return ACTIVATING for retry assert.NotNil(t, capturedRequest, "Activation result should have been reported") - assert.Equal(t, pb.ActivationStatus_ACTIVATION_FAILED, capturedRequest.ActivationStatus, - "Activation should fail when exit code 10 occurs") + assert.Equal(t, pb.ActivationStatus_ACTIVATING, capturedRequest.ActivationStatus, + "Activation should return ACTIVATING when exit code 10 occurs (retryable)") } // TestRetrieveActivationDetails_ConnectingStateTimeout tests the 3-minute timeout for connecting state @@ -849,11 +871,14 @@ func TestPerformDeactivationAsync_AMTInfoFailures(t *testing.T) { // TestRetrieveActivationDetails_UnableToAuthenticateWithAMT tests the "Unable to authenticate with AMT" scenario func TestRetrieveActivationDetails_UnableToAuthenticateWithAMT(t *testing.T) { var capturedRequests []*pb.ActivationResultRequest + var mu sync.Mutex // Protect capturedRequests from concurrent access mockServer := &mockDeviceManagementServer{ operationType: pb.OperationType_ACTIVATE, onReportActivationResults: func(ctx context.Context, req *pb.ActivationResultRequest) (*pb.ActivationResultResponse, error) { + mu.Lock() capturedRequests = append(capturedRequests, req) + mu.Unlock() return &pb.ActivationResultResponse{}, nil }, } @@ -893,7 +918,11 @@ func TestRetrieveActivationDetails_UnableToAuthenticateWithAMT(t *testing.T) { // Should succeed despite the authentication failure (async deactivation triggered) assert.NoError(t, err, "RetrieveActivationDetails should succeed with async deactivation logic") + // Wait for async operations to complete + time.Sleep(100 * time.Millisecond) + // Verify that at least one activation result was reported + mu.Lock() assert.NotEmpty(t, capturedRequests, "At least one activation result should have been reported") // The "Unable to authenticate with AMT" error should trigger async deactivation @@ -905,6 +934,7 @@ func TestRetrieveActivationDetails_UnableToAuthenticateWithAMT(t *testing.T) { break } } + mu.Unlock() assert.True(t, hasActivationFailedStatus, "Should have received ACTIVATION_FAILED status due to authentication failure triggering deactivation") // Give a brief moment for async deactivation to complete