Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ func main() {

hostID := confs.GUID

// Declare lastCheckTimestamp here so it can be used in the callback
var lastCheckTimestamp int64

op := func() error {
log.Infof("Reporting AMT status for host %s", hostID)
status, err := dmMgrClient.ReportAMTStatus(auth.GetAuthContext(ctx, confs.AccessTokenPath), hostID)
Expand Down Expand Up @@ -178,11 +181,16 @@ func main() {
}
log.Infof("Successfully reported AMT status for host %s", hostID)

// Set up status update callback for long-running activation operations
dmMgrClient.SetStatusUpdateCallback(func() {
atomic.StoreInt64(&lastCheckTimestamp, time.Now().Unix())
log.Info("Status timestamp updated via callback during activation operation")
})

var (
wg sync.WaitGroup
activationCheckInterval = confs.Manageability.HeartbeatInterval
lastActivationCheckTimestamp int64
lastCheckTimestamp int64
)

wg.Add(1)
Expand Down
132 changes: 100 additions & 32 deletions platform-manageability-agent/internal/comms/comms.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,18 @@ type Client struct {
connectingStateStartTime *time.Time // Track when AMT entered "connecting" state
previousState string // Track the previous AMT state to detect direct transitions
deactivationInProgress bool // Track if deactivation is currently in progress
activationInProgress bool // Track if activation is currently in progress
statusUpdateCallback func() // Callback to update timestamp before long-running activation operations
mu sync.RWMutex // Protects concurrent access to the fields above
}

// SetStatusUpdateCallback sets the callback function to update status timestamp
func (cli *Client) SetStatusUpdateCallback(callback func()) {
cli.mu.Lock()
defer cli.mu.Unlock()
cli.statusUpdateCallback = callback
}

func WithNetworkDialer(serviceAddr string) func(*Client) {
return func(s *Client) {
s.Dialer = grpc.WithContextDialer(func(ctx context.Context, s string) (net.Conn, error) {
Expand Down Expand Up @@ -279,38 +288,8 @@ func (cli *Client) RetrieveActivationDetails(ctx context.Context, hostID string,
cli.connectingStateStartTime = nil
cli.mu.Unlock()

// Execute activation command
rpsAddress := fmt.Sprintf("wss://%s/activate", conf.RPSAddress)
password := resp.ActionPassword
activationOutput, activationErr := cli.Executor.ExecuteAMTActivate(rpsAddress, resp.ProfileName, password)

// handles intermittent activation failures
// and allows the main periodic timer to retry activation in the next cycle
outputStr := string(activationOutput)
if strings.Contains(outputStr, `msg="interrupted system call"`) ||
strings.Contains(outputStr, "exit code: 10") {
log.Logger.Warnf("Interrupted system call detected for host %s - retrying next cycle", hostID)
}
if strings.Contains(outputStr, `msg="Unable to authenticate with AMT"`) {
log.Logger.Warnf("Unable to authenticate with AMT for host %s - triggering deactivation", hostID)
cli.triggerDeactivationAsync(hostID)
}

if activationErr != nil {
log.Logger.Errorf("Failed to execute activation command for host %s: %v", hostID, activationErr)
}

// Check provisioning status
ok := cli.isProvisioned(ctx, outputStr, hostID)
if !ok {
log.Logger.Errorf("Failed to execute activation command for host %s: %v, Output: %s",
hostID, activationErr, outputStr)
activationStatus = pb.ActivationStatus_ACTIVATION_FAILED
} else {
log.Logger.Debugf("Activation command output for host %s: %s", hostID, outputStr)
activationStatus = pb.ActivationStatus_ACTIVATING
log.Logger.Debugf("setting activation status to %s: %s", activationStatus, hostID)
}
// Trigger async activation and return immediately
activationStatus = cli.triggerActivationAsync(ctx, hostID, conf, resp.ProfileName, resp.ActionPassword)
case "connecting":
log.Logger.Infof("host %s is in 'connecting' state", hostID)
activationStatus = cli.handleConnectingState(hostID, normalizedStatus)
Expand Down Expand Up @@ -387,6 +366,95 @@ func (cli *Client) handleConnectingState(hostID string, currentState string) pb.
return pb.ActivationStatus_ACTIVATING
}

// triggerActivationAsync launches activation in background goroutine and returns immediately
// The periodic timer will retry if activation fails and RAS status remains "not connected"
func (cli *Client) triggerActivationAsync(ctx context.Context, hostID string, conf *config.Config, profileName, password string) pb.ActivationStatus {
cli.mu.Lock()
defer cli.mu.Unlock()

// Only trigger activation if not already in progress
if cli.activationInProgress {
log.Logger.Infof("Activation already in progress for host %s, skipping", hostID)
return pb.ActivationStatus_ACTIVATING
}

// Mark activation as in progress
cli.activationInProgress = true
log.Logger.Infof("Starting async activation for host %s", hostID)

// Launch goroutine for activation
go cli.performActivationAsync(ctx, hostID, conf, profileName, password)
// Return ACTIVATING immediately to indicate activation started
return pb.ActivationStatus_ACTIVATING
}

// performActivationAsync executes activation in background and updates status timestamp periodically
func (cli *Client) performActivationAsync(ctx context.Context, hostID string, conf *config.Config, profileName, password string) {
log.Logger.Infof("Starting async activation for host %s", hostID)

// Always reset activation in progress flag when done
defer func() {
cli.mu.Lock()
cli.activationInProgress = false
cli.mu.Unlock()
}()

// Update status timestamp before starting long-running operation
cli.mu.RLock()
if cli.statusUpdateCallback != nil {
cli.statusUpdateCallback()
log.Logger.Debug("Updated status timestamp before async activation")
}
cli.mu.RUnlock()

// Execute activation command
rpsAddress := fmt.Sprintf("wss://%s/activate", conf.RPSAddress)
activationOutput, activationErr := cli.Executor.ExecuteAMTActivate(rpsAddress, profileName, password)

// Update status timestamp after activation completes
cli.mu.RLock()
if cli.statusUpdateCallback != nil {
cli.statusUpdateCallback()
log.Logger.Debug("Updated status timestamp after async activation completed")
}
cli.mu.RUnlock()

// Handle intermittent activation failures
outputStr := string(activationOutput)
if strings.Contains(outputStr, `msg="interrupted system call"`) ||
strings.Contains(outputStr, "exit code: 10") {
log.Logger.Warnf("Interrupted system call detected for host %s - will retry next cycle", hostID)
return
}
if strings.Contains(outputStr, `msg="Unable to authenticate with AMT"`) {
log.Logger.Warnf("Unable to authenticate with AMT for host %s - triggering deactivation", hostID)
cli.triggerDeactivationAsync(hostID)
// Report activation failed before deactivation
_ = cli.reportActivationResult(ctx, hostID, pb.ActivationStatus_ACTIVATION_FAILED)
return
}

if activationErr != nil {
log.Logger.Errorf("Failed to execute activation command for host %s: %v", hostID, activationErr)
// Report activation failed to dm-manager
_ = cli.reportActivationResult(ctx, hostID, pb.ActivationStatus_ACTIVATION_FAILED)
return
}

// Check provisioning status
ok := cli.isProvisioned(ctx, outputStr, hostID)
if !ok {
log.Logger.Errorf("Failed to provision host %s - Output: %s", hostID, outputStr)
// Report activation failed to dm-manager
_ = cli.reportActivationResult(ctx, hostID, pb.ActivationStatus_ACTIVATION_FAILED)
} else {
log.Logger.Infof("Activation completed successfully for host %s", hostID)
// Activation is still in progress - will transition to ACTIVATED when RAS status becomes "connected"
// The next periodic check will detect "connected" state and report ACTIVATED
// No need to report here as activation is still progressing
}
}

// triggerDeactivationAsync launches deactivation in background goroutine and returns immediately
func (cli *Client) triggerDeactivationAsync(hostID string) pb.ActivationStatus {
cli.mu.Lock()
Expand Down
54 changes: 42 additions & 12 deletions platform-manageability-agent/internal/comms/comms_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"crypto/tls"
"fmt"
"net"
"sync"
"testing"
"time"

Expand Down Expand Up @@ -179,12 +180,15 @@ func TestRetrieveActivationDetails_Success(t *testing.T) {
}

func TestRetrieveActivationDetails_Failed(t *testing.T) {
var capturedRequest *pb.ActivationResultRequest
var capturedRequests []*pb.ActivationResultRequest
var mu sync.Mutex // Protect capturedRequests from concurrent access

mockServer := &mockDeviceManagementServer{
operationType: pb.OperationType_ACTIVATE,
onReportActivationResults: func(ctx context.Context, req *pb.ActivationResultRequest) (*pb.ActivationResultResponse, error) {
capturedRequest = req // Capture the request to verify later.
mu.Lock()
capturedRequests = append(capturedRequests, req) // Capture all requests
mu.Unlock()
log.Logger.Infof("Received ReportActivationResults request: %v", req)
return &pb.ActivationResultResponse{}, nil
},
Expand Down Expand Up @@ -217,11 +221,23 @@ func TestRetrieveActivationDetails_Failed(t *testing.T) {
})
assert.NoError(t, err, "RetrieveActivationDetails should succeed")

// Wait briefly for async activation to complete
time.Sleep(100 * time.Millisecond)

// Verify that the activation result was reported with ACTIVATION_FAILED status since CIRA is not configured.
assert.NotNil(t, capturedRequest, "Activation result should have been reported")
assert.Equal(t, "host-id", capturedRequest.HostId, "Host ID should match")
assert.Equal(t, pb.ActivationStatus_ACTIVATION_FAILED, capturedRequest.ActivationStatus,
"Activation status should be ACTIVATION_FAILED when CIRA is not configured")
mu.Lock()
assert.NotEmpty(t, capturedRequests, "At least one activation result should have been reported")

// Check that final status is ACTIVATION_FAILED
hasActivationFailed := false
for _, req := range capturedRequests {
assert.Equal(t, "host-id", req.HostId, "Host ID should match")
if req.ActivationStatus == pb.ActivationStatus_ACTIVATION_FAILED {
hasActivationFailed = true
}
}
mu.Unlock()
assert.True(t, hasActivationFailed, "Should have received ACTIVATION_FAILED status when CIRA is not configured")
}

func TestRetrieveActivationDetails_Connecting_Timeout(t *testing.T) {
Expand Down Expand Up @@ -511,10 +527,13 @@ func TestRetrieveActivationDetails_InterruptedSystemCall(t *testing.T) {
})
assert.NoError(t, err, "RetrieveActivationDetails should succeed")

// Verify activation failed due to interrupted system call
// Wait briefly for async activation to complete
time.Sleep(100 * time.Millisecond)

// Verify activation status - interrupted system call should return ACTIVATING for retry
assert.NotNil(t, capturedRequest, "Activation result should have been reported")
assert.Equal(t, pb.ActivationStatus_ACTIVATION_FAILED, capturedRequest.ActivationStatus,
"Activation should fail when interrupted system call occurs")
assert.Equal(t, pb.ActivationStatus_ACTIVATING, capturedRequest.ActivationStatus,
"Activation should return ACTIVATING when interrupted system call occurs (retryable)")
}

// TestRetrieveActivationDetails_InterruptedSystemCallWithExitCode tests exit code 10 detection
Expand Down Expand Up @@ -556,10 +575,13 @@ func TestRetrieveActivationDetails_InterruptedSystemCallWithExitCode(t *testing.
})
assert.NoError(t, err, "RetrieveActivationDetails should succeed")

// Verify activation failed due to exit code 10
// Wait briefly for async activation to complete
time.Sleep(100 * time.Millisecond)

// Verify activation status - exit code 10 should return ACTIVATING for retry
assert.NotNil(t, capturedRequest, "Activation result should have been reported")
assert.Equal(t, pb.ActivationStatus_ACTIVATION_FAILED, capturedRequest.ActivationStatus,
"Activation should fail when exit code 10 occurs")
assert.Equal(t, pb.ActivationStatus_ACTIVATING, capturedRequest.ActivationStatus,
"Activation should return ACTIVATING when exit code 10 occurs (retryable)")
}

// TestRetrieveActivationDetails_ConnectingStateTimeout tests the 3-minute timeout for connecting state
Expand Down Expand Up @@ -849,11 +871,14 @@ func TestPerformDeactivationAsync_AMTInfoFailures(t *testing.T) {
// TestRetrieveActivationDetails_UnableToAuthenticateWithAMT tests the "Unable to authenticate with AMT" scenario
func TestRetrieveActivationDetails_UnableToAuthenticateWithAMT(t *testing.T) {
var capturedRequests []*pb.ActivationResultRequest
var mu sync.Mutex // Protect capturedRequests from concurrent access

mockServer := &mockDeviceManagementServer{
operationType: pb.OperationType_ACTIVATE,
onReportActivationResults: func(ctx context.Context, req *pb.ActivationResultRequest) (*pb.ActivationResultResponse, error) {
mu.Lock()
capturedRequests = append(capturedRequests, req)
mu.Unlock()
return &pb.ActivationResultResponse{}, nil
},
}
Expand Down Expand Up @@ -893,7 +918,11 @@ func TestRetrieveActivationDetails_UnableToAuthenticateWithAMT(t *testing.T) {
// Should succeed despite the authentication failure (async deactivation triggered)
assert.NoError(t, err, "RetrieveActivationDetails should succeed with async deactivation logic")

// Wait for async operations to complete
time.Sleep(100 * time.Millisecond)

// Verify that at least one activation result was reported
mu.Lock()
assert.NotEmpty(t, capturedRequests, "At least one activation result should have been reported")

// The "Unable to authenticate with AMT" error should trigger async deactivation
Expand All @@ -905,6 +934,7 @@ func TestRetrieveActivationDetails_UnableToAuthenticateWithAMT(t *testing.T) {
break
}
}
mu.Unlock()
assert.True(t, hasActivationFailedStatus, "Should have received ACTIVATION_FAILED status due to authentication failure triggering deactivation")

// Give a brief moment for async deactivation to complete
Expand Down