From cbb5a982644bae16afb394f00697943175bf9496 Mon Sep 17 00:00:00 2001 From: Matthew Barnes Date: Mon, 23 Feb 2026 08:57:03 -0500 Subject: [PATCH] backend: Add missing operation controllers This adds operation controllers to monitor asynchronous create, update, and delete operations for node pool and external auth resource types and removes this logic from OperationsScanner. Although it's dead code now, I left operations_scanner.go in place because there's a good deal of metrics code there that we need to decide whether to adapt to the operation controlers or discard. There was some debate about whether the current backend metrics are actually useful; tracking CS polling success rate rather than async operation success rate. --- .../oldoperationscanner/operations_scanner.go | 167 +----------------- backend/pkg/app/backend.go | 86 +++++++-- .../operation_external_auth_create.go | 79 +++++++++ .../operation_external_auth_delete.go | 97 ++++++++++ .../operation_external_auth_update.go | 84 +++++++++ .../operation_node_pool_create.go | 79 +++++++++ .../operation_node_pool_delete.go | 107 +++++++++++ .../operation_node_pool_update.go | 79 +++++++++ .../controllers/operationcontrollers/utils.go | 132 ++++++++++++++ 9 files changed, 734 insertions(+), 176 deletions(-) create mode 100644 backend/pkg/controllers/operationcontrollers/operation_external_auth_create.go create mode 100644 backend/pkg/controllers/operationcontrollers/operation_external_auth_delete.go create mode 100644 backend/pkg/controllers/operationcontrollers/operation_external_auth_update.go create mode 100644 backend/pkg/controllers/operationcontrollers/operation_node_pool_create.go create mode 100644 backend/pkg/controllers/operationcontrollers/operation_node_pool_delete.go create mode 100644 backend/pkg/controllers/operationcontrollers/operation_node_pool_update.go diff --git a/backend/oldoperationscanner/operations_scanner.go b/backend/oldoperationscanner/operations_scanner.go index 3c3ed3ad55..68b30943d2 100644 --- a/backend/oldoperationscanner/operations_scanner.go +++ b/backend/oldoperationscanner/operations_scanner.go @@ -31,10 +31,6 @@ import ( "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/trace" - arohcpv1alpha1 "github.com/openshift-online/ocm-sdk-go/arohcp/v1alpha1" - cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" - ocmerrors "github.com/openshift-online/ocm-sdk-go/errors" - "github.com/Azure/ARO-HCP/backend/pkg/controllers/operationcontrollers" "github.com/Azure/ARO-HCP/backend/pkg/listers" backendtracing "github.com/Azure/ARO-HCP/backend/pkg/tracing" @@ -60,23 +56,6 @@ const ( pollExternalAuthOperationLabel = "poll_external_auth" ) -// Copied from uhc-clusters-service, because the -// OCM SDK does not define this for some reason. -type NodePoolStateValue string - -const ( - NodePoolStateValidating NodePoolStateValue = "validating" - NodePoolStatePending NodePoolStateValue = "pending" - NodePoolStateInstalling NodePoolStateValue = "installing" - NodePoolStateReady NodePoolStateValue = "ready" - NodePoolStateUpdating NodePoolStateValue = "updating" - NodePoolStateValidatingUpdate NodePoolStateValue = "validating_update" - NodePoolStatePendingUpdate NodePoolStateValue = "pending_update" - NodePoolStateUninstalling NodePoolStateValue = "uninstalling" - NodePoolStateRecoverableError NodePoolStateValue = "recoverable_error" - NodePoolStateError NodePoolStateValue = "error" -) - type operation struct { id string doc *api.Operation @@ -95,25 +74,6 @@ func listOperationLabelValues() iter.Seq[string] { }) } -// setSpanAttributes sets the operation and resource attributes on the span. -func (o *operation) setSpanAttributes(span trace.Span) { - // Operation attributes. - span.SetAttributes( - tracing.OperationIDKey.String(string(o.id)), - tracing.OperationTypeKey.String(string(o.doc.Request)), - tracing.OperationStatusKey.String(string(o.doc.Status)), - ) - - // Resource attributes. - if o.doc.ExternalID != nil { - span.SetAttributes( - tracing.ResourceGroupNameKey.String(o.doc.ExternalID.ResourceGroupName), - tracing.ResourceNameKey.String(o.doc.ExternalID.Name), - tracing.ResourceTypeKey.String(o.doc.ExternalID.ResourceType.Type), - ) - } -} - type OperationsScanner struct { dbClient database.DBClient lockClient database.LockClientInterface @@ -466,18 +426,15 @@ func (s *OperationsScanner) processOperations(ctx context.Context, subscriptionI // processOperation processes a single operation on a resource. func (s *OperationsScanner) processOperation(ctx context.Context, op operation) { logger := utils.LoggerFromContext(ctx) - ctx, span := startChildSpan(ctx, "processOperation") + _, span := startChildSpan(ctx, "processOperation") defer span.End() logger.Info("Processing") defer logger.Info("Processed") - switch op.doc.InternalID.Kind() { - case arohcpv1alpha1.NodePoolKind: - s.pollNodePoolOperation(ctx, op) - case cmv1.ExternalAuthKind: - s.pollExternalAuthOperation(ctx, op) - } + // XXX The previous business logic of OperationsScanner has + // been converted to various Kubernetes-style controllers + // that fulfill the OperationSynchronizer interface. } func (s *OperationsScanner) recordOperationError(ctx context.Context, operationName string, err error) { @@ -490,76 +447,6 @@ func (s *OperationsScanner) recordOperationError(ctx context.Context, operationN span.RecordError(err) } -// pollNodePoolOperation updates the status of a node pool operation. -func (s *OperationsScanner) pollNodePoolOperation(ctx context.Context, op operation) { - logger := utils.LoggerFromContext(ctx) - ctx, span := startChildSpan(ctx, "pollNodePoolOperation") - defer span.End() - defer s.updateOperationMetrics(pollNodePoolOperationLabel)() - op.setSpanAttributes(span) - - nodePoolStatus, err := s.clusterService.GetNodePoolStatus(ctx, op.doc.InternalID) - if err != nil { - var ocmError *ocmerrors.Error - if errors.As(err, &ocmError) && ocmError.Status() == http.StatusNotFound && op.doc.Request == database.OperationRequestDelete { - err = operationcontrollers.SetDeleteOperationAsCompleted(ctx, s.dbClient, op.doc, s.postAsyncNotification) - if err != nil { - s.recordOperationError(ctx, pollNodePoolOperationLabel, err) - logger.Error(err, "Failed to handle a completed deletion") - } - } else { - s.recordOperationError(ctx, pollNodePoolOperationLabel, err) - logger.Error(err, "Failed to get node pool status") - } - - return - } - - opStatus, opError, err := convertNodePoolStatus(op, nodePoolStatus) - if err != nil { - s.recordOperationError(ctx, pollNodePoolOperationLabel, err) - logger.Info("Node pool status conversion warning", "error", err.Error()) - return - } - - err = operationcontrollers.UpdateOperationStatus(ctx, s.dbClient, op.doc, opStatus, opError, s.postAsyncNotification) - if err != nil { - s.recordOperationError(ctx, pollNodePoolOperationLabel, err) - logger.Error(err, "Failed to update operation status") - } -} - -// pollExternalAuthOperation updates the status of an external auth operation. -func (s *OperationsScanner) pollExternalAuthOperation(ctx context.Context, op operation) { - logger := utils.LoggerFromContext(ctx) - ctx, span := startChildSpan(ctx, "pollExternalAuthOperation") - defer span.End() - defer s.updateOperationMetrics(pollExternalAuthOperationLabel)() - op.setSpanAttributes(span) - - _, err := s.clusterService.GetExternalAuth(ctx, op.doc.InternalID) - if err != nil { - var ocmError *ocmerrors.Error - if errors.As(err, &ocmError) && ocmError.Status() == http.StatusNotFound && op.doc.Request == database.OperationRequestDelete { - err = operationcontrollers.SetDeleteOperationAsCompleted(ctx, s.dbClient, op.doc, s.postAsyncNotification) - if err != nil { - s.recordOperationError(ctx, pollExternalAuthOperationLabel, err) - logger.Error(err, "Failed to handle a completed deletion") - } - } else { - s.recordOperationError(ctx, pollExternalAuthOperationLabel, err) - logger.Error(err, "Failed to get external auth status") - } - - return - } - err = operationcontrollers.UpdateOperationStatus(ctx, s.dbClient, op.doc, arm.ProvisioningStateSucceeded, nil, s.postAsyncNotification) - if err != nil { - s.recordOperationError(ctx, pollExternalAuthOperationLabel, err) - logger.Error(err, "Failed to update operation status") - } -} - // withSubscriptionLock holds a subscription lock while executing the given function. // In the event the subscription lock is lost, the context passed to the function will // be canceled. @@ -602,52 +489,6 @@ func (s *OperationsScanner) postAsyncNotification(ctx context.Context, operation return operationcontrollers.PostAsyncNotification(ctx, s.notificationClient, operation) } -// convertNodePoolStatus attempts to translate a NodePoolStatus object -// from Cluster Service into an ARM provisioning state and, if necessary, -// a structured OData error. -func convertNodePoolStatus(op operation, nodePoolStatus *arohcpv1alpha1.NodePoolStatus) (arm.ProvisioningState, *arm.CloudErrorBody, error) { - var opStatus = op.doc.Status - var opError *arm.CloudErrorBody - var err error - - switch state := NodePoolStateValue(nodePoolStatus.State().NodePoolStateValue()); state { - case NodePoolStateValidating, NodePoolStatePending, NodePoolStateValidatingUpdate, NodePoolStatePendingUpdate: - // These are valid node pool states for ARO-HCP but there are - // no unique ProvisioningState values for them. They should - // only occur when ProvisioningState is Accepted. - if opStatus != arm.ProvisioningStateAccepted { - err = fmt.Errorf("got NodePoolStatusValue '%s' while ProvisioningState was '%s' instead of '%s'", state, opStatus, arm.ProvisioningStateAccepted) - } - case NodePoolStateInstalling: - opStatus = arm.ProvisioningStateProvisioning - case NodePoolStateReady: - // Resource deletion is successful when fetching its state - // from Cluster Service returns a "404 Not Found" error. If - // we see the resource in a "Ready" state during a deletion - // operation, leave the current provisioning state as is. - if op.doc.Request != database.OperationRequestDelete { - opStatus = arm.ProvisioningStateSucceeded - } - case NodePoolStateUpdating: - opStatus = arm.ProvisioningStateUpdating - case NodePoolStateUninstalling: - opStatus = arm.ProvisioningStateDeleting - case NodePoolStateRecoverableError, NodePoolStateError: - // XXX OCM SDK offers no error code or message for failed node pool - // operations so "Internal Server Error" is all we can do for now. - // https://issues.redhat.com/browse/ARO-14969 - opStatus = arm.ProvisioningStateFailed - opError = arm.NewInternalServerError().CloudErrorBody - if msg, ok := nodePoolStatus.GetMessage(); ok { - opError.Message = msg - } - default: - err = fmt.Errorf("unhandled NodePoolState '%s'", state) - } - - return opStatus, opError, err -} - // StartRootSpan initiates a new parent trace. func StartRootSpan(ctx context.Context, name string) (context.Context, trace.Span) { return otel.GetTracerProvider(). diff --git a/backend/pkg/app/backend.go b/backend/pkg/app/backend.go index 29fe585ebf..3fb41a711a 100644 --- a/backend/pkg/app/backend.go +++ b/backend/pkg/app/backend.go @@ -20,7 +20,6 @@ import ( "fmt" "net/http" "sync" - "sync/atomic" "time" "github.com/prometheus/client_golang/prometheus" @@ -31,7 +30,6 @@ import ( "k8s.io/client-go/tools/leaderelection/resourcelock" utilsclock "k8s.io/utils/clock" - "github.com/Azure/ARO-HCP/backend/oldoperationscanner" "github.com/Azure/ARO-HCP/backend/pkg/controllers" "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" "github.com/Azure/ARO-HCP/backend/pkg/controllers/mismatchcontrollers" @@ -240,9 +238,6 @@ func (b *Backend) runBackendControllersUnderLeaderElection(ctx context.Context, activeOperationInformer, activeOperationLister := backendInformers.ActiveOperations() clusterInformer, _ := backendInformers.Clusters() - startedLeading := atomic.Bool{} - operationsScanner := oldoperationscanner.NewOperationsScanner( - b.options.CosmosDBClient, b.options.ClustersServiceClient, b.options.AzureLocation, subscriptionLister) dataDumpController := controllerutils.NewClusterWatchingController( "DataDump", b.options.CosmosDBClient, clusterInformer, 1*time.Minute, controllers.NewDataDumpController(activeOperationLister, b.options.CosmosDBClient)) doNothingController := controllers.NewDoNothingExampleController(b.options.CosmosDBClient, subscriptionLister) @@ -280,6 +275,72 @@ func (b *Backend) runBackendControllersUnderLeaderElection(ctx context.Context, activeOperationInformer, b.options.CosmosDBClient, ) + operationNodePoolCreateController := operationcontrollers.NewGenericOperationController( + "OperationNodePoolCreate", + operationcontrollers.NewOperationNodePoolCreateSynchronizer( + b.options.CosmosDBClient, + b.options.ClustersServiceClient, + http.DefaultClient, + ), + 10*time.Second, + activeOperationInformer, + b.options.CosmosDBClient, + ) + operationNodePoolUpdateController := operationcontrollers.NewGenericOperationController( + "OperationNodePoolUpdate", + operationcontrollers.NewOperationNodePoolUpdateSynchronizer( + b.options.CosmosDBClient, + b.options.ClustersServiceClient, + http.DefaultClient, + ), + 10*time.Second, + activeOperationInformer, + b.options.CosmosDBClient, + ) + operationNodePoolDeleteController := operationcontrollers.NewGenericOperationController( + "OperationNodePoolDelete", + operationcontrollers.NewOperationNodePoolDeleteSynchronizer( + b.options.CosmosDBClient, + b.options.ClustersServiceClient, + http.DefaultClient, + ), + 10*time.Second, + activeOperationInformer, + b.options.CosmosDBClient, + ) + operationExternalAuthCreateController := operationcontrollers.NewGenericOperationController( + "OperationExternalAuthCreate", + operationcontrollers.NewOperationExternalAuthCreateSynchronizer( + b.options.CosmosDBClient, + b.options.ClustersServiceClient, + http.DefaultClient, + ), + 10*time.Second, + activeOperationInformer, + b.options.CosmosDBClient, + ) + operationExternalAuthUpdateController := operationcontrollers.NewGenericOperationController( + "OperationExternalAuthUpdate", + operationcontrollers.NewOperationExternalAuthUpdateSynchronizer( + b.options.CosmosDBClient, + b.options.ClustersServiceClient, + http.DefaultClient, + ), + 10*time.Second, + activeOperationInformer, + b.options.CosmosDBClient, + ) + operationExternalAuthDeleteController := operationcontrollers.NewGenericOperationController( + "OperationExternalAuthDelete", + operationcontrollers.NewOperationExternalAuthDeleteSynchronizer( + b.options.CosmosDBClient, + b.options.ClustersServiceClient, + http.DefaultClient, + ), + 10*time.Second, + activeOperationInformer, + b.options.CosmosDBClient, + ) operationRequestCredentialController := operationcontrollers.NewGenericOperationController( "OperationRequestCredential", operationcontrollers.NewOperationRequestCredentialSynchronizer( @@ -333,18 +394,20 @@ func (b *Backend) runBackendControllersUnderLeaderElection(ctx context.Context, RetryPeriod: leaderElectionRetryPeriod, Callbacks: leaderelection.LeaderCallbacks{ OnStartedLeading: func(ctx context.Context) { - operationsScanner.LeaderGauge.Set(1) - startedLeading.Store(true) - // start the SharedInformers go backendInformers.RunWithContext(ctx) - go operationsScanner.Run(ctx) go dataDumpController.Run(ctx, 20) go doNothingController.Run(ctx, 20) go operationClusterCreateController.Run(ctx, 20) go operationClusterUpdateController.Run(ctx, 20) go operationClusterDeleteController.Run(ctx, 20) + go operationNodePoolCreateController.Run(ctx, 20) + go operationNodePoolUpdateController.Run(ctx, 20) + go operationNodePoolDeleteController.Run(ctx, 20) + go operationExternalAuthCreateController.Run(ctx, 20) + go operationExternalAuthUpdateController.Run(ctx, 20) + go operationExternalAuthDeleteController.Run(ctx, 20) go operationRequestCredentialController.Run(ctx, 20) go operationRevokeCredentialsController.Run(ctx, 20) go clusterServiceMatchingClusterController.Run(ctx, 20) @@ -356,10 +419,7 @@ func (b *Backend) runBackendControllersUnderLeaderElection(ctx context.Context, go triggerControlPlaneUpgradeController.Run(ctx, 20) }, OnStoppedLeading: func() { - operationsScanner.LeaderGauge.Set(0) - if startedLeading.Load() { - operationsScanner.Join() - } + // This needs to be defined even though it does nothing. }, }, ReleaseOnCancel: true, diff --git a/backend/pkg/controllers/operationcontrollers/operation_external_auth_create.go b/backend/pkg/controllers/operationcontrollers/operation_external_auth_create.go new file mode 100644 index 0000000000..e53b8f49f9 --- /dev/null +++ b/backend/pkg/controllers/operationcontrollers/operation_external_auth_create.go @@ -0,0 +1,79 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operationcontrollers + +import ( + "context" + "fmt" + "net/http" + "strings" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type operationExternalAuthCreate struct { + cosmosClient database.DBClient + clusterServiceClient ocm.ClusterServiceClientSpec + notificationClient *http.Client +} + +func NewOperationExternalAuthCreateSynchronizer( + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + notificationClient *http.Client, +) OperationSynchronizer { + c := &operationExternalAuthCreate{ + cosmosClient: cosmosClient, + clusterServiceClient: clusterServiceClient, + notificationClient: notificationClient, + } + + return c +} + +func (c *operationExternalAuthCreate) ShouldProcess(ctx context.Context, operation *api.Operation) bool { + if operation.Status.IsTerminal() { + return false + } + if operation.Request != database.OperationRequestCreate { + return false + } + if operation.ExternalID == nil || !strings.EqualFold(operation.ExternalID.ResourceType.String(), api.ExternalAuthResourceType.String()) { + return false + } + return true +} + +func (c *operationExternalAuthCreate) SynchronizeOperation(ctx context.Context, key controllerutils.OperationKey) error { + logger := utils.LoggerFromContext(ctx) + logger.Info("checking operation") + + operation, err := c.cosmosClient.Operations(key.SubscriptionID).Get(ctx, key.OperationName) + if database.IsResponseError(err, http.StatusNotFound) { + return nil // no work to do + } + if err != nil { + return fmt.Errorf("failed to get active operation: %w", err) + } + if !c.ShouldProcess(ctx, operation) { + return nil // no work to do + } + + return pollExternalAuthStatus(ctx, c.cosmosClient, c.clusterServiceClient, operation, c.notificationClient) +} diff --git a/backend/pkg/controllers/operationcontrollers/operation_external_auth_delete.go b/backend/pkg/controllers/operationcontrollers/operation_external_auth_delete.go new file mode 100644 index 0000000000..5a2fc77a15 --- /dev/null +++ b/backend/pkg/controllers/operationcontrollers/operation_external_auth_delete.go @@ -0,0 +1,97 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operationcontrollers + +import ( + "context" + "errors" + "fmt" + "net/http" + "strings" + + ocmerrors "github.com/openshift-online/ocm-sdk-go/errors" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type operationExternalAuthDelete struct { + cosmosClient database.DBClient + clusterServiceClient ocm.ClusterServiceClientSpec + notificationClient *http.Client +} + +// NewOperationExternalAuthDeleteSynchronizer periodically lists all clusters and for each out when the cluster was deleted and its state. +func NewOperationExternalAuthDeleteSynchronizer( + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + notificationClient *http.Client, +) OperationSynchronizer { + c := &operationExternalAuthDelete{ + cosmosClient: cosmosClient, + clusterServiceClient: clusterServiceClient, + notificationClient: notificationClient, + } + + return c +} + +func (c *operationExternalAuthDelete) ShouldProcess(ctx context.Context, operation *api.Operation) bool { + if operation.Status.IsTerminal() { + return false + } + if operation.Request != database.OperationRequestDelete { + return false + } + if operation.ExternalID == nil || !strings.EqualFold(operation.ExternalID.ResourceType.String(), api.ExternalAuthResourceType.String()) { + return false + } + return true +} + +func (c *operationExternalAuthDelete) SynchronizeOperation(ctx context.Context, key controllerutils.OperationKey) error { + logger := utils.LoggerFromContext(ctx) + logger.Info("checking operation") + + operation, err := c.cosmosClient.Operations(key.SubscriptionID).Get(ctx, key.OperationName) + if database.IsResponseError(err, http.StatusNotFound) { + return nil // no work to do + } + if err != nil { + return fmt.Errorf("failed to get active operation: %w", err) + } + if !c.ShouldProcess(ctx, operation) { + return nil // no work to do + } + + _, err = c.clusterServiceClient.GetExternalAuth(ctx, operation.InternalID) + var ocmGetExternalAuthError *ocmerrors.Error + if err != nil && errors.As(err, &ocmGetExternalAuthError) && ocmGetExternalAuthError.Status() == http.StatusNotFound { + logger.Info("node pool was deleted") + + err = SetDeleteOperationAsCompleted(ctx, c.cosmosClient, operation, postAsyncNotificationFn(c.notificationClient)) + if err != nil { + logger.Error(err, "Failed to handle a completed deletion") + } + } + if err != nil { + return utils.TrackError(err) + } + + return nil +} diff --git a/backend/pkg/controllers/operationcontrollers/operation_external_auth_update.go b/backend/pkg/controllers/operationcontrollers/operation_external_auth_update.go new file mode 100644 index 0000000000..ebc5395dad --- /dev/null +++ b/backend/pkg/controllers/operationcontrollers/operation_external_auth_update.go @@ -0,0 +1,84 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operationcontrollers + +import ( + "context" + "fmt" + "net/http" + "strings" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type operationExternalAuthUpdate struct { + cosmosClient database.DBClient + clusterServiceClient ocm.ClusterServiceClientSpec + notificationClient *http.Client +} + +func NewOperationExternalAuthUpdateSynchronizer( + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + notificationClient *http.Client, +) OperationSynchronizer { + c := &operationExternalAuthUpdate{ + cosmosClient: cosmosClient, + clusterServiceClient: clusterServiceClient, + notificationClient: notificationClient, + } + + return c +} + +func (c *operationExternalAuthUpdate) ShouldProcess(ctx context.Context, operation *api.Operation) bool { + if operation.Status.IsTerminal() { + return false + } + if operation.Request != database.OperationRequestUpdate { + return false + } + if operation.ExternalID == nil || !strings.EqualFold(operation.ExternalID.ResourceType.String(), api.ExternalAuthResourceType.String()) { + return false + } + return true +} + +func (c *operationExternalAuthUpdate) SynchronizeOperation(ctx context.Context, key controllerutils.OperationKey) error { + logger := utils.LoggerFromContext(ctx) + logger.Info("checking operation") + + operation, err := c.cosmosClient.Operations(key.SubscriptionID).Get(ctx, key.OperationName) + if database.IsResponseError(err, http.StatusNotFound) { + return nil // no work to do + } + if err != nil { + return fmt.Errorf("failed to get active operation: %w", err) + } + if !c.ShouldProcess(ctx, operation) { + return nil // no work to do + } + + _, err = c.clusterServiceClient.GetExternalAuth(ctx, operation.InternalID) + if err != nil { + return utils.TrackError(err) + } + + return pollExternalAuthStatus(ctx, c.cosmosClient, c.clusterServiceClient, operation, c.notificationClient) +} diff --git a/backend/pkg/controllers/operationcontrollers/operation_node_pool_create.go b/backend/pkg/controllers/operationcontrollers/operation_node_pool_create.go new file mode 100644 index 0000000000..b0c4c899ea --- /dev/null +++ b/backend/pkg/controllers/operationcontrollers/operation_node_pool_create.go @@ -0,0 +1,79 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operationcontrollers + +import ( + "context" + "fmt" + "net/http" + "strings" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type operationNodePoolCreate struct { + cosmosClient database.DBClient + clusterServiceClient ocm.ClusterServiceClientSpec + notificationClient *http.Client +} + +func NewOperationNodePoolCreateSynchronizer( + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + notificationClient *http.Client, +) OperationSynchronizer { + c := &operationNodePoolCreate{ + cosmosClient: cosmosClient, + clusterServiceClient: clusterServiceClient, + notificationClient: notificationClient, + } + + return c +} + +func (c *operationNodePoolCreate) ShouldProcess(ctx context.Context, operation *api.Operation) bool { + if operation.Status.IsTerminal() { + return false + } + if operation.Request != database.OperationRequestCreate { + return false + } + if operation.ExternalID == nil || !strings.EqualFold(operation.ExternalID.ResourceType.String(), api.NodePoolResourceType.String()) { + return false + } + return true +} + +func (c *operationNodePoolCreate) SynchronizeOperation(ctx context.Context, key controllerutils.OperationKey) error { + logger := utils.LoggerFromContext(ctx) + logger.Info("checking operation") + + operation, err := c.cosmosClient.Operations(key.SubscriptionID).Get(ctx, key.OperationName) + if database.IsResponseError(err, http.StatusNotFound) { + return nil // no work to do + } + if err != nil { + return fmt.Errorf("failed to get active operation: %w", err) + } + if !c.ShouldProcess(ctx, operation) { + return nil // no work to do + } + + return pollNodePoolStatus(ctx, c.cosmosClient, c.clusterServiceClient, operation, c.notificationClient) +} diff --git a/backend/pkg/controllers/operationcontrollers/operation_node_pool_delete.go b/backend/pkg/controllers/operationcontrollers/operation_node_pool_delete.go new file mode 100644 index 0000000000..6e6e67a147 --- /dev/null +++ b/backend/pkg/controllers/operationcontrollers/operation_node_pool_delete.go @@ -0,0 +1,107 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operationcontrollers + +import ( + "context" + "errors" + "fmt" + "net/http" + "strings" + + ocmerrors "github.com/openshift-online/ocm-sdk-go/errors" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type operationNodePoolDelete struct { + cosmosClient database.DBClient + clusterServiceClient ocm.ClusterServiceClientSpec + notificationClient *http.Client +} + +// NewOperationNodePoolDeleteSynchronizer periodically lists all clusters and for each out when the cluster was deleted and its state. +func NewOperationNodePoolDeleteSynchronizer( + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + notificationClient *http.Client, +) OperationSynchronizer { + c := &operationNodePoolDelete{ + cosmosClient: cosmosClient, + clusterServiceClient: clusterServiceClient, + notificationClient: notificationClient, + } + + return c +} + +func (c *operationNodePoolDelete) ShouldProcess(ctx context.Context, operation *api.Operation) bool { + if operation.Status.IsTerminal() { + return false + } + if operation.Request != database.OperationRequestDelete { + return false + } + if operation.ExternalID == nil || !strings.EqualFold(operation.ExternalID.ResourceType.String(), api.NodePoolResourceType.String()) { + return false + } + return true +} + +func (c *operationNodePoolDelete) SynchronizeOperation(ctx context.Context, key controllerutils.OperationKey) error { + logger := utils.LoggerFromContext(ctx) + logger.Info("checking operation") + + operation, err := c.cosmosClient.Operations(key.SubscriptionID).Get(ctx, key.OperationName) + if database.IsResponseError(err, http.StatusNotFound) { + return nil // no work to do + } + if err != nil { + return fmt.Errorf("failed to get active operation: %w", err) + } + if !c.ShouldProcess(ctx, operation) { + return nil // no work to do + } + + clusterStatus, err := c.clusterServiceClient.GetNodePoolStatus(ctx, operation.InternalID) + var ocmGetNodePoolError *ocmerrors.Error + if err != nil && errors.As(err, &ocmGetNodePoolError) && ocmGetNodePoolError.Status() == http.StatusNotFound { + logger.Info("node pool was deleted") + + err = SetDeleteOperationAsCompleted(ctx, c.cosmosClient, operation, postAsyncNotificationFn(c.notificationClient)) + if err != nil { + logger.Error(err, "Failed to handle a completed deletion") + } + } + if err != nil { + return utils.TrackError(err) + } + + newOperationStatus, newOperationError, err := convertNodePoolStatus(operation, clusterStatus) + if err != nil { + return utils.TrackError(err) + } + + err = UpdateOperationStatus(ctx, c.cosmosClient, operation, newOperationStatus, newOperationError, postAsyncNotificationFn(c.notificationClient)) + if err != nil { + return utils.TrackError(err) + } + + return nil +} diff --git a/backend/pkg/controllers/operationcontrollers/operation_node_pool_update.go b/backend/pkg/controllers/operationcontrollers/operation_node_pool_update.go new file mode 100644 index 0000000000..39ef2784c6 --- /dev/null +++ b/backend/pkg/controllers/operationcontrollers/operation_node_pool_update.go @@ -0,0 +1,79 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operationcontrollers + +import ( + "context" + "fmt" + "net/http" + "strings" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type operationNodePoolUpdate struct { + cosmosClient database.DBClient + clusterServiceClient ocm.ClusterServiceClientSpec + notificationClient *http.Client +} + +func NewOperationNodePoolUpdateSynchronizer( + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + notificationClient *http.Client, +) OperationSynchronizer { + c := &operationNodePoolUpdate{ + cosmosClient: cosmosClient, + clusterServiceClient: clusterServiceClient, + notificationClient: notificationClient, + } + + return c +} + +func (c *operationNodePoolUpdate) ShouldProcess(ctx context.Context, operation *api.Operation) bool { + if operation.Status.IsTerminal() { + return false + } + if operation.Request != database.OperationRequestUpdate { + return false + } + if operation.ExternalID == nil || !strings.EqualFold(operation.ExternalID.ResourceType.String(), api.NodePoolResourceType.String()) { + return false + } + return true +} + +func (c *operationNodePoolUpdate) SynchronizeOperation(ctx context.Context, key controllerutils.OperationKey) error { + logger := utils.LoggerFromContext(ctx) + logger.Info("checking operation") + + operation, err := c.cosmosClient.Operations(key.SubscriptionID).Get(ctx, key.OperationName) + if database.IsResponseError(err, http.StatusNotFound) { + return nil // no work to do + } + if err != nil { + return fmt.Errorf("failed to get active operation: %w", err) + } + if !c.ShouldProcess(ctx, operation) { + return nil // no work to do + } + + return pollNodePoolStatus(ctx, c.cosmosClient, c.clusterServiceClient, operation, c.notificationClient) +} diff --git a/backend/pkg/controllers/operationcontrollers/utils.go b/backend/pkg/controllers/operationcontrollers/utils.go index b8b8babae8..85c4efbf88 100644 --- a/backend/pkg/controllers/operationcontrollers/utils.go +++ b/backend/pkg/controllers/operationcontrollers/utils.go @@ -44,6 +44,23 @@ var localClock clock.Clock = clock.RealClock{} type PostAsyncNotificationFunc func(ctx context.Context, operation *api.Operation) error +// Copied from uhc-clusters-service, because the +// OCM SDK does not define this for some reason. +type NodePoolStateValue string + +const ( + NodePoolStateValidating NodePoolStateValue = "validating" + NodePoolStatePending NodePoolStateValue = "pending" + NodePoolStateInstalling NodePoolStateValue = "installing" + NodePoolStateReady NodePoolStateValue = "ready" + NodePoolStateUpdating NodePoolStateValue = "updating" + NodePoolStateValidatingUpdate NodePoolStateValue = "validating_update" + NodePoolStatePendingUpdate NodePoolStateValue = "pending_update" + NodePoolStateUninstalling NodePoolStateValue = "uninstalling" + NodePoolStateRecoverableError NodePoolStateValue = "recoverable_error" + NodePoolStateError NodePoolStateValue = "error" +) + // UpdateOperationStatus updates Cosmos DB to reflect an updated resource status. func UpdateOperationStatus(ctx context.Context, cosmosClient database.DBClient, existingOperation *api.Operation, newOperationStatus arm.ProvisioningState, newOperationError *arm.CloudErrorBody, postAsyncNotificationFn PostAsyncNotificationFunc) error { logger := utils.LoggerFromContext(ctx) @@ -341,6 +358,121 @@ func convertClusterStatus(ctx context.Context, clusterServiceClient ocm.ClusterS return newOperationStatus, opError, err } +// pollNodePoolStatus converts a node pool status from Cluster +// Service to info for an Azure async operation status endpoint. +func pollNodePoolStatus( + ctx context.Context, + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + operation *api.Operation, + notificationClient *http.Client) error { + // XXX This is currently called by the operationNodePoolCreate and + // operationNodePoolUpdate controllers because the logic flows + // are identical. If the logic flows ever diverge, then this + // function should be split up and the pieces moved back to + // their respective controllers. + + logger := utils.LoggerFromContext(ctx) + + nodePoolStatus, err := clusterServiceClient.GetNodePoolStatus(ctx, operation.InternalID) + if err != nil { + return utils.TrackError(err) + } + + newOperationStatus, newOperationError, err := convertNodePoolStatus(operation, nodePoolStatus) + if err != nil { + return utils.TrackError(err) + } + logger.Info("new status", "newStatus", newOperationStatus) + + logger.Info("updating status") + err = UpdateOperationStatus(ctx, cosmosClient, operation, newOperationStatus, newOperationError, postAsyncNotificationFn(notificationClient)) + if err != nil { + return utils.TrackError(err) + } + + return nil +} + +// convertNodePoolStatus attempts to translate a NodePoolStatus object +// from Cluster Service into an ARM provisioning state and, if necessary, +// a structured OData error. +func convertNodePoolStatus(operation *api.Operation, nodePoolStatus *arohcpv1alpha1.NodePoolStatus) (arm.ProvisioningState, *arm.CloudErrorBody, error) { + var newOperationStatus = operation.Status + var opError *arm.CloudErrorBody + var err error + + switch state := NodePoolStateValue(nodePoolStatus.State().NodePoolStateValue()); state { + case NodePoolStateValidating, NodePoolStatePending, NodePoolStateValidatingUpdate, NodePoolStatePendingUpdate: + // These are valid node pool states for ARO-HCP but there are + // no unique ProvisioningState values for them. They should + // only occur when ProvisioningState is Accepted. + if operation.Status != arm.ProvisioningStateAccepted { + err = fmt.Errorf("got NodePoolStatusValue '%s' while ProvisioningState was '%s' instead of '%s'", state, operation.Status, arm.ProvisioningStateAccepted) + } + case NodePoolStateInstalling: + newOperationStatus = arm.ProvisioningStateProvisioning + case NodePoolStateReady: + // Resource deletion is successful when fetching its state + // from Cluster Service returns a "404 Not Found" error. If + // we see the resource in a "Ready" state during a deletion + // operation, leave the current provisioning state as is. + if operation.Request != database.OperationRequestDelete { + newOperationStatus = arm.ProvisioningStateSucceeded + } + case NodePoolStateUpdating: + newOperationStatus = arm.ProvisioningStateUpdating + case NodePoolStateUninstalling: + newOperationStatus = arm.ProvisioningStateDeleting + case NodePoolStateRecoverableError, NodePoolStateError: + // XXX OCM SDK offers no error code or message for failed node pool + // operations so "Internal Server Error" is all we can do for now. + // https://issues.redhat.com/browse/ARO-14969 + newOperationStatus = arm.ProvisioningStateFailed + opError = arm.NewInternalServerError().CloudErrorBody + if msg, ok := nodePoolStatus.GetMessage(); ok { + opError.Message = msg + } + default: + err = fmt.Errorf("unhandled NodePoolState '%s'", state) + } + + return newOperationStatus, opError, err +} + +// pollExternalAuthStatus converts an external auth status from Cluster +// Service to info for an Azure async operation status endpoint. +func pollExternalAuthStatus( + ctx context.Context, + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + operation *api.Operation, + notificationClient *http.Client) error { + // XXX This is currently called by the operationExternalAuthCreate and + // operationExternalAuthUpdate controllers because the logic flows + // are identical. If the logic flows ever diverge, then this + // function should be split up and the pieces moved back to + // their respective controllers. + + logger := utils.LoggerFromContext(ctx) + + _, err := clusterServiceClient.GetExternalAuth(ctx, operation.InternalID) + if err != nil { + return utils.TrackError(err) + } + + newOperationStatus := arm.ProvisioningStateSucceeded + logger.Info("new status", "newStatus", newOperationStatus) + + logger.Info("updating status") + err = UpdateOperationStatus(ctx, cosmosClient, operation, newOperationStatus, nil, postAsyncNotificationFn(notificationClient)) + if err != nil { + return utils.TrackError(err) + } + + return nil +} + // convertInflightChecks gets a cluster internal ID, fetches inflight check errors from CS endpoint, and converts them // to arm.CloudErrorBody type. // The function should be triggered only if inflight errors occurred with provision error code OCM4001.