diff --git a/backend/oldoperationscanner/operations_scanner.go b/backend/oldoperationscanner/operations_scanner.go index 3c3ed3ad55..68b30943d2 100644 --- a/backend/oldoperationscanner/operations_scanner.go +++ b/backend/oldoperationscanner/operations_scanner.go @@ -31,10 +31,6 @@ import ( "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/trace" - arohcpv1alpha1 "github.com/openshift-online/ocm-sdk-go/arohcp/v1alpha1" - cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" - ocmerrors "github.com/openshift-online/ocm-sdk-go/errors" - "github.com/Azure/ARO-HCP/backend/pkg/controllers/operationcontrollers" "github.com/Azure/ARO-HCP/backend/pkg/listers" backendtracing "github.com/Azure/ARO-HCP/backend/pkg/tracing" @@ -60,23 +56,6 @@ const ( pollExternalAuthOperationLabel = "poll_external_auth" ) -// Copied from uhc-clusters-service, because the -// OCM SDK does not define this for some reason. -type NodePoolStateValue string - -const ( - NodePoolStateValidating NodePoolStateValue = "validating" - NodePoolStatePending NodePoolStateValue = "pending" - NodePoolStateInstalling NodePoolStateValue = "installing" - NodePoolStateReady NodePoolStateValue = "ready" - NodePoolStateUpdating NodePoolStateValue = "updating" - NodePoolStateValidatingUpdate NodePoolStateValue = "validating_update" - NodePoolStatePendingUpdate NodePoolStateValue = "pending_update" - NodePoolStateUninstalling NodePoolStateValue = "uninstalling" - NodePoolStateRecoverableError NodePoolStateValue = "recoverable_error" - NodePoolStateError NodePoolStateValue = "error" -) - type operation struct { id string doc *api.Operation @@ -95,25 +74,6 @@ func listOperationLabelValues() iter.Seq[string] { }) } -// setSpanAttributes sets the operation and resource attributes on the span. -func (o *operation) setSpanAttributes(span trace.Span) { - // Operation attributes. - span.SetAttributes( - tracing.OperationIDKey.String(string(o.id)), - tracing.OperationTypeKey.String(string(o.doc.Request)), - tracing.OperationStatusKey.String(string(o.doc.Status)), - ) - - // Resource attributes. - if o.doc.ExternalID != nil { - span.SetAttributes( - tracing.ResourceGroupNameKey.String(o.doc.ExternalID.ResourceGroupName), - tracing.ResourceNameKey.String(o.doc.ExternalID.Name), - tracing.ResourceTypeKey.String(o.doc.ExternalID.ResourceType.Type), - ) - } -} - type OperationsScanner struct { dbClient database.DBClient lockClient database.LockClientInterface @@ -466,18 +426,15 @@ func (s *OperationsScanner) processOperations(ctx context.Context, subscriptionI // processOperation processes a single operation on a resource. func (s *OperationsScanner) processOperation(ctx context.Context, op operation) { logger := utils.LoggerFromContext(ctx) - ctx, span := startChildSpan(ctx, "processOperation") + _, span := startChildSpan(ctx, "processOperation") defer span.End() logger.Info("Processing") defer logger.Info("Processed") - switch op.doc.InternalID.Kind() { - case arohcpv1alpha1.NodePoolKind: - s.pollNodePoolOperation(ctx, op) - case cmv1.ExternalAuthKind: - s.pollExternalAuthOperation(ctx, op) - } + // XXX The previous business logic of OperationsScanner has + // been converted to various Kubernetes-style controllers + // that fulfill the OperationSynchronizer interface. } func (s *OperationsScanner) recordOperationError(ctx context.Context, operationName string, err error) { @@ -490,76 +447,6 @@ func (s *OperationsScanner) recordOperationError(ctx context.Context, operationN span.RecordError(err) } -// pollNodePoolOperation updates the status of a node pool operation. -func (s *OperationsScanner) pollNodePoolOperation(ctx context.Context, op operation) { - logger := utils.LoggerFromContext(ctx) - ctx, span := startChildSpan(ctx, "pollNodePoolOperation") - defer span.End() - defer s.updateOperationMetrics(pollNodePoolOperationLabel)() - op.setSpanAttributes(span) - - nodePoolStatus, err := s.clusterService.GetNodePoolStatus(ctx, op.doc.InternalID) - if err != nil { - var ocmError *ocmerrors.Error - if errors.As(err, &ocmError) && ocmError.Status() == http.StatusNotFound && op.doc.Request == database.OperationRequestDelete { - err = operationcontrollers.SetDeleteOperationAsCompleted(ctx, s.dbClient, op.doc, s.postAsyncNotification) - if err != nil { - s.recordOperationError(ctx, pollNodePoolOperationLabel, err) - logger.Error(err, "Failed to handle a completed deletion") - } - } else { - s.recordOperationError(ctx, pollNodePoolOperationLabel, err) - logger.Error(err, "Failed to get node pool status") - } - - return - } - - opStatus, opError, err := convertNodePoolStatus(op, nodePoolStatus) - if err != nil { - s.recordOperationError(ctx, pollNodePoolOperationLabel, err) - logger.Info("Node pool status conversion warning", "error", err.Error()) - return - } - - err = operationcontrollers.UpdateOperationStatus(ctx, s.dbClient, op.doc, opStatus, opError, s.postAsyncNotification) - if err != nil { - s.recordOperationError(ctx, pollNodePoolOperationLabel, err) - logger.Error(err, "Failed to update operation status") - } -} - -// pollExternalAuthOperation updates the status of an external auth operation. -func (s *OperationsScanner) pollExternalAuthOperation(ctx context.Context, op operation) { - logger := utils.LoggerFromContext(ctx) - ctx, span := startChildSpan(ctx, "pollExternalAuthOperation") - defer span.End() - defer s.updateOperationMetrics(pollExternalAuthOperationLabel)() - op.setSpanAttributes(span) - - _, err := s.clusterService.GetExternalAuth(ctx, op.doc.InternalID) - if err != nil { - var ocmError *ocmerrors.Error - if errors.As(err, &ocmError) && ocmError.Status() == http.StatusNotFound && op.doc.Request == database.OperationRequestDelete { - err = operationcontrollers.SetDeleteOperationAsCompleted(ctx, s.dbClient, op.doc, s.postAsyncNotification) - if err != nil { - s.recordOperationError(ctx, pollExternalAuthOperationLabel, err) - logger.Error(err, "Failed to handle a completed deletion") - } - } else { - s.recordOperationError(ctx, pollExternalAuthOperationLabel, err) - logger.Error(err, "Failed to get external auth status") - } - - return - } - err = operationcontrollers.UpdateOperationStatus(ctx, s.dbClient, op.doc, arm.ProvisioningStateSucceeded, nil, s.postAsyncNotification) - if err != nil { - s.recordOperationError(ctx, pollExternalAuthOperationLabel, err) - logger.Error(err, "Failed to update operation status") - } -} - // withSubscriptionLock holds a subscription lock while executing the given function. // In the event the subscription lock is lost, the context passed to the function will // be canceled. @@ -602,52 +489,6 @@ func (s *OperationsScanner) postAsyncNotification(ctx context.Context, operation return operationcontrollers.PostAsyncNotification(ctx, s.notificationClient, operation) } -// convertNodePoolStatus attempts to translate a NodePoolStatus object -// from Cluster Service into an ARM provisioning state and, if necessary, -// a structured OData error. -func convertNodePoolStatus(op operation, nodePoolStatus *arohcpv1alpha1.NodePoolStatus) (arm.ProvisioningState, *arm.CloudErrorBody, error) { - var opStatus = op.doc.Status - var opError *arm.CloudErrorBody - var err error - - switch state := NodePoolStateValue(nodePoolStatus.State().NodePoolStateValue()); state { - case NodePoolStateValidating, NodePoolStatePending, NodePoolStateValidatingUpdate, NodePoolStatePendingUpdate: - // These are valid node pool states for ARO-HCP but there are - // no unique ProvisioningState values for them. They should - // only occur when ProvisioningState is Accepted. - if opStatus != arm.ProvisioningStateAccepted { - err = fmt.Errorf("got NodePoolStatusValue '%s' while ProvisioningState was '%s' instead of '%s'", state, opStatus, arm.ProvisioningStateAccepted) - } - case NodePoolStateInstalling: - opStatus = arm.ProvisioningStateProvisioning - case NodePoolStateReady: - // Resource deletion is successful when fetching its state - // from Cluster Service returns a "404 Not Found" error. If - // we see the resource in a "Ready" state during a deletion - // operation, leave the current provisioning state as is. - if op.doc.Request != database.OperationRequestDelete { - opStatus = arm.ProvisioningStateSucceeded - } - case NodePoolStateUpdating: - opStatus = arm.ProvisioningStateUpdating - case NodePoolStateUninstalling: - opStatus = arm.ProvisioningStateDeleting - case NodePoolStateRecoverableError, NodePoolStateError: - // XXX OCM SDK offers no error code or message for failed node pool - // operations so "Internal Server Error" is all we can do for now. - // https://issues.redhat.com/browse/ARO-14969 - opStatus = arm.ProvisioningStateFailed - opError = arm.NewInternalServerError().CloudErrorBody - if msg, ok := nodePoolStatus.GetMessage(); ok { - opError.Message = msg - } - default: - err = fmt.Errorf("unhandled NodePoolState '%s'", state) - } - - return opStatus, opError, err -} - // StartRootSpan initiates a new parent trace. func StartRootSpan(ctx context.Context, name string) (context.Context, trace.Span) { return otel.GetTracerProvider(). diff --git a/backend/pkg/app/backend.go b/backend/pkg/app/backend.go index 29fe585ebf..3fb41a711a 100644 --- a/backend/pkg/app/backend.go +++ b/backend/pkg/app/backend.go @@ -20,7 +20,6 @@ import ( "fmt" "net/http" "sync" - "sync/atomic" "time" "github.com/prometheus/client_golang/prometheus" @@ -31,7 +30,6 @@ import ( "k8s.io/client-go/tools/leaderelection/resourcelock" utilsclock "k8s.io/utils/clock" - "github.com/Azure/ARO-HCP/backend/oldoperationscanner" "github.com/Azure/ARO-HCP/backend/pkg/controllers" "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" "github.com/Azure/ARO-HCP/backend/pkg/controllers/mismatchcontrollers" @@ -240,9 +238,6 @@ func (b *Backend) runBackendControllersUnderLeaderElection(ctx context.Context, activeOperationInformer, activeOperationLister := backendInformers.ActiveOperations() clusterInformer, _ := backendInformers.Clusters() - startedLeading := atomic.Bool{} - operationsScanner := oldoperationscanner.NewOperationsScanner( - b.options.CosmosDBClient, b.options.ClustersServiceClient, b.options.AzureLocation, subscriptionLister) dataDumpController := controllerutils.NewClusterWatchingController( "DataDump", b.options.CosmosDBClient, clusterInformer, 1*time.Minute, controllers.NewDataDumpController(activeOperationLister, b.options.CosmosDBClient)) doNothingController := controllers.NewDoNothingExampleController(b.options.CosmosDBClient, subscriptionLister) @@ -280,6 +275,72 @@ func (b *Backend) runBackendControllersUnderLeaderElection(ctx context.Context, activeOperationInformer, b.options.CosmosDBClient, ) + operationNodePoolCreateController := operationcontrollers.NewGenericOperationController( + "OperationNodePoolCreate", + operationcontrollers.NewOperationNodePoolCreateSynchronizer( + b.options.CosmosDBClient, + b.options.ClustersServiceClient, + http.DefaultClient, + ), + 10*time.Second, + activeOperationInformer, + b.options.CosmosDBClient, + ) + operationNodePoolUpdateController := operationcontrollers.NewGenericOperationController( + "OperationNodePoolUpdate", + operationcontrollers.NewOperationNodePoolUpdateSynchronizer( + b.options.CosmosDBClient, + b.options.ClustersServiceClient, + http.DefaultClient, + ), + 10*time.Second, + activeOperationInformer, + b.options.CosmosDBClient, + ) + operationNodePoolDeleteController := operationcontrollers.NewGenericOperationController( + "OperationNodePoolDelete", + operationcontrollers.NewOperationNodePoolDeleteSynchronizer( + b.options.CosmosDBClient, + b.options.ClustersServiceClient, + http.DefaultClient, + ), + 10*time.Second, + activeOperationInformer, + b.options.CosmosDBClient, + ) + operationExternalAuthCreateController := operationcontrollers.NewGenericOperationController( + "OperationExternalAuthCreate", + operationcontrollers.NewOperationExternalAuthCreateSynchronizer( + b.options.CosmosDBClient, + b.options.ClustersServiceClient, + http.DefaultClient, + ), + 10*time.Second, + activeOperationInformer, + b.options.CosmosDBClient, + ) + operationExternalAuthUpdateController := operationcontrollers.NewGenericOperationController( + "OperationExternalAuthUpdate", + operationcontrollers.NewOperationExternalAuthUpdateSynchronizer( + b.options.CosmosDBClient, + b.options.ClustersServiceClient, + http.DefaultClient, + ), + 10*time.Second, + activeOperationInformer, + b.options.CosmosDBClient, + ) + operationExternalAuthDeleteController := operationcontrollers.NewGenericOperationController( + "OperationExternalAuthDelete", + operationcontrollers.NewOperationExternalAuthDeleteSynchronizer( + b.options.CosmosDBClient, + b.options.ClustersServiceClient, + http.DefaultClient, + ), + 10*time.Second, + activeOperationInformer, + b.options.CosmosDBClient, + ) operationRequestCredentialController := operationcontrollers.NewGenericOperationController( "OperationRequestCredential", operationcontrollers.NewOperationRequestCredentialSynchronizer( @@ -333,18 +394,20 @@ func (b *Backend) runBackendControllersUnderLeaderElection(ctx context.Context, RetryPeriod: leaderElectionRetryPeriod, Callbacks: leaderelection.LeaderCallbacks{ OnStartedLeading: func(ctx context.Context) { - operationsScanner.LeaderGauge.Set(1) - startedLeading.Store(true) - // start the SharedInformers go backendInformers.RunWithContext(ctx) - go operationsScanner.Run(ctx) go dataDumpController.Run(ctx, 20) go doNothingController.Run(ctx, 20) go operationClusterCreateController.Run(ctx, 20) go operationClusterUpdateController.Run(ctx, 20) go operationClusterDeleteController.Run(ctx, 20) + go operationNodePoolCreateController.Run(ctx, 20) + go operationNodePoolUpdateController.Run(ctx, 20) + go operationNodePoolDeleteController.Run(ctx, 20) + go operationExternalAuthCreateController.Run(ctx, 20) + go operationExternalAuthUpdateController.Run(ctx, 20) + go operationExternalAuthDeleteController.Run(ctx, 20) go operationRequestCredentialController.Run(ctx, 20) go operationRevokeCredentialsController.Run(ctx, 20) go clusterServiceMatchingClusterController.Run(ctx, 20) @@ -356,10 +419,7 @@ func (b *Backend) runBackendControllersUnderLeaderElection(ctx context.Context, go triggerControlPlaneUpgradeController.Run(ctx, 20) }, OnStoppedLeading: func() { - operationsScanner.LeaderGauge.Set(0) - if startedLeading.Load() { - operationsScanner.Join() - } + // This needs to be defined even though it does nothing. }, }, ReleaseOnCancel: true, diff --git a/backend/pkg/controllers/operationcontrollers/operation_external_auth_create.go b/backend/pkg/controllers/operationcontrollers/operation_external_auth_create.go new file mode 100644 index 0000000000..e53b8f49f9 --- /dev/null +++ b/backend/pkg/controllers/operationcontrollers/operation_external_auth_create.go @@ -0,0 +1,79 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operationcontrollers + +import ( + "context" + "fmt" + "net/http" + "strings" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type operationExternalAuthCreate struct { + cosmosClient database.DBClient + clusterServiceClient ocm.ClusterServiceClientSpec + notificationClient *http.Client +} + +func NewOperationExternalAuthCreateSynchronizer( + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + notificationClient *http.Client, +) OperationSynchronizer { + c := &operationExternalAuthCreate{ + cosmosClient: cosmosClient, + clusterServiceClient: clusterServiceClient, + notificationClient: notificationClient, + } + + return c +} + +func (c *operationExternalAuthCreate) ShouldProcess(ctx context.Context, operation *api.Operation) bool { + if operation.Status.IsTerminal() { + return false + } + if operation.Request != database.OperationRequestCreate { + return false + } + if operation.ExternalID == nil || !strings.EqualFold(operation.ExternalID.ResourceType.String(), api.ExternalAuthResourceType.String()) { + return false + } + return true +} + +func (c *operationExternalAuthCreate) SynchronizeOperation(ctx context.Context, key controllerutils.OperationKey) error { + logger := utils.LoggerFromContext(ctx) + logger.Info("checking operation") + + operation, err := c.cosmosClient.Operations(key.SubscriptionID).Get(ctx, key.OperationName) + if database.IsResponseError(err, http.StatusNotFound) { + return nil // no work to do + } + if err != nil { + return fmt.Errorf("failed to get active operation: %w", err) + } + if !c.ShouldProcess(ctx, operation) { + return nil // no work to do + } + + return pollExternalAuthStatus(ctx, c.cosmosClient, c.clusterServiceClient, operation, c.notificationClient) +} diff --git a/backend/pkg/controllers/operationcontrollers/operation_external_auth_delete.go b/backend/pkg/controllers/operationcontrollers/operation_external_auth_delete.go new file mode 100644 index 0000000000..5a2fc77a15 --- /dev/null +++ b/backend/pkg/controllers/operationcontrollers/operation_external_auth_delete.go @@ -0,0 +1,97 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operationcontrollers + +import ( + "context" + "errors" + "fmt" + "net/http" + "strings" + + ocmerrors "github.com/openshift-online/ocm-sdk-go/errors" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type operationExternalAuthDelete struct { + cosmosClient database.DBClient + clusterServiceClient ocm.ClusterServiceClientSpec + notificationClient *http.Client +} + +// NewOperationExternalAuthDeleteSynchronizer periodically lists all clusters and for each out when the cluster was deleted and its state. +func NewOperationExternalAuthDeleteSynchronizer( + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + notificationClient *http.Client, +) OperationSynchronizer { + c := &operationExternalAuthDelete{ + cosmosClient: cosmosClient, + clusterServiceClient: clusterServiceClient, + notificationClient: notificationClient, + } + + return c +} + +func (c *operationExternalAuthDelete) ShouldProcess(ctx context.Context, operation *api.Operation) bool { + if operation.Status.IsTerminal() { + return false + } + if operation.Request != database.OperationRequestDelete { + return false + } + if operation.ExternalID == nil || !strings.EqualFold(operation.ExternalID.ResourceType.String(), api.ExternalAuthResourceType.String()) { + return false + } + return true +} + +func (c *operationExternalAuthDelete) SynchronizeOperation(ctx context.Context, key controllerutils.OperationKey) error { + logger := utils.LoggerFromContext(ctx) + logger.Info("checking operation") + + operation, err := c.cosmosClient.Operations(key.SubscriptionID).Get(ctx, key.OperationName) + if database.IsResponseError(err, http.StatusNotFound) { + return nil // no work to do + } + if err != nil { + return fmt.Errorf("failed to get active operation: %w", err) + } + if !c.ShouldProcess(ctx, operation) { + return nil // no work to do + } + + _, err = c.clusterServiceClient.GetExternalAuth(ctx, operation.InternalID) + var ocmGetExternalAuthError *ocmerrors.Error + if err != nil && errors.As(err, &ocmGetExternalAuthError) && ocmGetExternalAuthError.Status() == http.StatusNotFound { + logger.Info("node pool was deleted") + + err = SetDeleteOperationAsCompleted(ctx, c.cosmosClient, operation, postAsyncNotificationFn(c.notificationClient)) + if err != nil { + logger.Error(err, "Failed to handle a completed deletion") + } + } + if err != nil { + return utils.TrackError(err) + } + + return nil +} diff --git a/backend/pkg/controllers/operationcontrollers/operation_external_auth_update.go b/backend/pkg/controllers/operationcontrollers/operation_external_auth_update.go new file mode 100644 index 0000000000..ebc5395dad --- /dev/null +++ b/backend/pkg/controllers/operationcontrollers/operation_external_auth_update.go @@ -0,0 +1,84 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operationcontrollers + +import ( + "context" + "fmt" + "net/http" + "strings" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type operationExternalAuthUpdate struct { + cosmosClient database.DBClient + clusterServiceClient ocm.ClusterServiceClientSpec + notificationClient *http.Client +} + +func NewOperationExternalAuthUpdateSynchronizer( + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + notificationClient *http.Client, +) OperationSynchronizer { + c := &operationExternalAuthUpdate{ + cosmosClient: cosmosClient, + clusterServiceClient: clusterServiceClient, + notificationClient: notificationClient, + } + + return c +} + +func (c *operationExternalAuthUpdate) ShouldProcess(ctx context.Context, operation *api.Operation) bool { + if operation.Status.IsTerminal() { + return false + } + if operation.Request != database.OperationRequestUpdate { + return false + } + if operation.ExternalID == nil || !strings.EqualFold(operation.ExternalID.ResourceType.String(), api.ExternalAuthResourceType.String()) { + return false + } + return true +} + +func (c *operationExternalAuthUpdate) SynchronizeOperation(ctx context.Context, key controllerutils.OperationKey) error { + logger := utils.LoggerFromContext(ctx) + logger.Info("checking operation") + + operation, err := c.cosmosClient.Operations(key.SubscriptionID).Get(ctx, key.OperationName) + if database.IsResponseError(err, http.StatusNotFound) { + return nil // no work to do + } + if err != nil { + return fmt.Errorf("failed to get active operation: %w", err) + } + if !c.ShouldProcess(ctx, operation) { + return nil // no work to do + } + + _, err = c.clusterServiceClient.GetExternalAuth(ctx, operation.InternalID) + if err != nil { + return utils.TrackError(err) + } + + return pollExternalAuthStatus(ctx, c.cosmosClient, c.clusterServiceClient, operation, c.notificationClient) +} diff --git a/backend/pkg/controllers/operationcontrollers/operation_node_pool_create.go b/backend/pkg/controllers/operationcontrollers/operation_node_pool_create.go new file mode 100644 index 0000000000..b0c4c899ea --- /dev/null +++ b/backend/pkg/controllers/operationcontrollers/operation_node_pool_create.go @@ -0,0 +1,79 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operationcontrollers + +import ( + "context" + "fmt" + "net/http" + "strings" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type operationNodePoolCreate struct { + cosmosClient database.DBClient + clusterServiceClient ocm.ClusterServiceClientSpec + notificationClient *http.Client +} + +func NewOperationNodePoolCreateSynchronizer( + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + notificationClient *http.Client, +) OperationSynchronizer { + c := &operationNodePoolCreate{ + cosmosClient: cosmosClient, + clusterServiceClient: clusterServiceClient, + notificationClient: notificationClient, + } + + return c +} + +func (c *operationNodePoolCreate) ShouldProcess(ctx context.Context, operation *api.Operation) bool { + if operation.Status.IsTerminal() { + return false + } + if operation.Request != database.OperationRequestCreate { + return false + } + if operation.ExternalID == nil || !strings.EqualFold(operation.ExternalID.ResourceType.String(), api.NodePoolResourceType.String()) { + return false + } + return true +} + +func (c *operationNodePoolCreate) SynchronizeOperation(ctx context.Context, key controllerutils.OperationKey) error { + logger := utils.LoggerFromContext(ctx) + logger.Info("checking operation") + + operation, err := c.cosmosClient.Operations(key.SubscriptionID).Get(ctx, key.OperationName) + if database.IsResponseError(err, http.StatusNotFound) { + return nil // no work to do + } + if err != nil { + return fmt.Errorf("failed to get active operation: %w", err) + } + if !c.ShouldProcess(ctx, operation) { + return nil // no work to do + } + + return pollNodePoolStatus(ctx, c.cosmosClient, c.clusterServiceClient, operation, c.notificationClient) +} diff --git a/backend/pkg/controllers/operationcontrollers/operation_node_pool_delete.go b/backend/pkg/controllers/operationcontrollers/operation_node_pool_delete.go new file mode 100644 index 0000000000..6e6e67a147 --- /dev/null +++ b/backend/pkg/controllers/operationcontrollers/operation_node_pool_delete.go @@ -0,0 +1,107 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operationcontrollers + +import ( + "context" + "errors" + "fmt" + "net/http" + "strings" + + ocmerrors "github.com/openshift-online/ocm-sdk-go/errors" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type operationNodePoolDelete struct { + cosmosClient database.DBClient + clusterServiceClient ocm.ClusterServiceClientSpec + notificationClient *http.Client +} + +// NewOperationNodePoolDeleteSynchronizer periodically lists all clusters and for each out when the cluster was deleted and its state. +func NewOperationNodePoolDeleteSynchronizer( + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + notificationClient *http.Client, +) OperationSynchronizer { + c := &operationNodePoolDelete{ + cosmosClient: cosmosClient, + clusterServiceClient: clusterServiceClient, + notificationClient: notificationClient, + } + + return c +} + +func (c *operationNodePoolDelete) ShouldProcess(ctx context.Context, operation *api.Operation) bool { + if operation.Status.IsTerminal() { + return false + } + if operation.Request != database.OperationRequestDelete { + return false + } + if operation.ExternalID == nil || !strings.EqualFold(operation.ExternalID.ResourceType.String(), api.NodePoolResourceType.String()) { + return false + } + return true +} + +func (c *operationNodePoolDelete) SynchronizeOperation(ctx context.Context, key controllerutils.OperationKey) error { + logger := utils.LoggerFromContext(ctx) + logger.Info("checking operation") + + operation, err := c.cosmosClient.Operations(key.SubscriptionID).Get(ctx, key.OperationName) + if database.IsResponseError(err, http.StatusNotFound) { + return nil // no work to do + } + if err != nil { + return fmt.Errorf("failed to get active operation: %w", err) + } + if !c.ShouldProcess(ctx, operation) { + return nil // no work to do + } + + clusterStatus, err := c.clusterServiceClient.GetNodePoolStatus(ctx, operation.InternalID) + var ocmGetNodePoolError *ocmerrors.Error + if err != nil && errors.As(err, &ocmGetNodePoolError) && ocmGetNodePoolError.Status() == http.StatusNotFound { + logger.Info("node pool was deleted") + + err = SetDeleteOperationAsCompleted(ctx, c.cosmosClient, operation, postAsyncNotificationFn(c.notificationClient)) + if err != nil { + logger.Error(err, "Failed to handle a completed deletion") + } + } + if err != nil { + return utils.TrackError(err) + } + + newOperationStatus, newOperationError, err := convertNodePoolStatus(operation, clusterStatus) + if err != nil { + return utils.TrackError(err) + } + + err = UpdateOperationStatus(ctx, c.cosmosClient, operation, newOperationStatus, newOperationError, postAsyncNotificationFn(c.notificationClient)) + if err != nil { + return utils.TrackError(err) + } + + return nil +} diff --git a/backend/pkg/controllers/operationcontrollers/operation_node_pool_update.go b/backend/pkg/controllers/operationcontrollers/operation_node_pool_update.go new file mode 100644 index 0000000000..39ef2784c6 --- /dev/null +++ b/backend/pkg/controllers/operationcontrollers/operation_node_pool_update.go @@ -0,0 +1,79 @@ +// Copyright 2025 Microsoft Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package operationcontrollers + +import ( + "context" + "fmt" + "net/http" + "strings" + + "github.com/Azure/ARO-HCP/backend/pkg/controllers/controllerutils" + "github.com/Azure/ARO-HCP/internal/api" + "github.com/Azure/ARO-HCP/internal/database" + "github.com/Azure/ARO-HCP/internal/ocm" + "github.com/Azure/ARO-HCP/internal/utils" +) + +type operationNodePoolUpdate struct { + cosmosClient database.DBClient + clusterServiceClient ocm.ClusterServiceClientSpec + notificationClient *http.Client +} + +func NewOperationNodePoolUpdateSynchronizer( + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + notificationClient *http.Client, +) OperationSynchronizer { + c := &operationNodePoolUpdate{ + cosmosClient: cosmosClient, + clusterServiceClient: clusterServiceClient, + notificationClient: notificationClient, + } + + return c +} + +func (c *operationNodePoolUpdate) ShouldProcess(ctx context.Context, operation *api.Operation) bool { + if operation.Status.IsTerminal() { + return false + } + if operation.Request != database.OperationRequestUpdate { + return false + } + if operation.ExternalID == nil || !strings.EqualFold(operation.ExternalID.ResourceType.String(), api.NodePoolResourceType.String()) { + return false + } + return true +} + +func (c *operationNodePoolUpdate) SynchronizeOperation(ctx context.Context, key controllerutils.OperationKey) error { + logger := utils.LoggerFromContext(ctx) + logger.Info("checking operation") + + operation, err := c.cosmosClient.Operations(key.SubscriptionID).Get(ctx, key.OperationName) + if database.IsResponseError(err, http.StatusNotFound) { + return nil // no work to do + } + if err != nil { + return fmt.Errorf("failed to get active operation: %w", err) + } + if !c.ShouldProcess(ctx, operation) { + return nil // no work to do + } + + return pollNodePoolStatus(ctx, c.cosmosClient, c.clusterServiceClient, operation, c.notificationClient) +} diff --git a/backend/pkg/controllers/operationcontrollers/utils.go b/backend/pkg/controllers/operationcontrollers/utils.go index b8b8babae8..85c4efbf88 100644 --- a/backend/pkg/controllers/operationcontrollers/utils.go +++ b/backend/pkg/controllers/operationcontrollers/utils.go @@ -44,6 +44,23 @@ var localClock clock.Clock = clock.RealClock{} type PostAsyncNotificationFunc func(ctx context.Context, operation *api.Operation) error +// Copied from uhc-clusters-service, because the +// OCM SDK does not define this for some reason. +type NodePoolStateValue string + +const ( + NodePoolStateValidating NodePoolStateValue = "validating" + NodePoolStatePending NodePoolStateValue = "pending" + NodePoolStateInstalling NodePoolStateValue = "installing" + NodePoolStateReady NodePoolStateValue = "ready" + NodePoolStateUpdating NodePoolStateValue = "updating" + NodePoolStateValidatingUpdate NodePoolStateValue = "validating_update" + NodePoolStatePendingUpdate NodePoolStateValue = "pending_update" + NodePoolStateUninstalling NodePoolStateValue = "uninstalling" + NodePoolStateRecoverableError NodePoolStateValue = "recoverable_error" + NodePoolStateError NodePoolStateValue = "error" +) + // UpdateOperationStatus updates Cosmos DB to reflect an updated resource status. func UpdateOperationStatus(ctx context.Context, cosmosClient database.DBClient, existingOperation *api.Operation, newOperationStatus arm.ProvisioningState, newOperationError *arm.CloudErrorBody, postAsyncNotificationFn PostAsyncNotificationFunc) error { logger := utils.LoggerFromContext(ctx) @@ -341,6 +358,121 @@ func convertClusterStatus(ctx context.Context, clusterServiceClient ocm.ClusterS return newOperationStatus, opError, err } +// pollNodePoolStatus converts a node pool status from Cluster +// Service to info for an Azure async operation status endpoint. +func pollNodePoolStatus( + ctx context.Context, + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + operation *api.Operation, + notificationClient *http.Client) error { + // XXX This is currently called by the operationNodePoolCreate and + // operationNodePoolUpdate controllers because the logic flows + // are identical. If the logic flows ever diverge, then this + // function should be split up and the pieces moved back to + // their respective controllers. + + logger := utils.LoggerFromContext(ctx) + + nodePoolStatus, err := clusterServiceClient.GetNodePoolStatus(ctx, operation.InternalID) + if err != nil { + return utils.TrackError(err) + } + + newOperationStatus, newOperationError, err := convertNodePoolStatus(operation, nodePoolStatus) + if err != nil { + return utils.TrackError(err) + } + logger.Info("new status", "newStatus", newOperationStatus) + + logger.Info("updating status") + err = UpdateOperationStatus(ctx, cosmosClient, operation, newOperationStatus, newOperationError, postAsyncNotificationFn(notificationClient)) + if err != nil { + return utils.TrackError(err) + } + + return nil +} + +// convertNodePoolStatus attempts to translate a NodePoolStatus object +// from Cluster Service into an ARM provisioning state and, if necessary, +// a structured OData error. +func convertNodePoolStatus(operation *api.Operation, nodePoolStatus *arohcpv1alpha1.NodePoolStatus) (arm.ProvisioningState, *arm.CloudErrorBody, error) { + var newOperationStatus = operation.Status + var opError *arm.CloudErrorBody + var err error + + switch state := NodePoolStateValue(nodePoolStatus.State().NodePoolStateValue()); state { + case NodePoolStateValidating, NodePoolStatePending, NodePoolStateValidatingUpdate, NodePoolStatePendingUpdate: + // These are valid node pool states for ARO-HCP but there are + // no unique ProvisioningState values for them. They should + // only occur when ProvisioningState is Accepted. + if operation.Status != arm.ProvisioningStateAccepted { + err = fmt.Errorf("got NodePoolStatusValue '%s' while ProvisioningState was '%s' instead of '%s'", state, operation.Status, arm.ProvisioningStateAccepted) + } + case NodePoolStateInstalling: + newOperationStatus = arm.ProvisioningStateProvisioning + case NodePoolStateReady: + // Resource deletion is successful when fetching its state + // from Cluster Service returns a "404 Not Found" error. If + // we see the resource in a "Ready" state during a deletion + // operation, leave the current provisioning state as is. + if operation.Request != database.OperationRequestDelete { + newOperationStatus = arm.ProvisioningStateSucceeded + } + case NodePoolStateUpdating: + newOperationStatus = arm.ProvisioningStateUpdating + case NodePoolStateUninstalling: + newOperationStatus = arm.ProvisioningStateDeleting + case NodePoolStateRecoverableError, NodePoolStateError: + // XXX OCM SDK offers no error code or message for failed node pool + // operations so "Internal Server Error" is all we can do for now. + // https://issues.redhat.com/browse/ARO-14969 + newOperationStatus = arm.ProvisioningStateFailed + opError = arm.NewInternalServerError().CloudErrorBody + if msg, ok := nodePoolStatus.GetMessage(); ok { + opError.Message = msg + } + default: + err = fmt.Errorf("unhandled NodePoolState '%s'", state) + } + + return newOperationStatus, opError, err +} + +// pollExternalAuthStatus converts an external auth status from Cluster +// Service to info for an Azure async operation status endpoint. +func pollExternalAuthStatus( + ctx context.Context, + cosmosClient database.DBClient, + clusterServiceClient ocm.ClusterServiceClientSpec, + operation *api.Operation, + notificationClient *http.Client) error { + // XXX This is currently called by the operationExternalAuthCreate and + // operationExternalAuthUpdate controllers because the logic flows + // are identical. If the logic flows ever diverge, then this + // function should be split up and the pieces moved back to + // their respective controllers. + + logger := utils.LoggerFromContext(ctx) + + _, err := clusterServiceClient.GetExternalAuth(ctx, operation.InternalID) + if err != nil { + return utils.TrackError(err) + } + + newOperationStatus := arm.ProvisioningStateSucceeded + logger.Info("new status", "newStatus", newOperationStatus) + + logger.Info("updating status") + err = UpdateOperationStatus(ctx, cosmosClient, operation, newOperationStatus, nil, postAsyncNotificationFn(notificationClient)) + if err != nil { + return utils.TrackError(err) + } + + return nil +} + // convertInflightChecks gets a cluster internal ID, fetches inflight check errors from CS endpoint, and converts them // to arm.CloudErrorBody type. // The function should be triggered only if inflight errors occurred with provision error code OCM4001.