diff --git a/CHANGELOG.md b/CHANGELOG.md index 86ca367d0..c186c94ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ - (Feature) (Platform) Enable HTTP to HTTPS Redirect - (Bugfix) (Platform) Fix NoAuth Mode - (Feature) Define OptionalReplace Upgrade Mode option +- (Feature) OptionalReplace Upgrade Mode ## [1.2.50](https://github.com/arangodb/kube-arangodb/tree/1.2.50) (2025-07-04) - (Feature) (Platform) MetaV1 Integration Service diff --git a/pkg/deployment/reconcile/action_compact_member.go b/pkg/deployment/reconcile/action_compact_member.go index 36f84d576..14648eb0a 100644 --- a/pkg/deployment/reconcile/action_compact_member.go +++ b/pkg/deployment/reconcile/action_compact_member.go @@ -54,7 +54,7 @@ func (a *actionCompactMember) Start(ctx context.Context) (bool, error) { } switch g { - case api.ServerGroupDBServers: + case api.ServerGroupDBServers, api.ServerGroupSingle: dbc, err := a.actionCtx.GetServerAsyncClient(m.ID) if err != nil { return false, errors.Wrapf(err, "Unable to create client") @@ -91,7 +91,7 @@ func (a actionCompactMember) CheckProgress(ctx context.Context) (bool, bool, err } switch g { - case api.ServerGroupDBServers: + case api.ServerGroupDBServers, api.ServerGroupSingle: dbc, err := a.actionCtx.GetServerAsyncClient(m.ID) if err != nil { return false, false, errors.Wrapf(err, "Unable to create client") diff --git a/pkg/deployment/reconcile/plan_builder_rotate_upgrade.go b/pkg/deployment/reconcile/plan_builder_rotate_upgrade.go index 8b77ebfca..d3f1f7a31 100644 --- a/pkg/deployment/reconcile/plan_builder_rotate_upgrade.go +++ b/pkg/deployment/reconcile/plan_builder_rotate_upgrade.go @@ -287,6 +287,11 @@ func (r *Reconciler) createUpgradePlanInternal(apiObject k8sutil.APIObject, spec return nil, false } + if m.Member.Conditions.IsTrue(api.ConditionTypeMarkedToRemove) { + // Server is supposed to be removed, lets not continue + return nil, false + } + if decision.AreGroupsPendingUpgrade(upgradeOrder[:id]...) { // Not all members before were upgraded return nil, false @@ -313,20 +318,19 @@ func (r *Reconciler) createUpgradePlanInternal(apiObject k8sutil.APIObject, spec switch group { case api.ServerGroupDBServers: // Members are supposed to be replaced - if !m.Member.Conditions.IsTrue(api.ConditionTypeMarkedToRemove) { - return api.Plan{actions.NewAction(api.ActionTypeMarkToRemoveMember, m.Group, m.Member, "Replace by Upgrade")}, false - } - return nil, false + return api.Plan{actions.NewAction(api.ActionTypeMarkToRemoveMember, m.Group, m.Member, "Replace by Upgrade")}, false } } + compact := um.Get() == api.ServerGroupUpgradeModeOptionalReplace + if d.updateAllowed { // We are fine, group is alive so we can proceed r.planLogger.Str("member", m.Member.ID).Str("Reason", d.updateMessage).Info("Upgrade allowed") - return r.createUpgradeMemberPlan(m.Member, m.Group, "Version upgrade", spec, status, !d.upgradeDecision.AutoUpgradeNeeded, agencyCache), false + return r.createUpgradeMemberPlan(m.Member, m.Group, "Version upgrade", spec, status, !d.upgradeDecision.AutoUpgradeNeeded, compact, agencyCache), false } else if d.unsafeUpdateAllowed { r.planLogger.Str("member", m.Member.ID).Str("Reason", d.updateMessage).Info("Pod needs upgrade but cluster is not ready. Either some shards are not in sync or some member is not ready, but unsafe upgrade is allowed") - return r.createUpgradeMemberPlan(m.Member, m.Group, "Version upgrade", spec, status, !d.upgradeDecision.AutoUpgradeNeeded, agencyCache), false + return r.createUpgradeMemberPlan(m.Member, m.Group, "Version upgrade", spec, status, !d.upgradeDecision.AutoUpgradeNeeded, compact, agencyCache), false } else { r.planLogger.Str("member", m.Member.ID).Str("Reason", d.updateMessage).Info("Pod needs upgrade but cluster is not ready. Either some shards are not in sync or some member is not ready.") return nil, true @@ -596,7 +600,7 @@ func (r *Reconciler) createMemberAllowUpgradeConditionPlan(ctx context.Context, // createUpgradeMemberPlan creates a plan to upgrade (stop-recreateWithAutoUpgrade-stop-start) an existing // member. func (r *Reconciler) createUpgradeMemberPlan(member api.MemberStatus, - group api.ServerGroup, reason string, spec api.DeploymentSpec, status api.DeploymentStatus, rotateStatefull bool, agencyCache state.State) api.Plan { + group api.ServerGroup, reason string, spec api.DeploymentSpec, status api.DeploymentStatus, rotateStatefull, compact bool, agencyCache state.State) api.Plan { upgradeAction := api.ActionTypeUpgradeMember if rotateStatefull || group.IsStateless() { upgradeAction = api.ActionTypeRotateMember @@ -608,7 +612,7 @@ func (r *Reconciler) createUpgradeMemberPlan(member api.MemberStatus, Str("action", string(upgradeAction)). Info("Creating upgrade plan") - plan := createRotateMemberPlanWithAction(member, group, upgradeAction, spec, reason, util.CheckConditionalP1Nil(agencyCache.GetRebootID, state.Server(member.ID))) + plan := createRotateMemberPlanWithAction(member, group, upgradeAction, spec, reason, util.CheckConditionalP1Nil(agencyCache.GetRebootID, state.Server(member.ID)), compact) if member.Image == nil || member.Image.Image != spec.GetImage() { plan = plan.Before(actions.NewAction(api.ActionTypeSetMemberCurrentImage, group, member, reason).SetImage(spec.GetImage())) diff --git a/pkg/deployment/reconcile/plan_builder_utils.go b/pkg/deployment/reconcile/plan_builder_utils.go index c423f699e..c76d17a12 100644 --- a/pkg/deployment/reconcile/plan_builder_utils.go +++ b/pkg/deployment/reconcile/plan_builder_utils.go @@ -37,19 +37,25 @@ func (r *Reconciler) createRotateMemberPlan(member api.MemberStatus, Str("role", group.AsRole()). Str("reason", reason). Debug("Creating rotation plan") - return createRotateMemberPlanWithAction(member, group, api.ActionTypeRotateMember, spec, reason, rebootId) + return createRotateMemberPlanWithAction(member, group, api.ActionTypeRotateMember, spec, reason, rebootId, false) } // createRotateMemberPlanWithAction creates a plan to rotate (stop->-start) an existing // member. func createRotateMemberPlanWithAction(member api.MemberStatus, - group api.ServerGroup, action api.ActionType, spec api.DeploymentSpec, reason string, rebootId *int) api.Plan { + group api.ServerGroup, action api.ActionType, spec api.DeploymentSpec, reason string, rebootId *int, compact bool) api.Plan { var plan = api.Plan{ actions.NewAction(api.ActionTypeCleanTLSKeyfileCertificate, group, member, "Remove server keyfile and enforce renewal/recreation"), } plan = withSecureWrap(member, group, spec, rebootId, plan...) + if compact { + plan = plan.After( + actions.NewAction(api.ActionTypeCompactMember, group, member, reason), + ) + } + plan = plan.After( actions.NewAction(api.ActionTypeKillMemberPod, group, member, reason), actions.NewAction(action, group, member, reason), diff --git a/pkg/deployment/resources/pod_creator_arangod.go b/pkg/deployment/resources/pod_creator_arangod.go index d444402b3..a0592c46e 100644 --- a/pkg/deployment/resources/pod_creator_arangod.go +++ b/pkg/deployment/resources/pod_creator_arangod.go @@ -638,6 +638,13 @@ func (a *ArangoUpgradeContainer) GetCommand() ([]string, error) { pod.UpgradeDebug().Args(a.input).Sort().AsArgs()..., ) + if a.input.Group == api.ServerGroupDBServers || a.input.Group == api.ServerGroupSingle { + if a.input.GroupSpec.UpgradeMode.Get() == api.ServerGroupUpgradeModeOptionalReplace || + (a.input.GroupSpec.UpgradeMode.Get() == api.ServerGroupUpgradeModeManual && a.input.GroupSpec.ManualUpgradeMode.Get() == api.ServerGroupUpgradeModeOptionalReplace) { + upgradeArgs = append(upgradeArgs, "--database.auto-upgrade-full-compaction") + } + } + return append(args, upgradeArgs...), nil } diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index 585def28b..7cb2382c6 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -23,6 +23,7 @@ package resources import ( "context" "fmt" + "slices" goStrings "strings" "time" @@ -136,13 +137,24 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter switch containers[id] { case api.ServerGroupReservedInitContainerNameVersionCheck: if c, ok := kresources.GetAnyContainerStatusByName(pod.Status.InitContainerStatuses, containers[id]); ok { - if t := c.State.Terminated; t != nil && t.ExitCode == 11 { + if t := c.State.Terminated; t != nil && t.ExitCode == constants.ArangoDBExitCodeUpgradeRequired { memberStatus.Upgrade = true updateMemberStatusNeeded = true } } case api.ServerGroupReservedInitContainerNameUpgrade: memberStatus.Conditions.Update(api.ConditionTypeUpgradeFailed, true, "Upgrade Failed", "") + if group == api.ServerGroupDBServers { + if c, ok := kresources.GetAnyContainerStatusByName(pod.Status.InitContainerStatuses, containers[id]); ok { + if t := c.State.Terminated; t != nil && slices.Contains([]int32{ + constants.ArangoDBExitCodeUpgradeFailedCompaction, + //constants.ContainerExitCodeSegmentationFault, // Also in case of Segv + constants.ArangoDBExitCodeInvalidArgument, // If Arg is not known + }, t.ExitCode) { + memberStatus.Conditions.Update(api.ConditionTypeMarkedToRemove, true, "Replace Required due to the mismatch", "") + } + } + } } } } diff --git a/pkg/util/constants/adb_codes.go b/pkg/util/constants/adb_codes.go new file mode 100644 index 000000000..2c17bb2a3 --- /dev/null +++ b/pkg/util/constants/adb_codes.go @@ -0,0 +1,29 @@ +// +// DISCLAIMER +// +// Copyright 2025 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// + +package constants + +const ( + ArangoDBExitCodeInvalidArgument = 3 + ArangoDBExitCodeUpgradeRequired = 11 + ArangoDBExitCodeUpgradeFailedCompaction = 30 + + ContainerExitCodeSegmentationFault = 139 +)