Skip to content

[Feature] OptionalReplace Upgrade Mode #1939

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
- (Feature) (Platform) Enable HTTP to HTTPS Redirect
- (Bugfix) (Platform) Fix NoAuth Mode
- (Feature) Define OptionalReplace Upgrade Mode option
- (Feature) OptionalReplace Upgrade Mode

## [1.2.50](https://github.com/arangodb/kube-arangodb/tree/1.2.50) (2025-07-04)
- (Feature) (Platform) MetaV1 Integration Service
Expand Down
4 changes: 2 additions & 2 deletions pkg/deployment/reconcile/action_compact_member.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func (a *actionCompactMember) Start(ctx context.Context) (bool, error) {
}

switch g {
case api.ServerGroupDBServers:
case api.ServerGroupDBServers, api.ServerGroupSingle:
dbc, err := a.actionCtx.GetServerAsyncClient(m.ID)
if err != nil {
return false, errors.Wrapf(err, "Unable to create client")
Expand Down Expand Up @@ -91,7 +91,7 @@ func (a actionCompactMember) CheckProgress(ctx context.Context) (bool, bool, err
}

switch g {
case api.ServerGroupDBServers:
case api.ServerGroupDBServers, api.ServerGroupSingle:
dbc, err := a.actionCtx.GetServerAsyncClient(m.ID)
if err != nil {
return false, false, errors.Wrapf(err, "Unable to create client")
Expand Down
20 changes: 12 additions & 8 deletions pkg/deployment/reconcile/plan_builder_rotate_upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,11 @@ func (r *Reconciler) createUpgradePlanInternal(apiObject k8sutil.APIObject, spec
return nil, false
}

if m.Member.Conditions.IsTrue(api.ConditionTypeMarkedToRemove) {
// Server is supposed to be removed, lets not continue
return nil, false
}

if decision.AreGroupsPendingUpgrade(upgradeOrder[:id]...) {
// Not all members before were upgraded
return nil, false
Expand All @@ -313,20 +318,19 @@ func (r *Reconciler) createUpgradePlanInternal(apiObject k8sutil.APIObject, spec
switch group {
case api.ServerGroupDBServers:
// Members are supposed to be replaced
if !m.Member.Conditions.IsTrue(api.ConditionTypeMarkedToRemove) {
return api.Plan{actions.NewAction(api.ActionTypeMarkToRemoveMember, m.Group, m.Member, "Replace by Upgrade")}, false
}
return nil, false
return api.Plan{actions.NewAction(api.ActionTypeMarkToRemoveMember, m.Group, m.Member, "Replace by Upgrade")}, false
Copy link
Preview

Copilot AI Jul 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line appears to be missing proper indentation and logical structure. The condition check for !m.Member.Conditions.IsTrue(api.ConditionTypeMarkedToRemove) was removed but the action is still being executed unconditionally.

Suggested change
return api.Plan{actions.NewAction(api.ActionTypeMarkToRemoveMember, m.Group, m.Member, "Replace by Upgrade")}, false
if !m.Member.Conditions.IsTrue(api.ConditionTypeMarkedToRemove) {
return api.Plan{actions.NewAction(api.ActionTypeMarkToRemoveMember, m.Group, m.Member, "Replace by Upgrade")}, false
}

Copilot uses AI. Check for mistakes.

}
Copy link
Preview

Copilot AI Jul 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This closing brace appears to be orphaned after the removal of the condition check and return statement on the previous lines, which could cause compilation errors.

Suggested change
}
}
default:
r.planLogger.Str("member", m.Member.ID).Str("UpgradeMode", string(um.Get())).Error("Unhandled upgrade mode")
return nil, true

Copilot uses AI. Check for mistakes.

}

compact := um.Get() == api.ServerGroupUpgradeModeOptionalReplace

if d.updateAllowed {
// We are fine, group is alive so we can proceed
r.planLogger.Str("member", m.Member.ID).Str("Reason", d.updateMessage).Info("Upgrade allowed")
return r.createUpgradeMemberPlan(m.Member, m.Group, "Version upgrade", spec, status, !d.upgradeDecision.AutoUpgradeNeeded, agencyCache), false
return r.createUpgradeMemberPlan(m.Member, m.Group, "Version upgrade", spec, status, !d.upgradeDecision.AutoUpgradeNeeded, compact, agencyCache), false
} else if d.unsafeUpdateAllowed {
r.planLogger.Str("member", m.Member.ID).Str("Reason", d.updateMessage).Info("Pod needs upgrade but cluster is not ready. Either some shards are not in sync or some member is not ready, but unsafe upgrade is allowed")
return r.createUpgradeMemberPlan(m.Member, m.Group, "Version upgrade", spec, status, !d.upgradeDecision.AutoUpgradeNeeded, agencyCache), false
return r.createUpgradeMemberPlan(m.Member, m.Group, "Version upgrade", spec, status, !d.upgradeDecision.AutoUpgradeNeeded, compact, agencyCache), false
} else {
r.planLogger.Str("member", m.Member.ID).Str("Reason", d.updateMessage).Info("Pod needs upgrade but cluster is not ready. Either some shards are not in sync or some member is not ready.")
return nil, true
Expand Down Expand Up @@ -596,7 +600,7 @@ func (r *Reconciler) createMemberAllowUpgradeConditionPlan(ctx context.Context,
// createUpgradeMemberPlan creates a plan to upgrade (stop-recreateWithAutoUpgrade-stop-start) an existing
// member.
func (r *Reconciler) createUpgradeMemberPlan(member api.MemberStatus,
group api.ServerGroup, reason string, spec api.DeploymentSpec, status api.DeploymentStatus, rotateStatefull bool, agencyCache state.State) api.Plan {
group api.ServerGroup, reason string, spec api.DeploymentSpec, status api.DeploymentStatus, rotateStatefull, compact bool, agencyCache state.State) api.Plan {
upgradeAction := api.ActionTypeUpgradeMember
if rotateStatefull || group.IsStateless() {
upgradeAction = api.ActionTypeRotateMember
Expand All @@ -608,7 +612,7 @@ func (r *Reconciler) createUpgradeMemberPlan(member api.MemberStatus,
Str("action", string(upgradeAction)).
Info("Creating upgrade plan")

plan := createRotateMemberPlanWithAction(member, group, upgradeAction, spec, reason, util.CheckConditionalP1Nil(agencyCache.GetRebootID, state.Server(member.ID)))
plan := createRotateMemberPlanWithAction(member, group, upgradeAction, spec, reason, util.CheckConditionalP1Nil(agencyCache.GetRebootID, state.Server(member.ID)), compact)

if member.Image == nil || member.Image.Image != spec.GetImage() {
plan = plan.Before(actions.NewAction(api.ActionTypeSetMemberCurrentImage, group, member, reason).SetImage(spec.GetImage()))
Expand Down
10 changes: 8 additions & 2 deletions pkg/deployment/reconcile/plan_builder_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,25 @@ func (r *Reconciler) createRotateMemberPlan(member api.MemberStatus,
Str("role", group.AsRole()).
Str("reason", reason).
Debug("Creating rotation plan")
return createRotateMemberPlanWithAction(member, group, api.ActionTypeRotateMember, spec, reason, rebootId)
return createRotateMemberPlanWithAction(member, group, api.ActionTypeRotateMember, spec, reason, rebootId, false)
}

// createRotateMemberPlanWithAction creates a plan to rotate (stop-<action>>-start) an existing
// member.
func createRotateMemberPlanWithAction(member api.MemberStatus,
group api.ServerGroup, action api.ActionType, spec api.DeploymentSpec, reason string, rebootId *int) api.Plan {
group api.ServerGroup, action api.ActionType, spec api.DeploymentSpec, reason string, rebootId *int, compact bool) api.Plan {

var plan = api.Plan{
actions.NewAction(api.ActionTypeCleanTLSKeyfileCertificate, group, member, "Remove server keyfile and enforce renewal/recreation"),
}
plan = withSecureWrap(member, group, spec, rebootId, plan...)

if compact {
plan = plan.After(
actions.NewAction(api.ActionTypeCompactMember, group, member, reason),
)
}

plan = plan.After(
actions.NewAction(api.ActionTypeKillMemberPod, group, member, reason),
actions.NewAction(action, group, member, reason),
Expand Down
7 changes: 7 additions & 0 deletions pkg/deployment/resources/pod_creator_arangod.go
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,13 @@ func (a *ArangoUpgradeContainer) GetCommand() ([]string, error) {
pod.UpgradeDebug().Args(a.input).Sort().AsArgs()...,
)

if a.input.Group == api.ServerGroupDBServers || a.input.Group == api.ServerGroupSingle {
if a.input.GroupSpec.UpgradeMode.Get() == api.ServerGroupUpgradeModeOptionalReplace ||
(a.input.GroupSpec.UpgradeMode.Get() == api.ServerGroupUpgradeModeManual && a.input.GroupSpec.ManualUpgradeMode.Get() == api.ServerGroupUpgradeModeOptionalReplace) {
upgradeArgs = append(upgradeArgs, "--database.auto-upgrade-full-compaction")
}
}

return append(args, upgradeArgs...), nil
}

Expand Down
14 changes: 13 additions & 1 deletion pkg/deployment/resources/pod_inspector.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ package resources
import (
"context"
"fmt"
"slices"
goStrings "strings"
"time"

Expand Down Expand Up @@ -136,13 +137,24 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
switch containers[id] {
case api.ServerGroupReservedInitContainerNameVersionCheck:
if c, ok := kresources.GetAnyContainerStatusByName(pod.Status.InitContainerStatuses, containers[id]); ok {
if t := c.State.Terminated; t != nil && t.ExitCode == 11 {
if t := c.State.Terminated; t != nil && t.ExitCode == constants.ArangoDBExitCodeUpgradeRequired {
memberStatus.Upgrade = true
updateMemberStatusNeeded = true
}
}
case api.ServerGroupReservedInitContainerNameUpgrade:
memberStatus.Conditions.Update(api.ConditionTypeUpgradeFailed, true, "Upgrade Failed", "")
if group == api.ServerGroupDBServers {
if c, ok := kresources.GetAnyContainerStatusByName(pod.Status.InitContainerStatuses, containers[id]); ok {
if t := c.State.Terminated; t != nil && slices.Contains([]int32{
constants.ArangoDBExitCodeUpgradeFailedCompaction,
//constants.ContainerExitCodeSegmentationFault, // Also in case of Segv
Copy link
Preview

Copilot AI Jul 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove commented code or implement it. The TODO-style comment suggests this might be needed but is commented out without clear reasoning.

Suggested change
//constants.ContainerExitCodeSegmentationFault, // Also in case of Segv
constants.ContainerExitCodeSegmentationFault, // Also in case of Segv

Copilot uses AI. Check for mistakes.

constants.ArangoDBExitCodeInvalidArgument, // If Arg is not known
}, t.ExitCode) {
memberStatus.Conditions.Update(api.ConditionTypeMarkedToRemove, true, "Replace Required due to the mismatch", "")
}
}
}
}
}
}
Expand Down
29 changes: 29 additions & 0 deletions pkg/util/constants/adb_codes.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
//
// DISCLAIMER
//
// Copyright 2025 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright holder is ArangoDB GmbH, Cologne, Germany
//

package constants

const (
ArangoDBExitCodeInvalidArgument = 3
ArangoDBExitCodeUpgradeRequired = 11
ArangoDBExitCodeUpgradeFailedCompaction = 30

ContainerExitCodeSegmentationFault = 139
)