Skip to content

Commit 6dfb485

Browse files
authored
[Feature] Remove stuck Pods (#1148)
1 parent 16aed13 commit 6dfb485

File tree

4 files changed

+48
-2
lines changed

4 files changed

+48
-2
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
- (Feature) PVC Member Status info
1111
- (Feature) Respect ToBeCleanedServers in Agency
1212
- (Improvement) Unify K8S Error Handling
13+
- (Feature) Remove stuck Pods
1314

1415
## [1.2.19](https://github.com/arangodb/kube-arangodb/tree/1.2.19) (2022-10-05)
1516
- (Bugfix) Prevent changes when UID is wrong

pkg/deployment/resources/pod_inspector.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
"github.com/arangodb/kube-arangodb/pkg/metrics"
3838
"github.com/arangodb/kube-arangodb/pkg/util"
3939
"github.com/arangodb/kube-arangodb/pkg/util/errors"
40+
"github.com/arangodb/kube-arangodb/pkg/util/globals"
4041
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
4142
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/info"
4243
inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector"
@@ -372,6 +373,7 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
372373
updateMemberStatusNeeded = true
373374
log.Str("pod-name", pod.GetName()).Debug("Pod marked as terminating")
374375
}
376+
375377
// Process finalizers
376378
if x, err := r.runPodFinalizers(ctx, pod, memberStatus, func(m api.MemberStatus) error {
377379
updateMemberStatusNeeded = true
@@ -383,6 +385,26 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
383385
} else {
384386
nextInterval = nextInterval.ReduceTo(x)
385387
}
388+
389+
// Check if any additional deletion request is required
390+
if !k8sutil.IsPodAlive(pod) {
391+
// Pod is dead, but still not removed. Send additional deletion request
392+
nctx, c := globals.GetGlobals().Timeouts().Kubernetes().WithTimeout(ctx)
393+
defer c()
394+
395+
if err := cachedStatus.PodsModInterface().V1().Delete(nctx, pod.GetName(), meta.DeleteOptions{
396+
GracePeriodSeconds: util.NewInt64(10),
397+
Preconditions: meta.NewUIDPreconditions(string(pod.GetUID())),
398+
}); err != nil {
399+
if k8sutil.IsNotFound(err) {
400+
// Pod is already gone, we are fine with it
401+
} else if k8sutil.IsConflict(err) {
402+
log.Warn("UID of Pod Changed")
403+
} else {
404+
log.Err(err).Error("Unknown error while deleting Pod")
405+
}
406+
}
407+
}
386408
}
387409

388410
if updateMemberStatusNeeded {

pkg/util/k8sutil/errors.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,11 @@ func isNotFoundC(err error) bool {
7575
// IsInvalid returns true if the given error is or is caused by a
7676
// kubernetes InvalidError,
7777
func IsInvalid(err error) bool {
78-
return apierrors.IsInvalid(errors.Cause(err))
78+
return isError(err, isInvalidC)
7979
}
8080

8181
func isInvalidC(err error) bool {
82-
return isError(err, isInvalidC)
82+
return apierrors.IsInvalid(errors.Cause(err))
8383
}
8484

8585
// IsForbiddenOrNotFound returns true if the given error is or is caused by a

pkg/util/k8sutil/pods.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,29 @@ func MasterJWTVolumeMount() core.VolumeMount {
360360
}
361361
}
362362

363+
// IsPodAlive returns true if any of the containers within pod is running
364+
func IsPodAlive(pod *core.Pod) bool {
365+
return IsAnyContainerAlive(pod.Status.ContainerStatuses) ||
366+
IsAnyContainerAlive(pod.Status.InitContainerStatuses) ||
367+
IsAnyContainerAlive(pod.Status.EphemeralContainerStatuses)
368+
}
369+
370+
// IsAnyContainerAlive returns true if any of the containers is running
371+
func IsAnyContainerAlive(containers []core.ContainerStatus) bool {
372+
for _, c := range containers {
373+
if IsContainerAlive(c) {
374+
return true
375+
}
376+
}
377+
378+
return false
379+
}
380+
381+
// IsContainerAlive returns true if container is running
382+
func IsContainerAlive(container core.ContainerStatus) bool {
383+
return container.State.Running != nil
384+
}
385+
363386
// ClusterJWTVolumeMount creates a volume mount structure for a cluster JWT secret (token).
364387
func ClusterJWTVolumeMount() core.VolumeMount {
365388
return core.VolumeMount{

0 commit comments

Comments
 (0)