From 41c52a6ef670396ba234773f458664abaa994cf9 Mon Sep 17 00:00:00 2001 From: Matheus Pimenta Date: Tue, 3 Feb 2026 13:01:19 +0000 Subject: [PATCH] Fix waiting and erroring out on garbage-collected Jobs Signed-off-by: Matheus Pimenta --- .github/workflows/e2e.yaml | 10 +++++++- config/testdata/job-ttl/helmrelease.yaml | 25 ++++++++++++++++++++ go.mod | 2 +- go.sum | 4 ++-- internal/action/wait.go | 29 +++++++++++++++++++----- 5 files changed, 60 insertions(+), 10 deletions(-) create mode 100644 config/testdata/job-ttl/helmrelease.yaml diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index e5b51fc49..35c2130f3 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -80,6 +80,14 @@ jobs: kubectl -n helm-system wait helmreleases/podinfo-git --for=condition=ready --timeout=4m kubectl -n helm-system wait helmreleases/podinfo-oci --for=condition=ready --timeout=4m kubectl -n helm-system delete -f config/testdata/podinfo + - name: Run Job with TTL test + run: | + # This test verifies that the wait logic correctly handles Jobs with + # ttlSecondsAfterFinished that get garbage-collected after completion. + # Without the fix, the wait would fail with NotFound error. + kubectl -n helm-system apply -f config/testdata/job-ttl + kubectl -n helm-system wait helmreleases/job-ttl --for=condition=ready --timeout=4m + kubectl -n helm-system delete -f config/testdata/job-ttl - name: Run client-side apply upgrade test run: | set -euo pipefail @@ -1147,7 +1155,7 @@ jobs: exit 1 fi kubectl -n helm-system delete -f config/testdata/post-renderer-kustomize - - name: Bootstrap CRDs Upgrade Tests + - name: Bootstrap Tests Using Local Helm Chart if: ${{ startsWith(github.ref, 'refs/tags/') || startsWith(github.ref, 'refs/heads/') }} run: | REF=${{ github.ref }} diff --git a/config/testdata/job-ttl/helmrelease.yaml b/config/testdata/job-ttl/helmrelease.yaml new file mode 100644 index 000000000..8f759b9ad --- /dev/null +++ b/config/testdata/job-ttl/helmrelease.yaml @@ -0,0 +1,25 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: job-ttl +spec: + interval: 5m + timeout: 2m + chart: + spec: + chart: podinfo + version: '>=6.10.1 <7.0.0' + sourceRef: + kind: HelmRepository + name: podinfo + interval: 1m + values: + # Enable a post-install hook Job with ttlSecondsAfterFinished. + # The Job completes immediately and gets garbage-collected by the TTL controller. + # The fix ensures the wait doesn't fail when the Job is NotFound after being + # garbage-collected. + hooks: + postInstall: + job: + enabled: true + ttlSecondsAfterFinished: 0 diff --git a/go.mod b/go.mod index eb53efa33..63af7d62a 100644 --- a/go.mod +++ b/go.mod @@ -26,7 +26,7 @@ require ( github.com/fluxcd/pkg/cache v0.13.0 github.com/fluxcd/pkg/chartutil v1.21.0 github.com/fluxcd/pkg/runtime v0.97.0 - github.com/fluxcd/pkg/ssa v0.65.0 + github.com/fluxcd/pkg/ssa v0.66.0 github.com/fluxcd/pkg/testserver v0.13.0 github.com/fluxcd/source-controller/api v1.7.2 github.com/go-logr/logr v1.4.3 diff --git a/go.sum b/go.sum index 209ad8757..fd7e98284 100644 --- a/go.sum +++ b/go.sum @@ -162,8 +162,8 @@ github.com/fluxcd/pkg/chartutil v1.21.0 h1:NJYhlekwBwuqMpRgsOlcsJrw2Xq0cBJW0Nmvz github.com/fluxcd/pkg/chartutil v1.21.0/go.mod h1:Gv50bF3SS4OvvKCyyIMRkGeNzZk6Fsh4+lAdrjx97T4= github.com/fluxcd/pkg/runtime v0.97.0 h1:NZ9ZL2RE6J4yNDHI7ZNmL/OnCsDWurzVIvws73FYMz8= github.com/fluxcd/pkg/runtime v0.97.0/go.mod h1:FyjNMFNAERkCsF/muTWJYU9MZOsq/m4Sc4aQk/EgQ9E= -github.com/fluxcd/pkg/ssa v0.65.0 h1:pvPYmBmfqS0sRNoi4/IUoocBbglYuD/OJLgOuxMJ9+o= -github.com/fluxcd/pkg/ssa v0.65.0/go.mod h1:RjvVjJIoRo1ecsv91yMuiqzO6cpNag80M6MOB/vrJdc= +github.com/fluxcd/pkg/ssa v0.66.0 h1:CYbCTuws8Sn1xAaOFNz92yS2iS5YBqJgd+DrzmRxfaQ= +github.com/fluxcd/pkg/ssa v0.66.0/go.mod h1:RjvVjJIoRo1ecsv91yMuiqzO6cpNag80M6MOB/vrJdc= github.com/fluxcd/pkg/testserver v0.13.0 h1:xEpBcEYtD7bwvZ+i0ZmChxKkDo/wfQEV3xmnzVybSSg= github.com/fluxcd/pkg/testserver v0.13.0/go.mod h1:akRYv3FLQUsme15na9ihECRG6hBuqni4XEY9W8kzs8E= github.com/fluxcd/source-controller/api v1.7.2 h1:/lg/xoyRjxwdhHKqjTxQS2o1cp+DMKJ8W4rpm+ZLemQ= diff --git a/internal/action/wait.go b/internal/action/wait.go index 6fbe75ecb..ebab3cc6e 100644 --- a/internal/action/wait.go +++ b/internal/action/wait.go @@ -25,14 +25,16 @@ import ( "github.com/fluxcd/cli-utils/pkg/kstatus/status" "github.com/fluxcd/cli-utils/pkg/object" helmkube "helm.sh/helm/v4/pkg/kube" + appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" apimeta "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" "github.com/fluxcd/pkg/runtime/controller" runtimestatusreaders "github.com/fluxcd/pkg/runtime/statusreaders" "github.com/fluxcd/pkg/ssa" + ssautils "github.com/fluxcd/pkg/ssa/utils" v2 "github.com/fluxcd/helm-controller/api/v2" ) @@ -152,10 +154,12 @@ func (w *waiter) wait(ctx context.Context, resources helmkube.ResourceList, timeout time.Duration, failFast bool, sr ...NewStatusReaderFunc) error { // WaitForSetWithContext expects a list of ObjMetadata. - objs := []object.ObjMetadata{} + var objs object.ObjMetadataSet + var jobs []*unstructured.Unstructured for _, res := range resources { - // Skip paused apps/v1/Deployment (copied from Helm). gvk := res.Object.GetObjectKind().GroupVersionKind() + + // Skip paused apps/v1/Deployment (copied from Helm). if gvk == deploymentGVK { uns, err := runtime.DefaultUnstructuredConverter.ToUnstructured(res.Object) if err != nil { @@ -167,6 +171,15 @@ func (w *waiter) wait(ctx context.Context, resources helmkube.ResourceList, } } + // Collect Jobs with TTL for special handling. + if gvk == jobGVK { + uns, err := runtime.DefaultUnstructuredConverter.ToUnstructured(res.Object) + if err != nil { + return err + } + jobs = append(jobs, &unstructured.Unstructured{Object: uns}) + } + // Convert to ObjMetadata. obj, err := object.RuntimeToObjMeta(res.Object) if err != nil { @@ -176,12 +189,16 @@ func (w *waiter) wait(ctx context.Context, resources helmkube.ResourceList, } return w.newResourceManager(sr...).WaitForSetWithContext(ctx, objs, ssa.WaitOptions{ - Interval: 5 * time.Second, // Copied from kustomize-controller. - Timeout: timeout, + JobsWithTTL: ssautils.ExtractJobsWithTTL(jobs), + Interval: 5 * time.Second, // Copied from kustomize-controller. + Timeout: timeout, // The kustomize-controller has an opt-in feature gate that disables // fail fast here: DisableFailFastBehavior. FailFast: failFast, }) } -var deploymentGVK = schema.GroupVersionKind{Group: "apps", Version: "v1", Kind: "Deployment"} +var ( + deploymentGVK = appsv1.SchemeGroupVersion.WithKind("Deployment") + jobGVK = batchv1.SchemeGroupVersion.WithKind("Job") +)