diff --git a/.github/ci/operator-postgres-values.yaml b/.github/ci/operator-postgres-values.yaml new file mode 100644 index 00000000..4330d9ff --- /dev/null +++ b/.github/ci/operator-postgres-values.yaml @@ -0,0 +1,77 @@ +# E2E values consumed by the "operator + postgres E2E" step in test.yml. +# Not under charts/openfga/ci/ on purpose — chart-testing's helm-test runs +# a gRPC probe immediately after install, which would race the operator's +# scale-up. The dedicated workflow step waits for the migration ConfigMap +# and the scale-up explicitly, then verifies readiness. +replicaCount: 1 + +operator: + enabled: true + +migration: + enabled: true + +datastore: + engine: postgres + uriSecret: openfga-e2e-postgres-credentials + +openfga-operator: + image: + pullPolicy: Never + +extraObjects: + - apiVersion: v1 + kind: Secret + metadata: + name: openfga-e2e-postgres-credentials + stringData: + uri: "postgres://openfga:changeme@openfga-e2e-postgres:5432/openfga?sslmode=disable" + - apiVersion: apps/v1 + kind: Deployment + metadata: + name: openfga-e2e-postgres + spec: + replicas: 1 + selector: + matchLabels: + app: openfga-e2e-postgres + template: + metadata: + labels: + app: openfga-e2e-postgres + spec: + containers: + - name: postgres + image: postgres:17 + ports: + - containerPort: 5432 + env: + - name: POSTGRES_USER + value: openfga + - name: POSTGRES_PASSWORD + value: changeme + - name: POSTGRES_DB + value: openfga + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + volumeMounts: + - name: data + mountPath: /var/lib/postgresql/data + readinessProbe: + exec: + command: ["pg_isready", "-U", "openfga", "-d", "openfga"] + initialDelaySeconds: 5 + periodSeconds: 5 + volumes: + - name: data + emptyDir: {} + - apiVersion: v1 + kind: Service + metadata: + name: openfga-e2e-postgres + spec: + selector: + app: openfga-e2e-postgres + ports: + - port: 5432 + targetPort: 5432 diff --git a/.github/workflows/operator.yml b/.github/workflows/operator.yml new file mode 100644 index 00000000..001d22f5 --- /dev/null +++ b/.github/workflows/operator.yml @@ -0,0 +1,116 @@ +name: Operator + +on: + push: + branches: + - main + paths: + - "operator/**" + - "charts/openfga-operator/**" + - ".github/workflows/operator.yml" + pull_request: + paths: + - "operator/**" + - "charts/openfga-operator/**" + - ".github/workflows/operator.yml" + workflow_dispatch: + inputs: + push_image: + description: "Push the operator image to GHCR" + type: boolean + default: true + +env: + IMAGE_NAME: ghcr.io/${{ github.repository_owner }}/openfga-operator + +jobs: + test: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: operator/go.mod + cache-dependency-path: operator/go.sum + + - name: Run tests + working-directory: operator + run: go test ./... -v + + - name: Run vet + working-directory: operator + run: go vet ./... + + build-and-push: + needs: test + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Extract version from Chart.yaml + id: version + run: | + version=$(grep '^appVersion:' charts/openfga-operator/Chart.yaml | awk '{print $2}' | tr -d '"') + echo "version=${version}" >> "$GITHUB_OUTPUT" + short_sha="${GITHUB_SHA::7}" + echo "short_sha=${short_sha}" >> "$GITHUB_OUTPUT" + echo "Operator version: ${version} (sha: ${short_sha})" + + - name: Determine image tags and push policy + id: tags + run: | + if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]]; then + # Main push: publish floating : and :latest plus an + # immutable :- so consumers pinning a specific + # commit have a stable reference. + echo "tags=${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }},${{ env.IMAGE_NAME }}:latest,${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }}-${{ steps.version.outputs.short_sha }}" >> "$GITHUB_OUTPUT" + echo "push=true" >> "$GITHUB_OUTPUT" + elif [[ "${{ github.event_name }}" == "workflow_dispatch" && "${{ inputs.push_image }}" == "true" ]]; then + echo "tags=${{ env.IMAGE_NAME }}:${{ steps.version.outputs.version }}-${{ steps.version.outputs.short_sha }}" >> "$GITHUB_OUTPUT" + echo "push=true" >> "$GITHUB_OUTPUT" + else + # Pull request (or workflow_dispatch with push_image=false): + # build both platforms but don't publish — catches arm64-incompatible + # changes (build tags, syscalls, CGO) before they merge. + echo "tags=${{ env.IMAGE_NAME }}:pr-${{ steps.version.outputs.short_sha }}" >> "$GITHUB_OUTPUT" + echo "push=false" >> "$GITHUB_OUTPUT" + fi + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to GHCR + if: steps.tags.outputs.push == 'true' + uses: docker/login-action@v4.1.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and (conditionally) push + uses: docker/build-push-action@v6 + with: + context: operator + push: ${{ steps.tags.outputs.push }} + platforms: linux/amd64,linux/arm64 + tags: ${{ steps.tags.outputs.tags }} + cache-from: type=gha + cache-to: type=gha,mode=max + labels: | + org.opencontainers.image.source=https://github.com/${{ github.repository }} + org.opencontainers.image.version=${{ steps.version.outputs.version }} + org.opencontainers.image.revision=${{ github.sha }} + org.opencontainers.image.title=openfga-operator + org.opencontainers.image.description=OpenFGA Kubernetes operator for migration orchestration diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 49030831..bc035c32 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -59,6 +59,97 @@ jobs: if: steps.list-changed.outputs.changed == 'true' uses: helm/kind-action@v1.14.0 + - name: Build and load operator image into kind + if: steps.list-changed.outputs.changed == 'true' + run: | + version=$(grep '^appVersion:' charts/openfga-operator/Chart.yaml | awk '{print $2}' | tr -d '"') + docker build -t "ghcr.io/openfga/openfga-operator:${version}" operator/ + kind load docker-image "ghcr.io/openfga/openfga-operator:${version}" --name chart-testing + - name: Run chart-testing (install) if: steps.list-changed.outputs.changed == 'true' run: ct install --target-branch ${{ github.event.repository.default_branch }} + + - name: E2E test — operator-managed migration across schema boundary + id: e2e-operator + if: steps.list-changed.outputs.changed == 'true' + env: + NS: openfga-e2e + REL: openfga + # v1.9.5 → v1.14.1 crosses the v1.10.0 "!!REQUIRES MIGRATION!!" + # boundary (collation spec change in openfga/openfga#2661). + OLD_VER: v1.9.5 + NEW_VER: v1.14.1 + run: | + set -euo pipefail + kubectl create namespace "$NS" + helm dependency build charts/openfga + + echo "=== Phase 1: fresh install at ${OLD_VER} ===" + helm install "$REL" charts/openfga \ + --namespace "$NS" \ + --values .github/ci/operator-postgres-values.yaml \ + --set image.tag="${OLD_VER}" \ + --wait --timeout=3m + + # Operator pod must reach Ready (validates /readyz, RBAC, env vars). + kubectl wait deployment -n "$NS" \ + -l app.kubernetes.io/name=openfga-operator \ + --for=condition=Available=True --timeout=2m + + # Operator must run the migration Job and write ConfigMap at OLD_VER. + # Poll because kubectl wait --for=create requires kubectl >=1.31. + for i in $(seq 1 60); do + ver=$(kubectl get configmap "${REL}-migration-status" -n "$NS" \ + -o jsonpath='{.data.version}' 2>/dev/null || true) + if [ "$ver" = "${OLD_VER}" ]; then + echo "Phase 1: migration ConfigMap version=${ver}" + break + fi + sleep 3 + done + test "$ver" = "${OLD_VER}" + + # Operator must scale the openfga Deployment from 0 to 1 ready replica. + # condition=Available alone returns true at 0/0 before scale-up; + # readyReplicas=1 is the load-bearing signal. + kubectl wait deployment/"$REL" -n "$NS" \ + --for=jsonpath='{.status.readyReplicas}'=1 --timeout=3m + + echo "=== Phase 2: helm upgrade ${OLD_VER} → ${NEW_VER} ===" + helm upgrade "$REL" charts/openfga \ + --namespace "$NS" \ + --values .github/ci/operator-postgres-values.yaml \ + --set image.tag="${NEW_VER}" \ + --wait --timeout=3m + + # Operator must detect the version change, delete the stale Job, + # run a new migration, and update the ConfigMap to NEW_VER. + for i in $(seq 1 60); do + ver=$(kubectl get configmap "${REL}-migration-status" -n "$NS" \ + -o jsonpath='{.data.version}' 2>/dev/null || true) + if [ "$ver" = "${NEW_VER}" ]; then + echo "Phase 2: migration ConfigMap version=${ver}" + break + fi + sleep 3 + done + test "$ver" = "${NEW_VER}" + + # New pods must roll out at NEW_VER and become Ready. + kubectl wait deployment/"$REL" -n "$NS" \ + --for=jsonpath='{.status.readyReplicas}'=1 --timeout=3m + image=$(kubectl get deployment/"$REL" -n "$NS" \ + -o jsonpath='{.spec.template.spec.containers[0].image}') + echo "Phase 2 running image: $image" + echo "$image" | grep -q ":${NEW_VER}" + + - name: Dump operator E2E diagnostics on failure + if: failure() && steps.e2e-operator.conclusion == 'failure' + env: + NS: openfga-e2e + run: | + kubectl get all,configmap,job -n "$NS" -o wide || true + kubectl describe deployment -n "$NS" || true + kubectl logs -n "$NS" -l app.kubernetes.io/name=openfga-operator --tail=200 || true + kubectl logs -n "$NS" -l job-name --tail=200 || true diff --git a/charts/openfga-operator/.helmignore b/charts/openfga-operator/.helmignore new file mode 100644 index 00000000..edf9e7ef --- /dev/null +++ b/charts/openfga-operator/.helmignore @@ -0,0 +1,18 @@ +# Patterns to ignore when building packages. +.DS_Store +.git +.gitignore +.bzr +.bzrignore +.hg +.hgignore +.svn +*.swp +*.bak +*.tmp +*.orig +*~ +.project +.idea +*.tmproj +.vscode diff --git a/charts/openfga-operator/Chart.yaml b/charts/openfga-operator/Chart.yaml new file mode 100644 index 00000000..95da06ba --- /dev/null +++ b/charts/openfga-operator/Chart.yaml @@ -0,0 +1,19 @@ +apiVersion: v2 +name: openfga-operator +description: Helm chart for the OpenFGA Kubernetes operator. + +type: application +version: 0.1.0 +appVersion: "0.1.0" + +home: "https://openfga.github.io/helm-charts" +icon: https://github.com/openfga/community/raw/main/brand-assets/icon/color/openfga-icon-color.svg + +maintainers: + - name: OpenFGA Authors + url: https://github.com/openfga +sources: + - https://github.com/openfga/helm-charts + +annotations: + artifacthub.io/license: Apache-2.0 diff --git a/charts/openfga-operator/ci/default-values.yaml b/charts/openfga-operator/ci/default-values.yaml new file mode 100644 index 00000000..93797cd5 --- /dev/null +++ b/charts/openfga-operator/ci/default-values.yaml @@ -0,0 +1,4 @@ +# Standalone install exercise for chart-testing. +# kind has the operator image preloaded, so skip the registry pull. +image: + pullPolicy: Never diff --git a/charts/openfga-operator/crds/README.md b/charts/openfga-operator/crds/README.md new file mode 100644 index 00000000..060b0d0c --- /dev/null +++ b/charts/openfga-operator/crds/README.md @@ -0,0 +1,4 @@ +# CRDs + +This directory is reserved for Custom Resource Definitions added in later stages. +No CRDs are installed in Stage 1 (migration orchestration). diff --git a/charts/openfga-operator/templates/NOTES.txt b/charts/openfga-operator/templates/NOTES.txt new file mode 100644 index 00000000..bcfcf801 --- /dev/null +++ b/charts/openfga-operator/templates/NOTES.txt @@ -0,0 +1,16 @@ +The openfga-operator has been deployed. + +NOTE: Ensure the operator image ({{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}) is available in your registry. +If unavailable, the operator pod may remain in ImagePullBackOff until the image is pushed. + +To check operator status: + kubectl get deployment --namespace {{ include "openfga-operator.namespace" . }} {{ include "openfga-operator.fullname" . }} + +To view operator logs: + kubectl logs --namespace {{ include "openfga-operator.namespace" . }} -l "app.kubernetes.io/name={{ include "openfga-operator.name" . }}" + +To check migration status: + kubectl get configmap -n {{ include "openfga-operator.namespace" . }} -l app.kubernetes.io/managed-by=openfga-operator + +To inspect migration jobs: + kubectl get jobs -n {{ include "openfga-operator.namespace" . }} -l app.kubernetes.io/part-of=openfga,app.kubernetes.io/component=migration diff --git a/charts/openfga-operator/templates/_helpers.tpl b/charts/openfga-operator/templates/_helpers.tpl new file mode 100644 index 00000000..f63057d6 --- /dev/null +++ b/charts/openfga-operator/templates/_helpers.tpl @@ -0,0 +1,72 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "openfga-operator.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "openfga-operator.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Expand the namespace of the release. +Allows overriding it for multi-namespace deployments in combined charts. +*/}} +{{- define "openfga-operator.namespace" -}} +{{- default .Release.Namespace .Values.namespaceOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "openfga-operator.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "openfga-operator.labels" -}} +helm.sh/chart: {{ include "openfga-operator.chart" . }} +{{ include "openfga-operator.selectorLabels" . }} +app.kubernetes.io/component: operator +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +app.kubernetes.io/part-of: openfga +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "openfga-operator.selectorLabels" -}} +app.kubernetes.io/name: {{ include "openfga-operator.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "openfga-operator.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "openfga-operator.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- required "serviceAccount.name must be set when serviceAccount.create=false" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/charts/openfga-operator/templates/deployment.yaml b/charts/openfga-operator/templates/deployment.yaml new file mode 100644 index 00000000..5b83a6bd --- /dev/null +++ b/charts/openfga-operator/templates/deployment.yaml @@ -0,0 +1,85 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "openfga-operator.fullname" . }} + namespace: {{ include "openfga-operator.namespace" . }} + labels: + {{- include "openfga-operator.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "openfga-operator.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "openfga-operator.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "openfga-operator.serviceAccountName" . }} + {{- with .Values.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: operator + {{- with .Values.securityContext }} + securityContext: + {{- toYaml . | nindent 12 }} + {{- end }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + args: + {{- if .Values.leaderElection.enabled }} + - --leader-elect + {{- end }} + {{- if .Values.watchNamespace }} + - --watch-namespace={{ .Values.watchNamespace }} + {{- end }} + - --backoff-limit={{ .Values.migrationJob.backoffLimit }} + - --active-deadline-seconds={{ .Values.migrationJob.activeDeadlineSeconds }} + - --ttl-seconds-after-finished={{ .Values.migrationJob.ttlSecondsAfterFinished }} + env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - name: healthz + containerPort: 8081 + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: healthz + initialDelaySeconds: 15 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: healthz + initialDelaySeconds: 5 + periodSeconds: 10 + {{- with .Values.resources }} + resources: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/charts/openfga-operator/templates/pdb.yaml b/charts/openfga-operator/templates/pdb.yaml new file mode 100644 index 00000000..6c3514eb --- /dev/null +++ b/charts/openfga-operator/templates/pdb.yaml @@ -0,0 +1,18 @@ +{{- if .Values.podDisruptionBudget.enabled -}} +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ include "openfga-operator.fullname" . }} + namespace: {{ include "openfga-operator.namespace" . }} + labels: + {{- include "openfga-operator.labels" . | nindent 4 }} +spec: + {{- if .Values.podDisruptionBudget.minAvailable }} + minAvailable: {{ .Values.podDisruptionBudget.minAvailable }} + {{- else }} + maxUnavailable: {{ .Values.podDisruptionBudget.maxUnavailable | default 1 }} + {{- end }} + selector: + matchLabels: + {{- include "openfga-operator.selectorLabels" . | nindent 6 }} +{{- end }} diff --git a/charts/openfga-operator/templates/role.yaml b/charts/openfga-operator/templates/role.yaml new file mode 100644 index 00000000..dd17870b --- /dev/null +++ b/charts/openfga-operator/templates/role.yaml @@ -0,0 +1,26 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "openfga-operator.fullname" . }} + namespace: {{ include "openfga-operator.namespace" . }} + labels: + {{- include "openfga-operator.labels" . | nindent 4 }} +rules: + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "list", "watch", "patch"] + - apiGroups: ["apps"] + resources: ["deployments/status"] + verbs: ["patch"] + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "watch", "create", "delete"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["get", "list", "watch", "create", "update"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["get", "list", "watch", "create", "update"] + - apiGroups: [""] + resources: ["events"] + verbs: ["create", "patch"] diff --git a/charts/openfga-operator/templates/rolebinding.yaml b/charts/openfga-operator/templates/rolebinding.yaml new file mode 100644 index 00000000..afacb98a --- /dev/null +++ b/charts/openfga-operator/templates/rolebinding.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "openfga-operator.fullname" . }} + namespace: {{ include "openfga-operator.namespace" . }} + labels: + {{- include "openfga-operator.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "openfga-operator.fullname" . }} +subjects: + - kind: ServiceAccount + name: {{ include "openfga-operator.serviceAccountName" . }} + namespace: {{ include "openfga-operator.namespace" . }} diff --git a/charts/openfga-operator/templates/serviceaccount.yaml b/charts/openfga-operator/templates/serviceaccount.yaml new file mode 100644 index 00000000..8b1f8941 --- /dev/null +++ b/charts/openfga-operator/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "openfga-operator.serviceAccountName" . }} + namespace: {{ include "openfga-operator.namespace" . }} + labels: + {{- include "openfga-operator.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/charts/openfga-operator/values.schema.json b/charts/openfga-operator/values.schema.json new file mode 100644 index 00000000..324465ff --- /dev/null +++ b/charts/openfga-operator/values.schema.json @@ -0,0 +1,110 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "type": "object", + "properties": { + "global": { + "type": "object" + }, + "replicaCount": { + "type": "integer", + "minimum": 1 + }, + "image": { + "type": "object", + "properties": { + "repository": { + "type": "string", + "minLength": 1 + }, + "pullPolicy": { + "type": "string", + "enum": ["Always", "IfNotPresent", "Never"] + }, + "tag": { + "type": "string" + } + }, + "required": ["repository"], + "additionalProperties": false + }, + "imagePullSecrets": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { "type": "string" } + }, + "required": ["name"], + "additionalProperties": false + } + }, + "nameOverride": { "type": "string" }, + "fullnameOverride": { "type": "string" }, + "namespaceOverride": { "type": "string" }, + "serviceAccount": { + "type": "object", + "properties": { + "create": { "type": "boolean" }, + "annotations": { "type": "object" }, + "name": { "type": "string" } + }, + "additionalProperties": false + }, + "podAnnotations": { "type": "object" }, + "podSecurityContext": { "type": "object" }, + "securityContext": { "type": "object" }, + "watchNamespace": { "type": "string" }, + "leaderElection": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" } + }, + "additionalProperties": false + }, + "migrationJob": { + "type": "object", + "properties": { + "backoffLimit": { + "type": "integer", + "minimum": 0 + }, + "activeDeadlineSeconds": { + "type": "integer", + "minimum": 1 + }, + "ttlSecondsAfterFinished": { + "type": "integer", + "minimum": 0 + } + }, + "additionalProperties": false + }, + "resources": { "type": "object" }, + "podDisruptionBudget": { + "type": "object", + "properties": { + "enabled": { "type": "boolean" }, + "minAvailable": { + "oneOf": [ + { "type": "string" }, + { "type": "integer", "minimum": 0 } + ] + }, + "maxUnavailable": { + "oneOf": [ + { "type": "string" }, + { "type": "integer", "minimum": 0 } + ] + } + }, + "additionalProperties": false + }, + "nodeSelector": { "type": "object" }, + "tolerations": { + "type": "array", + "items": { "type": "object" } + }, + "affinity": { "type": "object" } + }, + "additionalProperties": false +} diff --git a/charts/openfga-operator/values.yaml b/charts/openfga-operator/values.yaml new file mode 100644 index 00000000..921dcef7 --- /dev/null +++ b/charts/openfga-operator/values.yaml @@ -0,0 +1,83 @@ +replicaCount: 1 + +image: + repository: ghcr.io/openfga/openfga-operator + pullPolicy: IfNotPresent + # -- Overrides the image tag whose default is the chart appVersion. + tag: "" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" +# -- Override the namespace for all operator resources. +# Useful when the parent chart deploys subcharts into a different namespace. +namespaceOverride: "" + +serviceAccount: + # -- Specifies whether a service account should be created. + create: true + # -- Annotations to add to the service account. + annotations: {} + # -- The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template. + name: "" + +podAnnotations: {} + +podSecurityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + +securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 65532 + +# -- Namespace to watch for OpenFGA Deployments. +# Leave empty to default to the operator pod's own namespace (read from +# the POD_NAMESPACE env var, set via the downward API). This usually +# equals the release namespace, but when `namespaceOverride` puts the +# operator in a different namespace than the release, the watch follows +# the pod — not the release. Set this explicitly to watch a specific +# namespace independent of where the operator runs. +watchNamespace: "" + +leaderElection: + # -- Enable leader election for controller manager. + enabled: true + +migrationJob: + # -- Number of pod failures before a migration Job is considered failed. + backoffLimit: 3 + # -- Maximum wall-clock seconds a migration Job can run before being terminated. + activeDeadlineSeconds: 300 + # -- Seconds to keep completed/failed Job pods for log inspection before garbage collection. + ttlSecondsAfterFinished: 300 + +resources: + requests: + cpu: 10m + memory: 64Mi + limits: + memory: 128Mi + +podDisruptionBudget: + # -- Enable a PodDisruptionBudget for the operator. + enabled: false + # -- Minimum number of pods that must be available during disruption. + # Cannot be set together with maxUnavailable. + minAvailable: "" + # -- Maximum number of pods that can be unavailable during disruption. + # Defaults to 1 when enabled and minAvailable is not set. + maxUnavailable: 1 + +nodeSelector: {} + +tolerations: [] + +affinity: {} diff --git a/charts/openfga/Chart.lock b/charts/openfga/Chart.lock index e82ffa5a..80114538 100644 --- a/charts/openfga/Chart.lock +++ b/charts/openfga/Chart.lock @@ -8,5 +8,8 @@ dependencies: - name: common repository: oci://registry-1.docker.io/bitnamicharts version: 2.13.3 -digest: sha256:4bbfb25821b0dfb6c70aabb5caf4c5ec7e6526261f93a8f531f507f1d4c43e3e -generated: "2026-03-18T11:41:40.1785546-04:00" +- name: openfga-operator + repository: file://../openfga-operator + version: 0.1.0 +digest: sha256:d502dc105790995a4368a049c0f593820d08f2f82dc9c9a70480a343c7affe8b +generated: "2026-04-10T11:45:16.638975-04:00" diff --git a/charts/openfga/Chart.yaml b/charts/openfga/Chart.yaml index c7eeb76d..624d5ca3 100644 --- a/charts/openfga/Chart.yaml +++ b/charts/openfga/Chart.yaml @@ -29,3 +29,7 @@ dependencies: repository: oci://registry-1.docker.io/bitnamicharts tags: - bitnami-common + - name: openfga-operator + version: "0.1.0" + repository: "file://../openfga-operator" + condition: operator.enabled diff --git a/charts/openfga/ci/operator-mode-values.yaml b/charts/openfga/ci/operator-mode-values.yaml new file mode 100644 index 00000000..b85a6af2 --- /dev/null +++ b/charts/openfga/ci/operator-mode-values.yaml @@ -0,0 +1,25 @@ +# Exercises operator-managed mode end-to-end via chart-testing. +# +# The openfga-operator subchart auto-installs (conditional dependency on +# operator.enabled). With the memory datastore, the chart starts the +# Deployment at replicas=1 immediately, so `helm test` runs without racing +# the operator's reconcile loop. Migration is skipped (memory engine), but +# the rest of the wiring is exercised: subchart resolution, operator RBAC, +# pod/SA/annotation rendering, and the operator pod actually running and +# reconciling against the openfga Deployment in its release namespace. +# +# Postgres + operator (which exercises the migration Job path) is left to +# a follow-up E2E test — it requires waiting for the operator to scale the +# Deployment up before `helm test` runs the gRPC probe. +operator: + enabled: true + +migration: + enabled: true + +datastore: + engine: memory + +openfga-operator: + image: + pullPolicy: Never diff --git a/charts/openfga/templates/NOTES.txt b/charts/openfga/templates/NOTES.txt index 0048291e..628c3558 100644 --- a/charts/openfga/templates/NOTES.txt +++ b/charts/openfga/templates/NOTES.txt @@ -1,3 +1,20 @@ +{{- if and .Values.operator.enabled .Values.migration.enabled }} +NOTE: operator-managed migration is enabled. The OpenFGA Deployment starts at +0 replicas and is scaled up by the openfga-operator only after the migration +Job completes successfully. + +If pods don't appear within ~2 minutes, check the operator and the migration +Job: + + kubectl get deployment -A -l app.kubernetes.io/name=openfga-operator + kubectl logs -n {{ .Release.Namespace }} -l app.kubernetes.io/name=openfga-operator --tail=100 + kubectl get job/{{ include "openfga.fullname" . }}-migrate -n {{ .Release.Namespace }} -o yaml + kubectl describe deployment/{{ include "openfga.fullname" . }} -n {{ .Release.Namespace }} + +A `MigrationFailed` condition on the Deployment indicates the migration Job +failed; the operator will retry every 60s once the underlying issue is fixed. + +{{ end -}} 1. Get the application URL by running these commands: {{- if .Values.ingress.enabled }} {{- range $host := .Values.ingress.hosts }} diff --git a/charts/openfga/templates/_helpers.tpl b/charts/openfga/templates/_helpers.tpl index 5889497a..cc50e03d 100644 --- a/charts/openfga/templates/_helpers.tpl +++ b/charts/openfga/templates/_helpers.tpl @@ -74,6 +74,17 @@ Create the name of the service account to use {{- end }} {{- end }} +{{/* +Create the name of the migration service account to use (operator mode only) +*/}} +{{- define "openfga.migrationServiceAccountName" -}} +{{- if .Values.migration.serviceAccount.create }} +{{- default (printf "%s-migration" (include "openfga.fullname" .)) .Values.migration.serviceAccount.name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- required "migration.serviceAccount.name must be set when migration.serviceAccount.create=false" .Values.migration.serviceAccount.name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} + {{/* Return true if a secret object should be created */}} diff --git a/charts/openfga/templates/deployment.yaml b/charts/openfga/templates/deployment.yaml index e6c1fff9..7318ddea 100644 --- a/charts/openfga/templates/deployment.yaml +++ b/charts/openfga/templates/deployment.yaml @@ -4,12 +4,35 @@ metadata: name: {{ include "openfga.fullname" . }} labels: {{- include "openfga.labels" . | nindent 4 }} - {{- with .Values.annotations }} + {{- $hasOperatorAnnotations := and .Values.operator.enabled .Values.migration.enabled }} + {{- if or $hasOperatorAnnotations .Values.annotations }} annotations: + {{- if $hasOperatorAnnotations }} + openfga.dev/migration-enabled: "true" + openfga.dev/container-name: "{{ .Chart.Name }}" + openfga.dev/desired-replicas: '{{ ternary 1 .Values.replicaCount (eq .Values.datastore.engine "memory") }}' + {{- if or .Values.migration.serviceAccount.create .Values.migration.serviceAccount.name }} + openfga.dev/migration-service-account: '{{ include "openfga.migrationServiceAccountName" . }}' + {{- end }} + {{- end }} + {{- with .Values.annotations }} {{- toYaml . | nindent 4 }} + {{- end }} {{- end }} spec: - {{- if not .Values.autoscaling.enabled }} + {{- if and .Values.operator.enabled .Values.migration.enabled }} + {{- if .Values.autoscaling.enabled }} + {{- fail "operator.enabled and autoscaling.enabled cannot both be true" }} + {{- end }} + {{- /* On upgrade: preserve live replicas (zero-downtime). On fresh install: lookup returns empty, fall back to 0. + OpenFGA gates readiness on MinimumSupportedDatastoreSchemaRevision — see ADR-002. */ -}} + {{- $existing := (lookup "apps/v1" "Deployment" (include "openfga.namespace" .) (include "openfga.fullname" .)) }} + {{- if and $existing (hasKey ($existing) "spec") }} + replicas: {{ $existing.spec.replicas }} + {{- else }} + replicas: {{ ternary 1 0 (eq .Values.datastore.engine "memory") }} + {{- end }} + {{- else if not .Values.autoscaling.enabled }} replicas: {{ ternary 1 .Values.replicaCount (eq .Values.datastore.engine "memory")}} {{- end }} selector: @@ -37,9 +60,11 @@ spec: serviceAccountName: {{ include "openfga.serviceAccountName" . }} securityContext: {{- toYaml .Values.podSecurityContext | nindent 8 }} - {{ if or (and (has .Values.datastore.engine (list "postgres" "mysql")) .Values.datastore.applyMigrations .Values.datastore.waitForMigrations) .Values.extraInitContainers }} + {{- $needsMigrationInit := and (not .Values.operator.enabled) (has .Values.datastore.engine (list "postgres" "mysql")) .Values.datastore.applyMigrations .Values.datastore.waitForMigrations }} + {{- if or $needsMigrationInit .Values.extraInitContainers }} initContainers: - {{- if and (has .Values.datastore.engine (list "postgres" "mysql")) .Values.datastore.applyMigrations .Values.datastore.waitForMigrations (eq .Values.datastore.migrationType "job") }} + {{- if $needsMigrationInit }} + {{- if eq .Values.datastore.migrationType "job" }} - name: wait-for-migration securityContext: {{- toYaml .Values.securityContext | nindent 12 }} @@ -49,7 +74,7 @@ spec: resources: {{- toYaml .Values.datastore.migrations.resources | nindent 12 }} {{- end }} - {{- if and (has .Values.datastore.engine (list "postgres" "mysql")) (eq .Values.datastore.migrationType "initContainer") }} + {{- if eq .Values.datastore.migrationType "initContainer" }} {{- with .Values.migrate.extraInitContainers }} {{- toYaml . | nindent 8 }} {{- end }} @@ -77,6 +102,7 @@ spec: {{- include "common.tplvalues.render" ( dict "value" .Values.migrate.sidecars "context" $) | nindent 8 }} {{- end }} {{- end }} + {{- end }} {{- with .Values.extraInitContainers }} {{- toYaml . | nindent 8 }} {{- end }} diff --git a/charts/openfga/templates/job.yaml b/charts/openfga/templates/job.yaml index fc70228d..d46d938f 100644 --- a/charts/openfga/templates/job.yaml +++ b/charts/openfga/templates/job.yaml @@ -1,4 +1,4 @@ -{{- if and (has .Values.datastore.engine (list "postgres" "mysql")) .Values.datastore.applyMigrations (eq .Values.datastore.migrationType "job") -}} +{{- if and (not .Values.operator.enabled) (has .Values.datastore.engine (list "postgres" "mysql")) .Values.datastore.applyMigrations (eq .Values.datastore.migrationType "job") -}} apiVersion: batch/v1 kind: Job metadata: diff --git a/charts/openfga/templates/rbac.yaml b/charts/openfga/templates/rbac.yaml index 3c8e0f8b..71d3c096 100644 --- a/charts/openfga/templates/rbac.yaml +++ b/charts/openfga/templates/rbac.yaml @@ -1,4 +1,4 @@ -{{- if .Values.serviceAccount.create -}} +{{- if and (not .Values.operator.enabled) .Values.serviceAccount.create -}} apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: diff --git a/charts/openfga/templates/serviceaccount.yaml b/charts/openfga/templates/serviceaccount.yaml index bbe191c9..f732c46e 100644 --- a/charts/openfga/templates/serviceaccount.yaml +++ b/charts/openfga/templates/serviceaccount.yaml @@ -10,3 +10,16 @@ metadata: {{- toYaml . | nindent 4 }} {{- end }} {{- end }} +{{- if and .Values.operator.enabled .Values.migration.enabled .Values.migration.serviceAccount.create }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "openfga.migrationServiceAccountName" . }} + labels: + {{- include "openfga.labels" . | nindent 4 }} + {{- with .Values.migration.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/charts/openfga/tests/operator_mode_job_test.yaml b/charts/openfga/tests/operator_mode_job_test.yaml new file mode 100644 index 00000000..31d57607 --- /dev/null +++ b/charts/openfga/tests/operator_mode_job_test.yaml @@ -0,0 +1,28 @@ +suite: operator mode - job template +templates: + - templates/job.yaml +tests: + - it: should not render migration job when operator is enabled + set: + operator.enabled: true + migration.enabled: true + datastore.engine: postgres + datastore.uri: "postgres://localhost/openfga" + datastore.applyMigrations: true + datastore.migrationType: job + asserts: + - hasDocuments: + count: 0 + + - it: should render migration job when operator is disabled + set: + operator.enabled: false + datastore.engine: postgres + datastore.uri: "postgres://localhost/openfga" + datastore.applyMigrations: true + datastore.migrationType: job + asserts: + - hasDocuments: + count: 1 + - isKind: + of: Job diff --git a/charts/openfga/tests/operator_mode_rbac_test.yaml b/charts/openfga/tests/operator_mode_rbac_test.yaml new file mode 100644 index 00000000..bb60846c --- /dev/null +++ b/charts/openfga/tests/operator_mode_rbac_test.yaml @@ -0,0 +1,25 @@ +suite: operator mode - RBAC +templates: + - templates/rbac.yaml +tests: + - it: should not render legacy RBAC when operator is enabled + set: + operator.enabled: true + serviceAccount.create: true + asserts: + - hasDocuments: + count: 0 + + - it: should render legacy RBAC when operator is disabled + set: + operator.enabled: false + serviceAccount.create: true + asserts: + - hasDocuments: + count: 2 + - isKind: + of: Role + documentIndex: 0 + - isKind: + of: RoleBinding + documentIndex: 1 diff --git a/charts/openfga/tests/operator_mode_serviceaccount_test.yaml b/charts/openfga/tests/operator_mode_serviceaccount_test.yaml new file mode 100644 index 00000000..cbeab1a0 --- /dev/null +++ b/charts/openfga/tests/operator_mode_serviceaccount_test.yaml @@ -0,0 +1,66 @@ +suite: operator mode - service accounts +templates: + - templates/serviceaccount.yaml +tests: + - it: should render migration service account when operator is enabled + set: + operator.enabled: true + migration.enabled: true + migration.serviceAccount.create: true + serviceAccount.create: true + asserts: + - hasDocuments: + count: 2 + - isKind: + of: ServiceAccount + documentIndex: 1 + - equal: + path: metadata.name + value: RELEASE-NAME-openfga-migration + documentIndex: 1 + + - it: should not render migration service account when operator is disabled + set: + operator.enabled: false + serviceAccount.create: true + asserts: + - hasDocuments: + count: 1 + + - it: should not render migration service account when migration SA creation is disabled + set: + operator.enabled: true + migration.enabled: true + migration.serviceAccount.create: false + migration.serviceAccount.name: external-sa + serviceAccount.create: true + asserts: + - hasDocuments: + count: 1 + + - it: should render migration service account with custom annotations + set: + operator.enabled: true + migration.enabled: true + migration.serviceAccount.create: true + migration.serviceAccount.annotations: + eks.amazonaws.com/role-arn: "arn:aws:iam::123456789012:role/openfga-migrator" + serviceAccount.create: true + asserts: + - equal: + path: metadata.annotations["eks.amazonaws.com/role-arn"] + value: "arn:aws:iam::123456789012:role/openfga-migrator" + documentIndex: 1 + + - it: should use custom migration service account name + set: + operator.enabled: true + migration.enabled: true + migration.serviceAccount.create: true + migration.serviceAccount.name: my-migrator + serviceAccount.create: true + asserts: + - equal: + path: metadata.name + value: my-migrator + documentIndex: 1 diff --git a/charts/openfga/tests/operator_mode_test.yaml b/charts/openfga/tests/operator_mode_test.yaml new file mode 100644 index 00000000..164d8bd1 --- /dev/null +++ b/charts/openfga/tests/operator_mode_test.yaml @@ -0,0 +1,170 @@ +suite: operator mode +templates: + - templates/deployment.yaml +tests: + # --- Deployment annotations --- + - it: should set operator annotations when operator and migration are enabled + set: + operator.enabled: true + migration.enabled: true + replicaCount: 3 + datastore.engine: postgres + asserts: + - equal: + path: metadata.annotations["openfga.dev/migration-enabled"] + value: "true" + - equal: + path: metadata.annotations["openfga.dev/desired-replicas"] + value: "3" + - equal: + path: metadata.annotations["openfga.dev/migration-service-account"] + value: RELEASE-NAME-openfga-migration + + - it: should not set operator annotations when operator is disabled + set: + operator.enabled: false + annotations: + custom: value + asserts: + - isNull: + path: metadata.annotations["openfga.dev/migration-enabled"] + - isNull: + path: metadata.annotations["openfga.dev/desired-replicas"] + + - it: should set desired-replicas to 1 for memory datastore + set: + operator.enabled: true + migration.enabled: true + replicaCount: 5 + datastore.engine: memory + asserts: + - equal: + path: metadata.annotations["openfga.dev/desired-replicas"] + value: "1" + + - it: should use custom migration service account name when set + set: + operator.enabled: true + migration.enabled: true + datastore.engine: postgres + migration.serviceAccount.name: my-custom-sa + asserts: + - equal: + path: metadata.annotations["openfga.dev/migration-service-account"] + value: my-custom-sa + + - it: should not set migration-service-account annotation when SA creation is disabled and no name set + set: + operator.enabled: true + migration.enabled: true + datastore.engine: postgres + migration.serviceAccount.create: false + asserts: + - isNull: + path: metadata.annotations["openfga.dev/migration-service-account"] + + # --- Replica count --- + # When no live cluster is available (helm template / test), lookup returns empty, + # so the template falls back to replicas: 0 (fresh install behavior). + # On a real cluster, lookup preserves the existing replica count for zero-downtime upgrades. + - it: should set replicas to 0 on fresh install when operator is enabled with database datastore + set: + operator.enabled: true + migration.enabled: true + replicaCount: 3 + datastore.engine: postgres + asserts: + - equal: + path: spec.replicas + value: 0 + + - it: should set replicas to 1 when operator is enabled with memory datastore + set: + operator.enabled: true + migration.enabled: true + replicaCount: 5 + datastore.engine: memory + asserts: + - equal: + path: spec.replicas + value: 1 + + - it: should set replicas to replicaCount when operator is disabled + set: + operator.enabled: false + replicaCount: 5 + datastore.engine: postgres + asserts: + - equal: + path: spec.replicas + value: 5 + + # --- Autoscaling conflict --- + - it: should fail when operator and autoscaling are both enabled + set: + operator.enabled: true + migration.enabled: true + autoscaling.enabled: true + datastore.engine: postgres + asserts: + - failedTemplate: + errorMessage: "operator.enabled and autoscaling.enabled cannot both be true" + + # --- initContainers gating --- + - it: should not render migration initContainers when operator is enabled + set: + operator.enabled: true + migration.enabled: true + datastore.engine: postgres + datastore.uri: "postgres://localhost/openfga" + datastore.applyMigrations: true + datastore.waitForMigrations: true + datastore.migrationType: job + asserts: + - isNull: + path: spec.template.spec.initContainers + + - it: should render migration initContainers when operator is disabled + set: + operator.enabled: false + datastore.engine: postgres + datastore.uri: "postgres://localhost/openfga" + datastore.applyMigrations: true + datastore.waitForMigrations: true + datastore.migrationType: job + asserts: + - isNotNull: + path: spec.template.spec.initContainers + + # --- Pod template labels --- + # The pod template must carry the full common label set (helm.sh/chart, + # component, version, managed-by, part-of) — not just selectorLabels — + # so logging/monitoring tooling that filters on these labels keeps working + # across upgrades. Regression guard for the operator-migration branch. + - it: should include common labels on pod template metadata when operator is disabled + set: + operator.enabled: false + asserts: + - isNotEmpty: + path: spec.template.metadata.labels["helm.sh/chart"] + - equal: + path: spec.template.metadata.labels["app.kubernetes.io/component"] + value: authorization-controller + - equal: + path: spec.template.metadata.labels["app.kubernetes.io/part-of"] + value: openfga + + - it: should include common labels on pod template metadata when operator is enabled + set: + operator.enabled: true + migration.enabled: true + datastore.engine: postgres + asserts: + - isNotEmpty: + path: spec.template.metadata.labels["helm.sh/chart"] + - equal: + path: spec.template.metadata.labels["app.kubernetes.io/component"] + value: authorization-controller + - equal: + path: spec.template.metadata.labels["app.kubernetes.io/part-of"] + value: openfga diff --git a/charts/openfga/values.schema.json b/charts/openfga/values.schema.json index 151cc21a..4fb19a27 100644 --- a/charts/openfga/values.schema.json +++ b/charts/openfga/values.schema.json @@ -1289,6 +1289,58 @@ "type": "boolean", "description": "This value is not used by this chart, but allows a common pattern of enabling/disabling subchart dependencies (where OpenFGA is a subchart)", "default": false + }, + "operator": { + "type": "object", + "description": "Controls the openfga-operator subchart. When enabled, migration is managed by the operator instead of the Helm job hook.", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable the openfga-operator subchart for operator-managed migrations", + "default": false + } + }, + "additionalProperties": false + }, + "openfga-operator": { + "type": "object", + "description": "Values passed through to the openfga-operator subchart (validated by that chart's own schema)" + }, + "migration": { + "type": "object", + "description": "Controls operator-driven migration behavior. Only used when operator.enabled is true.", + "properties": { + "enabled": { + "type": "boolean", + "description": "Enable operator-managed database migrations", + "default": true + }, + "serviceAccount": { + "type": "object", + "properties": { + "create": { + "type": "boolean", + "description": "Create a dedicated service account for migration Jobs", + "default": true + }, + "annotations": { + "type": "object", + "description": "Annotations to add to the migration service account", + "additionalProperties": { + "type": "string" + }, + "default": {} + }, + "name": { + "type": "string", + "description": "The name of the migration service account. Defaults to {fullname}-migration. Must be set explicitly when create=false and a dedicated migration SA is desired; leave empty to skip the annotation entirely.", + "default": "" + } + }, + "additionalProperties": false + } + }, + "additionalProperties": false } }, "additionalProperties": false diff --git a/charts/openfga/values.yaml b/charts/openfga/values.yaml index 75fa19d9..fcca5b09 100644 --- a/charts/openfga/values.yaml +++ b/charts/openfga/values.yaml @@ -385,6 +385,45 @@ testContainerSpec: {} # -- Array of extra K8s manifests to deploy ## Note: Supports use of custom Helm templates extraObjects: [] + +# -- operator controls the openfga-operator subchart. +# When enabled, migration is managed by the operator instead of the Helm job hook. +operator: + enabled: false + +# -- Values passed to the openfga-operator subchart (when operator.enabled is true). +# See charts/openfga-operator/values.yaml for all available options. +openfga-operator: {} + # migrationJob: + # backoffLimit: 3 + # activeDeadlineSeconds: 300 + # ttlSecondsAfterFinished: 300 + # leaderElection: + # enabled: true + # watchNamespace: "" + # resources: + # requests: + # cpu: 10m + # memory: 64Mi + # limits: + # memory: 128Mi + +# -- migration controls operator-driven migration behavior. +# Only used when operator.enabled is true. +migration: + # -- Enable operator-managed migrations. Set to false if you manage migrations externally. + enabled: true + serviceAccount: + # -- Create a dedicated service account for migration Jobs. + # The migration Job inherits env vars (including secretKeyRef) from the OpenFGA container. + # If your datastore secret has RBAC restrictions, ensure this service account can read it. + create: true + # -- Annotations to add to the migration service account. + # Use this to attach cloud IAM roles (e.g., eks.amazonaws.com/role-arn) for DDL permissions. + annotations: {} + # -- The name of the migration service account. + # If not set and create is true, defaults to {fullname}-migration. + name: "" ## Example: Deploy a PostgreSQL instance for dev/test using official Docker images. ## For production, use a managed database service or an operator like CloudnativePG. ## Configure the chart to use the secret: diff --git a/docs/adr/000-template.md b/docs/adr/000-template.md new file mode 100644 index 00000000..2cc78bb7 --- /dev/null +++ b/docs/adr/000-template.md @@ -0,0 +1,48 @@ +# ADR-NNN: Title + +- **Status:** Proposed +- **Date:** YYYY-MM-DD +- **Deciders:** [list of people involved] +- **Related Issues:** # +- **Related ADR:** [ADR-NNN](NNN-filename.md) + +## Context + +What is the problem or situation that motivates this decision? What constraints exist? What forces are at play? + +Include enough background that someone unfamiliar with the project can understand why this decision matters. + +## Decision + +What is the change being proposed or decided? + +### Alternatives Considered + +**A. [Alternative name]** + +[Description of the alternative] + +*Pros:* ... +*Cons:* ... + +**B. [Alternative name]** + +[Description of the alternative] + +*Pros:* ... +*Cons:* ... + +## Consequences + +### Positive + +- What improves as a result of this decision? + +### Negative + +- What gets harder, more complex, or more costly? + +### Risks + +- What assumptions might prove false? +- What could go wrong? diff --git a/docs/adr/001-adopt-openfga-operator.md b/docs/adr/001-adopt-openfga-operator.md new file mode 100644 index 00000000..ca845537 --- /dev/null +++ b/docs/adr/001-adopt-openfga-operator.md @@ -0,0 +1,95 @@ +# ADR-001: Adopt a Kubernetes Operator for OpenFGA Lifecycle Management + +- **Status:** Proposed +- **Date:** 2026-04-06 +- **Deciders:** OpenFGA Helm Charts maintainers +- **Related Issues:** #211, #107, #120, #100, #95, #126, #132, #143, #144 + +## Context + +The OpenFGA Helm chart currently handles all lifecycle concerns — deployment, configuration, database migrations, and secret management — through Helm templates and hooks. This approach works for simple installations but breaks down in several important scenarios: + +1. **Database migrations rely on Helm hooks**, which are incompatible with GitOps tools (ArgoCD, FluxCD) and Helm's own `--wait` flag. This is the single biggest pain point for users, accounting for 6 open issues (#211, #107, #120, #100, #95, #126). + +2. **Store provisioning, authorization model updates, and tuple management** are runtime operations that happen through the OpenFGA API. There is no declarative, GitOps-native way to manage these. Teams must use imperative scripts, CI pipelines, or manual API calls to set up stores and push models after deployment. + +3. **The migration init container** depends on `groundnuty/k8s-wait-for`, an unmaintained image with known CVEs, pinned by mutable tag (#132, #144). + +4. **Migration and runtime workloads share a single ServiceAccount**, violating least-privilege when cloud IAM-based database authentication (AWS IRSA, GCP Workload Identity) maps the ServiceAccount directly to a database role (#95). + +### Alternatives Considered + +**A. Fix migrations within the Helm chart (no operator)** + +- Strip Helm hook annotations from the migration Job by default, rendering it as a regular resource. +- Replace `k8s-wait-for` with a shell-based init container that polls the database schema version directly. +- Add a separate ServiceAccount for the migration Job. + +*Pros:* Lower complexity, no new component to maintain. +*Cons:* Doesn't solve the ordering problem cleanly — the Job and Deployment are created simultaneously, requiring an init container to gate startup. Still requires an image or script to poll. Doesn't address store/model/tuple lifecycle at all. + +**B. Recommend initContainer mode as default** + +- Change `datastore.migrationType` default from `"job"` to `"initContainer"`, running migrations inside each pod. + +*Pros:* No separate Job, no hooks, no `k8s-wait-for`. +*Cons:* Every pod runs migrations on startup (wasteful). Rolling updates trigger redundant migrations. Crash-loops on migration failure. Still shares ServiceAccount. No path to store lifecycle management. + +**C. Build an operator (selected)** + +- A Kubernetes operator manages migrations as internal reconciliation logic and exposes CRDs for store, model, and tuple lifecycle. + +*Pros:* Solves all migration issues. Enables GitOps-native authorization management. Follows established Kubernetes patterns (CNPG, Strimzi, cert-manager). Separates concerns cleanly. +*Cons:* Significant development and maintenance investment. New component to deploy and monitor. Learning curve for contributors. + +**D. External migration tool (e.g., Flyway, golang-migrate)** + +- Remove migrations from the chart entirely and document using an external tool. + +*Pros:* Simplifies the chart completely. +*Cons:* Shifts complexity to the user. Every user must build their own migration pipeline. No standard approach across the community. + +## Decision + +We will build an **OpenFGA Kubernetes Operator** that handles: + +1. **Database migration orchestration** (Stage 1) — replacing Helm hooks, the `k8s-wait-for` init container, and shared ServiceAccount with operator-managed migration Jobs and deployment readiness gating. + +2. **Declarative store lifecycle management** (Stages 2-4) — exposing `FGAStore`, `FGAModel`, and `FGATuples` CRDs for GitOps-native authorization configuration. + +The operator will be: +- Written in Go using `controller-runtime` / kubebuilder +- Distributed as a Helm subchart dependency of the main OpenFGA chart +- Optional — users who don't need it can set `operator.enabled: false` and fall back to the existing behavior + +Development will follow a staged approach to deliver value incrementally: + +| Stage | Scope | Outcome | +|-------|-------|---------| +| 1 | Operator scaffolding + migration handling | All 6 migration issues resolved | +| 2 | `FGAStore` CRD | Declarative store provisioning | +| 3 | `FGAModel` CRD | Declarative authorization model management | +| 4 | `FGATuples` CRD | Declarative tuple management | + +## Consequences + +### Positive + +- **Resolves all 6 migration issues** (#211, #107, #120, #100, #95, #126) and related dependency issues (#132, #144) +- **Eliminates `k8s-wait-for` dependency** — removes an unmaintained, CVE-carrying image from the supply chain +- **Enables GitOps-native authorization management** — stores, models, and tuples become declarative Kubernetes resources that ArgoCD/FluxCD can sync +- **Enforces least-privilege** — separate ServiceAccounts for migration (DDL) and runtime (CRUD) +- **Simplifies the Helm chart** — removes migration Job template, init container logic, RBAC for job-status-reading, and hook annotations +- **Follows Kubernetes ecosystem conventions** — operators are the standard pattern for managing stateful application lifecycle + +### Negative + +- **New component to maintain** — the operator is a full Go project with its own release cycle, CI, testing, and CVE surface +- **Increased deployment footprint** — an additional pod running in the cluster (though resource requirements are minimal: ~50m CPU, ~64Mi memory) +- **Learning curve** — contributors need to understand controller-runtime patterns to modify the operator +- **CRD management complexity** — Helm does not upgrade or delete CRDs; users may need to apply CRD manifests separately on operator upgrades + +### Neutral + +- **Backward compatibility preserved** — the `operator.enabled: false` fallback maintains the existing Helm hook behavior for users who haven't migrated +- **No change for memory-datastore users** — users running with `datastore.engine: memory` are unaffected (no migrations, no operator needed) diff --git a/docs/adr/002-operator-managed-migrations.md b/docs/adr/002-operator-managed-migrations.md new file mode 100644 index 00000000..1f0dc741 --- /dev/null +++ b/docs/adr/002-operator-managed-migrations.md @@ -0,0 +1,241 @@ +# ADR-002: Replace Helm Hook Migrations with Operator-Managed Migrations + +- **Status:** Proposed +- **Date:** 2026-04-06 +- **Deciders:** OpenFGA Helm Charts maintainers +- **Related ADR:** [ADR-001](001-adopt-openfga-operator.md) +- **Related Issues:** #211, #107, #120, #100, #95, #126, #132, #144 + +## Context + +### How Migrations Work Today + +The current Helm chart uses a **Helm hook Job** to run database migrations (`openfga migrate`) and a **`k8s-wait-for` init container** on the Deployment to block server startup until the migration completes. + +Seven files are involved: + +| File | Role | +|------|------| +| `templates/job.yaml` | Migration Job with Helm hook annotations | +| `templates/deployment.yaml` | OpenFGA Deployment + `wait-for-migration` init container | +| `templates/serviceaccount.yaml` | Shared ServiceAccount (migration + runtime) | +| `templates/rbac.yaml` | Role + RoleBinding so init container can poll Job status | +| `templates/_helpers.tpl` | Datastore environment variable helpers | +| `values.yaml` | `datastore.*`, `migrate.*`, `initContainer.*` configuration | +| `Chart.yaml` | `bitnami/common` dependency for migration sidecars | + +**The migration Job** (`templates/job.yaml`) is annotated as a Helm hook: + +```yaml +annotations: + "helm.sh/hook": post-install,post-upgrade,post-rollback,post-delete + "helm.sh/hook-delete-policy": before-hook-creation + "helm.sh/hook-weight": "1" +``` + +This means Helm manages it outside the normal release lifecycle — it only runs after Helm finishes creating/upgrading all other resources. + +**The wait-for init container** blocks the Deployment pods from starting: + +```yaml +initContainers: + - name: wait-for-migration + image: "groundnuty/k8s-wait-for:v2.0" + args: ["job-wr", "openfga-migrate"] +``` + +It polls the Kubernetes API (`GET /apis/batch/v1/.../jobs/openfga-migrate`) until `.status.succeeded >= 1`. This requires RBAC permissions (Role/RoleBinding for `batch/jobs` `get`/`list`). + +**The alternative mode** (`datastore.migrationType: initContainer`) runs migration directly inside each Deployment pod as an init container, avoiding hooks entirely but introducing redundant migration runs across replicas. + +### The Six Issues + +| Issue | Tool | Root Cause | +|-------|------|-----------| +| **#211** | ArgoCD | ArgoCD ignores Helm hook annotations. The migration Job is never created as a managed resource. The init container waits forever for a Job that doesn't exist. | +| **#107** | ArgoCD | Same root cause. The Job is invisible in ArgoCD's UI — users can't see, debug, or manually sync it. | +| **#120** | Helm `--wait` | Circular deadlock. Helm waits for the Deployment to be ready before running post-install hooks. The Deployment is never ready because the init container waits for the hook Job. The Job never runs because Helm is waiting. | +| **#100** | FluxCD | FluxCD waits for all resources by default. The `hook-delete-policy: before-hook-creation` removes the completed Job before FluxCD can confirm the Deployment is healthy. | +| **#95** | AWS IRSA | Migration and runtime share a ServiceAccount. With IAM-based DB auth, the runtime gets DDL permissions it doesn't need (CREATE TABLE, ALTER TABLE). | +| **#126** | All | The `k8s-wait-for` image is configured in two separate places in `values.yaml`, leading to inconsistency. Related: #132 (image unmaintained, has CVEs) and #144 (pinned by mutable tag). | + +### Why Helm Hooks Are Fundamentally Wrong for This + +Helm hooks are a **deploy-time orchestration mechanism**. They assume Helm is the active agent running the deployment. GitOps tools (ArgoCD, FluxCD) break this assumption — they render the chart to manifests and apply them declaratively. The hook annotations are either ignored (ArgoCD) or cause ordering/cleanup conflicts (FluxCD). + +This is not a bug in ArgoCD or FluxCD. It is a fundamental mismatch between Helm's imperative hook model and the declarative GitOps model. + +## Decision + +Replace the Helm hook migration Job and `k8s-wait-for` init container with **operator-managed migrations** as part of Stage 1 of the OpenFGA Operator (see [ADR-001](001-adopt-openfga-operator.md)). + +### How It Works + +The operator runs a **migration controller** that reconciles the OpenFGA Deployment: + +``` +┌──────────────────────────────────────────────────────────┐ +│ Operator Reconciliation │ +│ │ +│ 1. Read Deployment → extract image tag (e.g. v1.14.0) │ +│ 2. Read ConfigMap/openfga-migration-status │ +│ └── "Last migrated version: v1.13.0" │ +│ 3. Versions differ → migration needed │ +│ 4. Create Job/openfga-migrate │ +│ ├── ServiceAccount: openfga-migrator (DDL perms) │ +│ ├── Image: openfga/openfga:v1.14.0 │ +│ ├── Args: ["migrate"] │ +│ └── ttlSecondsAfterFinished: 300 │ +│ 5. Watch Job until succeeded │ +│ 6. Update ConfigMap → "version: v1.14.0" │ +│ 7. Ensure Deployment at desired replicas │ +│ (fresh install: 0 → N; upgrade: already running) │ +│ 8. New pods pass readiness, serve requests │ +└──────────────────────────────────────────────────────────┘ +``` + +**Key design decisions within this approach:** + +#### Zero-downtime upgrades via lookup and readiness gating + +On **fresh install**, the Helm chart renders the Deployment with `replicas: 0` (no existing Deployment found via `lookup`). The operator runs the migration Job and scales the Deployment to the desired replica count afterward. + +On **upgrade**, the chart uses Helm's `lookup` function to read the current replica count from the live Deployment and preserves it. Kubernetes starts a rolling update with the new image. OpenFGA has a **built-in schema version gate**: on startup, each instance calls `IsReady()` which checks the database schema revision against `MinimumSupportedDatastoreSchemaRevision` (via goose). If the schema is behind, the gRPC health endpoint returns `NOT_SERVING`, the readiness probe fails, and Kubernetes does not route traffic to the pod. Old pods continue serving on the migrated schema (OpenFGA migrations are additive/backward-compatible — this is how the existing Helm hook flow has operated for years with rolling updates). Once the operator's migration Job completes, new pods pass readiness and the rolling update proceeds. + +This matches the existing zero-downtime behavior of the non-operator chart. The previous approach (always starting at `replicas: 0`) introduced a full outage on every `helm upgrade` — even for config-only changes — which was a regression from the existing rolling update model. + +**`lookup` caveat:** `helm template` and `--dry-run=client` cannot query the cluster, so `lookup` returns empty and the template falls back to `replicas: 0`. This is correct for CI rendering (no live cluster) and does not affect real installs/upgrades. `--dry-run=server` works correctly. + +#### Version tracking via ConfigMap + +A ConfigMap (`openfga-migration-status`) records the last successfully migrated version. The operator compares this to the Deployment's image tag to determine if migration is needed. This is: +- Simple to inspect (`kubectl get configmap openfga-migration-status -o yaml`) +- Survives operator restarts +- Can be manually deleted to force re-migration + +#### Separate ServiceAccount for migrations + +The operator creates a dedicated `openfga-migrator` ServiceAccount for migration Jobs. Users can annotate it with cloud IAM roles that grant DDL permissions, while the runtime ServiceAccount retains only CRUD permissions. + +#### Migration Job is a regular resource + +The Job created by the operator has no Helm hook annotations. It is a standard Kubernetes Job, visible to ArgoCD, FluxCD, and all Kubernetes tooling. It has an owner reference to the operator's managed resource for proper garbage collection. + +#### Failure handling + +| Failure | Behavior | +|---------|----------| +| Job fails | Operator sets `MigrationFailed` condition on Deployment. Does NOT scale up. User inspects Job logs. | +| Job hangs | `activeDeadlineSeconds` (default 300s) kills it. Operator sees failure. | +| Operator crashes | On restart, re-reads ConfigMap and Job status. Resumes from where it left off. | +| Database unreachable | Job fails to connect. After exhausting `backoffLimit`, operator deletes the failed Job, sets a `retry-after` annotation, and recreates a fresh Job after a fixed 60-second cooldown. Cycle repeats until the database becomes available. | + +### Sequence Comparison + +**Before (Helm hooks):** + +``` +helm install + ├── Create ServiceAccount, RBAC, Secret, Service + ├── Create Deployment (with wait-for-migration init container) + │ └── Pod starts → init container polls for Job → waits... + ├── [Helm finishes regular resources] + ├── Run post-install hooks: + │ └── Create Job/openfga-migrate → runs openfga migrate + │ └── Job succeeds + ├── Init container sees Job succeeded → exits + └── Main container starts +``` + +Problems: ArgoCD skips step 4. FluxCD deletes Job in step 4. `--wait` deadlocks between steps 2 and 4. + +**After (operator-managed, fresh install):** + +``` +helm install + ├── Create ServiceAccount (runtime), ServiceAccount (migrator) + ├── Create Secret, Service + ├── Create Deployment (replicas: 0 via lookup fallback, no init containers) + ├── Create Operator Deployment + └── [Helm is done — all resources are regular, no hooks] + +Operator starts: + ├── Detects Deployment image version + ├── No migration status ConfigMap → migration needed + ├── Creates Job/openfga-migrate (regular Job, no hooks) + │ └── Uses openfga-migrator ServiceAccount + │ └── Runs openfga migrate → succeeds + ├── Creates ConfigMap with migrated version + └── Scales Deployment 0 → 3 replicas → pods start +``` + +**After (operator-managed, upgrade with new image):** + +``` +helm upgrade + ├── lookup finds existing Deployment at 3 replicas → preserves replicas: 3 + ├── Patches Deployment with new image tag + ├── Kubernetes starts rolling update + │ ├── New pods (v1.14) start → schema is behind → + │ │ readiness fails (gRPC NOT_SERVING) → no traffic routed + │ └── Old pods (v1.13) continue serving traffic + └── [Helm is done] + +Operator reconciles: + ├── Detects image version differs from ConfigMap + ├── Creates Job/openfga-migrate → runs migration + ├── Updates ConfigMap → "version: v1.14.0" + └── New pods pass readiness → rolling update completes + (operator does NOT scale to zero — zero downtime) +``` + +No hooks. No init containers. No `k8s-wait-for`. No downtime on upgrade. All resources are regular Kubernetes objects. + +### What Changes in the Helm Chart + +Nothing is deleted outright — every change is gated on `operator.enabled` so the legacy flow remains the default for backward compatibility. + +**Gated on `operator.enabled: false` (legacy Helm-hook flow, rendered when the operator is disabled):** + +| File/Section | Behavior when operator is enabled | +|--------------|-----------------------------------| +| `templates/job.yaml` | Skipped — operator creates migration Jobs dynamically | +| `templates/rbac.yaml` | Skipped — no init container needs to poll Job status | +| `values.yaml`: `initContainer.*` | Unused — `k8s-wait-for` not deployed | +| `values.yaml`: `datastore.migrationType`, `datastore.waitForMigrations` | Unused — operator always uses a Job and handles ordering | +| `values.yaml`: `migrate.annotations` | Unused — no Helm hooks | +| Deployment migration init containers | Skipped — operator manages readiness via replica scaling | + +**Added (active only when `operator.enabled: true`):** + +| File/Section | Purpose | +|--------------|---------| +| `values.yaml`: `operator.enabled` | Toggle the operator subchart | +| `values.yaml`: `migration.serviceAccount.*` | Separate ServiceAccount for migration Jobs | +| `values.yaml`: `migration.backoffLimit`, `activeDeadlineSeconds`, `ttlSecondsAfterFinished` | Migration Job configuration | +| `templates/serviceaccount.yaml`: second SA | Migration ServiceAccount | +| `charts/openfga-operator/` | Operator subchart (conditional dependency) | + +Users on `operator.enabled: false` (the default) see identical rendered output to the pre-operator chart, so gradual adoption is possible with no forced migration. + +## Consequences + +### Positive + +- **All 6 migration issues resolved** — no Helm hooks means no ArgoCD/FluxCD/`--wait` incompatibility +- **`k8s-wait-for` eliminated** — removes an unmaintained image with CVEs from the supply chain (#132, #144) +- **Least-privilege enforced** — separate ServiceAccounts for migration (DDL) and runtime (CRUD) (#95) +- **Runtime surface area reduced** — when `operator.enabled: true`, the legacy migration Job, init-container `k8s-wait-for` logic, and job-watching RBAC are skipped from the rendered manifest +- **Migration is observable** — Job is a regular resource visible in all tools; ConfigMap records migration history; operator conditions surface errors +- **Idempotent and crash-safe** — operator can restart at any point and resume correctly + +### Negative + +- **Operator is a new runtime dependency** — if the operator pod is unavailable, migrations don't run (but existing running pods are unaffected) +- **`lookup` limitation** — `helm template` and `--dry-run=client` cannot query the cluster; the template falls back to `replicas: 0` in these contexts. This does not affect real installs/upgrades. +- **Two upgrade paths to document** — `operator.enabled: true` (new) vs `operator.enabled: false` (legacy) + +### Risks + +- **Readiness gate relies on OpenFGA's built-in schema check** — the zero-downtime upgrade model depends on `MinimumSupportedDatastoreSchemaRevision` in `pkg/storage/sqlcommon/sqlcommon.go` causing `NOT_SERVING` when the schema is behind. If a future OpenFGA release removes or weakens this check, new pods could serve traffic against an unmigrated schema. This coupling should be documented and monitored across OpenFGA releases. +- **ConfigMap as state store** — if the ConfigMap is accidentally deleted, the operator re-runs migration (which is safe — `openfga migrate` is idempotent). This is a feature, not a bug, but should be documented. diff --git a/docs/adr/README.md b/docs/adr/README.md new file mode 100644 index 00000000..d6b3445e --- /dev/null +++ b/docs/adr/README.md @@ -0,0 +1,178 @@ +# Architecture Decision Records + +This directory contains Architecture Decision Records (ADRs) for the OpenFGA Helm Charts project. + +ADRs are short documents that capture significant architectural decisions along with their context, alternatives considered, and consequences. They serve as a decision log — not a living design doc, but a point-in-time record of *why* a decision was made. + +We follow the format described by [Michael Nygard](https://cognitect.com/blog/2011/11/15/documenting-architecture-decisions). + +## Index + +| ADR | Title | Status | Date | +|-----|-------|--------|------| +| [ADR-001](001-adopt-openfga-operator.md) | Adopt a Kubernetes Operator for OpenFGA Lifecycle Management | Proposed | 2026-04-06 | +| [ADR-002](002-operator-managed-migrations.md) | Replace Helm Hook Migrations with Operator-Managed Migrations | Proposed | 2026-04-06 | + +--- + +## What is an ADR? + +An ADR captures a single architectural decision. It records: + +- **What** was decided +- **Why** it was decided (the context and constraints at the time) +- **What alternatives** were considered and why they were rejected +- **What consequences** follow from the decision (positive, negative, and neutral) + +ADRs are **immutable once accepted** — if a decision changes, you write a new ADR that supersedes the old one rather than editing it. This preserves the history of *why* things changed over time. + +## ADR Lifecycle + +```text +Proposed → Accepted → (optionally) Superseded or Deprecated + ↑ + │ feedback loop + │ + Discussion +``` + +### Statuses + +| Status | Meaning | +|--------|---------| +| **Proposed** | The ADR has been written and is open for discussion. No commitment has been made. | +| **Accepted** | The decision has been agreed upon by maintainers. Implementation can proceed. | +| **Deprecated** | The decision is no longer relevant (e.g., the feature was removed). | +| **Superseded by ADR-XXX** | A newer ADR has replaced this decision. The old ADR links to the new one. | + +## How to Propose an ADR + +1. **Create a branch** — e.g., `docs/adr-005-my-decision` + +2. **Copy the template** — use `000-template.md` as a starting point + +3. **Write the ADR** — fill in Context, Decision, and Consequences. Focus on *why*, not *how*. The most valuable part is the Alternatives Considered section — it shows reviewers what you evaluated and why you chose this path. + +4. **Assign a number** — use the next sequential number. Check the index above. + +5. **Open a pull request** — the PR is where discussion happens. Title it: `ADR-005: ` + +6. **Add to the index** — update the table in this README with the new entry (status: Proposed) + +### Proposing related ADRs together + +When multiple ADRs are part of a single cohesive proposal — e.g., a foundational decision and several downstream decisions that depend on it — they can be submitted in a single PR. This lets reviewers see the full picture instead of bouncing between separate PRs. + +When doing this: + +- **Explain the relationship in the PR description** — identify which ADR is the foundational decision and which are downstream. For example: "ADR-001 is the core decision to build an operator. ADR-002 is a downstream decision about how the operator handles migrations." +- **Each ADR can be accepted or rejected independently** — a reviewer might approve the foundational decision but push back on a downstream one. If that happens, split the PR: merge the accepted ADRs and keep the contested ones open for further discussion. +- **Keep each ADR self-contained** — even though they're in the same PR, each ADR should stand on its own. A reader should be able to understand a downstream ADR without reading the foundational one first (though they may reference each other). + +## How to Give Feedback on an ADR + +ADR review happens in the **pull request**, not by editing the ADR directly. This keeps the discussion visible and linked to the decision. + +### As a reviewer + +- **Comment on the PR** — ask questions, challenge assumptions, suggest alternatives. Good review questions: + - "Did you consider X as an alternative?" + - "What happens if Y fails?" + - "This conflicts with how we do Z — can you address that?" + - "I agree with the decision but the consequence about X should mention Y" + +- **Request changes** if you believe the decision is wrong or incomplete + +- **Approve** when you're satisfied the decision is sound and well-documented + +### As the author responding to feedback + +- **Update the ADR in the PR** based on feedback: + - Add alternatives that reviewers suggested (with your evaluation of them) + - Expand the Consequences section if reviewers identified impacts you missed + - Clarify the Context if reviewers were confused about the problem + - Adjust the Decision if feedback reveals a better approach + +- **Do NOT delete feedback-driven changes** — if a reviewer raised a valid alternative and you addressed it, the ADR is stronger for including it + +- **Resolve PR comments** as you address them so reviewers can track progress + +### Reaching consensus + +- ADRs move to **Accepted** when maintainers approve the PR +- Not every maintainer needs to approve — follow the project's normal review standards +- If consensus can't be reached, escalate to a synchronous discussion (meeting, call) and record the outcome in the PR +- Disagreement is fine — document it in the Consequences section as a risk or trade-off rather than hiding it + +## How to Supersede an ADR + +When a decision needs to change: + +1. **Do NOT edit the original ADR** — it's a historical record + +2. **Write a new ADR** that references the old one: + ```markdown + - **Supersedes:** [ADR-002](002-operator-managed-migrations.md) + ``` + +3. **Update the old ADR's status** — change it to: + ```markdown + - **Status:** Superseded by [ADR-007](007-new-approach.md) + ``` + +4. **Update the index** in this README + +This way, anyone reading ADR-002 knows it's been replaced and can follow the link to understand what changed and why. + +## ADR Format + +Every ADR follows this structure: + +```markdown +# ADR-NNN: Title + +- **Status:** Proposed | Accepted | Deprecated | Superseded by ADR-XXX +- **Date:** YYYY-MM-DD +- **Deciders:** Who was involved in the decision +- **Related Issues:** GitHub issue references +- **Related ADR:** Links to related ADRs + +## Context + +What is the problem or situation that motivates this decision? +Include enough background that someone unfamiliar with the project +can understand why this decision matters. + +## Decision + +What is the decision and why was it chosen? + +### Alternatives Considered + +What other options were evaluated? Why were they rejected? +This is often the most valuable section — it prevents future +contributors from re-proposing rejected approaches. + +## Consequences + +### Positive +What improves as a result of this decision? + +### Negative +What gets harder or more complex? Be honest — every decision has costs. + +### Risks +What could go wrong? What assumptions might prove false? +``` + +## Template + +A blank template is available at [000-template.md](000-template.md). + +## Tips for Writing Good ADRs + +- **Keep it short** — an ADR is one decision, not a design doc. If it's longer than 2-3 pages, consider splitting it. +- **Focus on why, not how** — implementation details change; the reasoning behind the decision is what matters long-term. +- **Be honest about trade-offs** — an ADR that lists only positive consequences isn't credible. Every decision has costs. +- **Write for your future self** — in 18 months, you won't remember why you chose this. The ADR should tell you. +- **Not every decision needs an ADR** — use ADRs for decisions that are hard to reverse, affect multiple components, or where the reasoning isn't obvious from the code. diff --git a/operator/.dockerignore b/operator/.dockerignore new file mode 100644 index 00000000..3efb8a0e --- /dev/null +++ b/operator/.dockerignore @@ -0,0 +1,6 @@ +**/.git +**/.gitignore +**/README.md +**/LICENSE +**/Makefile +**/.dockerignore diff --git a/operator/Dockerfile b/operator/Dockerfile new file mode 100644 index 00000000..034e22d0 --- /dev/null +++ b/operator/Dockerfile @@ -0,0 +1,25 @@ +# pinned multi-arch index for golang:1.26.2 (linux/amd64, linux/arm64, ...) +FROM --platform=$BUILDPLATFORM golang:1.26.2@sha256:5f3787b7f902c07c7ec4f3aa91a301a3eda8133aa32661a3b3a3a86ab3a68a36 AS builder + +# buildx provides these automatically; declare so Go cross-compiles to the +# requested target instead of the build host's arch. +ARG TARGETOS +ARG TARGETARCH + +WORKDIR /workspace +COPY go.mod go.sum ./ +RUN go mod download + +COPY cmd/ cmd/ +COPY internal/ internal/ + +RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} \ + go build -ldflags="-s -w" -o /operator ./cmd/ + +# pinned multi-arch index for gcr.io/distroless/static:nonroot +FROM gcr.io/distroless/static:nonroot@sha256:e3f945647ffb95b5839c07038d64f9811adf17308b9121d8a2b87b6a22a80a39 +WORKDIR / +COPY --from=builder /operator . +USER 65532:65532 + +ENTRYPOINT ["/operator"] diff --git a/operator/Makefile b/operator/Makefile new file mode 100644 index 00000000..4b97c0cb --- /dev/null +++ b/operator/Makefile @@ -0,0 +1,27 @@ +IMG ?= openfga/openfga-operator:dev + +.PHONY: build test vet fmt lint docker-build docker-push clean + +build: + mkdir -p bin + go build -o bin/operator ./cmd/ + +test: + go test ./... -v + +vet: + go vet ./... + +fmt: + go fmt ./... + +lint: vet fmt + +docker-build: + docker build -t $(IMG) . + +docker-push: + docker push $(IMG) + +clean: + rm -rf bin/ diff --git a/operator/README.md b/operator/README.md new file mode 100644 index 00000000..c9efbe8e --- /dev/null +++ b/operator/README.md @@ -0,0 +1,134 @@ +# OpenFGA Operator + +A Kubernetes operator that manages database migrations for OpenFGA deployments. Instead of relying on Helm hooks and init containers, the operator watches OpenFGA Deployments, detects version changes, and orchestrates migrations as regular Jobs. + +This is **Stage 1** of the operator — focused solely on migration orchestration. See [ADR-001](../docs/adr/001-adopt-openfga-operator.md) for the full roadmap. + +## How It Works + +1. The operator watches Deployments **in its own namespace** labeled `app.kubernetes.io/part-of: openfga` and `app.kubernetes.io/component: authorization-controller` +2. When a version change is detected (comparing the container image tag to the `{name}-migration-status` ConfigMap), the operator: + - Keeps the Deployment at 0 replicas + - Creates a migration Job running `openfga migrate` + - Waits for the Job to complete + - Updates the ConfigMap with the new version + - Scales the Deployment up to the desired replica count +3. On failure, a `MigrationFailed` condition is set on the Deployment and replicas stay at 0 + +## Prerequisites + +- Go 1.26.2+ +- Docker +- Helm 3.6+ +- A Kubernetes cluster (Rancher Desktop, kind, etc.) + +## Development + +### Build + +```bash +cd operator +go build ./... +``` + +### Test + +```bash +go test ./... -v +``` + +### Lint + +```bash +go vet ./... +``` + +### Docker Image + +```bash +docker build -t openfga/openfga-operator:dev . +``` + +## Local Testing + +Integration test values and instructions are in [`tests/`](tests/). Three scenarios are provided: + +| Scenario | Values File | What It Tests | +|----------|-------------|---------------| +| Happy path | `tests/values-happy-path.yaml` | Full lifecycle: Postgres up, migration succeeds, OpenFGA scales to 3/3 | +| DB outage & recovery | `tests/values-db-outage.yaml` | Postgres starts at 0 replicas; scale it up later to verify self-healing | +| No database | `tests/values-no-db.yaml` | Permanent failure: operator retries without crashing, app stays at 0 | + +Quick start: + +```bash +# 1. Build the operator image +cd operator +docker build -t openfga/openfga-operator:dev . + +# 2. Update chart dependencies +cd .. +helm dependency update charts/openfga + +# 3. Run the happy-path test +kubectl create namespace openfga-test +helm install openfga-test charts/openfga -n openfga-test \ + -f operator/tests/values-happy-path.yaml + +# 4. Verify (wait ~30s) +kubectl get all -n openfga-test + +# 5. Clean up +helm uninstall openfga-test -n openfga-test +kubectl delete namespace openfga-test +``` + +See [`tests/README.md`](tests/README.md) for detailed verification steps and all three scenarios. + +## Project Structure + +``` +operator/ +├── cmd/ +│ └── main.go # Entry point, manager setup +├── internal/ +│ └── controller/ +│ ├── migration_controller.go # Reconciliation loop +│ ├── migration_controller_test.go # Unit tests +│ └── helpers.go # Job builder, scaling, ConfigMap helpers +├── Dockerfile # Multi-stage build (distroless runtime) +├── Makefile +├── go.mod +└── go.sum +``` + +## Configuration + +The operator accepts the following flags: + +| Flag | Default | Description | +|------|---------|-------------| +| `--leader-elect` | `false` | Enable leader election so only one replica actively reconciles at a time. Required when running multiple operator replicas for high availability; standby pods wait for the leader's Lease to expire before taking over. Not needed for single-replica deployments. | +| `--watch-namespace` | `""` | Namespace to watch for OpenFGA Deployments. Defaults to the operator pod's own namespace (via `POD_NAMESPACE` env var). Each operator instance manages only its own namespace, so multiple independent OpenFGA installations can coexist safely. | +| `--metrics-bind-address` | `:8080` | Address the Prometheus metrics endpoint binds to. Change only if the default port conflicts with other containers in the pod. | +| `--health-probe-bind-address` | `:8081` | Address the Kubernetes liveness and readiness probe endpoints bind to. Change only if the default port conflicts. | +| `--backoff-limit` | `3` | Number of times a migration Job's pod can fail before the Job is considered failed. After hitting this limit the operator deletes the Job, sets a `MigrationFailed` condition on the Deployment, and retries after a 60-second cooldown. | +| `--active-deadline-seconds` | `300` | Maximum wall-clock seconds a migration Job can run before Kubernetes terminates it. Prevents stuck migrations from blocking the pipeline indefinitely. Increase for very large databases. | +| `--ttl-seconds-after-finished` | `300` | Seconds Kubernetes keeps a completed or failed Job (and its pods) before garbage-collecting them, giving you time to inspect logs. | + +When deployed via the Helm subchart, these are configured through `values.yaml`. See `charts/openfga-operator/values.yaml` for all available options. + +## Annotations + +The operator reads these annotations from the OpenFGA Deployment: + +| Annotation | Description | +|------------|-------------| +| `openfga.dev/migration-enabled` | Must be `"true"` for the operator to manage migrations. Deployments without this annotation are ignored. Set by the Helm chart when `operator.enabled` and `migration.enabled` are both true. | +| `openfga.dev/desired-replicas` | The replica count to restore after migration succeeds. Set by the Helm chart. | +| `openfga.dev/migration-service-account` | The ServiceAccount to use for migration Jobs. Defaults to the Deployment's SA. | + +## Limitations + +- **Mutable image tags:** The operator detects version changes by comparing the container image tag (or digest). If you deploy with a mutable tag like `latest` or reuse the same tag for different builds, the operator will not detect changes and will skip the migration. Use immutable tags (e.g., `v1.14.0`) or pin images by digest for reliable migration triggering. +- **Migration-specific volumes:** The legacy Helm chart values `migrate.extraVolumes` and `migrate.extraVolumeMounts` have no effect in operator mode. The operator inherits volumes and mounts from the main Deployment pod spec. If you need additional volumes for migrations (e.g., CA bundles or TLS certs), add them to the top-level `extraVolumes` and `extraVolumeMounts` values instead. diff --git a/operator/cmd/main.go b/operator/cmd/main.go new file mode 100644 index 00000000..ac9bac64 --- /dev/null +++ b/operator/cmd/main.go @@ -0,0 +1,123 @@ +package main + +import ( + "flag" + "fmt" + "math" + "os" + + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/healthz" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + "github.com/openfga/openfga-operator/internal/controller" +) + +var scheme = runtime.NewScheme() + +func init() { + _ = clientgoscheme.AddToScheme(scheme) +} + +func main() { + var ( + leaderElect bool + watchNamespace string + metricsAddr string + healthProbeAddr string + backoffLimit int + activeDeadline int + ttlAfterFinished int + ) + + flag.BoolVar(&leaderElect, "leader-elect", false, "Enable leader election for the controller manager.") + flag.StringVar(&watchNamespace, "watch-namespace", "", "Namespace to watch. Defaults to the operator pod namespace.") + flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") + flag.StringVar(&healthProbeAddr, "health-probe-bind-address", ":8081", "The address the health probe endpoint binds to.") + flag.IntVar(&backoffLimit, "backoff-limit", int(controller.DefaultBackoffLimit), "BackoffLimit for migration Jobs.") + flag.IntVar(&activeDeadline, "active-deadline-seconds", int(controller.DefaultActiveDeadlineSeconds), "ActiveDeadlineSeconds for migration Jobs.") + flag.IntVar(&ttlAfterFinished, "ttl-seconds-after-finished", int(controller.DefaultTTLSecondsAfterFinished), "TTLSecondsAfterFinished for migration Jobs.") + + opts := zap.Options{Development: false} + opts.BindFlags(flag.CommandLine) + flag.Parse() + + // Validate flag values. + for _, v := range []struct { + name string + value int + max int + }{ + {"backoff-limit", backoffLimit, math.MaxInt32}, + {"active-deadline-seconds", activeDeadline, math.MaxInt32}, + {"ttl-seconds-after-finished", ttlAfterFinished, math.MaxInt32}, + } { + if v.value < 0 || v.value > v.max { + fmt.Fprintf(os.Stderr, "invalid value for --%s: must be between 0 and %d\n", v.name, v.max) + os.Exit(1) + } + } + + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + logger := ctrl.Log.WithName("setup") + + // Fall back to the pod's namespace when no explicit scope is set. + if watchNamespace == "" { + if podNS, ok := os.LookupEnv("POD_NAMESPACE"); ok && podNS != "" { + watchNamespace = podNS + logger.Info("defaulting watch scope to pod namespace", "namespace", podNS) + } + } + + // Configure cache namespace restrictions. + var cacheOpts cache.Options + if watchNamespace != "" { + cacheOpts.DefaultNamespaces = map[string]cache.Config{ + watchNamespace: {}, + } + } + + mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ + Scheme: scheme, + Metrics: metricsserver.Options{BindAddress: metricsAddr}, + HealthProbeBindAddress: healthProbeAddr, + LeaderElection: leaderElect, + LeaderElectionID: "openfga-operator-leader", + Cache: cacheOpts, + }) + if err != nil { + logger.Error(err, "unable to create manager") + os.Exit(1) + } + + reconciler := &controller.MigrationReconciler{ + Client: mgr.GetClient(), + BackoffLimit: int32(backoffLimit), + ActiveDeadlineSeconds: int64(activeDeadline), + TTLSecondsAfterFinished: int32(ttlAfterFinished), + } + + if err := reconciler.SetupWithManager(mgr); err != nil { + logger.Error(err, "unable to create controller", "controller", "MigrationReconciler") + os.Exit(1) + } + + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + logger.Error(err, "unable to set up health check") + os.Exit(1) + } + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + logger.Error(err, "unable to set up readiness check") + os.Exit(1) + } + + logger.Info("starting manager") + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + logger.Error(err, "problem running manager") + os.Exit(1) + } +} diff --git a/operator/go.mod b/operator/go.mod new file mode 100644 index 00000000..8cd2bbe4 --- /dev/null +++ b/operator/go.mod @@ -0,0 +1,66 @@ +module github.com/openfga/openfga-operator + +go 1.26.2 + +require ( + k8s.io/api v0.35.3 + k8s.io/apimachinery v0.35.3 + k8s.io/client-go v0.35.3 + k8s.io/utils v0.0.0-20260319190234-28399d86e0b5 + sigs.k8s.io/controller-runtime v0.23.3 +) + +require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/emicklei/go-restful/v3 v3.12.2 // indirect + github.com/evanphx/json-patch/v5 v5.9.11 // indirect + github.com/fsnotify/fsnotify v1.9.0 // indirect + github.com/fxamacker/cbor/v2 v2.9.0 // indirect + github.com/go-logr/logr v1.4.3 // indirect + github.com/go-logr/zapr v1.3.0 // indirect + github.com/go-openapi/jsonpointer v0.21.0 // indirect + github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/swag v0.23.0 // indirect + github.com/google/btree v1.1.3 // indirect + github.com/google/gnostic-models v0.7.0 // indirect + github.com/google/go-cmp v0.7.0 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/josharian/intern v1.0.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/mailru/easyjson v0.7.7 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/prometheus/client_golang v1.23.2 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.66.1 // indirect + github.com/prometheus/procfs v0.16.1 // indirect + github.com/spf13/pflag v1.0.9 // indirect + github.com/x448/float16 v0.8.4 // indirect + go.uber.org/multierr v1.11.0 // indirect + go.uber.org/zap v1.27.0 // indirect + go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/net v0.47.0 // indirect + golang.org/x/oauth2 v0.30.0 // indirect + golang.org/x/sync v0.18.0 // indirect + golang.org/x/sys v0.38.0 // indirect + golang.org/x/term v0.37.0 // indirect + golang.org/x/text v0.31.0 // indirect + golang.org/x/time v0.9.0 // indirect + gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect + google.golang.org/protobuf v1.36.8 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + k8s.io/apiextensions-apiserver v0.35.0 // indirect + k8s.io/klog/v2 v2.130.1 // indirect + k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect + sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect + sigs.k8s.io/randfill v1.0.0 // indirect + sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect +) diff --git a/operator/go.sum b/operator/go.sum new file mode 100644 index 00000000..79e74816 --- /dev/null +++ b/operator/go.sum @@ -0,0 +1,171 @@ +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= +github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/evanphx/json-patch v0.5.2 h1:xVCHIVMUu1wtM/VkR9jVZ45N3FhZfYMMYGorLCR8P3k= +github.com/evanphx/json-patch v0.5.2/go.mod h1:ZWS5hhDbVDyob71nXKNL0+PWn6ToqBHMikGIFbs31qQ= +github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= +github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= +github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= +github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= +github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= +github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= +github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= +github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= +github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= +github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= +github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= +github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= +github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= +github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= +github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= +github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= +github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= +github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= +github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= +github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= +github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= +github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= +github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= +github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= +go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= +go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= +go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= +golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= +golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= +golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= +golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= +golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= +golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= +golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= +golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= +golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= +golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= +golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= +gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= +gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +k8s.io/api v0.35.3 h1:pA2fiBc6+N9PDf7SAiluKGEBuScsTzd2uYBkA5RzNWQ= +k8s.io/api v0.35.3/go.mod h1:9Y9tkBcFwKNq2sxwZTQh1Njh9qHl81D0As56tu42GA4= +k8s.io/apiextensions-apiserver v0.35.0 h1:3xHk2rTOdWXXJM+RDQZJvdx0yEOgC0FgQ1PlJatA5T4= +k8s.io/apiextensions-apiserver v0.35.0/go.mod h1:E1Ahk9SADaLQ4qtzYFkwUqusXTcaV2uw3l14aqpL2LU= +k8s.io/apimachinery v0.35.3 h1:MeaUwQCV3tjKP4bcwWGgZ/cp/vpsRnQzqO6J6tJyoF8= +k8s.io/apimachinery v0.35.3/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/client-go v0.35.3 h1:s1lZbpN4uI6IxeTM2cpdtrwHcSOBML1ODNTCCfsP1pg= +k8s.io/client-go v0.35.3/go.mod h1:RzoXkc0mzpWIDvBrRnD+VlfXP+lRzqQjCmKtiwZ8Q9c= +k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= +k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= +k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= +k8s.io/utils v0.0.0-20260319190234-28399d86e0b5 h1:kBawHLSnx/mYHmRnNUf9d4CpjREbeZuxoSGOX/J+aYM= +k8s.io/utils v0.0.0-20260319190234-28399d86e0b5/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk= +sigs.k8s.io/controller-runtime v0.23.3 h1:VjB/vhoPoA9l1kEKZHBMnQF33tdCLQKJtydy4iqwZ80= +sigs.k8s.io/controller-runtime v0.23.3/go.mod h1:B6COOxKptp+YaUT5q4l6LqUJTRpizbgf9KSRNdQGns0= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= +sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= +sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482 h1:2WOzJpHUBVrrkDjU4KBT8n5LDcj824eX0I5UKcgeRUs= +sigs.k8s.io/structured-merge-diff/v6 v6.3.2-0.20260122202528-d9cc6641c482/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= +sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= +sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= diff --git a/operator/internal/controller/helpers.go b/operator/internal/controller/helpers.go new file mode 100644 index 00000000..da1c7179 --- /dev/null +++ b/operator/internal/controller/helpers.go @@ -0,0 +1,274 @@ +package controller + +import ( + "context" + "fmt" + "strconv" + "strings" + "time" + + appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + // Labels used to discover OpenFGA Deployments. + LabelPartOf = "app.kubernetes.io/part-of" + LabelComponent = "app.kubernetes.io/component" + + LabelPartOfValue = "openfga" + LabelComponentValue = "authorization-controller" + + // Annotations set on the Deployment by the Helm chart / operator. + AnnotationMigrationEnabled = "openfga.dev/migration-enabled" + AnnotationContainerName = "openfga.dev/container-name" + AnnotationDesiredReplicas = "openfga.dev/desired-replicas" + AnnotationMigrationServiceAccount = "openfga.dev/migration-service-account" + AnnotationRetryAfter = "openfga.dev/migration-retry-after" + + // Defaults for migration Job configuration. + DefaultBackoffLimit int32 = 3 + DefaultActiveDeadlineSeconds int64 = 300 + DefaultTTLSecondsAfterFinished int32 = 300 +) + +// extractImageTag returns the tag portion of a container image reference. +// For "openfga/openfga:v1.14.0" it returns "v1.14.0". +// For "openfga/openfga@sha256:abc..." it returns the digest. +// If there is no tag or digest, it returns "latest". +func extractImageTag(image string) string { + // Handle digest references. + if idx := strings.LastIndex(image, "@"); idx != -1 { + return image[idx+1:] + } + + // Handle tag references — be careful not to split on the port in a registry URL. + // Find the last '/' to isolate the image name from the registry. + lastSlash := strings.LastIndex(image, "/") + nameAndTag := image + if lastSlash != -1 { + nameAndTag = image[lastSlash+1:] + } + + if idx := strings.LastIndex(nameAndTag, ":"); idx != -1 { + return nameAndTag[idx+1:] + } + + return "latest" +} + +// migrationConfigMapName returns the name of the ConfigMap used to track migration state. +func migrationConfigMapName(deploymentName string) string { + return deploymentName + "-migration-status" +} + +// migrationJobName returns the name of the migration Job. +func migrationJobName(deploymentName string) string { + return deploymentName + "-migrate" +} + +// findOpenFGAContainer finds the OpenFGA container in the Deployment's pod spec. +// It checks the openfga.dev/container-name annotation first, then looks for a +// container named "openfga". Returns an error if no containers exist or the +// target container is not found. +func findOpenFGAContainer(deployment *appsv1.Deployment) (*corev1.Container, error) { + containers := deployment.Spec.Template.Spec.Containers + if len(containers) == 0 { + return nil, fmt.Errorf("deployment %s/%s has no containers", deployment.Namespace, deployment.Name) + } + + targetName := deployment.Annotations[AnnotationContainerName] + if targetName == "" { + targetName = "openfga" + } + for i := range containers { + if containers[i].Name == targetName { + return &containers[i], nil + } + } + return nil, fmt.Errorf("container %q not found in deployment %s/%s", targetName, deployment.Namespace, deployment.Name) +} + +// buildMigrationJob constructs a migration Job for the given Deployment. +func buildMigrationJob( + deployment *appsv1.Deployment, + mainContainer *corev1.Container, + desiredVersion string, + backoffLimit int32, + activeDeadlineSeconds int64, + ttlSecondsAfterFinished int32, +) *batchv1.Job { + // Determine the migration service account. + migrationSA := deployment.Annotations[AnnotationMigrationServiceAccount] + if migrationSA == "" { + migrationSA = deployment.Spec.Template.Spec.ServiceAccountName + } + + // Sanitize version for use as a label value (must match [a-zA-Z0-9._-], max 63 chars). + // The full version is stored in an annotation for accurate comparison. + labelVersion := strings.ReplaceAll(desiredVersion, ":", "_") + if len(labelVersion) > 63 { + labelVersion = labelVersion[:63] + } + + return &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: migrationJobName(deployment.Name), + Namespace: deployment.Namespace, + Labels: map[string]string{ + LabelPartOf: LabelPartOfValue, + LabelComponent: "migration", + "app.kubernetes.io/managed-by": "openfga-operator", + "app.kubernetes.io/version": labelVersion, + }, + Annotations: map[string]string{ + "openfga.dev/desired-version": desiredVersion, + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: deployment.Name, + UID: deployment.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Spec: batchv1.JobSpec{ + BackoffLimit: ptr.To(backoffLimit), + ActiveDeadlineSeconds: ptr.To(activeDeadlineSeconds), + TTLSecondsAfterFinished: ptr.To(ttlSecondsAfterFinished), + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + LabelPartOf: LabelPartOfValue, + LabelComponent: "migration", + }, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: migrationSA, + RestartPolicy: corev1.RestartPolicyNever, + ImagePullSecrets: deployment.Spec.Template.Spec.ImagePullSecrets, + SecurityContext: deployment.Spec.Template.Spec.SecurityContext, + Containers: []corev1.Container{ + { + Name: "migrate-database", + Image: mainContainer.Image, + Args: []string{"migrate"}, + Env: mainContainer.Env, + EnvFrom: mainContainer.EnvFrom, + Resources: mainContainer.Resources, + VolumeMounts: mainContainer.VolumeMounts, + SecurityContext: mainContainer.SecurityContext, + }, + }, + // Inherit volumes and scheduling constraints from the parent Deployment. + Volumes: deployment.Spec.Template.Spec.Volumes, + NodeSelector: deployment.Spec.Template.Spec.NodeSelector, + Tolerations: deployment.Spec.Template.Spec.Tolerations, + Affinity: deployment.Spec.Template.Spec.Affinity, + }, + }, + }, + } +} + +// updateMigrationStatus creates or updates the migration-status ConfigMap. +func updateMigrationStatus( + ctx context.Context, + c client.Client, + deployment *appsv1.Deployment, + version string, + jobName string, +) error { + cmName := migrationConfigMapName(deployment.Name) + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: cmName, + Namespace: deployment.Namespace, + Labels: map[string]string{ + LabelPartOf: LabelPartOfValue, + LabelComponent: "migration", + "app.kubernetes.io/managed-by": "openfga-operator", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: deployment.Name, + UID: deployment.UID, + Controller: ptr.To(true), + BlockOwnerDeletion: ptr.To(true), + }, + }, + }, + Data: map[string]string{ + "version": version, + "migratedAt": time.Now().UTC().Format(time.RFC3339), + "jobName": jobName, + }, + } + + // Try to get existing ConfigMap first. + existing := &corev1.ConfigMap{} + err := c.Get(ctx, client.ObjectKeyFromObject(cm), existing) + if err != nil { + if client.IgnoreNotFound(err) != nil { + return fmt.Errorf("getting migration status ConfigMap: %w", err) + } + // ConfigMap doesn't exist — create it. + if createErr := c.Create(ctx, cm); createErr != nil { + return fmt.Errorf("creating migration status ConfigMap: %w", createErr) + } + return nil + } + + // Update existing ConfigMap (including OwnerReferences in case the Deployment + // was deleted and recreated with a new UID). + existing.Data = cm.Data + existing.Labels = cm.Labels + existing.OwnerReferences = cm.OwnerReferences + if updateErr := c.Update(ctx, existing); updateErr != nil { + return fmt.Errorf("updating migration status ConfigMap: %w", updateErr) + } + return nil +} + +// ensureDeploymentScaled ensures the Deployment is scaled to the desired replica count. +// The desired count is read from the AnnotationDesiredReplicas annotation. +// Returns true if the Deployment was already at the desired scale. +func ensureDeploymentScaled(ctx context.Context, c client.Client, deployment *appsv1.Deployment) (bool, error) { + desiredStr, ok := deployment.Annotations[AnnotationDesiredReplicas] + if !ok || desiredStr == "" { + // No annotation — nothing to do. The Deployment may not have been scaled down yet. + return true, nil + } + + desired, err := strconv.ParseInt(desiredStr, 10, 32) + if err != nil { + return false, fmt.Errorf("parsing desired replicas annotation: %w", err) + } + + desiredInt32 := int32(desired) + current := int32(1) + if deployment.Spec.Replicas != nil { + current = *deployment.Spec.Replicas + } + + if current == desiredInt32 { + return true, nil + } + + patch := client.MergeFrom(deployment.DeepCopy()) + deployment.Spec.Replicas = ptr.To(desiredInt32) + if patchErr := c.Patch(ctx, deployment, patch); patchErr != nil { + return false, fmt.Errorf("scaling deployment to %d replicas: %w", desiredInt32, patchErr) + } + return false, nil +} + diff --git a/operator/internal/controller/migration_controller.go b/operator/internal/controller/migration_controller.go new file mode 100644 index 00000000..ae096b63 --- /dev/null +++ b/operator/internal/controller/migration_controller.go @@ -0,0 +1,343 @@ +package controller + +import ( + "context" + "fmt" + "strings" + "time" + + appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +// MigrationReconciler watches OpenFGA Deployments and orchestrates database +// migrations when the application version changes. +type MigrationReconciler struct { + client.Client + + // BackoffLimit for migration Jobs. + BackoffLimit int32 + // ActiveDeadlineSeconds for migration Jobs. + ActiveDeadlineSeconds int64 + // TTLSecondsAfterFinished for migration Jobs. + TTLSecondsAfterFinished int32 +} + +// Reconcile handles a single reconciliation for an OpenFGA Deployment. +func (r *MigrationReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + logger := log.FromContext(ctx) + + // 1. Get the OpenFGA Deployment. + deployment := &appsv1.Deployment{} + if err := r.Get(ctx, req.NamespacedName, deployment); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + // 2. Skip if migration is not opted-in via annotation. + if len(deployment.Annotations) == 0 || deployment.Annotations[AnnotationMigrationEnabled] != "true" { + logger.V(1).Info("migration not enabled for this deployment, skipping") + return ctrl.Result{}, nil + } + + // 3. Find the OpenFGA container and extract the desired version. + mainContainer, err := findOpenFGAContainer(deployment) + if err != nil { + logger.Error(err, "unable to find OpenFGA container") + return ctrl.Result{}, err + } + desiredVersion := extractImageTag(mainContainer.Image) + + // 3b. Skip migration for memory datastore — just ensure the Deployment is scaled up. + if isMemoryDatastore(mainContainer) { + logger.V(1).Info("memory datastore detected, skipping migration") + if _, scaleErr := ensureDeploymentScaled(ctx, r.Client, deployment); scaleErr != nil { + return ctrl.Result{}, scaleErr + } + return ctrl.Result{}, nil + } + + // 4. Check current migration status from ConfigMap. + configMap := &corev1.ConfigMap{} + cmName := migrationConfigMapName(req.Name) + err = r.Get(ctx, types.NamespacedName{Name: cmName, Namespace: req.Namespace}, configMap) + + currentVersion := "" + if err == nil { + currentVersion = configMap.Data["version"] + } else if !apierrors.IsNotFound(err) { + return ctrl.Result{}, fmt.Errorf("getting migration status: %w", err) + } + + // 5. If versions match, ensure Deployment is scaled up and return. + if currentVersion == desiredVersion { + logger.V(1).Info("migration up to date", "version", desiredVersion) + statusPatch := client.MergeFrom(deployment.DeepCopy()) + clearMigrationFailedCondition(deployment) + if patchErr := r.Status().Patch(ctx, deployment, statusPatch); patchErr != nil { + logger.Error(patchErr, "failed to clear MigrationFailed condition") + } + if _, scaleErr := ensureDeploymentScaled(ctx, r.Client, deployment); scaleErr != nil { + return ctrl.Result{}, scaleErr + } + return ctrl.Result{}, nil + } + + logger.Info("migration needed", "currentVersion", currentVersion, "desiredVersion", desiredVersion) + + // 6. Check retry-after annotation to honor backoff cooldown. + if retryAfter, ok := deployment.Annotations[AnnotationRetryAfter]; ok { + retryTime, parseErr := time.Parse(time.RFC3339, retryAfter) + if parseErr == nil && time.Now().Before(retryTime) { + remaining := time.Until(retryTime) + logger.V(1).Info("in retry cooldown", "retryAfter", retryAfter, "remaining", remaining) + return ctrl.Result{RequeueAfter: remaining}, nil + } + } + + // 7. Check if a migration Job already exists. + jobName := migrationJobName(req.Name) + job := &batchv1.Job{} + err = r.Get(ctx, types.NamespacedName{Name: jobName, Namespace: req.Namespace}, job) + + if apierrors.IsNotFound(err) { + // Create the migration Job. + job = buildMigrationJob( + deployment, + mainContainer, + desiredVersion, + r.BackoffLimit, + r.ActiveDeadlineSeconds, + r.TTLSecondsAfterFinished, + ) + if createErr := r.Create(ctx, job); createErr != nil { + if apierrors.IsAlreadyExists(createErr) { + // A concurrent reconcile already created the Job; requeue to pick it up. + logger.V(1).Info("migration job already exists, will recheck", "job", jobName) + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } + // Leave the retry-after annotation intact so the cooldown survives this failure. + return ctrl.Result{}, fmt.Errorf("creating migration job: %w", createErr) + } + // Clear the retry-after annotation now that the Job is created. + if _, hasRetry := deployment.Annotations[AnnotationRetryAfter]; hasRetry { + patch := client.MergeFrom(deployment.DeepCopy()) + delete(deployment.Annotations, AnnotationRetryAfter) + if patchErr := r.Patch(ctx, deployment, patch); patchErr != nil { + logger.Error(patchErr, "failed to clear retry-after annotation") + } + } + logger.Info("created migration job", "job", jobName, "version", desiredVersion) + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } else if err != nil { + return ctrl.Result{}, fmt.Errorf("getting migration job: %w", err) + } + + // 8. If the existing Job is for a different (or unknown) version, delete it + // and recreate. Check annotation first (supports digests > 63 chars), fall + // back to label. A Job with neither marker is treated as stale: we cannot + // trust its outcome to represent the current desired version, so trusting + // JobComplete in step 9 would write a wrong version into the status ConfigMap. + jobVersion := job.Annotations["openfga.dev/desired-version"] + versionMatch := jobVersion == desiredVersion + if jobVersion == "" { + // Label values have ":" replaced with "_", so sanitize desiredVersion for comparison. + sanitized := strings.ReplaceAll(desiredVersion, ":", "_") + if len(sanitized) > 63 { + sanitized = sanitized[:63] + } + jobVersion = job.Labels["app.kubernetes.io/version"] + versionMatch = jobVersion != "" && jobVersion == sanitized + } + if !versionMatch { + logger.Info("existing migration job is for a different or unknown version, deleting", "jobVersion", jobVersion, "desiredVersion", desiredVersion) + propagation := metav1.DeletePropagationBackground + if delErr := r.Delete(ctx, job, &client.DeleteOptions{ + PropagationPolicy: &propagation, + }); delErr != nil && !apierrors.IsNotFound(delErr) { + return ctrl.Result{}, fmt.Errorf("deleting stale migration job: %w", delErr) + } + return ctrl.Result{RequeueAfter: 5 * time.Second}, nil + } + + // 9. Check Job status using conditions for authoritative completion signals. + if isJobConditionTrue(job, batchv1.JobComplete) { + logger.Info("migration succeeded", "version", desiredVersion) + + // Clear MigrationFailed condition. + statusPatch := client.MergeFrom(deployment.DeepCopy()) + clearMigrationFailedCondition(deployment) + if patchErr := r.Status().Patch(ctx, deployment, statusPatch); patchErr != nil { + logger.Error(patchErr, "failed to clear MigrationFailed condition") + } + + // Update migration status ConfigMap. + if statusErr := updateMigrationStatus(ctx, r.Client, deployment, desiredVersion, jobName); statusErr != nil { + return ctrl.Result{}, statusErr + } + + // Scale Deployment back up. + if _, scaleErr := ensureDeploymentScaled(ctx, r.Client, deployment); scaleErr != nil { + return ctrl.Result{}, scaleErr + } + + return ctrl.Result{}, nil + } + + // JobFailureTarget is set as soon as the Job controller decides the Job + // will fail (backoff limit reached, deadline exceeded, etc.); JobFailed + // only flips after pods finish terminating, which can take BackoffLimit × + // ActiveDeadlineSeconds. Treating either as "failed" surfaces the failure + // to users within seconds instead of minutes. + if isJobConditionTrue(job, batchv1.JobFailed) || isJobConditionTrue(job, batchv1.JobFailureTarget) { + logger.Info("migration job failed, will delete and retry", "job", jobName, "version", desiredVersion) + + // Set condition so kubectl describe shows the failure. + statusPatch := client.MergeFrom(deployment.DeepCopy()) + setMigrationFailedCondition(deployment, desiredVersion) + if patchErr := r.Status().Patch(ctx, deployment, statusPatch); patchErr != nil { + logger.Error(patchErr, "failed to set MigrationFailed condition") + } + + // Persist a retry-after annotation so the cooldown is honored even + // when the Job deletion triggers an immediate re-enqueue. + retryAfter := time.Now().Add(60 * time.Second).UTC().Format(time.RFC3339) + patch := client.MergeFrom(deployment.DeepCopy()) + if deployment.Annotations == nil { + deployment.Annotations = make(map[string]string) + } + deployment.Annotations[AnnotationRetryAfter] = retryAfter + if patchErr := r.Patch(ctx, deployment, patch); patchErr != nil { + return ctrl.Result{}, fmt.Errorf("persisting retry-after annotation: %w", patchErr) + } + + // Delete the failed Job so a fresh one is created on the next reconcile. + propagation := metav1.DeletePropagationBackground + if delErr := r.Delete(ctx, job, &client.DeleteOptions{ + PropagationPolicy: &propagation, + }); delErr != nil && !apierrors.IsNotFound(delErr) { + return ctrl.Result{}, fmt.Errorf("deleting failed migration job: %w", delErr) + } + logger.Info("deleted failed migration job, will retry", "job", jobName) + + // Requeue after the cooldown period. + return ctrl.Result{RequeueAfter: 60 * time.Second}, nil + } + + // 10. Job still running — requeue. + logger.V(1).Info("migration job in progress", "job", jobName) + return ctrl.Result{RequeueAfter: 10 * time.Second}, nil +} + +// isJobConditionTrue returns true if the Job has a condition of the given type +// with status True. This is more reliable than comparing status counters because +// the Job controller sets conditions atomically when it makes its final decision. +func isJobConditionTrue(job *batchv1.Job, conditionType batchv1.JobConditionType) bool { + for _, c := range job.Status.Conditions { + if c.Type == conditionType && c.Status == corev1.ConditionTrue { + return true + } + } + return false +} + +// isMemoryDatastore checks if the Deployment is using the memory datastore +// (no database migration needed). +// +// NOTE: This only inspects explicit env vars on the container spec. If +// OPENFGA_DATASTORE_ENGINE is injected via envFrom (ConfigMap/Secret), it +// will not be detected here and the operator will attempt a migration. +func isMemoryDatastore(container *corev1.Container) bool { + for _, env := range container.Env { + if env.Name == "OPENFGA_DATASTORE_ENGINE" { + return strings.EqualFold(env.Value, "memory") + } + } + return false +} + +// setMigrationFailedCondition sets a MigrationFailed condition on the Deployment. +func setMigrationFailedCondition(deployment *appsv1.Deployment, version string) { + condition := appsv1.DeploymentCondition{ + Type: "MigrationFailed", + Status: corev1.ConditionTrue, + LastTransitionTime: metav1.Now(), + Reason: "MigrationJobFailed", + Message: fmt.Sprintf("Database migration failed for version %s. Check migration job logs.", version), + } + + // Replace existing MigrationFailed condition if present. + for i, c := range deployment.Status.Conditions { + if c.Type == "MigrationFailed" { + deployment.Status.Conditions[i] = condition + return + } + } + deployment.Status.Conditions = append(deployment.Status.Conditions, condition) +} + +// clearMigrationFailedCondition removes or sets the MigrationFailed condition to False. +func clearMigrationFailedCondition(deployment *appsv1.Deployment) { + for i, c := range deployment.Status.Conditions { + if c.Type == "MigrationFailed" { + deployment.Status.Conditions[i].Status = corev1.ConditionFalse + deployment.Status.Conditions[i].LastTransitionTime = metav1.Now() + deployment.Status.Conditions[i].Reason = "MigrationSucceeded" + deployment.Status.Conditions[i].Message = "Migration completed successfully." + return + } + } +} + +// SetupWithManager sets up the controller with the Manager. +func (r *MigrationReconciler) SetupWithManager(mgr ctrl.Manager) error { + // Only watch Deployments that are part of OpenFGA. + labelPredicate, err := predicate.LabelSelectorPredicate(metav1.LabelSelector{ + MatchLabels: map[string]string{ + LabelPartOf: LabelPartOfValue, + LabelComponent: LabelComponentValue, + }, + }) + if err != nil { + return fmt.Errorf("creating label predicate: %w", err) + } + + return ctrl.NewControllerManagedBy(mgr). + For(&appsv1.Deployment{}, builder.WithPredicates(labelPredicate)). + Owns(&batchv1.Job{}). + Watches(&corev1.ConfigMap{}, handler.EnqueueRequestsFromMapFunc( + func(ctx context.Context, obj client.Object) []reconcile.Request { + // Only watch ConfigMaps that are migration status ConfigMaps. + if obj.GetLabels()[LabelPartOf] != LabelPartOfValue || + obj.GetLabels()["app.kubernetes.io/managed-by"] != "openfga-operator" { + return nil + } + // Map back to the owning Deployment. + for _, ref := range obj.GetOwnerReferences() { + if ref.Kind == "Deployment" { + return []reconcile.Request{ + {NamespacedName: types.NamespacedName{ + Name: ref.Name, + Namespace: obj.GetNamespace(), + }}, + } + } + } + return nil + }, + )). + Complete(r) +} diff --git a/operator/internal/controller/migration_controller_test.go b/operator/internal/controller/migration_controller_test.go new file mode 100644 index 00000000..1bd2eb0b --- /dev/null +++ b/operator/internal/controller/migration_controller_test.go @@ -0,0 +1,1104 @@ +package controller + +import ( + "context" + "fmt" + "testing" + "time" + + appsv1 "k8s.io/api/apps/v1" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" +) + +func newScheme() *runtime.Scheme { + s := runtime.NewScheme() + _ = clientgoscheme.AddToScheme(s) + return s +} + +func newTestDeployment(name, namespace, image string, replicas int32) *appsv1.Deployment { + return &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + UID: "test-uid-123", + Labels: map[string]string{ + LabelPartOf: LabelPartOfValue, + LabelComponent: LabelComponentValue, + }, + Annotations: map[string]string{ + AnnotationMigrationEnabled: "true", + }, + }, + Spec: appsv1.DeploymentSpec{ + Replicas: ptr.To(replicas), + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "openfga"}, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "openfga"}, + }, + Spec: corev1.PodSpec{ + ServiceAccountName: "openfga", + Containers: []corev1.Container{ + { + Name: "openfga", + Image: image, + Env: []corev1.EnvVar{ + {Name: "OPENFGA_DATASTORE_ENGINE", Value: "postgres"}, + {Name: "OPENFGA_DATASTORE_URI", Value: "postgres://localhost/openfga"}, + {Name: "OPENFGA_LOG_LEVEL", Value: "info"}, + }, + }, + }, + }, + }, + }, + } +} + +func newReconciler(objects ...runtime.Object) *MigrationReconciler { + scheme := newScheme() + clientBuilder := fake.NewClientBuilder().WithScheme(scheme). + WithStatusSubresource(&appsv1.Deployment{}) + for _, obj := range objects { + clientBuilder = clientBuilder.WithRuntimeObjects(obj) + } + return &MigrationReconciler{ + Client: clientBuilder.Build(), + BackoffLimit: DefaultBackoffLimit, + ActiveDeadlineSeconds: DefaultActiveDeadlineSeconds, + TTLSecondsAfterFinished: DefaultTTLSecondsAfterFinished, + } +} + +func findCondition(conditions []appsv1.DeploymentCondition, condType string) *appsv1.DeploymentCondition { + for i := range conditions { + if string(conditions[i].Type) == condType { + return &conditions[i] + } + } + return nil +} + +func TestReconcile_FirstInstall_CreatesJob(t *testing.T) { + // Given: a Deployment with no migration-status ConfigMap. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 0) + r := newReconciler(dep) + + // When: reconciling. + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: a migration Job should be created and requeue requested. + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter == 0 { + t.Error("expected requeue, got none") + } + + // Verify the Job was created. + job := &batchv1.Job{} + if err := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migrate", Namespace: "default", + }, job); err != nil { + t.Fatalf("expected migration job to be created: %v", err) + } + + if job.Spec.Template.Spec.Containers[0].Image != "openfga/openfga:v1.14.0" { + t.Errorf("expected job image openfga/openfga:v1.14.0, got %s", job.Spec.Template.Spec.Containers[0].Image) + } + + if job.Spec.Template.Spec.Containers[0].Args[0] != "migrate" { + t.Errorf("expected job args [migrate], got %v", job.Spec.Template.Spec.Containers[0].Args) + } + + // Verify all env vars from the main container were passed. + jobEnvNames := make(map[string]bool) + for _, env := range job.Spec.Template.Spec.Containers[0].Env { + jobEnvNames[env.Name] = true + } + for _, expected := range []string{"OPENFGA_DATASTORE_ENGINE", "OPENFGA_DATASTORE_URI", "OPENFGA_LOG_LEVEL"} { + if !jobEnvNames[expected] { + t.Errorf("expected env var %s to be passed to migration job", expected) + } + } +} + +func TestReconcile_VersionMatch_ScalesUp(t *testing.T) { + // Given: a Deployment at 0 replicas with matching migration-status ConfigMap. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 0) + dep.Annotations[AnnotationDesiredReplicas] = "3" + + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "openfga-migration-status", + Namespace: "default", + }, + Data: map[string]string{ + "version": "v1.14.0", + "migratedAt": "2026-04-06T12:00:00Z", + "jobName": "openfga-migrate", + }, + } + + r := newReconciler(dep, cm) + + // When: reconciling. + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: no error, no requeue. + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter != 0 { + t.Error("expected no requeue when versions match") + } + + // Verify Deployment was scaled up. + updated := &appsv1.Deployment{} + if err := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga", Namespace: "default", + }, updated); err != nil { + t.Fatalf("getting deployment: %v", err) + } + if *updated.Spec.Replicas != 3 { + t.Errorf("expected 3 replicas, got %d", *updated.Spec.Replicas) + } +} + +func TestReconcile_JobSucceeded_UpdatesConfigMapAndScalesUp(t *testing.T) { + // Given: a Deployment at 0 replicas, no ConfigMap, a succeeded migration Job, + // and a pre-existing MigrationFailed condition from a prior attempt. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 0) + dep.Annotations[AnnotationDesiredReplicas] = "3" + dep.Status.Conditions = []appsv1.DeploymentCondition{ + { + Type: "MigrationFailed", + Status: corev1.ConditionTrue, + Reason: "MigrationJobFailed", + Message: "Database migration failed for version v1.13.0.", + }, + } + + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "openfga-migrate", + Namespace: "default", + Annotations: map[string]string{ + "openfga.dev/desired-version": "v1.14.0", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "openfga", + UID: "test-uid-123", + }, + }, + }, + Spec: batchv1.JobSpec{ + BackoffLimit: ptr.To(int32(3)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "migrate", Image: "openfga/openfga:v1.14.0"}}, + RestartPolicy: corev1.RestartPolicyNever, + }, + }, + }, + Status: batchv1.JobStatus{ + Succeeded: 1, + Conditions: []batchv1.JobCondition{ + { + Type: batchv1.JobComplete, + Status: corev1.ConditionTrue, + }, + }, + }, + } + + r := newReconciler(dep, job) + + // When: reconciling. + _, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: no error. + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Verify ConfigMap was created. + cm := &corev1.ConfigMap{} + if err := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migration-status", Namespace: "default", + }, cm); err != nil { + t.Fatalf("expected ConfigMap to be created: %v", err) + } + if cm.Data["version"] != "v1.14.0" { + t.Errorf("expected version v1.14.0 in ConfigMap, got %s", cm.Data["version"]) + } + + // Verify Deployment was scaled up. + updated := &appsv1.Deployment{} + if err := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga", Namespace: "default", + }, updated); err != nil { + t.Fatalf("getting deployment: %v", err) + } + if *updated.Spec.Replicas != 3 { + t.Errorf("expected 3 replicas, got %d", *updated.Spec.Replicas) + } + + // Verify MigrationFailed condition was cleared. + cond := findCondition(updated.Status.Conditions, "MigrationFailed") + if cond == nil { + t.Fatal("expected MigrationFailed condition to exist") + } + if cond.Status != corev1.ConditionFalse { + t.Errorf("expected MigrationFailed status False after success, got %s", cond.Status) + } + if cond.Reason != "MigrationSucceeded" { + t.Errorf("expected reason MigrationSucceeded, got %s", cond.Reason) + } +} + +func TestReconcile_JobFailed_SetsRetryAnnotationAndRequeues(t *testing.T) { + // Given: a Deployment at 0 replicas and a failed migration Job. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 0) + dep.Annotations[AnnotationDesiredReplicas] = "3" + + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "openfga-migrate", + Namespace: "default", + Annotations: map[string]string{ + "openfga.dev/desired-version": "v1.14.0", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "openfga", + UID: "test-uid-123", + }, + }, + }, + Spec: batchv1.JobSpec{ + BackoffLimit: ptr.To(int32(3)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "migrate", Image: "openfga/openfga:v1.14.0"}}, + RestartPolicy: corev1.RestartPolicyNever, + }, + }, + }, + Status: batchv1.JobStatus{ + Failed: 3, + Conditions: []batchv1.JobCondition{ + { + Type: batchv1.JobFailed, + Status: corev1.ConditionTrue, + }, + }, + }, + } + + r := newReconciler(dep, job) + + // When: reconciling. + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: no error, but requeue after 60s for retry. + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter != 60*time.Second { + t.Errorf("expected 60s requeue, got %v", result.RequeueAfter) + } + + // Verify Deployment replicas unchanged (still at 0 from fresh install). + updated := &appsv1.Deployment{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga", Namespace: "default", + }, updated); getErr != nil { + t.Fatalf("getting deployment: %v", getErr) + } + if *updated.Spec.Replicas != 0 { + t.Errorf("expected 0 replicas after failed migration, got %d", *updated.Spec.Replicas) + } + + // Verify the failed Job was deleted. + deletedJob := &batchv1.Job{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migrate", Namespace: "default", + }, deletedJob); getErr == nil { + t.Error("expected failed migration job to be deleted") + } + + // Verify retry-after annotation was set on the Deployment. + if _, ok := updated.Annotations[AnnotationRetryAfter]; !ok { + t.Error("expected retry-after annotation to be set on Deployment") + } + + // Verify MigrationFailed condition was set. + cond := findCondition(updated.Status.Conditions, "MigrationFailed") + if cond == nil { + t.Fatal("expected MigrationFailed condition to be set") + } + if cond.Status != corev1.ConditionTrue { + t.Errorf("expected MigrationFailed status True, got %s", cond.Status) + } + if cond.Reason != "MigrationJobFailed" { + t.Errorf("expected reason MigrationJobFailed, got %s", cond.Reason) + } +} + +func TestReconcile_JobFailureTarget_TreatedAsFailed(t *testing.T) { + // Given: a Job with only JobFailureTarget=True (no JobFailed yet). The + // Job controller sets this as soon as it decides the Job will fail, + // before pods finish terminating and JobFailed is recorded. The operator + // should treat this as a failure to surface the error in seconds rather + // than waiting the full BackoffLimit × ActiveDeadlineSeconds. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 0) + dep.Annotations[AnnotationDesiredReplicas] = "3" + + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "openfga-migrate", + Namespace: "default", + Annotations: map[string]string{ + "openfga.dev/desired-version": "v1.14.0", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "openfga", + UID: "test-uid-123", + }, + }, + }, + Status: batchv1.JobStatus{ + Conditions: []batchv1.JobCondition{ + {Type: batchv1.JobFailureTarget, Status: corev1.ConditionTrue, Reason: "BackoffLimitExceeded"}, + }, + }, + } + + r := newReconciler(dep, job) + + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter != 60*time.Second { + t.Errorf("expected 60s requeue, got %v", result.RequeueAfter) + } + + updated := &appsv1.Deployment{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga", Namespace: "default", + }, updated); getErr != nil { + t.Fatalf("getting deployment: %v", getErr) + } + + deletedJob := &batchv1.Job{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migrate", Namespace: "default", + }, deletedJob); getErr == nil { + t.Error("expected migration job to be deleted on JobFailureTarget") + } + if _, ok := updated.Annotations[AnnotationRetryAfter]; !ok { + t.Error("expected retry-after annotation to be set") + } + cond := findCondition(updated.Status.Conditions, "MigrationFailed") + if cond == nil || cond.Status != corev1.ConditionTrue { + t.Fatal("expected MigrationFailed condition True") + } +} + +func TestReconcile_RetryAfterCooldown_SkipsJobCreation(t *testing.T) { + // Given: a Deployment with a retry-after annotation in the future. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 0) + dep.Annotations[AnnotationDesiredReplicas] = "3" + dep.Annotations[AnnotationRetryAfter] = time.Now().Add(30 * time.Second).UTC().Format(time.RFC3339) + + r := newReconciler(dep) + + // When: reconciling. + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: no error, requeue with remaining cooldown time. + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter == 0 { + t.Error("expected requeue during cooldown") + } + if result.RequeueAfter > 30*time.Second { + t.Errorf("expected requeue within 30s, got %v", result.RequeueAfter) + } + + // Verify no Job was created. + job := &batchv1.Job{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migrate", Namespace: "default", + }, job); getErr == nil { + t.Error("expected no migration job during cooldown") + } +} + +func TestReconcile_UnknownVersionJob_DeletedNotTrusted(t *testing.T) { + // Given: a Deployment desiring v1.14.0 and a JobComplete migration Job that + // carries no version annotation or label (e.g. left over from an older + // operator or created by a third-party tool). Trusting its outcome would + // write the wrong version into the migration-status ConfigMap. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 0) + dep.Annotations[AnnotationDesiredReplicas] = "3" + + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "openfga-migrate", + Namespace: "default", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "openfga", + UID: "test-uid-123", + }, + }, + }, + Status: batchv1.JobStatus{ + Conditions: []batchv1.JobCondition{ + {Type: batchv1.JobComplete, Status: corev1.ConditionTrue}, + }, + }, + } + + r := newReconciler(dep, job) + + // When: reconciling. + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: the Job is deleted and a requeue is scheduled; the ConfigMap is + // NOT created from the unknown-version Job's outcome. + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter == 0 { + t.Error("expected requeue after deleting unknown-version job") + } + + deletedJob := &batchv1.Job{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migrate", Namespace: "default", + }, deletedJob); getErr == nil { + t.Error("expected unknown-version job to be deleted") + } + + cm := &corev1.ConfigMap{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migration-status", Namespace: "default", + }, cm); getErr == nil { + t.Errorf("expected no migration-status ConfigMap; got version=%q", cm.Data["version"]) + } +} + +func TestReconcile_RetryAfterPersistsOnJobCreateFailure(t *testing.T) { + // Given: a Deployment with an elapsed retry-after annotation, and a client + // that fails Job creation with a non-AlreadyExists error. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 0) + dep.Annotations[AnnotationDesiredReplicas] = "3" + dep.Annotations[AnnotationRetryAfter] = time.Now().Add(-1 * time.Second).UTC().Format(time.RFC3339) + + scheme := newScheme() + c := fake.NewClientBuilder(). + WithScheme(scheme). + WithStatusSubresource(&appsv1.Deployment{}). + WithRuntimeObjects(dep). + WithInterceptorFuncs(interceptor.Funcs{ + Create: func(ctx context.Context, c client.WithWatch, obj client.Object, opts ...client.CreateOption) error { + if _, ok := obj.(*batchv1.Job); ok { + return fmt.Errorf("simulated transient API error") + } + return c.Create(ctx, obj, opts...) + }, + }). + Build() + r := &MigrationReconciler{ + Client: c, + BackoffLimit: DefaultBackoffLimit, + ActiveDeadlineSeconds: DefaultActiveDeadlineSeconds, + TTLSecondsAfterFinished: DefaultTTLSecondsAfterFinished, + } + + // When: reconciling. + _, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: an error is returned and the retry-after annotation is preserved + // so the next reconcile honors the cooldown. + if err == nil { + t.Fatal("expected error from failed job creation") + } + + updated := &appsv1.Deployment{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga", Namespace: "default", + }, updated); getErr != nil { + t.Fatalf("getting deployment: %v", getErr) + } + if _, ok := updated.Annotations[AnnotationRetryAfter]; !ok { + t.Error("expected retry-after annotation to persist after Job creation failure") + } +} + +func TestReconcile_RetryAfterClearedAfterJobCreated(t *testing.T) { + // Given: a Deployment with an elapsed retry-after annotation. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 0) + dep.Annotations[AnnotationDesiredReplicas] = "3" + dep.Annotations[AnnotationRetryAfter] = time.Now().Add(-1 * time.Second).UTC().Format(time.RFC3339) + + r := newReconciler(dep) + + // When: reconciling. + if _, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }); err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Then: the Job exists and the retry-after annotation has been cleared. + job := &batchv1.Job{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migrate", Namespace: "default", + }, job); getErr != nil { + t.Fatalf("expected migration job to be created: %v", getErr) + } + + updated := &appsv1.Deployment{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga", Namespace: "default", + }, updated); getErr != nil { + t.Fatalf("getting deployment: %v", getErr) + } + if _, ok := updated.Annotations[AnnotationRetryAfter]; ok { + t.Error("expected retry-after annotation to be cleared after Job created") + } +} + +func TestReconcile_MemoryDatastore_SkipsMigration(t *testing.T) { + // Given: a Deployment using the memory datastore. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 0) + dep.Annotations[AnnotationDesiredReplicas] = "1" + dep.Spec.Template.Spec.Containers[0].Env = []corev1.EnvVar{ + {Name: "OPENFGA_DATASTORE_ENGINE", Value: "memory"}, + } + + r := newReconciler(dep) + + // When: reconciling. + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: no error, no requeue. + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter != 0 { + t.Error("expected no requeue for memory datastore") + } + + // Verify Deployment was scaled up (no migration needed). + updated := &appsv1.Deployment{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga", Namespace: "default", + }, updated); getErr != nil { + t.Fatalf("getting deployment: %v", getErr) + } + if *updated.Spec.Replicas != 1 { + t.Errorf("expected 1 replica, got %d", *updated.Spec.Replicas) + } + + // Verify no Job was created. + job := &batchv1.Job{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migrate", Namespace: "default", + }, job); getErr == nil { + t.Error("expected no migration job for memory datastore") + } +} + +func TestReconcile_DeploymentNotFound_NoError(t *testing.T) { + r := newReconciler() + + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "nonexistent", Namespace: "default"}, + }) + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter != 0 { + t.Error("expected no requeue for missing deployment") + } +} + +func TestReconcile_FindContainerByName(t *testing.T) { + // Given: a Deployment with a sidecar before the openfga container. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 0) + dep.Spec.Template.Spec.Containers = []corev1.Container{ + { + Name: "sidecar", + Image: "envoyproxy/envoy:v1.30", + }, + { + Name: "openfga", + Image: "openfga/openfga:v1.14.0", + Env: []corev1.EnvVar{ + {Name: "OPENFGA_DATASTORE_ENGINE", Value: "postgres"}, + {Name: "OPENFGA_DATASTORE_URI", Value: "postgres://localhost/openfga"}, + }, + }, + } + + r := newReconciler(dep) + + // When: reconciling. + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: Job should use the openfga container's image, not the sidecar's. + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter == 0 { + t.Error("expected requeue, got none") + } + + job := &batchv1.Job{} + if err := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migrate", Namespace: "default", + }, job); err != nil { + t.Fatalf("expected migration job to be created: %v", err) + } + + if job.Spec.Template.Spec.Containers[0].Image != "openfga/openfga:v1.14.0" { + t.Errorf("expected job image openfga/openfga:v1.14.0, got %s", job.Spec.Template.Spec.Containers[0].Image) + } +} + +func TestReconcile_StaleJob_DeletedAndRequeued(t *testing.T) { + // Given: a Deployment at v1.15.0 with an existing migration Job for v1.14.0. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.15.0", 0) + dep.Annotations[AnnotationDesiredReplicas] = "3" + + staleJob := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "openfga-migrate", + Namespace: "default", + Labels: map[string]string{ + "app.kubernetes.io/version": "v1.14.0", + }, + Annotations: map[string]string{ + "openfga.dev/desired-version": "v1.14.0", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "openfga", + UID: "test-uid-123", + }, + }, + }, + Spec: batchv1.JobSpec{ + BackoffLimit: ptr.To(int32(3)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "migrate", Image: "openfga/openfga:v1.14.0"}}, + RestartPolicy: corev1.RestartPolicyNever, + }, + }, + }, + Status: batchv1.JobStatus{ + Succeeded: 1, + Conditions: []batchv1.JobCondition{ + { + Type: batchv1.JobComplete, + Status: corev1.ConditionTrue, + }, + }, + }, + } + + r := newReconciler(dep, staleJob) + + // When: reconciling. + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: no error, requeue to recreate with correct version. + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter == 0 { + t.Error("expected requeue after deleting stale job") + } + + // Verify the stale Job was deleted. + deletedJob := &batchv1.Job{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migrate", Namespace: "default", + }, deletedJob); getErr == nil { + t.Error("expected stale migration job to be deleted") + } + + // Verify ConfigMap was NOT updated (migration didn't actually run for v1.15.0). + cm := &corev1.ConfigMap{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migration-status", Namespace: "default", + }, cm); getErr == nil { + if cm.Data["version"] == "v1.15.0" { + t.Error("ConfigMap should not be updated to v1.15.0 from a stale v1.14.0 job") + } + } +} + +func TestReconcile_MigrationNotEnabled_Skips(t *testing.T) { + // Given: a Deployment without the migration-enabled annotation. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 3) + delete(dep.Annotations, AnnotationMigrationEnabled) + + r := newReconciler(dep) + + // When: reconciling. + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: no error, no requeue, no Job created, replicas unchanged. + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter != 0 { + t.Error("expected no requeue when migration is not enabled") + } + + // Verify no Job was created. + job := &batchv1.Job{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migrate", Namespace: "default", + }, job); getErr == nil { + t.Error("expected no migration job when migration is not enabled") + } + + // Verify replicas unchanged. + updated := &appsv1.Deployment{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga", Namespace: "default", + }, updated); getErr != nil { + t.Fatalf("getting deployment: %v", getErr) + } + if *updated.Spec.Replicas != 3 { + t.Errorf("expected 3 replicas unchanged, got %d", *updated.Spec.Replicas) + } +} + +func TestReconcile_StaleJob_LabelOnlyFallback_DeletedAndRequeued(t *testing.T) { + // Given: a Deployment at v1.15.0 with an existing Job that only has a label (no annotation). + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.15.0", 0) + dep.Annotations[AnnotationDesiredReplicas] = "3" + + staleJob := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "openfga-migrate", + Namespace: "default", + Labels: map[string]string{ + "app.kubernetes.io/version": "v1.14.0", + }, + // No annotation — forces the label-only fallback path. + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "openfga", + UID: "test-uid-123", + }, + }, + }, + Spec: batchv1.JobSpec{ + BackoffLimit: ptr.To(int32(3)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "migrate", Image: "openfga/openfga:v1.14.0"}}, + RestartPolicy: corev1.RestartPolicyNever, + }, + }, + }, + } + + r := newReconciler(dep, staleJob) + + // When: reconciling. + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: stale Job should be deleted and requeue requested. + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter == 0 { + t.Error("expected requeue after deleting stale job") + } + + deletedJob := &batchv1.Job{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migrate", Namespace: "default", + }, deletedJob); getErr == nil { + t.Error("expected stale migration job to be deleted") + } +} + +func TestReconcile_JobSucceeded_UpdatesExistingConfigMap(t *testing.T) { + // Given: a Deployment with a pre-existing ConfigMap from v1.13.0 and a succeeded Job for v1.14.0. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 0) + dep.Annotations[AnnotationDesiredReplicas] = "3" + + existingCM := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "openfga-migration-status", + Namespace: "default", + Labels: map[string]string{ + LabelPartOf: LabelPartOfValue, + LabelComponent: "migration", + "app.kubernetes.io/managed-by": "openfga-operator", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "openfga", + UID: "test-uid-123", + }, + }, + }, + Data: map[string]string{ + "version": "v1.13.0", + "migratedAt": "2026-04-01T12:00:00Z", + "jobName": "openfga-migrate", + }, + } + + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "openfga-migrate", + Namespace: "default", + Annotations: map[string]string{ + "openfga.dev/desired-version": "v1.14.0", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "openfga", + UID: "test-uid-123", + }, + }, + }, + Spec: batchv1.JobSpec{ + BackoffLimit: ptr.To(int32(3)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "migrate", Image: "openfga/openfga:v1.14.0"}}, + RestartPolicy: corev1.RestartPolicyNever, + }, + }, + }, + Status: batchv1.JobStatus{ + Succeeded: 1, + Conditions: []batchv1.JobCondition{ + { + Type: batchv1.JobComplete, + Status: corev1.ConditionTrue, + }, + }, + }, + } + + r := newReconciler(dep, existingCM, job) + + // When: reconciling. + _, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: no error. + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Verify ConfigMap was updated to v1.14.0. + cm := &corev1.ConfigMap{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga-migration-status", Namespace: "default", + }, cm); getErr != nil { + t.Fatalf("expected ConfigMap to exist: %v", getErr) + } + if cm.Data["version"] != "v1.14.0" { + t.Errorf("expected version v1.14.0 in ConfigMap, got %s", cm.Data["version"]) + } +} + +func TestReconcile_MigrationNeeded_DoesNotScaleToZero(t *testing.T) { + // Given: a Deployment with replicas > 0 and no migration-status ConfigMap. + // The operator should create the migration Job WITHOUT scaling to zero, + // relying on OpenFGA's built-in schema version check to gate readiness. + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 3) + dep.Annotations = map[string]string{ + AnnotationMigrationEnabled: "true", + AnnotationDesiredReplicas: "3", + } + + r := newReconciler(dep) + + // When: reconciling. + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: no error, Job created. + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter == 0 { + t.Error("expected requeue after creating job") + } + + // Verify Deployment replicas were NOT changed — pods keep running during migration. + updated := &appsv1.Deployment{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga", Namespace: "default", + }, updated); getErr != nil { + t.Fatalf("getting deployment: %v", getErr) + } + if *updated.Spec.Replicas != 3 { + t.Errorf("expected replicas to remain at 3, got %d", *updated.Spec.Replicas) + } +} + +func TestReconcile_JobInProgress_Requeues(t *testing.T) { + // Given: a Deployment with a running Job (no conditions set yet). + dep := newTestDeployment("openfga", "default", "openfga/openfga:v1.14.0", 0) + dep.Annotations[AnnotationDesiredReplicas] = "3" + + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: "openfga-migrate", + Namespace: "default", + Annotations: map[string]string{ + "openfga.dev/desired-version": "v1.14.0", + }, + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "openfga", + UID: "test-uid-123", + }, + }, + }, + Spec: batchv1.JobSpec{ + BackoffLimit: ptr.To(int32(3)), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{{Name: "migrate", Image: "openfga/openfga:v1.14.0"}}, + RestartPolicy: corev1.RestartPolicyNever, + }, + }, + }, + Status: batchv1.JobStatus{ + Active: 1, + }, + } + + r := newReconciler(dep, job) + + // When: reconciling. + result, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "openfga", Namespace: "default"}, + }) + + // Then: no error, requeue after 10s to poll progress. + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if result.RequeueAfter != 10*time.Second { + t.Errorf("expected 10s requeue for in-progress job, got %v", result.RequeueAfter) + } + + // Verify Deployment still at 0 replicas. + updated := &appsv1.Deployment{} + if getErr := r.Get(context.Background(), types.NamespacedName{ + Name: "openfga", Namespace: "default", + }, updated); getErr != nil { + t.Fatalf("getting deployment: %v", getErr) + } + if *updated.Spec.Replicas != 0 { + t.Errorf("expected 0 replicas while job in progress, got %d", *updated.Spec.Replicas) + } +} + +func TestExtractImageTag(t *testing.T) { + tests := []struct { + image string + expected string + }{ + {"openfga/openfga:v1.14.0", "v1.14.0"}, + {"openfga/openfga:latest", "latest"}, + {"openfga/openfga", "latest"}, + {"ghcr.io/openfga/openfga:v1.14.0", "v1.14.0"}, + {"registry.example.com:5000/openfga/openfga:v1.14.0", "v1.14.0"}, + {"openfga/openfga@sha256:abcdef1234567890", "sha256:abcdef1234567890"}, + } + + for _, tt := range tests { + t.Run(tt.image, func(t *testing.T) { + got := extractImageTag(tt.image) + if got != tt.expected { + t.Errorf("extractImageTag(%q) = %q, want %q", tt.image, got, tt.expected) + } + }) + } +} diff --git a/operator/tests/README.md b/operator/tests/README.md new file mode 100644 index 00000000..e4377a99 --- /dev/null +++ b/operator/tests/README.md @@ -0,0 +1,186 @@ +# Local Integration Tests + +Manual integration tests for the OpenFGA operator on a local Kubernetes cluster (Rancher Desktop, kind, minikube, etc.). + +## Prerequisites + +- A running local Kubernetes cluster +- Helm 3.6+ +- The operator image built locally: + ```bash + cd operator + docker build -t openfga/openfga-operator:dev . + ``` +- Chart dependencies updated: + ```bash + helm dependency update charts/openfga + ``` + +All test values files use `imagePullPolicy: Never`, so the locally-built image must be available to the cluster's container runtime. On Rancher Desktop (dockerd) and Docker Desktop this works automatically. For kind, load the image first: + +```bash +kind load docker-image openfga/openfga-operator:dev +``` + +## Test Scenarios + +### 1. Happy Path + +Deploys OpenFGA with a Postgres instance. The operator should run the migration and scale OpenFGA up within ~30 seconds. + +```bash +kubectl create namespace openfga-test +helm install openfga-test charts/openfga -n openfga-test \ + -f operator/tests/values-happy-path.yaml +``` + +**Expected outcome:** + +| Resource | State | +|----------|-------| +| `openfga-test-openfga-operator` | `1/1 Running` | +| `openfga-test-postgres` | `1/1 Running` | +| `openfga-test-migrate-xxxxx` | `0/1 Completed` | +| `openfga-test` (OpenFGA) | `3/3 Running` | + +**Verify:** + +```bash +# All resources healthy +kubectl get all -n openfga-test + +# Operator logs show full lifecycle +kubectl logs -n openfga-test deployment/openfga-test-openfga-operator + +# Migration status recorded +kubectl get configmap openfga-test-migration-status -n openfga-test -o jsonpath='{.data}' + +# Database tables created +kubectl exec -n openfga-test deployment/openfga-test-postgres -- \ + psql -U openfga -d openfga -c '\dt' + +# OpenFGA responding +kubectl run curl-test --image=curlimages/curl -n openfga-test \ + --rm -it --restart=Never -- curl -s http://openfga-test:8080/healthz +# Expected: {"status":"SERVING"} +``` + +**Clean up:** + +```bash +helm uninstall openfga-test -n openfga-test +kubectl delete namespace openfga-test +``` + +--- + +### 2. Database Outage and Recovery + +Deploys OpenFGA with a Postgres instance scaled to 0 replicas (simulating a database that isn't ready yet). The operator should retry migrations until Postgres becomes available, then self-heal. + +```bash +kubectl create namespace openfga-test +helm install openfga-test charts/openfga -n openfga-test \ + -f operator/tests/values-db-outage.yaml +``` + +**Expected behavior while Postgres is down:** + +- Migration Job runs and fails (each pod times out after ~60s) +- After 3 failures (backoffLimit), the operator: + - Sets `MigrationFailed: True` condition on the Deployment + - Deletes the failed Job + - Creates a fresh Job after a 60-second delay +- This cycle repeats indefinitely +- OpenFGA stays at 0 replicas throughout (safe — no unmigrated app running) + +**Watch the failure cycle:** + +```bash +# Check deployment conditions +kubectl get deployment openfga-test -n openfga-test \ + -o jsonpath='{range .status.conditions[*]}{.type}: {.status} - {.message}{"\n"}{end}' + +# Watch operator logs for delete/retry cycle +kubectl logs -n openfga-test deployment/openfga-test-openfga-operator -f +# Look for: +# "migration job failed, will delete and retry" +# "deleted failed migration job, will retry" +# "created migration job" +``` + +**Bring Postgres back (after a few minutes):** + +```bash +kubectl scale deployment openfga-test-postgres -n openfga-test --replicas=1 +``` + +**Expected recovery (within ~60s of Postgres becoming ready):** + +- The currently running migration pod connects and succeeds +- Operator updates the ConfigMap with the new version +- Operator scales OpenFGA to 3/3 replicas +- `{"status":"SERVING"}` from the health endpoint + +**Verify recovery:** + +```bash +# OpenFGA should be 3/3 Running +kubectl get all -n openfga-test + +# Migration status recorded +kubectl get configmap openfga-test-migration-status -n openfga-test -o jsonpath='{.data}' + +# Health check +kubectl run curl-test --image=curlimages/curl -n openfga-test \ + --rm -it --restart=Never -- curl -s http://openfga-test:8080/healthz +``` + +**Clean up:** + +```bash +helm uninstall openfga-test -n openfga-test +kubectl delete namespace openfga-test +``` + +--- + +### 3. No Database (Permanent Failure) + +Deploys OpenFGA pointing at a Postgres hostname that doesn't exist. The operator should continuously retry without crashing or leaving the app in a broken state. + +```bash +kubectl create namespace openfga-test +helm install openfga-test charts/openfga -n openfga-test \ + -f operator/tests/values-no-db.yaml +``` + +**Expected behavior:** + +- Migration Jobs fail repeatedly (DNS resolution fails for `postgres-does-not-exist`) +- Operator sets `MigrationFailed: True` on the Deployment +- Operator deletes failed Jobs and retries every ~60 seconds +- OpenFGA stays at 0 replicas indefinitely — never starts against an unmigrated database + +This scenario verifies the operator doesn't crash-loop or consume excessive resources when the database is permanently unavailable. + +**Verify:** + +```bash +# OpenFGA at 0/0, operator at 1/1 +kubectl get deployments -n openfga-test + +# MigrationFailed condition present +kubectl get deployment openfga-test -n openfga-test \ + -o jsonpath='{range .status.conditions[*]}{.type}: {.status} - {.message}{"\n"}{end}' + +# Operator logs show retry cycle +kubectl logs -n openfga-test deployment/openfga-test-openfga-operator --tail=20 +``` + +**Clean up:** + +```bash +helm uninstall openfga-test -n openfga-test +kubectl delete namespace openfga-test +``` diff --git a/operator/tests/values-db-outage.yaml b/operator/tests/values-db-outage.yaml new file mode 100644 index 00000000..a7c59720 --- /dev/null +++ b/operator/tests/values-db-outage.yaml @@ -0,0 +1,70 @@ +# Test values: Postgres deployed but scaled to 0 (simulates DB outage) +operator: + enabled: true + +openfga-operator: + image: + repository: openfga/openfga-operator + tag: dev + pullPolicy: Never + resources: + requests: + cpu: 10m + memory: 64Mi + +datastore: + engine: postgres + uri: "postgres://openfga:changeme@openfga-test-postgres:5432/openfga?sslmode=disable" + +migration: + enabled: true + serviceAccount: + create: true + +extraObjects: + - apiVersion: v1 + kind: Secret + metadata: + name: openfga-test-postgres-creds + stringData: + POSTGRES_USER: openfga + POSTGRES_PASSWORD: changeme + POSTGRES_DB: openfga + - apiVersion: apps/v1 + kind: Deployment + metadata: + name: openfga-test-postgres + spec: + replicas: 0 # Start with Postgres DOWN + selector: + matchLabels: + app: openfga-test-postgres + template: + metadata: + labels: + app: openfga-test-postgres + spec: + containers: + - name: postgres + image: postgres:17 + ports: + - containerPort: 5432 + envFrom: + - secretRef: + name: openfga-test-postgres-creds + volumeMounts: + - name: data + mountPath: /var/lib/postgresql/data + volumes: + - name: data + emptyDir: {} + - apiVersion: v1 + kind: Service + metadata: + name: openfga-test-postgres + spec: + selector: + app: openfga-test-postgres + ports: + - port: 5432 + targetPort: 5432 diff --git a/operator/tests/values-happy-path.yaml b/operator/tests/values-happy-path.yaml new file mode 100644 index 00000000..77e6306f --- /dev/null +++ b/operator/tests/values-happy-path.yaml @@ -0,0 +1,70 @@ +# Local test values for operator-managed migration on Rancher Desktop +operator: + enabled: true + +openfga-operator: + image: + repository: openfga/openfga-operator + tag: dev + pullPolicy: Never + resources: + requests: + cpu: 10m + memory: 64Mi + +datastore: + engine: postgres + uri: "postgres://openfga:changeme@openfga-test-postgres:5432/openfga?sslmode=disable" + +migration: + enabled: true + serviceAccount: + create: true + +extraObjects: + - apiVersion: v1 + kind: Secret + metadata: + name: openfga-test-postgres-creds + stringData: + POSTGRES_USER: openfga + POSTGRES_PASSWORD: changeme + POSTGRES_DB: openfga + - apiVersion: apps/v1 + kind: Deployment + metadata: + name: openfga-test-postgres + spec: + replicas: 1 + selector: + matchLabels: + app: openfga-test-postgres + template: + metadata: + labels: + app: openfga-test-postgres + spec: + containers: + - name: postgres + image: postgres:17 + ports: + - containerPort: 5432 + envFrom: + - secretRef: + name: openfga-test-postgres-creds + volumeMounts: + - name: data + mountPath: /var/lib/postgresql/data + volumes: + - name: data + emptyDir: {} + - apiVersion: v1 + kind: Service + metadata: + name: openfga-test-postgres + spec: + selector: + app: openfga-test-postgres + ports: + - port: 5432 + targetPort: 5432 diff --git a/operator/tests/values-no-db.yaml b/operator/tests/values-no-db.yaml new file mode 100644 index 00000000..2d1cd762 --- /dev/null +++ b/operator/tests/values-no-db.yaml @@ -0,0 +1,23 @@ +# Test values with NO postgres — simulates database unavailable +operator: + enabled: true + +openfga-operator: + image: + repository: openfga/openfga-operator + tag: dev + pullPolicy: Never + resources: + requests: + cpu: 10m + memory: 64Mi + +datastore: + engine: postgres + # Points to a service that doesn't exist + uri: "postgres://openfga:changeme@postgres-does-not-exist:5432/openfga?sslmode=disable" + +migration: + enabled: true + serviceAccount: + create: true