diff --git a/.github/hack/cleanup-odh.sh b/.github/hack/cleanup-odh.sh index e9538384b..e8f5fee29 100755 --- a/.github/hack/cleanup-odh.sh +++ b/.github/hack/cleanup-odh.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# cleanup-odh.sh - Remove OpenDataHub operator and all related resources +# cleanup-odh.sh - Remove OpenDataHub/RHOAI MaaS resources and related operators # # This script removes: # - DataScienceCluster and DSCInitialization custom resources @@ -8,7 +8,9 @@ # - Custom CatalogSource (odh-custom-catalog) # - ODH operator namespace (odh-operator) # - OpenDataHub application namespace (opendatahub) +# - MaaS resources from RHOAI namespace (redhat-ods-applications) # - MaaS subscription namespace (models-as-a-service) +# - Policy engine artifacts (Kuadrant/RHCL OLM resources, AuthConfig CRs) # - Keycloak identity provider (if deployed) # - ODH CRDs (optional) # @@ -40,6 +42,19 @@ fi echo "Connected to cluster. Starting cleanup..." echo "" +# Detect operator type to find the right application namespace +MAAS_APP_NAMESPACE="" +if kubectl get subscription rhods-operator -n redhat-ods-operator &>/dev/null; then + MAAS_APP_NAMESPACE="redhat-ods-applications" + echo "Detected RHOAI operator (application namespace: $MAAS_APP_NAMESPACE)" +elif kubectl get subscription opendatahub-operator -A &>/dev/null; then + MAAS_APP_NAMESPACE="opendatahub" + echo "Detected ODH operator (application namespace: $MAAS_APP_NAMESPACE)" +else + echo "No operator detected, will clean both namespaces" +fi +echo "" + # 1. Delete DataScienceCluster instances echo "1. Deleting DataScienceCluster instances..." kubectl delete datasciencecluster --all -A --ignore-not-found --timeout=120s 2>/dev/null || true @@ -82,6 +97,41 @@ kubectl delete ns odh-operator --ignore-not-found --timeout=120s 2>/dev/null || echo "8. Deleting opendatahub namespace..." kubectl delete ns opendatahub --ignore-not-found --timeout=120s 2>/dev/null || true +# 8b. Clean MaaS resources from RHOAI application namespace +# On RHOAI clusters, MaaS resources live in redhat-ods-applications which is +# operator-managed. We delete MaaS resources individually instead of the namespace. +cleanup_maas_resources() { + local ns=$1 + if ! kubectl get namespace "$ns" &>/dev/null; then + echo " $ns not found, skipping" + return 0 + fi + + echo " Cleaning MaaS resources from $ns..." + kubectl delete deployment maas-api maas-controller postgres -n "$ns" --ignore-not-found 2>/dev/null || true + kubectl delete service maas-api postgres -n "$ns" --ignore-not-found 2>/dev/null || true + kubectl delete secret maas-db-config postgres-creds -n "$ns" --ignore-not-found 2>/dev/null || true + kubectl delete authpolicy maas-api-auth-policy -n "$ns" --ignore-not-found 2>/dev/null || true + kubectl delete httproute maas-api-route -n "$ns" --ignore-not-found 2>/dev/null || true + kubectl delete destinationrule maas-api-backend-tls -n "$ns" --ignore-not-found 2>/dev/null || true + kubectl delete networkpolicy maas-api-cleanup-restrict maas-authorino-allow -n "$ns" --ignore-not-found 2>/dev/null || true + kubectl delete cronjob maas-api-key-cleanup -n "$ns" --ignore-not-found 2>/dev/null || true + kubectl delete role maas-api-db-secret maas-controller-leader-election-role -n "$ns" --ignore-not-found 2>/dev/null || true + kubectl delete rolebinding maas-api-db-secret maas-controller-leader-election-rolebinding -n "$ns" --ignore-not-found 2>/dev/null || true + kubectl delete serviceaccount maas-api maas-controller -n "$ns" --ignore-not-found 2>/dev/null || true + echo " ✅ MaaS resources cleaned from $ns" +} + +if [[ "$MAAS_APP_NAMESPACE" == "redhat-ods-applications" ]]; then + echo "8b. Cleaning MaaS resources from RHOAI namespace..." + cleanup_maas_resources "redhat-ods-applications" +elif [[ -z "$MAAS_APP_NAMESPACE" ]]; then + # No operator detected, clean both just in case + echo "8b. Cleaning MaaS resources from both possible namespaces..." + cleanup_maas_resources "redhat-ods-applications" + cleanup_maas_resources "opendatahub" +fi + force_delete_namespace() { local ns=$1 shift @@ -172,6 +222,11 @@ if kubectl get namespace rh-connectivity-link &>/dev/null; then echo " ✅ RHCL OLM resources cleaned" fi +# 11b. Delete AuthConfig CRs cluster-wide +# Old AuthConfig CRs can block new policy engine installs if the CRD schema changes. +echo "11b. Deleting AuthConfig CRs..." +kubectl delete authconfig --all --all-namespaces --ignore-not-found 2>/dev/null || true + # 12. Delete policy engine namespaces (Kuadrant or RHCL) for policy_ns in kuadrant-system rh-connectivity-link; do echo "12. Deleting $policy_ns namespace (if installed)..." @@ -210,21 +265,30 @@ kubectl delete envoyfilter kuadrant-auth-tls-fix -n openshift-ingress --ignore-n kubectl delete authpolicy -n openshift-ingress --all --ignore-not-found 2>/dev/null || true kubectl delete ratelimitpolicy -n openshift-ingress --all --ignore-not-found 2>/dev/null || true kubectl delete tokenratelimitpolicy -n openshift-ingress --all --ignore-not-found 2>/dev/null || true +kubectl delete gatewayclass openshift-default --ignore-not-found 2>/dev/null || true # 16. Delete MaaS RBAC (ClusterRoles, ClusterRoleBindings - can conflict with other managers) echo "16. Deleting MaaS RBAC..." kubectl delete clusterrolebinding maas-api maas-controller-rolebinding --ignore-not-found 2>/dev/null || true kubectl delete clusterrole maas-api maas-controller-role --ignore-not-found 2>/dev/null || true -# 17. Optionally delete CRDs +# 17. Delete CRDs +# Always delete KServe/MaaS CRDs to prevent storedVersions schema conflicts on reinstall. +# ODH-internal CRDs are only deleted with --include-crds. +echo "17. Deleting KServe/MaaS CRDs (always removed to prevent version conflicts)..." +for crd in $(kubectl get crd -o name 2>/dev/null | grep -E 'serving\.kserve\.io|maas\.opendatahub\.io'); do + echo " Deleting $crd" + kubectl delete "$crd" --ignore-not-found --timeout=30s 2>/dev/null || true +done + if $INCLUDE_CRDS; then - echo "17. Deleting ODH CRDs..." - kubectl delete crd datascienceclusters.datasciencecluster.opendatahub.io --ignore-not-found 2>/dev/null || true - kubectl delete crd dscinitializations.dscinitialization.opendatahub.io --ignore-not-found 2>/dev/null || true - kubectl delete crd datasciencepipelinesapplications.datasciencepipelinesapplications.opendatahub.io --ignore-not-found 2>/dev/null || true - # Add more CRDs as needed + echo "17b. Deleting all ODH CRDs..." + for crd in $(kubectl get crd -o name 2>/dev/null | grep -E 'opendatahub\.io|trustyai\.opendatahub'); do + echo " Deleting $crd" + kubectl delete "$crd" --ignore-not-found --timeout=30s 2>/dev/null || true + done else - echo "17. Skipping CRD deletion (use --include-crds to remove CRDs)" + echo "17b. Skipping ODH-internal CRD deletion (use --include-crds to remove all)" fi echo "" @@ -233,4 +297,5 @@ echo "" echo "Verify cleanup with:" echo " kubectl get subscription -A | grep -i odh" echo " kubectl get csv -A | grep -i odh" -echo " kubectl get ns | grep -E 'odh|opendatahub|models-as-a-service|kuadrant|rh-connectivity-link|keycloak-system|llm'" \ No newline at end of file +echo " kubectl get ns | grep -E 'odh|opendatahub|models-as-a-service|kuadrant|rh-connectivity-link|keycloak-system|llm' + kubectl get deployment maas-api maas-controller postgres -n redhat-ods-applications 2>/dev/null || echo ' (no MaaS resources in redhat-ods-applications)'" \ No newline at end of file diff --git a/.github/hack/install-odh.sh b/.github/hack/install-odh.sh index 7a898d7bd..524cce30e 100755 --- a/.github/hack/install-odh.sh +++ b/.github/hack/install-odh.sh @@ -12,6 +12,8 @@ # OPERATOR_INSTALL_PLAN_APPROVAL - Manual (default) or Automatic; use "-" to omit. # Manual: blocks auto-upgrades; this script auto-approves only the first InstallPlan so install does not stall. # OPERATOR_IMAGE - Custom operator image to patch into CSV (optional) +# OPERATOR_OPERANDS_MAP - Path to operands-map.yaml for RELATED_IMAGE env var injection (optional) +# Used with OPERATOR_IMAGE to ensure component images match the operator. # # Usage: ./install-odh.sh @@ -59,6 +61,51 @@ patch_operator_csv_if_needed() { {\"op\": \"replace\", \"path\": \"/spec/install/spec/deployments/0/spec/template/spec/containers/0/image\", \"value\": \"$OPERATOR_IMAGE\"} ]" log_info "CSV $csv_name patched with image $OPERATOR_IMAGE" + + # When using a custom operator image, the community CSV may lack RELATED_IMAGE env vars + # that the operator needs to deploy the correct component versions. + # If OPERATOR_OPERANDS_MAP points to a local operands-map.yaml, inject its env vars into the CSV. + if [[ -n "${OPERATOR_OPERANDS_MAP:-}" && -f "$OPERATOR_OPERANDS_MAP" ]]; then + log_info "Injecting RELATED_IMAGE env vars from $OPERATOR_OPERANDS_MAP into CSV" + local env_patches="[" + local first=true + while IFS= read -r line; do + local name value + name=$(echo "$line" | sed -n 's/.*name: \(RELATED_IMAGE_[^ ]*\)/\1/p') + if [[ -n "$name" ]]; then + read -r value_line + value=$(echo "$value_line" | sed -n 's/.*value: \(.*\)/\1/p') + if [[ -n "$value" ]]; then + $first || env_patches+="," + first=false + env_patches+="{\"name\":\"$name\",\"value\":\"$value\"}" + fi + fi + done < "$OPERATOR_OPERANDS_MAP" + + if [[ "$env_patches" != "[" ]]; then + env_patches+="]" + local container_path="/spec/install/spec/deployments/0/spec/template/spec/containers/0" + local existing_env + existing_env=$(kubectl get csv "$csv_name" -n "$namespace" -o jsonpath="{${container_path}.env}" 2>/dev/null || echo "[]") + + local merged_env + merged_env=$(python3 -c " +import json, sys +existing = json.loads('${existing_env}') +new_envs = json.loads(sys.stdin.read()) +existing_names = {e['name'] for e in existing} +for e in new_envs: + if e['name'] not in existing_names: + existing.append(e) +print(json.dumps(existing)) +" <<< "$env_patches") + + kubectl patch csv "$csv_name" -n "$namespace" --type='json' \ + -p="[{\"op\": \"replace\", \"path\": \"${container_path}/env\", \"value\": ${merged_env}}]" + log_info "CSV env vars patched with RELATED_IMAGE entries" + fi + fi } echo "=== Installing OpenDataHub operator ===" diff --git a/.github/workflows/openapi-validation.yml b/.github/workflows/openapi-validation.yml new file mode 100644 index 000000000..c4825b3ea --- /dev/null +++ b/.github/workflows/openapi-validation.yml @@ -0,0 +1,212 @@ +name: OpenAPI Validation + +on: + pull_request: + paths: + - 'maas-api/openapi3.yaml' + - '.spectral.yml' + - '.github/workflows/openapi-validation.yml' + push: + branches: [main, master] + paths: + - 'maas-api/openapi3.yaml' + +permissions: + contents: read + +jobs: + validate-spec: + name: Validate OpenAPI Specification + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - name: Setup Node.js + uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 # v4.0.2 + with: + node-version: '20' + + - name: Install Spectral + run: npm install -g @stoplight/spectral-cli@6.13.1 + + - name: Validate OpenAPI Spec + # NOTE: continue-on-error is temporary until existing errors are fixed in PR #694 + # Once #694 merges, remove this line to enforce strict validation + continue-on-error: true + run: | + echo "🔍 Validating OpenAPI specification..." + spectral lint maas-api/openapi3.yaml --format stylish --verbose + + - name: Generate Validation Report + if: always() + run: | + echo "📊 Generating detailed validation report..." + spectral lint maas-api/openapi3.yaml --format json > openapi-validation-report.json || true + + # Show summary + if [ -f openapi-validation-report.json ]; then + echo "## OpenAPI Validation Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + ERROR_COUNT=$(jq '[.[] | select(.severity == 0)] | length' openapi-validation-report.json) + WARN_COUNT=$(jq '[.[] | select(.severity == 1)] | length' openapi-validation-report.json) + INFO_COUNT=$(jq '[.[] | select(.severity == 2)] | length' openapi-validation-report.json) + HINT_COUNT=$(jq '[.[] | select(.severity == 3)] | length' openapi-validation-report.json) + + echo "- ❌ Errors: $ERROR_COUNT" >> $GITHUB_STEP_SUMMARY + echo "- âš ī¸ Warnings: $WARN_COUNT" >> $GITHUB_STEP_SUMMARY + echo "- â„šī¸ Info: $INFO_COUNT" >> $GITHUB_STEP_SUMMARY + echo "- 💡 Hints: $HINT_COUNT" >> $GITHUB_STEP_SUMMARY + + if [ "$ERROR_COUNT" -eq 0 ]; then + echo "" >> $GITHUB_STEP_SUMMARY + echo "✅ **No errors found!** Spec is valid." >> $GITHUB_STEP_SUMMARY + fi + fi + + - name: Upload Validation Report + if: always() + uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + with: + name: openapi-validation-report + path: openapi-validation-report.json + retention-days: 30 + + breaking-changes: + name: Detect Breaking Changes + runs-on: ubuntu-latest + if: github.event_name == 'pull_request' + + steps: + - name: Checkout PR + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + fetch-depth: 0 # Need full history for comparison + + - name: Setup Node.js + uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 # v4.0.2 + with: + node-version: '20' + + - name: Install oasdiff + run: | + curl -fsSL https://raw.githubusercontent.com/oasdiff/oasdiff/main/install.sh | sh + oasdiff --version + + - name: Check for Breaking Changes + id: breaking_changes + # NOTE: continue-on-error allows PR to proceed even with breaking changes + # Review breaking-changes-report artifact and document intentional changes in PR + continue-on-error: true + env: + BASE_REF: ${{ github.base_ref }} + run: | + # Validate base ref to prevent script injection + if [[ ! "$BASE_REF" =~ ^[A-Za-z0-9._/-]+$ ]]; then + echo "❌ Invalid base ref format" + exit 1 + fi + + echo "🔍 Checking for API breaking changes..." + + # Get base branch spec + git fetch origin "$BASE_REF" + git show "origin/$BASE_REF:maas-api/openapi3.yaml" > base-spec.yaml + + # Run breaking change detection + oasdiff breaking base-spec.yaml maas-api/openapi3.yaml > breaking-changes.txt || true + + if [ -s breaking-changes.txt ]; then + echo "has_breaking_changes=true" >> $GITHUB_OUTPUT + echo "## âš ī¸ Breaking Changes Detected" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + cat breaking-changes.txt >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + else + echo "has_breaking_changes=false" >> $GITHUB_OUTPUT + echo "## ✅ No Breaking Changes" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "This PR does not introduce any breaking API changes." >> $GITHUB_STEP_SUMMARY + fi + + - name: Upload Breaking Changes Report + if: always() + uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + with: + name: breaking-changes-report + path: breaking-changes.txt + retention-days: 30 + + - name: Fail on Breaking Changes + if: steps.breaking_changes.outputs.has_breaking_changes == 'true' + # NOTE: continue-on-error is temporary to allow PR #693 to merge + # This establishes the validation infrastructure. Once #694 fixes existing issues, + # remove this line to enforce strict breaking change checks + continue-on-error: true + run: | + echo "❌ Breaking changes detected. If intentional, document in PR and get approval." + echo " Consider:" + echo " - Is this a major version bump?" + echo " - Are clients given migration time?" + echo " - Is there a deprecation notice?" + exit 1 + + changelog-check: + name: Check API Changelog + runs-on: ubuntu-latest + if: github.event_name == 'pull_request' + + steps: + - name: Checkout + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + fetch-depth: 0 # Need full history for git diff + + - name: Check if API changed + id: api_changed + env: + BASE_REF: ${{ github.base_ref }} + run: | + # Validate base ref to prevent script injection + if [[ ! "$BASE_REF" =~ ^[A-Za-z0-9._/-]+$ ]]; then + echo "❌ Invalid base ref format" + exit 1 + fi + + if git diff --name-only "origin/$BASE_REF...HEAD" | grep -q "maas-api/openapi3.yaml"; then + echo "changed=true" >> $GITHUB_OUTPUT + else + echo "changed=false" >> $GITHUB_OUTPUT + fi + + - name: Verify Changelog Entry + if: steps.api_changed.outputs.changed == 'true' + env: + BASE_REF: ${{ github.base_ref }} + run: | + # Validate base ref to prevent script injection + if [[ ! "$BASE_REF" =~ ^[A-Za-z0-9._/-]+$ ]]; then + echo "❌ Invalid base ref format" + exit 1 + fi + + echo "📝 Checking for API changelog entry..." + + # Check if there's an API changelog file + if [ ! -f docs/content/release-notes/api-changelog.md ]; then + echo "âš ī¸ No API changelog found at docs/content/release-notes/api-changelog.md" >> $GITHUB_STEP_SUMMARY + echo " Consider creating one to track API changes over time." >> $GITHUB_STEP_SUMMARY + exit 0 + fi + + # Check if changelog was updated in this PR + if git diff --name-only "origin/$BASE_REF...HEAD" | grep -q "docs/content/release-notes/api-changelog.md"; then + echo "✅ API changelog updated" >> $GITHUB_STEP_SUMMARY + else + echo "âš ī¸ API spec changed but changelog not updated" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "Consider adding an entry to docs/content/release-notes/api-changelog.md" >> $GITHUB_STEP_SUMMARY + fi diff --git a/.spectral.yml b/.spectral.yml new file mode 100644 index 000000000..f44da63f7 --- /dev/null +++ b/.spectral.yml @@ -0,0 +1,36 @@ +# Spectral OpenAPI Linting Rules +# https://stoplight.io/open-source/spectral + +extends: [[spectral:oas, all]] + +rules: + # Require operation IDs for all endpoints + operation-operationId: error + + # Require descriptions + info-description: error + operation-description: warn + + # Require examples for request/response bodies + operation-success-response: error + + # Security + oas3-api-servers: error + + # Documentation quality + info-contact: warn + info-license: off # May not have public license + + # Schema quality + oas3-schema: error + + # Custom rules for MaaS API + maas-subscription-header: + description: Endpoints should document X-MaaS-Subscription header behavior + severity: hint + given: $.paths[*][*] + then: + field: description + function: pattern + functionOptions: + match: ".*subscription.*|.*Subscription.*" diff --git a/.tekton/maas-group-test.yaml b/.tekton/maas-group-test.yaml new file mode 100644 index 000000000..11174a430 --- /dev/null +++ b/.tekton/maas-group-test.yaml @@ -0,0 +1,49 @@ +--- +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + annotations: + build.appstudio.openshift.io/repo: https://github.com/opendatahub-io/models-as-a-service?rev={{revision}} + build.appstudio.redhat.com/commit_sha: '{{revision}}' + build.appstudio.redhat.com/target_branch: '{{target_branch}}' + build.appstudio.redhat.com/pull_request_number: '{{pull_request_number}}' + pipelinesascode.tekton.dev/cancel-in-progress: "false" + pipelinesascode.tekton.dev/max-keep-runs: "3" + pipelinesascode.tekton.dev/on-cel-expression: event == "group-test" + pipelinesascode.tekton.dev/on-comment: "^/group-test" + name: maas-group-test + namespace: open-data-hub-tenant + labels: + appstudio.openshift.io/application: group-testing + appstudio.openshift.io/component: maas-group + pipelines.appstudio.openshift.io/type: test +spec: + params: + - name: group-components + value: '{ "odh-maas-api-ci": "opendatahub/maas-api", "odh-maas-controller-ci": "opendatahub/maas-controller" }' + pipelineRef: + resolver: git + params: + - name: url + value: https://github.com/opendatahub-io/odh-konflux-central.git + - name: revision + value: main + - name: pathInRepo + value: integration-tests/models-as-a-service/pr-group-testing-pipeline.yaml + taskRunTemplate: + podTemplate: + nodeSelector: + konflux-ci.dev/workload: konflux-tenants + tolerations: + - effect: NoSchedule + key: konflux-ci.dev/workload + operator: Equal + value: konflux-tenants + serviceAccountName: konflux-integration-runner + timeouts: + pipeline: 4h0m0s + tasks: 3h + workspaces: + - name: git-auth + secret: + secretName: '{{ git_auth_secret }}' diff --git a/.tekton/odh-maas-controller-pull-request.yaml b/.tekton/odh-maas-controller-pull-request.yaml index 61771b4bf..90b8937b1 100644 --- a/.tekton/odh-maas-controller-pull-request.yaml +++ b/.tekton/odh-maas-controller-pull-request.yaml @@ -28,14 +28,16 @@ spec: - name: output-image value: quay.io/opendatahub/maas-controller:odh-pr - name: dockerfile - value: Dockerfile + value: maas-controller/Dockerfile - name: path-context - value: maas-controller + value: . - name: additional-tags value: - 'odh-pr-{{revision}}' - name: pipeline-type value: pull-request + - name: enable-group-testing + value: "true" pipelineRef: resolver: git params: diff --git a/.tekton/odh-maas-controller-push.yaml b/.tekton/odh-maas-controller-push.yaml index 8f882f415..141d20f18 100644 --- a/.tekton/odh-maas-controller-push.yaml +++ b/.tekton/odh-maas-controller-push.yaml @@ -25,9 +25,9 @@ spec: - name: output-image value: quay.io/opendatahub/maas-controller:odh-stable - name: dockerfile - value: Dockerfile + value: maas-controller/Dockerfile - name: path-context - value: maas-controller + value: . pipelineRef: resolver: git params: diff --git a/.tekton/odh-maas-controller-v3-4-push.yaml b/.tekton/odh-maas-controller-v3-4-push.yaml index 795c67462..dd7088e21 100644 --- a/.tekton/odh-maas-controller-v3-4-push.yaml +++ b/.tekton/odh-maas-controller-v3-4-push.yaml @@ -32,9 +32,9 @@ spec: - name: rhoai-version value: "3.4.0" - name: dockerfile - value: Dockerfile.konflux + value: maas-controller/Dockerfile.konflux - name: path-context - value: maas-controller + value: . - name: hermetic value: true - name: prefetch-input diff --git a/README.md b/README.md index e0be87e05..7e1f278ae 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ Note: The `scripts/deploy.sh` script creates a development PostgreSQL instance a ### Deploy Infrastructure -Use the unified deployment script for all deployment scenarios: +Use the unified deployment script for all deployment scenarios. The script installs prerequisites (policy engine, Gateway, PostgreSQL, Authorino TLS) and deploys `maas-controller`, which then deploys `maas-api` automatically via its **Tenant reconciler**: ```bash # Deploy ODH (default) @@ -107,7 +107,7 @@ For detailed instructions, see the [Deployment Guide](docs/content/quickstart.md | Variable | Description | Example | |----------|-------------|---------| -| `MAAS_API_IMAGE` | Custom MaaS API container image (works in both operator and kustomize modes) | `quay.io/user/maas-api:pr-123` | +| `MAAS_API_IMAGE` | Custom MaaS API container image (passed to the Tenant reconciler via `RELATED_IMAGE_ODH_MAAS_API_IMAGE`) | `quay.io/user/maas-api:pr-123` | | `MAAS_CONTROLLER_IMAGE` | Custom MaaS controller container image | `quay.io/user/maas-controller:pr-123` | | `METADATA_CACHE_TTL` | TTL in seconds for Authorino metadata HTTP caching | `60` (default), `300` | | `AUTHZ_CACHE_TTL` | TTL in seconds for Authorino OPA authorization caching | `60` (default), `30` | @@ -158,6 +158,7 @@ MAAS_API_IMAGE=quay.io/myuser/maas-api:pr-123 \ - [Deployment Guide](docs/content/quickstart.md) - Complete deployment instructions - [MaaS API Documentation](maas-api/README.md) - Go API for key management +- [MaaS Controller Documentation](maas-controller/README.md) - Controller, Tenant reconciler, and subscription model - [Authorino Caching Configuration](docs/content/configuration-and-management/authorino-caching.md) - Cache tuning for metadata and authorization Online Documentation: [https://opendatahub-io.github.io/models-as-a-service/](https://opendatahub-io.github.io/models-as-a-service/) diff --git a/deployment/base/maas-api/core/cronjob-cleanup.yaml b/deployment/base/maas-api/core/cronjob-cleanup.yaml index 8acb85f89..0782c769f 100644 --- a/deployment/base/maas-api/core/cronjob-cleanup.yaml +++ b/deployment/base/maas-api/core/cronjob-cleanup.yaml @@ -22,7 +22,7 @@ spec: runAsNonRoot: true containers: - name: cleanup - image: curlimages/curl:8.18.0 + image: registry.redhat.io/ubi9/ubi-minimal:9.7 command: - /bin/sh - -c diff --git a/deployment/base/maas-api/overlays/tls/README.md b/deployment/base/maas-api/overlays/tls/README.md index 463c01e1f..3fdcfa651 100644 --- a/deployment/base/maas-api/overlays/tls/README.md +++ b/deployment/base/maas-api/overlays/tls/README.md @@ -41,12 +41,11 @@ Client → Gateway (TLS termination) → [DestinationRule] → maas-api:8443 (TL kustomize build deployment/overlays/tls | kubectl apply -f - ``` -### As part of full TLS backend +### As part of Tenant reconciler -This overlay is referenced by `overlays/tls-backend` which adds: -- Authorino TLS configuration -- HTTPRoute port patches for HTTPS backend -- Service CA bundle for inter-service trust +This overlay is referenced by `maas-api/deploy/overlays/odh` (the Tenant reconciler overlay) +and `deployment/overlays/odh` (the ODH operator overlay). The Tenant reconciler also applies +gateway policies and configures DestinationRule namespace via PostRender. ## Certificate Management diff --git a/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_maasauthpolicies.yaml b/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_maasauthpolicies.yaml index 7d41407dc..b58754a44 100644 --- a/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_maasauthpolicies.yaml +++ b/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_maasauthpolicies.yaml @@ -123,37 +123,64 @@ spec: properties: authPolicies: description: AuthPolicies lists the underlying Kuadrant AuthPolicies - and their Accepted/Enforced state. + and their status. items: - description: AuthPolicyRefStatus reports the status of one underlying - Kuadrant AuthPolicy created by this MaaSAuthPolicy. + description: |- + AuthPolicyRefStatus reports the status of a generated Kuadrant AuthPolicy. + Embeds ResourceRefStatus for common fields (Ready, Reason, Message). properties: - accepted: - description: Accepted reports whether the AuthPolicy has been - accepted (e.g. status.conditions type=Accepted). - type: string - enforced: - description: Enforced reports whether the AuthPolicy is enforced - (e.g. status.conditions type=Enforced). + message: + description: Message is a human-readable description of the + status + maxLength: 1024 type: string model: description: Model is the MaaSModelRef name this AuthPolicy targets. + maxLength: 63 + minLength: 1 type: string modelNamespace: description: ModelNamespace is the namespace of the MaaSModelRef. + maxLength: 63 + minLength: 1 type: string name: - description: Name is the name of the AuthPolicy resource. + description: Name of the referenced resource + maxLength: 253 type: string namespace: - description: Namespace is the namespace of the AuthPolicy resource. + description: Namespace of the referenced resource + maxLength: 63 + type: string + ready: + description: Ready indicates whether the resource is valid and + healthy + type: boolean + reason: + description: Reason is a machine-readable reason code + enum: + - Reconciled + - ReconcileFailed + - PartialFailure + - Valid + - NotFound + - GetFailed + - Accepted + - AcceptedEnforced + - NotAccepted + - Enforced + - NotEnforced + - BackendNotReady + - ConditionsNotFound + - Unknown type: string required: - model - modelNamespace - name - namespace + - ready type: object type: array conditions: @@ -219,6 +246,7 @@ spec: enum: - Pending - Active + - Degraded - Failed type: string type: object diff --git a/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_maassubscriptions.yaml b/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_maassubscriptions.yaml index a0c3a98ef..df4cc0d40 100644 --- a/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_maassubscriptions.yaml +++ b/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_maassubscriptions.yaml @@ -88,9 +88,14 @@ spec: minimum: 1 type: integer window: - description: Window is the time window (e.g., "1m", "1h", - "24h") - pattern: ^(\d+)(s|m|h|d)$ + description: |- + Window is the time window for rate limiting (e.g., "1m", "1h", "24h"). + Allowed units: s (seconds), m (minutes), h (hours). Days (d) are not + supported; use hours instead (e.g., "24h" for one day). + The numeric part must be between 1 and 9999. + maxLength: 5 + minLength: 2 + pattern: ^[1-9]\d{0,3}(s|m|h)$ type: string required: - limit @@ -217,13 +222,116 @@ spec: - type type: object type: array + modelRefStatuses: + description: ModelRefStatuses reports the status of each referenced + MaaSModelRef + items: + description: ModelRefStatus reports the status of a referenced MaaSModelRef. + properties: + message: + description: Message is a human-readable description of the + status + maxLength: 1024 + type: string + name: + description: Name of the referenced resource + maxLength: 253 + type: string + namespace: + description: Namespace of the referenced resource + maxLength: 63 + type: string + ready: + description: Ready indicates whether the resource is valid and + healthy + type: boolean + reason: + description: Reason is a machine-readable reason code + enum: + - Reconciled + - ReconcileFailed + - PartialFailure + - Valid + - NotFound + - GetFailed + - Accepted + - AcceptedEnforced + - NotAccepted + - Enforced + - NotEnforced + - BackendNotReady + - ConditionsNotFound + - Unknown + type: string + required: + - name + - namespace + - ready + type: object + type: array phase: description: Phase represents the current phase of the subscription enum: - Pending - Active + - Degraded - Failed type: string + tokenRateLimitStatuses: + description: TokenRateLimitStatuses reports the status of each generated + TokenRateLimitPolicy + items: + description: TokenRateLimitStatus reports the status of a generated + TokenRateLimitPolicy. + properties: + message: + description: Message is a human-readable description of the + status + maxLength: 1024 + type: string + model: + description: Model is the MaaSModelRef name this TokenRateLimitPolicy + targets + maxLength: 63 + minLength: 1 + type: string + name: + description: Name of the referenced resource + maxLength: 253 + type: string + namespace: + description: Namespace of the referenced resource + maxLength: 63 + type: string + ready: + description: Ready indicates whether the resource is valid and + healthy + type: boolean + reason: + description: Reason is a machine-readable reason code + enum: + - Reconciled + - ReconcileFailed + - PartialFailure + - Valid + - NotFound + - GetFailed + - Accepted + - AcceptedEnforced + - NotAccepted + - Enforced + - NotEnforced + - BackendNotReady + - ConditionsNotFound + - Unknown + type: string + required: + - model + - name + - namespace + - ready + type: object + type: array type: object type: object served: true diff --git a/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_tenants.yaml b/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_tenants.yaml new file mode 100644 index 000000000..8a5208a6d --- /dev/null +++ b/deployment/base/maas-controller/crd/bases/maas.opendatahub.io_tenants.yaml @@ -0,0 +1,216 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.4 + name: tenants.maas.opendatahub.io +spec: + group: maas.opendatahub.io + names: + kind: Tenant + listKind: TenantList + plural: tenants + singular: tenant + scope: Namespaced + versions: + - additionalPrinterColumns: + - description: Ready + jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - description: Reason + jsonPath: .status.conditions[?(@.type=="Ready")].reason + name: Reason + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + Tenant is the namespace-scoped API for the MaaS platform tenant. + The CEL validation above enforces a singleton (name == "default-tenant") during v1alpha1. + To enable multi-tenancy later, remove the XValidation rule — no CRD migration required + because removing a validation is a non-breaking schema change. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: TenantSpec defines the desired state of Tenant. + properties: + apiKeys: + description: APIKeys contains configuration for API key management. + properties: + maxExpirationDays: + format: int32 + minimum: 1 + type: integer + type: object + externalOIDC: + description: ExternalOIDC configures an external OIDC identity provider + for the maas-api AuthPolicy. + properties: + clientId: + description: ClientID is the OAuth2 client ID. + maxLength: 256 + minLength: 1 + pattern: ^\S+$ + type: string + issuerUrl: + description: IssuerURL is the OIDC issuer URL (e.g. https://keycloak.example.com/realms/maas). + maxLength: 2048 + minLength: 9 + pattern: ^https://\S+$ + type: string + ttl: + default: 300 + description: TTL is the JWKS cache duration in seconds. + minimum: 30 + type: integer + required: + - clientId + - issuerUrl + type: object + gatewayRef: + description: |- + GatewayRef specifies which Gateway (Gateway API) to use for exposing model endpoints. + If omitted, defaults to openshift-ingress/maas-default-gateway. + properties: + name: + default: maas-default-gateway + maxLength: 63 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?)?$ + type: string + namespace: + default: openshift-ingress + maxLength: 63 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?)?$ + type: string + type: object + telemetry: + description: Telemetry contains configuration for telemetry and metrics + collection. + properties: + enabled: + default: true + type: boolean + metrics: + description: TenantMetricsConfig defines optional metric dimensions. + properties: + captureGroup: + default: false + type: boolean + captureModelUsage: + default: true + type: boolean + captureOrganization: + default: true + type: boolean + captureUser: + default: false + description: |- + CaptureUser adds a "user" dimension to telemetry metrics containing + the authenticated user ID. Defaults to false. Enabling this may + have GDPR / privacy implications — ensure compliance before use. + type: boolean + type: object + type: object + type: object + status: + description: TenantStatus defines the observed state of Tenant. + properties: + conditions: + description: |- + Conditions represent the latest available observations. + Types mirror ODH modelsasservice / internal controller status for DSC aggregation: Ready, + DependenciesAvailable, MaaSPrerequisitesAvailable, DeploymentsAvailable, Degraded. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + phase: + description: Phase is a high-level lifecycle phase for the platform + reconcile. + enum: + - Pending + - Active + - Degraded + - Failed + type: string + type: object + type: object + x-kubernetes-validations: + - message: Tenant name must be default-tenant + rule: self.metadata.name == 'default-tenant' + served: true + storage: true + subresources: + status: {} diff --git a/deployment/base/maas-controller/crd/kustomization.yaml b/deployment/base/maas-controller/crd/kustomization.yaml index 8fca2319c..8753afeb6 100644 --- a/deployment/base/maas-controller/crd/kustomization.yaml +++ b/deployment/base/maas-controller/crd/kustomization.yaml @@ -3,4 +3,5 @@ resources: - bases/maas.opendatahub.io_externalmodels.yaml - bases/maas.opendatahub.io_maasauthpolicies.yaml - bases/maas.opendatahub.io_maasmodelrefs.yaml + - bases/maas.opendatahub.io_tenants.yaml - bases/maas.opendatahub.io_maassubscriptions.yaml diff --git a/deployment/base/maas-controller/manager/manager.yaml b/deployment/base/maas-controller/manager/manager.yaml index 2b459579a..85a62e199 100644 --- a/deployment/base/maas-controller/manager/manager.yaml +++ b/deployment/base/maas-controller/manager/manager.yaml @@ -32,7 +32,6 @@ spec: - --gateway-namespace=$(GATEWAY_NAMESPACE) - --maas-api-namespace=$(MAAS_API_NAMESPACE) - --maas-subscription-namespace=$(MAAS_SUBSCRIPTION_NAMESPACE) - - --cluster-audience=$(CLUSTER_AUDIENCE) - --metadata-cache-ttl=60 - --authz-cache-ttl=60 env: @@ -46,11 +45,6 @@ spec: fieldPath: metadata.namespace - name: MAAS_SUBSCRIPTION_NAMESPACE value: "models-as-a-service" - - name: CLUSTER_AUDIENCE - valueFrom: - configMapKeyRef: - name: maas-parameters - key: cluster-audience image: maas-controller name: manager imagePullPolicy: Always diff --git a/deployment/base/maas-controller/rbac/clusterrole.yaml b/deployment/base/maas-controller/rbac/clusterrole.yaml index a6816bbc5..c771d3497 100644 --- a/deployment/base/maas-controller/rbac/clusterrole.yaml +++ b/deployment/base/maas-controller/rbac/clusterrole.yaml @@ -7,32 +7,113 @@ rules: - apiGroups: - "" resources: - - namespaces + - configmaps + - serviceaccounts verbs: - create + - delete - get + - list + - patch + - watch - apiGroups: - "" resources: + - endpoints + - pods - secrets verbs: - get + - list + - watch +- apiGroups: + - "" + resources: + - namespaces + verbs: + - create + - get + - list + - watch +- apiGroups: + - "" + resources: + - serviceaccounts/token + verbs: + - create - apiGroups: - "" resources: - services verbs: - create + - delete - get - list + - patch - update - watch +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - get + - list + - watch +- apiGroups: + - apps + resources: + - deployments + verbs: + - create + - delete + - get + - list + - patch + - watch +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create +- apiGroups: + - batch + resources: + - cronjobs + verbs: + - create + - delete + - get + - list + - patch + - watch - apiGroups: - config.openshift.io resources: - authentications verbs: - get + - list + - watch +- apiGroups: + - extensions.kuadrant.io + resources: + - telemetrypolicies + verbs: + - create + - delete + - get + - list + - patch + - watch - apiGroups: - gateway.networking.k8s.io resources: @@ -72,6 +153,17 @@ rules: - patch - update - watch +- apiGroups: + - kuadrant.io + resources: + - ratelimitpolicies + verbs: + - create + - delete + - get + - list + - patch + - watch - apiGroups: - maas.opendatahub.io resources: @@ -80,12 +172,23 @@ rules: - get - list - watch +- apiGroups: + - maas.opendatahub.io + resources: + - externalmodels/finalizers + - maasauthpolicies/finalizers + - maasmodelrefs/finalizers + - maassubscriptions/finalizers + - tenants/finalizers + verbs: + - update - apiGroups: - maas.opendatahub.io resources: - maasauthpolicies - maasmodelrefs - maassubscriptions + - tenants verbs: - create - delete @@ -94,20 +197,13 @@ rules: - patch - update - watch -- apiGroups: - - maas.opendatahub.io - resources: - - maasauthpolicies/finalizers - - maasmodelrefs/finalizers - - maassubscriptions/finalizers - verbs: - - update - apiGroups: - maas.opendatahub.io resources: - maasauthpolicies/status - maasmodelrefs/status - maassubscriptions/status + - tenants/status verbs: - get - patch @@ -121,8 +217,20 @@ rules: - delete - get - list + - patch - update - watch +- apiGroups: + - networking.istio.io + resources: + - envoyfilters + verbs: + - create + - delete + - get + - list + - patch + - watch - apiGroups: - networking.istio.io resources: @@ -133,6 +241,37 @@ rules: - list - update - watch +- apiGroups: + - networking.k8s.io + resources: + - networkpolicies + verbs: + - create + - delete + - get + - list + - patch + - watch +- apiGroups: + - operator.authorino.kuadrant.io + resources: + - authorinos + verbs: + - get + - list + - watch +- apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - clusterroles + verbs: + - create + - delete + - get + - list + - patch + - watch - apiGroups: - serving.kserve.io resources: @@ -141,3 +280,14 @@ rules: - get - list - watch +- apiGroups: + - telemetry.istio.io + resources: + - telemetries + verbs: + - create + - delete + - get + - list + - patch + - watch diff --git a/deployment/base/maas-controller/rbac/kustomization.yaml b/deployment/base/maas-controller/rbac/kustomization.yaml index 6fcc77e75..c25ed593b 100644 --- a/deployment/base/maas-controller/rbac/kustomization.yaml +++ b/deployment/base/maas-controller/rbac/kustomization.yaml @@ -4,3 +4,5 @@ resources: - service_account.yaml - leader_election_role.yaml - leader_election_role_binding.yaml + - owner_role.yaml + - viewer_role.yaml diff --git a/deployment/base/maas-controller/rbac/owner_role.yaml b/deployment/base/maas-controller/rbac/owner_role.yaml new file mode 100644 index 000000000..bafff227f --- /dev/null +++ b/deployment/base/maas-controller/rbac/owner_role.yaml @@ -0,0 +1,22 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: maas-owner-role + labels: + rbac.authorization.k8s.io/aggregate-to-admin: "true" + rbac.authorization.k8s.io/aggregate-to-edit: "true" +rules: +- apiGroups: + - maas.opendatahub.io + resources: + - maasmodelrefs + - externalmodels + verbs: + - create + - delete + - get + - list + - patch + - update + - watch diff --git a/deployment/base/maas-controller/rbac/viewer_role.yaml b/deployment/base/maas-controller/rbac/viewer_role.yaml new file mode 100644 index 000000000..f76885e56 --- /dev/null +++ b/deployment/base/maas-controller/rbac/viewer_role.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: maas-viewer-role + labels: + rbac.authorization.k8s.io/aggregate-to-view: "true" + rbac.authorization.k8s.io/aggregate-to-admin: "true" + rbac.authorization.k8s.io/aggregate-to-edit: "true" +rules: +- apiGroups: + - maas.opendatahub.io + resources: + - maasmodelrefs + - externalmodels + verbs: + - get + - list + - watch diff --git a/deployment/components/observability/observability/dashboards/prometheus-data-source.yaml b/deployment/components/observability/observability/dashboards/kuadrant-prometheus-datasource.yaml similarity index 94% rename from deployment/components/observability/observability/dashboards/prometheus-data-source.yaml rename to deployment/components/observability/observability/dashboards/kuadrant-prometheus-datasource.yaml index 541b21b0a..97480f2fa 100644 --- a/deployment/components/observability/observability/dashboards/prometheus-data-source.yaml +++ b/deployment/components/observability/observability/dashboards/kuadrant-prometheus-datasource.yaml @@ -1,7 +1,7 @@ apiVersion: perses.dev/v1alpha1 kind: PersesDatasource metadata: - name: prometheus + name: kuadrant-prometheus-datasource namespace: opendatahub spec: client: diff --git a/deployment/components/observability/observability/dashboards/kustomization.yaml b/deployment/components/observability/observability/dashboards/kustomization.yaml index 659606e98..489bb9ab1 100644 --- a/deployment/components/observability/observability/dashboards/kustomization.yaml +++ b/deployment/components/observability/observability/dashboards/kustomization.yaml @@ -8,7 +8,7 @@ metadata: resources: - usage-dashboard.yaml - - prometheus-data-source.yaml + - kuadrant-prometheus-datasource.yaml labels: - pairs: diff --git a/deployment/components/observability/observability/dashboards/usage-dashboard.yaml b/deployment/components/observability/observability/dashboards/usage-dashboard.yaml index 3fa2dfe96..94ecc68a0 100644 --- a/deployment/components/observability/observability/dashboards/usage-dashboard.yaml +++ b/deployment/components/observability/observability/dashboards/usage-dashboard.yaml @@ -51,7 +51,7 @@ spec: display: collapse: open: true - title: Token Consumption by User + title: Token Consumption items: - content: $ref: '#/spec/panels/tokenConsumptionByUser' @@ -80,7 +80,7 @@ spec: spec: datasource: kind: PrometheusDatasource - name: prometheus + name: kuadrant-prometheus-datasource query: 'count(count by (user) (increase(authorized_calls{user!="", user=~"$user", subscription=~"$subscription"}[$__range]) > 0)) or vector(0)' seriesNameFormat: Users successRate: @@ -104,7 +104,7 @@ spec: spec: datasource: kind: PrometheusDatasource - name: prometheus + name: kuadrant-prometheus-datasource query: '((sum(increase(authorized_calls{user!="", user=~"$user", subscription=~"$subscription"}[$__range]))) / ((sum(increase(authorized_calls{user!="", user=~"$user", subscription=~"$subscription"}[$__range])) + (sum(increase(limited_calls{user!="", user=~"$user", subscription=~"$subscription"}[$__range])) or vector(0))) > 0)) or vector(1)' seriesNameFormat: Success Rate tokenConsumptionByUser: @@ -112,7 +112,7 @@ spec: spec: display: description: Per-user totals over the Range dropdown window. - name: Token Consumption by User + name: Token Consumption plugin: kind: Table spec: @@ -167,7 +167,7 @@ spec: spec: datasource: kind: PrometheusDatasource - name: prometheus + name: kuadrant-prometheus-datasource query: 'round(sum by (user, subscription, model) (increase(authorized_hits{user!="", user=~"$user", subscription=~"$subscription", model=~"$model"}[$__range])))' - kind: TimeSeriesQuery spec: @@ -176,7 +176,7 @@ spec: spec: datasource: kind: PrometheusDatasource - name: prometheus + name: kuadrant-prometheus-datasource query: |- round( sum by (user, subscription, model) ( @@ -194,7 +194,7 @@ spec: spec: datasource: kind: PrometheusDatasource - name: prometheus + name: kuadrant-prometheus-datasource query: |- round( sum by (user, subscription, model) ( @@ -227,7 +227,7 @@ spec: spec: datasource: kind: PrometheusDatasource - name: prometheus + name: kuadrant-prometheus-datasource query: 'sum(increase(limited_calls{user!="", user=~"$user", subscription=~"$subscription"}[$__range])) or vector(0)' seriesNameFormat: Errors totalRequests: @@ -252,7 +252,7 @@ spec: spec: datasource: kind: PrometheusDatasource - name: prometheus + name: kuadrant-prometheus-datasource query: '(sum(increase(authorized_calls{user!="", user=~"$user", subscription=~"$subscription"}[$__range])) or vector(0)) + (sum(increase(limited_calls{user!="", user=~"$user", subscription=~"$subscription"}[$__range])) or vector(0))' seriesNameFormat: Requests totalTokens: @@ -277,7 +277,7 @@ spec: spec: datasource: kind: PrometheusDatasource - name: prometheus + name: kuadrant-prometheus-datasource query: 'sum(increase(authorized_hits{user!="", user=~"$user", subscription=~"$subscription", model=~"$model"}[$__range])) or vector(0)' seriesNameFormat: Tokens variables: @@ -294,7 +294,7 @@ spec: spec: datasource: kind: PrometheusDatasource - name: prometheus + name: kuadrant-prometheus-datasource labelName: user matchers: - 'authorized_hits{user!=""}' @@ -311,7 +311,7 @@ spec: spec: datasource: kind: PrometheusDatasource - name: prometheus + name: kuadrant-prometheus-datasource labelName: subscription matchers: - 'authorized_hits{subscription!=""}' @@ -328,7 +328,7 @@ spec: spec: datasource: kind: PrometheusDatasource - name: prometheus + name: kuadrant-prometheus-datasource labelName: model matchers: - 'authorized_hits{model!=""}' diff --git a/deployment/components/shared-patches/kustomization.yaml b/deployment/components/shared-patches/kustomization.yaml index 72a2b5e88..70f6e47a3 100644 --- a/deployment/components/shared-patches/kustomization.yaml +++ b/deployment/components/shared-patches/kustomization.yaml @@ -4,14 +4,13 @@ kind: Component # ============================================================================= # SHARED PATCHES COMPONENT # ============================================================================= -# Provides common configuration for all MaaS deployment overlays (tls-backend, -# http-backend, odh). Centralizes patches and replacements to eliminate duplication +# Provides common configuration for MaaS deployment overlays (ODH operator, +# Tenant reconciler). Centralizes patches and replacements to eliminate duplication # and maintain DRY principle. # # USED BY: -# - deployment/overlays/tls-backend/ -# - deployment/overlays/http-backend/ -# - deployment/overlays/odh/ +# - deployment/overlays/odh/ (ODH operator overlay) +# - maas-api/deploy/overlays/odh/ (Tenant reconciler overlay) # # PROVIDES: # - Environment variables for maas-api deployment @@ -154,6 +153,30 @@ replacements: delimiter: "." index: 1 +# ----------------------------------------------------------------------------- +# 3b. DESTINATIONRULE HOST FIX (TLS overlay) +# ----------------------------------------------------------------------------- +# When the TLS overlay is used, a DestinationRule is created with placeholder +# host: maas-api.maas-api.svc → maas-api..svc +# NOTE: DestinationRule namespace must be fixed by each overlay directly +# (not here) because the overlay's `namespace:` directive overrides component +# replacements. See overlays/odh for the pattern. +# ----------------------------------------------------------------------------- +- source: + kind: ConfigMap + version: v1 + name: maas-parameters + fieldPath: data.app-namespace + targets: + - select: + kind: DestinationRule + name: maas-api-backend-tls + fieldPaths: + - spec.host + options: + delimiter: "." + index: 1 + # ----------------------------------------------------------------------------- # 4. CLUSTER AUDIENCE FOR KUBERNETESTOKENREVIEW # ----------------------------------------------------------------------------- diff --git a/deployment/overlays/http-backend/README.md b/deployment/overlays/http-backend/README.md deleted file mode 100644 index 6b430091e..000000000 --- a/deployment/overlays/http-backend/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# HTTP Backend Overlay - -This overlay deploys the MaaS API with HTTP (no TLS) and includes all gateway-level policies. - -## What's Included - -- `base/maas-api` — Deployment, Service, HTTPRoute, RBAC, maas-api-auth-policy -- maas-controller provides gateway-level auth and rate limit policies - -## Usage - -```bash -kustomize build --load-restrictor LoadRestrictionsNone deployment/overlays/http-backend | kubectl apply -f - -``` - -`LoadRestrictionsNone` is required because this overlay references `../odh/params.env` outside the overlay root. - -## When to Use - -- Development environments -- When TLS is handled at the ingress/mesh layer -- Testing without certificate complexity - -For production with end-to-end TLS, use `overlays/tls-backend` instead. - diff --git a/deployment/overlays/http-backend/kustomization.yaml b/deployment/overlays/http-backend/kustomization.yaml deleted file mode 100644 index 66fba03d6..000000000 --- a/deployment/overlays/http-backend/kustomization.yaml +++ /dev/null @@ -1,54 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -# HTTP backend overlay for kustomize deployment mode -# This overlay configures MaaS with HTTP backend (Authorino ↔ MaaS API communication over HTTP) -# -# STRUCTURE: -# - Base resources: maas-api (without TLS), maas-controller -# - Shared config: Inherits common patches/replacements via shared-patches component -# - HTTP-specific: NetworkPolicy patch to fix Authorino podSelector -# -# SHARED CONFIGURATION (from components/shared-patches): -# - Environment variables for maas-api deployment (maas-parameters from ../odh/params.env) -# - Image replacements for maas-api and maas-controller -# - Gateway configuration (namespace, name) -# - AuthPolicy URL placeholder replacement (placehold → actual namespace) -# -# For details, see: deployment/components/shared-patches/README.md - -# IMPORTANT: This namespace must match 'app-namespace' in params.env -# The AuthPolicy URL uses app-namespace for DNS resolution -namespace: opendatahub - -# Reuse overlays/odh/params.env (single source of truth). Kustomize requires -# --load-restrictor LoadRestrictionsNone when building this overlay (see deploy.sh). -configMapGenerator: -- envs: - - ../odh/params.env - name: maas-parameters -generatorOptions: - disableNameSuffixHash: true - -resources: - - ../../base/maas-api # Without TLS (no DestinationRule) - - ../../base/maas-controller/default - -# Include shared-patches component for common configuration -# This provides: env vars, image replacements, gateway config, URL placeholder fix -components: - - ../../components/shared-patches - -# HTTP-SPECIFIC PATCHES -# Fix NetworkPolicy ingress podSelector - the labels transformer in maas-api default -# adds MaaS labels to the Authorino podSelector. Authorino pods only have -# authorino-resource: authorino, so we restore the correct selector. -patches: - - target: - kind: NetworkPolicy - name: maas-authorino-allow - patch: | - - op: replace - path: /spec/ingress/0/from/0/podSelector/matchLabels - value: - authorino-resource: authorino diff --git a/deployment/overlays/odh/kustomization.yaml b/deployment/overlays/odh/kustomization.yaml index cd3e2c1b0..7fd40b551 100644 --- a/deployment/overlays/odh/kustomization.yaml +++ b/deployment/overlays/odh/kustomization.yaml @@ -6,12 +6,12 @@ metadata: # ODH operator overlay # This overlay configures MaaS for deployment via OpenDataHub operator -# Includes: maas-api with TLS, maas-controller, and gateway-level default policies +# Includes: maas-api with TLS, maas-controller (gateway policies managed separately) # # STRUCTURE: -# - Base resources: maas-api with TLS, maas-controller, gateway policies +# - Base resources: maas-api with TLS, maas-controller # - Shared config: Inherits common patches/replacements via shared-patches component -# - ODH-specific: Additional replacements for gateway policies and DestinationRule +# - ODH-specific: Additional replacements for DestinationRule # # SHARED CONFIGURATION (from components/shared-patches): # - Environment variables for maas-api deployment (maas-parameters from params.env) @@ -20,8 +20,9 @@ metadata: # - AuthPolicy URL placeholder replacement (placehold → actual namespace) # # ODH-SPECIFIC CONFIGURATION: -# - Gateway-level policies (gateway-default-auth, gateway-default-deny) # - DestinationRule for TLS backend configuration +# - Note: Gateway-level policies (gateway-default-auth, gateway-default-deny) +# are deployed separately to avoid WasmPlugin timeout issues # # For details, see: deployment/components/shared-patches/README.md @@ -41,8 +42,8 @@ generatorOptions: resources: - ../../base/maas-api/overlays/tls # maas-api with TLS (includes DestinationRule + NetworkPolicy) - ../../base/maas-controller/default - - ../../base/maas-controller/policies # gateway-default-auth, gateway-default-deny - ../../base/payload-processing/default # BBR ext_proc for external model payload processing + - ../../components/observability/observability/dashboards/ # Include shared-patches component for common configuration # This provides: env vars, image replacements, gateway config, URL placeholder fix @@ -194,23 +195,13 @@ replacements: options: delimiter: "." index: 1 -# Replace gateway namespace in gateway policies +# Replace gateway namespace in DestinationRule - source: kind: ConfigMap version: v1 name: maas-parameters fieldPath: data.gateway-namespace targets: - - select: - kind: AuthPolicy - name: gateway-default-auth - fieldPaths: - - metadata.namespace - - select: - kind: TokenRateLimitPolicy - name: gateway-default-deny - fieldPaths: - - metadata.namespace - select: kind: DestinationRule name: maas-api-backend-tls diff --git a/deployment/overlays/odh/params.env b/deployment/overlays/odh/params.env index af0b60a74..13d977ae9 100644 --- a/deployment/overlays/odh/params.env +++ b/deployment/overlays/odh/params.env @@ -1,6 +1,7 @@ maas-api-image=quay.io/opendatahub/maas-api:latest maas-controller-image=quay.io/opendatahub/maas-controller:latest payload-processing-image=quay.io/opendatahub/odh-ai-gateway-payload-processing:odh-stable +maas-api-key-cleanup-image=registry.redhat.io/ubi9/ubi-minimal:9.7 payload-processing-replicas=1 gateway-namespace=openshift-ingress gateway-name=maas-default-gateway diff --git a/deployment/overlays/tls-backend-disk/kustomization.yaml b/deployment/overlays/tls-backend-disk/kustomization.yaml deleted file mode 100644 index 8ad85cfe3..000000000 --- a/deployment/overlays/tls-backend-disk/kustomization.yaml +++ /dev/null @@ -1,37 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -# TLS Backend with Disk Storage -# Extends tls-backend with PVC-based persistent storage -# -# Usage: -# kustomize build deployment/overlays/tls-backend-disk | kubectl apply -f - - -resources: - - ../tls-backend - - pvc.yaml - -patches: - - target: - kind: Deployment - name: maas-api - patch: |- - - op: add - path: /spec/template/spec/volumes/- - value: - name: data - persistentVolumeClaim: - claimName: maas-api-data - - op: add - path: /spec/template/spec/containers/0/volumeMounts/- - value: - name: data - mountPath: /data - - op: add - path: /spec/template/spec/containers/0/command - value: - - ./maas-api - - op: add - path: /spec/template/spec/containers/0/args - value: - - --storage=disk diff --git a/deployment/overlays/tls-backend-disk/pvc.yaml b/deployment/overlays/tls-backend-disk/pvc.yaml deleted file mode 100644 index 20e399056..000000000 --- a/deployment/overlays/tls-backend-disk/pvc.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: maas-api-data - labels: - app.kubernetes.io/name: maas-api - app.kubernetes.io/component: storage -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 1Gi diff --git a/deployment/overlays/tls-backend/README.md b/deployment/overlays/tls-backend/README.md deleted file mode 100644 index 9bfbc2b02..000000000 --- a/deployment/overlays/tls-backend/README.md +++ /dev/null @@ -1,67 +0,0 @@ -# TLS Backend Overlay - -Enables end-to-end TLS for maas-api using OpenShift serving certificates. - -## Contents - -| File | Purpose | -|------|---------| -| `kustomization.yaml` | References base TLS overlay and policies, applies HTTPS patches | - -Authorino TLS is configured by `scripts/setup-authorino-tls.sh` (run automatically by `deploy.sh` or manually). - - -## Traffic Flow - -**External (client → gateway → maas-api):** - -``` -Client :443 → Gateway (TLS termination) → DestinationRule → maas-api :8443 -``` - -**Internal (Authorino → maas-api for API key validation and metadata):** - -``` -Authorino → maas-api :8443 → /internal/v1/api-keys/validate -``` - -## Usage - -### Using Unified Deployment Script (Recommended) - -```bash -# TLS is enabled by default -./scripts/deploy.sh --deployment-mode kustomize - -# Or explicitly enable TLS -./scripts/deploy.sh --deployment-mode kustomize --enable-tls-backend -``` - -The deployment script automatically: -1. Applies the kustomize overlay -2. Configures Authorino for TLS using `scripts/setup-authorino-tls.sh` -3. Restarts deployments to pick up certificates - -### Manual Deployment (Advanced) - -```bash -# Apply Kustomize overlay (LoadRestrictionsNone: overlay uses ../odh/params.env) -kustomize build --load-restrictor LoadRestrictionsNone deployment/overlays/tls-backend | kubectl apply -f - - -# Configure Authorino for TLS (operator-managed, can't be patched via Kustomize) -./scripts/setup-authorino-tls.sh - -# Restart to pick up certificates -kubectl rollout restart deployment/maas-api -n maas-api -kubectl rollout restart deployment/authorino -n kuadrant-system -``` - -**Note:** `scripts/setup-authorino-tls.sh` patches Authorino's service, CR, and deployment. Use `--disable-tls-backend` with `deploy.sh` to skip if you manage Authorino TLS separately. - -## Why the script? - -Authorino resources are managed by the Kuadrant operator. Kustomize can't patch them because they don't exist in our manifests; they're created by the operator. The script uses `kubectl patch` to configure TLS on the live resources. - -## See also - -- [Securing Authorino for llm-d in RHOAI](https://github.com/opendatahub-io/kserve/tree/release-v0.15/docs/samples/llmisvc/ocp-setup-for-GA#ssl-authorino) diff --git a/deployment/overlays/tls-backend/kustomization.yaml b/deployment/overlays/tls-backend/kustomization.yaml deleted file mode 100644 index 519a68970..000000000 --- a/deployment/overlays/tls-backend/kustomization.yaml +++ /dev/null @@ -1,72 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -# TLS backend overlay for kustomize deployment mode -# This overlay configures MaaS with TLS-enabled backend (Authorino ↔ MaaS API communication over TLS) -# -# STRUCTURE: -# - Base resources: maas-api with TLS (includes DestinationRule), maas-controller -# - Shared config: Inherits common patches/replacements via shared-patches component -# - TLS-specific: Additional replacements for DestinationRule configuration -# -# SHARED CONFIGURATION (from components/shared-patches): -# - Environment variables for maas-api deployment (maas-parameters from ../odh/params.env) -# - Image replacements for maas-api and maas-controller -# - Gateway configuration (namespace, name) -# - AuthPolicy URL placeholder replacement (placehold → actual namespace) -# -# For details, see: deployment/components/shared-patches/README.md - -# IMPORTANT: This namespace must match 'app-namespace' in params.env -# The AuthPolicy URL uses app-namespace for DNS resolution -namespace: opendatahub - -# Reuse overlays/odh/params.env (single source of truth). Kustomize requires -# --load-restrictor LoadRestrictionsNone when building this overlay (see deploy.sh). -configMapGenerator: -- envs: - - ../odh/params.env - name: maas-parameters -generatorOptions: - disableNameSuffixHash: true - -resources: - - ../../base/maas-api/overlays/tls # Includes DestinationRule for TLS - - ../../base/maas-controller/default - -# Include shared-patches component for common configuration -# This provides: env vars, image replacements, gateway config, URL placeholder fix -components: - - ../../components/shared-patches - -# TLS-SPECIFIC REPLACEMENTS -# These are in addition to shared-patches and handle TLS-specific resources (DestinationRule) -replacements: -# Replace gateway namespace in DestinationRule -- source: - kind: ConfigMap - version: v1 - name: maas-parameters - fieldPath: data.gateway-namespace - targets: - - select: - kind: DestinationRule - name: maas-api-backend-tls - fieldPaths: - - metadata.namespace - -# Replace app-namespace in DestinationRule host -- source: - kind: ConfigMap - version: v1 - name: maas-parameters - fieldPath: data.app-namespace - targets: - - select: - kind: DestinationRule - name: maas-api-backend-tls - fieldPaths: - - spec.host - options: - delimiter: "." - index: 1 diff --git a/docs/content/advanced-administration/observability.md b/docs/content/advanced-administration/observability.md index 0fe6c0632..37cda60ad 100644 --- a/docs/content/advanced-administration/observability.md +++ b/docs/content/advanced-administration/observability.md @@ -405,7 +405,7 @@ MaaS supports three model serving backends that expose Prometheus metrics on `/m - **vLLM** (current stable) — full-featured LLM inference server - **llm-d** — llm-d inference platform (runs vLLM as backend + EPP routing layer) -- **llm-d-inference-sim** (v0.7.1) — lightweight simulator for testing without GPUs +- **llm-d-inference-sim** (v0.8.2) — lightweight simulator for testing without GPUs **Supported versions:** @@ -413,7 +413,7 @@ MaaS supports three model serving backends that expose Prometheus metrics on `/m |---------|----------------|------------------| | vLLM | v0.7.x stable | — | | llm-d | v0.1.x | — | -| llm-d-inference-sim | **v0.7.1** | `docs/samples/models/simulator/` | +| llm-d-inference-sim | **v0.8.2** | `docs/samples/models/simulator/` | #### vLLM Metrics (port 8000) @@ -443,7 +443,7 @@ All three backends expose `vllm:`-prefixed metrics. The table below shows which | `vllm:time_per_output_token_seconds` | Histogram | Y | — | — | Legacy ITL name (kept by simulator for backward compat; not used by dashboards) | !!! note "Simulator metric alignment" - As of v0.7.1, the simulator fully aligns with current vLLM metric names (`kv_cache_usage_perc`, `inter_token_latency_seconds`, `prompt_tokens_total`, `generation_tokens_total`). Older simulator versions (v0.6.x) used different names (`gpu_cache_usage_perc`, `time_per_output_token_seconds`) and are **no longer supported** by MaaS dashboards. The simulator also exposes additional metrics not used by MaaS dashboards (e.g. `request_inference_time_seconds`, `request_params_max_tokens`). + As of v0.7.1 (still true in v0.8.x), the simulator fully aligns with current vLLM metric names (`kv_cache_usage_perc`, `inter_token_latency_seconds`, `prompt_tokens_total`, `generation_tokens_total`). Older simulator versions (v0.6.x) used different names (`gpu_cache_usage_perc`, `time_per_output_token_seconds`) and are **no longer supported** by MaaS dashboards. The simulator also exposes additional metrics not used by MaaS dashboards (e.g. `request_inference_time_seconds`, `request_params_max_tokens`). !!! note "Lazily registered metrics" Some vLLM/simulator metrics are **lazily registered** — they only appear in `/metrics` output after the first event that triggers them. For example, `request_queue_time_seconds` (on real vLLM) only appears after a request actually queues (when `max-num-seqs` is exceeded). Similarly, histogram counters like `e2e_request_latency_seconds` only appear after the first inference request completes. Dashboard panels will show "No Data" until sufficient traffic has been generated. This is normal Prometheus client behavior, not a configuration issue. @@ -475,7 +475,7 @@ When using llm-d, the inference gateway's Endpoint Picker (EPP) exposes addition #### Dashboard Metric Queries -Dashboard panels use histogram `_sum` as primary data source. All queries work across vLLM, llm-d, and llm-d-inference-sim v0.7.1: +Dashboard panels use histogram `_sum` as primary data source. All queries work across vLLM, llm-d, and llm-d-inference-sim v0.8.2: | Panel | PromQL metric | |-------|---------------| diff --git a/docs/content/assets/concepts/personas-resource-model-dark.png b/docs/content/assets/concepts/personas-resource-model-dark.png new file mode 100644 index 000000000..87d1db094 Binary files /dev/null and b/docs/content/assets/concepts/personas-resource-model-dark.png differ diff --git a/docs/content/assets/concepts/personas-resource-model-light.png b/docs/content/assets/concepts/personas-resource-model-light.png new file mode 100644 index 000000000..56bfe5817 Binary files /dev/null and b/docs/content/assets/concepts/personas-resource-model-light.png differ diff --git a/docs/content/architecture.md b/docs/content/concepts/architecture.md similarity index 50% rename from docs/content/architecture.md rename to docs/content/concepts/architecture.md index 30bc8af3d..220d01f5d 100644 --- a/docs/content/architecture.md +++ b/docs/content/concepts/architecture.md @@ -2,20 +2,29 @@ ## Overview -The MaaS Platform is a Kubernetes-native layer for AI model serving built on [Gateway API](https://gateway-api.sigs.k8s.io/) and policy controllers ([Kuadrant](https://docs.kuadrant.io/), [Authorino](https://docs.kuadrant.io/1.0.x/authorino/), [Limitador](https://docs.kuadrant.io/1.0.x/limitador/)). It provides policy-based authentication and authorization, plus subscription-based rate limiting. Future work includes improved request routing and discovery. +The MaaS Platform is a Kubernetes-native layer for AI model serving built on [Gateway API](https://gateway-api.sigs.k8s.io/) and policy controllers ([Kuadrant](https://docs.kuadrant.io/), [Authorino](https://docs.kuadrant.io/1.0.x/authorino/), [Limitador](https://docs.kuadrant.io/1.0.x/limitador/)). It provides policy-based authentication and authorization, plus subscription-based rate limiting. + +Our future plans include improved request routing and discovery—and we're already sketching what comes after that. ## Architecture ### đŸ—ī¸ High-Level Architecture -The MaaS Platform is an end-to-end solution built on [Kuadrant](https://docs.kuadrant.io/). +The MaaS Platform is a layer for **authorization and rate limiting** built on [Kuadrant](https://docs.kuadrant.io/). It sits in front of **models** you deploy on the cluster; the same pattern is expected to extend to models hosted outside the cluster over time. + +**Our main components include:** -All traffic flows through the Gateway **maas-default-gateway** (Gateway API). Then utilizes [Authorino](https://docs.kuadrant.io/1.0.x/authorino/) to enforcing authentication, authorization and [Limitador](https://docs.kuadrant.io/1.0.x/limitador/) to enforce and track token usage. Auth policies use **caching** (e.g., subscription selection results, API key validation) to reduce latency on the hot path. +- **Gateway (`maas-default-gateway`)** — Entry point for traffic using [Gateway API](https://gateway-api.sigs.k8s.io/); HTTPRoutes attach here. +- **[Kuadrant](https://docs.kuadrant.io/1.4.x/)** — Policy engine: connects routes and **AuthPolicy** resources to the Gateway and orchestrates enforcement on the hot path. +- **[Authorino](https://docs.kuadrant.io/1.4.x/authorino/)** — **Authentication and authorization** at the edge. +- **[Limitador](https://docs.kuadrant.io/1.4.x/limitador/)** — **Rate limiting** and tracking usage against subscription limits. +- **maas-api** — Custom service for **API key minting** and **validation** (including the internal endpoint the gateway calls for `sk-oai-*` keys). **Main Flows:** -- **Key Minting** — For obtaining API keys to authenticate programmatic access. -- **Inference** — For calling models to generate completions. +- **Key minting** (blue) — Obtain `sk-oai-*` API keys for programmatic access to models (after authenticating with your cluster identity or configured OIDC). Each mint **binds a subscription** to the key; that association is stored with the key and used on inference. +- **Inference** (green) — Call deployed models to generate completions using an API key (and subscription) on the inference route. + ```mermaid graph TB @@ -35,8 +44,10 @@ graph TB end subgraph ModelServingLayer["Model Serving Layer"] + MaaSModelRef["MaaSModelRef"] InferenceService[Inference Service] LLM[LLM] + ExternalModel["ExternalModel /
external API"] end User -->|"Request Key"| Gateway @@ -48,105 +59,84 @@ graph TB Gateway --> MaaSAuthPolicy MaaSAuthPolicy -.->|"Validate API Key"| MaaSAPI MaaSAuthPolicy -->|"Rate Limit"| MaaSSubscription - MaaSSubscription --> InferenceService + MaaSSubscription --> MaaSModelRef + MaaSModelRef -->|"On-cluster"| InferenceService + MaaSModelRef -.->|"Tech Preview"| ExternalModel InferenceService --> LLM LLM -->|"Return Response"| User + ExternalModel -.->|"Return Response"| User linkStyle 0,1,2,3 stroke:#1976d2,stroke-width:2px - linkStyle 4,5,6,7,8,9,10 stroke:#388e3c,stroke-width:2px + linkStyle 4,5,6,7,8,9,11,12 stroke:#388e3c,stroke-width:2px + linkStyle 10,13 stroke:#388e3c,stroke-width:2px,stroke-dasharray: 6 4 style MaaSAPI fill:#1976d2,stroke:#333,stroke-width:2px,color:#fff style Gateway fill:#7b1fa2,stroke:#333,stroke-width:2px,color:#fff style AuthPolicy fill:#e65100,stroke:#333,stroke-width:2px,color:#fff style MaaSAuthPolicy fill:#e65100,stroke:#333,stroke-width:2px,color:#fff style MaaSSubscription fill:#e65100,stroke:#333,stroke-width:2px,color:#fff + style MaaSModelRef fill:#e65100,stroke:#333,stroke-width:2px,color:#fff style InferenceService fill:#388e3c,stroke:#333,stroke-width:2px,color:#fff style LLM fill:#388e3c,stroke:#333,stroke-width:2px,color:#fff + style ExternalModel fill:#00695c,stroke:#333,stroke-width:2px,color:#fff ``` ### Key Minting Flow — Request & Validation **Flow summary:** -1. User sends `POST /v1/api-keys` with Bearer `{identity-token}`. -2. Gateway routes the request to AuthPolicy (Authorino). -3. AuthPolicy validates the presented identity token via the configured auth method (`kubernetesTokenReview` for OpenShift, or OIDC JWT validation when enabled). -4. Gateway forwards the authenticated request and user context to the Key Minting Service. +1. User sends `POST /maas-api/v1/api-keys` with `Authorization: Bearer {identity-token}`. + - The body sets which **MaaSSubscription** to bind (`subscription`), or omits it so the platform picks an accessible one (for example by priority). + - That subscription is **stored on the key** at mint; inference later reads it from the key record, not from per-request headers. +2. **Validate identity** — **Authorino** (AuthPolicy) checks the token using the configured method: + - **`kubernetesTokenReview`** — OpenShift cluster tokens + - **OIDC JWT validation** — external IdP (for example Keycloak) — **Tech Preview** +3. After authentication, the **request** is forwarded to **MaaS API** (key minting) on the gateway upstream path, with identity context available for minting—**Authorino** validates the request; it does not proxy or forward the HTTP call to MaaS API itself. +4. **MaaS API** handles key minting using that authenticated identity and the requested subscription binding. +5. The service generates a random `sk-oai-*` key and hashes it with SHA-256. +6. Only the hash and metadata (username, groups, name, `subscription` — the MaaSSubscription name bound at mint, `expiresAt`) are stored in PostgreSQL. +7. The plaintext key is returned to the user **only in this minting response** (show-once), along with `expiresAt`; it is **not** exposed again on later reads. The diagram below stops at storage and does not show the HTTP response back to the user. -```mermaid -graph TB - subgraph UserLayer["User"] - U[User] - end - - subgraph GatewayLayer["Gateway & Policy"] - G[Gateway] - AP[AuthPolicy
Authorino] - end - - subgraph KeyMintingLayer["MaaS API"] - KMS[MaaS API] - end - - U -->|"1. POST /v1/api-keys
Bearer {identity-token}"| G - G -->|"2. Route /maas-api"| AP - AP -->|"3. Validate identity token
TokenReview or OIDC JWT"| G - G -->|"4. Forward + user context"| KMS - - style KMS fill:#1976d2,stroke:#333,stroke-width:2px,color:#fff - style G fill:#7b1fa2,stroke:#333,stroke-width:2px,color:#fff - style AP fill:#e65100,stroke:#333,stroke-width:2px,color:#fff -``` - -!!! Tip "OIDC Support" - The `maas-api` route can be configured to validate external OIDC tokens (for example Keycloak-issued JWTs) in addition to the existing OpenShift TokenReview flow. Model routes still use the current API-key policy, so the interim OIDC flow is: authenticate with OIDC at `maas-api`, mint an `sk-oai-*` key, then use that key for model discovery and inference. - - -### Key Minting Service (Default Implementation) - -**Flow summary:** - -1. Gateway forwards the authenticated request and user context to the Key Minting Service (MaaS API). -2. The service generates a random `sk-oai-*` key and hashes it with SHA-256. -3. Only the hash and metadata (username, groups, name, optional `expiresAt` when TTL is set) are stored in PostgreSQL. -4. The plaintext key is returned to the user **once**, along with `expiresAt` when a TTL was specified; the key cannot be retrieved again. - -Keys can be permanent (no expiration) or have an optional **TTL** (`expiresIn`, e.g., `30d`, `90d`, `1h`); the response includes `expiresAt` when a TTL is set. +Every key expires. With **operator-managed** MaaS, the cluster operator sets the maximum lifetime on the **`ModelsAsService`** CR: **`spec.apiKeys.maxExpirationDays`** (see [ModelsAsService CR](../install/maas-setup.md#modelsasservice-cr)). **`maas-api`** applies that cap as **`API_KEY_MAX_EXPIRATION_DAYS`** (for example 90 days by default when defaults apply). Omit **`expiresIn`** on create to use that maximum, or set a shorter **`expiresIn`** (e.g., `30d`, `90d`, `1h`) within the configured cap. The response always includes **`expiresAt`** (RFC3339). ```mermaid graph TB subgraph UserLayer["User"] U[User] end - + subgraph GatewayLayer["Gateway & Policy"] G[Gateway] + AP["AuthPolicy
Authorino"] end - - subgraph KeyMintingService["Key Minting Service (Default)"] + + subgraph KeyMinting["MaaS API"] API[MaaS API] Gen[Generate sk-oai-* key] Hash[Hash with SHA-256] end - - subgraph Storage["Storage (Default)"] - DB[(PostgreSQL
key hashes + metadata + TTL)] + + subgraph Storage["Storage"] + DB[(PostgreSQL
hashes + subscription + metadata + TTL)] end - - U --> G - G -->|"Forward + user context"| API + + U -->|"POST /maas-api/v1/api-keys"| G + G -->|"Validate identity"| AP + AP -->|"Request continues upstream"| API API --> Gen Gen --> Hash - Hash -->|"Store hash + expiresAt"| DB - API -->|"Return key ONCE"| U - + Hash -->|"Store hash + metadata"| DB + style API fill:#1976d2,stroke:#333,stroke-width:2px,color:#fff + style Gen fill:#1976d2,stroke:#333,stroke-width:2px,color:#fff + style Hash fill:#1976d2,stroke:#333,stroke-width:2px,color:#fff style G fill:#7b1fa2,stroke:#333,stroke-width:2px,color:#fff + style AP fill:#e65100,stroke:#333,stroke-width:2px,color:#fff style DB fill:#336791,stroke:#333,stroke-width:2px,color:#fff ``` -!!! tip "Future Plans" - This is the **default implementation**. Future plans include integration with other key store providers (e.g., HashiCorp Vault, cloud secret managers). +!!! Tip "OIDC Support" + **Tech Preview:** OIDC JWT validation on the `maas-api` route is optional alongside OpenShift `kubernetesTokenReview`. Model routes still rely on API-key auth; the typical flow is authenticate at `maas-api`, mint an `sk-oai-*` key, then use that key for discovery and inference. !!! note "PostgreSQL" A **PostgreSQL database is required** and is **not included** with the MaaS deployment. The deploy script provides a basic PostgreSQL deployment for development and testing—**this is not intended for production use**. For production, provision and configure your own PostgreSQL instance. @@ -156,10 +146,12 @@ graph TB **Flow summary:** 1. User sends inference request with an API key. -2. Gateway routes to MaaSAuthPolicy (Authorino). -3. MaaSAuthPolicy validates the key via MaaS API and selects subscription; on failure returns 401/403. -4. MaaSSubscription (Limitador) checks token rate limits; on exceed returns 429. -5. Request reaches Inference Service and LLM; completion returned to user. +2. **Validate identity** — request reaches **MaaSAuthPolicy (Authorino)** via the Gateway. +3. **MaaSAuthPolicy** validates the key via **MaaS API**; on failure returns 401/403. +4. **Check limits** — **MaaSSubscription (Limitador)** enforces token rate limits; on exceed returns 429. +5. Request reaches Inference Service when within limits. +6. Inference Service forwards to the LLM. +7. Completion Response is returned to the user. ```mermaid graph TB @@ -183,9 +175,9 @@ graph TB end U -->|"1. Inference + API key"| G - G -->|"2. Route"| MAP + G -->|"2. Validate identity"| MAP MAP -.->|"3. Validate key"| API - MAP -->|"4. Auth OK"| MS + MAP -->|"4. Check limits"| MS MS -->|"5. Within limits"| INV INV -->|"6. Forward"| LLM LLM -->|"7. Completion"| U @@ -206,43 +198,45 @@ graph TB ### Auth & Validation Flow (Deep Dive) -The MaaSAuthPolicy delegates to the MaaS API for key validation and subscription selection. The subscription name comes from the PostgreSQL key record (set at key creation). +For inference with an `sk-oai-*` API key, the policy layer performs **two MaaS API steps** in order. **First** the key is validated against PostgreSQL. **Subscription** is not read from request headers for API keys—it is **stored on the key record** when the key was minted and is returned as part of validation. **Second**, that subscription name, together with the username and groups from the key record, is used to confirm the caller may use that subscription for the target model (for example, that the subscription exists, the user still has access, and the model is part of that subscription). **Flow summary:** -1. Authorino calls MaaS API to validate the API key. -2. MaaS API validates the key (format, not revoked, not expired) and returns username, groups, and subscription. -3. Authorino calls MaaS API to check subscription (groups, username, requested subscription from the key). -4. If the user lacks access to the requested subscription → error (403). -5. On success, returns selected subscription; Authorino caches the result (e.g., 60s TTL). Identity information (username, groups, subscription, key ID) is made available to TokenRateLimitPolicy and observability through AuthPolicy's `filters.identity` mechanism, but is **not forwarded** as HTTP headers to upstream model workloads (defense-in-depth security). Clients do not send subscription headers on inference; subscription comes from the API key record created at mint time. +1. The **policy layer** sends the API key to the MaaS API **validate-key** path. +2. **Validate key** — MaaS API parses the key, looks up the salted hash in PostgreSQL, and rejects unknown, revoked, expired, or malformed keys (and keys with no subscription bound). On success it returns identity (username, groups, key ID) and the **subscription name from the key record** (mint-time binding). +3. **Subscription from the key** — The next step uses that subscription name as the requested subscription—**not** a client-supplied `X-MaaS-Subscription` value. For API keys the subscription in the request body to subscription selection is exactly the subscription returned from validation. +4. **Confirm subscription access** — MaaS API subscription selection checks that the user and groups can use that subscription and that the requested model is allowed; failures surface as access denied (for example 403) to the policy layer. +5. On success, identity and subscription context are available for rate limiting and metrics. That context is **not** forwarded as HTTP headers to upstream model workloads (defense in depth). Results may be cached briefly by the policy layer to avoid repeating work on every request. ```mermaid graph TB - subgraph AuthLayer["MaaSAuthPolicy (Authorino)"] - A[Authorino] + subgraph PolicyLayer["Policy layer"] + P[Policy] end subgraph MaaSLayer["MaaS API"] - Validate[Validate API Key] - SubSelect[Check Subscription] + V[Validate API key] + S[Confirm subscription access] end subgraph Storage["Storage"] DB[(PostgreSQL)] end - A -->|"1. Validate key"| Validate - Validate -->|"Lookup hash, check not expired"| DB - DB -->|"metadata"| Validate - - A -->|"2. Check subscription"| SubSelect - SubSelect -.->|"3. No access to requested sub → 403"| A - SubSelect -->|"4. Selected subscription"| A + P -->|"1. API key"| V + V -->|"2. Lookup key record"| DB + DB -->|"3. Subscription stored on key"| V + V -.->|"Invalid key"| P + P -->|"4. Groups, username, subscription from key"| S + S -.->|"Access denied"| P + S -->|"5. Authorized"| P - linkStyle 4 stroke:#c62828,stroke-width:2px,stroke-dasharray:5,5 + linkStyle 3 stroke:#c62828,stroke-width:2px,stroke-dasharray:5,5 + linkStyle 5 stroke:#c62828,stroke-width:2px,stroke-dasharray:5,5 - style Validate fill:#1976d2,stroke:#333,stroke-width:2px,color:#fff - style SubSelect fill:#1976d2,stroke:#333,stroke-width:2px,color:#fff + style P fill:#e65100,stroke:#333,stroke-width:2px,color:#fff + style V fill:#1976d2,stroke:#333,stroke-width:2px,color:#fff + style S fill:#1976d2,stroke:#333,stroke-width:2px,color:#fff style DB fill:#336791,stroke:#333,stroke-width:2px,color:#fff ``` diff --git a/docs/content/concepts/model-reference.md b/docs/content/concepts/model-reference.md new file mode 100644 index 000000000..73075cf8f --- /dev/null +++ b/docs/content/concepts/model-reference.md @@ -0,0 +1,33 @@ +# Model Reference + +**MaaSModelRef** is a pointer to an **inference service** (on-cluster or external). + +The controller **collects metadata** from that service and uses it to **wire routing on the default gateway** (`maas-default-gateway`). **MaaSAuthPolicy** and **MaaSSubscription** reference the same `MaaSModelRef` names so **access** and **quota** apply on the inference path. + +```mermaid +flowchart LR + subgraph Downstream ["Downstream (cluster or external)"] + OnCluster["Inference service
(e.g. LLMInferenceService)"] + External["External model
(API endpoint)"] + end + + MaaSModelRef["MaaSModelRef"] + + subgraph Policies ["Policies"] + MaaSAuthPolicy["MaaSAuthPolicy"] + MaaSSubscription["MaaSSubscription"] + end + + OnCluster -->|"1. Endpoint, status"| MaaSModelRef + External -->|"1. Endpoint, status"| MaaSModelRef + MaaSModelRef -->|"2. For policies"| MaaSAuthPolicy + MaaSModelRef -->|"2. For policies"| MaaSSubscription + + style MaaSModelRef fill:#1976d2,stroke:#333,stroke-width:2px,color:#fff + style MaaSAuthPolicy fill:#e65100,stroke:#333,stroke-width:2px,color:#fff + style MaaSSubscription fill:#e65100,stroke:#333,stroke-width:2px,color:#fff + style OnCluster fill:#388e3c,stroke:#333,stroke-width:2px,color:#fff + style External fill:#00695c,stroke:#333,stroke-width:2px,color:#fff +``` + +For configuration steps, see [Quota and Access Configuration](../configuration-and-management/quota-and-access-configuration.md). diff --git a/docs/content/concepts/personas.md b/docs/content/concepts/personas.md new file mode 100644 index 000000000..421b68c71 --- /dev/null +++ b/docs/content/concepts/personas.md @@ -0,0 +1,42 @@ +# Personas and responsibilities + +This page follows the same idea as the [Gateway API personas](https://gateway-api.sigs.k8s.io/#personas): short **who** and **what they own**, focused on **MaaS day-to-day operation** (not cluster install). Anything not listed as in scope is out of scope for that persona. + +## Resource model + + +![Personas resource model](../assets/concepts/personas-resource-model-light.png#only-light) +![Personas resource model](../assets/concepts/personas-resource-model-dark.png#only-dark) + +**How to read it** + +- **Model owners** deploy **`MaaSModelRef`** and the **model server** workload in their namespace (often one stack per model line). +- **ODH administrators** configure **`MaaSAuthPolicy`** and **`MaaSSubscription`** so the right callers and quotas apply to those models. +- **`MaaSSubscription`** ties subscriptions to model references; parallel **MaaSModelRef → model server** branches can represent multiple models under one subscription pattern. +- **API consumers** call inference through the **Gateway** with an **`sk-oai-*`** key and use **maas-api** for self-service key minting—they do not manage **`MaaSAuthPolicy`**, **`MaaSSubscription`**, or **`MaaSModelRef`** (those sit with administrators and model owners). + +--- + +## Model owners + +**Who:** Teams that ship and operate a model in their namespace—often **model owners**, ML engineers, or project admins (not a special “data scientist” role required by MaaS). + +**Owns:** **`MaaSModelRef`** in the same namespace as the **model server** (for example KServe `LLMInferenceService` or your inference `Deployment`)—the serving workload the reference points at. + +--- + +## ODH administrators + +**Who:** OpenShift or ODH **administrators** who govern access and quota for MaaS. + +**Owns:** **`MaaSAuthPolicy`**, **`MaaSSubscription`**, and the **Gateway** / **HTTPRoute** surface that exposes MaaS to users—at the level of **MaaS and Gateway API resources**, not the inference images or weights in application namespaces. + +--- + +## API consumers + +**Who:** Application developers, automation, or anyone calling inference with an **`sk-oai-*`** key. + +**Owns:** **Self-service** use of **maas-api** (mint and manage keys within policy) and **inference** through the **Gateway**, subject to **`MaaSSubscription`** limits—shown on the **inference** arc in the diagram above. + +--- diff --git a/docs/content/configuration-and-management/subscription-overview.md b/docs/content/concepts/subscription-overview.md similarity index 92% rename from docs/content/configuration-and-management/subscription-overview.md rename to docs/content/concepts/subscription-overview.md index e8cad48b4..116916b1f 100644 --- a/docs/content/configuration-and-management/subscription-overview.md +++ b/docs/content/concepts/subscription-overview.md @@ -20,7 +20,7 @@ flowchart TD A -- "Pass" --> BothPass B -- "Pass" --> BothPass - BothPass{Access Granted} --> InferenceService["Inference Service"] + BothPass{Access Granted} --> InferenceService["Inference server
(MaaSModelRef)"] style User fill:#1976d2,stroke:#333,stroke-width:2px,color:#fff style Gateway fill:#7b1fa2,stroke:#333,stroke-width:2px,color:#fff @@ -61,7 +61,7 @@ The team can use only the 5 models specified in the policy. Their usage is gover For configuration details, see: -- [Quota and Access Configuration](quota-and-access-configuration.md) — Step-by-step configuration for MaaSModelRef, MaaSAuthPolicy, and MaaSSubscription +- [Quota and Access Configuration](../configuration-and-management/quota-and-access-configuration.md) — Step-by-step configuration for MaaSModelRef, MaaSAuthPolicy, and MaaSSubscription Additional references: diff --git a/docs/content/configuration-and-management/external-models.md b/docs/content/configuration-and-management/external-models.md new file mode 100644 index 000000000..43be00ba2 --- /dev/null +++ b/docs/content/configuration-and-management/external-models.md @@ -0,0 +1,53 @@ +# External models + +!!! warning "Documentation in progress" + This section is **still under development**. Behavior, field names, and operator steps may change in upcoming releases. For authoritative CRD fields, see the **[ExternalModel](../reference/crds/external-model.md)** reference. For registration steps aligned with the API, see [Registering external models](model-listing-flow.md#registering-external-models). + +**External models** are inference backends that run **outside** the cluster (for example managed APIs or a reachable HTTP endpoint). In MaaS they are still represented by a **MaaSModelRef** with **`spec.modelRef.kind: ExternalModel`**, so listing, API keys, subscriptions, and gateway policies work the same way as for on-cluster models—the difference is how traffic is routed to the upstream provider. + +## Flow (high level) + +1. **Provider configuration** — You define an **[ExternalModel](../reference/crds/external-model.md)** CR in the model namespace: `provider`, `endpoint` (FQDN), `credentialRef` (Secret with API keys), and `targetModel` (upstream model id). The Secret must live in the **same namespace** as the ExternalModel and the MaaSModelRef you will create. + +2. **Registration** — You create a **MaaSModelRef** whose **`spec.modelRef.name`** matches the ExternalModel’s name (same namespace). The MaaS controller treats this as an external backend. + +3. **Route and mesh** — The **ExternalModel** reconciler creates namespaced resources so traffic can leave the mesh to the provider: an ExternalName **Service**, Istio **ServiceEntry**, optional **DestinationRule** (TLS origination), and a Gateway API **HTTPRoute** attached to **`maas-default-gateway`**. Those objects are **owned by the ExternalModel** CR (not by the MaaSModelRef name). + +### HTTPRoute name: one route per ExternalModel + +The **HTTPRoute** `metadata.name` is the **ExternalModel** name (the same string as **`spec.modelRef.name`** on the `MaaSModelRef`). It is **not** the `MaaSModelRef`’s own `metadata.name`, so you can name the model reference `maas-model-my-model` while the ExternalModel (and its HTTPRoute) stay `my-model`. + +If you see **two** HTTPRoutes (for example `my-model` and `maas-model-my-model`), only **`my-model`** is created by the ExternalModel flow in this project. A route whose name matches the **MaaSModelRef** name is usually from **KServe** (an `LLMInferenceService` / `InferenceService` with that name), an old experiment, or a manually applied manifest—not from the ExternalModel reconciler. + +4. **Status** — The **MaaSModelRef** reconciler waits until the HTTPRoute is **Accepted** and **Programmed** on the gateway, then sets **`status.endpoint`** to the **client-facing MaaS URL** (not the raw provider URL). Clients and **`GET /v1/models`** both use that URL. + +5. **Access and quota** — You apply **MaaSAuthPolicy** and **MaaSSubscription** the same way as for on-cluster models. + +```mermaid +flowchart LR + subgraph config [Configuration] + EM[ExternalModel CR] + SEC[Secret credentials] + MMR[MaaSModelRef kind ExternalModel] + end + subgraph control [Controller] + R[Reconciler creates mesh and HTTPRoute] + S[status.endpoint on MaaSModelRef] + end + subgraph policies [Policies] + AP[MaaSAuthPolicy] + SUB[MaaSSubscription] + end + EM --> MMR + SEC --> EM + MMR --> R + R --> S + MMR --> AP + MMR --> SUB +``` + +## Related documentation + +- [MaaSModelRef kinds — ExternalModel](maas-model-kinds.md#externalmodel) — reconciler responsibilities and optional annotations +- [Model listing flow — Registering external models](model-listing-flow.md#registering-external-models) — numbered steps and catalog behavior +- [On-cluster models](model-gateway-and-serving.md) — LLMInferenceService and `maas-default-gateway` (contrast with external routing) diff --git a/docs/content/configuration-and-management/maas-models.md b/docs/content/configuration-and-management/maas-models.md deleted file mode 100644 index f07da0a08..000000000 --- a/docs/content/configuration-and-management/maas-models.md +++ /dev/null @@ -1,54 +0,0 @@ -# MaaS Models - -MaaS uses **MaaSModelRef** to identify model servers that live on the cluster. Each MaaSModelRef is a reference to a model server—it holds the information MaaS needs to perform authentication, authorization, and rate limiting. - -By using a single unified object (MaaSModelRef) for all model types, MaaS can handle different kinds of model servers—each with its own backend and lifecycle—through one consistent interface. The controller uses a **provider paradigm** to distinguish between types: each model type (for example, LLMInferenceService, external APIs) has a provider that knows how to reconcile and resolve that type. - -**Supported LLMs:** Most model families should work; an official validated list is in progress. - -**Supported inference services:** vLLM through LLMInferenceService (KServe) is the initial supported release for on-cluster models; additional backends are planned for future releases. - -## The Model Reference - -A MaaS model is a reference to a model server (for example, an LLMInferenceService or external API). The MaaS controller, running in the **control plane**, reconciles these references and gathers the information needed to route requests and enforce policies—such as the model's endpoint URL and readiness status. - -That information is then used by MaaSSubscription and MaaSAuthPolicy to complete their logic: validating access, selecting subscriptions, and enforcing rate limits. - -## How Model Information Is Used - -Both **MaaSAuthPolicy** (access) and **MaaSSubscription** (quota) reference models by their **MaaSModelRef** name. They rely on the information that MaaSModelRef provides—gathered at the control plane—to: - -- Route requests to the correct model endpoint -- Validate that the user has access to the requested model -- Apply the correct rate limits for that model - -```mermaid -flowchart LR - subgraph Downstream ["Downstream (cluster)"] - ModelServer["Model Server
(e.g. LLMInferenceService)"] - end - - MaaSModelRef["MaaSModelRef"] - - subgraph Policies ["Policies"] - MaaSAuthPolicy["MaaSAuthPolicy"] - MaaSSubscription["MaaSSubscription"] - end - - ModelServer -->|"1. Fetches endpoint,
status, etc."| MaaSModelRef - MaaSModelRef -->|"2. Feeds model info"| MaaSAuthPolicy - MaaSModelRef -->|"2. Feeds model info"| MaaSSubscription - - style MaaSModelRef fill:#1976d2,stroke:#333,stroke-width:2px,color:#fff - style MaaSAuthPolicy fill:#e65100,stroke:#333,stroke-width:2px,color:#fff - style MaaSSubscription fill:#e65100,stroke:#333,stroke-width:2px,color:#fff - style ModelServer fill:#388e3c,stroke:#333,stroke-width:2px,color:#fff -``` - -## Summary - -- **MaaSModelRef** — Stores the reference to a model server; the controller gathers the information needed for auth and routing. -- **MaaSAuthPolicy** and **MaaSSubscription** — Reference models by name and use that information to enforce access and quota. -- **Control plane** — The MaaS controller reconciles model references and populates the data that policies and subscriptions depend on. - -For configuration details and how to create and use MaaSModelRef, see [Quota and Access Configuration](quota-and-access-configuration.md) in the Administration Guide. diff --git a/docs/content/configuration-and-management/model-gateway-and-serving.md b/docs/content/configuration-and-management/model-gateway-and-serving.md new file mode 100644 index 000000000..0ea7c9759 --- /dev/null +++ b/docs/content/configuration-and-management/model-gateway-and-serving.md @@ -0,0 +1,119 @@ +# On-cluster models + +This page covers **on-cluster models**: point an **LLMInferenceService** at **`maas-default-gateway`** so traffic uses MaaS authentication, subscriptions, and rate limits. For end-to-end samples (LLMInferenceService + MaaSModelRef + policies), see [Deploy sample models](../install/model-setup.md). + +**Related topics (canonical detail elsewhere):** + +- Catalog and **`GET /v1/models`** behavior: [Model listing flow](model-listing-flow.md) +- **`spec.modelRef` kinds** (`LLMInferenceService`, `ExternalModel`): [MaaSModelRef kinds](maas-model-kinds.md) +- Access and quotas: [Quota and Access Configuration](quota-and-access-configuration.md) + +!!! tip "Subscription model" + Model access and rate limits use **MaaSModelRef**, **MaaSAuthPolicy**, and **MaaSSubscription**. See [Access and Quota Overview](../concepts/subscription-overview.md). + +## Backends at a glance + +On-cluster models typically use **LLMInferenceService** (for example vLLM via KServe). For **off-cluster** providers, see [External models](external-models.md). See [MaaSModelRef kinds](maas-model-kinds.md) for field semantics. + +## Standard vs MaaS gateway + +MaaS uses a **separate** Gateway API instance for policy enforcement. Only workloads attached to **`maas-default-gateway`** participate in MaaS listing (via **MaaSModelRef**), API keys, and subscription limits. The default KServe/ODH gateway path does not apply those policies. + +The diagram summarizes the split; for platform-wide context see [Architecture](../concepts/architecture.md). + +```mermaid +graph TB + subgraph cluster["OpenShift/K8s Cluster"] + subgraph gateways["Gateway Layer"] + defaultGW["Default Gateway ODH/KServe"] + maasGW["MaaS gateway maas-default-gateway"] + end + + subgraph models["Model Deployments"] + standardModel["LLMInferenceService standard"] + maasModel["LLMInferenceService MaaS-enabled router.gateway.refs"] + end + + defaultGW -.->|routes| standardModel + maasGW -->|routes| maasModel + end + + users["Users"] -->|ODH auth| defaultGW + apiUsers["API clients"] -->|Bearer token| maasGW +``` + +!!! note + **`maas-default-gateway`** is created during MaaS installation; you do not create it by hand for normal setups. + +## Prerequisites + +- MaaS installed with **`maas-default-gateway`** +- An **LLMInferenceService** to configure (or plan to create one) +- Permissions to edit **LLMInferenceService** in the target namespace + +## Configure the gateway reference + +Set **`spec.router.gateway.refs`** so the inference route attaches to **`maas-default-gateway`** in **`openshift-ingress`**. Without this, KServe uses the default gateway and **MaaS policies do not apply**. + +```yaml +apiVersion: serving.kserve.io/v1alpha1 +kind: LLMInferenceService +metadata: + name: my-production-model + namespace: llm +spec: + model: + uri: hf://Qwen/Qwen3-0.6B + name: Qwen/Qwen3-0.6B + replicas: 1 + router: + route: { } + gateway: + refs: + - name: maas-default-gateway + namespace: openshift-ingress + template: + # ... your container / resources ... +``` + +GPU, image, and resource blocks vary by model; see [Deploy sample models](../install/model-setup.md) for full samples. + +!!! warning "Legacy tier annotation" + The annotation **`alpha.maas.opendatahub.io/tiers`** applied to **LLMInferenceService** was part of the **legacy tier-based** access model (automatic tier RBAC). Current deployments should use **MaaSSubscription** and **MaaSAuthPolicy** instead. If you still maintain tier annotations, see [Tier to Subscription migration](../migration/tier-to-subscription.md). + +## MaaSModelRef metadata (optional) + +After the LLMInferenceService uses the MaaS gateway, register it with a **MaaSModelRef** and optional display annotations for **`GET /v1/models`**. See [CRD annotations](crd-annotations.md) for the full list. + +```yaml +apiVersion: maas.opendatahub.io/v1alpha1 +kind: MaaSModelRef +metadata: + name: my-production-model + namespace: llm + annotations: + openshift.io/display-name: "My Production Model" +spec: + modelRef: + kind: LLMInferenceService + name: my-production-model +``` + +## Update an existing LLMInferenceService + +**Patch:** + +```bash +kubectl patch llminferenceservice my-production-model -n llm --type='json' -p='[ + { + "op": "add", + "path": "/spec/router/gateway/refs/-", + "value": { + "name": "maas-default-gateway", + "namespace": "openshift-ingress" + } + } +]' +``` + +Or **`kubectl edit llminferenceservice my-production-model -n llm`** and set **`spec.router.gateway.refs`** as in the YAML above. diff --git a/docs/content/configuration-and-management/model-listing-flow.md b/docs/content/configuration-and-management/model-listing-flow.md index f898ade3e..64ed3a898 100644 --- a/docs/content/configuration-and-management/model-listing-flow.md +++ b/docs/content/configuration-and-management/model-listing-flow.md @@ -138,28 +138,6 @@ All models in the response include a `subscriptions` array with metadata for eac } ``` -### Deduplication Behavior - -Models are deduplicated by `(id, url, ownedBy)` key: - -- **Same id + same URL + same MaaSModelRef (ownedBy)**: Single entry with subscriptions aggregated into the `subscriptions` array -- **Different id, URL, or MaaSModelRef**: Separate entries - -**User token authentication** (multiple subscriptions): -- Model `gpt-3.5` from MaaSModelRef `namespace-a/model-a` at URL `https://example.com/gpt-3.5` is accessible via subscriptions A and B - - Result: One entry with `subscriptions: [{name: "A"}, {name: "B"}]` -- Model `gpt-3.5` from MaaSModelRef `namespace-b/model-b` at the same URL is only in subscription B - - Result: Separate entry with `subscriptions: [{name: "B"}]` (different MaaSModelRef) -- Model `gpt-3.5` at URL `https://example.com/gpt-3.5-premium` from `namespace-a/model-a` is only in subscription B - - Result: Separate entry with `subscriptions: [{name: "B"}]` (different URL) - -**API key authentication** (single subscription): -- Deduplication handles edge cases where multiple MaaSModelRef resources point to the same model endpoint -- Each unique MaaSModelRef resource appears as a separate entry - -!!! tip "Subscription metadata fields" - The `displayName` and `description` fields are read from the MaaSSubscription CRD's `spec.displayName` and `spec.description` fields. If these fields are not set in the CRD, they will be empty strings in the response. - ## Registering models To have models appear via the **MaaSModelRef** flow: @@ -195,4 +173,4 @@ You can use the [maas-system samples](https://github.com/opendatahub-io/models-a - [MaaS Controller README](https://github.com/opendatahub-io/models-as-a-service/tree/main/maas-controller) — install and MaaSModelRef/MaaSAuthPolicy/MaaSSubscription - [Model setup](./model-setup.md) — configuring LLMInferenceServices (gateway reference) as backends for MaaSModelRef -- [Architecture](../architecture.md) — overall MaaS architecture +- [Architecture](../concepts/architecture.md) — overall MaaS architecture diff --git a/docs/content/configuration-and-management/model-setup.md b/docs/content/configuration-and-management/model-setup.md index 5d64939e3..4218f57b8 100644 --- a/docs/content/configuration-and-management/model-setup.md +++ b/docs/content/configuration-and-management/model-setup.md @@ -318,7 +318,7 @@ curl -sSk -H "Authorization: Bearer $TOKEN" \ ## References -- [Access and Quota Overview](subscription-overview.md) - Configure policies and subscriptions +- [Access and Quota Overview](../concepts/subscription-overview.md) - Configure policies and subscriptions - [Quota and Access Configuration](quota-and-access-configuration.md) - Detailed configuration -- [Architecture Overview](../architecture.md) - Understand the overall MaaS architecture +- [Architecture Overview](../concepts/architecture.md) - Understand the overall MaaS architecture - [KServe LLMInferenceService Documentation](https://kserve.github.io/website/) - Official KServe documentation diff --git a/docs/content/configuration-and-management/namespace-rbac.md b/docs/content/configuration-and-management/namespace-rbac.md new file mode 100644 index 000000000..a42b84909 --- /dev/null +++ b/docs/content/configuration-and-management/namespace-rbac.md @@ -0,0 +1,85 @@ +# Namespace User Permissions + +This page describes the RBAC permissions for MaaS custom resources in user namespaces. + +## ClusterRoles + +MaaS provides two aggregated ClusterRoles that extend the standard Kubernetes/OpenShift roles with permissions for MaaS resources: + +- **`maas-owner-role`** - Aggregates to `admin` and `edit` roles +- **`maas-viewer-role`** - Aggregates to `view`, `admin`, and `edit` roles + +This allows namespace admins and contributors to create and manage MaaS resources without requiring cluster-admin intervention. + +## Permission Matrix + +| User Role | Resources | Permissions | +|-----------|-----------|-------------| +| **admin** | `MaaSModelRef`, `ExternalModel` | `create`, `delete`, `get`, `list`, `patch`, `update`, `watch` | +| **edit** | `MaaSModelRef`, `ExternalModel` | `create`, `delete`, `get`, `list`, `patch`, `update`, `watch` | +| **view** | `MaaSModelRef`, `ExternalModel` | `get`, `list`, `watch` | + +### Included Resources + +- **MaaSModelRef** - References to model backends (LLMInferenceService or ExternalModel backend) +- **ExternalModel** - External LLM provider definitions (OpenAI, Anthropic, etc.) + +### Excluded Resources + +The following platform-managed resources are **not** included: +- **MaaSSubscription** - Managed in the `models-as-a-service` namespace by platform admins +- **MaaSAuthPolicy** - Managed in the `models-as-a-service` namespace by platform admins + + +## Verification + +### For Namespace Users + +To verify your permissions in a namespace: + +```bash +# Check if you can create MaaSModelRef +kubectl auth can-i create maasmodelref -n my-models + +# Check if you can list MaaSModelRef +kubectl auth can-i list maasmodelref -n my-models +``` + +### For Platform Administrators + +To verify the ClusterRoles are correctly installed and aggregated, run the RBAC verification script at `scripts/verify-rbac-aggregation.sh` in the repository root: + +```bash +./scripts/verify-rbac-aggregation.sh +``` + +## Troubleshooting + +### "Forbidden" Error When Creating MaaSModelRef + +**Problem:** +```text +Error from server (Forbidden): maasmodelrefs.maas.opendatahub.io is forbidden: +User "user@example.com" cannot create resource "maasmodelrefs" in API group +"maas.opendatahub.io" in the namespace "my-models" +``` + +**Solution:** + +You need the `admin` or `edit` role in the namespace. Ask your platform administrator to grant you access: + +```bash +kubectl create rolebinding my-models-admin --clusterrole=admin --user=user@example.com -n my-models +``` + +### Cannot Create MaaSSubscription + +**Problem:** You get a "Forbidden" error when trying to create a MaaSSubscription. + +**Solution:** This is expected. `MaaSSubscription` and `MaaSAuthPolicy` are platform-managed resources and can only be created by cluster administrators. Contact your platform administrator if you need a new subscription. + +## Related Documentation + +- [Model Setup Guide](model-setup.md) - How to configure models for MaaS +- [Quota and Access Configuration](quota-and-access-configuration.md) - Platform admin guide for subscriptions +- [Self-Service Model Access](../user-guide/self-service-model-access.md) - End user guide for using models via API diff --git a/docs/content/configuration-and-management/quota-and-access-configuration.md b/docs/content/configuration-and-management/quota-and-access-configuration.md index 62c9d1826..63684fe11 100644 --- a/docs/content/configuration-and-management/quota-and-access-configuration.md +++ b/docs/content/configuration-and-management/quota-and-access-configuration.md @@ -1,6 +1,6 @@ # Quota and Access Configuration -This guide provides step-by-step instructions for configuring MaaSModelRef, MaaSAuthPolicy, and MaaSSubscription. For conceptual overview, see [Access and Quota Overview](subscription-overview.md) and [MaaS Models](maas-models.md). +This guide provides step-by-step instructions for configuring MaaSModelRef, MaaSAuthPolicy, and MaaSSubscription. For conceptual overview, see [Access and Quota Overview](../concepts/subscription-overview.md) and [Model Reference](../concepts/model-reference.md). ## Prerequisites @@ -136,6 +136,13 @@ TRLP=$(kubectl get tokenratelimitpolicy -n ${MODEL_NS} -l maas.opendatahub.io/mo [[ -n "$TRLP" ]] && kubectl wait --for=condition=Enforced=true tokenratelimitpolicy/${TRLP} -n ${MODEL_NS} --timeout=120s ``` +!!! warning "Multiple model references on one HTTPRoute" + **This limitation affects v3.4 deployments.** More than one **MaaSModelRef** on the same route can break independent per-subscription limits—only one **TokenRateLimitPolicy** is fully effective at the gateway. For **MaaSSubscription** readiness, the controller checks each TRLP’s **`Accepted`** condition; Kuadrant may still show **`Enforced`** and **`Overridden`** (or similar **`reason`**) when policies conflict on one route. + + **Planning guidance:** Prefer **one HTTPRoute per model** when different subscriptions need separate limits. Putting models on a shared route “by tier” still implies **multiple TRLPs** if **multiple** **MaaSModelRef** resources target that route—it only aligns with this limitation when **every** model on the route is meant to share **one** **MaaSSubscription** (and access policy) story. + + See [Subscription limitations and known issues](subscription-known-issues.md#token-rate-limits-when-multiple-model-references-share-one-httproute) for `kubectl`/`jq` examples and workarounds. + !!! note "Namespace requirements" Both **MaaSAuthPolicy** and **MaaSSubscription** must be installed in the `models-as-a-service` namespace. Each `modelRefs` entry must specify the `namespace` where the MaaSModelRef lives (e.g. `llm`). @@ -257,7 +264,7 @@ kubectl wait --for=condition=Enforced=true tokenratelimitpolicy/ -n ## Related Documentation -- [Access and Quota Overview](subscription-overview.md) — How policies and subscriptions work together -- [MaaS Models](maas-models.md) — Conceptual overview +- [Access and Quota Overview](../concepts/subscription-overview.md) — How policies and subscriptions work together +- [Model Reference](../concepts/model-reference.md) — Conceptual overview - [Token Management](token-management.md) - [Validation](../install/validation.md) diff --git a/docs/content/configuration-and-management/subscription-known-issues.md b/docs/content/configuration-and-management/subscription-known-issues.md index 8f55af505..6dfac9ffe 100644 --- a/docs/content/configuration-and-management/subscription-known-issues.md +++ b/docs/content/configuration-and-management/subscription-known-issues.md @@ -1,4 +1,4 @@ -# Subscription Known Issues +# Subscription limitations and known issues This document describes known issues and operational considerations for the subscription-based MaaS Platform. @@ -40,8 +40,57 @@ API keys store the user's groups and bound subscription name at creation time. I - Revoke and recreate API keys when users change groups - Use OpenShift tokens for interactive use when group membership changes frequently (tokens reflect live group membership) +## Token rate limits when multiple model references share one HTTPRoute + +**Impact:** High + +**Description:** + +When more than one **MaaSModelRef** resolves to the **same** **HTTPRoute**, the controller creates multiple **TokenRateLimitPolicy** resources targeting that route. Kuadrant then **enforces only one** of them in practice, so **per-subscription token limits may not all apply** even though CRs look valid. + +The **MaaS controller** treats a TRLP as healthy for **MaaSSubscription** status using the Kuadrant **`Accepted`** condition on each `TokenRateLimitPolicy`. Kuadrant also publishes runtime conditions such as **`Enforced`**; when multiple TRLPs conflict on one route you may see **`Enforced`** = True on one policy and **`Overridden`** (or similar) on others—check **`status.conditions`** (and **`reason`** / **`message`**) on each TRLP. + +**Detection:** + +List TRLPs that target an HTTPRoute, then inspect **`Accepted`** (controller readiness) and **`Enforced`** (gateway application): + +```bash +# List TRLPs that target an HTTPRoute (namespace/name → route name) +kubectl get tokenratelimitpolicy -A -o json | jq -r '.items[] | select(.spec.targetRef.kind=="HTTPRoute") | "\(.metadata.namespace)/\(.metadata.name) → \(.spec.targetRef.name)"' | sort + +# Accepted + Enforced condition status per TRLP (needs jq; if this fails, use kubectl describe on each TRLP) +kubectl get tokenratelimitpolicy -A -o json | jq -r ' + .items[] | select(.spec.targetRef.kind == "HTTPRoute") + | . as $i + | (($i.status.conditions // []) | map(select(.type == "Accepted")) | .[0]) as $a + | (($i.status.conditions // []) | map(select(.type == "Enforced")) | .[0]) as $e + | [ + $i.metadata.namespace, + $i.metadata.name, + $i.spec.targetRef.name, + (($a // {}) | .status // "?"), + (($e // {}) | .status // "?"), + (($e // {}) | .reason // "") + ] | @tsv' +``` + +**How to recognize it:** Several TRLPs share the same `spec.targetRef.name`. Compare **`Accepted`** (what the MaaS controller uses for subscription readiness) and **`Enforced`** / **`reason`** (for example **`Overridden`**) on each policy—one route may show one TRLP fully effective and others superseded. + +**Workarounds:** + +1. **Dedicated routes per model** — Deploy each model with its own HTTPRoute to ensure independent rate limiting +2. **Shared subscription design** — If models share an **HTTPRoute**, use **one** **MaaSSubscription** that lists every **MaaSModelRef** on that route so you are not applying **different** subscription limits to the same route. The controller may still create **one TRLP per model ref**; **prefer (1)** when each subscription must enforce limits independently until **Tracking** below ships. +3. **Route consolidation by tier** — **Yes:** if **multiple** **MaaSModelRef** resources still target the **same** **HTTPRoute**, you still get **multiple TRLPs**; grouping models by tier on shared routes does **not** change that by itself. Treat “premium” vs “free” as an operational label only. This pattern is **only** appropriate when **every** model on that shared route is meant to share **one** **MaaSSubscription** and a consistent **MaaSAuthPolicy** access story—**not** when different teams or subscriptions each register their own model refs on one route. If you need **separate** subscriptions with **separate** limits on the same route, use **dedicated routes per model** (1). + +**Status in v3.4:** + +This limitation **remains in Models-as-a-Service v3.4**. The fix requiring merge strategy support for TokenRateLimitPolicy is not included. Plan your model deployment topology accordingly. + +**Tracking:** [opendatahub-io/models-as-a-service#585](https://github.com/opendatahub-io/models-as-a-service/pull/585) proposes the controller change for coexisting token rate limit policies on a shared route. + ## Related Documentation - [Understanding Token Management](token-management.md) -- [Access and Quota Overview](subscription-overview.md) +- [Access and Quota Overview](../concepts/subscription-overview.md) - [Quota and Access Configuration](quota-and-access-configuration.md) +- [MaaS Controller Overview](maas-controller-overview.md) diff --git a/docs/content/configuration-and-management/tls-configuration.md b/docs/content/configuration-and-management/tls-configuration.md index 904747c8c..a6757ba2a 100644 --- a/docs/content/configuration-and-management/tls-configuration.md +++ b/docs/content/configuration-and-management/tls-configuration.md @@ -191,9 +191,8 @@ Pre-configured overlays are available for common scenarios: | Overlay | Description | |---------|-------------| | `deployment/base/maas-api/overlays/tls` | Base TLS overlay for maas-api (deployment patch, service annotation, DestinationRule) | -| `deployment/overlays/tls-backend` | Full TLS deployment with Authorino configuration | -| `deployment/overlays/tls-backend-disk` | TLS + persistent storage (PVC) | -| `deployment/overlays/http-backend` | HTTP only (development/testing) | +| `maas-api/deploy/overlays/odh` | Tenant reconciler overlay (TLS, gateway policies, shared-patches) | +| `deployment/overlays/odh` | ODH operator overlay (TLS, controller, gateway policies, observability) | The `tls` base overlay includes: @@ -203,11 +202,9 @@ The `tls` base overlay includes: | `service-patch.yaml` | Add serving-cert annotation, expose port 8443 | | `destinationrule.yaml` | Configure gateway TLS to maas-api backend | -Deploy using: - -```bash -kustomize build --load-restrictor LoadRestrictionsNone deployment/overlays/tls-backend | kubectl apply -f - -``` +maas-api is deployed by the Tenant reconciler in `maas-controller`. The `deploy.sh` script +installs prerequisites (policy engine, PostgreSQL, Authorino TLS) and then deploys +`maas-controller`, which creates the `default-tenant` CR and reconciles maas-api via SSA. ## Verifying TLS Configuration diff --git a/docs/content/configuration-and-management/token-management.md b/docs/content/configuration-and-management/token-management.md index c15b3f76c..ae3b4caac 100644 --- a/docs/content/configuration-and-management/token-management.md +++ b/docs/content/configuration-and-management/token-management.md @@ -3,7 +3,7 @@ This guide explains the authentication and credential management used to access models in the MaaS Platform. !!! tip "API keys (current)" - The platform uses **API keys** (`sk-oai-*`) stored in PostgreSQL for programmatic access. Create keys via `POST /v1/api-keys` (authenticate with your OpenShift token) and use them with the `Authorization: Bearer` header. Each key is bound to one MaaSSubscription at creation time (optional `subscription` in the request body; if omitted, the **highest `spec.priority`** subscription you can access is chosen). See [Quota and Access Configuration](quota-and-access-configuration.md) and [Subscription Known Issues](subscription-known-issues.md). + The platform uses **API keys** (`sk-oai-*`) stored in PostgreSQL for programmatic access. Create keys via `POST /v1/api-keys` (authenticate with your OpenShift token) and use them with the `Authorization: Bearer` header. Each key is bound to one MaaSSubscription at creation time (optional `subscription` in the request body; if omitted, the **highest `spec.priority`** subscription you can access is chosen). See [Quota and Access Configuration](quota-and-access-configuration.md) and [Subscription limitations and known issues](subscription-known-issues.md). !!! note "Prerequisites" This document assumes you have configured subscriptions (MaaSAuthPolicy, MaaSSubscription). diff --git a/docs/content/configuration-and-management/troubleshooting-external-model-rbac.md b/docs/content/configuration-and-management/troubleshooting-external-model-rbac.md new file mode 100644 index 000000000..8ff70e378 --- /dev/null +++ b/docs/content/configuration-and-management/troubleshooting-external-model-rbac.md @@ -0,0 +1,83 @@ +# Troubleshooting: ExternalModel Service `ownerReference` / finalizers (RBAC) + +## Symptoms + +- `external-model-reconciler` logs show Service create failing with: + + ```text + cannot set blockOwnerDeletion if an ownerReference refers to a resource you can't set finalizers on + ``` + +- `MaaSModelRef` objects that reference `ExternalModel` backends stay `Pending` with a backend-not-ready reason. + +## Cause + +The reconciler sets a **controller `ownerReference`** on the `Service` it creates for an `ExternalModel`. With that pattern, the API server checks that the controller identity can **`update` the `externalmodels/finalizers` subresource** (OwnerReferencesPermissionEnforcement). + +If the **`maas-controller` ServiceAccount** is not allowed that verb on that subresource, Service creation fails before routes are healthy. + +## What to fix + +1. **ClusterRole** `maas-controller-role` must include a rule that allows `update` on `externalmodels/finalizers` for API group `maas.opendatahub.io`. + + Source manifest in this repository: `deployment/base/maas-controller/rbac/clusterrole.yaml`. + +2. **ClusterRoleBinding** `maas-controller-rolebinding` must bind that `ClusterRole` to the **`maas-controller` ServiceAccount** in the namespace where the controller runs (commonly `opendatahub` when using the ODH overlay). + +On OpenShift, the `ModelsAsService` component may **own** these objects; if your live `ClusterRole` is missing the `externalmodels/finalizers` rule, upgrade or re-apply the manifest from this repo, or reconcile the component so the shipped RBAC matches. + +## How to verify (important) + +`oc auth can-i` **does not** treat `externalmodels/finalizers` as a single resource name the same way RBAC does. Using the slash form often returns **`no` even when the rule is present**. + +Use the **`--subresource=finalizers`** form instead: + +```bash +# Replace NAMESPACE with the namespace where ExternalModel CRs live (e.g. llm) +# Replace SA_NAMESPACE with the controller ServiceAccount namespace (e.g. opendatahub) + +oc auth can-i update externalmodels --subresource=finalizers \ + -n NAMESPACE \ + --as=system:serviceaccount:SA_NAMESPACE:maas-controller +``` + +You should see **`yes`**. + +**Incorrect (misleading false negative):** + +```bash +# Often prints "no" even when RBAC is correct — do not use for verification +oc auth can-i update externalmodels/finalizers -n NAMESPACE \ + --as=system:serviceaccount:SA_NAMESPACE:maas-controller +``` + +## Optional: add the rule with `oc patch` + +If you must patch the live `ClusterRole` (for example before an operator update ships the rule): + +```bash +oc patch clusterrole maas-controller-role --type=json -p='[ + { + "op": "add", + "path": "/rules/-", + "value": { + "apiGroups": ["maas.opendatahub.io"], + "resources": ["externalmodels/finalizers"], + "verbs": ["update"] + } + } +]' +``` + +Then verify with the **`--subresource=finalizers`** command above, not the slash form. + +## What we changed in docs (2026-04-14) + +- Documented that **`oc auth can-i update externalmodels/finalizers`** can incorrectly report **`no`** when permission exists. +- Documented the supported check: **`oc auth can-i update externalmodels --subresource=finalizers`**. +- Pointed to **`deployment/base/maas-controller/rbac/clusterrole.yaml`** as the in-repo source for the `maas-controller-role` rules. + +## Related + +- [Namespace user permissions (RBAC)](namespace-rbac.md) +- [MaaS controller overview](maas-controller-overview.md) diff --git a/docs/content/index.md b/docs/content/index.md index beb90168e..843862a6c 100644 --- a/docs/content/index.md +++ b/docs/content/index.md @@ -11,16 +11,18 @@ Use this platform to streamline the deployment of your models, monitor usage, an ### 🚀 Getting Started - **[QuickStart Guide](quickstart.md)** - Complete platform deployment instructions -- **[Architecture](architecture.md)** - Overview of the MaaS Platform architecture +- **[Architecture](concepts/architecture.md)** - Overview of the MaaS Platform architecture ### âš™ī¸ Configuration & Management -- **[Access and Quota Overview](configuration-and-management/subscription-overview.md)** - Policies (access) and subscriptions (quota) for model access +- **[Access and Quota Overview](concepts/subscription-overview.md)** - Policies (access) and subscriptions (quota) for model access +- **[Subscription limitations and known issues](configuration-and-management/subscription-known-issues.md)** - Rate limits on shared routes, API keys, caching, and other planning notes - **[Model Setup (On Cluster)](configuration-and-management/model-setup.md)** - Setting up models for MaaS - **[Self-Service Model Access](user-guide/self-service-model-access.md)** - Managing model access and policies ### 📋 Release Notes +- **[Release notes](release-notes/index.md)** - Version highlights and known limitations by release ### 🔧 Advanced Administration diff --git a/docs/content/install/maas-setup.md b/docs/content/install/maas-setup.md index f175eb475..3ddb7276b 100644 --- a/docs/content/install/maas-setup.md +++ b/docs/content/install/maas-setup.md @@ -185,12 +185,52 @@ After creating the database Secret and Gateways, create or update your DataScien - **MaaS API AuthPolicy** (maas-api-auth-policy) - Protects the MaaS API endpoint - **NetworkPolicy** (maas-authorino-allow) - Allows Authorino to reach MaaS API + ### ModelsAsService CR + + With `modelsAsService` **Managed**, the [Open Data Hub operator](https://github.com/opendatahub-io/opendatahub-operator) reconciles a **cluster-scoped** `ModelsAsService` object. The resource name **must** be `default-modelsasservice` (only one instance per cluster). The authoritative API definition is in the operator repo: [`modelsasservice_types.go`](https://github.com/opendatahub-io/opendatahub-operator/blob/main/api/components/v1alpha1/modelsasservice_types.go). + + **Nothing in `spec` is required for a default install.** If you omit `spec`, the operator uses the same defaults as this guide: Gateway **`openshift-ingress` / `maas-default-gateway`**, and telemetry metric toggles use the defaults described below. + + | Field | What to set | + | ----- | ----------- | + | `spec.gatewayRef.namespace` | Namespace of your Gateway API `Gateway` (default `openshift-ingress`). | + | `spec.gatewayRef.name` | Name of that `Gateway` (default `maas-default-gateway`). Set these if your MaaS hostname is exposed through a different Gateway than the default. | + | `spec.apiKeys.maxExpirationDays` | Maximum allowed API key lifetime in **days**. When set, users cannot mint keys with a longer lifetime than this value (via `expiresIn`). Optional; if unset, the operator does not apply a cap through this field (see also `maas-api` / `API_KEY_MAX_EXPIRATION_DAYS` in your deployment). | + | `spec.telemetry.metrics.captureOrganization` | Include `organization_id` on metrics (default `true`). | + | `spec.telemetry.metrics.captureUser` | Include user labels on metrics (default `false`; privacy-sensitive). | + | `spec.telemetry.metrics.captureGroup` | Include group labels on metrics (default `false`; higher cardinality). | + | `spec.telemetry.metrics.captureModelUsage` | Include model labels on usage metrics (default `true`). | + + Example (patch common values): + + ```yaml + apiVersion: components.platform.opendatahub.io/v1alpha1 + kind: ModelsAsService + metadata: + name: default-modelsasservice + spec: + gatewayRef: + namespace: openshift-ingress + name: maas-default-gateway + apiKeys: + maxExpirationDays: 90 + telemetry: + metrics: + captureUser: false + captureGroup: false + ``` + + ```bash + kubectl apply -f modelsasservice.yaml + kubectl get modelsasservice default-modelsasservice -o yaml + ``` + === "Kustomize" !!! note "Development and early testing" Kustomize deployment can be used for **development and early testing purposes**. For production, use the Managed tab above. - Set `modelsAsService` to **Unmanaged** so the operator does not deploy the MaaS API, then deploy MaaS via the ODH overlay: + Set `modelsAsService` to **Removed** so the operator does not deploy the MaaS API, then deploy MaaS via the ODH overlay: ```yaml kubectl apply -f - < + +``` + +**Benefits**: +- Better UX than swagger-ui +- Supports examples, tutorials +- Can embed in existing docs + +### Phase 4: Automation (Lower Priority) + +#### 4.1 Auto-generate from Code Annotations +**Goal**: Generate spec from Go code + +**Tool**: [swaggo/swag](https://github.com/swaggo/swag) + +**Trade-offs**: +- Pro: Single source of truth (code) +- Pro: Can't drift from implementation +- Con: Requires refactoring all handlers +- Con: Annotations clutter code +- **Decision**: Defer until spec stabilizes + +#### 4.2 Mock Server for Development +**Goal**: Frontend can develop against spec before backend ready + +**Tool**: [Prism](https://stoplight.io/open-source/prism) + +**Implementation**: +```bash +# Run mock server +prism mock maas-api/openapi3.yaml +# Returns example responses from spec +``` + +**Benefits**: +- Parallel frontend/backend development +- Can test edge cases +- Useful for demos + +## Recommended Implementation Order + +1. **Week 1**: Spectral validation in CI +2. **Week 2**: Breaking change detection +3. **Week 3**: Contract testing (dredd) +4. **Week 4**: Client SDK generation (Python) + +## Success Metrics + +- Zero spec validation errors +- No breaking changes merged without approval +- 100% endpoint coverage in contract tests +- Client SDK published to PyPI + +## Open Questions + +1. Should we version the API (v1, v2)? +2. Who owns spec updates (backend team only or shared)? +3. Should we enforce spec-first development (spec then code)? +4. Do we want runtime validation in production (performance impact)? diff --git a/docs/samples/maas-system/README.md b/docs/samples/maas-system/README.md index 3a44b40e6..67499017d 100644 --- a/docs/samples/maas-system/README.md +++ b/docs/samples/maas-system/README.md @@ -10,6 +10,7 @@ Bundled samples that deploy LLMInferenceService + MaaSModelRef + MaaSAuthPolicy | **premium** | premium-user | premium-simulated-simulated-premium | 1000/min | | **facebook-opt-125m-cpu** | system:authenticated | facebook-opt-125m-cpu-single-node-no-scheduler-cpu | 100/min | | **qwen3** | system:authenticated | qwen3-single-node-no-scheduler-nvidia-gpu | 100/min | +| **granite-3-1-8b-rhelai-modelcar** | system:authenticated | granite-3-1-8b-rhelai-modelcar-single-node-cpu (LLMIS in `llm`) | 10000/min | ## Usage @@ -25,6 +26,7 @@ kustomize build docs/samples/maas-system/ | kubectl apply -f - # Or deploy a specific sample kustomize build docs/samples/maas-system/facebook-opt-125m-cpu/ | kubectl apply -f - kustomize build docs/samples/maas-system/qwen3/ | kubectl apply -f - +kustomize build docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/ | kubectl apply -f - # Verify kubectl get maasmodelref -n llm diff --git a/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/kustomization.yaml b/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/kustomization.yaml new file mode 100644 index 000000000..b19766004 --- /dev/null +++ b/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - llm + - maas diff --git a/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/llm/kustomization.yaml b/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/llm/kustomization.yaml new file mode 100644 index 000000000..f06dac931 --- /dev/null +++ b/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/llm/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../../../models/granite-3-1-8b-rhelai-modelcar diff --git a/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/maas/kustomization.yaml b/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/maas/kustomization.yaml new file mode 100644 index 000000000..849ad5197 --- /dev/null +++ b/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/maas/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + + +resources: + - maas-model.yaml + - maas-auth-policy.yaml + - maas-subscription.yaml diff --git a/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/maas/maas-auth-policy.yaml b/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/maas/maas-auth-policy.yaml new file mode 100644 index 000000000..0539186bd --- /dev/null +++ b/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/maas/maas-auth-policy.yaml @@ -0,0 +1,12 @@ +apiVersion: maas.opendatahub.io/v1alpha1 +kind: MaaSAuthPolicy +metadata: + name: granite-3-1-8b-starter-access + namespace: models-as-a-service +spec: + modelRefs: + - name: granite-3-1-8b-starter + subjects: + groups: + - name: system:authenticated + users: [] diff --git a/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/maas/maas-model.yaml b/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/maas/maas-model.yaml new file mode 100644 index 000000000..eabc8d1a7 --- /dev/null +++ b/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/maas/maas-model.yaml @@ -0,0 +1,14 @@ +apiVersion: maas.opendatahub.io/v1alpha1 +kind: MaaSModelRef +metadata: + name: granite-3-1-8b-starter + namespace: llm + annotations: + openshift.io/display-name: "IBM Granite 3.1 8B Instruct (RHEL AI model car + vLLM CPU)" + openshift.io/description: "MaaS model ref for LLMIS from docs/samples/models/granite-3-1-8b-rhelai-modelcar (kustomize namePrefix + single-node-cpu)." +spec: + modelRef: + kind: LLMInferenceService + # kustomize build docs/samples/models/granite-3-1-8b-rhelai-modelcar → granite-3-1-8b-rhelai-modelcar-single-node-cpu + name: granite-3-1-8b-rhelai-modelcar-single-node-cpu + namespace: llm diff --git a/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/maas/maas-subscription.yaml b/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/maas/maas-subscription.yaml new file mode 100644 index 000000000..7aa36e667 --- /dev/null +++ b/docs/samples/maas-system/granite-3-1-8b-rhelai-modelcar/maas/maas-subscription.yaml @@ -0,0 +1,16 @@ +apiVersion: maas.opendatahub.io/v1alpha1 +kind: MaaSSubscription +metadata: + name: granite-3-1-8b-starter-subscription + namespace: models-as-a-service +spec: + owner: + groups: + - name: system:authenticated + users: [] + modelRefs: + - name: granite-3-1-8b-starter + namespace: llm + tokenRateLimits: + - limit: 10000 + window: 1m diff --git a/docs/samples/models/README.md b/docs/samples/models/README.md index 98976f155..a378cb903 100644 --- a/docs/samples/models/README.md +++ b/docs/samples/models/README.md @@ -9,6 +9,7 @@ This directory contains `LLMInferenceService`s for deploying sample models. Plea - **facebook-opt-125m-cpu** - Facebook OPT 125M model (CPU-based) - **qwen3** - Qwen3 model (GPU-based with autoscaling) - **ibm-granite-2b-gpu** - IBM Granite 2B Instruct model (GPU-based, supports instructions) +- **granite-3-1-8b-rhelai-modelcar** - Granite 3.1 8B Instruct via Red Hat model car OCI + `vllm-cpu-rhel9` (CPU; see comments in `model.yaml`) ## Deployment @@ -23,7 +24,7 @@ kubectl create namespace llm Deploy any model using: ```bash -MODEL_NAME=simulator # or simulator-premium, facebook-opt-125m-cpu, qwen3, or ibm-granite-2b-gpu +MODEL_NAME=simulator # or simulator-premium, facebook-opt-125m-cpu, qwen3, ibm-granite-2b-gpu, granite-3-1-8b-rhelai-modelcar kustomize build docs/samples/models/$MODEL_NAME | kubectl apply -f - ``` diff --git a/docs/samples/models/e2e-distinct-2-simulated/model.yaml b/docs/samples/models/e2e-distinct-2-simulated/model.yaml index 0821d0cb1..347a27dac 100644 --- a/docs/samples/models/e2e-distinct-2-simulated/model.yaml +++ b/docs/samples/models/e2e-distinct-2-simulated/model.yaml @@ -17,9 +17,8 @@ spec: template: containers: - name: main - image: "ghcr.io/llm-d/llm-d-inference-sim:v0.7.1" + image: "ghcr.io/llm-d/llm-d-inference-sim:v0.8.2" imagePullPolicy: Always - command: ["/app/llm-d-inference-sim"] args: - --port - "8000" @@ -27,6 +26,7 @@ spec: - test/e2e-distinct-model-2 - --mode - random + - --no-mm-encoder-only - --ssl-certfile - /var/run/kserve/tls/tls.crt - --ssl-keyfile @@ -42,6 +42,11 @@ spec: fieldRef: apiVersion: v1 fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP ports: - name: https containerPort: 8000 diff --git a/docs/samples/models/e2e-distinct-simulated/model.yaml b/docs/samples/models/e2e-distinct-simulated/model.yaml index 4e1164b9a..22fde7752 100644 --- a/docs/samples/models/e2e-distinct-simulated/model.yaml +++ b/docs/samples/models/e2e-distinct-simulated/model.yaml @@ -17,9 +17,8 @@ spec: template: containers: - name: main - image: "ghcr.io/llm-d/llm-d-inference-sim:v0.7.1" + image: "ghcr.io/llm-d/llm-d-inference-sim:v0.8.2" imagePullPolicy: Always - command: ["/app/llm-d-inference-sim"] args: - --port - "8000" @@ -27,6 +26,7 @@ spec: - test/e2e-distinct-model - --mode - random + - --no-mm-encoder-only - --ssl-certfile - /var/run/kserve/tls/tls.crt - --ssl-keyfile @@ -42,6 +42,11 @@ spec: fieldRef: apiVersion: v1 fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP ports: - name: https containerPort: 8000 diff --git a/docs/samples/models/granite-3-1-8b-rhelai-modelcar/kustomization.yaml b/docs/samples/models/granite-3-1-8b-rhelai-modelcar/kustomization.yaml new file mode 100644 index 000000000..85af139c4 --- /dev/null +++ b/docs/samples/models/granite-3-1-8b-rhelai-modelcar/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +metadata: + name: granite-3-1-8b-rhelai-modelcar-single-node-cpu + +namespace: llm + +namePrefix: granite-3-1-8b-rhelai-modelcar- + +resources: + - model.yaml diff --git a/docs/samples/models/granite-3-1-8b-rhelai-modelcar/model.yaml b/docs/samples/models/granite-3-1-8b-rhelai-modelcar/model.yaml new file mode 100644 index 000000000..34a890972 --- /dev/null +++ b/docs/samples/models/granite-3-1-8b-rhelai-modelcar/model.yaml @@ -0,0 +1,73 @@ +# LLMInferenceService — Granite 3.1 8B: Red Hat *model car* (weights) + Red Hat *vLLM runtime* (server). +# +# ## Why you saw: `vllm: command not found` (often around “line 129” of a generated script) +# KServe / OpenShift AI generates a launcher script that runs `vllm ...`. That only works when +# the **predictor container image** is a vLLM runtime (e.g. rhaiis/vllm-*-rhel9) where `vllm` +# is on PATH. +# +# The image `registry.redhat.io/rhelai1/modelcar-granite-3-1-8b-starter-v1:1.4` is a **ModelCar**: +# it carries model artifacts (typically under `/models`). It is **not** a substitute for the +# vLLM ServingRuntime image. Using the modelcar as `template.containers[].image` leaves no +# `vllm` binary where the script expects it → `vllm: command not found`. +# +# **Correct split:** `spec.model.uri` = `oci://â€Ļ` modelcar (weights) + `template.containers[].image` +# = Red Hat AI Inference Server vLLM image (CPU or CUDA tag per your cluster). +# +# Pin `rhaiis/vllm-cpu-rhel9` / `vllm-cuda-rhel9` tags to what your OpenShift AI / subscription documents. +# GPU: add nodeSelector/tolerations and `nvidia.com/gpu` limits; switch image to vllm-cuda-rhel9. +# +# Requires: registry.redhat.io pull secret, AVX2 for CPU inference (see RHAIIS docs). +apiVersion: serving.kserve.io/v1alpha1 +kind: LLMInferenceService +metadata: + # With namePrefix in kustomization.yaml → LLMIS name granite-3-1-8b-rhelai-modelcar-single-node-cpu + name: single-node-cpu +spec: + model: + # Model weights (ModelCar OCI) — mounted/consumed by the platform; not the predictor image. + uri: oci://registry.redhat.io/rhelai1/modelcar-granite-3-1-8b-starter-v1:1.4 + name: ibm-granite/granite-3.1-8b-instruct + replicas: 1 + router: + route: {} + gateway: + refs: + - name: maas-default-gateway + namespace: openshift-ingress + template: + containers: + - name: main + # vLLM runtime (has `vllm` on PATH for the KServe launcher). NOT the modelcar image. + image: registry.redhat.io/rhaiis/vllm-cpu-rhel9:3.3.0 + imagePullPolicy: IfNotPresent + env: + - name: VLLM_CPU_KVCACHE_SPACE + value: "3" + # Omit --trust-remote-code: vLLM warns it only applies to HF AutoModel paths; no effect for mounted/OCI weights. + - name: VLLM_ADDITIONAL_ARGS + value: "--max-model-len 4096" + resources: + requests: + cpu: "3" + memory: 16Gi + limits: + cpu: "6" + memory: 24Gi + livenessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTPS + initialDelaySeconds: 300 + periodSeconds: 30 + timeoutSeconds: 30 + failureThreshold: 5 + readinessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTPS + initialDelaySeconds: 300 + periodSeconds: 15 + timeoutSeconds: 15 + failureThreshold: 30 diff --git a/docs/samples/models/simulator-premium/model.yaml b/docs/samples/models/simulator-premium/model.yaml index 256856425..9378c2894 100644 --- a/docs/samples/models/simulator-premium/model.yaml +++ b/docs/samples/models/simulator-premium/model.yaml @@ -17,16 +17,16 @@ spec: template: containers: - name: main - image: "ghcr.io/llm-d/llm-d-inference-sim:v0.7.1" + image: "ghcr.io/llm-d/llm-d-inference-sim:v0.8.2" imagePullPolicy: Always - command: ["/app/llm-d-inference-sim"] args: - --port - - "8000" + - "8000" - --model - facebook/opt-125m - --mode - random + - --no-mm-encoder-only - --ssl-certfile - /var/run/kserve/tls/tls.crt - --ssl-keyfile @@ -42,6 +42,11 @@ spec: fieldRef: apiVersion: v1 fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP ports: - name: https containerPort: 8000 diff --git a/docs/samples/models/simulator/model.yaml b/docs/samples/models/simulator/model.yaml index 32fdf7efa..2579c0979 100644 --- a/docs/samples/models/simulator/model.yaml +++ b/docs/samples/models/simulator/model.yaml @@ -16,17 +16,21 @@ spec: namespace: openshift-ingress template: containers: + # llm-d-inference-sim: OpenAI-compatible HTTP on --port; /health + /ready for probes. + # Image ENTRYPOINT is /app/llm-d-inference-sim (v0.8.x); args are appended to it. + # See: https://github.com/llm-d/llm-d-inference-sim - name: main - image: "ghcr.io/llm-d/llm-d-inference-sim:v0.7.1" + image: "ghcr.io/llm-d/llm-d-inference-sim:v0.8.2" imagePullPolicy: Always - command: ["/app/llm-d-inference-sim"] args: - --port - - "8000" + - "8000" - --model - facebook/opt-125m - --mode - random + # Keep full OpenAI surface (e.g. /v1/embeddings); do not use encoder-only mode. + - --no-mm-encoder-only - --ssl-certfile - /var/run/kserve/tls/tls.crt - --ssl-keyfile @@ -42,6 +46,11 @@ spec: fieldRef: apiVersion: v1 fieldPath: metadata.namespace + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP ports: - name: https containerPort: 8000 diff --git a/maas-api/README.md b/maas-api/README.md index 842bae8a4..3e2193309 100644 --- a/maas-api/README.md +++ b/maas-api/README.md @@ -13,9 +13,8 @@ !!! warning "Database Required" The maas-api **requires** a PostgreSQL database and will fail to start without it. You must create a Secret named `maas-db-config` with the `DB_CONNECTION_URL` key before deploying. - For development, the `scripts/deploy.sh` script creates this automatically. - For production ODH/RHOAI deployments, see [Database Prerequisites](../docs/content/install/prerequisites.md#database-prerequisite). + For production ODH/RHOAI deployments, see [Database Setup](../docs/content/install/maas-setup.md#database-setup). ### Setup @@ -271,6 +270,56 @@ curl -sSk \ "${HOST}/maas-api/v1/api-keys/search" | jq . ``` +## Configuration + +The maas-api server is configured via **environment variables** or **CLI flags** (CLI flags take precedence). + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `DEBUG_MODE` | `false` | Enable debug logging. Set to `true` or `1`. | +| `NAMESPACE` | `maas-api` | Namespace where maas-api is deployed. | +| `GATEWAY_NAME` | `maas-default-gateway` | Name of the Gateway resource used for model routing. | +| `GATEWAY_NAMESPACE` | `openshift-ingress` | Namespace of the Gateway resource. | +| `MAAS_SUBSCRIPTION_NAMESPACE` | `models-as-a-service` | Namespace where MaaSSubscription CRs are located. | +| `INSTANCE_NAME` | Value of `GATEWAY_NAME` | Name of the MaaS instance (for logging/identification). | +| `SECURE` | `false` | Enable HTTPS. Requires TLS configuration. | +| `ADDRESS` | `:8443` (HTTPS) or `:8080` (HTTP) | Server listen address (host:port). | +| `PORT` | - | **DEPRECATED.** Use `ADDRESS` with `SECURE=false` instead. | +| `API_KEY_MAX_EXPIRATION_DAYS` | `90` | Maximum allowed API key lifetime in days. Users cannot create keys with longer expiration. Minimum: 1. | +| `ACCESS_CHECK_TIMEOUT_SECONDS` | `15` | Timeout for model access validation during `/v1/models` requests. Models that don't respond within this window are excluded. Minimum: 1. | +| `TLS_CERT` | - | Path to TLS certificate file (PEM format). Required if `SECURE=true` and not using self-signed cert. | +| `TLS_KEY` | - | Path to TLS private key file (PEM format). Required if `SECURE=true` and not using self-signed cert. | +| `TLS_SELF_SIGNED` | `false` | Generate self-signed certificate. Alternative to providing `TLS_CERT`/`TLS_KEY`. | + +!!! note "Database Configuration" + The database connection URL is loaded from the Kubernetes secret `maas-db-config` (key: `DB_CONNECTION_URL`) in the same namespace as the maas-api pod. See [Database Configuration](#database-configuration) below. + +!!! note "TLS Minimum Version" + The minimum TLS version can be configured via the `--tls-min-version` CLI flag (default: `1.2`). Environment variable configuration is not currently supported. + +### CLI Flags + +Most environment variables have corresponding CLI flags. When both are provided, CLI flags take precedence. Note that `API_KEY_MAX_EXPIRATION_DAYS` and `ACCESS_CHECK_TIMEOUT_SECONDS` are environment variable only and have no CLI flag equivalents. + +| Flag | Env Var | Default | Description | +|------|---------|---------|-------------| +| `--debug` | `DEBUG_MODE` | `false` | Enable debug mode. | +| `--namespace` | `NAMESPACE` | `maas-api` | Namespace of the MaaS instance. | +| `--name` | `INSTANCE_NAME` | Value of `--gateway-name` | Name of the MaaS instance. | +| `--gateway-name` | `GATEWAY_NAME` | `maas-default-gateway` | Name of the Gateway resource. | +| `--gateway-namespace` | `GATEWAY_NAMESPACE` | `openshift-ingress` | Namespace where Gateway is deployed. | +| `--maas-subscription-namespace` | `MAAS_SUBSCRIPTION_NAMESPACE` | `models-as-a-service` | Namespace where MaaSSubscription CRs are located. | +| `--secure` | `SECURE` | `false` | Use HTTPS. Requires TLS configuration. | +| `--address` | `ADDRESS` | `:8443` or `:8080` | HTTPS listen address. | +| `--port` | `PORT` | - | **DEPRECATED.** Use `--address` with `--secure=false`. | +| `--tls-cert` | `TLS_CERT` | - | Path to TLS certificate. | +| `--tls-key` | `TLS_KEY` | - | Path to TLS private key. | +| `--tls-self-signed` | `TLS_SELF_SIGNED` | `false` | Generate self-signed certificate. | +| `--tls-min-version` | - | `1.2` | Minimum TLS version (`1.2` or `1.3`). | + + ### Database Configuration maas-api uses PostgreSQL for persistent storage of API key metadata. The database connection is configured via a Kubernetes Secret. @@ -278,7 +327,7 @@ maas-api uses PostgreSQL for persistent storage of API key metadata. The databas !!! note "Automatic Setup" When using `scripts/deploy.sh` for development, PostgreSQL is deployed automatically with the secret created. -For production deployments, see the [Database Prerequisites](../docs/content/install/prerequisites.md#database-prerequisite) guide. +For production deployments, see the [Database Setup](../docs/content/install/maas-setup.md#database-setup) guide. #### Listing models with subscription filtering diff --git a/maas-api/cmd/main.go b/maas-api/cmd/main.go index a32262721..443212289 100644 --- a/maas-api/cmd/main.go +++ b/maas-api/cmd/main.go @@ -148,7 +148,7 @@ func registerHandlers(ctx context.Context, log *logger.Logger, router *gin.Engin subscriptionSelector := subscription.NewSelector(log, cluster.MaaSSubscriptionLister) - modelManager, err := models.NewManager(log) + modelManager, err := models.NewManager(log, cfg.AccessCheckTimeoutSeconds) if err != nil { log.Fatal("Failed to create model manager", "error", err) } diff --git a/maas-api/deploy/overlays/odh/kustomization.yaml b/maas-api/deploy/overlays/odh/kustomization.yaml index e9101516e..80c51dd4d 100644 --- a/maas-api/deploy/overlays/odh/kustomization.yaml +++ b/maas-api/deploy/overlays/odh/kustomization.yaml @@ -3,35 +3,47 @@ kind: Kustomization metadata: name: maas-api-opendatahub - -# Overlay to be used by OpenDataHub Operator to install MaaS component itself. + +# Overlay used by the Tenant reconciler (maas-controller) to render maas-api +# platform workloads. CustomizeParams writes gateway, namespace, and audience +# values into params.env before kustomize build; shared-patches then +# substitutes them into Deployments, HTTPRoutes, and AuthPolicies. resources: -- ../../../../deployment/base/maas-api +- ../../../../deployment/base/maas-api/overlays/tls +- ../../../../deployment/base/maas-controller/policies namespace: opendatahub labels: -- includeSelectors: true - pairs: +- pairs: app.kubernetes.io/component: api app.kubernetes.io/name: maas-api -# This approach is used across ODH components to set the image through GitOps +components: + - ../../../../deployment/components/shared-patches + configMapGenerator: -- name: maas-api-deployment-config +- name: maas-parameters envs: - params.env options: disableNameSuffixHash: true +generatorOptions: + disableNameSuffixHash: true + +# DestinationRule must live in the gateway namespace, not app namespace. +# The overlay's `namespace:` directive overrides all resources to opendatahub, +# so we fix it here (replacements run after namespace transformer). replacements: - source: - fieldPath: data.maas-api-image kind: ConfigMap - name: maas-api-deployment-config + version: v1 + name: maas-parameters + fieldPath: data.gateway-namespace targets: - - fieldPaths: - - spec.template.spec.containers.[name=maas-api].image - select: - kind: Deployment - name: maas-api + - select: + kind: DestinationRule + name: maas-api-backend-tls + fieldPaths: + - metadata.namespace diff --git a/maas-api/deploy/overlays/odh/params.env b/maas-api/deploy/overlays/odh/params.env index 1efbcd69d..0835f628d 100644 --- a/maas-api/deploy/overlays/odh/params.env +++ b/maas-api/deploy/overlays/odh/params.env @@ -1,2 +1,12 @@ -# Image configuration +# Image configuration (overridden by RELATED_IMAGE_* env vars at runtime) maas-api-image=quay.io/opendatahub/maas-api:latest +maas-controller-image=quay.io/opendatahub/maas-controller:latest +# Gateway configuration (overridden by CustomizeParams from Tenant spec) +gateway-namespace=openshift-ingress +gateway-name=maas-default-gateway +# Application namespace (overridden by CustomizeParams) +app-namespace=opendatahub +# Cluster audience for kubernetesTokenReview (overridden by CustomizeParams from Authentication/cluster) +cluster-audience=https://kubernetes.default.svc +# API key cleanup CronJob image +maas-api-key-cleanup-image=registry.redhat.io/ubi9/ubi-minimal:9.7 diff --git a/maas-api/go.mod b/maas-api/go.mod index 009584e34..6ece3c3b2 100644 --- a/maas-api/go.mod +++ b/maas-api/go.mod @@ -8,13 +8,13 @@ require ( github.com/golang-jwt/jwt/v5 v5.3.0 github.com/golang-migrate/migrate/v4 v4.19.1 github.com/google/uuid v1.6.0 - github.com/jackc/pgx/v5 v5.7.6 + github.com/jackc/pgx/v5 v5.9.2 github.com/kserve/kserve v0.0.0-20251121160314-57d83d202f36 github.com/lib/pq v1.10.9 github.com/openai/openai-go/v2 v2.3.1 github.com/stretchr/testify v1.11.1 go.uber.org/zap v1.27.0 - golang.org/x/sync v0.18.0 + golang.org/x/sync v0.19.0 k8s.io/api v0.34.1 k8s.io/apimachinery v0.34.1 k8s.io/client-go v0.34.1 @@ -24,11 +24,11 @@ require ( ) require ( - cel.dev/expr v0.24.0 // indirect + cel.dev/expr v0.25.1 // indirect cloud.google.com/go v0.121.6 // indirect cloud.google.com/go/auth v0.16.4 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect - cloud.google.com/go/compute/metadata v0.8.0 // indirect + cloud.google.com/go/compute/metadata v0.9.0 // indirect cloud.google.com/go/iam v1.5.2 // indirect cloud.google.com/go/monitoring v1.24.2 // indirect cloud.google.com/go/storage v1.56.0 // indirect @@ -37,7 +37,7 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.2 // indirect github.com/AzureAD/microsoft-authentication-library-for-go v1.5.0 // indirect - github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.29.0 // indirect + github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.30.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.53.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.53.0 // indirect github.com/aws/aws-sdk-go v1.55.6 // indirect @@ -46,11 +46,11 @@ require ( github.com/bytedance/sonic/loader v0.2.4 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cloudwego/base64x v0.1.5 // indirect - github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443 // indirect + github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/emicklei/go-restful/v3 v3.13.0 // indirect - github.com/envoyproxy/go-control-plane/envoy v1.32.4 // indirect - github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect + github.com/envoyproxy/go-control-plane/envoy v1.36.0 // indirect + github.com/envoyproxy/protoc-gen-validate v1.3.0 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect @@ -97,7 +97,7 @@ require ( github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.17.0 // indirect github.com/spf13/pflag v1.0.10 // indirect - github.com/spiffe/go-spiffe/v2 v2.5.0 // indirect + github.com/spiffe/go-spiffe/v2 v2.6.0 // indirect github.com/tidwall/gjson v1.18.0 // indirect github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.1 // indirect @@ -105,9 +105,8 @@ require ( github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.3.0 // indirect github.com/x448/float16 v0.8.4 // indirect - github.com/zeebo/errs v1.4.0 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect - go.opentelemetry.io/contrib/detectors/gcp v1.36.0 // indirect + go.opentelemetry.io/contrib/detectors/gcp v1.39.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.61.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 // indirect go.opentelemetry.io/otel v1.43.0 // indirect @@ -119,20 +118,20 @@ require ( go.yaml.in/yaml/v2 v2.4.2 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/arch v0.18.0 // indirect - golang.org/x/crypto v0.45.0 // indirect - golang.org/x/net v0.47.0 // indirect - golang.org/x/oauth2 v0.30.0 // indirect + golang.org/x/crypto v0.46.0 // indirect + golang.org/x/net v0.48.0 // indirect + golang.org/x/oauth2 v0.34.0 // indirect golang.org/x/sys v0.42.0 // indirect - golang.org/x/term v0.37.0 // indirect - golang.org/x/text v0.31.0 // indirect + golang.org/x/term v0.38.0 // indirect + golang.org/x/text v0.32.0 // indirect golang.org/x/time v0.12.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect google.golang.org/api v0.247.0 // indirect google.golang.org/genproto v0.0.0-20250603155806-513f23925822 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20250818200422-3122310a409c // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250826171959-ef028d996bc1 // indirect - google.golang.org/grpc v1.75.1 // indirect - google.golang.org/protobuf v1.36.8 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect + google.golang.org/grpc v1.79.3 // indirect + google.golang.org/protobuf v1.36.10 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/go-playground/validator.v9 v9.31.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect diff --git a/maas-api/go.sum b/maas-api/go.sum index 5ef918e0b..877aa9d99 100644 --- a/maas-api/go.sum +++ b/maas-api/go.sum @@ -1,5 +1,5 @@ -cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY= -cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= +cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4= +cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4= cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= @@ -13,8 +13,8 @@ cloud.google.com/go/auth v0.16.4/go.mod h1:j10ncYwjX/g3cdX7GpEzsdM+d+ZNsXAbb6qXA cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc= cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c= cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= -cloud.google.com/go/compute/metadata v0.8.0 h1:HxMRIbao8w17ZX6wBnjhcDkW6lTFpgcaobyVfZWqRLA= -cloud.google.com/go/compute/metadata v0.8.0/go.mod h1:sYOGTp851OV9bOFJ9CH7elVvyzopvWQFNNghtDQ/Biw= +cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs= +cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10= cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= cloud.google.com/go/iam v1.5.2 h1:qgFRAGEmd8z6dJ/qyEchAuL9jpswyODjA2lS+w234g8= cloud.google.com/go/iam v1.5.2/go.mod h1:SE1vg0N81zQqLzQEwxL2WI6yhetBdbNQuTvIKCSkUHE= @@ -49,8 +49,8 @@ github.com/AzureAD/microsoft-authentication-library-for-go v1.5.0 h1:XkkQbfMyuH2 github.com/AzureAD/microsoft-authentication-library-for-go v1.5.0/go.mod h1:HKpQxkWaGLJ+D/5H8QRpyQXA1eKjxkFlOMwck5+33Jk= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= -github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.29.0 h1:UQUsRi8WTzhZntp5313l+CHIAT95ojUI2lpP/ExlZa4= -github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.29.0/go.mod h1:Cz6ft6Dkn3Et6l2v2a9/RpN7epQ1GtDlO6lj8bEcOvw= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.30.0 h1:sBEjpZlNHzK1voKq9695PJSX2o5NEXl7/OL3coiIY0c= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.30.0/go.mod h1:P4WPRUkOhJC13W//jWpyfJNDAIpvRbAUIYLX/4jtlE0= github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.53.0 h1:owcC2UnmsZycprQ5RfRgjydWhuoxg71LUfyiQdijZuM= github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.53.0/go.mod h1:ZPpqegjbE99EPKsu3iUWV22A04wzGPcAY/ziSIQEEgs= github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/cloudmock v0.53.0 h1:4LP6hvB4I5ouTbGgWtixJhgED6xdf67twf9PoY96Tbg= @@ -76,8 +76,8 @@ github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDk github.com/cloudwego/base64x v0.1.5 h1:XPciSp1xaq2VCSt6lF0phncD4koWyULpl5bUxbfCyP4= github.com/cloudwego/base64x v0.1.5/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w= github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY= -github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443 h1:aQ3y1lwWyqYPiWZThqv1aFbZMiM9vblcSArJRf2Irls= -github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= +github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5 h1:6xNmx7iTtyBRev0+D/Tv1FZd4SCg8axKApyNyRsAt/w= +github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5/go.mod h1:KdCmV+x/BuvyMxRnYBlmVaq4OLiKW6iRQfvC62cvdkI= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= github.com/containerd/errdefs v1.0.0/go.mod h1:+YBYIdtsnF4Iw6nWZhJcqGSg/dwvV7tyJ/kCkyJ2k+M= github.com/containerd/errdefs/pkg v0.3.0 h1:9IKJ06FvyNlexW690DXuQNx2KA2cUJXx151Xdx3ZPPE= @@ -98,14 +98,14 @@ github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4 github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= -github.com/envoyproxy/go-control-plane v0.13.4 h1:zEqyPVyku6IvWCFwux4x9RxkLOMUL+1vC9xUFv5l2/M= -github.com/envoyproxy/go-control-plane v0.13.4/go.mod h1:kDfuBlDVsSj2MjrLEtRWtHlsWIFcGyB2RMO44Dc5GZA= -github.com/envoyproxy/go-control-plane/envoy v1.32.4 h1:jb83lalDRZSpPWW2Z7Mck/8kXZ5CQAFYVjQcdVIr83A= -github.com/envoyproxy/go-control-plane/envoy v1.32.4/go.mod h1:Gzjc5k8JcJswLjAx1Zm+wSYE20UrLtt7JZMWiWQXQEw= +github.com/envoyproxy/go-control-plane v0.14.0 h1:hbG2kr4RuFj222B6+7T83thSPqLjwBIfQawTkC++2HA= +github.com/envoyproxy/go-control-plane v0.14.0/go.mod h1:NcS5X47pLl/hfqxU70yPwL9ZMkUlwlKxtAohpi2wBEU= +github.com/envoyproxy/go-control-plane/envoy v1.36.0 h1:yg/JjO5E7ubRyKX3m07GF3reDNEnfOboJ0QySbH736g= +github.com/envoyproxy/go-control-plane/envoy v1.36.0/go.mod h1:ty89S1YCCVruQAm9OtKeEkQLTb+Lkz0k8v9W0Oxsv98= github.com/envoyproxy/go-control-plane/ratelimit v0.1.0 h1:/G9QYbddjL25KvtKTv3an9lx6VBE2cnb8wp1vEGNYGI= github.com/envoyproxy/go-control-plane/ratelimit v0.1.0/go.mod h1:Wk+tMFAFbCXaJPzVVHnPgRKdUdwW/KdbRt94AzgRee4= -github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8= -github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU= +github.com/envoyproxy/protoc-gen-validate v1.3.0 h1:TvGH1wof4H33rezVKWSpqKz5NXWg5VPuZ0uONDT6eb4= +github.com/envoyproxy/protoc-gen-validate v1.3.0/go.mod h1:HvYl7zwPa5mffgyeTUHA9zHIH36nmrm7oCbo4YKoSWA= github.com/evanphx/json-patch v5.9.0+incompatible h1:fBXyNpNMuTTDdquAq/uisOr2lShz4oaXpDTX2bLe7ls= github.com/evanphx/json-patch v5.9.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= @@ -205,8 +205,8 @@ github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsI github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo= github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= -github.com/jackc/pgx/v5 v5.7.6 h1:rWQc5FwZSPX58r1OQmkuaNicxdmExaEz5A2DO2hUuTk= -github.com/jackc/pgx/v5 v5.7.6/go.mod h1:aruU7o91Tc2q2cFp5h4uP3f6ztExVpyVv88Xl/8Vl8M= +github.com/jackc/pgx/v5 v5.9.2 h1:3ZhOzMWnR4yJ+RW1XImIPsD1aNSz4T4fyP7zlQb56hw= +github.com/jackc/pgx/v5 v5.9.2/go.mod h1:mal1tBGAFfLHvZzaYh77YS/eC6IX9OWbRV1QIIM0Jn4= github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo= github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= github.com/jmespath/go-jmespath v0.4.1-0.20220621161143-b0104c826a24 h1:liMMTbpW34dhU4az1GN0pTPADwNmvoRSeoZ6PItiqnY= @@ -294,8 +294,8 @@ github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0t github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/spiffe/go-spiffe/v2 v2.5.0 h1:N2I01KCUkv1FAjZXJMwh95KK1ZIQLYbPfhaxw8WS0hE= -github.com/spiffe/go-spiffe/v2 v2.5.0/go.mod h1:P+NxobPc6wXhVtINNtFjNWGBTreew1GBUCwT2wPmb7g= +github.com/spiffe/go-spiffe/v2 v2.6.0 h1:l+DolpxNWYgruGQVV0xsfeya3CsC7m8iBzDnMpsbLuo= +github.com/spiffe/go-spiffe/v2 v2.6.0/go.mod h1:gm2SeUoMZEtpnzPNs2Csc0D/gX33k1xIx7lEzqblHEs= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= @@ -326,14 +326,12 @@ github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -github.com/zeebo/errs v1.4.0 h1:XNdoD/RRMKP7HD0UhJnIzUy74ISdGGxURlYG8HSWSfM= -github.com/zeebo/errs v1.4.0/go.mod h1:sgbWHsvVuTPHcqJJGQ1WhI5KbWlHYz+2+2C/LSEtCw4= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= -go.opentelemetry.io/contrib/detectors/gcp v1.36.0 h1:F7q2tNlCaHY9nMKHR6XH9/qkp8FktLnIcy6jJNyOCQw= -go.opentelemetry.io/contrib/detectors/gcp v1.36.0/go.mod h1:IbBN8uAIIx734PTonTPxAxnjc2pQTxWNkwfstZ+6H2k= +go.opentelemetry.io/contrib/detectors/gcp v1.39.0 h1:kWRNZMsfBHZ+uHjiH4y7Etn2FK26LAGkNFw7RHv1DhE= +go.opentelemetry.io/contrib/detectors/gcp v1.39.0/go.mod h1:t/OGqzHBa5v6RHZwrDBJ2OirWc+4q/w2fTbLZwAKjTk= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.61.0 h1:q4XOmH/0opmeuJtPsbFNivyl7bCt7yRBbeEm2sC/XtQ= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.61.0/go.mod h1:snMWehoOh2wsEwnvvwtDyFCxVeDAODenXHtn5vzrKjo= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.62.0 h1:Hf9xI/XLML9ElpiHVDNwvqI0hIFlzV8dgIr35kV1kRU= @@ -366,8 +364,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.45.0 h1:jMBrvKuj23MTlT0bQEOBcAE0mjg8mK9RXFhRH6nyF3Q= -golang.org/x/crypto v0.45.0/go.mod h1:XTGrrkGJve7CYK7J8PEww4aY7gM3qMCElcJQ8n8JdX4= +golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU= +golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= @@ -379,8 +377,8 @@ golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHl golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= -golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= +golang.org/x/mod v0.30.0 h1:fDEXFVZ/fmCKProc/yAXXUijritrDzahmwwefnjoPFk= +golang.org/x/mod v0.30.0/go.mod h1:lAsf5O2EvJeSFMiBxXDki7sCgAxEUcZHXoXMKT4GJKc= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -393,13 +391,13 @@ golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= -golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= +golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= +golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= -golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= -golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= +golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= +golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -407,8 +405,8 @@ golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= -golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= +golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -422,14 +420,14 @@ golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= -golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= -golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= +golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q= +golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= -golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= +golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU= +golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= @@ -447,8 +445,8 @@ golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgw golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= -golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= +golang.org/x/tools v0.39.0 h1:ik4ho21kwuQln40uelmciQPp9SipgNDdrafrYA4TmQQ= +golang.org/x/tools v0.39.0/go.mod h1:JnefbkDPyD8UU2kI5fuf8ZX4/yUeh9W877ZeBONxUqQ= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -475,17 +473,17 @@ google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRn google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20250603155806-513f23925822 h1:rHWScKit0gvAPuOnu87KpaYtjK5zBMLcULh7gxkCXu4= google.golang.org/genproto v0.0.0-20250603155806-513f23925822/go.mod h1:HubltRL7rMh0LfnQPkMH4NPDFEWp0jw3vixw7jEM53s= -google.golang.org/genproto/googleapis/api v0.0.0-20250818200422-3122310a409c h1:AtEkQdl5b6zsybXcbz00j1LwNodDuH6hVifIaNqk7NQ= -google.golang.org/genproto/googleapis/api v0.0.0-20250818200422-3122310a409c/go.mod h1:ea2MjsO70ssTfCjiwHgI0ZFqcw45Ksuk2ckf9G468GA= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250826171959-ef028d996bc1 h1:pmJpJEvT846VzausCQ5d7KreSROcDqmO388w5YbnltA= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250826171959-ef028d996bc1/go.mod h1:GmFNa4BdJZ2a8G+wCe9Bg3wwThLrJun751XstdJt5Og= +google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 h1:fCvbg86sFXwdrl5LgVcTEvNC+2txB5mgROGmRL5mrls= +google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:+rXWjjaukWZun3mLfjmVnQi18E1AsFbDN9QdJ5YXLto= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 h1:gRkg/vSppuSQoDjxyiGfN4Upv/h/DQmIR10ZU8dh4Ww= +google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= -google.golang.org/grpc v1.75.1 h1:/ODCNEuf9VghjgO3rqLcfg8fiOP0nSluljWFlDxELLI= -google.golang.org/grpc v1.75.1/go.mod h1:JtPAzKiq4v1xcAB2hydNlWI2RnF85XXcV0mhKXr2ecQ= -google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= -google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= +google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE= +google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= +google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= +google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= diff --git a/maas-api/internal/api_keys/handler.go b/maas-api/internal/api_keys/handler.go index 06b360b07..539622ddc 100644 --- a/maas-api/internal/api_keys/handler.go +++ b/maas-api/internal/api_keys/handler.go @@ -136,7 +136,7 @@ func (h *Handler) GetAPIKey(c *gin.Context) { // If expiresIn is not provided, defaults to API_KEY_MAX_EXPIRATION_DAYS (or 1hr for ephemeral). // Users can only create keys for themselves - the key inherits the user's groups. type CreateAPIKeyRequest struct { - Name string `json:"name,omitempty"` // Required for regular keys, optional for ephemeral + Name string `json:"name,omitempty"` // Required for regular keys, optional for ephemeral Description string `json:"description,omitempty"` Subscription string `json:"subscription,omitempty"` // Optional MaaSSubscription name; when omitted, highest-priority accessible subscription is used ExpiresIn *token.Duration `json:"expiresIn,omitempty"` // Optional - defaults to API_KEY_MAX_EXPIRATION_DAYS (1hr for ephemeral) @@ -194,6 +194,7 @@ func (h *Handler) CreateAPIKey(c *gin.Context) { var notFound *subscription.SubscriptionNotFoundError var accessDenied *subscription.AccessDeniedError var noSub *subscription.NoSubscriptionError + var modelUnhealthy *subscription.ModelUnhealthyError if errors.As(err, ¬Found) || errors.As(err, &accessDenied) || errors.As(err, &noSub) { c.JSON(http.StatusBadRequest, gin.H{ "error": apiKeySubscriptionResolutionErrMsg, @@ -201,6 +202,19 @@ func (h *Handler) CreateAPIKey(c *gin.Context) { }) return } + if errors.As(err, &modelUnhealthy) { + // Unreconciled (empty phase): 400 - temporary state, retry later + // Failed phase: 403 - authorization denied, subscription broken + statusCode := http.StatusBadRequest + if modelUnhealthy.Phase == "Failed" { + statusCode = http.StatusForbidden + } + c.JSON(statusCode, gin.H{ + "error": modelUnhealthy.Message, + "code": "subscription_not_ready", + }) + return + } c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to create API key"}) return } diff --git a/maas-api/internal/api_keys/handler_test.go b/maas-api/internal/api_keys/handler_test.go index 68a97196e..91f741ada 100644 --- a/maas-api/internal/api_keys/handler_test.go +++ b/maas-api/internal/api_keys/handler_test.go @@ -29,13 +29,13 @@ type fixedSubSelector struct{} func (fixedSubSelector) Select(_ []string, _ string, requested string, _ string) (*subscription.SelectResponse, error) { if requested != "" { - return &subscription.SelectResponse{Name: requested}, nil + return &subscription.SelectResponse{Name: requested, Phase: "Active"}, nil } - return &subscription.SelectResponse{Name: testSubscriptionName}, nil + return &subscription.SelectResponse{Name: testSubscriptionName, Phase: "Active"}, nil } func (fixedSubSelector) SelectHighestPriority(_ []string, _ string) (*subscription.SelectResponse, error) { - return &subscription.SelectResponse{Name: testSubscriptionName}, nil + return &subscription.SelectResponse{Name: testSubscriptionName, Phase: "Active"}, nil } // errSubSelector returns fixed errors from Select / SelectHighestPriority (for handler HTTP mapping tests). @@ -48,14 +48,14 @@ func (e errSubSelector) Select(_ []string, _ string, _ string, _ string) (*subsc if e.selectErr != nil { return nil, e.selectErr } - return &subscription.SelectResponse{Name: "stub-sub"}, nil + return &subscription.SelectResponse{Name: "stub-sub", Phase: "Active"}, nil } func (e errSubSelector) SelectHighestPriority(_ []string, _ string) (*subscription.SelectResponse, error) { if e.highestPriorityErr != nil { return nil, e.highestPriorityErr } - return &subscription.SelectResponse{Name: testSubscriptionName}, nil + return &subscription.SelectResponse{Name: testSubscriptionName, Phase: "Active"}, nil } // Test constants. diff --git a/maas-api/internal/api_keys/service_test.go b/maas-api/internal/api_keys/service_test.go index 932f82d6d..af63a91a6 100644 --- a/maas-api/internal/api_keys/service_test.go +++ b/maas-api/internal/api_keys/service_test.go @@ -18,13 +18,13 @@ type serviceTestSubSelector struct{} func (serviceTestSubSelector) Select(_ []string, _ string, requested string, _ string) (*subscription.SelectResponse, error) { if requested != "" { - return &subscription.SelectResponse{Name: requested}, nil + return &subscription.SelectResponse{Name: requested, Phase: "Active"}, nil } - return &subscription.SelectResponse{Name: "default-sub"}, nil + return &subscription.SelectResponse{Name: "default-sub", Phase: "Active"}, nil } func (serviceTestSubSelector) SelectHighestPriority(_ []string, _ string) (*subscription.SelectResponse, error) { - return &subscription.SelectResponse{Name: "default-sub"}, nil + return &subscription.SelectResponse{Name: "default-sub", Phase: "Active"}, nil } func createTestService(t *testing.T) (*api_keys.Service, *api_keys.MockStore) { @@ -641,7 +641,7 @@ func (s subSelectorStub) Select(_ []string, _ string, requested string, _ string if s.selectErr != nil { return nil, s.selectErr } - return &subscription.SelectResponse{Name: requested}, nil + return &subscription.SelectResponse{Name: requested, Phase: "Active"}, nil } func (s subSelectorStub) SelectHighestPriority(_ []string, _ string) (*subscription.SelectResponse, error) { @@ -652,7 +652,7 @@ func (s subSelectorStub) SelectHighestPriority(_ []string, _ string) (*subscript if name == "" { name = "from-priority" } - return &subscription.SelectResponse{Name: name}, nil + return &subscription.SelectResponse{Name: name, Phase: "Active"}, nil } func TestCreateAPIKey_Subscription(t *testing.T) { @@ -799,3 +799,116 @@ func createTestAPIKey(t *testing.T) (string, string) { require.NoError(t, err) return plainKey, hash } + +func TestCreateAPIKey_ValidatesSubscriptionPhase(t *testing.T) { + ctx := context.Background() + cfg := &config.Config{} + user := "testuser" + groups := []string{"g1"} + + tests := []struct { + name string + phase string + deleting bool + expectError bool + errorMsg string + }{ + { + name: "rejects Failed subscription (prevents key spam)", + phase: "Failed", + deleting: false, + expectError: true, + errorMsg: "Failed phase", + }, + { + name: "allows Pending subscription (enforcement at inference time)", + phase: "Pending", + deleting: false, + expectError: false, + }, + { + name: "allows Degraded subscription (enforcement at inference time)", + phase: "Degraded", + deleting: false, + expectError: false, + }, + { + name: "rejects unreconciled subscription (empty phase)", + phase: "", + deleting: false, + expectError: true, + errorMsg: "unreconciled", + }, + { + name: "allows Active subscription", + phase: "Active", + deleting: false, + expectError: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Mock selector that returns subscription with specific health status + selector := &mockHealthSelector{ + phase: tt.phase, + deleting: tt.deleting, + } + + store := api_keys.NewMockStore() + svc := api_keys.NewServiceWithLogger(store, cfg, selector, logger.Development()) + + _, err := svc.CreateAPIKey(ctx, user, groups, "test-key", "", nil, false, "test-sub") + + if tt.expectError { + require.Error(t, err, "Expected error for %s", tt.name) + var modelErr *subscription.ModelUnhealthyError + require.ErrorAs(t, err, &modelErr, "Expected ModelUnhealthyError") + require.Contains(t, modelErr.Message, tt.errorMsg, "Error message should contain: %s", tt.errorMsg) + } else { + require.NoError(t, err, "Expected no error for %s", tt.name) + } + }) + } +} + +// mockHealthSelector implements SubscriptionSelector for health testing. +type mockHealthSelector struct { + phase string + deleting bool +} + +func (m *mockHealthSelector) Select(_ []string, _ string, _ string, _ string) (*subscription.SelectResponse, error) { + // Simulate health validation that real selector does for API key creation + // API key creation path blocks Failed and unreconciled (empty phase) + if m.phase == "" { + return nil, &subscription.ModelUnhealthyError{ + Subscription: "test-sub", + Phase: "", + Reason: "SubscriptionNotReady", + Message: "subscription is unreconciled (no status.phase set)", + } + } + if m.phase == "Failed" { + return nil, &subscription.ModelUnhealthyError{ + Subscription: "test-sub", + Phase: "Failed", + Reason: "SubscriptionNotReady", + Message: "subscription is in Failed phase (cannot create API keys)", + } + } + + resp := &subscription.SelectResponse{ + Name: "test-sub", + Phase: m.phase, + } + if m.deleting { + resp.DeletionTimestamp = "2026-04-08T12:00:00Z" + } + return resp, nil +} + +func (m *mockHealthSelector) SelectHighestPriority(_ []string, _ string) (*subscription.SelectResponse, error) { + //nolint:unqueryvet // False positive - not a SQL query + return m.Select(nil, "", "", "") +} diff --git a/maas-api/internal/config/config.go b/maas-api/internal/config/config.go index 3c9627163..f42b5b493 100644 --- a/maas-api/internal/config/config.go +++ b/maas-api/internal/config/config.go @@ -46,6 +46,12 @@ type Config struct { // Default: 30 days. Minimum: 1 day. APIKeyMaxExpirationDays int + // AccessCheckTimeoutSeconds bounds the total duration of model access validation. + // This limits the staleness window between when access is checked and when the + // response reaches the client. Models whose probes don't complete within this + // window are excluded (fail-closed). Default: 15 seconds. Minimum: 1 second. + AccessCheckTimeoutSeconds int + // Deprecated flag (backward compatibility with pre-TLS version) deprecatedHTTPPort string } @@ -56,6 +62,7 @@ func Load() *Config { gatewayName := env.GetString("GATEWAY_NAME", constant.DefaultGatewayName) secure, _ := env.GetBool("SECURE", false) maxExpirationDays, _ := env.GetInt("API_KEY_MAX_EXPIRATION_DAYS", constant.DefaultAPIKeyMaxExpirationDays) + accessCheckTimeoutSeconds, _ := env.GetInt("ACCESS_CHECK_TIMEOUT_SECONDS", 15) c := &Config{ Name: env.GetString("INSTANCE_NAME", gatewayName), @@ -69,6 +76,7 @@ func Load() *Config { DebugMode: debugMode, DBConnectionURL: "", // Loaded from K8s secret via LoadDatabaseURL() APIKeyMaxExpirationDays: maxExpirationDays, + AccessCheckTimeoutSeconds: accessCheckTimeoutSeconds, // Deprecated env var (backward compatibility with pre-TLS version) deprecatedHTTPPort: env.GetString("PORT", ""), } @@ -141,6 +149,10 @@ func (c *Config) Validate() error { return errors.New("API_KEY_MAX_EXPIRATION_DAYS must be at least 1") } + if c.AccessCheckTimeoutSeconds < 1 { + return errors.New("ACCESS_CHECK_TIMEOUT_SECONDS must be at least 1") + } + return nil } diff --git a/maas-api/internal/config/config_test.go b/maas-api/internal/config/config_test.go index 4d4958409..30073ea62 100644 --- a/maas-api/internal/config/config_test.go +++ b/maas-api/internal/config/config_test.go @@ -120,6 +120,7 @@ func TestValidate(t *testing.T) { DBConnectionURL: "postgresql://localhost/test", Secure: false, APIKeyMaxExpirationDays: 30, + AccessCheckTimeoutSeconds: 15, MaaSSubscriptionNamespace: "models-as-a-service", }, }, @@ -129,6 +130,7 @@ func TestValidate(t *testing.T) { DBConnectionURL: "postgresql://localhost/test", TLS: TLSConfig{SelfSigned: true, MinVersion: TLSVersion(tls.VersionTLS12)}, APIKeyMaxExpirationDays: 30, + AccessCheckTimeoutSeconds: 15, MaaSSubscriptionNamespace: "models-as-a-service", }, }, @@ -138,6 +140,7 @@ func TestValidate(t *testing.T) { DBConnectionURL: "postgresql://localhost/test", TLS: TLSConfig{Cert: "/cert.pem", Key: "/key.pem", MinVersion: TLSVersion(tls.VersionTLS12)}, APIKeyMaxExpirationDays: 30, + AccessCheckTimeoutSeconds: 15, MaaSSubscriptionNamespace: "models-as-a-service", }, }, @@ -146,6 +149,7 @@ func TestValidate(t *testing.T) { cfg: Config{ DBConnectionURL: "postgresql://localhost/test", APIKeyMaxExpirationDays: 1, + AccessCheckTimeoutSeconds: 15, MaaSSubscriptionNamespace: "models-as-a-service", }, }, @@ -154,6 +158,7 @@ func TestValidate(t *testing.T) { cfg: Config{ DBConnectionURL: "postgresql://localhost/test", APIKeyMaxExpirationDays: 30, + AccessCheckTimeoutSeconds: 15, MaaSSubscriptionNamespace: "models-as-a-service", }, }, @@ -162,6 +167,7 @@ func TestValidate(t *testing.T) { cfg: Config{ DBConnectionURL: "postgresql://localhost/test", APIKeyMaxExpirationDays: 365, + AccessCheckTimeoutSeconds: 15, MaaSSubscriptionNamespace: "models-as-a-service", }, }, diff --git a/maas-api/internal/handlers/models.go b/maas-api/internal/handlers/models.go index 155a41aa5..a4db61782 100644 --- a/maas-api/internal/handlers/models.go +++ b/maas-api/internal/handlers/models.go @@ -5,6 +5,7 @@ import ( "net/http" "sort" "strings" + "time" "github.com/gin-gonic/gin" "github.com/openai/openai-go/v2/packages/pagination" @@ -170,9 +171,9 @@ func (h *ModelsHandler) addSubscriptionIfNew(model *models.Model, subInfo models model.Subscriptions = append(model.Subscriptions, subInfo) } -// ListLLMs handles GET /v1/models. -func (h *ModelsHandler) ListLLMs(c *gin.Context) { - // Require Authorization header and pass it through as-is to list and access validation. +// extractAndValidateAuth validates and extracts authentication details. +// Returns authHeader, requestedSubscription, isAPIKeyRequest, and error. +func (h *ModelsHandler) extractAndValidateAuth(c *gin.Context) (string, string, bool, error) { authHeader := strings.TrimSpace(c.GetHeader("Authorization")) if authHeader == "" { h.logger.Error("Authorization header missing") @@ -181,14 +182,10 @@ func (h *ModelsHandler) ListLLMs(c *gin.Context) { "message": "Authorization required", "type": "authentication_error", }}) - return + return "", "", false, errors.New("missing authorization") } // Extract x-maas-subscription header. - // For API keys: Authorino injects this from auth.metadata.apiKeyValidation.subscription - // For user tokens: This header is not present (Authorino doesn't inject it) - // Note: If client sends x-maas-subscription header, there may be multiple values. - // Authorino appends its value, so we take the last non-empty value. requestedSubscription := "" headerValues := c.Request.Header.Values("X-Maas-Subscription") for i := len(headerValues) - 1; i >= 0; i-- { @@ -208,39 +205,133 @@ func (h *ModelsHandler) ListLLMs(c *gin.Context) { "message": "API key has no subscription bound", "type": "permission_error", }}) + return "", "", false, errors.New("api key missing subscription") + } + + return authHeader, requestedSubscription, isAPIKeyRequest, nil +} + +// getUserContextIfNeeded retrieves user context from the request if subscription selector is configured. +func (h *ModelsHandler) getUserContextIfNeeded(c *gin.Context) (*token.UserContext, error) { + if h.subscriptionSelector == nil { + return nil, nil + } + + userContextVal, exists := c.Get("user") + if !exists { + h.logger.Error("User context not found - ExtractUserInfo middleware not called") + c.JSON(http.StatusInternalServerError, gin.H{ + "error": gin.H{ + "message": "Internal server error", + "type": "server_error", + }}) + return nil, errors.New("user context not found") + } + + userContext, ok := userContextVal.(*token.UserContext) + if !ok { + h.logger.Error("Invalid user context type") + c.JSON(http.StatusInternalServerError, gin.H{ + "error": gin.H{ + "message": "Internal server error", + "type": "server_error", + }}) + return nil, errors.New("invalid user context type") + } + + return userContext, nil +} + +// aggregateModelsFromSubscriptions filters and aggregates models across multiple subscriptions. +func (h *ModelsHandler) aggregateModelsFromSubscriptions( + c *gin.Context, + list []models.Model, + subscriptionsToUse []*subscription.SelectResponse, + authHeader string, +) []models.Model { + type modelKey struct { + id string + url string + ownedBy string + } + modelsByKey := make(map[modelKey]*models.Model) + + for _, sub := range subscriptionsToUse { + // Pre-filter by modelRefs if available (optimization to reduce HTTP calls) + modelsToCheck := list + if len(sub.ModelRefs) > 0 { + h.logger.Debug("Pre-filtering models by subscription modelRefs", + "subscription", sub.Name, + "totalModels", len(list), + "modelRefsCount", len(sub.ModelRefs), + ) + modelsToCheck = filterModelsBySubscription(list, sub.ModelRefs) + h.logger.Debug("After modelRef filtering", "modelsToCheck", len(modelsToCheck)) + } + + probeSubscriptionHeader := sub.Name + h.logger.Debug("Filtering models by subscription", "subscription", sub.Name, "modelCount", len(modelsToCheck), "probeWithSubscriptionHeader", probeSubscriptionHeader != "") + filteredModels := h.modelMgr.FilterModelsByAccess(c.Request.Context(), modelsToCheck, authHeader, probeSubscriptionHeader) + + for _, model := range filteredModels { + subInfo := models.SubscriptionInfo{ + Name: sub.Name, + DisplayName: sub.DisplayName, + Description: sub.Description, + } + + urlStr := "" + if model.URL != nil { + urlStr = model.URL.String() + } + key := modelKey{id: model.ID, url: urlStr, ownedBy: model.OwnedBy} + + if existingModel, exists := modelsByKey[key]; exists { + h.addSubscriptionIfNew(existingModel, subInfo) + } else { + model.Subscriptions = []models.SubscriptionInfo{subInfo} + modelsByKey[key] = &model + } + } + } + + // Convert map to slice with deterministic ordering + keys := make([]modelKey, 0, len(modelsByKey)) + for k := range modelsByKey { + keys = append(keys, k) + } + sort.Slice(keys, func(i, j int) bool { + if keys[i].id != keys[j].id { + return keys[i].id < keys[j].id + } + if keys[i].url != keys[j].url { + return keys[i].url < keys[j].url + } + return keys[i].ownedBy < keys[j].ownedBy + }) + + modelList := make([]models.Model, 0, len(keys)) + for _, k := range keys { + modelList = append(modelList, *modelsByKey[k]) + } + return modelList +} + +// ListLLMs handles GET /v1/models. +func (h *ModelsHandler) ListLLMs(c *gin.Context) { + // Validate and extract authentication details + authHeader, requestedSubscription, isAPIKeyRequest, err := h.extractAndValidateAuth(c) + if err != nil { return } - // Determine behavior based on auth method: - // - API key with subscription → filter by that subscription (requestedSubscription != "") - // - User token → return all accessible models (requestedSubscription == "") + // Determine behavior based on auth method returnAllModels := !isAPIKeyRequest && requestedSubscription == "" // Get user context for subscription selection - var userContext *token.UserContext - if h.subscriptionSelector != nil { - // Extract user info from context (set by ExtractUserInfo middleware) - userContextVal, exists := c.Get("user") - if !exists { - h.logger.Error("User context not found - ExtractUserInfo middleware not called") - c.JSON(http.StatusInternalServerError, gin.H{ - "error": gin.H{ - "message": "Internal server error", - "type": "server_error", - }}) - return - } - var ok bool - userContext, ok = userContextVal.(*token.UserContext) - if !ok { - h.logger.Error("Invalid user context type") - c.JSON(http.StatusInternalServerError, gin.H{ - "error": gin.H{ - "message": "Internal server error", - "type": "server_error", - }}) - return - } + userContext, err := h.getUserContextIfNeeded(c) + if err != nil { + return } // Log the authentication method and filtering behavior @@ -260,6 +351,7 @@ func (h *ModelsHandler) ListLLMs(c *gin.Context) { // Initialize to empty slice (not nil) so JSON marshals as [] instead of null modelList := []models.Model{} + accessCheckedAt := time.Now().UTC() if h.maasModelRefLister != nil { h.logger.Debug("Listing models from MaaSModelRef cache (all namespaces)") list, err := models.ListFromMaaSModelRefLister(h.maasModelRefLister) @@ -286,86 +378,21 @@ func (h *ModelsHandler) ListLLMs(c *gin.Context) { } } else { // Filter models by subscription(s) and aggregate subscriptions - // Deduplication key is (model ID, URL, OwnedBy) - models with the same ID, URL, and - // MaaSModelRef (namespace/name) are the same instance and should have their - // subscriptions aggregated into an array. - type modelKey struct { - id string - url string - ownedBy string - } - modelsByKey := make(map[modelKey]*models.Model) - - for _, sub := range subscriptionsToUse { - // Pre-filter by modelRefs if available (optimization to reduce HTTP calls) - modelsToCheck := list - if len(sub.ModelRefs) > 0 { - h.logger.Debug("Pre-filtering models by subscription modelRefs", - "subscription", sub.Name, - "totalModels", len(list), - "modelRefsCount", len(sub.ModelRefs), - ) - modelsToCheck = filterModelsBySubscription(list, sub.ModelRefs) - h.logger.Debug("After modelRef filtering", "modelsToCheck", len(modelsToCheck)) - } - - // Always probe with the subscription header for access validation - // For API keys: uses the subscription bound to the key (bare name format) - // For user tokens: uses each accessible subscription to check which models are available - // Using bare name format to match what's stored in API keys - probeSubscriptionHeader := sub.Name - h.logger.Debug("Filtering models by subscription", "subscription", sub.Name, "modelCount", len(modelsToCheck), "probeWithSubscriptionHeader", probeSubscriptionHeader != "") - filteredModels := h.modelMgr.FilterModelsByAccess(c.Request.Context(), modelsToCheck, authHeader, probeSubscriptionHeader) - - for _, model := range filteredModels { - subInfo := models.SubscriptionInfo{ - Name: sub.Name, - DisplayName: sub.DisplayName, - Description: sub.Description, - } - - // Create key from model ID, URL, and OwnedBy (namespace/name of MaaSModelRef) - urlStr := "" - if model.URL != nil { - urlStr = model.URL.String() - } - key := modelKey{id: model.ID, url: urlStr, ownedBy: model.OwnedBy} - - if existingModel, exists := modelsByKey[key]; exists { - // Model already exists - append subscription if not already present - h.addSubscriptionIfNew(existingModel, subInfo) - } else { - // New model - create entry with subscriptions array - model.Subscriptions = []models.SubscriptionInfo{subInfo} - modelsByKey[key] = &model - } - } - } - - // Convert map to slice with deterministic ordering - keys := make([]modelKey, 0, len(modelsByKey)) - for k := range modelsByKey { - keys = append(keys, k) - } - sort.Slice(keys, func(i, j int) bool { - if keys[i].id != keys[j].id { - return keys[i].id < keys[j].id - } - if keys[i].url != keys[j].url { - return keys[i].url < keys[j].url - } - return keys[i].ownedBy < keys[j].ownedBy - }) - for _, k := range keys { - modelList = append(modelList, *modelsByKey[k]) - } + modelList = h.aggregateModelsFromSubscriptions(c, list, subscriptionsToUse, authHeader) } + accessCheckedAt = time.Now().UTC() h.logger.Debug("Access validation complete", "listed", len(list), "accessible", len(modelList), "subscriptions", len(subscriptionsToUse)) } else { h.logger.Debug("MaaSModelRef lister not configured, returning empty model list") } + // Prevent clients and proxies from caching authorization-checked model listings. + // The access check is a point-in-time snapshot; auth policies may change at any moment. + // X-Access-Checked-At lets clients assess the freshness of the authorization decision. + c.Header("Cache-Control", "no-store") + c.Header("X-Access-Checked-At", accessCheckedAt.Format(time.RFC3339)) + h.logger.Debug("GET /v1/models returning models", "count", len(modelList)) c.JSON(http.StatusOK, pagination.Page[models.Model]{ Object: "list", diff --git a/maas-api/internal/handlers/models_test.go b/maas-api/internal/handlers/models_test.go index 2a1f25259..5a5674188 100644 --- a/maas-api/internal/handlers/models_test.go +++ b/maas-api/internal/handlers/models_test.go @@ -90,6 +90,12 @@ func (f *fakeSubscriptionLister) List() ([]*unstructured.Unstructured, error) { map[string]any{"name": "premium-users"}, }, "spec", "owner", "groups") + // Set status.phase to Active (required for subscription filtering) + _ = unstructured.SetNestedField(sub.Object, "Active", "status", "phase") + _ = unstructured.SetNestedSlice(sub.Object, []any{ + map[string]any{"type": "Ready", "status": "True"}, + }, "status", "conditions") + return []*unstructured.Unstructured{sub}, nil } @@ -115,6 +121,12 @@ func (f fakeMultiSubscriptionLister) List() ([]*unstructured.Unstructured, error } _ = unstructured.SetNestedSlice(sub.Object, groupSlice, "spec", "owner", "groups") + // Set status.phase to Active (required for subscription filtering) + _ = unstructured.SetNestedField(sub.Object, "Active", "status", "phase") + _ = unstructured.SetNestedSlice(sub.Object, []any{ + map[string]any{"type": "Ready", "status": "True"}, + }, "status", "conditions") + result = append(result, sub) } return result, nil @@ -320,7 +332,7 @@ func TestListingModels(t *testing.T) { } router, _ := fixtures.SetupTestServer(t, config) - modelMgr, errMgr := models.NewManager(testLogger) + modelMgr, errMgr := models.NewManager(testLogger, 15) require.NoError(t, errMgr) // Set up test fixtures @@ -350,6 +362,16 @@ func TestListingModels(t *testing.T) { require.Equal(t, http.StatusOK, w.Code, "Expected status OK") + // Verify anti-caching and freshness headers (authorization timing race mitigation) + assert.Equal(t, "no-store", w.Header().Get("Cache-Control"), + "Expected Cache-Control: no-store to prevent caching of authorization-checked listings") + accessCheckedAt := w.Header().Get("X-Access-Checked-At") + assert.NotEmpty(t, accessCheckedAt, "Expected X-Access-Checked-At header with RFC3339 timestamp") + if accessCheckedAt != "" { + _, parseErr := time.Parse(time.RFC3339, accessCheckedAt) + require.NoError(t, parseErr, "X-Access-Checked-At should be valid RFC3339") + } + var response pagination.Page[models.Model] err = json.Unmarshal(w.Body.Bytes(), &response) require.NoError(t, err, "Failed to unmarshal response body") @@ -425,7 +447,7 @@ func TestListingModelsWithSubscriptionHeader(t *testing.T) { } router, _ := fixtures.SetupTestServer(t, config) - modelMgr, errMgr := models.NewManager(testLogger) + modelMgr, errMgr := models.NewManager(testLogger, 15) require.NoError(t, errMgr) _, cleanup := fixtures.StubTokenProviderAPIs(t) @@ -637,6 +659,12 @@ func TestListModels_ReturnAllModels(t *testing.T) { sub.SetAnnotations(annotations) } + // Set status.phase to Active (required for subscription filtering) + _ = unstructured.SetNestedField(sub.Object, "Active", "status", "phase") + _ = unstructured.SetNestedSlice(sub.Object, []any{ + map[string]any{"type": "Ready", "status": "True"}, + }, "status", "conditions") + return sub } @@ -647,7 +675,7 @@ func TestListModels_ReturnAllModels(t *testing.T) { }, } - modelMgr, err := models.NewManager(testLogger) + modelMgr, err := models.NewManager(testLogger, 15) require.NoError(t, err) subscriptionSelector := subscription.NewSelector(testLogger, subscriptionLister) @@ -819,6 +847,13 @@ func TestListModels_DeduplicationBySubscription(t *testing.T) { "groups": groupSlice, }, }, "spec") + + // Set status.phase to Active (required for subscription filtering) + _ = unstructured.SetNestedField(sub.Object, "Active", "status", "phase") + _ = unstructured.SetNestedSlice(sub.Object, []any{ + map[string]any{"type": "Ready", "status": "True"}, + }, "status", "conditions") + return sub } @@ -829,7 +864,7 @@ func TestListModels_DeduplicationBySubscription(t *testing.T) { }, } - modelMgr, err := models.NewManager(testLogger) + modelMgr, err := models.NewManager(testLogger, 15) require.NoError(t, err) subscriptionSelector := subscription.NewSelector(testLogger, subscriptionLister) @@ -931,6 +966,13 @@ func TestListModels_DifferentModelRefsWithSameModelID(t *testing.T) { "groups": groupSlice, }, }, "spec") + + // Set status.phase to Active (required for subscription filtering) + _ = unstructured.SetNestedField(sub.Object, "Active", "status", "phase") + _ = unstructured.SetNestedSlice(sub.Object, []any{ + map[string]any{"type": "Ready", "status": "True"}, + }, "status", "conditions") + return sub } @@ -940,7 +982,7 @@ func TestListModels_DifferentModelRefsWithSameModelID(t *testing.T) { }, } - modelMgr, err := models.NewManager(testLogger) + modelMgr, err := models.NewManager(testLogger, 15) require.NoError(t, err) subscriptionSelector := subscription.NewSelector(testLogger, subscriptionLister) @@ -1031,6 +1073,13 @@ func TestListModels_DifferentModelRefsWithSameURLAndModelID(t *testing.T) { "groups": groupSlice, }, }, "spec") + + // Set status.phase to Active (required for subscription filtering) + _ = unstructured.SetNestedField(sub.Object, "Active", "status", "phase") + _ = unstructured.SetNestedSlice(sub.Object, []any{ + map[string]any{"type": "Ready", "status": "True"}, + }, "status", "conditions") + return sub } @@ -1040,7 +1089,7 @@ func TestListModels_DifferentModelRefsWithSameURLAndModelID(t *testing.T) { }, } - modelMgr, err := models.NewManager(testLogger) + modelMgr, err := models.NewManager(testLogger, 15) require.NoError(t, err) subscriptionSelector := subscription.NewSelector(testLogger, subscriptionLister) @@ -1129,6 +1178,13 @@ func TestListModels_DifferentModelRefsWithSameModelIDAndDifferentSubscriptions(t "groups": groupSlice, }, }, "spec") + + // Set status.phase to Active (required for subscription filtering) + _ = unstructured.SetNestedField(sub.Object, "Active", "status", "phase") + _ = unstructured.SetNestedSlice(sub.Object, []any{ + map[string]any{"type": "Ready", "status": "True"}, + }, "status", "conditions") + return sub } @@ -1139,7 +1195,7 @@ func TestListModels_DifferentModelRefsWithSameModelIDAndDifferentSubscriptions(t }, } - modelMgr, err := models.NewManager(testLogger) + modelMgr, err := models.NewManager(testLogger, 15) require.NoError(t, err) subscriptionSelector := subscription.NewSelector(testLogger, subscriptionLister) diff --git a/maas-api/internal/models/discovery.go b/maas-api/internal/models/discovery.go index a08fd1d64..eb030615d 100644 --- a/maas-api/internal/models/discovery.go +++ b/maas-api/internal/models/discovery.go @@ -32,28 +32,43 @@ const maxModelsResponseBytes int64 = 4 << 20 // 4 MiB // HTTP client and concurrency for access-validation probes. const ( - httpClientTimeout = 5 * time.Second httpMaxIdleConns = 100 httpIdleConnTimeout = 90 * time.Second maxDiscoveryConcurrency = 10 + + // defaultAccessCheckTimeout bounds the total duration of FilterModelsByAccess. + // This limits the staleness window between when access is checked and when + // the response reaches the client. Models whose probes don't complete within + // this window are excluded (fail-closed). + defaultAccessCheckTimeout = 15 * time.Second ) // Manager runs access validation (probe model endpoints) for models listed from MaaSModelRef. type Manager struct { - logger *logger.Logger - httpClient *http.Client + logger *logger.Logger + httpClient *http.Client + accessCheckTimeout time.Duration } // NewManager creates a Manager for filtering models by access. The client uses InsecureSkipVerify // for cluster-internal probes; auth is enforced by the gateway/model server. -func NewManager(log *logger.Logger) (*Manager, error) { +// accessCheckTimeoutSeconds controls the total duration bound for access validation; +// if <= 0, the default of 15 seconds is used. +func NewManager(log *logger.Logger, accessCheckTimeoutSeconds int) (*Manager, error) { if log == nil { return nil, errors.New("log is required") } + timeout := defaultAccessCheckTimeout + if accessCheckTimeoutSeconds > 0 { + timeout = time.Duration(accessCheckTimeoutSeconds) * time.Second + } return &Manager{ - logger: log, + logger: log, + accessCheckTimeout: timeout, httpClient: &http.Client{ - Timeout: httpClientTimeout, + // No per-client Timeout — each request inherits the accessCheckTimeout + // deadline via its context. This ensures that configuring a longer + // ACCESS_CHECK_TIMEOUT_SECONDS actually allows slower backends to respond. Transport: &http.Transport{ TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, //nolint:gosec // cluster-internal only MaxIdleConns: httpMaxIdleConns, @@ -65,12 +80,26 @@ func NewManager(log *logger.Logger) (*Manager, error) { } // FilterModelsByAccess returns only models the user can access by probing each model's -// /v1/models endpoint with the given Authorization and x-maas-subscription headers (passed through as-is). 2xx or 405 → include, 401/403/404 → exclude. +// /v1/models endpoint with the given Authorization and x-maas-subscription headers (passed through as-is). +// 2xx or 405 → include, 401/403/404 → exclude. // Models with nil URL are skipped. Concurrency is limited by maxDiscoveryConcurrency. +// +// Because authorization policies propagate asynchronously through the gateway, there is an +// inherent eventual-consistency window: a model listed here may become inaccessible (or vice versa) +// by the time the client acts on the response. Actual enforcement always happens at the gateway +// when the model is invoked for inference. Callers should set Cache-Control: no-store and expose +// a freshness timestamp via response headers so clients can assess freshness. +// +// The access check is bounded by accessCheckTimeout to limit the staleness window. func (m *Manager) FilterModelsByAccess(ctx context.Context, models []Model, authHeader string, subscriptionHeader string) []Model { if len(models) == 0 { return models } + + // Bound the total access-check duration to limit the staleness window. + ctx, cancel := context.WithTimeout(ctx, m.accessCheckTimeout) + defer cancel() + m.logger.Debug("FilterModelsByAccess: validating access for models", "count", len(models), "subscriptionHeaderProvided", subscriptionHeader != "") // Initialize to empty slice (not nil) so JSON marshals as [] instead of null when no models are accessible out := []Model{} @@ -222,7 +251,11 @@ func (m *Manager) fetchModelsWithRetry(ctx context.Context, authHeader string, s lastResult = authRes return lastResult != authRetry, nil }); err != nil { - m.logger.Debug("Access validation failed: model fetch backoff exhausted", "service", meta.ServiceName, "endpoint", meta.Endpoint, "error", err) + if errors.Is(err, context.DeadlineExceeded) || ctx.Err() == context.DeadlineExceeded { + m.logger.Debug("Access validation failed: context deadline exceeded", "service", meta.ServiceName, "endpoint", meta.Endpoint, "timeout", m.accessCheckTimeout) + } else { + m.logger.Debug("Access validation failed: model fetch backoff exhausted", "service", meta.ServiceName, "endpoint", meta.Endpoint, "error", err) + } return nil // explicit fail-closed on error } @@ -249,6 +282,10 @@ func (m *Manager) fetchModels(ctx context.Context, authHeader string, subscripti // #nosec G704 -- Intentional HTTP request to probe model endpoint for authorization check resp, err := m.httpClient.Do(req) if err != nil { + if errors.Is(err, context.DeadlineExceeded) || ctx.Err() == context.DeadlineExceeded { + m.logger.Debug("Access validation: request timed out (context deadline exceeded)", "service", meta.ServiceName, "endpoint", meta.Endpoint) + return nil, authDenied // fail-closed, no point retrying a deadline + } m.logger.Debug("Access validation: GET request failed", "service", meta.ServiceName, "endpoint", meta.Endpoint, "error", err) return nil, authRetry } diff --git a/maas-api/internal/subscription/handler.go b/maas-api/internal/subscription/handler.go index b62d13240..1e6446cda 100644 --- a/maas-api/internal/subscription/handler.go +++ b/maas-api/internal/subscription/handler.go @@ -69,11 +69,15 @@ func (h *Handler) SelectSubscription(c *gin.Context) { response, err := h.selector.Select(req.Groups, req.Username, req.RequestedSubscription, req.RequestedModel) if err != nil { + // NOTE: All error responses return http.StatusOK with error fields populated in SelectResponse. + // This is intentional for Authorino integration, which expects 200 OK responses with metadata + // fields (not HTTP error codes). See SelectResponse type documentation in types.go. var noSubErr *NoSubscriptionError var notFoundErr *SubscriptionNotFoundError var accessDeniedErr *AccessDeniedError var multipleSubsErr *MultipleSubscriptionsError var modelNotInSubErr *ModelNotInSubscriptionError + var modelUnhealthyErr *ModelUnhealthyError if errors.As(err, &noSubErr) { h.logger.Debug("No subscription found for user", @@ -134,6 +138,21 @@ func (h *Handler) SelectSubscription(c *gin.Context) { return } + if errors.As(err, &modelUnhealthyErr) { + h.logger.Debug("Requested model is unhealthy", + "subscription", modelUnhealthyErr.Subscription, + "phase", modelUnhealthyErr.Phase, + "reason", modelUnhealthyErr.Reason, + "message", modelUnhealthyErr.Message, + ) + c.JSON(http.StatusOK, SelectResponse{ + Error: "model_unhealthy", + Message: modelUnhealthyErr.Message, + Phase: modelUnhealthyErr.Phase, + }) + return + } + // All other errors are internal server errors h.logger.Error("Subscription selection failed", "error", err.Error(), diff --git a/maas-api/internal/subscription/handler_test.go b/maas-api/internal/subscription/handler_test.go index 3bdb679c6..6820e4a5d 100644 --- a/maas-api/internal/subscription/handler_test.go +++ b/maas-api/internal/subscription/handler_test.go @@ -62,6 +62,15 @@ func createTestSubscription(name string, groups []string, priority int32, orgID, }, }, }, + "status": map[string]any{ + "phase": "Active", + "conditions": []any{ + map[string]any{ + "type": "Ready", + "status": "True", + }, + }, + }, }, } } @@ -307,6 +316,15 @@ func TestHandler_SelectSubscription_UserWithoutGroups(t *testing.T) { }, }, }, + "status": map[string]any{ + "phase": "Active", + "conditions": []any{ + map[string]any{ + "type": "Ready", + "status": "True", + }, + }, + }, }, } @@ -474,6 +492,15 @@ func createTestSubscriptionWithLimit(name string, groups []string, priority int3 }, }, }, + "status": map[string]any{ + "phase": "Active", + "conditions": []any{ + map[string]any{ + "type": "Ready", + "status": "True", + }, + }, + }, }, } } @@ -619,6 +646,15 @@ func createTestSubscriptionWithModels( }, }, }, + "status": map[string]any{ + "phase": "Active", + "conditions": []any{ + map[string]any{ + "type": "Ready", + "status": "True", + }, + }, + }, }, } } @@ -665,6 +701,15 @@ func createTestSubscriptionWithAnnotations(name string, groups []string, modelNa "priority": int64(10), "modelRefs": modelRefs, }, + "status": map[string]any{ + "phase": "Active", + "conditions": []any{ + map[string]any{ + "type": "Ready", + "status": "True", + }, + }, + }, }, } } diff --git a/maas-api/internal/subscription/selector.go b/maas-api/internal/subscription/selector.go index ea71e3c89..915037ab8 100644 --- a/maas-api/internal/subscription/selector.go +++ b/maas-api/internal/subscription/selector.go @@ -13,6 +13,15 @@ import ( "github.com/opendatahub-io/models-as-a-service/maas-api/internal/logger" ) +// Phase constants for MaaSSubscription status. +// These must match the Phase values defined in maas-controller/api/maas/v1alpha1/common_types.go. +const ( + PhasePending = "Pending" + PhaseActive = "Active" + PhaseDegraded = "Degraded" + PhaseFailed = "Failed" +) + // Lister provides access to MaaSSubscription resources from an informer cache. type Lister interface { List() ([]*unstructured.Unstructured, error) @@ -37,18 +46,22 @@ func NewSelector(log *logger.Logger, lister Lister) *Selector { // subscription represents a parsed MaaSSubscription for selection. type subscription struct { - Name string - Namespace string - DisplayName string - Description string - Groups []string - Users []string - Priority int32 - MaxLimit int64 - OrganizationID string - CostCenter string - Labels map[string]string - ModelRefs []ModelRefInfo + Name string + Namespace string + DisplayName string + Description string + Groups []string + Users []string + Priority int32 + MaxLimit int64 + OrganizationID string + CostCenter string + Labels map[string]string + ModelRefs []ModelRefInfo + Phase string // status.phase: "Active", "Failed", "Pending", or "" + Ready bool // computed from status.conditions Ready condition + DeletionTimestamp *string // metadata.deletionTimestamp (set when being deleted) + TokenRateLimitStatuses []TokenRateLimitStatus // per-model TRLP status from status.tokenRateLimitStatuses } // GetAllAccessible returns all subscriptions the user has access to. @@ -62,11 +75,25 @@ func (s *Selector) GetAllAccessible(groups []string, username string) ([]*Select return nil, fmt.Errorf("failed to load subscriptions: %w", err) } - var accessible []*SelectResponse + accessible := make([]*SelectResponse, 0, len(subscriptions)) for _, sub := range subscriptions { - if userHasAccess(&sub, username, groups) { - accessible = append(accessible, toResponse(&sub)) + // Check user access + if !userHasAccess(&sub, username, groups) { + continue } + + // Allowlist: only include Active and Degraded subscriptions + // Exclude Failed, Pending, empty (unreconciled), unknown phases, and deleting subscriptions + if sub.Phase != PhaseActive && sub.Phase != PhaseDegraded { + continue + } + + // Exclude subscriptions being deleted + if sub.DeletionTimestamp != nil { + continue + } + + accessible = append(accessible, toResponse(&sub)) } // Sort for deterministic ordering @@ -111,6 +138,10 @@ func (s *Selector) Select(groups []string, username string, requestedSubscriptio if requestedModel != "" && !subscriptionIncludesModel(&sub, requestedModel) { return nil, &ModelNotInSubscriptionError{Subscription: requestedSubscription, Model: requestedModel} } + // Check model health for Degraded subscriptions + if err := checkModelHealth(&sub, requestedModel); err != nil { + return nil, err + } return toResponse(&sub), nil } } @@ -127,6 +158,10 @@ func (s *Selector) Select(groups []string, username string, requestedSubscriptio if requestedModel != "" && !subscriptionIncludesModel(&sub, requestedModel) { return nil, &ModelNotInSubscriptionError{Subscription: requestedSubscription, Model: requestedModel} } + // Check model health for Degraded subscriptions + if err := checkModelHealth(&sub, requestedModel); err != nil { + return nil, err + } return toResponse(&sub), nil } } @@ -152,6 +187,10 @@ func (s *Selector) Select(groups []string, username string, requestedSubscriptio } if len(accessibleSubs) == 1 { + // Check model health for Degraded subscriptions + if err := checkModelHealth(&accessibleSubs[0], requestedModel); err != nil { + return nil, err + } return toResponse(&accessibleSubs[0]), nil } @@ -219,6 +258,8 @@ func (s *Selector) loadSubscriptions() ([]subscription, error) { } // parseSubscription extracts subscription data from unstructured object. +// +//nolint:gocyclo // TODO: refactor to reduce cyclomatic complexity func parseSubscription(obj *unstructured.Unstructured) (subscription, error) { spec, found, err := unstructured.NestedMap(obj.Object, "spec") if err != nil || !found { @@ -280,6 +321,72 @@ func parseSubscription(obj *unstructured.Unstructured) (subscription, error) { // Parse tokenMetadata parseTokenMetadata(spec, &sub) + // Parse status.phase with validation + if status, found, _ := unstructured.NestedMap(obj.Object, "status"); found { + if phase, ok := status["phase"].(string); ok { + // Normalize whitespace and validate against known phases + phase = strings.TrimSpace(phase) + switch phase { + case PhaseActive, PhaseDegraded, PhaseFailed, PhasePending: + sub.Phase = phase + default: + // Unknown phase value - keep raw for debugging but will be rejected by health checks + sub.Phase = phase + } + } + + // Parse status.conditions to extract Ready condition + if conditions, found, _ := unstructured.NestedSlice(status, "conditions"); found { + for _, condRaw := range conditions { + if condMap, ok := condRaw.(map[string]any); ok { + condType, _ := condMap["type"].(string) + if condType == "Ready" { + condStatus, _ := condMap["status"].(string) + sub.Ready = condStatus == "True" + break + } + } + } + } + + // Parse status.tokenRateLimitStatuses to extract TRLP health + if trlpStatuses, found, _ := unstructured.NestedSlice(status, "tokenRateLimitStatuses"); found { + for _, statusRaw := range trlpStatuses { + if statusMap, ok := statusRaw.(map[string]any); ok { + trlpStatus := TokenRateLimitStatus{} + if model, ok := statusMap["model"].(string); ok { + trlpStatus.Model = model + } + if name, ok := statusMap["name"].(string); ok { + trlpStatus.Name = name + } + if namespace, ok := statusMap["namespace"].(string); ok { + trlpStatus.Namespace = namespace + } + if ready, ok := statusMap["ready"].(bool); ok { + trlpStatus.Ready = ready + } + if reason, ok := statusMap["reason"].(string); ok { + trlpStatus.Reason = reason + } + if message, ok := statusMap["message"].(string); ok { + trlpStatus.Message = message + } + sub.TokenRateLimitStatuses = append(sub.TokenRateLimitStatuses, trlpStatus) + } + } + } + } + + // Parse metadata.deletionTimestamp + if metadata := obj.Object["metadata"]; metadata != nil { + if metadataMap, ok := metadata.(map[string]any); ok { + if deletionTimestamp, ok := metadataMap["deletionTimestamp"].(string); ok && deletionTimestamp != "" { + sub.DeletionTimestamp = &deletionTimestamp + } + } + } + return sub, nil } @@ -383,6 +490,114 @@ func subscriptionIncludesModel(sub *subscription, requestedModel string) bool { return false } +// checkModelHealth validates subscription phase and model health. +// Returns error if subscription is not in Active/Degraded phase or if model is unhealthy in Degraded subscriptions. +// +// Two validation paths: +// 1. API key creation (requestedModel=""): Allow Active/Degraded/Pending, block Failed/unreconciled. +// Rationale: Users can create keys while subscription is setting up (Pending), but enforcement +// happens at inference time. Failed subscriptions blocked to prevent key spam on broken subscriptions. +// 2. Inference (requestedModel set): Strict allowlist of Active/Degraded only. +// Blocks Pending/Failed/unreconciled at authorization time. +func checkModelHealth(sub *subscription, requestedModel string) error { + // API key creation path: Allow Active, Degraded, Pending + // Block Failed (prevents key spam on permanently broken subscriptions) + // Block unreconciled (empty phase) + if requestedModel == "" { + if sub.Phase == "" { + return &ModelUnhealthyError{ + Subscription: sub.Name, + Phase: sub.Phase, + Reason: "SubscriptionNotReady", + Message: "subscription is unreconciled (no status.phase set)", + } + } + if sub.Phase == PhaseFailed { + return &ModelUnhealthyError{ + Subscription: sub.Name, + Phase: sub.Phase, + Reason: "SubscriptionNotReady", + Message: "subscription is in Failed phase (cannot create API keys)", + } + } + return nil // Allow Active, Degraded, Pending for API key creation + } + + // Inference path: Allowlist only Active and Degraded subscriptions + // Reject Failed, Pending, unreconciled, and unknown phases + if sub.Phase != PhaseActive && sub.Phase != PhaseDegraded { + phaseDisplay := sub.Phase + if phaseDisplay == "" { + phaseDisplay = "unreconciled" + } + return &ModelUnhealthyError{ + Subscription: sub.Name, + Phase: sub.Phase, + Reason: "SubscriptionNotReady", + Message: fmt.Sprintf("subscription is in %s phase (allowed: Active, Degraded)", phaseDisplay), + } + } + + // Active subscriptions are allowed without TRLP checks (already validated above) + if sub.Phase != PhaseDegraded { + return nil + } + + // For Degraded subscriptions, verify rate limits can be enforced (if defined) + // Parse the requested model (format: "namespace/name") + parts := strings.SplitN(requestedModel, "/", 2) + if len(parts) != 2 { + return &ModelUnhealthyError{ + Subscription: sub.Name, + Phase: sub.Phase, + Reason: "InvalidModelFormat", + Message: "invalid model format: must be namespace/name", + } + } + requestedNS := parts[0] + requestedName := parts[1] + + // Check if this model has tokenRateLimits defined in the subscription spec + hasRateLimits := false + for _, ref := range sub.ModelRefs { + if ref.Namespace == requestedNS && ref.Name == requestedName { + if len(ref.TokenRateLimits) > 0 { + hasRateLimits = true + } + break + } + } + + // If model doesn't have rate limits defined, allow inference (no TRLP to check) + if !hasRateLimits { + return nil + } + + // Model has rate limits defined - verify TRLP is ready + for _, trlp := range sub.TokenRateLimitStatuses { + if trlp.Model == requestedName { + if !trlp.Ready { + return &ModelUnhealthyError{ + Subscription: sub.Name, + Phase: sub.Phase, + Reason: "RateLimitNotEnforced", + Message: "subscription rate limiting policies are not ready", + } + } + // TRLP is ready - allow inference + return nil + } + } + + // Model has rate limits defined but TRLP status missing - fail closed + return &ModelUnhealthyError{ + Subscription: sub.Name, + Phase: sub.Phase, + Reason: "RateLimitNotEnforced", + Message: "subscription rate limiting policies are not ready", + } +} + // hasModel returns true if the subscription includes the given model name. func (s subscription) hasModel(modelID string) bool { for _, ref := range s.ModelRefs { @@ -442,7 +657,7 @@ func toSubscriptionInfo(sub *subscription) SubscriptionInfo { if modelRefs == nil { modelRefs = []ModelRefInfo{} } - return SubscriptionInfo{ + info := SubscriptionInfo{ SubscriptionIDHeader: sub.Name, SubscriptionDescription: desc, DisplayName: sub.DisplayName, @@ -452,6 +667,7 @@ func toSubscriptionInfo(sub *subscription) SubscriptionInfo { CostCenter: sub.CostCenter, Labels: sub.Labels, } + return info } // ResponseToSubscriptionInfo converts a SelectResponse to a SubscriptionInfo. @@ -485,7 +701,7 @@ func toResponse(sub *subscription) *SelectResponse { if modelRefs == nil { modelRefs = []ModelRefInfo{} } - return &SelectResponse{ + resp := &SelectResponse{ Name: sub.Name, Namespace: sub.Namespace, DisplayName: sub.DisplayName, @@ -495,7 +711,13 @@ func toResponse(sub *subscription) *SelectResponse { OrganizationID: sub.OrganizationID, CostCenter: sub.CostCenter, Labels: sub.Labels, + Phase: sub.Phase, + Ready: sub.Ready, + } + if sub.DeletionTimestamp != nil { + resp.DeletionTimestamp = *sub.DeletionTimestamp } + return resp } // NoSubscriptionError indicates no matching subscription found. @@ -541,3 +763,16 @@ type ModelNotInSubscriptionError struct { func (e *ModelNotInSubscriptionError) Error() string { return fmt.Sprintf("subscription %s does not include model %s", e.Subscription, e.Model) } + +// ModelUnhealthyError indicates the requested model is not healthy in a Degraded subscription. +// Note: Model field is intentionally omitted to prevent XSS attacks. +type ModelUnhealthyError struct { + Subscription string + Phase string // Subscription phase for Authorino OPA evaluation + Reason string + Message string +} + +func (e *ModelUnhealthyError) Error() string { + return "requested model is unhealthy in subscription" +} diff --git a/maas-api/internal/subscription/selector_test.go b/maas-api/internal/subscription/selector_test.go index b78fd0cbc..8ffa63987 100644 --- a/maas-api/internal/subscription/selector_test.go +++ b/maas-api/internal/subscription/selector_test.go @@ -10,7 +10,13 @@ import ( "github.com/opendatahub-io/models-as-a-service/maas-api/internal/subscription" ) -const defaultTestTokenRateLimit int64 = 1000 +const ( + defaultTestTokenRateLimit int64 = 1000 + phaseActive = "Active" + phaseFailed = "Failed" + phasePending = "Pending" + phaseDegraded = "Degraded" +) // fakeLister implements subscription.Lister for testing. type fakeLister struct { @@ -72,12 +78,24 @@ func createSubscription(name string, groups []string, users []string, priority i metadata["annotations"] = annotations } + // Add Active status by default (real subscriptions are reconciled) + status := map[string]any{ + "phase": phaseActive, + "conditions": []any{ + map[string]any{ + "type": "Ready", + "status": "True", + }, + }, + } + return &unstructured.Unstructured{ Object: map[string]any{ "apiVersion": "maas.opendatahub.io/v1alpha1", "kind": "MaaSSubscription", "metadata": metadata, "spec": spec, + "status": status, }, } } @@ -193,6 +211,63 @@ func TestGetAllAccessible(t *testing.T) { expectedCount: 1, expectedNames: []string{"basic-sub"}, }, + { + name: "exclude Failed subscriptions", + subscriptions: []*unstructured.Unstructured{ + createSubscriptionWithHealth("failed-sub", []string{"basic-users"}, nil, 10, defaultTestTokenRateLimit, phaseFailed, false, false), + createSubscriptionWithHealth("active-sub", []string{"basic-users"}, nil, 20, defaultTestTokenRateLimit, phaseActive, true, false), + }, + groups: []string{"basic-users"}, + username: "alice", + expectedCount: 1, + expectedNames: []string{"active-sub"}, + }, + { + name: "exclude Pending subscriptions", + subscriptions: []*unstructured.Unstructured{ + createSubscriptionWithHealth("pending-sub", []string{"basic-users"}, nil, 10, defaultTestTokenRateLimit, phasePending, false, false), + createSubscriptionWithHealth("active-sub", []string{"basic-users"}, nil, 20, defaultTestTokenRateLimit, phaseActive, true, false), + }, + groups: []string{"basic-users"}, + username: "alice", + expectedCount: 1, + expectedNames: []string{"active-sub"}, + }, + { + name: "include Degraded subscriptions", + subscriptions: []*unstructured.Unstructured{ + createSubscriptionWithHealth("degraded-sub", []string{"basic-users"}, nil, 10, defaultTestTokenRateLimit, phaseDegraded, true, false), + createSubscriptionWithHealth("active-sub", []string{"basic-users"}, nil, 20, defaultTestTokenRateLimit, phaseActive, true, false), + }, + groups: []string{"basic-users"}, + username: "alice", + expectedCount: 2, + expectedNames: []string{"active-sub", "degraded-sub"}, + }, + { + name: "exclude deleting subscriptions", + subscriptions: []*unstructured.Unstructured{ + createSubscriptionWithHealth("deleting-sub", []string{"basic-users"}, nil, 10, defaultTestTokenRateLimit, phaseActive, true, true), + createSubscriptionWithHealth("active-sub", []string{"basic-users"}, nil, 20, defaultTestTokenRateLimit, phaseActive, true, false), + }, + groups: []string{"basic-users"}, + username: "alice", + expectedCount: 1, + expectedNames: []string{"active-sub"}, + }, + { + name: "filter by phase - only Active and Degraded included", + subscriptions: []*unstructured.Unstructured{ + createSubscriptionWithHealth("active-sub", []string{"basic-users"}, nil, 10, defaultTestTokenRateLimit, phaseActive, true, false), + createSubscriptionWithHealth("degraded-sub", []string{"basic-users"}, nil, 20, defaultTestTokenRateLimit, phaseDegraded, true, false), + createSubscriptionWithHealth("failed-sub", []string{"basic-users"}, nil, 30, defaultTestTokenRateLimit, phaseFailed, false, false), + createSubscriptionWithHealth("pending-sub", []string{"basic-users"}, nil, 40, defaultTestTokenRateLimit, phasePending, false, false), + }, + groups: []string{"basic-users"}, + username: "alice", + expectedCount: 2, + expectedNames: []string{"active-sub", "degraded-sub"}, + }, } for _, tt := range tests { @@ -333,3 +408,478 @@ func TestSelectHighestPriority(t *testing.T) { } }) } + +// createSubscriptionWithHealth creates a subscription with health status fields. +// +//nolint:unparam // Test helper - parameters provide flexibility for future tests +func createSubscriptionWithHealth( + name string, groups []string, users []string, priority int32, + tokenLimit int64, phase string, ready bool, deleting bool, +) *unstructured.Unstructured { + sub := createSubscription(name, groups, users, priority, tokenLimit, "", "") + + // Add status + if phase != "" || ready { + status := map[string]any{} + if phase != "" { + status["phase"] = phase + } + + // Add Ready condition + if phase != "" { + conditions := []any{ + map[string]any{ + "type": "Ready", + "status": func() string { + if ready { + return "True" + } + return "False" + }(), + "reason": "Test", + "message": "Test condition", + }, + } + status["conditions"] = conditions + } + + sub.Object["status"] = status + } + + // Add deletionTimestamp if deleting + if deleting { + metadata, ok := sub.Object["metadata"].(map[string]any) + if !ok { + panic("metadata should be map[string]any") + } + metadata["deletionTimestamp"] = "2026-04-08T12:00:00Z" + } + + return sub +} + +func TestSelector_HealthFieldParsing(t *testing.T) { + log := logger.New(false) + + tests := []struct { + name string + subscription *unstructured.Unstructured + expectedPhase string + expectedReady bool + expectedDeleting bool + expectError bool // Failed/Pending subscriptions should error + }{ + { + name: "Active subscription with Ready=True", + subscription: createSubscriptionWithHealth("active-sub", []string{"g1"}, nil, 10, 1000, phaseActive, true, false), + expectedPhase: phaseActive, + expectedReady: true, + expectedDeleting: false, + expectError: false, + }, + { + name: "Failed subscription with Ready=False - rejected for API key creation", + subscription: createSubscriptionWithHealth("failed-sub", []string{"g1"}, nil, 10, 1000, phaseFailed, false, false), + expectedPhase: phaseFailed, + expectedReady: false, + expectedDeleting: false, + expectError: true, // Failed subscriptions rejected to prevent key spam + }, + { + name: "Pending subscription with Ready=False - allowed for API key creation", + subscription: createSubscriptionWithHealth("pending-sub", []string{"g1"}, nil, 10, 1000, phasePending, false, false), + expectedPhase: phasePending, + expectedReady: false, + expectedDeleting: false, + expectError: false, // Pending subscriptions allowed (optimistic - might become Active) + }, + { + name: "Degraded subscription with Ready=False", + subscription: createSubscriptionWithHealth("degraded-sub", []string{"g1"}, nil, 10, 1000, phaseDegraded, false, false), + expectedPhase: phaseDegraded, + expectedReady: false, + expectedDeleting: false, + expectError: false, + }, + { + name: "Subscription being deleted", + subscription: createSubscriptionWithHealth("deleting-sub", []string{"g1"}, nil, 10, 1000, phaseActive, true, true), + expectedPhase: phaseActive, + expectedReady: true, + expectedDeleting: true, + expectError: false, + }, + { + name: "Subscription without status - rejected (unreconciled)", + subscription: func() *unstructured.Unstructured { + // Create subscription without status (unreconciled) + return &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "maas.opendatahub.io/v1alpha1", + "kind": "MaaSSubscription", + "metadata": map[string]any{ + "name": "no-status-sub", + "namespace": "test-ns", + }, + "spec": map[string]any{ + "owner": map[string]any{ + "groups": []any{map[string]any{"name": "g1"}}, + "users": []any{}, + }, + "priority": int64(10), + "modelRefs": []any{ + map[string]any{ + "name": "test-model", + "tokenRateLimits": []any{ + map[string]any{ + "limit": int64(1000), + "window": "1m", + }, + }, + }, + }, + }, + // No status field - simulates unreconciled subscription + }, + } + }(), + expectedPhase: "", + expectedReady: false, + expectedDeleting: false, + expectError: true, // Empty phase means unreconciled - now rejected + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + lister := &fakeLister{subscriptions: []*unstructured.Unstructured{tt.subscription}} + selector := subscription.NewSelector(log, lister) + + //nolint:unqueryvet,nolintlint // False positive - not a SQL query + result, err := selector.Select([]string{"g1"}, "", "", "") + + if tt.expectError { + if err == nil { + t.Fatalf("Expected error for %s subscription, got nil", tt.expectedPhase) + } + // Error expected - test passes + return + } + + if err != nil { + t.Fatalf("Select() error = %v", err) + } + + if result.Phase != tt.expectedPhase { + t.Errorf("Phase = %v, want %v", result.Phase, tt.expectedPhase) + } + + if result.Ready != tt.expectedReady { + t.Errorf("Ready = %v, want %v", result.Ready, tt.expectedReady) + } + + gotDeleting := result.DeletionTimestamp != "" + if gotDeleting != tt.expectedDeleting { + t.Errorf("DeletionTimestamp set = %v, want %v", gotDeleting, tt.expectedDeleting) + } + }) + } +} + +func TestSelector_ListAccessibleWithHealth(t *testing.T) { + log := logger.New(false) + + subscriptions := []*unstructured.Unstructured{ + createSubscriptionWithHealth("active-sub", []string{"g1"}, nil, 10, 1000, phaseActive, true, false), + createSubscriptionWithHealth("degraded-sub", []string{"g1"}, nil, 9, 1000, phaseDegraded, true, false), + createSubscriptionWithHealth("failed-sub", []string{"g1"}, nil, 5, 1000, phaseFailed, false, false), + createSubscriptionWithHealth("deleting-sub", []string{"g1"}, nil, 8, 1000, phaseActive, true, true), + } + + lister := &fakeLister{subscriptions: subscriptions} + selector := subscription.NewSelector(log, lister) + + results, err := selector.GetAllAccessible([]string{"g1"}, "") + if err != nil { + t.Fatalf("GetAllAccessible() error = %v", err) + } + + // Only Active and Degraded subscriptions are returned (Failed and deleting are filtered out) + if len(results) != 2 { + t.Fatalf("Expected 2 subscriptions (Active and Degraded only), got %d", len(results)) + } + + // Check that health fields are populated in returned results + for _, result := range results { + switch result.Name { + case "active-sub": + if result.Phase != phaseActive || !result.Ready || result.DeletionTimestamp != "" { + t.Errorf("active-sub health fields incorrect: Phase=%s, Ready=%v, DeletionTimestamp=%s", + result.Phase, result.Ready, result.DeletionTimestamp) + } + case "degraded-sub": + if result.Phase != phaseDegraded || !result.Ready || result.DeletionTimestamp != "" { + t.Errorf("degraded-sub health fields incorrect: Phase=%s, Ready=%v, DeletionTimestamp=%s", + result.Phase, result.Ready, result.DeletionTimestamp) + } + case "failed-sub": + t.Errorf("failed-sub should have been filtered out") + case "deleting-sub": + t.Errorf("deleting-sub should have been filtered out") + } + } +} + +func TestSelector_DegradedSubscriptionTRLPFiltering(t *testing.T) { + log := logger.Production() + + tests := []struct { + name string + subscription *unstructured.Unstructured + requestedModel string + expectError bool + expectedErrorReason string + }{ + { + name: "Degraded subscription with TRLP not ready - blocks inference", + subscription: createSubscriptionWithTRLPStatus("degraded-sub", []string{"g1"}, phaseDegraded, []map[string]any{ + { + "name": "model-a", + "namespace": "ns", + "ready": true, + "reason": "Valid", + }, + }, []map[string]any{ + { + "model": "model-a", + "name": "maas-trlp-model-a", + "namespace": "ns", + "ready": false, + "reason": "NotAccepted", + "message": "status not available", + }, + }), + requestedModel: "ns/model-a", + expectError: true, + expectedErrorReason: "RateLimitNotEnforced", + }, + { + name: "Degraded subscription with all TRLPs ready - allows inference (partial model failure)", + subscription: createSubscriptionWithTRLPStatus("degraded-sub", []string{"g1"}, phaseDegraded, []map[string]any{ + { + "name": "model-a", + "namespace": "ns", + "ready": true, + "reason": "Valid", + }, + { + "name": "model-b", + "namespace": "ns", + "ready": false, + "reason": "NotFound", + "message": "model not found", + }, + }, []map[string]any{ + { + "model": "model-a", + "name": "maas-trlp-model-a", + "namespace": "ns", + "ready": true, + "reason": "Accepted", + }, + }), + requestedModel: "ns/model-a", + expectError: false, + }, + { + name: "Active subscription - TRLP status doesn't matter", + subscription: createSubscriptionWithTRLPStatus("active-sub", []string{"g1"}, phaseActive, []map[string]any{ + { + "name": "model-a", + "namespace": "ns", + "ready": true, + "reason": "Valid", + }, + }, []map[string]any{ + { + "model": "model-a", + "name": "maas-trlp-model-a", + "namespace": "ns", + "ready": false, + "reason": "NotAccepted", + }, + }), + requestedModel: "ns/model-a", + expectError: false, + }, + { + name: "Degraded subscription with multiple TRLPs - requested model TRLP ready allows inference", + subscription: createSubscriptionWithTRLPStatus("degraded-sub", []string{"g1"}, phaseDegraded, []map[string]any{ + { + "name": "model-a", + "namespace": "ns", + "ready": true, + "reason": "Valid", + }, + { + "name": "model-b", + "namespace": "ns", + "ready": true, + "reason": "Valid", + }, + }, []map[string]any{ + { + "model": "model-a", + "name": "maas-trlp-model-a", + "namespace": "ns", + "ready": true, + "reason": "Accepted", + }, + { + "model": "model-b", + "name": "maas-trlp-model-b", + "namespace": "ns", + "ready": false, + "reason": "NotAccepted", + "message": "policy not accepted", + }, + }), + requestedModel: "ns/model-a", + expectError: false, + }, + { + name: "Degraded subscription with multiple TRLPs - requested model TRLP not ready blocks inference", + subscription: createSubscriptionWithTRLPStatus("degraded-sub", []string{"g1"}, phaseDegraded, []map[string]any{ + { + "name": "model-a", + "namespace": "ns", + "ready": true, + "reason": "Valid", + }, + { + "name": "model-b", + "namespace": "ns", + "ready": true, + "reason": "Valid", + }, + }, []map[string]any{ + { + "model": "model-a", + "name": "maas-trlp-model-a", + "namespace": "ns", + "ready": true, + "reason": "Accepted", + }, + { + "model": "model-b", + "name": "maas-trlp-model-b", + "namespace": "ns", + "ready": false, + "reason": "NotAccepted", + "message": "policy not accepted", + }, + }), + requestedModel: "ns/model-b", + expectError: true, + expectedErrorReason: "RateLimitNotEnforced", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + lister := &fakeLister{subscriptions: []*unstructured.Unstructured{tt.subscription}} + selector := subscription.NewSelector(log, lister) + + //nolint:unqueryvet,nolintlint // False positive - not a SQL query + result, err := selector.Select([]string{"g1"}, "", "", tt.requestedModel) + + if tt.expectError { + if err == nil { + t.Fatalf("Expected error but got none") + } + var modelUnhealthyErr *subscription.ModelUnhealthyError + if !errors.As(err, &modelUnhealthyErr) { + t.Fatalf("Expected ModelUnhealthyError, got %T: %v", err, err) + } + if tt.expectedErrorReason != "" && modelUnhealthyErr.Reason != tt.expectedErrorReason { + t.Fatalf("Expected error reason %q, got %q", tt.expectedErrorReason, modelUnhealthyErr.Reason) + } + } else { + if err != nil { + t.Fatalf("Expected no error but got: %v", err) + } + if result == nil { + t.Fatal("Expected result but got nil") + } + } + }) + } +} + +// createSubscriptionWithTRLPStatus creates a test subscription with model and TRLP status. +func createSubscriptionWithTRLPStatus(name string, groups []string, phase string, modelStatuses []map[string]any, trlpStatuses []map[string]any) *unstructured.Unstructured { + groupsSlice := make([]any, len(groups)) + for i, g := range groups { + groupsSlice[i] = map[string]any{"name": g} + } + + // Convert []map[string]any to []any for k8s deep copy compatibility + modelStatusesAny := make([]any, len(modelStatuses)) + for i, status := range modelStatuses { + modelStatusesAny[i] = status + } + + trlpStatusesAny := make([]any, len(trlpStatuses)) + for i, status := range trlpStatuses { + trlpStatusesAny[i] = status + } + + // Build modelRefs from modelStatuses + modelRefs := make([]any, 0, len(modelStatuses)) + for _, status := range modelStatuses { + modelName, _ := status["name"].(string) + modelNamespace, _ := status["namespace"].(string) + modelRefs = append(modelRefs, map[string]any{ + "name": modelName, + "namespace": modelNamespace, + "tokenRateLimits": []any{ + map[string]any{ + "limit": int64(100), + "window": "1m", + }, + }, + }) + } + + obj := &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "maas.opendatahub.io/v1alpha1", + "kind": "MaaSSubscription", + "metadata": map[string]any{ + "name": name, + "namespace": "test-ns", + }, + "spec": map[string]any{ + "owner": map[string]any{ + "groups": groupsSlice, + }, + "priority": int64(10), + "modelRefs": modelRefs, + }, + "status": map[string]any{ + "phase": phase, + "conditions": []any{ + map[string]any{ + "type": "Ready", + "status": "True", + "reason": phase, + "message": "test", + }, + }, + "modelRefStatuses": modelStatusesAny, + "tokenRateLimitStatuses": trlpStatusesAny, + }, + }, + } + return obj +} diff --git a/maas-api/internal/subscription/types.go b/maas-api/internal/subscription/types.go index ae67efc5f..b0b397166 100644 --- a/maas-api/internal/subscription/types.go +++ b/maas-api/internal/subscription/types.go @@ -28,6 +28,11 @@ type SelectResponse struct { CostCenter string `json:"costCenter,omitempty"` // Cost center for attribution Labels map[string]string `json:"labels,omitempty"` // Additional tracking labels + // Health fields (populated from status and metadata) + Phase string `json:"phase"` // Subscription phase: "Active", "Degraded", "Failed", "Pending", or "" (always serialized for Authorino OPA rules) + Ready bool `json:"ready"` // Whether subscription is ready (from Ready condition) + DeletionTimestamp string `json:"deletionTimestamp,omitempty"` // Set when subscription is being deleted + // Error fields (populated when selection fails) Error string `json:"error,omitempty"` // Error code (e.g., "bad_request", "not_found", "access_denied", "multiple_subscriptions") Message string `json:"message,omitempty"` // Human-readable error message @@ -60,6 +65,16 @@ type TokenRateLimit struct { Window string `json:"window"` } +// TokenRateLimitStatus represents the status of a TokenRateLimitPolicy for a model. +type TokenRateLimitStatus struct { + Model string `json:"model"` + Name string `json:"name"` + Namespace string `json:"namespace"` + Ready bool `json:"ready"` + Reason string `json:"reason"` + Message string `json:"message"` +} + // BillingRate defines billing information. type BillingRate struct { PerToken string `json:"per_token"` diff --git a/maas-api/openapi3.yaml b/maas-api/openapi3.yaml index f38d6ec3b..280704a49 100644 --- a/maas-api/openapi3.yaml +++ b/maas-api/openapi3.yaml @@ -3,6 +3,13 @@ info: title: Models as a Service API description: Models as a Service Billing and Management API version: "1.0" + contact: + name: MaaS API Support + url: https://github.com/opendatahub-io/models-as-a-service + email: opendatahub@redhat.com + license: + name: Apache 2.0 + url: https://www.apache.org/licenses/LICENSE-2.0 servers: - url: '{serverUrl}' variables: @@ -174,7 +181,9 @@ paths: schema: $ref: '#/components/schemas/ErrorResponse' example: - error: Failed to retrieve LLM models + error: + message: Failed to retrieve LLM models + type: server_error /v1/api-keys: post: tags: @@ -453,7 +462,9 @@ paths: schema: $ref: '#/components/schemas/ErrorResponse' example: - error: "Access denied: you can only bulk revoke your own API keys" + error: + message: "Access denied: you can only bulk revoke your own API keys" + type: forbidden /v1/api-keys/{id}: get: tags: @@ -531,8 +542,12 @@ paths: example: - subscription_id_header: free-tier subscription_description: Free Tier + priority: 0 + model_refs: [] - subscription_id_header: premium subscription_description: Premium Plan + priority: 10 + model_refs: [] "500": description: Internal Server Error response. content: @@ -565,6 +580,8 @@ paths: example: - subscription_id_header: premium subscription_description: Premium Plan + priority: 10 + model_refs: [] "400": description: Bad Request. Missing model-id parameter. content: @@ -834,7 +851,8 @@ components: example: 100 window: type: string - description: "Time window (e.g., 1m, 1h, 24h)" + description: "Time window (e.g., 1m, 1h, 24h). Allowed units: s, m, h (1-9999)." + pattern: "^[1-9]\\d{0,3}(s|m|h)$" example: 1m required: - limit @@ -925,9 +943,11 @@ components: tags: - name: api-keys description: "\U0001F5DDī¸ Named API Key Management service. Long-lived, trackable tokens for applications." + - name: api-keys-v2 + description: "\U0001F511 API Key Management v2. OpenAI-compatible API keys with hash-based storage." + - name: health + description: â¤ī¸ Health check service - name: models description: "\U0001F916 Model management service" - name: subscriptions description: Subscription listing service - - name: health - description: â¤ī¸ Health check service diff --git a/maas-controller/.golangci.yml b/maas-controller/.golangci.yml index 40f2a00b1..4b5aaa18c 100644 --- a/maas-controller/.golangci.yml +++ b/maas-controller/.golangci.yml @@ -120,6 +120,9 @@ linters: - linters: - ireturn path: pkg/controller/maas/maassubscription_controller\.go + - linters: + - ireturn + path: pkg/controller/maas/tenant_controller\.go - linters: - ireturn path: pkg/reconciler/externalmodel/reconciler\.go diff --git a/maas-controller/Dockerfile b/maas-controller/Dockerfile index 24e5edbd9..483a98415 100644 --- a/maas-controller/Dockerfile +++ b/maas-controller/Dockerfile @@ -10,9 +10,9 @@ ARG TARGETOS ARG TARGETARCH WORKDIR /app -COPY go.mod go.sum ./ +COPY maas-controller/go.mod maas-controller/go.sum ./ RUN go mod download -COPY . . +COPY maas-controller/ ./ USER root @@ -22,9 +22,14 @@ FROM --platform=$TARGETPLATFORM registry.access.redhat.com/ubi9/ubi-minimal:late WORKDIR / -COPY --from=builder /app/manager . +COPY --from=builder /app/manager /manager RUN chmod +x /manager +COPY maas-api/deploy /maas-api/deploy +COPY deployment/base/maas-api /deployment/base/maas-api +COPY deployment/base/maas-controller/policies /deployment/base/maas-controller/policies +COPY deployment/components /deployment/components +RUN chmod -R g=u /maas-api /deployment # Use a non-root user (OpenShift will assign random UID) USER 1001 diff --git a/maas-controller/Dockerfile.konflux b/maas-controller/Dockerfile.konflux index c44291291..3dfec4efd 100644 --- a/maas-controller/Dockerfile.konflux +++ b/maas-controller/Dockerfile.konflux @@ -10,9 +10,9 @@ ARG TARGETOS ARG TARGETARCH WORKDIR /app -COPY go.mod go.sum ./ +COPY maas-controller/go.mod maas-controller/go.sum ./ RUN go mod download -COPY . . +COPY maas-controller/ ./ USER root RUN CGO_ENABLED=${CGO_ENABLED} GOEXPERIMENT=${GOEXPERIMENT} GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH:-amd64} go build -a -trimpath -ldflags="-s -w" -o manager ./cmd/manager @@ -21,9 +21,14 @@ FROM --platform=$TARGETPLATFORM registry.access.redhat.com/ubi9/ubi-minimal@sha2 WORKDIR / -COPY --from=builder /app/manager . +COPY --from=builder /app/manager /manager RUN chmod +x /manager +COPY maas-api/deploy /maas-api/deploy +COPY deployment/base/maas-api /deployment/base/maas-api +COPY deployment/base/maas-controller/policies /deployment/base/maas-controller/policies +COPY deployment/components /deployment/components +RUN chmod -R g=u /maas-api /deployment # Use a non-root user (OpenShift will assign random UID) USER 1001 diff --git a/maas-controller/Makefile b/maas-controller/Makefile index 33872df48..4f6653d22 100644 --- a/maas-controller/Makefile +++ b/maas-controller/Makefile @@ -54,15 +54,18 @@ CONTROLLER_GEN = $(BUILD_DIR)/controller-gen ##@ Development .PHONY: build -build: tidy $(BUILD_DIR) ## build manager binary to bin/manager - $(GO_ENV) go build -o $(BUILD_DIR)/$(BINARY_NAME) ./cmd/manager +build: tidy generate manifests lint test binary ## run full build: tidy, generate, manifests, lint, test, binary + +.PHONY: binary +binary: $(BUILD_DIR) ## build manager binary to bin/manager (skip checks) + $(GO_ENV) go build -o "$(BUILD_DIR)/$(BINARY_NAME)" ./cmd/manager $(BUILD_DIR): - mkdir -p $(BUILD_DIR) + mkdir -p "$(BUILD_DIR)" .PHONY: run -run: build ## build and run manager locally - $(BUILD_DIR)/$(BINARY_NAME) +run: binary ## build and run manager locally + "$(BUILD_DIR)/$(BINARY_NAME)" TEST_FLAGS ?= -race -coverprofile=coverage.out .PHONY: test diff --git a/maas-controller/README.md b/maas-controller/README.md index 5c844abcb..0f043bcf6 100644 --- a/maas-controller/README.md +++ b/maas-controller/README.md @@ -1,11 +1,27 @@ # MaaS Controller -Control plane for the Models-as-a-Service (MaaS) subscription model. It reconciles **MaaSModelRef**, **MaaSAuthPolicy**, and **MaaSSubscription** custom resources and creates the corresponding Kuadrant AuthPolicies and TokenRateLimitPolicies, plus HTTPRoutes where needed. +Control plane for the Models-as-a-Service (MaaS) platform. The controller has two main responsibilities: + +1. **Tenant reconciler** — deploys and manages `maas-api` via Server-Side Apply (SSA). The controller image includes the kustomize manifests and renders them at runtime, applying namespace, image, and configuration overrides from the `Tenant` CR and environment variables. +2. **Subscription reconciler** — reconciles **MaaSModelRef**, **MaaSAuthPolicy**, and **MaaSSubscription** custom resources and creates the corresponding Kuadrant AuthPolicies and TokenRateLimitPolicies, plus HTTPRoutes where needed. For a comparison of the old tier-based flow vs the new subscription flow, see [docs/old-vs-new-flow.md](docs/old-vs-new-flow.md). ## Architecture +### Tenant reconciler + +The Tenant reconciler watches `Tenant` CRs and deploys `maas-api` into the target namespace. On startup the controller creates a `default-tenant` CR if one does not exist. The reconciler: + +- Renders the embedded kustomize overlay (`maas-api/deploy/overlays/odh`) with runtime parameters (namespace, image, TLS settings) +- Applies the rendered manifests via SSA with `ForceOwnership`, so the controller is the sole owner +- Deploys gateway default policies (`AuthPolicy` for deny-unauthenticated, `TokenRateLimitPolicy` for deny-unsubscribed) +- Annotates the `maas-api` AuthPolicy with `opendatahub.io/managed=false` to prevent the ODH operator from reverting customizations + +The `RELATED_IMAGE_ODH_MAAS_API_IMAGE` environment variable controls which `maas-api` image the Tenant reconciler deploys. When set on the controller Deployment, it overrides the default image in the kustomize manifests. + +### Subscription model + The controller implements a **dual-gate** model where both gates must pass for a request to succeed: ```text @@ -108,13 +124,11 @@ MaaSModelRef's `spec.modelRef.kind` selects how the controller discovers and exp | Kind (CRD value) | Behaviour | | ---------------- | --------- | | **LLMInferenceService** | Validates that an HTTPRoute exists for the referenced LLMInferenceService (created by KServe). Reads endpoint and readiness from the LLMInferenceService/HTTPRoute. | -| **ExternalModel** | Stub: not yet implemented. Controller sets status **Phase=Failed** and condition **Reason=Unsupported**. When implemented, users supply the HTTPRoute (controller does not create it); see `providers_external.go`. | +| **ExternalModel** | References an [ExternalModel](../docs/content/reference/crds/external-model.md) CR that defines an external AI/ML provider (e.g., OpenAI, Anthropic). The ExternalModel controller creates an HTTPRoute named `` in the same namespace. MaaSModelRef validates the HTTPRoute exists and references the configured gateway, then derives the endpoint from the gateway's hostname. Model is ready once the HTTPRoute is accepted by the gateway. See `providers_external.go` for implementation. | -The CRD enum for `kind` is `LLMInferenceService` and `ExternalModel` (see `api/maas/v1alpha1/maasmodelref_types.go`). The registry accepts **LLMInferenceService** (and the alias **llmisvc** for backwards compatibility). Use `kind: LLMInferenceService` in MaaSModelRef specs. +The CRD enum for `kind` is `LLMInferenceService` and `ExternalModel` (see `api/maas/v1alpha1/maasmodelref_types.go`). The registry accepts **LLMInferenceService**, **ExternalModel**, and the alias **llmisvc** (for backwards compatibility). -**Endpoint override:** MaaSModel supports an optional `spec.endpointOverride` field. When set, the controller uses this value for `status.endpoint` instead of the auto-discovered endpoint. This applies to all kinds and is useful when the discovered endpoint is wrong (e.g. wrong gateway or hostname). The controller still validates the backend normally — only the final endpoint URL is overridden. - -**Status for unimplemented kinds:** If a kind returns `ErrKindNotImplemented` (e.g. ExternalModel), the controller updates status with Phase=Failed and Ready condition Reason=**Unsupported** (instead of ReconcileFailed), so UIs can distinguish "not implemented" from other failures. +**Endpoint override:** MaaSModelRef supports an optional `spec.endpointOverride` field. When set, the controller uses this value for `status.endpoint` instead of the auto-discovered endpoint. This applies to all kinds and is useful when the discovered endpoint is wrong (e.g. wrong gateway or hostname). The controller still validates the backend normally — only the final endpoint URL is overridden. ### Adding a new provider @@ -213,18 +227,15 @@ Common groups: `dedicated-admins`, `system:authenticated`, `system:authenticated All commands below are meant to be run from the **repository root** (the directory containing `maas-controller/`). -### Option A: Full deploy with subscription controller (recommended) +### Option A: Full deploy (recommended) -Deploy the entire MaaS stack including the subscription controller in one command: +Deploy the entire MaaS stack in one command. The script installs prerequisites (policy engine, Gateway, PostgreSQL, Authorino TLS) and deploys `maas-controller`, which then deploys `maas-api` via the Tenant reconciler: ```bash ./scripts/deploy.sh --operator-type odh ``` -This installs all infrastructure (cert-manager, LWS, Kuadrant, ODH, gateway, policies) -plus the subscription controller. - -### Option B: Add subscription controller to an existing deployment +### Option B: Add controller to an existing deployment If MaaS infrastructure is already deployed, install just the controller: @@ -249,11 +260,12 @@ kubectl get crd | grep maas.opendatahub.io | Component | Path | Description | | --------- | ---- | ----------- | -| CRDs | `deployment/base/maas-controller/crd/` | MaaSModelRef, MaaSAuthPolicy, MaaSSubscription | +| CRDs | `deployment/base/maas-controller/crd/` | MaaSModelRef, MaaSAuthPolicy, MaaSSubscription, Tenant | | RBAC | `deployment/base/maas-controller/rbac/` | ClusterRole, ServiceAccount, bindings | | Controller | `deployment/base/maas-controller/manager/` | Deployment (`quay.io/opendatahub/maas-controller:latest`) | | Default auth policy | `deployment/base/maas-controller/policies/` | Gateway-level AuthPolicy (deny unauthenticated, 401/403) | | Default deny policy | `deployment/base/maas-controller/policies/` | Gateway-level TokenRateLimitPolicy with 0 tokens (deny unsubscribed, 429) | +| maas-api (via Tenant) | Embedded kustomize manifests | Deployed at runtime by the Tenant reconciler | ## Examples @@ -352,8 +364,10 @@ kubectl annotate tokenratelimitpolicy -n opendatahub.io/manag The default deployment uses `quay.io/opendatahub/maas-controller:latest`. +The Dockerfile builds from the **repository root** context (not `maas-controller/`) because the controller image includes kustomize manifests from `maas-api/deploy/` and `deployment/`. + ```bash -make -C maas-controller image-build # build with podman/buildah/docker +make -C maas-controller image-build # build with podman/buildah/docker (from repo root) make -C maas-controller image-push # push to quay.io/opendatahub/maas-controller:latest (this image is created automatically on main branch, so preferably push images with different tag and/or to your temp registry if you are doing some testing and verification) # Custom image/tag @@ -392,13 +406,39 @@ CI will fail if the generated files are out of date. ## Troubleshooting +### Understanding Status Phases + +MaaSSubscription and MaaSAuthPolicy use these phases: + +| Phase | Meaning | +| ----- | ------- | +| **Active** | All model references valid, all operands healthy | +| **Degraded** | Partial functionality — some models valid, others missing/invalid | +| **Failed** | No functionality — all model references invalid or missing | +| **Pending** | Transitional state — resources or model references are being created/updated and validity/health is not yet determined | + +Check per-item status to identify specific issues: + +```bash +# Find resources with issues +kubectl get maassubscription -n models-as-a-service -o jsonpath='{range .items[?(@.status.phase!="Active")]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}' + +# Check which model refs are failing +kubectl get maassubscription my-subscription -n models-as-a-service -o jsonpath='{.status.modelRefStatuses}' | jq . +``` + +### Common Issues + **MaaS CRs stuck in `Failed` state:** -The controller retries with exponential backoff. If the HTTPRoute doesn't exist yet (KServe still deploying), the CRs will auto-recover when it appears. If they stay stuck, check controller logs: +The controller retries with exponential backoff. If the HTTPRoute doesn't exist yet (KServe still deploying), the CRs will auto-recover when it appears. If they stay stuck, check `status.modelRefStatuses` for `NotFound` reasons, or check controller logs: ```bash kubectl logs deployment/maas-controller -n opendatahub --tail=20 ``` +**MaaS CRs in `Degraded` state:** +Some model references are invalid. Check `status.modelRefStatuses` (subscription) or `status.authPolicies` (auth policy) to identify which models are failing and why (`NotFound`, `NotAccepted`, `NotEnforced`). + **Auth returns 403 even though user is in the right group:** The groups in MaaSAuthPolicy must match your identity provider's groups, not OpenShift Group objects. Check your actual token groups (see Authentication section above). @@ -410,7 +450,66 @@ Check that the WasmPlugin exists: `kubectl get wasmplugins -n openshift-ingress` ## Configuration +### CLI Flags + +The controller accepts the following command-line flags (configured via `deployment/overlays/odh/params.env` when using kustomize): + +| Flag | Default | Description | +|------|---------|-------------| +| `--metrics-bind-address` | `:8080` | The address the metrics endpoint binds to. | +| `--health-probe-bind-address` | `:8081` | The address the probe endpoint binds to. | +| `--leader-elect` | `false` | Enable leader election for controller manager. | +| `--gateway-name` | `maas-default-gateway` | The name of the Gateway resource to use for model HTTPRoutes. | +| `--gateway-namespace` | `openshift-ingress` | The namespace of the Gateway resource. | +| `--maas-api-namespace` | `opendatahub` | The namespace where maas-api service is deployed. | +| `--maas-subscription-namespace` | `models-as-a-service` | The namespace to watch for MaaSAuthPolicy and MaaSSubscription CRs. | +| `--cluster-audience` | `https://kubernetes.default.svc` | **The OIDC audience of the cluster for TokenReview.** HyperShift/ROSA clusters use a custom OIDC provider URL and must override this value. | +| `--metadata-cache-ttl` | `60` | TTL in seconds for Authorino metadata HTTP caching (apiKeyValidation, subscription-info). | +| `--authz-cache-ttl` | `60` | TTL in seconds for Authorino OPA authorization caching (auth-valid, subscription-valid, require-group-membership). | + +### Configuring for HyperShift/ROSA Clusters + +HyperShift and ROSA clusters use custom OIDC provider URLs. You **must** configure `cluster-audience` to match your cluster's OIDC audience. + +**Find your cluster's OIDC issuer:** + +```bash +kubectl get --raw /.well-known/openid-configuration | jq -r .issuer +``` + +Use this issuer URL as the `cluster-audience` value. + +**Configure via params.env (kustomize deployment):** + +Edit `deployment/overlays/odh/params.env` and update the `cluster-audience` line: + +```env +cluster-audience=https://your-cluster-oidc-issuer +``` + +Then redeploy: + +```bash +kustomize build deployment/overlays/odh | kubectl apply -f - +``` + +**Configure via kubectl patch (running deployment):** + +```bash +# Replace 'opendatahub' with your controller namespace if different +CONTROLLER_NS=opendatahub + +kubectl patch configmap maas-parameters -n $CONTROLLER_NS \ + --type merge \ + -p '{"data":{"cluster-audience":"https://your-cluster-oidc-issuer"}}' + +# Restart controller to pick up new config +kubectl rollout restart deployment/maas-controller -n $CONTROLLER_NS +``` + +### Other Configuration + - **Controller namespace**: Default is `opendatahub`. Override via `kustomize build deployment/base/maas-controller/default | sed "s/namespace: opendatahub/namespace: /g" | kubectl apply -f -`. -- **MaaS subscription namespace**: Default is `models-as-a-service`. Override in the deployment or via Kustomize. -- **Image**: Default is `quay.io/opendatahub/maas-controller:latest`. Override in the deployment or via Kustomize. -- **Gateway name**: The default auth policy targets `maas-default-gateway` in `openshift-ingress`. Edit `deployment/base/maas-controller/policies/gateway-default-auth.yaml` if your gateway has a different name. +- **MaaS subscription namespace**: Default is `models-as-a-service`. Override `maas-subscription-namespace` in `params.env`. +- **Image**: Default is `quay.io/opendatahub/maas-controller:latest`. Override `maas-controller-image` in `params.env`. +- **Gateway name/namespace**: Override `gateway-name` and `gateway-namespace` in `params.env`. diff --git a/maas-controller/api/maas/v1alpha1/common_types.go b/maas-controller/api/maas/v1alpha1/common_types.go new file mode 100644 index 000000000..5546e58ed --- /dev/null +++ b/maas-controller/api/maas/v1alpha1/common_types.go @@ -0,0 +1,99 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +// Phase represents the lifecycle phase of a MaaS resource. +// +kubebuilder:validation:Enum=Pending;Active;Degraded;Failed +type Phase string + +// Phase constants for MaaS resources (MaaSSubscription, MaaSAuthPolicy, MaaSModelRef) +const ( + PhasePending Phase = "Pending" + PhaseActive Phase = "Active" + PhaseDegraded Phase = "Degraded" + PhaseFailed Phase = "Failed" +) + +// ConditionReason represents a machine-readable reason for a status condition. +// +kubebuilder:validation:Enum=Reconciled;ReconcileFailed;PartialFailure;Valid;NotFound;GetFailed;Accepted;AcceptedEnforced;NotAccepted;Enforced;NotEnforced;BackendNotReady;ConditionsNotFound;Unknown +type ConditionReason string + +// Reason constants for status conditions and per-item statuses. +// These follow Kubernetes conventions: CamelCase, past tense for completed actions. +const ( + // ReasonReconciled indicates successful reconciliation. + ReasonReconciled ConditionReason = "Reconciled" + + // ReasonReconcileFailed indicates reconciliation failed. + ReasonReconcileFailed ConditionReason = "ReconcileFailed" + + // ReasonPartialFailure indicates some items succeeded, others failed. + ReasonPartialFailure ConditionReason = "PartialFailure" + + // ReasonValid indicates a referenced resource exists and is valid. + ReasonValid ConditionReason = "Valid" + + // ReasonNotFound indicates a referenced resource was not found. + ReasonNotFound ConditionReason = "NotFound" + + // ReasonGetFailed indicates a failure when fetching a resource. + ReasonGetFailed ConditionReason = "GetFailed" + + // ReasonAccepted indicates the resource was accepted by the target system (e.g., Kuadrant). + ReasonAccepted ConditionReason = "Accepted" + + // ReasonAcceptedEnforced indicates the policy is both accepted and enforced. + ReasonAcceptedEnforced ConditionReason = "AcceptedEnforced" + + // ReasonNotAccepted indicates the resource was not accepted by the target system. + ReasonNotAccepted ConditionReason = "NotAccepted" + + // ReasonEnforced indicates the policy is actively enforced. + ReasonEnforced ConditionReason = "Enforced" + + // ReasonNotEnforced indicates the policy is not yet enforced. + ReasonNotEnforced ConditionReason = "NotEnforced" + + // ReasonBackendNotReady indicates the backend service is not ready. + ReasonBackendNotReady ConditionReason = "BackendNotReady" + + // ReasonConditionsNotFound indicates status conditions are not available. + ReasonConditionsNotFound ConditionReason = "ConditionsNotFound" + + // ReasonUnknown indicates an unknown or unhandled state. + ReasonUnknown ConditionReason = "Unknown" +) + +// ResourceRefStatus is the common status for any referenced Kubernetes resource. +// Embedded by specific status types for type safety (follows metav1.Condition pattern). +type ResourceRefStatus struct { + // Name of the referenced resource + // +kubebuilder:validation:MaxLength=253 + Name string `json:"name"` + // Namespace of the referenced resource + // +kubebuilder:validation:MaxLength=63 + Namespace string `json:"namespace"` + // Ready indicates whether the resource is valid and healthy + Ready bool `json:"ready"` + // Reason is a machine-readable reason code + // +optional + Reason ConditionReason `json:"reason,omitempty"` + // Message is a human-readable description of the status + // +kubebuilder:validation:MaxLength=1024 + // +optional + Message string `json:"message,omitempty"` +} diff --git a/maas-controller/api/maas/v1alpha1/maasauthpolicy_types.go b/maas-controller/api/maas/v1alpha1/maasauthpolicy_types.go index 4efc2fa82..8d8b02439 100644 --- a/maas-controller/api/maas/v1alpha1/maasauthpolicy_types.go +++ b/maas-controller/api/maas/v1alpha1/maasauthpolicy_types.go @@ -80,35 +80,30 @@ type MeteringMetadata struct { Labels map[string]string `json:"labels,omitempty"` } -// AuthPolicyRefStatus reports the status of one underlying Kuadrant AuthPolicy created by this MaaSAuthPolicy. +// AuthPolicyRefStatus reports the status of a generated Kuadrant AuthPolicy. +// Embeds ResourceRefStatus for common fields (Ready, Reason, Message). type AuthPolicyRefStatus struct { - // Name is the name of the AuthPolicy resource. - Name string `json:"name"` - // Namespace is the namespace of the AuthPolicy resource. - Namespace string `json:"namespace"` + ResourceRefStatus `json:",inline"` // Model is the MaaSModelRef name this AuthPolicy targets. + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=63 Model string `json:"model"` // ModelNamespace is the namespace of the MaaSModelRef. + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=63 ModelNamespace string `json:"modelNamespace"` - // Accepted reports whether the AuthPolicy has been accepted (e.g. status.conditions type=Accepted). - // +optional - Accepted string `json:"accepted,omitempty"` - // Enforced reports whether the AuthPolicy is enforced (e.g. status.conditions type=Enforced). - // +optional - Enforced string `json:"enforced,omitempty"` } // MaaSAuthPolicyStatus defines the observed state of MaaSAuthPolicy type MaaSAuthPolicyStatus struct { // Phase represents the current phase of the policy - // +kubebuilder:validation:Enum=Pending;Active;Failed - Phase string `json:"phase,omitempty"` + Phase Phase `json:"phase,omitempty"` // Conditions represent the latest available observations of the policy's state // +optional Conditions []metav1.Condition `json:"conditions,omitempty"` - // AuthPolicies lists the underlying Kuadrant AuthPolicies and their Accepted/Enforced state. + // AuthPolicies lists the underlying Kuadrant AuthPolicies and their status. // +optional AuthPolicies []AuthPolicyRefStatus `json:"authPolicies,omitempty"` } diff --git a/maas-controller/api/maas/v1alpha1/maassubscription_types.go b/maas-controller/api/maas/v1alpha1/maassubscription_types.go index b6f81a678..0d4df5cb3 100644 --- a/maas-controller/api/maas/v1alpha1/maassubscription_types.go +++ b/maas-controller/api/maas/v1alpha1/maassubscription_types.go @@ -78,8 +78,13 @@ type TokenRateLimit struct { // +kubebuilder:validation:Minimum=1 Limit int64 `json:"limit"` - // Window is the time window (e.g., "1m", "1h", "24h") - // +kubebuilder:validation:Pattern=`^(\d+)(s|m|h|d)$` + // Window is the time window for rate limiting (e.g., "1m", "1h", "24h"). + // Allowed units: s (seconds), m (minutes), h (hours). Days (d) are not + // supported; use hours instead (e.g., "24h" for one day). + // The numeric part must be between 1 and 9999. + // +kubebuilder:validation:MinLength=2 + // +kubebuilder:validation:MaxLength=5 + // +kubebuilder:validation:Pattern=`^[1-9]\d{0,3}(s|m|h)$` Window string `json:"window"` } @@ -104,15 +109,36 @@ type TokenMetadata struct { Labels map[string]string `json:"labels,omitempty"` } +// ModelRefStatus reports the status of a referenced MaaSModelRef. +type ModelRefStatus struct { + ResourceRefStatus `json:",inline"` +} + +// TokenRateLimitStatus reports the status of a generated TokenRateLimitPolicy. +type TokenRateLimitStatus struct { + ResourceRefStatus `json:",inline"` + // Model is the MaaSModelRef name this TokenRateLimitPolicy targets + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=63 + Model string `json:"model"` +} + // MaaSSubscriptionStatus defines the observed state of MaaSSubscription type MaaSSubscriptionStatus struct { // Phase represents the current phase of the subscription - // +kubebuilder:validation:Enum=Pending;Active;Failed - Phase string `json:"phase,omitempty"` + Phase Phase `json:"phase,omitempty"` // Conditions represent the latest available observations of the subscription's state // +optional Conditions []metav1.Condition `json:"conditions,omitempty"` + + // ModelRefStatuses reports the status of each referenced MaaSModelRef + // +optional + ModelRefStatuses []ModelRefStatus `json:"modelRefStatuses,omitempty"` + + // TokenRateLimitStatuses reports the status of each generated TokenRateLimitPolicy + // +optional + TokenRateLimitStatuses []TokenRateLimitStatus `json:"tokenRateLimitStatuses,omitempty"` } //+kubebuilder:object:root=true diff --git a/maas-controller/api/maas/v1alpha1/tenant_types.go b/maas-controller/api/maas/v1alpha1/tenant_types.go new file mode 100644 index 000000000..e96618087 --- /dev/null +++ b/maas-controller/api/maas/v1alpha1/tenant_types.go @@ -0,0 +1,168 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + // TenantKind is the API kind for the cluster MaaS tenant / platform singleton. + TenantKind = "Tenant" + // TenantInstanceName is the singleton resource name enforced by the API. + TenantInstanceName = "default-tenant" +) + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Namespaced +// +kubebuilder:validation:XValidation:rule="self.metadata.name == 'default-tenant'",message="Tenant name must be default-tenant" +// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].status`,description="Ready" +// +kubebuilder:printcolumn:name="Reason",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].reason`,description="Reason" +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// Tenant is the namespace-scoped API for the MaaS platform tenant. +// The CEL validation above enforces a singleton (name == "default-tenant") during v1alpha1. +// To enable multi-tenancy later, remove the XValidation rule — no CRD migration required +// because removing a validation is a non-breaking schema change. +type Tenant struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec TenantSpec `json:"spec,omitempty"` + Status TenantStatus `json:"status,omitempty"` +} + +// TenantSpec defines the desired state of Tenant. +type TenantSpec struct { + // GatewayRef specifies which Gateway (Gateway API) to use for exposing model endpoints. + // If omitted, defaults to openshift-ingress/maas-default-gateway. + // +kubebuilder:validation:Optional + GatewayRef TenantGatewayRef `json:"gatewayRef,omitempty"` + + // APIKeys contains configuration for API key management. + // +kubebuilder:validation:Optional + APIKeys *TenantAPIKeysConfig `json:"apiKeys,omitempty"` + + // ExternalOIDC configures an external OIDC identity provider for the maas-api AuthPolicy. + // +kubebuilder:validation:Optional + ExternalOIDC *TenantExternalOIDCConfig `json:"externalOIDC,omitempty"` + + // Telemetry contains configuration for telemetry and metrics collection. + // +kubebuilder:validation:Optional + Telemetry *TenantTelemetryConfig `json:"telemetry,omitempty"` +} + +// TenantExternalOIDCConfig defines the external OIDC provider settings. +type TenantExternalOIDCConfig struct { + // IssuerURL is the OIDC issuer URL (e.g. https://keycloak.example.com/realms/maas). + // +kubebuilder:validation:MinLength=9 + // +kubebuilder:validation:MaxLength=2048 + // +kubebuilder:validation:Pattern=`^https://\S+$` + IssuerURL string `json:"issuerUrl"` + + // ClientID is the OAuth2 client ID. + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=256 + // +kubebuilder:validation:Pattern=`^\S+$` + ClientID string `json:"clientId"` + + // TTL is the JWKS cache duration in seconds. + // +kubebuilder:validation:Optional + // +kubebuilder:default=300 + // +kubebuilder:validation:Minimum=30 + TTL int `json:"ttl,omitempty"` +} + +// TenantTelemetryConfig defines configuration for telemetry collection. +type TenantTelemetryConfig struct { + // +kubebuilder:default=true + // +kubebuilder:validation:Optional + Enabled *bool `json:"enabled,omitempty"` + + // +kubebuilder:validation:Optional + Metrics *TenantMetricsConfig `json:"metrics,omitempty"` +} + +// TenantMetricsConfig defines optional metric dimensions. +type TenantMetricsConfig struct { + // +kubebuilder:default=true + // +kubebuilder:validation:Optional + CaptureOrganization *bool `json:"captureOrganization,omitempty"` + + // CaptureUser adds a "user" dimension to telemetry metrics containing + // the authenticated user ID. Defaults to false. Enabling this may + // have GDPR / privacy implications — ensure compliance before use. + // +kubebuilder:default=false + // +kubebuilder:validation:Optional + CaptureUser *bool `json:"captureUser,omitempty"` + + // +kubebuilder:default=false + // +kubebuilder:validation:Optional + CaptureGroup *bool `json:"captureGroup,omitempty"` + + // +kubebuilder:default=true + // +kubebuilder:validation:Optional + CaptureModelUsage *bool `json:"captureModelUsage,omitempty"` +} + +// TenantAPIKeysConfig defines configuration options for API key management. +type TenantAPIKeysConfig struct { + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Minimum=1 + MaxExpirationDays *int32 `json:"maxExpirationDays,omitempty"` +} + +// TenantGatewayRef defines the reference to the global Gateway (Gateway API). +type TenantGatewayRef struct { + // +kubebuilder:default="openshift-ingress" + // +kubebuilder:validation:Pattern="^([a-z0-9]([-a-z0-9]*[a-z0-9])?)?$" + // +kubebuilder:validation:MaxLength=63 + Namespace string `json:"namespace,omitempty"` + + // +kubebuilder:default="maas-default-gateway" + // +kubebuilder:validation:Pattern="^([a-z0-9]([-a-z0-9]*[a-z0-9])?)?$" + // +kubebuilder:validation:MaxLength=63 + Name string `json:"name,omitempty"` +} + +// TenantStatus defines the observed state of Tenant. +type TenantStatus struct { + // Phase is a high-level lifecycle phase for the platform reconcile. + // +kubebuilder:validation:Optional + // +kubebuilder:validation:Enum=Pending;Active;Degraded;Failed + Phase string `json:"phase,omitempty"` + + // Conditions represent the latest available observations. + // Types mirror ODH modelsasservice / internal controller status for DSC aggregation: Ready, + // DependenciesAvailable, MaaSPrerequisitesAvailable, DeploymentsAvailable, Degraded. + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// +kubebuilder:object:root=true + +// TenantList contains a list of Tenant. +type TenantList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []Tenant `json:"items"` +} + +func init() { + SchemeBuilder.Register(&Tenant{}, &TenantList{}) +} diff --git a/maas-controller/api/maas/v1alpha1/zz_generated.deepcopy.go b/maas-controller/api/maas/v1alpha1/zz_generated.deepcopy.go index 578e4a28c..e72f35c7e 100644 --- a/maas-controller/api/maas/v1alpha1/zz_generated.deepcopy.go +++ b/maas-controller/api/maas/v1alpha1/zz_generated.deepcopy.go @@ -12,6 +12,7 @@ import ( // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AuthPolicyRefStatus) DeepCopyInto(out *AuthPolicyRefStatus) { *out = *in + out.ResourceRefStatus = in.ResourceRefStatus } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AuthPolicyRefStatus. @@ -477,6 +478,16 @@ func (in *MaaSSubscriptionStatus) DeepCopyInto(out *MaaSSubscriptionStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.ModelRefStatuses != nil { + in, out := &in.ModelRefStatuses, &out.ModelRefStatuses + *out = make([]ModelRefStatus, len(*in)) + copy(*out, *in) + } + if in.TokenRateLimitStatuses != nil { + in, out := &in.TokenRateLimitStatuses, &out.TokenRateLimitStatuses + *out = make([]TokenRateLimitStatus, len(*in)) + copy(*out, *in) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MaaSSubscriptionStatus. @@ -526,6 +537,22 @@ func (in *ModelRef) DeepCopy() *ModelRef { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ModelRefStatus) DeepCopyInto(out *ModelRefStatus) { + *out = *in + out.ResourceRefStatus = in.ResourceRefStatus +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelRefStatus. +func (in *ModelRefStatus) DeepCopy() *ModelRefStatus { + if in == nil { + return nil + } + out := new(ModelRefStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ModelReference) DeepCopyInto(out *ModelReference) { *out = *in @@ -591,6 +618,21 @@ func (in *OwnerSpec) DeepCopy() *OwnerSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ResourceRefStatus) DeepCopyInto(out *ResourceRefStatus) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceRefStatus. +func (in *ResourceRefStatus) DeepCopy() *ResourceRefStatus { + if in == nil { + return nil + } + out := new(ResourceRefStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SubjectSpec) DeepCopyInto(out *SubjectSpec) { *out = *in @@ -616,6 +658,228 @@ func (in *SubjectSpec) DeepCopy() *SubjectSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Tenant) DeepCopyInto(out *Tenant) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Tenant. +func (in *Tenant) DeepCopy() *Tenant { + if in == nil { + return nil + } + out := new(Tenant) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *Tenant) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TenantAPIKeysConfig) DeepCopyInto(out *TenantAPIKeysConfig) { + *out = *in + if in.MaxExpirationDays != nil { + in, out := &in.MaxExpirationDays, &out.MaxExpirationDays + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TenantAPIKeysConfig. +func (in *TenantAPIKeysConfig) DeepCopy() *TenantAPIKeysConfig { + if in == nil { + return nil + } + out := new(TenantAPIKeysConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TenantExternalOIDCConfig) DeepCopyInto(out *TenantExternalOIDCConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TenantExternalOIDCConfig. +func (in *TenantExternalOIDCConfig) DeepCopy() *TenantExternalOIDCConfig { + if in == nil { + return nil + } + out := new(TenantExternalOIDCConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TenantGatewayRef) DeepCopyInto(out *TenantGatewayRef) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TenantGatewayRef. +func (in *TenantGatewayRef) DeepCopy() *TenantGatewayRef { + if in == nil { + return nil + } + out := new(TenantGatewayRef) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TenantList) DeepCopyInto(out *TenantList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]Tenant, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TenantList. +func (in *TenantList) DeepCopy() *TenantList { + if in == nil { + return nil + } + out := new(TenantList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *TenantList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TenantMetricsConfig) DeepCopyInto(out *TenantMetricsConfig) { + *out = *in + if in.CaptureOrganization != nil { + in, out := &in.CaptureOrganization, &out.CaptureOrganization + *out = new(bool) + **out = **in + } + if in.CaptureUser != nil { + in, out := &in.CaptureUser, &out.CaptureUser + *out = new(bool) + **out = **in + } + if in.CaptureGroup != nil { + in, out := &in.CaptureGroup, &out.CaptureGroup + *out = new(bool) + **out = **in + } + if in.CaptureModelUsage != nil { + in, out := &in.CaptureModelUsage, &out.CaptureModelUsage + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TenantMetricsConfig. +func (in *TenantMetricsConfig) DeepCopy() *TenantMetricsConfig { + if in == nil { + return nil + } + out := new(TenantMetricsConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TenantSpec) DeepCopyInto(out *TenantSpec) { + *out = *in + out.GatewayRef = in.GatewayRef + if in.APIKeys != nil { + in, out := &in.APIKeys, &out.APIKeys + *out = new(TenantAPIKeysConfig) + (*in).DeepCopyInto(*out) + } + if in.ExternalOIDC != nil { + in, out := &in.ExternalOIDC, &out.ExternalOIDC + *out = new(TenantExternalOIDCConfig) + **out = **in + } + if in.Telemetry != nil { + in, out := &in.Telemetry, &out.Telemetry + *out = new(TenantTelemetryConfig) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TenantSpec. +func (in *TenantSpec) DeepCopy() *TenantSpec { + if in == nil { + return nil + } + out := new(TenantSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TenantStatus) DeepCopyInto(out *TenantStatus) { + *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TenantStatus. +func (in *TenantStatus) DeepCopy() *TenantStatus { + if in == nil { + return nil + } + out := new(TenantStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TenantTelemetryConfig) DeepCopyInto(out *TenantTelemetryConfig) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.Metrics != nil { + in, out := &in.Metrics, &out.Metrics + *out = new(TenantMetricsConfig) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TenantTelemetryConfig. +func (in *TenantTelemetryConfig) DeepCopy() *TenantTelemetryConfig { + if in == nil { + return nil + } + out := new(TenantTelemetryConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TokenMetadata) DeepCopyInto(out *TokenMetadata) { *out = *in @@ -652,3 +916,19 @@ func (in *TokenRateLimit) DeepCopy() *TokenRateLimit { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TokenRateLimitStatus) DeepCopyInto(out *TokenRateLimitStatus) { + *out = *in + out.ResourceRefStatus = in.ResourceRefStatus +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TokenRateLimitStatus. +func (in *TokenRateLimitStatus) DeepCopy() *TokenRateLimitStatus { + if in == nil { + return nil + } + out := new(TokenRateLimitStatus) + in.DeepCopyInto(out) + return out +} diff --git a/maas-controller/cmd/manager/main.go b/maas-controller/cmd/manager/main.go index cbc994612..333417fd1 100644 --- a/maas-controller/cmd/manager/main.go +++ b/maas-controller/cmd/manager/main.go @@ -20,11 +20,14 @@ import ( "context" "flag" "fmt" + "net/http" "os" + "path/filepath" "time" kservev1alpha1 "github.com/kserve/kserve/pkg/apis/serving/v1alpha1" corev1 "k8s.io/api/core/v1" + extv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" @@ -40,11 +43,13 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" + "sigs.k8s.io/controller-runtime/pkg/manager" metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" gatewayapiv1 "sigs.k8s.io/gateway-api/apis/v1" maasv1alpha1 "github.com/opendatahub-io/models-as-a-service/maas-controller/api/maas/v1alpha1" "github.com/opendatahub-io/models-as-a-service/maas-controller/pkg/controller/maas" + "github.com/opendatahub-io/models-as-a-service/maas-controller/pkg/platform/tenantreconcile" "github.com/opendatahub-io/models-as-a-service/maas-controller/pkg/reconciler/externalmodel" ) @@ -55,6 +60,7 @@ var ( func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(extv1.AddToScheme(scheme)) utilruntime.Must(kservev1alpha1.AddToScheme(scheme)) utilruntime.Must(gatewayapiv1.Install(scheme)) utilruntime.Must(maasv1alpha1.AddToScheme(scheme)) @@ -62,23 +68,70 @@ func init() { //+kubebuilder:rbac:groups="",resources=namespaces,verbs=get;create -// ensureSubscriptionNamespaceExists checks whether the subscription namespace exists +// ensureSubscriptionNamespaceWithClient checks whether the subscription namespace exists // and creates it if missing. It checks for existence first so that the controller can // start even when the service account lacks namespace-create permission (common in // operator-managed deployments where the operator pre-creates the namespace). // Permanent errors such as Forbidden are not retried. -func ensureSubscriptionNamespaceExists(ctx context.Context, namespace string) error { - cfg := ctrl.GetConfigOrDie() - clientset, err := kubernetes.NewForConfig(cfg) - if err != nil { - return fmt.Errorf("unable to create Kubernetes client: %w", err) - } - - _, err = clientset.CoreV1().Namespaces().Get(ctx, namespace, metav1.GetOptions{}) +// +// Handles the edge case where the namespace is in Terminating phase during RHOAI +// reinstall/upgrade - waits for deletion to complete before attempting creation. +func ensureSubscriptionNamespaceWithClient(ctx context.Context, namespace string, clientset kubernetes.Interface) error { + ns, err := clientset.CoreV1().Namespaces().Get(ctx, namespace, metav1.GetOptions{}) if err == nil { - setupLog.Info("subscription namespace already exists", "namespace", namespace) - return nil + if ns.Status.Phase == corev1.NamespaceTerminating { + setupLog.Info("subscription namespace is terminating, waiting for deletion to complete", + "namespace", namespace) + + pollErr := wait.PollUntilContextTimeout(ctx, 2*time.Second, 90*time.Second, true, + func(ctx context.Context) (bool, error) { + checkNs, pollErr := clientset.CoreV1().Namespaces().Get(ctx, namespace, metav1.GetOptions{}) + if errors.IsNotFound(pollErr) { + setupLog.Info("terminating namespace has been deleted", "namespace", namespace) + return true, nil + } + if errors.IsForbidden(pollErr) { + setupLog.Info("insufficient permissions to poll namespace deletion status, "+ + "assuming namespace is managed externally", + "namespace", namespace, "error", pollErr) + return true, nil + } + if pollErr != nil { + return false, fmt.Errorf("error checking namespace status during deletion wait: %w", pollErr) + } + if checkNs.Status.Phase == corev1.NamespaceActive || checkNs.Status.Phase == "" { + setupLog.Info("subscription namespace became active during deletion wait "+ + "(recreated by operator or external process)", + "namespace", namespace) + return true, nil + } + setupLog.V(1).Info("namespace still terminating, will retry", + "namespace", namespace, "phase", checkNs.Status.Phase) + return false, nil + }) + + if pollErr != nil { + return fmt.Errorf("failed waiting for terminating namespace %q to be deleted: %w", + namespace, pollErr) + } + + finalNs, finalErr := clientset.CoreV1().Namespaces().Get(ctx, namespace, metav1.GetOptions{}) + doneErr, fallThrough := resolveNamespaceAfterTerminationWait(namespace, finalNs, finalErr) + if fallThrough { + err = finalErr + } else { + if doneErr != nil { + return doneErr + } + return nil + } + } else { + setupLog.Info("subscription namespace already exists", + "namespace", namespace, "phase", ns.Status.Phase) + return nil + } } + if errors.IsForbidden(err) { setupLog.Info("insufficient permissions to check namespace existence, assuming it exists — "+ "verify that the ClusterRoleBinding references the correct namespace for the controller ServiceAccount", @@ -100,15 +153,34 @@ func ensureSubscriptionNamespaceExists(ctx context.Context, namespace string) er Name: namespace, Labels: map[string]string{ "opendatahub.io/generated-namespace": "true", + "app.kubernetes.io/managed-by": "maas-controller", + "app.kubernetes.io/part-of": "maas-controller", }, }, } _, err := clientset.CoreV1().Namespaces().Create(ctx, ns, metav1.CreateOptions{}) - if err == nil || errors.IsAlreadyExists(err) { + if err == nil { setupLog.Info("subscription namespace ready", "namespace", namespace) return true, nil } + if errors.IsAlreadyExists(err) { + // Re-check phase: AlreadyExists only proves the name is occupied, but the namespace + // could still be Terminating. Verify it's actually ready before returning success. + existingNs, getErr := clientset.CoreV1().Namespaces().Get(ctx, namespace, metav1.GetOptions{}) + if getErr != nil { + setupLog.Info("namespace already exists but failed to verify phase, will retry", + "namespace", namespace, "error", getErr) + return false, nil + } + if existingNs.Status.Phase == corev1.NamespaceActive || existingNs.Status.Phase == "" { + setupLog.Info("subscription namespace ready", "namespace", namespace) + return true, nil + } + setupLog.Info("namespace already exists but is not ready, will retry", + "namespace", namespace, "phase", existingNs.Status.Phase) + return false, nil + } if errors.IsForbidden(err) { return false, fmt.Errorf("service account lacks permission to create namespace %q — "+ "either pre-create the namespace or grant 'create' on namespaces to the controller service account: %w", @@ -119,6 +191,111 @@ func ensureSubscriptionNamespaceExists(ctx context.Context, namespace string) er }) } +// resolveNamespaceAfterTerminationWait interprets the namespace GET after a successful termination poll. +// If fallThroughToCreate is true, the caller must assign the original finalErr to the outer GET error and +// continue into namespace creation. If fallThroughToCreate is false and the returned error is nil, the +// subscription namespace is already satisfied (Active or assumed external management). +func resolveNamespaceAfterTerminationWait(namespace string, finalNs *corev1.Namespace, finalErr error) (doneErr error, fallThroughToCreate bool) { + if finalErr == nil && (finalNs.Status.Phase == corev1.NamespaceActive || finalNs.Status.Phase == "") { + setupLog.Info("subscription namespace exists and is active "+ + "(recreated externally during deletion wait)", + "namespace", namespace) + return nil, false + } + if errors.IsForbidden(finalErr) { + setupLog.Info("insufficient permissions to verify namespace state after deletion wait, "+ + "assuming it exists", + "namespace", namespace, "error", finalErr) + return nil, false + } + if errors.IsNotFound(finalErr) { + return nil, true + } + if finalErr != nil { + return fmt.Errorf("unable to verify namespace %q after termination wait: %w", namespace, finalErr), false + } + if finalNs.Status.Phase == corev1.NamespaceTerminating { + return fmt.Errorf("namespace %q is still terminating after wait; retry after it is fully deleted", + namespace), false + } + return fmt.Errorf("namespace %q exists in unexpected state after termination wait (phase=%q)", + namespace, finalNs.Status.Phase), false +} + +// checkSubscriptionNamespaceReady returns nil if the subscription namespace exists and controllers can rely on it. +// Terminating and missing namespaces are not ready. Forbidden on GET matches startup behavior (assume operator-managed). +// +// Namespace.Status.Phase is documented as Active or Terminating; an empty string is treated as ready because it is +// commonly seen before status is fully populated and matches Kubernetes' defaulting to an active namespace. +func checkSubscriptionNamespaceReady(ctx context.Context, clientset kubernetes.Interface, namespace string) error { + ns, err := clientset.CoreV1().Namespaces().Get(ctx, namespace, metav1.GetOptions{}) + if errors.IsNotFound(err) { + return fmt.Errorf("subscription namespace %q does not exist", namespace) + } + if errors.IsForbidden(err) { + setupLog.V(1).Info("readiness: insufficient permissions to check namespace, assuming ready", "namespace", namespace, "error", err) + return nil + } + if err != nil { + return fmt.Errorf("subscription namespace %q ready check: %w", namespace, err) + } + if ns.Status.Phase == corev1.NamespaceTerminating { + return fmt.Errorf("subscription namespace %q is terminating", namespace) + } + if ns.Status.Phase == corev1.NamespaceActive || ns.Status.Phase == "" { + return nil + } + return fmt.Errorf("subscription namespace %q is not ready (phase=%q)", namespace, ns.Status.Phase) +} + +// subscriptionNamespaceReadiness performs an uncached Namespace GET on each probe for an accurate signal. +// Load is bounded by the kubelet readiness probe interval (often ~10s); avoid short-lived caching here so +// Terminating / deleted namespaces are reflected promptly. +func subscriptionNamespaceReadiness(clientset kubernetes.Interface, namespace string) healthz.Checker { + return func(req *http.Request) error { + return checkSubscriptionNamespaceReady(req.Context(), clientset, namespace) + } +} + +// subscriptionNamespaceMonitor periodically re-runs ensureSubscriptionNamespaceWithClient so a namespace +// removed while the process is running can be recreated. When leader election is enabled, only the leader runs this. +type subscriptionNamespaceMonitor struct { + clientset kubernetes.Interface + namespace string + interval time.Duration + needLeaderElection bool +} + +func (m *subscriptionNamespaceMonitor) NeedLeaderElection() bool { + return m.needLeaderElection +} + +func (m *subscriptionNamespaceMonitor) Start(ctx context.Context) error { + if m.interval <= 0 { + return fmt.Errorf("subscription namespace maintain interval must be positive, got %v", m.interval) + } + run := func() { + innerCtx, cancel := context.WithTimeout(ctx, 2*time.Minute) + defer cancel() + if err := ensureSubscriptionNamespaceWithClient(innerCtx, m.namespace, m.clientset); err != nil { + // Keep running; the next tick will retry. Alerting on sustained failure is better done via + // metrics (e.g. Prometheus counter) in a follow-up if product needs it. + setupLog.Error(err, "subscription namespace maintenance failed", "namespace", m.namespace) + } + } + run() + ticker := time.NewTicker(m.interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + run() + } + } +} + // getClusterServiceAccountIssuer fetches the cluster's service account issuer from OpenShift/ROSA configuration. // Returns empty string if not found or not running on OpenShift/ROSA. // Uses client.Reader (not client.Client) so it can be called before the manager cache starts. @@ -148,6 +325,61 @@ func getClusterServiceAccountIssuer(c client.Reader) (string, error) { return issuer, nil } +// ensureDefaultTenantRunnable returns a manager.Runnable that periodically ensures the +// default-tenant CR exists. If the Tenant is deleted (e.g. during testing or operator +// lifecycle), it will be recreated on the next tick. +func ensureDefaultTenantRunnable(mgr ctrl.Manager, tenantNamespace string) manager.RunnableFunc { + return func(ctx context.Context) error { + log := ctrl.Log.WithName("setup").WithName("ensureDefaultTenant") + c := mgr.GetClient() + + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() + + ensure := func() { + key := client.ObjectKey{Name: maasv1alpha1.TenantInstanceName, Namespace: tenantNamespace} + var existing maasv1alpha1.Tenant + if err := c.Get(ctx, key, &existing); err == nil { + return + } else if !errors.IsNotFound(err) { + log.Error(err, "failed to check for default-tenant") + return + } + + tenant := &maasv1alpha1.Tenant{ + TypeMeta: metav1.TypeMeta{ + APIVersion: maasv1alpha1.GroupVersion.String(), + Kind: maasv1alpha1.TenantKind, + }, + ObjectMeta: metav1.ObjectMeta{ + Name: maasv1alpha1.TenantInstanceName, + Namespace: tenantNamespace, + }, + } + tenantreconcile.EnsureTenantGatewayDefaults(tenant) + + if err := c.Create(ctx, tenant); err != nil { + if errors.IsAlreadyExists(err) { + return + } + log.Error(err, "failed to create default-tenant", "namespace", tenantNamespace) + return + } + log.Info("created default-tenant", "namespace", tenantNamespace) + } + + ensure() + for { + select { + case <-ctx.Done(): + return nil + case <-ticker.C: + ensure() + } + } + } +} + func main() { var metricsAddr string var enableLeaderElection bool @@ -159,6 +391,7 @@ func main() { var clusterAudience string var metadataCacheTTL int64 var authzCacheTTL int64 + var subscriptionNamespaceMaintainInterval time.Duration flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metrics endpoint binds to.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") @@ -171,6 +404,9 @@ func main() { flag.StringVar(&clusterAudience, "cluster-audience", "https://kubernetes.default.svc", "The OIDC audience of the cluster for TokenReview. HyperShift/ROSA clusters use a custom OIDC provider URL.") flag.Int64Var(&metadataCacheTTL, "metadata-cache-ttl", 60, "TTL in seconds for Authorino metadata HTTP caching (apiKeyValidation, subscription-info).") flag.Int64Var(&authzCacheTTL, "authz-cache-ttl", 60, "TTL in seconds for Authorino OPA authorization caching (auth-valid, subscription-valid, require-group-membership).") + flag.DurationVar(&subscriptionNamespaceMaintainInterval, "subscription-namespace-maintain-interval", 30*time.Second, + "How often to re-check the subscription namespace while the manager is running (recreate if deleted). "+ + "Larger values reduce apiserver load; smaller values detect external deletions sooner.") opts := zap.Options{Development: false} opts.BindFlags(flag.CommandLine) @@ -178,21 +414,28 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) - // Ensure subscription namespace exists before starting controllers - if err := ensureSubscriptionNamespaceExists(context.Background(), maasSubscriptionNamespace); err != nil { + cfg := ctrl.GetConfigOrDie() + clientset, err := kubernetes.NewForConfig(cfg) + if err != nil { + setupLog.Error(err, "unable to create Kubernetes client for subscription namespace setup") + os.Exit(1) + } + if err := ensureSubscriptionNamespaceWithClient(context.Background(), maasSubscriptionNamespace, clientset); err != nil { setupLog.Error(err, "unable to ensure subscription namespace exists", "namespace", maasSubscriptionNamespace) os.Exit(1) } - setupLog.Info("watching namespace for MaaS AuthPolicy and MaaSSubscription", "namespace", maasSubscriptionNamespace) + setupLog.Info("watching namespace for MaaS CRs", "namespace", maasSubscriptionNamespace) + nsCfg := map[string]cache.Config{maasSubscriptionNamespace: {}} cacheOpts := cache.Options{ ByObject: map[client.Object]cache.ByObject{ - &maasv1alpha1.MaaSAuthPolicy{}: {Namespaces: map[string]cache.Config{maasSubscriptionNamespace: {}}}, - &maasv1alpha1.MaaSSubscription{}: {Namespaces: map[string]cache.Config{maasSubscriptionNamespace: {}}}, + &maasv1alpha1.Tenant{}: {Namespaces: nsCfg}, + &maasv1alpha1.MaaSAuthPolicy{}: {Namespaces: nsCfg}, + &maasv1alpha1.MaaSSubscription{}: {Namespaces: nsCfg}, }, } - mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ + mgr, err := ctrl.NewManager(cfg, ctrl.Options{ Scheme: scheme, Cache: cacheOpts, Metrics: metricsserver.Options{BindAddress: metricsAddr}, @@ -256,11 +499,50 @@ func main() { os.Exit(1) } + if err := mgr.Add(&subscriptionNamespaceMonitor{ + clientset: clientset, + namespace: maasSubscriptionNamespace, + interval: subscriptionNamespaceMaintainInterval, + needLeaderElection: enableLeaderElection, + }); err != nil { + setupLog.Error(err, "unable to add subscription namespace monitor") + os.Exit(1) + } + + // Ensure the default-tenant CR exists in the MaaS subscription namespace + // (same namespace as MaaSSubscription / MaaSAuthPolicy CRs). + // maas-controller owns creation; ODH operator only reads status and deletes on disable. + if err := mgr.Add(ensureDefaultTenantRunnable(mgr, maasSubscriptionNamespace)); err != nil { + setupLog.Error(err, "unable to register ensureDefaultTenant runnable") + os.Exit(1) + } + + manifestPath := os.Getenv("MAAS_PLATFORM_MANIFESTS") + if manifestPath == "" { + manifestPath = tenantreconcile.DefaultManifestPath() + } + if abs, err := filepath.Abs(manifestPath); err == nil { + manifestPath = abs + } + setupLog.Info("Tenant platform kustomize path", "path", manifestPath) + + if err := (&maas.TenantReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + ManifestPath: manifestPath, + AppNamespace: maasAPINamespace, + TenantNamespace: maasSubscriptionNamespace, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Tenant") + os.Exit(1) + } + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { setupLog.Error(err, "unable to set up health check") os.Exit(1) } - if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + // readyz: uncached Namespace GET each probe — see subscriptionNamespaceReadiness. + if err := mgr.AddReadyzCheck("readyz", subscriptionNamespaceReadiness(clientset, maasSubscriptionNamespace)); err != nil { setupLog.Error(err, "unable to set up ready check") os.Exit(1) } diff --git a/maas-controller/go.mod b/maas-controller/go.mod index 1e527aedf..95f0c1cc1 100644 --- a/maas-controller/go.mod +++ b/maas-controller/go.mod @@ -5,13 +5,18 @@ go 1.25.0 require ( github.com/go-logr/logr v1.4.3 github.com/kserve/kserve v0.15.0 + github.com/onsi/gomega v1.37.0 github.com/stretchr/testify v1.11.1 k8s.io/api v0.33.1 + k8s.io/apiextensions-apiserver v0.33.1 k8s.io/apimachinery v0.33.1 k8s.io/client-go v0.33.1 knative.dev/pkg v0.0.0-20250326102644-9f3e60a9244c sigs.k8s.io/controller-runtime v0.20.4 sigs.k8s.io/gateway-api v1.2.1 + sigs.k8s.io/kustomize/api v0.19.0 + sigs.k8s.io/kustomize/kyaml v0.19.0 + sigs.k8s.io/yaml v1.4.0 ) require ( @@ -28,6 +33,7 @@ require ( github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.51.0 // indirect github.com/aws/aws-sdk-go v1.55.6 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/blang/semver/v4 v4.0.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect @@ -38,6 +44,7 @@ require ( github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/fxamacker/cbor/v2 v2.8.0 // indirect + github.com/go-errors/errors v1.4.2 // indirect github.com/go-jose/go-jose/v4 v4.1.4 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-logr/zapr v1.3.0 // indirect @@ -51,6 +58,7 @@ require ( github.com/google/gnostic-models v0.6.9 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/s2a-go v0.1.9 // indirect + github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect github.com/google/uuid v1.6.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect github.com/googleapis/gax-go/v2 v2.14.1 // indirect @@ -62,8 +70,8 @@ require ( github.com/mailru/easyjson v0.9.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_golang v1.22.0 // indirect @@ -73,6 +81,7 @@ require ( github.com/spf13/pflag v1.0.6 // indirect github.com/spiffe/go-spiffe/v2 v2.6.0 // indirect github.com/x448/float16 v0.8.4 // indirect + github.com/xlab/treeprint v1.2.0 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/detectors/gcp v1.39.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.59.0 // indirect @@ -99,11 +108,10 @@ require ( google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect google.golang.org/grpc v1.79.3 // indirect google.golang.org/protobuf v1.36.10 // indirect - gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/go-playground/validator.v9 v9.31.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/apiextensions-apiserver v0.33.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250318190949-c8a335a9a2ff // indirect k8s.io/utils v0.0.0-20241210054802-24370beab758 // indirect @@ -112,7 +120,6 @@ require ( sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect - sigs.k8s.io/yaml v1.4.0 // indirect ) replace github.com/kserve/kserve => github.com/opendatahub-io/kserve v0.0.0-20260112171902-47894470ea49 diff --git a/maas-controller/go.sum b/maas-controller/go.sum index 125ae9529..7639959f2 100644 --- a/maas-controller/go.sum +++ b/maas-controller/go.sum @@ -42,6 +42,8 @@ github.com/aws/aws-sdk-go v1.55.6 h1:cSg4pvZ3m8dgYcgqB97MrcdjUmZ1BeMYKUxMMB89IPk github.com/aws/aws-sdk-go v1.55.6/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= +github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/blendle/zapdriver v1.3.1 h1:C3dydBOWYRiOk+B8X9IVZ5IOe+7cl+tGOexN4QqHfpE= github.com/blendle/zapdriver v1.3.1/go.mod h1:mdXfREi6u5MArG4j9fewC+FGnXaBR+T4Ox4J2u4eHCc= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= @@ -73,6 +75,8 @@ github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fxamacker/cbor/v2 v2.8.0 h1:fFtUGXUzXPHTIUdne5+zzMPTfffl3RD5qYnkY40vtxU= github.com/fxamacker/cbor/v2 v2.8.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/go-errors/errors v1.4.2 h1:J6MZopCL4uSllY1OfXM374weqZFFItUbrImctkmUxIA= +github.com/go-errors/errors v1.4.2/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= github.com/go-jose/go-jose/v4 v4.1.4 h1:moDMcTHmvE6Groj34emNPLs/qtYXRVcd6S7NHbHz3kA= github.com/go-jose/go-jose/v4 v4.1.4/go.mod h1:x4oUasVrzR7071A4TnHLGSPpNOm2a21K9Kf04k1rs08= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= @@ -131,6 +135,8 @@ github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0= github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM= +github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4= +github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/enterprise-certificate-proxy v0.3.6 h1:GW/XbdyBFQ8Qe+YAmFU9uHLo7OnF5tL52HFAgMmyrf4= @@ -171,6 +177,8 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/2gBQ3RWajuToeY6ZtZTIKv2v7ThUy5KKusIT0yc0= +github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/onsi/ginkgo/v2 v2.23.4 h1:ktYTpKJAVZnDT4VjxSbiBenUjmlL/5QkBEocaWXiQus= @@ -198,6 +206,8 @@ github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzM github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= +github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= +github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spiffe/go-spiffe/v2 v2.6.0 h1:l+DolpxNWYgruGQVV0xsfeya3CsC7m8iBzDnMpsbLuo= @@ -206,10 +216,13 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +github.com/xlab/treeprint v1.2.0 h1:HzHnuAF1plUN2zGlAFHbSQP2qJ0ZAD3XF5XD7OesXRQ= +github.com/xlab/treeprint v1.2.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= @@ -367,8 +380,8 @@ google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/go-playground/assert.v1 v1.2.1 h1:xoYuJVE7KT85PYWrN730RguIQO0ePzVRfFMXadIrXTM= gopkg.in/go-playground/assert.v1 v1.2.1/go.mod h1:9RXL0bg/zibRAgZUYszZSwO/z8Y/a8bDuhia5mkpMnE= gopkg.in/go-playground/validator.v9 v9.31.0 h1:bmXmP2RSNtFES+bn4uYuHT7iJFJv7Vj+an+ZQdDaD1M= @@ -378,6 +391,7 @@ gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= @@ -412,6 +426,10 @@ sigs.k8s.io/gateway-api-inference-extension v0.3.0 h1:jLFNxWfG8GeosTa4KWOMr4eTIL sigs.k8s.io/gateway-api-inference-extension v0.3.0/go.mod h1:x6g5FKSs4MsivsIAZJigVEjrvDAtgxNNynoWyid4v28= sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= +sigs.k8s.io/kustomize/api v0.19.0 h1:F+2HB2mU1MSiR9Hp1NEgoU2q9ItNOaBJl0I4Dlus5SQ= +sigs.k8s.io/kustomize/api v0.19.0/go.mod h1:/BbwnivGVcBh1r+8m3tH1VNxJmHSk1PzP5fkP6lbL1o= +sigs.k8s.io/kustomize/kyaml v0.19.0 h1:RFge5qsO1uHhwJsu3ipV7RNolC7Uozc0jUBC/61XSlA= +sigs.k8s.io/kustomize/kyaml v0.19.0/go.mod h1:FeKD5jEOH+FbZPpqUghBP8mrLjJ3+zD3/rf9NNu1cwY= sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= diff --git a/maas-controller/pkg/controller/maas/cross_namespace_test.go b/maas-controller/pkg/controller/maas/cross_namespace_test.go index 5edf9e0a0..a2bd92c2e 100644 --- a/maas-controller/pkg/controller/maas/cross_namespace_test.go +++ b/maas-controller/pkg/controller/maas/cross_namespace_test.go @@ -42,7 +42,7 @@ func TestMaaSAuthPolicyReconciler_CrossNamespace(t *testing.T) { modelNamespaceA = "model-ns-a" modelNamespaceB = "model-ns-b" modelName = "test-model" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName authPolicyName = "maas-auth-" + modelName maasPolicyName = "cross-ns-policy" ) @@ -139,7 +139,7 @@ func TestMaaSAuthPolicyReconciler_SelectiveModelManagement(t *testing.T) { modelNamespaceA = "model-ns-a" modelNamespaceB = "model-ns-b" modelName = "test-model" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName authPolicyName = "maas-auth-" + modelName maasPolicyName = "selective-policy" ) @@ -213,7 +213,7 @@ func TestMaaSAuthPolicyReconciler_SameNameDifferentNamespaces(t *testing.T) { modelName = "shared-model" namespaceA = "team-a" namespaceB = "team-b" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName authPolicyName = "maas-auth-" + modelName ) @@ -305,7 +305,7 @@ func TestMaaSSubscriptionReconciler_CrossNamespace(t *testing.T) { modelNamespaceA = "model-ns-a" modelNamespaceB = "model-ns-b" modelName = "test-model" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName trlpName = "maas-trlp-" + modelName subName = "cross-ns-subscription" ) @@ -417,7 +417,7 @@ func TestMaaSSubscriptionReconciler_DuplicateNameIsolation(t *testing.T) { const ( modelName = "llm" modelNamespace = "models" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName trlpName = "maas-trlp-" + modelName subscriptionName = "gold" // SAME name in both namespaces namespaceA = "tenant-a" @@ -538,7 +538,7 @@ func TestMaaSSubscriptionReconciler_DuplicateNameIsolation(t *testing.T) { if !ok { t.Fatal("predicate is not string") } - expectedPredA := `auth.identity.selected_subscription_key == "` + namespaceA + "/" + subscriptionName + "@" + modelNamespace + "/" + modelName + `"` + expectedPredA := `auth.identity.selected_subscription_key == "` + namespaceA + "/" + subscriptionName + "@" + modelNamespace + "/" + modelName + `" && !request.path.endsWith("/v1/models")` if pred != expectedPredA { t.Errorf("Tenant-a predicate = %q, want %q", pred, expectedPredA) } @@ -564,7 +564,7 @@ func TestMaaSSubscriptionReconciler_DuplicateNameIsolation(t *testing.T) { if !ok { t.Fatal("predicate is not string") } - expectedPredB := `auth.identity.selected_subscription_key == "` + namespaceB + "/" + subscriptionName + "@" + modelNamespace + "/" + modelName + `"` + expectedPredB := `auth.identity.selected_subscription_key == "` + namespaceB + "/" + subscriptionName + "@" + modelNamespace + "/" + modelName + `" && !request.path.endsWith("/v1/models")` if pred != expectedPredB { t.Errorf("Tenant-b predicate = %q, want %q", pred, expectedPredB) } @@ -610,7 +610,7 @@ func TestMaaSModelRefDeletion_CrossNamespaceIsolation(t *testing.T) { modelName = "shared-model" namespaceA = "team-a" namespaceB = "team-b" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName authPolicyName = "maas-auth-" + modelName ) diff --git a/maas-controller/pkg/controller/maas/helpers_test.go b/maas-controller/pkg/controller/maas/helpers_test.go index 38c1de3fb..90d98e0fd 100644 --- a/maas-controller/pkg/controller/maas/helpers_test.go +++ b/maas-controller/pkg/controller/maas/helpers_test.go @@ -18,6 +18,7 @@ package maas import ( "context" + "regexp" "testing" "time" @@ -91,6 +92,97 @@ func TestDeletionTimestampSet(t *testing.T) { } } +// TestTokenRateLimitWindowPattern validates the kubebuilder regex pattern applied to +// TokenRateLimit.Window (defined in maassubscription_types.go). +// +// Background: MaaSSubscription.tokenRateLimits[].window values are passed through +// verbatim into Kuadrant TokenRateLimitPolicy rates[].window. Kuadrant only accepts +// s (seconds), m (minutes), and h (hours) with short numeric segments. The previous +// pattern (^(\d+)(s|m|h|d)$) allowed d (days) and unbounded numbers, both of which +// Kuadrant rejects at TRLP apply time. The tightened pattern (^[1-9]\d{0,3}(s|m|h)$) +// ensures CRD admission catches invalid values before they reach the controller. +// +// Pattern breakdown: +// - ^[1-9] — first digit must be 1-9 (no leading zeros, no zero window) +// - \d{0,3} — up to 3 more digits (total 1-4 digits → range 1-9999) +// - (s|m|h) — only Kuadrant-compatible time units +// - $ — no trailing characters +func TestTokenRateLimitWindowPattern(t *testing.T) { + // This must stay in sync with the +kubebuilder:validation:Pattern marker on + // TokenRateLimit.Window in maassubscription_types.go. If the marker changes, + // update this constant and re-run the test to verify. + windowPattern := regexp.MustCompile(`^[1-9]\d{0,3}(s|m|h)$`) + + tests := []struct { + name string + value string + valid bool + }{ + // --- valid: each Kuadrant-accepted unit with typical values --- + {"1 second", "1s", true}, + {"1 minute", "1m", true}, + {"1 hour", "1h", true}, + {"30 seconds", "30s", true}, + {"5 minutes", "5m", true}, + {"24 hours", "24h", true}, // common replacement for "1d" + + // --- valid: numeric boundary values (1-9999) --- + {"max 4-digit value", "9999h", true}, // upper boundary + {"3-digit value", "100m", true}, + {"2-digit value", "10s", true}, + {"single digit", "9s", true}, // lower boundary (besides 1) + + // --- invalid: days unit --- + // Previously allowed by the old pattern. Kuadrant does not support "d"; + // users should convert to hours (e.g. "1d" → "24h", "7d" → "168h"). + {"days not allowed", "1d", false}, + {"7 days not allowed", "7d", false}, + {"30 days not allowed", "30d", false}, + + // --- invalid: leading zero --- + // Leading zeros produce ambiguous values and are not valid Kuadrant input. + {"leading zero", "01m", false}, + {"leading zero hours", "024h", false}, + + // --- invalid: zero value --- + // A zero-length window is meaningless for rate limiting. + {"zero seconds", "0s", false}, + {"zero minutes", "0m", false}, + {"zero hours", "0h", false}, + + // --- invalid: exceeds 4-digit cap --- + // Kuadrant rejects oversized numeric segments. The pattern caps at 9999. + {"5-digit value", "10000s", false}, + {"6-digit value", "100000m", false}, + + // --- invalid: unsupported units --- + // Kuadrant does not accept milliseconds, and the pattern is case-sensitive. + {"milliseconds not allowed", "100ms", false}, + {"uppercase day", "1D", false}, + {"weeks not allowed", "1w", false}, + + // --- invalid: malformed input --- + // Catch-all cases for input that doesn't match the expected format at all. + {"no unit", "100", false}, + {"no number", "m", false}, + {"empty string", "", false}, + {"leading whitespace", " 1m", false}, + {"trailing whitespace", "1m ", false}, + {"decimal", "1.5h", false}, + {"negative", "-1m", false}, + {"go duration", "1h30m", false}, // compound durations are not supported + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := windowPattern.MatchString(tt.value) + if got != tt.valid { + t.Errorf("windowPattern.MatchString(%q) = %v, want %v", tt.value, got, tt.valid) + } + }) + } +} + func TestValidateCELValue(t *testing.T) { tests := []struct { name string diff --git a/maas-controller/pkg/controller/maas/maasauthpolicy_controller.go b/maas-controller/pkg/controller/maas/maasauthpolicy_controller.go index 69ea62f89..9adcdf1b4 100644 --- a/maas-controller/pkg/controller/maas/maasauthpolicy_controller.go +++ b/maas-controller/pkg/controller/maas/maasauthpolicy_controller.go @@ -165,18 +165,82 @@ func (r *MaaSAuthPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reque statusSnapshot := policy.Status.DeepCopy() + // Track missing models to include in status even when reconciliation skips them + missingModels := r.findMissingModelRefs(ctx, policy) + refs, err := r.reconcileModelAuthPolicies(ctx, log, policy) if err != nil { log.Error(err, "failed to reconcile model AuthPolicies") - r.updateStatus(ctx, policy, "Failed", fmt.Sprintf("Failed to reconcile: %v", err), statusSnapshot) + r.updateStatus(ctx, policy, maasv1alpha1.PhaseFailed, fmt.Sprintf("Failed to reconcile: %v", err), statusSnapshot) return ctrl.Result{}, err } + // Update per-AuthPolicy status r.updateAuthPolicyRefStatus(ctx, log, policy, refs) - r.updateStatus(ctx, policy, "Active", "Successfully reconciled", statusSnapshot) + + // Derive final phase based on model and AuthPolicy health + phase, message := r.deriveAuthPolicyPhase(policy, missingModels) + r.updateStatus(ctx, policy, phase, message, statusSnapshot) return ctrl.Result{}, nil } +// findMissingModelRefs returns a list of model refs that don't exist or couldn't be fetched. +// Treats both NotFound and transient errors as "missing" to fail-safe (avoid falsely reporting Active). +func (r *MaaSAuthPolicyReconciler) findMissingModelRefs(ctx context.Context, policy *maasv1alpha1.MaaSAuthPolicy) []maasv1alpha1.ModelRef { + log := logr.FromContextOrDiscard(ctx) + var missing []maasv1alpha1.ModelRef + for _, ref := range policy.Spec.ModelRefs { + model := &maasv1alpha1.MaaSModelRef{} + if err := r.Get(ctx, types.NamespacedName{Namespace: ref.Namespace, Name: ref.Name}, model); err != nil { + // Treat both NotFound and transient errors as missing to fail-safe + if !apierrors.IsNotFound(err) { + log.Error(err, "transient error fetching MaaSModelRef, treating as missing", "model", ref.Namespace+"/"+ref.Name) + } + missing = append(missing, ref) + } + } + return missing +} + +// deriveAuthPolicyPhase determines the MaaSAuthPolicy phase based on model and AuthPolicy health. +func (r *MaaSAuthPolicyReconciler) deriveAuthPolicyPhase(policy *maasv1alpha1.MaaSAuthPolicy, missingModels []maasv1alpha1.ModelRef) (phase maasv1alpha1.Phase, message string) { + totalModels := len(policy.Spec.ModelRefs) + missingCount := len(missingModels) + validModels := totalModels - missingCount + + // All models missing -> Failed + if validModels == 0 { + return maasv1alpha1.PhaseFailed, fmt.Sprintf("all %d model references are invalid or missing", totalModels) + } + + // Check AuthPolicy health for valid models + var healthyPolicies, unhealthyPolicies int + for _, ap := range policy.Status.AuthPolicies { + if ap.Ready { + healthyPolicies++ + } else { + unhealthyPolicies++ + } + } + + // Some models missing -> Degraded + if missingCount > 0 { + return maasv1alpha1.PhaseDegraded, fmt.Sprintf("%d of %d model references are missing", missingCount, totalModels) + } + + // All models valid but some AuthPolicies unhealthy -> Degraded + if unhealthyPolicies > 0 { + return maasv1alpha1.PhaseDegraded, fmt.Sprintf("%d of %d AuthPolicies not accepted/enforced", unhealthyPolicies, len(policy.Status.AuthPolicies)) + } + + // No AuthPolicies generated yet -> Degraded + if healthyPolicies == 0 { + return maasv1alpha1.PhaseDegraded, "no generated AuthPolicies attached to models" + } + + return maasv1alpha1.PhaseActive, "successfully reconciled" +} + type authPolicyRef struct { Name string Namespace string @@ -397,12 +461,24 @@ allow { }, } - // Fail-close: require successful subscription selection (name must be present) + // Fail-close: require successful subscription selection AND health checks + // Allowlist approach: only Active and Degraded phases are permitted + // Rejects Failed, Pending, empty (unreconciled), unknown phases, and deleting subscriptions authRules["subscription-valid"] = map[string]any{ "metrics": false, "priority": int64(0), "opa": map[string]any{ - "rego": `allow { object.get(input.auth.metadata["subscription-info"], "name", "") != "" }`, + "rego": `allow { + # Subscription name must be present (selector succeeded) + object.get(input.auth.metadata["subscription-info"], "name", "") != "" + # Error field must be empty (no validation errors from selector) + object.get(input.auth.metadata["subscription-info"], "error", "") == "" + # Allowlist: phase must be exactly "Active" or "Degraded" (reject empty/unreconciled) + phase := object.get(input.auth.metadata["subscription-info"], "phase", "") + any([phase == "Active", phase == "Degraded"]) + # Subscription must not be deleting + object.get(input.auth.metadata["subscription-info"], "deletionTimestamp", "") == "" +}`, }, // Cache authorization result keyed by subscription selection inputs. // Uses same key dimensions as subscription-info metadata to ensure cache coherence. @@ -773,26 +849,49 @@ func (r *MaaSAuthPolicyReconciler) updateAuthPolicyRefStatus(ctx context.Context ap.SetGroupVersionKind(schema.GroupVersionKind{Group: "kuadrant.io", Version: "v1", Kind: "AuthPolicy"}) ap.SetNamespace(ref.Namespace) ap.SetName(ref.Name) + + status := maasv1alpha1.AuthPolicyRefStatus{ + ResourceRefStatus: maasv1alpha1.ResourceRefStatus{ + Name: ref.Name, + Namespace: ref.Namespace, + }, + Model: ref.Model, + ModelNamespace: ref.ModelNamespace, + } + if err := r.Get(ctx, client.ObjectKeyFromObject(ap), ap); err != nil { log.Info("could not get AuthPolicy for status", "name", ref.Name, "namespace", ref.Namespace, "error", err) - policy.Status.AuthPolicies = append(policy.Status.AuthPolicies, maasv1alpha1.AuthPolicyRefStatus{ - Name: ref.Name, Namespace: ref.Namespace, Model: ref.Model, ModelNamespace: ref.ModelNamespace, Accepted: "Unknown", Enforced: "Unknown", - }) + status.Ready = false + if apierrors.IsNotFound(err) { + status.Reason = maasv1alpha1.ReasonNotFound + status.Message = "AuthPolicy not created yet" + } else { + status.Reason = maasv1alpha1.ReasonGetFailed + status.Message = fmt.Sprintf("failed to get AuthPolicy: %v", err) + } + policy.Status.AuthPolicies = append(policy.Status.AuthPolicies, status) continue } - accepted, enforced := getAuthPolicyConditionState(ap) - policy.Status.AuthPolicies = append(policy.Status.AuthPolicies, maasv1alpha1.AuthPolicyRefStatus{ - Name: ref.Name, Namespace: ref.Namespace, Model: ref.Model, ModelNamespace: ref.ModelNamespace, Accepted: accepted, Enforced: enforced, - }) + + ready, reason, message := getAuthPolicyReadyState(ap) + status.Ready = ready + status.Reason = reason + status.Message = message + policy.Status.AuthPolicies = append(policy.Status.AuthPolicies, status) } } -func getAuthPolicyConditionState(ap *unstructured.Unstructured) (accepted, enforced string) { - accepted, enforced = "Unknown", "Unknown" +// getAuthPolicyReadyState checks if an AuthPolicy is accepted and enforced. +// Returns ready=true only if both Accepted and Enforced conditions are True. +func getAuthPolicyReadyState(ap *unstructured.Unstructured) (ready bool, reason maasv1alpha1.ConditionReason, message string) { conditions, found, err := unstructured.NestedSlice(ap.Object, "status", "conditions") if err != nil || !found || len(conditions) == 0 { - return accepted, enforced + return false, maasv1alpha1.ReasonConditionsNotFound, "status conditions not available" } + + var accepted, enforced bool + var acceptedMsg, enforcedMsg string + for _, c := range conditions { cond, ok := c.(map[string]any) if !ok { @@ -800,30 +899,55 @@ func getAuthPolicyConditionState(ap *unstructured.Unstructured) (accepted, enfor } typ, _ := cond["type"].(string) status, _ := cond["status"].(string) + msg, _ := cond["message"].(string) + switch typ { case "Accepted": - accepted = status + accepted = status == "True" + if !accepted { + acceptedMsg = msg + } case "Enforced": - enforced = status + enforced = status == "True" + if !enforced { + enforcedMsg = msg + } } } - return accepted, enforced + + if accepted && enforced { + return true, maasv1alpha1.ReasonAcceptedEnforced, "" + } + if !accepted { + return false, maasv1alpha1.ReasonNotAccepted, acceptedMsg + } + return false, maasv1alpha1.ReasonNotEnforced, enforcedMsg } -func (r *MaaSAuthPolicyReconciler) updateStatus(ctx context.Context, policy *maasv1alpha1.MaaSAuthPolicy, phase, message string, statusSnapshot *maasv1alpha1.MaaSAuthPolicyStatus) { +func (r *MaaSAuthPolicyReconciler) updateStatus(ctx context.Context, policy *maasv1alpha1.MaaSAuthPolicy, phase maasv1alpha1.Phase, message string, statusSnapshot *maasv1alpha1.MaaSAuthPolicyStatus) { policy.Status.Phase = phase - status := metav1.ConditionTrue - reason := "Reconciled" - if phase == "Failed" { + var status metav1.ConditionStatus + var reason maasv1alpha1.ConditionReason + switch phase { + case maasv1alpha1.PhaseActive: + status = metav1.ConditionTrue + reason = maasv1alpha1.ReasonReconciled + case maasv1alpha1.PhaseDegraded: + status = metav1.ConditionFalse + reason = maasv1alpha1.ReasonPartialFailure + case maasv1alpha1.PhaseFailed: status = metav1.ConditionFalse - reason = "ReconcileFailed" + reason = maasv1alpha1.ReasonReconcileFailed + default: + status = metav1.ConditionUnknown + reason = maasv1alpha1.ReasonUnknown } apimeta.SetStatusCondition(&policy.Status.Conditions, metav1.Condition{ Type: "Ready", Status: status, - Reason: reason, + Reason: string(reason), Message: message, ObservedGeneration: policy.GetGeneration(), }) diff --git a/maas-controller/pkg/controller/maas/maasauthpolicy_controller_test.go b/maas-controller/pkg/controller/maas/maasauthpolicy_controller_test.go index 0a70e41e2..aa548ec06 100644 --- a/maas-controller/pkg/controller/maas/maasauthpolicy_controller_test.go +++ b/maas-controller/pkg/controller/maas/maasauthpolicy_controller_test.go @@ -21,6 +21,7 @@ import ( "testing" apierrors "k8s.io/apimachinery/pkg/api/errors" + apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" @@ -61,8 +62,8 @@ func TestMaaSAuthPolicyReconciler_ManagedAnnotation(t *testing.T) { const ( modelName = "llm" namespace = "default" - httpRouteName = "maas-model-" + modelName // ExternalModel naming convention - authPolicyName = "maas-auth-" + modelName // generated by the controller + httpRouteName = modelName // ExternalModel naming convention + authPolicyName = "maas-auth-" + modelName // generated by the controller maasPolicyName = "policy-a" ) @@ -144,7 +145,7 @@ func TestMaaSAuthPolicyReconciler_DuplicateReconciliation(t *testing.T) { const ( modelName = "llm" namespace = "default" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName authPolicyName = "maas-auth-" + modelName ) @@ -280,8 +281,8 @@ func TestMaaSAuthPolicyReconciler_RemoveModelRef(t *testing.T) { modelA = "model-a" modelB = "model-b" namespace = "default" - httpRouteA = "maas-model-" + modelA - httpRouteB = "maas-model-" + modelB + httpRouteA = modelA + httpRouteB = modelB authPolicyA = "maas-auth-" + modelA authPolicyB = "maas-auth-" + modelB maasPolicyName = "policy-1" @@ -359,8 +360,8 @@ func TestMaaSAuthPolicyReconciler_RemoveModelRef_Aggregation(t *testing.T) { modelA = "model-a" modelB = "model-b" namespace = "default" - httpRouteA = "maas-model-" + modelA - httpRouteB = "maas-model-" + modelB + httpRouteA = modelA + httpRouteB = modelB authPolicyB = "maas-auth-" + modelB ) @@ -453,7 +454,7 @@ func TestMaaSAuthPolicyReconciler_MultiplePoliciesDeletion(t *testing.T) { const ( modelName = "shared-model" modelNamespace = "llm" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName authPolicyName = "maas-auth-" + modelName policy1Name = "policy-1" policy2Name = "policy-2" @@ -565,7 +566,7 @@ func TestMaaSAuthPolicyReconciler_CachingConfiguration(t *testing.T) { const ( modelName = "llm" namespace = "default" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName authPolicyName = "maas-auth-" + modelName maasPolicyName = "policy-a" ) @@ -792,7 +793,7 @@ func TestMaaSAuthPolicyReconciler_CacheKeyIsolation(t *testing.T) { const ( modelName = "llm" namespace = "default" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName authPolicyName = "maas-auth-" + modelName maasPolicyName = "policy-a" ) @@ -976,11 +977,11 @@ func TestMaaSAuthPolicyReconciler_CacheKeyModelIsolation(t *testing.T) { model2Name := "llm-2" model1 := newMaaSModelRef(model1Name, namespace, "ExternalModel", model1Name) - route1 := newHTTPRoute("maas-model-"+model1Name, namespace) + route1 := newHTTPRoute(model1Name, namespace) policy1 := newMaaSAuthPolicy("policy-1", namespace, "team-a", maasv1alpha1.ModelRef{Name: model1Name, Namespace: namespace}) model2 := newMaaSModelRef(model2Name, namespace, "ExternalModel", model2Name) - route2 := newHTTPRoute("maas-model-"+model2Name, namespace) + route2 := newHTTPRoute(model2Name, namespace) policy2 := newMaaSAuthPolicy("policy-2", namespace, "team-a", maasv1alpha1.ModelRef{Name: model2Name, Namespace: namespace}) c := fake.NewClientBuilder(). @@ -1072,7 +1073,7 @@ func TestMaaSAuthPolicyReconciler_NoIdentityHeadersUpstream(t *testing.T) { const ( modelName = "llm" namespace = "default" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName authPolicyName = "maas-auth-" + modelName maasPolicyName = "policy-a" ) @@ -1232,3 +1233,202 @@ func contains(s, substr string) bool { return false }()) } + +// TestMaaSAuthPolicyReconciler_MissingModelRef_FailedPhase verifies that an auth policy +// with all missing model refs gets Failed phase. +func TestMaaSAuthPolicyReconciler_MissingModelRef_FailedPhase(t *testing.T) { + const ( + namespace = "default" + maasAuthName = "auth-missing" + missingModel = "non-existent-model" + ) + + // Create auth policy referencing a non-existent model + maasAuth := newMaaSAuthPolicy(maasAuthName, namespace, "team-a", + maasv1alpha1.ModelRef{Name: missingModel, Namespace: namespace}) + + c := fake.NewClientBuilder(). + WithScheme(scheme). + WithRESTMapper(testRESTMapper()). + WithObjects(maasAuth). + WithStatusSubresource(&maasv1alpha1.MaaSAuthPolicy{}). + Build() + + r := &MaaSAuthPolicyReconciler{ + Client: c, + Scheme: scheme, + MaaSAPINamespace: namespace, + GatewayName: "openshift-ingress/maas-default-gateway", + } + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: maasAuthName, Namespace: namespace}} + if _, err := r.Reconcile(context.Background(), req); err != nil { + t.Fatalf("Reconcile: unexpected error: %v", err) + } + + // Fetch updated auth policy + var policy maasv1alpha1.MaaSAuthPolicy + if err := c.Get(context.Background(), req.NamespacedName, &policy); err != nil { + t.Fatalf("Get MaaSAuthPolicy: %v", err) + } + + // Verify phase is Failed (all models missing) + if policy.Status.Phase != maasv1alpha1.PhaseFailed { + t.Errorf("expected phase Failed, got %q", policy.Status.Phase) + } + + // Verify Ready condition is False + readyCond := apimeta.FindStatusCondition(policy.Status.Conditions, "Ready") + if readyCond == nil { + t.Fatal("Ready condition not found") + } + if readyCond.Status != metav1.ConditionFalse { + t.Errorf("expected Ready=False, got %v", readyCond.Status) + } +} + +// TestMaaSAuthPolicyReconciler_PartialModelRefs_DegradedPhase verifies that an auth policy +// with some valid and some invalid model refs gets Degraded phase. +func TestMaaSAuthPolicyReconciler_PartialModelRefs_DegradedPhase(t *testing.T) { + const ( + namespace = "default" + maasAuthName = "auth-partial" + validModel = "valid-model" + missingModel = "missing-model" + httpRouteName = validModel + ) + + // Create valid model and route + model := newMaaSModelRef(validModel, namespace, "ExternalModel", validModel) + route := newHTTPRoute(httpRouteName, namespace) + + // Create auth policy referencing both valid and invalid models + maasAuth := newMaaSAuthPolicy(maasAuthName, namespace, "team-a", + maasv1alpha1.ModelRef{Name: validModel, Namespace: namespace}, + maasv1alpha1.ModelRef{Name: missingModel, Namespace: namespace}) + + c := fake.NewClientBuilder(). + WithScheme(scheme). + WithRESTMapper(testRESTMapper()). + WithObjects(model, route, maasAuth). + WithStatusSubresource(&maasv1alpha1.MaaSAuthPolicy{}). + Build() + + r := &MaaSAuthPolicyReconciler{ + Client: c, + Scheme: scheme, + MaaSAPINamespace: namespace, + GatewayName: "openshift-ingress/maas-default-gateway", + } + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: maasAuthName, Namespace: namespace}} + if _, err := r.Reconcile(context.Background(), req); err != nil { + t.Fatalf("Reconcile: unexpected error: %v", err) + } + + var policy maasv1alpha1.MaaSAuthPolicy + if err := c.Get(context.Background(), req.NamespacedName, &policy); err != nil { + t.Fatalf("Get MaaSAuthPolicy: %v", err) + } + + // Verify phase is Degraded (partial functionality) + if policy.Status.Phase != maasv1alpha1.PhaseDegraded { + t.Errorf("expected phase Degraded, got %q", policy.Status.Phase) + } + + // Verify Ready condition is False with PartialFailure reason + readyCond := apimeta.FindStatusCondition(policy.Status.Conditions, "Ready") + if readyCond == nil { + t.Fatal("Ready condition not found") + } + if readyCond.Status != metav1.ConditionFalse { + t.Errorf("expected Ready=False, got %v", readyCond.Status) + } + if readyCond.Reason != "PartialFailure" { + t.Errorf("expected reason PartialFailure, got %q", readyCond.Reason) + } +} + +// TestMaaSAuthPolicyReconciler_AllValidModelRefs_ActivePhase verifies that an auth policy +// with all valid model refs and accepted/enforced AuthPolicy gets Active phase. +func TestMaaSAuthPolicyReconciler_AllValidModelRefs_ActivePhase(t *testing.T) { + const ( + namespace = "default" + maasAuthName = "auth-valid" + modelName = "valid-model" + httpRouteName = modelName + authPolicyName = "maas-auth-" + modelName + ) + + model := newMaaSModelRef(modelName, namespace, "ExternalModel", modelName) + route := newHTTPRoute(httpRouteName, namespace) + maasAuth := newMaaSAuthPolicy(maasAuthName, namespace, "team-a", + maasv1alpha1.ModelRef{Name: modelName, Namespace: namespace}) + + // Pre-create AuthPolicy with Accepted=True and Enforced=True (simulates Kuadrant accepting) + existingAP := newPreexistingAuthPolicy(authPolicyName, namespace, modelName, map[string]string{ + "maas.opendatahub.io/auth-policies": maasAuthName, + }) + if err := unstructured.SetNestedSlice(existingAP.Object, []any{ + map[string]any{ + "type": "Accepted", + "status": "True", + }, + map[string]any{ + "type": "Enforced", + "status": "True", + }, + }, "status", "conditions"); err != nil { + t.Fatalf("SetNestedSlice status.conditions: %v", err) + } + + c := fake.NewClientBuilder(). + WithScheme(scheme). + WithRESTMapper(testRESTMapper()). + WithObjects(model, route, maasAuth, existingAP). + WithStatusSubresource(&maasv1alpha1.MaaSAuthPolicy{}). + Build() + + r := &MaaSAuthPolicyReconciler{ + Client: c, + Scheme: scheme, + MaaSAPINamespace: namespace, + GatewayName: "openshift-ingress/maas-default-gateway", + } + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: maasAuthName, Namespace: namespace}} + if _, err := r.Reconcile(context.Background(), req); err != nil { + t.Fatalf("Reconcile: unexpected error: %v", err) + } + + var policy maasv1alpha1.MaaSAuthPolicy + if err := c.Get(context.Background(), req.NamespacedName, &policy); err != nil { + t.Fatalf("Get MaaSAuthPolicy: %v", err) + } + + // Verify phase is Active + if policy.Status.Phase != maasv1alpha1.PhaseActive { + t.Errorf("expected phase Active, got %q", policy.Status.Phase) + } + + // Verify Ready condition is True + readyCond := apimeta.FindStatusCondition(policy.Status.Conditions, "Ready") + if readyCond == nil { + t.Fatal("Ready condition not found") + } + if readyCond.Status != metav1.ConditionTrue { + t.Errorf("expected Ready=True, got %v", readyCond.Status) + } + + // Verify authPolicies status is populated with Ready=true + if len(policy.Status.AuthPolicies) != 1 { + t.Fatalf("expected 1 authPolicy status, got %d", len(policy.Status.AuthPolicies)) + } + apStatus := policy.Status.AuthPolicies[0] + if apStatus.Model != modelName { + t.Errorf("expected model %q, got %q", modelName, apStatus.Model) + } + if !apStatus.Ready { + t.Error("expected authPolicies[0].Ready=true") + } + if apStatus.Reason != maasv1alpha1.ReasonAcceptedEnforced { + t.Errorf("expected reason %q, got %q", maasv1alpha1.ReasonAcceptedEnforced, apStatus.Reason) + } +} diff --git a/maas-controller/pkg/controller/maas/maasmodelref_controller.go b/maas-controller/pkg/controller/maas/maasmodelref_controller.go index 6ca5bf840..da65d6db8 100644 --- a/maas-controller/pkg/controller/maas/maasmodelref_controller.go +++ b/maas-controller/pkg/controller/maas/maasmodelref_controller.go @@ -43,12 +43,13 @@ import ( gatewayapiv1 "sigs.k8s.io/gateway-api/apis/v1" maasv1alpha1 "github.com/opendatahub-io/models-as-a-service/maas-controller/api/maas/v1alpha1" + "github.com/opendatahub-io/models-as-a-service/maas-controller/pkg/platform/tenantreconcile" ) // Default gateway name and namespace when not set via flags. const ( - defaultGatewayName = "maas-default-gateway" - defaultGatewayNamespace = "openshift-ingress" + defaultGatewayName = tenantreconcile.DefaultGatewayName + defaultGatewayNamespace = tenantreconcile.DefaultGatewayNamespace defaultClusterAudience = "https://kubernetes.default.svc" ) diff --git a/maas-controller/pkg/controller/maas/maassubscription_controller.go b/maas-controller/pkg/controller/maas/maassubscription_controller.go index d08fa279c..8e7371ce0 100644 --- a/maas-controller/pkg/controller/maas/maassubscription_controller.go +++ b/maas-controller/pkg/controller/maas/maassubscription_controller.go @@ -72,6 +72,202 @@ const ( // (API key mint and selector use deterministic tie-break; admins should set distinct priorities). const ConditionSpecPriorityDuplicate = "SpecPriorityDuplicate" +// validateModelRefs checks each model reference and returns per-model status. +func (r *MaaSSubscriptionReconciler) validateModelRefs(ctx context.Context, subscription *maasv1alpha1.MaaSSubscription) []maasv1alpha1.ModelRefStatus { + statuses := make([]maasv1alpha1.ModelRefStatus, 0, len(subscription.Spec.ModelRefs)) + seen := make(map[string]struct{}) + + for _, ref := range subscription.Spec.ModelRefs { + key := ref.Namespace + "/" + ref.Name + if _, ok := seen[key]; ok { + continue + } + seen[key] = struct{}{} + + status := maasv1alpha1.ModelRefStatus{ + ResourceRefStatus: maasv1alpha1.ResourceRefStatus{ + Name: ref.Name, + Namespace: ref.Namespace, + }, + } + + model := &maasv1alpha1.MaaSModelRef{} + if err := r.Get(ctx, types.NamespacedName{Namespace: ref.Namespace, Name: ref.Name}, model); err != nil { + if apierrors.IsNotFound(err) { + status.Ready = false + status.Reason = maasv1alpha1.ReasonNotFound + status.Message = fmt.Sprintf("MaaSModelRef %s/%s not found", ref.Namespace, ref.Name) + } else { + status.Ready = false + status.Reason = maasv1alpha1.ReasonGetFailed + status.Message = fmt.Sprintf("failed to get MaaSModelRef: %v", err) + } + } else { + status.Ready = true + status.Reason = maasv1alpha1.ReasonValid + } + statuses = append(statuses, status) + } + return statuses +} + +// checkTokenRateLimitHealth checks the health of generated TokenRateLimitPolicies. +func (r *MaaSSubscriptionReconciler) checkTokenRateLimitHealth(ctx context.Context, subscription *maasv1alpha1.MaaSSubscription) []maasv1alpha1.TokenRateLimitStatus { + statuses := make([]maasv1alpha1.TokenRateLimitStatus, 0, len(subscription.Spec.ModelRefs)) + seen := make(map[string]struct{}) + + for _, ref := range subscription.Spec.ModelRefs { + key := ref.Namespace + "/" + ref.Name + if _, ok := seen[key]; ok { + continue + } + seen[key] = struct{}{} + + policyName := fmt.Sprintf("maas-trlp-%s", ref.Name) + status := maasv1alpha1.TokenRateLimitStatus{ + ResourceRefStatus: maasv1alpha1.ResourceRefStatus{ + Name: policyName, + Namespace: ref.Namespace, + }, + Model: ref.Name, + } + + // Find the TRLP for this model (TRLP lives in HTTPRoute namespace) + _, httpRouteNS, err := findHTTPRouteForModel(ctx, r.Client, ref.Namespace, ref.Name) + if err != nil { + // Record status even when HTTPRoute not found - makes diagnosing issues easier + status.Ready = false + if errors.Is(err, ErrHTTPRouteNotFound) || errors.Is(err, ErrModelNotFound) { + status.Reason = maasv1alpha1.ReasonBackendNotReady + status.Message = fmt.Sprintf("HTTPRoute not found yet; TokenRateLimitPolicy cannot be created: %v", err) + } else { + status.Reason = maasv1alpha1.ReasonGetFailed + status.Message = fmt.Sprintf("failed to find HTTPRoute for model: %v", err) + } + statuses = append(statuses, status) + continue + } + status.Namespace = httpRouteNS + + trlp := &unstructured.Unstructured{} + trlp.SetGroupVersionKind(schema.GroupVersionKind{Group: "kuadrant.io", Version: "v1alpha1", Kind: "TokenRateLimitPolicy"}) + + if err := r.Get(ctx, types.NamespacedName{Name: policyName, Namespace: httpRouteNS}, trlp); err != nil { + if apierrors.IsNotFound(err) { + status.Ready = false + status.Reason = maasv1alpha1.ReasonNotFound + status.Message = "TokenRateLimitPolicy not created yet" + } else { + status.Ready = false + status.Reason = maasv1alpha1.ReasonGetFailed + status.Message = fmt.Sprintf("failed to get TokenRateLimitPolicy: %v", err) + } + } else { + // Check Accepted condition from TRLP status + accepted, message := getTRLPAcceptedCondition(trlp) + status.Ready = accepted + if accepted { + status.Reason = maasv1alpha1.ReasonAccepted + } else { + status.Reason = maasv1alpha1.ReasonNotAccepted + status.Message = message + } + } + statuses = append(statuses, status) + } + return statuses +} + +// getTRLPAcceptedCondition extracts the Accepted condition from a TokenRateLimitPolicy. +func getTRLPAcceptedCondition(trlp *unstructured.Unstructured) (accepted bool, message string) { + status, found, err := unstructured.NestedMap(trlp.Object, "status") + if err != nil || !found { + return false, "status not available" + } + + conditions, found, err := unstructured.NestedSlice(status, "conditions") + if err != nil || !found { + return false, "conditions not found" + } + + for _, c := range conditions { + cond, ok := c.(map[string]any) + if !ok { + continue + } + if cond["type"] == "Accepted" { + if cond["status"] == "True" { + return true, "" + } + if msg, ok := cond["message"].(string); ok { + return false, msg + } + return false, "Accepted condition is False" + } + } + return false, "Accepted condition not found" +} + +// deriveFinalPhase determines the subscription phase based on model and TRLP statuses. +func deriveFinalPhase(modelStatuses []maasv1alpha1.ModelRefStatus, trlpStatuses []maasv1alpha1.TokenRateLimitStatus) (phase maasv1alpha1.Phase, message string) { + if len(modelStatuses) == 0 { + return maasv1alpha1.PhaseFailed, "no model references specified" + } + + // Build a set of models that validateModelRefs reported as valid + validModelSet := make(map[string]struct{}) + var validModels, invalidModels int + for _, s := range modelStatuses { + if s.Ready { + validModels++ + validModelSet[s.Name] = struct{}{} + } else { + invalidModels++ + } + } + + // Check TRLP health + // Also detect race condition: model reported as valid by validateModelRefs but + // deleted before checkTokenRateLimitHealth ran (TRLP reports BackendNotReady) + var healthyTRLPs, unhealthyTRLPs, modelsWithBackendIssues int + for _, s := range trlpStatuses { + if s.Ready { + healthyTRLPs++ + } else { + unhealthyTRLPs++ + // Only count as backend issue if the model was reported as valid + // (avoids double-counting models already marked as invalid) + if s.Reason == maasv1alpha1.ReasonBackendNotReady { + if _, wasValid := validModelSet[s.Model]; wasValid { + modelsWithBackendIssues++ + } + } + } + } + + // Adjust counts for race condition: models thought to be valid but actually unavailable + effectiveValidModels := validModels - modelsWithBackendIssues + effectiveInvalidModels := invalidModels + modelsWithBackendIssues + + // All models invalid -> Failed + if effectiveValidModels <= 0 { + return maasv1alpha1.PhaseFailed, fmt.Sprintf("all %d model references are invalid or unavailable", len(modelStatuses)) + } + + // Partial model failure -> Degraded + if effectiveInvalidModels > 0 { + return maasv1alpha1.PhaseDegraded, fmt.Sprintf("%d of %d model references are invalid or unavailable", effectiveInvalidModels, len(modelStatuses)) + } + + // All models valid but some TRLPs unhealthy (not due to backend issues) -> Degraded + trlpOnlyIssues := unhealthyTRLPs - modelsWithBackendIssues + if trlpOnlyIssues > 0 { + return maasv1alpha1.PhaseDegraded, fmt.Sprintf("%d of %d TokenRateLimitPolicies not accepted", trlpOnlyIssues, len(trlpStatuses)) + } + + return maasv1alpha1.PhaseActive, "successfully reconciled" +} + // Reconcile is part of the main kubernetes reconciliation loop func (r *MaaSSubscriptionReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log := logr.FromContextOrDiscard(ctx).WithValues("MaaSSubscription", req.NamespacedName) @@ -100,15 +296,69 @@ func (r *MaaSSubscriptionReconciler) Reconcile(ctx context.Context, req ctrl.Req statusSnapshot := subscription.Status.DeepCopy() - // Reconcile TokenRateLimitPolicy for each model - // IMPORTANT: TokenRateLimitPolicy targets the HTTPRoute for each model - if err := r.reconcileTokenRateLimitPolicies(ctx, log, subscription); err != nil { - log.Error(err, "failed to reconcile TokenRateLimitPolicies") - r.updateStatus(ctx, subscription, "Failed", fmt.Sprintf("Failed to reconcile: %v", err), statusSnapshot) - return ctrl.Result{}, err + // Validate model references and populate per-model status + modelStatuses := r.validateModelRefs(ctx, subscription) + subscription.Status.ModelRefStatuses = modelStatuses + + // Check if we have any valid models to proceed with TRLP reconciliation + hasValidModels := false + for _, s := range modelStatuses { + if s.Ready { + hasValidModels = true + break + } } - r.updateStatus(ctx, subscription, "Active", "Successfully reconciled", statusSnapshot) + // Only reconcile TRLPs if we have valid models + if hasValidModels { + // Reconcile TokenRateLimitPolicy for each model + // IMPORTANT: TokenRateLimitPolicy targets the HTTPRoute for each model + if err := r.reconcileTokenRateLimitPolicies(ctx, log, subscription); err != nil { + log.Error(err, "failed to reconcile TokenRateLimitPolicies") + subscription.Status.Phase = maasv1alpha1.PhaseFailed + r.updateStatus(ctx, subscription, maasv1alpha1.PhaseFailed, fmt.Sprintf("failed to reconcile TokenRateLimitPolicies: %v", err), statusSnapshot) + return ctrl.Result{}, err + } + } else { + // No valid models - clean up any stale TRLPs from previous reconciliations + if err := r.cleanupStaleTRLPs(ctx, log, subscription); err != nil { + log.Error(err, "failed to clean up stale TokenRateLimitPolicies") + r.updateStatus(ctx, subscription, maasv1alpha1.PhaseFailed, fmt.Sprintf("failed to clean up stale TokenRateLimitPolicies: %v", err), statusSnapshot) + return ctrl.Result{}, err + } + } + + // Check TRLP health and populate status + trlpStatuses := r.checkTokenRateLimitHealth(ctx, subscription) + subscription.Status.TokenRateLimitStatuses = trlpStatuses + + // Correct stale modelRefStatuses: validateModelRefs may have reported a model + // as valid (informer cache still had it) while the model is actually being + // deleted (finalizer present). checkTokenRateLimitHealth detects this via + // findHTTPRouteForModel's deletionTimestamp check and reports BackendNotReady. + // Propagate that information back into modelRefStatuses so the status is + // consistent with the derived phase. + backendNotReady := make(map[string]string, len(trlpStatuses)) + for _, ts := range trlpStatuses { + if ts.Reason == maasv1alpha1.ReasonBackendNotReady { + backendNotReady[ts.Namespace+"/"+ts.Model] = ts.Message + } + } + for i := range modelStatuses { + if modelStatuses[i].Ready { + if msg, found := backendNotReady[modelStatuses[i].Namespace+"/"+modelStatuses[i].Name]; found { + modelStatuses[i].Ready = false + modelStatuses[i].Reason = maasv1alpha1.ReasonNotFound + modelStatuses[i].Message = msg + } + } + } + subscription.Status.ModelRefStatuses = modelStatuses + + // Derive final phase based on model and TRLP health + phase, message := deriveFinalPhase(modelStatuses, trlpStatuses) + r.updateStatus(ctx, subscription, phase, message, statusSnapshot) + return ctrl.Result{}, nil } @@ -245,7 +495,10 @@ func (r *MaaSSubscriptionReconciler) reconcileTRLPForModel(ctx context.Context, "rates": si.rates, "when": []any{ map[string]any{ - "predicate": fmt.Sprintf(`auth.identity.selected_subscription_key == "%s"`, modelScopedRef), + // Exempt /v1/models endpoint from token rate limiting. + // This endpoint is used for model discovery/metadata and does not consume inference tokens. + // Users should be able to query model capabilities even when their token quota is exhausted. + "predicate": fmt.Sprintf(`auth.identity.selected_subscription_key == "%s" && !request.path.endsWith("/v1/models")`, modelScopedRef), }, }, "counters": []any{ @@ -461,7 +714,7 @@ func (r *MaaSSubscriptionReconciler) handleDeletion(ctx context.Context, log log return ctrl.Result{}, nil } -func (r *MaaSSubscriptionReconciler) updateStatus(ctx context.Context, subscription *maasv1alpha1.MaaSSubscription, phase, message string, statusSnapshot *maasv1alpha1.MaaSSubscriptionStatus) { +func (r *MaaSSubscriptionReconciler) updateStatus(ctx context.Context, subscription *maasv1alpha1.MaaSSubscription, phase maasv1alpha1.Phase, message string, statusSnapshot *maasv1alpha1.MaaSSubscriptionStatus) { // Status-only updates do not bump metadata.generation, so this reconcile may not re-queue. // Merge SpecPriorityDuplicate from the API server so we do not clobber the async duplicate-priority scan. latest := &maasv1alpha1.MaaSSubscription{} @@ -473,17 +726,27 @@ func (r *MaaSSubscriptionReconciler) updateStatus(ctx context.Context, subscript subscription.Status.Phase = phase - status := metav1.ConditionTrue - reason := "Reconciled" - if phase == "Failed" { + var status metav1.ConditionStatus + var reason maasv1alpha1.ConditionReason + switch phase { + case maasv1alpha1.PhaseActive: + status = metav1.ConditionTrue + reason = maasv1alpha1.ReasonReconciled + case maasv1alpha1.PhaseDegraded: + status = metav1.ConditionFalse + reason = maasv1alpha1.ReasonPartialFailure + case maasv1alpha1.PhaseFailed: status = metav1.ConditionFalse - reason = "ReconcileFailed" + reason = maasv1alpha1.ReasonReconcileFailed + default: + status = metav1.ConditionUnknown + reason = maasv1alpha1.ReasonUnknown } apimeta.SetStatusCondition(&subscription.Status.Conditions, metav1.Condition{ Type: "Ready", Status: status, - Reason: reason, + Reason: string(reason), Message: message, ObservedGeneration: subscription.GetGeneration(), }) diff --git a/maas-controller/pkg/controller/maas/maassubscription_controller_test.go b/maas-controller/pkg/controller/maas/maassubscription_controller_test.go index 17e93a441..608d46866 100644 --- a/maas-controller/pkg/controller/maas/maassubscription_controller_test.go +++ b/maas-controller/pkg/controller/maas/maassubscription_controller_test.go @@ -80,8 +80,8 @@ func TestMaaSSubscriptionReconciler_ManagedAnnotation(t *testing.T) { const ( modelName = "llm" namespace = "default" - httpRouteName = "maas-model-" + modelName // ExternalModel naming convention - trlpName = "maas-trlp-" + modelName // generated by the controller + httpRouteName = modelName // ExternalModel naming convention + trlpName = "maas-trlp-" + modelName // generated by the controller maasSubName = "sub-a" ) @@ -167,7 +167,7 @@ func TestMaaSSubscriptionReconciler_DuplicateReconciliation(t *testing.T) { const ( modelName = "llm" namespace = "default" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName trlpName = "maas-trlp-" + modelName ) @@ -226,7 +226,7 @@ func TestMaaSSubscriptionReconciler_SpecPriorityDuplicateCondition(t *testing.T) const ( modelName = "llm" namespace = "default" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName ) model := newMaaSModelRef(modelName, namespace, "ExternalModel", modelName) @@ -372,8 +372,8 @@ func TestMaaSSubscriptionReconciler_RemoveModelRef(t *testing.T) { modelA = "model-a" modelB = "model-b" namespace = "default" - httpRouteA = "maas-model-" + modelA - httpRouteB = "maas-model-" + modelB + httpRouteA = modelA + httpRouteB = modelB trlpA = "maas-trlp-" + modelA trlpB = "maas-trlp-" + modelB subName = "sub-1" @@ -460,8 +460,8 @@ func TestMaaSSubscriptionReconciler_RemoveModelRef_Aggregation(t *testing.T) { modelA = "model-a" modelB = "model-b" namespace = "default" - httpRouteA = "maas-model-" + modelA - httpRouteB = "maas-model-" + modelB + httpRouteA = modelA + httpRouteB = modelB trlpB = "maas-trlp-" + modelB ) @@ -567,7 +567,7 @@ func TestMaaSSubscriptionReconciler_MultipleSubscriptionsDeletion(t *testing.T) const ( modelName = "shared-model" modelNamespace = "llm" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName trlpName = "maas-trlp-" + modelName sub1Name = "subscription-1" sub2Name = "subscription-2" @@ -708,7 +708,7 @@ func TestMaaSSubscriptionReconciler_SimplifiedTRLP(t *testing.T) { const ( modelName = "llm" namespace = "default" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName trlpName = "maas-trlp-" + modelName maasSubName = "sub-a" ) @@ -778,7 +778,8 @@ func TestMaaSSubscriptionReconciler_SimplifiedTRLP(t *testing.T) { } // Predicate now uses model-scoped key: namespace/name@modelNamespace/modelName - expected := fmt.Sprintf(`auth.identity.selected_subscription_key == "%s/%s@%s/%s"`, namespace, maasSubName, namespace, modelName) + // and exempts /v1/models endpoint from rate limiting + expected := fmt.Sprintf(`auth.identity.selected_subscription_key == "%s/%s@%s/%s" && !request.path.endsWith("/v1/models")`, namespace, maasSubName, namespace, modelName) if pred != expected { t.Errorf("predicate = %q, want %q", pred, expected) } @@ -802,7 +803,7 @@ func TestMaaSSubscriptionReconciler_MultipleSubscriptionsSimplified(t *testing.T const ( modelName = "llm" namespace = "default" - httpRouteName = "maas-model-" + modelName + httpRouteName = modelName trlpName = "maas-trlp-" + modelName ) @@ -866,7 +867,8 @@ func TestMaaSSubscriptionReconciler_MultipleSubscriptionsSimplified(t *testing.T t.Fatalf("sub-a predicate not a string: %T", predMap["predicate"]) } // Predicate now uses model-scoped key: namespace/name@modelNamespace/modelName - expected := fmt.Sprintf(`auth.identity.selected_subscription_key == "%s/sub-a@%s/%s"`, namespace, namespace, modelName) + // and exempts /v1/models endpoint from rate limiting + expected := fmt.Sprintf(`auth.identity.selected_subscription_key == "%s/sub-a@%s/%s" && !request.path.endsWith("/v1/models")`, namespace, namespace, modelName) if pred != expected { t.Errorf("sub-a predicate = %q, want %q", pred, expected) } @@ -901,7 +903,8 @@ func TestMaaSSubscriptionReconciler_MultipleSubscriptionsSimplified(t *testing.T t.Fatalf("sub-b predicate not a string: %T", predMap["predicate"]) } // Predicate now uses model-scoped key: namespace/name@modelNamespace/modelName - expected := fmt.Sprintf(`auth.identity.selected_subscription_key == "%s/sub-b@%s/%s"`, namespace, namespace, modelName) + // and exempts /v1/models endpoint from rate limiting + expected := fmt.Sprintf(`auth.identity.selected_subscription_key == "%s/sub-b@%s/%s" && !request.path.endsWith("/v1/models")`, namespace, namespace, modelName) if pred != expected { t.Errorf("sub-b predicate = %q, want %q", pred, expected) } @@ -929,3 +932,417 @@ func getKeys(m map[string]any) []string { } return keys } + +// TestMaaSSubscriptionReconciler_MissingModelRef_FailedPhase verifies that a subscription +// with all missing model refs gets Failed phase and correct modelRefStatuses. +func TestMaaSSubscriptionReconciler_MissingModelRef_FailedPhase(t *testing.T) { + const ( + namespace = "default" + maasSubName = "sub-missing" + ) + + // Create subscription referencing a non-existent model + maasSub := newMaaSSubscription(maasSubName, namespace, "team-a", "non-existent-model", 100) + + c := fake.NewClientBuilder(). + WithScheme(scheme). + WithRESTMapper(testRESTMapper()). + WithObjects(maasSub). + WithStatusSubresource(&maasv1alpha1.MaaSSubscription{}). + WithIndex(&maasv1alpha1.MaaSSubscription{}, "spec.modelRef", subscriptionModelRefIndexer). + Build() + + r := &MaaSSubscriptionReconciler{Client: c, Scheme: scheme} + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: maasSubName, Namespace: namespace}} + if _, err := r.Reconcile(context.Background(), req); err != nil { + t.Fatalf("Reconcile: unexpected error: %v", err) + } + + // Fetch updated subscription + var sub maasv1alpha1.MaaSSubscription + if err := c.Get(context.Background(), req.NamespacedName, &sub); err != nil { + t.Fatalf("Get MaaSSubscription: %v", err) + } + + // Verify phase is Failed + if sub.Status.Phase != maasv1alpha1.PhaseFailed { + t.Errorf("expected phase Failed, got %q", sub.Status.Phase) + } + + // Verify Ready condition is False with ReconcileFailed reason + readyCond := apimeta.FindStatusCondition(sub.Status.Conditions, "Ready") + if readyCond == nil { + t.Fatal("Ready condition not found") + } + if readyCond.Status != metav1.ConditionFalse { + t.Errorf("expected Ready=False, got %v", readyCond.Status) + } + + // Verify modelRefStatuses contains the missing model with NotFound reason + if len(sub.Status.ModelRefStatuses) != 1 { + t.Fatalf("expected 1 modelRefStatus, got %d", len(sub.Status.ModelRefStatuses)) + } + modelStatus := sub.Status.ModelRefStatuses[0] + if modelStatus.Name != "non-existent-model" { + t.Errorf("expected model name 'non-existent-model', got %q", modelStatus.Name) + } + if modelStatus.Ready { + t.Error("expected modelRefStatus.Ready=false") + } + if modelStatus.Reason != maasv1alpha1.ReasonNotFound { + t.Errorf("expected reason %q, got %q", maasv1alpha1.ReasonNotFound, modelStatus.Reason) + } +} + +// TestMaaSSubscriptionReconciler_DeletingModelRef_FailedPhase verifies that when a model +// has deletionTimestamp set (finalizer keeps it in the informer cache), the subscription +// corrects modelRefStatuses to ready=false based on TRLP BackendNotReady health. +func TestMaaSSubscriptionReconciler_DeletingModelRef_FailedPhase(t *testing.T) { + const ( + namespace = "default" + maasSubName = "sub-deleting" + modelName = "deleting-model" + ) + + // Model exists but is being deleted (deletionTimestamp set, finalizer present). + now := metav1.Now() + model := newMaaSModelRef(modelName, namespace, "ExternalModel", modelName) + model.DeletionTimestamp = &now + model.Finalizers = []string{"maas.opendatahub.io/model-cleanup"} + + maasSub := newMaaSSubscription(maasSubName, namespace, "team-a", modelName, 100) + + c := fake.NewClientBuilder(). + WithScheme(scheme). + WithRESTMapper(testRESTMapper()). + WithObjects(maasSub, model). + WithStatusSubresource(&maasv1alpha1.MaaSSubscription{}). + WithIndex(&maasv1alpha1.MaaSSubscription{}, "spec.modelRef", subscriptionModelRefIndexer). + Build() + + r := &MaaSSubscriptionReconciler{Client: c, Scheme: scheme} + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: maasSubName, Namespace: namespace}} + if _, err := r.Reconcile(context.Background(), req); err != nil { + t.Fatalf("Reconcile: unexpected error: %v", err) + } + + var sub maasv1alpha1.MaaSSubscription + if err := c.Get(context.Background(), req.NamespacedName, &sub); err != nil { + t.Fatalf("Get MaaSSubscription: %v", err) + } + + // Phase must be Failed — model backend is gone + if sub.Status.Phase != maasv1alpha1.PhaseFailed { + t.Errorf("expected phase Failed, got %q", sub.Status.Phase) + } + + // modelRefStatuses must reflect the deletion even though the object is + // still in the cache (correction via TRLP BackendNotReady health). + if len(sub.Status.ModelRefStatuses) != 1 { + t.Fatalf("expected 1 modelRefStatus, got %d", len(sub.Status.ModelRefStatuses)) + } + modelStatus := sub.Status.ModelRefStatuses[0] + if modelStatus.Ready { + t.Error("expected modelRefStatus.Ready=false for deleting model") + } + if modelStatus.Reason != maasv1alpha1.ReasonNotFound { + t.Errorf("expected reason %q, got %q", maasv1alpha1.ReasonNotFound, modelStatus.Reason) + } +} + +// TestMaaSSubscriptionReconciler_PartialModelRefs_DegradedPhase verifies that a subscription +// with some valid and some invalid model refs gets Degraded phase. +func TestMaaSSubscriptionReconciler_PartialModelRefs_DegradedPhase(t *testing.T) { + const ( + namespace = "default" + maasSubName = "sub-partial" + validModel = "valid-model" + missingModel = "missing-model" + httpRouteName = validModel + ) + + // Create valid model and route + model := newMaaSModelRef(validModel, namespace, "ExternalModel", validModel) + route := newHTTPRoute(httpRouteName, namespace) + + // Create subscription referencing both valid and invalid models + maasSub := &maasv1alpha1.MaaSSubscription{ + ObjectMeta: metav1.ObjectMeta{Name: maasSubName, Namespace: namespace}, + Spec: maasv1alpha1.MaaSSubscriptionSpec{ + Owner: maasv1alpha1.OwnerSpec{ + Groups: []maasv1alpha1.GroupReference{{Name: "team-a"}}, + }, + ModelRefs: []maasv1alpha1.ModelSubscriptionRef{ + {Name: validModel, Namespace: namespace, TokenRateLimits: []maasv1alpha1.TokenRateLimit{{Limit: 100, Window: "1m"}}}, + {Name: missingModel, Namespace: namespace, TokenRateLimits: []maasv1alpha1.TokenRateLimit{{Limit: 100, Window: "1m"}}}, + }, + }, + } + + c := fake.NewClientBuilder(). + WithScheme(scheme). + WithRESTMapper(testRESTMapper()). + WithObjects(model, route, maasSub). + WithStatusSubresource(&maasv1alpha1.MaaSSubscription{}). + WithIndex(&maasv1alpha1.MaaSSubscription{}, "spec.modelRef", subscriptionModelRefIndexer). + Build() + + r := &MaaSSubscriptionReconciler{Client: c, Scheme: scheme} + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: maasSubName, Namespace: namespace}} + if _, err := r.Reconcile(context.Background(), req); err != nil { + t.Fatalf("Reconcile: unexpected error: %v", err) + } + + // Fetch updated subscription + var sub maasv1alpha1.MaaSSubscription + if err := c.Get(context.Background(), req.NamespacedName, &sub); err != nil { + t.Fatalf("Get MaaSSubscription: %v", err) + } + + // Verify phase is Degraded (partial functionality) + if sub.Status.Phase != maasv1alpha1.PhaseDegraded { + t.Errorf("expected phase Degraded, got %q", sub.Status.Phase) + } + + // Verify Ready condition is False with PartialFailure reason + readyCond := apimeta.FindStatusCondition(sub.Status.Conditions, "Ready") + if readyCond == nil { + t.Fatal("Ready condition not found") + } + if readyCond.Status != metav1.ConditionFalse { + t.Errorf("expected Ready=False, got %v", readyCond.Status) + } + if readyCond.Reason != "PartialFailure" { + t.Errorf("expected reason PartialFailure, got %q", readyCond.Reason) + } + + // Verify modelRefStatuses contains both models with correct status + if len(sub.Status.ModelRefStatuses) != 2 { + t.Fatalf("expected 2 modelRefStatuses, got %d", len(sub.Status.ModelRefStatuses)) + } + + // Find and verify each status + var foundValid, foundMissing bool + for _, status := range sub.Status.ModelRefStatuses { + switch status.Name { + case validModel: + foundValid = true + if !status.Ready { + t.Errorf("expected valid model Ready=true") + } + if status.Reason != maasv1alpha1.ReasonValid { + t.Errorf("expected valid model reason %q, got %q", maasv1alpha1.ReasonValid, status.Reason) + } + case missingModel: + foundMissing = true + if status.Ready { + t.Errorf("expected missing model Ready=false") + } + if status.Reason != maasv1alpha1.ReasonNotFound { + t.Errorf("expected missing model reason %q, got %q", maasv1alpha1.ReasonNotFound, status.Reason) + } + } + } + if !foundValid { + t.Error("valid model status not found") + } + if !foundMissing { + t.Error("missing model status not found") + } +} + +// TestMaaSSubscriptionReconciler_AllValidModelRefs_ActivePhase verifies that a subscription +// with all valid model refs and accepted TRLP gets Active phase. +func TestMaaSSubscriptionReconciler_AllValidModelRefs_ActivePhase(t *testing.T) { + const ( + namespace = "default" + maasSubName = "sub-valid" + modelName = "valid-model" + httpRouteName = modelName + trlpName = "maas-trlp-" + modelName + ) + + model := newMaaSModelRef(modelName, namespace, "ExternalModel", modelName) + route := newHTTPRoute(httpRouteName, namespace) + maasSub := newMaaSSubscription(maasSubName, namespace, "team-a", modelName, 100) + + // Pre-create TRLP with Accepted=True status (simulates Kuadrant accepting the policy) + existingTRLP := newPreexistingTRLP(trlpName, namespace, modelName, map[string]string{ + "maas.opendatahub.io/subscriptions": maasSubName, + }) + if err := unstructured.SetNestedSlice(existingTRLP.Object, []any{ + map[string]any{ + "type": "Accepted", + "status": "True", + }, + }, "status", "conditions"); err != nil { + t.Fatalf("SetNestedSlice status.conditions: %v", err) + } + + c := fake.NewClientBuilder(). + WithScheme(scheme). + WithRESTMapper(testRESTMapper()). + WithObjects(model, route, maasSub, existingTRLP). + WithStatusSubresource(&maasv1alpha1.MaaSSubscription{}). + WithIndex(&maasv1alpha1.MaaSSubscription{}, "spec.modelRef", subscriptionModelRefIndexer). + Build() + + r := &MaaSSubscriptionReconciler{Client: c, Scheme: scheme} + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: maasSubName, Namespace: namespace}} + if _, err := r.Reconcile(context.Background(), req); err != nil { + t.Fatalf("Reconcile: unexpected error: %v", err) + } + + var sub maasv1alpha1.MaaSSubscription + if err := c.Get(context.Background(), req.NamespacedName, &sub); err != nil { + t.Fatalf("Get MaaSSubscription: %v", err) + } + + // Verify phase is Active + if sub.Status.Phase != maasv1alpha1.PhaseActive { + t.Errorf("expected phase Active, got %q", sub.Status.Phase) + } + + // Verify Ready condition is True + readyCond := apimeta.FindStatusCondition(sub.Status.Conditions, "Ready") + if readyCond == nil { + t.Fatal("Ready condition not found") + } + if readyCond.Status != metav1.ConditionTrue { + t.Errorf("expected Ready=True, got %v", readyCond.Status) + } + + // Verify modelRefStatuses shows valid model + if len(sub.Status.ModelRefStatuses) != 1 { + t.Fatalf("expected 1 modelRefStatus, got %d", len(sub.Status.ModelRefStatuses)) + } + if !sub.Status.ModelRefStatuses[0].Ready { + t.Error("expected modelRefStatus.Ready=true") + } + + // Verify tokenRateLimitStatuses shows accepted TRLP + if len(sub.Status.TokenRateLimitStatuses) != 1 { + t.Fatalf("expected 1 tokenRateLimitStatus, got %d", len(sub.Status.TokenRateLimitStatuses)) + } + if !sub.Status.TokenRateLimitStatuses[0].Ready { + t.Error("expected tokenRateLimitStatus.Ready=true") + } +} + +// TestMaaSSubscriptionReconciler_WindowValuesInTRLP verifies that valid window values +// (seconds, minutes, hours) are correctly propagated into the generated TokenRateLimitPolicy +// rates, and that the previously allowed "d" (days) unit is no longer used. +// +// This is an end-to-end reconciliation test: it creates a MaaSSubscription with a specific +// window value, runs the reconciler, and then inspects the resulting Kuadrant +// TokenRateLimitPolicy to confirm that spec.limits..rates[0].window carries the +// exact value from the subscription. This complements TestTokenRateLimitWindowPattern +// (in helpers_test.go) which validates the CRD admission regex in isolation — here we +// verify the controller doesn't silently drop, transform, or default the window on its +// way into the TRLP. +func TestMaaSSubscriptionReconciler_WindowValuesInTRLP(t *testing.T) { + tests := []struct { + name string + window string + }{ + {"seconds", "30s"}, // short window, typical for burst limits + {"minutes", "5m"}, // default-like value used across the codebase + {"hours", "24h"}, // common replacement for the now-removed "1d" + {"max digits", "9999h"}, // upper bound of the 4-digit numeric cap + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + const ( + modelName = "llm" + namespace = "default" + httpRouteName = modelName + trlpName = "maas-trlp-" + modelName + maasSubName = "sub-window" + ) + + // Set up the minimum objects the reconciler needs: a MaaSModelRef (so the + // model lookup succeeds) and an HTTPRoute (so the TRLP has a valid target). + model := newMaaSModelRef(modelName, namespace, "ExternalModel", modelName) + route := newHTTPRoute(httpRouteName, namespace) + + // Build the subscription inline (instead of using newMaaSSubscription) so we + // can set a custom Window value per test case. + maasSub := &maasv1alpha1.MaaSSubscription{ + ObjectMeta: metav1.ObjectMeta{Name: maasSubName, Namespace: namespace}, + Spec: maasv1alpha1.MaaSSubscriptionSpec{ + Owner: maasv1alpha1.OwnerSpec{ + Groups: []maasv1alpha1.GroupReference{{Name: "team-a"}}, + }, + ModelRefs: []maasv1alpha1.ModelSubscriptionRef{ + { + Name: modelName, + Namespace: namespace, + TokenRateLimits: []maasv1alpha1.TokenRateLimit{ + {Limit: 500, Window: tc.window}, + }, + }, + }, + }, + } + + c := fake.NewClientBuilder(). + WithScheme(scheme). + WithRESTMapper(testRESTMapper()). + WithObjects(model, route, maasSub). + WithStatusSubresource(&maasv1alpha1.MaaSSubscription{}). + WithIndex(&maasv1alpha1.MaaSSubscription{}, "spec.modelRef", subscriptionModelRefIndexer). + Build() + + r := &MaaSSubscriptionReconciler{Client: c, Scheme: scheme} + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: maasSubName, Namespace: namespace}} + if _, err := r.Reconcile(context.Background(), req); err != nil { + t.Fatalf("Reconcile: unexpected error: %v", err) + } + + // Fetch the generated TokenRateLimitPolicy that the reconciler should have + // created for this model. + trlp := &unstructured.Unstructured{} + trlp.SetGroupVersionKind(schema.GroupVersionKind{Group: "kuadrant.io", Version: "v1alpha1", Kind: "TokenRateLimitPolicy"}) + if err := c.Get(context.Background(), types.NamespacedName{Name: trlpName, Namespace: namespace}, trlp); err != nil { + t.Fatalf("Get TokenRateLimitPolicy %q: %v", trlpName, err) + } + + // Navigate into spec.limits..rates to find the rate entry produced + // from the subscription's TokenRateLimit. The key format is + // "---tokens". + limitKey := namespace + "-" + maasSubName + "-" + modelName + "-tokens" + ratesRaw, found, err := unstructured.NestedSlice(trlp.Object, "spec", "limits", limitKey, "rates") + if err != nil || !found { + t.Fatalf("spec.limits.%s.rates not found: found=%v err=%v", limitKey, found, err) + } + if len(ratesRaw) != 1 { + t.Fatalf("expected 1 rate entry, got %d", len(ratesRaw)) + } + + rateMap, ok := ratesRaw[0].(map[string]any) + if !ok { + t.Fatalf("rate entry is not map[string]any: %T", ratesRaw[0]) + } + + // Verify the window value was passed through verbatim — no conversion, + // defaulting, or normalization should occur between the CRD and the TRLP. + gotWindow, ok := rateMap["window"].(string) + if !ok { + t.Fatalf("window is not a string: %T", rateMap["window"]) + } + if gotWindow != tc.window { + t.Errorf("TRLP window = %q, want %q", gotWindow, tc.window) + } + + // Also verify the limit to ensure the full rate entry is intact. + gotLimit, ok := rateMap["limit"].(int64) + if !ok { + t.Fatalf("limit is not int64: %T", rateMap["limit"]) + } + if gotLimit != 500 { + t.Errorf("TRLP limit = %d, want 500", gotLimit) + } + }) + } +} diff --git a/maas-controller/pkg/controller/maas/providers_external.go b/maas-controller/pkg/controller/maas/providers_external.go index 0447d3c36..00022ce93 100644 --- a/maas-controller/pkg/controller/maas/providers_external.go +++ b/maas-controller/pkg/controller/maas/providers_external.go @@ -28,7 +28,6 @@ import ( gatewayapiv1 "sigs.k8s.io/gateway-api/apis/v1" maasv1alpha1 "github.com/opendatahub-io/models-as-a-service/maas-controller/api/maas/v1alpha1" - "github.com/opendatahub-io/models-as-a-service/maas-controller/pkg/reconciler/externalmodel" ) // routeConditionProgrammed is the "Programmed" condition type for route parent status. @@ -58,7 +57,7 @@ func (h *externalModelHandler) ReconcileRoute(ctx context.Context, log logr.Logg return fmt.Errorf("failed to get ExternalModel %s: %w", model.Spec.ModelRef.Name, err) } - routeName := externalmodel.ModelRouteName(model.Name) + routeName := model.Spec.ModelRef.Name routeNS := model.Namespace route := &gatewayapiv1.HTTPRoute{} @@ -113,18 +112,14 @@ func (h *externalModelHandler) ReconcileRoute(ctx context.Context, log logr.Logg pNS = string(*parent.ParentRef.Namespace) } if pName == expectedGatewayName && pNS == expectedGatewayNamespace { - accepted := false - programmed := false for _, cond := range parent.Conditions { if cond.Type == string(gatewayapiv1.RouteConditionAccepted) && cond.Status == metav1.ConditionTrue { - accepted = true - } - if cond.Type == routeConditionProgrammed && cond.Status == metav1.ConditionTrue { - programmed = true + gatewayAccepted = true } } - gatewayAccepted = accepted && programmed - break + if gatewayAccepted { + break + } } } } @@ -182,11 +177,13 @@ func (h *externalModelHandler) Status(ctx context.Context, log logr.Logger, mode } // GetModelEndpoint returns the endpoint URL for the ExternalModel. -// Follows the same resolution order as llmisvc: HTTPRoute hostnames > gateway listeners > gateway addresses. +// Uses ExternalModel name (spec.modelRef.name) in the path to match the HTTPRoute +// created by the reconciler and BBR's model-provider-resolver store key. func (h *externalModelHandler) GetModelEndpoint(ctx context.Context, log logr.Logger, model *maasv1alpha1.MaaSModelRef) (string, error) { + extModelName := model.Spec.ModelRef.Name if len(model.Status.HTTPRouteHostnames) > 0 { hostname := model.Status.HTTPRouteHostnames[0] - return fmt.Sprintf("https://%s/%s", hostname, model.Name), nil + return fmt.Sprintf("https://%s/%s/%s", hostname, model.Namespace, extModelName), nil } gatewayName := h.r.gatewayName() @@ -199,19 +196,19 @@ func (h *externalModelHandler) GetModelEndpoint(ctx context.Context, log logr.Lo for _, listener := range gateway.Spec.Listeners { if listener.Hostname != nil { - return fmt.Sprintf("https://%s/%s", string(*listener.Hostname), model.Name), nil + return fmt.Sprintf("https://%s/%s/%s", string(*listener.Hostname), model.Namespace, extModelName), nil } } for _, addr := range gateway.Status.Addresses { if addr.Type != nil && *addr.Type == gatewayapiv1.HostnameAddressType { - return fmt.Sprintf("https://%s/%s", addr.Value, model.Name), nil + return fmt.Sprintf("https://%s/%s/%s", addr.Value, model.Namespace, extModelName), nil } } if len(gateway.Status.Addresses) > 0 { log.Info("Using IP-based gateway address; TLS hostname verification may fail", - "address", gateway.Status.Addresses[0].Value, "model", model.Name) - return fmt.Sprintf("https://%s/%s", gateway.Status.Addresses[0].Value, model.Name), nil + "address", gateway.Status.Addresses[0].Value, "model", extModelName) + return fmt.Sprintf("https://%s/%s/%s", gateway.Status.Addresses[0].Value, model.Namespace, extModelName), nil } return "", fmt.Errorf("unable to determine endpoint: gateway %s/%s has no hostname or addresses", gatewayNS, gatewayName) @@ -228,7 +225,7 @@ func (h *externalModelHandler) CleanupOnDelete(ctx context.Context, log logr.Log type externalModelRouteResolver struct{} func (externalModelRouteResolver) HTTPRouteForModel(ctx context.Context, c client.Reader, model *maasv1alpha1.MaaSModelRef) (routeName, routeNamespace string, err error) { - routeName = externalmodel.ModelRouteName(model.Name) + routeName = model.Spec.ModelRef.Name routeNamespace = model.Namespace return routeName, routeNamespace, nil } diff --git a/maas-controller/pkg/controller/maas/providers_external_test.go b/maas-controller/pkg/controller/maas/providers_external_test.go index ebd7c8e06..b6ffdf256 100644 --- a/maas-controller/pkg/controller/maas/providers_external_test.go +++ b/maas-controller/pkg/controller/maas/providers_external_test.go @@ -98,7 +98,7 @@ func newGatewayWithHostname(name, ns, hostname string) *gatewayapiv1.Gateway { func TestExternalModel_ReconcileRoute_Success(t *testing.T) { model := newExternalModel("gpt-4o", "default", "openai", "api.openai.com") externalModelCR := newExternalModelCR("gpt-4o", "default", "openai", "api.openai.com") - route := newHTTPRouteWithGateway("maas-model-gpt-4o", "default", "maas-default-gateway", "openshift-ingress") + route := newHTTPRouteWithGateway("gpt-4o", "default", "maas-default-gateway", "openshift-ingress") r, _ := newTestReconciler(model, externalModelCR, route) r.GatewayName = "maas-default-gateway" @@ -111,8 +111,8 @@ func TestExternalModel_ReconcileRoute_Success(t *testing.T) { t.Fatalf("ReconcileRoute: unexpected error: %v", err) } - if model.Status.HTTPRouteName != "maas-model-gpt-4o" { - t.Errorf("HTTPRouteName = %q, want %q", model.Status.HTTPRouteName, "maas-model-gpt-4o") + if model.Status.HTTPRouteName != "gpt-4o" { + t.Errorf("HTTPRouteName = %q, want %q", model.Status.HTTPRouteName, "gpt-4o") } if model.Status.HTTPRouteGatewayName != "maas-default-gateway" { t.Errorf("HTTPRouteGatewayName = %q, want %q", model.Status.HTTPRouteGatewayName, "maas-default-gateway") @@ -164,7 +164,7 @@ func TestExternalModel_ReconcileRoute_MissingExternalModel(t *testing.T) { func TestExternalModel_ReconcileRoute_WrongGateway(t *testing.T) { model := newExternalModel("gpt-4o", "default", "openai", "api.openai.com") externalModelCR := newExternalModelCR("gpt-4o", "default", "openai", "api.openai.com") - route := newHTTPRouteWithGateway("maas-model-gpt-4o", "default", "wrong-gateway", "wrong-ns") + route := newHTTPRouteWithGateway("gpt-4o", "default", "wrong-gateway", "wrong-ns") r, _ := newTestReconciler(model, externalModelCR, route) r.GatewayName = "maas-default-gateway" @@ -183,7 +183,7 @@ func TestExternalModel_ReconcileRoute_WrongGateway(t *testing.T) { func TestExternalModel_Status_Ready(t *testing.T) { model := newExternalModel("gpt-4o", "default", "openai", "api.openai.com") - model.Status.HTTPRouteName = "maas-model-gpt-4o" + model.Status.HTTPRouteName = "gpt-4o" model.Status.HTTPRouteGatewayName = "maas-default-gateway" model.Status.HTTPRouteHostnames = []string{"maas.example.com"} @@ -198,15 +198,15 @@ func TestExternalModel_Status_Ready(t *testing.T) { if !ready { t.Error("Status: ready = false, want true") } - if endpoint != "https://maas.example.com/gpt-4o" { - t.Errorf("Status: endpoint = %q, want %q", endpoint, "https://maas.example.com/gpt-4o") + if endpoint != "https://maas.example.com/default/gpt-4o" { + t.Errorf("Status: endpoint = %q, want %q", endpoint, "https://maas.example.com/default/gpt-4o") } } func TestExternalModel_Status_NotReadyWhenGatewayNotAccepted(t *testing.T) { model := newExternalModel("gpt-4o", "default", "openai", "api.openai.com") // HTTPRouteName set but gateway not yet accepted (no HTTPRouteGatewayName) - model.Status.HTTPRouteName = "maas-model-gpt-4o" + model.Status.HTTPRouteName = "gpt-4o" r, _ := newTestReconciler(model) handler := &externalModelHandler{r: r} @@ -249,8 +249,8 @@ func TestExternalModel_GetModelEndpoint_FromHostnames(t *testing.T) { if err != nil { t.Fatalf("GetModelEndpoint: unexpected error: %v", err) } - if endpoint != "https://maas.example.com/claude-sonnet" { - t.Errorf("GetModelEndpoint = %q, want %q", endpoint, "https://maas.example.com/claude-sonnet") + if endpoint != "https://maas.example.com/default/claude-sonnet" { + t.Errorf("GetModelEndpoint = %q, want %q", endpoint, "https://maas.example.com/default/claude-sonnet") } } @@ -268,8 +268,8 @@ func TestExternalModel_GetModelEndpoint_FromGateway(t *testing.T) { if err != nil { t.Fatalf("GetModelEndpoint: unexpected error: %v", err) } - if endpoint != "https://maas.cluster.example.com/gpt-4o" { - t.Errorf("GetModelEndpoint = %q, want %q", endpoint, "https://maas.cluster.example.com/gpt-4o") + if endpoint != "https://maas.cluster.example.com/default/gpt-4o" { + t.Errorf("GetModelEndpoint = %q, want %q", endpoint, "https://maas.cluster.example.com/default/gpt-4o") } } @@ -305,8 +305,8 @@ func TestExternalModelRouteResolver(t *testing.T) { if err != nil { t.Fatalf("HTTPRouteForModel: unexpected error: %v", err) } - if routeName != "maas-model-gpt-4o" { - t.Errorf("routeName = %q, want %q", routeName, "maas-model-gpt-4o") + if routeName != "gpt-4o" { + t.Errorf("routeName = %q, want %q", routeName, "gpt-4o") } if routeNS != "default" { t.Errorf("routeNS = %q, want %q", routeNS, "default") diff --git a/maas-controller/pkg/controller/maas/providers_test.go b/maas-controller/pkg/controller/maas/providers_test.go index 6a9ca9f93..a551b7d51 100644 --- a/maas-controller/pkg/controller/maas/providers_test.go +++ b/maas-controller/pkg/controller/maas/providers_test.go @@ -227,15 +227,15 @@ func TestFindHTTPRouteForModel_ExternalModel_Success(t *testing.T) { }, } route := &gatewayapiv1.HTTPRoute{ - ObjectMeta: metav1.ObjectMeta{Name: "maas-model-foo", Namespace: "default"}, + ObjectMeta: metav1.ObjectMeta{Name: "foo", Namespace: "default"}, } c := fake.NewClientBuilder().WithScheme(scheme).WithObjects(model, route).Build() routeName, routeNS, err := findHTTPRouteForModel(ctx, c, "default", "foo") if err != nil { t.Fatalf("findHTTPRouteForModel: %v", err) } - if routeName != "maas-model-foo" || routeNS != "default" { - t.Errorf("findHTTPRouteForModel: got (%q, %q), want (\"maas-model-foo\", \"default\")", routeName, routeNS) + if routeName != "foo" || routeNS != "default" { + t.Errorf("findHTTPRouteForModel: got (%q, %q), want (\"foo\", \"default\")", routeName, routeNS) } } diff --git a/maas-controller/pkg/controller/maas/tenant_conditions.go b/maas-controller/pkg/controller/maas/tenant_conditions.go new file mode 100644 index 000000000..0d3bb5490 --- /dev/null +++ b/maas-controller/pkg/controller/maas/tenant_conditions.go @@ -0,0 +1,69 @@ +package maas + +import ( + "strings" + + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + maasv1alpha1 "github.com/opendatahub-io/models-as-a-service/maas-controller/api/maas/v1alpha1" + "github.com/opendatahub-io/models-as-a-service/maas-controller/pkg/platform/tenantreconcile" +) + +func setTenantCondition(tenant *maasv1alpha1.Tenant, typ string, status metav1.ConditionStatus, reason, message string) { + apimeta.SetStatusCondition(&tenant.Status.Conditions, metav1.Condition{ + Type: typ, + Status: status, + Reason: reason, + Message: message, + ObservedGeneration: tenant.Generation, + LastTransitionTime: metav1.Now(), + }) +} + +func setDependenciesCondition(tenant *maasv1alpha1.Tenant, ok bool, detail string) { + if ok { + setTenantCondition(tenant, tenantreconcile.ConditionDependenciesAvailable, metav1.ConditionTrue, + "DependenciesMet", "AuthConfig CRD (Kuadrant) is available on the cluster") + return + } + setTenantCondition(tenant, tenantreconcile.ConditionDependenciesAvailable, metav1.ConditionFalse, + "DependencyMissing", detail) +} + +func setPrerequisiteConditionsFromReport(tenant *maasv1alpha1.Tenant, rep tenantreconcile.PrerequisiteReport) { + switch { + case len(rep.Blocking) > 0: + agg := strings.Join(append(append([]string{}, rep.Blocking...), rep.Warnings...), "; ") + setTenantCondition(tenant, tenantreconcile.ConditionMaaSPrerequisitesAvailable, metav1.ConditionFalse, + "PrerequisitesMissing", agg) + setTenantCondition(tenant, tenantreconcile.ConditionTypeDegraded, metav1.ConditionTrue, + "PrerequisitesMissing", agg) + case len(rep.Warnings) > 0: + agg := strings.Join(rep.Warnings, "; ") + setTenantCondition(tenant, tenantreconcile.ConditionMaaSPrerequisitesAvailable, metav1.ConditionTrue, + "PrerequisitesMet", "Prerequisites satisfied; see Degraded for warnings") + setTenantCondition(tenant, tenantreconcile.ConditionTypeDegraded, metav1.ConditionTrue, + "PrerequisitesWarning", agg) + default: + setTenantCondition(tenant, tenantreconcile.ConditionMaaSPrerequisitesAvailable, metav1.ConditionTrue, + "PrerequisitesMet", "All prerequisites are satisfied") + setTenantCondition(tenant, tenantreconcile.ConditionTypeDegraded, metav1.ConditionFalse, + "PrerequisitesMet", "All prerequisites are satisfied") + } +} + +func setDeploymentsAvailableCondition(tenant *maasv1alpha1.Tenant, ok bool, reason, message string) { + st := metav1.ConditionFalse + if ok { + st = metav1.ConditionTrue + } + setTenantCondition(tenant, tenantreconcile.ConditionDeploymentsAvailable, st, reason, message) +} + +func prerequisitesUnevaluatedCondition(tenant *maasv1alpha1.Tenant, detail string) { + setTenantCondition(tenant, tenantreconcile.ConditionMaaSPrerequisitesAvailable, metav1.ConditionUnknown, + "DependenciesNotMet", detail) + setTenantCondition(tenant, tenantreconcile.ConditionTypeDegraded, metav1.ConditionFalse, + "DependenciesNotMet", detail) +} diff --git a/maas-controller/pkg/controller/maas/tenant_controller.go b/maas-controller/pkg/controller/maas/tenant_controller.go new file mode 100644 index 000000000..28280285c --- /dev/null +++ b/maas-controller/pkg/controller/maas/tenant_controller.go @@ -0,0 +1,184 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package maas + +import ( + "context" + + corev1 "k8s.io/api/core/v1" + extv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + maasv1alpha1 "github.com/opendatahub-io/models-as-a-service/maas-controller/api/maas/v1alpha1" + "github.com/opendatahub-io/models-as-a-service/maas-controller/pkg/platform/tenantreconcile" +) + +// TenantReconciler reconciles cluster Tenant (platform singleton). +// Platform manifest logic mirrors opendatahub-operator modelsasservice (kustomize + post-render + SSA apply). +type TenantReconciler struct { + client.Client + Scheme *runtime.Scheme + // OperatorNamespace overrides POD_NAMESPACE / WATCH_NAMESPACE when discovering namespaced platform workloads (tests). + OperatorNamespace string + // ManifestPath is the directory containing kustomization.yaml for the ODH maas-api overlay (e.g. maas-api/deploy/overlays/odh). + ManifestPath string + // AppNamespace is the namespace where maas-api workloads are deployed (--maas-api-namespace, default opendatahub). + AppNamespace string + // TenantNamespace is the namespace where the Tenant CR lives (--maas-subscription-namespace, default models-as-a-service). + TenantNamespace string +} + +// +kubebuilder:rbac:groups=maas.opendatahub.io,resources=tenants,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=maas.opendatahub.io,resources=tenants/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=maas.opendatahub.io,resources=tenants/finalizers,verbs=update +// +kubebuilder:rbac:groups=gateway.networking.k8s.io,resources=gateways,verbs=get;list;watch +// +kubebuilder:rbac:groups="",resources=configmaps,verbs=get;list;watch;create;patch;delete +// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch +// +kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create;patch;delete +// +kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;patch;delete +// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;patch;delete +// +kubebuilder:rbac:groups=networking.k8s.io,resources=networkpolicies,verbs=get;list;watch;create;patch;delete +// +kubebuilder:rbac:groups=gateway.networking.k8s.io,resources=httproutes,verbs=get;list;watch;create;patch;delete +// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterroles,verbs=get;list;watch;create;patch;delete +// +kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=clusterrolebindings,verbs=get;list;watch;create;patch;delete +// +kubebuilder:rbac:groups=config.openshift.io,resources=authentications,verbs=get;list;watch +// +kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=get;list;watch +// +kubebuilder:rbac:groups=operator.authorino.kuadrant.io,resources=authorinos,verbs=get;list;watch +// +kubebuilder:rbac:groups=kuadrant.io,resources=ratelimitpolicies,verbs=get;list;watch;create;patch;delete +// +kubebuilder:rbac:groups=extensions.kuadrant.io,resources=telemetrypolicies,verbs=get;list;watch;create;patch;delete +// +kubebuilder:rbac:groups=networking.istio.io,resources=destinationrules,verbs=get;list;watch;create;patch;delete +// +kubebuilder:rbac:groups=networking.istio.io,resources=envoyfilters,verbs=get;list;watch;create;patch;delete +// +kubebuilder:rbac:groups=telemetry.istio.io,resources=telemetries,verbs=get;list;watch;create;patch;delete +// +kubebuilder:rbac:groups=batch,resources=cronjobs,verbs=get;list;watch;create;patch;delete + +// maas-controller creates the maas-api ClusterRole via SSA. +// The rules below mirror the maas-api ClusterRole so the controller can pass the API-server escalation check. +// +// +kubebuilder:rbac:groups="",resources=endpoints,verbs=get;list;watch +// +kubebuilder:rbac:groups="",resources=namespaces,verbs=get;list;watch;create +// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch +// +kubebuilder:rbac:groups="",resources=serviceaccounts/token,verbs=create +// +kubebuilder:rbac:groups=authentication.k8s.io,resources=tokenreviews,verbs=create +// +kubebuilder:rbac:groups=authorization.k8s.io,resources=subjectaccessreviews,verbs=create +// +kubebuilder:rbac:groups=maas.opendatahub.io,resources=maasmodelrefs,verbs=get;list;watch +// +kubebuilder:rbac:groups=maas.opendatahub.io,resources=maassubscriptions,verbs=get;list;watch + +// Reconcile drives Tenant platform lifecycle (ODH no longer runs the modelsasservice deploy pipeline). +func (r *TenantReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + return r.reconcile(ctx, req) +} + +const openshiftAuthenticationClusterName = "cluster" + +func (r *TenantReconciler) enqueueDefaultTenant(_ context.Context, _ client.Object) []reconcile.Request { + return []reconcile.Request{{NamespacedName: types.NamespacedName{ + Name: maasv1alpha1.TenantInstanceName, + Namespace: r.TenantNamespace, + }}} +} + +// crdLabeledForMaaSComponent matches ODH modelsasservice watch: app.opendatahub.io/modelsasservice=true. +func crdLabeledForMaaSComponent() predicate.Predicate { + key := tenantreconcile.LabelODHAppPrefix + "/" + tenantreconcile.ComponentName + return predicate.NewPredicateFuncs(func(o client.Object) bool { + l := o.GetLabels() + return l != nil && l[key] == "true" + }) +} + +func secretNamedMaaSDB() predicate.Predicate { + return predicate.NewPredicateFuncs(func(o client.Object) bool { + return o.GetName() == tenantreconcile.MaaSDBSecretName + }) +} + +// inTenantWorkNamespaces limits watches to the namespaces where Tenant children live, +// avoiding cluster-wide informer noise on busy clusters. +func (r *TenantReconciler) inTenantWorkNamespaces() predicate.Predicate { + return predicate.NewPredicateFuncs(func(o client.Object) bool { + ns := o.GetNamespace() + return ns == r.AppNamespace || ns == r.operatorNamespace() + }) +} + +func authenticationClusterSingleton() predicate.Predicate { + return predicate.NewPredicateFuncs(func(o client.Object) bool { + return o.GetName() == openshiftAuthenticationClusterName + }) +} + +// deletedConfigMapOnly mirrors ODH: unmanaged ConfigMaps are recreated when deleted. +func deletedConfigMapOnly() predicate.Predicate { + return predicate.Funcs{ + CreateFunc: func(event.CreateEvent) bool { + return false + }, + UpdateFunc: func(event.UpdateEvent) bool { + return false + }, + DeleteFunc: func(event.DeleteEvent) bool { + return true + }, + GenericFunc: func(event.GenericEvent) bool { + return false + }, + } +} + +// SetupWithManager registers the Tenant controller. +func (r *TenantReconciler) SetupWithManager(mgr ctrl.Manager) error { + authMeta := &metav1.PartialObjectMetadata{} + authMeta.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "config.openshift.io", + Version: "v1", + Kind: "Authentication", + }) + + return ctrl.NewControllerManagedBy(mgr). + For(&maasv1alpha1.Tenant{}). + Watches( + &extv1.CustomResourceDefinition{}, + handler.EnqueueRequestsFromMapFunc(r.enqueueDefaultTenant), + builder.WithPredicates(crdLabeledForMaaSComponent()), + ). + Watches( + &corev1.ConfigMap{}, + handler.EnqueueRequestsFromMapFunc(r.enqueueDefaultTenant), + builder.WithPredicates(deletedConfigMapOnly(), r.inTenantWorkNamespaces()), + ). + Watches( + &corev1.Secret{}, + handler.EnqueueRequestsFromMapFunc(r.enqueueDefaultTenant), + builder.WithPredicates(secretNamedMaaSDB(), r.inTenantWorkNamespaces()), + ). + WatchesMetadata( + authMeta, + handler.EnqueueRequestsFromMapFunc(r.enqueueDefaultTenant), + builder.WithPredicates(authenticationClusterSingleton()), + ). + Complete(r) +} diff --git a/maas-controller/pkg/controller/maas/tenant_finalize.go b/maas-controller/pkg/controller/maas/tenant_finalize.go new file mode 100644 index 000000000..f788809db --- /dev/null +++ b/maas-controller/pkg/controller/maas/tenant_finalize.go @@ -0,0 +1,339 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package maas + +import ( + "context" + "fmt" + "os" + "time" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + netwv1 "k8s.io/api/networking/v1" + rbacv1 "k8s.io/api/rbac/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/sets" + "sigs.k8s.io/controller-runtime/pkg/client" + gwapiv1 "sigs.k8s.io/gateway-api/apis/v1" + + maasv1alpha1 "github.com/opendatahub-io/models-as-a-service/maas-controller/api/maas/v1alpha1" + "github.com/opendatahub-io/models-as-a-service/maas-controller/pkg/platform/tenantreconcile" +) + +// deletePropagation is used for child deletes so the Tenant finalizer does not block on foreground chains. +var deletePropagation = client.PropagationPolicy(metav1.DeletePropagationBackground) + +// optionalPlatformGVKs are extension resources created by the legacy ODH modelsasservice pipeline (and future +// maas-controller apply) that may reference Tenant as controller owner. List failures are ignored when the +// API is not installed. +var optionalPlatformGVKs = []schema.GroupVersionKind{ + {Group: "kuadrant.io", Version: "v1", Kind: "AuthPolicy"}, + {Group: "kuadrant.io", Version: "v1", Kind: "RateLimitPolicy"}, + {Group: "extensions.kuadrant.io", Version: "v1alpha1", Kind: "TelemetryPolicy"}, + {Group: "networking.istio.io", Version: "v1", Kind: "DestinationRule"}, + {Group: "networking.istio.io", Version: "v1alpha3", Kind: "EnvoyFilter"}, + {Group: "telemetry.istio.io", Version: "v1", Kind: "Telemetry"}, +} + +func (r *TenantReconciler) operatorNamespace() string { + if r.OperatorNamespace != "" { + return r.OperatorNamespace + } + if ns := os.Getenv("POD_NAMESPACE"); ns != "" { + return ns + } + return os.Getenv("WATCH_NAMESPACE") +} + +func ownedByTenantRef(obj metav1.Object, tenant *maasv1alpha1.Tenant) bool { + for _, ref := range obj.GetOwnerReferences() { + if ref.UID == tenant.UID && + ref.Kind == maasv1alpha1.TenantKind && + ref.APIVersion == maasv1alpha1.GroupVersion.String() { + return true + } + } + return false +} + +func ownedByTenantLabel(obj metav1.Object, tenant *maasv1alpha1.Tenant) bool { + labels := obj.GetLabels() + return labels != nil && + labels[tenantreconcile.LabelTenantName] == tenant.Name && + labels[tenantreconcile.LabelTenantNamespace] == tenant.Namespace +} + +func isOwnedByTenant(obj metav1.Object, tenant *maasv1alpha1.Tenant) bool { + return ownedByTenantRef(obj, tenant) || ownedByTenantLabel(obj, tenant) +} + +func tenantWorkNamespaces(tenant *maasv1alpha1.Tenant, operatorNS, appNS string) []string { + out := sets.New[string]() + if tenant.Namespace != "" { + out.Insert(tenant.Namespace) + } + if appNS != "" { + out.Insert(appNS) + } + if operatorNS != "" { + out.Insert(operatorNS) + } + if tenant.Spec.GatewayRef.Namespace != "" { + out.Insert(tenant.Spec.GatewayRef.Namespace) + } + return sets.List(out) +} + +// finalizeTenantDeletion deletes API objects owned by the tenant (owner refs). It returns +// (stillPending, err): stillPending means children are present or terminating — requeue without removing the finalizer. +func (r *TenantReconciler) finalizeTenantDeletion(ctx context.Context, tenant *maasv1alpha1.Tenant) (bool, error) { + opNS := r.operatorNamespace() + namespaces := tenantWorkNamespaces(tenant, opNS, r.AppNamespace) + if len(namespaces) == 0 { + return false, fmt.Errorf("cannot finalize Tenant %s/%s: no work namespaces resolved (operator namespace and GatewayRef.Namespace are both empty); namespaced children may be orphaned", tenant.Namespace, tenant.Name) + } + + pending := false + + for _, ns := range namespaces { + p, err := r.deleteOwnedInNamespace(ctx, tenant, ns) + if err != nil { + return false, err + } + pending = pending || p + } + + p, err := r.deleteOwnedClusterScoped(ctx, tenant) + if err != nil { + return false, err + } + pending = pending || p + + return pending, nil +} + +func (r *TenantReconciler) deleteOwnedInNamespace(ctx context.Context, tenant *maasv1alpha1.Tenant, ns string) (bool, error) { + pending := false + + var cmList corev1.ConfigMapList + if err := r.List(ctx, &cmList, client.InNamespace(ns)); err != nil { + return false, fmt.Errorf("list ConfigMaps in %q: %w", ns, err) + } + for i := range cmList.Items { + item := &cmList.Items[i] + if !isOwnedByTenant(item, tenant) { + continue + } + if !item.GetDeletionTimestamp().IsZero() { + pending = true + continue + } + if err := r.Delete(ctx, item, deletePropagation); err != nil && !apierrors.IsNotFound(err) { + return false, fmt.Errorf("delete ConfigMap %s/%s: %w", ns, item.Name, err) + } + pending = true + } + + var svcList corev1.ServiceList + if err := r.List(ctx, &svcList, client.InNamespace(ns)); err != nil { + return false, fmt.Errorf("list Services in %q: %w", ns, err) + } + for i := range svcList.Items { + item := &svcList.Items[i] + if !isOwnedByTenant(item, tenant) { + continue + } + if !item.GetDeletionTimestamp().IsZero() { + pending = true + continue + } + if err := r.Delete(ctx, item, deletePropagation); err != nil && !apierrors.IsNotFound(err) { + return false, fmt.Errorf("delete Service %s/%s: %w", ns, item.Name, err) + } + pending = true + } + + var saList corev1.ServiceAccountList + if err := r.List(ctx, &saList, client.InNamespace(ns)); err != nil { + return false, fmt.Errorf("list ServiceAccounts in %q: %w", ns, err) + } + for i := range saList.Items { + item := &saList.Items[i] + if !isOwnedByTenant(item, tenant) { + continue + } + if !item.GetDeletionTimestamp().IsZero() { + pending = true + continue + } + if err := r.Delete(ctx, item, deletePropagation); err != nil && !apierrors.IsNotFound(err) { + return false, fmt.Errorf("delete ServiceAccount %s/%s: %w", ns, item.Name, err) + } + pending = true + } + + var depList appsv1.DeploymentList + if err := r.List(ctx, &depList, client.InNamespace(ns)); err != nil { + return false, fmt.Errorf("list Deployments in %q: %w", ns, err) + } + for i := range depList.Items { + item := &depList.Items[i] + if !isOwnedByTenant(item, tenant) { + continue + } + if !item.GetDeletionTimestamp().IsZero() { + pending = true + continue + } + if err := r.Delete(ctx, item, deletePropagation); err != nil && !apierrors.IsNotFound(err) { + return false, fmt.Errorf("delete Deployment %s/%s: %w", ns, item.Name, err) + } + pending = true + } + + var npList netwv1.NetworkPolicyList + if err := r.List(ctx, &npList, client.InNamespace(ns)); err != nil { + return false, fmt.Errorf("list NetworkPolicies in %q: %w", ns, err) + } + for i := range npList.Items { + item := &npList.Items[i] + if !isOwnedByTenant(item, tenant) { + continue + } + if !item.GetDeletionTimestamp().IsZero() { + pending = true + continue + } + if err := r.Delete(ctx, item, deletePropagation); err != nil && !apierrors.IsNotFound(err) { + return false, fmt.Errorf("delete NetworkPolicy %s/%s: %w", ns, item.Name, err) + } + pending = true + } + + var hrList gwapiv1.HTTPRouteList + if err := r.List(ctx, &hrList, client.InNamespace(ns)); err != nil { + return false, fmt.Errorf("list HTTPRoutes in %q: %w", ns, err) + } + for i := range hrList.Items { + item := &hrList.Items[i] + if !isOwnedByTenant(item, tenant) { + continue + } + if !item.GetDeletionTimestamp().IsZero() { + pending = true + continue + } + if err := r.Delete(ctx, item, deletePropagation); err != nil && !apierrors.IsNotFound(err) { + return false, fmt.Errorf("delete HTTPRoute %s/%s: %w", ns, item.Name, err) + } + pending = true + } + + for _, gvk := range optionalPlatformGVKs { + p, err := r.deleteOwnedUnstructured(ctx, tenant, ns, gvk) + if err != nil { + return false, err + } + pending = pending || p + } + + return pending, nil +} + +func (r *TenantReconciler) deleteOwnedUnstructured(ctx context.Context, tenant *maasv1alpha1.Tenant, ns string, gvk schema.GroupVersionKind) (bool, error) { + listGVK := gvk + listGVK.Kind = gvk.Kind + "List" + + ul := &unstructured.UnstructuredList{} + ul.SetGroupVersionKind(listGVK) + + if err := r.List(ctx, ul, client.InNamespace(ns)); err != nil { + if meta.IsNoMatchError(err) { + return false, nil + } + return false, fmt.Errorf("list %s in namespace %q: %w", listGVK.String(), ns, err) + } + + pending := false + for i := range ul.Items { + obj := &ul.Items[i] + if !isOwnedByTenant(obj, tenant) { + continue + } + if !obj.GetDeletionTimestamp().IsZero() { + pending = true + continue + } + if err := r.Delete(ctx, obj, deletePropagation); err != nil && !apierrors.IsNotFound(err) { + return false, fmt.Errorf("delete %s %s/%s: %w", obj.GetKind(), ns, obj.GetName(), err) + } + pending = true + } + return pending, nil +} + +func (r *TenantReconciler) deleteOwnedClusterScoped(ctx context.Context, tenant *maasv1alpha1.Tenant) (bool, error) { + pending := false + + var crList rbacv1.ClusterRoleList + if err := r.List(ctx, &crList); err != nil { + return false, fmt.Errorf("list ClusterRoles: %w", err) + } + for i := range crList.Items { + item := &crList.Items[i] + if !isOwnedByTenant(item, tenant) { + continue + } + if !item.GetDeletionTimestamp().IsZero() { + pending = true + continue + } + if err := r.Delete(ctx, item, deletePropagation); err != nil && !apierrors.IsNotFound(err) { + return false, fmt.Errorf("delete ClusterRole %s: %w", item.Name, err) + } + pending = true + } + + var crbList rbacv1.ClusterRoleBindingList + if err := r.List(ctx, &crbList); err != nil { + return false, fmt.Errorf("list ClusterRoleBindings: %w", err) + } + for i := range crbList.Items { + item := &crbList.Items[i] + if !isOwnedByTenant(item, tenant) { + continue + } + if !item.GetDeletionTimestamp().IsZero() { + pending = true + continue + } + if err := r.Delete(ctx, item, deletePropagation); err != nil && !apierrors.IsNotFound(err) { + return false, fmt.Errorf("delete ClusterRoleBinding %s: %w", item.Name, err) + } + pending = true + } + + return pending, nil +} + +// finalizeRequeueInterval is used while owned children are still terminating. +const finalizeRequeueInterval = 5 * time.Second diff --git a/maas-controller/pkg/controller/maas/tenant_reconcile.go b/maas-controller/pkg/controller/maas/tenant_reconcile.go new file mode 100644 index 000000000..578c2a46a --- /dev/null +++ b/maas-controller/pkg/controller/maas/tenant_reconcile.go @@ -0,0 +1,291 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package maas + +import ( + "context" + "errors" + "fmt" + "strings" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + gwapiv1 "sigs.k8s.io/gateway-api/apis/v1" + + maasv1alpha1 "github.com/opendatahub-io/models-as-a-service/maas-controller/api/maas/v1alpha1" + "github.com/opendatahub-io/models-as-a-service/maas-controller/pkg/platform/tenantreconcile" +) + +// Annotations mirrored from ODH (avoid importing opendatahub-operator). +const ( + managementStateAnnotation = "component.opendatahub.io/management-state" + managementStateManaged = "Managed" + managementStateRemoved = "Removed" + managementStateUnmanaged = "Unmanaged" +) + +const ( + tenantFinalizer = "maas.opendatahub.io/tenant-finalizer" +) + +func managementState(ann map[string]string) string { + if ann == nil { + return "" + } + return ann[managementStateAnnotation] +} + +func (r *TenantReconciler) reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + + var tenant maasv1alpha1.Tenant + if err := r.Get(ctx, req.NamespacedName, &tenant); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + if tenant.Name != maasv1alpha1.TenantInstanceName { + return ctrl.Result{}, nil + } + + // Handle delete before Removed/Unmanaged idle so we still run teardown when the CR is being deleted. + if !tenant.DeletionTimestamp.IsZero() { + if !controllerutil.ContainsFinalizer(&tenant, tenantFinalizer) { + return ctrl.Result{}, nil + } + pending, err := r.finalizeTenantDeletion(ctx, &tenant) + if err != nil { + return ctrl.Result{}, err + } + if pending { + return ctrl.Result{RequeueAfter: finalizeRequeueInterval}, nil + } + patchBase := client.MergeFrom(tenant.DeepCopy()) + controllerutil.RemoveFinalizer(&tenant, tenantFinalizer) + if err := r.Patch(ctx, &tenant, patchBase); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } + + ms := managementState(tenant.Annotations) + if ms == managementStateRemoved || ms == managementStateUnmanaged { + return r.handleIdleManagementState(ctx, &tenant, ms) + } + + if !controllerutil.ContainsFinalizer(&tenant, tenantFinalizer) { + patchBase := client.MergeFrom(tenant.DeepCopy()) + controllerutil.AddFinalizer(&tenant, tenantFinalizer) + if err := r.Patch(ctx, &tenant, patchBase); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{Requeue: true}, nil + } + + if ms != "" && ms != managementStateManaged { + if err := r.patchStatus(ctx, &tenant, "Failed", metav1.ConditionFalse, "UnexpectedManagementState", + fmt.Sprintf("unsupported %s=%q", managementStateAnnotation, ms)); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + } + + orig := tenant.DeepCopy() + if err := applyGatewayDefaults(&tenant); err != nil { + if err2 := r.patchStatus(ctx, &tenant, "Failed", metav1.ConditionFalse, "InvalidGateway", err.Error()); err2 != nil { + return ctrl.Result{}, err2 + } + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + } + if orig.Spec.GatewayRef != tenant.Spec.GatewayRef { + if err := r.Patch(ctx, &tenant, client.MergeFrom(orig)); err != nil { + return ctrl.Result{}, err + } + if err := r.Get(ctx, req.NamespacedName, &tenant); err != nil { + return ctrl.Result{}, err + } + } + + if err := validateGatewayExists(ctx, r.Client, tenant.Spec.GatewayRef.Namespace, tenant.Spec.GatewayRef.Name); err != nil { + log.Info("gateway validation failed", "error", err) + if err2 := r.patchStatus(ctx, &tenant, "Pending", metav1.ConditionFalse, "GatewayNotReady", err.Error()); err2 != nil { + return ctrl.Result{}, err2 + } + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + } + + if r.ManifestPath == "" { + if err := r.patchStatus(ctx, &tenant, "Failed", metav1.ConditionFalse, "ManifestPathUnset", + "MAAS_PLATFORM_MANIFESTS is not set and no default kustomize path resolved; cannot apply platform manifests"); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: 2 * time.Minute}, nil + } + + if err := tenantreconcile.CheckDependencies(ctx, r.Client); err != nil { + log.Info("Tenant dependency check failed", "error", err) + setDependenciesCondition(&tenant, false, err.Error()) + setDeploymentsAvailableCondition(&tenant, false, "DependenciesNotMet", err.Error()) + prerequisitesUnevaluatedCondition(&tenant, "Prerequisites were not evaluated because required dependencies are not met") + if err2 := r.patchStatus(ctx, &tenant, "Pending", metav1.ConditionFalse, "DependenciesNotAvailable", err.Error()); err2 != nil { + return ctrl.Result{}, err2 + } + return ctrl.Result{RequeueAfter: 45 * time.Second}, nil + } + setDependenciesCondition(&tenant, true, "") + + appNs := r.AppNamespace + rep := tenantreconcile.CollectPrerequisiteReport(ctx, r.Client, appNs) + setPrerequisiteConditionsFromReport(&tenant, rep) + if len(rep.Blocking) > 0 { + tenant.Status.Phase = "Failed" + agg := strings.Join(append(append([]string{}, rep.Blocking...), rep.Warnings...), "; ") + setDeploymentsAvailableCondition(&tenant, false, "PrerequisitesMissing", agg) + apimeta.SetStatusCondition(&tenant.Status.Conditions, metav1.Condition{ + Type: tenantreconcile.ReadyConditionType, + Status: metav1.ConditionFalse, + Reason: "PrerequisitesNotMet", + Message: agg, + ObservedGeneration: tenant.Generation, + LastTransitionTime: metav1.Now(), + }) + if err := r.Status().Update(ctx, &tenant); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: 45 * time.Second}, nil + } + + runRes, err := tenantreconcile.RunPlatform(ctx, log, r.Client, r.Scheme, &tenant, r.ManifestPath, appNs) + if err != nil { + log.Error(err, "Tenant platform reconcile failed") + setDeploymentsAvailableCondition(&tenant, false, "PlatformReconcileFailed", err.Error()) + if err2 := r.patchStatus(ctx, &tenant, "Failed", metav1.ConditionFalse, "PlatformReconcileFailed", err.Error()); err2 != nil { + return ctrl.Result{}, err2 + } + return ctrl.Result{RequeueAfter: 45 * time.Second}, nil + } + + if runRes.DeploymentPending { + tenant.Status.Phase = "Pending" + setDeploymentsAvailableCondition(&tenant, false, "DeploymentsNotReady", runRes.Detail) + apimeta.SetStatusCondition(&tenant.Status.Conditions, metav1.Condition{ + Type: tenantreconcile.ReadyConditionType, + Status: metav1.ConditionFalse, + Reason: "DeploymentsNotReady", + Message: runRes.Detail, + ObservedGeneration: tenant.Generation, + LastTransitionTime: metav1.Now(), + }) + if err := r.Status().Update(ctx, &tenant); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{RequeueAfter: 20 * time.Second}, nil + } + + tenant.Status.Phase = "Active" + if apimeta.IsStatusConditionTrue(tenant.Status.Conditions, tenantreconcile.ConditionTypeDegraded) { + tenant.Status.Phase = "Degraded" + } + setDeploymentsAvailableCondition(&tenant, true, "DeploymentsReady", "maas-api deployment is available") + apimeta.SetStatusCondition(&tenant.Status.Conditions, metav1.Condition{ + Type: tenantreconcile.ReadyConditionType, + Status: metav1.ConditionTrue, + Reason: "Reconciled", + Message: "MaaS platform manifests applied and maas-api deployment is available", + ObservedGeneration: tenant.Generation, + LastTransitionTime: metav1.Now(), + }) + if err := r.Status().Update(ctx, &tenant); err != nil { + return ctrl.Result{}, err + } + + log.V(1).Info("Tenant platform reconciled", "name", tenant.Name) + return ctrl.Result{RequeueAfter: 5 * time.Minute}, nil +} + +// handleIdleManagementState handles Removed and Unmanaged states. +// Removed tears down owned resources before dropping the finalizer; +// Unmanaged simply drops the finalizer, leaving resources in place. +func (r *TenantReconciler) handleIdleManagementState(ctx context.Context, tenant *maasv1alpha1.Tenant, ms string) (ctrl.Result, error) { + if err := r.patchStatus(ctx, tenant, "", metav1.ConditionFalse, "ManagementStateIdle", + fmt.Sprintf("management state is %q; platform workloads are not driven by this reconciler in this state", ms)); err != nil { + return ctrl.Result{}, err + } + if controllerutil.ContainsFinalizer(tenant, tenantFinalizer) { + if ms == managementStateRemoved { + pending, err := r.finalizeTenantDeletion(ctx, tenant) + if err != nil { + return ctrl.Result{}, err + } + if pending { + return ctrl.Result{RequeueAfter: finalizeRequeueInterval}, nil + } + } + patchBase := client.MergeFrom(tenant.DeepCopy()) + controllerutil.RemoveFinalizer(tenant, tenantFinalizer) + if err := r.Patch(ctx, tenant, patchBase); err != nil { + return ctrl.Result{}, err + } + } + return ctrl.Result{}, nil +} + +func applyGatewayDefaults(tenant *maasv1alpha1.Tenant) error { + ref := &tenant.Spec.GatewayRef + if ref.Namespace == "" && ref.Name == "" { + ref.Namespace = tenantreconcile.DefaultGatewayNamespace + ref.Name = tenantreconcile.DefaultGatewayName + return nil + } + if ref.Namespace == "" || ref.Name == "" { + return errors.New("invalid gateway specification: when specifying a custom gateway, both namespace and name must be provided") + } + return nil +} + +func validateGatewayExists(ctx context.Context, c client.Client, namespace, name string) error { + gw := &gwapiv1.Gateway{} + key := types.NamespacedName{Namespace: namespace, Name: name} + if err := c.Get(ctx, key, gw); err != nil { + if apierrors.IsNotFound(err) { + return fmt.Errorf("gateway %s/%s not found: the specified Gateway must exist before enabling MaaS platform reconcile", namespace, name) + } + return fmt.Errorf("failed to look up gateway %s/%s: %w", namespace, name, err) + } + return nil +} + +func (r *TenantReconciler) patchStatus(ctx context.Context, tenant *maasv1alpha1.Tenant, phase string, status metav1.ConditionStatus, reason, message string) error { + tenant.Status.Phase = phase + apimeta.SetStatusCondition(&tenant.Status.Conditions, metav1.Condition{ + Type: tenantreconcile.ReadyConditionType, + Status: status, + Reason: reason, + Message: message, + ObservedGeneration: tenant.Generation, + LastTransitionTime: metav1.Now(), + }) + return r.Status().Update(ctx, tenant) +} diff --git a/maas-controller/pkg/controller/maas/tenant_reconcile_test.go b/maas-controller/pkg/controller/maas/tenant_reconcile_test.go new file mode 100644 index 000000000..f7ab0242e --- /dev/null +++ b/maas-controller/pkg/controller/maas/tenant_reconcile_test.go @@ -0,0 +1,428 @@ +//nolint:testpackage +package maas + +import ( + "context" + "testing" + "time" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + apimeta "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + gwapiv1 "sigs.k8s.io/gateway-api/apis/v1" + + maasv1alpha1 "github.com/opendatahub-io/models-as-a-service/maas-controller/api/maas/v1alpha1" + "github.com/opendatahub-io/models-as-a-service/maas-controller/pkg/platform/tenantreconcile" + + . "github.com/onsi/gomega" +) + +func tenantTestScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(s)) + utilruntime.Must(maasv1alpha1.AddToScheme(s)) + utilruntime.Must(gwapiv1.Install(s)) + return s +} + +func TestTenantReconcile_DeletionRemovesFinalizerAfterOwnedConfigMapDeleted(t *testing.T) { + g := NewWithT(t) + s := tenantTestScheme(t) + + const testNS = "opendatahub" + now := metav1.NewTime(time.Now()) + tenant := &maasv1alpha1.Tenant{ + ObjectMeta: metav1.ObjectMeta{ + Name: maasv1alpha1.TenantInstanceName, + Namespace: testNS, + UID: types.UID("tenant-uid"), + DeletionTimestamp: &now, + Finalizers: []string{tenantFinalizer}, + }, + } + trueRef := true + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "maas-owned", + Namespace: testNS, + OwnerReferences: []metav1.OwnerReference{{ + APIVersion: maasv1alpha1.GroupVersion.String(), + Kind: maasv1alpha1.TenantKind, + Name: tenant.Name, + UID: tenant.UID, + Controller: &trueRef, + BlockOwnerDeletion: &trueRef, + }}, + }, + } + + cl := fake.NewClientBuilder(). + WithScheme(s). + WithStatusSubresource(&maasv1alpha1.Tenant{}). + WithObjects(tenant, cm). + Build() + + r := &TenantReconciler{ + Client: cl, + Scheme: s, + OperatorNamespace: testNS, + AppNamespace: testNS, + } + + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: tenant.Name, Namespace: testNS}} + + res1, err := r.Reconcile(context.Background(), req) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(res1.RequeueAfter).To(Equal(finalizeRequeueInterval), "first pass issues child deletes and requeues") + + res2, err := r.Reconcile(context.Background(), req) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(res2.RequeueAfter).To(BeNumerically("==", 0)) + + var updated maasv1alpha1.Tenant + err = cl.Get(context.Background(), client.ObjectKey{Name: tenant.Name, Namespace: testNS}, &updated) + if apierrors.IsNotFound(err) { + // Fake client may remove the tenant once the finalizer is gone while deletionTimestamp is set. + } else { + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(updated.Finalizers).NotTo(ContainElement(tenantFinalizer)) + } + + var cms corev1.ConfigMapList + g.Expect(cl.List(context.Background(), &cms, client.InNamespace("opendatahub"))).To(Succeed()) + g.Expect(cms.Items).To(BeEmpty()) +} + +func TestTenantReconcile_DeletionRequeuesWhileOwnedChildTerminating(t *testing.T) { + g := NewWithT(t) + s := tenantTestScheme(t) + + const testNS = "opendatahub" + now := metav1.NewTime(time.Now()) + tenant := &maasv1alpha1.Tenant{ + ObjectMeta: metav1.ObjectMeta{ + Name: maasv1alpha1.TenantInstanceName, + Namespace: testNS, + UID: types.UID("tenant-uid"), + DeletionTimestamp: &now, + Finalizers: []string{tenantFinalizer}, + }, + } + trueRef := true + cm := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "maas-owned", + Namespace: testNS, + OwnerReferences: []metav1.OwnerReference{{ + APIVersion: maasv1alpha1.GroupVersion.String(), + Kind: maasv1alpha1.TenantKind, + Name: tenant.Name, + UID: tenant.UID, + Controller: &trueRef, + BlockOwnerDeletion: &trueRef, + }}, + DeletionTimestamp: &now, + Finalizers: []string{"test-finalizer"}, + }, + } + + cl := fake.NewClientBuilder(). + WithScheme(s). + WithStatusSubresource(&maasv1alpha1.Tenant{}). + WithObjects(tenant, cm). + Build() + + r := &TenantReconciler{ + Client: cl, + Scheme: s, + OperatorNamespace: testNS, + AppNamespace: testNS, + } + + req := ctrl.Request{NamespacedName: types.NamespacedName{Name: tenant.Name, Namespace: testNS}} + + res, err := r.Reconcile(context.Background(), req) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(res.RequeueAfter).To(Equal(finalizeRequeueInterval)) + + var updated maasv1alpha1.Tenant + g.Expect(cl.Get(context.Background(), client.ObjectKey{Name: tenant.Name, Namespace: testNS}, &updated)).To(Succeed()) + g.Expect(updated.Finalizers).To(ContainElement(tenantFinalizer)) +} + +func TestTenantReconcile_NonSingletonNameIsNoOp(t *testing.T) { + g := NewWithT(t) + s := tenantTestScheme(t) + + const testNS = "models-as-a-service" + tenant := &maasv1alpha1.Tenant{ + ObjectMeta: metav1.ObjectMeta{ + Name: "not-default-tenant", + Namespace: testNS, + }, + } + + cl := fake.NewClientBuilder(). + WithScheme(s). + WithStatusSubresource(&maasv1alpha1.Tenant{}). + WithObjects(tenant). + Build() + + r := &TenantReconciler{ + Client: cl, + Scheme: s, + AppNamespace: testNS, + } + + res, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "not-default-tenant", Namespace: testNS}, + }) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(res).To(Equal(ctrl.Result{})) + + var updated maasv1alpha1.Tenant + g.Expect(cl.Get(context.Background(), client.ObjectKey{Name: "not-default-tenant", Namespace: testNS}, &updated)).To(Succeed()) + g.Expect(updated.Finalizers).To(BeEmpty(), "non-singleton should not get a finalizer") +} + +func TestTenantReconcile_FinalizerAddedOnFirstReconcile(t *testing.T) { + g := NewWithT(t) + s := tenantTestScheme(t) + + const testNS = "models-as-a-service" + tenant := &maasv1alpha1.Tenant{ + ObjectMeta: metav1.ObjectMeta{ + Name: maasv1alpha1.TenantInstanceName, + Namespace: testNS, + }, + } + + cl := fake.NewClientBuilder(). + WithScheme(s). + WithStatusSubresource(&maasv1alpha1.Tenant{}). + WithObjects(tenant). + Build() + + r := &TenantReconciler{ + Client: cl, + Scheme: s, + AppNamespace: testNS, + } + + res, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: maasv1alpha1.TenantInstanceName, Namespace: testNS}, + }) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(res.Requeue).To(BeTrue(), "should requeue after adding finalizer") + + var updated maasv1alpha1.Tenant + g.Expect(cl.Get(context.Background(), client.ObjectKey{Name: maasv1alpha1.TenantInstanceName, Namespace: testNS}, &updated)).To(Succeed()) + g.Expect(updated.Finalizers).To(ContainElement(tenantFinalizer)) +} + +func TestTenantReconcile_ManagementStateRemovedSetsIdleAndRemovesFinalizer(t *testing.T) { + g := NewWithT(t) + s := tenantTestScheme(t) + + const testNS = "models-as-a-service" + tenant := &maasv1alpha1.Tenant{ + ObjectMeta: metav1.ObjectMeta{ + Name: maasv1alpha1.TenantInstanceName, + Namespace: testNS, + Annotations: map[string]string{ + managementStateAnnotation: managementStateRemoved, + }, + Finalizers: []string{tenantFinalizer}, + }, + } + + cl := fake.NewClientBuilder(). + WithScheme(s). + WithStatusSubresource(&maasv1alpha1.Tenant{}). + WithObjects(tenant). + Build() + + r := &TenantReconciler{ + Client: cl, + Scheme: s, + AppNamespace: testNS, + } + + res, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: maasv1alpha1.TenantInstanceName, Namespace: testNS}, + }) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(res).To(Equal(ctrl.Result{})) + + var updated maasv1alpha1.Tenant + g.Expect(cl.Get(context.Background(), client.ObjectKey{Name: maasv1alpha1.TenantInstanceName, Namespace: testNS}, &updated)).To(Succeed()) + g.Expect(updated.Finalizers).NotTo(ContainElement(tenantFinalizer), "finalizer should be removed in Removed state") + + readyCond := apimeta.FindStatusCondition(updated.Status.Conditions, tenantreconcile.ReadyConditionType) + g.Expect(readyCond).NotTo(BeNil()) + g.Expect(readyCond.Status).To(Equal(metav1.ConditionFalse)) + g.Expect(readyCond.Reason).To(Equal("ManagementStateIdle")) +} + +func TestTenantReconcile_ManagementStateUnmanagedSetsIdle(t *testing.T) { + g := NewWithT(t) + s := tenantTestScheme(t) + + const testNS = "models-as-a-service" + tenant := &maasv1alpha1.Tenant{ + ObjectMeta: metav1.ObjectMeta{ + Name: maasv1alpha1.TenantInstanceName, + Namespace: testNS, + Annotations: map[string]string{ + managementStateAnnotation: managementStateUnmanaged, + }, + }, + } + + cl := fake.NewClientBuilder(). + WithScheme(s). + WithStatusSubresource(&maasv1alpha1.Tenant{}). + WithObjects(tenant). + Build() + + r := &TenantReconciler{ + Client: cl, + Scheme: s, + AppNamespace: testNS, + } + + res, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: maasv1alpha1.TenantInstanceName, Namespace: testNS}, + }) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(res).To(Equal(ctrl.Result{})) + + var updated maasv1alpha1.Tenant + g.Expect(cl.Get(context.Background(), client.ObjectKey{Name: maasv1alpha1.TenantInstanceName, Namespace: testNS}, &updated)).To(Succeed()) + readyCond := apimeta.FindStatusCondition(updated.Status.Conditions, tenantreconcile.ReadyConditionType) + g.Expect(readyCond).NotTo(BeNil()) + g.Expect(readyCond.Reason).To(Equal("ManagementStateIdle")) +} + +func TestTenantReconcile_UnexpectedManagementStateSetsFailedPhase(t *testing.T) { + g := NewWithT(t) + s := tenantTestScheme(t) + + const testNS = "models-as-a-service" + tenant := &maasv1alpha1.Tenant{ + ObjectMeta: metav1.ObjectMeta{ + Name: maasv1alpha1.TenantInstanceName, + Namespace: testNS, + Annotations: map[string]string{ + managementStateAnnotation: "InvalidState", + }, + Finalizers: []string{tenantFinalizer}, + }, + } + + cl := fake.NewClientBuilder(). + WithScheme(s). + WithStatusSubresource(&maasv1alpha1.Tenant{}). + WithObjects(tenant). + Build() + + r := &TenantReconciler{ + Client: cl, + Scheme: s, + AppNamespace: testNS, + } + + res, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: maasv1alpha1.TenantInstanceName, Namespace: testNS}, + }) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(res.RequeueAfter).To(Equal(30 * time.Second)) + + var updated maasv1alpha1.Tenant + g.Expect(cl.Get(context.Background(), client.ObjectKey{Name: maasv1alpha1.TenantInstanceName, Namespace: testNS}, &updated)).To(Succeed()) + g.Expect(updated.Status.Phase).To(Equal("Failed")) + readyCond := apimeta.FindStatusCondition(updated.Status.Conditions, tenantreconcile.ReadyConditionType) + g.Expect(readyCond).NotTo(BeNil()) + g.Expect(readyCond.Reason).To(Equal("UnexpectedManagementState")) +} + +func TestTenantReconcile_DeletionIncludesAppNamespace(t *testing.T) { + g := NewWithT(t) + s := tenantTestScheme(t) + + const testNS = "models-as-a-service" + now := metav1.NewTime(time.Now()) + tenant := &maasv1alpha1.Tenant{ + ObjectMeta: metav1.ObjectMeta{ + Name: maasv1alpha1.TenantInstanceName, + Namespace: testNS, + UID: types.UID("tenant-uid"), + DeletionTimestamp: &now, + Finalizers: []string{tenantFinalizer}, + }, + Spec: maasv1alpha1.TenantSpec{ + GatewayRef: maasv1alpha1.TenantGatewayRef{ + Namespace: "openshift-ingress", + Name: "maas-default-gateway", + }, + }, + } + + cl := fake.NewClientBuilder(). + WithScheme(s). + WithStatusSubresource(&maasv1alpha1.Tenant{}). + WithObjects(tenant). + Build() + + r := &TenantReconciler{ + Client: cl, + Scheme: s, + OperatorNamespace: "opendatahub", + AppNamespace: testNS, + } + + _, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: maasv1alpha1.TenantInstanceName, Namespace: testNS}, + }) + // Finalization should succeed (no owned resources) and the object is deleted + // (fake client removes the object once finalizers are cleared on a deleted resource). + // The reconciler may return NotFound when trying the final status update — that's OK. + if err != nil { + g.Expect(apierrors.IsNotFound(err)).To(BeTrue(), "expected NotFound (object finalized and deleted), got: %v", err) + } + + var updated maasv1alpha1.Tenant + err = cl.Get(context.Background(), client.ObjectKey{Name: maasv1alpha1.TenantInstanceName, Namespace: testNS}, &updated) + // Object should be gone (finalizer removed → fake client deletes it) + g.Expect(apierrors.IsNotFound(err)).To(BeTrue(), "tenant should be fully deleted after finalization") +} + +func TestTenantReconcile_NotFoundIsNoOp(t *testing.T) { + g := NewWithT(t) + s := tenantTestScheme(t) + + cl := fake.NewClientBuilder(). + WithScheme(s). + Build() + + r := &TenantReconciler{ + Client: cl, + Scheme: s, + AppNamespace: "models-as-a-service", + } + + res, err := r.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: types.NamespacedName{Name: maasv1alpha1.TenantInstanceName, Namespace: "models-as-a-service"}, + }) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(res).To(Equal(ctrl.Result{})) +} diff --git a/maas-controller/pkg/platform/tenantreconcile/apply.go b/maas-controller/pkg/platform/tenantreconcile/apply.go new file mode 100644 index 000000000..766b28cf8 --- /dev/null +++ b/maas-controller/pkg/platform/tenantreconcile/apply.go @@ -0,0 +1,151 @@ +package tenantreconcile + +import ( + "bufio" + "context" + "fmt" + "os" + "path/filepath" + "strings" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + + maasv1alpha1 "github.com/opendatahub-io/models-as-a-service/maas-controller/api/maas/v1alpha1" +) + +const ssaFieldOwner = "maas-controller" + +func parseParams(fileName string) (map[string]string, error) { + paramsEnv, err := os.Open(fileName) + if err != nil { + return nil, err + } + defer paramsEnv.Close() + + paramsEnvMap := make(map[string]string) + scanner := bufio.NewScanner(paramsEnv) + for scanner.Scan() { + line := scanner.Text() + key, value, found := strings.Cut(line, "=") + if found { + paramsEnvMap[key] = value + } + } + if err := scanner.Err(); err != nil { + return nil, err + } + + return paramsEnvMap, nil +} + +func writeParamsToTmp(params map[string]string, tmpDir string) (string, error) { + tmp, err := os.CreateTemp(tmpDir, "params.env-") + if err != nil { + return "", err + } + defer tmp.Close() + + writer := bufio.NewWriter(tmp) + for key, value := range params { + if _, err := fmt.Fprintf(writer, "%s=%s\n", key, value); err != nil { + return "", err + } + } + if err := writer.Flush(); err != nil { + return "", fmt.Errorf("failed to write to file: %w", err) + } + + return tmp.Name(), nil +} + +func updateMap(m *map[string]string, key, val string) int { + old := (*m)[key] + if old == val { + return 0 + } + (*m)[key] = val + return 1 +} + +// ApplyParams mirrors opendatahub-operator/pkg/deploy.ApplyParams for params.env substitution. +func ApplyParams(componentPath, file string, imageParamsMap map[string]string, extraParamsMaps ...map[string]string) error { + paramsFile := filepath.Join(componentPath, file) + + paramsEnvMap, err := parseParams(paramsFile) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + + updated := 0 + for i := range paramsEnvMap { + relatedImageValue := os.Getenv(imageParamsMap[i]) + if relatedImageValue != "" { + updated |= updateMap(¶msEnvMap, i, relatedImageValue) + } + } + for _, extraParamsMap := range extraParamsMaps { + for eKey, eValue := range extraParamsMap { + updated |= updateMap(¶msEnvMap, eKey, eValue) + } + } + + if updated == 0 { + return nil + } + + tmp, err := writeParamsToTmp(paramsEnvMap, componentPath) + if err != nil { + return err + } + + if err = os.Rename(tmp, paramsFile); err != nil { + _ = os.Remove(tmp) + return fmt.Errorf("failed rename %s to %s: %w", tmp, paramsFile, err) + } + + return nil +} + +// ApplyRendered server-side-applies rendered objects with Tenant as controller owner (ODH deploy parity). +// Same-namespace children get a standard ownerReference; cluster-scoped and cross-namespace children +// get tracking labels instead (Kubernetes forbids cross-namespace and namespaced-to-cluster ownerReferences). +func ApplyRendered(ctx context.Context, c client.Client, scheme *runtime.Scheme, tenant *maasv1alpha1.Tenant, objs []unstructured.Unstructured) error { + for i := range objs { + u := objs[i].DeepCopy() + + childNs := u.GetNamespace() + if childNs != "" && childNs == tenant.Namespace { + if err := controllerutil.SetControllerReference(tenant, u, scheme); err != nil { + return fmt.Errorf("set controller reference on %s %s/%s: %w", u.GetKind(), u.GetNamespace(), u.GetName(), err) + } + } else { + setTenantTrackingLabels(u, tenant) + } + unstructured.RemoveNestedField(u.Object, "metadata", "managedFields") + unstructured.RemoveNestedField(u.Object, "metadata", "resourceVersion") + unstructured.RemoveNestedField(u.Object, "status") + // ForceOwnership is intentional: maas-controller is the sole manager for + // Tenant platform resources. During migration from the ODH modelsasservice + // pipeline, force ensures a clean field-manager handoff without conflicts. + if err := c.Patch(ctx, u, client.Apply, client.FieldOwner(ssaFieldOwner), client.ForceOwnership); err != nil { + return fmt.Errorf("apply %s %s/%s: %w", u.GetKind(), u.GetNamespace(), u.GetName(), err) + } + } + return nil +} + +func setTenantTrackingLabels(obj *unstructured.Unstructured, tenant *maasv1alpha1.Tenant) { + labels := obj.GetLabels() + if labels == nil { + labels = make(map[string]string) + } + labels[LabelTenantName] = tenant.Name + labels[LabelTenantNamespace] = tenant.Namespace + obj.SetLabels(labels) +} diff --git a/maas-controller/pkg/platform/tenantreconcile/constants.go b/maas-controller/pkg/platform/tenantreconcile/constants.go new file mode 100644 index 000000000..4a45ec77f --- /dev/null +++ b/maas-controller/pkg/platform/tenantreconcile/constants.go @@ -0,0 +1,60 @@ +// Package tenantreconcile mirrors the Open Data Hub operator modelsasservice component pipeline +// (initialize → dependencies → prerequisites → gateway → params → kustomize → post-render → apply → deployment status). +package tenantreconcile + +import "k8s.io/apimachinery/pkg/runtime/schema" + +const ( + // ComponentName matches the ODH modelsasservice component label key suffix (app.opendatahub.io/). + ComponentName = "modelsasservice" + + LabelODHAppPrefix = "app.opendatahub.io" + LabelK8sPartOf = "app.kubernetes.io/part-of" + LabelTenantName = "maas.opendatahub.io/tenant-name" + LabelTenantNamespace = "maas.opendatahub.io/tenant-namespace" + + DefaultGatewayNamespace = "openshift-ingress" + DefaultGatewayName = "maas-default-gateway" + + GatewayDefaultAuthPolicyName = "gateway-default-auth" + GatewayTokenRateLimitDefaultDenyPolicyName = "gateway-default-deny" + MaaSAPIAuthPolicyName = "maas-api-auth-policy" + GatewayDestinationRuleName = "maas-api-backend-tls" + TelemetryPolicyName = "maas-telemetry" + IstioTelemetryName = "latency-per-subscription" + MaaSParametersConfigMapName = "maas-parameters" + MaaSAPIDeploymentName = "maas-api" + MaaSDBSecretName = "maas-db-config" //nolint:gosec // secret name reference, not a credential + MaaSDBSecretKey = "DB_CONNECTION_URL" + + MonitoringNamespace = "openshift-monitoring" + ClusterMonitoringConfigName = "cluster-monitoring-config" + + // Condition types aligned with ODH internal/controller/status for DSC aggregation parity. + ConditionDependenciesAvailable = "DependenciesAvailable" + ConditionMaaSPrerequisitesAvailable = "MaaSPrerequisitesAvailable" + ConditionDeploymentsAvailable = "DeploymentsAvailable" + ConditionTypeDegraded = "Degraded" + ReadyConditionType = "Ready" +) + +// ImageParamKeys maps params.env keys to RELATED_IMAGE_* env vars (same as ODH modelsasservice_support.go). +var ImageParamKeys = map[string]string{ + "maas-api-image": "RELATED_IMAGE_ODH_MAAS_API_IMAGE", + "maas-controller-image": "RELATED_IMAGE_ODH_MAAS_CONTROLLER_IMAGE", + "payload-processing-image": "RELATED_IMAGE_ODH_AI_GATEWAY_PAYLOAD_PROCESSING_IMAGE", +} + +// GVKs used for post-render and readiness (mirrors opendatahub-operator/pkg/cluster/gvk selections for modelsasservice). +var ( + GVKConfigMap = schema.GroupVersionKind{Group: "", Version: "v1", Kind: "ConfigMap"} + GVKDeployment = schema.GroupVersionKind{Group: "apps", Version: "v1", Kind: "Deployment"} + GVKAuthPolicy = schema.GroupVersionKind{Group: "kuadrant.io", Version: "v1", Kind: "AuthPolicy"} + GVKTokenRateLimitPolicy = schema.GroupVersionKind{Group: "kuadrant.io", Version: "v1alpha1", Kind: "TokenRateLimitPolicy"} + GVKDestinationRule = schema.GroupVersionKind{Group: "networking.istio.io", Version: "v1", Kind: "DestinationRule"} + GVKTelemetryPolicy = schema.GroupVersionKind{Group: "extensions.kuadrant.io", Version: "v1alpha1", Kind: "TelemetryPolicy"} + GVKEnvoyFilter = schema.GroupVersionKind{Group: "networking.istio.io", Version: "v1alpha3", Kind: "EnvoyFilter"} + GVKIstioTelemetry = schema.GroupVersionKind{Group: "telemetry.istio.io", Version: "v1", Kind: "Telemetry"} + GVKAuthConfig = schema.GroupVersionKind{Group: "authorino.kuadrant.io", Version: "v1beta3", Kind: "AuthConfig"} + GVKAuthorino = schema.GroupVersionKind{Group: "operator.authorino.kuadrant.io", Version: "v1beta1", Kind: "Authorino"} +) diff --git a/maas-controller/pkg/platform/tenantreconcile/kustomize.go b/maas-controller/pkg/platform/tenantreconcile/kustomize.go new file mode 100644 index 000000000..ce49fcfb0 --- /dev/null +++ b/maas-controller/pkg/platform/tenantreconcile/kustomize.go @@ -0,0 +1,141 @@ +package tenantreconcile + +import ( + "fmt" + "os" + "path/filepath" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "sigs.k8s.io/kustomize/api/builtins" //nolint:staticcheck // no replacement until kustomize API v1 + "sigs.k8s.io/kustomize/api/filters/namespace" + "sigs.k8s.io/kustomize/api/krusty" + "sigs.k8s.io/kustomize/api/types" + "sigs.k8s.io/kustomize/kyaml/filesys" + "sigs.k8s.io/kustomize/kyaml/resid" + + maasv1alpha1 "github.com/opendatahub-io/models-as-a-service/maas-controller/api/maas/v1alpha1" +) + +// createNamespaceApplierPlugin mirrors opendatahub-operator/pkg/plugins.CreateNamespaceApplierPlugin. +func createNamespaceApplierPlugin(targetNamespace string) *builtins.NamespaceTransformerPlugin { + return &builtins.NamespaceTransformerPlugin{ + ObjectMeta: types.ObjectMeta{ + Name: "maas-namespace-plugin", + Namespace: targetNamespace, + }, + FieldSpecs: []types.FieldSpec{ + {Gvk: resid.Gvk{}, Path: "metadata/namespace", CreateIfNotPresent: true}, + {Gvk: resid.Gvk{Group: "rbac.authorization.k8s.io", Kind: "ClusterRoleBinding"}, Path: "subjects/namespace", CreateIfNotPresent: true}, + {Gvk: resid.Gvk{Group: "rbac.authorization.k8s.io", Kind: "RoleBinding"}, Path: "subjects/namespace", CreateIfNotPresent: true}, + {Gvk: resid.Gvk{Group: "admissionregistration.k8s.io", Kind: "ValidatingWebhookConfiguration"}, Path: "webhooks/clientConfig/service/namespace", CreateIfNotPresent: false}, + {Gvk: resid.Gvk{Group: "admissionregistration.k8s.io", Kind: "MutatingWebhookConfiguration"}, Path: "webhooks/clientConfig/service/namespace", CreateIfNotPresent: false}, + {Gvk: resid.Gvk{Group: "apiextensions.k8s.io", Kind: "CustomResourceDefinition"}, Path: "spec/conversion/webhook/clientConfig/service/namespace", CreateIfNotPresent: false}, + }, + UnsetOnly: false, + SetRoleBindingSubjects: namespace.AllServiceAccountSubjects, + } +} + +func odhComponentLabels() map[string]string { + return map[string]string{ + LabelODHAppPrefix + "/" + ComponentName: "true", + LabelK8sPartOf: "models-as-a-service", + } +} + +func createSetLabelsPlugin(labels map[string]string) *builtins.LabelTransformerPlugin { + return &builtins.LabelTransformerPlugin{ + Labels: labels, + FieldSpecs: []types.FieldSpec{ + {Gvk: resid.Gvk{Kind: "Deployment"}, Path: "spec/template/metadata/labels", CreateIfNotPresent: true}, + {Gvk: resid.Gvk{Kind: "Deployment"}, Path: "spec/selector/matchLabels", CreateIfNotPresent: true}, + {Gvk: resid.Gvk{}, Path: "metadata/labels", CreateIfNotPresent: true}, + }, + } +} + +// RenderKustomize runs kustomize build for the ODH maas-api overlay and applies ODH-equivalent namespace + labels. +func RenderKustomize(manifestDir, appNamespace string) ([]unstructured.Unstructured, error) { + kustomizationPath := manifestDir + if !fileExists(filepath.Join(manifestDir, "kustomization.yaml")) { + kustomizationPath = filepath.Join(manifestDir, "default") + } + + k := krusty.MakeKustomizer(krusty.MakeDefaultOptions()) + fs := filesys.MakeFsOnDisk() + resMap, err := k.Run(fs, kustomizationPath) + if err != nil { + return nil, fmt.Errorf("kustomize build %q: %w", kustomizationPath, err) + } + + if appNamespace != "" { + plugin := createNamespaceApplierPlugin(appNamespace) + if err := plugin.Transform(resMap); err != nil { + return nil, fmt.Errorf("namespace transform: %w", err) + } + } + + labelPlugin := createSetLabelsPlugin(odhComponentLabels()) + if err := labelPlugin.Transform(resMap); err != nil { + return nil, fmt.Errorf("labels transform: %w", err) + } + + rendered := resMap.Resources() + out := make([]unstructured.Unstructured, 0, len(rendered)) + for i := range rendered { + m, err := rendered[i].Map() + if err != nil { + return nil, fmt.Errorf("resource map: %w", err) + } + normalizeJSONTypes(m) + out = append(out, unstructured.Unstructured{Object: m}) + } + return out, nil +} + +// normalizeJSONTypes converts Go int values to int64 in an unstructured map. +// Kustomize's resMap.Map() returns int for YAML integers, but +// k8s.io/apimachinery DeepCopyJSONValue only handles int64/float64. +func normalizeJSONTypes(obj map[string]any) { + for k, v := range obj { + obj[k] = normalizeValue(v) + } +} + +func normalizeValue(v any) any { + switch val := v.(type) { + case int: + return int64(val) + case map[string]any: + normalizeJSONTypes(val) + return val + case []any: + for i, item := range val { + val[i] = normalizeValue(item) + } + return val + default: + return v + } +} + +func fileExists(p string) bool { + fs := filesys.MakeFsOnDisk() + return fs.Exists(p) +} + +// DefaultManifestPath returns MAAS_PLATFORM_MANIFESTS or a dev default relative to cwd (models-as-a-service repo layout). +func DefaultManifestPath() string { + if v := os.Getenv("MAAS_PLATFORM_MANIFESTS"); v != "" { + return v + } + return "../maas-api/deploy/overlays/odh" +} + +// EnsureTenantGatewayDefaults applies the same default gateway ref as ODH when unset. +func EnsureTenantGatewayDefaults(t *maasv1alpha1.Tenant) { + if t.Spec.GatewayRef.Namespace == "" && t.Spec.GatewayRef.Name == "" { + t.Spec.GatewayRef.Namespace = DefaultGatewayNamespace + t.Spec.GatewayRef.Name = DefaultGatewayName + } +} diff --git a/maas-controller/pkg/platform/tenantreconcile/pipeline.go b/maas-controller/pkg/platform/tenantreconcile/pipeline.go new file mode 100644 index 000000000..06fb374a8 --- /dev/null +++ b/maas-controller/pkg/platform/tenantreconcile/pipeline.go @@ -0,0 +1,140 @@ +package tenantreconcile + +import ( + "context" + "errors" + "fmt" + "path/filepath" + + "github.com/go-logr/logr" + appsv1 "k8s.io/api/apps/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/validation" + "sigs.k8s.io/controller-runtime/pkg/client" + gwapiv1 "sigs.k8s.io/gateway-api/apis/v1" + + maasv1alpha1 "github.com/opendatahub-io/models-as-a-service/maas-controller/api/maas/v1alpha1" +) + +// RunResult is returned from Run for reconcile pacing. +type RunResult struct { + DeploymentPending bool + Detail string +} + +// CheckDependencies verifies required CRDs (AuthConfig) are registered on the cluster. +func CheckDependencies(ctx context.Context, c client.Client) error { + if ok, err := IsGVKAvailable(c, GVKAuthConfig); err != nil { + return fmt.Errorf("dependencies: %w", err) + } else if !ok { + return errors.New("dependency missing: AuthConfig CRD (authorino.kuadrant.io/v1beta3) not available on cluster") + } + return nil +} + +// RunPlatform runs kustomize render, apply, and deployment readiness after dependencies and prerequisites +// have succeeded and gateway ref is valid (caller validates gateway existence). +func RunPlatform(ctx context.Context, log logr.Logger, c client.Client, scheme *runtime.Scheme, tenant *maasv1alpha1.Tenant, manifestPath string, appNs string) (*RunResult, error) { + manifestPath, err := filepath.Abs(manifestPath) + if err != nil { + return nil, fmt.Errorf("manifest path: %w", err) + } + + if errs := validation.IsDNS1123Subdomain(appNs); len(errs) > 0 { + return nil, fmt.Errorf("invalid application namespace %q: %v", appNs, errs) + } + + if tenant.Spec.GatewayRef.Namespace == "" || tenant.Spec.GatewayRef.Name == "" { + return nil, errors.New("gateway ref must be set (reconciler should default gateway before calling RunPlatform)") + } + gw := &gwapiv1.Gateway{} + if err := c.Get(ctx, types.NamespacedName{Namespace: tenant.Spec.GatewayRef.Namespace, Name: tenant.Spec.GatewayRef.Name}, gw); err != nil { + if apierrors.IsNotFound(err) { + return nil, fmt.Errorf("gateway %s/%s not found", tenant.Spec.GatewayRef.Namespace, tenant.Spec.GatewayRef.Name) + } + return nil, fmt.Errorf("gateway lookup: %w", err) + } + + audience, err := GetClusterServiceAccountIssuer(ctx, c) + if err != nil { + return nil, fmt.Errorf("cluster audience: %w", err) + } + if err := CustomizeParams(manifestPath, tenant, appNs, audience); err != nil { + return nil, fmt.Errorf("customize params: %w", err) + } + + rendered, err := RenderKustomize(manifestPath, appNs) + if err != nil { + return nil, fmt.Errorf("kustomize: %w", err) + } + + resources, err := PostRender(ctx, log, tenant, rendered) + if err != nil { + return nil, fmt.Errorf("post-render: %w", err) + } + + if err := ApplyRendered(ctx, c, scheme, tenant, resources); err != nil { + return nil, fmt.Errorf("apply: %w", err) + } + + ready, detail, err := MaasAPIDeploymentReady(ctx, c, appNs) + if err != nil { + return nil, fmt.Errorf("deployment status: %w", err) + } + if !ready { + return &RunResult{DeploymentPending: true, Detail: detail}, nil + } + return &RunResult{}, nil +} + +// Run executes the ODH-equivalent modelsasservice pipeline against Tenant. +// The application namespace is derived from tenant.Namespace (Tenant CR is co-located with workloads). +func Run(ctx context.Context, log logr.Logger, c client.Client, scheme *runtime.Scheme, tenant *maasv1alpha1.Tenant, manifestPath string) (*RunResult, error) { + manifestPath, err := filepath.Abs(manifestPath) + if err != nil { + return nil, fmt.Errorf("manifest path: %w", err) + } + + if err := CheckDependencies(ctx, c); err != nil { + return nil, err + } + + appNs := tenant.Namespace + if errs := validation.IsDNS1123Subdomain(appNs); len(errs) > 0 { + return nil, fmt.Errorf("invalid application namespace %q: %v", appNs, errs) + } + + if err := ValidatePrerequisites(ctx, c, appNs); err != nil { + return nil, fmt.Errorf("prerequisites: %w", err) + } + + return RunPlatform(ctx, log, c, scheme, tenant, manifestPath, appNs) +} + +// MaasAPIDeploymentReady mirrors ODH deployments action for maas-api. +func MaasAPIDeploymentReady(ctx context.Context, c client.Client, appNamespace string) (ready bool, detail string, err error) { + dep := &appsv1.Deployment{} + key := types.NamespacedName{Namespace: appNamespace, Name: MaaSAPIDeploymentName} + if err := c.Get(ctx, key, dep); err != nil { + if apierrors.IsNotFound(err) { + return false, fmt.Sprintf("deployment %s/%s not found", appNamespace, MaaSAPIDeploymentName), nil + } + return false, "", err + } + desired := int32(1) + if dep.Spec.Replicas != nil { + desired = *dep.Spec.Replicas + } + if dep.Status.ObservedGeneration < dep.Generation { + return false, "waiting for deployment spec to be observed", nil + } + if dep.Status.UpdatedReplicas < desired { + return false, fmt.Sprintf("updated replicas %d/%d", dep.Status.UpdatedReplicas, desired), nil + } + if dep.Status.AvailableReplicas < desired { + return false, fmt.Sprintf("available replicas %d/%d", dep.Status.AvailableReplicas, desired), nil + } + return true, "", nil +} diff --git a/maas-controller/pkg/platform/tenantreconcile/postrender.go b/maas-controller/pkg/platform/tenantreconcile/postrender.go new file mode 100644 index 000000000..1c187e419 --- /dev/null +++ b/maas-controller/pkg/platform/tenantreconcile/postrender.go @@ -0,0 +1,423 @@ +package tenantreconcile + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "fmt" + "sort" + "strconv" + "strings" + + "github.com/go-logr/logr" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + + maasv1alpha1 "github.com/opendatahub-io/models-as-a-service/maas-controller/api/maas/v1alpha1" +) + +// PostRender mutates rendered resources the same way as ODH modelsasservice post-kustomize actions. +func PostRender(ctx context.Context, log logr.Logger, tenant *maasv1alpha1.Tenant, resources []unstructured.Unstructured) ([]unstructured.Unstructured, error) { + gatewayNamespace := tenant.Spec.GatewayRef.Namespace + gatewayName := tenant.Spec.GatewayRef.Name + + // Filter out resources with opendatahub.io/managed: false annotation + var filteredResources []unstructured.Unstructured + for i := range resources { + resource := &resources[i] + + // Skip resources with opendatahub.io/managed: false annotation + annotations := resource.GetAnnotations() + if annotations != nil && annotations["opendatahub.io/managed"] == "false" { + log.V(2).Info("Skipping resource due to opendatahub.io/managed=false annotation", + "kind", resource.GetKind(), "name", resource.GetName(), "namespace", resource.GetNamespace()) + continue + } + + gvk := resource.GroupVersionKind() + switch { + case gvk == GVKAuthPolicy && resource.GetName() == GatewayDefaultAuthPolicyName: + if err := configureAuthPolicy(log, resource, gatewayNamespace, gatewayName); err != nil { + return nil, err + } + case gvk == GVKTokenRateLimitPolicy && resource.GetName() == GatewayTokenRateLimitDefaultDenyPolicyName: + if err := configureTokenRateLimitPolicy(log, resource, gatewayNamespace, gatewayName); err != nil { + return nil, err + } + case gvk == GVKDestinationRule && resource.GetName() == GatewayDestinationRuleName: + configureDestinationRule(log, resource, gatewayNamespace) + } + + filteredResources = append(filteredResources, *resource) + } + + setManagedFalseAnnotation(filteredResources) + + if err := configureExternalOIDC(log, tenant, filteredResources); err != nil { + return nil, err + } + if err := configureTelemetryPolicyResources(log, tenant, &filteredResources); err != nil { + return nil, err + } + if err := configureIstioTelemetryResources(log, tenant, &filteredResources); err != nil { + return nil, err + } + if err := configureConfigHashAnnotation(log, filteredResources); err != nil { + return nil, err + } + _ = ctx + return filteredResources, nil +} + +func configureAuthPolicy(log logr.Logger, resource *unstructured.Unstructured, gatewayNamespace, gatewayName string) error { + log.V(4).Info("Configuring AuthPolicy", "name", resource.GetName(), "newNamespace", gatewayNamespace, "newTargetGateway", gatewayName) + resource.SetNamespace(gatewayNamespace) + if err := unstructured.SetNestedField(resource.Object, gatewayName, "spec", "targetRef", "name"); err != nil { + return fmt.Errorf("failed to set spec.targetRef.name on AuthPolicy: %w", err) + } + return nil +} + +func configureTokenRateLimitPolicy(log logr.Logger, resource *unstructured.Unstructured, gatewayNamespace, gatewayName string) error { + log.V(4).Info("Configuring TokenRateLimitPolicy", "name", resource.GetName(), "newNamespace", gatewayNamespace, "newTargetGateway", gatewayName) + resource.SetNamespace(gatewayNamespace) + if err := unstructured.SetNestedField(resource.Object, gatewayName, "spec", "targetRef", "name"); err != nil { + return fmt.Errorf("failed to set spec.targetRef.name on TokenRateLimitPolicy: %w", err) + } + return nil +} + +func configureDestinationRule(log logr.Logger, resource *unstructured.Unstructured, gatewayNamespace string) { + log.V(4).Info("Configuring DestinationRule", "name", resource.GetName(), "newNamespace", gatewayNamespace) + resource.SetNamespace(gatewayNamespace) +} + +// setManagedFalseAnnotation marks the maas-api AuthPolicy with opendatahub.io/managed=false +// so the ODH operator does not reconcile it back to its defaults after the Tenant reconciler +// has applied OIDC, audience, and other customizations. +func setManagedFalseAnnotation(resources []unstructured.Unstructured) { + for i := range resources { + r := &resources[i] + if r.GroupVersionKind() == GVKAuthPolicy && r.GetName() == MaaSAPIAuthPolicyName { + ann := r.GetAnnotations() + if ann == nil { + ann = make(map[string]string) + } + ann["opendatahub.io/managed"] = "false" + r.SetAnnotations(ann) + return + } + } +} + +func configureExternalOIDC(log logr.Logger, tenant *maasv1alpha1.Tenant, resources []unstructured.Unstructured) error { + if tenant.Spec.ExternalOIDC == nil { + return nil + } + oidc := tenant.Spec.ExternalOIDC + for i := range resources { + resource := &resources[i] + if resource.GroupVersionKind() == GVKAuthPolicy && resource.GetName() == MaaSAPIAuthPolicyName { + return patchAuthPolicyWithOIDC(log, resource, oidc) + } + } + return fmt.Errorf("rendered resources are missing AuthPolicy %q while spec.externalOIDC is configured — refusing to deploy without OIDC rules", MaaSAPIAuthPolicyName) +} + +func patchAuthPolicyWithOIDC(log logr.Logger, resource *unstructured.Unstructured, oidc *maasv1alpha1.TenantExternalOIDCConfig) error { + ttl := int64(oidc.TTL) + if ttl == 0 { + ttl = 300 + } + if err := unstructured.SetNestedField(resource.Object, map[string]any{ + "when": []any{ + map[string]any{ + "predicate": `!request.headers.authorization.startsWith("Bearer sk-oai-") && request.headers.authorization.matches("^Bearer [^.]+\\.[^.]+\\.[^.]+$")`, + }, + }, + "jwt": map[string]any{ + "issuerUrl": oidc.IssuerURL, + "ttl": ttl, + }, + "priority": int64(1), + }, "spec", "rules", "authentication", "oidc-identities"); err != nil { + return fmt.Errorf("failed to set oidc-identities: %w", err) + } + if err := unstructured.SetNestedField(resource.Object, int64(2), + "spec", "rules", "authentication", "openshift-identities", "priority"); err != nil { + return fmt.Errorf("failed to set openshift-identities priority: %w", err) + } + if err := unstructured.SetNestedField(resource.Object, []any{ + map[string]any{ + "predicate": `!request.headers.authorization.startsWith("Bearer sk-oai-")`, + }, + }, "spec", "rules", "authentication", "openshift-identities", "when"); err != nil { + return fmt.Errorf("failed to set openshift-identities when: %w", err) + } + if err := unstructured.SetNestedField(resource.Object, map[string]any{ + "when": []any{ + map[string]any{ + "predicate": `!request.headers.authorization.startsWith("Bearer sk-oai-") && request.headers.authorization.matches("^Bearer [^.]+\\.[^.]+\\.[^.]+$")`, + }, + }, + "patternMatching": map[string]any{ + "patterns": []any{ + map[string]any{ + "selector": "auth.identity.azp", + "operator": "eq", + "value": oidc.ClientID, + }, + }, + }, + "priority": int64(1), + }, "spec", "rules", "authorization", "oidc-client-bound"); err != nil { + return fmt.Errorf("failed to set oidc-client-bound: %w", err) + } + if err := unstructured.SetNestedField(resource.Object, map[string]any{ + "expression": `has(auth.identity.preferred_username) ? auth.identity.preferred_username : (has(auth.identity.sub) ? auth.identity.sub : auth.identity.user.username)`, + }, "spec", "rules", "response", "success", "headers", "X-MaaS-Username-OC", "plain"); err != nil { + return fmt.Errorf("failed to set X-MaaS-Username-OC: %w", err) + } + groupsExpr := `has(auth.identity.groups) ? ` + + `(size(auth.identity.groups) > 0 ? ` + + `'["system:authenticated","' + auth.identity.groups.join('","') + '"]' : ` + + `'["system:authenticated"]') : ` + + `'["' + auth.identity.user.groups.join('","') + '"]'` + if err := unstructured.SetNestedField(resource.Object, map[string]any{ + "expression": groupsExpr, + }, "spec", "rules", "response", "success", "headers", "X-MaaS-Group-OC", "plain"); err != nil { + return fmt.Errorf("failed to set X-MaaS-Group-OC: %w", err) + } + log.Info("Patched maas-api AuthPolicy with external OIDC configuration", "issuerUrl", oidc.IssuerURL, "clientId", oidc.ClientID) + return nil +} + +func isTelemetryEnabled(t *maasv1alpha1.TenantTelemetryConfig) bool { + if t == nil { + return false + } + if t.Enabled == nil { + return false + } + return *t.Enabled +} + +func configureTelemetryPolicyResources(log logr.Logger, tenant *maasv1alpha1.Tenant, resources *[]unstructured.Unstructured) error { + if !isTelemetryEnabled(tenant.Spec.Telemetry) { + return nil + } + // Caller should have checked CRD; still skip if API missing at apply time. + gatewayNamespace := tenant.Spec.GatewayRef.Namespace + gatewayName := tenant.Spec.GatewayRef.Name + metricLabels := buildTelemetryLabels(log, tenant.Spec.Telemetry) + tp := &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "extensions.kuadrant.io/v1alpha1", + "kind": "TelemetryPolicy", + "metadata": map[string]any{ + "name": TelemetryPolicyName, + "namespace": gatewayNamespace, + "labels": map[string]any{ + "app.kubernetes.io/part-of": "maas-observability", + LabelTenantName: tenant.Name, + LabelTenantNamespace: tenant.Namespace, + }, + }, + "spec": map[string]any{ + "targetRef": map[string]any{ + "group": "gateway.networking.k8s.io", + "kind": "Gateway", + "name": gatewayName, + }, + "metrics": map[string]any{ + "default": map[string]any{ + "labels": metricLabels, + }, + }, + }, + }, + } + log.V(2).Info("Appending TelemetryPolicy", "name", TelemetryPolicyName, "namespace", gatewayNamespace) + *resources = append(*resources, *tp) + return nil +} + +func configureIstioTelemetryResources(log logr.Logger, tenant *maasv1alpha1.Tenant, resources *[]unstructured.Unstructured) error { + if !isTelemetryEnabled(tenant.Spec.Telemetry) { + return nil + } + gatewayNamespace := tenant.Spec.GatewayRef.Namespace + gatewayName := tenant.Spec.GatewayRef.Name + istioTelemetry := &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "telemetry.istio.io/v1", + "kind": "Telemetry", + "metadata": map[string]any{ + "name": IstioTelemetryName, + "namespace": gatewayNamespace, + "labels": map[string]any{ + "app.kubernetes.io/part-of": "maas-observability", + LabelTenantName: tenant.Name, + LabelTenantNamespace: tenant.Namespace, + }, + }, + "spec": map[string]any{ + "selector": map[string]any{ + "matchLabels": map[string]any{ + "gateway.networking.k8s.io/gateway-name": gatewayName, + }, + }, + "metrics": []any{ + map[string]any{ + "providers": []any{map[string]any{"name": "prometheus"}}, + "overrides": []any{ + map[string]any{ + "match": map[string]any{"metric": "REQUEST_DURATION", "mode": "CLIENT_AND_SERVER"}, + "tagOverrides": map[string]any{ + "subscription": map[string]any{ + "operation": "UPSERT", + "value": `request.headers["x-maas-subscription"]`, + }, + }, + }, + }, + }, + }, + }, + }, + } + log.V(2).Info("Appending Istio Telemetry", "name", IstioTelemetryName, "namespace", gatewayNamespace) + *resources = append(*resources, *istioTelemetry) + return nil +} + +func buildTelemetryLabels(log logr.Logger, config *maasv1alpha1.TenantTelemetryConfig) map[string]any { + captureOrganization := true + captureUser := false + captureGroup := false + captureModelUsage := true + if config != nil && config.Metrics != nil { + metrics := config.Metrics + if metrics.CaptureOrganization != nil { + captureOrganization = *metrics.CaptureOrganization + } + if metrics.CaptureUser != nil { + captureUser = *metrics.CaptureUser + } + if metrics.CaptureGroup != nil { + captureGroup = *metrics.CaptureGroup + } + if metrics.CaptureModelUsage != nil { + captureModelUsage = *metrics.CaptureModelUsage + } + } + labels := map[string]any{ + "subscription": "auth.identity.selected_subscription", + "cost_center": "auth.identity.subscription_info.costCenter", + } + if captureOrganization { + labels["organization_id"] = "auth.identity.subscription_info.organizationId" + } + if captureUser { + log.Info("WARNING: User identity metrics enabled - ensure GDPR/privacy compliance", "field", "captureUser", "value", true) + labels["user"] = "auth.identity.userid" + } + if captureGroup { + labels["group"] = "auth.identity.group" + } + if captureModelUsage { + labels["model"] = "responseBodyJSON(\"/model\")" + } + return labels +} + +func configureConfigHashAnnotation(log logr.Logger, resources []unstructured.Unstructured) error { + var configMap *corev1.ConfigMap + for idx := range resources { + resource := &resources[idx] + if resource.GroupVersionKind() == GVKConfigMap && resource.GetName() == MaaSParametersConfigMapName { + cm := &corev1.ConfigMap{} + if err := runtime.DefaultUnstructuredConverter.FromUnstructured(resource.Object, cm); err != nil { + return fmt.Errorf("failed to convert ConfigMap: %w", err) + } + configMap = cm + break + } + } + if configMap == nil { + log.V(1).Info("ConfigMap not found in rendered resources, skipping config hash annotation", "expectedName", MaaSParametersConfigMapName) + return nil + } + + configHash := hashConfigMapData(configMap.Data) + log.V(4).Info("Computed ConfigMap hash", "hash", configHash, "configMap", configMap.Name) + + var deployment *appsv1.Deployment + depIdx := -1 + for idx := range resources { + resource := &resources[idx] + if resource.GroupVersionKind() == GVKDeployment && resource.GetName() == MaaSAPIDeploymentName { + dep := &appsv1.Deployment{} + if err := runtime.DefaultUnstructuredConverter.FromUnstructured(resource.Object, dep); err != nil { + return fmt.Errorf("failed to convert Deployment: %w", err) + } + deployment = dep + depIdx = idx + break + } + } + if deployment == nil { + log.V(1).Info("Deployment not found in rendered resources, skipping config hash annotation", "expectedName", MaaSAPIDeploymentName) + return nil + } + + if deployment.Spec.Template.Annotations == nil { + deployment.Spec.Template.Annotations = make(map[string]string) + } + annotationKey := LabelODHAppPrefix + "/maas-config-hash" + deployment.Spec.Template.Annotations[annotationKey] = configHash + + u, err := runtime.DefaultUnstructuredConverter.ToUnstructured(deployment) + if err != nil { + return fmt.Errorf("failed to convert Deployment back to unstructured: %w", err) + } + resources[depIdx].Object = u + + return nil +} + +func hashConfigMapData(data map[string]string) string { + keys := make([]string, 0, len(data)) + for k := range data { + keys = append(keys, k) + } + sort.Strings(keys) + var sb strings.Builder + for _, k := range keys { + sb.WriteString(k) + sb.WriteString("=") + sb.WriteString(data[k]) + sb.WriteString("\n") + } + hash := sha256.Sum256([]byte(sb.String())) + return hex.EncodeToString(hash[:]) +} + +// CustomizeParams writes gateway/app-namespace/cluster-audience and optional API key days into overlay params.env +// (same keys as ODH customizeManifests; images use RELATED_IMAGE_* like ODH Init + ApplyParams). +func CustomizeParams(manifestDir string, tenant *maasv1alpha1.Tenant, appNamespace string, clusterAudience string) error { + params := map[string]string{ + "gateway-namespace": tenant.Spec.GatewayRef.Namespace, + "gateway-name": tenant.Spec.GatewayRef.Name, + "app-namespace": appNamespace, + } + if tenant.Spec.APIKeys != nil && tenant.Spec.APIKeys.MaxExpirationDays != nil { + params["api-key-max-expiration-days"] = strconv.FormatInt(int64(*tenant.Spec.APIKeys.MaxExpirationDays), 10) + } + if clusterAudience != "" { + params["cluster-audience"] = clusterAudience + } + return ApplyParams(manifestDir, "params.env", ImageParamKeys, params) +} diff --git a/maas-controller/pkg/platform/tenantreconcile/prerequisites.go b/maas-controller/pkg/platform/tenantreconcile/prerequisites.go new file mode 100644 index 000000000..375d3c2d6 --- /dev/null +++ b/maas-controller/pkg/platform/tenantreconcile/prerequisites.go @@ -0,0 +1,207 @@ +package tenantreconcile + +import ( + "context" + "fmt" + "strings" + + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/yaml" +) + +// IsGVKAvailable uses the REST mapper (same spirit as ODH dependency checks). +func IsGVKAvailable(c client.Client, gvk schema.GroupVersionKind) (bool, error) { + _, err := c.RESTMapper().RESTMapping(gvk.GroupKind(), gvk.Version) + if err != nil { + if meta.IsNoMatchError(err) { + return false, nil + } + return false, err + } + return true, nil +} + +// GetClusterServiceAccountIssuer returns spec.serviceAccountIssuer from OpenShift Authentication/cluster, or "". +func GetClusterServiceAccountIssuer(ctx context.Context, c client.Reader) (string, error) { + u := &unstructured.Unstructured{} + u.SetGroupVersionKind(schema.GroupVersionKind{Group: "config.openshift.io", Version: "v1", Kind: "Authentication"}) + if err := c.Get(ctx, client.ObjectKey{Name: "cluster"}, u); err != nil { + if meta.IsNoMatchError(err) || apierrors.IsNotFound(err) { + return "", nil + } + return "", err + } + issuer, found, err := unstructured.NestedString(u.Object, "spec", "serviceAccountIssuer") + if err != nil { + return "", fmt.Errorf("reading spec.serviceAccountIssuer: %w", err) + } + if !found { + return "", nil + } + return issuer, nil +} + +func gvkListKind(gvk schema.GroupVersionKind) schema.GroupVersionKind { + out := gvk + out.Kind = gvk.Kind + "List" + return out +} + +// PrerequisiteReport separates blocking errors from warnings (ODH modelsasservice validatePrerequisites parity). +type PrerequisiteReport struct { + Blocking []string + Warnings []string +} + +// CollectPrerequisiteReport runs prerequisite checks and returns blocking vs warning messages. +func CollectPrerequisiteReport(ctx context.Context, c client.Client, appNamespace string) PrerequisiteReport { + log := log.FromContext(ctx) + var rep PrerequisiteReport + + if msg := checkAuthorinoTLS(ctx, c); msg != "" { + rep.Warnings = append(rep.Warnings, msg) + log.V(1).Info("MaaS prerequisite warning", "check", "authorino-tls", "message", msg) + } + if msg := checkDatabaseSecret(ctx, c, appNamespace); msg != "" { + rep.Blocking = append(rep.Blocking, msg) + log.Error(nil, "MaaS prerequisite error", "check", "database-secret", "message", msg) + } + if msg := checkUserWorkloadMonitoring(ctx, c); msg != "" { + rep.Warnings = append(rep.Warnings, msg) + log.V(1).Info("MaaS prerequisite warning", "check", "user-workload-monitoring", "message", msg) + } + + return rep +} + +// ValidatePrerequisites mirrors modelsasservice validatePrerequisites (blocking + warnings). +// Warnings do not return an error; callers may surface them on status separately. +func ValidatePrerequisites(ctx context.Context, c client.Client, appNamespace string) error { + rep := CollectPrerequisiteReport(ctx, c, appNamespace) + if len(rep.Blocking) > 0 { + all := append(append([]string{}, rep.Blocking...), rep.Warnings...) + return fmt.Errorf("blocking prerequisites missing: %s", strings.Join(all, "; ")) + } + return nil +} + +func checkAuthorinoTLS(ctx context.Context, c client.Client) string { + has, err := IsGVKAvailable(c, GVKAuthorino) + if err != nil { + log.FromContext(ctx).Error(err, "failed to check Authorino API availability") + return "failed to check Authorino CRD availability due to a cluster API error" + } + if !has { + return "" + } + + authorinoList := &unstructured.UnstructuredList{} + authorinoList.SetGroupVersionKind(gvkListKind(GVKAuthorino)) + if err := c.List(ctx, authorinoList); err != nil { + log.FromContext(ctx).Error(err, "failed to list Authorino instances") + return "failed to list Authorino instances due to a cluster API error" + } + + if len(authorinoList.Items) == 0 { + return "no Authorino instances found. " + + "Authorino must be deployed and configured with TLS for MaaS authentication" + } + + for i := range authorinoList.Items { + item := &authorinoList.Items[i] + enabled, _, err := unstructured.NestedBool(item.Object, "spec", "listener", "tls", "enabled") + if err != nil { + log.FromContext(ctx).Error(err, "failed to read spec.listener.tls.enabled from Authorino", "name", item.GetName()) + continue + } + certName, _, err := unstructured.NestedString(item.Object, "spec", "listener", "tls", "certSecretRef", "name") + if err != nil { + log.FromContext(ctx).Error(err, "failed to read spec.listener.tls.certSecretRef.name from Authorino", "name", item.GetName()) + continue + } + if enabled && certName != "" { + return "" + } + } + + return "Authorino TLS is not configured: no Authorino instance has listener.tls.enabled=true with a certSecretRef. " + + "Patch Authorino with spec.listener.tls.enabled=true and spec.listener.tls.certSecretRef to enable TLS. " + + "See https://docs.kuadrant.io/1.0.x/authorino/docs/user-guides/mtls-authentication/" +} + +func checkDatabaseSecret(ctx context.Context, c client.Client, appNamespace string) string { + secret := &corev1.Secret{} + err := c.Get(ctx, types.NamespacedName{ + Namespace: appNamespace, + Name: MaaSDBSecretName, + }, secret) + + if err != nil { + if apierrors.IsNotFound(err) { + return fmt.Sprintf("database Secret '%s' not found in namespace '%s'. "+ + "Create the Secret with key '%s' containing the PostgreSQL connection URL. "+ + "MaaS API cannot start without a database connection", + MaaSDBSecretName, appNamespace, MaaSDBSecretKey) + } + log.FromContext(ctx).Error(err, "failed to check database Secret", "name", MaaSDBSecretName, "namespace", appNamespace) + return fmt.Sprintf("failed to check database Secret '%s' in namespace '%s' due to a cluster API error", + MaaSDBSecretName, appNamespace) + } + + value, ok := secret.Data[MaaSDBSecretKey] + if !ok || strings.TrimSpace(string(value)) == "" { + return fmt.Sprintf("database Secret '%s' in namespace '%s' is missing required key '%s'. "+ + "The Secret must contain a valid PostgreSQL connection URL", + MaaSDBSecretName, appNamespace, MaaSDBSecretKey) + } + + return "" +} + +func checkUserWorkloadMonitoring(ctx context.Context, c client.Client) string { + cm := &corev1.ConfigMap{} + err := c.Get(ctx, types.NamespacedName{ + Namespace: MonitoringNamespace, + Name: ClusterMonitoringConfigName, + }, cm) + + if err != nil { + if apierrors.IsNotFound(err) { + return "User Workload Monitoring not configured: ConfigMap 'cluster-monitoring-config' not found in 'openshift-monitoring'. " + + "Showback/FinOps usage views will not work without User Workload Monitoring enabled" + } + log.FromContext(ctx).Error(err, "unable to verify User Workload Monitoring status") + return "unable to verify User Workload Monitoring status due to a cluster API error. " + + "Ensure User Workload Monitoring is enabled for showback functionality" + } + + configData, ok := cm.Data["config.yaml"] + if !ok { + return "User Workload Monitoring is not enabled. " + + "Set enableUserWorkload: true in 'cluster-monitoring-config' ConfigMap in 'openshift-monitoring' namespace. " + + "Showback/FinOps usage views will not work without it" + } + + var cfg struct { + EnableUserWorkload bool `yaml:"enableUserWorkload"` + } + if err := yaml.Unmarshal([]byte(configData), &cfg); err != nil { + return "User Workload Monitoring config is invalid in 'cluster-monitoring-config'. " + + "Ensure config.yaml is valid YAML and sets enableUserWorkload: true" + } + + if !cfg.EnableUserWorkload { + return "User Workload Monitoring is not enabled. " + + "Set enableUserWorkload: true in 'cluster-monitoring-config' ConfigMap in 'openshift-monitoring' namespace. " + + "Showback/FinOps usage views will not work without it" + } + + return "" +} diff --git a/maas-controller/pkg/reconciler/externalmodel/reconciler.go b/maas-controller/pkg/reconciler/externalmodel/reconciler.go index d0eff5509..8d0c91a0f 100644 --- a/maas-controller/pkg/reconciler/externalmodel/reconciler.go +++ b/maas-controller/pkg/reconciler/externalmodel/reconciler.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "strconv" - "strings" "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" @@ -18,38 +17,30 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/event" - "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/log" gatewayapiv1 "sigs.k8s.io/gateway-api/apis/v1" maasv1alpha1 "github.com/opendatahub-io/models-as-a-service/maas-controller/api/maas/v1alpha1" + "github.com/opendatahub-io/models-as-a-service/maas-controller/pkg/platform/tenantreconcile" ) const ( - // AnnExtraHeaders allows setting additional headers on the HTTPRoute. - // Format: "key1=value1,key2=value2" - AnnExtraHeaders = "maas.opendatahub.io/extra-headers" + // annotationPort overrides the default port (443). + annotationPort = "maas.opendatahub.io/port" - // AnnPort overrides the default port (443). - AnnPort = "maas.opendatahub.io/port" + // annotationTLS controls TLS origination (default "true"). + annotationTLS = "maas.opendatahub.io/tls" - // AnnTLS controls TLS origination (default "true"). - AnnTLS = "maas.opendatahub.io/tls" - - // AnnPathPrefix overrides the default path prefix (/external//). - AnnPathPrefix = "maas.opendatahub.io/path-prefix" - - // Default gateway (matches MaaS controller defaults) - defaultGatewayName = "maas-default-gateway" - defaultGatewayNamespace = "openshift-ingress" + defaultGatewayName = tenantreconcile.DefaultGatewayName + defaultGatewayNamespace = tenantreconcile.DefaultGatewayNamespace ) -// Reconciler watches MaaSModelRef CRs with kind=ExternalModel and creates -// the Istio resources needed to route to the external provider. +// Reconciler watches ExternalModel CRs and creates the Istio resources +// needed to route to the external provider. // -// All resources are created in the model's namespace (same as the MaaSModelRef). -// OwnerReferences on each resource ensure Kubernetes garbage collection handles -// cleanup when the MaaSModelRef is deleted — no finalizer needed. +// All resources are created in the ExternalModel's namespace. +// OwnerReferences on each resource ensure Kubernetes garbage collection +// handles cleanup when the ExternalModel is deleted — no finalizer needed. type Reconciler struct { client.Client Scheme *runtime.Scheme @@ -72,20 +63,60 @@ func (r *Reconciler) gatewayNamespace() string { return defaultGatewayNamespace } +// commonLabels returns labels applied to all managed resources. +func commonLabels(modelName string) map[string]string { + return map[string]string{ + "app.kubernetes.io/managed-by": "maas-external-model-reconciler", + "maas.opendatahub.io/external-model": modelName, + } +} + +// getTLSInfo reads optional TLS overrides from ExternalModel annotations. +// Returns tls enabled (default true) and port (default 443). +func getTLSInfo(extModel *maasv1alpha1.ExternalModel) (tls bool, port int32, err error) { + tls = true + port = 443 + + annotations := extModel.GetAnnotations() + if annotations == nil { + return + } + + if portStr, ok := annotations[annotationPort]; ok { + p, parseErr := strconv.ParseInt(portStr, 10, 32) + if parseErr != nil { + return false, 0, fmt.Errorf("invalid port %q: %w", portStr, parseErr) + } + if p < 1 || p > 65535 { + return false, 0, fmt.Errorf("port %d out of range (1-65535)", p) + } + port = int32(p) + } + + if tlsStr, ok := annotations[annotationTLS]; ok { + parsed, parseErr := strconv.ParseBool(tlsStr) + if parseErr != nil { + return false, 0, fmt.Errorf("invalid tls value %q: %w", tlsStr, parseErr) + } + tls = parsed + } + + return +} + //+kubebuilder:rbac:groups=gateway.networking.k8s.io,resources=httproutes,verbs=get;list;watch;create;update //+kubebuilder:rbac:groups=maas.opendatahub.io,resources=externalmodels,verbs=get;list;watch -//+kubebuilder:rbac:groups=maas.opendatahub.io,resources=maasmodelrefs,verbs=get;list;watch -//+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;update +//+kubebuilder:rbac:groups=maas.opendatahub.io,resources=externalmodels/finalizers,verbs=update +//+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;update;delete //+kubebuilder:rbac:groups=networking.istio.io,resources=serviceentries,verbs=get;list;watch;create;update //+kubebuilder:rbac:groups=networking.istio.io,resources=destinationrules,verbs=get;list;watch;create;update;delete -// Reconcile handles create/update/delete of MaaSModelRef CRs with kind=ExternalModel. -// The ExternalModel kind filter is handled by the predicate in SetupWithManager. +// Reconcile handles create/update/delete of ExternalModel CRs. func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - log := r.Log.WithValues("maasmodelref", req.NamespacedName) + log.FromContext(ctx).Info("Reconciling ExternalModel", "namespace", req.Namespace, "name", req.Name) - model := &maasv1alpha1.MaaSModelRef{} - if err := r.Get(ctx, req.NamespacedName, model); err != nil { + extModel := &maasv1alpha1.ExternalModel{} + if err := r.Get(ctx, req.NamespacedName, extModel); err != nil { if apierrors.IsNotFound(err) { return ctrl.Result{}, nil } @@ -93,87 +124,73 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu } // Nothing to do on deletion — OwnerReferences handle cleanup - if !model.GetDeletionTimestamp().IsZero() { + if !extModel.GetDeletionTimestamp().IsZero() { return ctrl.Result{}, nil } - // Fetch the referenced ExternalModel CR to get provider configuration - extModel := &maasv1alpha1.ExternalModel{} - extModelKey := types.NamespacedName{ - Name: model.Spec.ModelRef.Name, - Namespace: model.Namespace, - } - if err := r.Get(ctx, extModelKey, extModel); err != nil { - if apierrors.IsNotFound(err) { - log.Info("ExternalModel CR not found, waiting", "name", model.Spec.ModelRef.Name) - return ctrl.Result{}, nil - } - return ctrl.Result{}, fmt.Errorf("failed to get ExternalModel %s: %w", model.Spec.ModelRef.Name, err) - } - - spec, err := specFromExternalModel(extModel, model) + tls, port, err := getTLSInfo(extModel) if err != nil { - log.Error(err, "Failed to parse ExternalModel spec") - return ctrl.Result{}, fmt.Errorf("invalid ExternalModel spec: %w", err) + return ctrl.Result{}, fmt.Errorf("invalid ExternalModel annotations: %w", err) } - log.Info("Reconciling ExternalModel", - "provider", spec.Provider, - "endpoint", spec.Endpoint, - "tls", spec.TLS, + logger := r.Log.WithValues("externalmodel", req.NamespacedName) + logger.Info("Reconciling ExternalModel", + "provider", extModel.Spec.Provider, + "endpoint", extModel.Spec.Endpoint, + "tls", tls, ) - ns := model.Namespace + ns := extModel.Namespace + name := extModel.Name gwName := r.gatewayName() gwNamespace := r.gatewayNamespace() - labels := commonLabels(model.GetName()) + labels := commonLabels(name) // 1. ExternalName Service (backend for HTTPRoute) - svc := BuildService(spec, model.Name, ns, labels) - if err := controllerutil.SetControllerReference(model, svc, r.Scheme); err != nil { + svc := buildService(extModel.Spec.Endpoint, name, ns, port, labels) + if err := controllerutil.SetControllerReference(extModel, svc, r.Scheme); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set owner on Service: %w", err) } - if err := r.applyService(ctx, log, svc); err != nil { + if err := r.applyService(ctx, logger, svc); err != nil { return ctrl.Result{}, fmt.Errorf("failed to create Service: %w", err) } // 2. ServiceEntry (registers external host in mesh) - se := BuildServiceEntry(spec, model.Name, ns, labels) - if err := r.setUnstructuredOwner(model, se); err != nil { + se := buildServiceEntry(extModel.Spec.Endpoint, name, ns, port, tls, labels) + if err := r.setUnstructuredOwner(extModel, se); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set owner on ServiceEntry: %w", err) } - if err := r.applyUnstructured(ctx, log, se); err != nil { + if err := r.applyUnstructured(ctx, logger, se); err != nil { return ctrl.Result{}, fmt.Errorf("failed to create ServiceEntry: %w", err) } // 3. DestinationRule (only if TLS; delete stale DR when TLS is disabled) - drName := ModelDestinationRuleName(model.Name) - if spec.TLS { - dr := BuildDestinationRule(spec, model.Name, ns, labels) - if err := r.setUnstructuredOwner(model, dr); err != nil { + if tls { + dr := buildDestinationRule(extModel.Spec.Endpoint, name, ns, labels) + if err := r.setUnstructuredOwner(extModel, dr); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set owner on DestinationRule: %w", err) } - if err := r.applyUnstructured(ctx, log, dr); err != nil { + if err := r.applyUnstructured(ctx, logger, dr); err != nil { return ctrl.Result{}, fmt.Errorf("failed to create DestinationRule: %w", err) } } else { - if err := r.deleteIfExists(ctx, log, "DestinationRule", drName, ns, schema.GroupVersionKind{ + if err := r.deleteIfExists(ctx, logger, "DestinationRule", name, ns, schema.GroupVersionKind{ Group: "networking.istio.io", Version: "v1", Kind: "DestinationRule", }); err != nil { - log.Error(err, "Failed to delete stale DestinationRule", "name", drName) + return ctrl.Result{}, fmt.Errorf("failed to delete stale DestinationRule: %w", err) } } // 4. HTTPRoute (routes requests to external provider via gateway) - hr := BuildHTTPRoute(spec, model.Name, ns, gwName, gwNamespace, labels) - if err := controllerutil.SetControllerReference(model, hr, r.Scheme); err != nil { + hr := buildHTTPRoute(extModel.Spec.Endpoint, name, extModel.Spec.TargetModel, ns, port, gwName, gwNamespace, labels) + if err := controllerutil.SetControllerReference(extModel, hr, r.Scheme); err != nil { return ctrl.Result{}, fmt.Errorf("failed to set owner on HTTPRoute: %w", err) } - if err := r.applyHTTPRoute(ctx, log, hr); err != nil { + if err := r.applyHTTPRoute(ctx, logger, hr); err != nil { return ctrl.Result{}, fmt.Errorf("failed to create HTTPRoute: %w", err) } - log.Info("ExternalModel resources reconciled successfully", + logger.Info("ExternalModel resources reconciled successfully", "service", svc.Name, "serviceEntry", se.GetName(), "httpRoute", hr.Name, @@ -184,7 +201,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu } // setUnstructuredOwner sets the controller OwnerReference on an unstructured resource. -func (r *Reconciler) setUnstructuredOwner(owner *maasv1alpha1.MaaSModelRef, obj *unstructured.Unstructured) error { +func (r *Reconciler) setUnstructuredOwner(owner *maasv1alpha1.ExternalModel, obj *unstructured.Unstructured) error { isController := true blockDeletion := true obj.SetOwnerReferences([]metav1.OwnerReference{ @@ -228,7 +245,10 @@ func (r *Reconciler) applyService(ctx context.Context, log logr.Logger, desired if err != nil { return err } - if !equality.Semantic.DeepEqual(existing.Spec, desired.Spec) { + specChanged := !equality.Semantic.DeepEqual(existing.Spec, desired.Spec) + ownerChanged := !equality.Semantic.DeepEqual(existing.OwnerReferences, desired.OwnerReferences) + labelsChanged := !equality.Semantic.DeepEqual(existing.Labels, desired.Labels) + if specChanged || ownerChanged || labelsChanged { existing.Spec = desired.Spec existing.Labels = desired.Labels existing.OwnerReferences = desired.OwnerReferences @@ -273,95 +293,10 @@ func (r *Reconciler) applyHTTPRoute(ctx context.Context, log logr.Logger, desire return r.Update(ctx, existing) } -// specFromExternalModel reads ExternalModelSpec from the ExternalModel CR and -// optional annotation overrides from the MaaSModelRef. -// Provider and endpoint come from the ExternalModel CR (PR #586). -// Port, TLS, path-prefix, and extra-headers are optional annotation overrides on the MaaSModelRef. -func specFromExternalModel(extModel *maasv1alpha1.ExternalModel, model *maasv1alpha1.MaaSModelRef) (ExternalModelSpec, error) { - ann := model.GetAnnotations() - if ann == nil { - ann = map[string]string{} - } - - spec := ExternalModelSpec{ - Provider: extModel.Spec.Provider, - Endpoint: extModel.Spec.Endpoint, - PathPrefix: ann[AnnPathPrefix], - TLS: true, - Port: 443, - // TLSInsecureSkipVerify: extModel.Spec.TLSInsecureSkipVerify, // requires issue #627 CRD change - } - - if spec.Provider == "" { - return spec, fmt.Errorf("provider is required on ExternalModel %s", extModel.Name) - } - if spec.Endpoint == "" { - return spec, fmt.Errorf("endpoint is required on ExternalModel %s", extModel.Name) - } - - if portStr, ok := ann[AnnPort]; ok { - p, err := strconv.ParseInt(portStr, 10, 32) - if err != nil { - return spec, fmt.Errorf("invalid port %q: %w", portStr, err) - } - if p < 1 || p > 65535 { - return spec, fmt.Errorf("port %d out of range (1-65535)", p) - } - spec.Port = int32(p) - } - - if tlsStr, ok := ann[AnnTLS]; ok { - parsed, err := strconv.ParseBool(tlsStr) - if err != nil { - return spec, fmt.Errorf("invalid tls value %q: %w", tlsStr, err) - } - spec.TLS = parsed - } - - if extraStr, ok := ann[AnnExtraHeaders]; ok && extraStr != "" { - spec.ExtraHeaders = map[string]string{} - for pair := range strings.SplitSeq(extraStr, ",") { - kv := strings.SplitN(pair, "=", 2) - if len(kv) == 2 { - spec.ExtraHeaders[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1]) - } - } - } - - return spec, nil -} - -// externalModelPredicate filters MaaSModelRef events to only ExternalModel kind. -func externalModelPredicate() predicate.Predicate { - isExternalModel := func(obj client.Object) bool { - model, ok := obj.(*maasv1alpha1.MaaSModelRef) - if !ok { - return false - } - return model.Spec.ModelRef.Kind == "ExternalModel" - } - return predicate.Funcs{ - CreateFunc: func(e event.CreateEvent) bool { - return isExternalModel(e.Object) - }, - UpdateFunc: func(e event.UpdateEvent) bool { - return isExternalModel(e.ObjectOld) || isExternalModel(e.ObjectNew) - }, - DeleteFunc: func(e event.DeleteEvent) bool { - return isExternalModel(e.Object) - }, - GenericFunc: func(e event.GenericEvent) bool { - return isExternalModel(e.Object) - }, - } -} - -// SetupWithManager registers the reconciler to watch MaaSModelRef CRs -// with kind=ExternalModel only (filtered by predicate). +// SetupWithManager registers the reconciler to watch ExternalModel CRs. func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). - For(&maasv1alpha1.MaaSModelRef{}). - WithEventFilter(externalModelPredicate()). + For(&maasv1alpha1.ExternalModel{}). Named("external-model-reconciler"). Complete(r) } diff --git a/maas-controller/pkg/reconciler/externalmodel/resources.go b/maas-controller/pkg/reconciler/externalmodel/resources.go index 4f0dd72f0..05ee393a1 100644 --- a/maas-controller/pkg/reconciler/externalmodel/resources.go +++ b/maas-controller/pkg/reconciler/externalmodel/resources.go @@ -1,8 +1,6 @@ package externalmodel import ( - "strings" - corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" @@ -10,39 +8,34 @@ import ( gatewayapiv1 "sigs.k8s.io/gateway-api/apis/v1" ) -// BuildService creates a Kubernetes ExternalName Service that maps an in-cluster -// DNS name to the external FQDN. This allows HTTPRoute backendRefs to reference -// external hosts via standard k8s Service names. -func BuildService(spec ExternalModelSpec, modelName, namespace string, labels map[string]string) *corev1.Service { - svcName := ModelBackendServiceName(modelName) +// buildService creates a Kubernetes ExternalName Service that maps an in-cluster +// DNS name to the external FQDN. Uses the ExternalModel name directly. +func buildService(endpoint, name, namespace string, port int32, labels map[string]string) *corev1.Service { return &corev1.Service{ ObjectMeta: metav1.ObjectMeta{ - Name: svcName, + Name: name, Namespace: namespace, Labels: labels, }, Spec: corev1.ServiceSpec{ Type: corev1.ServiceTypeExternalName, - ExternalName: spec.Endpoint, + ExternalName: endpoint, Ports: []corev1.ServicePort{ { - Port: spec.Port, - TargetPort: intstr.FromInt32(spec.Port), + Port: port, + TargetPort: intstr.FromInt32(port), }, }, }, } } -// BuildServiceEntry creates an Istio ServiceEntry that registers the external -// FQDN in the mesh service registry. Required when outboundTrafficPolicy is -// REGISTRY_ONLY. -func BuildServiceEntry(spec ExternalModelSpec, modelName, namespace string, labels map[string]string) *unstructured.Unstructured { - seName := ModelServiceEntryName(modelName) - +// buildServiceEntry creates an Istio ServiceEntry that registers the external +// FQDN in the mesh service registry. +func buildServiceEntry(endpoint, name, namespace string, port int32, tls bool, labels map[string]string) *unstructured.Unstructured { protocol := "HTTPS" portName := "https" - if !spec.TLS { + if !tls { protocol = "HTTP" portName = "http" } @@ -50,17 +43,17 @@ func BuildServiceEntry(spec ExternalModelSpec, modelName, namespace string, labe se := &unstructured.Unstructured{} se.SetAPIVersion("networking.istio.io/v1") se.SetKind("ServiceEntry") - se.SetName(seName) + se.SetName(name) se.SetNamespace(namespace) se.SetLabels(labels) se.Object["spec"] = map[string]any{ - "hosts": []any{spec.Endpoint}, + "hosts": []any{endpoint}, "location": "MESH_EXTERNAL", "resolution": "DNS", "ports": []any{ map[string]any{ - "number": int64(spec.Port), + "number": int64(port), "name": portName, "protocol": protocol, }, @@ -69,106 +62,69 @@ func BuildServiceEntry(spec ExternalModelSpec, modelName, namespace string, labe return se } -// BuildDestinationRule creates an Istio DestinationRule that configures TLS -// origination for the external host. Skipped when TLS is false. -func BuildDestinationRule(spec ExternalModelSpec, modelName, namespace string, labels map[string]string) *unstructured.Unstructured { - drName := ModelDestinationRuleName(modelName) - +// buildDestinationRule creates an Istio DestinationRule that configures TLS +// origination for the external host. +func buildDestinationRule(endpoint, name, namespace string, labels map[string]string) *unstructured.Unstructured { dr := &unstructured.Unstructured{} dr.SetAPIVersion("networking.istio.io/v1") dr.SetKind("DestinationRule") - dr.SetName(drName) + dr.SetName(name) dr.SetNamespace(namespace) dr.SetLabels(labels) - tlsConfig := map[string]any{ - "mode": "SIMPLE", - } - if spec.TLSInsecureSkipVerify { - tlsConfig["insecureSkipVerify"] = true - } - dr.Object["spec"] = map[string]any{ - "host": spec.Endpoint, + "host": endpoint, "trafficPolicy": map[string]any{ - "tls": tlsConfig, + "tls": map[string]any{ + "mode": "SIMPLE", + }, }, } return dr } -// BuildHTTPRoute creates the maas-model- HTTPRoute in the model's namespace. -// This route is used by the MaaS auth and subscription controllers to attach -// AuthPolicy and TokenRateLimitPolicy. -// -// It contains two match rules: -// 1. Path-based match (PathPrefix: /) — required for the Kuadrant Wasm plugin -// which runs before BBR in the Envoy filter chain. Without a path predicate, auth + -// rate limiting are bypassed. -// 2. Header-based match (X-Gateway-Model-Name: ) — required for BBR's -// ClearRouteCache flow. After BBR extracts the model name from the request body, -// it sets this header and Envoy re-matches to this route. -// -// Both rules route to the backend ExternalName Service in the same namespace and apply -// a URLRewrite filter to strip the path prefix before forwarding to the external provider. -func BuildHTTPRoute(spec ExternalModelSpec, modelName, namespace, gatewayName, gatewayNamespace string, labels map[string]string) *gatewayapiv1.HTTPRoute { - routeName := ModelRouteName(modelName) - backendSvcName := ModelBackendServiceName(modelName) - +// buildHTTPRoute creates the HTTPRoute in the model's namespace. +// Path prefix is // for namespace isolation. +// Only a Host header filter is set (required for TLS SNI). +// BBR ext-proc handles path rewriting and provider-specific headers. +func buildHTTPRoute(endpoint, name, targetModel, namespace string, port int32, gatewayName, gatewayNamespace string, labels map[string]string) *gatewayapiv1.HTTPRoute { gwNamespace := gatewayapiv1.Namespace(gatewayNamespace) pathType := gatewayapiv1.PathMatchPathPrefix - pathPrefix := "/" + modelName + pathPrefix := "/" + namespace + "/" + name headerType := gatewayapiv1.HeaderMatchExact - port := gatewayapiv1.PortNumber(spec.Port) + gwPort := gatewayapiv1.PortNumber(port) timeout := gatewayapiv1.Duration("300s") backendRefs := []gatewayapiv1.HTTPBackendRef{ { BackendRef: gatewayapiv1.BackendRef{ BackendObjectReference: gatewayapiv1.BackendObjectReference{ - Name: gatewayapiv1.ObjectName(backendSvcName), - Port: &port, + Name: gatewayapiv1.ObjectName(name), + Port: &gwPort, }, }, }, } - // Build header modifiers (Host + any extra headers) - headers := []gatewayapiv1.HTTPHeader{ - { - Name: "Host", - Value: spec.Endpoint, - }, - } - for k, v := range spec.ExtraHeaders { - headers = append(headers, gatewayapiv1.HTTPHeader{ - Name: gatewayapiv1.HTTPHeaderName(k), - Value: v, - }) - } - - // Filters shared by both rules: rewrite path prefix and set Host header + // Host header is required for TLS SNI — must be set before TLS handshake, + // which happens before BBR ext-proc runs. filters := []gatewayapiv1.HTTPRouteFilter{ - { - Type: gatewayapiv1.HTTPRouteFilterURLRewrite, - URLRewrite: &gatewayapiv1.HTTPURLRewriteFilter{ - Path: &gatewayapiv1.HTTPPathModifier{ - Type: gatewayapiv1.PrefixMatchHTTPPathModifier, - ReplacePrefixMatch: strPtr("/"), - }, - }, - }, { Type: gatewayapiv1.HTTPRouteFilterRequestHeaderModifier, RequestHeaderModifier: &gatewayapiv1.HTTPHeaderFilter{ - Set: headers, + Set: []gatewayapiv1.HTTPHeader{ + { + Name: "Host", + Value: endpoint, + }, + }, }, }, } return &gatewayapiv1.HTTPRoute{ ObjectMeta: metav1.ObjectMeta{ - Name: routeName, + Name: name, Namespace: namespace, Labels: labels, }, @@ -204,7 +160,7 @@ func BuildHTTPRoute(spec ExternalModelSpec, modelName, namespace, gatewayName, g { Name: "X-Gateway-Model-Name", Type: &headerType, - Value: modelName, + Value: targetModel, }, }, }, @@ -217,22 +173,3 @@ func BuildHTTPRoute(spec ExternalModelSpec, modelName, namespace, gatewayName, g }, } } - -func sanitize(s string) string { - // Convert to lowercase and replace non-alphanumeric characters with dashes - // for RFC 1123 DNS label compatibility. - var result []byte - for _, c := range []byte(strings.ToLower(s)) { - if (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') { - result = append(result, c) - } else { - result = append(result, '-') - } - } - // Trim leading/trailing dashes - return strings.Trim(string(result), "-") -} - -func strPtr(s string) *string { - return &s -} diff --git a/maas-controller/pkg/reconciler/externalmodel/resources_test.go b/maas-controller/pkg/reconciler/externalmodel/resources_test.go index 5c2e22754..8ed89bc24 100644 --- a/maas-controller/pkg/reconciler/externalmodel/resources_test.go +++ b/maas-controller/pkg/reconciler/externalmodel/resources_test.go @@ -5,240 +5,104 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + gatewayapiv1 "sigs.k8s.io/gateway-api/apis/v1" ) -func TestSanitize(t *testing.T) { - assert.Equal(t, "api-openai-com", sanitize("api.openai.com")) - assert.Equal(t, "vllm-internal", sanitize("vllm.internal")) - assert.Equal(t, "simple", sanitize("simple")) - assert.Equal(t, "api-openai-com", sanitize("API.OpenAI.com")) // uppercase - assert.Equal(t, "host-8000", sanitize("host:8000")) // colon - assert.Equal(t, "my-host", sanitize("my_host")) // underscore -} - -func TestModelNameHelpers(t *testing.T) { - // Normal names - assert.Equal(t, "maas-model-my-gpt4", ModelRouteName("my-gpt4")) - assert.Equal(t, "maas-model-my-gpt4-backend", ModelBackendServiceName("my-gpt4")) - assert.Equal(t, "maas-model-my-gpt4-se", ModelServiceEntryName("my-gpt4")) - assert.Equal(t, "maas-model-my-gpt4-dr", ModelDestinationRuleName("my-gpt4")) - - // Names with dots (e.g., model names like "gpt-4o.v2") - assert.Equal(t, "maas-model-gpt-4o-v2", ModelRouteName("gpt-4o.v2")) - assert.Equal(t, "maas-model-gpt-4o-v2-backend", ModelBackendServiceName("gpt-4o.v2")) - - // Long names get truncated to 63 chars - longName := "this-is-a-very-long-model-name-that-exceeds-sixty-three-characters-limit" - assert.LessOrEqual(t, len(ModelRouteName(longName)), 63) - assert.LessOrEqual(t, len(ModelBackendServiceName(longName)), 63) - assert.LessOrEqual(t, len(ModelServiceEntryName(longName)), 63) - assert.LessOrEqual(t, len(ModelDestinationRuleName(longName)), 63) -} - func TestBuildService(t *testing.T) { - spec := ExternalModelSpec{ - Provider: "openai", - Endpoint: "api.openai.com", - Port: 443, - TLS: true, - } - labels := commonLabels("my-gpt4") - - svc := BuildService(spec, "my-gpt4", "llm", labels) + svc := buildService("api.openai.com", "gpt-4o", "llm", 443, commonLabels("gpt-4o")) - assert.Equal(t, ModelBackendServiceName("my-gpt4"), svc.Name) + assert.Equal(t, "gpt-4o", svc.Name) assert.Equal(t, "llm", svc.Namespace) assert.Equal(t, "api.openai.com", svc.Spec.ExternalName) assert.Equal(t, int32(443), svc.Spec.Ports[0].Port) - assert.Contains(t, svc.Labels, "maas.opendatahub.io/external-model") } func TestBuildServiceEntry(t *testing.T) { - spec := ExternalModelSpec{ - Provider: "openai", - Endpoint: "api.openai.com", - Port: 443, - TLS: true, - } - labels := commonLabels("my-gpt4") - - se := BuildServiceEntry(spec, "my-gpt4", "llm", labels) + se := buildServiceEntry("api.openai.com", "gpt-4o", "llm", 443, true, commonLabels("gpt-4o")) assert.Equal(t, "ServiceEntry", se.GetKind()) - assert.Equal(t, "networking.istio.io/v1", se.GetAPIVersion()) - assert.Equal(t, ModelServiceEntryName("my-gpt4"), se.GetName()) + assert.Equal(t, "gpt-4o", se.GetName()) assert.Equal(t, "llm", se.GetNamespace()) seSpec, ok := se.Object["spec"].(map[string]any) - require.True(t, ok, "spec must be map[string]any") + require.True(t, ok) hosts, ok := seSpec["hosts"].([]any) - require.True(t, ok, "hosts must be []any") + require.True(t, ok) assert.Equal(t, "api.openai.com", hosts[0]) ports, ok := seSpec["ports"].([]any) - require.True(t, ok, "ports must be []any") + require.True(t, ok) port, ok := ports[0].(map[string]any) - require.True(t, ok, "port must be map[string]any") + require.True(t, ok) assert.Equal(t, "https", port["name"]) assert.Equal(t, "HTTPS", port["protocol"]) } func TestBuildServiceEntryNoTLS(t *testing.T) { - spec := ExternalModelSpec{ - Provider: "vllm", - Endpoint: "vllm.internal", - Port: 8000, - TLS: false, - } - labels := commonLabels("test-model") + se := buildServiceEntry("vllm.internal", "my-vllm", "llm", 8000, false, commonLabels("my-vllm")) - se := BuildServiceEntry(spec, "test-model", "llm", labels) seSpec, ok := se.Object["spec"].(map[string]any) - require.True(t, ok, "spec must be map[string]any") + require.True(t, ok) ports, ok := seSpec["ports"].([]any) - require.True(t, ok, "ports must be []any") + require.True(t, ok) port, ok := ports[0].(map[string]any) - require.True(t, ok, "port must be map[string]any") + require.True(t, ok) assert.Equal(t, "HTTP", port["protocol"]) assert.Equal(t, "http", port["name"]) } func TestBuildDestinationRule(t *testing.T) { - spec := ExternalModelSpec{ - Provider: "openai", - Endpoint: "api.openai.com", - Port: 443, - TLS: true, - } - labels := commonLabels("my-gpt4") - - dr := BuildDestinationRule(spec, "my-gpt4", "llm", labels) + dr := buildDestinationRule("api.openai.com", "gpt-4o", "llm", commonLabels("gpt-4o")) assert.Equal(t, "DestinationRule", dr.GetKind()) - assert.Equal(t, "networking.istio.io/v1", dr.GetAPIVersion()) - assert.Equal(t, ModelDestinationRuleName("my-gpt4"), dr.GetName()) + assert.Equal(t, "gpt-4o", dr.GetName()) assert.Equal(t, "llm", dr.GetNamespace()) drSpec, ok := dr.Object["spec"].(map[string]any) - require.True(t, ok, "spec must be map[string]any") + require.True(t, ok) assert.Equal(t, "api.openai.com", drSpec["host"]) - - // Default: no insecureSkipVerify key - tp, ok := drSpec["trafficPolicy"].(map[string]any) - require.True(t, ok, "trafficPolicy must be map[string]any") - tlsCfg, ok := tp["tls"].(map[string]any) - require.True(t, ok, "tls must be map[string]any") - assert.Equal(t, "SIMPLE", tlsCfg["mode"]) - _, hasInsecure := tlsCfg["insecureSkipVerify"] - assert.False(t, hasInsecure, "insecureSkipVerify should not be set by default") -} - -func TestBuildDestinationRuleInsecureSkipVerify(t *testing.T) { - spec := ExternalModelSpec{ - Provider: "openai", - Endpoint: "3.150.113.9", - Port: 443, - TLS: true, - TLSInsecureSkipVerify: true, - } - labels := commonLabels("simulator-model") - - dr := BuildDestinationRule(spec, "simulator-model", "llm", labels) - - drSpec, ok := dr.Object["spec"].(map[string]any) - require.True(t, ok, "spec must be map[string]any") - assert.Equal(t, "3.150.113.9", drSpec["host"]) - - tp, ok := drSpec["trafficPolicy"].(map[string]any) - require.True(t, ok, "trafficPolicy must be map[string]any") - tlsCfg, ok := tp["tls"].(map[string]any) - require.True(t, ok, "tls must be map[string]any") - assert.Equal(t, "SIMPLE", tlsCfg["mode"]) - assert.Equal(t, true, tlsCfg["insecureSkipVerify"], "insecureSkipVerify must be true when opted in") } func TestBuildHTTPRoute(t *testing.T) { - spec := ExternalModelSpec{ - Provider: "openai", - Endpoint: "api.openai.com", - Port: 443, - TLS: true, - ExtraHeaders: map[string]string{}, - } - labels := commonLabels("my-gpt4") + hr := buildHTTPRoute("api.openai.com", "gpt-4o", "gpt-4o", "llm", 443, "maas-default-gateway", "openshift-ingress", commonLabels("gpt-4o")) - hr := BuildHTTPRoute(spec, "my-gpt4", "llm", "maas-default-gateway", "openshift-ingress", labels) - - assert.Equal(t, ModelRouteName("my-gpt4"), hr.Name) + assert.Equal(t, "gpt-4o", hr.Name) assert.Equal(t, "llm", hr.Namespace) assert.Len(t, hr.Spec.ParentRefs, 1) assert.Equal(t, "maas-default-gateway", string(hr.Spec.ParentRefs[0].Name)) // Must have 2 rules: path-based and header-based - assert.Len(t, hr.Spec.Rules, 2, "must have path-based and header-based rules") + assert.Len(t, hr.Spec.Rules, 2) - // Rule 1: path-based match + // Rule 1: path-based match with namespace prefix rule1 := hr.Spec.Rules[0] - assert.Len(t, rule1.Matches, 1) - assert.NotNil(t, rule1.Matches[0].Path) - assert.Equal(t, "/my-gpt4", *rule1.Matches[0].Path.Value) - assert.Equal(t, ModelBackendServiceName("my-gpt4"), string(rule1.BackendRefs[0].Name)) + assert.Equal(t, "/llm/gpt-4o", *rule1.Matches[0].Path.Value) + assert.Equal(t, "gpt-4o", string(rule1.BackendRefs[0].Name)) - // Rule 2: header-based match + // Rule 2: header-based match uses targetModel rule2 := hr.Spec.Rules[1] - assert.Len(t, rule2.Matches, 1) - assert.Len(t, rule2.Matches[0].Headers, 1) assert.Equal(t, "X-Gateway-Model-Name", string(rule2.Matches[0].Headers[0].Name)) - assert.Equal(t, "my-gpt4", rule2.Matches[0].Headers[0].Value) - assert.Equal(t, ModelBackendServiceName("my-gpt4"), string(rule2.BackendRefs[0].Name)) + assert.Equal(t, "gpt-4o", rule2.Matches[0].Headers[0].Value) - // Both rules should have URLRewrite filter + // Only Host header filter (required for TLS SNI), no URLRewrite for i, rule := range hr.Spec.Rules { - foundRewrite := false - for _, f := range rule.Filters { - if f.URLRewrite != nil { - foundRewrite = true - assert.Equal(t, "/", *f.URLRewrite.Path.ReplacePrefixMatch, - "rule %d: URLRewrite should strip prefix to /", i) - } - } - assert.True(t, foundRewrite, "rule %d: must have URLRewrite filter", i) + assert.Len(t, rule.Filters, 1, "rule %d: must have exactly 1 filter (Host header)", i) + assert.Equal(t, gatewayapiv1.HTTPRouteFilterRequestHeaderModifier, rule.Filters[0].Type) + assert.Equal(t, "Host", string(rule.Filters[0].RequestHeaderModifier.Set[0].Name)) + assert.Equal(t, "api.openai.com", rule.Filters[0].RequestHeaderModifier.Set[0].Value) } } -func TestBuildHTTPRouteWithExtraHeaders(t *testing.T) { - spec := ExternalModelSpec{ - Provider: "anthropic", - Endpoint: "api.anthropic.com", - Port: 443, - TLS: true, - ExtraHeaders: map[string]string{ - "anthropic-version": "2023-06-01", - }, - } - labels := commonLabels("my-claude") - - hr := BuildHTTPRoute(spec, "my-claude", "llm", "maas-default-gateway", "openshift-ingress", labels) - - // Check both rules have the extra header - for _, rule := range hr.Spec.Rules { - for _, f := range rule.Filters { - if f.RequestHeaderModifier != nil { - foundHost := false - foundExtra := false - for _, h := range f.RequestHeaderModifier.Set { - if string(h.Name) == "Host" { - foundHost = true - assert.Equal(t, "api.anthropic.com", h.Value) - } - if string(h.Name) == "anthropic-version" { - foundExtra = true - assert.Equal(t, "2023-06-01", h.Value) - } - } - assert.True(t, foundHost, "must set Host header") - assert.True(t, foundExtra, "must set anthropic-version header") - } - } - } +func TestBuildHTTPRoute_TargetModelDiffersFromName(t *testing.T) { + hr := buildHTTPRoute("bedrock-mantle.us-east-2.api.aws", "my-bedrock", "openai.gpt-oss-20b", "llm", 443, "maas-default-gateway", "openshift-ingress", commonLabels("my-bedrock")) + + // Name and path use ExternalModel name + assert.Equal(t, "my-bedrock", hr.Name) + assert.Equal(t, "/llm/my-bedrock", *hr.Spec.Rules[0].Matches[0].Path.Value) + + // Header match uses targetModel (what the user sends in body.model) + assert.Equal(t, "openai.gpt-oss-20b", hr.Spec.Rules[1].Matches[0].Headers[0].Value) + + // BackendRef uses ExternalModel name (Service name) + assert.Equal(t, "my-bedrock", string(hr.Spec.Rules[0].BackendRefs[0].Name)) } diff --git a/maas-controller/pkg/reconciler/externalmodel/types.go b/maas-controller/pkg/reconciler/externalmodel/types.go deleted file mode 100644 index 47be9b8c3..000000000 --- a/maas-controller/pkg/reconciler/externalmodel/types.go +++ /dev/null @@ -1,79 +0,0 @@ -// Package externalmodel implements a reconciler that watches MaaSModelRef CRs -// with kind=ExternalModel and creates the Istio resources required to route -// traffic to an external AI model provider: -// -// 1. ExternalName Service - DNS bridge for HTTPRoute backendRef -// 2. ServiceEntry - Registers external host in Istio mesh -// 3. DestinationRule - TLS origination (HTTP -> HTTPS) -// 4. HTTPRoute - Routes requests and sets Host header -// -// All resources are created in the model's namespace (same as the MaaSModelRef). -// OwnerReferences on each resource ensure Kubernetes garbage collection handles -// cleanup when the MaaSModelRef is deleted. -package externalmodel - -import ( - "strings" -) - -// ExternalModelSpec holds the configuration for routing to an external model. -// Provider and endpoint are read from the referenced ExternalModel CR (PR #586). -// Port, TLS, path-prefix, and extra-headers are optional annotation overrides on the MaaSModelRef. -type ExternalModelSpec struct { - // Provider identifies the API format (e.g. "openai", "anthropic", "vllm") - Provider string - // Endpoint is the external FQDN (e.g. "api.openai.com") - Endpoint string - // ExtraHeaders are additional headers to set (e.g. "anthropic-version=2023-06-01") - ExtraHeaders map[string]string - // Port is the external service port (default 443) - Port int32 - // TLS indicates whether TLS origination is needed (default true) - TLS bool - // PathPrefix is the path prefix to match (default "/external//") - PathPrefix string - // TLSInsecureSkipVerify disables certificate verification (testing only) - TLSInsecureSkipVerify bool -} - -// truncateName ensures base + suffix fits within 63 characters. -func truncateName(base, suffix string) string { - const maxLen = 63 - limit := max(maxLen-len(suffix), 1) - if len(base) == 0 { - base = "model" - } - if len(base) > limit { - base = base[:limit] - base = strings.TrimRight(base, "-") - } - return base + suffix -} - -// ModelRouteName returns the sanitized, length-safe name for the maas-model-* HTTPRoute. -func ModelRouteName(modelName string) string { - return truncateName("maas-model-"+sanitize(modelName), "") -} - -// ModelBackendServiceName returns the sanitized, length-safe name for the backend Service. -func ModelBackendServiceName(modelName string) string { - return truncateName("maas-model-"+sanitize(modelName), "-backend") -} - -// ModelServiceEntryName returns the sanitized, length-safe name for the ServiceEntry. -func ModelServiceEntryName(modelName string) string { - return truncateName("maas-model-"+sanitize(modelName), "-se") -} - -// ModelDestinationRuleName returns the sanitized, length-safe name for the DestinationRule. -func ModelDestinationRuleName(modelName string) string { - return truncateName("maas-model-"+sanitize(modelName), "-dr") -} - -// commonLabels returns labels applied to all managed resources. -func commonLabels(modelName string) map[string]string { - return map[string]string{ - "app.kubernetes.io/managed-by": "maas-external-model-reconciler", - "maas.opendatahub.io/external-model": modelName, - } -} diff --git a/scripts/README.md b/scripts/README.md index b55bb8a25..6fdc9016c 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -29,6 +29,8 @@ Automated deployment script for OpenShift clusters supporting both operator-base - Installs primary operator (RHOAI or ODH) or deploys via kustomize - Applies custom resources (DSC, DSCI) - Configures TLS backend (enabled by default, use `--disable-tls-backend` to skip) +- Deploys `maas-controller`, which then deploys `maas-api` via the **Tenant reconciler** (SSA) +- Passes `MAAS_API_IMAGE` to the controller as `RELATED_IMAGE_ODH_MAAS_API_IMAGE` so the Tenant reconciler uses the correct image - Supports custom operator catalogs and MaaS API images for PR testing **Options:** @@ -53,7 +55,7 @@ Automated deployment script for OpenShift clusters supporting both operator-base - `kustomize` installed **Environment Variables:** -- `MAAS_API_IMAGE` - Custom MaaS API container image (works in both operator and kustomize modes) +- `MAAS_API_IMAGE` - Custom MaaS API container image (passed to the Tenant reconciler via `RELATED_IMAGE_ODH_MAAS_API_IMAGE` on the controller Deployment) - `MAAS_CONTROLLER_IMAGE` - Custom MaaS controller container image - `OPERATOR_CATALOG` - Custom operator catalog for PR testing - `OPERATOR_IMAGE` - Custom operator image for PR testing @@ -221,7 +223,7 @@ Installs individual dependencies (Kuadrant, ODH, etc.). ### Initial Deployment (Operator Mode - Recommended) ```bash -# 1. Deploy the platform using ODH operator (default) +# 1. Deploy the platform (installs prerequisites + maas-controller; Tenant reconciler deploys maas-api) ./scripts/deploy.sh # 2. Validate the deployment @@ -236,7 +238,7 @@ kustomize build docs/samples/models/simulator | kubectl apply -f - ### Initial Deployment (Kustomize Mode) ```bash -# 1. Deploy the platform using kustomize +# 1. Deploy the platform via kustomize (maas-controller Tenant reconciler deploys maas-api) ./scripts/deploy.sh --deployment-mode kustomize # 2. Validate the deployment diff --git a/scripts/deploy.sh b/scripts/deploy.sh index c8fd4b4d4..0106f5e8a 100755 --- a/scripts/deploy.sh +++ b/scripts/deploy.sh @@ -27,7 +27,7 @@ # --channel Operator channel override # # ENVIRONMENT VARIABLES: -# MAAS_API_IMAGE Custom MaaS API container image +# MAAS_API_IMAGE Custom MaaS API image (passed to Tenant reconciler via RELATED_IMAGE) # MAAS_CONTROLLER_IMAGE Custom MaaS controller container image # OPERATOR_TYPE Operator type (rhoai/odh) # LOG_LEVEL Logging verbosity (DEBUG, INFO, WARN, ERROR) @@ -506,10 +506,11 @@ main() { ;; esac - # Install subscription controller (always deployed) - # In kustomize mode, maas-controller is included in the overlay; in operator mode, install via script. + # Install maas-controller (all deployment modes). + # The Tenant reconciler in maas-controller is the sole deployer of maas-api. + # In operator mode, skip if the ODH operator already created the deployment (3.4+). log_info "" - log_info "MaaS Subscription Controller..." + log_info "MaaS Controller..." local script_dir script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" local project_root="$script_dir/.." @@ -517,62 +518,114 @@ main() { local config_dir="$project_root/deployment/base/maas-controller/default" if [[ ! -d "$controller_dir" ]]; then - log_error "maas-controller directory not found at $controller_dir — subscription controller required" + log_error "maas-controller directory not found at $controller_dir — controller is required" return 1 + fi + + if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then + log_error "Namespace $NAMESPACE does not exist." + return 1 + fi + + if kubectl get deployment maas-controller -n "$NAMESPACE" &>/dev/null; then + log_info " maas-controller already exists in $NAMESPACE (e.g. operator-managed), skipping manifest apply" else - if [[ "$DEPLOYMENT_MODE" != "kustomize" ]]; then - log_info " Installing controller (CRDs, RBAC, deployment, default-deny policy)..." - if ! kubectl get namespace "$NAMESPACE" &>/dev/null; then - log_error "Namespace $NAMESPACE does not exist. Create it first (e.g. via ODH operator)." + log_info " Installing controller (CRDs, RBAC, deployment)..." + if [[ "$NAMESPACE" != "opendatahub" ]]; then + (cd "$project_root" && kustomize build deployment/base/maas-controller/default | \ + sed "s/namespace: opendatahub/namespace: $NAMESPACE/g") | kubectl apply -f - || { + log_error "Failed to apply maas-controller manifests" return 1 - fi - set_maas_controller_image - if [[ "$NAMESPACE" != "opendatahub" ]]; then - (cd "$project_root" && kustomize build deployment/base/maas-controller/default | \ - sed "s/namespace: opendatahub/namespace: $NAMESPACE/g") | kubectl apply -f - || { - cleanup_maas_controller_image - log_error "Failed to apply maas-controller manifests" - return 1 - } - else - kubectl apply -k "$config_dir" || { - cleanup_maas_controller_image - log_error "Failed to apply maas-controller manifests" - return 1 - } - fi - cleanup_maas_controller_image + } else - log_info " Controller deployed via kustomize overlay (deployment/base/maas-controller/default)" + kubectl apply -k "$config_dir" || { + log_error "Failed to apply maas-controller manifests" + return 1 + } fi + fi - log_info " Waiting for maas-controller to be ready..." - if ! kubectl rollout status deployment/maas-controller -n "$NAMESPACE" --timeout="${ROLLOUT_TIMEOUT}s"; then - log_error "maas-controller deployment not ready (timeout: ${ROLLOUT_TIMEOUT}s)" + if [[ -n "${MAAS_CONTROLLER_IMAGE:-}" ]]; then + log_info " Custom MaaS controller image: $MAAS_CONTROLLER_IMAGE" + kubectl set image deployment/maas-controller manager="${MAAS_CONTROLLER_IMAGE}" -n "$NAMESPACE" || { + log_error "Failed to set maas-controller container image" + return 1 + } + kubectl set env deployment/maas-controller -n "$NAMESPACE" \ + "RELATED_IMAGE_ODH_MAAS_CONTROLLER_IMAGE=${MAAS_CONTROLLER_IMAGE}" || { + log_error "Failed to set RELATED_IMAGE_ODH_MAAS_CONTROLLER_IMAGE on maas-controller" return 1 + } + fi + + log_info " Waiting for maas-controller to be ready..." + if ! kubectl rollout status deployment/maas-controller -n "$NAMESPACE" --timeout="${ROLLOUT_TIMEOUT}s"; then + log_error "maas-controller deployment not ready (timeout: ${ROLLOUT_TIMEOUT}s)" + return 1 + fi + log_info " Controller ready." + + # Pass custom maas-api image to the Tenant reconciler via RELATED_IMAGE env var. + # The reconciler reads this when building params.env for kustomize (ApplyParams). + local env_patches=() + if [[ -n "${MAAS_API_IMAGE:-}" ]]; then + log_info " Configuring custom MaaS API image: $MAAS_API_IMAGE" + env_patches+=("RELATED_IMAGE_ODH_MAAS_API_IMAGE=$MAAS_API_IMAGE") + fi + # Patch controller with correct audience for HyperShift/ROSA clusters. + local cluster_aud + cluster_aud=$(get_cluster_audience 2>/dev/null || echo "") + if [[ -n "$cluster_aud" && "$cluster_aud" != "https://kubernetes.default.svc" ]]; then + log_info " Non-standard cluster audience detected: $cluster_aud" + env_patches+=("CLUSTER_AUDIENCE=$cluster_aud") + fi + + if [[ ${#env_patches[@]} -gt 0 ]]; then + log_info " Patching maas-controller env vars: ${env_patches[*]}" + kubectl set env deployment/maas-controller -n "$NAMESPACE" "${env_patches[@]}" + if ! kubectl rollout status deployment/maas-controller -n "$NAMESPACE" --timeout="${ROLLOUT_TIMEOUT}s"; then + log_warn "maas-controller rollout after env patch did not complete in time (timeout: ${ROLLOUT_TIMEOUT}s)" fi + fi - log_info " Subscription controller ready." - log_info " Create MaaSModelRef, MaaSAuthPolicy, and MaaSSubscription to enable per-model auth and rate limiting." - - # Patch controller with correct audience for HyperShift/ROSA clusters. - # The controller creates AuthPolicies with kubernetesTokenReview.audiences; - # on non-standard clusters the default audience (https://kubernetes.default.svc) - # causes Authorino token validation to fail with 401. - local cluster_aud - cluster_aud=$(get_cluster_audience 2>/dev/null || echo "") - if [[ -n "$cluster_aud" && "$cluster_aud" != "https://kubernetes.default.svc" ]]; then - log_info " Non-standard cluster audience detected: $cluster_aud" - log_info " Patching maas-controller with correct CLUSTER_AUDIENCE..." - kubectl set env deployment/maas-controller -n "$NAMESPACE" CLUSTER_AUDIENCE="$cluster_aud" - if ! kubectl rollout status deployment/maas-controller -n "$NAMESPACE" --timeout="${ROLLOUT_TIMEOUT}s"; then - log_warn "maas-controller rollout after audience patch did not complete in time (timeout: ${ROLLOUT_TIMEOUT}s)" + # Wait for the Tenant reconciler to deploy maas-api. + # The controller creates a default-tenant CR on startup, and the Tenant + # reconciler renders and SSA-applies maas-api manifests + gateway policies. + log_info "" + log_info "Waiting for Tenant reconciler to deploy maas-api..." + local maas_api_timeout="${CUSTOM_RESOURCE_TIMEOUT:-600}" + local elapsed=0 + while [[ $elapsed -lt $maas_api_timeout ]]; do + if kubectl get deployment maas-api -n "$NAMESPACE" &>/dev/null; then + log_info " maas-api deployment found, waiting for rollout..." + if kubectl rollout status deployment/maas-api -n "$NAMESPACE" --timeout="$((maas_api_timeout - elapsed))s" 2>/dev/null; then + log_info " maas-api is ready" + break fi fi + sleep 10 + elapsed=$((elapsed + 10)) + if (( elapsed % 60 == 0 )); then + log_info " Still waiting for maas-api deployment... (${elapsed}s / ${maas_api_timeout}s)" + fi + done + + if ! kubectl get deployment maas-api -n "$NAMESPACE" &>/dev/null; then + log_error "maas-api deployment not created by Tenant reconciler after ${maas_api_timeout}s" + log_error "Check maas-controller logs: kubectl logs -l app.kubernetes.io/name=maas-controller -n $NAMESPACE" + return 1 fi + log_info "" + log_info "MaaS API and MaaS Controller deployment completed successfully!" + local deployed_api_image deployed_ctrl_image + deployed_api_image=$(kubectl get deployment/maas-api -n "$NAMESPACE" -o jsonpath='{.spec.template.spec.containers[0].image}' 2>/dev/null || echo "unknown") + deployed_ctrl_image=$(kubectl get deployment/maas-controller -n "$NAMESPACE" -o jsonpath='{.spec.template.spec.containers[0].image}' 2>/dev/null || echo "unknown") + log_info " maas-api image: $deployed_api_image" + log_info " maas-controller image: $deployed_ctrl_image" + log_info "===================================================" - log_info " Deployment completed successfully!" + log_info " Models-as-a-Service Deployment completed successfully!" log_info "===================================================" } @@ -609,17 +662,15 @@ deploy_via_operator() { deploy_keycloak fi - # Inject custom MaaS API image if specified - inject_maas_api_image_operator_mode "$NAMESPACE" - # Configure TLS backend (if enabled) if [[ "$ENABLE_TLS_BACKEND" == "true" ]]; then configure_tls_backend fi - - # Configure audience for non-standard clusters (Hypershift/ROSA) - configure_cluster_audience + # Custom maas-api image injection and cluster audience configuration + # are now handled by the Tenant reconciler in maas-controller (common + # block in main). The controller receives RELATED_IMAGE_ODH_MAAS_API_IMAGE + # and CLUSTER_AUDIENCE env vars and applies them during kustomize render. log_info "Operator deployment completed" } @@ -631,33 +682,12 @@ deploy_via_operator() { deploy_via_kustomize() { log_info "Starting kustomize-based deployment..." - local project_root - project_root="$(find_project_root)" || { - log_error "Could not find project root" - exit 1 - } - # Install rate limiter component (RHCL or Kuadrant) install_policy_engine - local overlay="$project_root/deployment/overlays/http-backend" - if [[ "$ENABLE_TLS_BACKEND" == "true" ]]; then - log_info "Using TLS backend overlay" - overlay="$project_root/deployment/overlays/tls-backend" - else - log_info "Using HTTP backend overlay" - fi - - # Set namespace and image from script (overlay kustomization is restored on exit) - trap 'cleanup_maas_api_image; cleanup_maas_controller_image; cleanup_overlay_namespace' EXIT INT TERM - set_maas_api_image - set_maas_controller_image - set_overlay_namespace "$overlay" "$NAMESPACE" - # Create namespace (idempotent - treat AlreadyExists as success to avoid TOCTOU races) log_info "Ensuring namespace exists: $NAMESPACE" if ! kubectl create namespace "$NAMESPACE" 2>/dev/null; then - # Create failed - check if it's because namespace already exists if kubectl get namespace "$NAMESPACE" &>/dev/null; then log_debug "Namespace $NAMESPACE already exists" else @@ -668,11 +698,6 @@ deploy_via_kustomize() { log_info "Created namespace: $NAMESPACE" fi - # Note: The subscription namespace (default: models-as-a-service) is automatically - # created by maas-controller when it starts (see maas-controller/cmd/manager/main.go). - # We only set the variable here for use in manifest patching below. - local subscription_namespace="${MAAS_SUBSCRIPTION_NAMESPACE:-models-as-a-service}" - # Deploy PostgreSQL for API key storage (requires namespace to exist) deploy_postgresql @@ -681,35 +706,17 @@ deploy_via_kustomize() { deploy_keycloak fi - log_info "Applying kustomize manifests..." - # Patch MAAS_SUBSCRIPTION_NAMESPACE env var with the configured subscription namespace - # tls/http overlays reference ../odh/params.env outside the overlay root. - kubectl apply --server-side=true --force-conflicts="$KUSTOMIZE_FORCE_CONFLICTS" -f <( - kustomize build --load-restrictor LoadRestrictionsNone "$overlay" | \ - perl -pe 'BEGIN{undef $/;} s/(name: MAAS_SUBSCRIPTION_NAMESPACE\n\s+value: ")[^"]*"/${1}'"$subscription_namespace"'"/smg' - ) - - # Apply gateway policies separately so they stay in openshift-ingress (overlay - # namespace would otherwise overwrite them to $NAMESPACE) - local policies_dir="$project_root/deployment/base/maas-controller/policies" - if [[ -d "$policies_dir" ]]; then - log_info "Applying gateway policies (openshift-ingress)..." - kubectl apply --server-side=true --force-conflicts="$KUSTOMIZE_FORCE_CONFLICTS" -f <(kustomize build "$policies_dir") - fi - - # Configure TLS backend (if enabled) + # Configure TLS backend (Authorino only — maas-api is deployed later by the Tenant reconciler) if [[ "$ENABLE_TLS_BACKEND" == "true" ]]; then configure_tls_backend fi - # Patch the live AuthPolicy after kustomize apply so OIDC and API key - # behavior matches operator mode when configured. - configure_maas_api_authpolicy + # maas-api, gateway policies, AuthPolicy configuration, and cluster audience + # are now handled by the Tenant reconciler in maas-controller. After the + # controller starts it creates the default-tenant CR, which triggers the + # reconciler to apply maas-api manifests and gateway policies via SSA. - # Configure audience for non-standard clusters (HyperShift/ROSA) - configure_cluster_audience - - log_info "Kustomize deployment completed" + log_info "Kustomize prerequisite deployment completed" } #────────────────────────────────────────────────────────────── @@ -818,104 +825,7 @@ install_optional_operators() { #────────────────────────────────────────────────────────────── # RATE LIMITER INSTALLATION #────────────────────────────────────────────────────────────── - -# Patch Kuadrant/RHCL CSV to recognize OpenShift Gateway controller -# This is required because Kuadrant needs to know about the Gateway API provider -# Without this patch, Kuadrant shows "MissingDependency" and AuthPolicies won't be enforced -patch_kuadrant_csv_for_gateway() { - local namespace=$1 - local operator_prefix=$2 - - log_info "Patching $operator_prefix CSV for OpenShift Gateway controller..." - - # Find the CSV - local csv_name - csv_name=$(kubectl get csv -n "$namespace" --no-headers 2>/dev/null | grep "^${operator_prefix}" | awk '{print $1}' | head -1) - - if [[ -z "$csv_name" ]]; then - log_warn "Could not find CSV for $operator_prefix in $namespace, skipping Gateway controller patch" - return 0 - fi - - # Check if ISTIO_GATEWAY_CONTROLLER_NAMES already has both values - local current_value - current_value=$(kubectl get csv "$csv_name" -n "$namespace" -o jsonpath='{.spec.install.spec.deployments[0].spec.template.spec.containers[0].env[?(@.name=="ISTIO_GATEWAY_CONTROLLER_NAMES")].value}' 2>/dev/null || echo "") - - if [[ "$current_value" == *"istio.io/gateway-controller"* && "$current_value" == *"openshift.io/gateway-controller"* ]]; then - log_debug "CSV already has correct ISTIO_GATEWAY_CONTROLLER_NAMES value" - return 0 - fi - - # Find the index of ISTIO_GATEWAY_CONTROLLER_NAMES env var - local env_index - env_index=$(kubectl get csv "$csv_name" -n "$namespace" -o json | jq '.spec.install.spec.deployments[0].spec.template.spec.containers[0].env | to_entries | .[] | select(.value.name=="ISTIO_GATEWAY_CONTROLLER_NAMES") | .key' 2>/dev/null || echo "") - - if [[ -z "$env_index" ]]; then - # Env var doesn't exist, add it - log_debug "Adding ISTIO_GATEWAY_CONTROLLER_NAMES to CSV" - kubectl patch csv "$csv_name" -n "$namespace" --type='json' -p='[ - { - "op": "add", - "path": "/spec/install/spec/deployments/0/spec/template/spec/containers/0/env/-", - "value": { - "name": "ISTIO_GATEWAY_CONTROLLER_NAMES", - "value": "istio.io/gateway-controller,openshift.io/gateway-controller/v1" - } - } - ]' 2>/dev/null || log_warn "Failed to add ISTIO_GATEWAY_CONTROLLER_NAMES to CSV" - else - # Env var exists, update it - log_debug "Updating ISTIO_GATEWAY_CONTROLLER_NAMES in CSV (index: $env_index)" - kubectl patch csv "$csv_name" -n "$namespace" --type='json' -p="[ - { - \"op\": \"replace\", - \"path\": \"/spec/install/spec/deployments/0/spec/template/spec/containers/0/env/${env_index}/value\", - \"value\": \"istio.io/gateway-controller,openshift.io/gateway-controller/v1\" - } - ]" 2>/dev/null || log_warn "Failed to update ISTIO_GATEWAY_CONTROLLER_NAMES in CSV" - fi - - log_info "CSV patched for OpenShift Gateway controller" - - # CRITICAL: Force delete the operator pod to pick up the new env var - # OLM updates the deployment spec but doesn't always trigger a pod restart - # The operator must have ISTIO_GATEWAY_CONTROLLER_NAMES set BEFORE Kuadrant CR is created - log_info "Forcing operator restart to apply new Gateway controller configuration..." - - # The kuadrant operator deployment is always named kuadrant-operator-controller-manager - # regardless of whether we're using rhcl-operator or kuadrant-operator - local operator_deployment="kuadrant-operator-controller-manager" - if kubectl get deployment "$operator_deployment" -n "$namespace" &>/dev/null; then - # Force delete the operator pod - this ensures the new env var is picked up - kubectl delete pod -n "$namespace" -l control-plane=controller-manager --force --grace-period=0 2>/dev/null || \ - kubectl delete pod -n "$namespace" -l app.kubernetes.io/name=kuadrant-operator --force --grace-period=0 2>/dev/null || \ - kubectl delete pod -n "$namespace" -l app=kuadrant --force --grace-period=0 2>/dev/null || true - - # Wait for the new pod to be ready - log_info "Waiting for operator pod to restart..." - sleep 5 - kubectl rollout status deployment/"$operator_deployment" -n "$namespace" --timeout="${ROLLOUT_TIMEOUT}s" 2>/dev/null || \ - log_warn "Operator rollout status check timed out (timeout: ${ROLLOUT_TIMEOUT}s)" - - # Verify the env var is in the RUNNING pod - local pod_env - pod_env=$(kubectl exec -n "$namespace" deployment/"$operator_deployment" -- env 2>/dev/null | grep ISTIO_GATEWAY_CONTROLLER_NAMES || echo "") - - if [[ "$pod_env" == *"openshift.io/gateway-controller/v1"* ]]; then - log_info "Operator pod is running with OpenShift Gateway controller configuration" - else - log_warn "Operator pod may not have correct env yet: $pod_env" - fi - - # Give the operator time to fully initialize with the new Gateway controller configuration - # This is critical - the operator needs to register as a Gateway controller before Kuadrant CR is created - log_info "Waiting 15s for operator to fully initialize with Gateway controller configuration..." - sleep 15 - else - log_warn "Could not find operator deployment, waiting 60s for env propagation" - sleep 60 - fi -} +# patch_csv_operator_container_env and patch_kuadrant_csv live in deployment-helpers.sh install_policy_engine() { log_info "Installing policy engine: $POLICY_ENGINE" @@ -937,7 +847,7 @@ install_policy_engine() { fi # Patch RHCL CSV to recognize OpenShift Gateway controller - patch_kuadrant_csv_for_gateway "rh-connectivity-link" "rhcl-operator" + patch_kuadrant_csv "rh-connectivity-link" "rhcl-operator" # Apply RHCL/Kuadrant custom resource apply_kuadrant_cr "rh-connectivity-link" @@ -1000,7 +910,7 @@ EOF fi # Patch Kuadrant CSV to recognize OpenShift Gateway controller - patch_kuadrant_csv_for_gateway "$kuadrant_ns" "kuadrant-operator" + patch_kuadrant_csv "$kuadrant_ns" "kuadrant-operator" # Apply Kuadrant custom resource apply_kuadrant_cr "$kuadrant_ns" @@ -1165,7 +1075,7 @@ apply_custom_resources() { local webhook_deployment if [[ "$OPERATOR_TYPE" == "rhoai" ]]; then - webhook_deployment="rhods-operator-controller-manager" + webhook_deployment="rhods-operator" else webhook_deployment="opendatahub-operator-controller-manager" fi diff --git a/scripts/deployment-helpers.sh b/scripts/deployment-helpers.sh index 80d3f3a7a..c64472929 100755 --- a/scripts/deployment-helpers.sh +++ b/scripts/deployment-helpers.sh @@ -223,6 +223,137 @@ log_error() { # OLM Subscription and CSV Helper Functions # ========================================== +# Patch one env var on spec.install.spec.deployments[0].containers[0] of a ClusterServiceVersion. +# Returns 0 if a patch was applied, 1 if the value was already correct, 2 if patch failed. +patch_csv_operator_container_env() { + local namespace=$1 + local csv_name=$2 + local env_name=$3 + local env_value=$4 + + local current + current=$(kubectl get csv "$csv_name" -n "$namespace" -o jsonpath="{.spec.install.spec.deployments[0].spec.template.spec.containers[0].env[?(@.name==\"${env_name}\")].value}" 2>/dev/null || echo "") + + if [[ "$current" == "$env_value" ]]; then + return 1 + fi + + local env_index + env_index=$(kubectl get csv "$csv_name" -n "$namespace" -o json | jq -r --arg n "$env_name" '.spec.install.spec.deployments[0].spec.template.spec.containers[0].env | to_entries[] | select(.value.name == $n) | .key' 2>/dev/null | head -1) + + if [[ -z "$env_index" ]]; then + log_debug "Adding ${env_name} to CSV ${csv_name}" + kubectl patch csv "$csv_name" -n "$namespace" --type='json' -p="[ + { + \"op\": \"add\", + \"path\": \"/spec/install/spec/deployments/0/spec/template/spec/containers/0/env/-\", + \"value\": { + \"name\": \"${env_name}\", + \"value\": \"${env_value}\" + } + } + ]" 2>/dev/null || { + log_warn "Failed to add ${env_name} to CSV" + return 2 + } + else + log_debug "Updating ${env_name} in CSV ${csv_name} (index: $env_index)" + kubectl patch csv "$csv_name" -n "$namespace" --type='json' -p="[ + { + \"op\": \"replace\", + \"path\": \"/spec/install/spec/deployments/0/spec/template/spec/containers/0/env/${env_index}/value\", + \"value\": \"${env_value}\" + } + ]" 2>/dev/null || { + log_warn "Failed to update ${env_name} in CSV" + return 2 + } + fi + return 0 +} + +# Patch Kuadrant/RHCL CSV to recognize OpenShift Gateway controller +# This is required because Kuadrant needs to know about the Gateway API provider +# Without this patch, Kuadrant shows "MissingDependency" and AuthPolicies won't be enforced +# +# Also sets RATELIMIT_*_SERVICE_FAILURE_MODE=deny so policy fails closed when Limitador +# service is unavailable (see Kuadrant operator deployment env). +# +# Arguments: e.g. patch_kuadrant_csv "kuadrant-system" "kuadrant-operator" +patch_kuadrant_csv() { + local namespace=$1 + local operator_prefix=$2 + + log_info "Patching $operator_prefix CSV (Gateway API, rate limit failure modes)..." + + # Find the CSV + local csv_name + csv_name=$(kubectl get csv -n "$namespace" --no-headers 2>/dev/null | grep "^${operator_prefix}" | awk '{print $1}' | head -1) + + if [[ -z "$csv_name" ]]; then + log_warn "Could not find CSV for $operator_prefix in $namespace, skipping Gateway controller patch" + return 0 + fi + + local patched_any=false + + # --- ISTIO_GATEWAY_CONTROLLER_NAMES (OpenShift Gateway controller) --- + local gateway_controller_names="istio.io/gateway-controller,openshift.io/gateway-controller/v1" + patch_csv_operator_container_env "$namespace" "$csv_name" "ISTIO_GATEWAY_CONTROLLER_NAMES" "$gateway_controller_names" && patched_any=true + + # --- Rate limit dependency failure modes (fail closed) --- + patch_csv_operator_container_env "$namespace" "$csv_name" "RATELIMIT_CHECK_SERVICE_FAILURE_MODE" "deny" && patched_any=true + patch_csv_operator_container_env "$namespace" "$csv_name" "RATELIMIT_REPORT_SERVICE_FAILURE_MODE" "deny" && patched_any=true + + if [[ "$patched_any" != "true" ]]; then + log_debug "CSV already has all required operator env (Gateway + rate limit failure modes)" + return 0 + fi + + log_info "CSV patched (Gateway controller and/or rate limit failure modes)" + + # CRITICAL: Force delete the operator pod to pick up the new env var + # OLM updates the deployment spec but doesn't always trigger a pod restart + # The operator must have ISTIO_GATEWAY_CONTROLLER_NAMES set BEFORE Kuadrant CR is created + log_info "Forcing operator restart to apply CSV env configuration..." + + # The kuadrant operator deployment is always named kuadrant-operator-controller-manager + # regardless of whether we're using rhcl-operator or kuadrant-operator + local operator_deployment="kuadrant-operator-controller-manager" + if kubectl get deployment "$operator_deployment" -n "$namespace" &>/dev/null; then + # Force delete the operator pod - this ensures the new env var is picked up + kubectl delete pod -n "$namespace" -l control-plane=controller-manager --force --grace-period=0 2>/dev/null || \ + kubectl delete pod -n "$namespace" -l app.kubernetes.io/name=kuadrant-operator --force --grace-period=0 2>/dev/null || \ + kubectl delete pod -n "$namespace" -l app=kuadrant --force --grace-period=0 2>/dev/null || true + + # Wait for the new pod to be ready + log_info "Waiting for operator pod to restart..." + sleep 5 + kubectl rollout status deployment/"$operator_deployment" -n "$namespace" --timeout="${ROLLOUT_TIMEOUT}s" 2>/dev/null || \ + log_warn "Operator rollout status check timed out (timeout: ${ROLLOUT_TIMEOUT}s)" + + # Verify required env vars are in the RUNNING pod + local pod_env + pod_env=$(kubectl exec -n "$namespace" deployment/"$operator_deployment" -- env 2>/dev/null || true) + + if echo "$pod_env" | grep '^ISTIO_GATEWAY_CONTROLLER_NAMES=' | grep -q 'openshift.io/gateway-controller/v1' \ + && echo "$pod_env" | grep -Fq 'RATELIMIT_CHECK_SERVICE_FAILURE_MODE=deny' \ + && echo "$pod_env" | grep -Fq 'RATELIMIT_REPORT_SERVICE_FAILURE_MODE=deny'; then + log_info "Operator pod has required CSV env (ISTIO gateway controller + RATELIMIT_* failure modes)" + else + log_warn "Operator pod may not have correct env yet (ISTIO / RATELIMIT_* failure modes)" + fi + + # Give the operator time to fully initialize with the new Gateway controller configuration + # This is critical - the operator needs to register as a Gateway controller before Kuadrant CR is created + log_info "Waiting 15s for operator to fully initialize with Gateway controller configuration..." + sleep 15 + else + log_warn "Could not find operator deployment, waiting 60s for env propagation" + sleep 60 + fi +} + # waitsubscriptioninstalled namespace subscription_name # Waits for an OLM Subscription to finish installing its CSV. # Exits with error if the installation times out. @@ -739,6 +870,29 @@ find_project_root() { fi } +# _patch_params_env key value project_root +# Patches a key=value line in params.env. Creates a backup on first call. +_patch_params_env() { + local key="$1" value="$2" project_root="$3" + export _MAAS_PARAMS_ENV="$project_root/deployment/overlays/odh/params.env" + [ -f "$_MAAS_PARAMS_ENV" ] || return 0 + export _MAAS_PARAMS_ENV_BACKUP="${_MAAS_PARAMS_ENV}.backup" + if [ ! -f "$_MAAS_PARAMS_ENV_BACKUP" ]; then + cp "$_MAAS_PARAMS_ENV" "$_MAAS_PARAMS_ENV_BACKUP" + fi + local sed_cmd="sed" + [[ "$(uname -s)" == "Darwin" ]] && sed_cmd="gsed" + $sed_cmd -i "s|^${key}=.*|${key}=${value}|" "$_MAAS_PARAMS_ENV" +} + +# _cleanup_params_env +# Restores params.env from backup. Safe to call multiple times. +_cleanup_params_env() { + if [ -n "${_MAAS_PARAMS_ENV_BACKUP:-}" ] && [ -f "$_MAAS_PARAMS_ENV_BACKUP" ]; then + mv -f "$_MAAS_PARAMS_ENV_BACKUP" "$_MAAS_PARAMS_ENV" 2>/dev/null || true + fi +} + # set_maas_api_image # Sets the MaaS API container image in base kustomization using MAAS_API_IMAGE env var. # If MAAS_API_IMAGE is not set, does nothing. @@ -774,58 +928,20 @@ set_maas_api_image() { mv -f "$_MAAS_API_BACKUP" "$_MAAS_API_KUSTOMIZATION" 2>/dev/null || true return 1 } + + # Patch params.env — kustomize replacements in shared-patches read from this + # file and override the base images: transformer set above. + _patch_params_env "maas-api-image" "$MAAS_API_IMAGE" "$project_root" } # cleanup_maas_api_image -# Restores the original kustomization.yaml from backup. +# Restores the original kustomization.yaml and params.env from backup. # Safe to call even if set_maas_api_image was not called or MAAS_API_IMAGE was not set. cleanup_maas_api_image() { if [ -n "${_MAAS_API_BACKUP:-}" ] && [ -f "$_MAAS_API_BACKUP" ]; then mv -f "$_MAAS_API_BACKUP" "$_MAAS_API_KUSTOMIZATION" 2>/dev/null || true fi -} - -# set_maas_controller_image -# Sets the MaaS controller container image in config/manager kustomization using MAAS_CONTROLLER_IMAGE env var. -# If MAAS_CONTROLLER_IMAGE is not set, does nothing. -# Creates a backup that must be restored by calling cleanup_maas_controller_image. -# -# Environment: -# MAAS_CONTROLLER_IMAGE - Container image to use (e.g., quay.io/opendatahub/maas-controller:pr-42) -set_maas_controller_image() { - if [ -z "${MAAS_CONTROLLER_IMAGE:-}" ]; then - return 0 - fi - if [ -n "${_MAAS_CONTROLLER_IMAGE_SET:-}" ]; then - return 0 - fi - - local project_root - project_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" - - export _MAAS_CONTROLLER_KUSTOMIZATION="$project_root/deployment/base/maas-controller/manager/kustomization.yaml" - export _MAAS_CONTROLLER_BACKUP="${_MAAS_CONTROLLER_KUSTOMIZATION}.backup" - export _MAAS_CONTROLLER_IMAGE_SET=1 - - echo " Setting MaaS controller image: ${MAAS_CONTROLLER_IMAGE}" - cp "$_MAAS_CONTROLLER_KUSTOMIZATION" "$_MAAS_CONTROLLER_BACKUP" || { - echo "Error: failed to create backup of controller kustomization.yaml" >&2 - return 1 - } - (cd "$(dirname "$_MAAS_CONTROLLER_KUSTOMIZATION")" && kustomize edit set image "maas-controller=${MAAS_CONTROLLER_IMAGE}") || { - echo "Error: failed to set image in controller kustomization.yaml" >&2 - mv -f "$_MAAS_CONTROLLER_BACKUP" "$_MAAS_CONTROLLER_KUSTOMIZATION" 2>/dev/null || true - return 1 - } -} - -# cleanup_maas_controller_image -# Restores the original controller kustomization.yaml from backup. -# Safe to call even if set_maas_controller_image was not called or MAAS_CONTROLLER_IMAGE was not set. -cleanup_maas_controller_image() { - if [ -n "${_MAAS_CONTROLLER_BACKUP:-}" ] && [ -f "$_MAAS_CONTROLLER_BACKUP" ]; then - mv -f "$_MAAS_CONTROLLER_BACKUP" "$_MAAS_CONTROLLER_KUSTOMIZATION" 2>/dev/null || true - fi + _cleanup_params_env } # set_overlay_namespace overlay_dir namespace @@ -833,7 +949,7 @@ cleanup_maas_controller_image() { # Creates a backup that must be restored by calling cleanup_overlay_namespace. # # Arguments: -# overlay_dir - Path to overlay directory (e.g. deployment/overlays/tls-backend) +# overlay_dir - Path to overlay directory # namespace - Namespace to set (e.g. opendatahub) set_overlay_namespace() { local overlay_dir="${1?overlay_dir is required}" @@ -1488,3 +1604,116 @@ create_maas_db_config_secret() { kubectl label --local -f - app=maas-api --dry-run=client -o yaml | \ kubectl apply -n "$namespace" -f - } + +# ========================================== +# Diagnostic Helpers +# ========================================== + +# dump_llmis_diagnostics +# Dumps comprehensive diagnostic information when an LLMInferenceService +# fails to become ready. Captures pod status, logs, events, and node resources +# to help diagnose deployment failures. +# +# Usage: +# if ! kubectl wait llminferenceservice/my-model --for=condition=Ready; then +# dump_llmis_diagnostics "my-model" "llm" +# fi +# +# Output: +# - LLMInferenceService status (conditions, observedGeneration) +# - Pod status (wide format) +# - ReplicaSet/Deployment status +# - Container logs (current and previous) +# - Namespace events +# - Node resource allocation +dump_llmis_diagnostics() { + local llmis_name="$1" + local namespace="$2" + + if [[ -z "$llmis_name" || -z "$namespace" ]]; then + echo "Usage: dump_llmis_diagnostics " + return 1 + fi + + echo "" + echo "==========================================" + echo "LLMInferenceService Diagnostics: $llmis_name" + echo "==========================================" + + echo "" + echo "========== LLMInferenceService Status ==========" + # Only output status (not full YAML) to avoid logging potentially sensitive spec fields + kubectl get llminferenceservice/"$llmis_name" -n "$namespace" -o jsonpath='{.status}' 2>&1 | jq -C '.' 2>/dev/null || \ + kubectl get llminferenceservice/"$llmis_name" -n "$namespace" -o jsonpath='{.status}' 2>&1 || \ + echo " (failed to get LLMIS status)" + + echo "" + echo "========== Pod Status ==========" + # KServe creates resources with pattern: ${llmis_name}-kserve-* + # Use name-based filtering since label selectors may not match + if kubectl get pods -n "$namespace" 2>/dev/null | grep -q "^${llmis_name}-"; then + kubectl get pods -n "$namespace" 2>&1 | grep "^NAME\|^${llmis_name}-" || echo " (no matching pods found)" + else + echo " (no pods found matching pattern: ${llmis_name}-*)" + fi + + echo "" + echo "========== ReplicaSet Status ==========" + if kubectl get rs -n "$namespace" 2>/dev/null | grep -q "^${llmis_name}-"; then + kubectl get rs -n "$namespace" -o wide 2>&1 | grep "^NAME\|^${llmis_name}-" || echo " (no matching replicasets found)" + else + echo " (no replicasets found matching pattern: ${llmis_name}-*)" + fi + + echo "" + echo "========== Deployment Status ==========" + if kubectl get deployment -n "$namespace" 2>/dev/null | grep -q "^${llmis_name}-"; then + kubectl get deployment -n "$namespace" -o wide 2>&1 | grep "^NAME\|^${llmis_name}-" || echo " (no matching deployments found)" + else + echo " (no deployments found matching pattern: ${llmis_name}-*)" + fi + + echo "" + echo "========== Container Logs ==========" + local pods + # Use awk alone to avoid grep exit code 1 when no matches found + pods=$(kubectl get pods -n "$namespace" --no-headers 2>/dev/null | awk '/^'"${llmis_name}"'-/ {print $1}') + + if [[ -z "$pods" ]]; then + echo " (no pods found - container logs unavailable)" + else + for pod in $pods; do + echo "" + echo "--- Pod: $pod ---" + + # Try main container + echo "Main container (current):" + kubectl logs "$pod" -n "$namespace" -c main --tail=100 2>&1 || echo " (no logs available)" + + echo "" + echo "Main container (previous - if crashed):" + kubectl logs "$pod" -n "$namespace" -c main --previous --tail=100 2>&1 || echo " (no previous logs)" + + echo "" + echo "Storage initializer container:" + kubectl logs "$pod" -n "$namespace" -c storage-initializer --tail=50 2>&1 || echo " (no logs available)" + done + fi + + echo "" + echo "========== Namespace Events (Recent 100) ==========" + kubectl get events -n "$namespace" --sort-by='.lastTimestamp' 2>&1 | tail -100 || echo " (failed to get events)" + + echo "" + echo "========== Node Status ==========" + kubectl get nodes -o wide 2>&1 || echo " (failed to get nodes)" + + echo "" + echo "========== Node Resource Allocation ==========" + kubectl describe nodes 2>&1 | grep -A 10 "Allocated resources:" || echo " (failed to get node resources)" + + echo "" + echo "==========================================" + echo "End of diagnostics for: $llmis_name" + echo "==========================================" +} diff --git a/scripts/install-dependencies.sh b/scripts/install-dependencies.sh index ea2ef5a39..e466520ab 100755 --- a/scripts/install-dependencies.sh +++ b/scripts/install-dependencies.sh @@ -168,29 +168,9 @@ EOF sleep 5 - # Patch Kuadrant for OpenShift Gateway Controller - echo " Patching Kuadrant operator..." - if ! kubectl -n kuadrant-system get deployment kuadrant-operator-controller-manager -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="ISTIO_GATEWAY_CONTROLLER_NAMES")]}' | grep -q "ISTIO_GATEWAY_CONTROLLER_NAMES"; then - # Find the actual CSV name instead of hardcoding it - KUADRANT_CSV=$(find_csv_with_min_version "kuadrant-operator" "$KUADRANT_MIN_VERSION" "kuadrant-system" || echo "") - if [ -n "$KUADRANT_CSV" ]; then - kubectl patch csv "$KUADRANT_CSV" -n kuadrant-system --type='json' -p='[ - { - "op": "add", - "path": "/spec/install/spec/deployments/0/spec/template/spec/containers/0/env/-", - "value": { - "name": "ISTIO_GATEWAY_CONTROLLER_NAMES", - "value": "istio.io/gateway-controller,openshift.io/gateway-controller/v1" - } - } - ]' - echo " ✅ Kuadrant operator patched ($KUADRANT_CSV)" - else - echo " âš ī¸ Kuadrant CSV not found, skipping patch" - fi - else - echo " ✅ Kuadrant operator already configured" - fi + # Gateway API + fail-close rate limits (same as deploy.sh patch_kuadrant_csv) + echo "🚀 Patching Kuadrant operator CSV..." + patch_kuadrant_csv "kuadrant-system" "kuadrant-operator" echo "✅ Successfully installed kuadrant" echo "" diff --git a/scripts/validate-deployment.sh b/scripts/validate-deployment.sh index 01d5dcf4c..843779d65 100755 --- a/scripts/validate-deployment.sh +++ b/scripts/validate-deployment.sh @@ -394,7 +394,7 @@ print_header "3ī¸âƒŖ Policy Status" print_check "AuthPolicy" AUTHPOLICY_COUNT=$(kubectl get authpolicy -A --no-headers 2>/dev/null | wc -l || echo "0") if [ "$AUTHPOLICY_COUNT" -gt 0 ]; then - AUTHPOLICY_STATUS=$(kubectl get authpolicy -n openshift-ingress gateway-auth-policy -o jsonpath='{.status.conditions[?(@.type=="Accepted")].status}' 2>/dev/null || echo "NotFound") + AUTHPOLICY_STATUS=$(kubectl get authpolicy -n openshift-ingress gateway-default-auth -o jsonpath='{.status.conditions[?(@.type=="Accepted")].status}' 2>/dev/null || echo "NotFound") if [ "$AUTHPOLICY_STATUS" = "True" ]; then print_success "AuthPolicy is configured and accepted" else diff --git a/scripts/verify-rbac-aggregation.sh b/scripts/verify-rbac-aggregation.sh new file mode 100755 index 000000000..d575a98f5 --- /dev/null +++ b/scripts/verify-rbac-aggregation.sh @@ -0,0 +1,289 @@ +#!/usr/bin/env bash + +# verify-rbac-aggregation.sh +# +# PURPOSE: +# Manual validation helper for platform administrators to verify that MaaS RBAC +# aggregation is correctly configured after deployment. +# +# USAGE: +# ./scripts/verify-rbac-aggregation.sh +# +# REQUIREMENTS: +# - Kubernetes cluster with MaaS deployed +# - kubectl configured with cluster-admin permissions +# - jq command-line JSON processor +# - ClusterRoles must be created (maas-owner-role, maas-user-view-role) +# +# WHAT IT CHECKS: +# 1. Aggregated ClusterRoles exist (maas-owner-role, maas-user-view-role) +# 2. ClusterRoles have correct aggregation labels +# 3. Built-in admin/edit/view roles include MaaS permissions via aggregation +# 4. Correct verbs are assigned to each role (create/delete for admin, read-only for view) +# +# WHEN TO USE: +# - After initial MaaS deployment +# - When troubleshooting namespace user permission issues +# - After MaaS upgrades to verify RBAC configuration +# +# NOT USED IN CI/CD: +# This is a manual diagnostic tool. CI validates manifests via validate-manifests.sh, +# but runtime cluster state validation requires a live deployment and is done manually. + +set -euo pipefail + +# Color output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Test results +PASSED=0 +FAILED=0 + +log_info() { + echo -e "${BLUE}ℹ${NC} $*" +} + +log_success() { + echo -e "${GREEN}✓${NC} $*" + ((PASSED++)) || true +} + +log_error() { + echo -e "${RED}✗${NC} $*" + ((FAILED++)) || true +} + +log_warning() { + echo -e "${YELLOW}⚠${NC} $*" +} + +echo "==========================================" +echo "MaaS RBAC Aggregation Verification" +echo "==========================================" +echo "" + +# Verify jq is installed +if ! command -v jq &>/dev/null; then + echo -e "${RED}✗${NC} jq is not installed. This script requires jq for precise RBAC verification." + echo " Install jq: https://jqlang.github.io/jq/download/" + exit 1 +fi + +# Check 1: Verify aggregated ClusterRoles exist +log_info "Checking for aggregated ClusterRoles..." + +if kubectl get clusterrole maas-owner-role &>/dev/null; then + log_success "ClusterRole 'maas-owner-role' exists" +else + log_error "ClusterRole 'maas-owner-role' not found" +fi + +if kubectl get clusterrole maas-user-view-role &>/dev/null; then + log_success "ClusterRole 'maas-user-view-role' exists" +else + log_error "ClusterRole 'maas-user-view-role' not found" +fi + +echo "" + +# Check 2: Verify aggregation labels on maas-owner-role +log_info "Checking aggregation labels on maas-owner-role..." + +AGGREGATE_TO_ADMIN=$(kubectl get clusterrole maas-owner-role -o jsonpath='{.metadata.labels.rbac\.authorization\.k8s\.io/aggregate-to-admin}' 2>/dev/null || echo "") +if [ "$AGGREGATE_TO_ADMIN" = "true" ]; then + log_success "maas-owner-role has 'aggregate-to-admin: true' label" +else + log_error "maas-owner-role missing 'aggregate-to-admin: true' label" +fi + +AGGREGATE_TO_EDIT=$(kubectl get clusterrole maas-owner-role -o jsonpath='{.metadata.labels.rbac\.authorization\.k8s\.io/aggregate-to-edit}' 2>/dev/null || echo "") +if [ "$AGGREGATE_TO_EDIT" = "true" ]; then + log_success "maas-owner-role has 'aggregate-to-edit: true' label" +else + log_error "maas-owner-role missing 'aggregate-to-edit: true' label" +fi + +echo "" + +# Check 3: Verify aggregation labels on maas-user-view-role +log_info "Checking aggregation labels on maas-user-view-role..." + +AGGREGATE_TO_VIEW=$(kubectl get clusterrole maas-user-view-role -o jsonpath='{.metadata.labels.rbac\.authorization\.k8s\.io/aggregate-to-view}' 2>/dev/null || echo "") +if [ "$AGGREGATE_TO_VIEW" = "true" ]; then + log_success "maas-user-view-role has 'aggregate-to-view: true' label" +else + log_error "maas-user-view-role missing 'aggregate-to-view: true' label" +fi + +AGGREGATE_TO_ADMIN=$(kubectl get clusterrole maas-user-view-role -o jsonpath='{.metadata.labels.rbac\.authorization\.k8s\.io/aggregate-to-admin}' 2>/dev/null || echo "") +if [ "$AGGREGATE_TO_ADMIN" = "true" ]; then + log_success "maas-user-view-role has 'aggregate-to-admin: true' label" +else + log_error "maas-user-view-role missing 'aggregate-to-admin: true' label" +fi + +AGGREGATE_TO_EDIT=$(kubectl get clusterrole maas-user-view-role -o jsonpath='{.metadata.labels.rbac\.authorization\.k8s\.io/aggregate-to-edit}' 2>/dev/null || echo "") +if [ "$AGGREGATE_TO_EDIT" = "true" ]; then + log_success "maas-user-view-role has 'aggregate-to-edit: true' label" +else + log_error "maas-user-view-role missing 'aggregate-to-edit: true' label" +fi + +echo "" + +# Check 4: Verify built-in admin role includes MaaS permissions +log_info "Checking if 'admin' ClusterRole includes MaaS permissions..." + +ADMIN_RULES=$(kubectl get clusterrole admin -o yaml 2>/dev/null || echo "") + +if echo "$ADMIN_RULES" | grep -q "maas.opendatahub.io"; then + log_success "'admin' ClusterRole includes maas.opendatahub.io API group" + + # Check for specific resources - fail if missing + if echo "$ADMIN_RULES" | grep -A5 "maas.opendatahub.io" | grep -q "maasmodelrefs"; then + log_success "'admin' ClusterRole includes maasmodelrefs resource" + else + log_error "'admin' ClusterRole missing required maasmodelrefs resource" + fi + + if echo "$ADMIN_RULES" | grep -A5 "maas.opendatahub.io" | grep -q "externalmodels"; then + log_success "'admin' ClusterRole includes externalmodels resource" + else + log_error "'admin' ClusterRole missing required externalmodels resource" + fi +else + log_error "'admin' ClusterRole does not include maas.opendatahub.io API group" + log_warning "RBAC aggregation may take a few seconds after ClusterRole creation" +fi + +echo "" + +# Check 5: Verify built-in edit role includes MaaS permissions +log_info "Checking if 'edit' ClusterRole includes MaaS permissions..." + +EDIT_RULES=$(kubectl get clusterrole edit -o yaml 2>/dev/null || echo "") + +if echo "$EDIT_RULES" | grep -q "maas.opendatahub.io"; then + log_success "'edit' ClusterRole includes maas.opendatahub.io API group" + + # Check for specific resources - fail if missing + if echo "$EDIT_RULES" | grep -A5 "maas.opendatahub.io" | grep -q "maasmodelrefs"; then + log_success "'edit' ClusterRole includes maasmodelrefs resource" + else + log_error "'edit' ClusterRole missing required maasmodelrefs resource" + fi + + if echo "$EDIT_RULES" | grep -A5 "maas.opendatahub.io" | grep -q "externalmodels"; then + log_success "'edit' ClusterRole includes externalmodels resource" + else + log_error "'edit' ClusterRole missing required externalmodels resource" + fi +else + log_error "'edit' ClusterRole does not include maas.opendatahub.io API group" + log_warning "RBAC aggregation may take a few seconds after ClusterRole creation" +fi + +echo "" + +# Check 6: Verify built-in view role includes MaaS permissions +log_info "Checking if 'view' ClusterRole includes MaaS permissions..." + +VIEW_RULES=$(kubectl get clusterrole view -o yaml 2>/dev/null || echo "") + +if echo "$VIEW_RULES" | grep -q "maas.opendatahub.io"; then + log_success "'view' ClusterRole includes maas.opendatahub.io API group" + + # Check for specific resources - fail if missing + if echo "$VIEW_RULES" | grep -A5 "maas.opendatahub.io" | grep -q "maasmodelrefs"; then + log_success "'view' ClusterRole includes maasmodelrefs resource" + else + log_error "'view' ClusterRole missing required maasmodelrefs resource" + fi + + if echo "$VIEW_RULES" | grep -A5 "maas.opendatahub.io" | grep -q "externalmodels"; then + log_success "'view' ClusterRole includes externalmodels resource" + else + log_error "'view' ClusterRole missing required externalmodels resource" + fi +else + log_error "'view' ClusterRole does not include maas.opendatahub.io API group" + log_warning "RBAC aggregation may take a few seconds after ClusterRole creation" +fi + +echo "" + +# Check 7: Verify correct verbs for admin role +log_info "Checking verbs for 'admin' ClusterRole MaaS permissions..." + +# Extract verbs only from the MaaS rule using jq +ADMIN_VERBS=$(kubectl get clusterrole admin -o json 2>/dev/null | jq -r '.rules[] | select(.apiGroups[]? == "maas.opendatahub.io") | .verbs[]' 2>/dev/null || echo "") + +EXPECTED_VERBS=("create" "delete" "get" "list" "patch" "update" "watch") +for verb in "${EXPECTED_VERBS[@]}"; do + if echo "$ADMIN_VERBS" | grep -Fx "$verb" >/dev/null; then + log_success "'admin' role has '$verb' verb for MaaS resources" + else + log_error "'admin' role missing required '$verb' verb for MaaS resources" + fi +done + +echo "" + +# Check 8: Verify correct verbs for view role (read-only) +log_info "Checking verbs for 'view' ClusterRole MaaS permissions..." + +# Extract verbs only from the MaaS rule using jq +VIEW_VERBS=$(kubectl get clusterrole view -o json 2>/dev/null | jq -r '.rules[] | select(.apiGroups[]? == "maas.opendatahub.io") | .verbs[]' 2>/dev/null || echo "") + +READ_VERBS=("get" "list" "watch") +for verb in "${READ_VERBS[@]}"; do + if echo "$VIEW_VERBS" | grep -Fx "$verb" >/dev/null; then + log_success "'view' role has '$verb' verb for MaaS resources" + else + log_error "'view' role missing required '$verb' verb for MaaS resources" + fi +done + +# Ensure view role doesn't have write verbs +WRITE_VERBS=("create" "delete" "patch" "update") +for verb in "${WRITE_VERBS[@]}"; do + if echo "$VIEW_VERBS" | grep -Fx "$verb" >/dev/null; then + log_error "'view' role incorrectly has '$verb' verb (should be read-only)" + fi +done + +echo "" +echo "==========================================" +echo "Summary" +echo "==========================================" +echo -e "${GREEN}Passed:${NC} $PASSED" +echo -e "${RED}Failed:${NC} $FAILED" +echo "" + +if [[ $FAILED -eq 0 ]]; then + echo -e "${GREEN}✓ All RBAC aggregation checks passed!${NC}" + echo "" + echo "Next steps:" + echo " 1. Grant namespace users 'admin' or 'edit' role to enable MaaSModelRef creation" + echo " 2. Grant namespace users 'view' role for read-only access" + echo "" + echo "Example: Grant admin role to a user in namespace 'my-models'" + echo " kubectl create rolebinding my-models-admin \\" + echo " --clusterrole=admin \\" + echo " --user=user@example.com \\" + echo " -n my-models" + exit 0 +else + echo -e "${RED}✗ Some RBAC aggregation checks failed${NC}" + echo "" + echo "Troubleshooting:" + echo " 1. Verify MaaS controller is deployed: kubectl get deployment maas-controller -n opendatahub" + echo " 2. Check ClusterRole definitions: kubectl get clusterrole | grep maas-user" + echo " 3. Wait a few seconds for RBAC aggregation to propagate" + echo " 4. Check for RBAC controller errors: kubectl logs -n kube-system -l component=kube-controller-manager" + exit 1 +fi diff --git a/semgrep.yaml b/semgrep.yaml index 516925627..94a3768e8 100644 --- a/semgrep.yaml +++ b/semgrep.yaml @@ -21,7 +21,7 @@ rules: # SECTION 1: GENERIC SECRETS DETECTION — Applies to all file types # ========================================================================== - - id: generic-hardcoded-secret + - id: generic-hardcoded-secret # pragma: allowlist secret languages: [generic] severity: ERROR message: | @@ -77,7 +77,7 @@ rules: cwe: "CWE-798" category: "security" - - id: generic-aws-secret-access-key + - id: generic-aws-secret-access-key # pragma: allowlist secret languages: [generic] severity: ERROR message: | @@ -361,7 +361,7 @@ rules: category: "security" note: "Not necessarily dangerous, but aggregated roles can accumulate unexpected permissions if selectors are too broad" - - id: k8s-rbac-secrets-cluster-access + - id: k8s-rbac-secrets-cluster-access # pragma: allowlist secret languages: [yaml] severity: WARNING message: | @@ -498,7 +498,7 @@ rules: cwe: "CWE-653" category: "security" - - id: k8s-secret-in-configmap + - id: k8s-secret-in-configmap # pragma: allowlist secret languages: [yaml] severity: ERROR message: | @@ -541,7 +541,7 @@ rules: cwe: "CWE-522" category: "security" - - id: yaml-hardcoded-secret + - id: yaml-hardcoded-secret # pragma: allowlist secret languages: [yaml] severity: WARNING message: | @@ -744,10 +744,10 @@ rules: # SECTION 4: GITHUB ACTIONS SECURITY — Workflow files # ========================================================================== - - id: github-actions-hardcoded-secret + - id: github-actions-hardcoded-secret # pragma: allowlist secret languages: [yaml] severity: ERROR - message: | + message: | # pragma: allowlist secret Hardcoded secret in GitHub Actions workflow. Security Risk: Secrets in workflows are visible in git history and to all collaborators. @@ -1814,7 +1814,7 @@ rules: metadata: category: "security" - - id: dockerfile-secret-in-env + - id: dockerfile-secret-in-env # pragma: allowlist secret languages: [dockerfile] severity: ERROR message: | diff --git a/test/e2e/README.md b/test/e2e/README.md index 9d98c5760..33310a43a 100644 --- a/test/e2e/README.md +++ b/test/e2e/README.md @@ -135,6 +135,42 @@ The `/v1/models` endpoint implements subscription-aware model filtering: - HTTP header handling follows standards (case-insensitive) - Model metadata is accurately preserved from source +### Negative & Security Tests + +```bash +cd test/e2e +source .venv/bin/activate + +pytest tests/test_negative_security.py -v +``` + +**Test Coverage (8 tests):** + +- Header spoofing: client-injected identity headers (`X-MaaS-Username`, `X-MaaS-Group`, `X-MaaS-Key-Id`) are stripped +- Duplicate `X-MaaS-Subscription` headers don't override API key binding +- Expired API keys rejected at gateway (403) +- Cross-model access denied when subscription doesn't cover the model (403) +- AuthPolicy deletion revokes gateway access +- MaaSSubscription referencing non-existent model does not reach Active +- MaaSAuthPolicy referencing non-existent model does not reach Active +- Special characters / injection payloads in `X-MaaS-Subscription` header handled safely + +These tests validate the platform's security invariants. + +### Namespace Scoping Tests + +Tests that MaaS controller and API only watch the subscription namespace: + +```bash +pytest tests/test_namespace_scoping.py -v +``` + +**Test Coverage (3 test classes):** +- MaaS API only sees subscriptions in the subscription namespace +- Controller only reconciles CRs in the subscription namespace +- AuthPolicy model ref scoping (only reconciled into the referenced model's namespace) +- Subscription model ref scoping (TRLP only created in the referenced model's namespace) + ## CI Integration These tests run automatically in CI via: @@ -146,8 +182,10 @@ The `prow_run_smoke_test.sh` script: 2. Deploys test models (free + premium simulators) 3. Runs E2E tests: - API key management (`test_api_keys.py`) - - Subscription controller (`test_subscription.py`) + - Subscription management (`test_subscription.py`) - Models endpoint (`test_models_endpoint.py`) + - Negative & security (`test_negative_security.py`) + - Namespace scoping (`test_namespace_scoping.py`) - External OIDC (`test_external_oidc.py`) when `EXTERNAL_OIDC=true` 4. Requires externally provided OIDC settings when `EXTERNAL_OIDC=true` 5. Runs deployment validation and token metadata verification diff --git a/test/e2e/fixtures/kustomization.yaml b/test/e2e/fixtures/kustomization.yaml index 0c91a2dc7..567fa2e14 100644 --- a/test/e2e/fixtures/kustomization.yaml +++ b/test/e2e/fixtures/kustomization.yaml @@ -13,3 +13,4 @@ resources: - unconfigured # No auth/subscription (validates 403) - distinct # Distinct model ID (validates multiple distinct models) - distinct-2 # Second distinct model ID (validates multiple distinct models) + - trlp-test # TRLP test model (validates TRLP validation behavior) diff --git a/test/e2e/fixtures/trlp-test/kustomization.yaml b/test/e2e/fixtures/trlp-test/kustomization.yaml new file mode 100644 index 000000000..b19766004 --- /dev/null +++ b/test/e2e/fixtures/trlp-test/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - llm + - maas diff --git a/test/e2e/fixtures/trlp-test/llm/kustomization.yaml b/test/e2e/fixtures/trlp-test/llm/kustomization.yaml new file mode 100644 index 000000000..bddbceadc --- /dev/null +++ b/test/e2e/fixtures/trlp-test/llm/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: llm + +resources: + - llmis.yaml diff --git a/test/e2e/fixtures/trlp-test/llm/llmis.yaml b/test/e2e/fixtures/trlp-test/llm/llmis.yaml new file mode 100644 index 000000000..722c133b6 --- /dev/null +++ b/test/e2e/fixtures/trlp-test/llm/llmis.yaml @@ -0,0 +1,65 @@ +apiVersion: serving.kserve.io/v1alpha1 +kind: LLMInferenceService +metadata: + name: e2e-trlp-test-simulated +spec: + model: + uri: hf://sshleifer/tiny-gpt2 # ~2MB test model, simulator ignores it anyway + name: test/e2e-trlp-test-model + replicas: 1 + router: + route: {} + # Connect to MaaS-enabled gateway + gateway: + refs: + - name: maas-default-gateway + namespace: openshift-ingress + template: + containers: + - name: main + image: "ghcr.io/llm-d/llm-d-inference-sim:v0.7.1" + imagePullPolicy: Always + command: ["/app/llm-d-inference-sim"] + args: + - --port + - "8000" + - --model + - test/e2e-trlp-test-model + - --mode + - random + - --ssl-certfile + - /var/run/kserve/tls/tls.crt + - --ssl-keyfile + - /var/run/kserve/tls/tls.key + env: + - name: POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + ports: + - name: https + containerPort: 8000 + protocol: TCP + livenessProbe: + httpGet: + path: /health + port: https + scheme: HTTPS + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + readinessProbe: + httpGet: + path: /ready + port: https + scheme: HTTPS diff --git a/test/e2e/fixtures/trlp-test/maas/kustomization.yaml b/test/e2e/fixtures/trlp-test/maas/kustomization.yaml new file mode 100644 index 000000000..6497285bc --- /dev/null +++ b/test/e2e/fixtures/trlp-test/maas/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - maas-model.yaml diff --git a/test/e2e/fixtures/trlp-test/maas/maas-model.yaml b/test/e2e/fixtures/trlp-test/maas/maas-model.yaml new file mode 100644 index 000000000..0cfc5de31 --- /dev/null +++ b/test/e2e/fixtures/trlp-test/maas/maas-model.yaml @@ -0,0 +1,13 @@ +# MaaSModelRef for the TRLP test simulator. +# Used by e2e tests to validate TRLP validation behavior (Degraded with TRLP not ready blocks inference). +# LLMIS from docs/samples/models/e2e-trlp-test-simulated (name: e2e-trlp-test-simulated in namespace llm). +# Serves model ID: test/e2e-trlp-test-model +apiVersion: maas.opendatahub.io/v1alpha1 +kind: MaaSModelRef +metadata: + name: e2e-trlp-test-simulated + namespace: llm +spec: + modelRef: + kind: LLMInferenceService + name: e2e-trlp-test-simulated diff --git a/test/e2e/scripts/auth_utils.sh b/test/e2e/scripts/auth_utils.sh index 5b3c59c75..39f8bf489 100755 --- a/test/e2e/scripts/auth_utils.sh +++ b/test/e2e/scripts/auth_utils.sh @@ -7,6 +7,17 @@ # artifact collection for Prow/CI. Use for diagnosing 403/401 issues, # DNS/connectivity problems, and collecting logs for analysis. # +# Collected artifacts (under $ARTIFACT_DIR): +# authorino-debug.log - Authorino pod logs (token-redacted) +# cluster-state.log - Cluster snapshot (nodes, namespaces, policies, CRs) +# maas-debug-report.log - Full MaaS debug report +# maas-crs/ - Full YAML of MaaS custom resources: +# maasmodelrefs.yaml - MaaSModelRef definitions +# maasauthpolicies.yaml - MaaSAuthPolicy definitions +# maassubscriptions.yaml - MaaSSubscription definitions +# externalmodels.yaml - ExternalModel definitions +# pod-logs/ - Per-pod logs from the deployment namespace +# # Usage: # source test/e2e/scripts/auth_utils.sh # patch_authorino_debug @@ -16,10 +27,15 @@ # ./test/e2e/scripts/auth_utils.sh # # Environment: -# DEPLOYMENT_NAMESPACE - Namespace of MaaS API and controller (default: opendatahub) -# MAAS_SUBSCRIPTION_NAMESPACE - Namespace of MaaS CRs (default: models-as-a-service) -# AUTHORINO_NAMESPACE - Namespace for Authorino (default: kuadrant-system) -# ARTIFACT_DIR - Prow artifact dir; also ARTIFACTS, LOG_DIR (default: test/e2e/reports) +# DEPLOYMENT_NAMESPACE - MaaS API and controller namespace (default: opendatahub) +# MAAS_SUBSCRIPTION_NAMESPACE - MaaS CRs namespace (default: models-as-a-service) +# AUTHORINO_NAMESPACE - Authorino namespace (default: kuadrant-system) +# OPERATOR_NAMESPACE - RHOAI operator namespace (default: redhat-ods-operator) +# APPLICATIONS_NAMESPACE - RHOAI applications namespace (default: redhat-ods-applications) +# GATEWAY_NAMESPACE - Gateway/ingress namespace (default: openshift-ingress) +# LLM_NAMESPACE - LLM workload namespace (default: llm) +# ISTIO_NAMESPACE - Istio/service mesh namespace (default: istio-system) +# ARTIFACT_DIR - Prow artifact dir; also ARTIFACTS, LOG_DIR (default: test/e2e/reports) # # ============================================================================= @@ -38,6 +54,11 @@ PROJECT_ROOT="$(_find_root)" DEPLOYMENT_NAMESPACE="${DEPLOYMENT_NAMESPACE:-opendatahub}" MAAS_SUBSCRIPTION_NAMESPACE="${MAAS_SUBSCRIPTION_NAMESPACE:-models-as-a-service}" AUTHORINO_NAMESPACE="${AUTHORINO_NAMESPACE:-kuadrant-system}" +OPERATOR_NAMESPACE="${OPERATOR_NAMESPACE:-redhat-ods-operator}" +APPLICATIONS_NAMESPACE="${APPLICATIONS_NAMESPACE:-redhat-ods-applications}" +GATEWAY_NAMESPACE="${GATEWAY_NAMESPACE:-openshift-ingress}" +LLM_NAMESPACE="${LLM_NAMESPACE:-llm}" +ISTIO_NAMESPACE="${ISTIO_NAMESPACE:-istio-system}" # OpenShift CI/Prow use ARTIFACT_DIR; respect ARTIFACTS_DIR if already set by caller ARTIFACTS_DIR="${ARTIFACTS_DIR:-${ARTIFACT_DIR:-${ARTIFACTS:-${LOG_DIR:-$PROJECT_ROOT/test/e2e/reports}}}}" @@ -93,7 +114,62 @@ collect_authorino_logs_redacted() { fi done done - [[ -s "$outfile" ]] && echo " Saved to $outfile" + [[ -s "$outfile" ]] && echo " Saved to $outfile" || true +} + +# ----------------------------------------------------------------------------- +# Collect full MaaS CR YAML definitions to artifact dir +# Mirrors the CRD list from red-hat-data-services/must-gather: +# gather_models_as_a_service +# ----------------------------------------------------------------------------- +MAAS_CRDS=( + "maasmodelrefs.maas.opendatahub.io" + "maasauthpolicies.maas.opendatahub.io" + "maassubscriptions.maas.opendatahub.io" + "externalmodels.maas.opendatahub.io" +) + +collect_maas_crs() { + local outdir="${1:-$ARTIFACTS_DIR/maas-crs}" + mkdir -p "$outdir" + echo "Collecting MaaS CR definitions to $outdir" + + local ns_list="" + for crd in "${MAAS_CRDS[@]}"; do + local nss + nss=$(kubectl get "$crd" --all-namespaces -o jsonpath='{range .items[*]}{.metadata.namespace}{" "}{end}' 2>/dev/null || true) + ns_list+=" $nss" + done + ns_list=$(echo "$ns_list" | tr ' ' '\n' | sort -u | grep -v '^$' || true) + + if [[ -z "$ns_list" ]]; then + echo " No MaaS CRs found in any namespace" + echo "No MaaS CRs found at $(date -Iseconds 2>/dev/null || date)" > "$outdir/no-crs-found.log" + return 0 + fi + + local total=0 + for crd in "${MAAS_CRDS[@]}"; do + local short_name="${crd%%.*}" + local outfile="$outdir/${short_name}.yaml" + : > "$outfile" + for ns in $ns_list; do + local yaml + yaml=$(kubectl get "$crd" -n "$ns" -o yaml 2>/dev/null || true) + if [[ -n "$yaml" ]] && ! echo "$yaml" | grep -q 'items: \[\]'; then + { + echo "# --- namespace: $ns ---" + echo "$yaml" + echo "" + } | redact_tokens >> "$outfile" + total=$((total + 1)) + fi + done + if [[ ! -s "$outfile" ]]; then + rm -f "$outfile" + fi + done + echo " Saved CRs from $(echo "$ns_list" | wc -w | tr -d ' ') namespace(s) to $outdir ($total resource group(s))" } # ----------------------------------------------------------------------------- @@ -111,6 +187,18 @@ collect_cluster_state() { echo "--- MaaS deployment namespace ($DEPLOYMENT_NAMESPACE) ---" kubectl get all -n "$DEPLOYMENT_NAMESPACE" 2>/dev/null || true echo "" + echo "--- RHOAI Operator namespace ($OPERATOR_NAMESPACE) ---" + kubectl get pods,deployments,csv -n "$OPERATOR_NAMESPACE" -o wide 2>/dev/null || true + echo "" + echo "--- RHOAI Applications namespace ($APPLICATIONS_NAMESPACE) ---" + kubectl get pods,deployments,services -n "$APPLICATIONS_NAMESPACE" -o wide 2>/dev/null || true + echo "" + echo "--- DSC / DSCI ---" + kubectl get datasciencecluster,dscinitialization -o wide 2>/dev/null || true + echo "" + echo "--- Gateway namespace ($GATEWAY_NAMESPACE) ---" + kubectl get pods,services -n "$GATEWAY_NAMESPACE" -o wide 2>/dev/null || true + echo "" echo "--- AuthPolicies ---" kubectl get authpolicies -A 2>/dev/null || true echo "" @@ -156,7 +244,24 @@ collect_e2e_artifacts() { echo "Artifact dir: $ARTIFACTS_DIR" collect_authorino_logs_redacted "$ARTIFACTS_DIR/authorino-debug.log" collect_cluster_state "$ARTIFACTS_DIR" - collect_namespace_pod_logs "$DEPLOYMENT_NAMESPACE" "$ARTIFACTS_DIR/pod-logs" + collect_maas_crs "$ARTIFACTS_DIR/maas-crs" + local ns + for ns in \ + "$DEPLOYMENT_NAMESPACE" \ + "$MAAS_SUBSCRIPTION_NAMESPACE" \ + "$OPERATOR_NAMESPACE" \ + "$APPLICATIONS_NAMESPACE" \ + "$AUTHORINO_NAMESPACE" \ + "$GATEWAY_NAMESPACE" \ + "$LLM_NAMESPACE" \ + "$ISTIO_NAMESPACE" \ + ; do + if kubectl get namespace "$ns" &>/dev/null; then + collect_namespace_pod_logs "$ns" "$ARTIFACTS_DIR/pod-logs/$ns" + else + echo " Skipping namespace $ns (not found)" + fi + done echo "==============================================" } @@ -320,7 +425,7 @@ EOF fi # Fallback to deployment namespace if still empty - [[ -z "$maas_api_ns" ]] && maas_api_ns="$DEPLOYMENT_NAMESPACE" + [[ -z "$maas_api_ns" ]] && maas_api_ns="$DEPLOYMENT_NAMESPACE" || true local sub_select_url="https://maas-api.${maas_api_ns}.svc.cluster.local:8443/internal/v1/subscriptions/select" _section "Subscription Selector Endpoint Validation" @@ -421,11 +526,16 @@ main() { patch_authorino_debug return 0 fi - # Default: collect artifacts, then print auth debug report + # Default: collect artifacts, then print auth debug report (also saved to file) collect_e2e_artifacts echo "" - echo "========== Auth Debug Report ==========" - run_auth_debug_report + echo "========== MaaS Debug Report ==========" + local report + report=$(run_auth_debug_report) + echo "$report" + mkdir -p "$ARTIFACTS_DIR" + echo "$report" > "$ARTIFACTS_DIR/maas-debug-report.log" + echo "MaaS debug report saved to $ARTIFACTS_DIR/maas-debug-report.log" } # Run main only when executed directly (not sourced) diff --git a/test/e2e/scripts/prow_run_smoke_test.sh b/test/e2e/scripts/prow_run_smoke_test.sh index 360eba273..aa94b62b9 100755 --- a/test/e2e/scripts/prow_run_smoke_test.sh +++ b/test/e2e/scripts/prow_run_smoke_test.sh @@ -275,14 +275,12 @@ deploy_models() { echo "Waiting for models to be ready (timeout: ${LLMIS_TIMEOUT}s)..." if ! oc wait llminferenceservice/facebook-opt-125m-simulated -n llm --for=condition=Ready --timeout="${LLMIS_TIMEOUT}s"; then echo "❌ ERROR: Timed out after ${LLMIS_TIMEOUT}s waiting for free simulator to be ready" - oc get llminferenceservice/facebook-opt-125m-simulated -n llm -o yaml || true - oc get events -n llm --sort-by='.lastTimestamp' || true + dump_llmis_diagnostics "facebook-opt-125m-simulated" "llm" exit 1 fi if ! oc wait llminferenceservice/premium-simulated-simulated-premium -n llm --for=condition=Ready --timeout="${LLMIS_TIMEOUT}s"; then echo "❌ ERROR: Timed out after ${LLMIS_TIMEOUT}s waiting for premium simulator to be ready" - oc get llminferenceservice/premium-simulated-simulated-premium -n llm -o yaml || true - oc get events -n llm --sort-by='.lastTimestamp' || true + dump_llmis_diagnostics "premium-simulated-simulated-premium" "llm" exit 1 fi echo "✅ Simulator models ready" @@ -453,7 +451,38 @@ setup_premium_test_token() { export E2E_TEST_TOKEN_SA_NAMESPACE="$PREMIUM_USERS_NS" export E2E_TEST_TOKEN_SA_NAME="$PREMIUM_SA" - # TODO: Add brief reconcile wait if controller is slow to pick up patches. + + # Wait for subscriptions to reconcile after patches (race condition fix) + # Subscriptions must reach Active or Degraded phase before tests start, + # otherwise the OPA rule in subscription-valid will reject empty phase. + echo "Waiting for MaaSSubscriptions to reconcile after patch (timeout: 60s)..." + local timeout=60 + local deadline=$((SECONDS + timeout)) + local both_ready=false + + while [[ $SECONDS -lt $deadline ]]; do + local sim_phase premium_phase + sim_phase=$(oc get maassubscription simulator-subscription -n "$MAAS_SUBSCRIPTION_NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "") + premium_phase=$(oc get maassubscription premium-simulator-subscription -n "$MAAS_SUBSCRIPTION_NAMESPACE" -o jsonpath='{.status.phase}' 2>/dev/null || echo "") + + # Accept Active or Degraded (both are valid for tests) + if [[ "$sim_phase" == "Active" || "$sim_phase" == "Degraded" ]] && \ + [[ "$premium_phase" == "Active" || "$premium_phase" == "Degraded" ]]; then + echo "✅ Both subscriptions ready: simulator-subscription=$sim_phase, premium-simulator-subscription=$premium_phase" + both_ready=true + break + fi + + sleep 2 + done + + if ! $both_ready; then + echo "❌ ERROR: Subscriptions did not reach Active/Degraded phase within ${timeout}s" + echo "Subscription status:" + oc get maassubscriptions -n "$MAAS_SUBSCRIPTION_NAMESPACE" -o yaml || true + exit 1 + fi + echo "✅ Premium test token setup complete (E2E_TEST_TOKEN_SA_* exported)" } @@ -517,7 +546,7 @@ run_e2e_tests() { echo "âš ī¸ WARNING: Gateway not reachable after ${gw_timeout}s, proceeding anyway (tests may fail)" fi - # Run all e2e tests: API keys, subscription, models endpoint, and namespace scoping tests + # Run all e2e tests: API keys, namespace scoping, negative security, subscription, models endpoint if ! PYTHONPATH="$test_dir:${PYTHONPATH:-}" pytest \ -v --maxfail=5 --disable-warnings \ --junitxml="$xml" \ @@ -525,9 +554,10 @@ run_e2e_tests() { --capture=tee-sys --show-capture=all --log-level=INFO \ "$test_dir/tests/test_api_keys.py" \ "$test_dir/tests/test_namespace_scoping.py" \ + "$test_dir/tests/test_negative_security.py" \ "$test_dir/tests/test_subscription.py" \ "$test_dir/tests/test_models_endpoint.py" \ - "$test_dir/tests/test_external_oidc.py" ; then + "$test_dir/tests/test_external_models.py" ; then echo "❌ ERROR: E2E tests failed" exit 1 fi diff --git a/test/e2e/smoke.sh b/test/e2e/smoke.sh index 043bb5997..33a4e5faf 100755 --- a/test/e2e/smoke.sh +++ b/test/e2e/smoke.sh @@ -82,30 +82,72 @@ fi USER="$(oc whoami)" echo "[smoke] Performing smoke test for user: ${USER}" -# 1) Get OC token directly (no more /v1/tokens minting endpoint) +# 1) Get bootstrap token and mint API key for tests mkdir -p "${DIR}/reports" LOG="${DIR}/reports/smoke-${USER}.log" : > "${LOG}" -TOKEN="$(oc whoami -t || true)" -if [[ -z "${TOKEN}" ]]; then - echo "[smoke] ERROR: could not get OC token via 'oc whoami -t'" | tee -a "${LOG}" +# Get bootstrap token (cluster token used only for minting API keys) +BOOTSTRAP_TOKEN="$(oc whoami -t || true)" +if [[ -z "${BOOTSTRAP_TOKEN}" ]]; then + echo "[smoke] ERROR: could not get bootstrap token via 'oc whoami -t'" | tee -a "${LOG}" echo "[smoke] Make sure you are logged into OpenShift" | tee -a "${LOG}" exit 1 fi -export TOKEN -# Log a masked preview of the token to the log (not the console) -echo "[token] using OC token: len=$((${#TOKEN})) head=${TOKEN:0:12}â€Ļtail=${TOKEN: -8}" >> "${LOG}" +# Log token acquisition without exposing token content +echo "[bootstrap] acquired cluster token (len=${#BOOTSTRAP_TOKEN})" >> "${LOG}" -# Admin token setup - use current user if possible, add to odh-admins -setup_admin_token() { - if [[ -n "${ADMIN_OC_TOKEN:-}" ]]; then - echo "[smoke] ADMIN_OC_TOKEN already set externally" - export ADMIN_OC_TOKEN - return 0 +# Mint an API key using a bootstrap token +# Usage: mint_api_key [bootstrap_token] +# All logs go to stderr; only the key is written to stdout +mint_api_key() { + local key_name="${1:-e2e-smoke-key}" + local token="${2:-${BOOTSTRAP_TOKEN}}" + local response + local api_key + + # Pre-flight check for jq + if ! command -v jq >/dev/null 2>&1; then + echo "[smoke] ERROR: jq is required to mint API keys" | tee -a "${LOG}" >&2 + return 1 fi + + echo "[smoke] Minting API key '${key_name}' via ${MAAS_API_BASE_URL}/v1/api-keys..." | tee -a "${LOG}" >&2 + + if ! response=$(curl -skS --max-time 30 -X POST \ + -H "Authorization: Bearer ${token}" \ + -H "Content-Type: application/json" \ + -d "{\"name\": \"${key_name}\", \"expiresIn\": \"2h\"}" \ + "${MAAS_API_BASE_URL}/v1/api-keys" 2>&1); then + echo "[smoke] ERROR: Failed to reach ${MAAS_API_BASE_URL}/v1/api-keys" | tee -a "${LOG}" >&2 + return 1 + fi + + api_key=$(echo "${response}" | jq -r '.key // empty' 2>/dev/null || true) + + if [[ -z "${api_key}" || "${api_key}" == "null" ]]; then + echo "[smoke] ERROR: Failed to mint API key" | tee -a "${LOG}" >&2 + echo "[smoke] Response from /v1/api-keys was not parseable (may contain sensitive data)" | tee -a "${LOG}" >&2 + return 1 + fi + + echo "[smoke] Successfully minted API key (len=${#api_key})" | tee -a "${LOG}" >&2 + printf '%s\n' "${api_key}" +} +# Mint API key for tests +if ! TOKEN=$(mint_api_key "e2e-smoke-${USER}"); then + echo "[smoke] ERROR: Failed to mint API key for tests" | tee -a "${LOG}" + exit 1 +fi +export TOKEN + +# Admin token setup - add to odh-admins, then mint admin API key +setup_admin_token() { + # Clear any stale inherited value to prevent false positive admin tests + unset ADMIN_OC_TOKEN + echo "[smoke] Setting up admin token for admin tests..." local current_user @@ -155,13 +197,25 @@ subjects: name: odh-admins RBAC_EOF - # Use current user's token - ADMIN_OC_TOKEN="$(oc whoami -t 2>/dev/null || true)" - if [[ -n "${ADMIN_OC_TOKEN}" ]]; then + # Get admin bootstrap token + local admin_bootstrap_token + admin_bootstrap_token="$(oc whoami -t 2>/dev/null || true)" + if [[ -z "${admin_bootstrap_token}" ]]; then + echo "[smoke] Failed to get admin bootstrap token - admin tests will be skipped" + return 0 + fi + + # Mint admin API key + local admin_api_key + if ! admin_api_key=$(mint_api_key "e2e-admin-${current_user}" "${admin_bootstrap_token}"); then + echo "[smoke] Failed to mint admin API key - admin tests will be skipped" + return 0 + fi + + if [[ -n "${admin_api_key}" ]]; then + ADMIN_OC_TOKEN="${admin_api_key}" export ADMIN_OC_TOKEN - echo "[smoke] ADMIN_OC_TOKEN configured - admin tests will run" - else - echo "[smoke] Failed to get token (cert-based auth?) - admin tests will be skipped" + echo "[smoke] Admin API key minted successfully - admin tests will run" fi } diff --git a/test/e2e/tests/conftest.py b/test/e2e/tests/conftest.py index cea745cca..af1d953f2 100644 --- a/test/e2e/tests/conftest.py +++ b/test/e2e/tests/conftest.py @@ -97,12 +97,12 @@ def model_id(model_catalog: dict): @pytest.fixture(scope="session") def model_base_url(model_catalog: dict, model_id: str, gateway_url: str) -> str: items = (model_catalog.get("data") or model_catalog.get("models") or []) - first = items[0] if items else {} - url = (first or {}).get("url") - if not url: - # Build from gateway URL - url = f"{gateway_url}/llm/{model_id}" - return url.rstrip("/") + match = next((m for m in items if m.get("id") == model_id), None) + if match: + url = match.get("url") + if url: + return url.rstrip("/") + return f"{gateway_url}/llm/{model_id}".rstrip("/") @pytest.fixture(scope="session") def model_v1(model_base_url: str) -> str: diff --git a/test/e2e/tests/test_api_keys.py b/test/e2e/tests/test_api_keys.py index f7af8afc1..8e726f318 100644 --- a/test/e2e/tests/test_api_keys.py +++ b/test/e2e/tests/test_api_keys.py @@ -29,14 +29,39 @@ 3. Get token: export ADMIN_OC_TOKEN=$(oc create token tester-admin -n default) """ +import json import logging import os +import subprocess +import time +from datetime import datetime + import pytest import requests -import time from conftest import TLS_VERIFY -from test_subscription import SIMULATOR_SUBSCRIPTION +from test_helper import ( + MODEL_NAME, + MODEL_NAMESPACE, + MODEL_REF, + SIMULATOR_SUBSCRIPTION, + TIMEOUT, + _create_api_key, + _create_api_key_raw, + _create_sa_token, + _create_test_auth_policy, + _create_test_subscription, + _delete_cr, + _delete_sa, + _get_cr, + _maas_api_url, + _ns, + _sa_to_user, + _scale_controller_down, + _scale_controller_up, + _wait_for_maas_subscription_phase, + _wait_reconcile, +) log = logging.getLogger(__name__) @@ -50,7 +75,7 @@ def model_completions_url(model_v1: str) -> str: @pytest.fixture def inference_model_name() -> str: """Model name for inference requests. Override with INFERENCE_MODEL_NAME env var.""" - return os.environ.get("INFERENCE_MODEL_NAME", "facebook/opt-125m") + return os.environ.get("INFERENCE_MODEL_NAME", MODEL_NAME) class TestAPIKeyCRUD: @@ -310,7 +335,7 @@ class TestAPIKeyExpiration: Environment Variables: - API_KEY_MAX_EXPIRATION_DAYS: The configured max expiration in days (set on maas-api deployment). Must be explicitly set by the e2e test harness to match the maas-api deployment configuration. - Default is 30 days. Minimum is 1 day. + Default is 90 days. Minimum is 1 day. """ @pytest.fixture @@ -1061,3 +1086,269 @@ def test_trigger_cleanup_preserves_active_keys( assert r_get.json().get("status") == "active", \ f"Key should still be active after cleanup, got: {r_get.json().get('status')}" print(f"[cleanup] Active ephemeral key {key_id} survived cleanup (correct behavior)") + + +class TestAPIKeySubscriptionPhases: + """ + Test API key creation with subscriptions in different phases. + + Tests verify that API keys can be created for any reconciled subscription + phase (Active, Degraded, Failed, Pending), but not for unreconciled subscriptions. + + Note: Inference behavior is tested separately in test_subscription.py::TestDegradedSubscriptionFiltering + """ + + def test_create_key_for_active_subscription(self): + """API key creation succeeds for Active subscription.""" + ns = _ns() + subscription_name = "e2e-apikey-active-sub" + auth_name = "e2e-apikey-active-auth" + sa_name = "e2e-apikey-active-sa" + + try: + oc_token = _create_sa_token(sa_name, namespace="default") + sa_user = _sa_to_user(sa_name, namespace="default") + + _create_test_auth_policy(auth_name, MODEL_REF, users=[sa_user]) + _create_test_subscription(subscription_name, MODEL_REF, users=[sa_user]) + _wait_for_maas_subscription_phase(subscription_name, namespace=ns) + + cr = _get_cr("maassubscription", subscription_name, namespace=ns) + phase = cr.get("status", {}).get("phase") + assert phase == "Active", f"Expected Active, got {phase}" + + # Create API key (should succeed) + api_key = _create_api_key( + oc_token, + name="active-sub-test", + subscription=subscription_name + ) + assert api_key is not None and api_key.startswith("sk-"), \ + f"Expected valid API key, got: {api_key[:20] if api_key else None}" + log.info("✅ API key created successfully for Active subscription") + + finally: + _delete_cr("maassubscription", subscription_name, namespace=ns) + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + + def test_create_key_for_degraded_subscription(self): + """API key creation succeeds for Degraded subscription.""" + ns = _ns() + subscription_name = "e2e-apikey-degraded-sub" + auth_name = "e2e-apikey-degraded-auth" + sa_name = "e2e-apikey-degraded-sa" + missing_model = "nonexistent-model-apikey" + + try: + oc_token = _create_sa_token(sa_name, namespace="default") + sa_user = _sa_to_user(sa_name, namespace="default") + + _create_test_auth_policy(auth_name, MODEL_REF, users=[sa_user]) + # Create with valid + missing model to trigger Degraded phase + _create_test_subscription( + subscription_name, + [MODEL_REF, missing_model], + users=[sa_user] + ) + _wait_reconcile(seconds=10) + + cr = _get_cr("maassubscription", subscription_name, namespace=ns) + phase = cr.get("status", {}).get("phase") + assert phase == "Degraded", f"Expected Degraded, got {phase}" + + # Create API key (should succeed) + api_key = _create_api_key( + oc_token, + name="degraded-sub-test", + subscription=subscription_name + ) + assert api_key is not None and api_key.startswith("sk-"), \ + f"Expected valid API key, got: {api_key[:20] if api_key else None}" + log.info("✅ API key created successfully for Degraded subscription") + + finally: + _delete_cr("maassubscription", subscription_name, namespace=ns) + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + + def test_create_key_for_failed_subscription(self): + """API key creation is rejected for Failed subscription to prevent key spam.""" + ns = _ns() + subscription_name = "e2e-apikey-failed-sub" + auth_name = "e2e-apikey-failed-auth" + sa_name = "e2e-apikey-failed-sa" + + try: + oc_token = _create_sa_token(sa_name, namespace="default") + sa_user = _sa_to_user(sa_name, namespace="default") + + _create_test_auth_policy(auth_name, MODEL_REF, users=[sa_user]) + _create_test_subscription(subscription_name, MODEL_REF, users=[sa_user]) + _wait_reconcile(seconds=10) + + # Patch to Failed phase + patch_data = { + "status": { + "phase": "Failed", + "conditions": [{ + "type": "Ready", + "status": "False", + "reason": "Failed", + "message": "Test scenario", + "lastTransitionTime": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") + }], + "modelRefStatuses": [{ + "name": MODEL_REF, + "namespace": MODEL_NAMESPACE, + "ready": False, + "reason": "ReconcileFailed", + "message": "Test failure" + }] + } + } + + cmd = [ + "kubectl", "patch", "maassubscription", subscription_name, + "-n", ns, "--type=merge", "--subresource=status", + "-p", json.dumps(patch_data) + ] + result = subprocess.run(cmd, capture_output=True, text=True) + assert result.returncode == 0, f"Failed to patch: {result.stderr}" + + cr = _get_cr("maassubscription", subscription_name, namespace=ns) + phase = cr.get("status", {}).get("phase") + assert phase == "Failed", f"Expected Failed, got {phase}" + + # Create API key (should be rejected for Failed subscriptions) + resp = _create_api_key_raw( + oc_token, + name="failed-sub-test", + subscription=subscription_name + ) + assert resp.status_code == 403, \ + f"Expected 403 Forbidden for Failed subscription, got {resp.status_code}: {resp.text}" + log.info("✅ API key creation rejected for Failed subscription (prevents key spam)") + + finally: + _delete_cr("maassubscription", subscription_name, namespace=ns) + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + + def test_create_key_for_pending_subscription(self): + """API key creation succeeds for Pending subscription.""" + ns = _ns() + subscription_name = "e2e-apikey-pending-sub" + auth_name = "e2e-apikey-pending-auth" + sa_name = "e2e-apikey-pending-sa" + + try: + oc_token = _create_sa_token(sa_name, namespace="default") + sa_user = _sa_to_user(sa_name, namespace="default") + + _create_test_auth_policy(auth_name, MODEL_REF, users=[sa_user]) + _create_test_subscription(subscription_name, MODEL_REF, users=[sa_user]) + _wait_reconcile(seconds=10) + + # Patch to Pending phase + patch_data = { + "status": { + "phase": "Pending", + "conditions": [{ + "type": "Ready", + "status": "False", + "reason": "Pending", + "message": "Reconciliation in progress", + "lastTransitionTime": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") + }], + } + } + + cmd = [ + "kubectl", "patch", "maassubscription", subscription_name, + "-n", ns, "--type=merge", "--subresource=status", + "-p", json.dumps(patch_data) + ] + result = subprocess.run(cmd, capture_output=True, text=True) + assert result.returncode == 0, f"Failed to patch: {result.stderr}" + + cr = _get_cr("maassubscription", subscription_name, namespace=ns) + phase = cr.get("status", {}).get("phase") + assert phase == "Pending", f"Expected Pending, got {phase}" + + # Create API key (should succeed) + api_key = _create_api_key( + oc_token, + name="pending-sub-test", + subscription=subscription_name + ) + assert api_key is not None and api_key.startswith("sk-"), \ + f"Expected valid API key, got: {api_key[:20] if api_key else None}" + log.info("✅ API key created successfully for Pending subscription") + + finally: + _delete_cr("maassubscription", subscription_name, namespace=ns) + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + + def test_reject_key_for_unreconciled_subscription(self): + """ + API key creation is rejected for unreconciled subscription (empty phase). + + This test scales down the controller to ensure deterministic behavior. + """ + ns = _ns() + subscription_name = "e2e-apikey-unreconciled-sub" + auth_name = "e2e-apikey-unreconciled-auth" + sa_name = "e2e-apikey-unreconciled-sa" + + try: + # Scale down controller to prevent reconciliation + _scale_controller_down() + + oc_token = _create_sa_token(sa_name, namespace="default") + sa_user = _sa_to_user(sa_name, namespace="default") + + _create_test_auth_policy(auth_name, MODEL_REF, users=[sa_user]) + # Create subscription (won't reconcile with controller scaled down) + _create_test_subscription(subscription_name, MODEL_REF, users=[sa_user]) + + # Verify subscription is unreconciled + cr = _get_cr("maassubscription", subscription_name, namespace=ns) + phase = cr.get("status", {}).get("phase", "") + assert phase == "", f"Expected empty phase, got: {phase}" + log.info("✅ Subscription is unreconciled (empty phase)") + + # Try to create API key (should fail with 400) + response = requests.post( + f"{_maas_api_url()}/v1/api-keys", + headers={ + "Authorization": f"Bearer {oc_token}", + "Content-Type": "application/json" + }, + json={ + "name": "unreconciled-sub-test", + "subscription": subscription_name + }, + timeout=TIMEOUT, + verify=TLS_VERIFY, + ) + + assert response.status_code == 400, \ + f"Expected 400 for unreconciled subscription, got {response.status_code}: {response.text}" + response_data = response.json() + assert "code" in response_data and response_data["code"] == "subscription_not_ready", \ + f"Expected subscription_not_ready error code, got: {response_data}" + log.info("✅ API key creation rejected for unreconciled subscription") + + finally: + # Scale controller back up + _scale_controller_up() + _delete_cr("maassubscription", subscription_name, namespace=ns) + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() diff --git a/test/e2e/tests/test_external_models.py b/test/e2e/tests/test_external_models.py new file mode 100644 index 000000000..e22e4e8b2 --- /dev/null +++ b/test/e2e/tests/test_external_models.py @@ -0,0 +1,328 @@ +""" +E2E tests for external model (egress) support. + +Tests that MaaS can route requests to an external endpoint via ExternalModel CRD, +including reconciler resource creation, auth enforcement, and egress connectivity. + +Prerequisites: +- MaaS deployed with ExternalModel reconciler +- External endpoint accessible from the cluster (default: httpbin.org) + +Environment variables: +- E2E_EXTERNAL_ENDPOINT: External endpoint hostname (default: httpbin.org) +- E2E_EXTERNAL_SUBSCRIPTION: Subscription name (default: e2e-external-subscription) +- GATEWAY_HOST: MaaS gateway hostname (required) +""" + +import json +import logging +import os +import subprocess +import time + +import pytest +import requests + +from test_helper import ( + MODEL_NAMESPACE, + TLS_VERIFY, + _apply_cr, + _delete_cr, + _get_cr, + _wait_for_maas_auth_policy_phase, + _wait_for_maas_subscription_phase, +) + +log = logging.getLogger(__name__) + +# ─── Configuration ────────────────────────────────────────────────────────── + +EXTERNAL_ENDPOINT = os.environ.get("E2E_EXTERNAL_ENDPOINT", os.environ.get("E2E_SIMULATOR_ENDPOINT", "httpbin.org")) +SUBSCRIPTION_NAMESPACE = os.environ.get("E2E_SUBSCRIPTION_NAMESPACE", os.environ.get("MAAS_SUBSCRIPTION_NAMESPACE", "models-as-a-service")) +EXTERNAL_SUBSCRIPTION = os.environ.get("E2E_EXTERNAL_SUBSCRIPTION", "e2e-external-subscription") +EXTERNAL_AUTH_POLICY = os.environ.get("E2E_EXTERNAL_AUTH_POLICY", "e2e-external-access") +RECONCILE_WAIT = int(os.environ.get("E2E_RECONCILE_WAIT", "12")) + +EXTERNAL_MODEL_NAME = "e2e-external-model" + + +# ─── Helpers ───────────────────────────────────────────────────────────────── + +def _patch_cr(kind: str, name: str, namespace: str, patch: dict): + """Patch a Kubernetes resource.""" + subprocess.run( + ["oc", "patch", kind, name, "-n", namespace, "--type=merge", "-p", json.dumps(patch)], + capture_output=True, text=True, + ) + + + +# ─── Connectivity check ────────────────────────────────────────────────────── + +def _check_external_endpoint_reachable(): + """Verify the external endpoint is reachable. Skip tests if not.""" + try: + r = requests.get(f"https://{EXTERNAL_ENDPOINT}/get", timeout=10, verify=False) + if r.status_code == 200: + return True + except Exception: + pass + # Try HTTP fallback + try: + r = requests.get(f"http://{EXTERNAL_ENDPOINT}/get", timeout=10) + if r.status_code == 200: + return True + except Exception: + pass + return False + + +pytestmark = pytest.mark.skipif( + not _check_external_endpoint_reachable(), + reason=f"External endpoint {EXTERNAL_ENDPOINT} is not reachable (disconnected environment?)", +) + + +# ─── Fixture: Create external model resources ──────────────────────────────── + +@pytest.fixture(scope="module") +def external_models_setup(gateway_url, headers, api_keys_base_url): + """ + Create a single ExternalModel CR, MaaSModelRef, AuthPolicy, and + Subscription pointing to an external endpoint. Cleanup after tests. + """ + log.info(f"Setting up external model test fixture (endpoint: {EXTERNAL_ENDPOINT})...") + + # Create a dummy secret (ExternalModel requires credentialRef) + _apply_cr({ + "apiVersion": "v1", + "kind": "Secret", + "metadata": { + "name": f"{EXTERNAL_MODEL_NAME}-api-key", + "namespace": MODEL_NAMESPACE, + }, + "type": "Opaque", + "stringData": {"api-key": "e2e-test-key"}, + }) + + # Create ExternalModel CR + _apply_cr({ + "apiVersion": "maas.opendatahub.io/v1alpha1", + "kind": "ExternalModel", + "metadata": {"name": EXTERNAL_MODEL_NAME, "namespace": MODEL_NAMESPACE}, + "spec": { + "provider": "openai", + "targetModel": "gpt-3.5-turbo", + "endpoint": EXTERNAL_ENDPOINT, + "credentialRef": { + "name": f"{EXTERNAL_MODEL_NAME}-api-key", + "namespace": MODEL_NAMESPACE, + }, + }, + }) + + # Create MaaSModelRef + _apply_cr({ + "apiVersion": "maas.opendatahub.io/v1alpha1", + "kind": "MaaSModelRef", + "metadata": { + "name": EXTERNAL_MODEL_NAME, + "namespace": MODEL_NAMESPACE, + "annotations": { + "maas.opendatahub.io/endpoint": EXTERNAL_ENDPOINT, + "maas.opendatahub.io/provider": "openai", + }, + }, + "spec": { + "modelRef": {"kind": "ExternalModel", "name": EXTERNAL_MODEL_NAME}, + }, + }) + + # Create MaaSAuthPolicy + _apply_cr({ + "apiVersion": "maas.opendatahub.io/v1alpha1", + "kind": "MaaSAuthPolicy", + "metadata": {"name": EXTERNAL_AUTH_POLICY, "namespace": SUBSCRIPTION_NAMESPACE}, + "spec": { + "modelRefs": [{"name": EXTERNAL_MODEL_NAME, "namespace": MODEL_NAMESPACE}], + "subjects": {"groups": [{"name": "system:authenticated"}]}, + }, + }) + + # Create MaaSSubscription + _apply_cr({ + "apiVersion": "maas.opendatahub.io/v1alpha1", + "kind": "MaaSSubscription", + "metadata": {"name": EXTERNAL_SUBSCRIPTION, "namespace": SUBSCRIPTION_NAMESPACE}, + "spec": { + "owner": {"groups": [{"name": "system:authenticated"}]}, + "modelRefs": [ + { + "name": EXTERNAL_MODEL_NAME, + "namespace": MODEL_NAMESPACE, + "tokenRateLimits": [{"limit": 10000, "window": "1h"}], + }, + ], + }, + }) + + # Wait for CRs to reconcile + _wait_for_maas_auth_policy_phase(EXTERNAL_AUTH_POLICY, namespace=SUBSCRIPTION_NAMESPACE) + _wait_for_maas_subscription_phase(EXTERNAL_SUBSCRIPTION, namespace=SUBSCRIPTION_NAMESPACE) + + # Create API key for tests + log.info("Creating API key for external model tests...") + r = requests.post( + api_keys_base_url, + headers=headers, + json={"name": "e2e-external-model-key", "subscription": EXTERNAL_SUBSCRIPTION}, + timeout=30, + verify=TLS_VERIFY, + ) + if r.status_code not in (200, 201): + pytest.fail(f"Failed to create API key: {r.status_code} {r.text}") + + api_key = r.json().get("key") + log.info(f"API key created: {api_key[:15]}...") + + yield { + "api_key": api_key, + "gateway_url": gateway_url, + } + + # ── Cleanup ── + log.info("Cleaning up external model test fixtures...") + _delete_cr("maasauthpolicy", EXTERNAL_AUTH_POLICY, SUBSCRIPTION_NAMESPACE) + _delete_cr("maassubscription", EXTERNAL_SUBSCRIPTION, SUBSCRIPTION_NAMESPACE) + _patch_cr("maasmodelref", EXTERNAL_MODEL_NAME, MODEL_NAMESPACE, + {"metadata": {"finalizers": []}}) + _delete_cr("maasmodelref", EXTERNAL_MODEL_NAME, MODEL_NAMESPACE) + _delete_cr("externalmodel", EXTERNAL_MODEL_NAME, MODEL_NAMESPACE) + _delete_cr("secret", f"{EXTERNAL_MODEL_NAME}-api-key", MODEL_NAMESPACE) + + +# ─── Tests: Discovery ─────────────────────────────────────────────────────── + +class TestExternalModelDiscovery: + """Verify ExternalModel reconciler creates the expected Istio resources.""" + + def test_maasmodelref_created(self, external_models_setup): + """MaaSModelRef exists for the external model.""" + cr = _get_cr("maasmodelref", EXTERNAL_MODEL_NAME, MODEL_NAMESPACE) + assert cr is not None, f"MaaSModelRef {EXTERNAL_MODEL_NAME} not found" + + def test_reconciler_created_httproute(self, external_models_setup): + """Reconciler created HTTPRoute matching the ExternalModel name.""" + cr = _get_cr("httproute", EXTERNAL_MODEL_NAME, MODEL_NAMESPACE) + assert cr is not None, f"HTTPRoute {EXTERNAL_MODEL_NAME} not found" + + def test_reconciler_created_backend_service(self, external_models_setup): + """Reconciler created backend service.""" + cr = _get_cr("service", EXTERNAL_MODEL_NAME, MODEL_NAMESPACE) + assert cr is not None, f"Service {EXTERNAL_MODEL_NAME} not found" + + +# ─── Tests: Auth ───────────────────────────────────────────────────────────── + +class TestExternalModelAuth: + """Verify auth enforcement for external model routes.""" + + def test_invalid_key_returns_401(self, external_models_setup): + """Invalid API key returns 401/403.""" + setup = external_models_setup + url = f"{setup['gateway_url']}/{MODEL_NAMESPACE}/{EXTERNAL_MODEL_NAME}/v1/chat/completions" + headers = { + "Content-Type": "application/json", + "Authorization": "Bearer INVALID-KEY-12345", + } + body = {"model": EXTERNAL_MODEL_NAME, "messages": [{"role": "user", "content": "hello"}]} + + r = requests.post(url, headers=headers, json=body, timeout=30, verify=TLS_VERIFY) + assert r.status_code in (401, 403), f"Expected 401/403, got {r.status_code}" + + def test_no_key_returns_401(self, external_models_setup): + """No API key returns 401/403.""" + setup = external_models_setup + url = f"{setup['gateway_url']}/{MODEL_NAMESPACE}/{EXTERNAL_MODEL_NAME}/v1/chat/completions" + headers = {"Content-Type": "application/json"} + body = {"model": EXTERNAL_MODEL_NAME, "messages": [{"role": "user", "content": "hello"}]} + + r = requests.post(url, headers=headers, json=body, timeout=30, verify=TLS_VERIFY) + assert r.status_code in (401, 403), f"Expected 401/403, got {r.status_code}" + + +# ─── Tests: Egress ─────────────────────────────────────────────────────────── + +class TestExternalModelEgress: + """Verify requests are forwarded to the external endpoint.""" + + def test_request_forwarded_returns_200(self, external_models_setup): + """ + With a valid API key, the request passes auth and reaches the + external endpoint. Expect 200 confirming egress connectivity. + """ + setup = external_models_setup + url = f"{setup['gateway_url']}/{MODEL_NAMESPACE}/{EXTERNAL_MODEL_NAME}/v1/chat/completions" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {setup['api_key']}", + } + body = {"model": EXTERNAL_MODEL_NAME, "messages": [{"role": "user", "content": "hello"}]} + + r = requests.post(url, headers=headers, json=body, timeout=30, verify=TLS_VERIFY) + assert r.status_code not in (401, 403), ( + f"Request was blocked by auth (HTTP {r.status_code}). " + f"Expected the request to reach the external endpoint." + ) + # Any non-auth response confirms egress connectivity. + # httpbin.org may return 404 for unknown paths — that's fine, + # it means the request left the cluster and reached the endpoint. + log.info(f"Egress test: HTTP {r.status_code} from external endpoint") + + +# ─── Tests: Cleanup ───────────────────────────────────────────────────────── + +class TestExternalModelCleanup: + """Verify resource cleanup when external models are deleted.""" + + def test_delete_removes_httproute(self, external_models_setup): + """ + Deleting an ExternalModel removes the HTTPRoute via OwnerReference + garbage collection (ExternalModel owns all reconciled resources). + """ + temp_name = "e2e-cleanup-test" + + # Create temporary model + _apply_cr({ + "apiVersion": "maas.opendatahub.io/v1alpha1", + "kind": "ExternalModel", + "metadata": {"name": temp_name, "namespace": MODEL_NAMESPACE}, + "spec": { + "provider": "openai", + "targetModel": "gpt-3.5-turbo", + "endpoint": EXTERNAL_ENDPOINT, + "credentialRef": { + "name": f"{EXTERNAL_MODEL_NAME}-api-key", + "namespace": MODEL_NAMESPACE, + }, + }, + }) + + try: + # Wait for reconciler to create resources + time.sleep(RECONCILE_WAIT * 2) + + # Verify HTTPRoute was created + route = _get_cr("httproute", temp_name, MODEL_NAMESPACE) + assert route is not None, f"HTTPRoute {temp_name} should exist before deletion" + + # Delete the ExternalModel (owns the HTTPRoute via OwnerReference) + _delete_cr("externalmodel", temp_name, MODEL_NAMESPACE) + time.sleep(RECONCILE_WAIT) + + # Verify HTTPRoute was cleaned up by garbage collection + route = _get_cr("httproute", temp_name, MODEL_NAMESPACE) + assert route is None, f"HTTPRoute {temp_name} should be cleaned up after ExternalModel deletion" + finally: + # Always clean up to avoid resource leaks + _delete_cr("externalmodel", temp_name, MODEL_NAMESPACE) diff --git a/test/e2e/tests/test_helper.py b/test/e2e/tests/test_helper.py index 32e3740bb..774eb1130 100644 --- a/test/e2e/tests/test_helper.py +++ b/test/e2e/tests/test_helper.py @@ -1,12 +1,634 @@ +""" +Shared helpers and constants for MaaS E2E tests. + +This module centralizes common utilities used across multiple test files: +- Environment-based constants (timeouts, model refs, namespaces) +- Cluster authentication (OC tokens, service account tokens) +- API key management (create, revoke) +- Custom Resource management (apply, delete, get, list, snapshot) +- Inference helpers (send requests, poll for expected status) +- Wait/polling utilities (reconciliation, CR readiness, phase checks) +- CR creation helpers (MaaSAuthPolicy, MaaSSubscription) + +Environment variables (all optional unless noted): + - GATEWAY_HOST: Gateway hostname (required) + - MAAS_API_BASE_URL: MaaS API URL (auto-derived from GATEWAY_HOST if not set) + - MAAS_SUBSCRIPTION_NAMESPACE: MaaS CRs namespace (default: models-as-a-service) + - E2E_TEST_TOKEN_SA_NAMESPACE, E2E_TEST_TOKEN_SA_NAME: SA token source for Prow + - E2E_TIMEOUT: Request timeout in seconds (default: 45) + - E2E_RECONCILE_WAIT: Wait time for reconciliation in seconds (default: 8) + - E2E_SKIP_TLS_VERIFY: Set to "true" to skip TLS verification + - E2E_MODEL_PATH: Path to free model (default: /llm/facebook-opt-125m-simulated) + - E2E_MODEL_NAME: Model name for API requests (default: facebook/opt-125m) + - E2E_MODEL_REF: Model ref for CRs (default: facebook-opt-125m-simulated) + - E2E_MODEL_NAMESPACE: Namespace where models live (default: llm) + - E2E_SIMULATOR_SUBSCRIPTION: Free-tier subscription (default: simulator-subscription) + - E2E_PREMIUM_MODEL_REF: Premium model ref (default: premium-simulated-simulated-premium) + - E2E_PREMIUM_SIMULATOR_SUBSCRIPTION: Premium subscription (default: premium-simulator-subscription) + - E2E_SIMULATOR_ACCESS_POLICY: Simulator auth policy name (default: simulator-access) + - E2E_UNCONFIGURED_MODEL_REF: Unconfigured model ref (default: e2e-unconfigured-facebook-opt-125m-simulated) + - E2E_UNCONFIGURED_MODEL_PATH: Path to unconfigured model (default: /llm/e2e-unconfigured-facebook-opt-125m-simulated) + - E2E_DISTINCT_MODEL_REF: First distinct model ref (default: e2e-distinct-simulated) + - E2E_DISTINCT_MODEL_ID: Model ID for first distinct model (default: test/e2e-distinct-model) + - E2E_DISTINCT_MODEL_2_REF: Second distinct model ref (default: e2e-distinct-2-simulated) + - E2E_DISTINCT_MODEL_2_ID: Model ID for second distinct model (default: test/e2e-distinct-model-2) + - E2E_TRLP_TEST_MODEL_REF: TRLP test model ref (default: e2e-trlp-test-simulated) + - E2E_TRLP_TEST_MODEL_PATH: Path to TRLP test model (default: /llm/e2e-trlp-test-simulated) + - E2E_TRLP_TEST_MODEL_ID: Model ID for TRLP test model (default: test/e2e-trlp-test-model) +""" + +import base64 +import json +import logging import os +import subprocess +import time +import uuid +from typing import Optional + import requests -from conftest import TLS_VERIFY +log = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Constants (override with env vars) +# --------------------------------------------------------------------------- +TIMEOUT = int(os.environ.get("E2E_TIMEOUT", "45")) +RECONCILE_WAIT = int(os.environ.get("E2E_RECONCILE_WAIT", "8")) +TLS_VERIFY = os.environ.get("E2E_SKIP_TLS_VERIFY", "").lower() != "true" +MODEL_PATH = os.environ.get("E2E_MODEL_PATH", "/llm/facebook-opt-125m-simulated") +MODEL_NAME = os.environ.get("E2E_MODEL_NAME", "facebook/opt-125m") +MODEL_REF = os.environ.get("E2E_MODEL_REF", "facebook-opt-125m-simulated") +MODEL_NAMESPACE = os.environ.get("E2E_MODEL_NAMESPACE", "llm") +SIMULATOR_SUBSCRIPTION = os.environ.get("E2E_SIMULATOR_SUBSCRIPTION", "simulator-subscription") +PREMIUM_MODEL_REF = os.environ.get("E2E_PREMIUM_MODEL_REF", "premium-simulated-simulated-premium") +PREMIUM_SIMULATOR_SUBSCRIPTION = os.environ.get("E2E_PREMIUM_SIMULATOR_SUBSCRIPTION", "premium-simulator-subscription") +SIMULATOR_ACCESS_POLICY = os.environ.get("E2E_SIMULATOR_ACCESS_POLICY", "simulator-access") +UNCONFIGURED_MODEL_REF = os.environ.get("E2E_UNCONFIGURED_MODEL_REF", "e2e-unconfigured-facebook-opt-125m-simulated") +UNCONFIGURED_MODEL_PATH = os.environ.get("E2E_UNCONFIGURED_MODEL_PATH", "/llm/e2e-unconfigured-facebook-opt-125m-simulated") +DISTINCT_MODEL_REF = os.environ.get("E2E_DISTINCT_MODEL_REF", "e2e-distinct-simulated") +DISTINCT_MODEL_ID = os.environ.get("E2E_DISTINCT_MODEL_ID", "test/e2e-distinct-model") +DISTINCT_MODEL_2_REF = os.environ.get("E2E_DISTINCT_MODEL_2_REF", "e2e-distinct-2-simulated") +DISTINCT_MODEL_2_ID = os.environ.get("E2E_DISTINCT_MODEL_2_ID", "test/e2e-distinct-model-2") +TRLP_TEST_MODEL_REF = os.environ.get("E2E_TRLP_TEST_MODEL_REF", "e2e-trlp-test-simulated") +TRLP_TEST_MODEL_PATH = os.environ.get("E2E_TRLP_TEST_MODEL_PATH", "/llm/e2e-trlp-test-simulated") +TRLP_TEST_MODEL_ID = os.environ.get("E2E_TRLP_TEST_MODEL_ID", "test/e2e-trlp-test-model") + + +# --------------------------------------------------------------------------- +# Environment / URL helpers +# --------------------------------------------------------------------------- + +def _ns(): + """Default MaaS subscription namespace.""" + return os.environ.get("MAAS_SUBSCRIPTION_NAMESPACE", "models-as-a-service") + + +def _gateway_url(): + host = os.environ.get("GATEWAY_HOST", "") + if not host: + raise RuntimeError("GATEWAY_HOST env var is required") + scheme = "http" if os.environ.get("INSECURE_HTTP", "").lower() == "true" else "https" + return f"{scheme}://{host}" + + +def _maas_api_url(): + """Get the MaaS API base URL for API key operations.""" + url = os.environ.get("MAAS_API_BASE_URL", "") + if not url: + host = os.environ.get("GATEWAY_HOST", "") + if not host: + raise RuntimeError("MAAS_API_BASE_URL or GATEWAY_HOST env var is required") + scheme = "http" if os.environ.get("INSECURE_HTTP", "").lower() == "true" else "https" + url = f"{scheme}://{host}/maas-api" + return url + + +# --------------------------------------------------------------------------- +# Authentication helpers +# --------------------------------------------------------------------------- + +def _decode_jwt_payload(token: str) -> Optional[dict]: + """Decode JWT payload (no verification, for debugging). Returns claims dict or None.""" + try: + parts = token.split(".") + if len(parts) != 3: + return None + payload_b64 = parts[1] + payload_b64 += "=" * (4 - len(payload_b64) % 4) + payload_bytes = base64.urlsafe_b64decode(payload_b64) + return json.loads(payload_bytes) + except Exception: + return None + + +def _create_sa_token(sa_name, namespace=None, duration="10m"): + namespace = namespace or _ns() + sa_result = subprocess.run( + ["oc", "create", "sa", sa_name, "-n", namespace], capture_output=True, text=True + ) + if sa_result.returncode != 0 and "already exists" not in sa_result.stderr: + raise RuntimeError(f"Failed to create SA {sa_name}: {sa_result.stderr}") + result = subprocess.run( + ["oc", "create", "token", sa_name, "-n", namespace, f"--duration={duration}"], + capture_output=True, text=True, + ) + token = result.stdout.strip() + if not token: + raise RuntimeError(f"Could not create token for SA {sa_name}: {result.stderr}") + return token + + +def _delete_sa(sa_name, namespace=None): + """Delete a service account (best-effort, for cleanup).""" + namespace = namespace or _ns() + result = subprocess.run( + ["oc", "delete", "sa", sa_name, "-n", namespace, "--ignore-not-found"], + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode != 0: + log.warning( + "Failed to delete serviceaccount/%s in %s: %s", + sa_name, + namespace, + result.stderr.strip(), + ) + + +def _sa_to_user(sa_name, namespace=None): + """Convert service account name to Kubernetes user principal.""" + namespace = namespace or _ns() + return f"system:serviceaccount:{namespace}:{sa_name}" + + +def _get_cluster_token(): + """Get OC token for API key management operations (not for inference). + + Priority: + 1. TOKEN env var (set by prow script for regular user) + 2. E2E_TEST_TOKEN_SA_* env vars (for SA-based tokens) + 3. oc whoami -t (fallback for local testing) + """ + token = os.environ.get("TOKEN", "") + if token: + log.info("Using TOKEN env var for API key operations") + return token + + sa_ns = os.environ.get("E2E_TEST_TOKEN_SA_NAMESPACE") + sa_name = os.environ.get("E2E_TEST_TOKEN_SA_NAME") + if sa_ns and sa_name: + token = _create_sa_token(sa_name, namespace=sa_ns) + else: + token_result = subprocess.run(["oc", "whoami", "-t"], capture_output=True, text=True) + token = token_result.stdout.strip() if token_result.returncode == 0 else "" + if not token: + raise RuntimeError("Could not get cluster token via `oc whoami -t`; run with oc login first") + claims = _decode_jwt_payload(token) + if claims: + safe_keys = {k: v for k, v in claims.items() if k in ("iss", "aud", "exp", "iat")} + log.debug("Token claims (non-sensitive): %s", json.dumps(safe_keys)) + return token + + +# --------------------------------------------------------------------------- +# API Key Management +# --------------------------------------------------------------------------- + +def _create_api_key_raw(oc_token: str, name: str = None, subscription: str = None): + """Create an API key and return the raw response (for testing error cases). + + Args: + oc_token: OC token for authentication with maas-api + name: Optional name for the key (auto-generated if not provided) + subscription: Optional MaaSSubscription name to bind (highest-priority auto-bind if omitted) + + Returns: + requests.Response object + """ + url = f"{_maas_api_url()}/v1/api-keys" + key_name = name or f"e2e-test-{uuid.uuid4().hex[:8]}" + + body = {"name": key_name} + if subscription: + body["subscription"] = subscription + + return requests.post( + url, + headers={ + "Authorization": f"Bearer {oc_token}", + "Content-Type": "application/json", + }, + json=body, + timeout=TIMEOUT, + verify=TLS_VERIFY, + ) + + +def _create_api_key(oc_token: str, name: str = None, subscription: str = None) -> str: + """Create an API key using the MaaS API and return the plaintext key. + + Args: + oc_token: OC token for authentication with maas-api + name: Optional name for the key (auto-generated if not provided) + subscription: Optional MaaSSubscription name to bind (highest-priority auto-bind if omitted) + + Returns: + The plaintext API key (sk-oai-xxx format) + """ + r = _create_api_key_raw(oc_token, name, subscription) + if r.status_code not in (200, 201): + raise RuntimeError(f"Failed to create API key: {r.status_code} {r.text}") + + data = r.json() + api_key = data.get("key") + if not api_key: + raise RuntimeError(f"API key response missing 'key' field: {data}") + + log.info("Created API key '%s' bound to subscription '%s'", name, subscription) + return api_key + + +def _revoke_api_key(oc_token: str, key_id: str): + """Revoke an API key (best-effort, for cleanup).""" + url = f"{_maas_api_url()}/v1/api-keys/{key_id}" + try: + r = requests.delete( + url, + headers={"Authorization": f"Bearer {oc_token}"}, + timeout=TIMEOUT, + verify=TLS_VERIFY, + ) + if r.status_code not in (200, 204, 404): + log.warning("Failed to revoke API key %s: %s %s", key_id, r.status_code, r.text[:200]) + except requests.RequestException as e: + log.warning("Failed to revoke API key %s: %s", key_id, e) + + +# --------------------------------------------------------------------------- +# CR Management +# --------------------------------------------------------------------------- + +def _apply_cr(cr_dict): + subprocess.run(["oc", "apply", "-f", "-"], input=json.dumps(cr_dict), capture_output=True, text=True, check=True) + + +def _delete_cr(kind, name, namespace=None): + namespace = namespace or _ns() + result = subprocess.run( + ["oc", "delete", kind, name, "-n", namespace, "--ignore-not-found", "--timeout=30s"], + capture_output=True, text=True, + ) + if result.returncode != 0: + log.warning("Failed to delete %s/%s in %s: %s", kind, name, namespace, result.stderr.strip()) + + +def _is_transient_kubectl_error(stderr): + """Check if kubectl error is likely transient (network, timeout).""" + transient_patterns = [ + "TLS handshake timeout", + "connection refused", + "connection reset", + "i/o timeout", + "dial tcp", + "EOF", + "temporary failure", + "network is unreachable", + ] + stderr_lower = stderr.lower() + return any(pattern.lower() in stderr_lower for pattern in transient_patterns) + + +def _is_not_found_error(stderr): + """Check if kubectl error indicates the resource was not found.""" + stderr_lower = stderr.lower() + return "notfound" in stderr_lower or "not found" in stderr_lower + + +def _get_cr(kind, name, namespace=None): + """Get a CR as dict, or None if not found. Retries on transient errors. + + Returns None only when the resource genuinely does not exist (server NotFound). + Raises RuntimeError for other failures (RBAC, missing CRD, transport errors + that persist after retries) so callers can distinguish infrastructure issues + from true absence. + """ + namespace = namespace or _ns() + max_retries = 3 + retry_delay = 2 + + for attempt in range(max_retries): + result = subprocess.run(["oc", "get", kind, name, "-n", namespace, "-o", "json"], capture_output=True, text=True) + + if result.returncode == 0: + return json.loads(result.stdout) + + if attempt < max_retries - 1 and _is_transient_kubectl_error(result.stderr): + log.warning( + f"Transient kubectl error getting {kind}/{name} (attempt {attempt + 1}/{max_retries}): {result.stderr.strip()}" + ) + time.sleep(retry_delay * (attempt + 1)) + continue + + # Terminal failure — distinguish not-found from other errors + if _is_not_found_error(result.stderr): + return None + + log.error( + f"Failed to get {kind}/{name} in namespace '{namespace}' after {attempt + 1} attempts. " + f"Last error: {result.stderr.strip()}" + ) + raise RuntimeError( + f"Failed to get {kind}/{name} in namespace '{namespace}': {result.stderr.strip()}" + ) + + +def _snapshot_cr(kind, name, namespace=None): + """Capture a CR for later restoration (strips runtime metadata).""" + cr = _get_cr(kind, name, namespace) + if not cr: + return None + meta = cr.get("metadata", {}) + for key in ("resourceVersion", "uid", "creationTimestamp", "generation", "managedFields"): + meta.pop(key, None) + annotations = meta.get("annotations", {}) + annotations.pop("kubectl.kubernetes.io/last-applied-configuration", None) + if not annotations: + meta.pop("annotations", None) + cr.pop("status", None) + return cr + + +def _list_crs(kind, namespace=None): + """List all CRs of a given kind. + + Args: + kind: CR kind (e.g., 'maasmodelref', 'maasauthpolicy') + namespace: Namespace to search (defaults to _ns()) + + Returns: + List of CR dictionaries + + Raises: + RuntimeError: If kubectl command fails with contextual error details + """ + namespace = namespace or _ns() + plural = { + "maasmodelref": "maasmodelrefs", + "maasauthpolicy": "maasauthpolicies", + "maassubscription": "maassubscriptions", + }.get(kind, f"{kind}s") + + cmd = ["oc", "get", plural, "-n", namespace, "-o", "json"] + + # Retry transient network errors with exponential backoff + max_retries = 3 + retry_delay = 2 # seconds + + for attempt in range(max_retries): + result = subprocess.run( + cmd, + capture_output=True, + text=True, + check=False + ) + + if result.returncode == 0: + return json.loads(result.stdout).get("items", []) + + # Check if error is transient and we have retries left + if attempt < max_retries - 1 and _is_transient_kubectl_error(result.stderr): + log.warning( + f"Transient kubectl error (attempt {attempt + 1}/{max_retries}): {result.stderr.strip()}" + ) + time.sleep(retry_delay * (attempt + 1)) # exponential backoff + continue + + # Final attempt or non-transient error + raise RuntimeError( + f"Failed to list {plural} in namespace '{namespace}'.\n" + f"Command: {' '.join(cmd)}\n" + f"Exit code: {result.returncode}\n" + f"Stderr: {result.stderr}\n" + f"Guidance: Ensure the CRD exists, namespace is correct, and you have permissions." + ) + + # Unreachable: loop always exits via return or raise + # Included for type checker and defensive programming + return [] + + +def _get_auth_policies_for_model(model_ref, namespace=None, model_namespace=None): + """Get all MaaSAuthPolicies that reference a model. + + Args: + model_ref: Name of the MaaSModelRef + namespace: Namespace to search for policies (defaults to _ns()) + model_namespace: Expected namespace of the modelRef (defaults to MODEL_NAMESPACE) + + Returns: + List of auth policy names that reference the model + """ + namespace = namespace or _ns() + model_namespace = model_namespace or MODEL_NAMESPACE + policies = _list_crs("maasauthpolicy", namespace) + + matching = [] + for policy in policies: + model_refs = policy.get("spec", {}).get("modelRefs", []) + for ref in model_refs: + if isinstance(ref, dict): + ref_name = ref.get("name") + ref_ns = ref.get("namespace") + else: + ref_name = ref + ref_ns = None + if ref_name == model_ref and ref_ns == model_namespace: + matching.append(policy["metadata"]["name"]) + break + return matching + + +def _get_subscriptions_for_model(model_ref, namespace=None, model_namespace=None): + """Get all MaaSSubscriptions that reference a model. + + Args: + model_ref: Name of the MaaSModelRef + namespace: Namespace to search for subscriptions (defaults to _ns()) + model_namespace: Expected namespace of the modelRef (defaults to MODEL_NAMESPACE) + + Returns: + List of subscription names that reference the model + """ + namespace = namespace or _ns() + model_namespace = model_namespace or MODEL_NAMESPACE + subs = _list_crs("maassubscription", namespace) -TIMEOUT = (45, 45) # (connect, read) + matching = [] + for sub in subs: + model_refs = sub.get("spec", {}).get("modelRefs", []) + for ref in model_refs: + if isinstance(ref, dict): + ref_name = ref.get("name") + ref_ns = ref.get("namespace") + else: + ref_name = ref + ref_ns = None + if ref_name == model_ref and ref_ns == model_namespace: + matching.append(sub["metadata"]["name"]) + break + return matching + + +# --------------------------------------------------------------------------- +# CR Creation Helpers +# --------------------------------------------------------------------------- + +def _create_test_auth_policy(name, model_refs, users=None, groups=None, namespace=None): + """Create a MaaSAuthPolicy CR for testing. + + Args: + name: Name of the auth policy + model_refs: Model ref(s) - can be string or list + users: List of user principals (e.g., ["system:serviceaccount:ns:sa"]) + groups: List of group names (e.g., ["system:authenticated"]) + namespace: Namespace for the auth policy (defaults to _ns()) + """ + namespace = namespace or _ns() + if not isinstance(model_refs, list): + model_refs = [model_refs] + + model_refs_formatted = [{"name": ref, "namespace": MODEL_NAMESPACE} for ref in model_refs] + groups_formatted = [{"name": g} for g in (groups or [])] + + log.info("Creating MaaSAuthPolicy: %s", name) + _apply_cr({ + "apiVersion": "maas.opendatahub.io/v1alpha1", + "kind": "MaaSAuthPolicy", + "metadata": {"name": name, "namespace": namespace}, + "spec": { + "modelRefs": model_refs_formatted, + "subjects": { + "users": users or [], + "groups": groups_formatted + } + } + }) + + +def _create_test_subscription( + name, + model_refs, + users=None, + groups=None, + token_limit=100, + window="1m", + namespace=None, + priority=None, +): + """Create a MaaSSubscription CR for testing. + + Args: + name: Name of the subscription + model_refs: Model ref(s) - can be string or list + users: List of user principals + groups: List of group names + token_limit: Token rate limit (default: 100) + window: Rate limit window (default: "1m") + namespace: Namespace for the subscription (defaults to _ns()) + priority: Optional spec.priority (higher wins for default API key binding) + """ + namespace = namespace or _ns() + if not isinstance(model_refs, list): + model_refs = [model_refs] + + groups_formatted = [{"name": g} for g in (groups or [])] + + spec = { + "owner": { + "users": users or [], + "groups": groups_formatted, + }, + "modelRefs": [ + { + "name": ref, + "namespace": MODEL_NAMESPACE, + "tokenRateLimits": [{"limit": token_limit, "window": window}], + } + for ref in model_refs + ], + } + if priority is not None: + spec["priority"] = int(priority) + + log.info("Creating MaaSSubscription: %s", name) + _apply_cr( + { + "apiVersion": "maas.opendatahub.io/v1alpha1", + "kind": "MaaSSubscription", + "metadata": {"name": name, "namespace": namespace}, + "spec": spec, + } + ) + + +# --------------------------------------------------------------------------- +# Inference Helpers +# --------------------------------------------------------------------------- + +def _inference(api_key, path=None, extra_headers=None, model_name=None, max_tokens=3): + """POST completions using an API key only (subscription is bound at mint).""" + path = path or MODEL_PATH + url = f"{_gateway_url()}{path}/v1/completions" + headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} + if extra_headers: + headers.update(extra_headers) + return requests.post( + url, headers=headers, + json={"model": model_name or MODEL_NAME, "prompt": "Hello", "max_tokens": max_tokens}, + timeout=TIMEOUT, verify=TLS_VERIFY, + ) + + +def _poll_status(api_key, expected, path=None, extra_headers=None, model_name=None, timeout=None, poll_interval=2): + """Poll inference endpoint until expected HTTP status or timeout.""" + timeout = timeout or max(RECONCILE_WAIT * 3, 60) + deadline = time.time() + timeout + last = None + last_err = None + while time.time() < deadline: + try: + r = _inference(api_key, path=path, extra_headers=extra_headers, model_name=model_name) + last_err = None + ok = r.status_code == expected if isinstance(expected, int) else r.status_code in expected + if ok: + return r + last = r + except requests.RequestException as exc: + last_err = exc + log.debug(f"Transient request error while polling: {exc}") + except Exception as exc: + log.exception(f"Non-transient error while polling, failing fast: {exc}") + raise + time.sleep(poll_interval) + exp_str = expected if isinstance(expected, int) else " or ".join(str(e) for e in expected) + err_msg = f"Expected {exp_str} within {timeout}s" + if last is not None: + err_msg += f", last status: {last.status_code}" + if last_err is not None: + err_msg += f", last error: {last_err}" + if last is None and last_err is None: + err_msg += ", no response (all requests may have raised non-RequestException)" + raise AssertionError(err_msg) + + +# --------------------------------------------------------------------------- +# HTTP helpers (used by test_smoke.py) +# --------------------------------------------------------------------------- def _post(url: str, payload: dict, headers: dict, timeout_sec: int = 45) -> requests.Response: - # TLS verification controlled by E2E_SKIP_TLS_VERIFY env var return requests.post( url, headers=headers, @@ -16,12 +638,375 @@ def _post(url: str, payload: dict, headers: dict, timeout_sec: int = 45) -> requ stream=False, ) + def chat(prompt: str, model_v1: str, headers: dict, model_name: str): url = f"{model_v1}/chat/completions" body = {"model": model_name, "messages": [{"role": "user", "content": prompt}]} return requests.post(url, headers=headers, json=body, timeout=30, verify=TLS_VERIFY) + def completions(prompt: str, model_v1: str, headers: dict, model_name: str): url = f"{model_v1}/completions" body = {"model": model_name, "prompt": prompt, "max_tokens": 16} return requests.post(url, headers=headers, json=body, timeout=30, verify=TLS_VERIFY) + + +# --------------------------------------------------------------------------- +# Wait / Polling Helpers +# --------------------------------------------------------------------------- + +def _wait_reconcile(seconds=None): + time.sleep(seconds or RECONCILE_WAIT) + + +def _wait_for_token_rate_limit_policy(model_ref, model_namespace=MODEL_NAMESPACE, timeout=60): + """Wait for TokenRateLimitPolicy to be created and enforced for a model. + + Args: + model_ref: Name of the model (e.g., "e2e-distinct-simulated") + model_namespace: Namespace where the TRLP should be created (default: MODEL_NAMESPACE) + timeout: Maximum wait time in seconds (default: 60) + + Raises: + TimeoutError: If TRLP isn't created and enforced within timeout + """ + trlp_name = f"maas-trlp-{model_ref}" + deadline = time.time() + timeout + log.info(f"Waiting for TokenRateLimitPolicy {trlp_name} in {model_namespace} (timeout: {timeout}s)...") + + while time.time() < deadline: + result = subprocess.run( + ["oc", "get", "tokenratelimitpolicy", trlp_name, "-n", model_namespace, "-o", "json"], + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode == 0: + try: + trlp = json.loads(result.stdout) + conditions = trlp.get("status", {}).get("conditions", []) + enforced = next((c for c in conditions if c.get("type") == "Enforced"), None) + if enforced and enforced.get("status") == "True": + log.info(f"TokenRateLimitPolicy {trlp_name} is enforced") + return + log.debug(f"TokenRateLimitPolicy {trlp_name} exists but not enforced yet") + except (json.JSONDecodeError, KeyError) as e: + log.debug(f"Failed to parse TRLP status: {e}") + elif _is_not_found_error(result.stderr): + log.debug(f"TokenRateLimitPolicy {trlp_name} not found yet...") + elif _is_transient_kubectl_error(result.stderr): + log.debug( + f"Transient error while reading TokenRateLimitPolicy {trlp_name}: {result.stderr.strip()}" + ) + else: + raise RuntimeError( + f"Failed to get TokenRateLimitPolicy {trlp_name} in namespace '{model_namespace}': " + f"{result.stderr.strip()}" + ) + time.sleep(3) + + raise TimeoutError( + f"TokenRateLimitPolicy {trlp_name} was not created and enforced in {model_namespace} within {timeout}s" + ) + + +def _wait_for_maas_subscription_phase(name, expected_phase="Active", namespace=None, timeout=60, require_model_statuses=False): + """Wait for MaaSSubscription to reach a specific phase. + + Args: + name: Name of the MaaSSubscription + expected_phase: Phase to wait for (default: "Active") + namespace: Namespace (defaults to _ns()) + timeout: Maximum wait time in seconds (default: 60) + require_model_statuses: If True, also requires modelRefStatuses to be populated + (default: False). Set to True for status reporting tests. + + Returns: + The subscription CR dict when the expected phase is reached + + Raises: + TimeoutError: If MaaSSubscription doesn't reach expected phase within timeout + """ + namespace = namespace or _ns() + deadline = time.time() + timeout + log.info(f"Waiting for MaaSSubscription {name} to reach phase '{expected_phase}' (timeout: {timeout}s)...") + + while time.time() < deadline: + cr = _get_cr("maassubscription", name, namespace) + if cr: + status = cr.get("status", {}) + phase = status.get("phase") + model_statuses = status.get("modelRefStatuses", []) + + if phase == expected_phase: + if require_model_statuses: + expected_count = len(cr.get("spec", {}).get("modelRefs", [])) + if len(model_statuses) >= expected_count: + log.info(f"MaaSSubscription {name} reached phase '{expected_phase}' with {len(model_statuses)}/{expected_count} modelRefStatuses") + return cr + else: + log.info(f"MaaSSubscription {name} reached phase '{expected_phase}'") + return cr + log.debug(f"MaaSSubscription {name}: phase={phase}, modelRefStatuses={len(model_statuses)}") + time.sleep(2) + + # Timeout - return current state for debugging + cr = _get_cr("maassubscription", name, namespace) + status = cr.get("status", {}) if cr else {} + raise TimeoutError( + f"MaaSSubscription {name} did not reach phase '{expected_phase}' within {timeout}s " + f"(current: phase={status.get('phase')}, modelRefStatuses={len(status.get('modelRefStatuses', []))})" + ) + + +def _wait_for_subscription_trlp_status(name, expected_ready=True, namespace=None, timeout=60): + """Wait for MaaSSubscription's TokenRateLimitPolicy status to reach expected ready state. + + Args: + name: Name of the MaaSSubscription + expected_ready: Expected ready state for all TRLPs (True or False) + namespace: Namespace (defaults to _ns()) + timeout: Maximum wait time in seconds (default: 60) + + Returns: + The subscription CR dict when all TRLPs reach the expected ready state + + Raises: + TimeoutError: If TRLPs don't reach expected state within timeout + """ + namespace = namespace or _ns() + deadline = time.time() + timeout + log.info(f"Waiting for MaaSSubscription {name} TRLP ready={expected_ready} (timeout: {timeout}s)...") + + while time.time() < deadline: + cr = _get_cr("maassubscription", name, namespace) + if cr: + status = cr.get("status", {}) + trlp_statuses = status.get("tokenRateLimitStatuses", []) + + # If we expect ready and there are no TRLPs yet, keep waiting + if expected_ready and len(trlp_statuses) == 0: + log.debug(f"MaaSSubscription {name}: waiting for TRLP statuses to appear") + time.sleep(2) + continue + + # Check if all TRLPs match expected ready state + if len(trlp_statuses) > 0: + all_match = all(trlp.get("ready") == expected_ready for trlp in trlp_statuses) + if all_match: + log.info(f"✅ MaaSSubscription {name} has {len(trlp_statuses)} TRLP(s) with ready={expected_ready}") + return cr + log.debug(f"MaaSSubscription {name}: TRLP statuses={trlp_statuses}") + + time.sleep(2) + + # Timeout - return current state for debugging + cr = _get_cr("maassubscription", name, namespace) + status = cr.get("status", {}) if cr else {} + trlp_statuses = status.get("tokenRateLimitStatuses", []) + raise TimeoutError( + f"MaaSSubscription {name} TRLPs did not reach ready={expected_ready} within {timeout}s " + f"(current TRLPs: {trlp_statuses})" + ) + + +def _wait_for_maas_auth_policy_phase(name, expected_phase="Active", namespace=None, timeout=60, + require_auth_policies=True, require_enforced=True): + """Wait for MaaSAuthPolicy to reach a specific phase. + + Args: + name: Name of the MaaSAuthPolicy + expected_phase: Phase to wait for (default: "Active") + namespace: Namespace (defaults to _ns()) + timeout: Maximum wait time in seconds (default: 60) + require_auth_policies: If True, requires authPolicies to be populated (default: True). + Set to False for Failed phase with missing models. + require_enforced: If True, requires all authPolicies to have ready=True + (default: True). Only applies when require_auth_policies is True. + + Returns: + The auth policy CR dict when the expected phase is reached + + Raises: + TimeoutError: If MaaSAuthPolicy doesn't reach expected phase within timeout + """ + namespace = namespace or _ns() + deadline = time.time() + timeout + log.info(f"Waiting for MaaSAuthPolicy {name} to reach phase '{expected_phase}' (timeout: {timeout}s)...") + + while time.time() < deadline: + cr = _get_cr("maasauthpolicy", name, namespace) + if cr: + status = cr.get("status", {}) + phase = status.get("phase") + auth_policies = status.get("authPolicies", []) + + if phase == expected_phase: + # No auth policies required — phase match is sufficient + if not require_auth_policies: + log.info(f"MaaSAuthPolicy {name} reached phase '{expected_phase}'") + return cr + + # Auth policies required — check they exist + if len(auth_policies) > 0: + if require_enforced: + all_enforced = all( + ap.get("ready") is True + for ap in auth_policies + ) + if all_enforced: + log.info(f"MaaSAuthPolicy {name} reached phase '{expected_phase}' and all enforced") + return cr + else: + log.info(f"MaaSAuthPolicy {name} reached phase '{expected_phase}' with {len(auth_policies)} auth policy status(es)") + return cr + + log.debug(f"MaaSAuthPolicy {name}: phase={phase}, authPolicies={len(auth_policies)}") + time.sleep(2) + + # Timeout - return current state for debugging + cr = _get_cr("maasauthpolicy", name, namespace) + status = cr.get("status", {}) if cr else {} + raise TimeoutError( + f"MaaSAuthPolicy {name} did not reach phase '{expected_phase}' within {timeout}s " + f"(current: phase={status.get('phase')}, authPolicies={len(status.get('authPolicies', []))})" + ) + + +# --------------------------------------------------------------------------- +# Controller scaling utilities +# --------------------------------------------------------------------------- + +def _scale_controller(replicas, namespace=None, timeout=60): + """ + Scale the maas-controller deployment. + + Args: + replicas: Target replica count (0 to disable, 1+ to enable) + namespace: Deployment namespace (defaults to DEPLOYMENT_NAMESPACE env or 'opendatahub') + timeout: Max seconds to wait for scaling operation (default: 60) + + Raises: + subprocess.CalledProcessError: If kubectl scale fails + TimeoutError: If pods don't reach desired state within timeout + """ + namespace = namespace or os.environ.get("DEPLOYMENT_NAMESPACE", "opendatahub") + + log.info(f"Scaling maas-controller to {replicas} replicas in namespace {namespace}...") + + # Scale the deployment + result = subprocess.run( + ["oc", "scale", "deployment", "maas-controller", + f"--replicas={replicas}", "-n", namespace], + check=True, + capture_output=True, + text=True, + timeout=timeout + ) + + # Wait for pods to reach desired state + if replicas == 0: + # Wait for all pods to terminate + log.debug(f"Waiting for maas-controller pods to terminate (timeout: {timeout}s)...") + subprocess.run( + ["oc", "wait", "--for=delete", "pod", + "-l", "app=maas-controller", "-n", namespace, + f"--timeout={timeout}s"], + check=False, # Don't fail if no pods exist + capture_output=True, + text=True + ) + log.info("✓ maas-controller scaled down to 0 replicas") + else: + # Wait for pods to become ready + log.debug(f"Waiting for maas-controller pods to become ready (timeout: {timeout}s)...") + try: + subprocess.run( + ["oc", "wait", "--for=condition=ready", "pod", + "-l", "app=maas-controller", "-n", namespace, + f"--timeout={timeout}s"], + check=True, + capture_output=True, + text=True + ) + log.info(f"✓ maas-controller scaled to {replicas} replica(s)") + except subprocess.CalledProcessError as e: + # Log but don't fail - sometimes pods need extra time + log.warning(f"Pods may not be ready yet: {e.stderr}") + time.sleep(5) # Give it a bit more time + + +def _scale_controller_down(namespace=None, timeout=60): + """Scale maas-controller to 0 replicas (convenience wrapper).""" + _scale_controller(0, namespace, timeout) + + +def _scale_controller_up(namespace=None, timeout=60): + """Scale maas-controller to 1 replica (convenience wrapper).""" + _scale_controller(1, namespace, timeout) + + +def _scale_kuadrant_controller(replicas, namespace="kuadrant-system", timeout=60): + """ + Scale the kuadrant-operator deployment. + + Args: + replicas: Target replica count (0 to disable, 1+ to enable) + namespace: Deployment namespace (default: kuadrant-system) + timeout: Max seconds to wait for scaling operation (default: 60) + + Raises: + subprocess.CalledProcessError: If kubectl scale fails + TimeoutError: If pods don't reach desired state within timeout + """ + log.info(f"Scaling kuadrant-operator to {replicas} replicas in namespace {namespace}...") + + # Scale the deployment + result = subprocess.run( + ["oc", "scale", "deployment", "kuadrant-operator-controller-manager", + f"--replicas={replicas}", "-n", namespace], + check=True, + capture_output=True, + text=True, + timeout=timeout + ) + + # Wait for pods to reach desired state + if replicas == 0: + # Wait for all pods to terminate + log.debug(f"Waiting for kuadrant-operator pods to terminate (timeout: {timeout}s)...") + subprocess.run( + ["oc", "wait", "--for=delete", "pod", + "-l", "control-plane=controller-manager", "-n", namespace, + f"--timeout={timeout}s"], + check=False, # Don't fail if no pods exist + capture_output=True, + text=True + ) + log.info("✓ kuadrant-operator scaled down to 0 replicas") + else: + # Wait for pods to become ready + log.debug(f"Waiting for kuadrant-operator pods to become ready (timeout: {timeout}s)...") + try: + subprocess.run( + ["oc", "wait", "--for=condition=ready", "pod", + "-l", "control-plane=controller-manager", "-n", namespace, + f"--timeout={timeout}s"], + check=True, + capture_output=True, + text=True + ) + log.info(f"✓ kuadrant-operator scaled to {replicas} replica(s)") + except subprocess.CalledProcessError: + log.warning(f"Pods may not be ready yet (timeout: {timeout}s)") + raise + + +def _scale_kuadrant_controller_down(namespace="kuadrant-system", timeout=60): + """Scale kuadrant-operator to 0 replicas (convenience wrapper).""" + _scale_kuadrant_controller(0, namespace, timeout) + + +def _scale_kuadrant_controller_up(namespace="kuadrant-system", timeout=60): + """Scale kuadrant-operator to 1 replica (convenience wrapper).""" + _scale_kuadrant_controller(1, namespace, timeout) diff --git a/test/e2e/tests/test_models_endpoint.py b/test/e2e/tests/test_models_endpoint.py index 72981d5f5..3034dc9c9 100644 --- a/test/e2e/tests/test_models_endpoint.py +++ b/test/e2e/tests/test_models_endpoint.py @@ -4,11 +4,15 @@ Tests the /v1/models endpoint in maas-api/internal/handlers/models.go which lists available models filtered by the user's subscription access. -Requires same environment setup as test_subscription.py: +Requires: - GATEWAY_HOST env var (e.g. maas.apps.cluster.example.com) - MAAS_API_BASE_URL env var (e.g. https://maas.apps.cluster.example.com/maas-api) - maas-controller deployed with example CRs applied - oc/kubectl access to create service account tokens + +Environment variables: + See test_helper.py module docstring for shared environment variables. + This file uses no additional file-specific environment variables. """ import json @@ -16,12 +20,27 @@ import os import subprocess import time +import uuid import pytest import requests -# Import helpers from test_subscription module -from test_subscription import ( +from test_helper import ( + DISTINCT_MODEL_2_ID, + DISTINCT_MODEL_2_REF, + DISTINCT_MODEL_ID, + DISTINCT_MODEL_REF, + MODEL_NAME, + MODEL_NAMESPACE, + MODEL_REF, + PREMIUM_MODEL_REF, + PREMIUM_SIMULATOR_SUBSCRIPTION, + SIMULATOR_ACCESS_POLICY, + SIMULATOR_SUBSCRIPTION, + TIMEOUT, + TLS_VERIFY, + UNCONFIGURED_MODEL_PATH, + UNCONFIGURED_MODEL_REF, _apply_cr, _create_api_key, _create_sa_token, @@ -30,28 +49,18 @@ _delete_cr, _delete_sa, _get_auth_policies_for_model, + _get_cluster_token, _get_cr, _get_subscriptions_for_model, + _inference, _maas_api_url, _ns, _sa_to_user, _snapshot_cr, - _wait_for_maas_auth_policy_ready, - _wait_for_maas_subscription_ready, + _wait_for_maas_auth_policy_phase, + _wait_for_maas_subscription_phase, + _wait_for_token_rate_limit_policy, _wait_reconcile, - DISTINCT_MODEL_ID, - DISTINCT_MODEL_REF, - DISTINCT_MODEL_2_ID, - DISTINCT_MODEL_2_REF, - MODEL_NAMESPACE, - MODEL_REF, - PREMIUM_MODEL_REF, - PREMIUM_SIMULATOR_SUBSCRIPTION, - UNCONFIGURED_MODEL_REF, - SIMULATOR_ACCESS_POLICY, - SIMULATOR_SUBSCRIPTION, - TIMEOUT, - TLS_VERIFY, ) log = logging.getLogger(__name__) @@ -151,7 +160,7 @@ class TestModelsEndpoint: ═══════════════════════════════════════════════════════════════════════════ ERROR CASES (HTTP 401) - Authentication Errors ═══════════════════════════════════════════════════════════════════════════ - 18. test_unauthenticated_request_401 + 22. test_unauthenticated_request_401 → No Authorization header → 401 authentication_error """ @@ -240,6 +249,9 @@ def test_single_subscription_auto_select(self): _create_test_auth_policy(auth_policy_name, DISTINCT_MODEL_REF, users=[sa_user]) _create_test_subscription(subscription_name, DISTINCT_MODEL_REF, users=[sa_user]) + # Wait for subscription to reconcile before creating API key + _wait_for_maas_subscription_phase(subscription_name, namespace=maas_ns) + # Create API key for inference api_key = _create_api_key(sa_token, name=f"{sa_name}-key") @@ -359,7 +371,7 @@ def test_explicit_subscription_header(self): # Add SA to premium-simulator-subscription to give it access to a second subscription log.info(f"Adding {sa_user} to premium-simulator-subscription users") subprocess.run([ - "kubectl", "patch", "maassubscription", PREMIUM_SIMULATOR_SUBSCRIPTION, + "oc", "patch", "maassubscription", PREMIUM_SIMULATOR_SUBSCRIPTION, "-n", maas_ns, "--type=merge", "-p", json.dumps({"spec": {"owner": {"users": [sa_user]}}}) @@ -409,7 +421,7 @@ def test_explicit_subscription_header(self): log.info(f"Removing {sa_user} from premium-simulator-subscription users") # Get current users list, remove our SA, then patch result = subprocess.run([ - "kubectl", "get", "maassubscription", PREMIUM_SIMULATOR_SUBSCRIPTION, + "oc", "get", "maassubscription", PREMIUM_SIMULATOR_SUBSCRIPTION, "-n", maas_ns, "-o", "jsonpath={.spec.owner.users}" ], capture_output=True, text=True, check=True, timeout=30) @@ -417,7 +429,7 @@ def test_explicit_subscription_header(self): users = json.loads(result.stdout) if result.stdout and result.stdout.strip() else [] users = [u for u in users if u != sa_user] subprocess.run([ - "kubectl", "patch", "maassubscription", PREMIUM_SIMULATOR_SUBSCRIPTION, + "oc", "patch", "maassubscription", PREMIUM_SIMULATOR_SUBSCRIPTION, "-n", maas_ns, "--type=merge", "-p", json.dumps({"spec": {"owner": {"users": users}}}) @@ -500,7 +512,7 @@ def test_models_filtered_by_subscription(self): # Add SA to premium subscription log.info(f"Adding {sa_user} to premium-simulator-subscription") subprocess.run([ - "kubectl", "patch", "maassubscription", PREMIUM_SIMULATOR_SUBSCRIPTION, + "oc", "patch", "maassubscription", PREMIUM_SIMULATOR_SUBSCRIPTION, "-n", maas_ns, "--type=merge", "-p", json.dumps({"spec": {"owner": {"users": [sa_user]}}}) @@ -564,7 +576,7 @@ def test_models_filtered_by_subscription(self): # Cleanup if sa_user is not None: result = subprocess.run([ - "kubectl", "get", "maassubscription", PREMIUM_SIMULATOR_SUBSCRIPTION, + "oc", "get", "maassubscription", PREMIUM_SIMULATOR_SUBSCRIPTION, "-n", maas_ns, "-o", "jsonpath={.spec.owner.users}" ], capture_output=True, text=True, check=True, timeout=30) @@ -572,7 +584,7 @@ def test_models_filtered_by_subscription(self): users = json.loads(result.stdout) if result.stdout and result.stdout.strip() else [] users = [u for u in users if u != sa_user] subprocess.run([ - "kubectl", "patch", "maassubscription", PREMIUM_SIMULATOR_SUBSCRIPTION, + "oc", "patch", "maassubscription", PREMIUM_SIMULATOR_SUBSCRIPTION, "-n", maas_ns, "--type=merge", "-p", json.dumps({"spec": {"owner": {"users": users}}}) @@ -623,7 +635,7 @@ def test_deduplication_same_model_multiple_refs(self): }, } subprocess.run( - ["kubectl", "apply", "-f", "-"], + ["oc", "apply", "-f", "-"], input=json.dumps(auth_policy_cr), text=True, check=True, @@ -658,12 +670,15 @@ def test_deduplication_same_model_multiple_refs(self): }, } subprocess.run( - ["kubectl", "apply", "-f", "-"], + ["oc", "apply", "-f", "-"], input=json.dumps(subscription_cr), text=True, check=True, ) + # Wait for subscription to reconcile before creating API key + _wait_for_maas_subscription_phase(subscription_name, namespace=maas_ns) + # Create API key bound to our test subscription api_key_response = requests.post( f"{_maas_api_url()}/v1/api-keys", @@ -788,7 +803,7 @@ def test_different_modelrefs_same_model_id(self): }, } subprocess.run( - ["kubectl", "apply", "-f", "-"], + ["oc", "apply", "-f", "-"], input=json.dumps(auth_policy_cr), text=True, check=True, @@ -823,12 +838,15 @@ def test_different_modelrefs_same_model_id(self): }, } subprocess.run( - ["kubectl", "apply", "-f", "-"], + ["oc", "apply", "-f", "-"], input=json.dumps(subscription_cr), text=True, check=True, ) + # Wait for subscription to reconcile before creating API key + _wait_for_maas_subscription_phase(subscription_name, namespace=maas_ns) + # Create API key bound to our test subscription api_key_response = requests.post( f"{_maas_api_url()}/v1/api-keys", @@ -871,10 +889,10 @@ def test_different_modelrefs_same_model_id(self): # Both modelRefs serve the same model ID assert len(unique_ids) == 1, \ - f"Expected only 1 unique model ID (both modelRefs serve facebook/opt-125m), got {len(unique_ids)}: {unique_ids}" + f"Expected only 1 unique model ID (both modelRefs serve {MODEL_NAME}), got {len(unique_ids)}: {unique_ids}" # Verify it's the expected model ID - expected_id = "facebook/opt-125m" + expected_id = MODEL_NAME assert expected_id in unique_ids, \ f"Expected to find '{expected_id}', but got {unique_ids}" @@ -955,7 +973,7 @@ def test_multiple_distinct_models_in_subscription(self): }, } subprocess.run( - ["kubectl", "apply", "-f", "-"], + ["oc", "apply", "-f", "-"], input=json.dumps(auth_policy_cr), text=True, check=True, @@ -990,12 +1008,15 @@ def test_multiple_distinct_models_in_subscription(self): }, } subprocess.run( - ["kubectl", "apply", "-f", "-"], + ["oc", "apply", "-f", "-"], input=json.dumps(subscription_cr), text=True, check=True, ) + # Wait for subscription to reconcile before creating API key + _wait_for_maas_subscription_phase(subscription_name, namespace=maas_ns) + # Create API key bound to our test subscription api_key_response = requests.post( f"{_maas_api_url()}/v1/api-keys", @@ -1102,10 +1123,10 @@ def test_user_token_returns_all_models(self): _create_test_auth_policy(auth2_name, DISTINCT_MODEL_2_REF, users=[sa_user]) _create_test_subscription(sub2_name, DISTINCT_MODEL_2_REF, users=[sa_user]) - _wait_for_maas_auth_policy_ready(auth1_name) - _wait_for_maas_auth_policy_ready(auth2_name) - _wait_for_maas_subscription_ready(sub1_name) - _wait_for_maas_subscription_ready(sub2_name) + _wait_for_maas_auth_policy_phase(auth1_name) + _wait_for_maas_auth_policy_phase(auth2_name) + _wait_for_maas_subscription_phase(sub1_name) + _wait_for_maas_subscription_phase(sub2_name) # Query with user token (no X-MaaS-Subscription header) log.info("Querying /v1/models with user token (no header)") @@ -1241,6 +1262,9 @@ def test_empty_model_list(self): log.info(f"Creating subscription with {UNCONFIGURED_MODEL_REF} (no auth policy = no access)") _create_test_subscription(subscription_name, UNCONFIGURED_MODEL_REF, users=[sa_user]) + # Wait for subscription to reconcile before creating API key + _wait_for_maas_subscription_phase(subscription_name, namespace=maas_ns) + # Create API key bound to test subscription api_key = _create_api_key(sa_token, name=f"{sa_name}-key", subscription=subscription_name) @@ -1285,11 +1309,11 @@ def test_empty_model_list(self): def test_response_schema_matches_openapi(self): """ - Test 10: Response structure matches OpenAPI schema. + Test 16: Response structure matches OpenAPI schema. Validates all required fields and types match the API specification. """ - log.info("Test 9: Response schema matches OpenAPI spec") + log.info("Test 16: Response schema matches OpenAPI spec") sa_name = "e2e-models-schema-test-sa" sa_ns = "default" @@ -1358,11 +1382,11 @@ def test_response_schema_matches_openapi(self): def test_model_metadata_preserved(self): """ - Test 11: Model metadata is correctly preserved. + Test 17: Model metadata is correctly preserved. Validates that url, ready, created, owned_by fields are accurate. """ - log.info("Test 10: Model metadata preserved") + log.info("Test 17: Model metadata preserved") sa_name = "e2e-models-metadata-sa" sa_ns = "default" @@ -1442,6 +1466,9 @@ def test_api_key_scoped_to_subscription(self): _create_test_auth_policy(auth_policy_name, MODEL_REF, users=[sa_user]) _create_test_subscription(subscription_name, MODEL_REF, users=[sa_user]) + # Wait for subscription to reconcile before creating API key + _wait_for_maas_subscription_phase(subscription_name, namespace=ns) + # Create API key bound to subscription_name api_key = _create_api_key(oc_token, name=f"{sa_name}-key", subscription=subscription_name) @@ -1506,6 +1533,9 @@ def test_api_key_with_deleted_subscription_403(self): _create_test_auth_policy(auth_policy_name, MODEL_REF, users=[sa_user]) _create_test_subscription(subscription_name, MODEL_REF, users=[sa_user]) + # Wait for subscription to reconcile before creating API key + _wait_for_maas_subscription_phase(subscription_name, namespace=ns) + # Create API key bound to subscription api_key = _create_api_key(oc_token, name=f"{sa_name}-key", subscription=subscription_name) @@ -1773,7 +1803,9 @@ def test_api_key_ignores_subscription_header(self): _create_test_auth_policy(auth2_name, DISTINCT_MODEL_2_REF, users=[sa_user]) _create_test_subscription(sub2_name, DISTINCT_MODEL_2_REF, users=[sa_user], priority=5) - _wait_reconcile() + # Wait for both subscriptions to reconcile before creating API key + _wait_for_maas_subscription_phase(sub1_name, namespace=maas_ns) + _wait_for_maas_subscription_phase(sub2_name, namespace=maas_ns) # Create API key - will be bound to highest priority subscription (sub1) log.info(f"Creating API key (will bind to {sub1_name} - highest priority)") @@ -1855,7 +1887,9 @@ def test_multiple_api_keys_different_subscriptions(self): _create_test_auth_policy(auth2_name, DISTINCT_MODEL_2_REF, users=[sa_user]) _create_test_subscription(sub2_name, DISTINCT_MODEL_2_REF, users=[sa_user]) - _wait_reconcile() + # Wait for both subscriptions to reconcile before creating API keys + _wait_for_maas_subscription_phase(sub1_name, namespace=maas_ns) + _wait_for_maas_subscription_phase(sub2_name, namespace=maas_ns) # Create two API keys, each bound to a different subscription log.info(f"Creating API key 1 bound to {sub1_name}") @@ -1960,10 +1994,10 @@ def test_service_account_token_multiple_subs_no_header(self): _create_test_auth_policy(auth2_name, DISTINCT_MODEL_2_REF, users=[sa_user]) _create_test_subscription(sub2_name, DISTINCT_MODEL_2_REF, users=[sa_user]) - _wait_for_maas_auth_policy_ready(auth1_name) - _wait_for_maas_auth_policy_ready(auth2_name) - _wait_for_maas_subscription_ready(sub1_name) - _wait_for_maas_subscription_ready(sub2_name) + _wait_for_maas_auth_policy_phase(auth1_name) + _wait_for_maas_auth_policy_phase(auth2_name) + _wait_for_maas_subscription_phase(sub1_name) + _wait_for_maas_subscription_phase(sub2_name) # Query with K8s token (no header) log.info("Querying /v1/models with K8s token (no header) - should return models from both subscriptions") @@ -2027,10 +2061,10 @@ def test_service_account_token_multiple_subs_with_header(self): _create_test_auth_policy(auth2_name, DISTINCT_MODEL_2_REF, users=[sa_user]) _create_test_subscription(sub2_name, DISTINCT_MODEL_2_REF, users=[sa_user]) - _wait_for_maas_auth_policy_ready(auth1_name) - _wait_for_maas_auth_policy_ready(auth2_name) - _wait_for_maas_subscription_ready(sub1_name) - _wait_for_maas_subscription_ready(sub2_name) + _wait_for_maas_auth_policy_phase(auth1_name) + _wait_for_maas_auth_policy_phase(auth2_name) + _wait_for_maas_subscription_phase(sub1_name) + _wait_for_maas_subscription_phase(sub2_name) # Query with K8s token and header specifying sub1 log.info(f"Querying /v1/models with K8s token and header: {sub1_name}") @@ -2114,3 +2148,155 @@ def test_unauthenticated_request_401(self): pass log.info(f"✅ Unauthenticated request → {r.status_code}") + + def test_central_models_endpoint_exempt_from_rate_limiting(self): + """ + Test that the central /v1/models endpoint remains accessible when token quota is exhausted. + + This test validates the end-to-end flow: + 1. User exhausts token quota with inference requests (gets 429) + 2. Central /v1/models endpoint is exempt at gateway level (gateway-default-deny TRLP) + 3. Central endpoint calls model-specific /v1/models endpoints for discovery + 4. Model-specific endpoints are also exempt (per-route TRLP fix) + 5. Central endpoint successfully aggregates and returns model list + + This ensures the entire discovery chain works when quota is exhausted. + + Ref: https://issues.redhat.com/browse/RHOAIENG-46770 + """ + # Use unconfigured model to isolate this test + model_ref = UNCONFIGURED_MODEL_REF + model_path = UNCONFIGURED_MODEL_PATH + + # Create unique subscription and auth policy names + auth_policy_name = "e2e-central-models-exempt-auth" + subscription_name = "e2e-central-models-exempt-sub" + + # Very low limit for fast, deterministic test + # With 3 token limit and max_tokens=1, we're guaranteed to exhaust quota within 5 requests + # (each successful request consumes â‰Ĩ1 token, so 5 requests > 3 token limit) + token_limit = 3 + window = "1m" + max_tokens = 1 + + try: + # 1. Create auth policy allowing system:authenticated + log.info(f"Creating auth policy for {model_ref}") + _create_test_auth_policy( + name=auth_policy_name, + model_refs=[model_ref], + groups=["system:authenticated"] + ) + _wait_reconcile() + _wait_for_maas_auth_policy_phase(auth_policy_name, timeout=90) + + # 2. Create subscription with low token limit + log.info(f"Creating subscription with {token_limit} token limit") + _create_test_subscription( + name=subscription_name, + model_refs=[model_ref], + groups=["system:authenticated"], + token_limit=token_limit, + window=window + ) + _wait_reconcile() + _wait_for_maas_subscription_phase(subscription_name, timeout=90) + + # Wait for TRLP to be created and enforced + _wait_for_token_rate_limit_policy(model_ref, model_namespace=MODEL_NAMESPACE, timeout=90) + + # 3. Create API key for this subscription + oc_token = _get_cluster_token() + api_key = _create_api_key( + oc_token, + name=f"e2e-central-exempt-{uuid.uuid4().hex[:8]}", + subscription=subscription_name, + ) + + # 4. Exhaust the token limit + # With 3 token limit and 5 requests, we're guaranteed to hit the limit + # (each successful request consumes â‰Ĩ1 token, so 5 requests > 3 token limit) + max_requests = 5 + success_count = 0 + rate_limited = False + + log.info(f"Exhausting token quota: sending up to {max_requests} requests") + for i in range(max_requests): + r = _inference(api_key, path=model_path) + request_num = i + 1 + log.info(f"Request {request_num}: {r.status_code}") + + if r.status_code == 200: + success_count += 1 + elif r.status_code == 429: + log.info(f"Rate limit hit after {success_count} successful requests") + rate_limited = True + break + + # Verify we hit rate limit (otherwise test setup is broken) + assert rate_limited, \ + f"Expected to hit rate limit within {max_requests} requests with {token_limit} token limit, " \ + f"but got {success_count} successful requests without hitting limit" + + # 5. Verify inference is blocked + log.info("Verifying inference endpoint is blocked...") + r_inference = _inference(api_key, path=model_path) + assert r_inference.status_code == 429, \ + f"Expected 429 for inference after exhausting tokens, got {r_inference.status_code}" + log.info("✓ Inference endpoint correctly blocked with 429") + + # 6. Verify central /v1/models endpoint still works + log.info("Verifying central /v1/models endpoint is still accessible...") + url = f"{_maas_api_url()}/v1/models" + headers = {"Authorization": f"Bearer {api_key}"} + r_models = requests.get(url, headers=headers, timeout=TIMEOUT, verify=TLS_VERIFY) + + assert r_models.status_code == 200, \ + f"Expected 200 for central /v1/models endpoint even when quota exhausted, got {r_models.status_code}. " \ + f"The central /v1/models endpoint should be exempt from rate limiting (gateway-level) and " \ + f"should be able to call model-specific /v1/models endpoints (per-route exemption). " \ + f"Response: {r_models.text[:500]}" + + # 7. Verify response structure and contains our model + try: + models_data = r_models.json() + assert "data" in models_data, \ + f"Expected 'data' field in response, got: {list(models_data.keys())}" + + models = models_data["data"] + assert isinstance(models, list), "Expected 'data' to be a list" + + # Verify at least one model is tied to our test subscription + model_ids = [m.get("id") for m in models] + log.info(f"✅ Central /v1/models returned {len(models)} models: {model_ids}") + + # Check that at least one model belongs to our test subscription + models_in_our_subscription = [] + for model in models: + # Models have a subscriptions array with subscription info + model_subs = model.get("subscriptions", []) + for sub in model_subs: + if sub.get("name") == subscription_name: + models_in_our_subscription.append(model.get("id")) + break + + assert len(models_in_our_subscription) >= 1, \ + f"Expected at least 1 model tied to subscription '{subscription_name}', " \ + f"but found none. Returned models: {model_ids}, subscription: {subscription_name}" + + log.info(f"✓ Found {len(models_in_our_subscription)} model(s) in our subscription: {models_in_our_subscription}") + + except json.JSONDecodeError as e: + pytest.fail(f"Central /v1/models response is not valid JSON: {e}. Response: {r_models.text[:500]}") + + log.info("✅ Central /v1/models endpoint works correctly when quota exhausted") + log.info(" - Gateway-level exemption: ✓") + log.info(" - Model-specific endpoint exemption: ✓") + log.info(" - End-to-end discovery flow: ✓") + + finally: + # Clean up + _delete_cr("maassubscription", subscription_name) + _delete_cr("maasauthpolicy", auth_policy_name) + _wait_reconcile() + log.info("Cleaned up central models endpoint exemption test resources") diff --git a/test/e2e/tests/test_namespace_scoping.py b/test/e2e/tests/test_namespace_scoping.py index ce2df69ca..b4ba2bcd5 100644 --- a/test/e2e/tests/test_namespace_scoping.py +++ b/test/e2e/tests/test_namespace_scoping.py @@ -12,15 +12,10 @@ - LLMInferenceService deployed in llm namespace (facebook-opt-125m-simulated) - oc/kubectl access with cluster-admin or sufficient RBAC permissions -Environment variables (all optional, with defaults): - - GATEWAY_HOST: Gateway hostname (required) - - MAAS_API_BASE_URL: MaaS API URL (required) - - MAAS_SUBSCRIPTION_NAMESPACE: MaaS subscription namespace (default: models-as-a-service) - - E2E_TIMEOUT: Request timeout in seconds (default: 30) - - E2E_RECONCILE_WAIT: Wait time for controller reconciliation (default: 8) - - E2E_SKIP_TLS_VERIFY: Set to "true" to skip TLS verification - - E2E_MODEL_REF: Model ref for tests (default: facebook-opt-125m-simulated) - - E2E_MODEL_NAMESPACE: Namespace where model MaaSModelRef lives (default: llm) +Environment variables: + See test_helper.py module docstring for shared environment variables + (GATEWAY_HOST, MAAS_API_BASE_URL, MAAS_SUBSCRIPTION_NAMESPACE, etc.). + This file uses no additional file-specific environment variables. """ import json @@ -34,31 +29,22 @@ import pytest import requests -log = logging.getLogger(__name__) - -# Constants -TIMEOUT = int(os.environ.get("E2E_TIMEOUT", "30")) -RECONCILE_WAIT = int(os.environ.get("E2E_RECONCILE_WAIT", "8")) -TLS_VERIFY = os.environ.get("E2E_SKIP_TLS_VERIFY", "").lower() != "true" -MODEL_REF = os.environ.get("E2E_MODEL_REF", "facebook-opt-125m-simulated") -MODEL_NAMESPACE = os.environ.get("E2E_MODEL_NAMESPACE", "llm") - - -def _ns(): - """Default MaaS subscription namespace.""" - return os.environ.get("MAAS_SUBSCRIPTION_NAMESPACE", "models-as-a-service") +from test_helper import ( + MODEL_NAMESPACE, + MODEL_REF, + TIMEOUT, + TLS_VERIFY, + _apply_cr, + _delete_cr, + _get_cr, + _maas_api_url, + _ns, + _revoke_api_key, + _wait_for_maas_subscription_phase, + _wait_reconcile, +) - -def _maas_api_url(): - """MaaS API base URL.""" - url = os.environ.get("MAAS_API_BASE_URL", "") - if not url: - host = os.environ.get("GATEWAY_HOST", "") - if not host: - raise RuntimeError("MAAS_API_BASE_URL or GATEWAY_HOST env var is required") - scheme = "http" if os.environ.get("INSECURE_HTTP", "").lower() == "true" else "https" - url = f"{scheme}://{host}/maas-api" - return url +log = logging.getLogger(__name__) def _get_token(): @@ -73,8 +59,13 @@ def _get_token(): return token -def _create_api_key(name: str = None) -> tuple[str, str]: - """Create an API key and return (key_id, plaintext_key).""" +def _create_ns_api_key(name: str = None) -> tuple[str, str]: + """Create an API key and return (key_id, plaintext_key). + + Note: This differs from test_helper._create_api_key which takes an oc_token + and returns only the key string. This version manages its own token and + returns (key_id, plaintext_key) tuple for namespace scoping tests. + """ token = _get_token() url = f"{_maas_api_url()}/v1/api-keys" key_name = name or f"e2e-ns-test-{uuid.uuid4().hex[:8]}" @@ -93,30 +84,10 @@ def _create_api_key(name: str = None) -> tuple[str, str]: return data.get("id"), data.get("key") -def _apply_cr(cr_dict: dict): - """Apply CR from dict.""" - subprocess.run( - ["oc", "apply", "-f", "-"], - input=json.dumps(cr_dict), - capture_output=True, - text=True, - check=True, - ) - - -def _delete_cr(kind: str, name: str, namespace: str): - """Delete CR (best effort).""" - subprocess.run( - ["oc", "delete", kind, name, "-n", namespace, "--ignore-not-found", "--timeout=30s"], - capture_output=True, - text=True, - ) - - -def _create_external_model(name: str, - namespace: str, +def _create_external_model(name: str, + namespace: str, provider: str = "openai", - endpoint: str = "test.example.com", + endpoint: str = "test.example.com", target_model: Optional[str] = None): """ Create an ExternalModel CR with the given name and namespace. Note: targetModel is required by the ExternalModel CRD. """ @@ -133,18 +104,6 @@ def _create_external_model(name: str, }) -def _get_cr(kind: str, name: str, namespace: str) -> Optional[dict]: - """Get CR as dict, or None if not found.""" - result = subprocess.run( - ["oc", "get", kind, name, "-n", namespace, "-o", "json"], - capture_output=True, - text=True, - ) - if result.returncode != 0: - return None - return json.loads(result.stdout) - - def _create_namespace(name: str): """Create namespace if it doesn't exist.""" result = subprocess.run( @@ -181,11 +140,6 @@ def _call_subscriptions_select(api_key: str, username: str, groups: list, reques ) -def _wait_reconcile(seconds=None): - """Wait for controller reconciliation.""" - time.sleep(seconds or RECONCILE_WAIT) - - def _get_cr_annotation(kind: str, name: str, namespace: str, key: str): """Return the annotation value for key on the CR, or \"\" if not found.""" result = subprocess.run( @@ -203,8 +157,12 @@ def _get_cr_annotation(kind: str, name: str, namespace: str, key: str): @pytest.fixture(scope="module") def api_key(): """Create an API key for tests.""" - _, key = _create_api_key("e2e-ns-scoping-key") - return key + key_id, key = _create_ns_api_key("e2e-ns-scoping-key") + try: + yield key + finally: + if key_id: + _revoke_api_key(_get_token(), key_id) class TestMaaSAPIWatchNamespace: @@ -227,7 +185,7 @@ def test_subscription_in_subscription_namespace_visible_to_api(self, api_key): "modelRefs": [{"name": MODEL_REF, "namespace": MODEL_NAMESPACE, "tokenRateLimits": [{"limit": 1, "window": "1m"}]}], }, }) - _wait_reconcile() + _wait_for_maas_subscription_phase(sub_name, "Active", namespace=ns) r = _call_subscriptions_select(api_key, "e2e-api-user", ["system:authenticated"], requested_subscription=sub_name) assert r.status_code == 200, f"subscriptions/select failed: {r.status_code} {r.text}" diff --git a/test/e2e/tests/test_negative_security.py b/test/e2e/tests/test_negative_security.py new file mode 100644 index 000000000..36c3126bd --- /dev/null +++ b/test/e2e/tests/test_negative_security.py @@ -0,0 +1,431 @@ +""" +Negative-path and security-oriented E2E tests for MaaS. + +Validates that the platform correctly rejects abuse scenarios: +- Header spoofing: client-supplied identity headers are stripped +- Expired API keys: rejected at gateway level +- Cross-model access: subscription-model binding enforced +- AuthPolicy removal: access revoked when policy deleted +- Missing resources: CRs referencing non-existent models + +Requires: + - GATEWAY_HOST env var + - MAAS_API_BASE_URL env var (for API key creation) + - oc/kubectl access to manage CRs + - Pre-deployed test models (free-tier simulator) + +Environment variables: + See test_helper.py module docstring for shared environment variables + (GATEWAY_HOST, MAAS_API_BASE_URL, MAAS_SUBSCRIPTION_NAMESPACE, etc.). + This file uses no additional file-specific environment variables. +""" + +import http.client +import json +import logging +import ssl +import time +import uuid +from urllib.parse import urlparse + +import pytest +import requests + +from test_helper import ( + MODEL_NAME, + MODEL_NAMESPACE, + MODEL_PATH, + MODEL_REF, + SIMULATOR_SUBSCRIPTION, + TIMEOUT, + TLS_VERIFY, + UNCONFIGURED_MODEL_PATH, + UNCONFIGURED_MODEL_REF, + _create_api_key, + _create_test_auth_policy, + _create_test_subscription, + _delete_cr, + _gateway_url, + _get_cluster_token, + _get_cr, + _inference, + _maas_api_url, + _poll_status, + _wait_for_maas_auth_policy_phase, + _wait_for_maas_subscription_phase, +) + +log = logging.getLogger(__name__) + + +# ============================================================================ +# P0: Header Spoofing Tests +# ============================================================================ + +class TestHeaderSpoofing: + """Verify that client-supplied identity headers cannot influence authorization. + + The AuthPolicy is configured to strip identity headers (X-MaaS-Username, + X-MaaS-Group, X-MaaS-Key-Id) before forwarding to the model backend. + Only X-MaaS-Subscription is injected (from key-derived identity, not client). + + Security invariant: key-derived identity always wins over client-supplied headers. + """ + + def test_injected_identity_headers_ignored(self): + """Client injects X-MaaS-Username/Group/Key-Id — platform ignores them. + + Validates that Authorino strips attacker-controlled identity headers. + The request should succeed (200) using the real key-derived identity, + proving the spoofed headers had no effect on authorization. + """ + api_key = _create_api_key(_get_cluster_token(), subscription=SIMULATOR_SUBSCRIPTION) + + spoofed_headers = { + "X-MaaS-Username": "cluster-admin", + "X-MaaS-Group": "system:cluster-admins,system:masters", + "X-MaaS-Key-Id": "fake-key-id-00000", + } + + r = _inference(api_key, extra_headers=spoofed_headers) + + # Request succeeds with the REAL identity (API key owner), not the spoofed one. + # If spoofed headers were honored, the test user would gain cluster-admin access. + log.info("Spoofed identity headers -> %s", r.status_code) + assert r.status_code == 200, ( + f"Expected 200 (spoofed headers stripped, real identity used), " + f"got {r.status_code}: {r.text[:500]}" + ) + + def test_duplicate_subscription_headers_ignored(self): + """Client sends multiple X-MaaS-Subscription headers — API key binding wins. + + For API key requests, the subscription is fixed at mint time. + Duplicate or conflicting X-MaaS-Subscription headers must not override + the key-derived subscription. + """ + api_key = _create_api_key(_get_cluster_token(), subscription=SIMULATOR_SUBSCRIPTION) + + # Use http.client to send genuinely duplicate X-MaaS-Subscription headers. + # The requests library uses a dict for headers, so it cannot send two + # headers with the same name — the second value overwrites the first. + gateway = _gateway_url() + parsed = urlparse(gateway) + path = f"{MODEL_PATH}/v1/completions" + body = json.dumps({"model": MODEL_NAME, "prompt": "Hello", "max_tokens": 3}) + + if parsed.scheme == "https": + ctx = ssl.create_default_context() + if not TLS_VERIFY: + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + conn = http.client.HTTPSConnection( + parsed.hostname, parsed.port or 443, timeout=TIMEOUT, context=ctx, + ) + else: + conn = http.client.HTTPConnection( + parsed.hostname, parsed.port or 80, timeout=TIMEOUT, + ) + + # Two separate X-MaaS-Subscription header lines + headers = [ + ("Authorization", f"Bearer {api_key}"), + ("Content-Type", "application/json"), + ("X-MaaS-Subscription", SIMULATOR_SUBSCRIPTION), + ("X-MaaS-Subscription", "nonexistent-fake-sub"), + ] + + conn.putrequest("POST", path) + for key, value in headers: + conn.putheader(key, value) + conn.putheader("Content-Length", str(len(body))) + conn.endheaders(body.encode()) + + resp = conn.getresponse() + status = resp.status + resp_body = resp.read().decode(errors="replace") + conn.close() + + # API key binding wins — request succeeds with key-derived subscription. + log.info("Duplicate X-MaaS-Subscription headers -> %s", status) + assert status == 200, ( + f"Expected 200 (API key subscription binding wins over duplicate headers), " + f"got {status}: {resp_body[:500]}" + ) + + +# ============================================================================ +# P1: Expired Key Rejection +# ============================================================================ + +class TestExpiredKeyRejection: + """Verify that expired API keys are rejected at the gateway.""" + + def test_expired_key_rejected_at_gateway(self): + """Create a short-lived API key, wait for expiration, assert 403. + + This validates that Authorino's apiKeyValidation metadata evaluator + calls /internal/v1/api-keys/validate which returns valid=false for + expired keys, causing the auth-valid OPA rule to deny the request. + """ + oc_token = _get_cluster_token() + + # Create key with shortest supported expiration + url = f"{_maas_api_url()}/v1/api-keys" + r = requests.post( + url, + headers={"Authorization": f"Bearer {oc_token}", "Content-Type": "application/json"}, + json={ + "name": f"e2e-expired-{uuid.uuid4().hex[:8]}", + "subscription": SIMULATOR_SUBSCRIPTION, + "expiresIn": "1s", + }, + timeout=TIMEOUT, + verify=TLS_VERIFY, + ) + if r.status_code not in (200, 201): + pytest.skip(f"Could not create short-lived key: {r.status_code} {r.text}") + + expired_key = r.json().get("key") + if not expired_key: + pytest.skip("API key response missing 'key' field") + + # Wait for expiration + cache TTL propagation + time.sleep(5) + + # Expired key should be rejected at gateway + r = _poll_status(expired_key, (401, 403), timeout=30) + log.info("Expired API key -> %s", r.status_code) + assert r.status_code in (401, 403), ( + f"Expected 401 or 403 for expired key, got {r.status_code}: {r.text[:500]}" + ) + + +# ============================================================================ +# P1: Cross-Model Access +# ============================================================================ + +class TestCrossModelAccess: + """Verify subscription-model binding is enforced at gateway. + + A key bound to subscription S (which grants access to model A) must NOT + be able to access model B (not in subscription S). + """ + + def test_key_cannot_access_model_outside_subscription(self): + """Key for model A cannot infer on model B outside its subscription. + + Uses the pre-deployed unconfigured model (a model with no subscription + granting access to it) to test cross-model access denial. + """ + api_key = _create_api_key(_get_cluster_token(), subscription=SIMULATOR_SUBSCRIPTION) + + # The unconfigured model exists but has no subscription granting access. + # Using the same API key (bound to simulator-subscription which covers MODEL_REF) + # should fail because the subscription doesn't cover UNCONFIGURED_MODEL_REF. + r = _inference(api_key, path=UNCONFIGURED_MODEL_PATH) + + log.info("Cross-model access (model outside subscription) -> %s", r.status_code) + assert r.status_code in (401, 403), ( + f"Expected 401 or 403 for model outside subscription scope, " + f"got {r.status_code}: {r.text[:500]}" + ) + + +# ============================================================================ +# P1: AuthPolicy Removal +# ============================================================================ + +class TestAuthPolicyRemoval: + """Verify that deleting a MaaSAuthPolicy revokes gateway access. + + When an AuthPolicy is removed, the generated Kuadrant AuthPolicy is also + deleted, and subsequent requests with the API key should be denied. + """ + + def test_authpolicy_deletion_revokes_access(self): + """Create auth policy, delete it, verify Kuadrant AuthPolicy is removed. + + Uses the unconfigured model to avoid interfering with other tests. + Creates a MaaSAuthPolicy, waits for the generated Kuadrant AuthPolicy + to appear, then deletes the MaaSAuthPolicy and verifies the controller + removes the downstream Kuadrant AuthPolicy. + + This tests the controller's cleanup logic. Gateway enforcement of + AuthPolicy is already covered by other tests (e.g. test_wrong_group_gets_403). + """ + suffix = uuid.uuid4().hex[:8] + policy_name = f"e2e-neg-policy-{suffix}" + model_ref = UNCONFIGURED_MODEL_REF + kuadrant_auth_name = f"maas-auth-{model_ref}" + + try: + # Create auth policy granting access + _create_test_auth_policy( + policy_name, + model_ref, + groups=["system:authenticated"], + ) + + _wait_for_maas_auth_policy_phase(policy_name) + + # Verify Kuadrant AuthPolicy was generated + ap = _get_cr("authpolicy", kuadrant_auth_name, namespace=MODEL_NAMESPACE) + assert ap is not None, ( + f"Kuadrant AuthPolicy '{kuadrant_auth_name}' should exist after MaaSAuthPolicy creation" + ) + log.info("Kuadrant AuthPolicy %s exists in %s", kuadrant_auth_name, MODEL_NAMESPACE) + + # Delete the MaaSAuthPolicy + log.info("Deleting MaaSAuthPolicy %s", policy_name) + _delete_cr("maasauthpolicy", policy_name) + + # Poll until the Kuadrant AuthPolicy is removed by the controller + deadline = time.time() + 60 + while time.time() < deadline: + ap = _get_cr("authpolicy", kuadrant_auth_name, namespace=MODEL_NAMESPACE) + if ap is None: + break + time.sleep(2) + + assert ap is None, ( + f"Kuadrant AuthPolicy '{kuadrant_auth_name}' should be removed " + f"after MaaSAuthPolicy deletion" + ) + log.info("Kuadrant AuthPolicy %s removed after MaaSAuthPolicy deletion", kuadrant_auth_name) + + finally: + _delete_cr("maasauthpolicy", policy_name) + + +# ============================================================================ +# P2: Missing MaaSModelRef References +# ============================================================================ + +class TestMissingModelRef: + """Verify CRs don't generate gateway resources for non-existent MaaSModelRefs. + + Uses a Degraded/partial approach: each CR references one valid model + (MODEL_REF) and one ghost model. The CR reaches Degraded phase, proving + the controller processed it successfully. We then verify that downstream + Kuadrant resources were created only for the valid model, not the ghost. + + This is stronger than testing with all-ghost models (which just go Failed), + because it proves the controller selectively generates resources per model + rather than failing early before resource generation. + """ + + def test_subscription_with_nonexistent_model_ref(self): + """MaaSSubscription generates TRLP only for valid model, not ghost model. + + Creates a subscription referencing one valid model and one ghost model, + waits for Degraded phase, then asserts that a TRLP exists for the valid + model but not for the ghost model. + """ + suffix = uuid.uuid4().hex[:8] + sub_name = f"e2e-neg-ghost-sub-{suffix}" + auth_name = f"e2e-neg-ghost-sub-auth-{suffix}" + ghost_model = f"nonexistent-model-{suffix}" + + try: + _create_test_auth_policy(auth_name, MODEL_REF, groups=["system:authenticated"]) + _create_test_subscription( + sub_name, + [MODEL_REF, ghost_model], + groups=["system:authenticated"], + ) + + _wait_for_maas_subscription_phase(sub_name, "Degraded", timeout=60) + + # No TRLP should exist for the ghost model + ghost_trlp_name = f"maas-trlp-{ghost_model}" + ghost_trlp = _get_cr("tokenratelimitpolicy", ghost_trlp_name, namespace=MODEL_NAMESPACE) + log.info("Ghost model TRLP exists: %s", ghost_trlp is not None) + assert ghost_trlp is None, ( + f"TokenRateLimitPolicy '{ghost_trlp_name}' should not exist for non-existent model" + ) + + # TRLP should exist for the valid model + valid_trlp_name = f"maas-trlp-{MODEL_REF}" + valid_trlp = _get_cr("tokenratelimitpolicy", valid_trlp_name, namespace=MODEL_NAMESPACE) + log.info("Valid model TRLP exists: %s", valid_trlp is not None) + assert valid_trlp is not None, ( + f"TokenRateLimitPolicy '{valid_trlp_name}' should exist for valid model" + ) + + finally: + _delete_cr("maassubscription", sub_name) + _delete_cr("maasauthpolicy", auth_name) + + def test_authpolicy_with_nonexistent_model_ref(self): + """MaaSAuthPolicy generates AuthPolicy only for valid model, not ghost model. + + Creates an auth policy referencing one valid model and one ghost model, + waits for Degraded phase, then asserts that a Kuadrant AuthPolicy exists + for the valid model but not for the ghost model. + """ + suffix = uuid.uuid4().hex[:8] + policy_name = f"e2e-neg-ghost-policy-{suffix}" + ghost_model = f"nonexistent-model-{suffix}" + + try: + _create_test_auth_policy( + policy_name, + [MODEL_REF, ghost_model], + groups=["system:authenticated"], + ) + + _wait_for_maas_auth_policy_phase(policy_name, "Degraded", timeout=60, require_auth_policies=False) + + # No AuthPolicy should exist for the ghost model + ghost_auth_name = f"maas-auth-{ghost_model}" + ghost_ap = _get_cr("authpolicy", ghost_auth_name, namespace=MODEL_NAMESPACE) + log.info("Ghost model AuthPolicy exists: %s", ghost_ap is not None) + assert ghost_ap is None, ( + f"AuthPolicy '{ghost_auth_name}' should not exist for non-existent model" + ) + + # AuthPolicy should exist for the valid model + valid_auth_name = f"maas-auth-{MODEL_REF}" + valid_ap = _get_cr("authpolicy", valid_auth_name, namespace=MODEL_NAMESPACE) + log.info("Valid model AuthPolicy exists: %s", valid_ap is not None) + assert valid_ap is not None, ( + f"AuthPolicy '{valid_auth_name}' should exist for valid model" + ) + + finally: + _delete_cr("maasauthpolicy", policy_name) + + +# ============================================================================ +# P2: Header Abuse +# ============================================================================ + +class TestHeaderAbuse: + """Verify malicious header values are handled safely.""" + + def test_special_characters_in_subscription_header(self): + """Injection-style characters in X-MaaS-Subscription header. + + Ensures the platform returns a clean 403 (subscription not found) + without leaking errors, stack traces, or SQL/NoSQL injection. + """ + api_key = _create_api_key(_get_cluster_token(), subscription=SIMULATOR_SUBSCRIPTION) + + injection_payloads = [ + "'; DROP TABLE subscriptions; --", + '{"$gt": ""}', + "../../../etc/passwd", + "", + ] + + for payload in injection_payloads: + r = _inference(api_key, extra_headers={"X-MaaS-Subscription": payload}) + + log.info("Injection payload %r -> %s", payload, r.status_code) + # API key binding wins — request should succeed (200) because + # the spoofed header is ignored for API key requests. + # If the platform processes the header, it should return 403, not 500. + assert r.status_code != 500, ( + f"Server error with injection payload '{payload}': {r.text[:500]}" + ) diff --git a/test/e2e/tests/test_subscription.py b/test/e2e/tests/test_subscription.py index b063eb25d..026c5a602 100644 --- a/test/e2e/tests/test_subscription.py +++ b/test/e2e/tests/test_subscription.py @@ -33,34 +33,14 @@ - maas-controller deployed with example CRs applied - oc/kubectl access to create service account tokens (for API key creation) -Environment variables (all optional, with defaults): - - GATEWAY_HOST: Gateway hostname (required) - - MAAS_API_BASE_URL: MaaS API URL (required for API key creation) - - MAAS_SUBSCRIPTION_NAMESPACE: MaaS CRs namespace (default: models-as-a-service) - - E2E_TEST_TOKEN_SA_NAMESPACE, E2E_TEST_TOKEN_SA_NAME: When set, use this SA token - instead of oc whoami -t (e.g. for Prow where oc whoami -t is unavailable) - - E2E_TIMEOUT: Request timeout in seconds (default: 30) - - E2E_RECONCILE_WAIT: Wait time for reconciliation in seconds (default: 8) - - E2E_MODEL_PATH: Path to free model (default: /llm/facebook-opt-125m-simulated) - - E2E_PREMIUM_MODEL_PATH: Path to premium model (default: /llm/premium-simulated-simulated-premium) - - E2E_MODEL_NAME: Model name for API requests (default: facebook/opt-125m) - - E2E_MODEL_REF: Model ref for CRs (default: facebook-opt-125m-simulated) - - E2E_PREMIUM_MODEL_REF: Premium model ref for CRs (default: premium-simulated-simulated-premium) - - E2E_UNCONFIGURED_MODEL_REF: Unconfigured model ref (default: e2e-unconfigured-facebook-opt-125m-simulated) - - E2E_UNCONFIGURED_MODEL_PATH: Path to unconfigured model (default: /llm/e2e-unconfigured-facebook-opt-125m-simulated) - - E2E_DISTINCT_MODEL_REF: First distinct model ref for multi-model tests (default: e2e-distinct-simulated) - - E2E_DISTINCT_MODEL_PATH: Path to first distinct model (default: /llm/e2e-distinct-simulated) - - E2E_DISTINCT_MODEL_ID: Model ID served by first distinct model (default: test/e2e-distinct-model) - - E2E_DISTINCT_MODEL_2_REF: Second distinct model ref for multi-model tests (default: e2e-distinct-2-simulated) - - E2E_DISTINCT_MODEL_2_PATH: Path to second distinct model (default: /llm/e2e-distinct-2-simulated) - - E2E_DISTINCT_MODEL_2_ID: Model ID served by second distinct model (default: test/e2e-distinct-model-2) - - E2E_SIMULATOR_SUBSCRIPTION: Free-tier subscription (default: simulator-subscription) - - E2E_PREMIUM_SIMULATOR_SUBSCRIPTION: Premium-tier subscription (default: premium-simulator-subscription) - - E2E_SIMULATOR_ACCESS_POLICY: Simulator auth policy name (default: simulator-access) - - E2E_INVALID_SUBSCRIPTION: Invalid subscription name for 403 test (default: nonexistent-sub) +Environment variables: + See test_helper.py module docstring for shared environment variables + (GATEWAY_HOST, MAAS_API_BASE_URL, MAAS_SUBSCRIPTION_NAMESPACE, etc.). + + File-specific variables (all optional, with defaults): + - E2E_PREMIUM_MODEL_PATH: Gateway path for premium model (default: /llm/premium-simulated-simulated-premium) """ -import base64 import copy import json import logging @@ -68,39 +48,59 @@ import subprocess import time import uuid -from typing import Optional from urllib.parse import urlparse import pytest import requests +from test_helper import ( + MODEL_NAME, + MODEL_NAMESPACE, + MODEL_PATH, + MODEL_REF, + PREMIUM_MODEL_REF, + SIMULATOR_ACCESS_POLICY, + SIMULATOR_SUBSCRIPTION, + TIMEOUT, + TLS_VERIFY, + TRLP_TEST_MODEL_REF, + TRLP_TEST_MODEL_PATH, + TRLP_TEST_MODEL_ID, + UNCONFIGURED_MODEL_PATH, + UNCONFIGURED_MODEL_REF, + _apply_cr, + _create_api_key, + _create_sa_token, + _create_test_auth_policy, + _create_test_subscription, + _delete_cr, + _delete_sa, + _gateway_url, + _get_auth_policies_for_model, + _get_cluster_token, + _get_cr, + _get_subscriptions_for_model, + _inference, + _maas_api_url, + _ns, + _poll_status, + _revoke_api_key, + _sa_to_user, + _snapshot_cr, + _wait_for_maas_auth_policy_phase, + _wait_for_maas_subscription_phase, + _wait_for_token_rate_limit_policy, + _scale_kuadrant_controller_down, + _scale_kuadrant_controller_up, + _wait_for_subscription_trlp_status, + _wait_reconcile, +) + log = logging.getLogger(__name__) -# Constants (override with env vars) -TIMEOUT = int(os.environ.get("E2E_TIMEOUT", "30")) -RECONCILE_WAIT = int(os.environ.get("E2E_RECONCILE_WAIT", "8")) -TLS_VERIFY = os.environ.get("E2E_SKIP_TLS_VERIFY", "").lower() != "true" -MODEL_PATH = os.environ.get("E2E_MODEL_PATH", "/llm/facebook-opt-125m-simulated") +# Constants specific to test_subscription.py (not shared) PREMIUM_MODEL_PATH = os.environ.get("E2E_PREMIUM_MODEL_PATH", "/llm/premium-simulated-simulated-premium") -MODEL_NAME = os.environ.get("E2E_MODEL_NAME", "facebook/opt-125m") -MODEL_REF = os.environ.get("E2E_MODEL_REF", "facebook-opt-125m-simulated") -PREMIUM_MODEL_REF = os.environ.get("E2E_PREMIUM_MODEL_REF", "premium-simulated-simulated-premium") -MODEL_NAMESPACE = os.environ.get("E2E_MODEL_NAMESPACE", "llm") -UNCONFIGURED_MODEL_REF = os.environ.get("E2E_UNCONFIGURED_MODEL_REF", "e2e-unconfigured-facebook-opt-125m-simulated") -UNCONFIGURED_MODEL_PATH = os.environ.get("E2E_UNCONFIGURED_MODEL_PATH", "/llm/e2e-unconfigured-facebook-opt-125m-simulated") -DISTINCT_MODEL_REF = os.environ.get("E2E_DISTINCT_MODEL_REF", "e2e-distinct-simulated") -DISTINCT_MODEL_PATH = os.environ.get("E2E_DISTINCT_MODEL_PATH", "/llm/e2e-distinct-simulated") -DISTINCT_MODEL_ID = os.environ.get("E2E_DISTINCT_MODEL_ID", "test/e2e-distinct-model") -DISTINCT_MODEL_2_REF = os.environ.get("E2E_DISTINCT_MODEL_2_REF", "e2e-distinct-2-simulated") -DISTINCT_MODEL_2_PATH = os.environ.get("E2E_DISTINCT_MODEL_2_PATH", "/llm/e2e-distinct-2-simulated") -DISTINCT_MODEL_2_ID = os.environ.get("E2E_DISTINCT_MODEL_2_ID", "test/e2e-distinct-model-2") -SIMULATOR_SUBSCRIPTION = os.environ.get("E2E_SIMULATOR_SUBSCRIPTION", "simulator-subscription") -PREMIUM_SIMULATOR_SUBSCRIPTION = os.environ.get( - "E2E_PREMIUM_SIMULATOR_SUBSCRIPTION", "premium-simulator-subscription" -) -SIMULATOR_ACCESS_POLICY = os.environ.get("E2E_SIMULATOR_ACCESS_POLICY", "simulator-access") -INVALID_SUBSCRIPTION = os.environ.get("E2E_INVALID_SUBSCRIPTION", "nonexistent-sub") # Generated resource names (for TestManagedAnnotation) AUTH_POLICY_NAME = f"maas-auth-{MODEL_REF}" @@ -108,156 +108,6 @@ MANAGED_ANNOTATION = "opendatahub.io/managed" -def _ns(): - return os.environ.get("MAAS_SUBSCRIPTION_NAMESPACE", "models-as-a-service") - - -def _gateway_url(): - host = os.environ.get("GATEWAY_HOST", "") - if not host: - raise RuntimeError("GATEWAY_HOST env var is required") - scheme = "http" if os.environ.get("INSECURE_HTTP", "").lower() == "true" else "https" - return f"{scheme}://{host}" - - -def _maas_api_url(): - """Get the MaaS API base URL for API key operations.""" - url = os.environ.get("MAAS_API_BASE_URL", "") - if not url: - # Derive from GATEWAY_HOST if MAAS_API_BASE_URL not set - host = os.environ.get("GATEWAY_HOST", "") - if not host: - raise RuntimeError("MAAS_API_BASE_URL or GATEWAY_HOST env var is required") - scheme = "http" if os.environ.get("INSECURE_HTTP", "").lower() == "true" else "https" - url = f"{scheme}://{host}/maas-api" - return url - - -# Used for debugging -def _decode_jwt_payload(token: str) -> Optional[dict]: - """Decode JWT payload (no verification, for debugging). Returns claims dict or None.""" - try: - parts = token.split(".") - if len(parts) != 3: - return None - payload_b64 = parts[1] - payload_b64 += "=" * (4 - len(payload_b64) % 4) # add padding - payload_bytes = base64.urlsafe_b64decode(payload_b64) - return json.loads(payload_bytes) - except Exception: - return None - - -def _get_cluster_token(): - """Get OC token for API key management operations (not for inference). - - Priority: - 1. TOKEN env var (set by prow script for regular user) - 2. E2E_TEST_TOKEN_SA_* env vars (for SA-based tokens) - 3. oc whoami -t (fallback for local testing) - """ - # Priority 1: TOKEN env var (regular user token from prow script) - token = os.environ.get("TOKEN", "") - if token: - log.info("Using TOKEN env var for API key operations") - return token - - # Priority 2: SA token if configured - sa_ns = os.environ.get("E2E_TEST_TOKEN_SA_NAMESPACE") - sa_name = os.environ.get("E2E_TEST_TOKEN_SA_NAME") - if sa_ns and sa_name: - token = _create_sa_token(sa_name, namespace=sa_ns) - else: - # Priority 3: oc whoami -t fallback - token_result = subprocess.run(["oc", "whoami", "-t"], capture_output=True, text=True) - token = token_result.stdout.strip() if token_result.returncode == 0 else "" - if not token: - raise RuntimeError("Could not get cluster token via `oc whoami -t`; run with oc login first") - claims = _decode_jwt_payload(token) - if claims: - log.info("Token claims (decoded): %s", json.dumps(claims, indent=2)) - return token - - -def _create_sa_token(sa_name, namespace=None, duration="10m"): - namespace = namespace or _ns() - sa_result = subprocess.run( - ["oc", "create", "sa", sa_name, "-n", namespace], capture_output=True, text=True - ) - if sa_result.returncode != 0 and "already exists" not in sa_result.stderr: - raise RuntimeError(f"Failed to create SA {sa_name}: {sa_result.stderr}") - result = subprocess.run( - ["oc", "create", "token", sa_name, "-n", namespace, f"--duration={duration}"], - capture_output=True, text=True, - ) - token = result.stdout.strip() - if not token: - raise RuntimeError(f"Could not create token for SA {sa_name}: {result.stderr}") - return token - - -# --------------------------------------------------------------------------- -# API Key Management Helpers -# --------------------------------------------------------------------------- - -def _create_api_key(oc_token: str, name: str = None, subscription: str = None) -> str: - """Create an API key using the MaaS API and return the plaintext key. - - Note: API keys inherit the authenticated user's groups automatically. - Users can only create keys for themselves with their own groups. - Pass ``subscription`` to bind a specific MaaSSubscription at mint time. - - Args: - oc_token: OC token for authentication with maas-api - name: Optional name for the key (auto-generated if not provided) - subscription: Optional MaaSSubscription name to bind (highest-priority auto-bind if omitted) - - Returns: - The plaintext API key (sk-oai-xxx format) - """ - url = f"{_maas_api_url()}/v1/api-keys" - key_name = name or f"e2e-sub-test-{uuid.uuid4().hex[:8]}" - - body = {"name": key_name} - if subscription: - body["subscription"] = subscription - - r = requests.post( - url, - headers={ - "Authorization": f"Bearer {oc_token}", - "Content-Type": "application/json", - }, - json=body, - timeout=TIMEOUT, - verify=TLS_VERIFY, - ) - if r.status_code not in (200, 201): - raise RuntimeError(f"Failed to create API key: {r.status_code} {r.text}") - - data = r.json() - api_key = data.get("key") - if not api_key: - raise RuntimeError(f"API key response missing 'key' field: {data}") - - log.info(f"Created API key '{key_name}' (inherits user's groups), bound to subscription '{subscription}'") - return api_key - - -def _revoke_api_key(oc_token: str, key_id: str): - """Revoke an API key (best-effort, for cleanup).""" - url = f"{_maas_api_url()}/v1/api-keys/{key_id}" - try: - requests.delete( - url, - headers={"Authorization": f"Bearer {oc_token}"}, - timeout=TIMEOUT, - verify=TLS_VERIFY, - ) - except Exception as e: - log.warning(f"Failed to revoke API key {key_id}: {e}") - - # Cache for API keys to avoid creating too many during test runs. # Keyed by process ID to ensure test isolation when running in parallel workers. _default_api_key_cache: dict = {} @@ -281,47 +131,6 @@ def _get_default_api_key() -> str: return _default_api_key_cache[pid] -def _delete_sa(sa_name, namespace=None): - namespace = namespace or _ns() - subprocess.run(["oc", "delete", "sa", sa_name, "-n", namespace, "--ignore-not-found"], capture_output=True, text=True) - - -def _apply_cr(cr_dict): - subprocess.run(["oc", "apply", "-f", "-"], input=json.dumps(cr_dict), capture_output=True, text=True, check=True) - - -def _delete_cr(kind, name, namespace=None): - namespace = namespace or _ns() - subprocess.run(["oc", "delete", kind, name, "-n", namespace, "--ignore-not-found", "--timeout=30s"], capture_output=True, text=True) - - -def _get_cr(kind, name, namespace=None): - namespace = namespace or _ns() - max_retries = 3 - retry_delay = 2 - - for attempt in range(max_retries): - result = subprocess.run(["oc", "get", kind, name, "-n", namespace, "-o", "json"], capture_output=True, text=True) - - if result.returncode == 0: - return json.loads(result.stdout) - - # Retry transient errors - if attempt < max_retries - 1 and _is_transient_kubectl_error(result.stderr): - log.warning( - f"Transient kubectl error getting {kind}/{name} (attempt {attempt + 1}/{max_retries}): {result.stderr.strip()}" - ) - time.sleep(retry_delay * (attempt + 1)) - continue - - # Non-transient error or final attempt - return None (existing behavior) - log.error( - f"Failed to get {kind}/{name} in namespace '{namespace}' after {max_retries} retries. " - f"Last error: {result.stderr.strip()}" - ) - return None - - def _cr_exists(kind, name, namespace=None): namespace = namespace or _ns() result = subprocess.run(["oc", "get", kind, name, "-n", namespace], capture_output=True, text=True) @@ -343,62 +152,6 @@ def _annotate(kind, name, annotation, namespace=None): ) -def _get_auth_policies_for_model(model_ref, namespace=None): - """Get all MaaSAuthPolicies that reference a model. - - Args: - model_ref: Name of the MaaSModelRef - namespace: Namespace to search (defaults to _ns()) - - Returns: - List of auth policy names that reference the model - """ - namespace = namespace or _ns() - policies = _list_crs("maasauthpolicy", namespace) - - matching = [] - for policy in policies: - model_refs = policy.get("spec", {}).get("modelRefs", []) - for ref in model_refs: - # Handle both string refs and dict refs with 'name' field - ref_name = ref.get("name") if isinstance(ref, dict) else ref - if ref_name == model_ref: - matching.append(policy["metadata"]["name"]) - break - return matching - - -def _get_subscriptions_for_model(model_ref, namespace=None): - """Get all MaaSSubscriptions that reference a model. - - Args: - model_ref: Name of the MaaSModelRef - namespace: Namespace to search (defaults to _ns()) - - Returns: - List of subscription names that reference the model - """ - namespace = namespace or _ns() - subs = _list_crs("maassubscription", namespace) - - matching = [] - for sub in subs: - model_refs = sub.get("spec", {}).get("modelRefs", []) - for ref in model_refs: - # Handle both string refs and dict refs with 'name' field - ref_name = ref.get("name") if isinstance(ref, dict) else ref - if ref_name == model_ref: - matching.append(sub["metadata"]["name"]) - break - return matching - - -def _sa_to_user(sa_name, namespace=None): - """Convert service account name to Kubernetes user principal.""" - namespace = namespace or _ns() - return f"system:serviceaccount:{namespace}:{sa_name}" - - def _create_test_maas_model(name, llmis_name=MODEL_REF, llmis_namespace=MODEL_NAMESPACE, namespace=None): """Create a MaaSModelRef CR for testing. @@ -420,116 +173,6 @@ def _create_test_maas_model(name, llmis_name=MODEL_REF, llmis_namespace=MODEL_NA }) -def _create_test_auth_policy(name, model_refs, users=None, groups=None, namespace=None): - """Create a MaaSAuthPolicy CR for testing. - - Args: - name: Name of the auth policy - model_refs: Model ref(s) - can be string or list - users: List of user principals (e.g., ["system:serviceaccount:ns:sa"]) - groups: List of group names (e.g., ["system:authenticated"]) - will be converted to required format - namespace: Namespace for the auth policy (defaults to _ns()) - """ - namespace = namespace or _ns() - if not isinstance(model_refs, list): - model_refs = [model_refs] - - # Convert model refs to required format: [{"name": "model1", "namespace": "llm"}, ...] - model_refs_formatted = [{"name": ref, "namespace": MODEL_NAMESPACE} for ref in model_refs] - - # Convert groups list to required format: [{"name": "group1"}, {"name": "group2"}] - groups_formatted = [{"name": g} for g in (groups or [])] - - log.info("Creating MaaSAuthPolicy: %s", name) - _apply_cr({ - "apiVersion": "maas.opendatahub.io/v1alpha1", - "kind": "MaaSAuthPolicy", - "metadata": {"name": name, "namespace": namespace}, - "spec": { - "modelRefs": model_refs_formatted, - "subjects": { - "users": users or [], - "groups": groups_formatted - } - } - }) - - -def _create_test_subscription( - name, - model_refs, - users=None, - groups=None, - token_limit=100, - window="1m", - namespace=None, - priority=None, -): - """Create a MaaSSubscription CR for testing. - - Args: - name: Name of the subscription - model_refs: Model ref(s) - can be string or list - users: List of user principals (e.g., ["system:serviceaccount:ns:sa"]) - groups: List of group names (e.g., ["system:authenticated"]) - will be converted to required format - token_limit: Token rate limit (default: 100) - window: Rate limit window (default: "1m") - namespace: Namespace for the subscription (defaults to _ns()) - priority: Optional spec.priority (higher wins for default API key binding when omitted) - """ - namespace = namespace or _ns() - if not isinstance(model_refs, list): - model_refs = [model_refs] - - # Convert groups list to required format: [{"name": "group1"}, {"name": "group2"}] - groups_formatted = [{"name": g} for g in (groups or [])] - - spec = { - "owner": { - "users": users or [], - "groups": groups_formatted, - }, - "modelRefs": [ - { - "name": ref, - "namespace": MODEL_NAMESPACE, - "tokenRateLimits": [{"limit": token_limit, "window": window}], - } - for ref in model_refs - ], - } - if priority is not None: - spec["priority"] = int(priority) - - log.info("Creating MaaSSubscription: %s", name) - _apply_cr( - { - "apiVersion": "maas.opendatahub.io/v1alpha1", - "kind": "MaaSSubscription", - "metadata": {"name": name, "namespace": namespace}, - "spec": spec, - } - ) - - -def _inference(api_key, path=None, extra_headers=None, model_name=None): - """POST completions using an API key only (subscription is bound at mint).""" - path = path or MODEL_PATH - url = f"{_gateway_url()}{path}/v1/completions" - headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"} - if extra_headers: - headers.update(extra_headers) - return requests.post( - url, headers=headers, - json={"model": model_name or MODEL_NAME, "prompt": "Hello", "max_tokens": 3}, - timeout=TIMEOUT, verify=TLS_VERIFY, - ) - - -def _wait_reconcile(seconds=None): - time.sleep(seconds or RECONCILE_WAIT) - - def _wait_for_maas_model_ready(name, namespace=None, timeout=120): """Wait for MaaSModelRef to reach Ready phase. @@ -567,247 +210,6 @@ def _wait_for_maas_model_ready(name, namespace=None, timeout=120): ) -def _wait_for_maas_auth_policy_ready(name, namespace=None, timeout=60): - """Wait for MaaSAuthPolicy to reach Active phase with enforced AuthPolicies. - - Args: - name: Name of the MaaSAuthPolicy - namespace: Namespace (defaults to _ns()) - timeout: Maximum wait time in seconds (default: 60) - - Raises: - TimeoutError: If MaaSAuthPolicy doesn't become Active/enforced within timeout - """ - namespace = namespace or _ns() - deadline = time.time() + timeout - log.info(f"Waiting for MaaSAuthPolicy {name} to become Active (timeout: {timeout}s)...") - - while time.time() < deadline: - cr = _get_cr("maasauthpolicy", name, namespace) - if cr: - phase = cr.get("status", {}).get("phase") - auth_policies = cr.get("status", {}).get("authPolicies", []) - - # Check if all auth policies are accepted and enforced - all_enforced = all( - ap.get("accepted") == "True" and ap.get("enforced") == "True" - for ap in auth_policies - ) - - if phase == "Active" and auth_policies and all_enforced: - log.info(f"✅ MaaSAuthPolicy {name} is Active and enforced") - return - log.debug(f"MaaSAuthPolicy {name} phase: {phase}, authPolicies: {len(auth_policies)}, all_enforced: {all_enforced}") - time.sleep(2) - - # Timeout - log current state for debugging - cr = _get_cr("maasauthpolicy", name, namespace) - current_phase = cr.get("status", {}).get("phase") if cr else "not found" - auth_policies = cr.get("status", {}).get("authPolicies", []) if cr else [] - raise TimeoutError( - f"MaaSAuthPolicy {name} did not become Active/enforced within {timeout}s " - f"(current phase: {current_phase}, authPolicies: {len(auth_policies)})" - ) - - -def _wait_for_maas_subscription_ready(name, namespace=None, timeout=30): - """Wait for MaaSSubscription to reach Active phase. - - Args: - name: Name of the MaaSSubscription - namespace: Namespace (defaults to _ns()) - timeout: Maximum wait time in seconds (default: 30) - - Raises: - TimeoutError: If MaaSSubscription doesn't become Active within timeout - """ - namespace = namespace or _ns() - deadline = time.time() + timeout - log.info(f"Waiting for MaaSSubscription {name} to become Active (timeout: {timeout}s)...") - - while time.time() < deadline: - cr = _get_cr("maassubscription", name, namespace) - if cr: - phase = cr.get("status", {}).get("phase") - if phase == "Active": - log.info(f"✅ MaaSSubscription {name} is Active") - return - log.debug(f"MaaSSubscription {name} phase: {phase}") - time.sleep(2) - - # Timeout - log current state for debugging - cr = _get_cr("maassubscription", name, namespace) - current_phase = cr.get("status", {}).get("phase") if cr else "not found" - raise TimeoutError( - f"MaaSSubscription {name} did not become Active within {timeout}s (current phase: {current_phase})" - ) - - -def _wait_for_token_rate_limit_policy(model_ref, model_namespace="llm", timeout=60): - """Wait for TokenRateLimitPolicy to be created and enforced for a model. - - Args: - model_ref: Name of the model (e.g., "e2e-distinct-simulated") - model_namespace: Namespace where the TRLP should be created (default: "llm") - timeout: Maximum wait time in seconds (default: 60) - - Raises: - TimeoutError: If TRLP isn't created and enforced within timeout - """ - trlp_name = f"maas-trlp-{model_ref}" - deadline = time.time() + timeout - log.info(f"Waiting for TokenRateLimitPolicy {trlp_name} in {model_namespace} (timeout: {timeout}s)...") - - while time.time() < deadline: - result = subprocess.run( - ["oc", "get", "tokenratelimitpolicy", trlp_name, "-n", model_namespace, "-o", "json"], - capture_output=True, text=True - ) - if result.returncode == 0: - try: - trlp = json.loads(result.stdout) - conditions = trlp.get("status", {}).get("conditions", []) - # Check if TRLP is enforced - enforced = next((c for c in conditions if c.get("type") in ["Enforced", "Ready"]), None) - if enforced and enforced.get("status") == "True": - log.info(f"✅ TokenRateLimitPolicy {trlp_name} is enforced") - return - log.debug(f"TokenRateLimitPolicy {trlp_name} exists but not enforced yet") - except (json.JSONDecodeError, KeyError) as e: - log.debug(f"Failed to parse TRLP status: {e}") - else: - log.debug(f"TokenRateLimitPolicy {trlp_name} not found yet...") - time.sleep(3) - - raise TimeoutError( - f"TokenRateLimitPolicy {trlp_name} was not created and enforced in {model_namespace} within {timeout}s" - ) - - -def _poll_status(api_key, expected, path=None, extra_headers=None, model_name=None, timeout=None, poll_interval=2): - """Poll inference endpoint until expected HTTP status or timeout.""" - timeout = timeout or max(RECONCILE_WAIT * 3, 60) - deadline = time.time() + timeout - last = None - last_err = None - while time.time() < deadline: - try: - r = _inference(api_key, path=path, extra_headers=extra_headers, model_name=model_name) - last_err = None - ok = r.status_code == expected if isinstance(expected, int) else r.status_code in expected - if ok: - return r - last = r - except requests.RequestException as exc: - last_err = exc - log.debug(f"Transient request error while polling: {exc}") - except Exception as exc: - # Catch-all to surface non-RequestException (e.g. JSON decode, timeout config) - last_err = exc - log.warning(f"Unexpected error while polling: {exc}") - time.sleep(poll_interval) - # Build failure message with all available context - exp_str = expected if isinstance(expected, int) else " or ".join(str(e) for e in expected) - err_msg = f"Expected {exp_str} within {timeout}s" - if last is not None: - err_msg += f", last status: {last.status_code}" - if last_err is not None: - err_msg += f", last error: {last_err}" - if last is None and last_err is None: - err_msg += ", no response (all requests may have raised non-RequestException)" - raise AssertionError(err_msg) - - -def _snapshot_cr(kind, name, namespace=None): - """Capture a CR for later restoration (strips runtime metadata).""" - cr = _get_cr(kind, name, namespace) - if not cr: - return None - meta = cr.get("metadata", {}) - for key in ("resourceVersion", "uid", "creationTimestamp", "generation", "managedFields"): - meta.pop(key, None) - annotations = meta.get("annotations", {}) - annotations.pop("kubectl.kubernetes.io/last-applied-configuration", None) - if not annotations: - meta.pop("annotations", None) - cr.pop("status", None) - return cr - - -def _is_transient_kubectl_error(stderr): - """Check if kubectl error is likely transient (network, timeout).""" - transient_patterns = [ - "TLS handshake timeout", - "connection refused", - "connection reset", - "i/o timeout", - "dial tcp", - "EOF", - "temporary failure", - "network is unreachable", - ] - stderr_lower = stderr.lower() - return any(pattern.lower() in stderr_lower for pattern in transient_patterns) - - -def _list_crs(kind, namespace=None): - """List all CRs of a given kind. - - Args: - kind: CR kind (e.g., 'maasmodelref', 'maasauthpolicy') - namespace: Namespace to search (defaults to _ns()) - - Returns: - List of CR dictionaries - - Raises: - RuntimeError: If kubectl command fails with contextual error details - """ - namespace = namespace or _ns() - plural = { - "maasmodelref": "maasmodelrefs", - "maasauthpolicy": "maasauthpolicies", - "maassubscription": "maassubscriptions", - }.get(kind, f"{kind}s") - - cmd = ["kubectl", "get", plural, "-n", namespace, "-o", "json"] - - # Retry transient network errors with exponential backoff - max_retries = 3 - retry_delay = 2 # seconds - - for attempt in range(max_retries): - result = subprocess.run( - cmd, - capture_output=True, - text=True, - check=False - ) - - if result.returncode == 0: - return json.loads(result.stdout).get("items", []) - - # Check if error is transient and we have retries left - if attempt < max_retries - 1 and _is_transient_kubectl_error(result.stderr): - log.warning( - f"Transient kubectl error (attempt {attempt + 1}/{max_retries}): {result.stderr.strip()}" - ) - time.sleep(retry_delay * (attempt + 1)) # exponential backoff - continue - - # Final attempt or non-transient error - raise RuntimeError( - f"Failed to list {plural} in namespace '{namespace}'.\n" - f"Command: {' '.join(cmd)}\n" - f"Exit code: {result.returncode}\n" - f"Stderr: {result.stderr}\n" - f"Guidance: Ensure the CRD exists, namespace is correct, and you have permissions." - ) - - # Unreachable: loop always exits via return (line 684) or raise (line 695) - # Included for type checker and defensive programming - return [] - # --------------------------------------------------------------------------- # Tests @@ -873,7 +275,7 @@ def high_priority_subscription_name_for_api_key_binding(): groups=["system:authenticated"], priority=_E2E_API_KEY_BINDING_HIGH_PRIORITY, ) - _wait_for_maas_subscription_ready(name, ns, timeout=90) + _wait_for_maas_subscription_phase(name, namespace=ns, timeout=90) yield name finally: _delete_cr("maassubscription", name) @@ -1021,13 +423,13 @@ def test_rate_limit_exhaustion_gets_429(self): auth_policy_name = "e2e-rate-limit-test-auth" subscription_name = "e2e-rate-limit-test-subscription" - # Very low limit for fast test: 15 tokens/min with max_tokens=3 per request - # Expected behavior: - # - Requests 1-5 succeed (use 15 tokens total) - # - Request 6 gets 429 (would need 18 tokens total) - token_limit = 15 + # Low limit so we exhaust it quickly. Actual tokens consumed per + # response are non-deterministic (max_tokens is a ceiling, not exact), + # so we send enough requests to be confident we hit the limit without + # asserting exactly when the 429 arrives. + token_limit = 10 window = "1m" - max_tokens = 3 # Explicitly track tokens per request for clarity + total_requests = 15 try: # 1. Create auth policy allowing system:authenticated @@ -1061,16 +463,11 @@ def test_rate_limit_exhaustion_gets_429(self): ) # 4. Send requests to exhaust the limit - # Calculate expected successful requests: token_limit / max_tokens = 15 / 3 = 5 - expected_success = token_limit // max_tokens - # Send 2 extra requests to ensure we hit the limit - total_requests = expected_success + 2 - rate_limited = False success_count = 0 for i in range(total_requests): - r = _inference(api_key, path=model_path) + r = _inference(api_key, path=model_path, max_tokens=1) request_num = i + 1 log.info(f"Request {request_num}/{total_requests}: {r.status_code}") @@ -1078,12 +475,7 @@ def test_rate_limit_exhaustion_gets_429(self): success_count += 1 elif r.status_code == 429: rate_limited = True - log.info(f"Rate limit exceeded after {success_count} successful requests " - f"({success_count * max_tokens} tokens used)") - - # Verify we hit the limit at approximately the right point (Âą1 for rounding) - assert abs(success_count - expected_success) <= 1, \ - f"Expected ~{expected_success} successful requests before 429, got {success_count}" + log.info(f"Rate limit exceeded after {success_count} successful requests") # Verify it's a rate limit 429, not a subscription error response_text = r.text.lower() if r.text else "" @@ -1109,8 +501,13 @@ def test_rate_limit_exhaustion_gets_429(self): # Brief pause to avoid overwhelming the system, but stay within the window time.sleep(0.1) + # Verify we actually exhausted the limit (at least one successful request) + assert success_count > 0, \ + f"Got 429 on request #{request_num} without any successful requests. " \ + f"This indicates a configuration issue, not rate limit exhaustion. Response: {r.text[:500]}" + assert rate_limited, \ - f"Expected 429 after ~{expected_success} requests with {token_limit} tokens/{window} limit, " \ + f"Expected 429 with {token_limit} tokens/{window} limit, " \ f"but got {success_count} successful requests without hitting limit" # Note: Skipping rate limit reset test to keep test fast (<5s) @@ -1123,6 +520,135 @@ def test_rate_limit_exhaustion_gets_429(self): _wait_reconcile() log.info("Cleaned up rate limit test resources") + def test_models_endpoint_exempt_from_rate_limiting(self): + """ + Test that /v1/models endpoint remains accessible when token quota is exhausted. + + This verifies that users can discover model capabilities even when they've + used all their inference tokens. The /v1/models endpoint is a discovery/metadata + endpoint that does not consume tokens and should remain accessible. + + Ref: https://issues.redhat.com/browse/RHOAIENG-46770 + + Test steps: + 1. Create subscription with very low token limit (15 tokens) + 2. Exhaust the limit with inference requests (5 requests × 3 tokens = 15) + 3. Verify inference requests get 429 (rate limited) + 4. Verify /v1/models endpoint still returns 200 (not rate limited) + """ + # Use unconfigured model to isolate this test + model_ref = UNCONFIGURED_MODEL_REF + model_path = UNCONFIGURED_MODEL_PATH + + # Create unique subscription and auth policy names + auth_policy_name = "e2e-models-exempt-test-auth" + subscription_name = "e2e-models-exempt-test-subscription" + + # Very low limit for fast, deterministic test + # With 3 token limit and max_tokens=1, we're guaranteed to exhaust quota within 5 requests + # (even if each request uses exactly 1 token: 5 requests > 3 token limit) + token_limit = 3 + window = "1m" + max_tokens = 1 + + try: + # 1. Create auth policy allowing system:authenticated + _create_test_auth_policy( + name=auth_policy_name, + model_refs=[model_ref], + groups=["system:authenticated"] + ) + _wait_reconcile() + _wait_for_maas_auth_policy_phase(auth_policy_name, timeout=90) + + # 2. Create subscription with low token limit + _create_test_subscription( + name=subscription_name, + model_refs=[model_ref], + groups=["system:authenticated"], + token_limit=token_limit, + window=window + ) + _wait_reconcile() + _wait_for_maas_subscription_phase(subscription_name, timeout=90) + + # Wait for TRLP to be created AND enforced by Kuadrant/Limitador + _wait_for_token_rate_limit_policy(model_ref, model_namespace=MODEL_NAMESPACE, timeout=90) + + # 3. Create API key for this subscription + oc_token = _get_cluster_token() + api_key = _create_api_key( + oc_token, + name=f"e2e-models-exempt-{uuid.uuid4().hex[:8]}", + subscription=subscription_name, + ) + + # 4. Exhaust the token limit + # With 3 token limit and 5 requests, we're guaranteed to hit the limit + # (each successful request consumes â‰Ĩ1 token, so 5 requests > 3 token limit) + max_requests = 5 + success_count = 0 + rate_limited = False + + log.info(f"Exhausting token quota: sending up to {max_requests} requests") + for i in range(max_requests): + r = _inference(api_key, path=model_path) + request_num = i + 1 + log.info(f"Request {request_num}: status {r.status_code}") + + if r.status_code == 200: + success_count += 1 + elif r.status_code == 429: + log.info(f"Rate limit hit after {success_count} successful requests") + rate_limited = True + break + else: + # Unexpected status during exhaustion + log.warning(f"Unexpected status during quota exhaustion: {r.status_code}") + + # Verify we hit rate limit (otherwise test setup is broken) + assert rate_limited, \ + f"Expected to hit rate limit within {max_requests} requests with {token_limit} token limit, " \ + f"but got {success_count} successful requests without hitting limit" + + # 5. Verify inference is now blocked with 429 + log.info("Verifying inference endpoint is blocked...") + r_inference = _inference(api_key, path=model_path) + assert r_inference.status_code == 429, \ + f"Expected 429 for inference after exhausting tokens, got {r_inference.status_code}. " \ + f"Response: {r_inference.text[:500]}" + log.info("✓ Inference endpoint correctly blocked with 429") + + # 6. Verify /v1/models endpoint is still accessible with 200 + log.info("Verifying /v1/models endpoint is still accessible...") + url = f"{_gateway_url()}{model_path}/v1/models" + headers = {"Authorization": f"Bearer {api_key}"} + r_models = requests.get(url, headers=headers, timeout=TIMEOUT, verify=TLS_VERIFY) + + assert r_models.status_code == 200, \ + f"Expected 200 for /v1/models endpoint even when quota exhausted, got {r_models.status_code}. " \ + f"The /v1/models endpoint does not consume tokens and should remain accessible. " \ + f"Response: {r_models.text[:500]}" + + # Verify it returns valid model metadata (sanity check) + try: + models_data = r_models.json() + except (json.JSONDecodeError, ValueError) as e: + # Non-JSON response is acceptable for some vLLM versions + log.info(f"✓ /v1/models endpoint accessible (200), non-JSON response: {r_models.text[:200]}") + else: + # JSON response - validate structure + assert "data" in models_data or "object" in models_data, \ + f"Expected valid models response with 'data' or 'object' field, got: {models_data}" + log.info(f"✓ /v1/models endpoint accessible (200) despite exhausted quota. Response keys: {list(models_data.keys())}") + + finally: + # Clean up + _delete_cr("maassubscription", subscription_name) + _delete_cr("maasauthpolicy", auth_policy_name) + _wait_reconcile() + log.info("Cleaned up models endpoint exemption test resources") + class TestMultipleSubscriptionsPerModel: """Multiple subscriptions for one model — API key in ONE subscription should get access. @@ -1390,22 +916,52 @@ def test_delete_last_subscription_denies_access(self): _apply_cr(original) _wait_reconcile() - # TODO: Uncomment this test once we validated unconfigured models - # def test_unconfigured_model_denied_by_gateway_auth(self): - # """New model with no MaaSAuthPolicy/MaaSSubscription -> gateway default auth denies (403).""" - # api_key = _get_default_api_key() - # r = _inference(api_key, path=UNCONFIGURED_MODEL_PATH) - # log.info(f"Unconfigured model (no auth policy) -> {r.status_code}") - # assert r.status_code == 403, f"Expected 403 (gateway default deny), got {r.status_code}" + def test_unconfigured_model_denied_by_gateway_auth(self): + """New model with no MaaSAuthPolicy/MaaSSubscription -> gateway default auth denies (403).""" + # Precondition: unconfigured model fixture is deployed + model = _get_cr("maasmodelref", UNCONFIGURED_MODEL_REF, namespace=MODEL_NAMESPACE) + assert model is not None, ( + f"MaaSModelRef {UNCONFIGURED_MODEL_REF} must exist in {MODEL_NAMESPACE} " + f"(deploy test/e2e/fixtures/unconfigured first)" + ) + # Precondition: no per-route auth policy exists for this model + assert not _cr_exists("maasauthpolicy", UNCONFIGURED_MODEL_REF, namespace=MODEL_NAMESPACE), ( + f"MaaSAuthPolicy for {UNCONFIGURED_MODEL_REF} must NOT exist — " + f"this test validates gateway-level deny-by-default" + ) -class TestOrderingEdgeCases: - """Tests that resource creation order doesn't matter.""" + # Precondition: no subscription exists for this model + assert not _cr_exists("maassubscription", UNCONFIGURED_MODEL_REF, namespace=MODEL_NAMESPACE), ( + f"MaaSSubscription for {UNCONFIGURED_MODEL_REF} must NOT exist — " + f"this test validates gateway-level deny-by-default" + ) - def test_subscription_before_auth_policy(self): - """Create subscription first, then auth policy -> should work once both exist.""" - ns = _ns() - try: + # Precondition: gateway-default-auth is in place and accepted + gw_auth = _get_cr("authpolicy", "gateway-default-auth", namespace="openshift-ingress") + assert gw_auth is not None, ( + "gateway-default-auth AuthPolicy must exist in openshift-ingress" + ) + conditions = gw_auth.get("status", {}).get("conditions", []) + accepted = [c for c in conditions if c.get("type") == "Accepted"] + assert accepted and accepted[0].get("status") == "True", ( + f"gateway-default-auth must be Accepted, got: {accepted}" + ) + + # Verify deny-by-default: inference to unconfigured model should be denied + api_key = _get_default_api_key() + r = _inference(api_key, path=UNCONFIGURED_MODEL_PATH) + log.info(f"Unconfigured model (no auth policy) -> {r.status_code}") + assert r.status_code == 403, f"Expected 403 (gateway default deny), got {r.status_code}" + + +class TestOrderingEdgeCases: + """Tests that resource creation order doesn't matter.""" + + def test_subscription_before_auth_policy(self): + """Create subscription first, then auth policy -> should work once both exist.""" + ns = _ns() + try: # Subscription CR must exist before minting a key bound to it _apply_cr({ "apiVersion": "maas.opendatahub.io/v1alpha1", @@ -1417,7 +973,7 @@ def test_subscription_before_auth_policy(self): }, }) _wait_reconcile() - _wait_for_maas_subscription_ready("e2e-ordering-sub", namespace=ns, timeout=90) + _wait_for_maas_subscription_phase("e2e-ordering-sub", namespace=ns, timeout=90) api_key = _create_api_key( _get_cluster_token(), @@ -2159,3 +1715,799 @@ def test_e2e_group_based_subscription_but_no_auth_gets_403(self): _delete_cr("maasauthpolicy", auth_policy_name, namespace=ns) _delete_sa(sa_name, namespace=ns) _wait_reconcile() + + +class TestStatusReporting: + """ + Tests for MaaSSubscription and MaaSAuthPolicy status reporting. + + Validates that the controller correctly reports: + - Phase (Active, Degraded, Failed) + - Per-item status (modelRefStatuses, tokenRateLimitStatuses, authPolicies) + - Ready/Reason fields on per-item statuses + """ + + def test_subscription_active_status_with_valid_model(self): + """ + Test: MaaSSubscription shows Active phase with valid model reference. + + Creates a subscription with a valid model ref and verifies: + - Phase is "Active" + - modelRefStatuses contains entry with ready=true + - tokenRateLimitStatuses contains entry with ready=true (after TRLP created) + """ + ns = _ns() + subscription_name = "e2e-status-active-sub" + auth_name = "e2e-status-active-auth" + sa_name = "e2e-status-active-sa" + + try: + _create_sa_token(sa_name, namespace="default") + sa_user = f"system:serviceaccount:default:{sa_name}" + + _create_test_auth_policy(auth_name, MODEL_REF, users=[sa_user]) + _create_test_subscription(subscription_name, MODEL_REF, users=[sa_user]) + + _wait_for_maas_auth_policy_phase(auth_name) + + # Wait for subscription to reach Active phase with populated status + cr = _wait_for_maas_subscription_phase(subscription_name, "Active", timeout=60, require_model_statuses=True) + + status = cr.get("status", {}) + model_statuses = status.get("modelRefStatuses", []) + trlp_statuses = status.get("tokenRateLimitStatuses", []) + + log.info(f"Subscription status: phase={status.get('phase')}, modelRefStatuses={len(model_statuses)}, tokenRateLimitStatuses={len(trlp_statuses)}") + + # Check model ref status + model_status = model_statuses[0] + assert model_status.get("ready") is True, "Expected modelRefStatus ready=true" + assert model_status.get("reason") == "Valid", f"Expected reason 'Valid', got {model_status.get('reason')}" + + log.info("✅ MaaSSubscription Active status verified") + + finally: + _delete_cr("maassubscription", subscription_name, namespace=ns) + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + + def test_subscription_failed_status_with_missing_model(self): + """ + Test: MaaSSubscription shows Failed phase when all model refs are missing. + + Creates a subscription referencing a non-existent model and verifies: + - Phase is "Failed" + - modelRefStatuses contains entry with ready=false, reason="NotFound" + """ + ns = _ns() + subscription_name = "e2e-status-failed-sub" + sa_name = "e2e-status-failed-sa" + missing_model = "nonexistent-model-xyz" + + try: + _create_sa_token(sa_name, namespace="default") + sa_user = f"system:serviceaccount:default:{sa_name}" + + # Create subscription with non-existent model + _create_test_subscription(subscription_name, missing_model, users=[sa_user]) + + # Wait for subscription to reach Failed phase with populated status + cr = _wait_for_maas_subscription_phase(subscription_name, "Failed", timeout=60, require_model_statuses=True) + + status = cr.get("status", {}) + model_statuses = status.get("modelRefStatuses", []) + + log.info(f"Subscription status: phase={status.get('phase')}, modelRefStatuses={model_statuses}") + + # Check model ref status shows NotFound + model_status = model_statuses[0] + assert model_status.get("ready") is False, "Expected modelRefStatus ready=false" + assert model_status.get("reason") == "NotFound", f"Expected reason 'NotFound', got {model_status.get('reason')}" + + log.info("✅ MaaSSubscription Failed status verified") + + finally: + _delete_cr("maassubscription", subscription_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + + def test_authpolicy_active_status_with_valid_model(self): + """ + Test: MaaSAuthPolicy shows Active phase with valid model reference. + + Creates an auth policy with a valid model ref and verifies: + - Phase is "Active" + - authPolicies contains entry with ready=true, reason="AcceptedEnforced" + """ + ns = _ns() + auth_name = "e2e-status-active-auth-only" + sa_name = "e2e-status-active-auth-sa" + + try: + _create_sa_token(sa_name, namespace="default") + sa_user = f"system:serviceaccount:default:{sa_name}" + + _create_test_auth_policy(auth_name, MODEL_REF, users=[sa_user]) + + # Wait for auth policy to reach Active phase with populated status + cr = _wait_for_maas_auth_policy_phase(auth_name, "Active", timeout=90) + + status = cr.get("status", {}) + auth_policies = status.get("authPolicies", []) + + log.info(f"AuthPolicy status: phase={status.get('phase')}, authPolicies={auth_policies}") + + # Check auth policy status + ap_status = auth_policies[0] + assert ap_status.get("ready") is True, "Expected authPolicy ready=true" + assert ap_status.get("reason") == "AcceptedEnforced", f"Expected reason 'AcceptedEnforced', got {ap_status.get('reason')}" + + log.info("✅ MaaSAuthPolicy Active status verified") + + finally: + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + + def test_authpolicy_failed_status_with_missing_model(self): + """ + Test: MaaSAuthPolicy shows Failed phase when all model refs are missing. + + Creates an auth policy referencing a non-existent model and verifies: + - Phase is "Failed" + - authPolicies array is empty (no AuthPolicy generated for missing model) + """ + ns = _ns() + auth_name = "e2e-status-failed-auth" + sa_name = "e2e-status-failed-auth-sa" + missing_model = "nonexistent-model-abc" + + try: + _create_sa_token(sa_name, namespace="default") + sa_user = f"system:serviceaccount:default:{sa_name}" + + # Create auth policy with non-existent model + _create_test_auth_policy(auth_name, missing_model, users=[sa_user]) + + # Wait for auth policy to reach Failed phase (no authPolicies expected for missing model) + cr = _wait_for_maas_auth_policy_phase(auth_name, "Failed", timeout=60, require_auth_policies=False) + + status = cr.get("status", {}) + log.info(f"AuthPolicy status: phase={status.get('phase')}, authPolicies={status.get('authPolicies', [])}") + + log.info("✅ MaaSAuthPolicy Failed status verified") + + finally: + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + + def test_subscription_degraded_status_with_partial_models(self): + """ + Test: MaaSSubscription shows Degraded phase when some models are valid, some missing. + + Creates a subscription with one valid and one missing model ref and verifies: + - Phase is "Degraded" + - modelRefStatuses contains entries for both (one ready=true, one ready=false) + """ + ns = _ns() + subscription_name = "e2e-status-degraded-sub" + auth_name = "e2e-status-degraded-auth" + sa_name = "e2e-status-degraded-sa" + missing_model = "nonexistent-model-partial" + + try: + _create_sa_token(sa_name, namespace="default") + sa_user = f"system:serviceaccount:default:{sa_name}" + + # Create auth policy for valid model only + _create_test_auth_policy(auth_name, MODEL_REF, users=[sa_user]) + + # Create subscription with both valid and missing models + _create_test_subscription(subscription_name, [MODEL_REF, missing_model], users=[sa_user]) + + # Wait for subscription to reach Degraded phase with polling + cr = _wait_for_maas_subscription_phase(subscription_name, "Degraded", timeout=60) + + status = cr.get("status", {}) + model_statuses = status.get("modelRefStatuses", []) + + log.info(f"Subscription status: phase={status.get('phase')}, modelRefStatuses={model_statuses}") + + assert len(model_statuses) == 2, f"Expected 2 modelRefStatuses, got {len(model_statuses)}" + + # Check we have one valid and one invalid + ready_count = sum(1 for s in model_statuses if s.get("ready") is True) + not_ready_count = sum(1 for s in model_statuses if s.get("ready") is False) + + assert ready_count == 1, f"Expected 1 ready modelRefStatus, got {ready_count}" + assert not_ready_count == 1, f"Expected 1 not-ready modelRefStatus, got {not_ready_count}" + + log.info("✅ MaaSSubscription Degraded status verified") + + finally: + _delete_cr("maassubscription", subscription_name, namespace=ns) + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + + def test_subscription_degraded_trlp_blocks_inference(self): + """ + Test: Degraded subscription with TRLP not ready blocks inference. + + This test verifies that when a subscription enters Degraded phase due to + TokenRateLimitPolicy not being ready (e.g., Kuadrant controller down), + inference requests are blocked with appropriate error to prevent rate + limits from being bypassed. + + Uses pre-deployed e2e-trlp-test-simulated model to avoid TRLP sharing with concurrent tests. + + Test flow: + 1. Scale down Kuadrant controller + 2. Create subscription with valid model - TRLP created but not accepted + 3. Wait for subscription to enter Degraded phase (TRLP ready=false) + 4. Create API key and verify inference is blocked (403 Forbidden) + 5. Scale Kuadrant controller back up + 6. Wait for subscription to reach Active phase (TRLP ready=true) + 7. Verify inference works (200 OK) + """ + ns = _ns() + subscription_name = "e2e-trlp-degraded-sub" + auth_name = "e2e-trlp-degraded-auth" + sa_name = "e2e-trlp-degraded-sa" + + try: + # Step 1: Scale down Kuadrant controller BEFORE creating subscription + log.info("Step 1: Scaling down Kuadrant controller...") + _scale_kuadrant_controller_down() + time.sleep(5) # Give time for controller to fully stop + + # Step 2: Create auth policy and subscription + log.info("Step 2: Creating subscription with Kuadrant controller down...") + sa_token = _create_sa_token(sa_name, namespace="default") + sa_user = f"system:serviceaccount:default:{sa_name}" + + _create_test_auth_policy(auth_name, TRLP_TEST_MODEL_REF, users=[sa_user]) + _create_test_subscription(subscription_name, TRLP_TEST_MODEL_REF, users=[sa_user]) + + # Wait for auth policy - will be Degraded since Kuadrant is down + log.info("Waiting for MaaSAuthPolicy (will be Degraded with Kuadrant down)...") + _wait_for_maas_auth_policy_phase(auth_name, "Degraded", timeout=60, require_auth_policies=True, require_enforced=False) + + # Step 3: Wait for subscription to reach Degraded phase with TRLP not ready + log.info("Step 3: Waiting for subscription to enter Degraded phase (TRLP not ready)...") + cr = _wait_for_maas_subscription_phase(subscription_name, "Degraded", timeout=120) + _wait_for_subscription_trlp_status(subscription_name, expected_ready=False, timeout=120) + + status = cr.get("status", {}) + trlp_statuses = status.get("tokenRateLimitStatuses", []) + log.info(f"Subscription Degraded: phase={status.get('phase')}, trlpStatuses={trlp_statuses}") + + # Verify at least one TRLP is not ready + assert len(trlp_statuses) > 0, "Expected at least one TRLP status" + assert any(not trlp.get("ready") for trlp in trlp_statuses), "Expected at least one TRLP to be not ready" + log.info("✅ Subscription in Degraded phase with TRLP not ready") + + # Step 4: Create API key and verify inference is blocked + log.info("Step 4: Creating API key and verifying inference is blocked...") + api_key = _create_api_key(sa_token, name="e2e-trlp-test-key", subscription=subscription_name) + + resp = _inference(api_key, path=TRLP_TEST_MODEL_PATH, model_name=TRLP_TEST_MODEL_ID) + assert resp.status_code == 403, f"Expected 403 Forbidden for Degraded subscription with TRLP not ready, got {resp.status_code}: {resp.text}" + log.info("✅ Inference blocked for Degraded subscription with TRLP not ready") + + # Step 5: Scale Kuadrant controller back up + log.info("Step 5: Scaling Kuadrant controller back up...") + _scale_kuadrant_controller_up() + time.sleep(10) # Give time for TRLP to reconcile and be accepted + + # Step 6: Wait for subscription to reach Active phase with TRLP ready + log.info("Step 6: Waiting for subscription to reach Active phase (TRLP ready)...") + _wait_for_maas_subscription_phase(subscription_name, "Active", timeout=120) + _wait_for_subscription_trlp_status(subscription_name, expected_ready=True, timeout=120) + + cr = _get_cr("maassubscription", subscription_name, namespace=ns) + status = cr.get("status", {}) + trlp_statuses = status.get("tokenRateLimitStatuses", []) + log.info(f"Subscription Active: phase={status.get('phase')}, trlpStatuses={trlp_statuses}") + + # Verify all TRLPs are now ready + assert all(trlp.get("ready") for trlp in trlp_statuses), "Expected all TRLPs to be ready" + log.info("✅ Subscription returned to Active phase with all TRLPs ready") + + # Step 7: Verify inference works + log.info("Step 7: Verifying inference works with Active subscription...") + resp = _inference(api_key, path=TRLP_TEST_MODEL_PATH, model_name=TRLP_TEST_MODEL_ID) + assert resp.status_code == 200, f"Expected 200 OK for Active subscription, got {resp.status_code}: {resp.text}" + log.info("✅ Inference works with Active subscription after Kuadrant recovery") + + log.info("✅ TRLP validation e2e test complete") + + finally: + # Ensure Kuadrant controller is scaled back up even if test fails + try: + log.info("Cleanup: Ensuring Kuadrant controller is scaled up...") + _scale_kuadrant_controller_up() + except Exception as e: + log.warning(f"Failed to scale Kuadrant controller up during cleanup: {e}") + + # Revoke API key + try: + oc_token = _get_cluster_token() + _revoke_api_key(oc_token, "e2e-trlp-test-key") + except Exception as e: + log.warning(f"Failed to revoke API key during cleanup: {e}") + + # Clean up resources (but not the model - it's pre-deployed) + _delete_cr("maassubscription", subscription_name, namespace=ns) + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + + def test_authpolicy_degraded_status_with_partial_models(self): + """ + Test: MaaSAuthPolicy shows Degraded phase when some models are valid, some missing. + + Creates an auth policy with one valid and one missing model ref and verifies: + - Phase is "Degraded" + - authPolicies contains entry for the valid model (ready=true) + """ + ns = _ns() + auth_name = "e2e-status-degraded-auth" + sa_name = "e2e-status-degraded-auth-sa" + missing_model = "nonexistent-model-auth-partial" + + try: + _create_sa_token(sa_name, namespace="default") + sa_user = f"system:serviceaccount:default:{sa_name}" + + # Create auth policy with both valid and missing models + _create_test_auth_policy(auth_name, [MODEL_REF, missing_model], users=[sa_user]) + + # Wait for auth policy to reach Degraded phase with polling + cr = _wait_for_maas_auth_policy_phase(auth_name, "Degraded", timeout=60) + + status = cr.get("status", {}) + auth_policies = status.get("authPolicies", []) + + log.info(f"AuthPolicy status: phase={status.get('phase')}, authPolicies={auth_policies}") + + # Should have at least one entry for the valid model + if len(auth_policies) > 0: + ready_count = sum(1 for ap in auth_policies if ap.get("ready") is True) + log.info(f"Found {ready_count} ready authPolicies out of {len(auth_policies)}") + + log.info("✅ MaaSAuthPolicy Degraded status verified") + + finally: + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + + def test_subscription_status_transitions_on_model_deletion(self): + """ + Test: MaaSSubscription transitions from Active to Degraded/Failed when model is deleted. + + Creates a subscription with a temporary model, verifies Active status, + then deletes the model and verifies status transitions appropriately. + """ + ns = _ns() + subscription_name = "e2e-status-transition-sub" + auth_name = "e2e-status-transition-auth" + model_name = "e2e-temp-model-status" + sa_name = "e2e-status-transition-sa" + + try: + _create_sa_token(sa_name, namespace="default") + sa_user = f"system:serviceaccount:default:{sa_name}" + + # Create a temporary model + _create_test_maas_model(model_name, llmis_name=MODEL_REF, namespace=MODEL_NAMESPACE) + _wait_reconcile() + + # Create auth policy and subscription for the model + _create_test_auth_policy(auth_name, model_name, users=[sa_user]) + _create_test_subscription(subscription_name, model_name, users=[sa_user]) + + _wait_for_maas_auth_policy_phase(auth_name) + _wait_for_maas_subscription_phase(subscription_name) + + # Verify initial Active status + cr = _get_cr("maassubscription", subscription_name, namespace=ns) + assert cr is not None + status = cr.get("status", {}) + initial_phase = status.get("phase") + log.info(f"Initial subscription status: phase={initial_phase}") + assert initial_phase == "Active", f"Expected initial phase 'Active', got '{initial_phase}'" + + # Delete the model + _delete_cr("maasmodelref", model_name, namespace=MODEL_NAMESPACE) + + # Wait for subscription to transition to Failed phase with polling + # Use longer timeout to allow for cache invalidation + cr = _wait_for_maas_subscription_phase(subscription_name, "Failed", timeout=120) + + # Poll for modelRefStatuses to also reflect the deletion + # (cache may take additional time to invalidate) + deadline = time.time() + 60 + while time.time() < deadline: + cr = _get_cr("maassubscription", subscription_name, namespace=ns) + status = cr.get("status", {}) + model_statuses = status.get("modelRefStatuses", []) + if len(model_statuses) > 0 and model_statuses[0].get("ready") is False: + break + time.sleep(2) + + status = cr.get("status", {}) + model_statuses = status.get("modelRefStatuses", []) + + log.info(f"Final subscription status: phase={status.get('phase')}, modelRefStatuses={model_statuses}") + + # Check model ref status shows NotFound + if len(model_statuses) > 0: + model_status = model_statuses[0] + assert model_status.get("ready") is False, "Expected modelRefStatus ready=false after deletion" + assert model_status.get("reason") == "NotFound", "Expected reason 'NotFound' after deletion" + + log.info("✅ MaaSSubscription status transition verified (Active → Failed)") + + finally: + _delete_cr("maassubscription", subscription_name, namespace=ns) + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_cr("maasmodelref", model_name, namespace=MODEL_NAMESPACE) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + +class TestDegradedSubscriptionFiltering: + """ + Test active filtering for Degraded subscriptions. + + Verifies inference behavior with subscriptions in different phases: + - Degraded subscriptions with healthy models allow inference + - Degraded subscriptions with unhealthy models block inference + - Failed subscriptions block inference + - Endpoints (/v1/models, /v1/subscriptions) report health correctly + + Strategy: Let controller naturally set phase based on model health + (valid + missing models → Degraded, all missing → Failed). + """ + + def test_degraded_healthy_model_allows_inference(self): + """ + Test: Inference to healthy model in Degraded subscription succeeds. + + Setup: + 1. Create subscription with 1 valid + 1 missing model + 2. Controller sets phase=Degraded, modelRefStatuses shows mixed health + + Verify: + - Subscription is Degraded with one ready=true, one ready=false + - Inference to the valid model succeeds (200) + """ + ns = _ns() + subscription_name = "e2e-degraded-healthy-inf" + auth_name = "e2e-degraded-healthy-inf-auth" + sa_name = "e2e-degraded-healthy-inf-sa" + missing_model = "nonexistent-model-inf" + + try: + oc_token = _create_sa_token(sa_name, namespace="default") + sa_user = f"system:serviceaccount:default:{sa_name}" + + # Create auth policy for valid model only + _create_test_auth_policy(auth_name, MODEL_REF, users=[sa_user]) + + # Create subscription with valid + missing → auto-Degraded + _create_test_subscription( + subscription_name, + [MODEL_REF, missing_model], + users=[sa_user] + ) + + _wait_reconcile(seconds=10) + + # Verify Degraded with mixed health + cr = _get_cr("maassubscription", subscription_name, namespace=ns) + status = cr.get("status", {}) + phase = status.get("phase") + model_statuses = status.get("modelRefStatuses", []) + + log.info(f"Phase: {phase}, modelRefStatuses: {model_statuses}") + + assert phase == "Degraded", f"Expected Degraded, got {phase}" + assert len(model_statuses) == 2, f"Expected 2 statuses, got {len(model_statuses)}" + + # Find our valid model status + valid_status = next( + (s for s in model_statuses if s.get("name") == MODEL_REF), + None + ) + assert valid_status is not None, f"Missing status for {MODEL_REF}" + assert valid_status.get("ready") is True, \ + f"Expected {MODEL_REF} ready=true, got {valid_status}" + + log.info(f"✅ Subscription Degraded with {MODEL_REF} healthy") + + # Create API key + # oc_token already set from _create_sa_token above + api_key = _create_api_key( + oc_token, + name="degraded-healthy", + subscription=subscription_name + ) + + # Inference to healthy model should work + log.info(f"Testing inference to healthy {MODEL_REF}...") + r = _inference(api_key, path=MODEL_PATH, model_name=MODEL_NAME) + + assert r.status_code == 200, \ + f"Expected 200 for healthy model in Degraded subscription, got {r.status_code}: {r.text[:500]}" + + log.info("✅ Inference to healthy model in Degraded subscription succeeded") + + finally: + _delete_cr("maassubscription", subscription_name, namespace=ns) + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + + def test_failed_subscription_blocks_inference(self): + """ + Test: Failed subscription blocks inference via OPA rule. + + Setup: + 1. Create subscription with valid model (starts Active) + 2. Create API key + 3. Manually patch subscription to Failed phase + 4. Verify inference is rejected by OPA (403) + + Note: We use manual patching because naturally creating a Failed subscription + requires only invalid models, which don't have routes (404 before OPA runs). + """ + ns = _ns() + subscription_name = "e2e-failed-sub-inf" + auth_name = "e2e-failed-sub-inf-auth" + sa_name = "e2e-failed-sub-inf-sa" + + try: + oc_token = _create_sa_token(sa_name, namespace="default") + sa_user = f"system:serviceaccount:default:{sa_name}" + + # Create auth policy for valid model + _create_test_auth_policy(auth_name, MODEL_REF, users=[sa_user]) + + # Create subscription with valid model (will be Active) + _create_test_subscription(subscription_name, MODEL_REF, users=[sa_user]) + + _wait_reconcile(seconds=10) + + # Verify it starts as Active + cr = _get_cr("maassubscription", subscription_name, namespace=ns) + phase = cr.get("status", {}).get("phase") + log.info(f"Initial phase: {phase}") + assert phase == "Active", f"Expected Active initially, got {phase}" + + # Create API key while Active + api_key = _create_api_key( + oc_token, + name="failed-sub-test", + subscription=subscription_name + ) + + # Verify inference works while Active + log.info("Testing inference while Active...") + r = _inference(api_key, path=MODEL_PATH, model_name=MODEL_NAME) + assert r.status_code == 200, f"Expected 200 while Active, got {r.status_code}: {r.text[:200]}" + log.info("✅ Inference works with Active subscription") + + # Manually patch subscription to Failed phase + import subprocess + import json + from datetime import datetime + + log.info("Manually patching subscription to Failed phase...") + patch_data = { + "status": { + "phase": "Failed", + "conditions": [ + { + "type": "Ready", + "status": "False", + "reason": "Failed", + "message": "Subscription failed", + "lastTransitionTime": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") + } + ], + "modelRefStatuses": [ + { + "name": MODEL_REF, + "namespace": MODEL_NAMESPACE, + "ready": False, + "reason": "ReconcileFailed", + "message": "Model failed" + } + ] + } + } + + cmd = [ + "kubectl", "patch", "maassubscription", subscription_name, + "-n", ns, + "--type=merge", + "--subresource=status", + "-p", json.dumps(patch_data) + ] + result = subprocess.run(cmd, capture_output=True, text=True) + assert result.returncode == 0, f"Failed to patch to Failed phase: {result.stderr}" + + # Verify phase is Failed + cr = _get_cr("maassubscription", subscription_name, namespace=ns) + phase = cr.get("status", {}).get("phase") + assert phase == "Failed", f"Expected Failed phase after patch, got {phase}" + log.info("✅ Subscription patched to Failed phase") + + # Test inference with Failed subscription - should be rejected by OPA + log.info("Testing inference with Failed subscription...") + r = _inference(api_key, path=MODEL_PATH, model_name=MODEL_NAME) + + log.info(f"Response: status={r.status_code}, body={r.text[:200]}") + + # Failed phase should be rejected by OPA rule (403 or error message) + if r.status_code == 200: + assert "denied" in r.text.lower() or "access" in r.text.lower(), f"Expected access denied message, got: {r.text[:200]}" + else: + assert r.status_code == 403, f"Expected 403 for Failed subscription, got {r.status_code}: {r.text[:200]}" + + log.info("✅ Inference with Failed subscription correctly rejected by OPA") + + finally: + _delete_cr("maassubscription", subscription_name, namespace=ns) + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + + def test_models_endpoint_with_degraded_subscription_api_key(self): + """ + Test: /v1/models with API key bound to Degraded subscription. + + Verify behavior when querying models list with a Degraded subscription. + Current implementation may succeed (showing valid models) or fail depending + on selector implementation. + """ + ns = _ns() + subscription_name = "e2e-degraded-models-apikey" + auth_name = "e2e-degraded-models-apikey-auth" + sa_name = "e2e-degraded-models-apikey-sa" + missing_model = "nonexistent-model-apikey" + + try: + oc_token = _create_sa_token(sa_name, namespace="default") + sa_user = f"system:serviceaccount:default:{sa_name}" + + # Create auth policy + _create_test_auth_policy(auth_name, MODEL_REF, users=[sa_user]) + + # Create subscription + _create_test_subscription( + subscription_name, + [MODEL_REF, missing_model], + users=[sa_user] + ) + + _wait_reconcile(seconds=10) + + # Verify Degraded + cr = _get_cr("maassubscription", subscription_name, namespace=ns) + phase = cr.get("status", {}).get("phase") + assert phase == "Degraded", f"Expected Degraded, got {phase}" + + # Create API key + # oc_token already set from _create_sa_token above + api_key = _create_api_key( + oc_token, + name="degraded-models", + subscription=subscription_name + ) + + # Call /v1/models + url = f"{_maas_api_url()}/v1/models" + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + } + + log.info(f"GET {url} with API key") + r = requests.get(url, headers=headers, timeout=TIMEOUT, verify=TLS_VERIFY) + + log.info(f"Response: {r.status_code}") + + # Should succeed - API key can list models from Degraded subscription + assert r.status_code == 200, \ + f"Expected 200 for /v1/models with Degraded subscription API key, got {r.status_code}: {r.text[:500]}" + + data = r.json() + models = data.get("data", []) + log.info(f"✅ /v1/models succeeded, returned {len(models)} models") + + # At least the valid model should be present + assert len(models) > 0, \ + "Expected at least one model from Degraded subscription with valid model" + + finally: + _delete_cr("maassubscription", subscription_name, namespace=ns) + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() + + def test_models_endpoint_with_degraded_subscription_kube_token(self): + """ + Test: /v1/models with Kube token includes models from Degraded subscriptions. + + Kube tokens should return models from all accessible subscriptions, + including Degraded ones. + """ + ns = _ns() + subscription_name = "e2e-degraded-models-kube" + auth_name = "e2e-degraded-models-kube-auth" + sa_name = "e2e-degraded-models-kube-sa" + missing_model = "nonexistent-model-kube" + + try: + oc_token = _create_sa_token(sa_name, namespace="default") + sa_user = f"system:serviceaccount:default:{sa_name}" + + # Create auth policy + _create_test_auth_policy(auth_name, MODEL_REF, users=[sa_user]) + + # Create subscription + _create_test_subscription( + subscription_name, + [MODEL_REF, missing_model], + users=[sa_user] + ) + + _wait_reconcile(seconds=10) + + # Verify Degraded + cr = _get_cr("maassubscription", subscription_name, namespace=ns) + phase = cr.get("status", {}).get("phase") + assert phase == "Degraded", f"Expected Degraded, got {phase}" + + # Call /v1/models with Kube token + url = f"{_maas_api_url()}/v1/models" + headers = { + "Authorization": f"Bearer {oc_token}", + "Content-Type": "application/json" + } + + log.info(f"GET {url} with Kube token") + r = requests.get(url, headers=headers, timeout=TIMEOUT, verify=TLS_VERIFY) + + assert r.status_code == 200, \ + f"Expected 200 with Kube token, got {r.status_code}: {r.text[:500]}" + + data = r.json() + models = data.get("data", []) + log.info(f"Returned {len(models)} models") + + # Verify the Degraded subscription is included in model subscriptions + found_degraded_sub = False + for model in models: + subs = model.get("subscriptions", []) + sub_names = [s.get("name") for s in subs] + if subscription_name in sub_names: + log.info(f"✅ Model {model.get('id')} includes Degraded subscription {subscription_name}") + found_degraded_sub = True + break + + assert found_degraded_sub, \ + f"Expected Degraded subscription '{subscription_name}' to be included in /v1/models response, but not found in any model's subscriptions" + + log.info("✅ /v1/models with Kube token includes Degraded subscription") + + finally: + _delete_cr("maassubscription", subscription_name, namespace=ns) + _delete_cr("maasauthpolicy", auth_name, namespace=ns) + _delete_sa(sa_name, namespace="default") + _wait_reconcile() diff --git a/test/e2e/tests/test_subscription_list_endpoints.py b/test/e2e/tests/test_subscription_list_endpoints.py index c5fc8550e..1d0d6d95b 100644 --- a/test/e2e/tests/test_subscription_list_endpoints.py +++ b/test/e2e/tests/test_subscription_list_endpoints.py @@ -7,7 +7,15 @@ subscription_id_header, subscription_description, display_name, priority, model_refs, organization_id, cost_center, labels -Requires same environment setup as test_subscription.py. +Requires: + - GATEWAY_HOST env var + - MAAS_API_BASE_URL env var + - maas-controller deployed with example CRs applied + - oc/kubectl access to create service account tokens + +Environment variables: + See test_helper.py module docstring for shared environment variables. + This file uses no additional file-specific environment variables. """ import json @@ -17,7 +25,14 @@ import pytest import requests -from test_subscription import ( +from test_helper import ( + DISTINCT_MODEL_2_REF, + DISTINCT_MODEL_REF, + MODEL_NAMESPACE, + MODEL_REF, + SIMULATOR_SUBSCRIPTION, + TIMEOUT, + TLS_VERIFY, _create_api_key, _create_sa_token, _create_test_auth_policy, @@ -28,13 +43,6 @@ _ns, _sa_to_user, _wait_reconcile, - MODEL_NAMESPACE, - MODEL_REF, - DISTINCT_MODEL_REF, - DISTINCT_MODEL_2_REF, - SIMULATOR_SUBSCRIPTION, - TIMEOUT, - TLS_VERIFY, ) log = logging.getLogger(__name__)