From 340c0052ac04af0517aad1554f4b533c49608e3d Mon Sep 17 00:00:00 2001 From: Radovan Fuchs Date: Fri, 27 Mar 2026 13:38:00 +0100 Subject: [PATCH 01/18] bump rhoai image version in prow tests --- tests/e2e-prow/rhoai/pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 73585cb82..0a6d2c6c0 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -13,7 +13,7 @@ MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" PIPELINE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # RHOAI llama-stack image -LLAMA_STACK_IMAGE="${LLAMA_STACK_IMAGE:-quay.io/rhoai/odh-llama-stack-core-rhel9:rhoai-3.3}" +LLAMA_STACK_IMAGE="${LLAMA_STACK_IMAGE:-quay.io/rhoai/odh-llama-stack-core-rhel9:rhoai-3.4-ea.2" echo "Using llama-stack image: $LLAMA_STACK_IMAGE" export LLAMA_STACK_IMAGE From a53c2e1b45db8860b34c46f0ebc3ce6584792150 Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Sat, 28 Mar 2026 15:55:09 +0100 Subject: [PATCH 02/18] Fix missing closing brace in LLAMA_STACK_IMAGE parameter expansion Co-Authored-By: Claude Opus 4.6 --- tests/e2e-prow/rhoai/pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 0a6d2c6c0..567d07c86 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -13,7 +13,7 @@ MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" PIPELINE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # RHOAI llama-stack image -LLAMA_STACK_IMAGE="${LLAMA_STACK_IMAGE:-quay.io/rhoai/odh-llama-stack-core-rhel9:rhoai-3.4-ea.2" +LLAMA_STACK_IMAGE="${LLAMA_STACK_IMAGE:-quay.io/rhoai/odh-llama-stack-core-rhel9:rhoai-3.4-ea.2}" echo "Using llama-stack image: $LLAMA_STACK_IMAGE" export LLAMA_STACK_IMAGE From 9105daf5c6ed2bf86f9aeb0cf3f69bdbbe3d93e2 Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Sun, 29 Mar 2026 20:01:47 +0200 Subject: [PATCH 03/18] Fix prow pipeline: in-cluster image build, RAG config, port-forward fix - Build llama-stack image in OpenShift internal registry via oc new-build/start-build - Add image-puller role for default SA to pull from internal registry - Add FAISS_VECTOR_STORE_ID and KV_RAG_PATH env vars to lightspeed-stack pod - Add inference, byok_rag, and rag sections to prow lightspeed-stack configs - Use envsubst with specific variable scoping in pipeline-services.sh - Fix free_local_tcp_port to only kill LISTEN sockets (was killing behave process) - Add MCP token secrets and empty OpenAI secret to pipeline.sh - Add rlsapi_v1_infer action to prow RBAC config - Simplify llama-stack.yaml to use pre-built image Co-Authored-By: Claude Opus 4.6 --- .../lightspeed/lightspeed-stack.yaml | 3 +- .../manifests/lightspeed/llama-stack.yaml | 78 +++++++++++++++--- tests/e2e-prow/rhoai/pipeline-services.sh | 21 ++--- tests/e2e-prow/rhoai/pipeline.sh | 80 ++++++++++++++++--- 4 files changed, 150 insertions(+), 32 deletions(-) diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/lightspeed-stack.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/lightspeed-stack.yaml index 55f9a9310..b10da3c5c 100644 --- a/tests/e2e-prow/rhoai/manifests/lightspeed/lightspeed-stack.yaml +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/lightspeed-stack.yaml @@ -22,13 +22,14 @@ spec: secretKeyRef: name: llama-stack-ip-secret key: key - # Same vars as docker-compose / server-mode YAML (${env.FAISS_VECTOR_STORE_ID} in byok_rag). - name: FAISS_VECTOR_STORE_ID valueFrom: secretKeyRef: name: faiss-vector-store-secret key: id optional: true + - name: KV_RAG_PATH + value: "/app-root/src/.llama/storage/rag/kv_store.db" image: ${LIGHTSPEED_STACK_IMAGE} ports: - containerPort: 8080 diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml index de22831f6..b04c43063 100644 --- a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml @@ -1,30 +1,65 @@ +# Llama Stack pod: uses pre-built image from in-cluster build. +# +# The image is built by pipeline.sh using oc new-build + test.containerfile. +# Only a small init container extracts the RAG DB into the shared volume. +# +# Requires: ConfigMap llama-stack-config (run.yaml), ConfigMap rag-data (kv_store.db.gz). +# Requires: Image built as ${LLAMA_STACK_IMAGE} (set by pipeline.sh). +# apiVersion: v1 kind: Pod metadata: name: llama-stack-service + labels: + pod: llama-stack-service spec: - imagePullSecrets: - - name: quay-lightspeed-pull-secret + securityContext: + seccompProfile: + type: RuntimeDefault initContainers: - name: setup-rag-data image: busybox:latest + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault command: - /bin/sh - -c - | - mkdir -p /data/storage/rag - gunzip -c /rag-data/kv_store.db.gz > /data/storage/rag/kv_store.db + mkdir -p /data/src/.llama/storage/rag /data/src/.llama/storage/files + chmod -R 777 /data + gunzip -c /rag-data/kv_store.db.gz > /data/src/.llama/storage/rag/kv_store.db + chmod -R 777 /data echo "RAG data extracted successfully" - ls -la /data/storage/rag/ volumeMounts: - - name: app-root + - name: rag-storage mountPath: /data - name: rag-data mountPath: /rag-data containers: - name: llama-stack-container - command: ["llama", "stack", "run", "/opt/app-root/run.yaml"] + image: ${LLAMA_STACK_IMAGE} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault + workingDir: /opt/app-root env: + - name: KV_STORE_PATH + value: "/opt/app-root/src/.llama/storage/kv_store.db" + - name: KV_RAG_PATH + value: "/opt/app-root/src/.llama/storage/rag/kv_store.db" + - name: SQL_STORE_PATH + value: "/opt/app-root/src/.llama/storage/sql_store.db" - name: KSVC_URL valueFrom: secretKeyRef: @@ -37,22 +72,43 @@ spec: key: key - name: INFERENCE_MODEL value: "meta-llama/Llama-3.1-8B-Instruct" + - name: OPENAI_API_KEY + valueFrom: + secretKeyRef: + name: openai-api-key-secret + key: key + optional: true + - name: E2E_OPENAI_MODEL + value: "gpt-4o-mini" - name: FAISS_VECTOR_STORE_ID valueFrom: secretKeyRef: name: faiss-vector-store-secret key: id - image: ${LLAMA_STACK_IMAGE} ports: - containerPort: 8321 + readinessProbe: + httpGet: + path: /v1/health + port: 8321 + initialDelaySeconds: 20 + periodSeconds: 5 + failureThreshold: 36 + livenessProbe: + httpGet: + path: /v1/health + port: 8321 + initialDelaySeconds: 120 + periodSeconds: 20 + failureThreshold: 3 volumeMounts: - - name: app-root - mountPath: /opt/app-root/src/.llama + - name: rag-storage + mountPath: /opt/app-root/src/.llama/storage/rag - name: config mountPath: /opt/app-root/run.yaml subPath: run.yaml volumes: - - name: app-root + - name: rag-storage emptyDir: {} - name: config configMap: diff --git a/tests/e2e-prow/rhoai/pipeline-services.sh b/tests/e2e-prow/rhoai/pipeline-services.sh index cd33ab9d5..73aed5671 100755 --- a/tests/e2e-prow/rhoai/pipeline-services.sh +++ b/tests/e2e-prow/rhoai/pipeline-services.sh @@ -1,27 +1,30 @@ #!/bin/bash BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +NAMESPACE="${NAMESPACE:-e2e-rhoai-dsc}" -# Deploy llama-stack -envsubst < "$BASE_DIR/manifests/lightspeed/llama-stack.yaml" | oc apply -f - +# Deploy llama-stack (substitute only LLAMA_STACK_IMAGE, leave other ${} intact) +envsubst '${LLAMA_STACK_IMAGE}' < "$BASE_DIR/manifests/lightspeed/llama-stack.yaml" | oc apply -n "$NAMESPACE" -f - oc wait pod/llama-stack-service \ - -n e2e-rhoai-dsc --for=condition=Ready --timeout=600s + -n "$NAMESPACE" --for=condition=Ready --timeout=600s # Get url address of llama-stack pod -oc label pod llama-stack-service pod=llama-stack-service -n e2e-rhoai-dsc +oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" oc expose pod llama-stack-service \ --name=llama-stack-service-svc \ --port=8321 \ --type=ClusterIP \ - -n e2e-rhoai-dsc + -n "$NAMESPACE" -export E2E_LLAMA_HOSTNAME="llama-stack-service-svc.e2e-rhoai-dsc.svc.cluster.local" +export E2E_LLAMA_HOSTNAME="llama-stack-service-svc.${NAMESPACE}.svc.cluster.local" oc create secret generic llama-stack-ip-secret \ --from-literal=key="$E2E_LLAMA_HOSTNAME" \ - -n e2e-rhoai-dsc || echo "Secret exists" + -n "$NAMESPACE" || echo "Secret exists" -# Deploy lightspeed-stack -oc apply -f "$BASE_DIR/manifests/lightspeed/lightspeed-stack.yaml" +# Deploy lightspeed-stack (substitute only LIGHTSPEED_STACK_IMAGE, leave other ${} intact) +LIGHTSPEED_STACK_IMAGE="${LIGHTSPEED_STACK_IMAGE:-quay.io/lightspeed-core/lightspeed-stack:dev-latest}" +export LIGHTSPEED_STACK_IMAGE +envsubst '${LIGHTSPEED_STACK_IMAGE}' < "$BASE_DIR/manifests/lightspeed/lightspeed-stack.yaml" | oc apply -n "$NAMESPACE" -f - diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 567d07c86..59bbe1a1b 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -8,14 +8,15 @@ export RUNNING_PROW=true #======================================== # 1. GLOBAL CONFIG #======================================== -NAMESPACE="e2e-rhoai-dsc" +NAMESPACE="${NAMESPACE:-e2e-rhoai-dsc}" +export NAMESPACE MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" PIPELINE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# RHOAI llama-stack image -LLAMA_STACK_IMAGE="${LLAMA_STACK_IMAGE:-quay.io/rhoai/odh-llama-stack-core-rhel9:rhoai-3.4-ea.2}" -echo "Using llama-stack image: $LLAMA_STACK_IMAGE" -export LLAMA_STACK_IMAGE +# RHOAI llama-stack image (unused when building from source via llama-stack-openai.yaml) +# LLAMA_STACK_IMAGE="${LLAMA_STACK_IMAGE:-quay.io/rhoai/odh-llama-stack-core-rhel9:rhoai-3.4-ea.2}" +# echo "Using llama-stack image: $LLAMA_STACK_IMAGE" +# export LLAMA_STACK_IMAGE #======================================== # 2. ENVIRONMENT SETUP @@ -56,6 +57,22 @@ create_secret() { create_secret hf-token-secret --from-literal=token="$HUGGING_FACE_HUB_TOKEN" create_secret vllm-api-key-secret --from-literal=key="$VLLM_API_KEY" +create_secret openai-api-key-secret --from-literal=key="" + +# MCP token secrets for lightspeed-stack +REPO_ROOT="$(cd "$PIPELINE_DIR/../../.." && pwd)" +if [ -f "$REPO_ROOT/tests/e2e/secrets/mcp-token" ]; then + oc create secret generic mcp-file-auth-token -n "$NAMESPACE" \ + --from-file=token="$REPO_ROOT/tests/e2e/secrets/mcp-token" \ + --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f - + echo "✅ mcp-file-auth-token secret applied" +fi +if [ -f "$REPO_ROOT/tests/e2e/secrets/invalid-mcp-token" ]; then + oc create secret generic mcp-invalid-file-auth-token -n "$NAMESPACE" \ + --from-file=token="$REPO_ROOT/tests/e2e/secrets/invalid-mcp-token" \ + --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f - + echo "✅ mcp-invalid-file-auth-token secret applied" +fi # Create Quay pull secret for llama-stack images echo "Creating Quay pull secret..." @@ -79,7 +96,7 @@ curl -sL -o tool_chat_template_llama3.1_json.jinja \ || { echo "❌ Failed to download jinja template"; exit 1; } oc create configmap vllm-chat-template -n "$NAMESPACE" \ - --from-file=tool_chat_template_llama3.1_json.jinja --dry-run=client -o yaml | oc apply -f - + --from-file=tool_chat_template_llama3.1_json.jinja --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f - #======================================== @@ -162,18 +179,18 @@ REPO_ROOT="$(cd "$PIPELINE_DIR/../../.." && pwd)" echo "Creating mock server ConfigMaps..." oc create configmap mock-jwks-script -n "$NAMESPACE" \ --from-file=server.py="$REPO_ROOT/tests/e2e/mock_jwks_server/server.py" \ - --dry-run=client -o yaml | oc apply -f - + --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f - oc create configmap mock-mcp-script -n "$NAMESPACE" \ --from-file=server.py="$REPO_ROOT/tests/e2e/mock_mcp_server/server.py" \ - --dry-run=client -o yaml | oc apply -f - + --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f - # Deploy mock server pods and services echo "Deploying mock-jwks..." -oc apply -f "$PIPELINE_DIR/manifests/lightspeed/mock-jwks.yaml" +oc apply -n "$NAMESPACE" -f "$PIPELINE_DIR/manifests/lightspeed/mock-jwks.yaml" echo "Deploying mock-mcp..." -oc apply -f "$PIPELINE_DIR/manifests/lightspeed/mock-mcp.yaml" +oc apply -n "$NAMESPACE" -f "$PIPELINE_DIR/manifests/lightspeed/mock-mcp.yaml" # Wait for mock servers to be ready echo "Waiting for mock servers to be ready..." @@ -189,7 +206,39 @@ oc wait pod/mock-jwks pod/mock-mcp \ echo "✅ Mock servers deployed" #======================================== -# 8. DEPLOY LIGHTSPEED STACK AND LLAMA STACK +# 8. BUILD LLAMA STACK IMAGE +#======================================== +echo "===== Building llama-stack image =====" +LLAMA_STACK_IMAGE="image-registry.openshift-image-registry.svc:5000/${NAMESPACE}/llama-stack-e2e:latest" +export LLAMA_STACK_IMAGE + +# Create BuildConfig (idempotent) +oc new-build --name=llama-stack-e2e \ + --binary \ + --strategy=docker \ + --image="registry.access.redhat.com/ubi9/ubi-minimal" \ + --to="llama-stack-e2e:latest" \ + -n "$NAMESPACE" 2>/dev/null || echo "BuildConfig llama-stack-e2e already exists" + +# Patch BuildConfig to use test.containerfile instead of Dockerfile +oc patch bc llama-stack-e2e -n "$NAMESPACE" --type=json \ + -p '[{"op":"replace","path":"/spec/strategy/dockerStrategy/dockerfilePath","value":"test.containerfile"}]' 2>/dev/null || true + +# Build from repo root +oc start-build llama-stack-e2e \ + --from-dir="$REPO_ROOT" \ + --follow \ + -n "$NAMESPACE" || { echo "❌ llama-stack image build failed"; exit 1; } + +echo "✅ llama-stack image built: $LLAMA_STACK_IMAGE" + +# Allow default SA to pull from the internal registry +oc policy add-role-to-user system:image-puller \ + system:serviceaccount:${NAMESPACE}:default \ + -n "$NAMESPACE" 2>/dev/null || true + +#======================================== +# 9. DEPLOY LIGHTSPEED STACK AND LLAMA STACK #======================================== echo "===== Deploying Services =====" @@ -313,6 +362,15 @@ for i in $(seq 1 36); do fi if [ $i -eq 36 ]; then echo "❌ Port-forward to lightspeed-stack never became ready (3 min)" + echo "" + echo "DEBUG: lightspeed-stack-service logs:" + oc logs lightspeed-stack-service -n "$NAMESPACE" --tail=100 || true + echo "" + echo "DEBUG: llama-stack-service logs:" + oc logs llama-stack-service -n "$NAMESPACE" --tail=100 || true + echo "" + echo "DEBUG: Pod status:" + oc get pods -n "$NAMESPACE" -o wide || true kill $PF_LCS_PID 2>/dev/null || true kill $PF_JWKS_PID 2>/dev/null || true exit 1 From dc88586dc4e6c99404a362470e017631714693f7 Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Mon, 30 Mar 2026 08:54:54 +0200 Subject: [PATCH 04/18] Add namespace diagnostic logging to prow pipeline Add DEBUG NS checkpoints to trace when e2e-rhoai-dsc namespace disappears during operator bootstrapping. Co-Authored-By: Claude Opus 4.6 --- tests/e2e-prow/rhoai/pipeline.sh | 2 ++ tests/e2e-prow/rhoai/scripts/bootstrap.sh | 1 + tests/e2e-prow/rhoai/scripts/deploy-vllm.sh | 1 + tests/e2e-prow/rhoai/scripts/gpu-setup.sh | 1 + 4 files changed, 5 insertions(+) diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 59bbe1a1b..13ff0bbd4 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -43,6 +43,7 @@ oc whoami #======================================== echo "===== Creating namespace & secrets =====" oc get ns "$NAMESPACE" >/dev/null 2>&1 || oc create namespace "$NAMESPACE" +echo "DEBUG NS: after create -> $(oc get ns $NAMESPACE -o jsonpath='{.status.phase}' 2>&1)" # Create NFD and NVIDIA namespaces oc apply -f "$PIPELINE_DIR/manifests/namespaces/nfd.yaml" @@ -102,6 +103,7 @@ oc create configmap vllm-chat-template -n "$NAMESPACE" \ #======================================== # 5. DEPLOY vLLM #======================================== +echo "DEBUG NS: before pipeline-vllm -> $(oc get ns $NAMESPACE -o jsonpath='{.status.phase}' 2>&1)" echo "===== Deploying vLLM =====" ./pipeline-vllm.sh oc get pods -n "$NAMESPACE" diff --git a/tests/e2e-prow/rhoai/scripts/bootstrap.sh b/tests/e2e-prow/rhoai/scripts/bootstrap.sh index 1718b70e5..8b7c063c5 100755 --- a/tests/e2e-prow/rhoai/scripts/bootstrap.sh +++ b/tests/e2e-prow/rhoai/scripts/bootstrap.sh @@ -94,3 +94,4 @@ echo "--> Applying DataScienceCluster from ds-cluster.yaml..." oc apply -f "$BASE_DIR/manifests/operators/ds-cluster.yaml" echo "All files applied successfully. The DataScienceCluster is now provisioning." +echo "DEBUG NS: after bootstrap -> $(oc get ns e2e-rhoai-dsc -o jsonpath='{.status.phase}' 2>&1)" diff --git a/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh b/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh index 5c3201fa5..ec3292d0b 100755 --- a/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh +++ b/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh @@ -67,6 +67,7 @@ echo "✅ GPU capacity available." echo "GPU nodes ready:" oc get nodes -l nvidia.com/gpu.present=true -o custom-columns=NAME:.metadata.name,GPU:.status.capacity.nvidia\\.com/gpu,INSTANCE:.metadata.labels.node\\.kubernetes\\.io/instance-type +echo "DEBUG NS: before vLLM deploy -> $(oc get ns e2e-rhoai-dsc -o jsonpath='{.status.phase}' 2>&1)" echo "Applying vLLM manifests..." envsubst < "$BASE_DIR/manifests/vllm/vllm-runtime-gpu.yaml" | oc apply -f - diff --git a/tests/e2e-prow/rhoai/scripts/gpu-setup.sh b/tests/e2e-prow/rhoai/scripts/gpu-setup.sh index d72d744bb..ab009a1a3 100755 --- a/tests/e2e-prow/rhoai/scripts/gpu-setup.sh +++ b/tests/e2e-prow/rhoai/scripts/gpu-setup.sh @@ -206,3 +206,4 @@ echo "" echo "ClusterPolicy Status:" oc get clusterpolicy gpu-cluster-policy -o jsonpath='{.status.state}' echo "" +echo "DEBUG NS: after gpu-setup -> $(oc get ns e2e-rhoai-dsc -o jsonpath='{.status.phase}' 2>&1)" From 8aaee739e127067ba3d95ac467623fc2cb007256 Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Thu, 9 Apr 2026 13:11:14 +0200 Subject: [PATCH 05/18] Remove namespace from cluster-scoped DataScienceCluster CR Co-Authored-By: Claude Opus 4.6 --- tests/e2e-prow/rhoai/manifests/operators/ds-cluster.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/e2e-prow/rhoai/manifests/operators/ds-cluster.yaml b/tests/e2e-prow/rhoai/manifests/operators/ds-cluster.yaml index e9b619726..d57226cc1 100644 --- a/tests/e2e-prow/rhoai/manifests/operators/ds-cluster.yaml +++ b/tests/e2e-prow/rhoai/manifests/operators/ds-cluster.yaml @@ -2,7 +2,6 @@ apiVersion: datasciencecluster.opendatahub.io/v1 kind: DataScienceCluster metadata: name: default-dsc - namespace: e2e-rhoai-dsc spec: serviceMesh: managementState: Managed From 6bc89989258c0459c620a41921197b5b5ed96e87 Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Thu, 9 Apr 2026 15:33:12 +0200 Subject: [PATCH 06/18] Add model/provider override env vars to prow pipeline Co-Authored-By: Claude Opus 4.6 --- tests/e2e-prow/rhoai/pipeline.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 13ff0bbd4..b8b7a03a6 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -388,6 +388,8 @@ done export E2E_LSC_HOSTNAME="localhost" export E2E_JWKS_HOSTNAME="localhost" +export E2E_DEFAULT_MODEL_OVERRIDE="$MODEL_NAME" +export E2E_DEFAULT_PROVIDER_OVERRIDE="vllm" echo "LCS accessible at: http://$E2E_LSC_HOSTNAME:8080" echo "Mock JWKS accessible at: http://$E2E_JWKS_HOSTNAME:8000" From 1f2bebb2c2fc9e47cf60f1aeac8b41703458d7ad Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Fri, 10 Apr 2026 09:23:49 +0200 Subject: [PATCH 07/18] Fix prow pipeline: run bootstrap before namespace creation The RHOAI operator deletes the e2e-rhoai-dsc namespace during DSC reconciliation. Reorder pipeline to run operator bootstrap first, then create namespace and secrets after DSC settles. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/e2e-prow/rhoai/pipeline.sh | 28 +++++++++++++++-------- tests/e2e-prow/rhoai/scripts/bootstrap.sh | 26 ++++++++++++++++++++- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index b8b7a03a6..3c38ccf2c 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -39,16 +39,22 @@ oc version oc whoami #======================================== -# 3. CREATE NAMESPACE & SECRETS +# 3. BOOTSTRAP OPERATORS & DSC (before namespace — DSC operator may delete it) #======================================== -echo "===== Creating namespace & secrets =====" -oc get ns "$NAMESPACE" >/dev/null 2>&1 || oc create namespace "$NAMESPACE" -echo "DEBUG NS: after create -> $(oc get ns $NAMESPACE -o jsonpath='{.status.phase}' 2>&1)" - -# Create NFD and NVIDIA namespaces +echo "===== Bootstrapping operators =====" +# Create NFD and NVIDIA namespaces (needed by operator subscriptions) oc apply -f "$PIPELINE_DIR/manifests/namespaces/nfd.yaml" oc apply -f "$PIPELINE_DIR/manifests/namespaces/nvidia-operator.yaml" +# Install operators and apply DataScienceCluster (this may delete/recreate namespaces) +"$PIPELINE_DIR/scripts/bootstrap.sh" "$PIPELINE_DIR" + +#======================================== +# 4. CREATE NAMESPACE & SECRETS (after DSC settles) +#======================================== +echo "===== Creating namespace & secrets =====" +oc get ns "$NAMESPACE" >/dev/null 2>&1 || oc create namespace "$NAMESPACE" +echo "DEBUG NS: after create -> $(oc get ns $NAMESPACE -o jsonpath='{.status.phase}' 2>&1)" create_secret() { local name=$1; shift @@ -88,7 +94,7 @@ oc secrets link default quay-lightspeed-pull-secret --for=pull -n "$NAMESPACE" 2 #======================================== -# 4. CONFIGMAPS +# 5. CONFIGMAPS #======================================== echo "===== Setting up configmaps =====" @@ -101,11 +107,13 @@ oc create configmap vllm-chat-template -n "$NAMESPACE" \ #======================================== -# 5. DEPLOY vLLM +# 6. DEPLOY vLLM (GPU setup + deploy, bootstrap already done) #======================================== -echo "DEBUG NS: before pipeline-vllm -> $(oc get ns $NAMESPACE -o jsonpath='{.status.phase}' 2>&1)" echo "===== Deploying vLLM =====" -./pipeline-vllm.sh +"$PIPELINE_DIR/scripts/gpu-setup.sh" "$PIPELINE_DIR" +source "$PIPELINE_DIR/scripts/fetch-vllm-image.sh" +"$PIPELINE_DIR/scripts/deploy-vllm.sh" "$PIPELINE_DIR" +"$PIPELINE_DIR/scripts/get-vllm-pod-info.sh" oc get pods -n "$NAMESPACE" diff --git a/tests/e2e-prow/rhoai/scripts/bootstrap.sh b/tests/e2e-prow/rhoai/scripts/bootstrap.sh index 8b7c063c5..d5ff94f3d 100755 --- a/tests/e2e-prow/rhoai/scripts/bootstrap.sh +++ b/tests/e2e-prow/rhoai/scripts/bootstrap.sh @@ -20,15 +20,23 @@ wait_for_operator() { } # APPLY OPERATOR SUBSCRIPTIONS +NAMESPACE="${NAMESPACE:-e2e-rhoai-dsc}" +ns_check() { echo "DEBUG NS ($1): $(oc get ns $NAMESPACE -o jsonpath='{.status.phase}' 2>&1)"; } + +ns_check "before operatorgroups" echo "--> Applying OperatorGroups from operatorgroup.yaml..." oc apply -f "$BASE_DIR/manifests/operators/operatorgroup.yaml" +ns_check "after operatorgroups" sleep 10 +ns_check "after 10s sleep (post operatorgroups)" echo "--> Applying Operator Subscriptions from operators.yaml..." oc apply -f "$BASE_DIR/manifests/operators/operators.yaml" +ns_check "after operator subscriptions" sleep 10 +ns_check "after 10s sleep (post subscriptions)" # WAIT FOR GPU OPERATOR NAMESPACE AND OPERATORGROUP echo "--> Ensuring GPU Operator namespace and OperatorGroup are ready..." @@ -50,8 +58,11 @@ echo "--> Waiting for Operators to be installed. This can take several minutes.. oc wait --for=condition=established --timeout=300s crd/clusterserviceversions.operators.coreos.com wait_for_operator "operators.coreos.com/servicemeshoperator.openshift-operators" "openshift-operators" "Service Mesh Operator" +ns_check "after Service Mesh Operator ready" wait_for_operator "operators.coreos.com/serverless-operator.openshift-operators" "openshift-operators" "Serverless Operator" +ns_check "after Serverless Operator ready" wait_for_operator "operators.coreos.com/rhods-operator.openshift-operators" "openshift-operators" "RHODS Operator" +ns_check "after RHODS Operator ready" # Verify GPU operator InstallPlan was created before waiting for CSV echo " -> Verifying GPU Operator InstallPlan was created..." @@ -79,19 +90,32 @@ done echo " -> InstallPlan created successfully" wait_for_operator "operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator" "nvidia-gpu-operator" "GPU Operator" +ns_check "after GPU Operator ready" wait_for_operator "operators.coreos.com/nfd.openshift-nfd" "openshift-nfd" "NFD Operator" +ns_check "after NFD Operator ready" echo " -> Waiting for NFD CRD to be established..." oc wait --for=condition=established --timeout=300s crd/nodefeaturediscoveries.nfd.openshift.io echo "--> All operators are ready." +ns_check "after all operators ready" oc get csv -n openshift-operators oc get csv -n nvidia-gpu-operator oc get csv -n openshift-nfd +ns_check "before DSC apply" echo "--> Applying DataScienceCluster from ds-cluster.yaml..." oc apply -f "$BASE_DIR/manifests/operators/ds-cluster.yaml" +ns_check "immediately after DSC apply" +sleep 5 +ns_check "5s after DSC apply" +sleep 10 +ns_check "15s after DSC apply" + +echo "--> Checking DSCInitialization and DSC status..." +oc get dsci -A -o jsonpath='{range .items[*]}DSCI: {.metadata.name} applicationsNS: {.spec.applicationsNamespace}{"\n"}{end}' 2>/dev/null || echo "No DSCInitialization found" +oc get dsc -A -o jsonpath='{range .items[*]}DSC: {.metadata.name} phase: {.status.phase}{"\n"}{end}' 2>/dev/null || echo "No DSC status yet" echo "All files applied successfully. The DataScienceCluster is now provisioning." -echo "DEBUG NS: after bootstrap -> $(oc get ns e2e-rhoai-dsc -o jsonpath='{.status.phase}' 2>&1)" +ns_check "end of bootstrap" From c3174992f75cd67e75a9dde0e8e9a51e8ac28d5f Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Fri, 10 Apr 2026 12:36:31 +0200 Subject: [PATCH 08/18] Hardcode NAMESPACE to avoid Prow env override Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/e2e-prow/rhoai/pipeline.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 3c38ccf2c..696c04277 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -8,7 +8,7 @@ export RUNNING_PROW=true #======================================== # 1. GLOBAL CONFIG #======================================== -NAMESPACE="${NAMESPACE:-e2e-rhoai-dsc}" +NAMESPACE="e2e-rhoai-dsc" export NAMESPACE MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" PIPELINE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -22,10 +22,10 @@ PIPELINE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # 2. ENVIRONMENT SETUP #======================================== echo "===== Setting up environment variables =====" -export HUGGING_FACE_HUB_TOKEN=$(cat /var/run/huggingface/hf-token-ces-lcore-test || true) -export VLLM_API_KEY=$(cat /var/run/vllm/vllm-api-key-lcore-test || true) -export QUAY_ROBOT_NAME=$(cat /var/run/quay-aipcc-name/lcore-quay-name-lcore-test || true) -export QUAY_ROBOT_PASSWORD=$(cat /var/run/quay-aipcc-password/lcore-quay-password-lcore-test || true) +# export HUGGING_FACE_HUB_TOKEN=$(cat /var/run/huggingface/hf-token-ces-lcore-test || true) +# export VLLM_API_KEY=$(cat /var/run/vllm/vllm-api-key-lcore-test || true) +# export QUAY_ROBOT_NAME=$(cat /var/run/quay-aipcc-name/lcore-quay-name-lcore-test || true) +# export QUAY_ROBOT_PASSWORD=$(cat /var/run/quay-aipcc-password/lcore-quay-password-lcore-test || true) [[ -n "$HUGGING_FACE_HUB_TOKEN" ]] && echo "✅ HUGGING_FACE_HUB_TOKEN is set" || { echo "❌ Missing HUGGING_FACE_HUB_TOKEN"; exit 1; } From 4e2a4bdffaf511c67d3ca06f3afd7578b481a2d2 Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Fri, 10 Apr 2026 16:11:33 +0200 Subject: [PATCH 09/18] Re-enable secret exports in prow pipeline Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/e2e-prow/rhoai/pipeline.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 696c04277..2f15b2c93 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -22,10 +22,10 @@ PIPELINE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # 2. ENVIRONMENT SETUP #======================================== echo "===== Setting up environment variables =====" -# export HUGGING_FACE_HUB_TOKEN=$(cat /var/run/huggingface/hf-token-ces-lcore-test || true) -# export VLLM_API_KEY=$(cat /var/run/vllm/vllm-api-key-lcore-test || true) -# export QUAY_ROBOT_NAME=$(cat /var/run/quay-aipcc-name/lcore-quay-name-lcore-test || true) -# export QUAY_ROBOT_PASSWORD=$(cat /var/run/quay-aipcc-password/lcore-quay-password-lcore-test || true) +export HUGGING_FACE_HUB_TOKEN=$(cat /var/run/huggingface/hf-token-ces-lcore-test || true) +export VLLM_API_KEY=$(cat /var/run/vllm/vllm-api-key-lcore-test || true) +export QUAY_ROBOT_NAME=$(cat /var/run/quay-aipcc-name/lcore-quay-name-lcore-test || true) +export QUAY_ROBOT_PASSWORD=$(cat /var/run/quay-aipcc-password/lcore-quay-password-lcore-test || true) [[ -n "$HUGGING_FACE_HUB_TOKEN" ]] && echo "✅ HUGGING_FACE_HUB_TOKEN is set" || { echo "❌ Missing HUGGING_FACE_HUB_TOKEN"; exit 1; } From f032f4abef364c37f7dbe1b15fe90b64cd0afff8 Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Fri, 17 Apr 2026 12:39:08 +0200 Subject: [PATCH 10/18] Add enrichment and RAG restore to prow llama-stack manifest Rename llama-stack.yaml to llama-stack-prow.yaml and add: - Config enrichment via llama_stack_configuration.py - restore_rag_seed() to re-inflate RAG db after enrichment - PYTHONPATH, lightspeed-stack.yaml mount, rag-data mount - materialize-run-yaml init container - Model/provider overrides in inline_rag e2e tests Co-Authored-By: Claude Opus 4.6 (1M context) --- .../lightspeed/llama-stack-prow.yaml | 203 ++++++++++++++++++ .../manifests/lightspeed/llama-stack.yaml | 118 ---------- tests/e2e-prow/rhoai/pipeline-services.sh | 2 +- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 4 +- 4 files changed, 206 insertions(+), 121 deletions(-) create mode 100644 tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-prow.yaml delete mode 100644 tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-prow.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-prow.yaml new file mode 100644 index 000000000..bf03f2871 --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-prow.yaml @@ -0,0 +1,203 @@ +# Llama Stack pod for Prow: uses pre-built image with enrichment + RAG restore. +# +# Requires: ConfigMap llama-stack-config (run.yaml), ConfigMap rag-data (kv_store.db.gz), +# ConfigMap lightspeed-stack-config (lightspeed-stack.yaml). +# Requires: Image built as ${LLAMA_STACK_IMAGE} (set by pipeline.sh). +# +apiVersion: v1 +kind: Pod +metadata: + name: llama-stack-service + labels: + pod: llama-stack-service +spec: + securityContext: + seccompProfile: + type: RuntimeDefault + initContainers: + - name: setup-rag-data + image: busybox:latest + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + command: + - /bin/sh + - -c + - | + set -e + mkdir -p /data/src/.llama/storage/rag /data/src/.llama/storage/files /data/.e2e-rag-seed + if [ ! -f /rag-data/kv_store.db.gz ]; then + echo "FATAL: missing /rag-data/kv_store.db.gz" + ls -la /rag-data || true + exit 1 + fi + gunzip -c /rag-data/kv_store.db.gz > /data/.e2e-rag-seed/kv_store.db + cp -f /data/.e2e-rag-seed/kv_store.db /data/src/.llama/storage/rag/kv_store.db + chmod -R 777 /data + echo "RAG data extracted successfully" + volumeMounts: + - name: rag-storage + mountPath: /data + - name: rag-data + mountPath: /rag-data + - name: materialize-run-yaml + image: busybox:latest + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + command: + - /bin/sh + - -c + - | + set -e + cp /cm/run.yaml /work/run.yaml + chmod 664 /work/run.yaml + volumeMounts: + - name: config-cm + mountPath: /cm + readOnly: true + - name: rag-storage + mountPath: /work + containers: + - name: llama-stack-container + image: ${LLAMA_STACK_IMAGE} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault + workingDir: /opt/app-root + env: + - name: PYTHONPATH + value: "/opt/app-root/src" + - name: HOME + value: "/opt/app-root/src" + - name: KV_STORE_PATH + value: "/opt/app-root/src/.llama/storage/kv_store.db" + - name: KV_RAG_PATH + value: "/opt/app-root/src/.llama/storage/rag/kv_store.db" + - name: SQL_STORE_PATH + value: "/opt/app-root/src/.llama/storage/sql_store.db" + - name: KSVC_URL + valueFrom: + secretKeyRef: + name: api-url-secret + key: key + - name: VLLM_API_KEY + valueFrom: + secretKeyRef: + name: vllm-api-key-secret + key: key + - name: INFERENCE_MODEL + value: "meta-llama/Llama-3.1-8B-Instruct" + - name: OPENAI_API_KEY + valueFrom: + secretKeyRef: + name: openai-api-key-secret + key: key + optional: true + - name: E2E_OPENAI_MODEL + value: "gpt-4o-mini" + - name: FAISS_VECTOR_STORE_ID + valueFrom: + secretKeyRef: + name: faiss-vector-store-secret + key: id + - name: E2E_LLAMA_HOSTNAME + valueFrom: + secretKeyRef: + name: llama-stack-ip-secret + key: key + command: + - /bin/bash + - -c + - | + set -e + RAG_SEED="/opt/app-root/src/.llama/storage/.e2e-rag-seed/kv_store.db" + RAG_CM_GZ="/opt/app-root/rag-data-cm/kv_store.db.gz" + RAG_WORK="${KV_RAG_PATH:-/opt/app-root/src/.llama/storage/rag/kv_store.db}" + restore_rag_seed() { + mkdir -p "$(dirname "$RAG_WORK")" + if [[ -f "$RAG_CM_GZ" ]]; then + RAG_WORK="$RAG_WORK" RAG_CM_GZ="$RAG_CM_GZ" python3 -c 'import gzip, os, shutil, sys; r, g = os.environ["RAG_WORK"], os.environ["RAG_CM_GZ"]; t = r + ".tmp"; i = gzip.open(g, "rb"); o = open(t, "wb"); shutil.copyfileobj(i, o); i.close(); o.close(); sz = os.path.getsize(t); (sz >= 1048576) or (print("FATAL: RAG from ConfigMap too small:", sz, file=sys.stderr) or sys.exit(1)); os.replace(t, r); os.chmod(r, 0o664)' || exit 1 + elif [[ -f "$RAG_SEED" ]]; then + cp -f "$RAG_SEED" "$RAG_WORK" + chmod 664 "$RAG_WORK" 2>/dev/null || true + fi + } + restore_rag_seed + INPUT_CONFIG="${LLAMA_STACK_CONFIG:-/opt/app-root/run.yaml}" + ENRICHED_CONFIG="/opt/app-root/run.yaml" + LIGHTSPEED_CONFIG="${LIGHTSPEED_CONFIG:-/opt/app-root/lightspeed-stack.yaml}" + ENV_FILE="/opt/app-root/.env" + if [[ -f "$LIGHTSPEED_CONFIG" ]]; then + echo "Enriching llama-stack config..." + ENRICHMENT_FAILED=0 + python3 /opt/app-root/src/llama_stack_configuration.py \ + -c "$LIGHTSPEED_CONFIG" \ + -i "$INPUT_CONFIG" \ + -o "$ENRICHED_CONFIG" \ + -e "$ENV_FILE" 2>&1 || ENRICHMENT_FAILED=1 + if [[ -f "$ENV_FILE" ]]; then + set -a && . "$ENV_FILE" && set +a + fi + if [[ -f "$ENRICHED_CONFIG" ]] && [[ "$ENRICHMENT_FAILED" -eq 0 ]]; then + echo "Using enriched config: $ENRICHED_CONFIG" + restore_rag_seed + exec llama stack run "$ENRICHED_CONFIG" + fi + fi + echo "Using original config: $INPUT_CONFIG" + restore_rag_seed + exec llama stack run "$INPUT_CONFIG" + ports: + - containerPort: 8321 + readinessProbe: + httpGet: + path: /v1/health + port: 8321 + initialDelaySeconds: 20 + periodSeconds: 5 + failureThreshold: 36 + livenessProbe: + httpGet: + path: /v1/health + port: 8321 + initialDelaySeconds: 120 + periodSeconds: 20 + failureThreshold: 3 + volumeMounts: + - name: rag-storage + mountPath: /opt/app-root/src/.llama/storage + - name: lightspeed-config + mountPath: /opt/app-root/lightspeed-stack.yaml + subPath: lightspeed-stack.yaml + readOnly: true + - name: rag-data + mountPath: /opt/app-root/rag-data-cm + readOnly: true + volumes: + - name: rag-storage + emptyDir: {} + - name: config-cm + configMap: + name: llama-stack-config + - name: lightspeed-config + configMap: + name: lightspeed-stack-config + - name: rag-data + configMap: + name: rag-data diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml deleted file mode 100644 index b04c43063..000000000 --- a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml +++ /dev/null @@ -1,118 +0,0 @@ -# Llama Stack pod: uses pre-built image from in-cluster build. -# -# The image is built by pipeline.sh using oc new-build + test.containerfile. -# Only a small init container extracts the RAG DB into the shared volume. -# -# Requires: ConfigMap llama-stack-config (run.yaml), ConfigMap rag-data (kv_store.db.gz). -# Requires: Image built as ${LLAMA_STACK_IMAGE} (set by pipeline.sh). -# -apiVersion: v1 -kind: Pod -metadata: - name: llama-stack-service - labels: - pod: llama-stack-service -spec: - securityContext: - seccompProfile: - type: RuntimeDefault - initContainers: - - name: setup-rag-data - image: busybox:latest - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: ["ALL"] - runAsNonRoot: true - runAsUser: 65534 - seccompProfile: - type: RuntimeDefault - command: - - /bin/sh - - -c - - | - mkdir -p /data/src/.llama/storage/rag /data/src/.llama/storage/files - chmod -R 777 /data - gunzip -c /rag-data/kv_store.db.gz > /data/src/.llama/storage/rag/kv_store.db - chmod -R 777 /data - echo "RAG data extracted successfully" - volumeMounts: - - name: rag-storage - mountPath: /data - - name: rag-data - mountPath: /rag-data - containers: - - name: llama-stack-container - image: ${LLAMA_STACK_IMAGE} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: ["ALL"] - runAsNonRoot: true - runAsUser: 1001 - seccompProfile: - type: RuntimeDefault - workingDir: /opt/app-root - env: - - name: KV_STORE_PATH - value: "/opt/app-root/src/.llama/storage/kv_store.db" - - name: KV_RAG_PATH - value: "/opt/app-root/src/.llama/storage/rag/kv_store.db" - - name: SQL_STORE_PATH - value: "/opt/app-root/src/.llama/storage/sql_store.db" - - name: KSVC_URL - valueFrom: - secretKeyRef: - name: api-url-secret - key: key - - name: VLLM_API_KEY - valueFrom: - secretKeyRef: - name: vllm-api-key-secret - key: key - - name: INFERENCE_MODEL - value: "meta-llama/Llama-3.1-8B-Instruct" - - name: OPENAI_API_KEY - valueFrom: - secretKeyRef: - name: openai-api-key-secret - key: key - optional: true - - name: E2E_OPENAI_MODEL - value: "gpt-4o-mini" - - name: FAISS_VECTOR_STORE_ID - valueFrom: - secretKeyRef: - name: faiss-vector-store-secret - key: id - ports: - - containerPort: 8321 - readinessProbe: - httpGet: - path: /v1/health - port: 8321 - initialDelaySeconds: 20 - periodSeconds: 5 - failureThreshold: 36 - livenessProbe: - httpGet: - path: /v1/health - port: 8321 - initialDelaySeconds: 120 - periodSeconds: 20 - failureThreshold: 3 - volumeMounts: - - name: rag-storage - mountPath: /opt/app-root/src/.llama/storage/rag - - name: config - mountPath: /opt/app-root/run.yaml - subPath: run.yaml - volumes: - - name: rag-storage - emptyDir: {} - - name: config - configMap: - name: llama-stack-config - - name: rag-data - configMap: - name: rag-data diff --git a/tests/e2e-prow/rhoai/pipeline-services.sh b/tests/e2e-prow/rhoai/pipeline-services.sh index 73aed5671..5f3c9a1ae 100755 --- a/tests/e2e-prow/rhoai/pipeline-services.sh +++ b/tests/e2e-prow/rhoai/pipeline-services.sh @@ -4,7 +4,7 @@ BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" NAMESPACE="${NAMESPACE:-e2e-rhoai-dsc}" # Deploy llama-stack (substitute only LLAMA_STACK_IMAGE, leave other ${} intact) -envsubst '${LLAMA_STACK_IMAGE}' < "$BASE_DIR/manifests/lightspeed/llama-stack.yaml" | oc apply -n "$NAMESPACE" -f - +envsubst '${LLAMA_STACK_IMAGE}' < "$BASE_DIR/manifests/lightspeed/llama-stack-prow.yaml" | oc apply -n "$NAMESPACE" -f - oc wait pod/llama-stack-service \ -n "$NAMESPACE" --for=condition=Ready --timeout=600s diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 540e2aab2..372b1f72c 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -292,9 +292,9 @@ cmd_restart_llama_stack() { else # Prow: vLLM Llama Stack image (matches pipeline.sh / pipeline-services.sh) if command -v envsubst >/dev/null 2>&1; then - envsubst < "$MANIFEST_DIR/llama-stack.yaml" | oc apply -n "$NAMESPACE" -f - + envsubst < "$MANIFEST_DIR/llama-stack-prow.yaml" | oc apply -n "$NAMESPACE" -f - else - sed "s|\${LLAMA_STACK_IMAGE}|${LLAMA_STACK_IMAGE:-}|g" "$MANIFEST_DIR/llama-stack.yaml" | + sed "s|\${LLAMA_STACK_IMAGE}|${LLAMA_STACK_IMAGE:-}|g" "$MANIFEST_DIR/llama-stack-prow.yaml" | oc apply -n "$NAMESPACE" -f - fi wait_for_pod "llama-stack-service" 24 From a33aff69804b7c564530484587caaea0023105ad Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Thu, 23 Apr 2026 16:34:44 +0200 Subject: [PATCH 11/18] Fix prow e2e pipeline: secret ordering, config path, and test runner - Create llama-stack-ip-secret before deploying the pod to fix chicken-and-egg dependency where the pod requires the secret as a non-optional env var - Add LLAMA_STACK_CONFIG env var pointing to the correct emptyDir mount path where materialize-run-yaml init container places run.yaml - Use make test-e2e-local instead of test-e2e to avoid macOS-incompatible script -c flag - Remove DEBUG NS echo lines from pipeline scripts Co-Authored-By: Claude Opus 4.6 --- .../lightspeed/llama-stack-prow.yaml | 4 +++- tests/e2e-prow/rhoai/pipeline-services.sh | 14 +++++++------- tests/e2e-prow/rhoai/pipeline.sh | 1 - tests/e2e-prow/rhoai/scripts/bootstrap.sh | 19 ------------------- tests/e2e-prow/rhoai/scripts/deploy-vllm.sh | 1 - tests/e2e-prow/rhoai/scripts/gpu-setup.sh | 1 - 6 files changed, 10 insertions(+), 30 deletions(-) diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-prow.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-prow.yaml index bf03f2871..757933c3d 100644 --- a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-prow.yaml +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-prow.yaml @@ -38,7 +38,7 @@ spec: fi gunzip -c /rag-data/kv_store.db.gz > /data/.e2e-rag-seed/kv_store.db cp -f /data/.e2e-rag-seed/kv_store.db /data/src/.llama/storage/rag/kv_store.db - chmod -R 777 /data + chmod -R 777 /data/src /data/.e2e-rag-seed echo "RAG data extracted successfully" volumeMounts: - name: rag-storage @@ -111,6 +111,8 @@ spec: optional: true - name: E2E_OPENAI_MODEL value: "gpt-4o-mini" + - name: LLAMA_STACK_CONFIG + value: "/opt/app-root/src/.llama/storage/run.yaml" - name: FAISS_VECTOR_STORE_ID valueFrom: secretKeyRef: diff --git a/tests/e2e-prow/rhoai/pipeline-services.sh b/tests/e2e-prow/rhoai/pipeline-services.sh index 5f3c9a1ae..1db04b6ea 100755 --- a/tests/e2e-prow/rhoai/pipeline-services.sh +++ b/tests/e2e-prow/rhoai/pipeline-services.sh @@ -3,13 +3,19 @@ BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" NAMESPACE="${NAMESPACE:-e2e-rhoai-dsc}" +# Create llama-stack-ip-secret before deploying the pod (it references the secret as an env var) +export E2E_LLAMA_HOSTNAME="llama-stack-service-svc.${NAMESPACE}.svc.cluster.local" +oc create secret generic llama-stack-ip-secret \ + --from-literal=key="$E2E_LLAMA_HOSTNAME" \ + -n "$NAMESPACE" 2>/dev/null || echo "Secret llama-stack-ip-secret exists" + # Deploy llama-stack (substitute only LLAMA_STACK_IMAGE, leave other ${} intact) envsubst '${LLAMA_STACK_IMAGE}' < "$BASE_DIR/manifests/lightspeed/llama-stack-prow.yaml" | oc apply -n "$NAMESPACE" -f - oc wait pod/llama-stack-service \ -n "$NAMESPACE" --for=condition=Ready --timeout=600s -# Get url address of llama-stack pod +# Expose llama-stack service oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" oc expose pod llama-stack-service \ @@ -18,12 +24,6 @@ oc expose pod llama-stack-service \ --type=ClusterIP \ -n "$NAMESPACE" -export E2E_LLAMA_HOSTNAME="llama-stack-service-svc.${NAMESPACE}.svc.cluster.local" - -oc create secret generic llama-stack-ip-secret \ - --from-literal=key="$E2E_LLAMA_HOSTNAME" \ - -n "$NAMESPACE" || echo "Secret exists" - # Deploy lightspeed-stack (substitute only LIGHTSPEED_STACK_IMAGE, leave other ${} intact) LIGHTSPEED_STACK_IMAGE="${LIGHTSPEED_STACK_IMAGE:-quay.io/lightspeed-core/lightspeed-stack:dev-latest}" export LIGHTSPEED_STACK_IMAGE diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 2f15b2c93..505c07815 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -54,7 +54,6 @@ oc apply -f "$PIPELINE_DIR/manifests/namespaces/nvidia-operator.yaml" #======================================== echo "===== Creating namespace & secrets =====" oc get ns "$NAMESPACE" >/dev/null 2>&1 || oc create namespace "$NAMESPACE" -echo "DEBUG NS: after create -> $(oc get ns $NAMESPACE -o jsonpath='{.status.phase}' 2>&1)" create_secret() { local name=$1; shift diff --git a/tests/e2e-prow/rhoai/scripts/bootstrap.sh b/tests/e2e-prow/rhoai/scripts/bootstrap.sh index d5ff94f3d..ae8444ca8 100755 --- a/tests/e2e-prow/rhoai/scripts/bootstrap.sh +++ b/tests/e2e-prow/rhoai/scripts/bootstrap.sh @@ -20,23 +20,15 @@ wait_for_operator() { } # APPLY OPERATOR SUBSCRIPTIONS -NAMESPACE="${NAMESPACE:-e2e-rhoai-dsc}" -ns_check() { echo "DEBUG NS ($1): $(oc get ns $NAMESPACE -o jsonpath='{.status.phase}' 2>&1)"; } - -ns_check "before operatorgroups" echo "--> Applying OperatorGroups from operatorgroup.yaml..." oc apply -f "$BASE_DIR/manifests/operators/operatorgroup.yaml" -ns_check "after operatorgroups" sleep 10 -ns_check "after 10s sleep (post operatorgroups)" echo "--> Applying Operator Subscriptions from operators.yaml..." oc apply -f "$BASE_DIR/manifests/operators/operators.yaml" -ns_check "after operator subscriptions" sleep 10 -ns_check "after 10s sleep (post subscriptions)" # WAIT FOR GPU OPERATOR NAMESPACE AND OPERATORGROUP echo "--> Ensuring GPU Operator namespace and OperatorGroup are ready..." @@ -58,11 +50,8 @@ echo "--> Waiting for Operators to be installed. This can take several minutes.. oc wait --for=condition=established --timeout=300s crd/clusterserviceversions.operators.coreos.com wait_for_operator "operators.coreos.com/servicemeshoperator.openshift-operators" "openshift-operators" "Service Mesh Operator" -ns_check "after Service Mesh Operator ready" wait_for_operator "operators.coreos.com/serverless-operator.openshift-operators" "openshift-operators" "Serverless Operator" -ns_check "after Serverless Operator ready" wait_for_operator "operators.coreos.com/rhods-operator.openshift-operators" "openshift-operators" "RHODS Operator" -ns_check "after RHODS Operator ready" # Verify GPU operator InstallPlan was created before waiting for CSV echo " -> Verifying GPU Operator InstallPlan was created..." @@ -90,32 +79,24 @@ done echo " -> InstallPlan created successfully" wait_for_operator "operators.coreos.com/gpu-operator-certified.nvidia-gpu-operator" "nvidia-gpu-operator" "GPU Operator" -ns_check "after GPU Operator ready" wait_for_operator "operators.coreos.com/nfd.openshift-nfd" "openshift-nfd" "NFD Operator" -ns_check "after NFD Operator ready" echo " -> Waiting for NFD CRD to be established..." oc wait --for=condition=established --timeout=300s crd/nodefeaturediscoveries.nfd.openshift.io echo "--> All operators are ready." -ns_check "after all operators ready" oc get csv -n openshift-operators oc get csv -n nvidia-gpu-operator oc get csv -n openshift-nfd -ns_check "before DSC apply" echo "--> Applying DataScienceCluster from ds-cluster.yaml..." oc apply -f "$BASE_DIR/manifests/operators/ds-cluster.yaml" -ns_check "immediately after DSC apply" sleep 5 -ns_check "5s after DSC apply" sleep 10 -ns_check "15s after DSC apply" echo "--> Checking DSCInitialization and DSC status..." oc get dsci -A -o jsonpath='{range .items[*]}DSCI: {.metadata.name} applicationsNS: {.spec.applicationsNamespace}{"\n"}{end}' 2>/dev/null || echo "No DSCInitialization found" oc get dsc -A -o jsonpath='{range .items[*]}DSC: {.metadata.name} phase: {.status.phase}{"\n"}{end}' 2>/dev/null || echo "No DSC status yet" echo "All files applied successfully. The DataScienceCluster is now provisioning." -ns_check "end of bootstrap" diff --git a/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh b/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh index ec3292d0b..5c3201fa5 100755 --- a/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh +++ b/tests/e2e-prow/rhoai/scripts/deploy-vllm.sh @@ -67,7 +67,6 @@ echo "✅ GPU capacity available." echo "GPU nodes ready:" oc get nodes -l nvidia.com/gpu.present=true -o custom-columns=NAME:.metadata.name,GPU:.status.capacity.nvidia\\.com/gpu,INSTANCE:.metadata.labels.node\\.kubernetes\\.io/instance-type -echo "DEBUG NS: before vLLM deploy -> $(oc get ns e2e-rhoai-dsc -o jsonpath='{.status.phase}' 2>&1)" echo "Applying vLLM manifests..." envsubst < "$BASE_DIR/manifests/vllm/vllm-runtime-gpu.yaml" | oc apply -f - diff --git a/tests/e2e-prow/rhoai/scripts/gpu-setup.sh b/tests/e2e-prow/rhoai/scripts/gpu-setup.sh index ab009a1a3..d72d744bb 100755 --- a/tests/e2e-prow/rhoai/scripts/gpu-setup.sh +++ b/tests/e2e-prow/rhoai/scripts/gpu-setup.sh @@ -206,4 +206,3 @@ echo "" echo "ClusterPolicy Status:" oc get clusterpolicy gpu-cluster-policy -o jsonpath='{.status.state}' echo "" -echo "DEBUG NS: after gpu-setup -> $(oc get ns e2e-rhoai-dsc -o jsonpath='{.status.phase}' 2>&1)" From 3422717b748c2f1ef9c20e32388a8bf286e38586 Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:10:12 +0200 Subject: [PATCH 12/18] Fix e2e-ops restart failures and mock-jwks port-forward - Replace unfiltered envsubst with sed in e2e-ops.sh restart commands to prevent blanking $VAR references in embedded bash scripts - Add mock-jwks port-forward management (kill/restart/health check) so RBAC and MCP tests don't fail with connection refused on :8000 - Restart mock-jwks port-forward as part of lightspeed restart - Increase vLLM max-model-len from 2048 to 32768 to avoid context length errors with RAG queries Co-Authored-By: Claude Opus 4.6 --- .../manifests/vllm/vllm-runtime-cpu.yaml | 2 +- .../manifests/vllm/vllm-runtime-gpu.yaml | 2 +- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 105 +++++++++++++++--- 3 files changed, 92 insertions(+), 17 deletions(-) diff --git a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml index 4c3f5e7bd..990dc2df3 100644 --- a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml +++ b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml @@ -24,7 +24,7 @@ spec: - --port - "8080" - --max-model-len - - "2048" + - "32768" image: quay.io/rh-ee-cpompeia/vllm-cpu:latest name: kserve-container env: diff --git a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml index b7597991c..e925890d2 100644 --- a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml +++ b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml @@ -24,7 +24,7 @@ spec: - --port - "8080" - --max-model-len - - "2048" + - "32768" - --gpu-memory-utilization - "0.9" image: ${VLLM_IMAGE} diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 372b1f72c..278d5f8f8 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -32,6 +32,7 @@ MANIFEST_DIR="$SCRIPT_DIR/../manifests/lightspeed" # Written by pipeline.sh when it starts LCS port-forward; e2e-ops kills this PID before rebinding 8080. E2E_LSC_PORT_FORWARD_PID_FILE="${E2E_LSC_PORT_FORWARD_PID_FILE:-/tmp/e2e-lightspeed-port-forward.pid}" E2E_LLAMA_PORT_FORWARD_PID_FILE="${E2E_LLAMA_PORT_FORWARD_PID_FILE:-/tmp/e2e-llama-port-forward.pid}" +E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks-port-forward.pid}" # ============================================================================ # Helper functions @@ -148,6 +149,23 @@ kill_stale_llama_forward() { free_local_tcp_port "$port" } +# Kill anything likely to hold the mock-jwks local forward (localhost:8000). +kill_stale_jwks_forward() { + local port="${1:-8000}" + local saved_pf + if [[ -f "$E2E_JWKS_PORT_FORWARD_PID_FILE" ]]; then + read -r saved_pf <"$E2E_JWKS_PORT_FORWARD_PID_FILE" 2>/dev/null || true + if [[ "$saved_pf" =~ ^[0-9]+$ ]]; then + kill -9 "$saved_pf" 2>/dev/null || true + fi + fi + pkill -9 -f "port-forward.*mock-jwks.*${port}:${port}" 2>/dev/null || true + pkill -9 -f "oc port-forward svc/mock-jwks ${port}:${port}" 2>/dev/null || true + free_local_tcp_port "$port" + sleep 1 + free_local_tcp_port "$port" +} + # After oc port-forward dies in <2s, show recent oc stderr from the log file. e2e_ops_emit_port_forward_immediate_failure_diag() { echo "[e2e-ops] /tmp/port-forward.log (tail 25):" @@ -242,16 +260,12 @@ cmd_restart_lightspeed() { sleep 2 } - # Apply manifest (expand LIGHTSPEED_STACK_IMAGE) + # Apply manifest (expand LIGHTSPEED_STACK_IMAGE only; filter prevents blanking other $VAR refs) LIGHTSPEED_STACK_IMAGE="${LIGHTSPEED_STACK_IMAGE:-quay.io/lightspeed-core/lightspeed-stack:dev-latest}" export LIGHTSPEED_STACK_IMAGE _ls_manifest="$MANIFEST_DIR/lightspeed-stack.yaml" - if command -v envsubst >/dev/null 2>&1; then - envsubst < "$_ls_manifest" | oc apply -n "$NAMESPACE" -f - - else - sed "s|\${LIGHTSPEED_STACK_IMAGE}|${LIGHTSPEED_STACK_IMAGE}|g" "$_ls_manifest" | - oc apply -n "$NAMESPACE" -f - - fi + sed "s|\${LIGHTSPEED_STACK_IMAGE}|${LIGHTSPEED_STACK_IMAGE}|g" "$_ls_manifest" | + oc apply -n "$NAMESPACE" -f - # Wait for pod to be ready (TCP probe passes when app listens on 8080) wait_for_pod "lightspeed-stack-service" 40 @@ -259,9 +273,10 @@ cmd_restart_lightspeed() { # Re-label pod for service discovery oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite - # Re-establish port-forward + # Re-establish port-forwards cmd_restart_port_forward - + cmd_restart_jwks_port_forward || echo "⚠️ Mock JWKS port-forward failed (RBAC tests may fail)" + echo "✓ Lightspeed restart complete" } @@ -291,12 +306,9 @@ cmd_restart_llama_stack() { fi else # Prow: vLLM Llama Stack image (matches pipeline.sh / pipeline-services.sh) - if command -v envsubst >/dev/null 2>&1; then - envsubst < "$MANIFEST_DIR/llama-stack-prow.yaml" | oc apply -n "$NAMESPACE" -f - - else - sed "s|\${LLAMA_STACK_IMAGE}|${LLAMA_STACK_IMAGE:-}|g" "$MANIFEST_DIR/llama-stack-prow.yaml" | - oc apply -n "$NAMESPACE" -f - - fi + # Use sed instead of envsubst to avoid blanking $VAR references in embedded bash scripts + sed "s|\${LLAMA_STACK_IMAGE}|${LLAMA_STACK_IMAGE:-}|g" "$MANIFEST_DIR/llama-stack-prow.yaml" | + oc apply -n "$NAMESPACE" -f - wait_for_pod "llama-stack-service" 24 echo "Labeling pod for service..." oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite @@ -453,6 +465,66 @@ cmd_restart_llama_port_forward() { return 1 } +cmd_restart_jwks_port_forward() { + local local_port="${LOCAL_JWKS_PORT:-8000}" + local remote_port="${REMOTE_JWKS_PORT:-8000}" + local max_attempts=4 + local pf_pid + local jwks_pf_log="/tmp/port-forward-jwks.log" + + # Check if existing forward is still alive + if [[ -f "$E2E_JWKS_PORT_FORWARD_PID_FILE" ]]; then + local saved_pf + read -r saved_pf <"$E2E_JWKS_PORT_FORWARD_PID_FILE" 2>/dev/null || true + if [[ "$saved_pf" =~ ^[0-9]+$ ]] && kill -0 "$saved_pf" 2>/dev/null; then + local http_code + http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 3 "http://127.0.0.1:$local_port/tokens" 2>/dev/null) || http_code="000" + if [[ "$http_code" != "000" ]]; then + echo "✓ Mock JWKS port-forward already healthy (PID: $saved_pf)" + return 0 + fi + fi + fi + + echo "Re-establishing mock-jwks port-forward on $local_port:$remote_port..." + + for ((attempt=1; attempt<=max_attempts; attempt++)); do + kill_stale_jwks_forward "$local_port" + sleep 2 + + echo "JWKS port-forward attempt $attempt/$max_attempts" + + : >"$jwks_pf_log" + nohup oc port-forward svc/mock-jwks "$local_port:$remote_port" -n "$NAMESPACE" \ + "$jwks_pf_log" 2>&1 & + pf_pid=$! + disown "$pf_pid" 2>/dev/null || true + sleep 3 + + if ! kill -0 "$pf_pid" 2>/dev/null; then + echo "JWKS port-forward process exited immediately" + continue + fi + + local http_code + http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://127.0.0.1:$local_port/tokens" 2>/dev/null) || http_code="000" + if [[ "$http_code" != "000" ]]; then + echo "$pf_pid" >"$E2E_JWKS_PORT_FORWARD_PID_FILE" + echo "✓ Mock JWKS port-forward established (PID: $pf_pid)" + return 0 + fi + + if [[ $attempt -lt $max_attempts ]]; then + echo "JWKS forward attempt $attempt failed, retrying..." + kill -9 "$pf_pid" 2>/dev/null || true + sleep 2 + fi + done + + echo "Failed to establish mock-jwks port-forward on :$local_port" + return 1 +} + cmd_wait_for_pod() { local pod_name="${1:?Pod name required}" local max_attempts="${2:-24}" @@ -515,6 +587,9 @@ case "$COMMAND" in restart-llama-port-forward) cmd_restart_llama_port_forward ;; + restart-jwks-port-forward) + cmd_restart_jwks_port_forward + ;; restart-port-forward) cmd_restart_port_forward ;; From 56c70f5d011a05c6cb856cff4c14e02b316aebd1 Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:32:06 +0200 Subject: [PATCH 13/18] Improve port-forward resilience for Prow E2E tests verify_connectivity now checks /v1/models returns 200 (not just /readiness) to ensure the app is fully initialized before declaring success. before_scenario in the test framework probes the port-forward before each scenario and auto-restarts it via e2e-ops if dead. Co-Authored-By: Claude Opus 4.6 --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 23 ++++++++++------ tests/e2e/features/environment.py | 35 +++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 278d5f8f8..c3a3a7e05 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -190,21 +190,28 @@ verify_connectivity() { local max_attempts="${1:-6}" local local_port="${LOCAL_PORT:-8080}" local http_code="" - + for ((attempt=1; attempt<=max_attempts; attempt++)); do - # Check readiness endpoint - accept 200 or 401 (auth required but service is up) + # First check /readiness to see if port-forward is alive (accept 200 or 401) http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://localhost:$local_port/readiness" 2>/dev/null) || http_code="000" - + if [[ "$http_code" == "200" || "$http_code" == "401" ]]; then - return 0 + # Port-forward works; now verify the app is fully initialized by hitting + # a real endpoint. /v1/models requires the Llama Stack handshake to complete. + local models_code + models_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 "http://localhost:$local_port/v1/models" 2>/dev/null) || models_code="000" + if [[ "$models_code" == "200" ]]; then + return 0 + fi + echo "[e2e-ops] /readiness=$http_code but /v1/models=$models_code (app still initializing, attempt $attempt/$max_attempts)" fi - + if [[ $attempt -lt $max_attempts ]]; then - sleep 2 + sleep 5 fi done - - echo "Connectivity check failed (HTTP: ${http_code:-unknown})" + + echo "Connectivity check failed (readiness: ${http_code:-unknown})" return 1 } diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index e519217e0..42d9bd0dd 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -24,6 +24,7 @@ from tests.e2e.utils.prow_utils import ( restart_pod, restore_llama_stack_pod, + run_e2e_ops, ) from tests.e2e.utils.utils import ( is_prow_environment, @@ -133,6 +134,35 @@ def before_all(context: Context) -> None: ) +def _ensure_prow_port_forward(context: Context) -> None: + """Check that the lightspeed port-forward is alive; restart it if dead. + + Probes localhost:{E2E_LSC_PORT}/readiness — if it fails, calls e2e-ops + restart-port-forward to re-establish the tunnel before the scenario runs. + """ + host = os.getenv("E2E_LSC_HOSTNAME", "localhost") + port = os.getenv("E2E_LSC_PORT", "8080") + url = f"http://{host}:{port}/readiness" + try: + resp = requests.get(url, timeout=5) + if resp.status_code in (200, 401): + return + except requests.RequestException: + pass + + print("[before_scenario] Port-forward appears dead, restarting...") + try: + result = run_e2e_ops("restart-port-forward", timeout=60) + print(result.stdout, end="") + if result.returncode != 0: + print(result.stderr, end="") + print("[before_scenario] Warning: port-forward restart failed") + else: + print("[before_scenario] Port-forward re-established") + except subprocess.TimeoutExpired: + print("[before_scenario] Warning: port-forward restart timed out") + + def before_scenario(context: Context, scenario: Scenario) -> None: """Run before each scenario is run. @@ -157,6 +187,11 @@ def before_scenario(context: Context, scenario: Scenario) -> None: scenario.skip("Skipped in library mode (no separate llama-stack container)") return + # In Prow, verify the lightspeed port-forward is alive before each scenario. + # Port-forwards can silently die between scenarios (e.g. pod restart, TCP reset). + if is_prow_environment(): + _ensure_prow_port_forward(context) + context.scenario_lightspeed_override_active = False context.lightspeed_stack_skip_restart = False From 986c6f36e2396f5d37834c02788d2f1ad950b39c Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Fri, 24 Apr 2026 16:43:42 +0200 Subject: [PATCH 14/18] Fix update-configmap cascade failure and surface oc errors Replace fragile oc delete + oc create with oc create --dry-run | oc apply so a failed update leaves the ConfigMap intact instead of deleted. The old approach caused 156 errored scenarios: if create failed after delete succeeded, the ConfigMap was gone and every subsequent update also failed. Also print stdout/stderr from e2e-ops on failure so the actual oc error is visible in test logs. Co-Authored-By: Claude Opus 4.6 --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 26 ++++++++++++++++--------- tests/e2e/utils/prow_utils.py | 4 +++- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index c3a3a7e05..e06ceac2c 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -541,16 +541,24 @@ cmd_wait_for_pod() { cmd_update_configmap() { local configmap_name="${1:?ConfigMap name required}" local source_file="${2:?Source file required}" - + echo "Updating ConfigMap $configmap_name from $source_file..." - - # Delete existing configmap - oc delete configmap "$configmap_name" -n "$NAMESPACE" --ignore-not-found=true - - # Create new configmap from the source file - oc create configmap "$configmap_name" -n "$NAMESPACE" \ - --from-file="lightspeed-stack.yaml=$source_file" - + + if [[ ! -f "$source_file" ]]; then + echo "ERROR: source file does not exist: $source_file" >&2 + return 1 + fi + + # Use dry-run + apply to avoid the delete-then-create race. + # If delete succeeds but create fails the ConfigMap is gone and every + # subsequent attempt cascades into failure. + if ! oc create configmap "$configmap_name" -n "$NAMESPACE" \ + --from-file="lightspeed-stack.yaml=$source_file" \ + --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f -; then + echo "ERROR: oc apply for ConfigMap $configmap_name failed" >&2 + return 1 + fi + echo "✓ ConfigMap $configmap_name updated successfully" } diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index 60e9a7a71..ca06727ea 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -197,7 +197,7 @@ def remove_configmap_backup(backup_key: str) -> None: def _recreate_configmap(configmap_name: str, source_file: str) -> None: - """Delete and recreate a ConfigMap from a file. + """Update a ConfigMap from a file via oc apply. Args: configmap_name: Name of the ConfigMap. @@ -205,6 +205,8 @@ def _recreate_configmap(configmap_name: str, source_file: str) -> None: """ result = run_e2e_ops("update-configmap", [configmap_name, source_file], timeout=60) if result.returncode != 0: + print(f"update-configmap stdout: {result.stdout}") + print(f"update-configmap stderr: {result.stderr}") raise subprocess.CalledProcessError( result.returncode, "update-configmap", result.stderr ) From 54c289be83180578eea318c4805b0b542a93a1fa Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Mon, 27 Apr 2026 09:09:44 +0200 Subject: [PATCH 15/18] Fix verify_connectivity for auth-enabled Prow environments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Prow, both /readiness and /v1/models return 401 when auth is enabled. The previous fix only accepted 200 from /v1/models, causing connectivity checks to always fail and port-forward to be declared dead. Accept 401 as valid — it proves the full app stack is running, not just the socket. Co-Authored-By: Claude Opus 4.6 --- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index e06ceac2c..c083beefb 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -198,9 +198,11 @@ verify_connectivity() { if [[ "$http_code" == "200" || "$http_code" == "401" ]]; then # Port-forward works; now verify the app is fully initialized by hitting # a real endpoint. /v1/models requires the Llama Stack handshake to complete. + # Accept 200 (no auth) or 401 (auth enabled) — both prove the full app + # stack is up, not just the TCP socket. local models_code models_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 "http://localhost:$local_port/v1/models" 2>/dev/null) || models_code="000" - if [[ "$models_code" == "200" ]]; then + if [[ "$models_code" == "200" || "$models_code" == "401" ]]; then return 0 fi echo "[e2e-ops] /readiness=$http_code but /v1/models=$models_code (app still initializing, attempt $attempt/$max_attempts)" From 90da0d1b1105d77f41935d7bb346c37122e5947c Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Mon, 27 Apr 2026 15:57:09 +0200 Subject: [PATCH 16/18] Fix Llama Stack disruption cascade and pipeline port-forward coordination Llama Stack disruption tests left the pod dead after the feature because Behave clears custom context attributes between scenarios, so after_feature never saw llama_stack_was_running=True. This caused 59+ subsequent scenarios to cascade-fail with Connection refused. Three fixes: - Store was_running in module-level state (survives Behave context resets) so after_feature reliably triggers _restore_llama_stack - Add restart-lightspeed fallback in before_scenario when port-forward alone fails (recovers from dead pods, not just dead tunnels) - Align pipeline.sh with pipeline-konflux.sh: export PID file paths for e2e-ops.sh, start Llama Stack port-forward on :8321, and use lsof/fuser fallback for port cleanup in minimal images Co-Authored-By: Claude Opus 4.6 --- tests/e2e-prow/rhoai/pipeline.sh | 63 +++++++++++++++++++++++-- tests/e2e-prow/rhoai/scripts/e2e-ops.sh | 32 ++++++++++--- tests/e2e/features/environment.py | 35 ++++++++++---- tests/e2e/features/steps/health.py | 25 +++++++++- 4 files changed, 135 insertions(+), 20 deletions(-) diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 505c07815..39f6fcdba 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -339,6 +339,15 @@ oc describe pod llama-stack-service -n "$NAMESPACE" || true #======================================== # 9. EXPOSE SERVICE & START PORT-FORWARD #======================================== +# Export PID file paths so e2e-ops.sh can find and kill stale port-forwards +# during test-triggered pod restarts (matches pipeline-konflux.sh). +export E2E_LSC_PORT_FORWARD_PID_FILE="${E2E_LSC_PORT_FORWARD_PID_FILE:-/tmp/e2e-lightspeed-port-forward.pid}" +export E2E_LLAMA_PORT_FORWARD_PID_FILE="${E2E_LLAMA_PORT_FORWARD_PID_FILE:-/tmp/e2e-llama-port-forward.pid}" +export E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks-port-forward.pid}" +rm -f "$E2E_LSC_PORT_FORWARD_PID_FILE" +rm -f "$E2E_LLAMA_PORT_FORWARD_PID_FILE" +rm -f "$E2E_JWKS_PORT_FORWARD_PID_FILE" + oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n $NAMESPACE oc expose pod lightspeed-stack-service \ @@ -347,20 +356,36 @@ oc expose pod lightspeed-stack-service \ --type=ClusterIP \ -n $NAMESPACE -# Kill any existing processes on ports 8080 and 8000 -echo "Checking for existing processes on ports 8080 and 8000..." -lsof -ti:8080 | xargs kill -9 2>/dev/null || true -lsof -ti:8000 | xargs kill -9 2>/dev/null || true +# Kill any existing processes on ports 8080, 8000, and 8321 (lsof may be missing in minimal images) +echo "Checking for existing processes on ports 8080, 8000, and 8321..." +if command -v lsof >/dev/null 2>&1; then + lsof -ti:8080 | xargs kill -9 2>/dev/null || true + lsof -ti:8000 | xargs kill -9 2>/dev/null || true + lsof -ti:8321 | xargs kill -9 2>/dev/null || true +elif command -v fuser >/dev/null 2>&1; then + fuser -k 8080/tcp 2>/dev/null || true + fuser -k 8000/tcp 2>/dev/null || true + fuser -k 8321/tcp 2>/dev/null || true +fi # Start port-forward for lightspeed-stack echo "Starting port-forward for lightspeed-stack..." oc port-forward svc/lightspeed-stack-service-svc 8080:8080 -n $NAMESPACE & PF_LCS_PID=$! +echo "$PF_LCS_PID" >"$E2E_LSC_PORT_FORWARD_PID_FILE" # Start port-forward for mock-jwks (needed for RBAC tests to get tokens) echo "Starting port-forward for mock-jwks..." oc port-forward svc/mock-jwks 8000:8000 -n $NAMESPACE & PF_JWKS_PID=$! +echo "$PF_JWKS_PID" >"$E2E_JWKS_PORT_FORWARD_PID_FILE" + +# Behave steps that call Llama Stack directly (MCP toolgroups, shields, disrupt/restore) +# need localhost:8321. Without this forward those tests hit "Connection refused". +echo "Starting port-forward for llama-stack..." +oc port-forward svc/llama-stack-service-svc 8321:8321 -n $NAMESPACE & +PF_LLAMA_PID=$! +echo "$PF_LLAMA_PID" >"$E2E_LLAMA_PORT_FORWARD_PID_FILE" # Wait for port-forward to be usable (app may not be listening immediately; port-forward can drop) echo "Waiting for port-forward to lightspeed-stack to be ready..." @@ -382,6 +407,7 @@ for i in $(seq 1 36); do oc get pods -n "$NAMESPACE" -o wide || true kill $PF_LCS_PID 2>/dev/null || true kill $PF_JWKS_PID 2>/dev/null || true + kill $PF_LLAMA_PID 2>/dev/null || true exit 1 fi # If port-forward process died, restart it (e.g. "connection refused" / "lost connection to pod") @@ -389,6 +415,31 @@ for i in $(seq 1 36); do echo "Port-forward died, restarting (attempt $i)..." oc port-forward svc/lightspeed-stack-service-svc 8080:8080 -n $NAMESPACE & PF_LCS_PID=$! + echo "$PF_LCS_PID" >"$E2E_LSC_PORT_FORWARD_PID_FILE" + fi + sleep 5 +done + +# Wait for Llama Stack port-forward to be usable +echo "Waiting for Llama Stack port-forward (localhost:8321 /v1/health)..." +for i in $(seq 1 36); do + if curl -sf http://localhost:8321/v1/health > /dev/null 2>&1; then + echo "✅ Llama Stack port-forward ready after $(( i * 5 ))s" + break + fi + if [ $i -eq 36 ]; then + echo "❌ Port-forward to llama-stack never became healthy (3 min)" + oc logs llama-stack-service -n "$NAMESPACE" --tail=100 || true + kill $PF_LCS_PID 2>/dev/null || true + kill $PF_JWKS_PID 2>/dev/null || true + kill $PF_LLAMA_PID 2>/dev/null || true + exit 1 + fi + if ! kill -0 $PF_LLAMA_PID 2>/dev/null; then + echo "Llama port-forward died, restarting (attempt $i)..." + oc port-forward svc/llama-stack-service-svc 8321:8321 -n $NAMESPACE & + PF_LLAMA_PID=$! + echo "$PF_LLAMA_PID" >"$E2E_LLAMA_PORT_FORWARD_PID_FILE" fi sleep 5 done @@ -399,6 +450,7 @@ export E2E_DEFAULT_MODEL_OVERRIDE="$MODEL_NAME" export E2E_DEFAULT_PROVIDER_OVERRIDE="vllm" echo "LCS accessible at: http://$E2E_LSC_HOSTNAME:8080" echo "Mock JWKS accessible at: http://$E2E_JWKS_HOSTNAME:8000" +echo "Llama Stack accessible at: http://localhost:8321" @@ -421,8 +473,11 @@ TEST_EXIT_CODE=$(cat "$E2E_EXIT_CODE_FILE" 2>/dev/null || echo 1) # Kill first so wait doesn't block (if a port-forward is still running, wait would hang) kill $PF_LCS_PID 2>/dev/null || true kill $PF_JWKS_PID 2>/dev/null || true +kill $PF_LLAMA_PID 2>/dev/null || true wait $PF_LCS_PID 2>/dev/null || true wait $PF_JWKS_PID 2>/dev/null || true +wait $PF_LLAMA_PID 2>/dev/null || true +rm -f "$E2E_LSC_PORT_FORWARD_PID_FILE" "$E2E_LLAMA_PORT_FORWARD_PID_FILE" "$E2E_JWKS_PORT_FORWARD_PID_FILE" set -e trap 'echo "❌ Pipeline failed at line $LINENO"; exit 1' ERR diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index c083beefb..b98eafab3 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -262,7 +262,15 @@ wait_for_llama_stack_http_health() { cmd_restart_lightspeed() { echo "Restarting lightspeed-stack service..." - + + # LCS hangs at startup if Llama Stack is unreachable (blocks Llama handshake, + # never opens port 8080, readiness probe never passes). Ensure Llama Stack + # is healthy before recreating the LCS pod. + if ! _llama_stack_http_health_once 2>/dev/null; then + echo "⚠️ Llama Stack not healthy — restoring before LCS restart..." + cmd_restart_llama_stack || echo "⚠️ Llama Stack restore failed; LCS may be slow to start" + fi + # Delete existing pod (short wait so hook stays within timeout; force if needed) timeout 20 oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || { oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --force --grace-period=0 2>/dev/null || true @@ -276,16 +284,28 @@ cmd_restart_lightspeed() { sed "s|\${LIGHTSPEED_STACK_IMAGE}|${LIGHTSPEED_STACK_IMAGE}|g" "$_ls_manifest" | oc apply -n "$NAMESPACE" -f - - # Wait for pod to be ready (TCP probe passes when app listens on 8080) - wait_for_pod "lightspeed-stack-service" 40 - + # Wait for pod to be ready (TCP probe passes when app listens on 8080). + # Don't let a timeout here abort the function — still attempt port-forward + # and diagnostics so later scenarios have a chance to recover. + local pod_ready=true + if ! wait_for_pod "lightspeed-stack-service" 40; then + pod_ready=false + echo "⚠️ Pod not ready within 120s — dumping diagnostics:" + oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -30 || true + oc logs lightspeed-stack-service -n "$NAMESPACE" --tail=40 2>&1 || true + fi + # Re-label pod for service discovery oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite - - # Re-establish port-forwards + + # Re-establish port-forwards (may succeed even if readiness was slow) cmd_restart_port_forward cmd_restart_jwks_port_forward || echo "⚠️ Mock JWKS port-forward failed (RBAC tests may fail)" + if [[ "$pod_ready" == "false" ]]; then + echo "⚠️ Lightspeed restart completed but pod was slow to become ready" + return 1 + fi echo "✓ Lightspeed restart complete" } diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index 42d9bd0dd..b0f98bed5 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -19,7 +19,11 @@ from tests.e2e.features.steps.common import ( reset_active_lightspeed_stack_config_basename, ) -from tests.e2e.features.steps.health import reset_llama_stack_disrupt_once_tracking +from tests.e2e.features.steps.health import ( + get_llama_stack_was_running, + reset_llama_stack_disrupt_once_tracking, + reset_llama_stack_was_running, +) from tests.e2e.utils.llama_stack_utils import register_shield from tests.e2e.utils.prow_utils import ( restart_pod, @@ -154,13 +158,27 @@ def _ensure_prow_port_forward(context: Context) -> None: try: result = run_e2e_ops("restart-port-forward", timeout=60) print(result.stdout, end="") + if result.returncode == 0: + print("[before_scenario] Port-forward re-established") + return + print(result.stderr, end="") + except subprocess.TimeoutExpired: + pass + + # Port-forward alone failed — the pod itself may be dead (e.g. Llama Stack + # was never restored after a disruption feature). Attempt a full restart, + # which also checks Llama health before recreating LCS. + print("[before_scenario] Port-forward failed; attempting full pod restart...") + try: + result = run_e2e_ops("restart-lightspeed", timeout=200) + print(result.stdout, end="") if result.returncode != 0: print(result.stderr, end="") - print("[before_scenario] Warning: port-forward restart failed") + print("[before_scenario] Warning: full pod restart failed") else: - print("[before_scenario] Port-forward re-established") + print("[before_scenario] Pod restart + port-forward re-established") except subprocess.TimeoutExpired: - print("[before_scenario] Warning: port-forward restart timed out") + print("[before_scenario] Warning: full pod restart timed out") def before_scenario(context: Context, scenario: Scenario) -> None: @@ -417,11 +435,12 @@ def after_feature(context: Context, feature: Feature) -> None: when ``context.feedback_e2e_conversation_cleanup`` is set by feedback steps, delete tracked feedback test conversations. """ - # Restore Llama Stack FIRST (before any lightspeed-stack restart) - llama_was_running = getattr(context, "llama_stack_was_running", False) - if llama_was_running: + # Restore Llama Stack FIRST (before any lightspeed-stack restart). + # Read from module-level state — Behave clears custom context attributes + # between scenarios, so context.llama_stack_was_running is unreliable here. + if get_llama_stack_was_running(): _restore_llama_stack(context) - context.llama_stack_was_running = False + reset_llama_stack_was_running() if getattr(context, "feedback_e2e_conversation_cleanup", False): token = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva" diff --git a/tests/e2e/features/steps/health.py b/tests/e2e/features/steps/health.py index b42ffe859..dd5243c5a 100644 --- a/tests/e2e/features/steps/health.py +++ b/tests/e2e/features/steps/health.py @@ -13,10 +13,25 @@ # Mutate one dict entry so we need not reassign a module-level bool (no global). _llama_stack_disrupt_once: dict[str, bool] = {"applied": False} +# Behave clears user attributes on ``context`` between scenarios; store +# ``was_running`` at module level so ``after_feature`` can still see it. +_llama_stack_was_running: dict[str, bool] = {"value": False} + + +def get_llama_stack_was_running() -> bool: + """Return whether Llama Stack was running before the disruption step.""" + return _llama_stack_was_running["value"] + + +def reset_llama_stack_was_running() -> None: + """Clear the module-level was_running flag after restoration.""" + _llama_stack_was_running["value"] = False + def reset_llama_stack_disrupt_once_tracking() -> None: """Reset before each feature; see ``environment.before_feature``.""" _llama_stack_disrupt_once["applied"] = False + _llama_stack_was_running["value"] = False @given("The llama-stack connection is disrupted") @@ -50,13 +65,18 @@ def llama_stack_connection_broken(context: Context) -> None: print("Llama Stack disruption skipped (already applied once this feature)") return - # Store original state for restoration (only on the real disruption path) + # Store original state for restoration (only on the real disruption path). + # Write to both context (backward compat) and module-level dict (survives + # Behave's per-scenario context clearing). context.llama_stack_was_running = False + _llama_stack_was_running["value"] = False if is_prow_environment(): from tests.e2e.utils.prow_utils import disrupt_llama_stack_pod - context.llama_stack_was_running = disrupt_llama_stack_pod() + was_running = disrupt_llama_stack_pod() + context.llama_stack_was_running = was_running + _llama_stack_was_running["value"] = was_running _llama_stack_disrupt_once["applied"] = True return @@ -71,6 +91,7 @@ def llama_stack_connection_broken(context: Context) -> None: if result.stdout.strip(): context.llama_stack_was_running = True + _llama_stack_was_running["value"] = True subprocess.run( ["docker", "stop", "llama-stack"], check=True, capture_output=True ) From b033f92b59fa3569c41aede619fc891706e6dd37 Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Tue, 28 Apr 2026 09:33:46 +0200 Subject: [PATCH 17/18] Skip TLS and proxy e2e tests in Prow (no Docker Compose services) TLS and proxy features depend on mock-tls-inference and proxy sidecars that are only deployed via Docker Compose, not in the OpenShift cluster. Every TLS scenario burned 200s waiting for a provider that never exists, consuming ~63 min of the 4h Prow timeout for guaranteed failures. Co-Authored-By: Claude Opus 4.6 --- tests/e2e/features/environment.py | 6 ++++++ tests/e2e/features/proxy.feature | 2 +- tests/e2e/features/tls.feature | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index b0f98bed5..77b8bd1f7 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -205,6 +205,12 @@ def before_scenario(context: Context, scenario: Scenario) -> None: scenario.skip("Skipped in library mode (no separate llama-stack container)") return + # Skip scenarios that depend on services not deployed in Prow/OpenShift + # (e.g. mock-tls-inference, proxy sidecars only available in Docker Compose) + if is_prow_environment() and "skip-in-prow" in scenario.effective_tags: + scenario.skip("Skipped in Prow (requires Docker Compose services)") + return + # In Prow, verify the lightspeed port-forward is alive before each scenario. # Port-forwards can silently die between scenarios (e.g. pod restart, TCP reset). if is_prow_environment(): diff --git a/tests/e2e/features/proxy.feature b/tests/e2e/features/proxy.feature index aaab54f4e..d4d9a49f0 100644 --- a/tests/e2e/features/proxy.feature +++ b/tests/e2e/features/proxy.feature @@ -1,4 +1,4 @@ -@e2e_group_3 @skip-in-library-mode +@e2e_group_3 @skip-in-library-mode @skip-in-prow Feature: Proxy and TLS networking tests for Llama Stack providers Verify that the Lightspeed Stack works correctly when Llama Stack's diff --git a/tests/e2e/features/tls.feature b/tests/e2e/features/tls.feature index 5a2d77338..a900b1c0f 100644 --- a/tests/e2e/features/tls.feature +++ b/tests/e2e/features/tls.feature @@ -1,4 +1,4 @@ -@e2e_group_1 @skip-in-library-mode +@e2e_group_1 @skip-in-library-mode @skip-in-prow Feature: TLS configuration for remote inference providers Validate that Llama Stack's NetworkConfig.tls settings are applied correctly when connecting to a remote inference provider over HTTPS. From aa6ab2e0abbe2a6722e8b559ad8e1ca27a55e2e8 Mon Sep 17 00:00:00 2001 From: are-ces <195810094+are-ces@users.noreply.github.com> Date: Tue, 28 Apr 2026 15:51:00 +0200 Subject: [PATCH 18/18] Fix after_feature AttributeError on hostname_llama Behave clears custom context attributes between scenarios, so hostname_llama/port_llama are gone by the time after_feature runs. Store them in module-level state (same pattern as llama_stack_was_running). Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/e2e/features/environment.py | 8 +++++--- tests/e2e/features/steps/common.py | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index 77b8bd1f7..ca2474578 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -17,6 +17,8 @@ from behave.runner import Context from tests.e2e.features.steps.common import ( + get_llama_stack_hostname, + get_llama_stack_port, reset_active_lightspeed_stack_config_basename, ) from tests.e2e.features.steps.health import ( @@ -312,7 +314,7 @@ def _print_llama_stack_diagnostics() -> None: print("--- end diagnostics ---") -def _restore_llama_stack(context: Context) -> None: +def _restore_llama_stack() -> None: """Restore Llama Stack connection after disruption.""" if is_prow_environment(): # Recreate llama pod, then restart LCS so in-process clients reconnect (Llama IP/pod changed). @@ -365,7 +367,7 @@ def _restore_llama_stack(context: Context) -> None: "llama-stack", "curl", "-sf", - f"http://{context.hostname_llama}:{context.port_llama}/v1/health", + f"http://{get_llama_stack_hostname()}:{get_llama_stack_port()}/v1/health", ], capture_output=True, timeout=5, @@ -445,7 +447,7 @@ def after_feature(context: Context, feature: Feature) -> None: # Read from module-level state — Behave clears custom context attributes # between scenarios, so context.llama_stack_was_running is unreliable here. if get_llama_stack_was_running(): - _restore_llama_stack(context) + _restore_llama_stack() reset_llama_stack_was_running() if getattr(context, "feedback_e2e_conversation_cleanup", False): diff --git a/tests/e2e/features/steps/common.py b/tests/e2e/features/steps/common.py index d90e8084e..5b7b3e715 100644 --- a/tests/e2e/features/steps/common.py +++ b/tests/e2e/features/steps/common.py @@ -21,12 +21,26 @@ # YAML across scenarios in one feature. Mutate the dict entry (no global). _active_lightspeed_stack_config_basename: dict[str, Optional[str]] = {"basename": None} +# Behave clears user attributes on ``context`` between scenarios; store +# Llama Stack endpoint info at module level so ``after_feature`` can see it. +_llama_stack_endpoint: dict[str, str] = {"hostname": "localhost", "port": "8321"} + def reset_active_lightspeed_stack_config_basename() -> None: """Reset before each feature; see ``environment.before_feature``.""" _active_lightspeed_stack_config_basename["basename"] = None +def get_llama_stack_hostname() -> str: + """Return the Llama Stack hostname surviving per-scenario context clearing.""" + return _llama_stack_endpoint["hostname"] + + +def get_llama_stack_port() -> str: + """Return the Llama Stack port surviving per-scenario context clearing.""" + return _llama_stack_endpoint["port"] + + @given("The service is started locally") def service_is_started_locally(context: Context) -> None: """Check the service status. @@ -46,6 +60,8 @@ def service_is_started_locally(context: Context) -> None: else: context.hostname_llama = "localhost" context.port_llama = os.getenv("E2E_LLAMA_PORT", "8321") + _llama_stack_endpoint["hostname"] = context.hostname_llama + _llama_stack_endpoint["port"] = context.port_llama @given('the Lightspeed stack configuration directory is "{directory}"')