diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/lightspeed-stack.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/lightspeed-stack.yaml index 55f9a9310..b10da3c5c 100644 --- a/tests/e2e-prow/rhoai/manifests/lightspeed/lightspeed-stack.yaml +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/lightspeed-stack.yaml @@ -22,13 +22,14 @@ spec: secretKeyRef: name: llama-stack-ip-secret key: key - # Same vars as docker-compose / server-mode YAML (${env.FAISS_VECTOR_STORE_ID} in byok_rag). - name: FAISS_VECTOR_STORE_ID valueFrom: secretKeyRef: name: faiss-vector-store-secret key: id optional: true + - name: KV_RAG_PATH + value: "/app-root/src/.llama/storage/rag/kv_store.db" image: ${LIGHTSPEED_STACK_IMAGE} ports: - containerPort: 8080 diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-prow.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-prow.yaml new file mode 100644 index 000000000..757933c3d --- /dev/null +++ b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack-prow.yaml @@ -0,0 +1,205 @@ +# Llama Stack pod for Prow: uses pre-built image with enrichment + RAG restore. +# +# Requires: ConfigMap llama-stack-config (run.yaml), ConfigMap rag-data (kv_store.db.gz), +# ConfigMap lightspeed-stack-config (lightspeed-stack.yaml). +# Requires: Image built as ${LLAMA_STACK_IMAGE} (set by pipeline.sh). +# +apiVersion: v1 +kind: Pod +metadata: + name: llama-stack-service + labels: + pod: llama-stack-service +spec: + securityContext: + seccompProfile: + type: RuntimeDefault + initContainers: + - name: setup-rag-data + image: busybox:latest + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + command: + - /bin/sh + - -c + - | + set -e + mkdir -p /data/src/.llama/storage/rag /data/src/.llama/storage/files /data/.e2e-rag-seed + if [ ! -f /rag-data/kv_store.db.gz ]; then + echo "FATAL: missing /rag-data/kv_store.db.gz" + ls -la /rag-data || true + exit 1 + fi + gunzip -c /rag-data/kv_store.db.gz > /data/.e2e-rag-seed/kv_store.db + cp -f /data/.e2e-rag-seed/kv_store.db /data/src/.llama/storage/rag/kv_store.db + chmod -R 777 /data/src /data/.e2e-rag-seed + echo "RAG data extracted successfully" + volumeMounts: + - name: rag-storage + mountPath: /data + - name: rag-data + mountPath: /rag-data + - name: materialize-run-yaml + image: busybox:latest + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + runAsUser: 65534 + seccompProfile: + type: RuntimeDefault + command: + - /bin/sh + - -c + - | + set -e + cp /cm/run.yaml /work/run.yaml + chmod 664 /work/run.yaml + volumeMounts: + - name: config-cm + mountPath: /cm + readOnly: true + - name: rag-storage + mountPath: /work + containers: + - name: llama-stack-container + image: ${LLAMA_STACK_IMAGE} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsNonRoot: true + runAsUser: 1001 + seccompProfile: + type: RuntimeDefault + workingDir: /opt/app-root + env: + - name: PYTHONPATH + value: "/opt/app-root/src" + - name: HOME + value: "/opt/app-root/src" + - name: KV_STORE_PATH + value: "/opt/app-root/src/.llama/storage/kv_store.db" + - name: KV_RAG_PATH + value: "/opt/app-root/src/.llama/storage/rag/kv_store.db" + - name: SQL_STORE_PATH + value: "/opt/app-root/src/.llama/storage/sql_store.db" + - name: KSVC_URL + valueFrom: + secretKeyRef: + name: api-url-secret + key: key + - name: VLLM_API_KEY + valueFrom: + secretKeyRef: + name: vllm-api-key-secret + key: key + - name: INFERENCE_MODEL + value: "meta-llama/Llama-3.1-8B-Instruct" + - name: OPENAI_API_KEY + valueFrom: + secretKeyRef: + name: openai-api-key-secret + key: key + optional: true + - name: E2E_OPENAI_MODEL + value: "gpt-4o-mini" + - name: LLAMA_STACK_CONFIG + value: "/opt/app-root/src/.llama/storage/run.yaml" + - name: FAISS_VECTOR_STORE_ID + valueFrom: + secretKeyRef: + name: faiss-vector-store-secret + key: id + - name: E2E_LLAMA_HOSTNAME + valueFrom: + secretKeyRef: + name: llama-stack-ip-secret + key: key + command: + - /bin/bash + - -c + - | + set -e + RAG_SEED="/opt/app-root/src/.llama/storage/.e2e-rag-seed/kv_store.db" + RAG_CM_GZ="/opt/app-root/rag-data-cm/kv_store.db.gz" + RAG_WORK="${KV_RAG_PATH:-/opt/app-root/src/.llama/storage/rag/kv_store.db}" + restore_rag_seed() { + mkdir -p "$(dirname "$RAG_WORK")" + if [[ -f "$RAG_CM_GZ" ]]; then + RAG_WORK="$RAG_WORK" RAG_CM_GZ="$RAG_CM_GZ" python3 -c 'import gzip, os, shutil, sys; r, g = os.environ["RAG_WORK"], os.environ["RAG_CM_GZ"]; t = r + ".tmp"; i = gzip.open(g, "rb"); o = open(t, "wb"); shutil.copyfileobj(i, o); i.close(); o.close(); sz = os.path.getsize(t); (sz >= 1048576) or (print("FATAL: RAG from ConfigMap too small:", sz, file=sys.stderr) or sys.exit(1)); os.replace(t, r); os.chmod(r, 0o664)' || exit 1 + elif [[ -f "$RAG_SEED" ]]; then + cp -f "$RAG_SEED" "$RAG_WORK" + chmod 664 "$RAG_WORK" 2>/dev/null || true + fi + } + restore_rag_seed + INPUT_CONFIG="${LLAMA_STACK_CONFIG:-/opt/app-root/run.yaml}" + ENRICHED_CONFIG="/opt/app-root/run.yaml" + LIGHTSPEED_CONFIG="${LIGHTSPEED_CONFIG:-/opt/app-root/lightspeed-stack.yaml}" + ENV_FILE="/opt/app-root/.env" + if [[ -f "$LIGHTSPEED_CONFIG" ]]; then + echo "Enriching llama-stack config..." + ENRICHMENT_FAILED=0 + python3 /opt/app-root/src/llama_stack_configuration.py \ + -c "$LIGHTSPEED_CONFIG" \ + -i "$INPUT_CONFIG" \ + -o "$ENRICHED_CONFIG" \ + -e "$ENV_FILE" 2>&1 || ENRICHMENT_FAILED=1 + if [[ -f "$ENV_FILE" ]]; then + set -a && . "$ENV_FILE" && set +a + fi + if [[ -f "$ENRICHED_CONFIG" ]] && [[ "$ENRICHMENT_FAILED" -eq 0 ]]; then + echo "Using enriched config: $ENRICHED_CONFIG" + restore_rag_seed + exec llama stack run "$ENRICHED_CONFIG" + fi + fi + echo "Using original config: $INPUT_CONFIG" + restore_rag_seed + exec llama stack run "$INPUT_CONFIG" + ports: + - containerPort: 8321 + readinessProbe: + httpGet: + path: /v1/health + port: 8321 + initialDelaySeconds: 20 + periodSeconds: 5 + failureThreshold: 36 + livenessProbe: + httpGet: + path: /v1/health + port: 8321 + initialDelaySeconds: 120 + periodSeconds: 20 + failureThreshold: 3 + volumeMounts: + - name: rag-storage + mountPath: /opt/app-root/src/.llama/storage + - name: lightspeed-config + mountPath: /opt/app-root/lightspeed-stack.yaml + subPath: lightspeed-stack.yaml + readOnly: true + - name: rag-data + mountPath: /opt/app-root/rag-data-cm + readOnly: true + volumes: + - name: rag-storage + emptyDir: {} + - name: config-cm + configMap: + name: llama-stack-config + - name: lightspeed-config + configMap: + name: lightspeed-stack-config + - name: rag-data + configMap: + name: rag-data diff --git a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml b/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml deleted file mode 100644 index de22831f6..000000000 --- a/tests/e2e-prow/rhoai/manifests/lightspeed/llama-stack.yaml +++ /dev/null @@ -1,62 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: llama-stack-service -spec: - imagePullSecrets: - - name: quay-lightspeed-pull-secret - initContainers: - - name: setup-rag-data - image: busybox:latest - command: - - /bin/sh - - -c - - | - mkdir -p /data/storage/rag - gunzip -c /rag-data/kv_store.db.gz > /data/storage/rag/kv_store.db - echo "RAG data extracted successfully" - ls -la /data/storage/rag/ - volumeMounts: - - name: app-root - mountPath: /data - - name: rag-data - mountPath: /rag-data - containers: - - name: llama-stack-container - command: ["llama", "stack", "run", "/opt/app-root/run.yaml"] - env: - - name: KSVC_URL - valueFrom: - secretKeyRef: - name: api-url-secret - key: key - - name: VLLM_API_KEY - valueFrom: - secretKeyRef: - name: vllm-api-key-secret - key: key - - name: INFERENCE_MODEL - value: "meta-llama/Llama-3.1-8B-Instruct" - - name: FAISS_VECTOR_STORE_ID - valueFrom: - secretKeyRef: - name: faiss-vector-store-secret - key: id - image: ${LLAMA_STACK_IMAGE} - ports: - - containerPort: 8321 - volumeMounts: - - name: app-root - mountPath: /opt/app-root/src/.llama - - name: config - mountPath: /opt/app-root/run.yaml - subPath: run.yaml - volumes: - - name: app-root - emptyDir: {} - - name: config - configMap: - name: llama-stack-config - - name: rag-data - configMap: - name: rag-data diff --git a/tests/e2e-prow/rhoai/manifests/operators/ds-cluster.yaml b/tests/e2e-prow/rhoai/manifests/operators/ds-cluster.yaml index e9b619726..d57226cc1 100644 --- a/tests/e2e-prow/rhoai/manifests/operators/ds-cluster.yaml +++ b/tests/e2e-prow/rhoai/manifests/operators/ds-cluster.yaml @@ -2,7 +2,6 @@ apiVersion: datasciencecluster.opendatahub.io/v1 kind: DataScienceCluster metadata: name: default-dsc - namespace: e2e-rhoai-dsc spec: serviceMesh: managementState: Managed diff --git a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml index 4c3f5e7bd..990dc2df3 100644 --- a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml +++ b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml @@ -24,7 +24,7 @@ spec: - --port - "8080" - --max-model-len - - "2048" + - "32768" image: quay.io/rh-ee-cpompeia/vllm-cpu:latest name: kserve-container env: diff --git a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml index b7597991c..e925890d2 100644 --- a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml +++ b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml @@ -24,7 +24,7 @@ spec: - --port - "8080" - --max-model-len - - "2048" + - "32768" - --gpu-memory-utilization - "0.9" image: ${VLLM_IMAGE} diff --git a/tests/e2e-prow/rhoai/pipeline-services.sh b/tests/e2e-prow/rhoai/pipeline-services.sh index cd33ab9d5..1db04b6ea 100755 --- a/tests/e2e-prow/rhoai/pipeline-services.sh +++ b/tests/e2e-prow/rhoai/pipeline-services.sh @@ -1,27 +1,30 @@ #!/bin/bash BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +NAMESPACE="${NAMESPACE:-e2e-rhoai-dsc}" -# Deploy llama-stack -envsubst < "$BASE_DIR/manifests/lightspeed/llama-stack.yaml" | oc apply -f - +# Create llama-stack-ip-secret before deploying the pod (it references the secret as an env var) +export E2E_LLAMA_HOSTNAME="llama-stack-service-svc.${NAMESPACE}.svc.cluster.local" +oc create secret generic llama-stack-ip-secret \ + --from-literal=key="$E2E_LLAMA_HOSTNAME" \ + -n "$NAMESPACE" 2>/dev/null || echo "Secret llama-stack-ip-secret exists" + +# Deploy llama-stack (substitute only LLAMA_STACK_IMAGE, leave other ${} intact) +envsubst '${LLAMA_STACK_IMAGE}' < "$BASE_DIR/manifests/lightspeed/llama-stack-prow.yaml" | oc apply -n "$NAMESPACE" -f - oc wait pod/llama-stack-service \ - -n e2e-rhoai-dsc --for=condition=Ready --timeout=600s + -n "$NAMESPACE" --for=condition=Ready --timeout=600s -# Get url address of llama-stack pod -oc label pod llama-stack-service pod=llama-stack-service -n e2e-rhoai-dsc +# Expose llama-stack service +oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" oc expose pod llama-stack-service \ --name=llama-stack-service-svc \ --port=8321 \ --type=ClusterIP \ - -n e2e-rhoai-dsc - -export E2E_LLAMA_HOSTNAME="llama-stack-service-svc.e2e-rhoai-dsc.svc.cluster.local" - -oc create secret generic llama-stack-ip-secret \ - --from-literal=key="$E2E_LLAMA_HOSTNAME" \ - -n e2e-rhoai-dsc || echo "Secret exists" + -n "$NAMESPACE" -# Deploy lightspeed-stack -oc apply -f "$BASE_DIR/manifests/lightspeed/lightspeed-stack.yaml" +# Deploy lightspeed-stack (substitute only LIGHTSPEED_STACK_IMAGE, leave other ${} intact) +LIGHTSPEED_STACK_IMAGE="${LIGHTSPEED_STACK_IMAGE:-quay.io/lightspeed-core/lightspeed-stack:dev-latest}" +export LIGHTSPEED_STACK_IMAGE +envsubst '${LIGHTSPEED_STACK_IMAGE}' < "$BASE_DIR/manifests/lightspeed/lightspeed-stack.yaml" | oc apply -n "$NAMESPACE" -f - diff --git a/tests/e2e-prow/rhoai/pipeline.sh b/tests/e2e-prow/rhoai/pipeline.sh index 73585cb82..39f6fcdba 100755 --- a/tests/e2e-prow/rhoai/pipeline.sh +++ b/tests/e2e-prow/rhoai/pipeline.sh @@ -9,13 +9,14 @@ export RUNNING_PROW=true # 1. GLOBAL CONFIG #======================================== NAMESPACE="e2e-rhoai-dsc" +export NAMESPACE MODEL_NAME="meta-llama/Llama-3.1-8B-Instruct" PIPELINE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -# RHOAI llama-stack image -LLAMA_STACK_IMAGE="${LLAMA_STACK_IMAGE:-quay.io/rhoai/odh-llama-stack-core-rhel9:rhoai-3.3}" -echo "Using llama-stack image: $LLAMA_STACK_IMAGE" -export LLAMA_STACK_IMAGE +# RHOAI llama-stack image (unused when building from source via llama-stack-openai.yaml) +# LLAMA_STACK_IMAGE="${LLAMA_STACK_IMAGE:-quay.io/rhoai/odh-llama-stack-core-rhel9:rhoai-3.4-ea.2}" +# echo "Using llama-stack image: $LLAMA_STACK_IMAGE" +# export LLAMA_STACK_IMAGE #======================================== # 2. ENVIRONMENT SETUP @@ -38,15 +39,21 @@ oc version oc whoami #======================================== -# 3. CREATE NAMESPACE & SECRETS +# 3. BOOTSTRAP OPERATORS & DSC (before namespace — DSC operator may delete it) #======================================== -echo "===== Creating namespace & secrets =====" -oc get ns "$NAMESPACE" >/dev/null 2>&1 || oc create namespace "$NAMESPACE" - -# Create NFD and NVIDIA namespaces +echo "===== Bootstrapping operators =====" +# Create NFD and NVIDIA namespaces (needed by operator subscriptions) oc apply -f "$PIPELINE_DIR/manifests/namespaces/nfd.yaml" oc apply -f "$PIPELINE_DIR/manifests/namespaces/nvidia-operator.yaml" +# Install operators and apply DataScienceCluster (this may delete/recreate namespaces) +"$PIPELINE_DIR/scripts/bootstrap.sh" "$PIPELINE_DIR" + +#======================================== +# 4. CREATE NAMESPACE & SECRETS (after DSC settles) +#======================================== +echo "===== Creating namespace & secrets =====" +oc get ns "$NAMESPACE" >/dev/null 2>&1 || oc create namespace "$NAMESPACE" create_secret() { local name=$1; shift @@ -56,6 +63,22 @@ create_secret() { create_secret hf-token-secret --from-literal=token="$HUGGING_FACE_HUB_TOKEN" create_secret vllm-api-key-secret --from-literal=key="$VLLM_API_KEY" +create_secret openai-api-key-secret --from-literal=key="" + +# MCP token secrets for lightspeed-stack +REPO_ROOT="$(cd "$PIPELINE_DIR/../../.." && pwd)" +if [ -f "$REPO_ROOT/tests/e2e/secrets/mcp-token" ]; then + oc create secret generic mcp-file-auth-token -n "$NAMESPACE" \ + --from-file=token="$REPO_ROOT/tests/e2e/secrets/mcp-token" \ + --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f - + echo "✅ mcp-file-auth-token secret applied" +fi +if [ -f "$REPO_ROOT/tests/e2e/secrets/invalid-mcp-token" ]; then + oc create secret generic mcp-invalid-file-auth-token -n "$NAMESPACE" \ + --from-file=token="$REPO_ROOT/tests/e2e/secrets/invalid-mcp-token" \ + --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f - + echo "✅ mcp-invalid-file-auth-token secret applied" +fi # Create Quay pull secret for llama-stack images echo "Creating Quay pull secret..." @@ -70,7 +93,7 @@ oc secrets link default quay-lightspeed-pull-secret --for=pull -n "$NAMESPACE" 2 #======================================== -# 4. CONFIGMAPS +# 5. CONFIGMAPS #======================================== echo "===== Setting up configmaps =====" @@ -79,14 +102,17 @@ curl -sL -o tool_chat_template_llama3.1_json.jinja \ || { echo "❌ Failed to download jinja template"; exit 1; } oc create configmap vllm-chat-template -n "$NAMESPACE" \ - --from-file=tool_chat_template_llama3.1_json.jinja --dry-run=client -o yaml | oc apply -f - + --from-file=tool_chat_template_llama3.1_json.jinja --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f - #======================================== -# 5. DEPLOY vLLM +# 6. DEPLOY vLLM (GPU setup + deploy, bootstrap already done) #======================================== echo "===== Deploying vLLM =====" -./pipeline-vllm.sh +"$PIPELINE_DIR/scripts/gpu-setup.sh" "$PIPELINE_DIR" +source "$PIPELINE_DIR/scripts/fetch-vllm-image.sh" +"$PIPELINE_DIR/scripts/deploy-vllm.sh" "$PIPELINE_DIR" +"$PIPELINE_DIR/scripts/get-vllm-pod-info.sh" oc get pods -n "$NAMESPACE" @@ -162,18 +188,18 @@ REPO_ROOT="$(cd "$PIPELINE_DIR/../../.." && pwd)" echo "Creating mock server ConfigMaps..." oc create configmap mock-jwks-script -n "$NAMESPACE" \ --from-file=server.py="$REPO_ROOT/tests/e2e/mock_jwks_server/server.py" \ - --dry-run=client -o yaml | oc apply -f - + --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f - oc create configmap mock-mcp-script -n "$NAMESPACE" \ --from-file=server.py="$REPO_ROOT/tests/e2e/mock_mcp_server/server.py" \ - --dry-run=client -o yaml | oc apply -f - + --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f - # Deploy mock server pods and services echo "Deploying mock-jwks..." -oc apply -f "$PIPELINE_DIR/manifests/lightspeed/mock-jwks.yaml" +oc apply -n "$NAMESPACE" -f "$PIPELINE_DIR/manifests/lightspeed/mock-jwks.yaml" echo "Deploying mock-mcp..." -oc apply -f "$PIPELINE_DIR/manifests/lightspeed/mock-mcp.yaml" +oc apply -n "$NAMESPACE" -f "$PIPELINE_DIR/manifests/lightspeed/mock-mcp.yaml" # Wait for mock servers to be ready echo "Waiting for mock servers to be ready..." @@ -189,7 +215,39 @@ oc wait pod/mock-jwks pod/mock-mcp \ echo "✅ Mock servers deployed" #======================================== -# 8. DEPLOY LIGHTSPEED STACK AND LLAMA STACK +# 8. BUILD LLAMA STACK IMAGE +#======================================== +echo "===== Building llama-stack image =====" +LLAMA_STACK_IMAGE="image-registry.openshift-image-registry.svc:5000/${NAMESPACE}/llama-stack-e2e:latest" +export LLAMA_STACK_IMAGE + +# Create BuildConfig (idempotent) +oc new-build --name=llama-stack-e2e \ + --binary \ + --strategy=docker \ + --image="registry.access.redhat.com/ubi9/ubi-minimal" \ + --to="llama-stack-e2e:latest" \ + -n "$NAMESPACE" 2>/dev/null || echo "BuildConfig llama-stack-e2e already exists" + +# Patch BuildConfig to use test.containerfile instead of Dockerfile +oc patch bc llama-stack-e2e -n "$NAMESPACE" --type=json \ + -p '[{"op":"replace","path":"/spec/strategy/dockerStrategy/dockerfilePath","value":"test.containerfile"}]' 2>/dev/null || true + +# Build from repo root +oc start-build llama-stack-e2e \ + --from-dir="$REPO_ROOT" \ + --follow \ + -n "$NAMESPACE" || { echo "❌ llama-stack image build failed"; exit 1; } + +echo "✅ llama-stack image built: $LLAMA_STACK_IMAGE" + +# Allow default SA to pull from the internal registry +oc policy add-role-to-user system:image-puller \ + system:serviceaccount:${NAMESPACE}:default \ + -n "$NAMESPACE" 2>/dev/null || true + +#======================================== +# 9. DEPLOY LIGHTSPEED STACK AND LLAMA STACK #======================================== echo "===== Deploying Services =====" @@ -281,6 +339,15 @@ oc describe pod llama-stack-service -n "$NAMESPACE" || true #======================================== # 9. EXPOSE SERVICE & START PORT-FORWARD #======================================== +# Export PID file paths so e2e-ops.sh can find and kill stale port-forwards +# during test-triggered pod restarts (matches pipeline-konflux.sh). +export E2E_LSC_PORT_FORWARD_PID_FILE="${E2E_LSC_PORT_FORWARD_PID_FILE:-/tmp/e2e-lightspeed-port-forward.pid}" +export E2E_LLAMA_PORT_FORWARD_PID_FILE="${E2E_LLAMA_PORT_FORWARD_PID_FILE:-/tmp/e2e-llama-port-forward.pid}" +export E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks-port-forward.pid}" +rm -f "$E2E_LSC_PORT_FORWARD_PID_FILE" +rm -f "$E2E_LLAMA_PORT_FORWARD_PID_FILE" +rm -f "$E2E_JWKS_PORT_FORWARD_PID_FILE" + oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n $NAMESPACE oc expose pod lightspeed-stack-service \ @@ -289,20 +356,36 @@ oc expose pod lightspeed-stack-service \ --type=ClusterIP \ -n $NAMESPACE -# Kill any existing processes on ports 8080 and 8000 -echo "Checking for existing processes on ports 8080 and 8000..." -lsof -ti:8080 | xargs kill -9 2>/dev/null || true -lsof -ti:8000 | xargs kill -9 2>/dev/null || true +# Kill any existing processes on ports 8080, 8000, and 8321 (lsof may be missing in minimal images) +echo "Checking for existing processes on ports 8080, 8000, and 8321..." +if command -v lsof >/dev/null 2>&1; then + lsof -ti:8080 | xargs kill -9 2>/dev/null || true + lsof -ti:8000 | xargs kill -9 2>/dev/null || true + lsof -ti:8321 | xargs kill -9 2>/dev/null || true +elif command -v fuser >/dev/null 2>&1; then + fuser -k 8080/tcp 2>/dev/null || true + fuser -k 8000/tcp 2>/dev/null || true + fuser -k 8321/tcp 2>/dev/null || true +fi # Start port-forward for lightspeed-stack echo "Starting port-forward for lightspeed-stack..." oc port-forward svc/lightspeed-stack-service-svc 8080:8080 -n $NAMESPACE & PF_LCS_PID=$! +echo "$PF_LCS_PID" >"$E2E_LSC_PORT_FORWARD_PID_FILE" # Start port-forward for mock-jwks (needed for RBAC tests to get tokens) echo "Starting port-forward for mock-jwks..." oc port-forward svc/mock-jwks 8000:8000 -n $NAMESPACE & PF_JWKS_PID=$! +echo "$PF_JWKS_PID" >"$E2E_JWKS_PORT_FORWARD_PID_FILE" + +# Behave steps that call Llama Stack directly (MCP toolgroups, shields, disrupt/restore) +# need localhost:8321. Without this forward those tests hit "Connection refused". +echo "Starting port-forward for llama-stack..." +oc port-forward svc/llama-stack-service-svc 8321:8321 -n $NAMESPACE & +PF_LLAMA_PID=$! +echo "$PF_LLAMA_PID" >"$E2E_LLAMA_PORT_FORWARD_PID_FILE" # Wait for port-forward to be usable (app may not be listening immediately; port-forward can drop) echo "Waiting for port-forward to lightspeed-stack to be ready..." @@ -313,8 +396,18 @@ for i in $(seq 1 36); do fi if [ $i -eq 36 ]; then echo "❌ Port-forward to lightspeed-stack never became ready (3 min)" + echo "" + echo "DEBUG: lightspeed-stack-service logs:" + oc logs lightspeed-stack-service -n "$NAMESPACE" --tail=100 || true + echo "" + echo "DEBUG: llama-stack-service logs:" + oc logs llama-stack-service -n "$NAMESPACE" --tail=100 || true + echo "" + echo "DEBUG: Pod status:" + oc get pods -n "$NAMESPACE" -o wide || true kill $PF_LCS_PID 2>/dev/null || true kill $PF_JWKS_PID 2>/dev/null || true + kill $PF_LLAMA_PID 2>/dev/null || true exit 1 fi # If port-forward process died, restart it (e.g. "connection refused" / "lost connection to pod") @@ -322,14 +415,42 @@ for i in $(seq 1 36); do echo "Port-forward died, restarting (attempt $i)..." oc port-forward svc/lightspeed-stack-service-svc 8080:8080 -n $NAMESPACE & PF_LCS_PID=$! + echo "$PF_LCS_PID" >"$E2E_LSC_PORT_FORWARD_PID_FILE" + fi + sleep 5 +done + +# Wait for Llama Stack port-forward to be usable +echo "Waiting for Llama Stack port-forward (localhost:8321 /v1/health)..." +for i in $(seq 1 36); do + if curl -sf http://localhost:8321/v1/health > /dev/null 2>&1; then + echo "✅ Llama Stack port-forward ready after $(( i * 5 ))s" + break + fi + if [ $i -eq 36 ]; then + echo "❌ Port-forward to llama-stack never became healthy (3 min)" + oc logs llama-stack-service -n "$NAMESPACE" --tail=100 || true + kill $PF_LCS_PID 2>/dev/null || true + kill $PF_JWKS_PID 2>/dev/null || true + kill $PF_LLAMA_PID 2>/dev/null || true + exit 1 + fi + if ! kill -0 $PF_LLAMA_PID 2>/dev/null; then + echo "Llama port-forward died, restarting (attempt $i)..." + oc port-forward svc/llama-stack-service-svc 8321:8321 -n $NAMESPACE & + PF_LLAMA_PID=$! + echo "$PF_LLAMA_PID" >"$E2E_LLAMA_PORT_FORWARD_PID_FILE" fi sleep 5 done export E2E_LSC_HOSTNAME="localhost" export E2E_JWKS_HOSTNAME="localhost" +export E2E_DEFAULT_MODEL_OVERRIDE="$MODEL_NAME" +export E2E_DEFAULT_PROVIDER_OVERRIDE="vllm" echo "LCS accessible at: http://$E2E_LSC_HOSTNAME:8080" echo "Mock JWKS accessible at: http://$E2E_JWKS_HOSTNAME:8000" +echo "Llama Stack accessible at: http://localhost:8321" @@ -352,8 +473,11 @@ TEST_EXIT_CODE=$(cat "$E2E_EXIT_CODE_FILE" 2>/dev/null || echo 1) # Kill first so wait doesn't block (if a port-forward is still running, wait would hang) kill $PF_LCS_PID 2>/dev/null || true kill $PF_JWKS_PID 2>/dev/null || true +kill $PF_LLAMA_PID 2>/dev/null || true wait $PF_LCS_PID 2>/dev/null || true wait $PF_JWKS_PID 2>/dev/null || true +wait $PF_LLAMA_PID 2>/dev/null || true +rm -f "$E2E_LSC_PORT_FORWARD_PID_FILE" "$E2E_LLAMA_PORT_FORWARD_PID_FILE" "$E2E_JWKS_PORT_FORWARD_PID_FILE" set -e trap 'echo "❌ Pipeline failed at line $LINENO"; exit 1' ERR diff --git a/tests/e2e-prow/rhoai/scripts/bootstrap.sh b/tests/e2e-prow/rhoai/scripts/bootstrap.sh index 1718b70e5..ae8444ca8 100755 --- a/tests/e2e-prow/rhoai/scripts/bootstrap.sh +++ b/tests/e2e-prow/rhoai/scripts/bootstrap.sh @@ -92,5 +92,11 @@ oc get csv -n openshift-nfd echo "--> Applying DataScienceCluster from ds-cluster.yaml..." oc apply -f "$BASE_DIR/manifests/operators/ds-cluster.yaml" +sleep 5 +sleep 10 + +echo "--> Checking DSCInitialization and DSC status..." +oc get dsci -A -o jsonpath='{range .items[*]}DSCI: {.metadata.name} applicationsNS: {.spec.applicationsNamespace}{"\n"}{end}' 2>/dev/null || echo "No DSCInitialization found" +oc get dsc -A -o jsonpath='{range .items[*]}DSC: {.metadata.name} phase: {.status.phase}{"\n"}{end}' 2>/dev/null || echo "No DSC status yet" echo "All files applied successfully. The DataScienceCluster is now provisioning." diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh index 540e2aab2..b98eafab3 100755 --- a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh +++ b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh @@ -32,6 +32,7 @@ MANIFEST_DIR="$SCRIPT_DIR/../manifests/lightspeed" # Written by pipeline.sh when it starts LCS port-forward; e2e-ops kills this PID before rebinding 8080. E2E_LSC_PORT_FORWARD_PID_FILE="${E2E_LSC_PORT_FORWARD_PID_FILE:-/tmp/e2e-lightspeed-port-forward.pid}" E2E_LLAMA_PORT_FORWARD_PID_FILE="${E2E_LLAMA_PORT_FORWARD_PID_FILE:-/tmp/e2e-llama-port-forward.pid}" +E2E_JWKS_PORT_FORWARD_PID_FILE="${E2E_JWKS_PORT_FORWARD_PID_FILE:-/tmp/e2e-jwks-port-forward.pid}" # ============================================================================ # Helper functions @@ -148,6 +149,23 @@ kill_stale_llama_forward() { free_local_tcp_port "$port" } +# Kill anything likely to hold the mock-jwks local forward (localhost:8000). +kill_stale_jwks_forward() { + local port="${1:-8000}" + local saved_pf + if [[ -f "$E2E_JWKS_PORT_FORWARD_PID_FILE" ]]; then + read -r saved_pf <"$E2E_JWKS_PORT_FORWARD_PID_FILE" 2>/dev/null || true + if [[ "$saved_pf" =~ ^[0-9]+$ ]]; then + kill -9 "$saved_pf" 2>/dev/null || true + fi + fi + pkill -9 -f "port-forward.*mock-jwks.*${port}:${port}" 2>/dev/null || true + pkill -9 -f "oc port-forward svc/mock-jwks ${port}:${port}" 2>/dev/null || true + free_local_tcp_port "$port" + sleep 1 + free_local_tcp_port "$port" +} + # After oc port-forward dies in <2s, show recent oc stderr from the log file. e2e_ops_emit_port_forward_immediate_failure_diag() { echo "[e2e-ops] /tmp/port-forward.log (tail 25):" @@ -172,21 +190,30 @@ verify_connectivity() { local max_attempts="${1:-6}" local local_port="${LOCAL_PORT:-8080}" local http_code="" - + for ((attempt=1; attempt<=max_attempts; attempt++)); do - # Check readiness endpoint - accept 200 or 401 (auth required but service is up) + # First check /readiness to see if port-forward is alive (accept 200 or 401) http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://localhost:$local_port/readiness" 2>/dev/null) || http_code="000" - + if [[ "$http_code" == "200" || "$http_code" == "401" ]]; then - return 0 + # Port-forward works; now verify the app is fully initialized by hitting + # a real endpoint. /v1/models requires the Llama Stack handshake to complete. + # Accept 200 (no auth) or 401 (auth enabled) — both prove the full app + # stack is up, not just the TCP socket. + local models_code + models_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 10 "http://localhost:$local_port/v1/models" 2>/dev/null) || models_code="000" + if [[ "$models_code" == "200" || "$models_code" == "401" ]]; then + return 0 + fi + echo "[e2e-ops] /readiness=$http_code but /v1/models=$models_code (app still initializing, attempt $attempt/$max_attempts)" fi - + if [[ $attempt -lt $max_attempts ]]; then - sleep 2 + sleep 5 fi done - - echo "Connectivity check failed (HTTP: ${http_code:-unknown})" + + echo "Connectivity check failed (readiness: ${http_code:-unknown})" return 1 } @@ -235,33 +262,50 @@ wait_for_llama_stack_http_health() { cmd_restart_lightspeed() { echo "Restarting lightspeed-stack service..." - + + # LCS hangs at startup if Llama Stack is unreachable (blocks Llama handshake, + # never opens port 8080, readiness probe never passes). Ensure Llama Stack + # is healthy before recreating the LCS pod. + if ! _llama_stack_http_health_once 2>/dev/null; then + echo "⚠️ Llama Stack not healthy — restoring before LCS restart..." + cmd_restart_llama_stack || echo "⚠️ Llama Stack restore failed; LCS may be slow to start" + fi + # Delete existing pod (short wait so hook stays within timeout; force if needed) timeout 20 oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --wait=true 2>/dev/null || { oc delete pod lightspeed-stack-service -n "$NAMESPACE" --ignore-not-found=true --force --grace-period=0 2>/dev/null || true sleep 2 } - # Apply manifest (expand LIGHTSPEED_STACK_IMAGE) + # Apply manifest (expand LIGHTSPEED_STACK_IMAGE only; filter prevents blanking other $VAR refs) LIGHTSPEED_STACK_IMAGE="${LIGHTSPEED_STACK_IMAGE:-quay.io/lightspeed-core/lightspeed-stack:dev-latest}" export LIGHTSPEED_STACK_IMAGE _ls_manifest="$MANIFEST_DIR/lightspeed-stack.yaml" - if command -v envsubst >/dev/null 2>&1; then - envsubst < "$_ls_manifest" | oc apply -n "$NAMESPACE" -f - - else - sed "s|\${LIGHTSPEED_STACK_IMAGE}|${LIGHTSPEED_STACK_IMAGE}|g" "$_ls_manifest" | - oc apply -n "$NAMESPACE" -f - - fi - - # Wait for pod to be ready (TCP probe passes when app listens on 8080) - wait_for_pod "lightspeed-stack-service" 40 + sed "s|\${LIGHTSPEED_STACK_IMAGE}|${LIGHTSPEED_STACK_IMAGE}|g" "$_ls_manifest" | + oc apply -n "$NAMESPACE" -f - + # Wait for pod to be ready (TCP probe passes when app listens on 8080). + # Don't let a timeout here abort the function — still attempt port-forward + # and diagnostics so later scenarios have a chance to recover. + local pod_ready=true + if ! wait_for_pod "lightspeed-stack-service" 40; then + pod_ready=false + echo "⚠️ Pod not ready within 120s — dumping diagnostics:" + oc describe pod lightspeed-stack-service -n "$NAMESPACE" 2>&1 | tail -30 || true + oc logs lightspeed-stack-service -n "$NAMESPACE" --tail=40 2>&1 || true + fi + # Re-label pod for service discovery oc label pod lightspeed-stack-service pod=lightspeed-stack-service -n "$NAMESPACE" --overwrite - - # Re-establish port-forward + + # Re-establish port-forwards (may succeed even if readiness was slow) cmd_restart_port_forward - + cmd_restart_jwks_port_forward || echo "⚠️ Mock JWKS port-forward failed (RBAC tests may fail)" + + if [[ "$pod_ready" == "false" ]]; then + echo "⚠️ Lightspeed restart completed but pod was slow to become ready" + return 1 + fi echo "✓ Lightspeed restart complete" } @@ -291,12 +335,9 @@ cmd_restart_llama_stack() { fi else # Prow: vLLM Llama Stack image (matches pipeline.sh / pipeline-services.sh) - if command -v envsubst >/dev/null 2>&1; then - envsubst < "$MANIFEST_DIR/llama-stack.yaml" | oc apply -n "$NAMESPACE" -f - - else - sed "s|\${LLAMA_STACK_IMAGE}|${LLAMA_STACK_IMAGE:-}|g" "$MANIFEST_DIR/llama-stack.yaml" | - oc apply -n "$NAMESPACE" -f - - fi + # Use sed instead of envsubst to avoid blanking $VAR references in embedded bash scripts + sed "s|\${LLAMA_STACK_IMAGE}|${LLAMA_STACK_IMAGE:-}|g" "$MANIFEST_DIR/llama-stack-prow.yaml" | + oc apply -n "$NAMESPACE" -f - wait_for_pod "llama-stack-service" 24 echo "Labeling pod for service..." oc label pod llama-stack-service pod=llama-stack-service -n "$NAMESPACE" --overwrite @@ -453,6 +494,66 @@ cmd_restart_llama_port_forward() { return 1 } +cmd_restart_jwks_port_forward() { + local local_port="${LOCAL_JWKS_PORT:-8000}" + local remote_port="${REMOTE_JWKS_PORT:-8000}" + local max_attempts=4 + local pf_pid + local jwks_pf_log="/tmp/port-forward-jwks.log" + + # Check if existing forward is still alive + if [[ -f "$E2E_JWKS_PORT_FORWARD_PID_FILE" ]]; then + local saved_pf + read -r saved_pf <"$E2E_JWKS_PORT_FORWARD_PID_FILE" 2>/dev/null || true + if [[ "$saved_pf" =~ ^[0-9]+$ ]] && kill -0 "$saved_pf" 2>/dev/null; then + local http_code + http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 3 "http://127.0.0.1:$local_port/tokens" 2>/dev/null) || http_code="000" + if [[ "$http_code" != "000" ]]; then + echo "✓ Mock JWKS port-forward already healthy (PID: $saved_pf)" + return 0 + fi + fi + fi + + echo "Re-establishing mock-jwks port-forward on $local_port:$remote_port..." + + for ((attempt=1; attempt<=max_attempts; attempt++)); do + kill_stale_jwks_forward "$local_port" + sleep 2 + + echo "JWKS port-forward attempt $attempt/$max_attempts" + + : >"$jwks_pf_log" + nohup oc port-forward svc/mock-jwks "$local_port:$remote_port" -n "$NAMESPACE" \ + "$jwks_pf_log" 2>&1 & + pf_pid=$! + disown "$pf_pid" 2>/dev/null || true + sleep 3 + + if ! kill -0 "$pf_pid" 2>/dev/null; then + echo "JWKS port-forward process exited immediately" + continue + fi + + local http_code + http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://127.0.0.1:$local_port/tokens" 2>/dev/null) || http_code="000" + if [[ "$http_code" != "000" ]]; then + echo "$pf_pid" >"$E2E_JWKS_PORT_FORWARD_PID_FILE" + echo "✓ Mock JWKS port-forward established (PID: $pf_pid)" + return 0 + fi + + if [[ $attempt -lt $max_attempts ]]; then + echo "JWKS forward attempt $attempt failed, retrying..." + kill -9 "$pf_pid" 2>/dev/null || true + sleep 2 + fi + done + + echo "Failed to establish mock-jwks port-forward on :$local_port" + return 1 +} + cmd_wait_for_pod() { local pod_name="${1:?Pod name required}" local max_attempts="${2:-24}" @@ -462,16 +563,24 @@ cmd_wait_for_pod() { cmd_update_configmap() { local configmap_name="${1:?ConfigMap name required}" local source_file="${2:?Source file required}" - + echo "Updating ConfigMap $configmap_name from $source_file..." - - # Delete existing configmap - oc delete configmap "$configmap_name" -n "$NAMESPACE" --ignore-not-found=true - - # Create new configmap from the source file - oc create configmap "$configmap_name" -n "$NAMESPACE" \ - --from-file="lightspeed-stack.yaml=$source_file" - + + if [[ ! -f "$source_file" ]]; then + echo "ERROR: source file does not exist: $source_file" >&2 + return 1 + fi + + # Use dry-run + apply to avoid the delete-then-create race. + # If delete succeeds but create fails the ConfigMap is gone and every + # subsequent attempt cascades into failure. + if ! oc create configmap "$configmap_name" -n "$NAMESPACE" \ + --from-file="lightspeed-stack.yaml=$source_file" \ + --dry-run=client -o yaml | oc apply -n "$NAMESPACE" -f -; then + echo "ERROR: oc apply for ConfigMap $configmap_name failed" >&2 + return 1 + fi + echo "✓ ConfigMap $configmap_name updated successfully" } @@ -515,6 +624,9 @@ case "$COMMAND" in restart-llama-port-forward) cmd_restart_llama_port_forward ;; + restart-jwks-port-forward) + cmd_restart_jwks_port_forward + ;; restart-port-forward) cmd_restart_port_forward ;; diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py index e519217e0..ca2474578 100644 --- a/tests/e2e/features/environment.py +++ b/tests/e2e/features/environment.py @@ -17,13 +17,20 @@ from behave.runner import Context from tests.e2e.features.steps.common import ( + get_llama_stack_hostname, + get_llama_stack_port, reset_active_lightspeed_stack_config_basename, ) -from tests.e2e.features.steps.health import reset_llama_stack_disrupt_once_tracking +from tests.e2e.features.steps.health import ( + get_llama_stack_was_running, + reset_llama_stack_disrupt_once_tracking, + reset_llama_stack_was_running, +) from tests.e2e.utils.llama_stack_utils import register_shield from tests.e2e.utils.prow_utils import ( restart_pod, restore_llama_stack_pod, + run_e2e_ops, ) from tests.e2e.utils.utils import ( is_prow_environment, @@ -133,6 +140,49 @@ def before_all(context: Context) -> None: ) +def _ensure_prow_port_forward(context: Context) -> None: + """Check that the lightspeed port-forward is alive; restart it if dead. + + Probes localhost:{E2E_LSC_PORT}/readiness — if it fails, calls e2e-ops + restart-port-forward to re-establish the tunnel before the scenario runs. + """ + host = os.getenv("E2E_LSC_HOSTNAME", "localhost") + port = os.getenv("E2E_LSC_PORT", "8080") + url = f"http://{host}:{port}/readiness" + try: + resp = requests.get(url, timeout=5) + if resp.status_code in (200, 401): + return + except requests.RequestException: + pass + + print("[before_scenario] Port-forward appears dead, restarting...") + try: + result = run_e2e_ops("restart-port-forward", timeout=60) + print(result.stdout, end="") + if result.returncode == 0: + print("[before_scenario] Port-forward re-established") + return + print(result.stderr, end="") + except subprocess.TimeoutExpired: + pass + + # Port-forward alone failed — the pod itself may be dead (e.g. Llama Stack + # was never restored after a disruption feature). Attempt a full restart, + # which also checks Llama health before recreating LCS. + print("[before_scenario] Port-forward failed; attempting full pod restart...") + try: + result = run_e2e_ops("restart-lightspeed", timeout=200) + print(result.stdout, end="") + if result.returncode != 0: + print(result.stderr, end="") + print("[before_scenario] Warning: full pod restart failed") + else: + print("[before_scenario] Pod restart + port-forward re-established") + except subprocess.TimeoutExpired: + print("[before_scenario] Warning: full pod restart timed out") + + def before_scenario(context: Context, scenario: Scenario) -> None: """Run before each scenario is run. @@ -157,6 +207,17 @@ def before_scenario(context: Context, scenario: Scenario) -> None: scenario.skip("Skipped in library mode (no separate llama-stack container)") return + # Skip scenarios that depend on services not deployed in Prow/OpenShift + # (e.g. mock-tls-inference, proxy sidecars only available in Docker Compose) + if is_prow_environment() and "skip-in-prow" in scenario.effective_tags: + scenario.skip("Skipped in Prow (requires Docker Compose services)") + return + + # In Prow, verify the lightspeed port-forward is alive before each scenario. + # Port-forwards can silently die between scenarios (e.g. pod restart, TCP reset). + if is_prow_environment(): + _ensure_prow_port_forward(context) + context.scenario_lightspeed_override_active = False context.lightspeed_stack_skip_restart = False @@ -253,7 +314,7 @@ def _print_llama_stack_diagnostics() -> None: print("--- end diagnostics ---") -def _restore_llama_stack(context: Context) -> None: +def _restore_llama_stack() -> None: """Restore Llama Stack connection after disruption.""" if is_prow_environment(): # Recreate llama pod, then restart LCS so in-process clients reconnect (Llama IP/pod changed). @@ -306,7 +367,7 @@ def _restore_llama_stack(context: Context) -> None: "llama-stack", "curl", "-sf", - f"http://{context.hostname_llama}:{context.port_llama}/v1/health", + f"http://{get_llama_stack_hostname()}:{get_llama_stack_port()}/v1/health", ], capture_output=True, timeout=5, @@ -382,11 +443,12 @@ def after_feature(context: Context, feature: Feature) -> None: when ``context.feedback_e2e_conversation_cleanup`` is set by feedback steps, delete tracked feedback test conversations. """ - # Restore Llama Stack FIRST (before any lightspeed-stack restart) - llama_was_running = getattr(context, "llama_stack_was_running", False) - if llama_was_running: - _restore_llama_stack(context) - context.llama_stack_was_running = False + # Restore Llama Stack FIRST (before any lightspeed-stack restart). + # Read from module-level state — Behave clears custom context attributes + # between scenarios, so context.llama_stack_was_running is unreliable here. + if get_llama_stack_was_running(): + _restore_llama_stack() + reset_llama_stack_was_running() if getattr(context, "feedback_e2e_conversation_cleanup", False): token = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6Ikpva" diff --git a/tests/e2e/features/proxy.feature b/tests/e2e/features/proxy.feature index aaab54f4e..d4d9a49f0 100644 --- a/tests/e2e/features/proxy.feature +++ b/tests/e2e/features/proxy.feature @@ -1,4 +1,4 @@ -@e2e_group_3 @skip-in-library-mode +@e2e_group_3 @skip-in-library-mode @skip-in-prow Feature: Proxy and TLS networking tests for Llama Stack providers Verify that the Lightspeed Stack works correctly when Llama Stack's diff --git a/tests/e2e/features/steps/common.py b/tests/e2e/features/steps/common.py index d90e8084e..5b7b3e715 100644 --- a/tests/e2e/features/steps/common.py +++ b/tests/e2e/features/steps/common.py @@ -21,12 +21,26 @@ # YAML across scenarios in one feature. Mutate the dict entry (no global). _active_lightspeed_stack_config_basename: dict[str, Optional[str]] = {"basename": None} +# Behave clears user attributes on ``context`` between scenarios; store +# Llama Stack endpoint info at module level so ``after_feature`` can see it. +_llama_stack_endpoint: dict[str, str] = {"hostname": "localhost", "port": "8321"} + def reset_active_lightspeed_stack_config_basename() -> None: """Reset before each feature; see ``environment.before_feature``.""" _active_lightspeed_stack_config_basename["basename"] = None +def get_llama_stack_hostname() -> str: + """Return the Llama Stack hostname surviving per-scenario context clearing.""" + return _llama_stack_endpoint["hostname"] + + +def get_llama_stack_port() -> str: + """Return the Llama Stack port surviving per-scenario context clearing.""" + return _llama_stack_endpoint["port"] + + @given("The service is started locally") def service_is_started_locally(context: Context) -> None: """Check the service status. @@ -46,6 +60,8 @@ def service_is_started_locally(context: Context) -> None: else: context.hostname_llama = "localhost" context.port_llama = os.getenv("E2E_LLAMA_PORT", "8321") + _llama_stack_endpoint["hostname"] = context.hostname_llama + _llama_stack_endpoint["port"] = context.port_llama @given('the Lightspeed stack configuration directory is "{directory}"') diff --git a/tests/e2e/features/steps/health.py b/tests/e2e/features/steps/health.py index b42ffe859..dd5243c5a 100644 --- a/tests/e2e/features/steps/health.py +++ b/tests/e2e/features/steps/health.py @@ -13,10 +13,25 @@ # Mutate one dict entry so we need not reassign a module-level bool (no global). _llama_stack_disrupt_once: dict[str, bool] = {"applied": False} +# Behave clears user attributes on ``context`` between scenarios; store +# ``was_running`` at module level so ``after_feature`` can still see it. +_llama_stack_was_running: dict[str, bool] = {"value": False} + + +def get_llama_stack_was_running() -> bool: + """Return whether Llama Stack was running before the disruption step.""" + return _llama_stack_was_running["value"] + + +def reset_llama_stack_was_running() -> None: + """Clear the module-level was_running flag after restoration.""" + _llama_stack_was_running["value"] = False + def reset_llama_stack_disrupt_once_tracking() -> None: """Reset before each feature; see ``environment.before_feature``.""" _llama_stack_disrupt_once["applied"] = False + _llama_stack_was_running["value"] = False @given("The llama-stack connection is disrupted") @@ -50,13 +65,18 @@ def llama_stack_connection_broken(context: Context) -> None: print("Llama Stack disruption skipped (already applied once this feature)") return - # Store original state for restoration (only on the real disruption path) + # Store original state for restoration (only on the real disruption path). + # Write to both context (backward compat) and module-level dict (survives + # Behave's per-scenario context clearing). context.llama_stack_was_running = False + _llama_stack_was_running["value"] = False if is_prow_environment(): from tests.e2e.utils.prow_utils import disrupt_llama_stack_pod - context.llama_stack_was_running = disrupt_llama_stack_pod() + was_running = disrupt_llama_stack_pod() + context.llama_stack_was_running = was_running + _llama_stack_was_running["value"] = was_running _llama_stack_disrupt_once["applied"] = True return @@ -71,6 +91,7 @@ def llama_stack_connection_broken(context: Context) -> None: if result.stdout.strip(): context.llama_stack_was_running = True + _llama_stack_was_running["value"] = True subprocess.run( ["docker", "stop", "llama-stack"], check=True, capture_output=True ) diff --git a/tests/e2e/features/tls.feature b/tests/e2e/features/tls.feature index 5a2d77338..a900b1c0f 100644 --- a/tests/e2e/features/tls.feature +++ b/tests/e2e/features/tls.feature @@ -1,4 +1,4 @@ -@e2e_group_1 @skip-in-library-mode +@e2e_group_1 @skip-in-library-mode @skip-in-prow Feature: TLS configuration for remote inference providers Validate that Llama Stack's NetworkConfig.tls settings are applied correctly when connecting to a remote inference provider over HTTPS. diff --git a/tests/e2e/utils/prow_utils.py b/tests/e2e/utils/prow_utils.py index 60e9a7a71..ca06727ea 100644 --- a/tests/e2e/utils/prow_utils.py +++ b/tests/e2e/utils/prow_utils.py @@ -197,7 +197,7 @@ def remove_configmap_backup(backup_key: str) -> None: def _recreate_configmap(configmap_name: str, source_file: str) -> None: - """Delete and recreate a ConfigMap from a file. + """Update a ConfigMap from a file via oc apply. Args: configmap_name: Name of the ConfigMap. @@ -205,6 +205,8 @@ def _recreate_configmap(configmap_name: str, source_file: str) -> None: """ result = run_e2e_ops("update-configmap", [configmap_name, source_file], timeout=60) if result.returncode != 0: + print(f"update-configmap stdout: {result.stdout}") + print(f"update-configmap stderr: {result.stderr}") raise subprocess.CalledProcessError( result.returncode, "update-configmap", result.stderr )