From 896b3384bf2782d39c8076c741b87250a2b7ced8 Mon Sep 17 00:00:00 2001 From: Tirso Garcia Date: Wed, 18 Mar 2026 21:47:15 +0100 Subject: [PATCH] feat: docs, delivery RBAC, and session/policy metrics (gaps 6-9) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gap 6 — docs/CONFIGURATION.md: Complete env var reference (80+ vars across 13 sections) Gap 7 — docs/DEPLOYMENT-TLS.md: Step-by-step TLS setup guide (cert generation, Helm values, verification, troubleshooting) for all 5 transports Gap 8 — Helm delivery RBAC: Separate ServiceAccount + Role + RoleBinding for K8s delivery tools (gated by kubernetesBackend.deliveryTools.enabled) Gap 9 — Session/policy Prometheus metrics: workspace_sessions_created_total, workspace_sessions_closed_total, workspace_discovery_requests_total, workspace_invocations_denied_total{reason} Added to existing KPIMetrics system, emitted on /metrics endpoint Co-Authored-By: Claude Opus 4.6 (1M context) --- .../templates/rbac-delivery.yaml | 47 ++ charts/underpass-runtime/templates/rbac.yaml | 45 -- charts/underpass-runtime/values.yaml | 4 + docs/CONFIGURATION.md | 226 ++++++ docs/DEPLOYMENT-TLS.md | 668 ++++++++++++++++++ internal/app/discovery.go | 4 + internal/app/kpi_metrics.go | 67 +- internal/app/kpi_metrics_test.go | 94 +++ internal/app/service.go | 23 +- 9 files changed, 1129 insertions(+), 49 deletions(-) create mode 100644 charts/underpass-runtime/templates/rbac-delivery.yaml create mode 100644 docs/CONFIGURATION.md create mode 100644 docs/DEPLOYMENT-TLS.md diff --git a/charts/underpass-runtime/templates/rbac-delivery.yaml b/charts/underpass-runtime/templates/rbac-delivery.yaml new file mode 100644 index 0000000..60bc416 --- /dev/null +++ b/charts/underpass-runtime/templates/rbac-delivery.yaml @@ -0,0 +1,47 @@ +{{- if .Values.kubernetesBackend.deliveryTools.enabled }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ default (printf "%s-delivery" (include "underpass-runtime.fullname" .)) .Values.kubernetesBackend.deliveryTools.serviceAccount }} + namespace: {{ .Values.kubernetesBackend.deliveryTools.namespace | default .Values.kubernetesBackend.namespace | default .Release.Namespace }} + labels: + {{- include "underpass-runtime.labels" . | nindent 4 }} +automountServiceAccountToken: true +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "underpass-runtime.fullname" . }}-delivery + namespace: {{ .Values.kubernetesBackend.deliveryTools.namespace | default .Values.kubernetesBackend.namespace | default .Release.Namespace }} + labels: + {{- include "underpass-runtime.labels" . | nindent 4 }} +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch", "patch", "update"] + - apiGroups: [""] + resources: ["services", "configmaps"] + verbs: ["get", "list", "watch", "create", "update", "patch"] + - apiGroups: ["apps"] + resources: ["deployments", "replicasets"] + verbs: ["get", "list", "watch", "create", "update", "patch"] + - apiGroups: ["apps"] + resources: ["deployments/rollout"] + verbs: ["update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "underpass-runtime.fullname" . }}-delivery + namespace: {{ .Values.kubernetesBackend.deliveryTools.namespace | default .Values.kubernetesBackend.namespace | default .Release.Namespace }} + labels: + {{- include "underpass-runtime.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "underpass-runtime.fullname" . }}-delivery +subjects: + - kind: ServiceAccount + name: {{ default (printf "%s-delivery" (include "underpass-runtime.fullname" .)) .Values.kubernetesBackend.deliveryTools.serviceAccount }} + namespace: {{ .Values.kubernetesBackend.deliveryTools.namespace | default .Values.kubernetesBackend.namespace | default .Release.Namespace }} +{{- end }} diff --git a/charts/underpass-runtime/templates/rbac.yaml b/charts/underpass-runtime/templates/rbac.yaml index 9a7d908..d477945 100644 --- a/charts/underpass-runtime/templates/rbac.yaml +++ b/charts/underpass-runtime/templates/rbac.yaml @@ -39,49 +39,4 @@ subjects: - kind: ServiceAccount name: {{ include "underpass-runtime.serviceAccountName" . }} namespace: {{ .Release.Namespace }} -{{- if .Values.kubernetesBackend.deliveryTools.enabled }} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: {{ include "underpass-runtime.fullname" . }}-delivery - namespace: {{ .Values.kubernetesBackend.namespace | default .Release.Namespace }} - labels: - {{- include "underpass-runtime.labels" . | nindent 4 }} -rules: - - apiGroups: [""] - resources: ["services", "configmaps"] - verbs: ["get", "list", "watch", "create", "update", "patch"] - - apiGroups: ["apps"] - resources: ["deployments"] - verbs: ["get", "list", "watch", "create", "update", "patch"] - - apiGroups: [""] - resources: ["pods"] - verbs: ["get", "list", "watch"] ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "underpass-runtime.fullname" . }}-delivery - namespace: {{ .Values.kubernetesBackend.namespace | default .Release.Namespace }} - labels: - {{- include "underpass-runtime.labels" . | nindent 4 }} -automountServiceAccountToken: true ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: {{ include "underpass-runtime.fullname" . }}-delivery - namespace: {{ .Values.kubernetesBackend.namespace | default .Release.Namespace }} - labels: - {{- include "underpass-runtime.labels" . | nindent 4 }} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: {{ include "underpass-runtime.fullname" . }}-delivery -subjects: - - kind: ServiceAccount - name: {{ include "underpass-runtime.fullname" . }}-delivery - namespace: {{ .Values.kubernetesBackend.namespace | default .Release.Namespace }} -{{- end }} {{- end }} diff --git a/charts/underpass-runtime/values.yaml b/charts/underpass-runtime/values.yaml index dca135f..5dfca9f 100644 --- a/charts/underpass-runtime/values.yaml +++ b/charts/underpass-runtime/values.yaml @@ -288,6 +288,10 @@ kubernetesBackend: deliveryTools: # -- Create delivery ServiceAccount + Role for K8s delivery tools enabled: false + # -- Target namespace for delivery RBAC (defaults to release namespace) + namespace: "" + # -- ServiceAccount name for delivery tools (auto-generated if empty) + serviceAccount: "" # -- RBAC for runner pod management rbac: diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md new file mode 100644 index 0000000..6f85090 --- /dev/null +++ b/docs/CONFIGURATION.md @@ -0,0 +1,226 @@ +# Workspace Service -- Environment Variable Reference + +Complete reference for all environment variables consumed by the workspace service +(`cmd/workspace/main.go` and `cmd/workspace/main_k8s.go`). + +--- + +## Core + +| Variable | Default | Description | +|---|---|---| +| `PORT` | `50053` | HTTP(S) listen port. | +| `LOG_LEVEL` | `info` | Log verbosity. Accepted values: `debug`, `info`, `warn`, `error` (case-insensitive). | +| `WORKSPACE_ROOT` | `/tmp/underpass-workspaces` | Filesystem root for local workspaces. Only used when `WORKSPACE_BACKEND=local`. | +| `ARTIFACT_ROOT` | `/tmp/underpass-artifacts` | Filesystem root for the local artifact store. Created automatically on startup. | +| `WORKSPACE_BACKEND` | `local` | Workspace lifecycle backend. Values: `local`, `docker`, `kubernetes`. The `kubernetes` variant requires the `k8s` build tag. | +| `WORKSPACE_DISABLED_BUNDLES` | _(empty -- all enabled)_ | Comma-separated list of tool bundle names to disable (e.g. `messaging,data`). | + +--- + +## TLS -- HTTP Server + +Controls TLS on the workspace HTTP server itself. + +| Variable | Default | Description | +|---|---|---| +| `WORKSPACE_TLS_MODE` | `disabled` | TLS mode for the HTTP listener. Values: `disabled` / `plaintext` (plain HTTP), `server` / `tls` (server-side TLS), `mutual` / `mtls` (mutual TLS -- requires client CA). | +| `WORKSPACE_TLS_CERT_PATH` | _(none)_ | Path to the PEM-encoded server certificate. Required when mode is `server` or `mutual`. | +| `WORKSPACE_TLS_KEY_PATH` | _(none)_ | Path to the PEM-encoded server private key. Required when mode is `server` or `mutual`. | +| `WORKSPACE_TLS_CLIENT_CA_PATH` | _(none)_ | Path to a PEM CA bundle used to verify client certificates. Required when mode is `mutual`. | + +--- + +## TLS -- Valkey + +Controls TLS for all Valkey connections (session store, invocation store, telemetry, outbox). +The Go runtime uses explicit env vars (not the `rediss://` URI scheme used by the Rust kernel). + +| Variable | Default | Description | +|---|---|---| +| `VALKEY_TLS_ENABLED` | `false` | Enable TLS for Valkey connections. Accepts `true`/`false`/`1`/`0`/`yes`/`no`. | +| `VALKEY_TLS_CA_PATH` | _(none)_ | Path to PEM CA bundle for verifying the Valkey server certificate. | +| `VALKEY_TLS_CERT_PATH` | _(none)_ | Path to PEM client certificate for mutual TLS. Optional -- when both cert and key are set, mode upgrades from `server` to `mutual`. | +| `VALKEY_TLS_KEY_PATH` | _(none)_ | Path to PEM client private key for mutual TLS. | + +--- + +## TLS -- NATS + +Controls TLS for the NATS event-bus connection. + +| Variable | Default | Description | +|---|---|---| +| `NATS_TLS_MODE` | `disabled` | TLS mode for the NATS client. Values: `disabled` / `plaintext`, `server` / `tls`, `mutual` / `mtls`. | +| `NATS_TLS_CA_PATH` | _(none)_ | Path to PEM CA bundle for verifying the NATS server certificate. | +| `NATS_TLS_CERT_PATH` | _(none)_ | Path to PEM client certificate (mutual TLS). | +| `NATS_TLS_KEY_PATH` | _(none)_ | Path to PEM client private key (mutual TLS). | +| `NATS_TLS_FIRST` | `false` | Env-var parity with the Rust kernel. The Go `nats.go` client does **not** support TLS-first handshake; if set to `true`, a warning is logged and the flag is ignored. | + +--- + +## Session Store + +| Variable | Default | Description | +|---|---|---| +| `SESSION_STORE_BACKEND` | `memory` | Session persistence backend. Values: `memory`, `valkey`. | +| `SESSION_STORE_KEY_PREFIX` | `workspace:session` | Redis/Valkey key prefix for session records. Only used when backend is `valkey`. | +| `SESSION_STORE_TTL_SECONDS` | `86400` | Session TTL in seconds (default 24 h). Only used when backend is `valkey`. | + +--- + +## Invocation Store + +| Variable | Default | Description | +|---|---|---| +| `INVOCATION_STORE_BACKEND` | `memory` | Invocation persistence backend. Values: `memory`, `valkey`. | +| `INVOCATION_STORE_KEY_PREFIX` | `workspace:invocation` | Redis/Valkey key prefix for invocation records. Only used when backend is `valkey`. | +| `INVOCATION_STORE_TTL_SECONDS` | `86400` | Invocation TTL in seconds (default 24 h). Only used when backend is `valkey`. | + +--- + +## Valkey Connection + +Shared by all Valkey-backed subsystems (session store, invocation store, telemetry, outbox). + +| Variable | Default | Description | +|---|---|---| +| `VALKEY_ADDR` | _(none)_ | Full `host:port` address. When set, takes precedence over `VALKEY_HOST` + `VALKEY_PORT`. | +| `VALKEY_HOST` | `localhost` | Valkey hostname. Ignored when `VALKEY_ADDR` is set. | +| `VALKEY_PORT` | `6379` | Valkey port. Ignored when `VALKEY_ADDR` is set. | +| `VALKEY_PASSWORD` | _(empty)_ | Valkey AUTH password. | +| `VALKEY_DB` | `0` | Valkey database index (integer). | + +--- + +## Event Bus + +| Variable | Default | Description | +|---|---|---| +| `EVENT_BUS` | `none` | Event publisher backend. Values: `none` (noop), `nats`. | +| `EVENT_BUS_NATS_URL` | `nats://localhost:4222` | NATS server URL. Only used when `EVENT_BUS=nats`. | +| `EVENT_BUS_NATS_STREAM` | _(empty)_ | JetStream stream name. Empty string disables JetStream (uses core NATS publish). | +| `EVENT_BUS_OUTBOX` | `false` | Enable the Valkey-backed outbox relay between the service and the NATS publisher. Requires a working Valkey connection. | +| `EVENT_BUS_OUTBOX_KEY_PREFIX` | `workspace:outbox` | Redis/Valkey key prefix for outbox entries. | + +--- + +## Artifact Store + +| Variable | Default | Description | +|---|---|---| +| `ARTIFACT_BACKEND` | `local` | Artifact persistence backend. Values: `local`, `s3`. | +| `ARTIFACT_S3_BUCKET` | `workspace-artifacts` | S3 bucket name. | +| `ARTIFACT_S3_PREFIX` | _(empty)_ | Optional key prefix inside the bucket. | +| `ARTIFACT_S3_ENDPOINT` | _(empty)_ | Custom S3-compatible endpoint (e.g. MinIO). Leave empty for AWS S3 default. | +| `ARTIFACT_S3_REGION` | `us-east-1` | AWS region for the S3 bucket. | +| `ARTIFACT_S3_ACCESS_KEY` | _(empty)_ | S3 access key ID. | +| `ARTIFACT_S3_SECRET_KEY` | _(empty)_ | S3 secret access key. | +| `ARTIFACT_S3_PATH_STYLE` | `true` | Use path-style addressing (`true` for MinIO, `false` for AWS virtual-hosted). | +| `ARTIFACT_S3_USE_SSL` | `false` | Enable HTTPS for the S3 connection. | +| `ARTIFACT_S3_CA_PATH` | _(none)_ | Path to PEM CA bundle for verifying the S3 endpoint certificate. | + +--- + +## Telemetry + +### Internal telemetry (Valkey aggregator) + +| Variable | Default | Description | +|---|---|---| +| `TELEMETRY_BACKEND` | `none` | Telemetry recorder backend. Values: `none` (noop recorder + in-memory querier), `memory` (in-memory recorder + querier), `valkey` (persistent). | +| `TELEMETRY_KEY_PREFIX` | `workspace:telemetry` | Redis/Valkey key prefix for telemetry data. Only used when backend is `valkey`. | +| `TELEMETRY_TTL_SECONDS` | `604800` | Telemetry record TTL in seconds (default 7 days). Only used when backend is `valkey`. | +| `TELEMETRY_AGGREGATION_INTERVAL_SECONDS` | `300` | Background aggregation loop interval in seconds (default 5 min). Only used when backend is `valkey`. | + +### OpenTelemetry (OTLP traces) + +| Variable | Default | Description | +|---|---|---| +| `WORKSPACE_OTEL_ENABLED` | `false` | Enable the OTLP trace exporter. | +| `WORKSPACE_OTEL_EXPORTER_OTLP_ENDPOINT` | _(empty -- SDK default)_ | OTLP HTTP endpoint (e.g. `otel-collector:4318`). | +| `WORKSPACE_OTEL_EXPORTER_OTLP_INSECURE` | `false` | Disable TLS verification for the OTLP exporter (dev/test only). | +| `WORKSPACE_OTEL_TLS_CA_PATH` | _(none)_ | Path to PEM CA bundle for verifying the OTLP collector certificate. | +| `WORKSPACE_VERSION` | `unknown` | Reported as `service.version` in OTLP resource attributes. | +| `WORKSPACE_ENV` | `unknown` | Reported as `deployment.environment` in OTLP resource attributes. | + +--- + +## Kubernetes Backend + +All variables below apply only when `WORKSPACE_BACKEND=kubernetes` and the binary is built with the `k8s` build tag. + +### Workspace pods + +| Variable | Default | Description | +|---|---|---| +| `WORKSPACE_K8S_NAMESPACE` | `underpass-runtime` | Kubernetes namespace for workspace pods. | +| `WORKSPACE_K8S_SERVICE_ACCOUNT` | _(empty)_ | ServiceAccount assigned to workspace pods. Empty means the namespace default. | +| `WORKSPACE_K8S_RUNNER_IMAGE` | _(empty)_ | Default container image for workspace runner pods. | +| `WORKSPACE_K8S_RUNNER_IMAGE_BUNDLES_JSON` | _(empty)_ | JSON object mapping runner profile names to container images (e.g. `{"python":"img:py","node":"img:node"}`). | +| `WORKSPACE_K8S_RUNNER_PROFILE_METADATA_KEY` | `runner_profile` | Session metadata key used to select a runner image bundle. | +| `WORKSPACE_K8S_INIT_IMAGE` | _(empty)_ | Init container image (repo clone, workspace setup). | +| `WORKSPACE_K8S_WORKDIR` | `/workspace/repo` | Working directory inside the runner container. | +| `WORKSPACE_K8S_CONTAINER` | `runner` | Name of the runner container inside the pod spec. | +| `WORKSPACE_K8S_POD_PREFIX` | `ws` | Prefix for generated pod names (e.g. `ws-`). | +| `WORKSPACE_K8S_READY_TIMEOUT_SECONDS` | `120` | Maximum wait time (seconds) for a workspace pod to reach Ready. | +| `WORKSPACE_K8S_GIT_AUTH_SECRET` | _(empty)_ | Name of the Kubernetes Secret containing Git credentials, mounted into the init container. | +| `WORKSPACE_K8S_GIT_AUTH_METADATA_KEY` | `git_auth_secret` | Session metadata key that overrides the default Git auth secret name. | + +### Pod security context + +| Variable | Default | Description | +|---|---|---| +| `WORKSPACE_K8S_RUN_AS_USER` | `1000` | UID for the pod security context `runAsUser`. | +| `WORKSPACE_K8S_RUN_AS_GROUP` | `1000` | GID for the pod security context `runAsGroup`. | +| `WORKSPACE_K8S_FS_GROUP` | `1000` | GID for the pod security context `fsGroup`. | +| `WORKSPACE_K8S_READ_ONLY_ROOT_FS` | `false` | Set `readOnlyRootFilesystem` on the runner container. | +| `WORKSPACE_K8S_AUTOMOUNT_SA_TOKEN` | `false` | Set `automountServiceAccountToken` on the pod spec. | + +### Pod janitor (garbage collection) + +| Variable | Default | Description | +|---|---|---| +| `WORKSPACE_K8S_POD_JANITOR_ENABLED` | `true` | Enable the background pod janitor loop. | +| `WORKSPACE_K8S_POD_JANITOR_INTERVAL_SECONDS` | `60` | Interval (seconds) between janitor sweeps. | +| `WORKSPACE_K8S_SESSION_POD_TERMINAL_TTL_SECONDS` | `300` | Grace period (seconds) before deleting a pod whose session has terminated. | +| `WORKSPACE_K8S_CONTAINER_POD_TERMINAL_TTL_SECONDS` | `300` | Grace period (seconds) before deleting a pod whose containers have terminated. | +| `WORKSPACE_K8S_MISSING_SESSION_GRACE_SECONDS` | `120` | Grace period (seconds) before deleting a pod whose session record no longer exists. | + +### Kubernetes client + +| Variable | Default | Description | +|---|---|---| +| `KUBECONFIG` | _(none)_ | Path to a kubeconfig file. Falls back to `~/.kube/config`, then in-cluster config. | + +--- + +## Authentication + +| Variable | Default | Description | +|---|---|---| +| `WORKSPACE_AUTH_MODE` | `payload` | Authentication mode. Values: `payload` (identity extracted from request body -- no token check), `trusted_headers` (identity from HTTP headers, validated with a shared token). | +| `WORKSPACE_AUTH_SHARED_TOKEN` | _(empty)_ | Shared secret token. Required when mode is `trusted_headers`. Compared in constant time. | +| `WORKSPACE_AUTH_TENANT_HEADER` | `X-Workspace-Tenant-Id` | HTTP header carrying the tenant identifier. | +| `WORKSPACE_AUTH_ACTOR_HEADER` | `X-Workspace-Actor-Id` | HTTP header carrying the actor (user/agent) identifier. | +| `WORKSPACE_AUTH_ROLES_HEADER` | `X-Workspace-Roles` | HTTP header carrying a comma-separated list of roles. | +| `WORKSPACE_AUTH_TOKEN_HEADER` | `X-Workspace-Auth-Token` | HTTP header carrying the shared authentication token. | + +--- + +## Docker Backend + +All variables below apply only when `WORKSPACE_BACKEND=docker`. + +| Variable | Default | Description | +|---|---|---| +| `WORKSPACE_DOCKER_SOCKET` | _(empty -- Docker default)_ | Path to the Docker daemon socket (e.g. `/var/run/docker.sock`). Empty uses the client library default from `DOCKER_HOST`. | +| `WORKSPACE_DOCKER_IMAGE` | `alpine:3.20` | Default container image for workspace containers. | +| `WORKSPACE_DOCKER_IMAGE_BUNDLES_JSON` | _(empty)_ | JSON object mapping runner profile names to container images. Same semantics as the K8s equivalent. | +| `WORKSPACE_DOCKER_RUNNER_PROFILE_KEY` | `runner_profile` | Session metadata key used to select an image bundle. | +| `WORKSPACE_DOCKER_WORKDIR` | `/workspace/repo` | Working directory inside the workspace container. | +| `WORKSPACE_DOCKER_CONTAINER_PREFIX` | `ws` | Prefix for generated container names. | +| `WORKSPACE_DOCKER_NETWORK` | _(empty)_ | Docker network to attach workspace containers to. Empty means the default bridge. | +| `WORKSPACE_DOCKER_CPU_LIMIT` | `2` | CPU core limit for workspace containers (integer). | +| `WORKSPACE_DOCKER_MEMORY_LIMIT_MB` | `2048` | Memory limit in MiB for workspace containers. | +| `WORKSPACE_DOCKER_TTL_SECONDS` | `3600` | Container TTL in seconds (default 1 h). Containers exceeding this age may be reaped. | diff --git a/docs/DEPLOYMENT-TLS.md b/docs/DEPLOYMENT-TLS.md new file mode 100644 index 0000000..fb3ca17 --- /dev/null +++ b/docs/DEPLOYMENT-TLS.md @@ -0,0 +1,668 @@ +# TLS Deployment Guide for underpass-runtime + +This guide covers enabling TLS across all five transports in underpass-runtime: +HTTP server, NATS event bus, Valkey session/invocation store, S3/MinIO artifact +storage, and OTLP telemetry export. + +## Table of Contents + +1. [Overview](#1-overview) +2. [Generate Self-Signed Certificates](#2-generate-self-signed-certificates) +3. [Create Kubernetes Secrets](#3-create-kubernetes-secrets) +4. [Deploy with Server TLS (tls.mode=server)](#4-deploy-with-server-tls) +5. [Deploy with Mutual TLS (tls.mode=mutual)](#5-deploy-with-mutual-tls) +6. [Valkey TLS Setup](#6-valkey-tls-setup) +7. [NATS TLS Setup](#7-nats-tls-setup) +8. [S3/MinIO TLS](#8-s3minio-tls) +9. [OTLP TLS](#9-otlp-tls) +10. [Verification](#10-verification) +11. [Troubleshooting](#11-troubleshooting) + +--- + +## 1. Overview + +### TLS Modes + +underpass-runtime supports three TLS modes, controlled by the `tls.mode` Helm +value (or the `WORKSPACE_TLS_MODE` environment variable): + +| Mode | Aliases | Behaviour | +|------------|--------------------|------------------------------------------------------------| +| `disabled` | `plaintext`, `""` | No TLS. Plain HTTP on the configured port. | +| `server` | `tls` | Server presents a certificate. Clients verify it. | +| `mutual` | `mtls` | Server presents a certificate AND requires a client cert. | + +### TLS 1.3 Minimum + +All TLS configurations enforce **TLS 1.3 as the minimum version**. This is +hard-coded in `internal/tlsutil/tls.go` and applies to every transport (HTTP +server, NATS client, Valkey client, S3 client, OTLP client). Connections from +clients that do not support TLS 1.3 will be rejected. + +### Environment Variables Reference + +| Transport | Variables | +|-------------|-----------------------------------------------------------------------------------------------| +| HTTP server | `WORKSPACE_TLS_MODE`, `WORKSPACE_TLS_CERT_PATH`, `WORKSPACE_TLS_KEY_PATH`, `WORKSPACE_TLS_CLIENT_CA_PATH` | +| Valkey | `VALKEY_TLS_ENABLED`, `VALKEY_TLS_CA_PATH`, `VALKEY_TLS_CERT_PATH`, `VALKEY_TLS_KEY_PATH` | +| NATS | `NATS_TLS_MODE`, `NATS_TLS_CA_PATH`, `NATS_TLS_CERT_PATH`, `NATS_TLS_KEY_PATH`, `NATS_TLS_FIRST` | +| S3/MinIO | `ARTIFACT_S3_USE_SSL`, `ARTIFACT_S3_CA_PATH` | +| OTLP | `WORKSPACE_OTEL_TLS_CA_PATH` | + +--- + +## 2. Generate Self-Signed Certificates + +The commands below create a self-signed CA and a server certificate with a +Subject Alternative Name (SAN). Adjust the SAN values to match your cluster's +service DNS names. + +### 2a. Create a Certificate Authority (CA) + +```bash +# Generate CA private key +openssl genrsa -out ca.key 4096 + +# Generate CA certificate (valid 10 years) +openssl req -new -x509 -days 3650 -key ca.key \ + -out ca.crt \ + -subj "/CN=underpass-runtime-ca/O=Underpass" +``` + +### 2b. Create a Server Certificate + +```bash +# Generate server private key +openssl genrsa -out tls.key 4096 + +# Create a CSR config with SANs +cat > server-csr.conf <-underpass-runtime..svc.cluster.local`. + +### 2c. Create a Client Certificate (for mutual TLS) + +Only needed when `tls.mode=mutual`. + +```bash +openssl genrsa -out client.key 4096 + +cat > client-csr.conf <:/healthz` | +| `server` | `curl --cacert https://:/healthz` | +| `mutual` | `nc -z -w5 ` (TCP connectivity only) | + +### 10c. Manual Verification with curl + +From inside the cluster (e.g., a debug pod): + +**Server TLS:** + +```bash +# Copy ca.crt into the debug pod, then: +curl --cacert ca.crt \ + https://underpass-runtime.default.svc.cluster.local:50053/healthz +``` + +**Mutual TLS:** + +```bash +curl --cacert ca.crt \ + --cert client.crt \ + --key client.key \ + https://underpass-runtime.default.svc.cluster.local:50053/healthz +``` + +### 10d. Verify TLS 1.3 + +```bash +openssl s_client \ + -connect underpass-runtime.default.svc.cluster.local:50053 \ + -tls1_3 \ + -CAfile ca.crt \ + &1 | grep "Protocol" +``` + +Expected output: + +``` +Protocol : TLSv1.3 +``` + +Attempting TLS 1.2 should fail: + +```bash +openssl s_client \ + -connect underpass-runtime.default.svc.cluster.local:50053 \ + -tls1_2 \ + -CAfile ca.crt \ + &1 | grep -i "error\|alert" +``` + +--- + +## 11. Troubleshooting + +### "tls.existingSecret is required when tls.mode is server or mutual" + +The Helm chart requires `tls.existingSecret` whenever TLS is enabled. Create +the Kubernetes secret first (see section 3), then reference it in your values. + +### "no valid certificates in CA file /var/run/..." + +The CA file is not valid PEM or is empty. Verify the secret contents: + +```bash +kubectl get secret underpass-runtime-tls -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -text +``` + +### "load server cert/key: tls: private key does not match public key" + +The certificate and key in the secret do not form a valid pair. Verify: + +```bash +openssl x509 -noout -modulus -in tls.crt | md5sum +openssl rsa -noout -modulus -in tls.key | md5sum +``` + +Both checksums must match. + +### "remote error: tls: bad certificate" (mutual TLS) + +The client is either not presenting a certificate or presenting one that is not +signed by the CA specified in `tls.keys.clientCa`. Verify the client cert: + +```bash +openssl verify -CAfile ca.crt client.crt +``` + +### Liveness/readiness probes fail with TLS enabled + +In `server` mode, the probes use `scheme: HTTPS`. If the certificate's SAN +does not include the pod IP or service DNS, the kubelet's probe will fail with +a TLS error. Make sure the certificate SANs cover the service name. + +In `mutual` mode, probes fall back to `tcpSocket` because the kubelet cannot +present a client certificate. This is expected and handled automatically by the +chart. + +### NATS connection fails with "tls: first record does not look like a TLS handshake" + +The NATS server expects a NATS INFO handshake before TLS upgrade, but the +client is attempting TLS immediately. Use the `tls://` URL scheme in +`eventBus.nats.url` and do **not** rely on `NATS_TLS_FIRST` (not supported in +the Go client). + +### Valkey connection times out with TLS enabled + +Valkey in TLS mode typically listens on port **6380**, not 6379. Verify +`valkey.port` matches the Valkey server's TLS port. Also confirm that Valkey +was started with TLS enabled (`--tls-port 6380 --tls-cert-file ... --tls-key-file ... --tls-ca-cert-file ...`). + +### S3/MinIO "x509: certificate signed by unknown authority" + +Set `artifacts.s3.caPath` to the path of the CA that signed the MinIO server +certificate. If reusing the HTTP server TLS mount, point to +`/var/run/underpass-runtime/tls/ca.crt`. Otherwise, mount a separate secret. + +### OTLP export fails with "transport: authentication handshake failed" + +Set `telemetry.otel.caPath` to the CA certificate path for the OTLP collector. +If the collector uses a publicly-trusted cert, ensure the container image +includes an up-to-date CA bundle. Setting `telemetry.otel.insecure: true` will +bypass TLS entirely (not recommended for production). + +### Permission denied reading certificate files + +The container runs as non-root user 65532 (see `podSecurityContext` in +values.yaml). Kubernetes secret volume mounts default to mode 0644, which is +readable by all users. If you override the secret volume `defaultMode`, ensure +the files remain readable by UID 65532. + +--- + +## Full Example: All Transports with TLS + +A single values file enabling TLS on every transport: + +```yaml +tls: + mode: server + existingSecret: underpass-runtime-tls + mountPath: /var/run/underpass-runtime/tls + keys: + cert: tls.crt + key: tls.key + clientCa: ca.crt + +natsTls: + mode: server + existingSecret: underpass-runtime-nats-tls + mountPath: /var/run/underpass-runtime/nats-tls + keys: + ca: ca.crt + cert: "" + key: "" + +valkeyTls: + enabled: true + existingSecret: underpass-runtime-valkey-tls + mountPath: /var/run/underpass-runtime/valkey-tls + keys: + ca: ca.crt + cert: "" + key: "" + +eventBus: + type: nats + nats: + url: tls://nats:4222 + +valkey: + enabled: true + host: valkey + port: 6380 + existingSecret: valkey-password + +stores: + backend: valkey + +artifacts: + backend: s3 + s3: + bucket: workspace-artifacts + endpoint: minio:9000 + useSSL: true + caPath: /var/run/underpass-runtime/tls/ca.crt + existingSecret: minio-credentials + +telemetry: + backend: valkey + otel: + enabled: true + endpoint: otel-collector:4317 + insecure: false + caPath: /var/run/underpass-runtime/tls/ca.crt +``` + +```bash +helm upgrade --install underpass-runtime \ + charts/underpass-runtime \ + -f values-tls-full.yaml +``` diff --git a/internal/app/discovery.go b/internal/app/discovery.go index 56cd5c9..8cba72b 100644 --- a/internal/app/discovery.go +++ b/internal/app/discovery.go @@ -83,6 +83,10 @@ func (s *Service) DiscoverTools(ctx context.Context, sessionID string, detail Di total := len(s.catalog.List()) + if s.kpiMetrics != nil { + s.kpiMetrics.ObserveDiscoveryRequest() + } + if detail == DiscoveryDetailFull { return s.discoverFull(tools, total, filter), nil } diff --git a/internal/app/kpi_metrics.go b/internal/app/kpi_metrics.go index fe65579..4415a61 100644 --- a/internal/app/kpi_metrics.go +++ b/internal/app/kpi_metrics.go @@ -28,12 +28,25 @@ type KPIMetrics struct { // workspace_context_bytes_saved: bytes saved by compact discovery contextBytesSaved int64 + + // workspace_sessions_created_total: sessions successfully created (HTTP 201) + sessionsCreated uint64 + + // workspace_sessions_closed_total: sessions successfully closed (HTTP 200) + sessionsClosed uint64 + + // workspace_discovery_requests_total: discovery endpoint served + discoveryRequests uint64 + + // workspace_invocations_denied_total{reason}: denied invocations by reason + invocationsDenied map[string]uint64 } // NewKPIMetrics creates a new KPI metrics tracker. func NewKPIMetrics() *KPIMetrics { return &KPIMetrics{ - toolCallsPerTask: map[string]uint64{}, + toolCallsPerTask: map[string]uint64{}, + invocationsDenied: map[string]uint64{}, } } @@ -85,6 +98,37 @@ func (k *KPIMetrics) ObserveContextBytesSaved(bytes int64) { k.contextBytesSaved += bytes } +// ObserveSessionCreated increments the sessions-created counter. +func (k *KPIMetrics) ObserveSessionCreated() { + k.mu.Lock() + defer k.mu.Unlock() + k.sessionsCreated++ +} + +// ObserveSessionClosed increments the sessions-closed counter. +func (k *KPIMetrics) ObserveSessionClosed() { + k.mu.Lock() + defer k.mu.Unlock() + k.sessionsClosed++ +} + +// ObserveDiscoveryRequest increments the discovery-requests counter. +func (k *KPIMetrics) ObserveDiscoveryRequest() { + k.mu.Lock() + defer k.mu.Unlock() + k.discoveryRequests++ +} + +// ObserveInvocationDenied increments the denied-invocations counter for the given reason. +func (k *KPIMetrics) ObserveInvocationDenied(reason string) { + k.mu.Lock() + defer k.mu.Unlock() + if reason == "" { + reason = "unspecified" + } + k.invocationsDenied[reason]++ +} + // PrometheusText returns Prometheus exposition format text for all KPI metrics. func (k *KPIMetrics) PrometheusText() string { k.mu.RLock() @@ -132,6 +176,27 @@ func (k *KPIMetrics) PrometheusText() string { b.WriteString("# TYPE workspace_context_bytes_saved counter\n") fmt.Fprintf(&b, "workspace_context_bytes_saved %d\n", k.contextBytesSaved) + b.WriteString("# HELP workspace_sessions_created_total Total sessions successfully created.\n") + b.WriteString("# TYPE workspace_sessions_created_total counter\n") + fmt.Fprintf(&b, "workspace_sessions_created_total %d\n", k.sessionsCreated) + + b.WriteString("# HELP workspace_sessions_closed_total Total sessions successfully closed.\n") + b.WriteString("# TYPE workspace_sessions_closed_total counter\n") + fmt.Fprintf(&b, "workspace_sessions_closed_total %d\n", k.sessionsClosed) + + b.WriteString("# HELP workspace_discovery_requests_total Total tool discovery requests served.\n") + b.WriteString("# TYPE workspace_discovery_requests_total counter\n") + fmt.Fprintf(&b, "workspace_discovery_requests_total %d\n", k.discoveryRequests) + + b.WriteString("# HELP workspace_invocations_denied_total Total denied invocations by denial reason.\n") + b.WriteString("# TYPE workspace_invocations_denied_total counter\n") + for _, reason := range sortedInnerKeys(k.invocationsDenied) { + fmt.Fprintf(&b, "workspace_invocations_denied_total{reason=\"%s\"} %d\n", + escapePrometheusLabelValue(reason), + k.invocationsDenied[reason], + ) + } + return b.String() } diff --git a/internal/app/kpi_metrics_test.go b/internal/app/kpi_metrics_test.go index 53b7c45..ec1f97e 100644 --- a/internal/app/kpi_metrics_test.go +++ b/internal/app/kpi_metrics_test.go @@ -93,6 +93,88 @@ func TestKPIMetrics_ContextBytesSaved(t *testing.T) { } } +func TestKPIMetrics_SessionsCreated(t *testing.T) { + kpi := NewKPIMetrics() + kpi.ObserveSessionCreated() + kpi.ObserveSessionCreated() + kpi.ObserveSessionCreated() + + text := kpi.PrometheusText() + if !strings.Contains(text, "workspace_sessions_created_total 3") { + t.Fatalf("expected sessions_created=3 in output:\n%s", text) + } +} + +func TestKPIMetrics_SessionsClosed(t *testing.T) { + kpi := NewKPIMetrics() + kpi.ObserveSessionClosed() + kpi.ObserveSessionClosed() + + text := kpi.PrometheusText() + if !strings.Contains(text, "workspace_sessions_closed_total 2") { + t.Fatalf("expected sessions_closed=2 in output:\n%s", text) + } +} + +func TestKPIMetrics_DiscoveryRequests(t *testing.T) { + kpi := NewKPIMetrics() + kpi.ObserveDiscoveryRequest() + kpi.ObserveDiscoveryRequest() + kpi.ObserveDiscoveryRequest() + kpi.ObserveDiscoveryRequest() + + text := kpi.PrometheusText() + if !strings.Contains(text, "workspace_discovery_requests_total 4") { + t.Fatalf("expected discovery_requests=4 in output:\n%s", text) + } +} + +func TestKPIMetrics_InvocationsDenied(t *testing.T) { + kpi := NewKPIMetrics() + kpi.ObserveInvocationDenied("policy_denied") + kpi.ObserveInvocationDenied("policy_denied") + kpi.ObserveInvocationDenied("approval_required") + kpi.ObserveInvocationDenied("") // defaults to "unspecified" + + text := kpi.PrometheusText() + if !strings.Contains(text, `workspace_invocations_denied_total{reason="policy_denied"} 2`) { + t.Fatalf("expected policy_denied=2 in output:\n%s", text) + } + if !strings.Contains(text, `workspace_invocations_denied_total{reason="approval_required"} 1`) { + t.Fatalf("expected approval_required=1 in output:\n%s", text) + } + if !strings.Contains(text, `workspace_invocations_denied_total{reason="unspecified"} 1`) { + t.Fatalf("expected unspecified=1 in output:\n%s", text) + } +} + +func TestKPIMetrics_InvocationsDenied_Empty(t *testing.T) { + kpi := NewKPIMetrics() + text := kpi.PrometheusText() + // No denied_total lines should appear when there are no denials. + if strings.Contains(text, "workspace_invocations_denied_total{") { + t.Fatalf("expected no denied lines when empty:\n%s", text) + } + // But the HELP/TYPE header should still be present. + if !strings.Contains(text, "# HELP workspace_invocations_denied_total") { + t.Fatalf("expected HELP header for denied_total:\n%s", text) + } +} + +func TestKPIMetrics_SessionCounters_ZeroDefault(t *testing.T) { + kpi := NewKPIMetrics() + text := kpi.PrometheusText() + if !strings.Contains(text, "workspace_sessions_created_total 0") { + t.Fatalf("expected sessions_created=0 by default:\n%s", text) + } + if !strings.Contains(text, "workspace_sessions_closed_total 0") { + t.Fatalf("expected sessions_closed=0 by default:\n%s", text) + } + if !strings.Contains(text, "workspace_discovery_requests_total 0") { + t.Fatalf("expected discovery_requests=0 by default:\n%s", text) + } +} + func TestKPIMetrics_PrometheusText_AllSections(t *testing.T) { kpi := NewKPIMetrics() kpi.ObserveToolCall("build") @@ -100,6 +182,10 @@ func TestKPIMetrics_PrometheusText_AllSections(t *testing.T) { kpi.ObserveRecommendationUsed(true) kpi.ObservePolicyDenialAfterRecommendation(false) kpi.ObserveContextBytesSaved(512) + kpi.ObserveSessionCreated() + kpi.ObserveSessionClosed() + kpi.ObserveDiscoveryRequest() + kpi.ObserveInvocationDenied("policy_denied") text := kpi.PrometheusText() @@ -109,6 +195,10 @@ func TestKPIMetrics_PrometheusText_AllSections(t *testing.T) { "workspace_recommendation_acceptance_rate", "workspace_policy_denial_rate_bad_recommendation", "workspace_context_bytes_saved", + "workspace_sessions_created_total", + "workspace_sessions_closed_total", + "workspace_discovery_requests_total", + "workspace_invocations_denied_total", } for _, section := range expectedSections { if !strings.Contains(text, section) { @@ -130,6 +220,10 @@ func TestKPIMetrics_ConcurrentAccess(t *testing.T) { kpi.ObserveRecommendationUsed(true) kpi.ObservePolicyDenialAfterRecommendation(false) kpi.ObserveContextBytesSaved(10) + kpi.ObserveSessionCreated() + kpi.ObserveSessionClosed() + kpi.ObserveDiscoveryRequest() + kpi.ObserveInvocationDenied("policy_denied") _ = kpi.PrometheusText() } }() diff --git a/internal/app/service.go b/internal/app/service.go index cf7119b..7cb9f10 100644 --- a/internal/app/service.go +++ b/internal/app/service.go @@ -113,10 +113,14 @@ func (noopTelemetryQuerier) AllToolStats(context.Context) (map[string]ToolStats, } func (s *Service) PrometheusMetrics() string { - if s.metrics == nil { - return "" + var b strings.Builder + if s.metrics != nil { + b.WriteString(s.metrics.PrometheusText()) + } + if s.kpiMetrics != nil { + b.WriteString(s.kpiMetrics.PrometheusText()) } - return s.metrics.PrometheusText() + return b.String() } func (s *Service) CreateSession(ctx context.Context, req CreateSessionRequest) (domain.Session, *ServiceError) { @@ -141,6 +145,9 @@ func (s *Service) CreateSession(ctx context.Context, req CreateSessionRequest) ( ExpiresAt: session.ExpiresAt, WorkspaceDir: session.WorkspacePath, }) + if s.kpiMetrics != nil { + s.kpiMetrics.ObserveSessionCreated() + } return session, nil } @@ -159,6 +166,9 @@ func (s *Service) CloseSession(ctx context.Context, sessionID string) *ServiceEr DurationSec: durationSec, }) } + if s.kpiMetrics != nil { + s.kpiMetrics.ObserveSessionClosed() + } return nil } @@ -465,6 +475,13 @@ func (s *Service) denyInvocation(ctx context.Context, invocation domain.Invocati CorrelationID: invocation.CorrelationID, Reason: domErr.Message, }) + if s.kpiMetrics != nil { + reason := domErr.Code + if reason == "" { + reason = domErr.Message + } + s.kpiMetrics.ObserveInvocationDenied(reason) + } return invocation }