diff --git a/slurm-mcp-demo/.env.template b/slurm-mcp-demo/.env.template new file mode 100644 index 0000000..0f9e676 --- /dev/null +++ b/slurm-mcp-demo/.env.template @@ -0,0 +1,9 @@ +#refer to this https://docs.nvidia.com/nemoclaw/latest/inference/switch-inference-providers.html#switch-to-a-different-model to set different provider +# ── Inference API credentials ───────────────────────────────────── +INFERENCE_API_KEY= + +# ── Inference provider configuration ───────────────────────────── +INFERENCE_PROVIDER_TYPE= +INFERENCE_PROVIDER_NAME= +INFERENCE_BASE_URL= +INFERENCE_MODEL= diff --git a/slurm-mcp-demo/.gitignore b/slurm-mcp-demo/.gitignore new file mode 100644 index 0000000..0db865b --- /dev/null +++ b/slurm-mcp-demo/.gitignore @@ -0,0 +1,6 @@ +.venv/ +.env +fake_host_key +__pycache__/ +*.pyc + diff --git a/slurm-mcp-demo/fake_cluster_mcp_server.py b/slurm-mcp-demo/fake_cluster_mcp_server.py new file mode 100644 index 0000000..c1c61b0 --- /dev/null +++ b/slurm-mcp-demo/fake_cluster_mcp_server.py @@ -0,0 +1,203 @@ +#!/usr/bin/env python3 +""" +Fake Slurm HPC headnode — MCP server exposing raw Slurm tools. + +The OpenClaw agent (which has its own LLM) decides which tool to call. +No secondary LLM or NL dispatcher runs here. + +Tools exposed: + get_hostname — return headnode hostname + sinfo — show partitions / node states + srun — launch a fake interactive training job + sbatch — submit a fake batch job + squeue — show job queue + sacctmgr — show account associations + sreport — show utilisation report + +Run: + python fake_cluster_mcp_server.py # streamable-http on 0.0.0.0:9000/mcp + python fake_cluster_mcp_server.py --port 9000 +""" +from __future__ import annotations + +import argparse +import os +import random + +from colorama import Fore, init as colorama_init +from fastmcp import FastMCP + +colorama_init(autoreset=True) + +mcp = FastMCP("fake-slurm-cluster") + +# --------------------------------------------------------------------------- +# In-memory job table (persists for the lifetime of the server process) +# --------------------------------------------------------------------------- +_jobs: dict[int, dict] = {} +_next_job_id = 42001 + +# --------------------------------------------------------------------------- +# Tool implementations +# --------------------------------------------------------------------------- + +@mcp.tool() +def get_hostname() -> str: + """Return the cluster headnode hostname.""" + return "dlcluster-headnode" + + +@mcp.tool() +def sinfo() -> str: + """Show available Slurm partitions and node states.""" + return ( + "PARTITION AVAIL TIMELIMIT NODES STATE NODELIST\n" + "gpu-a100* up infinite 4 idle node[01-04]\n" + "gpu-h100 up 2-00:00:00 8 idle node[05-12]\n" + "gpu-gb200 up 4-00:00:00 2 idle node[13-14]\n" + "cpu-general up infinite 16 idle node[15-30]\n" + ) + + +@mcp.tool() +def srun( + gpus: int = 1, + time_limit: str = "01:00:00", + epochs: int = 5, + model: str = "resnet50", +) -> str: + """Launch a fake interactive training job via srun. + + Args: + gpus: Number of GPUs to allocate. + time_limit: Wall-time limit HH:MM:SS. + epochs: Training epochs to simulate. + model: Model name printed in the epoch log. + """ + global _next_job_id + job_id = _next_job_id + _next_job_id += 1 + + lines = [ + f"srun: job {job_id} queued and waiting for resources", + f"srun: job {job_id} has been allocated resources", + f"Allocated {gpus} GPU(s) on node01 | time_limit={time_limit}", + "", + ] + random.seed(job_id) + loss, acc = 3.2, 0.05 + for epoch in range(1, epochs + 1): + loss -= random.uniform(0.2, 0.5) + acc += random.uniform(0.05, 0.12) + lines.append( + f"Epoch [{epoch}/{epochs}] loss={loss:.4f} " + f"acc={min(acc, 1.0):.4f} lr=1e-4 gpu_util=94% model={model}" + ) + lines += [ + "", + f"Training complete. Checkpoints → /checkpoint/user/run_{job_id}/", + ] + _jobs[job_id] = {"state": "COMPLETED", "user": "user", + "partition": "gpu-a100", "name": model} + return "\n".join(lines) + + +@mcp.tool() +def sbatch(script_name: str = "train.sh") -> str: + """Submit a fake batch job. + + Args: + script_name: Name of the batch script. + """ + global _next_job_id + job_id = _next_job_id + _next_job_id += 1 + _jobs[job_id] = {"state": "RUNNING", "user": "user", + "partition": "gpu-a100", "name": script_name} + return f"Submitted batch job {job_id}" + + +@mcp.tool() +def squeue(user: str = "user") -> str: + """Show jobs in the Slurm queue. + + Args: + user: Filter by username; "all" to see every job. + """ + header = ( + " JOBID PARTITION NAME USER ST TIME NODES NODELIST\n" + ) + rows = [] + for jid, info in _jobs.items(): + if user == "all" or info["user"] == user: + st = "R" if info["state"] == "RUNNING" else "CG" + rows.append( + f" {jid:>5} {info['partition']:<10} " + f"{info['name']:<8} {info['user']:<8} {st} 0:01 1 node01" + ) + return header + ("\n".join(rows) if rows else "(no jobs)") + + +@mcp.tool() +def sacctmgr(user: str = "user") -> str: + """Show Slurm account associations for a user. + + Args: + user: Username to query. + """ + return ( + " Cluster Account User Partition Share MaxJobs QOS\n" + "---------- ---------- --------- ---------- ------ ------- ---------\n" + "dlcluster root 1 normal\n" + "dlcluster root root 1 normal\n" + f"dlcluster {user:<10} 1 normal\n" + f"dlcluster {user:<10} {user:<9} 1 200 normal\n" + ) + + +@mcp.tool() +def sreport(user: str = "user") -> str: + """Show cluster utilisation report for a user. + + Args: + user: Username to query. + """ + return ( + "-----------------------------------------------------------\n" + "Cluster/Account/User Utilization 2024-01-01 - 2024-01-31\n" + "Usage reported in CPU Minutes\n" + "-----------------------------------------------------------\n" + " Cluster Account Login Used\n" + "--------- --------------- --------- ----------\n" + "dlcluster root 12,400\n" + f"dlcluster {user:<12} {user:<10} 298,102\n" + ) + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Fake Slurm MCP Server", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--host", default=os.environ.get("MCP_HOST", "0.0.0.0")) + parser.add_argument("--port", type=int, default=int(os.environ.get("MCP_PORT", "9000"))) + parser.add_argument("--path", default=os.environ.get("MCP_PATH", "/mcp")) + args = parser.parse_args() + + print( + Fore.GREEN + + f"[mcp-server] fake-slurm-cluster → " + f"http://{args.host}:{args.port}{args.path}" + ) + print(Fore.YELLOW + "[mcp-server] Reachable from sandbox via host's LAN/bridge IP on that port.") + mcp.run( + transport="streamable-http", + host=args.host, + port=args.port, + path=args.path, + show_banner=False, + ) diff --git a/slurm-mcp-demo/install.sh b/slurm-mcp-demo/install.sh new file mode 100644 index 0000000..466c9a4 --- /dev/null +++ b/slurm-mcp-demo/install.sh @@ -0,0 +1,330 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +CREDS_PATH="$HOME/.nemoclaw/credentials.json" +MCP_PORT=9000 +TMUX_SESSION="slurm-mcp" + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +info() { echo -e "${CYAN} ▸ $1${NC}"; } +ok() { echo -e "${GREEN} ✓ $1${NC}"; } +warn() { echo -e "${YELLOW} ⚠ $1${NC}"; } +fail() { echo -e "${RED} ✗ $1${NC}"; exit 1; } + +echo "" +echo -e "${CYAN} ╔══════════════════════════════════════════════════════════╗${NC}" +echo -e "${CYAN} ║ Slurm MCP Demo Installer for NemoClaw ║${NC}" +echo -e "${CYAN} ║ Fake HPC Cluster via MCP + OpenClaw Skill ║${NC}" +echo -e "${CYAN} ╚══════════════════════════════════════════════════════════╝${NC}" +echo "" + +# ── Step 0: Clean up stale environment ─────────────────────────── +info "Cleaning up stale environment..." +if tmux has-session -t "$TMUX_SESSION" 2>/dev/null; then + tmux kill-session -t "$TMUX_SESSION" + ok "Killed existing tmux session '$TMUX_SESSION'" +fi +STALE=$(pgrep -f "fake_cluster_mcp_server" 2>/dev/null || true) +if [ -n "$STALE" ]; then + kill $STALE 2>/dev/null || true + ok "Killed stale MCP server process(es)" +fi +ok "Environment clean" +echo "" + +# ── Step 1: Check prerequisites ────────────────────────────────── +info "Checking prerequisites..." +command -v openshell >/dev/null 2>&1 || fail "openshell CLI not found. Is NemoClaw installed?" +command -v nemoclaw >/dev/null 2>&1 || fail "nemoclaw CLI not found. Is NemoClaw installed?" +command -v python3 >/dev/null 2>&1 || fail "python3 not found." + +if ! command -v uv >/dev/null 2>&1; then + warn "uv not found — installing..." + curl -LsSf https://astral.sh/uv/install.sh | sh + export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH" + command -v uv >/dev/null 2>&1 || fail "uv install failed. Add ~/.local/bin to PATH and retry." + ok "uv installed" +fi +ok "Prerequisites OK" +echo "" + +# ── Step 2: Load .env and resolve inference config ─────────────── +info "Loading configuration..." + +# Load .env if present (values do NOT override existing env vars) +if [ -f "$SCRIPT_DIR/.env" ]; then + while IFS= read -r line || [ -n "$line" ]; do + # Skip comments and blank lines + [[ "$line" =~ ^[[:space:]]*# ]] && continue + [[ -z "${line// }" ]] && continue + key="${line%%=*}" + val="${line#*=}" + # Strip surrounding quotes from value + val="${val#\"}" ; val="${val%\"}" ; val="${val#\'}" ; val="${val%\'}" + # Only set if not already in environment + [ -z "${!key+x}" ] && export "$key"="$val" + done < "$SCRIPT_DIR/.env" + ok "Loaded .env from $SCRIPT_DIR/.env" +fi + +# Fall back to credentials.json for INFERENCE_API_KEY +if [ -z "${INFERENCE_API_KEY:-}" ] && [ -f "$CREDS_PATH" ]; then + INFERENCE_API_KEY=$(python3 -c " +import json +print(json.load(open('$CREDS_PATH')).get('INFERENCE_API_KEY','')) +" 2>/dev/null || true) + [ -n "${INFERENCE_API_KEY:-}" ] && ok "INFERENCE_API_KEY loaded from $CREDS_PATH" +fi + +[ -z "${INFERENCE_API_KEY:-}" ] && fail "INFERENCE_API_KEY is not set. Add it to $SCRIPT_DIR/.env or run: export INFERENCE_API_KEY=" + +# Apply defaults for optional config vars +INFERENCE_PROVIDER_TYPE="${INFERENCE_PROVIDER_TYPE:-nvidia}" +INFERENCE_PROVIDER_NAME="${INFERENCE_PROVIDER_NAME:-nvidia}" +INFERENCE_BASE_URL="${INFERENCE_BASE_URL:-https://inference-api.nvidia.com/v1}" +INFERENCE_MODEL="${INFERENCE_MODEL:-aws/anthropic/bedrock-claude-opus-4-6}" + +ok "INFERENCE_API_KEY : found" +ok "INFERENCE_PROVIDER : $INFERENCE_PROVIDER_NAME (type=$INFERENCE_PROVIDER_TYPE)" +ok "INFERENCE_BASE_URL : $INFERENCE_BASE_URL" +ok "INFERENCE_MODEL : $INFERENCE_MODEL" +echo "" + +# ── Step 3: Onboard if no sandbox exists ───────────────────────── +# Count sandboxes that are actually live (not just entries in sandboxes.json) +live_sandboxes() { + openshell sandbox list 2>/dev/null | grep -v "^No sandboxes" | grep -v "^NAME" | awk '{print $1}' | grep -v '^$' || true +} + +LIVE_COUNT=$(live_sandboxes | wc -l | tr -d ' ') + +if [ "${LIVE_COUNT:-0}" -eq 0 ]; then + info "Configuring inference provider '$INFERENCE_PROVIDER_NAME' and model..." + openshell provider create \ + --type "$INFERENCE_PROVIDER_TYPE" \ + --name "$INFERENCE_PROVIDER_NAME" \ + --credential INFERENCE_API_KEY \ + --config "NVIDIA_BASE_URL=$INFERENCE_BASE_URL" \ + 2>/dev/null && ok "Provider '$INFERENCE_PROVIDER_NAME' created" \ + || warn "Provider '$INFERENCE_PROVIDER_NAME' already exists — continuing" + openshell inference set \ + --provider "$INFERENCE_PROVIDER_NAME" \ + --model "$INFERENCE_MODEL" + ok "Inference set to $INFERENCE_MODEL via $INFERENCE_PROVIDER_NAME" + echo "" + + echo -e " ${YELLOW}No sandbox found — running 'nemoclaw onboard'...${NC}" + echo -e " ${YELLOW}Provider and model are pre-configured — you only need to confirm the sandbox name.${NC}" + echo "" + # NOTE: no --yes-i-accept-third-party-software — consent is required interactively + nemoclaw onboard + echo "" + ok "Onboarding complete" + echo "" + + # Wait up to 20 s for the new sandbox to appear in the live list + info "Waiting for sandbox to become ready..." + for i in $(seq 1 20); do + LIVE_COUNT=$(live_sandboxes | wc -l | tr -d ' ') + [ "${LIVE_COUNT:-0}" -gt 0 ] && break + sleep 1 + done + [ "${LIVE_COUNT:-0}" -eq 0 ] && fail "No sandbox appeared after onboarding. Run 'openshell sandbox list' to check." +fi + +# ── Step 4: Resolve sandbox name ───────────────────────────────── +# Priority: CLI arg → only live sandbox → sandboxes.json default (validated) → prompt + +if [ -n "${1:-}" ]; then + SANDBOX_NAME="$1" +else + LIVE_NAMES=$(live_sandboxes) + LIVE_COUNT=$(echo "$LIVE_NAMES" | grep -c . || true) + + if [ "${LIVE_COUNT:-0}" -eq 1 ]; then + # Exactly one live sandbox — use it regardless of sandboxes.json + SANDBOX_NAME=$(echo "$LIVE_NAMES" | head -1) + + else + # Multiple live sandboxes — try sandboxes.json default, but validate it is actually live + JSON_DEFAULT=$(python3 -c " +import json +try: + d = json.load(open('$HOME/.nemoclaw/sandboxes.json')) + print(d.get('defaultSandbox') or '') +except: pass +" 2>/dev/null || true) + + if [ -n "${JSON_DEFAULT:-}" ] && echo "$LIVE_NAMES" | grep -qx "$JSON_DEFAULT"; then + SANDBOX_NAME="$JSON_DEFAULT" + else + # sandboxes.json default is stale or absent — ask the user + echo "" + echo -e " ${YELLOW}Multiple sandboxes found:${NC}" + echo "$LIVE_NAMES" | while read -r n; do echo " - $n"; done + echo "" + echo -n " Which sandbox should be used? " + read -r SANDBOX_NAME + fi + fi +fi + +[ -z "${SANDBOX_NAME:-}" ] && fail "Could not determine sandbox name. Usage: ./install.sh " + +# Final check: confirm the chosen name is actually live +if ! live_sandboxes | grep -qx "$SANDBOX_NAME"; then + echo "" + echo -e " ${RED} ✗ Sandbox '$SANDBOX_NAME' not found. Live sandboxes:${NC}" + live_sandboxes | while read -r n; do echo " - $n"; done + echo "" + fail "Re-run with: bash install.sh " +fi + +info "Target sandbox: $SANDBOX_NAME" +echo "" + +# Persist key in credentials.json (mode 600) +mkdir -p "$(dirname "$CREDS_PATH")" +python3 -c " +import json, os +path = '$CREDS_PATH' +try: d = json.load(open(path)) +except: d = {} +d['INFERENCE_API_KEY'] = '$INFERENCE_API_KEY' +d['INFERENCE_PROVIDER_TYPE'] = '$INFERENCE_PROVIDER_TYPE' +d['INFERENCE_PROVIDER_NAME'] = '$INFERENCE_PROVIDER_NAME' +d['INFERENCE_BASE_URL'] = '$INFERENCE_BASE_URL' +d['INFERENCE_MODEL'] = '$INFERENCE_MODEL' +with open(path, 'w') as f: json.dump(d, f, indent=2) +os.chmod(path, 0o600) +" 2>/dev/null || true + +# ── Step 5: Install Python dependencies ───────────────────────── +info "Installing Python dependencies (latest versions)..." +cd "$SCRIPT_DIR" +uv venv --quiet +uv pip install --quiet --upgrade \ + fastmcp \ + colorama +ok "Dependencies installed in .venv" +echo "" + +# ── Step 6: Start MCP server in tmux ──────────────────────────── +info "Starting MCP server in tmux session '$TMUX_SESSION'..." + +tmux new-session -d -s "$TMUX_SESSION" \ + "cd '$SCRIPT_DIR' && \ + source .venv/bin/activate && \ + python fake_cluster_mcp_server.py --port $MCP_PORT 2>&1 | tee /tmp/slurm-mcp.log" + +# Wait up to 10 s for the port to open +SERVER_UP=false +for i in $(seq 1 10); do + sleep 1 + if curl -s --max-time 1 "http://127.0.0.1:${MCP_PORT}/mcp" >/dev/null 2>&1; then + SERVER_UP=true + break + fi +done + +tmux has-session -t "$TMUX_SESSION" 2>/dev/null \ + || fail "tmux session died immediately. Check /tmp/slurm-mcp.log" + +if [ "$SERVER_UP" = true ]; then + ok "MCP server is up on port $MCP_PORT" +else + warn "MCP server not responding yet — check: tmux attach -t $TMUX_SESSION" +fi +echo "" + +# ── Step 7: Apply sandbox network policy ──────────────────────── +info "Applying sandbox network policy..." +openshell policy set "$SANDBOX_NAME" \ + --policy "$SCRIPT_DIR/sandbox_policy.yaml" \ + --wait +ok "Policy applied" +echo "" + +# ── Step 8: Upload slurm-cluster-mcp skill ────────────────────── +info "Uploading slurm-cluster-mcp skill to sandbox..." +openshell sandbox upload "$SANDBOX_NAME" \ + "$SCRIPT_DIR/slurm-cluster-mcp" \ + /sandbox/.openclaw/workspace/skills/slurm-cluster-mcp +ok "Skill uploaded to /sandbox/.openclaw/workspace/skills/slurm-cluster-mcp/" +echo "" + +# ── Step 8b: Bootstrap skill venv with required deps ──────────── +# The agent runs mcp_client.py via this venv — it must be in the policy's +# allowed binaries (sandbox_policy.yaml covers /sandbox/.openclaw/workspace/skills/*/venv/bin/*) +info "Setting up skill Python venv (fastmcp + deps)..." +SKILL_VENV=/sandbox/.openclaw/workspace/skills/slurm-cluster-mcp/venv +openshell sandbox exec -n "$SANDBOX_NAME" -- \ + python3 -m venv "$SKILL_VENV" \ + || fail "Failed to create skill venv at $SKILL_VENV inside sandbox '$SANDBOX_NAME'." +openshell sandbox exec -n "$SANDBOX_NAME" -- \ + "$SKILL_VENV/bin/pip" install -q fastmcp colorama python-dotenv \ + || fail "pip install failed inside the skill venv. Check sandbox connectivity." +VENV_CHECK=$(openshell sandbox exec -n "$SANDBOX_NAME" -- \ + "$SKILL_VENV/bin/python3" -c "import fastmcp, colorama; print('ok')" 2>/dev/null || true) +[ "$VENV_CHECK" = "ok" ] \ + && ok "Skill venv ready ($SKILL_VENV)" \ + || fail "Skill venv verification failed — 'import fastmcp, colorama' returned no output. Re-run install.sh." +echo "" + +# ── Step 8c: Restart the OpenClaw gateway so it sees the new venv ─ +info "Restarting OpenClaw gateway in sandbox '$SANDBOX_NAME'..." +openshell sandbox exec -n "$SANDBOX_NAME" -- \ + openclaw gateway restart 2>/dev/null \ + || openshell sandbox exec -n "$SANDBOX_NAME" -- \ + sh -c "openclaw gateway stop 2>/dev/null; sleep 1; openclaw gateway start" \ + || warn "Could not restart OpenClaw gateway — you may need to reconnect manually." +ok "Gateway restarted" +echo "" + +# ── Step 9: Verify ─────────────────────────────────────────────── +info "Verifying installation..." + +MCP_UP=$(curl -s --max-time 3 "http://127.0.0.1:${MCP_PORT}/mcp" 2>&1 | wc -c || true) +[ "${MCP_UP:-0}" -gt 0 ] \ + && ok "MCP server responding on http://127.0.0.1:${MCP_PORT}/mcp" \ + || warn "MCP server not responding — check: tmux attach -t $TMUX_SESSION" + +SKILL_CHECK=$(openshell sandbox exec -n "$SANDBOX_NAME" -- \ + sh -c "test -f /sandbox/.openclaw/workspace/skills/slurm-cluster-mcp/SKILL.md && echo ok" \ + 2>/dev/null || true) +[ "$SKILL_CHECK" = "ok" ] \ + && ok "Skill confirmed in sandbox" \ + || warn "Skill not yet visible in sandbox — try reconnecting" + +FASTMCP_CHECK=$(openshell sandbox exec -n "$SANDBOX_NAME" -- \ + "$SKILL_VENV/bin/python3" -c "import fastmcp, colorama; print('ok')" \ + 2>/dev/null || true) +[ "$FASTMCP_CHECK" = "ok" ] \ + && ok "fastmcp + colorama reachable via skill venv" \ + || warn "skill venv import check failed — try re-running install.sh" + +echo "" +echo -e "${GREEN} ╔══════════════════════════════════════════════════════════╗${NC}" +echo -e "${GREEN} ║ Installation complete! ║${NC}" +echo -e "${GREEN} ╚══════════════════════════════════════════════════════════╝${NC}" +echo "" +echo " MCP server : http://127.0.0.1:${MCP_PORT}/mcp (tmux: $TMUX_SESSION)" +echo " Sandbox URL: http://host.openshell.internal:${MCP_PORT}/mcp" +echo " Server logs: tmux attach -t $TMUX_SESSION (Ctrl-B D to detach)" +echo "" +echo " Next steps:" +echo " 1. Connect: nemoclaw $SANDBOX_NAME connect" +echo " 2. Try: \"What GPU partitions are available on the cluster?\"" +echo " 3. Try: \"Launch a training job with 4 GPUs for 10 epochs using vit-large\"" +echo " 4. Try: \"Show me what jobs are currently running\"" +echo " 5. Try: \"How much compute have I used this month?\"" +echo "" +echo " If the agent doesn't find the skill, disconnect and reconnect." +echo -e " ${YELLOW}To restart: tmux kill-session -t $TMUX_SESSION && bash $SCRIPT_DIR/install.sh $SANDBOX_NAME${NC}" +echo "" diff --git a/slurm-mcp-demo/sandbox_policy.yaml b/slurm-mcp-demo/sandbox_policy.yaml new file mode 100644 index 0000000..f458e46 --- /dev/null +++ b/slurm-mcp-demo/sandbox_policy.yaml @@ -0,0 +1,144 @@ +version: 1 +filesystem_policy: + include_workdir: true + read_only: + - /usr + - /lib + - /proc + - /dev/urandom + - /app + - /etc + - /var/log + - /sandbox/.openclaw + read_write: + - /sandbox + - /sandbox/.openclaw-data + - /tmp + - /dev/null +landlock: + compatibility: best_effort +process: + run_as_user: sandbox + run_as_group: sandbox +network_policies: + claude_code: + name: claude_code + endpoints: + - host: api.anthropic.com + port: 443 + protocol: rest + tls: terminate + enforcement: enforce + access: full + - host: statsig.anthropic.com + port: 443 + - host: sentry.io + port: 443 + - host: raw.githubusercontent.com + port: 443 + - host: platform.claude.com + port: 443 + binaries: + - path: /usr/local/bin/claude + - path: /usr/local/bin/node + - path: /usr/bin/node + mcp_server_host: + name: mcp_server_host + endpoints: + - host: host.openshell.internal + port: 9000 + allowed_ips: + - 172.17.0.1 + - host: 127.0.0.1 + port: 9000 + binaries: + - { path: /usr/local/bin/claude } + - { path: /usr/local/bin/node } + - { path: /usr/bin/node } + - { path: /usr/bin/curl } + - { path: /usr/bin/python3 } + - { path: /usr/bin/python3.11 } + - { path: /sandbox/.venv/bin/python } + - { path: /sandbox/.venv/bin/python3 } + - { path: /sandbox/test_mcp_client/.venv/bin/python } + - { path: /sandbox/test_mcp_client/.venv/bin/python3 } + - { path: "/sandbox/.uv/python/**" } + - { path: "/sandbox/.openclaw/workspace/skills/*/venv/bin/python" } + - { path: "/sandbox/.openclaw/workspace/skills/*/venv/bin/python3" } + - { path: "/sandbox/.openclaw/workspace/skills/*/venv/bin/python3.11" } + nvidia: + name: nvidia + endpoints: + - { host: integrate.api.nvidia.com, port: 443 } + binaries: + - { path: /usr/bin/curl } + - { path: /bin/bash } + - { path: /usr/local/bin/opencode } + nvidia_web: + name: nvidia_web + endpoints: + - { host: nvidia.com, port: 443 } + - { host: www.nvidia.com, port: 443 } + binaries: + - { path: /usr/bin/curl } + github: + name: github + endpoints: + - host: github.com + port: 443 + protocol: rest + tls: terminate + enforcement: enforce + rules: + - allow: + method: GET + path: "/**/info/refs*" + - allow: + method: POST + path: "/**/git-upload-pack" + binaries: + - { path: /usr/bin/git } + github_rest_api: + name: github-rest-api + endpoints: + - host: api.github.com + port: 443 + protocol: rest + tls: terminate + enforcement: enforce + rules: + - allow: + method: GET + path: "/**" + - allow: + method: HEAD + path: "/**" + - allow: + method: OPTIONS + path: "/**" + binaries: + - { path: /usr/local/bin/claude } + - { path: /usr/bin/gh } + pypi: + name: pypi + endpoints: + - { host: pypi.org, port: 443 } + - { host: files.pythonhosted.org, port: 443 } + - { host: github.com, port: 443 } + - { host: objects.githubusercontent.com, port: 443 } + - { host: api.github.com, port: 443 } + - { host: downloads.python.org, port: 443 } + binaries: + - { path: /usr/bin/python3 } + - { path: /usr/bin/python3.11 } + - { path: /sandbox/.venv/bin/python } + - { path: /sandbox/.venv/bin/python3 } + - { path: /sandbox/.venv/bin/pip } + - { path: "/sandbox/.openclaw/workspace/skills/*/venv/bin/python" } + - { path: "/sandbox/.openclaw/workspace/skills/*/venv/bin/python3" } + - { path: "/sandbox/.openclaw/workspace/skills/*/venv/bin/pip" } + - { path: /app/.venv/bin/python } + - { path: /app/.venv/bin/python3 } + - { path: /app/.venv/bin/pip } + - { path: /usr/local/bin/uv } + - { path: "/sandbox/.uv/python/**" } diff --git a/slurm-mcp-demo/slurm-cluster-mcp/SKILL.md b/slurm-mcp-demo/slurm-cluster-mcp/SKILL.md new file mode 100644 index 0000000..1a87dde --- /dev/null +++ b/slurm-mcp-demo/slurm-cluster-mcp/SKILL.md @@ -0,0 +1,124 @@ +--- +name: slurm-cluster-mcp +description: Interact with a fake Slurm HPC cluster via MCP. Provides direct tool access to query GPU partitions, submit training jobs, submit batch jobs, check the job queue, view account limits, and view compute usage. You (the agent) decide which tool to call — no secondary LLM is involved. Trigger keywords — slurm, cluster, GPU partition, submit job, batch job, HPC, compute usage, training job, sinfo, squeue, sbatch, srun. +--- + +# Slurm Cluster MCP + +## Overview + +Direct tool interface to a fake Slurm HPC cluster running on the host machine via MCP. You decide which tool to invoke based on the user's request. The MCP server exposes raw Slurm commands — call them with the specific parameters that match the user's intent. + +## Invocation + +Always use the skill venv's Python (required by the sandbox network policy): + +```bash +SKILL_DIR=~/.openclaw/workspace/skills/slurm-cluster-mcp +$SKILL_DIR/venv/bin/python3 $SKILL_DIR/scripts/mcp_client.py [args] +``` + +Do **not** use bare `python3` — the system Python is not permitted to reach the MCP server on port 9000. + +## Available Tools + +### `get_hostname` +Returns the cluster headnode hostname. +**Use when:** user asks for the cluster name or hostname. +```bash +python3 mcp_client.py get_hostname +``` + +--- + +### `sinfo` +Lists available Slurm partitions and node states (GPU types, counts, availability). +**Use when:** user asks about available GPUs, partitions, node counts, or idle resources. +```bash +python3 mcp_client.py sinfo +``` + +--- + +### `srun` +Launches a fake interactive training job. Streams epoch-level logs and returns a job ID with checkpoint path. +**Use when:** user wants to run, launch, or start a training job interactively. +```bash +python3 mcp_client.py srun [--gpus N] [--time-limit HH:MM:SS] [--epochs N] [--model NAME] +``` +| Argument | Type | Default | Description | +|---|---|---|---| +| `--gpus` | int | `1` | Number of GPUs to allocate | +| `--time-limit` | str | `01:00:00` | Wall-time limit HH:MM:SS | +| `--epochs` | int | `5` | Training epochs to simulate | +| `--model` | str | `resnet50` | Model architecture name | + +**Example:** +```bash +python3 mcp_client.py srun --gpus 4 --epochs 10 --model vit-large +``` + +--- + +### `sbatch` +Submits a fake batch job and returns a job ID. +**Use when:** user wants to submit a batch script. +```bash +python3 mcp_client.py sbatch [--script-name NAME] +``` +| Argument | Type | Default | Description | +|---|---|---|---| +| `--script-name` | str | `train.sh` | Batch script filename | + +**Example:** +```bash +python3 mcp_client.py sbatch --script-name train_bert.sh +``` + +--- + +### `squeue` +Shows the current Slurm job queue. +**Use when:** user asks about running jobs, queued jobs, or job status. +```bash +python3 mcp_client.py squeue [--user NAME] +``` +| Argument | Type | Default | Description | +|---|---|---|---| +| `--user` | str | `user` | Username to filter; `all` for everyone | + +--- + +### `sacctmgr` +Shows Slurm account associations and compute limits for a user. +**Use when:** user asks about account limits, compute allocation, or account associations. +```bash +python3 mcp_client.py sacctmgr [--user NAME] +``` + +--- + +### `sreport` +Shows a cluster utilisation report (CPU-minutes used) for a user. +**Use when:** user asks about compute usage history or how much compute they have used. +```bash +python3 mcp_client.py sreport [--user NAME] +``` + +--- + +## Server URL + +The client connects to `http://host.openshell.internal:9000/mcp` by default. +Override with `--server-url URL` or the `MCP_SERVER_URL` environment variable. + +## Troubleshooting + +If the tool call fails with a connection error: +1. Check the MCP server is running on the host: `curl http://host.openshell.internal:9000/mcp` +2. Confirm the sandbox policy is applied (allows egress to port 9000) +3. If the venv is missing, recreate it: + ```bash + python3 -m venv $SKILL_DIR/venv + $SKILL_DIR/venv/bin/pip install -q fastmcp + ``` diff --git a/slurm-mcp-demo/slurm-cluster-mcp/references/architecture.md b/slurm-mcp-demo/slurm-cluster-mcp/references/architecture.md new file mode 100644 index 0000000..3db9c40 --- /dev/null +++ b/slurm-mcp-demo/slurm-cluster-mcp/references/architecture.md @@ -0,0 +1,58 @@ +# Slurm Cluster MCP — Architecture & Reference + +## Overview + +This skill provides a natural-language interface to a fake Slurm HPC cluster via an MCP (Model Context Protocol) server. The architecture has two components: + +1. **MCP Server** — runs on the host machine, wrapping a simulated Slurm cluster with a `cluster_agent` tool +2. **MCP Client** (`scripts/mcp_client.py`) — runs in the sandbox or any network-accessible machine, connects to the server and provides an interactive REPL + +## Network Topology + +``` +┌──────────────────────┐ HTTP/SSE ┌──────────────────────┐ +│ Sandbox / Client │ ───────────────────────> │ Host / MCP Server │ +│ mcp_client.py │ POST /mcp │ fake_cluster_mcp_ │ +│ │ <─────────────────────── │ server.py │ +└──────────────────────┘ └──────────────────────┘ +``` + +## Server URL Resolution Order + +The client resolves the MCP server URL in this priority: + +1. `--server-url` CLI flag +2. `MCP_SERVER_URL` environment variable +3. Default: `http://host.docker.internal:8000/mcp` (Docker host alias) + +In OpenShell sandboxes, the default is `http://host.openshell.internal:9000/mcp`. + +## MCP Tool + +The server exposes a single tool: + +- **Name:** `cluster_agent` +- **Parameter:** `query` (string) — natural-language request +- **Returns:** text response describing cluster state, job results, etc. + +## Dependencies + +- `fastmcp` — MCP client library +- `colorama` — colored terminal output +- `python-dotenv` — `.env` file loading + +Install with: +```bash +pip install fastmcp colorama python-dotenv +``` + +## Example Queries + +| Query | What it does | +|-------|-------------| +| "what GPU partitions are available?" | List available GPU partitions | +| "launch a training job with 4 GPUs for 10 epochs using vit-large" | Submit a GPU training job | +| "submit my train_bert.sh as a batch job" | Submit a batch script | +| "show me what jobs are running" | List active jobs | +| "what are my account limits?" | Show account quotas | +| "how much compute have I used this month?" | Show usage statistics | diff --git a/slurm-mcp-demo/slurm-cluster-mcp/scripts/mcp_client.py b/slurm-mcp-demo/slurm-cluster-mcp/scripts/mcp_client.py new file mode 100644 index 0000000..919a61f --- /dev/null +++ b/slurm-mcp-demo/slurm-cluster-mcp/scripts/mcp_client.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +""" +Direct MCP tool caller for the fake Slurm cluster. + +The OpenClaw agent (which already has an LLM) decides which tool to call and +with what arguments. This script executes that one tool call against the remote +MCP server and prints the result to stdout. No secondary LLM is involved. + +Usage: + /venv/bin/python3 mcp_client.py [options] + +Tools: + get_hostname + sinfo + srun [--gpus N] [--time-limit HH:MM:SS] [--epochs N] [--model NAME] + sbatch [--script-name NAME] + squeue [--user NAME] + sacctmgr [--user NAME] + sreport [--user NAME] + +Server URL (resolved in order): + 1. --server-url flag + 2. MCP_SERVER_URL env var + 3. Default: http://host.openshell.internal:9000/mcp + +Always run with the skill venv's Python so the sandbox policy allows the +outbound connection to port 9000. Do NOT use bare python3. +""" +from __future__ import annotations + +import asyncio +import os +import sys +import argparse + +try: + from fastmcp import Client +except ImportError as _e: + _skill_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + _venv_python = os.path.join(_skill_dir, "venv", "bin", "python3") + print( + f"\nMissing dependency: {_e}\n" + "Run this script with the skill venv's Python, not bare python3:\n\n" + f" {_venv_python} {__file__} [args]\n\n" + "If the venv doesn't exist yet, create it with:\n\n" + f" python3 -m venv {_skill_dir}/venv\n" + f" {_skill_dir}/venv/bin/pip install -q fastmcp\n", + file=sys.stderr, + ) + sys.exit(1) + +_DEFAULT_URL = "http://host.openshell.internal:9000/mcp" + + +async def call_tool(server_url: str, tool: str, args: dict) -> str: + async with Client(server_url) as client: + result = await client.call_tool(tool, args) + return result.content[0].text + + +def main() -> None: + root = argparse.ArgumentParser( + description="Call a specific Slurm MCP tool directly.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + root.add_argument( + "--server-url", + default=os.environ.get("MCP_SERVER_URL", _DEFAULT_URL), + help="Full URL of the MCP server.", + ) + + sub = root.add_subparsers(dest="tool", metavar="", required=True) + + sub.add_parser("get_hostname", help="Return the cluster headnode hostname.") + sub.add_parser("sinfo", help="Show available partitions and node states.") + + p_srun = sub.add_parser("srun", help="Launch a fake interactive training job.") + p_srun.add_argument("--gpus", type=int, default=1, help="Number of GPUs to allocate.") + p_srun.add_argument("--time-limit", default="01:00:00", help="Wall-time limit HH:MM:SS.") + p_srun.add_argument("--epochs", type=int, default=5, help="Training epochs to simulate.") + p_srun.add_argument("--model", default="resnet50", help="Model name for the epoch log.") + + p_sbatch = sub.add_parser("sbatch", help="Submit a fake batch job.") + p_sbatch.add_argument("--script-name", default="train.sh", help="Batch script filename.") + + p_squeue = sub.add_parser("squeue", help="Show the Slurm job queue.") + p_squeue.add_argument("--user", default="user", help='Username to filter; "all" for everyone.') + + p_sacctmgr = sub.add_parser("sacctmgr", help="Show account associations for a user.") + p_sacctmgr.add_argument("--user", default="user", help="Username to query.") + + p_sreport = sub.add_parser("sreport", help="Show cluster utilisation report.") + p_sreport.add_argument("--user", default="user", help="Username to query.") + + parsed = root.parse_args() + server_url = parsed.server_url + tool = parsed.tool + + # Build the args dict for the tool call (strip server_url and tool keys) + tool_args: dict = {} + if tool == "srun": + tool_args = { + "gpus": parsed.gpus, + "time_limit": parsed.time_limit, + "epochs": parsed.epochs, + "model": parsed.model, + } + elif tool == "sbatch": + tool_args = {"script_name": parsed.script_name} + elif tool in ("squeue", "sacctmgr", "sreport"): + tool_args = {"user": parsed.user} + + try: + result = asyncio.run(call_tool(server_url, tool, tool_args)) + print(result) + except Exception as exc: + print(f"Error calling '{tool}': {exc}", file=sys.stderr) + print(f"Is the server reachable at {server_url}?", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/slurm-mcp-demo/slurm-mcp-openclaw-guide.md b/slurm-mcp-demo/slurm-mcp-openclaw-guide.md new file mode 100644 index 0000000..c64a65d --- /dev/null +++ b/slurm-mcp-demo/slurm-mcp-openclaw-guide.md @@ -0,0 +1,231 @@ +# Connecting a Fake Slurm HPC Cluster to OpenClaw via MCP + +This guide walks you through connecting a simulated Slurm HPC cluster to an OpenClaw agent running inside an OpenShell sandbox. By the end, your agent will be able to query GPU partitions, submit training jobs, monitor the job queue, and check compute usage — all through natural language. + +The connection uses **MCP (Model Context Protocol)**. A lightweight Python server runs on the host, exposes Slurm-like tools over HTTP, and is stateless — it requires no API key and does no LLM reasoning of its own. The OpenClaw agent inside the sandbox calls MCP tools directly and uses its configured inference model for all natural-language reasoning. The sandbox talks to the host MCP server through an egress-approved network policy. + +## Prerequisites + +| Requirement | Details | +|-------------|---------| +| NemoClaw | `nemoclaw` and `openshell` CLIs must be installed. See [NemoClaw setup](https://github.com/NVIDIA/NemoClaw). | +| Inference API key | Required by `nemoclaw onboard` to configure the OpenClaw agent's inference provider. Set `INFERENCE_API_KEY` in `.env` before running `install.sh` (see below). | +| `uv` | Installed automatically by `install.sh` if missing. | + +--- + +## One-Command Setup + +### 1. Configure `.env` + +Copy the template and fill in your values: + +```bash +cd nemoclaw-demos/slurm-mcp-demo +cp .env.template .env +``` + +Open `.env` and set your API key and inference configuration. The example below uses the NVIDIA build API with Nemotron — replace these values with your own provider, base URL, and model: + +```bash +# Inference API credentials +INFERENCE_API_KEY=nvapi-your-key + +# Inference provider configuration +INFERENCE_PROVIDER_TYPE=nvidia +INFERENCE_PROVIDER_NAME=nvidia +INFERENCE_BASE_URL= "https://integrate.api.nvidia.com/v1" +INFERENCE_MODEL="nvidia/llama-3.3-nemotron-super-49b-v1.5" + +``` + +> **Choosing a provider or model:** see the NemoClaw docs for the full list of supported providers, base URLs, and models: +> [https://docs.nvidia.com/nemoclaw/latest/inference/switch-inference-providers.html#switch-to-a-different-model](https://docs.nvidia.com/nemoclaw/latest/inference/switch-inference-providers.html#switch-to-a-different-model) + +All five values are required. `install.sh` loads `.env` automatically — values in `.env` do not override variables already exported in your shell, so you can also override any of them inline: + +```bash +INFERENCE_MODEL=nvidia/llama-3.3-70b bash install.sh +``` + +`INFERENCE_API_KEY` is also cached to `~/.nemoclaw/credentials.json` after the first run, so future re-runs pick it up without needing `.env` in place. + +--- + +### 2. Run the installer + +```bash +cd nemoclaw-demos/slurm-mcp-demo +bash install.sh +``` + +The script will: +1. Clean up any stale MCP server processes and tmux sessions +2. Load `.env` and resolve `INFERENCE_API_KEY`, provider, base URL, and model +3. Pre-configure the openshell inference provider and model +4. Run `nemoclaw onboard` if no sandbox exists — provider and model are already set, so you only need to confirm the sandbox name +5. Install all Python dependencies on the host (latest versions via `uv`) +6. Start the MCP server in a persistent `tmux` session +7. Apply the sandbox network policy (`sandbox_policy.yaml`) +8. Upload the `slurm-cluster-mcp` skill into the sandbox +9. Bootstrap the skill's Python venv inside the sandbox (`fastmcp`, `colorama`) +10. Restart the OpenClaw gateway so it discovers the new skill +11. Verify the installation (MCP server reachable, skill present, venv imports OK) + +You can also pass a sandbox name directly to skip the interactive prompt: + +```bash +bash install.sh +``` + +--- + +## Trying It Out + +Connect to the sandbox and launch the OpenClaw TUI: + +```bash +# From your host terminal +openshell sandbox connect my-assistant + +# Inside the sandbox +openclaw tui +``` + +Once connected, verify the skill is loaded by asking the agent directly: + +``` +> do you have a skill to be able to query the slurm cluster? + + Yes, I have the slurm-cluster-mcp skill available, which provides a natural-language interface + to interact with a simulated Slurm HPC cluster. + + This skill allows me to: + - Query available resources (GPU partitions, node counts, etc.) + - Submit training/batch jobs (like the vit-large job we just ran) + - Monitor running jobs and check their status + - View account limits and compute usage + - Manage cluster operations through plain English queries + + The skill uses an MCP (Model Context Protocol) client to connect to the cluster server, and I've + already demonstrated it working by: + 1. Checking available GPU partitions (gpu-a100, gpu-h100, gpu-gb200) + 2. Launching and monitoring a training job with 4 GPUs for 10 epochs using vit-large + + What would you like to do with the Slurm cluster? I can help you query partitions, submit jobs, + check job status, or explore other cluster capabilities. + + connected | idle + agent main | session main (openclaw-tui) | inference/nvidia/bedrock-claude-opus-4-6 | tokens 23k/131k (17%) +``` + +Try these prompts: + +- "What GPU partitions are available on the cluster?" +- "Launch a training job with 4 GPUs for 10 epochs using vit-large." +- "Submit my train_bert.sh as a batch job." +- "Show me what jobs are currently running." + +The session below shows the agent responding to a GPU availability query — it calls `sinfo` under the hood and summarises the A100, H100, and CPU partitions, then launches a training job: + +``` +openclaw tui - ws://127.0.0.1:18789 - agent main - session main + + session agent:main:main + + What GPU partitions are available on the cluster? + + Based on the Slurm cluster status, here are the available GPU partitions: + + GPU Partitions: + - gpu-a100* (default) - 4 nodes available, idle, infinite time limit + - gpu-h100 - 8 nodes available, idle, max 2-day time limit + - gpu-gb200 - 2 nodes available, idle, max 4-day time limit + + There's also a cpu-general partition with 16 nodes for CPU-only workloads. + + The asterisk (*) indicates gpu-a100 is the default partition. All GPU + partitions are currently idle and ready for jobs. + + + Launch a training job with 4 GPUs for 10 epochs using vit-large. + + The training job has been successfully launched and completed. Here are + the details: + + Job Summary: + - Job ID: 42001 + - Resources: 4 GPUs allocated on node01 + - Time Limit: 1 hour + - Model: vit-large + - Epochs: 10 + + Training Progress: + The training showed steady improvement over the 10 epochs: + - Loss decreased from 2.8051 to -0.0002 + - Accuracy increased from 0.1082 to 0.9090 (90.9%) + - GPU utilization remained consistently high at 94% + + Completion: + Training completed successfully. Checkpoints saved to: + /checkpoint/user/run_42001/ + + The job utilized the gpu-a100 partition (default) and finished within + the allocated time limit. + + connected | idle + agent main | session main (openclaw-tui) | inference/nvidia/bedrock-claude-opus-4-6 | tokens 21k/131k (16%) +``` + +--- + +## Available MCP Tools + +The OpenClaw agent's LLM decides which tool to call. The MCP server exposes raw Slurm commands — it is stateless and requires no API key. + +| Tool | Arguments | What it does | +|------|-----------|-------------| +| `get_hostname` | — | Returns the cluster headnode hostname | +| `sinfo` | — | Lists A100 / H100 / GB200 / CPU partitions and node states | +| `srun` | `gpus`, `time_limit`, `epochs`, `model` | Simulates an interactive training job with epoch logs | +| `sbatch` | `script_name` | Submits a fake batch job and returns a job ID | +| `squeue` | `user` | Shows the in-memory job queue | +| `sacctmgr` | `user` | Returns user account associations and limits | +| `sreport` | `user` | Returns a fake CPU-minute utilisation report | + +--- + +## Troubleshooting + +| Issue | Fix | +|-------|-----| +| `INFERENCE_API_KEY is not set` | Add `INFERENCE_API_KEY=your-key` to `.env`, or run `export INFERENCE_API_KEY=...` before `install.sh`. | +| `Connection refused` from sandbox | Confirm the server is running: `curl http://127.0.0.1:9000/mcp` from the host. Check logs: `tmux attach -t slurm-mcp`. | +| `l7_decision=deny` in OpenShell logs | The sandbox policy wasn't applied or the connecting binary isn't listed. Re-run `openshell policy set` and check `sandbox_policy.yaml`. | +| Wrong `host.openshell.internal` resolution | Set `MCP_SERVER_URL` explicitly to the host's LAN IP instead of relying on the DNS alias. | +| Agent doesn't know about Slurm | Confirm the skill was uploaded to `/sandbox/.openclaw/workspace/skills/slurm-cluster-mcp/` and restart the OpenClaw gateway. | +| `ModuleNotFoundError: No module named 'fastmcp'` | Run with the skill venv Python, not bare `python3`. See `SKILL.md` Troubleshooting section. | +| `NVIDIA Endpoints endpoint validation failed` / `spawnSync curl ETIMEDOUT` during `nemoclaw onboard` | See below. | + +### nemoclaw onboard: validation timeout + +During `nemoclaw onboard`, step **[3/8] Configuring inference** makes a live inference call to validate the pre-configured model. Thinking/reasoning models can take ~10 seconds to respond on a non-streaming request, occasionally exceeding the 30-second internal timeout and producing: + +``` +NVIDIA Endpoints endpoint validation failed. +Chat Completions API: curl failed (exit -110): spawnSync curl ETIMEDOUT +``` + +If this happens, type `retry` at the prompt — the API call usually succeeds on a second attempt when the model is warm. If the problem persists, set `INFERENCE_MODEL` in `.env` to a faster model such as `nvidia/llama-3.3-70b-instruct`. + +### Restarting after a fresh reset + +```bash +# Clean up everything +tmux kill-session -t slurm-mcp +openshell sandbox delete my-assistant +openshell provider delete nvidia + +# Re-run +bash install.sh +```