From edef9d2019d0c564f80e62395285ceea737c6656 Mon Sep 17 00:00:00 2001 From: William Zujkowski Date: Mon, 2 Feb 2026 09:28:47 -0500 Subject: [PATCH] enhance: Add resource planning for parallel AI agent deployments - Increase default RAM from 8GB to 16GB (Claude CLI memory leak mitigation) - Add --memory and --vcpus flags to setup_cloud.sh - Increase default swap from 4GB to 8GB (configurable via SWAP_SIZE) - Enhance vm-health-check with Claude CLI memory tracking and warnings - Add RESOURCES.md with parallel deployment guide and memory calculator - Update CLAUDE.md quick reference with new options Claude CLI has documented memory leaks reaching 13-120GB+ in extended sessions. These changes ensure VMs can handle memory pressure and provide operators with tools to plan multi-clone deployments on hosts with limited RAM. Co-Authored-By: Claude Opus 4.5 --- CLAUDE.md | 27 ++-- RESOURCES.md | 296 ++++++++++++++++++++++++++++++++++++ guest/bootstrap_agent_vm.sh | 67 ++++++-- setup_cloud.sh | 35 ++++- virt_install_agent_vm.sh | 2 +- 5 files changed, 398 insertions(+), 29 deletions(-) create mode 100644 RESOURCES.md diff --git a/CLAUDE.md b/CLAUDE.md index 9b90104..cc99e41 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -10,8 +10,10 @@ ```bash # Setup & Installation (Cloud Image - Recommended) -./setup_cloud.sh # One-command setup using cloud images (fast!) -./setup_cloud.sh --vm-name my-agent # Custom VM name +./setup_cloud.sh # Default: 16GB RAM, 4 vCPUs +./setup_cloud.sh --memory 8192 --vcpus 4 # Lightweight (for multiple clones) +./setup_cloud.sh --memory 24576 --vcpus 8 # Heavy workload +./setup_cloud.sh --vm-name my-agent # Custom VM name # Setup & Installation (ISO - Alternative) ./setup.sh # One-command setup using ISO installer (slower) @@ -32,13 +34,18 @@ make install-deps # Install host dependencies only ./snapshot_manager.sh golden # Interactive golden image creation # Clone Management (Parallel Workflows) -./clone_manager.sh create --linked # Create linked clone (instant) -./clone_manager.sh create # Create full clone -./clone_manager.sh list # List all clones -./clone_manager.sh start # Start a clone -./clone_manager.sh stop # Stop a clone -./clone_manager.sh delete # Delete a clone -./clone_manager.sh cleanup # Delete all clones of VM +./clone_manager.sh create --linked # Linked clone (instant) +./clone_manager.sh create --linked --memory 8192 # Linked clone with 8GB +./clone_manager.sh create --linked --vcpus 2 # Linked clone with 2 vCPUs +./clone_manager.sh list # List all clones +./clone_manager.sh start # Start a clone +./clone_manager.sh stop # Stop a clone +./clone_manager.sh delete # Delete a clone +./clone_manager.sh cleanup # Delete all clones of VM + +# Resource Planning (see RESOURCES.md for details) +# 64GB host: 2-4 clones @ 12-16GB each +# Claude CLI can leak to 13GB+, plan accordingly # Quality make lint # Run shellcheck + yamllint @@ -496,4 +503,4 @@ When working with this codebase, ensure: --- -_Last updated: 2026-02-01 (ET)_ +_Last updated: 2026-02-02 (ET)_ diff --git a/RESOURCES.md b/RESOURCES.md new file mode 100644 index 0000000..0e22f05 --- /dev/null +++ b/RESOURCES.md @@ -0,0 +1,296 @@ +# Resource Planning for Parallel AI Agents + +This guide covers memory, CPU, and disk requirements for running multiple AI CLI agents (Claude, Codex, Gemini) simultaneously in moltdown VM clones. + +--- + +## TL;DR - Quick Reference + +| Host RAM | Recommended Clones | RAM per Clone | Config | +|----------|-------------------|---------------|--------| +| 32GB | 1-2 | 12-16GB | Conservative | +| 64GB | 2-4 | 12-16GB | Comfortable | +| 128GB | 4-8 | 12-16GB | Production | + +**Critical**: Claude CLI has known memory leaks reaching 13-120GB+ in extended sessions. Plan accordingly. + +--- + +## Why 16GB Default? + +The moltdown default of 16GB RAM per VM exists because: + +1. **Claude CLI Memory Leaks**: Documented issues show Claude Code consuming 13GB+ during extended sessions, sometimes reaching 120GB before OOM kill +2. **Desktop Overhead**: Ubuntu 24.04 + GNOME requires 400-600MB baseline +3. **Toolchain**: Node.js + Docker + Chrome can consume 1-2GB when active +4. **Swap Buffer**: 8GB swap provides overflow protection, but shouldn't be relied upon + +### Memory Breakdown (Active AI Session) + +| Component | Idle | Active | Peak | +|-----------|------|--------|------| +| Ubuntu + GNOME | 500MB | 700MB | 1GB | +| Claude CLI | 150MB | 500MB | **13GB+** | +| Chrome (Playwright) | 0 | 300MB | 1GB | +| Docker daemon | 100MB | 200MB | varies | +| Node.js runtime | 100MB | 300MB | 500MB | +| System buffers | 500MB | 1GB | 2GB | +| **Comfortable Total** | 1.5GB | 3GB | **16GB+** | + +--- + +## Clone Memory Architecture + +**Each clone allocates independent RAM** - there is no memory sharing between VMs: + +``` +Host RAM (64GB) +├── Host OS + KVM: 8-10GB reserved +├── Clone 1: 16GB (independent allocation) +├── Clone 2: 16GB (independent allocation) +├── Clone 3: 16GB (independent allocation) +└── Available: 6-14GB buffer +``` + +Linked clones (`--linked`) only share **disk blocks** via copy-on-write. Memory is never shared. + +--- + +## Deployment Scenarios + +### Scenario A: Single Dedicated Agent (Development) + +**Host**: Any with 32GB+ RAM + +```bash +./setup_cloud.sh --memory 16384 --vcpus 8 +``` + +- Full 16GB for handling memory leaks +- 8 vCPUs for fast compilation/analysis +- No clones needed + +### Scenario B: Two Parallel Agents (64GB Host) + +**Use Case**: Run different agents on different tasks simultaneously + +```bash +# Create golden image with 16GB +./setup_cloud.sh --memory 16384 --vcpus 4 + +# Create lightweight worker clones +./clone_manager.sh create ubuntu2404-agent --linked --memory 12288 --vcpus 4 +./clone_manager.sh create ubuntu2404-agent --linked --memory 12288 --vcpus 4 + +# Start both +./clone_manager.sh start moltdown-clone-ubuntu2404-agent-* +``` + +**Memory allocation**: +- Host overhead: 10GB +- Golden image (stopped): 0GB (not running) +- Clone 1: 12GB +- Clone 2: 12GB +- Buffer: 30GB for memory spikes + +### Scenario C: Maximum Density (64GB Host, 4 Agents) + +**Use Case**: Many short-lived tasks, aggressive snapshotting + +```bash +# Create clones with reduced memory +./clone_manager.sh create ubuntu2404-agent --linked --memory 8192 --vcpus 2 +./clone_manager.sh create ubuntu2404-agent --linked --memory 8192 --vcpus 2 +./clone_manager.sh create ubuntu2404-agent --linked --memory 8192 --vcpus 2 +./clone_manager.sh create ubuntu2404-agent --linked --memory 8192 --vcpus 2 +``` + +**Memory allocation**: +- Host overhead: 10GB +- 4 clones × 8GB: 32GB +- Buffer: 22GB + +**Mitigation required**: +- Revert clones frequently (`./snapshot_manager.sh post-run`) +- Monitor memory with `vm-health-check --watch` +- Accept OOM risk for long sessions + +### Scenario D: Production (128GB+ Host) + +```bash +# High-memory golden image +./setup_cloud.sh --memory 24576 --vcpus 8 # 24GB + +# Multiple comfortable clones +for i in {1..6}; do + ./clone_manager.sh create ubuntu2404-agent worker-$i --linked --memory 16384 --vcpus 4 +done +``` + +--- + +## Claude CLI Memory Leak Mitigation + +Known issue: Claude CLI can consume 13-120GB+ during extended sessions. + +### Strategy 1: Frequent Snapshots (Recommended) + +```bash +# Before agent run +./snapshot_manager.sh pre-run ubuntu2404-agent + +# After completion (or every few hours) +./snapshot_manager.sh post-run ubuntu2404-agent +``` + +The "molt" workflow releases all leaked memory by reverting to clean state. + +### Strategy 2: Memory Limits with cgroups + +Inside the VM, limit Claude CLI memory: + +```bash +# Create memory-limited slice +sudo mkdir -p /sys/fs/cgroup/claude-agent +echo "8G" | sudo tee /sys/fs/cgroup/claude-agent/memory.max + +# Run claude under limit +sudo cgexec -g memory:claude-agent claude +``` + +### Strategy 3: Watchdog Script + +Add to VM's crontab: + +```bash +*/15 * * * * /home/agent/bin/claude-memory-watchdog.sh +``` + +```bash +#!/bin/bash +# claude-memory-watchdog.sh +THRESHOLD_MB=10000 # 10GB + +claude_mem=$(ps aux | grep -E 'claude|node.*claude' | awk '{sum+=$6} END {print sum/1024}') +if (( $(echo "$claude_mem > $THRESHOLD_MB" | bc -l) )); then + logger "Claude CLI exceeded ${THRESHOLD_MB}MB, restarting..." + pkill -f claude + notify-send "Claude CLI restarted due to memory pressure" +fi +``` + +### Strategy 4: Swap as Emergency Buffer + +The default 4GB swap is insufficient. Increase to 8GB: + +```bash +# In bootstrap_local.sh or manually +SWAP_SIZE="8G" # Set before running bootstrap +``` + +--- + +## Monitoring Commands + +### From Host + +```bash +# Check all VM memory allocation +for vm in $(sudo virsh list --name); do + mem=$(sudo virsh dominfo "$vm" | grep "Used memory" | awk '{print $3/1024 "MB"}') + echo "$vm: $mem" +done + +# Watch total memory pressure +watch -n 5 'free -h; echo "---"; sudo virsh list' +``` + +### Inside VM + +```bash +# Quick health check +vm-health-check + +# Continuous monitoring +vm-health-check --watch + +# Check Claude CLI specifically +ps aux | grep -E 'claude|node' | awk '{printf "%s: %.1fMB\n", $11, $6/1024}' +``` + +--- + +## Resource Calculator + +Use this formula to plan deployments: + +``` +Available_for_VMs = Host_RAM - 10GB (host overhead) +Max_Clones = floor(Available_for_VMs / RAM_per_Clone) +Safe_Clones = Max_Clones - 1 (leave buffer for spikes) +``` + +**Example (64GB host, 12GB per clone)**: +``` +Available = 64 - 10 = 54GB +Max = 54 / 12 = 4 clones +Safe = 4 - 1 = 3 clones recommended +``` + +--- + +## vCPU Guidelines + +| Workload | vCPUs per Clone | Notes | +|----------|-----------------|-------| +| Light (CLI only) | 2 | Text analysis, simple queries | +| Standard | 4 | Code generation, file operations | +| Heavy (builds) | 6-8 | Compilation, Docker builds | +| Browser automation | 4+ | Playwright needs headroom | + +**Oversubscription**: KVM handles CPU overcommit well. 4 clones × 4 vCPUs on an 8-core host works fine for non-CPU-bound tasks. + +--- + +## Disk Space + +Linked clones are extremely efficient: + +| Item | Size | Notes | +|------|------|-------| +| Golden image | 15-25GB | After bootstrap | +| Linked clone (initial) | 1-5MB | Just metadata | +| Linked clone (active) | 1-10GB | Grows with changes | +| Full clone | 15-25GB | Complete copy | + +**Recommendation**: Use `--linked` for all parallel workflows. Only use full clones when you need complete isolation or plan to delete the golden image. + +--- + +## Quick Commands + +```bash +# Create memory-optimized clone +./clone_manager.sh create ubuntu2404-agent --linked --memory 12288 --vcpus 4 + +# Check clone resource usage +./clone_manager.sh status + +# Revert all clones to clean state (releases memory) +for clone in $(./clone_manager.sh list | grep running | awk '{print $1}'); do + ./clone_manager.sh stop "$clone" +done +./clone_manager.sh cleanup ubuntu2404-agent +``` + +--- + +## References + +- [Claude Code Memory Leak Issues](https://github.com/anthropics/claude-code/issues/4953) +- [Claude Code Memory Management Best Practices](https://medium.com/@codecentrevibe/claude-code-best-practices-memory-management-7bc291a87215) +- [libvirt Memory Management](https://libvirt.org/formatdomain.html#memory-allocation) + +--- + +_Last updated: 2026-02-02 (ET)_ diff --git a/guest/bootstrap_agent_vm.sh b/guest/bootstrap_agent_vm.sh index 86069aa..0867685 100755 --- a/guest/bootstrap_agent_vm.sh +++ b/guest/bootstrap_agent_vm.sh @@ -44,6 +44,10 @@ INSTALL_CLAUDE_CLI="true" REMOVE_DESKTOP_FLUFF="true" ENABLE_UNATTENDED_UPGRADES="true" +# Resource settings (can be overridden in bootstrap_local.sh) +# 8GB swap recommended for Claude CLI memory leak protection +SWAP_SIZE="${SWAP_SIZE:-8G}" + # Source local customizations if present readonly LOCAL_CONFIG="$HOME/bootstrap_local.sh" if [[ -f "$LOCAL_CONFIG" ]]; then @@ -405,14 +409,15 @@ phase_longrun_hardening() { sudo touch /etc/cloud/cloud-init.disabled # Create swap file if not present (important for memory pressure during long runs) + # Claude CLI memory leaks can consume 13GB+, so larger swap is recommended if [[ ! -f /swapfile ]]; then - log_info "Creating 4GB swap file..." - sudo fallocate -l 4G /swapfile + log_info "Creating ${SWAP_SIZE} swap file (Claude CLI memory leak protection)..." + sudo fallocate -l "$SWAP_SIZE" /swapfile sudo chmod 600 /swapfile sudo mkswap /swapfile sudo swapon /swapfile echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab - log_info "Swap file created and enabled" + log_info "Swap file created and enabled (${SWAP_SIZE})" else log_info "Swap file already exists" fi @@ -423,14 +428,54 @@ phase_longrun_hardening() { #!/bin/bash # vm-health-check - Quick VM health status for long-running sessions # Part of moltdown 🦀 -echo "=== VM Health Check $(date '+%Y-%m-%d %H:%M:%S') ===" -echo "Uptime: $(uptime -p)" -echo "Memory: $(free -h | awk '/Mem:/{print $3 "/" $2 " (" int($3/$2*100) "% used)"}')" -echo "Swap: $(free -h | awk '/Swap:/{if($2!="0B") print $3 "/" $2; else print "not configured"}')" -echo "Disk: $(df -h / | awk 'NR==2{print $3 "/" $2 " (" $5 " used)"}')" -echo "Load: $(cat /proc/loadavg | cut -d' ' -f1-3)" -echo "Procs: $(ps aux --no-headers | wc -l)" -echo "Journal: $(journalctl --disk-usage 2>/dev/null | grep -oP '\d+\.\d+[MG]' || echo 'unknown')" + +show_help() { + echo "Usage: vm-health-check [--watch]" + echo " --watch Continuous monitoring (updates every 30s)" + exit 0 +} + +check_health() { + echo "=== VM Health Check $(date '+%Y-%m-%d %H:%M:%S') ===" + echo "Uptime: $(uptime -p)" + echo "" + echo "--- Memory ---" + echo "RAM: $(free -h | awk '/Mem:/{print $3 "/" $2 " (" int($3/$2*100) "% used)"}')" + echo "Swap: $(free -h | awk '/Swap:/{if($2!="0B") print $3 "/" $2 " (" int($3/$2*100) "%)"; else print "not configured"}')" + + # Claude CLI memory tracking (critical for leak detection) + local claude_mem + claude_mem=$(ps aux 2>/dev/null | grep -E 'claude|node.*claude-code' | grep -v grep | awk '{sum+=$6} END {if(sum>0) printf "%.1fMB", sum/1024; else print "not running"}') + echo "Claude: $claude_mem" + + # Warn if Claude is consuming excessive memory + local claude_mb + claude_mb=$(ps aux 2>/dev/null | grep -E 'claude|node.*claude-code' | grep -v grep | awk '{sum+=$6} END {print sum/1024}') + if [[ -n "$claude_mb" ]] && (( $(echo "$claude_mb > 4000" | bc -l 2>/dev/null || echo 0) )); then + echo " ⚠️ WARNING: Claude CLI using >4GB - consider restarting or snapshotting" + fi + + echo "" + echo "--- System ---" + echo "Disk: $(df -h / | awk 'NR==2{print $3 "/" $2 " (" $5 " used)"}')" + echo "Load: $(cat /proc/loadavg | cut -d' ' -f1-3)" + echo "Procs: $(ps aux --no-headers | wc -l)" + echo "Journal: $(journalctl --disk-usage 2>/dev/null | grep -oP '\d+\.\d+[MG]' || echo 'unknown')" +} + +case "${1:-}" in + --help|-h) show_help ;; + --watch|-w) + while true; do + clear + check_health + echo "" + echo "[Ctrl+C to exit, refreshing in 30s...]" + sleep 30 + done + ;; + *) check_health ;; +esac HEALTHEOF sudo chmod +x /usr/local/bin/vm-health-check diff --git a/setup_cloud.sh b/setup_cloud.sh index 5fe192a..ac344da 100755 --- a/setup_cloud.sh +++ b/setup_cloud.sh @@ -16,7 +16,7 @@ readonly CLOUD_IMG_URL="https://cloud-images.ubuntu.com/noble/current/noble-serv readonly CLOUD_IMG_PATH="/var/lib/libvirt/images/ubuntu-noble-cloudimg.img" readonly DEFAULT_VM_NAME="ubuntu2404-agent" readonly DEFAULT_DISK_SIZE="50G" -readonly DEFAULT_MEMORY="8192" +readonly DEFAULT_MEMORY="16384" # 16GB - needed for Claude CLI memory leaks readonly DEFAULT_VCPUS="4" log_info() { echo -e "\033[32m[INFO]\033[0m $*"; } @@ -103,6 +103,8 @@ EOF create_vm() { local vm_name="$1" + local memory="$2" + local vcpus="$3" local disk_path="/var/lib/libvirt/images/${vm_name}.qcow2" local seed_path="/var/lib/libvirt/images/${vm_name}-seed.iso" @@ -124,11 +126,11 @@ create_vm() { seed_path="/var/lib/libvirt/images/${vm_name}-seed.iso" # Create VM - log_info "Creating VM: $vm_name" + log_info "Creating VM: $vm_name (${memory}MB RAM, ${vcpus} vCPUs)" sudo virt-install \ --name "$vm_name" \ - --vcpus "$DEFAULT_VCPUS" \ - --memory "$DEFAULT_MEMORY" \ + --vcpus "$vcpus" \ + --memory "$memory" \ --disk "path=$disk_path" \ --disk "path=$seed_path,device=cdrom" \ --os-variant ubuntu24.04 \ @@ -177,14 +179,33 @@ wait_for_ready() { main() { local vm_name="$DEFAULT_VM_NAME" + local memory="$DEFAULT_MEMORY" + local vcpus="$DEFAULT_VCPUS" local skip_download=false - + while [[ $# -gt 0 ]]; do case "$1" in --vm-name) vm_name="$2"; shift 2 ;; + --memory) memory="$2"; shift 2 ;; + --vcpus) vcpus="$2"; shift 2 ;; --skip-download) skip_download=true; shift ;; -h|--help) - echo "Usage: $0 [--vm-name NAME] [--skip-download]" + cat << EOF +Usage: $0 [options] + +Options: + --vm-name NAME VM name (default: $DEFAULT_VM_NAME) + --memory MB RAM in MB (default: $DEFAULT_MEMORY) + --vcpus N Number of vCPUs (default: $DEFAULT_VCPUS) + --skip-download Skip cloud image download if already present + -h, --help Show this help + +Examples: + $0 # Default: 16GB RAM, 4 vCPUs + $0 --memory 8192 --vcpus 4 # Lightweight: 8GB RAM (multiple clones) + $0 --memory 24576 --vcpus 8 # Heavy: 24GB RAM, 8 vCPUs + $0 --vm-name my-agent --memory 12288 # Custom name with 12GB RAM +EOF exit 0 ;; *) shift ;; @@ -203,7 +224,7 @@ main() { download_cloud_image fi - create_vm "$vm_name" + create_vm "$vm_name" "$memory" "$vcpus" wait_for_ready "$vm_name" echo "" diff --git a/virt_install_agent_vm.sh b/virt_install_agent_vm.sh index b793dd9..e823d36 100755 --- a/virt_install_agent_vm.sh +++ b/virt_install_agent_vm.sh @@ -22,7 +22,7 @@ readonly SCRIPT_VERSION="1.0.0" # VM defaults DEFAULT_VM_NAME="ubuntu2404-agent" DEFAULT_VCPUS="4" -DEFAULT_MEMORY="8192" # MB +DEFAULT_MEMORY="16384" # MB - 16GB needed for Claude CLI memory leaks DEFAULT_DISK_SIZE="50" # GB DEFAULT_DISK_PATH="/var/lib/libvirt/images"