Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@

# ShellForge

**Governed local AI agents — one Go binary, zero cloud.**
**Governed AI agent runtime — one Go binary, local or cloud.**

[![Go](https://img.shields.io/badge/Go-1.18+-00ADD8?style=for-the-badge&logo=go&logoColor=white)](https://go.dev)
[![GitHub Pages](https://img.shields.io/badge/Live_Site-agentguardhq.github.io/shellforge-ff6b2b?style=for-the-badge)](https://agentguardhq.github.io/shellforge)
[![License: MIT](https://img.shields.io/badge/License-MIT-blue?style=for-the-badge)](LICENSE)
[![AgentGuard](https://img.shields.io/badge/Governed_by-AgentGuard-green?style=for-the-badge)](https://github.com/AgentGuardHQ/agentguard)

*Run autonomous AI agents on your machine with policy enforcement on every tool call. No cloud. No API keys. No data leaves your laptop.*
*Run autonomous AI agents with policy enforcement on every tool call. Local via Ollama or cloud via Anthropic API — your choice.*

[Website](https://agentguardhq.github.io/shellforge) · [Docs](docs/architecture.md) · [Roadmap](docs/roadmap.md) · [AgentGuard](https://github.com/AgentGuardHQ/agentguard)

Expand Down Expand Up @@ -112,9 +112,11 @@ shellforge status

| Command | Description |
|---------|-------------|
| `shellforge run <driver> "prompt"` | Run a governed agent (goose, claude, copilot, codex, gemini) |
| `shellforge agent "prompt"` | Run a governed agent (Ollama, default) |
| `shellforge agent --provider anthropic "prompt"` | Run via Anthropic API (Haiku/Sonnet/Opus, prompt caching) |
| `shellforge agent --thinking-budget 8000 "prompt"` | Enable extended thinking (Sonnet/Opus) |
| `shellforge run <driver> "prompt"` | Run a governed CLI driver (goose, claude, copilot, codex, gemini) |
| `shellforge setup` | Install Ollama, create governance config, verify stack |
| `shellforge agent "prompt"` | Run a governed agent — every tool call checked |
| `shellforge qa [dir]` | QA analysis — find test gaps and issues |
| `shellforge report [repo]` | Generate a status report from git + logs |
| `shellforge serve agents.yaml` | Daemon mode — run a 24/7 agent swarm |
Expand Down
119 changes: 119 additions & 0 deletions cmd/shellforge/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/AgentGuardHQ/shellforge/internal/llm"
"github.com/AgentGuardHQ/shellforge/internal/logger"
"github.com/AgentGuardHQ/shellforge/internal/ollama"
"github.com/AgentGuardHQ/shellforge/internal/ralph"
"github.com/AgentGuardHQ/shellforge/internal/scheduler"
)

Expand Down Expand Up @@ -84,6 +85,8 @@ os.Exit(1)
}
cmdAgent(strings.Join(filtered, " "), providerName, thinkingBudget)
}
case "ralph":
cmdRalph()
case "swarm":
cmdSwarm()
case "serve":
Expand Down Expand Up @@ -120,6 +123,7 @@ Usage:
shellforge scan [dir] DefenseClaw supply chain scan
shellforge version Print version

shellforge ralph [flags] Run Ralph Loop (stateless-iterative task execution)
shellforge serve [config] Simple daemon mode (built-in scheduler)
shellforge swarm Setup Dagu orchestration (DAG workflows + web UI)

Expand Down Expand Up @@ -724,6 +728,121 @@ printResult("prototype-agent", result)
saveReport("outputs/logs", "prototype", result)
}

func cmdRalph() {
engine := mustGovernance()

// Parse flags
taskFile := "tasks.json"
logFile := "ralph-log.jsonl"
providerName := ""
dryRun := false
autoCommit := false
maxTasks := 0
var validate []string

remaining := os.Args[2:]
for i := 0; i < len(remaining); i++ {
switch remaining[i] {
case "--tasks":
if i+1 < len(remaining) {
taskFile = remaining[i+1]
i++
}
case "--log":
if i+1 < len(remaining) {
logFile = remaining[i+1]
i++
}
case "--provider":
if i+1 < len(remaining) {
providerName = remaining[i+1]
i++
}
case "--validate":
if i+1 < len(remaining) {
validate = append(validate, remaining[i+1])
i++
}
case "--dry-run":
dryRun = true
case "--auto-commit":
autoCommit = true
case "--max-tasks":
if i+1 < len(remaining) {
fmt.Sscanf(remaining[i+1], "%d", &maxTasks)
i++
}
default:
fmt.Fprintf(os.Stderr, "Unknown ralph flag: %s\n", remaining[i])
fmt.Fprintln(os.Stderr, "Usage: shellforge ralph [--tasks file] [--log file] [--provider name] [--validate cmd] [--dry-run] [--auto-commit] [--max-tasks N]")
os.Exit(1)
}
}

var provider llm.Provider
switch providerName {
case "anthropic":
apiKey := os.Getenv("ANTHROPIC_API_KEY")
if apiKey == "" {
fmt.Fprintln(os.Stderr, "Error: ANTHROPIC_API_KEY environment variable not set")
os.Exit(1)
}
model := os.Getenv("ANTHROPIC_MODEL")
if model == "" {
model = "claude-haiku-4-5-20251001"
}
provider = llm.NewAnthropicProvider(apiKey, model)
fmt.Fprintf(os.Stderr, "[ralph] Using Anthropic API (model: %s)\n", model)
case "":
// Legacy Ollama path
mustOllama()
default:
fmt.Fprintf(os.Stderr, "Unknown provider: %s\n", providerName)
os.Exit(1)
}

cfg := ralph.RalphConfig{
TaskSource: ralph.SourceFile,
TaskFile: taskFile,
LogFile: logFile,
Validate: validate,
AutoCommit: autoCommit,
MaxTasks: maxTasks,
DryRun: dryRun,
LoopConfig: agent.LoopConfig{
Agent: "ralph-agent",
System: "You are a senior engineer. Complete the requested task using available tools. Read files, write files, run commands. Be precise and thorough.",
Model: ollama.Model,
MaxTurns: 15,
TimeoutMs: 180_000,
OutputDir: "outputs/logs",
TokenBudget: 3000,
Provider: provider,
},
}

fmt.Printf("[ralph] Starting Ralph Loop — tasks: %s, dry-run: %v\n", taskFile, dryRun)

result, err := ralph.RunRalph(cfg, engine)
if err != nil {
fmt.Fprintf(os.Stderr, "ERROR: ralph loop: %s\n", err)
os.Exit(1)
}

fmt.Println()
fmt.Printf("[ralph] Complete — %d completed, %d failed, %d skipped (of %d total)\n",
result.Completed, result.Failed, result.Skipped, result.Total)

for _, entry := range result.Entries {
status := "completed"
if entry.Status == ralph.StatusFailed {
status = "FAILED"
}
fmt.Printf(" [%s] task %s: %s (%d turns, %dms)\n",
status, entry.TaskID, entry.Description, entry.Turns, entry.DurationMs)
}
}

func cmdSwarm() {
fmt.Println("=== ShellForge Swarm Setup (Dagu) ===")
fmt.Println()
Expand Down
7 changes: 6 additions & 1 deletion docs/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,14 @@ cmd/shellforge/
└── status.go # Ecosystem health check

internal/
├── llm/ # LLM provider interface
│ ├── provider.go # Provider interface (Chat, Name) + Message/Response types
│ └── anthropic.go# Anthropic API adapter (stdlib HTTP, prompt caching, tool_use)
├── agent/ # Agentic loop
│ ├── loop.go # runProviderLoop (Anthropic) + runOllamaLoop, drift detection wiring
│ └── drift.go # Drift detector — self-score every 5 calls, steer/kill on low scores
├── governance/ # agentguard.yaml parser + policy engine
├── ollama/ # Ollama HTTP client (chat, generate)
├── agent/ # Native fallback agentic loop
├── tools/ # 5 tool implementations + RTK wrapper
├── engine/ # Pluggable engine interface (Goose, OpenClaw, OpenCode)
├── logger/ # Structured JSON logging
Expand Down
20 changes: 14 additions & 6 deletions docs/roadmap.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,22 @@
- [x] Tested Aider (file editing only, no shell execution)
- [x] Evaluated Goose (Block) — native Ollama, actually executes tools

### v0.6.0 — Goose + Governed Shell ← CURRENT
### v0.6.0 — Goose + Governed Shell
- [x] Goose as local model driver (`shellforge run goose`)
- [x] `govern-shell.sh` — shell wrapper that evaluates every command through AgentGuard
- [x] `shellforge run goose` sets SHELL to governed wrapper automatically
- [x] Fixed catch-all deny bug (bounded-execution policy was denying everything)
- [x] Dagu DAG templates (sdlc-swarm, studio-swarm, workspace-swarm, multi-driver)

### v0.7.0 — Anthropic API Provider ← CURRENT
- [x] LLM provider interface (`llm.Provider`) — pluggable Ollama vs Anthropic backends
- [x] Anthropic API adapter — stdlib HTTP, structured `tool_use` blocks, multi-turn history
- [x] Prompt caching — `cache_control: ephemeral` on system + tools, ~90% savings on cached tokens
- [x] Extended thinking budget (`--thinking-budget` flag)
- [x] Model cascading via Octi Pulpo (Haiku→Sonnet→Opus by `TaskComplexity` score)
- [x] Drift detection — self-score every 5 tool calls, steer below 7, kill below 5 twice
- [x] RTK token compression wired into `runShellWithRTK()` (70-90% savings on shell output)

---

## In Progress
Expand Down Expand Up @@ -110,11 +119,10 @@ ShellForge orchestrates, Octi Pulpo coordinates, AgentGuard governs. This phase
- [ ] Move `internal/` types to `pkg/` for external import
- [ ] Cloud telemetry opt-in (AgentGuard Cloud)

### Phase 11 — Replace Workspace Bash Swarm
- [ ] Dagu replaces `server/deploy.sh` + cron + queue.txt
- [ ] Multi-driver DAGs: Claude Code + Copilot + Codex on Linux box
- [ ] Same governance policy across all drivers
- [ ] ShellForge as the runtime for agentguard-workspace swarm
### Phase 11 — Replace Workspace Bash Swarm ✅ DONE
- [x] Migrated to API-driven dispatch: Octi Pulpo → ShellForge → Anthropic API
- [x] GH Actions Copilot Agent workflow (`dispatch-agent.yml`) for free-tier automation
- [x] ShellForge is now the execution harness for the agentguard-workspace swarm

---

Expand Down
54 changes: 54 additions & 0 deletions internal/orchestrator/compress.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package orchestrator

// tokenThreshold is the maximum output size (in estimated tokens) that
// passes through without compression. Outputs below this are returned as-is.
const tokenThreshold = 750

// CompressResult compresses a sub-agent output if it exceeds the token threshold.
// Strategy:
// 1. If output < 750 tokens (estimated), return as-is
// 2. Otherwise truncate to the threshold with a marker
//
// A future version will use LLM summarization when a Provider is available.
func CompressResult(output string) string {
estimated := estimateTokens(output)
if estimated <= tokenThreshold {
return output
}

// Truncate to approximately tokenThreshold tokens (4 chars per token estimate)
maxChars := tokenThreshold * 4
if maxChars >= len(output) {
return output
}
return output[:maxChars] + "\n\n[... output truncated — " + itoa(estimated-tokenThreshold) + " tokens omitted]"
}

// estimateTokens provides a rough token count (1 token ~ 4 chars).
func estimateTokens(s string) int {
return len(s) / 4
}

// itoa converts an int to a string without importing strconv (Go 1.18 compat).
func itoa(n int) string {
if n == 0 {
return "0"
}
neg := false
if n < 0 {
neg = true
n = -n
}
var buf [20]byte
i := len(buf)
for n > 0 {
i--
buf[i] = byte('0' + n%10)
n /= 10
}
if neg {
i--
buf[i] = '-'
}
return string(buf[i:])
}
83 changes: 83 additions & 0 deletions internal/orchestrator/compress_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
package orchestrator

import (
"strings"
"testing"
)

func TestCompressResult_ShortOutput(t *testing.T) {
short := "This is a short output."
result := CompressResult(short)
if result != short {
t.Errorf("short output should pass through unchanged, got %q", result)
}
}

func TestCompressResult_ExactThreshold(t *testing.T) {
// 750 tokens * 4 chars = 3000 chars
exact := strings.Repeat("a", 3000)
result := CompressResult(exact)
if result != exact {
t.Error("output at exact threshold should pass through unchanged")
}
}

func TestCompressResult_OverThreshold(t *testing.T) {
// 4000 tokens * 4 chars = 16000 chars
long := strings.Repeat("x", 16000)
result := CompressResult(long)

if len(result) >= len(long) {
t.Errorf("compressed result should be shorter than original (%d >= %d)", len(result), len(long))
}
if !strings.Contains(result, "truncated") {
t.Error("compressed result should contain truncation marker")
}
if !strings.Contains(result, "omitted") {
t.Error("compressed result should indicate omitted tokens")
}
}

func TestCompressResult_Empty(t *testing.T) {
result := CompressResult("")
if result != "" {
t.Errorf("empty input should return empty, got %q", result)
}
}

func TestEstimateTokens(t *testing.T) {
tests := []struct {
input string
expected int
}{
{"", 0},
{"abcd", 1},
{"12345678", 2},
{strings.Repeat("a", 100), 25},
}
for _, tt := range tests {
got := estimateTokens(tt.input)
if got != tt.expected {
t.Errorf("estimateTokens(%d chars): expected %d, got %d", len(tt.input), tt.expected, got)
}
}
}

func TestItoa(t *testing.T) {
tests := []struct {
input int
expected string
}{
{0, "0"},
{1, "1"},
{42, "42"},
{-5, "-5"},
{1000, "1000"},
}
for _, tt := range tests {
got := itoa(tt.input)
if got != tt.expected {
t.Errorf("itoa(%d): expected %q, got %q", tt.input, tt.expected, got)
}
}
}
Loading
Loading