vals-ai · OrestesK · Mar 20, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -14,4 +14,6 @@ data/logs/
 *.log
 
 *aristotle_solns*
-*solve_all.sh*
+*solve_all.sh*
+/proof_bench.egg-info
+/.env
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
diff --git a/.python_version b/.python_version
@@ -0,0 +1 @@
+1.11
diff --git a/Makefile b/Makefile
@@ -0,0 +1,27 @@
+.PHONY: help install style test
+
+help:
+	@echo "Makefile for proof-bench"
+	@echo "Usage:"
+	@echo "  make install    Install dependencies"
+	@echo "  make style      Lint & format"
+	@echo "  make test       Run unit tests"
+
+install:
+	uv venv --python 3.11
+	uv sync --dev
+
+venv_check:
+	@if [ ! -f .venv/bin/activate ]; then \
+		echo "Virtualenv not found. Run 'make install' first."; \
+		exit 1; \
+	fi
+
+format: venv_check
+	uv run ruff format .
+lint: venv_check
+	uv run ruff check --fix .
+style: format lint
+
+test: venv_check
+	uv run pytest tests/ -x -q
diff --git a/README.md b/README.md
@@ -17,17 +17,19 @@ Setup, local Loogle configuration, development commands, and platform run instru
 - `problems/`: Lean theorem files plus informal statements/proofs under `problems/informal/`. The folder only has sample problems; the rest of the benchmark is private.
 - `data/proof-bench.jsonl`: exported metadata used at runtime
 - `proof_bench/agent.py`: agent loop and tool orchestration
-- `proof_bench/tools.py`: `lean_run_code`, `lean_loogle`, and `submit_proof`
+- `proof_bench/tools.py`: `lean_run_code`, `lean_loogle`, and `submit_proof` tool subclasses
+- `proof_bench/mcp_client.py`: MCP client infrastructure and Lean execution
 - `proof_bench/prover.py`: attempt execution, aggregation, and logging
 - `proof_bench/load_problems.py`: exported dataset loader
 - `proof_bench/validate_and_export.py`: metadata validation and JSONL export
 - `main.py`: CLI entrypoint
 
 ## Quick Start
 
-After following `SETUP.md`, a basic run looks like:
+After following `SETUP.md`, export the dataset then run:
 
 ```bash
+python proof_bench/validate_and_export.py
 python main.py --dataset exported --model openai/gpt-4o --k 3
 ```
 
@@ -42,6 +44,7 @@ python main.py --dataset exported --domains logic number_theory --model openai/g
 ## Where To Look Next
 
 - `SETUP.md`: installation, MCP/Loogle setup, local cache generation, development workflow, CI, and platform runs
-- `proof_bench/tools.py`: tool definitions and MCP plumbing
+- `proof_bench/tools.py`: tool subclasses (`lean_run_code`, `lean_loogle`, `submit_proof`)
+- `proof_bench/mcp_client.py`: MCP client infrastructure and Lean execution
 - `proof_bench/agent.py`: agent loop
 - `proof_bench/prover.py`: attempt execution and result aggregation
diff --git a/SETUP.md b/SETUP.md
@@ -6,19 +6,18 @@ This document contains the engineering and environment setup details for working
 
 ```bash
 curl -LsSf https://astral.sh/uv/install.sh | sh
-uv venv
+make install
 source .venv/bin/activate
-uv pip install -e ".[llm]"
 ```
 
-For development without LLM integrations:
+Create a `.env` file with your API keys:
 
 ```bash
-uv pip install -e .
+OPENAI_API_KEY='sk-...'
+ANTHROPIC_API_KEY='sk-ant-...'
+# add other provider keys as needed
 ```
 
-`[llm]` includes the public `model-library` dependency for model access.
-
 ## Lean 4
 
 ```bash
@@ -77,11 +76,10 @@ Install it with:
 uv tool install lean-lsp-mcp
 ```
 
-Run with Loogle enabled:
+Loogle is enabled by default. Use `--no-loogle` to turn it off. For local search:
 
 ```bash
-python main.py --dataset exported --model openai/gpt-4o --k 3 \
-  --enable-loogle --loogle-local
+python main.py --dataset exported --model openai/gpt-4o --k 3 --loogle-local
 ```
 
 Modes:
@@ -142,7 +140,7 @@ python -m proof_bench.loogle_daemon --port 8765
 
 # Terminal 2+
 export LOOGLE_DAEMON_URL=http://127.0.0.1:8765
-python main.py --dataset exported --model openai/gpt-4o --k 8 --enable-loogle
+python main.py --dataset exported --model openai/gpt-4o --k 8
 ```
 
 Programmatic config:
@@ -157,12 +155,9 @@ loogle_config = {
 ## Development
 
 ```bash
-pre-commit install
-pre-commit run --all-files
-ruff check .
-ruff format .
+make style
+make test
 lake build
-pytest tests/
 ```
 
 ## Upgrading Lean Or Mathlib