m96-chan · m96-chan · Dec 30, 2025 · Dec 30, 2025 · Dec 30, 2025 · Dec 30, 2025
diff --git a/.claude/skills/benchmark/README.md b/.claude/skills/benchmark/README.md
@@ -0,0 +1,57 @@
+# Benchmark Skill
+
+Run unified benchmark suite for GEMM, GEMV, and attention kernels.
+
+## Commands
+
+```bash
+# Quick benchmark (default: GEMM + GEMV)
+python -m pygpukit.benchmark --quick
+
+# Full benchmark with all sizes
+python -m pygpukit.benchmark
+
+# Save results to JSON
+python -m pygpukit.benchmark --quick --save results.json
+
+# Compare with baseline
+python -m pygpukit.benchmark --compare baseline.json
+
+# Fail on regression (for CI)
+python -m pygpukit.benchmark --compare baseline.json --fail-on-regression
+
+# Specific benchmarks
+python -m pygpukit.benchmark --gemm --sizes 4096,8192
+python -m pygpukit.benchmark --gemv --dtypes bf16,fp8
+python -m pygpukit.benchmark --attention --seq-lens 512,1024,2048
+
+# All benchmarks including FP8 (SM120+)
+python -m pygpukit.benchmark --all --fp8
+
+# Markdown output for README
+python -m pygpukit.benchmark --quick --markdown
+```
+
+## Output
+
+- Time in microseconds (us)
+- TFLOPS for compute benchmarks
+- Correctness verification
+- JSON export for regression tracking
+
+## Usage in Code
+
+```python
+from pygpukit.benchmark import BenchmarkSuite
+
+suite = BenchmarkSuite(quick=True)
+suite.add_gemm(sizes=[(4096, 4096, 4096)])
+suite.add_gemv(dtypes=["bf16", "fp8"])
+report = suite.run()
+report.save("baseline.json")
+
+# Compare
+comparison = suite.compare("baseline.json")
+if comparison.has_regression(threshold=0.05):
+    print("Regression detected!")
+```
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -633,7 +633,7 @@ python -m mypy src/ --ignore-missing-imports --disable-error-code=union-attr --d
 python -m pytest tests/ -v
 
 # 4. Benchmark runs (optional but recommended)
-python benchmark.py --quick
+python -m pygpukit.benchmark --quick
 ```
 
 **DO NOT create PR until all checks pass locally.**
@@ -679,27 +679,36 @@ If performance or correctness degrades:
 
 ### Benchmarking
 
-**Always use `benchmark.py` for performance measurement.**
+**Use unified benchmark suite: `python -m pygpukit.benchmark`**
 
 ```bash
-# Full benchmark (all dtypes, all sizes)
-python benchmark.py
+# Quick benchmark (GEMM + GEMV)
+python -m pygpukit.benchmark --quick
 
-# Quick mode (fewer warmup/iterations)
-python benchmark.py --quick
+# Full benchmark
+python -m pygpukit.benchmark
 
-# Specific sizes
-python benchmark.py --sizes 4096 8192
+# Save results and compare with baseline
+python -m pygpukit.benchmark --quick --save baseline.json
+python -m pygpukit.benchmark --compare baseline.json --fail-on-regression
 
-# TF32 kernel version selection
-python benchmark.py --tf32-version v1   # WMMA API
-python benchmark.py --tf32-version v2   # PTX mma.sync (default)
+# Specific benchmarks
+python -m pygpukit.benchmark --gemm --sizes 4096,8192
+python -m pygpukit.benchmark --gemv --dtypes bf16,fp8
+python -m pygpukit.benchmark --attention --seq-lens 512,1024
+
+# All benchmarks including FP8 (SM120+)
+python -m pygpukit.benchmark --all --fp8
+
+# Markdown output for README
+python -m pygpukit.benchmark --quick --markdown
 ```
 
 **Output includes:**
-- Kernel-only timing (no D2H copy overhead)
-- Correctness verification (relative error)
-- README.md-ready table format
+- Time in microseconds (us)
+- TFLOPS for compute benchmarks
+- Correctness verification
+- JSON export for regression tracking
 
 **Environment Variables:**
 - `PYGPUKIT_ALLOW_TF32=1` - Enable TF32 TensorCore

diff --git a/benchmarks/benchmark_gemv_all.py b/benchmarks/benchmark_gemv_all.py