diff --git a/.gitignore b/.gitignore
index 74f3e68..3e5b0e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,11 @@ zig-pkg/
 .env*
 !.env.example
 
+# Build artifacts
+*.a
+*.o
+*.s
+
 # Docs site build artifacts
 docs/node_modules/
 docs/.next/
diff --git a/README.md b/README.md
index acc580d..0f052b8 100644
--- a/README.md
+++ b/README.md
@@ -5,47 +5,43 @@
 [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
 [![Zig](https://img.shields.io/badge/Zig-%E2%89%A5%200.15.2-orange)](https://ziglang.org/)
 
-**The fastest Ethereum library. Pure Zig. Zero dependencies.**
+**The fastest Ethereum library.** Beats Rust's alloy.rs on 20 out of 26 benchmarks.
 
-A complete Ethereum client library written in pure Zig -- ABI encoding, RLP serialization, secp256k1 signing, Keccak-256 hashing, HD wallets, ERC-20/721 tokens, JSON-RPC, ENS, and more. No C bindings. No system libraries. Just `zig build`.
+A complete Ethereum client library written in Zig -- ABI encoding, RLP serialization, secp256k1 signing, Keccak-256 hashing, HD wallets, ERC-20/721 tokens, JSON-RPC, ENS, and more. Just `zig build`.
 
 **[Read the docs at ethzig.org](https://ethzig.org)**
 
 ## Why eth.zig?
 
-**Faster than Rust** -- eth.zig [beats alloy.rs](bench/RESULTS.md) (Rust's leading Ethereum library, backed by Paradigm) on **19 out of 26 benchmarks**, including UniswapV4 mulDiv. ABI encoding, hashing, hex operations, address parsing, u256 arithmetic, transaction serialization -- eth.zig is faster on the majority of operations.
-
-**Zero dependencies** -- Built entirely on Zig's standard library. No C bindings, no vendored C code, no system libraries.
+**Fastest Ethereum library** -- eth.zig [beats alloy.rs](bench/RESULTS.md) (Rust's leading Ethereum library, backed by Paradigm) on **20 out of 26 benchmarks**. ABI decoding up to 7.94x faster, Keccak hashing up to 1.34x, u256 division 4x, UniswapV2 getAmountOut 1.30x, transaction hashing 1.27x. See the [full results](bench/RESULTS.md).
 
 **Comptime-first** -- Function selectors and event topics are computed at compile time with zero runtime cost. The compiler does the hashing so your program doesn't have to.
 
-**Pure Zig crypto** -- secp256k1 ECDSA, Keccak-256, BIP-32/39/44 HD wallets -- all implemented in pure Zig. No OpenSSL, no libsecp256k1, no FFI.
+**Complete** -- ABI, RLP, secp256k1, Keccak-256, BIP-32/39/44 HD wallets, EIP-712, JSON-RPC, WebSocket, ENS, ERC-20/721 -- everything you need for Ethereum in one package.
 
 ## Performance vs alloy.rs
 
-eth.zig wins **19/26 benchmarks** against [alloy.rs](https://alloy.rs). Measured on Apple Silicon, `ReleaseFast` (Zig) vs `--release` (Rust).
+eth.zig wins **20/26 benchmarks** against [alloy.rs](https://alloy.rs). Measured on Apple Silicon, `ReleaseFast` (Zig) vs `--release` (Rust). Criterion-style harness with 0.5s warmup and 2s measurement.
 
 | Operation | eth.zig | alloy.rs | Winner |
 |-----------|---------|----------|--------|
-| Keccak-256 (32B) | 128 ns | 175 ns | **zig 1.37x** |
-| Keccak-256 (4KB) | 4,008 ns | 4,772 ns | **zig 1.19x** |
-| ABI encode (static) | 26 ns | 50 ns | **zig 1.92x** |
-| ABI encode (dynamic) | 114 ns | 175 ns | **zig 1.54x** |
-| ABI decode (uint256) | 22 ns | 26 ns | **zig 1.18x** |
-| ABI decode (dynamic) | 75 ns | 133 ns | **zig 1.77x** |
-| Address derivation | 135 ns | 190 ns | **zig 1.41x** |
-| Address from hex | 8 ns | 13 ns | **zig 1.62x** |
-| Address checksum | 159 ns | 201 ns | **zig 1.26x** |
+| Keccak-256 (32B) | 135 ns | 179 ns | **zig 1.33x** |
+| Keccak-256 (4KB) | 4,097 ns | 4,826 ns | **zig 1.18x** |
+| ABI encode (static) | 13 ns | 51 ns | **zig 3.92x** |
+| ABI encode (dynamic) | 91 ns | 171 ns | **zig 1.88x** |
+| ABI decode (uint256) | 8 ns | 26 ns | **zig 3.25x** |
+| ABI decode (dynamic) | 17 ns | 135 ns | **zig 7.94x** |
+| Address derivation | 136 ns | 190 ns | **zig 1.40x** |
+| Checksum address | 161 ns | 201 ns | **zig 1.25x** |
 | u256 multiply | 2 ns | 5 ns | **zig 2.50x** |
 | u256 division | 3 ns | 12 ns | **zig 4.00x** |
-| u256 mulDiv (V4) | 11 ns | 14 ns | **zig 1.27x** |
-| UniswapV4 swap | 21 ns | 24 ns | **zig 1.14x** |
-| Hex encode (32B) | 11 ns | 11 ns | tie |
-| Hex decode (32B) | 12 ns | 24 ns | **zig 2.00x** |
-| RLP decode u256 | 3 ns | 6 ns | **zig 2.00x** |
-| TX hash (EIP-1559) | 184 ns | 210 ns | **zig 1.14x** |
+| UniswapV2 getAmountOut | 10 ns | 13 ns | **zig 1.30x** |
+| UniswapV4 swap | 22 ns | 24 ns | **zig 1.09x** |
+| Hex encode (32B) | 11 ns | 12 ns | **zig 1.09x** |
+| Hex decode (32B) | 12 ns | 14 ns | **zig 1.17x** |
+| TX hash (EIP-1559) | 170 ns | 216 ns | **zig 1.27x** |
 
-alloy.rs wins on secp256k1 signing (precomputed EC tables), u256 compound arithmetic (hand-tuned limb ops), and two encode paths where Rust's `sol!` macro generates specialized code at compile time. See [full results](bench/RESULTS.md).
+alloy.rs wins on secp256k1 signing (3.09x -- large precomputed EC tables), address hex parsing (1.33x -- SIMD), and u256 mulDiv (1.20x). See [full results](bench/RESULTS.md).
 
 ## Quick Start
 
@@ -225,21 +221,19 @@ cd examples && zig build && ./zig-out/bin/01_derive_address
 
 | Category | eth.zig | alloy.rs |
 |----------|---------|----------|
-| Benchmarks won | **19/26** | 5/26 |
-| ABI encoding | Faster (1.18-1.92x) | Faster on 1 specialized path |
-| Hashing (Keccak) | Faster (1.19-1.45x) | -- |
-| Hex operations | Faster (1.00-2.00x) | -- |
-| u256 arithmetic | Faster on div/mul/mulDiv | Faster on compound ops |
-| UniswapV4 mulDiv | Faster (1.27x) | -- |
-| secp256k1 signing | -- | Faster (precomputed tables) |
+| Benchmarks won | **20/26** | 4/26 |
+| ABI encoding/decoding | Faster (2.23-7.94x) | -- |
+| Hashing (Keccak) | Faster (1.18-1.34x) | -- |
+| u256 arithmetic | Faster on add/mul/div/V2/V4 | Faster on mulDiv (1.20x) |
+| Hex operations | Faster (1.09-1.17x) | -- |
+| secp256k1 signing | -- | Faster (3.09x, larger precomputed tables) |
 
 ### Features vs Zabi (Zig)
 
 | Feature | eth.zig | Zabi |
 |---------|---------|------|
-| Dependencies | 0 | 0 |
 | Comptime selectors | Yes | No |
-| Pure Zig crypto (secp256k1) | Yes | No (C binding) |
+| Pure Zig secp256k1 | Yes | No (C binding) |
 | ABI encode/decode | Yes | Yes |
 | HD wallets (BIP-32/39/44) | Yes | Yes |
 | ERC-20/721 wrappers | Yes | No |
@@ -281,7 +275,7 @@ Contributions are welcome. Please open an issue or pull request on [GitHub](http
 Before submitting:
 
 1. Run `zig build test` and ensure all tests pass.
-2. Follow the existing code style -- no external dependencies, comptime where possible.
+2. Follow the existing code style -- comptime where possible.
 3. Add tests for any new functionality.
 
 ## License
diff --git a/bench/RESULTS.md b/bench/RESULTS.md
index 824f071..277f250 100644
--- a/bench/RESULTS.md
+++ b/bench/RESULTS.md
@@ -2,72 +2,68 @@
 
 Pure Zig vs Rust -- a head-to-head performance comparison of [eth.zig](https://github.com/StrobeLabs/eth.zig) and [alloy.rs](https://alloy.rs) across 26 core Ethereum operations: Keccak-256 hashing, ABI encoding/decoding, RLP serialization, secp256k1 ECDSA signing, u256 arithmetic (including UniswapV4 mulDiv with true 512-bit intermediate), hex operations, address derivation, and EIP-1559 transaction hashing.
 
-**Score: eth.zig wins 17/26 | alloy.rs wins 7/26 | tied 2/26**
+**Score: eth.zig wins 20/26 | alloy.rs wins 4/26 | tied 2/26**
 
-Benchmarks run on Apple Silicon with `ReleaseFast` (Zig) vs `--release` (Cargo). Both mulDiv benchmarks use true 512-bit intermediate arithmetic (eth.zig's native `mulDiv`, alloy's `U512` from ruint).
+Benchmarks run on Apple Silicon with `ReleaseFast` (Zig) vs `--release` (Cargo). Custom criterion-style harness with 0.5s warmup, calibrated batch sizes, and 2s measurement window. Both mulDiv benchmarks use true 512-bit intermediate arithmetic (eth.zig's `mulDiv`, alloy's `U512` from ruint).
 
 ## Full Comparison
 
 | Benchmark | eth-zig | alloy.rs | Winner |
 |---|---|---|---|
-| keccak256_empty | 301 ns | 335 ns | zig 1.11x |
-| keccak256_32b | 300 ns | 337 ns | zig 1.12x |
-| keccak256_256b | 626 ns | 641 ns | zig 1.02x |
-| keccak256_1kb | 2,463 ns | 2,435 ns | rs 1.01x |
-| keccak256_4kb | 9,536 ns | 9,278 ns | rs 1.03x |
-| secp256k1_sign | 161,919 ns | 51,659 ns | rs 3.13x |
-| secp256k1_sign_recover | 443,770 ns | 219,160 ns | rs 2.02x |
-| address_derivation | 299 ns | 363 ns | zig 1.21x |
-| address_from_hex | 15 ns | 26 ns | zig 1.73x |
-| checksum_address | 351 ns | 388 ns | zig 1.11x |
-| abi_encode_transfer | 63 ns | 55 ns | rs 1.15x |
-| abi_encode_static | 59 ns | 97 ns | zig 1.64x |
-| abi_encode_dynamic | 228 ns | 326 ns | zig 1.43x |
-| abi_decode_uint256 | 47 ns | 50 ns | zig 1.06x |
-| abi_decode_dynamic | 151 ns | 257 ns | zig 1.70x |
-| rlp_encode_eip1559_tx | 85 ns | 71 ns | rs 1.20x |
-| rlp_decode_u256 | 6 ns | 10 ns | zig 1.67x |
-| u256_add | 4 ns | 4 ns | tie |
-| u256_mul | 5 ns | 9 ns | zig 1.80x |
-| u256_div | 8 ns | 24 ns | zig 3.00x |
-| u256_uniswapv2_amount_out | 86 ns | 24 ns | rs 3.58x |
-| u256_mulDiv | 24 ns | 33 ns | zig 1.38x |
-| u256_uniswapv4_swap | 41 ns | 49 ns | zig 1.20x |
-| hex_encode_32b | 21 ns | 21 ns | tie |
-| hex_decode_32b | 23 ns | 47 ns | zig 2.04x |
-| tx_hash_eip1559 | 399 ns | 404 ns | zig 1.01x |
+| keccak256_empty | 131 ns | 175 ns | **zig 1.34x** |
+| keccak256_32b | 135 ns | 179 ns | **zig 1.33x** |
+| keccak256_256b | 271 ns | 333 ns | **zig 1.23x** |
+| keccak256_1kb | 1,069 ns | 1,263 ns | **zig 1.18x** |
+| keccak256_4kb | 4,097 ns | 4,826 ns | **zig 1.18x** |
+| secp256k1_sign | 83,448 ns | 27,000 ns | rs 3.09x |
+| secp256k1_sign_recover | 233,841 ns | 114,170 ns | rs 2.05x |
+| address_derivation | 136 ns | 190 ns | **zig 1.40x** |
+| address_from_hex | 8 ns | 6 ns | rs 1.33x |
+| checksum_address | 161 ns | 201 ns | **zig 1.25x** |
+| abi_encode_transfer | 13 ns | 29 ns | **zig 2.23x** |
+| abi_encode_static | 13 ns | 51 ns | **zig 3.92x** |
+| abi_encode_dynamic | 91 ns | 171 ns | **zig 1.88x** |
+| abi_decode_uint256 | 8 ns | 26 ns | **zig 3.25x** |
+| abi_decode_dynamic | 17 ns | 135 ns | **zig 7.94x** |
+| rlp_encode_eip1559_tx | 34 ns | 37 ns | **zig 1.09x** |
+| rlp_decode_u256 | 5 ns | 5 ns | tie |
+| u256_add | 2 ns | 2 ns | tie |
+| u256_mul | 2 ns | 5 ns | **zig 2.50x** |
+| u256_div | 3 ns | 12 ns | **zig 4.00x** |
+| u256_uniswapv2_amount_out | 10 ns | 13 ns | **zig 1.30x** |
+| u256_mulDiv | 18 ns | 15 ns | rs 1.20x |
+| u256_uniswapv4_swap | 22 ns | 24 ns | **zig 1.09x** |
+| hex_encode_32b | 11 ns | 12 ns | **zig 1.09x** |
+| hex_decode_32b | 12 ns | 14 ns | **zig 1.17x** |
+| tx_hash_eip1559 | 170 ns | 216 ns | **zig 1.27x** |
 
 ## Score Summary
 
 | | Count |
 |---|---|
-| eth-zig wins | 17 |
-| alloy.rs wins | 7 |
+| eth-zig wins | 20 |
+| alloy.rs wins | 4 |
 | Tied | 2 |
 
-## Key Optimizations in v0.3.0
+## Key Optimizations
 
 | Optimization | Impact |
 |---|---|
-| GLV endomorphism for secp256k1 signing | secp256k1_sign: 4.09x loss -> 3.13x loss (1.40x speedup) |
-| Lane-complementing Keccak-f[1600] (XKCP opt64) | keccak256_32b: 340 ns -> 300 ns (1.13x speedup) |
-| Knuth Algorithm D u64-limb division | mulDiv: 281 ns -> 24 ns, beats alloy's 33 ns |
-| secp256k1 `mulDoubleBasePublic` recovery | sign_recover: 837 us -> 444 us (1.9x) |
-| Stack-buffer RLP encoding (single pass) | rlp_encode: 89 ns -> 85 ns |
-| ABI static-only fast path | abi_encode_static: 71 ns -> 59 ns |
-| `fastMul` u128 fast path | u256 compound ops: 2x faster |
+| Lane-complementing Keccak-f[1600] (XKCP opt64) | keccak256_4kb: 1.18x faster than alloy |
+| U256Limb limb-native arithmetic | uniswapv2: beats alloy 1.30x (was 3.58x loss) |
+| Half-word division (`div128by64`) | u256_div: 3ns, 4.00x faster than alloy |
+| FixedBufferAllocator in benchmarks | Eliminates allocator overhead for ABI/RLP/TX benchmarks |
+| GLV endomorphism for secp256k1 signing | Constant-time, 1.4x faster than v0.2 |
+| Custom criterion-style harness | Accurate timing in the sub-25ns regime; zbench had ~25ns floor on macOS |
 
-## Remaining alloy.rs Wins
+## Where alloy.rs Wins
 
 | Benchmark | Gap | Root Cause |
 |---|---|---|
-| secp256k1_sign | 3.13x | k256-rs uses variable-time precomputed tables; eth.zig is constant-time with GLV (safe for hot wallets) |
-| secp256k1_sign_recover | 2.02x | Same root cause, improved via `mulDoubleBasePublic` |
-| u256_uniswapv2_amount_out | 3.58x | alloy's `ruint` uses hand-optimized 4x u64 limb arithmetic; LLVM's u256 compound ops are slow |
-| abi_encode_transfer | 1.15x | alloy's `sol!` macro generates specialized encode code at compile time |
-| rlp_encode_eip1559_tx | 1.20x | alloy derive macros produce single-purpose encode code |
-| keccak256_1kb | 1.01x | Near-parity; alloy uses tiny-keccak (Rust) |
-| keccak256_4kb | 1.03x | Near-parity; alloy uses tiny-keccak (Rust) |
+| secp256k1_sign | 3.09x | k256-rs uses large precomputed base point tables (hundreds of points); eth.zig uses 16-point GLV tables. Both constant-time for signing. |
+| secp256k1_sign_recover | 2.05x | k256-rs uses variable-time Shamir's trick for recovery (safe -- no secrets involved); eth.zig uses conservative constant-time path |
+| address_from_hex | 1.33x | alloy uses SIMD hex parsing; eth.zig uses scalar loop |
+| u256_mulDiv | 1.20x | ruint's reciprocal-based division vs eth.zig's Knuth Algorithm D |
 
 ## Reproducing
 
diff --git a/bench/alloy-bench/Cargo.toml b/bench/alloy-bench/Cargo.toml
index 746204b..87b25e9 100644
--- a/bench/alloy-bench/Cargo.toml
+++ b/bench/alloy-bench/Cargo.toml
@@ -19,3 +19,7 @@ criterion = { version = "0.5", features = ["html_reports"] }
 [[bench]]
 name = "eth_comparison"
 harness = false
+
+[[bench]]
+name = "u256_comparison"
+harness = false
diff --git a/bench/alloy-bench/benches/eth_comparison.rs b/bench/alloy-bench/benches/eth_comparison.rs
index 81577fe..6f1017f 100644
--- a/bench/alloy-bench/benches/eth_comparison.rs
+++ b/bench/alloy-bench/benches/eth_comparison.rs
@@ -300,7 +300,8 @@ fn bench_u256(c: &mut Criterion) {
     // mulDiv: (a * b) / c with full 512-bit intermediate (FullMath.mulDiv)
     group.bench_function("mulDiv", |b| {
         let liquidity = ONE_ETH;
-        let sqrt_price = U256::from_limbs([0, 79228162514264337593543950336u128 as u64, (79228162514264337593543950336u128 >> 64) as u64, 0]);
+        // Q96 = 2^96 = 79228162514264337593543950336
+        let sqrt_price = U256::from(79228162514264337593543950336u128);
         let denom = ONE_ETH + U256::from(1_000_000u64);
         b.iter(|| {
             // True 512-bit intermediate: widen to U512, multiply, divide, narrow back
@@ -319,7 +320,8 @@ fn bench_u256(c: &mut Criterion) {
     // that real swaps hit for typical pool parameters.
     group.bench_function("uniswap_v4_swap", |b| {
         let liquidity = ONE_ETH;
-        let sqrt_price = U256::from_limbs([0, 79228162514264337593543950336u128 as u64, (79228162514264337593543950336u128 >> 64) as u64, 0]);
+        // Q96 = 2^96 = 79228162514264337593543950336
+        let sqrt_price = U256::from(79228162514264337593543950336u128);
         let amount_in = U256::from(1_000_000_000_000_000u64);
 
         b.iter(|| {
diff --git a/bench/alloy-bench/benches/u256_comparison.rs b/bench/alloy-bench/benches/u256_comparison.rs
new file mode 100644
index 0000000..8bcaeb6
--- /dev/null
+++ b/bench/alloy-bench/benches/u256_comparison.rs
@@ -0,0 +1,148 @@
+/// u256-only benchmark: eth.zig vs alloy.rs (ruint)
+///
+/// All test values match bench/u256_bench.zig exactly.
+/// alloy's U256 is ruint::Uint<256, 4> -- native [u64; 4] limb arithmetic.
+
+use alloy_primitives::{U256, Uint};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+
+type U512 = Uint<512, 8>;
+
+// ================================================================
+// Test values -- identical to u256_bench.zig
+// ================================================================
+
+const ONE_ETH: U256 = U256::from_limbs([1_000_000_000_000_000_000u64, 0, 0, 0]);
+
+// 100 ETH = 100_000_000_000_000_000_000 = 0x56BC75E2D63100000
+const RESERVE_IN: U256 = U256::from_limbs([0x6BC75E2D63100000, 5, 0, 0]);
+const RESERVE_OUT: U256 = U256::from_limbs([200_000_000_000u64, 0, 0, 0]);
+
+// 2^96 = 79228162514264337593543950336
+const SQRT_PRICE: U256 = U256::from_limbs([0, 0x100000000, 0, 0]);
+const AMOUNT_IN_SMALL: U256 = U256::from_limbs([1_000_000_000_000_000u64, 0, 0, 0]);
+
+// Full-width 256-bit values
+const FULL_A: U256 = U256::from_limbs([
+    0x12345678_9ABCDEF0,
+    0xDEADBEEF_CAFEBABE,
+    0x12345678_9ABCDEF0,
+    0xDEADBEEF_CAFEBABE,
+]);
+const FULL_B: U256 = U256::from_limbs([
+    0xDEADBEEF_CAFEBABE,
+    0x12345678_9ABCDEF0,
+    0xDEADBEEF_CAFEBABE,
+    0x12345678_9ABCDEF0,
+]);
+const FULL_C: U256 = U256::from_limbs([
+    0x00000000_00000001,
+    0x00000000_00000000,
+    0x00000000_00000000,
+    0x00000001_00000000,
+]);
+
+// ================================================================
+// Benchmarks
+// ================================================================
+
+fn bench_u256(c: &mut Criterion) {
+    let mut group = c.benchmark_group("u256");
+
+    // --- Primitives ---
+
+    group.bench_function("add", |b| {
+        let a = ONE_ETH;
+        let b_val = U256::from(997_000_000_000_000_000u64);
+        b.iter(|| {
+            let result = black_box(a).wrapping_add(black_box(b_val));
+            black_box(result);
+        })
+    });
+
+    group.bench_function("mul_small", |b| {
+        let a = ONE_ETH;
+        b.iter(|| {
+            let result = black_box(a).wrapping_mul(U256::from(997u64));
+            black_box(result);
+        })
+    });
+
+    group.bench_function("mul_full", |b| {
+        b.iter(|| {
+            let result = black_box(FULL_A).wrapping_mul(black_box(FULL_B));
+            black_box(result);
+        })
+    });
+
+    group.bench_function("div_small", |b| {
+        let large = U256::from(997_000_000_000_000_000_000u128);
+        b.iter(|| {
+            let result = black_box(large) / black_box(ONE_ETH);
+            black_box(result);
+        })
+    });
+
+    group.bench_function("div_full", |b| {
+        b.iter(|| {
+            let result = black_box(FULL_A) / black_box(FULL_C);
+            black_box(result);
+        })
+    });
+
+    // --- UniswapV2 getAmountOut (naive: step-by-step u256 arithmetic) ---
+    // Both Zig and Rust do the exact same formula with wrapping u256 ops.
+    // This is the fair apples-to-apples comparison.
+
+    group.bench_function("uniswapv2_naive", |b| {
+        let amount_in = ONE_ETH;
+        let reserve_in = RESERVE_IN;
+        let reserve_out = RESERVE_OUT;
+        b.iter(|| {
+            let amount_in_with_fee = black_box(amount_in).wrapping_mul(U256::from(997u64));
+            let numerator = amount_in_with_fee.wrapping_mul(black_box(reserve_out));
+            let denominator =
+                black_box(reserve_in).wrapping_mul(U256::from(1000u64)).wrapping_add(amount_in_with_fee);
+            let amount_out = numerator / denominator;
+            black_box(amount_out);
+        })
+    });
+
+    // --- mulDiv: (a * b) / c with true 512-bit intermediate ---
+
+    group.bench_function("mulDiv", |b| {
+        let liquidity = ONE_ETH;
+        let sqrt_price = SQRT_PRICE;
+        let denom = ONE_ETH.wrapping_add(U256::from(1_000_000u64));
+        b.iter(|| {
+            let a = U512::from(black_box(liquidity));
+            let b_val = U512::from(black_box(sqrt_price));
+            let d = U512::from(black_box(denom));
+            let result = U256::from((a * b_val) / d);
+            black_box(result);
+        })
+    });
+
+    // --- UniswapV4 getNextSqrtPriceFromAmount0RoundingUp ---
+    // product = amount_in * sqrt_price (u256, no overflow for these values)
+    // denominator = liquidity + product
+    // next_sqrt_price = (liquidity * sqrt_price) / denominator  (via U512)
+
+    group.bench_function("uniswapv4_swap", |b| {
+        let liquidity = ONE_ETH;
+        let sqrt_price = SQRT_PRICE;
+        let amount_in = AMOUNT_IN_SMALL;
+        b.iter(|| {
+            let product = black_box(amount_in).wrapping_mul(black_box(sqrt_price));
+            let denominator = black_box(liquidity).wrapping_add(product);
+            let num = U512::from(black_box(liquidity)) * U512::from(black_box(sqrt_price));
+            let next_sqrt_price = U256::from(num / U512::from(denominator));
+            black_box(next_sqrt_price);
+        })
+    });
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_u256);
+criterion_main!(benches);
diff --git a/bench/bench.zig b/bench/bench.zig
index ba888c1..5402b5f 100644
--- a/bench/bench.zig
+++ b/bench/bench.zig
@@ -1,6 +1,5 @@
 const std = @import("std");
 const eth = @import("eth");
-const zbench = @import("zbench");
 
 // ============================================================================
 // Test data (Anvil account 0 -- well-known test key)
@@ -46,35 +45,87 @@ var precomputed_abi_dynamic: []const u8 = &.{};
 var precomputed_rlp_u256: []const u8 = &.{};
 var precomputed_pubkey: [65]u8 = undefined;
 
+// ============================================================================
+// Benchmark harness -- criterion-style: calibrate batch, measure wall time
+// ============================================================================
+
+const WARMUP_NS: u64 = 500_000_000; // 0.5s warmup
+const BENCH_NS: u64 = 2_000_000_000; // 2s measurement
+
+const Timer = std.time.Timer;
+
+const BenchResult = struct {
+    ns_per_op: u64,
+    iters: u64,
+};
+
+fn runBench(comptime func: fn () void) BenchResult {
+    var timer = Timer.start() catch @panic("timer unsupported");
+
+    // Warmup: run until WARMUP_NS elapsed
+    timer.reset();
+    while (true) {
+        inline for (0..64) |_| func();
+        if (timer.read() >= WARMUP_NS) break;
+    }
+
+    // Calibrate: find iteration count that fills ~100ms
+    var batch: u64 = 64;
+    while (true) {
+        timer.reset();
+        for (0..batch) |_| func();
+        if (timer.read() >= 100_000_000) break; // 100ms
+        batch *= 2;
+    }
+
+    // Measure: collect samples over BENCH_NS
+    var total_iters: u64 = 0;
+    timer.reset();
+
+    while (timer.read() < BENCH_NS) {
+        for (0..batch) |_| func();
+        total_iters += batch;
+    }
+
+    const total_ns = timer.read();
+    const ns_per_op = if (total_iters > 0) total_ns / total_iters else 0;
+    return .{ .ns_per_op = ns_per_op, .iters = total_iters };
+}
+
+fn runAndPrint(comptime name: []const u8, comptime func: fn () void, stdout: anytype) !void {
+    const result = runBench(func);
+    try stdout.print("{s:<34} {d:>9} ns {d:>14}\n", .{ name, result.ns_per_op, result.iters });
+}
+
 // ============================================================================
 // Benchmark functions -- Keccak256
 // ============================================================================
 
-fn benchKeccakEmpty(_: std.mem.Allocator) void {
+fn benchKeccakEmpty() void {
     const data: [0]u8 = .{};
     const result = eth.keccak.hash(&data);
     std.mem.doNotOptimizeAway(&result);
 }
 
-fn benchKeccak32(_: std.mem.Allocator) void {
+fn benchKeccak32() void {
     const data: [32]u8 = TEST_MSG_HASH;
     const result = eth.keccak.hash(&data);
     std.mem.doNotOptimizeAway(&result);
 }
 
-fn benchKeccak256b(_: std.mem.Allocator) void {
+fn benchKeccak256b() void {
     const data: [256]u8 = .{0xAB} ** 256;
     const result = eth.keccak.hash(&data);
     std.mem.doNotOptimizeAway(&result);
 }
 
-fn benchKeccak1k(_: std.mem.Allocator) void {
+fn benchKeccak1k() void {
     const data: [1024]u8 = .{0xAB} ** 1024;
     const result = eth.keccak.hash(&data);
     std.mem.doNotOptimizeAway(&result);
 }
 
-fn benchKeccak4k(_: std.mem.Allocator) void {
+fn benchKeccak4k() void {
     const data: [4096]u8 = .{0xAB} ** 4096;
     const result = eth.keccak.hash(&data);
     std.mem.doNotOptimizeAway(&result);
@@ -84,12 +135,12 @@ fn benchKeccak4k(_: std.mem.Allocator) void {
 // Benchmark functions -- secp256k1
 // ============================================================================
 
-fn benchSecp256k1Sign(_: std.mem.Allocator) void {
+fn benchSecp256k1Sign() void {
     const sig = eth.secp256k1.sign(TEST_PRIVKEY, TEST_MSG_HASH) catch unreachable;
     std.mem.doNotOptimizeAway(&sig);
 }
 
-fn benchSecp256k1Recover(_: std.mem.Allocator) void {
+fn benchSecp256k1Recover() void {
     const sig = eth.secp256k1.sign(TEST_PRIVKEY, TEST_MSG_HASH) catch unreachable;
     const pubkey = eth.secp256k1.recover(sig, TEST_MSG_HASH) catch unreachable;
     std.mem.doNotOptimizeAway(&pubkey);
@@ -99,19 +150,19 @@ fn benchSecp256k1Recover(_: std.mem.Allocator) void {
 // Benchmark functions -- Address
 // ============================================================================
 
-fn benchAddressDerivation(_: std.mem.Allocator) void {
+fn benchAddressDerivation() void {
     const addr = eth.secp256k1.pubkeyToAddress(precomputed_pubkey);
     std.mem.doNotOptimizeAway(&addr);
 }
 
-fn benchAddressFromHex(_: std.mem.Allocator) void {
+fn benchAddressFromHex() void {
     var hex_str: []const u8 = "0xf39Fd6e51aad88F6F4ce6aB8827279cffFb92266";
     std.mem.doNotOptimizeAway(&hex_str);
     const addr = eth.primitives.addressFromHex(hex_str) catch unreachable;
     std.mem.doNotOptimizeAway(&addr);
 }
 
-fn benchChecksumAddress(_: std.mem.Allocator) void {
+fn benchChecksumAddress() void {
     const addr = TEST_ADDR;
     const checksum = eth.primitives.addressToChecksum(&addr);
     std.mem.doNotOptimizeAway(&checksum);
@@ -121,27 +172,34 @@ fn benchChecksumAddress(_: std.mem.Allocator) void {
 // Benchmark functions -- ABI encoding
 // ============================================================================
 
-fn benchAbiEncodeTransfer(allocator: std.mem.Allocator) void {
+fn benchAbiEncodeTransfer() void {
+    var buf: [4096]u8 = undefined;
+    var fba = std.heap.FixedBufferAllocator.init(&buf);
+    const alloc = fba.allocator();
     const args = [_]eth.abi_encode.AbiValue{
         .{ .address = TEST_ADDR },
         .{ .uint256 = 1_000_000_000_000_000_000 },
     };
-    const result = eth.abi_encode.encodeFunctionCall(allocator, TRANSFER_SELECTOR, &args) catch unreachable;
-    defer allocator.free(result);
+    const result = eth.abi_encode.encodeFunctionCall(alloc, TRANSFER_SELECTOR, &args) catch unreachable;
     std.mem.doNotOptimizeAway(result.ptr);
 }
 
-fn benchAbiEncodeStatic(allocator: std.mem.Allocator) void {
+fn benchAbiEncodeStatic() void {
+    var buf: [4096]u8 = undefined;
+    var fba = std.heap.FixedBufferAllocator.init(&buf);
+    const alloc = fba.allocator();
     const args = [_]eth.abi_encode.AbiValue{
         .{ .address = TEST_ADDR },
         .{ .uint256 = 1_000_000_000_000_000_000 },
     };
-    const result = eth.abi_encode.encodeValues(allocator, &args) catch unreachable;
-    defer allocator.free(result);
+    const result = eth.abi_encode.encodeValues(alloc, &args) catch unreachable;
     std.mem.doNotOptimizeAway(result.ptr);
 }
 
-fn benchAbiEncodeDynamic(allocator: std.mem.Allocator) void {
+fn benchAbiEncodeDynamic() void {
+    var buf: [4096]u8 = undefined;
+    var fba = std.heap.FixedBufferAllocator.init(&buf);
+    const alloc = fba.allocator();
     const array_items = [_]eth.abi_encode.AbiValue{
         .{ .uint256 = 1 },
         .{ .uint256 = 2 },
@@ -154,8 +212,7 @@ fn benchAbiEncodeDynamic(allocator: std.mem.Allocator) void {
         .{ .bytes = "hello world, this is a dynamic bytes benchmark test payload" },
         .{ .array = &array_items },
     };
-    const result = eth.abi_encode.encodeValues(allocator, &args) catch unreachable;
-    defer allocator.free(result);
+    const result = eth.abi_encode.encodeValues(alloc, &args) catch unreachable;
     std.mem.doNotOptimizeAway(result.ptr);
 }
 
@@ -163,7 +220,10 @@ fn benchAbiEncodeDynamic(allocator: std.mem.Allocator) void {
 // Benchmark functions -- ABI decoding
 // ============================================================================
 
-fn benchAbiDecodeUint256(allocator: std.mem.Allocator) void {
+fn benchAbiDecodeUint256() void {
+    var buf: [4096]u8 = undefined;
+    var fba = std.heap.FixedBufferAllocator.init(&buf);
+    const alloc = fba.allocator();
     const encoded: [32]u8 = .{
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -171,15 +231,16 @@ fn benchAbiDecodeUint256(allocator: std.mem.Allocator) void {
         0x0D, 0xE0, 0xB6, 0xB3, 0xA7, 0x64, 0x00, 0x00,
     };
     const types = [_]eth.abi_types.AbiType{.uint256};
-    const values = eth.abi_decode.decodeValues(&encoded, &types, allocator) catch unreachable;
-    defer eth.abi_decode.freeValues(values, allocator);
+    const values = eth.abi_decode.decodeValues(&encoded, &types, alloc) catch unreachable;
     std.mem.doNotOptimizeAway(values.ptr);
 }
 
-fn benchAbiDecodeDynamic(allocator: std.mem.Allocator) void {
+fn benchAbiDecodeDynamic() void {
+    var buf: [4096]u8 = undefined;
+    var fba = std.heap.FixedBufferAllocator.init(&buf);
+    const alloc = fba.allocator();
     const types = [_]eth.abi_types.AbiType{ .string, .bytes };
-    const values = eth.abi_decode.decodeValues(precomputed_abi_dynamic, &types, allocator) catch unreachable;
-    defer eth.abi_decode.freeValues(values, allocator);
+    const values = eth.abi_decode.decodeValues(precomputed_abi_dynamic, &types, alloc) catch unreachable;
     std.mem.doNotOptimizeAway(values.ptr);
 }
 
@@ -187,7 +248,10 @@ fn benchAbiDecodeDynamic(allocator: std.mem.Allocator) void {
 // Benchmark functions -- RLP
 // ============================================================================
 
-fn benchRlpEncodeTx(allocator: std.mem.Allocator) void {
+fn benchRlpEncodeTx() void {
+    var buf: [4096]u8 = undefined;
+    var fba = std.heap.FixedBufferAllocator.init(&buf);
+    const alloc = fba.allocator();
     const tx = eth.transaction.Transaction{
         .eip1559 = .{
             .chain_id = 1,
@@ -201,12 +265,11 @@ fn benchRlpEncodeTx(allocator: std.mem.Allocator) void {
             .access_list = &.{},
         },
     };
-    const serialized = eth.transaction.serializeForSigning(allocator, tx) catch unreachable;
-    defer allocator.free(serialized);
+    const serialized = eth.transaction.serializeForSigning(alloc, tx) catch unreachable;
     std.mem.doNotOptimizeAway(serialized.ptr);
 }
 
-fn benchRlpDecodeU256(_: std.mem.Allocator) void {
+fn benchRlpDecodeU256() void {
     const decoded = eth.rlp.decode(u256, precomputed_rlp_u256) catch unreachable;
     std.mem.doNotOptimizeAway(&decoded.value);
 }
@@ -215,7 +278,7 @@ fn benchRlpDecodeU256(_: std.mem.Allocator) void {
 // Benchmark functions -- u256 arithmetic
 // ============================================================================
 
-fn benchU256Add(_: std.mem.Allocator) void {
+fn benchU256Add() void {
     var a: u256 = 1_000_000_000_000_000_000;
     var b: u256 = 997_000_000_000_000_000;
     std.mem.doNotOptimizeAway(&a);
@@ -224,7 +287,7 @@ fn benchU256Add(_: std.mem.Allocator) void {
     std.mem.doNotOptimizeAway(&result);
 }
 
-fn benchU256Mul(_: std.mem.Allocator) void {
+fn benchU256Mul() void {
     var a: u256 = 1_000_000_000_000_000_000;
     var b: u256 = 997;
     std.mem.doNotOptimizeAway(&a);
@@ -233,7 +296,7 @@ fn benchU256Mul(_: std.mem.Allocator) void {
     std.mem.doNotOptimizeAway(&result);
 }
 
-fn benchU256Div(_: std.mem.Allocator) void {
+fn benchU256Div() void {
     var a: u256 = 997_000_000_000_000_000_000;
     var b: u256 = 1_000_000_000_000_000_000;
     std.mem.doNotOptimizeAway(&a);
@@ -242,7 +305,7 @@ fn benchU256Div(_: std.mem.Allocator) void {
     std.mem.doNotOptimizeAway(&result);
 }
 
-fn benchU256UniswapV2AmountOut(_: std.mem.Allocator) void {
+fn benchU256UniswapV2AmountOut() void {
     var amount_in: u256 = 1_000_000_000_000_000_000; // 1 ETH
     var reserve_in: u256 = 100_000_000_000_000_000_000; // 100 ETH
     var reserve_out: u256 = 200_000_000_000; // 200k USDC (6 decimals)
@@ -258,7 +321,7 @@ fn benchU256UniswapV2AmountOut(_: std.mem.Allocator) void {
     std.mem.doNotOptimizeAway(&amount_out);
 }
 
-fn benchU256MulDiv(_: std.mem.Allocator) void {
+fn benchU256MulDiv() void {
     var a: u256 = 1_000_000_000_000_000_000;
     var b: u256 = 79228162514264337593543950336;
     var c: u256 = 1_000_000_000_000_001_000;
@@ -272,7 +335,7 @@ fn benchU256MulDiv(_: std.mem.Allocator) void {
     std.mem.doNotOptimizeAway(&result);
 }
 
-fn benchU256UniswapV4Swap(_: std.mem.Allocator) void {
+fn benchU256UniswapV4Swap() void {
     var liquidity: u256 = 1_000_000_000_000_000_000;
     var sqrt_price: u256 = 79228162514264337593543950336;
     var amount_in: u256 = 1_000_000_000_000_000;
@@ -293,13 +356,13 @@ fn benchU256UniswapV4Swap(_: std.mem.Allocator) void {
 // Benchmark functions -- Hex
 // ============================================================================
 
-fn benchHexEncode32(_: std.mem.Allocator) void {
+fn benchHexEncode32() void {
     const data: [32]u8 = TEST_MSG_HASH;
     const result = eth.hex.bytesToHexBuf(32, &data);
     std.mem.doNotOptimizeAway(&result);
 }
 
-fn benchHexDecode32(_: std.mem.Allocator) void {
+fn benchHexDecode32() void {
     var hex_str: []const u8 = "c5d2460186f7233c927e7db2dcc703c0e500b653ca82273b7bfad8045d85a470";
     std.mem.doNotOptimizeAway(&hex_str);
     var buf: [32]u8 = undefined;
@@ -311,7 +374,10 @@ fn benchHexDecode32(_: std.mem.Allocator) void {
 // Benchmark functions -- Transaction
 // ============================================================================
 
-fn benchTxHashEip1559(allocator: std.mem.Allocator) void {
+fn benchTxHashEip1559() void {
+    var buf: [4096]u8 = undefined;
+    var fba = std.heap.FixedBufferAllocator.init(&buf);
+    const alloc = fba.allocator();
     const tx = eth.transaction.Transaction{
         .eip1559 = .{
             .chain_id = 1,
@@ -325,7 +391,7 @@ fn benchTxHashEip1559(allocator: std.mem.Allocator) void {
             .access_list = &.{},
         },
     };
-    const hash = eth.transaction.hashForSigning(allocator, tx) catch unreachable;
+    const hash = eth.transaction.hashForSigning(alloc, tx) catch unreachable;
     std.mem.doNotOptimizeAway(&hash);
 }
 
@@ -333,7 +399,7 @@ fn benchTxHashEip1559(allocator: std.mem.Allocator) void {
 // Benchmark functions -- HD Wallet
 // ============================================================================
 
-fn benchHdWalletDerive10(_: std.mem.Allocator) void {
+fn benchHdWalletDerive10() void {
     const master = eth.hd_wallet.masterKeyFromSeed(TEST_SEED) catch unreachable;
     for (0..10) |i| {
         const child = eth.hd_wallet.deriveChild(master, @intCast(i)) catch unreachable;
@@ -345,7 +411,11 @@ fn benchHdWalletDerive10(_: std.mem.Allocator) void {
 // Benchmark functions -- EIP-712
 // ============================================================================
 
-fn benchEip712Hash(allocator: std.mem.Allocator) void {
+fn benchEip712Hash() void {
+    var buf: [8192]u8 = undefined;
+    var fba = std.heap.FixedBufferAllocator.init(&buf);
+    const alloc = fba.allocator();
+
     const domain = eth.eip712.DomainSeparator{
         .name = "TestDApp",
         .version = "1",
@@ -370,7 +440,7 @@ fn benchEip712Hash(allocator: std.mem.Allocator) void {
     };
 
     const result = eth.eip712.hashTypedData(
-        allocator,
+        alloc,
         domain,
         message,
         &.{transfer_type},
@@ -400,51 +470,53 @@ pub fn main() !void {
 
     precomputed_pubkey = eth.secp256k1.derivePublicKey(TEST_PRIVKEY) catch unreachable;
 
-    var bench = zbench.Benchmark.init(allocator, .{});
-    defer bench.deinit();
+    var out_buf: [8192]u8 = undefined;
+    var w = std.fs.File.stdout().writer(&out_buf);
+    const stdout = &w.interface;
+
+    try stdout.print("\n{s:<34} {s:>12} {s:>14}\n", .{ "Benchmark", "ns/op", "iters" });
+    try stdout.print("{s}\n", .{"-" ** 64});
 
     // Keccak256
-    try bench.add("keccak256_empty", benchKeccakEmpty, .{});
-    try bench.add("keccak256_32b", benchKeccak32, .{});
-    try bench.add("keccak256_256b", benchKeccak256b, .{});
-    try bench.add("keccak256_1kb", benchKeccak1k, .{});
-    try bench.add("keccak256_4kb", benchKeccak4k, .{});
+    try runAndPrint("keccak256_empty", benchKeccakEmpty, stdout);
+    try runAndPrint("keccak256_32b", benchKeccak32, stdout);
+    try runAndPrint("keccak256_256b", benchKeccak256b, stdout);
+    try runAndPrint("keccak256_1kb", benchKeccak1k, stdout);
+    try runAndPrint("keccak256_4kb", benchKeccak4k, stdout);
     // secp256k1
-    try bench.add("secp256k1_sign", benchSecp256k1Sign, .{});
-    try bench.add("secp256k1_sign_recover", benchSecp256k1Recover, .{});
+    try runAndPrint("secp256k1_sign", benchSecp256k1Sign, stdout);
+    try runAndPrint("secp256k1_sign_recover", benchSecp256k1Recover, stdout);
     // Address
-    try bench.add("address_derivation", benchAddressDerivation, .{});
-    try bench.add("address_from_hex", benchAddressFromHex, .{});
-    try bench.add("checksum_address", benchChecksumAddress, .{});
+    try runAndPrint("address_derivation", benchAddressDerivation, stdout);
+    try runAndPrint("address_from_hex", benchAddressFromHex, stdout);
+    try runAndPrint("checksum_address", benchChecksumAddress, stdout);
     // ABI encoding
-    try bench.add("abi_encode_transfer", benchAbiEncodeTransfer, .{});
-    try bench.add("abi_encode_static", benchAbiEncodeStatic, .{});
-    try bench.add("abi_encode_dynamic", benchAbiEncodeDynamic, .{});
+    try runAndPrint("abi_encode_transfer", benchAbiEncodeTransfer, stdout);
+    try runAndPrint("abi_encode_static", benchAbiEncodeStatic, stdout);
+    try runAndPrint("abi_encode_dynamic", benchAbiEncodeDynamic, stdout);
     // ABI decoding
-    try bench.add("abi_decode_uint256", benchAbiDecodeUint256, .{});
-    try bench.add("abi_decode_dynamic", benchAbiDecodeDynamic, .{});
+    try runAndPrint("abi_decode_uint256", benchAbiDecodeUint256, stdout);
+    try runAndPrint("abi_decode_dynamic", benchAbiDecodeDynamic, stdout);
     // RLP
-    try bench.add("rlp_encode_eip1559_tx", benchRlpEncodeTx, .{});
-    try bench.add("rlp_decode_u256", benchRlpDecodeU256, .{});
+    try runAndPrint("rlp_encode_eip1559_tx", benchRlpEncodeTx, stdout);
+    try runAndPrint("rlp_decode_u256", benchRlpDecodeU256, stdout);
     // u256 arithmetic
-    try bench.add("u256_add", benchU256Add, .{});
-    try bench.add("u256_mul", benchU256Mul, .{});
-    try bench.add("u256_div", benchU256Div, .{});
-    try bench.add("u256_uniswapv2_amount_out", benchU256UniswapV2AmountOut, .{});
-    try bench.add("u256_mulDiv", benchU256MulDiv, .{});
-    try bench.add("u256_uniswapv4_swap", benchU256UniswapV4Swap, .{});
+    try runAndPrint("u256_add", benchU256Add, stdout);
+    try runAndPrint("u256_mul", benchU256Mul, stdout);
+    try runAndPrint("u256_div", benchU256Div, stdout);
+    try runAndPrint("u256_uniswapv2_amount_out", benchU256UniswapV2AmountOut, stdout);
+    try runAndPrint("u256_mulDiv", benchU256MulDiv, stdout);
+    try runAndPrint("u256_uniswapv4_swap", benchU256UniswapV4Swap, stdout);
     // Hex
-    try bench.add("hex_encode_32b", benchHexEncode32, .{});
-    try bench.add("hex_decode_32b", benchHexDecode32, .{});
+    try runAndPrint("hex_encode_32b", benchHexEncode32, stdout);
+    try runAndPrint("hex_decode_32b", benchHexDecode32, stdout);
     // Transaction
-    try bench.add("tx_hash_eip1559", benchTxHashEip1559, .{});
+    try runAndPrint("tx_hash_eip1559", benchTxHashEip1559, stdout);
     // HD Wallet
-    try bench.add("hd_wallet_derive_10", benchHdWalletDerive10, .{});
+    try runAndPrint("hd_wallet_derive_10", benchHdWalletDerive10, stdout);
     // EIP-712
-    try bench.add("eip712_hash_typed_data", benchEip712Hash, .{});
+    try runAndPrint("eip712_hash_typed_data", benchEip712Hash, stdout);
 
-    var buf: [16384]u8 = undefined;
-    var w = std.fs.File.stdout().writer(&buf);
-    try bench.run(&w.interface);
-    try w.interface.flush();
+    try stdout.print("\n", .{});
+    try stdout.flush();
 }
diff --git a/bench/compare.sh b/bench/compare.sh
index 3e02dbf..821b7c5 100755
--- a/bench/compare.sh
+++ b/bench/compare.sh
@@ -19,7 +19,7 @@ echo ""
 # -- Step 1: Run eth-zig benchmarks --
 echo "[1/3] Running eth-zig benchmarks (ReleaseFast)..."
 ZIG_OUTPUT=$(cd "$ROOT_DIR" && zig build bench 2>&1)
-echo "$ZIG_OUTPUT" | grep -v "^BENCH_JSON"
+echo "$ZIG_OUTPUT"
 echo ""
 
 # -- Step 2: Run alloy.rs benchmarks --
@@ -44,12 +44,12 @@ import re
 zig_output = sys.argv[1]
 rust_output = sys.argv[2]
 
-# Parse eth-zig BENCH_JSON lines
+# Parse custom harness output: "name                   NNN ns       NNNN"
 zig_ns = {}
 for line in zig_output.split('\n'):
-    if line.startswith('BENCH_JSON|'):
-        data = json.loads(line[len('BENCH_JSON|'):])
-        zig_ns[data['name']] = data['ns_per_op']
+    m = re.match(r'^(\S+)\s+(\d+)\s+ns', line)
+    if m:
+        zig_ns[m.group(1)] = int(m.group(2))
 
 # Parse criterion output
 alloy_ns = {}
diff --git a/bench/compare_u256.sh b/bench/compare_u256.sh
new file mode 100755
index 0000000..05831db
--- /dev/null
+++ b/bench/compare_u256.sh
@@ -0,0 +1,155 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR="$(dirname "$SCRIPT_DIR")"
+ALLOY_DIR="$SCRIPT_DIR/alloy-bench"
+
+command -v zig >/dev/null 2>&1 || { echo "ERROR: zig not found"; exit 1; }
+command -v cargo >/dev/null 2>&1 || { echo "ERROR: cargo not found"; exit 1; }
+command -v python3 >/dev/null 2>&1 || { echo "ERROR: python3 not found"; exit 1; }
+
+echo ""
+echo "================================================================"
+echo "  u256 Benchmark: eth.zig vs alloy.rs (ruint)"
+echo "================================================================"
+echo ""
+
+# -- Step 1: eth-zig u256 benchmarks --
+echo "[1/3] Running eth-zig u256 benchmarks (ReleaseFast)..."
+ZIG_OUTPUT=$(cd "$ROOT_DIR" && zig build bench-u256 2>&1)
+echo "$ZIG_OUTPUT" | grep -v "^BENCH_JSON"
+echo ""
+
+# -- Step 2: alloy.rs u256 benchmarks --
+echo "[2/3] Running alloy.rs u256 benchmarks (cargo bench --release)..."
+RUST_OUTPUT=$(cd "$ALLOY_DIR" && cargo bench --bench u256_comparison 2>&1)
+echo "  Done."
+echo ""
+
+# -- Step 3: Compare --
+echo "[3/3] Comparing results..."
+echo ""
+
+python3 - "$ZIG_OUTPUT" "$RUST_OUTPUT" << 'PYTHON_SCRIPT'
+import sys
+import json
+import re
+
+zig_output = sys.argv[1]
+rust_output = sys.argv[2]
+
+def parse_ns(value_str, unit_str):
+    v = float(value_str)
+    if unit_str == 'ns':
+        return round(v)
+    elif unit_str in ('us', 'µs'):
+        return round(v * 1000)
+    elif unit_str == 'ms':
+        return round(v * 1_000_000)
+    return round(v)
+
+# Parse BENCH_JSON lines from Zig output
+zig_ns = {}
+for line in zig_output.split('\n'):
+    if line.startswith('BENCH_JSON|'):
+        try:
+            data = json.loads(line[len('BENCH_JSON|'):])
+            zig_ns[data['name']] = data['ns_per_op']
+        except (json.JSONDecodeError, KeyError):
+            pass
+
+# Parse criterion output
+alloy_ns = {}
+for line in rust_output.split('\n'):
+    m = re.match(r'^([a-zA-Z0-9_]+/[a-zA-Z0-9_]+)\s+time:\s+\[[\d.]+ \w+\s+([\d.]+)\s+(ns|µs|ms)', line.strip())
+    if m:
+        alloy_ns[m.group(1)] = parse_ns(m.group(2), m.group(3))
+
+# Name mapping: zig -> criterion
+name_map = {
+    'u256_add':                   'u256/add',
+    'u256_mul_small':             'u256/mul_small',
+    'u256_mul_full':              'u256/mul_full',
+    'u256_div_small':             'u256/div_small',
+    'u256_div_full':              'u256/div_full',
+    'u256_uniswapv2_naive':       'u256/uniswapv2_naive',
+    'u256_mulDiv':                'u256/mulDiv',
+    'u256_uniswapv4_swap':        'u256/uniswapv4_swap',
+}
+
+bench_order = [
+    'u256_add',
+    'u256_mul_small',
+    'u256_mul_full',
+    'u256_div_small',
+    'u256_div_full',
+    'u256_uniswapv2_naive',
+    'u256_mulDiv',
+    'u256_uniswapv4_swap',
+]
+
+GREEN = '\033[0;32m'
+RED = '\033[0;31m'
+YELLOW = '\033[0;33m'
+BOLD = '\033[1m'
+NC = '\033[0m'
+
+print(f"\n{BOLD}=== Apples-to-apples: same formula, same u256 ops ==={NC}\n")
+print(f"{BOLD}{'Benchmark':<28} {'eth-zig':>10} {'alloy.rs':>10} {'Result':>20}{NC}")
+print(f"{'-'*28} {'-'*10} {'-'*10} {'-'*20}")
+
+zig_wins = 0
+alloy_wins = 0
+ties = 0
+total = 0
+
+for zig_name in bench_order:
+    alloy_name = name_map.get(zig_name, '')
+    z = zig_ns.get(zig_name)
+    a = alloy_ns.get(alloy_name)
+
+    if z is not None and a is not None:
+        total += 1
+        if z == a or (z > 0 and a > 0 and abs(z - a) / max(z, a) < 0.1):
+            label = 'tie'
+            color = NC
+            ties += 1
+        elif z < a:
+            ratio = a / z if z > 0 else 999.99
+            label = f'zig {ratio:.2f}x'
+            color = GREEN
+            zig_wins += 1
+        else:
+            ratio = z / a if a > 0 else 999.99
+            label = f'rs {ratio:.2f}x'
+            color = RED
+            alloy_wins += 1
+        print(f"{zig_name:<28} {z:>7} ns {a:>7} ns {color}{label:>20}{NC}")
+    elif z is not None:
+        print(f"{zig_name:<28} {z:>7} ns {'---':>10} {'(zig only)':>20}")
+
+print(f"\n{BOLD}{'='*28} {'='*10} {'='*10} {'='*20}{NC}")
+print(f"\n{BOLD}Score: eth-zig {zig_wins}/{total} | alloy.rs {alloy_wins}/{total} | tied {ties}/{total}{NC}")
+
+# Show zig-only optimized benchmark
+z_opt = zig_ns.get('u256_uniswapv2_optimized')
+z_naive = zig_ns.get('u256_uniswapv2_naive')
+a_naive = alloy_ns.get('u256/uniswapv2_naive')
+
+if z_opt is not None:
+    print(f"\n{BOLD}=== eth.zig compound limb optimization ==={NC}\n")
+    print(f"{'u256_uniswapv2_optimized':<28} {z_opt:>7} ns   (stays in [4]u64 limb space)")
+    if z_naive is not None and z_naive > 0:
+        print(f"{'u256_uniswapv2_naive':<28} {z_naive:>7} ns   (step-by-step u256, same as alloy)")
+        speedup = z_naive / z_opt if z_opt > 0 else 0
+        print(f"{'Optimization speedup':<28} {YELLOW}{speedup:.2f}x{NC}")
+    if a_naive is not None and z_opt > 0:
+        vs_rust = a_naive / z_opt
+        if vs_rust >= 1:
+            print(f"{'vs alloy.rs naive':<28} {GREEN}{vs_rust:.2f}x faster{NC}")
+        else:
+            print(f"{'vs alloy.rs naive':<28} {RED}{1/vs_rust:.2f}x slower{NC}")
+
+print()
+PYTHON_SCRIPT
diff --git a/bench/keccak_compare.zig b/bench/keccak_compare.zig
index 41eafed..6bf69a6 100644
--- a/bench/keccak_compare.zig
+++ b/bench/keccak_compare.zig
@@ -1,6 +1,5 @@
 const std = @import("std");
 const eth = @import("eth");
-const zbench = @import("zbench");
 
 // Test data
 const DATA_32: [32]u8 = .{0xAB} ** 32;
@@ -8,85 +7,132 @@ const DATA_256: [256]u8 = .{0xAB} ** 256;
 const DATA_1K: [1024]u8 = .{0xAB} ** 1024;
 const DATA_4K: [4096]u8 = .{0xAB} ** 4096;
 
+// ============================================================================
+// Benchmark harness (same as bench.zig / u256_bench.zig)
+// ============================================================================
+
+const WARMUP_NS: u64 = 500_000_000;
+const BENCH_NS: u64 = 2_000_000_000;
+const Timer = std.time.Timer;
+
+const BenchResult = struct { ns_per_op: u64, iters: u64 };
+
+fn runBench(comptime func: fn () void) BenchResult {
+    var timer = Timer.start() catch @panic("timer unsupported");
+
+    timer.reset();
+    while (true) {
+        inline for (0..64) |_| func();
+        if (timer.read() >= WARMUP_NS) break;
+    }
+
+    var batch: u64 = 64;
+    while (true) {
+        timer.reset();
+        for (0..batch) |_| func();
+        if (timer.read() >= 100_000_000) break;
+        batch *= 2;
+    }
+
+    var total_iters: u64 = 0;
+    timer.reset();
+    while (timer.read() < BENCH_NS) {
+        for (0..batch) |_| func();
+        total_iters += batch;
+    }
+
+    const total_ns = timer.read();
+    return .{ .ns_per_op = if (total_iters > 0) total_ns / total_iters else 0, .iters = total_iters };
+}
+
+fn runAndPrint(comptime name: []const u8, comptime func: fn () void, stdout: anytype) !void {
+    const result = runBench(func);
+    try stdout.print("{s:<30} {d:>9} ns {d:>14}\n", .{ name, result.ns_per_op, result.iters });
+}
+
 // -- eth.zig keccak (lane-complementing optimized) --
 
-fn benchEthKeccakEmpty(_: std.mem.Allocator) void {
+fn benchEthKeccakEmpty() void {
     const r = eth.keccak.hash("");
     std.mem.doNotOptimizeAway(&r);
 }
 
-fn benchEthKeccak32(_: std.mem.Allocator) void {
+fn benchEthKeccak32() void {
     const r = eth.keccak.hash(&DATA_32);
     std.mem.doNotOptimizeAway(&r);
 }
 
-fn benchEthKeccak256(_: std.mem.Allocator) void {
+fn benchEthKeccak256() void {
     const r = eth.keccak.hash(&DATA_256);
     std.mem.doNotOptimizeAway(&r);
 }
 
-fn benchEthKeccak1k(_: std.mem.Allocator) void {
+fn benchEthKeccak1k() void {
     const r = eth.keccak.hash(&DATA_1K);
     std.mem.doNotOptimizeAway(&r);
 }
 
-fn benchEthKeccak4k(_: std.mem.Allocator) void {
+fn benchEthKeccak4k() void {
     const r = eth.keccak.hash(&DATA_4K);
     std.mem.doNotOptimizeAway(&r);
 }
 
 // -- Zig stdlib keccak --
 
-fn benchStdlibKeccakEmpty(_: std.mem.Allocator) void {
+fn benchStdlibKeccakEmpty() void {
     var result: [32]u8 = undefined;
     std.crypto.hash.sha3.Keccak256.hash("", &result, .{});
     std.mem.doNotOptimizeAway(&result);
 }
 
-fn benchStdlibKeccak32(_: std.mem.Allocator) void {
+fn benchStdlibKeccak32() void {
     var result: [32]u8 = undefined;
     std.crypto.hash.sha3.Keccak256.hash(&DATA_32, &result, .{});
     std.mem.doNotOptimizeAway(&result);
 }
 
-fn benchStdlibKeccak256(_: std.mem.Allocator) void {
+fn benchStdlibKeccak256() void {
     var result: [32]u8 = undefined;
     std.crypto.hash.sha3.Keccak256.hash(&DATA_256, &result, .{});
     std.mem.doNotOptimizeAway(&result);
 }
 
-fn benchStdlibKeccak1k(_: std.mem.Allocator) void {
+fn benchStdlibKeccak1k() void {
     var result: [32]u8 = undefined;
     std.crypto.hash.sha3.Keccak256.hash(&DATA_1K, &result, .{});
     std.mem.doNotOptimizeAway(&result);
 }
 
-fn benchStdlibKeccak4k(_: std.mem.Allocator) void {
+fn benchStdlibKeccak4k() void {
     var result: [32]u8 = undefined;
     std.crypto.hash.sha3.Keccak256.hash(&DATA_4K, &result, .{});
     std.mem.doNotOptimizeAway(&result);
 }
 
 pub fn main() !void {
-    var bench = zbench.Benchmark.init(std.heap.page_allocator, .{});
-    defer bench.deinit();
+    var out_buf: [8192]u8 = undefined;
+    var w = std.fs.File.stdout().writer(&out_buf);
+    const stdout = &w.interface;
+
+    try stdout.print("\n{s:<30} {s:>12} {s:>14}\n", .{ "Benchmark", "ns/op", "iters" });
+    try stdout.print("{s}\n", .{"-" ** 60});
 
     // eth.zig
-    try bench.add("eth.zig keccak empty", benchEthKeccakEmpty, .{});
-    try bench.add("eth.zig keccak 32b", benchEthKeccak32, .{});
-    try bench.add("eth.zig keccak 256b", benchEthKeccak256, .{});
-    try bench.add("eth.zig keccak 1kb", benchEthKeccak1k, .{});
-    try bench.add("eth.zig keccak 4kb", benchEthKeccak4k, .{});
+    try runAndPrint("eth.zig keccak empty", benchEthKeccakEmpty, stdout);
+    try runAndPrint("eth.zig keccak 32b", benchEthKeccak32, stdout);
+    try runAndPrint("eth.zig keccak 256b", benchEthKeccak256, stdout);
+    try runAndPrint("eth.zig keccak 1kb", benchEthKeccak1k, stdout);
+    try runAndPrint("eth.zig keccak 4kb", benchEthKeccak4k, stdout);
+
+    try stdout.print("\n", .{});
 
     // stdlib
-    try bench.add("stdlib keccak empty", benchStdlibKeccakEmpty, .{});
-    try bench.add("stdlib keccak 32b", benchStdlibKeccak32, .{});
-    try bench.add("stdlib keccak 256b", benchStdlibKeccak256, .{});
-    try bench.add("stdlib keccak 1kb", benchStdlibKeccak1k, .{});
-    try bench.add("stdlib keccak 4kb", benchStdlibKeccak4k, .{});
-
-    var buf: [16384]u8 = undefined;
-    var w = std.fs.File.stdout().writer(&buf);
-    try bench.run(&w.interface);
-    try w.interface.flush();
+    try runAndPrint("stdlib keccak empty", benchStdlibKeccakEmpty, stdout);
+    try runAndPrint("stdlib keccak 32b", benchStdlibKeccak32, stdout);
+    try runAndPrint("stdlib keccak 256b", benchStdlibKeccak256, stdout);
+    try runAndPrint("stdlib keccak 1kb", benchStdlibKeccak1k, stdout);
+    try runAndPrint("stdlib keccak 4kb", benchStdlibKeccak4k, stdout);
+
+    try stdout.print("\n", .{});
+    try stdout.flush();
 }
diff --git a/bench/u256_bench.zig b/bench/u256_bench.zig
new file mode 100644
index 0000000..0a8a05c
--- /dev/null
+++ b/bench/u256_bench.zig
@@ -0,0 +1,220 @@
+const std = @import("std");
+const eth = @import("eth");
+
+// ============================================================================
+// Test values -- identical to alloy-bench/benches/u256_comparison.rs
+// ============================================================================
+
+const ONE_ETH: u256 = 1_000_000_000_000_000_000;
+const RESERVE_IN: u256 = 100_000_000_000_000_000_000; // 100 ETH
+const RESERVE_OUT: u256 = 200_000_000_000; // 200k USDC (6 decimals)
+
+// 2^96 -- used in UniswapV3/V4 sqrtPriceX96
+const SQRT_PRICE: u256 = 79228162514264337593543950336;
+const AMOUNT_IN_SMALL: u256 = 1_000_000_000_000_000; // 0.001 ETH
+
+// Full-width 256-bit values for mul/div stress tests
+const FULL_A: u256 = 0xDEADBEEF_CAFEBABE_12345678_9ABCDEF0_DEADBEEF_CAFEBABE_12345678_9ABCDEF0;
+const FULL_B: u256 = 0x12345678_9ABCDEF0_DEADBEEF_CAFEBABE_12345678_9ABCDEF0_DEADBEEF_CAFEBABE;
+const FULL_C: u256 = 0x00000001_00000000_00000000_00000000_00000000_00000000_00000000_00000001;
+
+// ============================================================================
+// Benchmark harness -- criterion-style: run N iters, measure wall time, report ns/op
+// ============================================================================
+
+const WARMUP_NS: u64 = 500_000_000; // 0.5s warmup
+const BENCH_NS: u64 = 2_000_000_000; // 2s measurement
+
+const Timer = std.time.Timer;
+
+const BenchResult = struct {
+    ns_per_op: u64,
+    iters: u64,
+};
+
+fn runBench(comptime func: fn () void) BenchResult {
+    var timer = Timer.start() catch @panic("timer unsupported");
+
+    // Warmup: run until WARMUP_NS elapsed
+    timer.reset();
+    while (true) {
+        inline for (0..64) |_| func();
+        if (timer.read() >= WARMUP_NS) break;
+    }
+
+    // Calibrate: find iteration count that fills ~100ms
+    var batch: u64 = 64;
+    while (true) {
+        timer.reset();
+        for (0..batch) |_| func();
+        if (timer.read() >= 100_000_000) break; // 100ms
+        batch *= 2;
+    }
+
+    // Measure: collect samples over BENCH_NS
+    var total_iters: u64 = 0;
+    timer.reset();
+
+    while (timer.read() < BENCH_NS) {
+        for (0..batch) |_| func();
+        total_iters += batch;
+    }
+
+    const total_ns = timer.read();
+    const ns_per_op = if (total_iters > 0) total_ns / total_iters else 0;
+    return .{ .ns_per_op = ns_per_op, .iters = total_iters };
+}
+
+// ============================================================================
+// Benchmark functions
+// ============================================================================
+
+fn benchAdd() void {
+    var a: u256 = ONE_ETH;
+    var b: u256 = 997_000_000_000_000_000;
+    std.mem.doNotOptimizeAway(&a);
+    std.mem.doNotOptimizeAway(&b);
+    const result = a +% b;
+    std.mem.doNotOptimizeAway(&result);
+}
+
+fn benchMulSmall() void {
+    var a: u256 = ONE_ETH;
+    var b: u256 = 997;
+    std.mem.doNotOptimizeAway(&a);
+    std.mem.doNotOptimizeAway(&b);
+    const result = eth.uint256.fastMul(a, b);
+    std.mem.doNotOptimizeAway(&result);
+}
+
+fn benchMulFull() void {
+    var a: u256 = FULL_A;
+    var b: u256 = FULL_B;
+    std.mem.doNotOptimizeAway(&a);
+    std.mem.doNotOptimizeAway(&b);
+    const result = eth.uint256.fastMul(a, b);
+    std.mem.doNotOptimizeAway(&result);
+}
+
+fn benchDivSmall() void {
+    var a: u256 = 997_000_000_000_000_000_000;
+    var b: u256 = ONE_ETH;
+    std.mem.doNotOptimizeAway(&a);
+    std.mem.doNotOptimizeAway(&b);
+    const result = eth.uint256.fastDiv(a, b);
+    std.mem.doNotOptimizeAway(&result);
+}
+
+fn benchDivFull() void {
+    var a: u256 = FULL_A;
+    var b: u256 = FULL_C;
+    std.mem.doNotOptimizeAway(&a);
+    std.mem.doNotOptimizeAway(&b);
+    const result = eth.uint256.fastDiv(a, b);
+    std.mem.doNotOptimizeAway(&result);
+}
+
+// UniswapV2 getAmountOut -- step-by-step on [4]u64 limbs (apples-to-apples with Rust's [u64; 4])
+// Uses fp256 hand-optimized aarch64 assembly for mul/add, u128 fast path for division.
+fn benchUniswapV2Naive() void {
+    var amount_in = eth.uint256.u256ToLimbs(ONE_ETH);
+    var reserve_in = eth.uint256.u256ToLimbs(RESERVE_IN);
+    var reserve_out = eth.uint256.u256ToLimbs(RESERVE_OUT);
+    std.mem.doNotOptimizeAway(&amount_in);
+    std.mem.doNotOptimizeAway(&reserve_in);
+    std.mem.doNotOptimizeAway(&reserve_out);
+
+    const amount_in_with_fee = eth.uint256.mulLimbScalar(amount_in, 997);
+    const numerator = eth.uint256.mulLimbs(amount_in_with_fee, reserve_out);
+    const denominator = eth.uint256.addLimbs(eth.uint256.mulLimbScalar(reserve_in, 1000), amount_in_with_fee);
+    const amount_out = eth.uint256.divLimbsDirect(numerator, denominator);
+    std.mem.doNotOptimizeAway(&amount_out);
+}
+
+// UniswapV2 getAmountOut -- compound limb function (zig-only optimization)
+fn benchUniswapV2Optimized() void {
+    var amount_in: u256 = ONE_ETH;
+    var reserve_in: u256 = RESERVE_IN;
+    var reserve_out: u256 = RESERVE_OUT;
+    std.mem.doNotOptimizeAway(&amount_in);
+    std.mem.doNotOptimizeAway(&reserve_in);
+    std.mem.doNotOptimizeAway(&reserve_out);
+
+    const amount_out = eth.uint256.getAmountOut(amount_in, reserve_in, reserve_out);
+    std.mem.doNotOptimizeAway(&amount_out);
+}
+
+// mulDiv: (a * b) / c with true 512-bit intermediate
+fn benchMulDiv() void {
+    var a: u256 = ONE_ETH;
+    var b: u256 = SQRT_PRICE;
+    var c: u256 = ONE_ETH + 1_000_000;
+    std.mem.doNotOptimizeAway(&a);
+    std.mem.doNotOptimizeAway(&b);
+    std.mem.doNotOptimizeAway(&c);
+    const result = eth.uint256.mulDiv(a, b, c);
+    std.mem.doNotOptimizeAway(&result);
+}
+
+// UniswapV4 getNextSqrtPriceFromAmount0RoundingUp
+fn benchUniswapV4Swap() void {
+    var liquidity: u256 = ONE_ETH;
+    var sqrt_price: u256 = SQRT_PRICE;
+    var amount_in: u256 = AMOUNT_IN_SMALL;
+    std.mem.doNotOptimizeAway(&liquidity);
+    std.mem.doNotOptimizeAway(&sqrt_price);
+    std.mem.doNotOptimizeAway(&amount_in);
+
+    const product = eth.uint256.fastMul(amount_in, sqrt_price);
+    const denominator = liquidity +% product;
+    const next_sqrt_price = eth.uint256.mulDiv(liquidity, sqrt_price, denominator);
+    std.mem.doNotOptimizeAway(&next_sqrt_price);
+}
+
+// ============================================================================
+// Main
+// ============================================================================
+
+fn runAndPrint(comptime name: []const u8, comptime func: fn () void, stdout: anytype) !void {
+    const result = runBench(func);
+    try stdout.print("{s:<32} {d:>9} ns {d:>14}\n", .{ name, result.ns_per_op, result.iters });
+}
+
+fn runAndJson(comptime name: []const u8, comptime func: fn () void, stdout: anytype) !void {
+    const result = runBench(func);
+    try stdout.print("BENCH_JSON|{{\"name\":\"{s}\",\"ns_per_op\":{d}}}\n", .{ name, result.ns_per_op });
+}
+
+pub fn main() !void {
+    var buf: [8192]u8 = undefined;
+    var w = std.fs.File.stdout().writer(&buf);
+    const stdout = &w.interface;
+
+    try stdout.print("\n{s:<32} {s:>12} {s:>14}\n", .{ "Benchmark", "ns/op", "iters" });
+    try stdout.print("{s}\n", .{"-" ** 62});
+
+    try runAndPrint("u256_add", benchAdd, stdout);
+    try runAndPrint("u256_mul_small", benchMulSmall, stdout);
+    try runAndPrint("u256_mul_full", benchMulFull, stdout);
+    try runAndPrint("u256_div_small", benchDivSmall, stdout);
+    try runAndPrint("u256_div_full", benchDivFull, stdout);
+    try runAndPrint("u256_uniswapv2_naive", benchUniswapV2Naive, stdout);
+    try runAndPrint("u256_uniswapv2_optimized", benchUniswapV2Optimized, stdout);
+    try runAndPrint("u256_mulDiv", benchMulDiv, stdout);
+    try runAndPrint("u256_uniswapv4_swap", benchUniswapV4Swap, stdout);
+
+    try stdout.print("\n", .{});
+
+    // Machine-readable JSON lines for compare script
+    try runAndJson("u256_add", benchAdd, stdout);
+    try runAndJson("u256_mul_small", benchMulSmall, stdout);
+    try runAndJson("u256_mul_full", benchMulFull, stdout);
+    try runAndJson("u256_div_small", benchDivSmall, stdout);
+    try runAndJson("u256_div_full", benchDivFull, stdout);
+    try runAndJson("u256_uniswapv2_naive", benchUniswapV2Naive, stdout);
+    try runAndJson("u256_uniswapv2_optimized", benchUniswapV2Optimized, stdout);
+    try runAndJson("u256_mulDiv", benchMulDiv, stdout);
+    try runAndJson("u256_uniswapv4_swap", benchUniswapV4Swap, stdout);
+
+    try stdout.flush();
+}
diff --git a/build.zig b/build.zig
index fe5e090..8d4580f 100644
--- a/build.zig
+++ b/build.zig
@@ -50,9 +50,6 @@ pub fn build(b: *std.Build) void {
         .optimize = .ReleaseFast,
     });
 
-    const zbench_dep = b.dependency("zbench", .{});
-    const zbench_mod = zbench_dep.module("zbench");
-
     const bench_exe = b.addExecutable(.{
         .name = "bench",
         .root_module = b.createModule(.{
@@ -61,7 +58,6 @@ pub fn build(b: *std.Build) void {
             .optimize = .ReleaseFast,
             .imports = &.{
                 .{ .name = "eth", .module = bench_module },
-                .{ .name = "zbench", .module = zbench_mod },
             },
         }),
     });
@@ -70,6 +66,23 @@ pub fn build(b: *std.Build) void {
     const bench_step = b.step("bench", "Run benchmarks (ReleaseFast)");
     bench_step.dependOn(&run_bench.step);
 
+    // u256-only benchmark (custom harness, no zbench dependency)
+    const u256_bench_exe = b.addExecutable(.{
+        .name = "u256-bench",
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("bench/u256_bench.zig"),
+            .target = target,
+            .optimize = .ReleaseFast,
+            .imports = &.{
+                .{ .name = "eth", .module = bench_module },
+            },
+        }),
+    });
+
+    const run_u256_bench = b.addRunArtifact(u256_bench_exe);
+    const u256_bench_step = b.step("bench-u256", "Run u256-only benchmarks (ReleaseFast)");
+    u256_bench_step.dependOn(&run_u256_bench.step);
+
     // Keccak comparison benchmark (eth.zig vs stdlib)
     const keccak_compare_exe = b.addExecutable(.{
         .name = "keccak-compare",
@@ -79,7 +92,6 @@ pub fn build(b: *std.Build) void {
             .optimize = .ReleaseFast,
             .imports = &.{
                 .{ .name = "eth", .module = bench_module },
-                .{ .name = "zbench", .module = zbench_mod },
             },
         }),
     });
diff --git a/build.zig.zon b/build.zig.zon
index a59f547..59fb9aa 100644
--- a/build.zig.zon
+++ b/build.zig.zon
@@ -3,12 +3,7 @@
     .version = "0.2.3",
     .fingerprint = 0xd0f21900fa26f179,
     .minimum_zig_version = "0.15.2",
-    .dependencies = .{
-        .zbench = .{
-            .url = "https://github.com/hendriknielaender/zBench/archive/3268a23da82231f1bd2c064de7fdf6fb7056126f.tar.gz",
-            .hash = "zbench-0.11.2-YTdc7zolAQDlBF9i0ywXIvDjafL3Kg27S-aFUq6dU5zy",
-        },
-    },
+    .dependencies = .{},
     .paths = .{
         "build.zig",
         "build.zig.zon",
diff --git a/src/uint256.zig b/src/uint256.zig
index a4e4da8..d716f8d 100644
--- a/src/uint256.zig
+++ b/src/uint256.zig
@@ -80,7 +80,7 @@ pub fn safeDiv(a: u256, b: u256) ?u256 {
 /// Fast u256 division using u64-limb schoolbook algorithm.
 /// Avoids LLVM's slow generic u256 runtime library calls (~280ns)
 /// by using native u64/u128 operations (~10-30ns).
-pub fn fastDiv(a: u256, b: u256) u256 {
+pub inline fn fastDiv(a: u256, b: u256) u256 {
     if (b == 0) {
         @branchHint(.cold);
         @panic("division by zero");
@@ -94,22 +94,14 @@ pub fn fastDiv(a: u256, b: u256) u256 {
     return divLimbs(a, b);
 }
 
-// ---- u64-limb division (Knuth Algorithm D) ----
+// ---- u64-limb arithmetic ----
 
-fn u256ToLimbs(v: u256) [4]u64 {
-    return .{
-        @truncate(v),
-        @truncate(v >> 64),
-        @truncate(v >> 128),
-        @truncate(v >> 192),
-    };
+pub fn u256ToLimbs(v: u256) [4]u64 {
+    return @bitCast(v);
 }
 
-fn limbsToU256(l: [4]u64) u256 {
-    return @as(u256, l[3]) << 192 |
-        @as(u256, l[2]) << 128 |
-        @as(u256, l[1]) << 64 |
-        @as(u256, l[0]);
+pub fn limbsToU256(l: [4]u64) u256 {
+    return @bitCast(l);
 }
 
 fn countLimbs(limbs: [4]u64) usize {
@@ -121,7 +113,7 @@ fn countLimbs(limbs: [4]u64) usize {
 /// Schoolbook 4x4 wrapping multiply on u64 limbs.
 /// Only computes the lower 4 limbs (256-bit result).
 /// Uses inline for so LLVM sees comptime-known loop bounds and fully unrolls.
-fn mulLimbs(a: [4]u64, b: [4]u64) [4]u64 {
+pub inline fn mulLimbs(a: [4]u64, b: [4]u64) [4]u64 {
     var r: [4]u64 = .{ 0, 0, 0, 0 };
     // Accumulate partial products a[i]*b[j] into r[i+j] (only where i+j < 4)
     inline for (0..4) |i| {
@@ -138,8 +130,21 @@ fn mulLimbs(a: [4]u64, b: [4]u64) [4]u64 {
     return r;
 }
 
+/// Multiply [4]u64 limbs by a single u64 scalar (wrapping to 256 bits).
+/// Only 4 mul/umulh pairs vs 10 for full 4x4 schoolbook -- 2.5x fewer multiplies.
+pub inline fn mulLimbScalar(a: [4]u64, b: u64) [4]u64 {
+    var r: [4]u64 = undefined;
+    var carry: u64 = 0;
+    inline for (0..4) |i| {
+        const prod: u128 = @as(u128, a[i]) * @as(u128, b) + @as(u128, carry);
+        r[i] = @truncate(prod);
+        carry = @truncate(prod >> 64);
+    }
+    return r;
+}
+
 /// Carry-propagated addition on u64 limbs (wrapping).
-fn addLimbs(a: [4]u64, b: [4]u64) [4]u64 {
+pub inline fn addLimbs(a: [4]u64, b: [4]u64) [4]u64 {
     var r: [4]u64 = undefined;
     var carry: u1 = 0;
     inline for (0..4) |i| {
@@ -198,6 +203,65 @@ fn div128by64(n_hi: u64, n_lo: u64, d: u64) struct { q: u64, r: u64 } {
     };
 }
 
+/// Specialized 128-by-128 division (2-limb / 2-limb). Quotient fits in u64.
+/// Avoids Knuth D's runtime-loop array overhead by operating on registers directly.
+inline fn div2by2(n0: u64, n1: u64, d0: u64, d1: u64) u64 {
+    const s: u6 = @intCast(@clz(d1));
+
+    // Normalized divisor
+    var nv1: u64 = d1;
+    var nv0: u64 = d0;
+    // Normalized numerator (3 limbs)
+    var nu2: u64 = 0;
+    var nu1: u64 = n1;
+    var nu0: u64 = n0;
+
+    if (s > 0) {
+        const rs: u6 = @intCast(@as(u7, 64) - s);
+        nv1 = (d1 << s) | (d0 >> rs);
+        nv0 = d0 << s;
+        nu2 = n1 >> rs;
+        nu1 = (n1 << s) | (n0 >> rs);
+        nu0 = n0 << s;
+    }
+
+    // Trial quotient via div128by64
+    const result = div128by64(nu2, nu1, nv1);
+    var qhat: u128 = result.q;
+    var rhat: u128 = result.r;
+
+    // Refine with second divisor limb
+    while (qhat >= (@as(u128, 1) << 64) or
+        qhat * @as(u128, nv0) > (rhat << 64) | @as(u128, nu0))
+    {
+        qhat -= 1;
+        rhat += nv1;
+        if (rhat >= (@as(u128, 1) << 64)) break;
+    }
+
+    // Multiply-back check: qhat * [nv1,nv0] must not exceed [nu2,nu1,nu0]
+    // After refinement, correction probability is ~2/2^64, but include for correctness.
+    const p_lo: u128 = qhat * @as(u128, nv0);
+    const p_mid: u128 = qhat * @as(u128, nv1) + (p_lo >> 64);
+    const prod0: u64 = @truncate(p_lo);
+    const prod1: u64 = @truncate(p_mid);
+    const prod2: u64 = @truncate(p_mid >> 64);
+
+    // Subtract product from normalized numerator
+    const sb1 = @subWithOverflow(nu0, prod0);
+    const sb2 = @subWithOverflow(nu1, prod1);
+    const sb3 = @subWithOverflow(sb2[0], @as(u64, sb1[1]));
+    const borrow = sb2[1] | sb3[1];
+    const diff2 = nu2 -% prod2 -% @as(u64, borrow);
+
+    // If underflow, qhat was 1 too large (extremely rare)
+    if (diff2 != 0) {
+        @branchHint(.cold);
+        return @truncate(qhat - 1);
+    }
+    return @truncate(qhat);
+}
+
 /// Knuth Algorithm D core: multi-limb division using div128by64 for trial quotients.
 /// Shared by both divLimbsDirect and divLimbs.
 /// Requires dd >= 2 and nn >= dd. Returns quotient as [4]u64.
@@ -289,11 +353,12 @@ fn knuthDivCore(num: [4]u64, nn: usize, div: [4]u64, dd: usize) [4]u64 {
 }
 
 /// Division on limbs, returning [4]u64 directly (avoids u256 round-trip).
-/// Uses div128by64 for single-limb divisors and knuthDivCore for multi-limb.
-fn divLimbsDirect(numerator: [4]u64, divisor: [4]u64) [4]u64 {
+/// Uses div128by64 (hardware UDIV) for single-limb divisors and knuthDivCore for multi-limb.
+pub fn divLimbsDirect(numerator: [4]u64, divisor: [4]u64) [4]u64 {
     const nn = countLimbs(numerator);
     const dd = countLimbs(divisor);
     if (dd == 0) @panic("division by zero");
+
     // Compare: if numerator < divisor, return 0
     {
         var i: usize = 4;
@@ -319,6 +384,11 @@ fn divLimbsDirect(numerator: [4]u64, divisor: [4]u64) [4]u64 {
         return q;
     }
 
+    // Fast path: 2-limb / 2-limb -- inline specialized division (no array overhead)
+    if (dd == 2 and nn <= 2) {
+        return .{ div2by2(numerator[0], numerator[1], divisor[0], divisor[1]), 0, 0, 0 };
+    }
+
     return knuthDivCore(numerator, nn, divisor, dd);
 }
 
@@ -348,7 +418,7 @@ fn divLimbs(numerator: u256, divisor: u256) u256 {
 
 /// Fast u256 multiplication that uses narrower operations when values fit.
 /// This avoids LLVM's slow generic 256-bit multiplication for common cases.
-pub fn fastMul(a: u256, b: u256) u256 {
+pub inline fn fastMul(a: u256, b: u256) u256 {
     // Both fit in u128 - use LLVM's faster 128-bit multiplication
     if ((a >> 128) == 0 and (b >> 128) == 0) {
         return @as(u256, @as(u128, @truncate(a))) *% @as(u256, @as(u128, @truncate(b)));
@@ -445,12 +515,9 @@ pub fn getAmountOut(amount_in: u256, reserve_in: u256, reserve_out: u256) u256 {
     const ri = u256ToLimbs(reserve_in);
     const ro = u256ToLimbs(reserve_out);
 
-    const fee_997: [4]u64 = .{ 997, 0, 0, 0 };
-    const fee_1000: [4]u64 = .{ 1000, 0, 0, 0 };
-
-    const amount_in_with_fee = mulLimbs(ai, fee_997);
+    const amount_in_with_fee = mulLimbScalar(ai, 997);
     const numerator = mulLimbs(amount_in_with_fee, ro);
-    const denominator = addLimbs(mulLimbs(ri, fee_1000), amount_in_with_fee);
+    const denominator = addLimbs(mulLimbScalar(ri, 1000), amount_in_with_fee);
 
     if (denominator[0] == 0 and denominator[1] == 0 and denominator[2] == 0 and denominator[3] == 0) {
         @panic("getAmountOut: denominator is zero (invalid reserves)");