From 0f7d09626d0b08368a68194aa7cd5b3a686b9fd4 Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Mon, 19 Jan 2026 08:32:02 -0800
Subject: [PATCH 01/16] feat: split Stage 6 into 6a (address) and 6b (cycle)
 phases

- Add BytecodeReadRafAddressSumcheckProver/Verifier and BytecodeReadRafCycleSumcheckProver/Verifier
- Add BooleanityAddressSumcheckProver/Verifier and BooleanityCycleSumcheckProver/Verifier
- Add SumcheckId variants: BytecodeReadRafAddressPhase, BooleanityAddressPhase, BytecodeClaimReductionCyclePhase, BytecodeClaimReduction
- Add VirtualPolynomial variants: BytecodeValStage, BytecodeReadRafAddrClaim, BooleanityAddrClaim, BytecodeClaimReductionIntermediate
- Update prover: prove_stage6a() and prove_stage6b()
- Update verifier: verify_stage6a() and verify_stage6b()
- Update JoltProof: stage6a_sumcheck_proof and stage6b_sumcheck_proof
- Add bytecode-commitment-progress.md planning doc
---
 bytecode-commitment-progress.md               | 655 ++++++++++++++++++
 jolt-core/src/poly/opening_proof.rs           |   4 +
 jolt-core/src/subprotocols/booleanity.rs      | 380 ++++++++--
 .../src/zkvm/bytecode/read_raf_checking.rs    | 366 +++++++++-
 jolt-core/src/zkvm/claim_reductions/advice.rs |  13 +-
 jolt-core/src/zkvm/proof_serialization.rs     |  27 +-
 jolt-core/src/zkvm/prover.rs                  | 114 ++-
 jolt-core/src/zkvm/verifier.rs                |  63 +-
 jolt-core/src/zkvm/witness.rs                 |   4 +
 9 files changed, 1515 insertions(+), 111 deletions(-)
 create mode 100644 bytecode-commitment-progress.md

diff --git a/bytecode-commitment-progress.md b/bytecode-commitment-progress.md
new file mode 100644
index 0000000000..33164f339c
--- /dev/null
+++ b/bytecode-commitment-progress.md
@@ -0,0 +1,655 @@
+# Bytecode Commitment (Planning / Progress Notes)
+
+This file is a **living design doc** for implementing **bytecode commitment** to remove verifier work linear in bytecode size \(K\), especially in recursion contexts (e.g. `examples/recursion/`).
+
+## Problem statement (what is slow today?)
+
+### Where the verifier is doing \(O(K)\) work
+
+- **Stage 6 verifier constructs `BytecodeReadRafSumcheckVerifier` by calling `BytecodeReadRafSumcheckParams::gen`**, passing the full `BytecodePreprocessing`.
+  - This happens in:
+    - `jolt-core/src/zkvm/verifier.rs` **L409–L417**
+
+- `BytecodeReadRafSumcheckParams::gen` currently **materializes 5 full `val_polys` of length `K`** by iterating the entire bytecode.
+  - `compute_val_polys(...)` call site:
+    - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L773–L784**
+  - The fused per-instruction loop is here:
+    - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L874–L1009**
+
+- In `expected_output_claim`, the verifier then **evaluates each `val_poly` at `r_address`**, which is also \(O(K)\).
+  - `val.evaluate(&r_address_prime.r)`:
+    - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L648–L666**
+  - `MultilinearPolynomial::evaluate` builds EQ tables and does a split-eq evaluation (still linear in coeff count):
+    - `jolt-core/src/poly/multilinear_polynomial.rs` **L682–L772**
+
+Net: for large bytecode (e.g. \(K \approx 2^{20}\)), the verifier is doing millions of field ops per verification, which explodes in recursion.
+
+## Relevant existing patterns we can mirror
+
+### 1) Two-phase claim reduction spanning Stage 6 → Stage 7 (Advice)
+
+- Stage 6 includes Advice claim reduction Phase 1:
+  - `jolt-core/src/zkvm/verifier.rs` **L446–L486**
+- Stage 7 conditionally includes Advice claim reduction Phase 2:
+  - `jolt-core/src/zkvm/verifier.rs` **L508–L529**
+- Advice reduction module:
+  - `jolt-core/src/zkvm/claim_reductions/advice.rs` (full file)
+
+### 2) “Trusted commitment in preprocessing-only context” (Advice)
+
+- Untrusted advice: prover commits during proving (`DoryContext::UntrustedAdvice`) and includes commitment in proof.
+  - `jolt-core/src/zkvm/prover.rs` **L636–L667**
+- Trusted advice: commitment/hint computed in preprocessing-only context (`DoryContext::TrustedAdvice`), verifier has commitment; prover just appends it to transcript.
+  - `jolt-core/src/zkvm/prover.rs` **L669–L688**
+- Dory contexts currently supported:
+  - `jolt-core/src/poly/commitment/dory/dory_globals.rs` **L160–L166**
+
+### 3) Single Stage 8 joint opening (Dory batch opening)
+
+Stage 8 collects polynomial claims, samples gamma, combines commitments, and verifies a single opening.
+
+- Stage 8 verifier:
+  - `jolt-core/src/zkvm/verifier.rs` **L542–L691**
+
+Advice polynomials get a **Lagrange embedding factor** so a smaller context polynomial can be batched with main polynomials:
+
+- `compute_advice_lagrange_factor`:
+  - `jolt-core/src/poly/opening_proof.rs` **L635–L672**
+
+## Key batching detail (important for scheduling reductions)
+
+Batched sumcheck instances are “front-loaded” via a **global round offset**:
+
+- Default `round_offset` shifts shorter instances to the **end**:
+  - `jolt-core/src/subprotocols/sumcheck_prover.rs` **L30–L37**
+  - `jolt-core/src/subprotocols/sumcheck_verifier.rs` **L24–L30**
+- `BatchedSumcheck` uses that offset to decide whether an instance is active in a global round:
+  - `jolt-core/src/subprotocols/sumcheck.rs` **L79–L93**
+
+This matters because it explains why Stage 6 “cycle rounds” can align across many instances even if they have different `num_rounds()`.
+
+## Bytecode commitment: what we likely need to commit to
+
+### Bytecode-side “fields” referenced in `compute_val_polys`
+
+From `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L874–L1009**, Val polynomials depend on:
+
+- **Instruction scalar fields**
+  - `instr.address` (a.k.a. unexpanded PC)
+  - `instr.operands.imm`
+- **Circuit flags**: `NUM_CIRCUIT_FLAGS = 13`
+  - `jolt-core/src/zkvm/instruction/mod.rs` **L59–L86**, **L121**
+- **Instruction flags**: `NUM_INSTRUCTION_FLAGS = 7`
+  - `jolt-core/src/zkvm/instruction/mod.rs` **L104–L119**, **L122**
+- **Register operands**: `rd`, `rs1`, `rs2` (used via `eq_r_register[...]` lookup)
+  - This strongly suggests committing to **one-hot indicators** `1_{rd=r}`, `1_{rs1=r}`, `1_{rs2=r}` for all `r` (linear combination with EQ table).
+- **Lookup table selector**
+  - `NUM_LOOKUP_TABLES = LookupTables::<32>::COUNT` (currently 41)
+  - `jolt-core/src/zkvm/lookup_table/mod.rs` **L118–L166**
+- **RAF / interleaving flag**
+  - `!circuit_flags.is_interleaved_operands()` (non-linear in circuit flags, so likely needs its own committed boolean field if we want linear combination only).
+  - `jolt-core/src/zkvm/instruction/mod.rs` **L124–L135**
+
+## Decisions so far (from discussion)
+
+### Commitment granularity + packing (key)
+
+We will **commit to the “atomic” bytecode fields**, but **pack/chunk them so each committed polynomial’s “lane” dimension fits `k_chunk = 2^{log_k_chunk}`**.
+
+- `log_k_chunk` is **either 4 or 8** (so `k_chunk` is **16 or 256**), chosen from trace length:
+  - `jolt-core/src/zkvm/config.rs` **L133–L151**
+
+#### Canonical lane ordering (authoritative)
+
+We fix a canonical total ordering of “lanes” (fields) so packing/chunking is purely mechanical and future-proof:
+
+1. **`rs1` one-hot lanes**: 128 lanes (registers 0..127)
+2. **`rs2` one-hot lanes**: 128 lanes
+3. **`rd` one-hot lanes**: 128 lanes
+4. **`unexpanded_pc` lane** (scalar)
+5. **`imm` lane** (scalar)
+6. **circuit flags** lanes: 13 boolean lanes (`NUM_CIRCUIT_FLAGS`)
+7. **instruction flags** lanes: 7 boolean lanes (`NUM_INSTRUCTION_FLAGS`)
+8. **lookup-table selector** lanes: 41 boolean lanes (`NUM_LOOKUP_TABLES`)
+9. **RAF/interleave flag** lane: 1 boolean lane (`raf_flag := !circuit_flags.is_interleaved_operands()`)
+
+Lane counts:
+- registers: `3 * REGISTER_COUNT = 3 * 128 = 384`
+  - `REGISTER_COUNT` definition: `common/src/constants.rs` **L1–L5**
+- “dense-ish” bytecode fields: `2 + 13 + 7 + 41 + 1 = 64`
+  - flags definitions: `jolt-core/src/zkvm/instruction/mod.rs` **L59–L86** (circuit), **L104–L119** (instruction)
+  - lookup tables count: `jolt-core/src/zkvm/lookup_table/mod.rs` **L118–L166**
+
+Total lanes = **384 + 64 = 448**.
+
+Packing policy:
+- We chunk the lane list into consecutive blocks of size `k_chunk`.
+- Each block becomes one committed “bytecode commitment polynomial”.
+- **`k_chunk=16`**: 448 lanes ⇒ **28 commitments** (exactly `3*(128/16)=24` for registers + `64/16=4` for the rest).
+- **`k_chunk=256`**: 448 lanes ⇒ **2 commitments**:
+  - chunk0: `rs1[0..127] || rs2[0..127]` (256 lanes)
+  - chunk1: `rd[0..127] || (all remaining 64 lanes) || (64 lanes padding)`
+
+Notes:
+- Even though the first 384 lanes are “one-hot structured”, the packing is defined by lanes, so rs1/rs2/rd can be packed together when `k_chunk=256`.
+- We will likely encode all lanes as field elements in the packed polynomial (booleans as 0/1), but **the representation choice (dense vs specialized one-hot)** is still an implementation detail (see Remaining plan questions below).
+
+### Embedding policy
+
+We will **not** require the main Dory matrix to grow to fit bytecode commitments. Instead we:
+
+- keep each bytecode-commit polynomial within the main `k_chunk` address-dimension, and
+- use a claim reduction (Stage 6→7) so these commitments can be batched into the single Stage 8 opening, similar to advice.
+
+### Domain / padding
+
+Bytecode commitments use the same **padding-to-power-of-two** policy as other committed polynomials:
+
+- the “instruction index” dimension is padded to a power of 2 (like other `T`-style dimensions).
+- the “lane/index” dimension is `k_chunk` (16 or 256), with unused lanes zero-padded.
+
+### Ownership / preprocessing storage
+
+Bytecode commitments should behave like **trusted preprocessing**:
+
+- verifier has them in shared preprocessing (like trusted advice commitment is “known” to verifier),
+- we define an enum where shared preprocessing stores **either**:
+  - raw bytecode (`BytecodePreprocessing`), **or**
+  - commitments (+ minimal metadata).
+
+## Remaining plan questions (to settle before coding)
+
+1. **Representation / PCS support for packed bytecode polynomials**:
+   - Packing into `k_chunk` lanes means each packed polynomial has `k_chunk * bytecode_len` coefficients (very large).
+   - We likely need a **streaming / implicit** polynomial representation (similar in spirit to `RLCPolynomial`) so Stage 8 can include bytecode commitments in the joint opening without materializing all coefficients.
+2. **“rs1+rs2 as one-hot” wording (important clarity)**:
+   - A single `OneHotPolynomial` can only select **one** lane index per column.
+   - Packing `rs1` and `rs2` into the same 256-lane chunk means two 1s per instruction; this may need to be represented as a packed dense-bool polynomial (still sparse), or via a different encoding.
+3. **Reduction batching**: we want **one** `BytecodeClaimReduction` sumcheck that batches all bytecode commitments and normalizes to the unified point (like `AdviceClaimReduction` + `HammingWeightClaimReduction` patterns).
+4. **Stage 6 refactor** (required for mid-stage emission):
+   - Stage 6 must split into **Stage 6a (log_K)** and **Stage 6b (log_T)** so bytecode-field claims emitted after the address rounds can be consumed immediately.
+   - This also requires splitting `Booleanity` into address/cycle sumchecks (it is internally two-phase today):
+     - `jolt-core/src/subprotocols/booleanity.rs` **L399–L453** (phase switch), **L455–L478** (cache_openings)
+5. **Exact API surface**:
+   - what concrete type should live in `JoltSharedPreprocessing` for the commitment-only variant (commitments-only vs commitments+opening hints)?
+   - which `SumcheckId` values should be used for the new reduction’s intermediate/final cached openings?
+
+---
+
+## BytecodeReadRaf Stage 6a: what claims should be emitted?
+
+The “emission point” is already explicit in the prover today: it happens right when we transition from the first `log_K` (address) rounds into the remaining `log_T` (cycle) rounds.
+
+In `BytecodeReadRafSumcheckProver::init_log_t_rounds`:
+
+- The prover computes the 5 stage-specific scalars:
+  - `poly.final_sumcheck_claim()` for each stage Val polynomial, plus the RAF-injected identity contribution for stages 1 and 3:
+    - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L307–L335**
+- It also finalizes the address point by reversing the collected low-to-high challenges:
+  - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L337–L340**
+
+Those 5 scalars are stored in:
+
+- `self.bound_val_evals: Option<[F; 5]>`
+  - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L320–L335**
+
+**Stage 6a should emit exactly these 5 scalars as “bytecode field claims”**, keyed by a new `SumcheckId` / `OpeningId`, with opening point = the address point `r_address` produced at the end of the address rounds.
+
+Implementation detail we’ll likely choose:
+
+- Emit **Val-only** claims `Val_s(r_address)` (no RAF Int injected), and let `BytecodeReadRaf` add the constant RAF terms itself (since `Int(r_address)=1`).
+  - Today RAF is injected in `bound_val_evals` at **L324–L331**; we can split this for cleaner “bytecode-only” claim reduction.
+
+Why this is the “right” interface:
+
+- Stage 6b (the cycle-phase continuation of BytecodeReadRaf) needs these 5 scalars as weights for the remaining `log_T` rounds (today they’re read from `bound_val_evals` during the `round >= log_K` branch).
+
+## BytecodeClaimReduction: what it should prove (high level)
+
+We mirror the structure of `AdviceClaimReduction` (`jolt-core/src/zkvm/claim_reductions/advice.rs`), but with different “payload polynomials” and a simpler address schedule thanks to `k_chunk`.
+
+### Inputs (from Stage 6a)
+
+- The 5 “Val stage” claims:
+  - `c_s := Val_s(r_bc)` for `s ∈ {1..5}`, where `r_bc` is the Stage 6a address point (bytecode-index point).
+- The point `r_bc` itself (implicitly stored as the opening point associated with `c_s`).
+
+### Witness (committed) polynomials
+
+Let `B_i` be the committed bytecode chunk polynomials induced by the canonical lane ordering.
+
+- `i ∈ [0, n_chunks)` where `n_chunks = ceil(448 / k_chunk)`:
+  - `k_chunk=16` ⇒ `n_chunks=28`
+  - `k_chunk=256` ⇒ `n_chunks=2`
+  - See lane spec above.
+
+Each `B_i` is a polynomial over:
+- **lane/address vars**: `log_k_chunk`
+- **bytecode-index vars**: `log_K_bytecode` (padded / embedded as needed; see “bytecode_len vs trace_len” note below)
+
+### The identity to prove (batched)
+
+Define a per-stage lane weight table `w_s[lane]` derived from:
+- stage gammas sampled in `BytecodeReadRafSumcheckParams::gen`:
+  - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L738–L742**
+- register EQ tables (`eq_r_register_4`, `eq_r_register_5`) and the stage formulas in `compute_val_polys`:
+  - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L752–L783**, **L874–L1009**
+
+Then for each stage:
+
+- \(c_s = \sum_{lane,k} B[lane,k] \cdot w_s[lane] \cdot \mathrm{eq}(r_{bc}, k)\)
+
+We will batch the 5 stages with a transcript challenge \( \eta \) (powers), so the sumcheck instance has a **single scalar input claim**:
+
+- \(C_{\text{in}} = \sum_s \eta^s \cdot c_s\)
+
+and proves:
+
+- \(C_{\text{in}} = \sum_{lane,k} B[lane,k] \cdot W_{\eta}(lane) \cdot \mathrm{eq}(r_{bc}, k)\)
+  - where \(W_{\eta}(lane) := \sum_s \eta^s \cdot w_s[lane]\)
+
+This keeps verifier complexity small: evaluating \(W_{\eta}\) at a point costs `O(k_chunk)` and computing \(\mathrm{eq}(r_{bc}, \cdot)\) uses `EqPolynomial`.
+
+### Reduction target (Stage 8 compatibility)
+
+BytecodeClaimReduction will run in two phases like advice:
+
+- **Phase 1 (Stage 6b)**: bind the bytecode-index variables (cycle-phase rounds).
+  - Cache an intermediate claim (like `AdviceClaimReductionCyclePhase`).
+- **Phase 2 (Stage 7)**: bind the lane variables (`log_k_chunk` rounds).
+  - When each `B_i` is fully bound (len==1), cache its final opening `B_i(final_point)` for batching into Stage 8.
+
+Verifier then reconstructs the stage-6a claim(s) from:
+- the final `B_i(final_point)` openings,
+- the scalar `EqPolynomial::mle(r_bc, final_point_k)`,
+- the scalar `W_eta(final_point_lane)`,
+exactly analogous to `AdviceClaimReductionVerifier::expected_output_claim`.
+
+### bytecode_len vs trace_len (defensive padding)
+
+If `bytecode_len > padded_trace_len` (rare but possible for “mostly dead code”), we need to ensure:
+- the main Dory URS / generators are large enough, and
+- any “bytecode index variable count” that is driven by Stage 6 cycle rounds has enough randomness.
+
+Pragmatic policy:
+- set `padded_trace_len = max(padded_trace_len, bytecode_len.next_power_of_two())` *when bytecode commitments are enabled*,
+  similar in spirit to `adjust_trace_length_for_advice` in `jolt-core/src/zkvm/prover.rs`.
+
+### Preliminary “field count” if committed separately (worst-case baseline)
+
+If we commit one polynomial per “atomic linear field”:
+
+- `pc` + `imm`: **2**
+- circuit flags: **13**
+- instruction flags: **7**
+- register one-hots: **3 * REGISTER_COUNT**
+  - Note: `REGISTER_COUNT = 32 (RISC-V) + 96 (virtual) = 128` in this repo
+    - `common/src/constants.rs` **L1–L5**
+- lookup table one-hots: **41**
+- raf/interleave flag: **1**
+
+Total baseline (with `REGISTER_COUNT=128`): **2 + 13 + 7 + 384 + 41 + 1 = 448 polynomials**.
+
+This is too many to *open individually*, but may be fine if we **derive only a few linear-combo commitments** (see open design options below).
+
+## Proposed direction (high-level)
+
+Goal: make verifier’s `BytecodeReadRaf` expected-claim computation **not materialize or evaluate length-K `val_polys`**, and instead consume **opening claims** that are later checked against a **trusted bytecode commitment** via Stage 8.
+
+Key idea: mirror advice:
+
+- **(A) Commit to bytecode (trusted preprocessing)**
+  - Add a dedicated Dory context (e.g. `DoryContext::Bytecode`) whose matrix is a top-left block of main, like advice.
+  - Verifier has these commitments “for free” (hard-coded / preprocessing).
+
+- **(B) Emit bytecode-related evaluation claims during Stage 6**
+  - Similar to how advice emits `RamValEvaluation` openings that later get reduced, `BytecodeReadRaf` should stop evaluating `val_polys` itself and instead *read* an opening claim (or small number of claims) from the opening accumulator.
+
+- **(C) New two-phase “BytecodeClaimReduction” sumcheck**
+  - Stage 6 phase: bind cycle-derived coordinates (last `log_T` rounds)
+  - Stage 7 phase: bind address-derived coordinates (`log_k_chunk` rounds)
+  - Cache final opening(s) so Stage 8 can batch them.
+
+- **(D) Stage 8 batches bytecode commitments**
+  - Include bytecode commitment(s) and reduced claim(s) in `polynomial_claims` with an embedding/Lagrange factor (same pattern as advice).
+
+## Open design questions (need alignment before coding)
+
+1. **Embedding feasibility**
+   - Bytecode commitment context must fit in main Dory matrix: need `(sigma_bytecode <= sigma_main)` and `(nu_bytecode <= nu_main)`.
+   - If program has **small trace length but huge bytecode**, do we:
+     - pad `T` upward (like `adjust_trace_length_for_advice`), or
+     - allow a second opening / separate Stage 8, or
+     - impose a constraint “recursion requires T big enough”?
+
+2. **Granularity**
+   - Commit per field (many polynomials), or
+   - commit a smaller set + derive per-stage Val polynomials by linear combinations of commitments, or
+   - pack fields into one polynomial `p(k, idx)` (but then Val is *not* a simple linear combo of `p` at one point; needs more thought).
+
+3. **How many bytecode “claims” should Stage 6 consume?**
+   - 5 claims (one per stage Val polynomial), or
+   - 1 claim (random linear combo of stage Vals, or another fixed fold) to minimize downstream reduction/opening cost.
+
+4. **Where should the “initial” bytecode openings live?**
+   - As `OpeningId::Committed(CommittedPolynomial::..., SumcheckId::BytecodeReadRaf)` entries, analogous to other committed openings, or
+   - a new `OpeningId` variant (like `TrustedAdvice(...)`) if we need special casing.
+
+5. **Commitment ownership**
+   - Should bytecode commitments be stored inside `JoltSharedPreprocessing` / `JoltVerifierPreprocessing`, or passed separately like `trusted_advice_commitment`?
+
+6. **Transcript binding**
+   - We likely need to append trusted bytecode commitment(s) to the transcript in `JoltVerifier::verify` (similar to trusted advice):
+     - `jolt-core/src/zkvm/verifier.rs` **L190–L203**
+
+---
+
+## Next steps (for plan agreement)
+
+1. Decide **commit granularity** (per-field vs derived vs packed) with a target of minimizing **recursive verifier cycles**.
+2. Decide **embedding policy** when bytecode is larger than main Dory dims.
+3. Define the **exact claims** `BytecodeReadRaf` will consume (count + meaning).
+4. Define the new **BytecodeClaimReduction** parameters (analogous to `AdviceClaimReductionParams`) and which Stage 6/7 rounds it occupies.
+
+---
+
+## Detailed implementation plan (agreed direction)
+
+This section is an implementation checklist in dependency order.
+
+### Step 1 — Refactor Stage 6 into two substages (6a + 6b)
+
+**Goal**: make “end of BytecodeReadRaf address rounds” a real stage boundary so we can:
+- emit `Val_s(r_bc)` claims **immediately** after binding `r_bc`,
+- start `BytecodeClaimReduction` during the subsequent **cycle** randomness (what will become Stage 6b),
+- avoid verifier doing any \(O(K_{\text{bytecode}})\) work.
+
+#### 1.1 Proof object / serialization changes
+
+- Split `stage6_sumcheck_proof` into:
+  - `stage6a_sumcheck_proof` (address rounds)
+  - `stage6b_sumcheck_proof` (cycle rounds)
+- Transcript ordering: **run Stage 6a sumcheck → append Stage 6a claims → run Stage 6b sumcheck → append Stage 6b claims** (breaking change OK).
+- Files:
+  - `jolt-core/src/zkvm/proof_serialization.rs` (`JoltProof` struct)
+  - any serialize/deserialize helpers that assume a single Stage 6 proof.
+
+#### 1.2 Prover plumbing
+
+- In `jolt-core/src/zkvm/prover.rs`:
+  - Replace `prove_stage6()` with `prove_stage6a()` + `prove_stage6b()`.
+  - Update the main `prove()` flow to call both and store both proofs.
+  - Stage 6 instances currently assembled at `prover.rs` **L1206–L1214** must be split across 6a/6b.
+
+Target contents:
+- **Stage 6a (max rounds = `max(log_K_bytecode, log_k_chunk)`)**:
+  - `BytecodeReadRafAddr` (new; `log_K_bytecode` rounds)
+  - `BooleanityAddr` (new; `log_k_chunk` rounds; will be active only in last `log_k_chunk` rounds via front-loaded batching)
+- **Stage 6b (max rounds = `log_T`)**:
+  - `BytecodeReadRafCycle` (new; `log_T` rounds)
+  - `BooleanityCycle` (new; `log_T` rounds)
+  - existing Stage-6 cycle-only instances (unchanged logic, just move them here):
+    - `RamHammingBooleanity` (`log_T`)
+    - `RamRaVirtualization` (`log_T`)
+    - `InstructionRaVirtualization` (`log_T`)
+    - `IncClaimReduction` (`log_T`)
+    - AdviceClaimReduction Phase 1 (if present) **needs a `round_offset` update** because Stage 6b `max_num_rounds` will now be `log_T` (see Step 2.3).
+  - `BytecodeClaimReduction` phase 1 (new; `log_T` rounds; see Step 4)
+
+#### 1.3 Verifier plumbing
+
+- In `jolt-core/src/zkvm/verifier.rs`:
+  - Replace `verify_stage6()` with `verify_stage6a()` + `verify_stage6b()`.
+  - Update the main `verify()` call chain to include both.
+
+### Step 2 — Split Booleanity into two sumchecks (address + cycle)
+
+Reason: `Booleanity` is currently a *single* sumcheck with an internal phase transition at `log_k_chunk`:
+- `jolt-core/src/subprotocols/booleanity.rs` **L399–L446**
+
+But Stage 6 is becoming two proofs, so Booleanity must be representable as two separate sumcheck instances.
+
+#### 2.1 New sumcheck instances
+
+Create:
+- `BooleanityAddressSumcheck` (`num_rounds = log_k_chunk`)
+- `BooleanityCycleSumcheck` (`num_rounds = log_T`)
+
+We will reuse most of the existing prover state splitting exactly at the current transition:
+- address phase ends where today `eq_r_r` is computed and `H` is initialized (**L415–L445**)
+- cycle phase reuses `D` and `H` binding (**L446–L452**)
+
+#### 2.2 Chaining between 6a and 6b (important)
+
+To make `BooleanityCycle` a standalone sumcheck, it needs an **input claim**:
+- the output of `BooleanityAddress`, i.e. the partially summed claim after binding `r_address`.
+
+We will follow the **AdviceClaimReduction** pattern:
+- Stage 6a prover computes this intermediate claim and stores it in the opening accumulator under a new `SumcheckId` (see Step 5).
+- Stage 6a verifier treats that stored claim as the expected output of `BooleanityAddress`.
+- Stage 6b `BooleanityCycle` uses that stored claim as its `input_claim`.
+
+This avoids needing BatchedSumcheck to “return per-instance output claims”.
+
+#### 2.3 Update advice reduction round alignment (PINNED)
+
+`AdviceClaimReductionProver::round_offset` currently assumes Stage 6 max rounds includes `log_k_chunk + log_T` (it aligns to the start of Booleanity’s cycle segment).
+With Stage 6b max rounds = `log_T`, this must be updated to avoid underflow and to align to Stage 6b round 0.
+
+File:
+- `jolt-core/src/zkvm/claim_reductions/advice.rs` (`round_offset` in both prover+verifier impls)
+
+### Step 3 — Split BytecodeReadRaf into two sumchecks (address + cycle)
+
+Reason: we need a real stage boundary right after binding `r_bc` (bytecode-index address point), because:
+- `Val_s(r_bc)` is computed exactly at the transition today in `init_log_t_rounds`
+  - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L307–L340**
+
+#### 3.1 New sumcheck instances
+
+Create:
+- `BytecodeReadRafAddressSumcheck` (`num_rounds = log_K_bytecode`)
+- `BytecodeReadRafCycleSumcheck` (`num_rounds = log_T`)
+
+#### 3.2 Stage 6a emissions (the key interface)
+
+At the end of address rounds (today’s `init_log_t_rounds` boundary):
+- emit **Val-only** claims:
+  - `c_s := Val_s(r_bc)` for `s=1..5`
+  - RAF terms are *not* included; verifier can add them succinctly because `Int(r_bc)=1`.
+- batch these 5 claims with a random \(\eta\) in later reduction (Step 4), but still store the 5 scalars in the opening map.
+
+Also emit the **cycle-phase input claim** for `BytecodeReadRafCycle`:
+- this is the output claim of the address-only sumcheck (the partially summed value over cycle variables).
+
+Both kinds of values must land in `opening_claims` so the verifier has them without recomputation.
+
+### Step 4 — Implement `BytecodeClaimReduction` (two-phase, single instance)
+
+This is the new sumcheck that replaces verifier’s \(O(K_{\text{bytecode}})\) evaluation of `val_polys`.
+
+#### 4.1 High-level role
+
+Input: the 5 `Val_s(r_bc)` scalars from Stage 6a.
+
+Output: a set of committed-polynomial openings for the **bytecode commitment chunk polynomials** at the unified Dory opening point, so Stage 8 can batch them.
+
+#### 4.2 Batching the 5 stage claims
+
+We will batch the 5 `Val_s(r_bc)` using a transcript challenge \(\eta\):
+
+- \(C_{\text{in}} = \sum_s \eta^s \cdot Val_s(r_{bc})\)
+
+and prove this equals a single linear functional of the committed bytecode polynomials:
+
+- \(C_{\text{in}} = \sum_{lane,k} B[lane,k] \cdot W_{\eta}(lane) \cdot \mathrm{eq}(r_{bc}, k)\)
+
+No per-lane openings are needed; correctness follows from linearity.
+
+#### 4.3 Two phases aligned to new stages
+
+- **Phase 1 (Stage 6b)**: bind the bytecode-index variables using Stage 6b cycle challenges.
+  - cache an intermediate claim (like `AdviceClaimReductionCyclePhase`) to start Phase 2.
+- **Phase 2 (Stage 7)**: bind the lane variables (`log_k_chunk` rounds).
+  - when each chunk polynomial is fully bound, cache its final opening for Stage 8.
+
+The address phase should be simpler than advice because lane vars = exactly `log_k_chunk` (no partial consumption).
+
+### Step 5 — `SumcheckId` / opening bookkeeping (naming + flow)
+
+#### 5.1 How `SumcheckId` actually enters the proving / verifying flow
+
+`SumcheckId` is part of the **key** used to store scalar claims in the opening accumulator maps.
+Concretely, the key type is `OpeningId`, and it embeds `SumcheckId`:
+
+- `OpeningId::Committed(CommittedPolynomial, SumcheckId)`
+- `OpeningId::Virtual(VirtualPolynomial, SumcheckId)`
+- `OpeningId::TrustedAdvice(SumcheckId)` / `OpeningId::UntrustedAdvice(SumcheckId)`
+  - `jolt-core/src/poly/opening_proof.rs` **L136–L175**
+
+**Prover side**: each sumcheck instance labels the claims it emits in `cache_openings(...)` by calling `ProverOpeningAccumulator::append_*` with a `SumcheckId`.
+Those become entries in `opening_claims` (serialized into the proof).
+
+**Verifier side**: the verifier is initialized with these claim scalars already present (from `opening_claims`), and each instance’s `cache_openings(...)` uses the same `SumcheckId` to populate the **opening point** for the existing claim (and to keep the transcript in sync).
+
+#### 5.2 Why advice has two `SumcheckId`s (`...CyclePhase` and final)
+
+Advice claim reduction spans Stage 6 → Stage 7, so it must store:
+
+- an **intermediate** scalar after Phase 1 (cycle binding), and
+- the **final** advice evaluation after Phase 2 (address binding).
+
+This is why `SumcheckId` has both:
+
+- `AdviceClaimReductionCyclePhase` (intermediate)
+- `AdviceClaimReduction` (final)
+  - `jolt-core/src/poly/opening_proof.rs` **L157–L160**
+
+Where it’s used:
+
+- Phase 2 starts from the Phase 1 intermediate:
+  - `AdviceClaimReductionParams::input_claim` (AddressVariables case):
+    - `jolt-core/src/zkvm/claim_reductions/advice.rs` **L190–L216**
+- Phase 1 and Phase 2 both cache openings under their respective IDs:
+  - `AdviceClaimReductionProver::cache_openings`:
+    - `jolt-core/src/zkvm/claim_reductions/advice.rs` **L466–L518**
+
+So neither is unused; they identify *two different stored claims*.
+
+#### 5.3 Naming rule of thumb (must match variable order)
+
+Two-phase protocols in this repo come in **both** variable orders:
+
+- **cycle → address**: advice claim reduction, bytecode claim reduction
+- **address → cycle**: booleanity, bytecode read+raf
+
+So the naming should reflect **what phase 1 binds**:
+
+- `XCyclePhase`: output claim after Phase 1 binds the **cycle-derived** variables
+- `XAddressPhase`: output claim after Phase 1 binds the **address-derived** variables
+- `X` (or `XFinal`): final output after all variables are bound
+
+For protocols we split into two physical sumchecks (Stage 6a + 6b) but want downstream stability:
+
+- keep the existing “final” `SumcheckId` if other modules already key off it (e.g. `HammingWeightClaimReduction` expects `SumcheckId::BytecodeReadRaf` today),
+- add a new `...AddressPhase` id for the Stage 6a pre-phase when the protocol binds address first.
+
+#### 5.4 Concrete `SumcheckId` changes for this rollout
+
+File to update:
+- `jolt-core/src/poly/opening_proof.rs` (`SumcheckId` enum)
+
+We will add:
+
+- **Address → cycle protocols (Stage 6 split)**:
+  - `BytecodeReadRafAddressPhase` (new; Stage 6a sumcheck; binds **address** first)
+  - `BooleanityAddressPhase` (new; Stage 6a sumcheck; binds **address** first)
+  - keep `BytecodeReadRaf` and `Booleanity` as the “final” IDs (Stage 6b sumchecks + cached openings) so downstream modules that key off them (e.g. HW reduction) remain stable.
+
+- **Cycle → address protocols (two-phase reductions)**:
+  - `BytecodeClaimReductionCyclePhase` (new; phase 1 output after binding **cycle** vars in Stage 6b)
+  - `BytecodeClaimReduction` (new; final output after binding **lane/address** vars in Stage 7)
+  - (existing) `AdviceClaimReductionCyclePhase` / `AdviceClaimReduction` already follow this pattern.
+
+We will also add **new `VirtualPolynomial` variants** for scalar claims that are *not* openings of committed polynomials:
+
+- **Stage 6a (BytecodeReadRafAddressPhase)**:
+  - `VirtualPolynomial::BytecodeValStage(usize)` for the 5 Val-only claims.
+  - `VirtualPolynomial::BytecodeReadRafAddrClaim` for the address-phase output claim that seeds the cycle-phase sumcheck.
+- **Stage 6a (BooleanityAddressPhase)**:
+  - `VirtualPolynomial::BooleanityAddrClaim` for the address-phase output claim that seeds the cycle-phase sumcheck.
+- **Stage 6b → Stage 7 (BytecodeClaimReduction)**:
+  - `VirtualPolynomial::BytecodeClaimReductionIntermediate` for the cycle-phase intermediate claim (analogous to advice’s `...CyclePhase`), used as Stage 7 input.
+
+#### 5.5 Quick “protocol → variable order → IDs” table (sanity)
+
+- **BytecodeReadRaf**: address → cycle
+  - Stage 6a: `SumcheckId::BytecodeReadRafAddressPhase`
+  - Stage 6b: `SumcheckId::BytecodeReadRaf` (final)
+- **Booleanity**: address → cycle
+  - Stage 6a: `SumcheckId::BooleanityAddressPhase`
+  - Stage 6b: `SumcheckId::Booleanity` (final)
+- **BytecodeClaimReduction**: cycle → lane/address
+  - Stage 6b: `SumcheckId::BytecodeClaimReductionCyclePhase` (intermediate stored)
+  - Stage 7: `SumcheckId::BytecodeClaimReduction` (final)
+- **AdviceClaimReduction** (existing): cycle → address (two-phase)
+  - Stage 6: `SumcheckId::AdviceClaimReductionCyclePhase`
+  - Stage 7: `SumcheckId::AdviceClaimReduction`
+
+### Step 6 — Bytecode commitments in preprocessing + transcript
+
+#### 6.1 New Dory context + storage
+
+Add a new `DoryContext::Bytecode` (like Trusted/UntrustedAdvice) so we can commit to bytecode chunk polynomials in preprocessing and hand the commitments to the verifier.
+
+Update shared preprocessing to store either:
+- raw `BytecodePreprocessing`, or
+- `{ bytecode_len, k_chunk, commitments: Vec<Commitment>, (optional) layout metadata }`
+
+#### 6.2 Canonical lane ordering implementation
+
+Implement an enum (or equivalent) encoding the authoritative lane ordering:
+- rs1 lanes (0..127), rs2 lanes (0..127), rd lanes (0..127), then dense fields.
+Then chunk into blocks of size `k_chunk` to get commitment indices.
+
+This ordering must be used consistently by:
+- commitment generation
+- `BytecodeClaimReduction` weight construction
+- Stage 8 batching / VMV contribution
+
+### Step 7 — Stage 8 batching integration (bytecode polynomials)
+
+Stage 8 currently builds a streaming `RLCPolynomial` from:
+- dense trace polys
+- onehot RA polys
+- advice polys (passed directly)
+
+We need to extend this to include “bytecode commitment chunk polynomials”:
+- they are **not** streamed from trace
+- they are too large to materialize when bytecode is big
+
+Implementation direction:
+- extend the streaming RLC machinery to support an additional source (“stream from bytecode”),
+  analogous to how it already streams onehot polys from trace.
+
+Files involved:
+- `jolt-core/src/poly/rlc_polynomial.rs` (extend streaming context + VMP to include bytecode chunk polys)
+- `jolt-core/src/zkvm/prover.rs` / `verifier.rs` Stage 8 claim collection (include bytecode chunk claims with appropriate embedding factor, like advice)
+
+### Step 8 — Defensive padding: bytecode_len vs trace_len
+
+When bytecode commitments are enabled, ensure we have enough cycle randomness to bind bytecode-index vars:
+
+- `padded_trace_len = max(padded_trace_len, bytecode_len.next_power_of_two())`
+
+This is analogous to `adjust_trace_length_for_advice` in `jolt-core/src/zkvm/prover.rs`.
+
+### Step 9 — Tests / validation
+
+- Unit tests:
+  - lane ordering + chunking (k_chunk=16 ⇒ 28 chunks, k_chunk=256 ⇒ 2 chunks)
+  - bytecode_len > trace_len padding path
+- E2E:
+  - prove+verify with bytecode commitment enabled, both layouts (CycleMajor/AddressMajor)
+- Recursion benchmark:
+  - confirm verifier cycle count no longer scales with bytecode length.
diff --git a/jolt-core/src/poly/opening_proof.rs b/jolt-core/src/poly/opening_proof.rs
index 3b3f93553d..5f1316d717 100644
--- a/jolt-core/src/poly/opening_proof.rs
+++ b/jolt-core/src/poly/opening_proof.rs
@@ -152,10 +152,14 @@ pub enum SumcheckId {
     RegistersClaimReduction,
     RegistersReadWriteChecking,
     RegistersValEvaluation,
+    BytecodeReadRafAddressPhase,
     BytecodeReadRaf,
+    BooleanityAddressPhase,
     Booleanity,
     AdviceClaimReductionCyclePhase,
     AdviceClaimReduction,
+    BytecodeClaimReductionCyclePhase,
+    BytecodeClaimReduction,
     IncClaimReduction,
     HammingWeightClaimReduction,
 }
diff --git a/jolt-core/src/subprotocols/booleanity.rs b/jolt-core/src/subprotocols/booleanity.rs
index ed6d58a0a0..329e80f622 100644
--- a/jolt-core/src/subprotocols/booleanity.rs
+++ b/jolt-core/src/subprotocols/booleanity.rs
@@ -388,6 +388,53 @@ impl<F: JoltField> BooleanitySumcheckProver<F> {
 
         gruen_poly * self.eq_r_r
     }
+
+    fn ingest_address_challenge(&mut self, r_j: F::Challenge, round: usize) {
+        // Phase 1: Bind B and update F
+        self.B.bind(r_j);
+        self.F.update(r_j);
+
+        // Transition to phase 2
+        if round == self.params.log_k_chunk - 1 {
+            self.eq_r_r = self.B.get_current_scalar();
+
+            // Initialize SharedRaPolynomials with per-poly pre-scaled eq tables (by rho_i)
+            let F_table = std::mem::take(&mut self.F);
+            let ra_indices = std::mem::take(&mut self.ra_indices);
+            let base_eq = F_table.clone_values();
+            let num_polys = self.params.polynomial_types.len();
+            debug_assert!(
+                num_polys == self.gamma_powers.len(),
+                "gamma_powers length mismatch: got {}, expected {}",
+                self.gamma_powers.len(),
+                num_polys
+            );
+            let tables: Vec<Vec<F>> = (0..num_polys)
+                .into_par_iter()
+                .map(|i| {
+                    let rho = self.gamma_powers[i];
+                    base_eq.iter().map(|v| rho * *v).collect()
+                })
+                .collect();
+            self.H = Some(SharedRaPolynomials::new(
+                tables,
+                ra_indices,
+                self.params.one_hot_params.clone(),
+            ));
+
+            // Drop G arrays
+            let g = std::mem::take(&mut self.G);
+            drop_in_background_thread(g);
+        }
+    }
+
+    fn ingest_cycle_challenge(&mut self, r_j: F::Challenge) {
+        // Phase 2: Bind D and H
+        self.D.bind(r_j);
+        if let Some(ref mut h) = self.H {
+            h.bind_in_place(r_j, BindingOrder::LowToHigh);
+        }
+    }
 }
 
 impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T> for BooleanitySumcheckProver<F> {
@@ -407,48 +454,9 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T> for BooleanitySum
     #[tracing::instrument(skip_all, name = "BooleanitySumcheckProver::ingest_challenge")]
     fn ingest_challenge(&mut self, r_j: F::Challenge, round: usize) {
         if round < self.params.log_k_chunk {
-            // Phase 1: Bind B and update F
-            self.B.bind(r_j);
-            self.F.update(r_j);
-
-            // Transition to phase 2
-            if round == self.params.log_k_chunk - 1 {
-                self.eq_r_r = self.B.get_current_scalar();
-
-                // Initialize SharedRaPolynomials with per-poly pre-scaled eq tables (by rho_i)
-                let F_table = std::mem::take(&mut self.F);
-                let ra_indices = std::mem::take(&mut self.ra_indices);
-                let base_eq = F_table.clone_values();
-                let num_polys = self.params.polynomial_types.len();
-                debug_assert!(
-                    num_polys == self.gamma_powers.len(),
-                    "gamma_powers length mismatch: got {}, expected {}",
-                    self.gamma_powers.len(),
-                    num_polys
-                );
-                let tables: Vec<Vec<F>> = (0..num_polys)
-                    .into_par_iter()
-                    .map(|i| {
-                        let rho = self.gamma_powers[i];
-                        base_eq.iter().map(|v| rho * *v).collect()
-                    })
-                    .collect();
-                self.H = Some(SharedRaPolynomials::new(
-                    tables,
-                    ra_indices,
-                    self.params.one_hot_params.clone(),
-                ));
-
-                // Drop G arrays
-                let g = std::mem::take(&mut self.G);
-                drop_in_background_thread(g);
-            }
+            self.ingest_address_challenge(r_j, round);
         } else {
-            // Phase 2: Bind D and H
-            self.D.bind(r_j);
-            if let Some(ref mut h) = self.H {
-                h.bind_in_place(r_j, BindingOrder::LowToHigh);
-            }
+            self.ingest_cycle_challenge(r_j);
         }
     }
 
@@ -483,6 +491,147 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T> for BooleanitySum
     }
 }
 
+#[derive(Allocative)]
+pub struct BooleanityAddressSumcheckProver<F: JoltField> {
+    inner: BooleanitySumcheckProver<F>,
+    last_round_poly: Option<UniPoly<F>>,
+    address_claim: Option<F>,
+}
+
+impl<F: JoltField> BooleanityAddressSumcheckProver<F> {
+    pub fn initialize(
+        params: BooleanitySumcheckParams<F>,
+        trace: &[Cycle],
+        bytecode: &BytecodePreprocessing,
+        memory_layout: &MemoryLayout,
+    ) -> Self {
+        Self {
+            inner: BooleanitySumcheckProver::initialize(params, trace, bytecode, memory_layout),
+            last_round_poly: None,
+            address_claim: None,
+        }
+    }
+
+    pub fn into_cycle_prover(self) -> BooleanityCycleSumcheckProver<F> {
+        BooleanityCycleSumcheckProver { inner: self.inner }
+    }
+}
+
+impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
+    for BooleanityAddressSumcheckProver<F>
+{
+    fn degree(&self) -> usize {
+        self.inner.params.degree()
+    }
+
+    fn num_rounds(&self) -> usize {
+        self.inner.params.log_k_chunk
+    }
+
+    fn input_claim(&self, accumulator: &ProverOpeningAccumulator<F>) -> F {
+        self.inner.params.input_claim(accumulator)
+    }
+
+    fn compute_message(&mut self, round: usize, previous_claim: F) -> UniPoly<F> {
+        let poly = self.inner.compute_phase1_message(round, previous_claim);
+        self.last_round_poly = Some(poly.clone());
+        poly
+    }
+
+    fn ingest_challenge(&mut self, r_j: F::Challenge, round: usize) {
+        if let Some(poly) = self.last_round_poly.take() {
+            let claim = poly.evaluate(&r_j);
+            if round == self.inner.params.log_k_chunk - 1 {
+                self.address_claim = Some(claim);
+            }
+        }
+        self.inner.ingest_address_challenge(r_j, round)
+    }
+
+    fn cache_openings(
+        &self,
+        accumulator: &mut ProverOpeningAccumulator<F>,
+        transcript: &mut T,
+        sumcheck_challenges: &[F::Challenge],
+    ) {
+        let mut r_address = sumcheck_challenges.to_vec();
+        r_address.reverse();
+        let opening_point = OpeningPoint::<BIG_ENDIAN, F>::new(r_address);
+        let address_claim = self
+            .address_claim
+            .expect("Booleanity address-phase claim missing");
+        accumulator.append_virtual(
+            transcript,
+            VirtualPolynomial::BooleanityAddrClaim,
+            SumcheckId::BooleanityAddressPhase,
+            opening_point,
+            address_claim,
+        );
+    }
+
+    #[cfg(feature = "allocative")]
+    fn update_flamegraph(&self, flamegraph: &mut FlameGraphBuilder) {
+        flamegraph.visit_root(self);
+    }
+}
+
+#[derive(Allocative)]
+pub struct BooleanityCycleSumcheckProver<F: JoltField> {
+    inner: BooleanitySumcheckProver<F>,
+}
+
+impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
+    for BooleanityCycleSumcheckProver<F>
+{
+    fn degree(&self) -> usize {
+        self.inner.params.degree()
+    }
+
+    fn num_rounds(&self) -> usize {
+        self.inner.params.log_t
+    }
+
+    fn input_claim(&self, accumulator: &ProverOpeningAccumulator<F>) -> F {
+        accumulator
+            .get_virtual_polynomial_opening(
+                VirtualPolynomial::BooleanityAddrClaim,
+                SumcheckId::BooleanityAddressPhase,
+            )
+            .1
+    }
+
+    fn compute_message(&mut self, round: usize, previous_claim: F) -> UniPoly<F> {
+        self.inner.compute_phase2_message(round, previous_claim)
+    }
+
+    fn ingest_challenge(&mut self, r_j: F::Challenge, _round: usize) {
+        self.inner.ingest_cycle_challenge(r_j)
+    }
+
+    fn cache_openings(
+        &self,
+        accumulator: &mut ProverOpeningAccumulator<F>,
+        transcript: &mut T,
+        sumcheck_challenges: &[F::Challenge],
+    ) {
+        let (r_address_point, _) = accumulator.get_virtual_polynomial_opening(
+            VirtualPolynomial::BooleanityAddrClaim,
+            SumcheckId::BooleanityAddressPhase,
+        );
+        let mut r_address_le = r_address_point.r;
+        r_address_le.reverse();
+        let mut full_challenges = r_address_le;
+        full_challenges.extend_from_slice(sumcheck_challenges);
+        let inner: &dyn SumcheckInstanceProver<F, T> = &self.inner;
+        inner.cache_openings(accumulator, transcript, &full_challenges);
+    }
+
+    #[cfg(feature = "allocative")]
+    fn update_flamegraph(&self, flamegraph: &mut FlameGraphBuilder) {
+        flamegraph.visit_root(self);
+    }
+}
+
 /// Booleanity Sumcheck Verifier.
 pub struct BooleanitySumcheckVerifier<F: JoltField> {
     params: BooleanitySumcheckParams<F>,
@@ -545,3 +694,152 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T> for BooleanityS
         );
     }
 }
+
+pub struct BooleanityAddressSumcheckVerifier<F: JoltField> {
+    params: BooleanitySumcheckParams<F>,
+}
+
+impl<F: JoltField> BooleanityAddressSumcheckVerifier<F> {
+    pub fn new(params: BooleanitySumcheckParams<F>) -> Self {
+        Self { params }
+    }
+
+    pub fn into_cycle_verifier(self) -> BooleanityCycleSumcheckVerifier<F> {
+        BooleanityCycleSumcheckVerifier {
+            params: self.params,
+        }
+    }
+}
+
+impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
+    for BooleanityAddressSumcheckVerifier<F>
+{
+    fn degree(&self) -> usize {
+        self.params.degree()
+    }
+
+    fn num_rounds(&self) -> usize {
+        self.params.log_k_chunk
+    }
+
+    fn input_claim(&self, accumulator: &VerifierOpeningAccumulator<F>) -> F {
+        self.params.input_claim(accumulator)
+    }
+
+    fn expected_output_claim(
+        &self,
+        accumulator: &VerifierOpeningAccumulator<F>,
+        _sumcheck_challenges: &[F::Challenge],
+    ) -> F {
+        accumulator
+            .get_virtual_polynomial_opening(
+                VirtualPolynomial::BooleanityAddrClaim,
+                SumcheckId::BooleanityAddressPhase,
+            )
+            .1
+    }
+
+    fn cache_openings(
+        &self,
+        accumulator: &mut VerifierOpeningAccumulator<F>,
+        transcript: &mut T,
+        sumcheck_challenges: &[F::Challenge],
+    ) {
+        let mut r_address = sumcheck_challenges.to_vec();
+        r_address.reverse();
+        accumulator.append_virtual(
+            transcript,
+            VirtualPolynomial::BooleanityAddrClaim,
+            SumcheckId::BooleanityAddressPhase,
+            OpeningPoint::<BIG_ENDIAN, F>::new(r_address),
+        );
+    }
+}
+
+pub struct BooleanityCycleSumcheckVerifier<F: JoltField> {
+    params: BooleanitySumcheckParams<F>,
+}
+
+impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
+    for BooleanityCycleSumcheckVerifier<F>
+{
+    fn degree(&self) -> usize {
+        self.params.degree()
+    }
+
+    fn num_rounds(&self) -> usize {
+        self.params.log_t
+    }
+
+    fn input_claim(&self, accumulator: &VerifierOpeningAccumulator<F>) -> F {
+        accumulator
+            .get_virtual_polynomial_opening(
+                VirtualPolynomial::BooleanityAddrClaim,
+                SumcheckId::BooleanityAddressPhase,
+            )
+            .1
+    }
+
+    fn expected_output_claim(
+        &self,
+        accumulator: &VerifierOpeningAccumulator<F>,
+        sumcheck_challenges: &[F::Challenge],
+    ) -> F {
+        let (r_address_point, _) = accumulator.get_virtual_polynomial_opening(
+            VirtualPolynomial::BooleanityAddrClaim,
+            SumcheckId::BooleanityAddressPhase,
+        );
+        let mut r_address_le = r_address_point.r;
+        r_address_le.reverse();
+        let mut full_challenges = r_address_le;
+        full_challenges.extend_from_slice(sumcheck_challenges);
+
+        let ra_claims: Vec<F> = self
+            .params
+            .polynomial_types
+            .iter()
+            .map(|poly_type| {
+                accumulator
+                    .get_committed_polynomial_opening(*poly_type, SumcheckId::Booleanity)
+                    .1
+            })
+            .collect();
+
+        let combined_r: Vec<F::Challenge> = self
+            .params
+            .r_address
+            .iter()
+            .cloned()
+            .rev()
+            .chain(self.params.r_cycle.iter().cloned().rev())
+            .collect();
+
+        EqPolynomial::<F>::mle(&full_challenges, &combined_r)
+            * zip(&self.params.gamma_powers_square, ra_claims)
+                .map(|(gamma_2i, ra)| (ra.square() - ra) * gamma_2i)
+                .sum::<F>()
+    }
+
+    fn cache_openings(
+        &self,
+        accumulator: &mut VerifierOpeningAccumulator<F>,
+        transcript: &mut T,
+        sumcheck_challenges: &[F::Challenge],
+    ) {
+        let (r_address_point, _) = accumulator.get_virtual_polynomial_opening(
+            VirtualPolynomial::BooleanityAddrClaim,
+            SumcheckId::BooleanityAddressPhase,
+        );
+        let mut r_address_le = r_address_point.r;
+        r_address_le.reverse();
+        let mut full_challenges = r_address_le;
+        full_challenges.extend_from_slice(sumcheck_challenges);
+        let opening_point = self.params.normalize_opening_point(&full_challenges);
+        accumulator.append_sparse(
+            transcript,
+            self.params.polynomial_types.clone(),
+            SumcheckId::Booleanity,
+            opening_point.r,
+        );
+    }
+}
diff --git a/jolt-core/src/zkvm/bytecode/read_raf_checking.rs b/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
index 223a6feaef..f25d4ff99e 100644
--- a/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
+++ b/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
@@ -371,17 +371,8 @@ impl<F: JoltField> BytecodeReadRafSumcheckProver<F> {
         // Drop trace and preprocessing - no longer needed after this
         self.trace = Arc::new(Vec::new());
     }
-}
-
-impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
-    for BytecodeReadRafSumcheckProver<F>
-{
-    fn get_params(&self) -> &dyn SumcheckInstanceParams<F> {
-        &self.params
-    }
 
-    #[tracing::instrument(skip_all, name = "BytecodeReadRafSumcheckProver::compute_message")]
-    fn compute_message(&mut self, round: usize, _previous_claim: F) -> UniPoly<F> {
+    fn compute_message_internal(&mut self, round: usize, _previous_claim: F) -> UniPoly<F> {
         if round < self.params.log_K {
             const DEGREE: usize = 2;
 
@@ -394,7 +385,8 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
                     });
 
                     let int_evals =
-                        self.params.int_poly
+                        self.params
+                            .int_poly
                             .sumcheck_evals(i, DEGREE, BindingOrder::LowToHigh);
 
                     // We have a separate Val polynomial for each stage
@@ -408,13 +400,20 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
                     // Which matches with the input claim:
                     // rv_1 + gamma * rv_2 + gamma^2 * rv_3 + gamma^3 * rv_4 + gamma^4 * rv_5 + gamma^5 * raf_1 + gamma^6 * raf_3
                     let mut val_evals = self
-                        .params.val_polys
+                        .params
+                        .val_polys
                         .iter()
                         // Val polynomials
                         .map(|val| val.sumcheck_evals_array::<DEGREE>(i, BindingOrder::LowToHigh))
                         // Here are the RAF polynomials and their powers
                         .zip([Some(&int_evals), None, Some(&int_evals), None, None])
-                        .zip([Some(self.params.gamma_powers[5]), None, Some(self.params.gamma_powers[4]), None, None])
+                        .zip([
+                            Some(self.params.gamma_powers[5]),
+                            None,
+                            Some(self.params.gamma_powers[4]),
+                            None,
+                            None,
+                        ])
                         .map(|((val_evals, int_evals), gamma)| {
                             std::array::from_fn::<F, DEGREE, _>(|j| {
                                 val_evals[j]
@@ -450,7 +449,7 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
 
             agg_round_poly
         } else {
-            let degree = <Self as SumcheckInstanceProver<F, T>>::degree(self);
+            let degree = self.params.degree();
 
             let out_len = self.gruen_eq_polys[0].E_out_current().len();
             let in_len = self.gruen_eq_polys[0].E_in_current().len();
@@ -520,8 +519,7 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
         }
     }
 
-    #[tracing::instrument(skip_all, name = "BytecodeReadRafSumcheckProver::ingest_challenge")]
-    fn ingest_challenge(&mut self, r_j: F::Challenge, round: usize) {
+    fn ingest_challenge_internal(&mut self, r_j: F::Challenge, round: usize) {
         if let Some(prev_round_polys) = self.prev_round_polys.take() {
             self.prev_round_claims = prev_round_polys.map(|poly| poly.evaluate(&r_j));
         }
@@ -550,6 +548,24 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
                 .for_each(|poly| poly.bind(r_j));
         }
     }
+}
+
+impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
+    for BytecodeReadRafSumcheckProver<F>
+{
+    fn get_params(&self) -> &dyn SumcheckInstanceParams<F> {
+        &self.params
+    }
+
+    #[tracing::instrument(skip_all, name = "BytecodeReadRafSumcheckProver::compute_message")]
+    fn compute_message(&mut self, round: usize, _previous_claim: F) -> UniPoly<F> {
+        self.compute_message_internal(round, _previous_claim)
+    }
+
+    #[tracing::instrument(skip_all, name = "BytecodeReadRafSumcheckProver::ingest_challenge")]
+    fn ingest_challenge(&mut self, r_j: F::Challenge, round: usize) {
+        self.ingest_challenge_internal(r_j, round)
+    }
 
     fn cache_openings(
         &self,
@@ -584,6 +600,141 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
     }
 }
 
+#[derive(Allocative)]
+pub struct BytecodeReadRafAddressSumcheckProver<F: JoltField> {
+    inner: BytecodeReadRafSumcheckProver<F>,
+}
+
+impl<F: JoltField> BytecodeReadRafAddressSumcheckProver<F> {
+    pub fn initialize(
+        params: BytecodeReadRafSumcheckParams<F>,
+        trace: Arc<Vec<Cycle>>,
+        bytecode_preprocessing: Arc<BytecodePreprocessing>,
+    ) -> Self {
+        Self {
+            inner: BytecodeReadRafSumcheckProver::initialize(params, trace, bytecode_preprocessing),
+        }
+    }
+
+    pub fn into_cycle_prover(self) -> BytecodeReadRafCycleSumcheckProver<F> {
+        BytecodeReadRafCycleSumcheckProver { inner: self.inner }
+    }
+}
+
+impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
+    for BytecodeReadRafAddressSumcheckProver<F>
+{
+    fn degree(&self) -> usize {
+        self.inner.params.degree()
+    }
+
+    fn num_rounds(&self) -> usize {
+        self.inner.params.log_K
+    }
+
+    fn input_claim(&self, accumulator: &ProverOpeningAccumulator<F>) -> F {
+        self.inner.params.input_claim(accumulator)
+    }
+
+    fn compute_message(&mut self, round: usize, previous_claim: F) -> UniPoly<F> {
+        self.inner.compute_message_internal(round, previous_claim)
+    }
+
+    fn ingest_challenge(&mut self, r_j: F::Challenge, round: usize) {
+        self.inner.ingest_challenge_internal(r_j, round)
+    }
+
+    fn cache_openings(
+        &self,
+        accumulator: &mut ProverOpeningAccumulator<F>,
+        transcript: &mut T,
+        sumcheck_challenges: &[F::Challenge],
+    ) {
+        let mut r_address = sumcheck_challenges.to_vec();
+        r_address.reverse();
+        let opening_point = OpeningPoint::<BIG_ENDIAN, F>::new(r_address);
+        let address_claim: F = self
+            .inner
+            .prev_round_claims
+            .iter()
+            .zip(self.inner.params.gamma_powers.iter())
+            .take(N_STAGES)
+            .map(|(claim, gamma)| *claim * *gamma)
+            .sum();
+        accumulator.append_virtual(
+            transcript,
+            VirtualPolynomial::BytecodeReadRafAddrClaim,
+            SumcheckId::BytecodeReadRafAddressPhase,
+            opening_point,
+            address_claim,
+        );
+    }
+
+    #[cfg(feature = "allocative")]
+    fn update_flamegraph(&self, flamegraph: &mut FlameGraphBuilder) {
+        flamegraph.visit_root(self);
+    }
+}
+
+#[derive(Allocative)]
+pub struct BytecodeReadRafCycleSumcheckProver<F: JoltField> {
+    inner: BytecodeReadRafSumcheckProver<F>,
+}
+
+impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
+    for BytecodeReadRafCycleSumcheckProver<F>
+{
+    fn degree(&self) -> usize {
+        self.inner.params.degree()
+    }
+
+    fn num_rounds(&self) -> usize {
+        self.inner.params.log_T
+    }
+
+    fn input_claim(&self, accumulator: &ProverOpeningAccumulator<F>) -> F {
+        accumulator
+            .get_virtual_polynomial_opening(
+                VirtualPolynomial::BytecodeReadRafAddrClaim,
+                SumcheckId::BytecodeReadRafAddressPhase,
+            )
+            .1
+    }
+
+    fn compute_message(&mut self, round: usize, previous_claim: F) -> UniPoly<F> {
+        self.inner
+            .compute_message_internal(round + self.inner.params.log_K, previous_claim)
+    }
+
+    fn ingest_challenge(&mut self, r_j: F::Challenge, round: usize) {
+        self.inner
+            .ingest_challenge_internal(r_j, round + self.inner.params.log_K)
+    }
+
+    fn cache_openings(
+        &self,
+        accumulator: &mut ProverOpeningAccumulator<F>,
+        transcript: &mut T,
+        sumcheck_challenges: &[F::Challenge],
+    ) {
+        let (r_address_point, _) = accumulator.get_virtual_polynomial_opening(
+            VirtualPolynomial::BytecodeReadRafAddrClaim,
+            SumcheckId::BytecodeReadRafAddressPhase,
+        );
+        let mut r_address_le = r_address_point.r;
+        r_address_le.reverse();
+        let mut full_challenges = r_address_le;
+        full_challenges.extend_from_slice(sumcheck_challenges);
+        let inner: &dyn SumcheckInstanceProver<F, T> = &self.inner;
+        inner.cache_openings(accumulator, transcript, &full_challenges);
+    }
+
+    #[cfg(feature = "allocative")]
+    fn update_flamegraph(&self, flamegraph: &mut FlameGraphBuilder) {
+        flamegraph.visit_root(self);
+    }
+}
+
 pub struct BytecodeReadRafSumcheckVerifier<F: JoltField> {
     params: BytecodeReadRafSumcheckParams<F>,
 }
@@ -695,6 +846,189 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
     }
 }
 
+pub struct BytecodeReadRafAddressSumcheckVerifier<F: JoltField> {
+    params: BytecodeReadRafSumcheckParams<F>,
+}
+
+impl<F: JoltField> BytecodeReadRafAddressSumcheckVerifier<F> {
+    pub fn new(
+        bytecode_preprocessing: &BytecodePreprocessing,
+        n_cycle_vars: usize,
+        one_hot_params: &OneHotParams,
+        opening_accumulator: &VerifierOpeningAccumulator<F>,
+        transcript: &mut impl Transcript,
+    ) -> Self {
+        Self {
+            params: BytecodeReadRafSumcheckParams::gen(
+                bytecode_preprocessing,
+                n_cycle_vars,
+                one_hot_params,
+                opening_accumulator,
+                transcript,
+            ),
+        }
+    }
+
+    pub fn into_cycle_verifier(self) -> BytecodeReadRafCycleSumcheckVerifier<F> {
+        BytecodeReadRafCycleSumcheckVerifier {
+            params: self.params,
+        }
+    }
+}
+
+impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
+    for BytecodeReadRafAddressSumcheckVerifier<F>
+{
+    fn degree(&self) -> usize {
+        self.params.degree()
+    }
+
+    fn num_rounds(&self) -> usize {
+        self.params.log_K
+    }
+
+    fn input_claim(&self, accumulator: &VerifierOpeningAccumulator<F>) -> F {
+        self.params.input_claim(accumulator)
+    }
+
+    fn expected_output_claim(
+        &self,
+        accumulator: &VerifierOpeningAccumulator<F>,
+        _sumcheck_challenges: &[F::Challenge],
+    ) -> F {
+        accumulator
+            .get_virtual_polynomial_opening(
+                VirtualPolynomial::BytecodeReadRafAddrClaim,
+                SumcheckId::BytecodeReadRafAddressPhase,
+            )
+            .1
+    }
+
+    fn cache_openings(
+        &self,
+        accumulator: &mut VerifierOpeningAccumulator<F>,
+        transcript: &mut T,
+        sumcheck_challenges: &[F::Challenge],
+    ) {
+        let mut r_address = sumcheck_challenges.to_vec();
+        r_address.reverse();
+        accumulator.append_virtual(
+            transcript,
+            VirtualPolynomial::BytecodeReadRafAddrClaim,
+            SumcheckId::BytecodeReadRafAddressPhase,
+            OpeningPoint::<BIG_ENDIAN, F>::new(r_address),
+        );
+    }
+}
+
+pub struct BytecodeReadRafCycleSumcheckVerifier<F: JoltField> {
+    params: BytecodeReadRafSumcheckParams<F>,
+}
+
+impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
+    for BytecodeReadRafCycleSumcheckVerifier<F>
+{
+    fn degree(&self) -> usize {
+        self.params.degree()
+    }
+
+    fn num_rounds(&self) -> usize {
+        self.params.log_T
+    }
+
+    fn input_claim(&self, accumulator: &VerifierOpeningAccumulator<F>) -> F {
+        accumulator
+            .get_virtual_polynomial_opening(
+                VirtualPolynomial::BytecodeReadRafAddrClaim,
+                SumcheckId::BytecodeReadRafAddressPhase,
+            )
+            .1
+    }
+
+    fn expected_output_claim(
+        &self,
+        accumulator: &VerifierOpeningAccumulator<F>,
+        sumcheck_challenges: &[F::Challenge],
+    ) -> F {
+        let (r_address_point, _) = accumulator.get_virtual_polynomial_opening(
+            VirtualPolynomial::BytecodeReadRafAddrClaim,
+            SumcheckId::BytecodeReadRafAddressPhase,
+        );
+        let mut r_address_le = r_address_point.r;
+        r_address_le.reverse();
+        let mut full_challenges = r_address_le;
+        full_challenges.extend_from_slice(sumcheck_challenges);
+        let opening_point = self.params.normalize_opening_point(&full_challenges);
+        let (r_address_prime, r_cycle_prime) = opening_point.split_at(self.params.log_K);
+
+        let int_poly = self.params.int_poly.evaluate(&r_address_prime.r);
+
+        let ra_claims = (0..self.params.d).map(|i| {
+            accumulator
+                .get_committed_polynomial_opening(
+                    CommittedPolynomial::BytecodeRa(i),
+                    SumcheckId::BytecodeReadRaf,
+                )
+                .1
+        });
+
+        let val = self
+            .params
+            .val_polys
+            .iter()
+            .zip(&self.params.r_cycles)
+            .zip(&self.params.gamma_powers)
+            .zip([
+                int_poly * self.params.gamma_powers[5], // RAF for Stage1
+                F::zero(),                              // There's no raf for Stage2
+                int_poly * self.params.gamma_powers[4], // RAF for Stage3
+                F::zero(),                              // There's no raf for Stage4
+                F::zero(),                              // There's no raf for Stage5
+            ])
+            .map(|(((val, r_cycle), gamma), int_poly)| {
+                (val.evaluate(&r_address_prime.r) + int_poly)
+                    * EqPolynomial::<F>::mle(r_cycle, &r_cycle_prime.r)
+                    * gamma
+            })
+            .sum::<F>();
+
+        ra_claims.fold(val, |running, ra_claim| running * ra_claim)
+    }
+
+    fn cache_openings(
+        &self,
+        accumulator: &mut VerifierOpeningAccumulator<F>,
+        transcript: &mut T,
+        sumcheck_challenges: &[F::Challenge],
+    ) {
+        let (r_address_point, _) = accumulator.get_virtual_polynomial_opening(
+            VirtualPolynomial::BytecodeReadRafAddrClaim,
+            SumcheckId::BytecodeReadRafAddressPhase,
+        );
+        let mut r_address_le = r_address_point.r;
+        r_address_le.reverse();
+        let mut full_challenges = r_address_le;
+        full_challenges.extend_from_slice(sumcheck_challenges);
+        let opening_point = self.params.normalize_opening_point(&full_challenges);
+        let (r_address, r_cycle) = opening_point.split_at(self.params.log_K);
+
+        let r_address_chunks = self
+            .params
+            .one_hot_params
+            .compute_r_address_chunks::<F>(&r_address.r);
+
+        (0..self.params.d).for_each(|i| {
+            let opening_point = [&r_address_chunks[i][..], &r_cycle.r].concat();
+            accumulator.append_sparse(
+                transcript,
+                vec![CommittedPolynomial::BytecodeRa(i)],
+                SumcheckId::BytecodeReadRaf,
+                opening_point,
+            );
+        });
+    }
+}
+
 #[derive(Allocative, Clone)]
 pub struct BytecodeReadRafSumcheckParams<F: JoltField> {
     /// Index `i` stores `gamma^i`.
diff --git a/jolt-core/src/zkvm/claim_reductions/advice.rs b/jolt-core/src/zkvm/claim_reductions/advice.rs
index aef7725cdc..275871e6cc 100644
--- a/jolt-core/src/zkvm/claim_reductions/advice.rs
+++ b/jolt-core/src/zkvm/claim_reductions/advice.rs
@@ -521,11 +521,8 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T> for AdviceClaimRe
     fn round_offset(&self, max_num_rounds: usize) -> usize {
         match self.params.phase {
             ReductionPhase::CycleVariables => {
-                // Align to the *start* of Booleanity's cycle segment, so local rounds correspond
-                // to low Dory column bits in the unified point ordering.
-                let booleanity_rounds = self.params.log_k_chunk + self.params.log_t;
-                let booleanity_offset = max_num_rounds - booleanity_rounds;
-                booleanity_offset + self.params.log_k_chunk
+                // Stage 6b only spans cycle variables; align to the start of the cycle segment.
+                max_num_rounds.saturating_sub(self.params.log_t)
             }
             ReductionPhase::AddressVariables => 0,
         }
@@ -667,11 +664,7 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
     fn round_offset(&self, max_num_rounds: usize) -> usize {
         let params = self.params.borrow();
         match params.phase {
-            ReductionPhase::CycleVariables => {
-                let booleanity_rounds = params.log_k_chunk + params.log_t;
-                let booleanity_offset = max_num_rounds - booleanity_rounds;
-                booleanity_offset + params.log_k_chunk
-            }
+            ReductionPhase::CycleVariables => max_num_rounds.saturating_sub(params.log_t),
             ReductionPhase::AddressVariables => 0,
         }
     }
diff --git a/jolt-core/src/zkvm/proof_serialization.rs b/jolt-core/src/zkvm/proof_serialization.rs
index 9712bd7717..2426b31124 100644
--- a/jolt-core/src/zkvm/proof_serialization.rs
+++ b/jolt-core/src/zkvm/proof_serialization.rs
@@ -36,7 +36,8 @@ pub struct JoltProof<F: JoltField, PCS: CommitmentScheme<Field = F>, FS: Transcr
     pub stage3_sumcheck_proof: SumcheckInstanceProof<F, FS>,
     pub stage4_sumcheck_proof: SumcheckInstanceProof<F, FS>,
     pub stage5_sumcheck_proof: SumcheckInstanceProof<F, FS>,
-    pub stage6_sumcheck_proof: SumcheckInstanceProof<F, FS>,
+    pub stage6a_sumcheck_proof: SumcheckInstanceProof<F, FS>,
+    pub stage6b_sumcheck_proof: SumcheckInstanceProof<F, FS>,
     pub stage7_sumcheck_proof: SumcheckInstanceProof<F, FS>,
     pub joint_opening_proof: PCS::Proof,
     pub untrusted_advice_commitment: Option<PCS::Commitment>,
@@ -365,6 +366,15 @@ impl CanonicalSerialize for VirtualPolynomial {
                 40u8.serialize_with_mode(&mut writer, compress)?;
                 (u8::try_from(*flag).unwrap()).serialize_with_mode(&mut writer, compress)
             }
+            Self::BytecodeValStage(stage) => {
+                41u8.serialize_with_mode(&mut writer, compress)?;
+                (u8::try_from(*stage).unwrap()).serialize_with_mode(&mut writer, compress)
+            }
+            Self::BytecodeReadRafAddrClaim => 42u8.serialize_with_mode(&mut writer, compress),
+            Self::BooleanityAddrClaim => 43u8.serialize_with_mode(&mut writer, compress),
+            Self::BytecodeClaimReductionIntermediate => {
+                44u8.serialize_with_mode(&mut writer, compress)
+            }
         }
     }
 
@@ -406,11 +416,15 @@ impl CanonicalSerialize for VirtualPolynomial {
             | Self::RamValInit
             | Self::RamValFinal
             | Self::RamHammingWeight
-            | Self::UnivariateSkip => 1,
+            | Self::UnivariateSkip
+            | Self::BytecodeReadRafAddrClaim
+            | Self::BooleanityAddrClaim
+            | Self::BytecodeClaimReductionIntermediate => 1,
             Self::InstructionRa(_)
             | Self::OpFlags(_)
             | Self::InstructionFlags(_)
-            | Self::LookupTableFlag(_) => 2,
+            | Self::LookupTableFlag(_)
+            | Self::BytecodeValStage(_) => 2,
         }
     }
 }
@@ -486,6 +500,13 @@ impl CanonicalDeserialize for VirtualPolynomial {
                     let flag = u8::deserialize_with_mode(&mut reader, compress, validate)?;
                     Self::LookupTableFlag(flag as usize)
                 }
+                41 => {
+                    let stage = u8::deserialize_with_mode(&mut reader, compress, validate)?;
+                    Self::BytecodeValStage(stage as usize)
+                }
+                42 => Self::BytecodeReadRafAddrClaim,
+                43 => Self::BooleanityAddrClaim,
+                44 => Self::BytecodeClaimReductionIntermediate,
                 _ => return Err(SerializationError::InvalidData),
             },
         )
diff --git a/jolt-core/src/zkvm/prover.rs b/jolt-core/src/zkvm/prover.rs
index 814ff22bbb..aeddfd54d2 100644
--- a/jolt-core/src/zkvm/prover.rs
+++ b/jolt-core/src/zkvm/prover.rs
@@ -41,7 +41,10 @@ use crate::{
     },
     pprof_scope,
     subprotocols::{
-        booleanity::{BooleanitySumcheckParams, BooleanitySumcheckProver},
+        booleanity::{
+            BooleanityAddressSumcheckProver, BooleanityCycleSumcheckProver,
+            BooleanitySumcheckParams,
+        },
         sumcheck::{BatchedSumcheck, SumcheckInstanceProof},
         sumcheck_prover::SumcheckInstanceProver,
         univariate_skip::{prove_uniskip_round, UniSkipFirstRoundProof},
@@ -96,7 +99,9 @@ use crate::{
 use crate::{
     poly::commitment::commitment_scheme::CommitmentScheme,
     zkvm::{
-        bytecode::read_raf_checking::BytecodeReadRafSumcheckProver,
+        bytecode::read_raf_checking::{
+            BytecodeReadRafAddressSumcheckProver, BytecodeReadRafCycleSumcheckProver,
+        },
         fiat_shamir_preamble,
         instruction_lookups::{
             ra_virtual::InstructionRaSumcheckProver as LookupsRaSumcheckProver,
@@ -153,6 +158,10 @@ pub struct JoltCpuProver<
     /// The advice claim reduction sumcheck effectively spans two stages (6 and 7).
     /// Cache the prover state here between stages.
     advice_reduction_prover_untrusted: Option<AdviceClaimReductionProver<F>>,
+    /// BytecodeReadRaf spans Stage 6a (address) and Stage 6b (cycle).
+    bytecode_read_raf_cycle_prover: Option<BytecodeReadRafCycleSumcheckProver<F>>,
+    /// Booleanity spans Stage 6a (address) and Stage 6b (cycle).
+    booleanity_cycle_prover: Option<BooleanityCycleSumcheckProver<F>>,
     pub unpadded_trace_len: usize,
     pub padded_trace_len: usize,
     pub transcript: ProofTranscript,
@@ -402,6 +411,8 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             },
             advice_reduction_prover_trusted: None,
             advice_reduction_prover_untrusted: None,
+            bytecode_read_raf_cycle_prover: None,
+            booleanity_cycle_prover: None,
             unpadded_trace_len,
             padded_trace_len,
             transcript,
@@ -454,7 +465,8 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         let stage3_sumcheck_proof = self.prove_stage3();
         let stage4_sumcheck_proof = self.prove_stage4();
         let stage5_sumcheck_proof = self.prove_stage5();
-        let stage6_sumcheck_proof = self.prove_stage6();
+        let stage6a_sumcheck_proof = self.prove_stage6a();
+        let stage6b_sumcheck_proof = self.prove_stage6b();
         let stage7_sumcheck_proof = self.prove_stage7();
 
         let joint_opening_proof = self.prove_stage8(opening_proof_hints);
@@ -489,7 +501,8 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             stage3_sumcheck_proof,
             stage4_sumcheck_proof,
             stage5_sumcheck_proof,
-            stage6_sumcheck_proof,
+            stage6a_sumcheck_proof,
+            stage6b_sumcheck_proof,
             stage7_sumcheck_proof,
             joint_opening_proof,
             trace_length: self.trace.len(),
@@ -1070,9 +1083,9 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
     }
 
     #[tracing::instrument(skip_all)]
-    fn prove_stage6(&mut self) -> SumcheckInstanceProof<F, ProofTranscript> {
+    fn prove_stage6a(&mut self) -> SumcheckInstanceProof<F, ProofTranscript> {
         #[cfg(not(target_arch = "wasm32"))]
-        print_current_memory_usage("Stage 6 baseline");
+        print_current_memory_usage("Stage 6a baseline");
 
         let bytecode_read_raf_params = BytecodeReadRafSumcheckParams::gen(
             &self.preprocessing.shared.bytecode,
@@ -1082,9 +1095,6 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             &mut self.transcript,
         );
 
-        let ram_hamming_booleanity_params =
-            HammingBooleanitySumcheckParams::new(&self.opening_accumulator);
-
         let booleanity_params = BooleanitySumcheckParams::new(
             self.trace.len().log_2(),
             &self.one_hot_params,
@@ -1092,6 +1102,55 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             &mut self.transcript,
         );
 
+        let mut bytecode_read_raf = BytecodeReadRafAddressSumcheckProver::initialize(
+            bytecode_read_raf_params,
+            Arc::clone(&self.trace),
+            Arc::clone(&self.preprocessing.shared.bytecode),
+        );
+        let mut booleanity = BooleanityAddressSumcheckProver::initialize(
+            booleanity_params,
+            &self.trace,
+            &self.preprocessing.shared.bytecode,
+            &self.program_io.memory_layout,
+        );
+
+        #[cfg(feature = "allocative")]
+        {
+            print_data_structure_heap_usage(
+                "BytecodeReadRafAddressSumcheckProver",
+                &bytecode_read_raf,
+            );
+            print_data_structure_heap_usage("BooleanityAddressSumcheckProver", &booleanity);
+        }
+
+        let mut instances: Vec<&mut dyn SumcheckInstanceProver<_, _>> =
+            vec![&mut bytecode_read_raf, &mut booleanity];
+
+        #[cfg(feature = "allocative")]
+        write_instance_flamegraph_svg(&instances, "stage6a_start_flamechart.svg");
+        tracing::info!("Stage 6a proving");
+        let (sumcheck_proof, _r_stage6a) = BatchedSumcheck::prove(
+            instances.iter_mut().map(|v| &mut **v as _).collect(),
+            &mut self.opening_accumulator,
+            &mut self.transcript,
+        );
+        #[cfg(feature = "allocative")]
+        write_instance_flamegraph_svg(&instances, "stage6a_end_flamechart.svg");
+
+        self.bytecode_read_raf_cycle_prover = Some(bytecode_read_raf.into_cycle_prover());
+        self.booleanity_cycle_prover = Some(booleanity.into_cycle_prover());
+
+        sumcheck_proof
+    }
+
+    #[tracing::instrument(skip_all)]
+    fn prove_stage6b(&mut self) -> SumcheckInstanceProof<F, ProofTranscript> {
+        #[cfg(not(target_arch = "wasm32"))]
+        print_current_memory_usage("Stage 6b baseline");
+
+        let ram_hamming_booleanity_params =
+            HammingBooleanitySumcheckParams::new(&self.opening_accumulator);
+
         let ram_ra_virtual_params = RamRaVirtualParams::new(
             self.trace.len(),
             &self.one_hot_params,
@@ -1108,7 +1167,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             &mut self.transcript,
         );
 
-        // Advice claim reduction (Phase 1 in Stage 6): trusted and untrusted are separate instances.
+        // Advice claim reduction (Phase 1 in Stage 6b): trusted and untrusted are separate instances.
         if self.advice.trusted_advice_polynomial.is_some() {
             let trusted_advice_params = AdviceClaimReductionParams::new(
                 AdviceKind::Trusted,
@@ -1159,21 +1218,17 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             };
         }
 
-        let mut bytecode_read_raf = BytecodeReadRafSumcheckProver::initialize(
-            bytecode_read_raf_params,
-            Arc::clone(&self.trace),
-            Arc::clone(&self.preprocessing.shared.bytecode),
-        );
+        let mut bytecode_read_raf = self
+            .bytecode_read_raf_cycle_prover
+            .take()
+            .expect("Stage 6b missing BytecodeReadRaf cycle prover");
+        let mut booleanity = self
+            .booleanity_cycle_prover
+            .take()
+            .expect("Stage 6b missing Booleanity cycle prover");
         let mut ram_hamming_booleanity =
             HammingBooleanitySumcheckProver::initialize(ram_hamming_booleanity_params, &self.trace);
 
-        let mut booleanity = BooleanitySumcheckProver::initialize(
-            booleanity_params,
-            &self.trace,
-            &self.preprocessing.shared.bytecode,
-            &self.program_io.memory_layout,
-        );
-
         let mut ram_ra_virtual = RamRaVirtualSumcheckProver::initialize(
             ram_ra_virtual_params,
             &self.trace,
@@ -1187,12 +1242,15 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
 
         #[cfg(feature = "allocative")]
         {
-            print_data_structure_heap_usage("BytecodeReadRafSumcheckProver", &bytecode_read_raf);
+            print_data_structure_heap_usage(
+                "BytecodeReadRafCycleSumcheckProver",
+                &bytecode_read_raf,
+            );
             print_data_structure_heap_usage(
                 "ram HammingBooleanitySumcheckProver",
                 &ram_hamming_booleanity,
             );
-            print_data_structure_heap_usage("BooleanitySumcheckProver", &booleanity);
+            print_data_structure_heap_usage("BooleanityCycleSumcheckProver", &booleanity);
             print_data_structure_heap_usage("RamRaSumcheckProver", &ram_ra_virtual);
             print_data_structure_heap_usage("LookupsRaSumcheckProver", &lookups_ra_virtual);
             print_data_structure_heap_usage("IncClaimReductionSumcheckProver", &inc_reduction);
@@ -1220,15 +1278,15 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         }
 
         #[cfg(feature = "allocative")]
-        write_instance_flamegraph_svg(&instances, "stage6_start_flamechart.svg");
-        tracing::info!("Stage 6 proving");
-        let (sumcheck_proof, _r_stage6) = BatchedSumcheck::prove(
+        write_instance_flamegraph_svg(&instances, "stage6b_start_flamechart.svg");
+        tracing::info!("Stage 6b proving");
+        let (sumcheck_proof, _r_stage6b) = BatchedSumcheck::prove(
             instances.iter_mut().map(|v| &mut **v as _).collect(),
             &mut self.opening_accumulator,
             &mut self.transcript,
         );
         #[cfg(feature = "allocative")]
-        write_instance_flamegraph_svg(&instances, "stage6_end_flamechart.svg");
+        write_instance_flamegraph_svg(&instances, "stage6b_end_flamechart.svg");
         drop_in_background_thread(bytecode_read_raf);
         drop_in_background_thread(ram_hamming_booleanity);
         drop_in_background_thread(booleanity);
diff --git a/jolt-core/src/zkvm/verifier.rs b/jolt-core/src/zkvm/verifier.rs
index bfad57eafd..b33878048a 100644
--- a/jolt-core/src/zkvm/verifier.rs
+++ b/jolt-core/src/zkvm/verifier.rs
@@ -18,7 +18,9 @@ use crate::zkvm::ram::RAMPreprocessing;
 use crate::zkvm::witness::all_committed_polynomials;
 use crate::zkvm::Serializable;
 use crate::zkvm::{
-    bytecode::read_raf_checking::BytecodeReadRafSumcheckVerifier,
+    bytecode::read_raf_checking::{
+        BytecodeReadRafAddressSumcheckVerifier, BytecodeReadRafCycleSumcheckVerifier,
+    },
     claim_reductions::{
         AdviceClaimReductionVerifier, AdviceKind, HammingWeightClaimReductionVerifier,
         IncClaimReductionSumcheckVerifier, InstructionLookupsClaimReductionSumcheckVerifier,
@@ -58,7 +60,10 @@ use crate::{
     },
     pprof_scope,
     subprotocols::{
-        booleanity::{BooleanitySumcheckParams, BooleanitySumcheckVerifier},
+        booleanity::{
+            BooleanityAddressSumcheckVerifier, BooleanityCycleSumcheckVerifier,
+            BooleanitySumcheckParams,
+        },
         sumcheck_verifier::SumcheckInstanceVerifier,
     },
     transcripts::Transcript,
@@ -90,6 +95,10 @@ pub struct JoltVerifier<
     /// The advice claim reduction sumcheck effectively spans two stages (6 and 7).
     /// Cache the verifier state here between stages.
     advice_reduction_verifier_untrusted: Option<AdviceClaimReductionVerifier<F>>,
+    /// BytecodeReadRaf spans Stage 6a (address) and Stage 6b (cycle).
+    bytecode_read_raf_cycle_verifier: Option<BytecodeReadRafCycleSumcheckVerifier<F>>,
+    /// Booleanity spans Stage 6a (address) and Stage 6b (cycle).
+    booleanity_cycle_verifier: Option<BooleanityCycleSumcheckVerifier<F>>,
     pub spartan_key: UniformSpartanKey<F>,
     pub one_hot_params: OneHotParams,
 }
@@ -171,6 +180,8 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             opening_accumulator,
             advice_reduction_verifier_trusted: None,
             advice_reduction_verifier_untrusted: None,
+            bytecode_read_raf_cycle_verifier: None,
+            booleanity_cycle_verifier: None,
             spartan_key,
             one_hot_params,
         })
@@ -207,7 +218,8 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
         self.verify_stage3()?;
         self.verify_stage4()?;
         self.verify_stage5()?;
-        self.verify_stage6()?;
+        self.verify_stage6a()?;
+        self.verify_stage6b()?;
         self.verify_stage7()?;
         self.verify_stage8()?;
 
@@ -406,26 +418,51 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
         Ok(())
     }
 
-    fn verify_stage6(&mut self) -> Result<(), anyhow::Error> {
+    fn verify_stage6a(&mut self) -> Result<(), anyhow::Error> {
         let n_cycle_vars = self.proof.trace_length.log_2();
-        let bytecode_read_raf = BytecodeReadRafSumcheckVerifier::gen(
+        let bytecode_read_raf = BytecodeReadRafAddressSumcheckVerifier::new(
             &self.preprocessing.shared.bytecode,
             n_cycle_vars,
             &self.one_hot_params,
             &self.opening_accumulator,
             &mut self.transcript,
         );
-
-        let ram_hamming_booleanity =
-            HammingBooleanitySumcheckVerifier::new(&self.opening_accumulator);
         let booleanity_params = BooleanitySumcheckParams::new(
             n_cycle_vars,
             &self.one_hot_params,
             &self.opening_accumulator,
             &mut self.transcript,
         );
+        let booleanity = BooleanityAddressSumcheckVerifier::new(booleanity_params);
+
+        let instances: Vec<&dyn SumcheckInstanceVerifier<F, ProofTranscript>> =
+            vec![&bytecode_read_raf, &booleanity];
+
+        let _r_stage6a = BatchedSumcheck::verify(
+            &self.proof.stage6a_sumcheck_proof,
+            instances,
+            &mut self.opening_accumulator,
+            &mut self.transcript,
+        )
+        .context("Stage 6a")?;
+
+        self.bytecode_read_raf_cycle_verifier = Some(bytecode_read_raf.into_cycle_verifier());
+        self.booleanity_cycle_verifier = Some(booleanity.into_cycle_verifier());
+
+        Ok(())
+    }
 
-        let booleanity = BooleanitySumcheckVerifier::new(booleanity_params);
+    fn verify_stage6b(&mut self) -> Result<(), anyhow::Error> {
+        let bytecode_read_raf = self
+            .bytecode_read_raf_cycle_verifier
+            .take()
+            .expect("Stage 6b missing BytecodeReadRaf cycle verifier");
+        let booleanity = self
+            .booleanity_cycle_verifier
+            .take()
+            .expect("Stage 6b missing Booleanity cycle verifier");
+        let ram_hamming_booleanity =
+            HammingBooleanitySumcheckVerifier::new(&self.opening_accumulator);
         let ram_ra_virtual = RamRaVirtualSumcheckVerifier::new(
             self.proof.trace_length,
             &self.one_hot_params,
@@ -443,7 +480,7 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             &mut self.transcript,
         );
 
-        // Advice claim reduction (Phase 1 in Stage 6): trusted and untrusted are separate instances.
+        // Advice claim reduction (Phase 1 in Stage 6b): trusted and untrusted are separate instances.
         if self.trusted_advice_commitment.is_some() {
             self.advice_reduction_verifier_trusted = Some(AdviceClaimReductionVerifier::new(
                 AdviceKind::Trusted,
@@ -484,13 +521,13 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             instances.push(advice);
         }
 
-        let _r_stage6 = BatchedSumcheck::verify(
-            &self.proof.stage6_sumcheck_proof,
+        let _r_stage6b = BatchedSumcheck::verify(
+            &self.proof.stage6b_sumcheck_proof,
             instances,
             &mut self.opening_accumulator,
             &mut self.transcript,
         )
-        .context("Stage 6")?;
+        .context("Stage 6b")?;
 
         Ok(())
     }
diff --git a/jolt-core/src/zkvm/witness.rs b/jolt-core/src/zkvm/witness.rs
index efcef73652..c661f3a708 100644
--- a/jolt-core/src/zkvm/witness.rs
+++ b/jolt-core/src/zkvm/witness.rs
@@ -271,4 +271,8 @@ pub enum VirtualPolynomial {
     OpFlags(CircuitFlags),
     InstructionFlags(InstructionFlags),
     LookupTableFlag(usize),
+    BytecodeValStage(usize),
+    BytecodeReadRafAddrClaim,
+    BooleanityAddrClaim,
+    BytecodeClaimReductionIntermediate,
 }

From dcd9481d3b060dcffc8c32ab3f85ad51671c0718 Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Mon, 19 Jan 2026 08:40:50 -0800
Subject: [PATCH 02/16] refactor: separate Address/Cycle provers into
 independent structs

- BooleanityAddressSumcheckProver: now has its own state (B, G, F, gamma_powers)
- BooleanityCycleSumcheckProver: now has its own state (D, H, eq_r_r, gamma_powers)
- BytecodeReadRafAddressSumcheckProver: now has its own state (F, val_polys, int_poly)
- BytecodeReadRafCycleSumcheckProver: now has its own state (ra, gruen_eq_polys, bound_val_evals)

The into_cycle_prover() method now transfers only the necessary state rather than
wrapping an inner shared struct. This makes the separation cleaner and prepares
for potential future changes where the two phases might diverge further.
---
 jolt-core/src/subprotocols/booleanity.rs      | 266 ++++++++++-
 .../src/zkvm/bytecode/read_raf_checking.rs    | 425 ++++++++++++++++--
 2 files changed, 647 insertions(+), 44 deletions(-)

diff --git a/jolt-core/src/subprotocols/booleanity.rs b/jolt-core/src/subprotocols/booleanity.rs
index 329e80f622..53bb5a859e 100644
--- a/jolt-core/src/subprotocols/booleanity.rs
+++ b/jolt-core/src/subprotocols/booleanity.rs
@@ -491,29 +491,189 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T> for BooleanitySum
     }
 }
 
+/// Booleanity Address-Phase Sumcheck Prover.
+///
+/// This prover handles only the first `log_k_chunk` rounds (address variables).
+/// After completion, call `into_cycle_prover()` to get the cycle-phase prover.
 #[derive(Allocative)]
 pub struct BooleanityAddressSumcheckProver<F: JoltField> {
-    inner: BooleanitySumcheckProver<F>,
+    /// B: split-eq over address-chunk variables (LowToHigh).
+    B: GruenSplitEqPolynomial<F>,
+    /// G[i][k] = Σ_j eq(r_cycle, j) · ra_i(k, j) for all RA polynomials
+    G: Vec<Vec<F>>,
+    /// F: Expanding table for address phase
+    F: ExpandingTable<F>,
+    /// Per-polynomial powers γ^i (in the base field).
+    gamma_powers: Vec<F>,
+    /// RA indices (non-transposed, one per cycle)
+    ra_indices: Vec<RaIndices>,
+    /// Last round polynomial for claim computation
     last_round_poly: Option<UniPoly<F>>,
+    /// Final claim after binding all address variables
     address_claim: Option<F>,
+
+    // State that will be transferred to cycle prover
+    /// D: split-eq over time/cycle variables (LowToHigh).
+    D: GruenSplitEqPolynomial<F>,
+    /// Per-polynomial inverse powers γ^{-i} (in the base field).
+    gamma_powers_inv: Vec<F>,
+    /// Parameters (shared with cycle prover)
+    pub params: BooleanitySumcheckParams<F>,
 }
 
 impl<F: JoltField> BooleanityAddressSumcheckProver<F> {
+    /// Initialize a BooleanityAddressSumcheckProver.
+    ///
+    /// Computes G polynomials and RA indices in a single pass over the trace.
+    #[tracing::instrument(skip_all, name = "BooleanityAddressSumcheckProver::initialize")]
     pub fn initialize(
         params: BooleanitySumcheckParams<F>,
         trace: &[Cycle],
         bytecode: &BytecodePreprocessing,
         memory_layout: &MemoryLayout,
     ) -> Self {
+        // Compute G and RA indices in a single pass over the trace
+        let (G, ra_indices) = compute_all_G_and_ra_indices::<F>(
+            trace,
+            bytecode,
+            memory_layout,
+            &params.one_hot_params,
+            &params.r_cycle,
+        );
+
+        // Initialize split-eq polynomials for address and cycle variables
+        let B = GruenSplitEqPolynomial::new(&params.r_address, BindingOrder::LowToHigh);
+        let D = GruenSplitEqPolynomial::new(&params.r_cycle, BindingOrder::LowToHigh);
+
+        // Initialize expanding table for address phase
+        let k_chunk = 1 << params.log_k_chunk;
+        let mut F_table = ExpandingTable::new(k_chunk, BindingOrder::LowToHigh);
+        F_table.reset(F::one());
+
+        // Compute prover-only fields: gamma_powers (γ^i) and gamma_powers_inv (γ^{-i})
+        let num_polys = params.polynomial_types.len();
+        let gamma_f: F = params.gamma.into();
+        let mut gamma_powers = Vec::with_capacity(num_polys);
+        let mut gamma_powers_inv = Vec::with_capacity(num_polys);
+        let mut rho_i = F::one();
+        for _ in 0..num_polys {
+            gamma_powers.push(rho_i);
+            gamma_powers_inv.push(
+                rho_i
+                    .inverse()
+                    .expect("gamma_powers[i] is nonzero (gamma != 0)"),
+            );
+            rho_i *= gamma_f;
+        }
+
         Self {
-            inner: BooleanitySumcheckProver::initialize(params, trace, bytecode, memory_layout),
+            B,
+            G,
+            F: F_table,
+            gamma_powers,
+            ra_indices,
             last_round_poly: None,
             address_claim: None,
+            D,
+            gamma_powers_inv,
+            params,
         }
     }
 
-    pub fn into_cycle_prover(self) -> BooleanityCycleSumcheckProver<F> {
-        BooleanityCycleSumcheckProver { inner: self.inner }
+    /// Transform into the cycle-phase prover, transferring necessary state.
+    pub fn into_cycle_prover(mut self) -> BooleanityCycleSumcheckProver<F> {
+        // Compute eq_r_r from B's final state
+        let eq_r_r = self.B.get_current_scalar();
+
+        // Initialize SharedRaPolynomials with per-poly pre-scaled eq tables (by rho_i)
+        let F_table = std::mem::take(&mut self.F);
+        let ra_indices = std::mem::take(&mut self.ra_indices);
+        let base_eq = F_table.clone_values();
+        let num_polys = self.params.polynomial_types.len();
+        let tables: Vec<Vec<F>> = (0..num_polys)
+            .into_par_iter()
+            .map(|i| {
+                let rho = self.gamma_powers[i];
+                base_eq.iter().map(|v| rho * *v).collect()
+            })
+            .collect();
+        let H = SharedRaPolynomials::new(tables, ra_indices, self.params.one_hot_params.clone());
+
+        // Drop G arrays in background
+        let g = std::mem::take(&mut self.G);
+        drop_in_background_thread(g);
+
+        BooleanityCycleSumcheckProver {
+            D: self.D,
+            H,
+            eq_r_r,
+            gamma_powers: self.gamma_powers,
+            gamma_powers_inv: self.gamma_powers_inv,
+            params: self.params,
+        }
+    }
+
+    fn compute_message_impl(&self, round: usize, previous_claim: F) -> UniPoly<F> {
+        let m = round + 1;
+        let B = &self.B;
+        let N = self.params.polynomial_types.len();
+
+        // Compute quadratic coefficients via generic split-eq fold
+        let quadratic_coeffs: [F; DEGREE_BOUND - 1] = B
+            .par_fold_out_in_unreduced::<9, { DEGREE_BOUND - 1 }>(&|k_prime| {
+                let coeffs = (0..N)
+                    .into_par_iter()
+                    .map(|i| {
+                        let G_i = &self.G[i];
+                        let inner_sum = G_i[k_prime << m..(k_prime + 1) << m]
+                            .par_iter()
+                            .enumerate()
+                            .map(|(k, &G_k)| {
+                                let k_m = k >> (m - 1);
+                                let F_k = self.F[k & ((1 << (m - 1)) - 1)];
+                                let G_times_F = G_k * F_k;
+
+                                let eval_infty = G_times_F * F_k;
+                                let eval_0 = if k_m == 0 {
+                                    eval_infty - G_times_F
+                                } else {
+                                    F::zero()
+                                };
+                                [eval_0, eval_infty]
+                            })
+                            .fold_with(
+                                [F::Unreduced::<5>::zero(); DEGREE_BOUND - 1],
+                                |running, new| {
+                                    [
+                                        running[0] + new[0].as_unreduced_ref(),
+                                        running[1] + new[1].as_unreduced_ref(),
+                                    ]
+                                },
+                            )
+                            .reduce(
+                                || [F::Unreduced::zero(); DEGREE_BOUND - 1],
+                                |running, new| [running[0] + new[0], running[1] + new[1]],
+                            );
+
+                        let gamma_2i = self.params.gamma_powers_square[i];
+                        [
+                            gamma_2i * F::from_barrett_reduce(inner_sum[0]),
+                            gamma_2i * F::from_barrett_reduce(inner_sum[1]),
+                        ]
+                    })
+                    .reduce(
+                        || [F::zero(); DEGREE_BOUND - 1],
+                        |running, new| [running[0] + new[0], running[1] + new[1]],
+                    );
+                coeffs
+            });
+
+        B.gruen_poly_deg_3(quadratic_coeffs[0], quadratic_coeffs[1], previous_claim)
+    }
+
+    fn ingest_challenge_impl(&mut self, r_j: F::Challenge) {
+        self.B.bind(r_j);
+        self.F.update(r_j);
     }
 }
 
@@ -521,19 +681,19 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
     for BooleanityAddressSumcheckProver<F>
 {
     fn degree(&self) -> usize {
-        self.inner.params.degree()
+        self.params.degree()
     }
 
     fn num_rounds(&self) -> usize {
-        self.inner.params.log_k_chunk
+        self.params.log_k_chunk
     }
 
-    fn input_claim(&self, accumulator: &ProverOpeningAccumulator<F>) -> F {
-        self.inner.params.input_claim(accumulator)
+    fn input_claim(&self, _accumulator: &ProverOpeningAccumulator<F>) -> F {
+        self.params.input_claim(_accumulator)
     }
 
     fn compute_message(&mut self, round: usize, previous_claim: F) -> UniPoly<F> {
-        let poly = self.inner.compute_phase1_message(round, previous_claim);
+        let poly = self.compute_message_impl(round, previous_claim);
         self.last_round_poly = Some(poly.clone());
         poly
     }
@@ -541,11 +701,11 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
     fn ingest_challenge(&mut self, r_j: F::Challenge, round: usize) {
         if let Some(poly) = self.last_round_poly.take() {
             let claim = poly.evaluate(&r_j);
-            if round == self.inner.params.log_k_chunk - 1 {
+            if round == self.params.log_k_chunk - 1 {
                 self.address_claim = Some(claim);
             }
         }
-        self.inner.ingest_address_challenge(r_j, round)
+        self.ingest_challenge_impl(r_j);
     }
 
     fn cache_openings(
@@ -575,20 +735,75 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
     }
 }
 
+/// Booleanity Cycle-Phase Sumcheck Prover.
+///
+/// This prover handles the remaining `log_t` rounds (cycle variables).
+/// It is constructed from `BooleanityAddressSumcheckProver::into_cycle_prover()`.
 #[derive(Allocative)]
 pub struct BooleanityCycleSumcheckProver<F: JoltField> {
-    inner: BooleanitySumcheckProver<F>,
+    /// D: split-eq over time/cycle variables (LowToHigh).
+    D: GruenSplitEqPolynomial<F>,
+    /// Shared H polynomials (RA polys bound over address, pre-scaled by gamma)
+    H: SharedRaPolynomials<F>,
+    /// eq(r_address, r_address) from address phase
+    eq_r_r: F,
+    /// Per-polynomial powers γ^i (in the base field).
+    gamma_powers: Vec<F>,
+    /// Per-polynomial inverse powers γ^{-i} (in the base field).
+    gamma_powers_inv: Vec<F>,
+    /// Parameters
+    pub params: BooleanitySumcheckParams<F>,
+}
+
+impl<F: JoltField> BooleanityCycleSumcheckProver<F> {
+    fn compute_message_impl(&self, previous_claim: F) -> UniPoly<F> {
+        let D = &self.D;
+        let H = &self.H;
+        let num_polys = H.num_polys();
+
+        // Compute quadratic coefficients via generic split-eq fold
+        let quadratic_coeffs: [F; DEGREE_BOUND - 1] = D
+            .par_fold_out_in_unreduced::<9, { DEGREE_BOUND - 1 }>(&|j_prime| {
+                let mut acc_c = F::Unreduced::<9>::zero();
+                let mut acc_e = F::Unreduced::<9>::zero();
+                for i in 0..num_polys {
+                    let h_0 = H.get_bound_coeff(i, 2 * j_prime);
+                    let h_1 = H.get_bound_coeff(i, 2 * j_prime + 1);
+                    let b = h_1 - h_0;
+
+                    let rho = self.gamma_powers[i];
+                    acc_c += h_0.mul_unreduced::<9>(h_0 - rho);
+                    acc_e += b.mul_unreduced::<9>(b);
+                }
+                [
+                    F::from_montgomery_reduce::<9>(acc_c),
+                    F::from_montgomery_reduce::<9>(acc_e),
+                ]
+            });
+
+        // Adjust claim by eq_r_r scaling
+        let adjusted_claim = previous_claim * self.eq_r_r.inverse().unwrap();
+        let gruen_poly =
+            D.gruen_poly_deg_3(quadratic_coeffs[0], quadratic_coeffs[1], adjusted_claim);
+
+        gruen_poly * self.eq_r_r
+    }
+
+    fn ingest_challenge_impl(&mut self, r_j: F::Challenge) {
+        self.D.bind(r_j);
+        self.H.bind_in_place(r_j, BindingOrder::LowToHigh);
+    }
 }
 
 impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
     for BooleanityCycleSumcheckProver<F>
 {
     fn degree(&self) -> usize {
-        self.inner.params.degree()
+        self.params.degree()
     }
 
     fn num_rounds(&self) -> usize {
-        self.inner.params.log_t
+        self.params.log_t
     }
 
     fn input_claim(&self, accumulator: &ProverOpeningAccumulator<F>) -> F {
@@ -600,12 +815,12 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
             .1
     }
 
-    fn compute_message(&mut self, round: usize, previous_claim: F) -> UniPoly<F> {
-        self.inner.compute_phase2_message(round, previous_claim)
+    fn compute_message(&mut self, _round: usize, previous_claim: F) -> UniPoly<F> {
+        self.compute_message_impl(previous_claim)
     }
 
     fn ingest_challenge(&mut self, r_j: F::Challenge, _round: usize) {
-        self.inner.ingest_cycle_challenge(r_j)
+        self.ingest_challenge_impl(r_j)
     }
 
     fn cache_openings(
@@ -622,8 +837,21 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
         r_address_le.reverse();
         let mut full_challenges = r_address_le;
         full_challenges.extend_from_slice(sumcheck_challenges);
-        let inner: &dyn SumcheckInstanceProver<F, T> = &self.inner;
-        inner.cache_openings(accumulator, transcript, &full_challenges);
+        let opening_point = self.params.normalize_opening_point(&full_challenges);
+
+        // H is scaled by rho_i; unscale so cached openings match the committed polynomials.
+        let claims: Vec<F> = (0..self.H.num_polys())
+            .map(|i| self.H.final_sumcheck_claim(i) * self.gamma_powers_inv[i])
+            .collect();
+
+        accumulator.append_sparse(
+            transcript,
+            self.params.polynomial_types.clone(),
+            SumcheckId::Booleanity,
+            opening_point.r[..self.params.log_k_chunk].to_vec(),
+            opening_point.r[self.params.log_k_chunk..].to_vec(),
+            claims,
+        );
     }
 
     #[cfg(feature = "allocative")]
diff --git a/jolt-core/src/zkvm/bytecode/read_raf_checking.rs b/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
index f25d4ff99e..f3128469c6 100644
--- a/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
+++ b/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
@@ -600,24 +600,288 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
     }
 }
 
+/// Bytecode Read+RAF Address-Phase Sumcheck Prover.
+///
+/// This prover handles only the first `log_K` rounds (address variables).
+/// After completion, call `into_cycle_prover()` to get the cycle-phase prover.
 #[derive(Allocative)]
 pub struct BytecodeReadRafAddressSumcheckProver<F: JoltField> {
-    inner: BytecodeReadRafSumcheckProver<F>,
+    /// Per-stage address MLEs F_i(k) built from eq(r_cycle_stage_i, (chunk_index, j)).
+    F: [MultilinearPolynomial<F>; N_STAGES],
+    /// Binding challenges for the first log_K variables.
+    r_address_prime: Vec<F::Challenge>,
+    /// Previous-round claims s_i(0)+s_i(1) per stage.
+    prev_round_claims: [F; N_STAGES],
+    /// Round polynomials per stage for advancing to the next claim.
+    prev_round_polys: Option<[UniPoly<F>; N_STAGES]>,
+    /// Trace for computing RA polynomials during transition.
+    #[allocative(skip)]
+    trace: Arc<Vec<Cycle>>,
+    /// Bytecode preprocessing for computing PCs.
+    #[allocative(skip)]
+    bytecode_preprocessing: Arc<BytecodePreprocessing>,
+
+    // State transferred to cycle prover
+    /// Per-stage Gruen-split eq polynomials over cycle vars.
+    gruen_eq_polys: [GruenSplitEqPolynomial<F>; N_STAGES],
+    /// Parameters (shared with cycle prover).
+    pub params: BytecodeReadRafSumcheckParams<F>,
 }
 
 impl<F: JoltField> BytecodeReadRafAddressSumcheckProver<F> {
+    /// Initialize a BytecodeReadRafAddressSumcheckProver.
+    #[tracing::instrument(skip_all, name = "BytecodeReadRafAddressSumcheckProver::initialize")]
     pub fn initialize(
         params: BytecodeReadRafSumcheckParams<F>,
         trace: Arc<Vec<Cycle>>,
         bytecode_preprocessing: Arc<BytecodePreprocessing>,
     ) -> Self {
+        let claim_per_stage = [
+            params.rv_claims[0] + params.gamma_powers[5] * params.raf_claim,
+            params.rv_claims[1],
+            params.rv_claims[2] + params.gamma_powers[4] * params.raf_shift_claim,
+            params.rv_claims[3],
+            params.rv_claims[4],
+        ];
+
+        // Two-table split-eq optimization for computing F[stage][k] = Σ_{c: PC(c)=k} eq(r_cycle, c).
+        let T = trace.len();
+        let K = params.K;
+        let log_T = params.log_T;
+
+        let lo_bits = log_T / 2;
+        let hi_bits = log_T - lo_bits;
+        let in_len: usize = 1 << lo_bits;
+        let out_len: usize = 1 << hi_bits;
+
+        let (E_hi, E_lo): ([Vec<F>; N_STAGES], [Vec<F>; N_STAGES]) = rayon::join(
+            || {
+                params
+                    .r_cycles
+                    .each_ref()
+                    .map(|r_cycle| EqPolynomial::evals(&r_cycle[..hi_bits]))
+            },
+            || {
+                params
+                    .r_cycles
+                    .each_ref()
+                    .map(|r_cycle| EqPolynomial::evals(&r_cycle[hi_bits..]))
+            },
+        );
+
+        let num_threads = rayon::current_num_threads();
+        let chunk_size = out_len.div_ceil(num_threads);
+
+        let F_polys: [Vec<F>; N_STAGES] = E_hi[0]
+            .par_chunks(chunk_size)
+            .enumerate()
+            .map(|(chunk_idx, chunk)| {
+                let mut partial: [Vec<F>; N_STAGES] =
+                    array::from_fn(|_| unsafe_allocate_zero_vec(K));
+                let mut inner: [Vec<F>; N_STAGES] = array::from_fn(|_| unsafe_allocate_zero_vec(K));
+                let mut touched = Vec::with_capacity(in_len);
+
+                let chunk_start = chunk_idx * chunk_size;
+                for (local_idx, _) in chunk.iter().enumerate() {
+                    let c_hi = chunk_start + local_idx;
+                    let c_hi_base = c_hi * in_len;
+
+                    for &k in &touched {
+                        for stage in 0..N_STAGES {
+                            inner[stage][k] = F::zero();
+                        }
+                    }
+                    touched.clear();
+
+                    for c_lo in 0..in_len {
+                        let c = c_hi_base + c_lo;
+                        if c >= T {
+                            break;
+                        }
+
+                        let pc = bytecode_preprocessing.get_pc(&trace[c]);
+                        if inner[0][pc].is_zero() {
+                            touched.push(pc);
+                        }
+                        for stage in 0..N_STAGES {
+                            inner[stage][pc] += E_lo[stage][c_lo];
+                        }
+                    }
+
+                    for &k in &touched {
+                        for stage in 0..N_STAGES {
+                            partial[stage][k] += E_hi[stage][c_hi] * inner[stage][k];
+                        }
+                    }
+                }
+                partial
+            })
+            .reduce(
+                || array::from_fn(|_| unsafe_allocate_zero_vec(K)),
+                |mut a, b| {
+                    for stage in 0..N_STAGES {
+                        a[stage]
+                            .par_iter_mut()
+                            .zip(b[stage].par_iter())
+                            .for_each(|(a, b)| *a += *b);
+                    }
+                    a
+                },
+            );
+
+        let F = F_polys.map(MultilinearPolynomial::from);
+        let gruen_eq_polys = params
+            .r_cycles
+            .each_ref()
+            .map(|r_cycle| GruenSplitEqPolynomial::new(r_cycle, BindingOrder::LowToHigh));
+
         Self {
-            inner: BytecodeReadRafSumcheckProver::initialize(params, trace, bytecode_preprocessing),
+            F,
+            r_address_prime: Vec::with_capacity(params.log_K),
+            prev_round_claims: claim_per_stage,
+            prev_round_polys: None,
+            trace,
+            bytecode_preprocessing,
+            gruen_eq_polys,
+            params,
         }
     }
 
-    pub fn into_cycle_prover(self) -> BytecodeReadRafCycleSumcheckProver<F> {
-        BytecodeReadRafCycleSumcheckProver { inner: self.inner }
+    /// Transform into the cycle-phase prover, computing RA polynomials and bound_val_evals.
+    pub fn into_cycle_prover(mut self) -> BytecodeReadRafCycleSumcheckProver<F> {
+        // Compute bound_val_evals from val_polys
+        let int_poly = self.params.int_poly.final_sumcheck_claim();
+        let bound_val_evals: [F; N_STAGES] = self
+            .params
+            .val_polys
+            .iter()
+            .zip([
+                int_poly * self.params.gamma_powers[5],
+                F::zero(),
+                int_poly * self.params.gamma_powers[4],
+                F::zero(),
+                F::zero(),
+            ])
+            .map(|(poly, int_term)| poly.final_sumcheck_claim() + int_term)
+            .collect::<Vec<F>>()
+            .try_into()
+            .unwrap();
+
+        // Reverse r_address_prime to get the correct order
+        let mut r_address = std::mem::take(&mut self.r_address_prime);
+        r_address.reverse();
+
+        let r_address_chunks = self
+            .params
+            .one_hot_params
+            .compute_r_address_chunks::<F>(&r_address);
+
+        // Build RA polynomials
+        let ra: Vec<RaPolynomial<u8, F>> = r_address_chunks
+            .iter()
+            .enumerate()
+            .map(|(i, r_address_chunk)| {
+                let ra_i: Vec<Option<u8>> = self
+                    .trace
+                    .par_iter()
+                    .map(|cycle| {
+                        let pc = self.bytecode_preprocessing.get_pc(cycle);
+                        Some(self.params.one_hot_params.bytecode_pc_chunk(pc, i))
+                    })
+                    .collect();
+                RaPolynomial::new(Arc::new(ra_i), EqPolynomial::evals(r_address_chunk))
+            })
+            .collect();
+
+        BytecodeReadRafCycleSumcheckProver {
+            ra,
+            gruen_eq_polys: self.gruen_eq_polys,
+            prev_round_claims: self.prev_round_claims,
+            prev_round_polys: None,
+            bound_val_evals,
+            params: self.params,
+        }
+    }
+
+    fn compute_message_impl(&mut self, _previous_claim: F) -> UniPoly<F> {
+        const DEGREE: usize = 2;
+
+        let eval_per_stage: [[F; DEGREE]; N_STAGES] = (0..self.params.val_polys[0].len() / 2)
+            .into_par_iter()
+            .map(|i| {
+                let ra_evals = self
+                    .F
+                    .each_ref()
+                    .map(|poly| poly.sumcheck_evals_array::<DEGREE>(i, BindingOrder::LowToHigh));
+
+                let int_evals =
+                    self.params
+                        .int_poly
+                        .sumcheck_evals(i, DEGREE, BindingOrder::LowToHigh);
+
+                let mut val_evals = self
+                    .params
+                    .val_polys
+                    .iter()
+                    .map(|val| val.sumcheck_evals_array::<DEGREE>(i, BindingOrder::LowToHigh))
+                    .zip([Some(&int_evals), None, Some(&int_evals), None, None])
+                    .zip([
+                        Some(self.params.gamma_powers[5]),
+                        None,
+                        Some(self.params.gamma_powers[4]),
+                        None,
+                        None,
+                    ])
+                    .map(|((val_evals, int_evals), gamma)| {
+                        std::array::from_fn::<F, DEGREE, _>(|j| {
+                            val_evals[j]
+                                + int_evals
+                                    .map_or(F::zero(), |int_evals| int_evals[j] * gamma.unwrap())
+                        })
+                    });
+
+                array::from_fn(|stage| {
+                    let [ra_at_0, ra_at_2] = ra_evals[stage];
+                    let [val_at_0, val_at_2] = val_evals.next().unwrap();
+                    [ra_at_0 * val_at_0, ra_at_2 * val_at_2]
+                })
+            })
+            .reduce(
+                || [[F::zero(); DEGREE]; N_STAGES],
+                |a, b| array::from_fn(|i| array::from_fn(|j| a[i][j] + b[i][j])),
+            );
+
+        let mut round_polys: [_; N_STAGES] = array::from_fn(|_| UniPoly::zero());
+        let mut agg_round_poly = UniPoly::zero();
+
+        for (stage, evals) in eval_per_stage.into_iter().enumerate() {
+            let [eval_at_0, eval_at_2] = evals;
+            let eval_at_1 = self.prev_round_claims[stage] - eval_at_0;
+            let round_poly = UniPoly::from_evals(&[eval_at_0, eval_at_1, eval_at_2]);
+            agg_round_poly += &(&round_poly * self.params.gamma_powers[stage]);
+            round_polys[stage] = round_poly;
+        }
+
+        self.prev_round_polys = Some(round_polys);
+        agg_round_poly
+    }
+
+    fn ingest_challenge_impl(&mut self, r_j: F::Challenge) {
+        if let Some(prev_round_polys) = self.prev_round_polys.take() {
+            self.prev_round_claims = prev_round_polys.map(|poly| poly.evaluate(&r_j));
+        }
+
+        self.params
+            .val_polys
+            .iter_mut()
+            .for_each(|poly| poly.bind_parallel(r_j, BindingOrder::LowToHigh));
+        self.params
+            .int_poly
+            .bind_parallel(r_j, BindingOrder::LowToHigh);
+        self.F
+            .iter_mut()
+            .for_each(|poly| poly.bind_parallel(r_j, BindingOrder::LowToHigh));
+        self.r_address_prime.push(r_j);
     }
 }
 
@@ -625,23 +889,23 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
     for BytecodeReadRafAddressSumcheckProver<F>
 {
     fn degree(&self) -> usize {
-        self.inner.params.degree()
+        self.params.degree()
     }
 
     fn num_rounds(&self) -> usize {
-        self.inner.params.log_K
+        self.params.log_K
     }
 
-    fn input_claim(&self, accumulator: &ProverOpeningAccumulator<F>) -> F {
-        self.inner.params.input_claim(accumulator)
+    fn input_claim(&self, _accumulator: &ProverOpeningAccumulator<F>) -> F {
+        self.params.input_claim(_accumulator)
     }
 
-    fn compute_message(&mut self, round: usize, previous_claim: F) -> UniPoly<F> {
-        self.inner.compute_message_internal(round, previous_claim)
+    fn compute_message(&mut self, _round: usize, previous_claim: F) -> UniPoly<F> {
+        self.compute_message_impl(previous_claim)
     }
 
-    fn ingest_challenge(&mut self, r_j: F::Challenge, round: usize) {
-        self.inner.ingest_challenge_internal(r_j, round)
+    fn ingest_challenge(&mut self, r_j: F::Challenge, _round: usize) {
+        self.ingest_challenge_impl(r_j)
     }
 
     fn cache_openings(
@@ -654,10 +918,9 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
         r_address.reverse();
         let opening_point = OpeningPoint::<BIG_ENDIAN, F>::new(r_address);
         let address_claim: F = self
-            .inner
             .prev_round_claims
             .iter()
-            .zip(self.inner.params.gamma_powers.iter())
+            .zip(self.params.gamma_powers.iter())
             .take(N_STAGES)
             .map(|(claim, gamma)| *claim * *gamma)
             .sum();
@@ -676,20 +939,118 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
     }
 }
 
+/// Bytecode Read+RAF Cycle-Phase Sumcheck Prover.
+///
+/// This prover handles the remaining `log_T` rounds (cycle variables).
+/// It is constructed from `BytecodeReadRafAddressSumcheckProver::into_cycle_prover()`.
 #[derive(Allocative)]
 pub struct BytecodeReadRafCycleSumcheckProver<F: JoltField> {
-    inner: BytecodeReadRafSumcheckProver<F>,
+    /// Chunked RA polynomials over address variables.
+    ra: Vec<RaPolynomial<u8, F>>,
+    /// Per-stage Gruen-split eq polynomials over cycle vars.
+    gruen_eq_polys: [GruenSplitEqPolynomial<F>; N_STAGES],
+    /// Previous-round claims s_i(0)+s_i(1) per stage.
+    prev_round_claims: [F; N_STAGES],
+    /// Round polynomials per stage.
+    prev_round_polys: Option<[UniPoly<F>; N_STAGES]>,
+    /// Final sumcheck claims of stage Val polynomials (with RAF Int folded).
+    bound_val_evals: [F; N_STAGES],
+    /// Parameters.
+    pub params: BytecodeReadRafSumcheckParams<F>,
+}
+
+impl<F: JoltField> BytecodeReadRafCycleSumcheckProver<F> {
+    fn compute_message_impl(&mut self, _previous_claim: F) -> UniPoly<F> {
+        let degree = self.params.degree();
+
+        let out_len = self.gruen_eq_polys[0].E_out_current().len();
+        let in_len = self.gruen_eq_polys[0].E_in_current().len();
+        let in_n_vars = in_len.log_2();
+
+        let mut evals_per_stage: [Vec<F>; N_STAGES] = (0..out_len)
+            .into_par_iter()
+            .map(|j_hi| {
+                let mut ra_eval_pairs = vec![(F::zero(), F::zero()); self.ra.len()];
+                let mut ra_prod_evals = vec![F::zero(); degree - 1];
+                let mut evals_per_stage: [_; N_STAGES] =
+                    array::from_fn(|_| vec![F::Unreduced::zero(); degree - 1]);
+
+                for j_lo in 0..in_len {
+                    let j = j_lo + (j_hi << in_n_vars);
+
+                    for (i, ra_i) in self.ra.iter().enumerate() {
+                        let ra_i_eval_at_j_0 = ra_i.get_bound_coeff(j * 2);
+                        let ra_i_eval_at_j_1 = ra_i.get_bound_coeff(j * 2 + 1);
+                        ra_eval_pairs[i] = (ra_i_eval_at_j_0, ra_i_eval_at_j_1);
+                    }
+                    eval_linear_prod_assign(&ra_eval_pairs, &mut ra_prod_evals);
+
+                    for stage in 0..N_STAGES {
+                        let eq_in_eval = self.gruen_eq_polys[stage].E_in_current()[j_lo];
+                        for i in 0..degree - 1 {
+                            evals_per_stage[stage][i] +=
+                                eq_in_eval.mul_unreduced::<9>(ra_prod_evals[i]);
+                        }
+                    }
+                }
+
+                array::from_fn(|stage| {
+                    let eq_out_eval = self.gruen_eq_polys[stage].E_out_current()[j_hi];
+                    evals_per_stage[stage]
+                        .iter()
+                        .map(|v| eq_out_eval * F::from_montgomery_reduce(*v))
+                        .collect()
+                })
+            })
+            .reduce(
+                || array::from_fn(|_| vec![F::zero(); degree - 1]),
+                |a, b| array::from_fn(|i| zip_eq(&a[i], &b[i]).map(|(a, b)| *a + *b).collect()),
+            );
+
+        // Multiply by bound values
+        for (stage, evals) in evals_per_stage.iter_mut().enumerate() {
+            evals
+                .iter_mut()
+                .for_each(|v| *v *= self.bound_val_evals[stage]);
+        }
+
+        let mut round_polys: [_; N_STAGES] = array::from_fn(|_| UniPoly::zero());
+        let mut agg_round_poly = UniPoly::zero();
+
+        for (stage, evals) in evals_per_stage.iter().enumerate() {
+            let claim = self.prev_round_claims[stage];
+            let round_poly = self.gruen_eq_polys[stage].gruen_poly_from_evals(evals, claim);
+            agg_round_poly += &(&round_poly * self.params.gamma_powers[stage]);
+            round_polys[stage] = round_poly;
+        }
+
+        self.prev_round_polys = Some(round_polys);
+        agg_round_poly
+    }
+
+    fn ingest_challenge_impl(&mut self, r_j: F::Challenge) {
+        if let Some(prev_round_polys) = self.prev_round_polys.take() {
+            self.prev_round_claims = prev_round_polys.map(|poly| poly.evaluate(&r_j));
+        }
+
+        self.ra
+            .iter_mut()
+            .for_each(|ra| ra.bind_parallel(r_j, BindingOrder::LowToHigh));
+        self.gruen_eq_polys
+            .iter_mut()
+            .for_each(|poly| poly.bind(r_j));
+    }
 }
 
 impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
     for BytecodeReadRafCycleSumcheckProver<F>
 {
     fn degree(&self) -> usize {
-        self.inner.params.degree()
+        self.params.degree()
     }
 
     fn num_rounds(&self) -> usize {
-        self.inner.params.log_T
+        self.params.log_T
     }
 
     fn input_claim(&self, accumulator: &ProverOpeningAccumulator<F>) -> F {
@@ -701,14 +1062,12 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
             .1
     }
 
-    fn compute_message(&mut self, round: usize, previous_claim: F) -> UniPoly<F> {
-        self.inner
-            .compute_message_internal(round + self.inner.params.log_K, previous_claim)
+    fn compute_message(&mut self, _round: usize, previous_claim: F) -> UniPoly<F> {
+        self.compute_message_impl(previous_claim)
     }
 
-    fn ingest_challenge(&mut self, r_j: F::Challenge, round: usize) {
-        self.inner
-            .ingest_challenge_internal(r_j, round + self.inner.params.log_K)
+    fn ingest_challenge(&mut self, r_j: F::Challenge, _round: usize) {
+        self.ingest_challenge_impl(r_j)
     }
 
     fn cache_openings(
@@ -725,8 +1084,24 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
         r_address_le.reverse();
         let mut full_challenges = r_address_le;
         full_challenges.extend_from_slice(sumcheck_challenges);
-        let inner: &dyn SumcheckInstanceProver<F, T> = &self.inner;
-        inner.cache_openings(accumulator, transcript, &full_challenges);
+        let opening_point = self.params.normalize_opening_point(&full_challenges);
+        let (r_address, r_cycle) = opening_point.split_at(self.params.log_K);
+
+        let r_address_chunks = self
+            .params
+            .one_hot_params
+            .compute_r_address_chunks::<F>(&r_address.r);
+
+        for i in 0..self.params.d {
+            accumulator.append_sparse(
+                transcript,
+                vec![CommittedPolynomial::BytecodeRa(i)],
+                SumcheckId::BytecodeReadRaf,
+                r_address_chunks[i].clone(),
+                r_cycle.clone().into(),
+                vec![self.ra[i].final_sumcheck_claim()],
+            );
+        }
     }
 
     #[cfg(feature = "allocative")]

From 0f40a19e1612427142483153a093f8aa92945e16 Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Mon, 19 Jan 2026 10:25:35 -0800
Subject: [PATCH 03/16] refactor(stage6): remove state handoff for
 booleanity/read-raf

---
 jolt-core/src/subprotocols/booleanity.rs      | 172 ++++++++++-------
 .../src/zkvm/bytecode/read_raf_checking.rs    | 178 ++++++++++--------
 jolt-core/src/zkvm/prover.rs                  |  57 +++---
 jolt-core/src/zkvm/verifier.rs                |  44 +++--
 4 files changed, 259 insertions(+), 192 deletions(-)

diff --git a/jolt-core/src/subprotocols/booleanity.rs b/jolt-core/src/subprotocols/booleanity.rs
index 53bb5a859e..9dd057eff8 100644
--- a/jolt-core/src/subprotocols/booleanity.rs
+++ b/jolt-core/src/subprotocols/booleanity.rs
@@ -36,7 +36,10 @@ use crate::{
             OpeningAccumulator, OpeningPoint, ProverOpeningAccumulator, SumcheckId,
             VerifierOpeningAccumulator, BIG_ENDIAN,
         },
-        shared_ra_polys::{compute_all_G_and_ra_indices, RaIndices, SharedRaPolynomials},
+        shared_ra_polys::{
+            compute_all_G, compute_all_G_and_ra_indices, compute_ra_indices, RaIndices,
+            SharedRaPolynomials,
+        },
         split_eq_poly::GruenSplitEqPolynomial,
         unipoly::UniPoly,
     },
@@ -494,7 +497,7 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T> for BooleanitySum
 /// Booleanity Address-Phase Sumcheck Prover.
 ///
 /// This prover handles only the first `log_k_chunk` rounds (address variables).
-/// After completion, call `into_cycle_prover()` to get the cycle-phase prover.
+/// The cycle-phase prover is constructed separately from witness + accumulator (Option B).
 #[derive(Allocative)]
 pub struct BooleanityAddressSumcheckProver<F: JoltField> {
     /// B: split-eq over address-chunk variables (LowToHigh).
@@ -503,20 +506,10 @@ pub struct BooleanityAddressSumcheckProver<F: JoltField> {
     G: Vec<Vec<F>>,
     /// F: Expanding table for address phase
     F: ExpandingTable<F>,
-    /// Per-polynomial powers γ^i (in the base field).
-    gamma_powers: Vec<F>,
-    /// RA indices (non-transposed, one per cycle)
-    ra_indices: Vec<RaIndices>,
     /// Last round polynomial for claim computation
     last_round_poly: Option<UniPoly<F>>,
     /// Final claim after binding all address variables
     address_claim: Option<F>,
-
-    // State that will be transferred to cycle prover
-    /// D: split-eq over time/cycle variables (LowToHigh).
-    D: GruenSplitEqPolynomial<F>,
-    /// Per-polynomial inverse powers γ^{-i} (in the base field).
-    gamma_powers_inv: Vec<F>,
     /// Parameters (shared with cycle prover)
     pub params: BooleanitySumcheckParams<F>,
 }
@@ -532,8 +525,8 @@ impl<F: JoltField> BooleanityAddressSumcheckProver<F> {
         bytecode: &BytecodePreprocessing,
         memory_layout: &MemoryLayout,
     ) -> Self {
-        // Compute G and RA indices in a single pass over the trace
-        let (G, ra_indices) = compute_all_G_and_ra_indices::<F>(
+        // Compute G in a single pass over the trace (witness-dependent).
+        let G = compute_all_G::<F>(
             trace,
             bytecode,
             memory_layout,
@@ -541,78 +534,24 @@ impl<F: JoltField> BooleanityAddressSumcheckProver<F> {
             &params.r_cycle,
         );
 
-        // Initialize split-eq polynomials for address and cycle variables
+        // Initialize split-eq polynomial for address variables
         let B = GruenSplitEqPolynomial::new(&params.r_address, BindingOrder::LowToHigh);
-        let D = GruenSplitEqPolynomial::new(&params.r_cycle, BindingOrder::LowToHigh);
 
         // Initialize expanding table for address phase
         let k_chunk = 1 << params.log_k_chunk;
         let mut F_table = ExpandingTable::new(k_chunk, BindingOrder::LowToHigh);
         F_table.reset(F::one());
 
-        // Compute prover-only fields: gamma_powers (γ^i) and gamma_powers_inv (γ^{-i})
-        let num_polys = params.polynomial_types.len();
-        let gamma_f: F = params.gamma.into();
-        let mut gamma_powers = Vec::with_capacity(num_polys);
-        let mut gamma_powers_inv = Vec::with_capacity(num_polys);
-        let mut rho_i = F::one();
-        for _ in 0..num_polys {
-            gamma_powers.push(rho_i);
-            gamma_powers_inv.push(
-                rho_i
-                    .inverse()
-                    .expect("gamma_powers[i] is nonzero (gamma != 0)"),
-            );
-            rho_i *= gamma_f;
-        }
-
         Self {
             B,
             G,
             F: F_table,
-            gamma_powers,
-            ra_indices,
             last_round_poly: None,
             address_claim: None,
-            D,
-            gamma_powers_inv,
             params,
         }
     }
 
-    /// Transform into the cycle-phase prover, transferring necessary state.
-    pub fn into_cycle_prover(mut self) -> BooleanityCycleSumcheckProver<F> {
-        // Compute eq_r_r from B's final state
-        let eq_r_r = self.B.get_current_scalar();
-
-        // Initialize SharedRaPolynomials with per-poly pre-scaled eq tables (by rho_i)
-        let F_table = std::mem::take(&mut self.F);
-        let ra_indices = std::mem::take(&mut self.ra_indices);
-        let base_eq = F_table.clone_values();
-        let num_polys = self.params.polynomial_types.len();
-        let tables: Vec<Vec<F>> = (0..num_polys)
-            .into_par_iter()
-            .map(|i| {
-                let rho = self.gamma_powers[i];
-                base_eq.iter().map(|v| rho * *v).collect()
-            })
-            .collect();
-        let H = SharedRaPolynomials::new(tables, ra_indices, self.params.one_hot_params.clone());
-
-        // Drop G arrays in background
-        let g = std::mem::take(&mut self.G);
-        drop_in_background_thread(g);
-
-        BooleanityCycleSumcheckProver {
-            D: self.D,
-            H,
-            eq_r_r,
-            gamma_powers: self.gamma_powers,
-            gamma_powers_inv: self.gamma_powers_inv,
-            params: self.params,
-        }
-    }
-
     fn compute_message_impl(&self, round: usize, previous_claim: F) -> UniPoly<F> {
         let m = round + 1;
         let B = &self.B;
@@ -738,7 +677,7 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
 /// Booleanity Cycle-Phase Sumcheck Prover.
 ///
 /// This prover handles the remaining `log_t` rounds (cycle variables).
-/// It is constructed from `BooleanityAddressSumcheckProver::into_cycle_prover()`.
+/// It is constructed from scratch via [`BooleanityCycleSumcheckProver::initialize`].
 #[derive(Allocative)]
 pub struct BooleanityCycleSumcheckProver<F: JoltField> {
     /// D: split-eq over time/cycle variables (LowToHigh).
@@ -756,6 +695,88 @@ pub struct BooleanityCycleSumcheckProver<F: JoltField> {
 }
 
 impl<F: JoltField> BooleanityCycleSumcheckProver<F> {
+    /// Initialize the cycle-phase prover from scratch (Option B).
+    ///
+    /// Reconstructs all cycle-phase state from:
+    /// - `params` (sampled in Stage 6a, must match verifier)
+    /// - witness inputs (`trace`, `bytecode`, `memory_layout`)
+    /// - Stage 6a address challenges (read from `accumulator`)
+    #[tracing::instrument(skip_all, name = "BooleanityCycleSumcheckProver::initialize")]
+    pub fn initialize(
+        params: BooleanitySumcheckParams<F>,
+        trace: &[Cycle],
+        bytecode: &BytecodePreprocessing,
+        memory_layout: &MemoryLayout,
+        accumulator: &ProverOpeningAccumulator<F>,
+    ) -> Self {
+        // Recover Stage 6a address challenges from the accumulator.
+        // These were stored as BIG_ENDIAN (MSB-first) by the address-phase cache_openings.
+        let (r_address_point, _) = accumulator.get_virtual_polynomial_opening(
+            VirtualPolynomial::BooleanityAddrClaim,
+            SumcheckId::BooleanityAddressPhase,
+        );
+        let mut r_address_low_to_high = r_address_point.r;
+        r_address_low_to_high.reverse();
+
+        // Recompute eq_r_r = eq(params.r_address, r_address_challenges) using the same binding
+        // progression as the address prover.
+        let mut B = GruenSplitEqPolynomial::new(&params.r_address, BindingOrder::LowToHigh);
+        for r_j in r_address_low_to_high.iter().cloned() {
+            B.bind(r_j);
+        }
+        let eq_r_r = B.get_current_scalar();
+
+        // Recompute base eq table over k_chunk addresses from the address challenges.
+        let k_chunk = 1 << params.log_k_chunk;
+        let mut F_table = ExpandingTable::new(k_chunk, BindingOrder::LowToHigh);
+        F_table.reset(F::one());
+        for r_j in r_address_low_to_high.iter().cloned() {
+            F_table.update(r_j);
+        }
+        let base_eq = F_table.clone_values();
+
+        // Compute RA indices from witness (unfused with G computation).
+        let ra_indices = compute_ra_indices(trace, bytecode, memory_layout, &params.one_hot_params);
+
+        // Compute prover-only batching coefficients rho_i = gamma^i and inverses.
+        let num_polys = params.polynomial_types.len();
+        let gamma_f: F = params.gamma.into();
+        let mut gamma_powers = Vec::with_capacity(num_polys);
+        let mut gamma_powers_inv = Vec::with_capacity(num_polys);
+        let mut rho_i = F::one();
+        for _ in 0..num_polys {
+            gamma_powers.push(rho_i);
+            gamma_powers_inv.push(
+                rho_i
+                    .inverse()
+                    .expect("gamma is nonzero, so rho_i is invertible"),
+            );
+            rho_i *= gamma_f;
+        }
+
+        // Initialize SharedRaPolynomials with per-poly pre-scaled eq tables (by rho_i).
+        let tables: Vec<Vec<F>> = (0..num_polys)
+            .into_par_iter()
+            .map(|i| {
+                let rho = gamma_powers[i];
+                base_eq.iter().map(|v| rho * *v).collect()
+            })
+            .collect();
+        let H = SharedRaPolynomials::new(tables, ra_indices, params.one_hot_params.clone());
+
+        // Cycle split-eq polynomial over r_cycle.
+        let D = GruenSplitEqPolynomial::new(&params.r_cycle, BindingOrder::LowToHigh);
+
+        Self {
+            D,
+            H,
+            eq_r_r,
+            gamma_powers,
+            gamma_powers_inv,
+            params,
+        }
+    }
+
     fn compute_message_impl(&self, previous_claim: F) -> UniPoly<F> {
         let D = &self.D;
         let H = &self.H;
@@ -932,6 +953,11 @@ impl<F: JoltField> BooleanityAddressSumcheckVerifier<F> {
         Self { params }
     }
 
+    /// Consume this verifier and return the underlying parameters (for Option B orchestration).
+    pub fn into_params(self) -> BooleanitySumcheckParams<F> {
+        self.params
+    }
+
     pub fn into_cycle_verifier(self) -> BooleanityCycleSumcheckVerifier<F> {
         BooleanityCycleSumcheckVerifier {
             params: self.params,
@@ -988,6 +1014,12 @@ pub struct BooleanityCycleSumcheckVerifier<F: JoltField> {
     params: BooleanitySumcheckParams<F>,
 }
 
+impl<F: JoltField> BooleanityCycleSumcheckVerifier<F> {
+    pub fn new(params: BooleanitySumcheckParams<F>) -> Self {
+        Self { params }
+    }
+}
+
 impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
     for BooleanityCycleSumcheckVerifier<F>
 {
diff --git a/jolt-core/src/zkvm/bytecode/read_raf_checking.rs b/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
index f3128469c6..edf8e185f3 100644
--- a/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
+++ b/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
@@ -603,7 +603,7 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
 /// Bytecode Read+RAF Address-Phase Sumcheck Prover.
 ///
 /// This prover handles only the first `log_K` rounds (address variables).
-/// After completion, call `into_cycle_prover()` to get the cycle-phase prover.
+/// The cycle-phase prover is constructed separately from witness + accumulator (Option B).
 #[derive(Allocative)]
 pub struct BytecodeReadRafAddressSumcheckProver<F: JoltField> {
     /// Per-stage address MLEs F_i(k) built from eq(r_cycle_stage_i, (chunk_index, j)).
@@ -614,16 +614,6 @@ pub struct BytecodeReadRafAddressSumcheckProver<F: JoltField> {
     prev_round_claims: [F; N_STAGES],
     /// Round polynomials per stage for advancing to the next claim.
     prev_round_polys: Option<[UniPoly<F>; N_STAGES]>,
-    /// Trace for computing RA polynomials during transition.
-    #[allocative(skip)]
-    trace: Arc<Vec<Cycle>>,
-    /// Bytecode preprocessing for computing PCs.
-    #[allocative(skip)]
-    bytecode_preprocessing: Arc<BytecodePreprocessing>,
-
-    // State transferred to cycle prover
-    /// Per-stage Gruen-split eq polynomials over cycle vars.
-    gruen_eq_polys: [GruenSplitEqPolynomial<F>; N_STAGES],
     /// Parameters (shared with cycle prover).
     pub params: BytecodeReadRafSumcheckParams<F>,
 }
@@ -730,79 +720,16 @@ impl<F: JoltField> BytecodeReadRafAddressSumcheckProver<F> {
             );
 
         let F = F_polys.map(MultilinearPolynomial::from);
-        let gruen_eq_polys = params
-            .r_cycles
-            .each_ref()
-            .map(|r_cycle| GruenSplitEqPolynomial::new(r_cycle, BindingOrder::LowToHigh));
 
         Self {
             F,
             r_address_prime: Vec::with_capacity(params.log_K),
             prev_round_claims: claim_per_stage,
             prev_round_polys: None,
-            trace,
-            bytecode_preprocessing,
-            gruen_eq_polys,
             params,
         }
     }
 
-    /// Transform into the cycle-phase prover, computing RA polynomials and bound_val_evals.
-    pub fn into_cycle_prover(mut self) -> BytecodeReadRafCycleSumcheckProver<F> {
-        // Compute bound_val_evals from val_polys
-        let int_poly = self.params.int_poly.final_sumcheck_claim();
-        let bound_val_evals: [F; N_STAGES] = self
-            .params
-            .val_polys
-            .iter()
-            .zip([
-                int_poly * self.params.gamma_powers[5],
-                F::zero(),
-                int_poly * self.params.gamma_powers[4],
-                F::zero(),
-                F::zero(),
-            ])
-            .map(|(poly, int_term)| poly.final_sumcheck_claim() + int_term)
-            .collect::<Vec<F>>()
-            .try_into()
-            .unwrap();
-
-        // Reverse r_address_prime to get the correct order
-        let mut r_address = std::mem::take(&mut self.r_address_prime);
-        r_address.reverse();
-
-        let r_address_chunks = self
-            .params
-            .one_hot_params
-            .compute_r_address_chunks::<F>(&r_address);
-
-        // Build RA polynomials
-        let ra: Vec<RaPolynomial<u8, F>> = r_address_chunks
-            .iter()
-            .enumerate()
-            .map(|(i, r_address_chunk)| {
-                let ra_i: Vec<Option<u8>> = self
-                    .trace
-                    .par_iter()
-                    .map(|cycle| {
-                        let pc = self.bytecode_preprocessing.get_pc(cycle);
-                        Some(self.params.one_hot_params.bytecode_pc_chunk(pc, i))
-                    })
-                    .collect();
-                RaPolynomial::new(Arc::new(ra_i), EqPolynomial::evals(r_address_chunk))
-            })
-            .collect();
-
-        BytecodeReadRafCycleSumcheckProver {
-            ra,
-            gruen_eq_polys: self.gruen_eq_polys,
-            prev_round_claims: self.prev_round_claims,
-            prev_round_polys: None,
-            bound_val_evals,
-            params: self.params,
-        }
-    }
-
     fn compute_message_impl(&mut self, _previous_claim: F) -> UniPoly<F> {
         const DEGREE: usize = 2;
 
@@ -942,7 +869,7 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
 /// Bytecode Read+RAF Cycle-Phase Sumcheck Prover.
 ///
 /// This prover handles the remaining `log_T` rounds (cycle variables).
-/// It is constructed from `BytecodeReadRafAddressSumcheckProver::into_cycle_prover()`.
+/// It is constructed from scratch via [`BytecodeReadRafCycleSumcheckProver::initialize`].
 #[derive(Allocative)]
 pub struct BytecodeReadRafCycleSumcheckProver<F: JoltField> {
     /// Chunked RA polynomials over address variables.
@@ -960,6 +887,96 @@ pub struct BytecodeReadRafCycleSumcheckProver<F: JoltField> {
 }
 
 impl<F: JoltField> BytecodeReadRafCycleSumcheckProver<F> {
+    /// Initialize the cycle-phase prover from scratch (Option B).
+    ///
+    /// This recomputes the address-phase internal state (per-stage claims and bound value
+    /// evaluations) by replaying the address binding using the Stage 6a challenges from the
+    /// accumulator. This avoids passing prover state across stages at the cost of extra work.
+    #[tracing::instrument(skip_all, name = "BytecodeReadRafCycleSumcheckProver::initialize")]
+    pub fn initialize(
+        params: BytecodeReadRafSumcheckParams<F>,
+        trace: Arc<Vec<Cycle>>,
+        bytecode_preprocessing: Arc<BytecodePreprocessing>,
+        accumulator: &ProverOpeningAccumulator<F>,
+    ) -> Self {
+        // Recover Stage 6a address challenges from the accumulator.
+        // Address-phase cache_openings stored them as BIG_ENDIAN (MSB-first).
+        let (r_address_point, _) = accumulator.get_virtual_polynomial_opening(
+            VirtualPolynomial::BytecodeReadRafAddrClaim,
+            SumcheckId::BytecodeReadRafAddressPhase,
+        );
+
+        // Sumcheck challenges were generated LowToHigh; recover that order for replay.
+        let mut r_address_low_to_high = r_address_point.r.clone();
+        r_address_low_to_high.reverse();
+
+        // Re-run the address prover deterministically (no transcript) to recover:
+        // - per-stage claims after binding all address variables
+        // - bound value evaluations (Val + RAF Int folds) as scalars
+        let mut addr = BytecodeReadRafAddressSumcheckProver::initialize(
+            params.clone(),
+            Arc::clone(&trace),
+            Arc::clone(&bytecode_preprocessing),
+        );
+        for (round, r_j) in r_address_low_to_high.iter().cloned().enumerate() {
+            let _ = round; // replay is round-agnostic for this instance
+                           // previous_claim is ignored by this instance (it uses internal per-stage state).
+            let _ = addr.compute_message_impl(F::zero());
+            addr.ingest_challenge_impl(r_j);
+        }
+
+        // Compute bound_val_evals from the now-fully-bound val_polys and int_poly.
+        let int_poly = addr.params.int_poly.final_sumcheck_claim();
+        let bound_val_evals: [F; N_STAGES] = addr
+            .params
+            .val_polys
+            .iter()
+            .zip([
+                int_poly * addr.params.gamma_powers[5],
+                F::zero(),
+                int_poly * addr.params.gamma_powers[4],
+                F::zero(),
+                F::zero(),
+            ])
+            .map(|(poly, int_term)| poly.final_sumcheck_claim() + int_term)
+            .collect::<Vec<F>>()
+            .try_into()
+            .unwrap();
+
+        // Build RA polynomials from witness using MSB-first address challenges.
+        let r_address_chunks = params
+            .one_hot_params
+            .compute_r_address_chunks::<F>(&r_address_point.r);
+        let ra: Vec<RaPolynomial<u8, F>> = r_address_chunks
+            .iter()
+            .enumerate()
+            .map(|(i, r_address_chunk)| {
+                let ra_i: Vec<Option<u8>> = trace
+                    .par_iter()
+                    .map(|cycle| {
+                        let pc = bytecode_preprocessing.get_pc(cycle);
+                        Some(params.one_hot_params.bytecode_pc_chunk(pc, i))
+                    })
+                    .collect();
+                RaPolynomial::new(Arc::new(ra_i), EqPolynomial::evals(r_address_chunk))
+            })
+            .collect();
+
+        let gruen_eq_polys = params
+            .r_cycles
+            .each_ref()
+            .map(|r_cycle| GruenSplitEqPolynomial::new(r_cycle, BindingOrder::LowToHigh));
+
+        Self {
+            ra,
+            gruen_eq_polys,
+            prev_round_claims: addr.prev_round_claims,
+            prev_round_polys: None,
+            bound_val_evals,
+            params,
+        }
+    }
+
     fn compute_message_impl(&mut self, _previous_claim: F) -> UniPoly<F> {
         let degree = self.params.degree();
 
@@ -1244,6 +1261,11 @@ impl<F: JoltField> BytecodeReadRafAddressSumcheckVerifier<F> {
         }
     }
 
+    /// Consume this verifier and return the underlying parameters (for Option B orchestration).
+    pub fn into_params(self) -> BytecodeReadRafSumcheckParams<F> {
+        self.params
+    }
+
     pub fn into_cycle_verifier(self) -> BytecodeReadRafCycleSumcheckVerifier<F> {
         BytecodeReadRafCycleSumcheckVerifier {
             params: self.params,
@@ -1300,6 +1322,12 @@ pub struct BytecodeReadRafCycleSumcheckVerifier<F: JoltField> {
     params: BytecodeReadRafSumcheckParams<F>,
 }
 
+impl<F: JoltField> BytecodeReadRafCycleSumcheckVerifier<F> {
+    pub fn new(params: BytecodeReadRafSumcheckParams<F>) -> Self {
+        Self { params }
+    }
+}
+
 impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
     for BytecodeReadRafCycleSumcheckVerifier<F>
 {
diff --git a/jolt-core/src/zkvm/prover.rs b/jolt-core/src/zkvm/prover.rs
index aeddfd54d2..35a2455ad4 100644
--- a/jolt-core/src/zkvm/prover.rs
+++ b/jolt-core/src/zkvm/prover.rs
@@ -158,10 +158,6 @@ pub struct JoltCpuProver<
     /// The advice claim reduction sumcheck effectively spans two stages (6 and 7).
     /// Cache the prover state here between stages.
     advice_reduction_prover_untrusted: Option<AdviceClaimReductionProver<F>>,
-    /// BytecodeReadRaf spans Stage 6a (address) and Stage 6b (cycle).
-    bytecode_read_raf_cycle_prover: Option<BytecodeReadRafCycleSumcheckProver<F>>,
-    /// Booleanity spans Stage 6a (address) and Stage 6b (cycle).
-    booleanity_cycle_prover: Option<BooleanityCycleSumcheckProver<F>>,
     pub unpadded_trace_len: usize,
     pub padded_trace_len: usize,
     pub transcript: ProofTranscript,
@@ -411,8 +407,6 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             },
             advice_reduction_prover_trusted: None,
             advice_reduction_prover_untrusted: None,
-            bytecode_read_raf_cycle_prover: None,
-            booleanity_cycle_prover: None,
             unpadded_trace_len,
             padded_trace_len,
             transcript,
@@ -465,8 +459,10 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         let stage3_sumcheck_proof = self.prove_stage3();
         let stage4_sumcheck_proof = self.prove_stage4();
         let stage5_sumcheck_proof = self.prove_stage5();
-        let stage6a_sumcheck_proof = self.prove_stage6a();
-        let stage6b_sumcheck_proof = self.prove_stage6b();
+        let (stage6a_sumcheck_proof, bytecode_read_raf_params, booleanity_params) =
+            self.prove_stage6a();
+        let stage6b_sumcheck_proof =
+            self.prove_stage6b(bytecode_read_raf_params, booleanity_params);
         let stage7_sumcheck_proof = self.prove_stage7();
 
         let joint_opening_proof = self.prove_stage8(opening_proof_hints);
@@ -1083,7 +1079,13 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
     }
 
     #[tracing::instrument(skip_all)]
-    fn prove_stage6a(&mut self) -> SumcheckInstanceProof<F, ProofTranscript> {
+    fn prove_stage6a(
+        &mut self,
+    ) -> (
+        SumcheckInstanceProof<F, ProofTranscript>,
+        BytecodeReadRafSumcheckParams<F>,
+        BooleanitySumcheckParams<F>,
+    ) {
         #[cfg(not(target_arch = "wasm32"))]
         print_current_memory_usage("Stage 6a baseline");
 
@@ -1103,12 +1105,12 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         );
 
         let mut bytecode_read_raf = BytecodeReadRafAddressSumcheckProver::initialize(
-            bytecode_read_raf_params,
+            bytecode_read_raf_params.clone(),
             Arc::clone(&self.trace),
             Arc::clone(&self.preprocessing.shared.bytecode),
         );
         let mut booleanity = BooleanityAddressSumcheckProver::initialize(
-            booleanity_params,
+            booleanity_params.clone(),
             &self.trace,
             &self.preprocessing.shared.bytecode,
             &self.program_io.memory_layout,
@@ -1137,14 +1139,15 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         #[cfg(feature = "allocative")]
         write_instance_flamegraph_svg(&instances, "stage6a_end_flamechart.svg");
 
-        self.bytecode_read_raf_cycle_prover = Some(bytecode_read_raf.into_cycle_prover());
-        self.booleanity_cycle_prover = Some(booleanity.into_cycle_prover());
-
-        sumcheck_proof
+        (sumcheck_proof, bytecode_read_raf_params, booleanity_params)
     }
 
     #[tracing::instrument(skip_all)]
-    fn prove_stage6b(&mut self) -> SumcheckInstanceProof<F, ProofTranscript> {
+    fn prove_stage6b(
+        &mut self,
+        bytecode_read_raf_params: BytecodeReadRafSumcheckParams<F>,
+        booleanity_params: BooleanitySumcheckParams<F>,
+    ) -> SumcheckInstanceProof<F, ProofTranscript> {
         #[cfg(not(target_arch = "wasm32"))]
         print_current_memory_usage("Stage 6b baseline");
 
@@ -1218,14 +1221,20 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             };
         }
 
-        let mut bytecode_read_raf = self
-            .bytecode_read_raf_cycle_prover
-            .take()
-            .expect("Stage 6b missing BytecodeReadRaf cycle prover");
-        let mut booleanity = self
-            .booleanity_cycle_prover
-            .take()
-            .expect("Stage 6b missing Booleanity cycle prover");
+        // Initialize Stage 6b cycle provers from scratch (Option B).
+        let mut bytecode_read_raf = BytecodeReadRafCycleSumcheckProver::initialize(
+            bytecode_read_raf_params,
+            Arc::clone(&self.trace),
+            Arc::clone(&self.preprocessing.shared.bytecode),
+            &self.opening_accumulator,
+        );
+        let mut booleanity = BooleanityCycleSumcheckProver::initialize(
+            booleanity_params,
+            &self.trace,
+            &self.preprocessing.shared.bytecode,
+            &self.program_io.memory_layout,
+            &self.opening_accumulator,
+        );
         let mut ram_hamming_booleanity =
             HammingBooleanitySumcheckProver::initialize(ram_hamming_booleanity_params, &self.trace);
 
diff --git a/jolt-core/src/zkvm/verifier.rs b/jolt-core/src/zkvm/verifier.rs
index b33878048a..7d87c3573c 100644
--- a/jolt-core/src/zkvm/verifier.rs
+++ b/jolt-core/src/zkvm/verifier.rs
@@ -20,6 +20,7 @@ use crate::zkvm::Serializable;
 use crate::zkvm::{
     bytecode::read_raf_checking::{
         BytecodeReadRafAddressSumcheckVerifier, BytecodeReadRafCycleSumcheckVerifier,
+        BytecodeReadRafSumcheckParams,
     },
     claim_reductions::{
         AdviceClaimReductionVerifier, AdviceKind, HammingWeightClaimReductionVerifier,
@@ -95,10 +96,6 @@ pub struct JoltVerifier<
     /// The advice claim reduction sumcheck effectively spans two stages (6 and 7).
     /// Cache the verifier state here between stages.
     advice_reduction_verifier_untrusted: Option<AdviceClaimReductionVerifier<F>>,
-    /// BytecodeReadRaf spans Stage 6a (address) and Stage 6b (cycle).
-    bytecode_read_raf_cycle_verifier: Option<BytecodeReadRafCycleSumcheckVerifier<F>>,
-    /// Booleanity spans Stage 6a (address) and Stage 6b (cycle).
-    booleanity_cycle_verifier: Option<BooleanityCycleSumcheckVerifier<F>>,
     pub spartan_key: UniformSpartanKey<F>,
     pub one_hot_params: OneHotParams,
 }
@@ -180,8 +177,6 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             opening_accumulator,
             advice_reduction_verifier_trusted: None,
             advice_reduction_verifier_untrusted: None,
-            bytecode_read_raf_cycle_verifier: None,
-            booleanity_cycle_verifier: None,
             spartan_key,
             one_hot_params,
         })
@@ -218,8 +213,8 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
         self.verify_stage3()?;
         self.verify_stage4()?;
         self.verify_stage5()?;
-        self.verify_stage6a()?;
-        self.verify_stage6b()?;
+        let (bytecode_read_raf_params, booleanity_params) = self.verify_stage6a()?;
+        self.verify_stage6b(bytecode_read_raf_params, booleanity_params)?;
         self.verify_stage7()?;
         self.verify_stage8()?;
 
@@ -418,7 +413,15 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
         Ok(())
     }
 
-    fn verify_stage6a(&mut self) -> Result<(), anyhow::Error> {
+    fn verify_stage6a(
+        &mut self,
+    ) -> Result<
+        (
+            BytecodeReadRafSumcheckParams<F>,
+            BooleanitySumcheckParams<F>,
+        ),
+        anyhow::Error,
+    > {
         let n_cycle_vars = self.proof.trace_length.log_2();
         let bytecode_read_raf = BytecodeReadRafAddressSumcheckVerifier::new(
             &self.preprocessing.shared.bytecode,
@@ -445,22 +448,17 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             &mut self.transcript,
         )
         .context("Stage 6a")?;
-
-        self.bytecode_read_raf_cycle_verifier = Some(bytecode_read_raf.into_cycle_verifier());
-        self.booleanity_cycle_verifier = Some(booleanity.into_cycle_verifier());
-
-        Ok(())
+        Ok((bytecode_read_raf.into_params(), booleanity.into_params()))
     }
 
-    fn verify_stage6b(&mut self) -> Result<(), anyhow::Error> {
-        let bytecode_read_raf = self
-            .bytecode_read_raf_cycle_verifier
-            .take()
-            .expect("Stage 6b missing BytecodeReadRaf cycle verifier");
-        let booleanity = self
-            .booleanity_cycle_verifier
-            .take()
-            .expect("Stage 6b missing Booleanity cycle verifier");
+    fn verify_stage6b(
+        &mut self,
+        bytecode_read_raf_params: BytecodeReadRafSumcheckParams<F>,
+        booleanity_params: BooleanitySumcheckParams<F>,
+    ) -> Result<(), anyhow::Error> {
+        // Initialize Stage 6b cycle verifiers from scratch (Option B).
+        let bytecode_read_raf = BytecodeReadRafCycleSumcheckVerifier::new(bytecode_read_raf_params);
+        let booleanity = BooleanityCycleSumcheckVerifier::new(booleanity_params);
         let ram_hamming_booleanity =
             HammingBooleanitySumcheckVerifier::new(&self.opening_accumulator);
         let ram_ra_virtual = RamRaVirtualSumcheckVerifier::new(

From 2df3d33d47a027f86e522a1cc9f8f65908ebaf4e Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Mon, 19 Jan 2026 20:12:32 -0800
Subject: [PATCH 04/16] feat(zkvm): add bytecode claim reduction sumcheck

---
 jolt-core/src/poly/rlc_polynomial.rs          |   5 +
 .../src/zkvm/bytecode/read_raf_checking.rs    | 241 +++++--
 .../src/zkvm/claim_reductions/bytecode.rs     | 672 ++++++++++++++++++
 jolt-core/src/zkvm/claim_reductions/mod.rs    |   5 +
 jolt-core/src/zkvm/proof_serialization.rs     |  13 +-
 jolt-core/src/zkvm/prover.rs                  |  34 +-
 jolt-core/src/zkvm/verifier.rs                |  43 +-
 jolt-core/src/zkvm/witness.rs                 |   9 +
 8 files changed, 957 insertions(+), 65 deletions(-)
 create mode 100644 jolt-core/src/zkvm/claim_reductions/bytecode.rs

diff --git a/jolt-core/src/poly/rlc_polynomial.rs b/jolt-core/src/poly/rlc_polynomial.rs
index 47a68c231e..5a657549b1 100644
--- a/jolt-core/src/poly/rlc_polynomial.rs
+++ b/jolt-core/src/poly/rlc_polynomial.rs
@@ -191,6 +191,11 @@ impl<F: JoltField> RLCPolynomial<F> {
                 | CommittedPolynomial::RamRa(_) => {
                     onehot_polys.push((*poly_id, *coeff));
                 }
+                CommittedPolynomial::BytecodeChunk(_) => {
+                    // Bytecode chunk polynomials are staged for later integration into Stage 8
+                    // streaming (see bytecode commitment track).
+                    panic!("BytecodeChunk polynomials are not yet supported in streaming RLC");
+                }
                 CommittedPolynomial::TrustedAdvice | CommittedPolynomial::UntrustedAdvice => {
                     // Advice polynomials are passed in directly (not streamed from trace)
                     if advice_poly_map.contains_key(poly_id) {
diff --git a/jolt-core/src/zkvm/bytecode/read_raf_checking.rs b/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
index edf8e185f3..9ddc776262 100644
--- a/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
+++ b/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
@@ -855,9 +855,24 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
             transcript,
             VirtualPolynomial::BytecodeReadRafAddrClaim,
             SumcheckId::BytecodeReadRafAddressPhase,
-            opening_point,
+            opening_point.clone(),
             address_claim,
         );
+
+        // Emit Val-only claims at the Stage 6a boundary only when the cycle phase has enough
+        // randomness to support the bytecode claim reduction path (`log_T >= log_K`).
+        if self.params.log_T >= self.params.log_K {
+            for stage in 0..N_STAGES {
+                let claim = self.params.val_polys[stage].final_sumcheck_claim();
+                accumulator.append_virtual(
+                    transcript,
+                    VirtualPolynomial::BytecodeValStage(stage),
+                    SumcheckId::BytecodeReadRafAddressPhase,
+                    opening_point.clone(),
+                    claim,
+                );
+            }
+        }
     }
 
     #[cfg(feature = "allocative")]
@@ -1250,14 +1265,31 @@ impl<F: JoltField> BytecodeReadRafAddressSumcheckVerifier<F> {
         opening_accumulator: &VerifierOpeningAccumulator<F>,
         transcript: &mut impl Transcript,
     ) -> Self {
+        let log_k = one_hot_params.bytecode_k.log_2();
         Self {
-            params: BytecodeReadRafSumcheckParams::gen(
-                bytecode_preprocessing,
-                n_cycle_vars,
-                one_hot_params,
-                opening_accumulator,
-                transcript,
-            ),
+            // If `log_T >= log_K_bytecode`, the verifier can use the fast path (no bytecode-length
+            // work) by consuming `Val_s(r_bc)` from Stage 6a and (eventually) checking them via
+            // BytecodeClaimReduction + committed bytecode.
+            //
+            // Otherwise, we fall back to the legacy path and materialize the Val polynomials
+            // (O(K_bytecode)) to keep soundness without requiring extra padding.
+            params: if n_cycle_vars >= log_k {
+                BytecodeReadRafSumcheckParams::gen_verifier(
+                    bytecode_preprocessing,
+                    n_cycle_vars,
+                    one_hot_params,
+                    opening_accumulator,
+                    transcript,
+                )
+            } else {
+                BytecodeReadRafSumcheckParams::gen(
+                    bytecode_preprocessing,
+                    n_cycle_vars,
+                    one_hot_params,
+                    opening_accumulator,
+                    transcript,
+                )
+            },
         }
     }
 
@@ -1309,12 +1341,26 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
     ) {
         let mut r_address = sumcheck_challenges.to_vec();
         r_address.reverse();
+        let opening_point = OpeningPoint::<BIG_ENDIAN, F>::new(r_address);
         accumulator.append_virtual(
             transcript,
             VirtualPolynomial::BytecodeReadRafAddrClaim,
             SumcheckId::BytecodeReadRafAddressPhase,
-            OpeningPoint::<BIG_ENDIAN, F>::new(r_address),
+            opening_point.clone(),
         );
+
+        // Populate opening points for the Val-only bytecode stage claims emitted in Stage 6a,
+        // but only when that fast path is enabled (`log_T >= log_K`).
+        if self.params.log_T >= self.params.log_K {
+            for stage in 0..N_STAGES {
+                accumulator.append_virtual(
+                    transcript,
+                    VirtualPolynomial::BytecodeValStage(stage),
+                    SumcheckId::BytecodeReadRafAddressPhase,
+                    opening_point.clone(),
+                );
+            }
+        }
     }
 }
 
@@ -1375,25 +1421,47 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
                 .1
         });
 
-        let val = self
-            .params
-            .val_polys
-            .iter()
-            .zip(&self.params.r_cycles)
-            .zip(&self.params.gamma_powers)
-            .zip([
-                int_poly * self.params.gamma_powers[5], // RAF for Stage1
-                F::zero(),                              // There's no raf for Stage2
-                int_poly * self.params.gamma_powers[4], // RAF for Stage3
-                F::zero(),                              // There's no raf for Stage4
-                F::zero(),                              // There's no raf for Stage5
-            ])
-            .map(|(((val, r_cycle), gamma), int_poly)| {
-                (val.evaluate(&r_address_prime.r) + int_poly)
-                    * EqPolynomial::<F>::mle(r_cycle, &r_cycle_prime.r)
-                    * gamma
-            })
-            .sum::<F>();
+        let int_terms = [
+            int_poly * self.params.gamma_powers[5], // RAF for Stage1
+            F::zero(),                              // There's no raf for Stage2
+            int_poly * self.params.gamma_powers[4], // RAF for Stage3
+            F::zero(),                              // There's no raf for Stage4
+            F::zero(),                              // There's no raf for Stage5
+        ];
+        let val = if self.params.val_polys[0].original_len() == 0 {
+            // Fast verifier path: consume Val_s(r_bc) claims emitted at the Stage 6a boundary,
+            // rather than re-evaluating `val_polys` (O(K_bytecode)).
+            (0..N_STAGES)
+                .zip(self.params.r_cycles.iter())
+                .zip(self.params.gamma_powers.iter())
+                .zip(int_terms)
+                .map(|(((stage, r_cycle), gamma), int_term)| {
+                    let val_claim = accumulator
+                        .get_virtual_polynomial_opening(
+                            VirtualPolynomial::BytecodeValStage(stage),
+                            SumcheckId::BytecodeReadRafAddressPhase,
+                        )
+                        .1;
+                    (val_claim + int_term)
+                        * EqPolynomial::<F>::mle(r_cycle, &r_cycle_prime.r)
+                        * *gamma
+                })
+                .sum::<F>()
+        } else {
+            // Legacy verifier path: directly evaluate Val polynomials at r_bc (O(K_bytecode)).
+            self.params
+                .val_polys
+                .iter()
+                .zip(&self.params.r_cycles)
+                .zip(&self.params.gamma_powers)
+                .zip(int_terms)
+                .map(|(((val, r_cycle), gamma), int_term)| {
+                    (val.evaluate(&r_address_prime.r) + int_term)
+                        * EqPolynomial::<F>::mle(r_cycle, &r_cycle_prime.r)
+                        * *gamma
+                })
+                .sum::<F>()
+        };
 
         ra_claims.fold(val, |running, ra_claim| running * ra_claim)
     }
@@ -1456,6 +1524,13 @@ pub struct BytecodeReadRafSumcheckParams<F: JoltField> {
     /// Identity polynomial over address vars used to inject RAF contributions.
     pub int_poly: IdentityPolynomial<F>,
     pub r_cycles: [Vec<F::Challenge>; N_STAGES],
+    /// Stage-specific batching gammas used to define Val(k) polynomials.
+    /// Stored so later claim reductions can reconstruct lane weights without resampling the transcript.
+    pub stage1_gammas: Vec<F>,
+    pub stage2_gammas: Vec<F>,
+    pub stage3_gammas: Vec<F>,
+    pub stage4_gammas: Vec<F>,
+    pub stage5_gammas: Vec<F>,
 }
 
 impl<F: JoltField> BytecodeReadRafSumcheckParams<F> {
@@ -1466,6 +1541,44 @@ impl<F: JoltField> BytecodeReadRafSumcheckParams<F> {
         one_hot_params: &OneHotParams,
         opening_accumulator: &dyn OpeningAccumulator<F>,
         transcript: &mut impl Transcript,
+    ) -> Self {
+        Self::gen_impl(
+            bytecode_preprocessing,
+            n_cycle_vars,
+            one_hot_params,
+            opening_accumulator,
+            transcript,
+            true,
+        )
+    }
+
+    /// Verifier-side generator: avoids materializing Val(k) polynomials (O(K_bytecode)).
+    #[tracing::instrument(skip_all, name = "BytecodeReadRafSumcheckParams::gen_verifier")]
+    pub fn gen_verifier(
+        bytecode_preprocessing: &BytecodePreprocessing,
+        n_cycle_vars: usize,
+        one_hot_params: &OneHotParams,
+        opening_accumulator: &dyn OpeningAccumulator<F>,
+        transcript: &mut impl Transcript,
+    ) -> Self {
+        Self::gen_impl(
+            bytecode_preprocessing,
+            n_cycle_vars,
+            one_hot_params,
+            opening_accumulator,
+            transcript,
+            false,
+        )
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    fn gen_impl(
+        bytecode_preprocessing: &BytecodePreprocessing,
+        n_cycle_vars: usize,
+        one_hot_params: &OneHotParams,
+        opening_accumulator: &dyn OpeningAccumulator<F>,
+        transcript: &mut impl Transcript,
+        compute_val_polys: bool,
     ) -> Self {
         let gamma_powers = transcript.challenge_scalar_powers(7);
 
@@ -1486,38 +1599,43 @@ impl<F: JoltField> BytecodeReadRafSumcheckParams<F> {
         let rv_claim_5 = Self::compute_rv_claim_5(opening_accumulator, &stage5_gammas);
         let rv_claims = [rv_claim_1, rv_claim_2, rv_claim_3, rv_claim_4, rv_claim_5];
 
-        // Pre-compute eq_r_register for stages 4 and 5 (they use different r_register points)
-        let r_register_4 = opening_accumulator
-            .get_virtual_polynomial_opening(
-                VirtualPolynomial::RdWa,
-                SumcheckId::RegistersReadWriteChecking,
-            )
-            .0
-            .r;
-        let eq_r_register_4 =
-            EqPolynomial::<F>::evals(&r_register_4[..(REGISTER_COUNT as usize).log_2()]);
-
-        let r_register_5 = opening_accumulator
-            .get_virtual_polynomial_opening(
-                VirtualPolynomial::RdWa,
-                SumcheckId::RegistersValEvaluation,
+        let val_polys = if compute_val_polys {
+            // Pre-compute eq_r_register for stages 4 and 5 (they use different r_register points)
+            let r_register_4 = opening_accumulator
+                .get_virtual_polynomial_opening(
+                    VirtualPolynomial::RdWa,
+                    SumcheckId::RegistersReadWriteChecking,
+                )
+                .0
+                .r;
+            let eq_r_register_4 =
+                EqPolynomial::<F>::evals(&r_register_4[..(REGISTER_COUNT as usize).log_2()]);
+
+            let r_register_5 = opening_accumulator
+                .get_virtual_polynomial_opening(
+                    VirtualPolynomial::RdWa,
+                    SumcheckId::RegistersValEvaluation,
+                )
+                .0
+                .r;
+            let eq_r_register_5 =
+                EqPolynomial::<F>::evals(&r_register_5[..(REGISTER_COUNT as usize).log_2()]);
+
+            // Fused pass: compute all val polynomials in a single parallel iteration
+            Self::compute_val_polys(
+                bytecode,
+                &eq_r_register_4,
+                &eq_r_register_5,
+                &stage1_gammas,
+                &stage2_gammas,
+                &stage3_gammas,
+                &stage4_gammas,
+                &stage5_gammas,
             )
-            .0
-            .r;
-        let eq_r_register_5 =
-            EqPolynomial::<F>::evals(&r_register_5[..(REGISTER_COUNT as usize).log_2()]);
-
-        // Fused pass: compute all val polynomials in a single parallel iteration
-        let val_polys = Self::compute_val_polys(
-            bytecode,
-            &eq_r_register_4,
-            &eq_r_register_5,
-            &stage1_gammas,
-            &stage2_gammas,
-            &stage3_gammas,
-            &stage4_gammas,
-            &stage5_gammas,
-        );
+        } else {
+            // Verifier doesn't need these (and must not iterate over bytecode).
+            array::from_fn(|_| MultilinearPolynomial::default())
+        };
 
         let int_poly = IdentityPolynomial::new(one_hot_params.bytecode_k.log_2());
 
@@ -1583,6 +1701,11 @@ impl<F: JoltField> BytecodeReadRafSumcheckParams<F> {
             raf_shift_claim,
             int_poly,
             r_cycles,
+            stage1_gammas,
+            stage2_gammas,
+            stage3_gammas,
+            stage4_gammas,
+            stage5_gammas,
         }
     }
 
diff --git a/jolt-core/src/zkvm/claim_reductions/bytecode.rs b/jolt-core/src/zkvm/claim_reductions/bytecode.rs
new file mode 100644
index 0000000000..31e64f94f3
--- /dev/null
+++ b/jolt-core/src/zkvm/claim_reductions/bytecode.rs
@@ -0,0 +1,672 @@
+//! Two-phase Bytecode claim reduction (Stage 6b cycle → Stage 7 lane/address).
+//!
+//! This reduction batches the 5 bytecode Val-stage claims emitted at the Stage 6a boundary:
+//! `Val_s(r_bc)` for `s = 0..5` (val-only; RAF terms excluded).
+//!
+//! High level:
+//! - Sample `η` and form `C_in = Σ_s η^s · Val_s(r_bc)`.
+//! - Define a canonical set of bytecode "lanes" (448 total) and a lane weight function
+//!   `W_η(lane) = Σ_s η^s · w_s(lane)` derived from the same stage-specific gammas used to
+//!   define `Val_s`.
+//! - Prove, via a two-phase sumcheck, that `C_in` equals a single linear functional of the
+//!   (eventual) committed bytecode chunk polynomials.
+//!
+//! NOTE: This module wires the reduction logic and emits openings for bytecode chunk polynomials.
+//! Commitment + Stage 8 batching integration is handled separately (see `bytecode-commitment-progress.md`).
+
+use std::cell::RefCell;
+use std::sync::Arc;
+
+use allocative::Allocative;
+use itertools::Itertools;
+use rayon::prelude::*;
+use strum::EnumCount;
+
+use crate::field::JoltField;
+use crate::poly::eq_poly::EqPolynomial;
+use crate::poly::multilinear_polynomial::{
+    BindingOrder, MultilinearPolynomial, PolynomialBinding, PolynomialEvaluation,
+};
+use crate::poly::opening_proof::{
+    OpeningAccumulator, OpeningPoint, ProverOpeningAccumulator, SumcheckId,
+    VerifierOpeningAccumulator, BIG_ENDIAN, LITTLE_ENDIAN,
+};
+use crate::poly::unipoly::UniPoly;
+use crate::subprotocols::sumcheck_prover::SumcheckInstanceProver;
+use crate::subprotocols::sumcheck_verifier::{SumcheckInstanceParams, SumcheckInstanceVerifier};
+use crate::transcripts::Transcript;
+use crate::utils::math::Math;
+use crate::utils::thread::unsafe_allocate_zero_vec;
+use crate::zkvm::bytecode::read_raf_checking::BytecodeReadRafSumcheckParams;
+use crate::zkvm::bytecode::BytecodePreprocessing;
+use crate::zkvm::instruction::{
+    CircuitFlags, Flags, InstructionFlags, InstructionLookup, NUM_CIRCUIT_FLAGS,
+    NUM_INSTRUCTION_FLAGS,
+};
+use crate::zkvm::lookup_table::LookupTables;
+use crate::zkvm::witness::{CommittedPolynomial, VirtualPolynomial};
+use common::constants::{REGISTER_COUNT, XLEN};
+
+const DEGREE_BOUND: usize = 2;
+const NUM_VAL_STAGES: usize = 5;
+
+/// Total lanes (authoritative ordering; see design doc).
+const fn total_lanes() -> usize {
+    3 * (REGISTER_COUNT as usize) // rs1, rs2, rd one-hot lanes
+        + 2 // unexpanded_pc, imm
+        + NUM_CIRCUIT_FLAGS
+        + NUM_INSTRUCTION_FLAGS
+        + LookupTables::<XLEN>::COUNT
+        + 1 // raf flag
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Allocative)]
+pub enum BytecodeReductionPhase {
+    CycleVariables,
+    LaneVariables,
+}
+
+#[derive(Clone, Allocative)]
+pub struct BytecodeClaimReductionParams<F: JoltField> {
+    pub phase: BytecodeReductionPhase,
+    pub eta: F,
+    pub eta_powers: [F; NUM_VAL_STAGES],
+    pub log_t: usize,
+    pub log_k_chunk: usize,
+    pub num_chunks: usize,
+    /// Bytecode address point, embedded into `log_t` bits by prefixing MSB zeros (BE).
+    pub r_bc_ext: OpeningPoint<BIG_ENDIAN, F>,
+    /// Per-chunk lane weight tables (length = k_chunk) for `W_eta`.
+    pub chunk_lane_weights: Vec<Vec<F>>,
+    /// (little-endian) challenges used in the cycle phase.
+    pub cycle_var_challenges: Vec<F::Challenge>,
+}
+
+impl<F: JoltField> BytecodeClaimReductionParams<F> {
+    pub fn new(
+        bytecode_read_raf_params: &BytecodeReadRafSumcheckParams<F>,
+        accumulator: &dyn OpeningAccumulator<F>,
+        transcript: &mut impl Transcript,
+    ) -> Self {
+        let log_t = bytecode_read_raf_params.log_T;
+        let log_k = bytecode_read_raf_params.log_K;
+        if log_t < log_k {
+            panic!(
+                "BytecodeClaimReduction requires log_T >= log_K_bytecode (got log_T={log_t}, log_K={log_k}). \
+                 Pad trace length to at least bytecode_len when enabling bytecode commitment/reduction."
+            );
+        }
+
+        let eta: F = transcript.challenge_scalar();
+        let mut eta_powers = [F::one(); NUM_VAL_STAGES];
+        for i in 1..NUM_VAL_STAGES {
+            eta_powers[i] = eta_powers[i - 1] * eta;
+        }
+
+        // r_bc comes from the Stage 6a BytecodeReadRaf address phase.
+        let (r_bc, _) = accumulator.get_virtual_polynomial_opening(
+            VirtualPolynomial::BytecodeReadRafAddrClaim,
+            SumcheckId::BytecodeReadRafAddressPhase,
+        );
+        let mut r_bc_ext: Vec<F::Challenge> = vec![F::Challenge::from(0u128); log_t - r_bc.len()];
+        r_bc_ext.extend_from_slice(&r_bc.r);
+        let r_bc_ext = OpeningPoint::<BIG_ENDIAN, F>::new(r_bc_ext);
+
+        let log_k_chunk = bytecode_read_raf_params.one_hot_params.log_k_chunk;
+        let k_chunk = 1 << log_k_chunk;
+        let num_chunks = total_lanes().div_ceil(k_chunk);
+
+        let chunk_lane_weights = compute_chunk_lane_weights(
+            bytecode_read_raf_params,
+            accumulator,
+            &eta_powers,
+            num_chunks,
+            k_chunk,
+        );
+
+        Self {
+            phase: BytecodeReductionPhase::CycleVariables,
+            eta,
+            eta_powers,
+            log_t,
+            log_k_chunk,
+            num_chunks,
+            r_bc_ext,
+            chunk_lane_weights,
+            cycle_var_challenges: vec![],
+        }
+    }
+}
+
+impl<F: JoltField> SumcheckInstanceParams<F> for BytecodeClaimReductionParams<F> {
+    fn input_claim(&self, accumulator: &dyn OpeningAccumulator<F>) -> F {
+        match self.phase {
+            BytecodeReductionPhase::CycleVariables => (0..NUM_VAL_STAGES)
+                .map(|stage| {
+                    let (_, val_claim) = accumulator.get_virtual_polynomial_opening(
+                        VirtualPolynomial::BytecodeValStage(stage),
+                        SumcheckId::BytecodeReadRafAddressPhase,
+                    );
+                    self.eta_powers[stage] * val_claim
+                })
+                .sum(),
+            BytecodeReductionPhase::LaneVariables => {
+                accumulator
+                    .get_virtual_polynomial_opening(
+                        VirtualPolynomial::BytecodeClaimReductionIntermediate,
+                        SumcheckId::BytecodeClaimReductionCyclePhase,
+                    )
+                    .1
+            }
+        }
+    }
+
+    fn degree(&self) -> usize {
+        DEGREE_BOUND
+    }
+
+    fn num_rounds(&self) -> usize {
+        match self.phase {
+            BytecodeReductionPhase::CycleVariables => self.log_t,
+            BytecodeReductionPhase::LaneVariables => self.log_k_chunk,
+        }
+    }
+
+    fn normalize_opening_point(
+        &self,
+        challenges: &[<F as JoltField>::Challenge],
+    ) -> OpeningPoint<BIG_ENDIAN, F> {
+        match self.phase {
+            BytecodeReductionPhase::CycleVariables => {
+                OpeningPoint::<LITTLE_ENDIAN, F>::new(challenges.to_vec()).match_endianness()
+            }
+            BytecodeReductionPhase::LaneVariables => {
+                // Full point: [lane || cycle] in big-endian.
+                let full_le: Vec<F::Challenge> =
+                    [self.cycle_var_challenges.as_slice(), challenges].concat();
+                OpeningPoint::<LITTLE_ENDIAN, F>::new(full_le).match_endianness()
+            }
+        }
+    }
+}
+
+#[derive(Allocative)]
+pub struct BytecodeClaimReductionProver<F: JoltField> {
+    pub params: BytecodeClaimReductionParams<F>,
+    /// Chunk polynomials B_i(lane, k) (eventually committed).
+    bytecode_chunks: Vec<MultilinearPolynomial<F>>,
+    /// Weight polynomials W_i(lane, k) = W_eta(lane) * eq(r_bc, k) (multilinear).
+    weight_chunks: Vec<MultilinearPolynomial<F>>,
+}
+
+impl<F: JoltField> BytecodeClaimReductionProver<F> {
+    #[tracing::instrument(skip_all, name = "BytecodeClaimReductionProver::initialize")]
+    pub fn initialize(
+        params: BytecodeClaimReductionParams<F>,
+        bytecode: Arc<BytecodePreprocessing>,
+    ) -> Self {
+        let log_t = params.log_t;
+        let t_size = 1 << log_t;
+        let k_chunk = 1 << params.log_k_chunk;
+
+        // Eq table over the (embedded) bytecode address point.
+        let eq_r_bc = EqPolynomial::<F>::evals(&params.r_bc_ext.r);
+        debug_assert_eq!(eq_r_bc.len(), t_size);
+
+        // Build per-chunk weight polynomials as an outer product (lane_weight ⊗ eq_r_bc).
+        let weight_chunks: Vec<MultilinearPolynomial<F>> = (0..params.num_chunks)
+            .into_par_iter()
+            .map(|chunk_idx| {
+                let lane_weights = &params.chunk_lane_weights[chunk_idx];
+                debug_assert_eq!(lane_weights.len(), k_chunk);
+                let mut coeffs: Vec<F> = unsafe_allocate_zero_vec(k_chunk * t_size);
+                for lane in 0..k_chunk {
+                    let w = lane_weights[lane];
+                    let base = lane * t_size;
+                    for k in 0..t_size {
+                        coeffs[base + k] = w * eq_r_bc[k];
+                    }
+                }
+                MultilinearPolynomial::from(coeffs)
+            })
+            .collect();
+
+        // Build per-chunk bytecode polynomials B_i(lane, k).
+        let bytecode_len = bytecode.bytecode.len();
+        let total = total_lanes();
+        let bytecode_chunks: Vec<MultilinearPolynomial<F>> = (0..params.num_chunks)
+            .into_par_iter()
+            .map(|chunk_idx| {
+                let mut coeffs: Vec<F> = unsafe_allocate_zero_vec(k_chunk * t_size);
+                for k in 0..t_size {
+                    if k >= bytecode_len {
+                        break;
+                    }
+                    let instr = &bytecode.bytecode[k];
+                    let normalized = instr.normalize();
+                    let circuit_flags = instr.circuit_flags();
+                    let instr_flags = instr.instruction_flags();
+                    let lookup_idx = instr
+                        .lookup_table()
+                        .map(|t| LookupTables::<XLEN>::enum_index(&t));
+                    let raf_flag =
+                        !crate::zkvm::instruction::InterleavedBitsMarker::is_interleaved_operands(
+                            &circuit_flags,
+                        );
+
+                    // Common scalars
+                    let unexpanded_pc = F::from_u64(normalized.address as u64);
+                    let imm = F::from_i128(normalized.operands.imm);
+                    let rs1 = normalized.operands.rs1;
+                    let rs2 = normalized.operands.rs2;
+                    let rd = normalized.operands.rd;
+
+                    for lane in 0..k_chunk {
+                        let global_lane = chunk_idx * k_chunk + lane;
+                        if global_lane >= total {
+                            break;
+                        }
+                        let value = lane_value::<F>(
+                            global_lane,
+                            rs1,
+                            rs2,
+                            rd,
+                            unexpanded_pc,
+                            imm,
+                            &circuit_flags,
+                            &instr_flags,
+                            lookup_idx,
+                            raf_flag,
+                        );
+                        coeffs[lane * t_size + k] = value;
+                    }
+                }
+                MultilinearPolynomial::from(coeffs)
+            })
+            .collect();
+
+        debug_assert_eq!(bytecode_chunks.len(), params.num_chunks);
+        debug_assert_eq!(weight_chunks.len(), params.num_chunks);
+
+        Self {
+            params,
+            bytecode_chunks,
+            weight_chunks,
+        }
+    }
+
+    fn compute_message_impl(&self, previous_claim: F) -> UniPoly<F> {
+        let half = self.bytecode_chunks[0].len() / 2;
+        let evals: [F; DEGREE_BOUND] = (0..half)
+            .into_par_iter()
+            .map(|j| {
+                let mut out = [F::zero(); DEGREE_BOUND];
+                for (b, w) in self.bytecode_chunks.iter().zip(self.weight_chunks.iter()) {
+                    let b_evals =
+                        b.sumcheck_evals_array::<DEGREE_BOUND>(j, BindingOrder::LowToHigh);
+                    let w_evals =
+                        w.sumcheck_evals_array::<DEGREE_BOUND>(j, BindingOrder::LowToHigh);
+                    for i in 0..DEGREE_BOUND {
+                        out[i] += b_evals[i] * w_evals[i];
+                    }
+                }
+                out
+            })
+            .reduce(
+                || [F::zero(); DEGREE_BOUND],
+                |mut acc, arr| {
+                    acc.iter_mut().zip(arr.iter()).for_each(|(a, b)| *a += *b);
+                    acc
+                },
+            );
+        UniPoly::from_evals_and_hint(previous_claim, &evals)
+    }
+}
+
+impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T> for BytecodeClaimReductionProver<F> {
+    fn get_params(&self) -> &dyn SumcheckInstanceParams<F> {
+        &self.params
+    }
+
+    fn compute_message(&mut self, _round: usize, previous_claim: F) -> UniPoly<F> {
+        self.compute_message_impl(previous_claim)
+    }
+
+    fn ingest_challenge(&mut self, r_j: F::Challenge, _round: usize) {
+        if self.params.phase == BytecodeReductionPhase::CycleVariables {
+            self.params.cycle_var_challenges.push(r_j);
+        }
+        self.bytecode_chunks
+            .iter_mut()
+            .for_each(|p| p.bind_parallel(r_j, BindingOrder::LowToHigh));
+        self.weight_chunks
+            .iter_mut()
+            .for_each(|p| p.bind_parallel(r_j, BindingOrder::LowToHigh));
+    }
+
+    fn cache_openings(
+        &self,
+        accumulator: &mut ProverOpeningAccumulator<F>,
+        transcript: &mut T,
+        sumcheck_challenges: &[F::Challenge],
+    ) {
+        match self.params.phase {
+            BytecodeReductionPhase::CycleVariables => {
+                // Cache intermediate claim for Stage 7.
+                let opening_point = self.params.normalize_opening_point(sumcheck_challenges);
+
+                let mut sum = F::zero();
+                for (b, w) in self.bytecode_chunks.iter().zip(self.weight_chunks.iter()) {
+                    debug_assert_eq!(b.len(), w.len());
+                    for i in 0..b.len() {
+                        sum += b.get_bound_coeff(i) * w.get_bound_coeff(i);
+                    }
+                }
+
+                accumulator.append_virtual(
+                    transcript,
+                    VirtualPolynomial::BytecodeClaimReductionIntermediate,
+                    SumcheckId::BytecodeClaimReductionCyclePhase,
+                    opening_point,
+                    sum,
+                );
+            }
+            BytecodeReductionPhase::LaneVariables => {
+                // Cache final openings of the bytecode chunk polynomials at the full point.
+                let opening_point = self.params.normalize_opening_point(sumcheck_challenges);
+                let (r_lane, r_cycle) = opening_point.split_at(self.params.log_k_chunk);
+
+                let polynomial_types: Vec<CommittedPolynomial> = (0..self.params.num_chunks)
+                    .map(CommittedPolynomial::BytecodeChunk)
+                    .collect();
+                let claims: Vec<F> = self
+                    .bytecode_chunks
+                    .iter()
+                    .map(|p| p.final_sumcheck_claim())
+                    .collect();
+
+                accumulator.append_sparse(
+                    transcript,
+                    polynomial_types,
+                    SumcheckId::BytecodeClaimReduction,
+                    r_lane.r,
+                    r_cycle.r,
+                    claims,
+                );
+            }
+        }
+    }
+}
+
+pub struct BytecodeClaimReductionVerifier<F: JoltField> {
+    pub params: RefCell<BytecodeClaimReductionParams<F>>,
+}
+
+impl<F: JoltField> BytecodeClaimReductionVerifier<F> {
+    pub fn new(params: BytecodeClaimReductionParams<F>) -> Self {
+        Self {
+            params: RefCell::new(params),
+        }
+    }
+}
+
+impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
+    for BytecodeClaimReductionVerifier<F>
+{
+    fn get_params(&self) -> &dyn SumcheckInstanceParams<F> {
+        unsafe { &*self.params.as_ptr() }
+    }
+
+    fn expected_output_claim(
+        &self,
+        accumulator: &VerifierOpeningAccumulator<F>,
+        sumcheck_challenges: &[F::Challenge],
+    ) -> F {
+        let params = self.params.borrow();
+        match params.phase {
+            BytecodeReductionPhase::CycleVariables => {
+                accumulator
+                    .get_virtual_polynomial_opening(
+                        VirtualPolynomial::BytecodeClaimReductionIntermediate,
+                        SumcheckId::BytecodeClaimReductionCyclePhase,
+                    )
+                    .1
+            }
+            BytecodeReductionPhase::LaneVariables => {
+                let opening_point = params.normalize_opening_point(sumcheck_challenges);
+                let (r_lane, r_cycle) = opening_point.split_at(params.log_k_chunk);
+
+                let eq_eval = EqPolynomial::<F>::mle(&r_cycle.r, &params.r_bc_ext.r);
+
+                // Evaluate each chunk's lane-weight polynomial at r_lane and combine with chunk openings.
+                let mut sum = F::zero();
+                for chunk_idx in 0..params.num_chunks {
+                    let (_, chunk_opening) = accumulator.get_committed_polynomial_opening(
+                        CommittedPolynomial::BytecodeChunk(chunk_idx),
+                        SumcheckId::BytecodeClaimReduction,
+                    );
+                    let w_poly =
+                        MultilinearPolynomial::from(params.chunk_lane_weights[chunk_idx].clone());
+                    let w_eval = w_poly.evaluate(&r_lane.r);
+                    sum += chunk_opening * w_eval;
+                }
+
+                sum * eq_eval
+            }
+        }
+    }
+
+    fn cache_openings(
+        &self,
+        accumulator: &mut VerifierOpeningAccumulator<F>,
+        transcript: &mut T,
+        sumcheck_challenges: &[F::Challenge],
+    ) {
+        let mut params = self.params.borrow_mut();
+        match params.phase {
+            BytecodeReductionPhase::CycleVariables => {
+                let opening_point = params.normalize_opening_point(sumcheck_challenges);
+                accumulator.append_virtual(
+                    transcript,
+                    VirtualPolynomial::BytecodeClaimReductionIntermediate,
+                    SumcheckId::BytecodeClaimReductionCyclePhase,
+                    opening_point,
+                );
+                // Record LE challenges for phase 2 normalization.
+                params.cycle_var_challenges = sumcheck_challenges.to_vec();
+            }
+            BytecodeReductionPhase::LaneVariables => {
+                let opening_point = params.normalize_opening_point(sumcheck_challenges);
+                let polynomial_types: Vec<CommittedPolynomial> = (0..params.num_chunks)
+                    .map(CommittedPolynomial::BytecodeChunk)
+                    .collect();
+                accumulator.append_sparse(
+                    transcript,
+                    polynomial_types,
+                    SumcheckId::BytecodeClaimReduction,
+                    opening_point.r,
+                );
+            }
+        }
+    }
+}
+
+fn compute_chunk_lane_weights<F: JoltField>(
+    bytecode_read_raf_params: &BytecodeReadRafSumcheckParams<F>,
+    accumulator: &dyn OpeningAccumulator<F>,
+    eta_powers: &[F; NUM_VAL_STAGES],
+    num_chunks: usize,
+    k_chunk: usize,
+) -> Vec<Vec<F>> {
+    let reg_count = REGISTER_COUNT as usize;
+    let total = total_lanes();
+
+    // Offsets (canonical lane ordering)
+    let rs1_start = 0usize;
+    let rs2_start = rs1_start + reg_count;
+    let rd_start = rs2_start + reg_count;
+    let unexp_pc_idx = rd_start + reg_count;
+    let imm_idx = unexp_pc_idx + 1;
+    let circuit_start = imm_idx + 1;
+    let instr_start = circuit_start + NUM_CIRCUIT_FLAGS;
+    let lookup_start = instr_start + NUM_INSTRUCTION_FLAGS;
+    let raf_flag_idx = lookup_start + LookupTables::<XLEN>::COUNT;
+    debug_assert_eq!(raf_flag_idx + 1, total);
+
+    // Eq tables for stage4/stage5 register selection weights.
+    let log_reg = reg_count.log_2();
+    let r_register_4 = accumulator
+        .get_virtual_polynomial_opening(
+            VirtualPolynomial::RdWa,
+            SumcheckId::RegistersReadWriteChecking,
+        )
+        .0
+        .r;
+    let eq_r_register_4 = EqPolynomial::<F>::evals(&r_register_4[..log_reg]);
+
+    let r_register_5 = accumulator
+        .get_virtual_polynomial_opening(VirtualPolynomial::RdWa, SumcheckId::RegistersValEvaluation)
+        .0
+        .r;
+    let eq_r_register_5 = EqPolynomial::<F>::evals(&r_register_5[..log_reg]);
+
+    let mut weights = vec![F::zero(); total];
+
+    // Stage 1
+    {
+        let coeff = eta_powers[0];
+        let g = &bytecode_read_raf_params.stage1_gammas;
+        weights[unexp_pc_idx] += coeff * g[0];
+        weights[imm_idx] += coeff * g[1];
+        for i in 0..NUM_CIRCUIT_FLAGS {
+            weights[circuit_start + i] += coeff * g[2 + i];
+        }
+    }
+
+    // Stage 2
+    {
+        let coeff = eta_powers[1];
+        let g = &bytecode_read_raf_params.stage2_gammas;
+        weights[circuit_start + (CircuitFlags::Jump as usize)] += coeff * g[0];
+        weights[instr_start + (InstructionFlags::Branch as usize)] += coeff * g[1];
+        weights[instr_start + (InstructionFlags::IsRdNotZero as usize)] += coeff * g[2];
+        weights[circuit_start + (CircuitFlags::WriteLookupOutputToRD as usize)] += coeff * g[3];
+    }
+
+    // Stage 3
+    {
+        let coeff = eta_powers[2];
+        let g = &bytecode_read_raf_params.stage3_gammas;
+        weights[imm_idx] += coeff * g[0];
+        weights[unexp_pc_idx] += coeff * g[1];
+        weights[instr_start + (InstructionFlags::LeftOperandIsRs1Value as usize)] += coeff * g[2];
+        weights[instr_start + (InstructionFlags::LeftOperandIsPC as usize)] += coeff * g[3];
+        weights[instr_start + (InstructionFlags::RightOperandIsRs2Value as usize)] += coeff * g[4];
+        weights[instr_start + (InstructionFlags::RightOperandIsImm as usize)] += coeff * g[5];
+        weights[instr_start + (InstructionFlags::IsNoop as usize)] += coeff * g[6];
+        weights[circuit_start + (CircuitFlags::VirtualInstruction as usize)] += coeff * g[7];
+        weights[circuit_start + (CircuitFlags::IsFirstInSequence as usize)] += coeff * g[8];
+    }
+
+    // Stage 4
+    {
+        let coeff = eta_powers[3];
+        let g = &bytecode_read_raf_params.stage4_gammas;
+        for r in 0..reg_count {
+            weights[rd_start + r] += coeff * g[0] * eq_r_register_4[r];
+            weights[rs1_start + r] += coeff * g[1] * eq_r_register_4[r];
+            weights[rs2_start + r] += coeff * g[2] * eq_r_register_4[r];
+        }
+    }
+
+    // Stage 5
+    {
+        let coeff = eta_powers[4];
+        let g = &bytecode_read_raf_params.stage5_gammas;
+        for r in 0..reg_count {
+            weights[rd_start + r] += coeff * g[0] * eq_r_register_5[r];
+        }
+        weights[raf_flag_idx] += coeff * g[1];
+        for i in 0..LookupTables::<XLEN>::COUNT {
+            weights[lookup_start + i] += coeff * g[2 + i];
+        }
+    }
+
+    // Chunk into k_chunk-sized blocks.
+    (0..num_chunks)
+        .map(|chunk_idx| {
+            (0..k_chunk)
+                .map(|lane| {
+                    let global = chunk_idx * k_chunk + lane;
+                    if global < total {
+                        weights[global]
+                    } else {
+                        F::zero()
+                    }
+                })
+                .collect_vec()
+        })
+        .collect_vec()
+}
+
+#[allow(clippy::too_many_arguments)]
+#[inline(always)]
+fn lane_value<F: JoltField>(
+    global_lane: usize,
+    rs1: Option<u8>,
+    rs2: Option<u8>,
+    rd: Option<u8>,
+    unexpanded_pc: F,
+    imm: F,
+    circuit_flags: &[bool; NUM_CIRCUIT_FLAGS],
+    instr_flags: &[bool; NUM_INSTRUCTION_FLAGS],
+    lookup_idx: Option<usize>,
+    raf_flag: bool,
+) -> F {
+    let reg_count = REGISTER_COUNT as usize;
+    let rs1_start = 0usize;
+    let rs2_start = rs1_start + reg_count;
+    let rd_start = rs2_start + reg_count;
+    let unexp_pc_idx = rd_start + reg_count;
+    let imm_idx = unexp_pc_idx + 1;
+    let circuit_start = imm_idx + 1;
+    let instr_start = circuit_start + NUM_CIRCUIT_FLAGS;
+    let lookup_start = instr_start + NUM_INSTRUCTION_FLAGS;
+    let raf_flag_idx = lookup_start + LookupTables::<XLEN>::COUNT;
+
+    if global_lane < rs2_start {
+        // rs1 one-hot
+        let r = global_lane as u8;
+        return F::from_bool(rs1 == Some(r));
+    }
+    if global_lane < rd_start {
+        // rs2 one-hot
+        let r = (global_lane - rs2_start) as u8;
+        return F::from_bool(rs2 == Some(r));
+    }
+    if global_lane < unexp_pc_idx {
+        // rd one-hot
+        let r = (global_lane - rd_start) as u8;
+        return F::from_bool(rd == Some(r));
+    }
+    if global_lane == unexp_pc_idx {
+        return unexpanded_pc;
+    }
+    if global_lane == imm_idx {
+        return imm;
+    }
+    if global_lane < instr_start {
+        let flag_idx = global_lane - circuit_start;
+        return F::from_bool(circuit_flags[flag_idx]);
+    }
+    if global_lane < lookup_start {
+        let flag_idx = global_lane - instr_start;
+        return F::from_bool(instr_flags[flag_idx]);
+    }
+    if global_lane < raf_flag_idx {
+        let table_idx = global_lane - lookup_start;
+        return F::from_bool(lookup_idx == Some(table_idx));
+    }
+    debug_assert_eq!(global_lane, raf_flag_idx);
+    F::from_bool(raf_flag)
+}
diff --git a/jolt-core/src/zkvm/claim_reductions/mod.rs b/jolt-core/src/zkvm/claim_reductions/mod.rs
index 5d19f993a1..d208bff0f9 100644
--- a/jolt-core/src/zkvm/claim_reductions/mod.rs
+++ b/jolt-core/src/zkvm/claim_reductions/mod.rs
@@ -1,4 +1,5 @@
 pub mod advice;
+pub mod bytecode;
 pub mod hamming_weight;
 pub mod increments;
 pub mod instruction_lookups;
@@ -9,6 +10,10 @@ pub use advice::{
     AdviceClaimReductionParams, AdviceClaimReductionProver, AdviceClaimReductionVerifier,
     AdviceKind,
 };
+pub use bytecode::{
+    BytecodeClaimReductionParams, BytecodeClaimReductionProver, BytecodeClaimReductionVerifier,
+    BytecodeReductionPhase,
+};
 pub use hamming_weight::{
     HammingWeightClaimReductionParams, HammingWeightClaimReductionProver,
     HammingWeightClaimReductionVerifier,
diff --git a/jolt-core/src/zkvm/proof_serialization.rs b/jolt-core/src/zkvm/proof_serialization.rs
index 2426b31124..f80340d81f 100644
--- a/jolt-core/src/zkvm/proof_serialization.rs
+++ b/jolt-core/src/zkvm/proof_serialization.rs
@@ -253,6 +253,10 @@ impl CanonicalSerialize for CommittedPolynomial {
                 3u8.serialize_with_mode(&mut writer, compress)?;
                 (u8::try_from(*i).unwrap()).serialize_with_mode(writer, compress)
             }
+            Self::BytecodeChunk(i) => {
+                7u8.serialize_with_mode(&mut writer, compress)?;
+                (u8::try_from(*i).unwrap()).serialize_with_mode(writer, compress)
+            }
             Self::RamRa(i) => {
                 4u8.serialize_with_mode(&mut writer, compress)?;
                 (u8::try_from(*i).unwrap()).serialize_with_mode(writer, compress)
@@ -265,7 +269,10 @@ impl CanonicalSerialize for CommittedPolynomial {
     fn serialized_size(&self, _compress: Compress) -> usize {
         match self {
             Self::RdInc | Self::RamInc | Self::TrustedAdvice | Self::UntrustedAdvice => 1,
-            Self::InstructionRa(_) | Self::BytecodeRa(_) | Self::RamRa(_) => 2,
+            Self::InstructionRa(_)
+            | Self::BytecodeRa(_)
+            | Self::BytecodeChunk(_)
+            | Self::RamRa(_) => 2,
         }
     }
 }
@@ -300,6 +307,10 @@ impl CanonicalDeserialize for CommittedPolynomial {
                 }
                 5 => Self::TrustedAdvice,
                 6 => Self::UntrustedAdvice,
+                7 => {
+                    let i = u8::deserialize_with_mode(reader, compress, validate)?;
+                    Self::BytecodeChunk(i as usize)
+                }
                 _ => return Err(SerializationError::InvalidData),
             },
         )
diff --git a/jolt-core/src/zkvm/prover.rs b/jolt-core/src/zkvm/prover.rs
index 35a2455ad4..e03dec909e 100644
--- a/jolt-core/src/zkvm/prover.rs
+++ b/jolt-core/src/zkvm/prover.rs
@@ -55,6 +55,7 @@ use crate::{
         bytecode::read_raf_checking::BytecodeReadRafSumcheckParams,
         claim_reductions::{
             AdviceClaimReductionParams, AdviceClaimReductionProver, AdviceKind,
+            BytecodeClaimReductionParams, BytecodeClaimReductionProver, BytecodeReductionPhase,
             HammingWeightClaimReductionParams, HammingWeightClaimReductionProver,
             IncClaimReductionSumcheckParams, IncClaimReductionSumcheckProver,
             InstructionLookupsClaimReductionSumcheckParams,
@@ -158,6 +159,9 @@ pub struct JoltCpuProver<
     /// The advice claim reduction sumcheck effectively spans two stages (6 and 7).
     /// Cache the prover state here between stages.
     advice_reduction_prover_untrusted: Option<AdviceClaimReductionProver<F>>,
+    /// The bytecode claim reduction sumcheck effectively spans two stages (6b and 7).
+    /// Cache the prover state here between stages.
+    bytecode_reduction_prover: Option<BytecodeClaimReductionProver<F>>,
     pub unpadded_trace_len: usize,
     pub padded_trace_len: usize,
     pub transcript: ProofTranscript,
@@ -407,6 +411,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             },
             advice_reduction_prover_trusted: None,
             advice_reduction_prover_untrusted: None,
+            bytecode_reduction_prover: None,
             unpadded_trace_len,
             padded_trace_len,
             transcript,
@@ -1170,6 +1175,24 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             &mut self.transcript,
         );
 
+        // Bytecode claim reduction (Phase 1 in Stage 6b): consumes Val_s(r_bc) from Stage 6a and
+        // caches an intermediate claim for Stage 7.
+        if bytecode_read_raf_params.log_T >= bytecode_read_raf_params.log_K {
+            let bytecode_reduction_params = BytecodeClaimReductionParams::new(
+                &bytecode_read_raf_params,
+                &self.opening_accumulator,
+                &mut self.transcript,
+            );
+            self.bytecode_reduction_prover = Some(BytecodeClaimReductionProver::initialize(
+                bytecode_reduction_params,
+                Arc::clone(&self.preprocessing.shared.bytecode),
+            ));
+        } else {
+            // Not enough cycle randomness to embed the bytecode index vars into Stage 6b.
+            // Fall back to the legacy verifier path (O(K_bytecode) in Stage 6b) by not running the reduction.
+            self.bytecode_reduction_prover = None;
+        }
+
         // Advice claim reduction (Phase 1 in Stage 6b): trusted and untrusted are separate instances.
         if self.advice.trusted_advice_polynomial.is_some() {
             let trusted_advice_params = AdviceClaimReductionParams::new(
@@ -1279,6 +1302,9 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             &mut lookups_ra_virtual,
             &mut inc_reduction,
         ];
+        if let Some(bytecode) = self.bytecode_reduction_prover.as_mut() {
+            instances.push(bytecode);
+        }
         if let Some(advice) = self.advice_reduction_prover_trusted.as_mut() {
             instances.push(advice);
         }
@@ -1289,6 +1315,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         #[cfg(feature = "allocative")]
         write_instance_flamegraph_svg(&instances, "stage6b_start_flamechart.svg");
         tracing::info!("Stage 6b proving");
+
         let (sumcheck_proof, _r_stage6b) = BatchedSumcheck::prove(
             instances.iter_mut().map(|v| &mut **v as _).collect(),
             &mut self.opening_accumulator,
@@ -1327,10 +1354,15 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         print_data_structure_heap_usage("HammingWeightClaimReductionProver", &hw_prover);
 
         // Run Stage 7 batched sumcheck (address rounds only).
-        // Includes HammingWeightClaimReduction plus address phase of advice reduction instances (if needed).
+        // Includes HammingWeightClaimReduction plus lane/address-phase reductions (if needed).
         let mut instances: Vec<Box<dyn SumcheckInstanceProver<F, ProofTranscript>>> =
             vec![Box::new(hw_prover)];
 
+        if let Some(mut bytecode_reduction_prover) = self.bytecode_reduction_prover.take() {
+            bytecode_reduction_prover.params.phase = BytecodeReductionPhase::LaneVariables;
+            instances.push(Box::new(bytecode_reduction_prover));
+        }
+
         if let Some(mut advice_reduction_prover_trusted) =
             self.advice_reduction_prover_trusted.take()
         {
diff --git a/jolt-core/src/zkvm/verifier.rs b/jolt-core/src/zkvm/verifier.rs
index 7d87c3573c..4e55d61e26 100644
--- a/jolt-core/src/zkvm/verifier.rs
+++ b/jolt-core/src/zkvm/verifier.rs
@@ -23,9 +23,10 @@ use crate::zkvm::{
         BytecodeReadRafSumcheckParams,
     },
     claim_reductions::{
-        AdviceClaimReductionVerifier, AdviceKind, HammingWeightClaimReductionVerifier,
-        IncClaimReductionSumcheckVerifier, InstructionLookupsClaimReductionSumcheckVerifier,
-        RamRaClaimReductionSumcheckVerifier,
+        AdviceClaimReductionVerifier, AdviceKind, BytecodeClaimReductionParams,
+        BytecodeClaimReductionVerifier, BytecodeReductionPhase,
+        HammingWeightClaimReductionVerifier, IncClaimReductionSumcheckVerifier,
+        InstructionLookupsClaimReductionSumcheckVerifier, RamRaClaimReductionSumcheckVerifier,
     },
     fiat_shamir_preamble,
     instruction_lookups::{
@@ -96,6 +97,9 @@ pub struct JoltVerifier<
     /// The advice claim reduction sumcheck effectively spans two stages (6 and 7).
     /// Cache the verifier state here between stages.
     advice_reduction_verifier_untrusted: Option<AdviceClaimReductionVerifier<F>>,
+    /// The bytecode claim reduction sumcheck effectively spans two stages (6b and 7).
+    /// Cache the verifier state here between stages.
+    bytecode_reduction_verifier: Option<BytecodeClaimReductionVerifier<F>>,
     pub spartan_key: UniformSpartanKey<F>,
     pub one_hot_params: OneHotParams,
 }
@@ -177,6 +181,7 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             opening_accumulator,
             advice_reduction_verifier_trusted: None,
             advice_reduction_verifier_untrusted: None,
+            bytecode_reduction_verifier: None,
             spartan_key,
             one_hot_params,
         })
@@ -457,7 +462,6 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
         booleanity_params: BooleanitySumcheckParams<F>,
     ) -> Result<(), anyhow::Error> {
         // Initialize Stage 6b cycle verifiers from scratch (Option B).
-        let bytecode_read_raf = BytecodeReadRafCycleSumcheckVerifier::new(bytecode_read_raf_params);
         let booleanity = BooleanityCycleSumcheckVerifier::new(booleanity_params);
         let ram_hamming_booleanity =
             HammingBooleanitySumcheckVerifier::new(&self.opening_accumulator);
@@ -478,6 +482,26 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             &mut self.transcript,
         );
 
+        // Bytecode claim reduction (Phase 1 in Stage 6b): consumes Val_s(r_bc) from Stage 6a and
+        // caches an intermediate claim for Stage 7.
+        //
+        // IMPORTANT: This must be sampled *after* other Stage 6b params (e.g. lookup/inc gammas),
+        // to match the prover's transcript order.
+        if bytecode_read_raf_params.log_T >= bytecode_read_raf_params.log_K {
+            let bytecode_reduction_params = BytecodeClaimReductionParams::new(
+                &bytecode_read_raf_params,
+                &self.opening_accumulator,
+                &mut self.transcript,
+            );
+            self.bytecode_reduction_verifier = Some(BytecodeClaimReductionVerifier::new(
+                bytecode_reduction_params,
+            ));
+        } else {
+            // Not enough cycle randomness to embed the bytecode index vars into Stage 6b.
+            // Fall back to the legacy verifier path (O(K_bytecode) in Stage 6b) by not running the reduction.
+            self.bytecode_reduction_verifier = None;
+        }
+
         // Advice claim reduction (Phase 1 in Stage 6b): trusted and untrusted are separate instances.
         if self.trusted_advice_commitment.is_some() {
             self.advice_reduction_verifier_trusted = Some(AdviceClaimReductionVerifier::new(
@@ -504,6 +528,8 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             ));
         }
 
+        let bytecode_read_raf = BytecodeReadRafCycleSumcheckVerifier::new(bytecode_read_raf_params);
+
         let mut instances: Vec<&dyn SumcheckInstanceVerifier<F, ProofTranscript>> = vec![
             &bytecode_read_raf,
             &ram_hamming_booleanity,
@@ -512,6 +538,9 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             &lookups_ra_virtual,
             &inc_reduction,
         ];
+        if let Some(ref bytecode) = self.bytecode_reduction_verifier {
+            instances.push(bytecode);
+        }
         if let Some(ref advice) = self.advice_reduction_verifier_trusted {
             instances.push(advice);
         }
@@ -542,6 +571,12 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
 
         let mut instances: Vec<&dyn SumcheckInstanceVerifier<F, ProofTranscript>> =
             vec![&hw_verifier];
+
+        if let Some(bytecode_reduction_verifier) = self.bytecode_reduction_verifier.as_mut() {
+            bytecode_reduction_verifier.params.borrow_mut().phase =
+                BytecodeReductionPhase::LaneVariables;
+            instances.push(bytecode_reduction_verifier);
+        }
         if let Some(advice_reduction_verifier_trusted) =
             self.advice_reduction_verifier_trusted.as_mut()
         {
diff --git a/jolt-core/src/zkvm/witness.rs b/jolt-core/src/zkvm/witness.rs
index c661f3a708..bde767ba52 100644
--- a/jolt-core/src/zkvm/witness.rs
+++ b/jolt-core/src/zkvm/witness.rs
@@ -31,6 +31,9 @@ pub enum CommittedPolynomial {
     InstructionRa(usize),
     /// One-hot ra polynomial for the bytecode instance of Shout
     BytecodeRa(usize),
+    /// Packed bytecode commitment chunk polynomial (lane chunk i).
+    /// This is used by BytecodeClaimReduction; commitment + batching integration is staged separately.
+    BytecodeChunk(usize),
     /// One-hot ra/wa polynomial for the RAM instance of Twist
     /// Note that for RAM, ra and wa are the same polynomial because
     /// there is at most one load or store per cycle.
@@ -114,6 +117,9 @@ impl CommittedPolynomial {
                     .collect();
                 PCS::process_chunk_onehot(setup, one_hot_params.k_chunk, &row)
             }
+            CommittedPolynomial::BytecodeChunk(_) => {
+                panic!("Bytecode chunk polynomials are not stream-committed yet")
+            }
             CommittedPolynomial::RamRa(idx) => {
                 let row: Vec<Option<usize>> = row_cycles
                     .iter()
@@ -159,6 +165,9 @@ impl CommittedPolynomial {
                     one_hot_params.k_chunk,
                 ))
             }
+            CommittedPolynomial::BytecodeChunk(_) => {
+                panic!("Bytecode chunk polynomials are not supported by generate_witness yet")
+            }
             CommittedPolynomial::RamRa(i) => {
                 let one_hot_params = one_hot_params.unwrap();
                 let addresses: Vec<_> = trace

From e0228acef964cdfa8b48ac83a76a0e9df50d9fdd Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Mon, 19 Jan 2026 20:30:23 -0800
Subject: [PATCH 05/16] feat(zkvm): add bytecode commitment mode config

---
 jolt-core/src/utils/errors.rs                 |  2 +
 .../src/zkvm/bytecode/read_raf_checking.rs    | 66 +++++++--------
 jolt-core/src/zkvm/config.rs                  | 57 ++++++++++++-
 jolt-core/src/zkvm/proof_serialization.rs     |  3 +-
 jolt-core/src/zkvm/prover.rs                  | 84 +++++++++++++++++--
 jolt-core/src/zkvm/verifier.rs                | 22 ++++-
 6 files changed, 190 insertions(+), 44 deletions(-)

diff --git a/jolt-core/src/utils/errors.rs b/jolt-core/src/utils/errors.rs
index a9e8b12909..e8b1b9fee1 100644
--- a/jolt-core/src/utils/errors.rs
+++ b/jolt-core/src/utils/errors.rs
@@ -28,6 +28,8 @@ pub enum ProofVerifyError {
     InvalidReadWriteConfig(String),
     #[error("Invalid one-hot configuration: {0}")]
     InvalidOneHotConfig(String),
+    #[error("Invalid bytecode commitment configuration: {0}")]
+    InvalidBytecodeConfig(String),
     #[error("Dory proof verification failed: {0}")]
     DoryError(String),
     #[error("Sumcheck verification failed")]
diff --git a/jolt-core/src/zkvm/bytecode/read_raf_checking.rs b/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
index 9ddc776262..6f40df8145 100644
--- a/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
+++ b/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
@@ -27,7 +27,7 @@ use crate::{
     utils::{math::Math, small_scalar::SmallScalar, thread::unsafe_allocate_zero_vec},
     zkvm::{
         bytecode::BytecodePreprocessing,
-        config::OneHotParams,
+        config::{BytecodeCommitmentMode, OneHotParams},
         instruction::{
             CircuitFlags, Flags, InstructionFlags, InstructionLookup, InterleavedBitsMarker,
             NUM_CIRCUIT_FLAGS,
@@ -859,9 +859,9 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T>
             address_claim,
         );
 
-        // Emit Val-only claims at the Stage 6a boundary only when the cycle phase has enough
-        // randomness to support the bytecode claim reduction path (`log_T >= log_K`).
-        if self.params.log_T >= self.params.log_K {
+        // Emit Val-only claims at the Stage 6a boundary only when the staged-Val/claim-reduction
+        // path is enabled.
+        if self.params.use_staged_val_claims {
             for stage in 0..N_STAGES {
                 let claim = self.params.val_polys[stage].final_sumcheck_claim();
                 accumulator.append_virtual(
@@ -1264,33 +1264,29 @@ impl<F: JoltField> BytecodeReadRafAddressSumcheckVerifier<F> {
         one_hot_params: &OneHotParams,
         opening_accumulator: &VerifierOpeningAccumulator<F>,
         transcript: &mut impl Transcript,
+        bytecode_mode: BytecodeCommitmentMode,
     ) -> Self {
-        let log_k = one_hot_params.bytecode_k.log_2();
-        Self {
-            // If `log_T >= log_K_bytecode`, the verifier can use the fast path (no bytecode-length
-            // work) by consuming `Val_s(r_bc)` from Stage 6a and (eventually) checking them via
-            // BytecodeClaimReduction + committed bytecode.
-            //
-            // Otherwise, we fall back to the legacy path and materialize the Val polynomials
-            // (O(K_bytecode)) to keep soundness without requiring extra padding.
-            params: if n_cycle_vars >= log_k {
-                BytecodeReadRafSumcheckParams::gen_verifier(
-                    bytecode_preprocessing,
-                    n_cycle_vars,
-                    one_hot_params,
-                    opening_accumulator,
-                    transcript,
-                )
-            } else {
-                BytecodeReadRafSumcheckParams::gen(
-                    bytecode_preprocessing,
-                    n_cycle_vars,
-                    one_hot_params,
-                    opening_accumulator,
-                    transcript,
-                )
-            },
-        }
+        let mut params = match bytecode_mode {
+            // Commitment mode: verifier MUST avoid O(K_bytecode) work here, and later stages will
+            // relate staged Val claims to committed bytecode.
+            BytecodeCommitmentMode::Commitment => BytecodeReadRafSumcheckParams::gen_verifier(
+                bytecode_preprocessing,
+                n_cycle_vars,
+                one_hot_params,
+                opening_accumulator,
+                transcript,
+            ),
+            // Legacy mode: verifier materializes/evaluates bytecode-dependent polynomials (O(K_bytecode)).
+            BytecodeCommitmentMode::Legacy => BytecodeReadRafSumcheckParams::gen(
+                bytecode_preprocessing,
+                n_cycle_vars,
+                one_hot_params,
+                opening_accumulator,
+                transcript,
+            ),
+        };
+        params.use_staged_val_claims = bytecode_mode == BytecodeCommitmentMode::Commitment;
+        Self { params }
     }
 
     /// Consume this verifier and return the underlying parameters (for Option B orchestration).
@@ -1350,8 +1346,8 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
         );
 
         // Populate opening points for the Val-only bytecode stage claims emitted in Stage 6a,
-        // but only when that fast path is enabled (`log_T >= log_K`).
-        if self.params.log_T >= self.params.log_K {
+        // but only when the staged-Val/claim-reduction path is enabled.
+        if self.params.use_staged_val_claims {
             for stage in 0..N_STAGES {
                 accumulator.append_virtual(
                     transcript,
@@ -1428,7 +1424,7 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
             F::zero(),                              // There's no raf for Stage4
             F::zero(),                              // There's no raf for Stage5
         ];
-        let val = if self.params.val_polys[0].original_len() == 0 {
+        let val = if self.params.use_staged_val_claims {
             // Fast verifier path: consume Val_s(r_bc) claims emitted at the Stage 6a boundary,
             // rather than re-evaluating `val_polys` (O(K_bytecode)).
             (0..N_STAGES)
@@ -1513,6 +1509,9 @@ pub struct BytecodeReadRafSumcheckParams<F: JoltField> {
     /// log2(K) and log2(T) used to determine round counts.
     pub log_K: usize,
     pub log_T: usize,
+    /// If true, Stage 6a emits `Val_s(r_bc)` as virtual openings and Stage 6b consumes them
+    /// (instead of verifier re-materializing/evaluating `val_polys`).
+    pub use_staged_val_claims: bool,
     /// Number of address chunks (and RA polynomials in the product).
     pub d: usize,
     /// Stage Val polynomials evaluated over address vars.
@@ -1695,6 +1694,7 @@ impl<F: JoltField> BytecodeReadRafSumcheckParams<F> {
             log_K: one_hot_params.bytecode_k.log_2(),
             d: one_hot_params.bytecode_d,
             log_T: n_cycle_vars,
+            use_staged_val_claims: false,
             val_polys,
             rv_claims,
             raf_claim,
diff --git a/jolt-core/src/zkvm/config.rs b/jolt-core/src/zkvm/config.rs
index c7846b1347..59d48757de 100644
--- a/jolt-core/src/zkvm/config.rs
+++ b/jolt-core/src/zkvm/config.rs
@@ -1,5 +1,8 @@
 use allocative::Allocative;
-use ark_serialize::{CanonicalDeserialize, CanonicalSerialize};
+use ark_serialize::{
+    CanonicalDeserialize, CanonicalSerialize, Compress, SerializationError, Valid, Validate,
+};
+use std::io::{Read, Write};
 
 use crate::field::JoltField;
 use crate::utils::math::Math;
@@ -20,6 +23,58 @@ pub fn get_instruction_sumcheck_phases(log_t: usize) -> usize {
     }
 }
 
+/// Controls whether the prover/verifier use the **legacy** bytecode path (verifier may do O(K))
+/// or the new **bytecode-commitment/claim-reduction** path (requires padding so `T >= K_bytecode`).
+#[repr(u8)]
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Allocative)]
+pub enum BytecodeCommitmentMode {
+    /// Legacy mode: verifier may materialize bytecode-dependent polynomials (O(K_bytecode)).
+    Legacy = 0,
+    /// Commitment mode: use staged Val claims + BytecodeClaimReduction; requires `log_T >= log_K`.
+    Commitment = 1,
+}
+
+impl Default for BytecodeCommitmentMode {
+    fn default() -> Self {
+        Self::Legacy
+    }
+}
+
+impl CanonicalSerialize for BytecodeCommitmentMode {
+    fn serialize_with_mode<W: Write>(
+        &self,
+        writer: W,
+        compress: Compress,
+    ) -> Result<(), SerializationError> {
+        (*self as u8).serialize_with_mode(writer, compress)
+    }
+
+    fn serialized_size(&self, compress: Compress) -> usize {
+        (*self as u8).serialized_size(compress)
+    }
+}
+
+impl Valid for BytecodeCommitmentMode {
+    fn check(&self) -> Result<(), SerializationError> {
+        Ok(())
+    }
+}
+
+impl CanonicalDeserialize for BytecodeCommitmentMode {
+    fn deserialize_with_mode<R: Read>(
+        reader: R,
+        compress: Compress,
+        validate: Validate,
+    ) -> Result<Self, SerializationError> {
+        let value = u8::deserialize_with_mode(reader, compress, validate)?;
+        match value {
+            0 => Ok(Self::Legacy),
+            1 => Ok(Self::Commitment),
+            _ => Err(SerializationError::InvalidData),
+        }
+    }
+}
+
 /// Configuration for read-write checking sumchecks.
 ///
 /// Contains parameters that control phase structure for RAM and register
diff --git a/jolt-core/src/zkvm/proof_serialization.rs b/jolt-core/src/zkvm/proof_serialization.rs
index f80340d81f..c6e012e0e2 100644
--- a/jolt-core/src/zkvm/proof_serialization.rs
+++ b/jolt-core/src/zkvm/proof_serialization.rs
@@ -19,7 +19,7 @@ use crate::{
     subprotocols::sumcheck::SumcheckInstanceProof,
     transcripts::Transcript,
     zkvm::{
-        config::{OneHotConfig, ReadWriteConfig},
+        config::{BytecodeCommitmentMode, OneHotConfig, ReadWriteConfig},
         instruction::{CircuitFlags, InstructionFlags},
         witness::{CommittedPolynomial, VirtualPolynomial},
     },
@@ -44,6 +44,7 @@ pub struct JoltProof<F: JoltField, PCS: CommitmentScheme<Field = F>, FS: Transcr
     pub trace_length: usize,
     pub ram_K: usize,
     pub bytecode_K: usize,
+    pub bytecode_mode: BytecodeCommitmentMode,
     pub rw_config: ReadWriteConfig,
     pub one_hot_config: OneHotConfig,
     pub dory_layout: DoryLayout,
diff --git a/jolt-core/src/zkvm/prover.rs b/jolt-core/src/zkvm/prover.rs
index e03dec909e..c3feb78792 100644
--- a/jolt-core/src/zkvm/prover.rs
+++ b/jolt-core/src/zkvm/prover.rs
@@ -16,7 +16,7 @@ use std::{
 use crate::poly::commitment::dory::DoryContext;
 use ark_serialize::{CanonicalDeserialize, CanonicalSerialize};
 
-use crate::zkvm::config::ReadWriteConfig;
+use crate::zkvm::config::{BytecodeCommitmentMode, ReadWriteConfig};
 use crate::zkvm::verifier::JoltSharedPreprocessing;
 use crate::zkvm::Serializable;
 
@@ -171,6 +171,8 @@ pub struct JoltCpuProver<
     pub final_ram_state: Vec<u64>,
     pub one_hot_params: OneHotParams,
     pub rw_config: ReadWriteConfig,
+    /// First-class selection of legacy vs bytecode-commitment/claim-reduction mode.
+    pub bytecode_mode: BytecodeCommitmentMode,
 }
 impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscript: Transcript>
     JoltCpuProver<'a, F, PCS, ProofTranscript>
@@ -183,6 +185,29 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         trusted_advice: &[u8],
         trusted_advice_commitment: Option<PCS::Commitment>,
         trusted_advice_hint: Option<PCS::OpeningProofHint>,
+    ) -> Self {
+        Self::gen_from_elf_with_bytecode_mode(
+            preprocessing,
+            elf_contents,
+            inputs,
+            untrusted_advice,
+            trusted_advice,
+            trusted_advice_commitment,
+            trusted_advice_hint,
+            BytecodeCommitmentMode::Legacy,
+        )
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn gen_from_elf_with_bytecode_mode(
+        preprocessing: &'a JoltProverPreprocessing<F, PCS>,
+        elf_contents: &[u8],
+        inputs: &[u8],
+        untrusted_advice: &[u8],
+        trusted_advice: &[u8],
+        trusted_advice_commitment: Option<PCS::Commitment>,
+        trusted_advice_hint: Option<PCS::OpeningProofHint>,
+        bytecode_mode: BytecodeCommitmentMode,
     ) -> Self {
         let memory_config = MemoryConfig {
             max_untrusted_advice_size: preprocessing.shared.memory_layout.max_untrusted_advice_size,
@@ -228,7 +253,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             trace.len(),
         );
 
-        Self::gen_from_trace(
+        Self::gen_from_trace_with_bytecode_mode(
             preprocessing,
             lazy_trace,
             trace,
@@ -236,6 +261,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             trusted_advice_commitment,
             trusted_advice_hint,
             final_memory_state,
+            bytecode_mode,
         )
     }
 
@@ -317,6 +343,28 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
     }
 
     pub fn gen_from_trace(
+        preprocessing: &'a JoltProverPreprocessing<F, PCS>,
+        lazy_trace: LazyTraceIterator,
+        trace: Vec<Cycle>,
+        program_io: JoltDevice,
+        trusted_advice_commitment: Option<PCS::Commitment>,
+        trusted_advice_hint: Option<PCS::OpeningProofHint>,
+        final_memory_state: Memory,
+    ) -> Self {
+        Self::gen_from_trace_with_bytecode_mode(
+            preprocessing,
+            lazy_trace,
+            trace,
+            program_io,
+            trusted_advice_commitment,
+            trusted_advice_hint,
+            final_memory_state,
+            BytecodeCommitmentMode::Legacy,
+        )
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn gen_from_trace_with_bytecode_mode(
         preprocessing: &'a JoltProverPreprocessing<F, PCS>,
         lazy_trace: LazyTraceIterator,
         mut trace: Vec<Cycle>,
@@ -324,6 +372,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         trusted_advice_commitment: Option<PCS::Commitment>,
         trusted_advice_hint: Option<PCS::OpeningProofHint>,
         final_memory_state: Memory,
+        bytecode_mode: BytecodeCommitmentMode,
     ) -> Self {
         // truncate trailing zeros on device outputs
         program_io.outputs.truncate(
@@ -341,6 +390,22 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         } else {
             (trace.len() + 1).next_power_of_two()
         };
+
+        // If we intend to use the bytecode-commitment/claim-reduction path, we must ensure
+        // `log_T >= log_K_bytecode`, i.e. `T >= K_bytecode`. Enforce by padding up-front.
+        let mut padded_trace_len = padded_trace_len;
+        if bytecode_mode == BytecodeCommitmentMode::Commitment {
+            let bytecode_k = preprocessing.shared.bytecode.code_size;
+            if bytecode_k > preprocessing.shared.max_padded_trace_length {
+                panic!(
+                    "Bytecode commitment mode requires max_padded_trace_length >= bytecode_K.\n\
+                     bytecode_K={} > max_padded_trace_length={}\n\
+                     Increase max_trace_length in preprocessing (JoltSharedPreprocessing::new).",
+                    bytecode_k, preprocessing.shared.max_padded_trace_length
+                );
+            }
+            padded_trace_len = padded_trace_len.max(bytecode_k);
+        }
         // We may need extra padding so the main Dory matrix has enough (row, col) variables
         // to embed advice commitments committed in their own preprocessing-only contexts.
         let has_trusted_advice = !program_io.trusted_advice.is_empty();
@@ -421,6 +486,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             final_ram_state,
             one_hot_params,
             rw_config,
+            bytecode_mode,
         }
     }
 
@@ -509,6 +575,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             trace_length: self.trace.len(),
             ram_K: self.one_hot_params.ram_k,
             bytecode_K: self.one_hot_params.bytecode_k,
+            bytecode_mode: self.bytecode_mode,
             rw_config: self.rw_config.clone(),
             one_hot_config: self.one_hot_params.to_config(),
             dory_layout: DoryGlobals::get_layout(),
@@ -1094,13 +1161,15 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         #[cfg(not(target_arch = "wasm32"))]
         print_current_memory_usage("Stage 6a baseline");
 
-        let bytecode_read_raf_params = BytecodeReadRafSumcheckParams::gen(
+        let mut bytecode_read_raf_params = BytecodeReadRafSumcheckParams::gen(
             &self.preprocessing.shared.bytecode,
             self.trace.len().log_2(),
             &self.one_hot_params,
             &self.opening_accumulator,
             &mut self.transcript,
         );
+        bytecode_read_raf_params.use_staged_val_claims =
+            self.bytecode_mode == BytecodeCommitmentMode::Commitment;
 
         let booleanity_params = BooleanitySumcheckParams::new(
             self.trace.len().log_2(),
@@ -1177,7 +1246,11 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
 
         // Bytecode claim reduction (Phase 1 in Stage 6b): consumes Val_s(r_bc) from Stage 6a and
         // caches an intermediate claim for Stage 7.
-        if bytecode_read_raf_params.log_T >= bytecode_read_raf_params.log_K {
+        if self.bytecode_mode == BytecodeCommitmentMode::Commitment {
+            debug_assert!(
+                bytecode_read_raf_params.log_T >= bytecode_read_raf_params.log_K,
+                "commitment mode requires log_T >= log_K_bytecode"
+            );
             let bytecode_reduction_params = BytecodeClaimReductionParams::new(
                 &bytecode_read_raf_params,
                 &self.opening_accumulator,
@@ -1188,8 +1261,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
                 Arc::clone(&self.preprocessing.shared.bytecode),
             ));
         } else {
-            // Not enough cycle randomness to embed the bytecode index vars into Stage 6b.
-            // Fall back to the legacy verifier path (O(K_bytecode) in Stage 6b) by not running the reduction.
+            // Legacy mode: do not run the bytecode claim reduction.
             self.bytecode_reduction_prover = None;
         }
 
diff --git a/jolt-core/src/zkvm/verifier.rs b/jolt-core/src/zkvm/verifier.rs
index 4e55d61e26..0d800222c7 100644
--- a/jolt-core/src/zkvm/verifier.rs
+++ b/jolt-core/src/zkvm/verifier.rs
@@ -10,6 +10,7 @@ use crate::subprotocols::sumcheck::BatchedSumcheck;
 use crate::zkvm::bytecode::BytecodePreprocessing;
 use crate::zkvm::claim_reductions::advice::ReductionPhase;
 use crate::zkvm::claim_reductions::RegistersClaimReductionSumcheckVerifier;
+use crate::zkvm::config::BytecodeCommitmentMode;
 use crate::zkvm::config::OneHotParams;
 #[cfg(feature = "prover")]
 use crate::zkvm::prover::JoltProverPreprocessing;
@@ -168,6 +169,17 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             .validate(proof.trace_length.log_2(), proof.ram_K.log_2())
             .map_err(ProofVerifyError::InvalidReadWriteConfig)?;
 
+        // If the proof claims it used bytecode commitment mode, it must have enough cycle vars
+        // to embed bytecode address variables (log_T >= log_K_bytecode), i.e. T >= K_bytecode.
+        if proof.bytecode_mode == BytecodeCommitmentMode::Commitment
+            && proof.trace_length < proof.bytecode_K
+        {
+            return Err(ProofVerifyError::InvalidBytecodeConfig(format!(
+                "bytecode commitment mode requires trace_length >= bytecode_K (got trace_length={}, bytecode_K={})",
+                proof.trace_length, proof.bytecode_K
+            )));
+        }
+
         // Construct full params from the validated config
         let one_hot_params =
             OneHotParams::from_config(&proof.one_hot_config, proof.bytecode_K, proof.ram_K);
@@ -434,6 +446,7 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             &self.one_hot_params,
             &self.opening_accumulator,
             &mut self.transcript,
+            self.proof.bytecode_mode,
         );
         let booleanity_params = BooleanitySumcheckParams::new(
             n_cycle_vars,
@@ -487,7 +500,11 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
         //
         // IMPORTANT: This must be sampled *after* other Stage 6b params (e.g. lookup/inc gammas),
         // to match the prover's transcript order.
-        if bytecode_read_raf_params.log_T >= bytecode_read_raf_params.log_K {
+        if self.proof.bytecode_mode == BytecodeCommitmentMode::Commitment {
+            debug_assert!(
+                bytecode_read_raf_params.log_T >= bytecode_read_raf_params.log_K,
+                "commitment mode requires log_T >= log_K_bytecode"
+            );
             let bytecode_reduction_params = BytecodeClaimReductionParams::new(
                 &bytecode_read_raf_params,
                 &self.opening_accumulator,
@@ -497,8 +514,7 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
                 bytecode_reduction_params,
             ));
         } else {
-            // Not enough cycle randomness to embed the bytecode index vars into Stage 6b.
-            // Fall back to the legacy verifier path (O(K_bytecode) in Stage 6b) by not running the reduction.
+            // Legacy mode: do not run the bytecode claim reduction.
             self.bytecode_reduction_verifier = None;
         }
 

From 7246c0d883e3db1d07e5f41b01d8148729eec803 Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Mon, 19 Jan 2026 21:31:45 -0800
Subject: [PATCH 06/16] =?UTF-8?q?refactor(zkvm):=20rename=20BytecodeCommit?=
 =?UTF-8?q?mentMode=20=E2=86=92=20BytecodeMode=20(Full/Committed)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../src/zkvm/bytecode/read_raf_checking.rs    | 10 +++----
 jolt-core/src/zkvm/config.rs                  | 28 +++++++++----------
 jolt-core/src/zkvm/proof_serialization.rs     |  4 +--
 jolt-core/src/zkvm/prover.rs                  | 20 ++++++-------
 jolt-core/src/zkvm/verifier.rs                |  8 ++----
 5 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/jolt-core/src/zkvm/bytecode/read_raf_checking.rs b/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
index 6f40df8145..f13713ddab 100644
--- a/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
+++ b/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
@@ -27,7 +27,7 @@ use crate::{
     utils::{math::Math, small_scalar::SmallScalar, thread::unsafe_allocate_zero_vec},
     zkvm::{
         bytecode::BytecodePreprocessing,
-        config::{BytecodeCommitmentMode, OneHotParams},
+        config::{BytecodeMode, OneHotParams},
         instruction::{
             CircuitFlags, Flags, InstructionFlags, InstructionLookup, InterleavedBitsMarker,
             NUM_CIRCUIT_FLAGS,
@@ -1264,12 +1264,12 @@ impl<F: JoltField> BytecodeReadRafAddressSumcheckVerifier<F> {
         one_hot_params: &OneHotParams,
         opening_accumulator: &VerifierOpeningAccumulator<F>,
         transcript: &mut impl Transcript,
-        bytecode_mode: BytecodeCommitmentMode,
+        bytecode_mode: BytecodeMode,
     ) -> Self {
         let mut params = match bytecode_mode {
             // Commitment mode: verifier MUST avoid O(K_bytecode) work here, and later stages will
             // relate staged Val claims to committed bytecode.
-            BytecodeCommitmentMode::Commitment => BytecodeReadRafSumcheckParams::gen_verifier(
+            BytecodeMode::Committed => BytecodeReadRafSumcheckParams::gen_verifier(
                 bytecode_preprocessing,
                 n_cycle_vars,
                 one_hot_params,
@@ -1277,7 +1277,7 @@ impl<F: JoltField> BytecodeReadRafAddressSumcheckVerifier<F> {
                 transcript,
             ),
             // Legacy mode: verifier materializes/evaluates bytecode-dependent polynomials (O(K_bytecode)).
-            BytecodeCommitmentMode::Legacy => BytecodeReadRafSumcheckParams::gen(
+            BytecodeMode::Full => BytecodeReadRafSumcheckParams::gen(
                 bytecode_preprocessing,
                 n_cycle_vars,
                 one_hot_params,
@@ -1285,7 +1285,7 @@ impl<F: JoltField> BytecodeReadRafAddressSumcheckVerifier<F> {
                 transcript,
             ),
         };
-        params.use_staged_val_claims = bytecode_mode == BytecodeCommitmentMode::Commitment;
+        params.use_staged_val_claims = bytecode_mode == BytecodeMode::Committed;
         Self { params }
     }
 
diff --git a/jolt-core/src/zkvm/config.rs b/jolt-core/src/zkvm/config.rs
index 59d48757de..acc98a198b 100644
--- a/jolt-core/src/zkvm/config.rs
+++ b/jolt-core/src/zkvm/config.rs
@@ -23,24 +23,24 @@ pub fn get_instruction_sumcheck_phases(log_t: usize) -> usize {
     }
 }
 
-/// Controls whether the prover/verifier use the **legacy** bytecode path (verifier may do O(K))
-/// or the new **bytecode-commitment/claim-reduction** path (requires padding so `T >= K_bytecode`).
+/// Controls whether the prover/verifier use the **full** bytecode path (verifier may do O(K))
+/// or the **committed** bytecode path (requires padding so `T >= K_bytecode`).
 #[repr(u8)]
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Allocative)]
-pub enum BytecodeCommitmentMode {
-    /// Legacy mode: verifier may materialize bytecode-dependent polynomials (O(K_bytecode)).
-    Legacy = 0,
-    /// Commitment mode: use staged Val claims + BytecodeClaimReduction; requires `log_T >= log_K`.
-    Commitment = 1,
+pub enum BytecodeMode {
+    /// Full mode: verifier may materialize bytecode-dependent polynomials (O(K_bytecode)).
+    Full = 0,
+    /// Committed mode: use staged Val claims + BytecodeClaimReduction; requires `log_T >= log_K`.
+    Committed = 1,
 }
 
-impl Default for BytecodeCommitmentMode {
+impl Default for BytecodeMode {
     fn default() -> Self {
-        Self::Legacy
+        Self::Full
     }
 }
 
-impl CanonicalSerialize for BytecodeCommitmentMode {
+impl CanonicalSerialize for BytecodeMode {
     fn serialize_with_mode<W: Write>(
         &self,
         writer: W,
@@ -54,13 +54,13 @@ impl CanonicalSerialize for BytecodeCommitmentMode {
     }
 }
 
-impl Valid for BytecodeCommitmentMode {
+impl Valid for BytecodeMode {
     fn check(&self) -> Result<(), SerializationError> {
         Ok(())
     }
 }
 
-impl CanonicalDeserialize for BytecodeCommitmentMode {
+impl CanonicalDeserialize for BytecodeMode {
     fn deserialize_with_mode<R: Read>(
         reader: R,
         compress: Compress,
@@ -68,8 +68,8 @@ impl CanonicalDeserialize for BytecodeCommitmentMode {
     ) -> Result<Self, SerializationError> {
         let value = u8::deserialize_with_mode(reader, compress, validate)?;
         match value {
-            0 => Ok(Self::Legacy),
-            1 => Ok(Self::Commitment),
+            0 => Ok(Self::Full),
+            1 => Ok(Self::Committed),
             _ => Err(SerializationError::InvalidData),
         }
     }
diff --git a/jolt-core/src/zkvm/proof_serialization.rs b/jolt-core/src/zkvm/proof_serialization.rs
index c6e012e0e2..c03e027598 100644
--- a/jolt-core/src/zkvm/proof_serialization.rs
+++ b/jolt-core/src/zkvm/proof_serialization.rs
@@ -19,7 +19,7 @@ use crate::{
     subprotocols::sumcheck::SumcheckInstanceProof,
     transcripts::Transcript,
     zkvm::{
-        config::{BytecodeCommitmentMode, OneHotConfig, ReadWriteConfig},
+        config::{BytecodeMode, OneHotConfig, ReadWriteConfig},
         instruction::{CircuitFlags, InstructionFlags},
         witness::{CommittedPolynomial, VirtualPolynomial},
     },
@@ -44,7 +44,7 @@ pub struct JoltProof<F: JoltField, PCS: CommitmentScheme<Field = F>, FS: Transcr
     pub trace_length: usize,
     pub ram_K: usize,
     pub bytecode_K: usize,
-    pub bytecode_mode: BytecodeCommitmentMode,
+    pub bytecode_mode: BytecodeMode,
     pub rw_config: ReadWriteConfig,
     pub one_hot_config: OneHotConfig,
     pub dory_layout: DoryLayout,
diff --git a/jolt-core/src/zkvm/prover.rs b/jolt-core/src/zkvm/prover.rs
index c3feb78792..eccc9cf569 100644
--- a/jolt-core/src/zkvm/prover.rs
+++ b/jolt-core/src/zkvm/prover.rs
@@ -16,7 +16,7 @@ use std::{
 use crate::poly::commitment::dory::DoryContext;
 use ark_serialize::{CanonicalDeserialize, CanonicalSerialize};
 
-use crate::zkvm::config::{BytecodeCommitmentMode, ReadWriteConfig};
+use crate::zkvm::config::{BytecodeMode, ReadWriteConfig};
 use crate::zkvm::verifier::JoltSharedPreprocessing;
 use crate::zkvm::Serializable;
 
@@ -171,8 +171,8 @@ pub struct JoltCpuProver<
     pub final_ram_state: Vec<u64>,
     pub one_hot_params: OneHotParams,
     pub rw_config: ReadWriteConfig,
-    /// First-class selection of legacy vs bytecode-commitment/claim-reduction mode.
-    pub bytecode_mode: BytecodeCommitmentMode,
+    /// First-class selection of full vs committed bytecode mode.
+    pub bytecode_mode: BytecodeMode,
 }
 impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscript: Transcript>
     JoltCpuProver<'a, F, PCS, ProofTranscript>
@@ -194,7 +194,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             trusted_advice,
             trusted_advice_commitment,
             trusted_advice_hint,
-            BytecodeCommitmentMode::Legacy,
+            BytecodeMode::Full,
         )
     }
 
@@ -207,7 +207,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         trusted_advice: &[u8],
         trusted_advice_commitment: Option<PCS::Commitment>,
         trusted_advice_hint: Option<PCS::OpeningProofHint>,
-        bytecode_mode: BytecodeCommitmentMode,
+        bytecode_mode: BytecodeMode,
     ) -> Self {
         let memory_config = MemoryConfig {
             max_untrusted_advice_size: preprocessing.shared.memory_layout.max_untrusted_advice_size,
@@ -359,7 +359,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             trusted_advice_commitment,
             trusted_advice_hint,
             final_memory_state,
-            BytecodeCommitmentMode::Legacy,
+            BytecodeMode::Full,
         )
     }
 
@@ -372,7 +372,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         trusted_advice_commitment: Option<PCS::Commitment>,
         trusted_advice_hint: Option<PCS::OpeningProofHint>,
         final_memory_state: Memory,
-        bytecode_mode: BytecodeCommitmentMode,
+        bytecode_mode: BytecodeMode,
     ) -> Self {
         // truncate trailing zeros on device outputs
         program_io.outputs.truncate(
@@ -394,7 +394,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         // If we intend to use the bytecode-commitment/claim-reduction path, we must ensure
         // `log_T >= log_K_bytecode`, i.e. `T >= K_bytecode`. Enforce by padding up-front.
         let mut padded_trace_len = padded_trace_len;
-        if bytecode_mode == BytecodeCommitmentMode::Commitment {
+        if bytecode_mode == BytecodeMode::Committed {
             let bytecode_k = preprocessing.shared.bytecode.code_size;
             if bytecode_k > preprocessing.shared.max_padded_trace_length {
                 panic!(
@@ -1169,7 +1169,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             &mut self.transcript,
         );
         bytecode_read_raf_params.use_staged_val_claims =
-            self.bytecode_mode == BytecodeCommitmentMode::Commitment;
+            self.bytecode_mode == BytecodeMode::Committed;
 
         let booleanity_params = BooleanitySumcheckParams::new(
             self.trace.len().log_2(),
@@ -1246,7 +1246,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
 
         // Bytecode claim reduction (Phase 1 in Stage 6b): consumes Val_s(r_bc) from Stage 6a and
         // caches an intermediate claim for Stage 7.
-        if self.bytecode_mode == BytecodeCommitmentMode::Commitment {
+        if self.bytecode_mode == BytecodeMode::Committed {
             debug_assert!(
                 bytecode_read_raf_params.log_T >= bytecode_read_raf_params.log_K,
                 "commitment mode requires log_T >= log_K_bytecode"
diff --git a/jolt-core/src/zkvm/verifier.rs b/jolt-core/src/zkvm/verifier.rs
index 0d800222c7..819fa3c712 100644
--- a/jolt-core/src/zkvm/verifier.rs
+++ b/jolt-core/src/zkvm/verifier.rs
@@ -10,7 +10,7 @@ use crate::subprotocols::sumcheck::BatchedSumcheck;
 use crate::zkvm::bytecode::BytecodePreprocessing;
 use crate::zkvm::claim_reductions::advice::ReductionPhase;
 use crate::zkvm::claim_reductions::RegistersClaimReductionSumcheckVerifier;
-use crate::zkvm::config::BytecodeCommitmentMode;
+use crate::zkvm::config::BytecodeMode;
 use crate::zkvm::config::OneHotParams;
 #[cfg(feature = "prover")]
 use crate::zkvm::prover::JoltProverPreprocessing;
@@ -171,9 +171,7 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
 
         // If the proof claims it used bytecode commitment mode, it must have enough cycle vars
         // to embed bytecode address variables (log_T >= log_K_bytecode), i.e. T >= K_bytecode.
-        if proof.bytecode_mode == BytecodeCommitmentMode::Commitment
-            && proof.trace_length < proof.bytecode_K
-        {
+        if proof.bytecode_mode == BytecodeMode::Committed && proof.trace_length < proof.bytecode_K {
             return Err(ProofVerifyError::InvalidBytecodeConfig(format!(
                 "bytecode commitment mode requires trace_length >= bytecode_K (got trace_length={}, bytecode_K={})",
                 proof.trace_length, proof.bytecode_K
@@ -500,7 +498,7 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
         //
         // IMPORTANT: This must be sampled *after* other Stage 6b params (e.g. lookup/inc gammas),
         // to match the prover's transcript order.
-        if self.proof.bytecode_mode == BytecodeCommitmentMode::Commitment {
+        if self.proof.bytecode_mode == BytecodeMode::Committed {
             debug_assert!(
                 bytecode_read_raf_params.log_T >= bytecode_read_raf_params.log_K,
                 "commitment mode requires log_T >= log_K_bytecode"

From 4b5f396d29ae1c509c7feb33f9f794355afe5c2d Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Tue, 20 Jan 2026 00:47:50 -0800
Subject: [PATCH 07/16] refactor(bytecode): separate bytecode preprocessing for
 Full/Committed modes

- BytecodePreprocessing::preprocess() now returns Self (caller wraps in Arc)
- JoltSharedPreprocessing::new() takes &BytecodePreprocessing, stores bytecode_size
- JoltProverPreprocessing stores Arc<BytecodePreprocessing> + optional commitments
- JoltVerifierPreprocessing uses VerifierBytecode<PCS> enum (Full or Committed)
- Added TrustedBytecodeCommitments<PCS> for type-safe commitment handling
- Updated SDK macros to return (shared, bytecode) tuple
- Updated all tests, guest/*, and benchmarks

This refactor enables Committed mode where verifier only receives bytecode
commitments instead of full O(K) bytecode data. Actual commitment computation
is TODO for a future PR.
---
 bytecode-refactor-design.md                   | 202 +++++++++++++
 jolt-core/benches/e2e_profiling.rs            |  23 +-
 jolt-core/src/guest/prover.rs                 |  10 +-
 jolt-core/src/guest/verifier.rs               |  12 +-
 .../src/poly/commitment/commitment_scheme.rs  |   8 +-
 jolt-core/src/utils/errors.rs                 |   2 +
 jolt-core/src/zkvm/bytecode/mod.rs            | 205 ++++++++++++-
 .../zkvm/claim_reductions/hamming_weight.rs   |   4 +-
 jolt-core/src/zkvm/prover.rs                  | 282 ++++++++++++------
 jolt-core/src/zkvm/verifier.rs                | 182 ++++++-----
 jolt-core/src/zkvm/witness.rs                 |   3 +-
 jolt-sdk/macros/src/lib.rs                    |  28 +-
 jolt-sdk/src/host_utils.rs                    |   1 +
 13 files changed, 764 insertions(+), 198 deletions(-)
 create mode 100644 bytecode-refactor-design.md

diff --git a/bytecode-refactor-design.md b/bytecode-refactor-design.md
new file mode 100644
index 0000000000..6299fe341b
--- /dev/null
+++ b/bytecode-refactor-design.md
@@ -0,0 +1,202 @@
+# Bytecode Preprocessing Refactor Design
+
+## Goal
+
+Separate bytecode preprocessing between prover and verifier based on `BytecodeMode`:
+
+- **Full mode**: Verifier has access to full bytecode (O(K) data) — current behavior
+- **Committed mode**: Verifier only sees bytecode commitments — enables succinct verification
+
+## Current State (After Refactor)
+
+```
+BytecodePreprocessing  ← O(K) data, created first via preprocess()
+├── bytecode: Vec<Instruction>
+└── pc_map: BytecodePCMapper
+
+JoltSharedPreprocessing  ← Truly shared, single source of truth for size
+├── bytecode_size: usize            ← Derived from bytecode.bytecode.len()
+├── ram: RAMPreprocessing
+├── memory_layout: MemoryLayout
+└── max_padded_trace_length: usize
+
+JoltProverPreprocessing  ← Prover always has full bytecode
+├── generators: PCS::ProverSetup
+├── shared: JoltSharedPreprocessing
+├── bytecode: Arc<BytecodePreprocessing>        ← Full bytecode (always)
+├── bytecode_commitments: Option<TrustedBytecodeCommitments<PCS>>  ← Only in Committed mode
+└── bytecode_commitment_hints: Option<Vec<PCS::OpeningProofHint>>  ← Only in Committed mode
+
+JoltVerifierPreprocessing  ← Verifier has mode-dependent bytecode
+├── generators: PCS::VerifierSetup
+├── shared: JoltSharedPreprocessing
+└── bytecode: VerifierBytecode<PCS>        ← Full OR Committed
+
+VerifierBytecode<PCS>  ← Mode-dependent bytecode info
+├── Full(Arc<BytecodePreprocessing>)              ← For Full mode
+└── Committed(TrustedBytecodeCommitments<PCS>)    ← For Committed mode
+```
+
+---
+
+## The Trace-Like Pattern
+
+Bytecode preprocessing follows the same pattern as trace:
+
+```rust
+// Trace pattern:
+let trace: Arc<Vec<Cycle>> = trace.into();
+
+// Bytecode pattern (parallel):
+let bytecode: Arc<BytecodePreprocessing> = BytecodePreprocessing::preprocess(instructions).into();
+```
+
+Both use `Arc` for cheap cloning (`Arc::clone` is O(1) reference count increment).
+
+---
+
+## Usage Examples
+
+### E2E Flow (Full Mode)
+
+```rust
+// 1. Decode + preprocess bytecode (returns Self, wrap in Arc)
+let (instructions, memory_init, _) = program.decode();
+let bytecode: Arc<BytecodePreprocessing> = BytecodePreprocessing::preprocess(instructions).into();
+
+// 2. Create shared preprocessing (borrows bytecode to get size)
+let shared = JoltSharedPreprocessing::new(
+    &bytecode,
+    memory_layout,
+    memory_init,
+    max_trace_length,
+);
+
+// 3. Prover (Arc::clone is O(1))
+let prover_pp = JoltProverPreprocessing::new(shared.clone(), Arc::clone(&bytecode));
+
+// 4. Verifier (Full mode)
+let verifier_pp = JoltVerifierPreprocessing::new_full(shared, generators, bytecode);
+```
+
+### E2E Flow (Committed Mode)
+
+```rust
+// 1-2. Same as above...
+let bytecode: Arc<BytecodePreprocessing> = BytecodePreprocessing::preprocess(instructions).into();
+let shared = JoltSharedPreprocessing::new(&bytecode, memory_layout, memory_init, max_trace);
+
+// 3. Prover in Committed mode (computes commitments during preprocessing)
+let prover_pp = JoltProverPreprocessing::new_committed(shared.clone(), Arc::clone(&bytecode));
+
+// 4. Verifier receives only commitments (from prover's preprocessing)
+let verifier_pp = JoltVerifierPreprocessing::new_committed(
+    shared,
+    generators,
+    prover_pp.bytecode_commitments.clone().unwrap(),
+);
+```
+
+### Accessing Bytecode Data
+
+```rust
+// Access bytecode size (always from shared - single source of truth)
+let code_size = prover_pp.shared.bytecode_size;   // ✅ Definitive source
+let code_size = verifier_pp.shared.bytecode_size; // ✅ Same
+
+// Access full bytecode (prover only, or verifier in Full mode)
+let bytecode_data = &prover_pp.bytecode;                              // Arc<BytecodePreprocessing>
+let bytecode_data = verifier_pp.bytecode.as_full()?;                  // Result<&Arc<...>, ProofVerifyError>
+let commitments = verifier_pp.bytecode.as_committed()?;               // Result<&TrustedBytecodeCommitments<PCS>, ProofVerifyError>
+```
+
+---
+
+## SDK Macro Changes
+
+The generated preprocessing functions now follow the trace-like pattern:
+
+```rust
+// Old API (deprecated)
+pub fn preprocess_shared_foo(program: &mut Program) -> JoltSharedPreprocessing
+
+// New API
+pub fn preprocess_shared_foo(program: &mut Program) 
+    -> (JoltSharedPreprocessing, Arc<BytecodePreprocessing>)
+
+pub fn preprocess_prover_foo(
+    shared: JoltSharedPreprocessing,
+    bytecode: Arc<BytecodePreprocessing>,
+) -> JoltProverPreprocessing<F, PCS>
+
+pub fn preprocess_verifier_foo(
+    shared: JoltSharedPreprocessing,
+    generators: PCS::VerifierSetup,
+    bytecode: Arc<BytecodePreprocessing>,  // For Full mode
+) -> JoltVerifierPreprocessing<F, PCS>
+```
+
+---
+
+## Key Design Decisions
+
+1. **`BytecodePreprocessing::preprocess()` returns `Self`** (not `Arc<Self>`)
+   - Caller uses `.into()` to wrap in Arc, just like trace
+
+2. **`JoltSharedPreprocessing::new()` takes `&BytecodePreprocessing`**
+   - Borrows to compute `bytecode_size = bytecode.bytecode.len()`
+   - Returns just `Self`, not a tuple
+
+3. **`bytecode_size` is the single source of truth**
+   - Stored in `JoltSharedPreprocessing`
+   - `BytecodePreprocessing` has no size field
+
+4. **`TrustedBytecodeCommitments<PCS>`** wrapper enforces trust model
+   - Type-level guarantee that commitments came from honest preprocessing
+   - Public `commitments: Vec<PCS::Commitment>` field for simplicity
+
+5. **No panics in `VerifierBytecode::as_full()` / `as_committed()`**
+   - Returns `Result<_, ProofVerifyError>` with `BytecodeTypeMismatch` error
+
+---
+
+## Files Modified
+
+| File | Changes |
+|------|---------|
+| `jolt-core/src/zkvm/bytecode/mod.rs` | `preprocess()` returns `Self`, added `VerifierBytecode<PCS>`, `TrustedBytecodeCommitments<PCS>` |
+| `jolt-core/src/zkvm/prover.rs` | Added `bytecode`, `bytecode_commitments`, `bytecode_commitment_hints` fields |
+| `jolt-core/src/zkvm/verifier.rs` | `new()` takes `&BytecodePreprocessing`, added `bytecode_size`, removed `bytecode` |
+| `jolt-core/src/guest/prover.rs` | Updated to new pattern |
+| `jolt-core/src/guest/verifier.rs` | Updated to new pattern |
+| `jolt-sdk/macros/src/lib.rs` | Updated generated code for new API |
+| `jolt-sdk/src/host_utils.rs` | Added `BytecodePreprocessing` export |
+| `jolt-core/benches/e2e_profiling.rs` | Updated to new pattern |
+
+---
+
+## Verification
+
+- ✅ `cargo fmt` clean
+- ✅ `cargo clippy -p jolt-core --tests -- -D warnings` passes
+- ✅ `cargo clippy -p jolt-sdk --benches -- -D warnings` passes
+
+---
+
+## Status
+
+**Refactor Complete** — Structure for Full and Committed modes is in place.
+
+### What's Done
+- Bytecode preprocessing separated from shared preprocessing
+- `Arc<BytecodePreprocessing>` pattern (like trace)
+- `JoltSharedPreprocessing.bytecode_size` as single source of truth
+- `VerifierBytecode<PCS>` enum for mode-dependent bytecode
+- `TrustedBytecodeCommitments<PCS>` wrapper for type-safe commitments
+- All call sites updated (tests, guest/*, SDK macros, benchmarks)
+
+### What's TODO (future PRs)
+- [ ] Implement actual bytecode commitment computation in `TrustedBytecodeCommitments::derive()`
+- [ ] Add E2E tests for Committed mode
+- [ ] Exercise `BytecodeClaimReduction` sumcheck with Committed mode
+- [ ] Consider unified `JoltConfig` struct for all configuration
diff --git a/jolt-core/benches/e2e_profiling.rs b/jolt-core/benches/e2e_profiling.rs
index cf5cb3b65d..b171c452ef 100644
--- a/jolt-core/benches/e2e_profiling.rs
+++ b/jolt-core/benches/e2e_profiling.rs
@@ -1,5 +1,8 @@
+use std::sync::Arc;
+
 use ark_serialize::CanonicalSerialize;
 use jolt_core::host;
+use jolt_core::zkvm::bytecode::BytecodePreprocessing;
 use jolt_core::zkvm::prover::JoltProverPreprocessing;
 use jolt_core::zkvm::verifier::{JoltSharedPreprocessing, JoltVerifierPreprocessing};
 use jolt_core::zkvm::{RV64IMACProver, RV64IMACVerifier};
@@ -201,19 +204,22 @@ fn prove_example(
 ) -> Vec<(tracing::Span, Box<dyn FnOnce()>)> {
     let mut tasks = Vec::new();
     let mut program = host::Program::new(example_name);
-    let (bytecode, init_memory_state, _) = program.decode();
+    let (instructions, init_memory_state, _) = program.decode();
     let (_lazy_trace, trace, _, program_io) = program.trace(&serialized_input, &[], &[]);
     let padded_trace_len = (trace.len() + 1).next_power_of_two();
     drop(trace);
 
     let task = move || {
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode,
+            &bytecode,
             program_io.memory_layout.clone(),
             init_memory_state,
             padded_trace_len,
         );
-        let preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
 
         let elf_contents_opt = program.get_elf_contents();
         let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
@@ -229,9 +235,10 @@ fn prove_example(
         let program_io = prover.program_io.clone();
         let (jolt_proof, _) = prover.prove();
 
-        let verifier_preprocessing = JoltVerifierPreprocessing::new(
+        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
             shared_preprocessing,
             preprocessing.generators.to_verifier_setup(),
+            Arc::clone(&preprocessing.bytecode),
         );
         let verifier =
             RV64IMACVerifier::new(&verifier_preprocessing, jolt_proof, program_io, None, None)
@@ -255,7 +262,7 @@ fn prove_example_with_trace(
     _scale: usize,
 ) -> (std::time::Duration, usize, usize, usize) {
     let mut program = host::Program::new(example_name);
-    let (bytecode, init_memory_state, _) = program.decode();
+    let (instructions, init_memory_state, _) = program.decode();
     let (_, trace, _, program_io) = program.trace(&serialized_input, &[], &[]);
 
     assert!(
@@ -263,13 +270,15 @@ fn prove_example_with_trace(
         "Trace is longer than expected"
     );
 
+    let bytecode: Arc<BytecodePreprocessing> =
+        BytecodePreprocessing::preprocess(instructions).into();
     let shared_preprocessing = JoltSharedPreprocessing::new(
-        bytecode.clone(),
+        &bytecode,
         program_io.memory_layout.clone(),
         init_memory_state,
         trace.len().next_power_of_two(),
     );
-    let preprocessing = JoltProverPreprocessing::new(shared_preprocessing);
+    let preprocessing = JoltProverPreprocessing::new(shared_preprocessing, Arc::clone(&bytecode));
 
     let elf_contents_opt = program.get_elf_contents();
     let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
diff --git a/jolt-core/src/guest/prover.rs b/jolt-core/src/guest/prover.rs
index a20023fed7..9df31cc5b2 100644
--- a/jolt-core/src/guest/prover.rs
+++ b/jolt-core/src/guest/prover.rs
@@ -16,16 +16,20 @@ pub fn preprocess(
     guest: &Program,
     max_trace_length: usize,
 ) -> JoltProverPreprocessing<ark_bn254::Fr, DoryCommitmentScheme> {
+    use crate::zkvm::bytecode::BytecodePreprocessing;
     use crate::zkvm::verifier::JoltSharedPreprocessing;
+    use std::sync::Arc;
 
-    let (bytecode, memory_init, program_size) = guest.decode();
+    let (instructions, memory_init, program_size) = guest.decode();
 
     let mut memory_config = guest.memory_config;
     memory_config.program_size = Some(program_size);
     let memory_layout = MemoryLayout::new(&memory_config);
+
+    let bytecode = Arc::new(BytecodePreprocessing::preprocess(instructions));
     let shared_preprocessing =
-        JoltSharedPreprocessing::new(bytecode, memory_layout, memory_init, max_trace_length);
-    JoltProverPreprocessing::new(shared_preprocessing)
+        JoltSharedPreprocessing::new(&bytecode, memory_layout, memory_init, max_trace_length);
+    JoltProverPreprocessing::new(shared_preprocessing, bytecode)
 }
 
 #[allow(clippy::type_complexity, clippy::too_many_arguments)]
diff --git a/jolt-core/src/guest/verifier.rs b/jolt-core/src/guest/verifier.rs
index 5c2a92904d..c642c9f525 100644
--- a/jolt-core/src/guest/verifier.rs
+++ b/jolt-core/src/guest/verifier.rs
@@ -1,3 +1,5 @@
+use std::sync::Arc;
+
 use crate::field::JoltField;
 use crate::poly::commitment::commitment_scheme::CommitmentScheme;
 use crate::poly::commitment::commitment_scheme::StreamingCommitmentScheme;
@@ -6,6 +8,7 @@ use crate::guest::program::Program;
 use crate::poly::commitment::dory::DoryCommitmentScheme;
 use crate::transcripts::Transcript;
 use crate::utils::errors::ProofVerifyError;
+use crate::zkvm::bytecode::BytecodePreprocessing;
 use crate::zkvm::proof_serialization::JoltProof;
 use crate::zkvm::verifier::JoltSharedPreprocessing;
 use crate::zkvm::verifier::JoltVerifier;
@@ -18,14 +21,17 @@ pub fn preprocess(
     max_trace_length: usize,
     verifier_setup: <DoryCommitmentScheme as CommitmentScheme>::VerifierSetup,
 ) -> JoltVerifierPreprocessing<ark_bn254::Fr, DoryCommitmentScheme> {
-    let (bytecode, memory_init, program_size) = guest.decode();
+    let (bytecode_instructions, memory_init, program_size) = guest.decode();
 
     let mut memory_config = guest.memory_config;
     memory_config.program_size = Some(program_size);
     let memory_layout = MemoryLayout::new(&memory_config);
+
+    let bytecode: Arc<BytecodePreprocessing> =
+        BytecodePreprocessing::preprocess(bytecode_instructions).into();
     let shared =
-        JoltSharedPreprocessing::new(bytecode, memory_layout, memory_init, max_trace_length);
-    JoltVerifierPreprocessing::new(shared, verifier_setup)
+        JoltSharedPreprocessing::new(&bytecode, memory_layout, memory_init, max_trace_length);
+    JoltVerifierPreprocessing::new_full(shared, verifier_setup, bytecode)
 }
 
 pub fn verify<F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, FS: Transcript>(
diff --git a/jolt-core/src/poly/commitment/commitment_scheme.rs b/jolt-core/src/poly/commitment/commitment_scheme.rs
index 6debe3b519..7e1a2faa43 100644
--- a/jolt-core/src/poly/commitment/commitment_scheme.rs
+++ b/jolt-core/src/poly/commitment/commitment_scheme.rs
@@ -27,7 +27,13 @@ pub trait CommitmentScheme: Clone + Sync + Send + 'static {
     /// A hint that helps the prover compute an opening proof. Typically some byproduct of
     /// the commitment computation, e.g. for Dory the Pedersen commitments to the rows can be
     /// used as a hint for the opening proof.
-    type OpeningProofHint: Sync + Send + Clone + Debug + PartialEq;
+    type OpeningProofHint: Sync
+        + Send
+        + Clone
+        + Debug
+        + PartialEq
+        + CanonicalSerialize
+        + CanonicalDeserialize;
 
     /// Generates the prover setup for this PCS. `max_num_vars` is the maximum number of
     /// variables of any polynomial that will be committed using this setup.
diff --git a/jolt-core/src/utils/errors.rs b/jolt-core/src/utils/errors.rs
index e8b1b9fee1..b3800e13eb 100644
--- a/jolt-core/src/utils/errors.rs
+++ b/jolt-core/src/utils/errors.rs
@@ -36,4 +36,6 @@ pub enum ProofVerifyError {
     SumcheckVerificationError,
     #[error("Univariate-skip round verification failed")]
     UniSkipVerificationError,
+    #[error("Bytecode type mismatch: {0}")]
+    BytecodeTypeMismatch(String),
 }
diff --git a/jolt-core/src/zkvm/bytecode/mod.rs b/jolt-core/src/zkvm/bytecode/mod.rs
index 82f6fb62ab..65695c7b4f 100644
--- a/jolt-core/src/zkvm/bytecode/mod.rs
+++ b/jolt-core/src/zkvm/bytecode/mod.rs
@@ -1,12 +1,186 @@
-use ark_serialize::{CanonicalDeserialize, CanonicalSerialize};
+use std::io::{Read, Write};
+use std::sync::Arc;
+
+use ark_serialize::{
+    CanonicalDeserialize, CanonicalSerialize, Compress, SerializationError, Valid, Validate,
+};
 use common::constants::{ALIGNMENT_FACTOR_BYTECODE, RAM_START_ADDRESS};
 use tracer::instruction::{Cycle, Instruction};
 
+use crate::poly::commitment::commitment_scheme::CommitmentScheme;
+use crate::utils::errors::ProofVerifyError;
+
 pub mod read_raf_checking;
 
+/// Bytecode commitments that were derived from actual bytecode.
+///
+/// This type enforces at the type level that commitments came from honest
+/// preprocessing of full bytecode. The canonical constructor is `derive()`,
+/// which takes full bytecode and computes commitments.
+///
+/// # Trust Model
+/// - Create via `derive()` from full bytecode (offline preprocessing)
+/// - Or deserialize from a trusted source (assumes honest origin)
+/// - Pass to verifier preprocessing for succinct (online) verification
+///
+/// # Security Warning
+/// If you construct this type with arbitrary commitments (bypassing `derive()`),
+/// verification will be unsound. Only use `derive()` or trusted deserialization.
+#[derive(Clone, Debug, PartialEq, CanonicalSerialize, CanonicalDeserialize)]
+pub struct TrustedBytecodeCommitments<PCS: CommitmentScheme> {
+    /// The bytecode chunk commitments.
+    /// Trust is enforced by the type - create via `derive()` or deserialize from trusted source.
+    pub commitments: Vec<PCS::Commitment>,
+}
+
+impl<PCS: CommitmentScheme> TrustedBytecodeCommitments<PCS> {
+    /// Derive commitments from full bytecode (the canonical constructor).
+    ///
+    /// This is the "offline preprocessing" step that must be done honestly.
+    /// Returns trusted commitments + hints for opening proofs.
+    #[tracing::instrument(skip_all, name = "TrustedBytecodeCommitments::derive")]
+    pub fn derive(
+        _bytecode: &BytecodePreprocessing,
+        _generators: &PCS::ProverSetup,
+    ) -> (Self, Vec<PCS::OpeningProofHint>) {
+        // TODO: Implement bytecode chunk polynomial commitment computation.
+        // This will:
+        // 1. Build bytecode chunk polynomials based on lane ordering
+        //    (see bytecode-commitment-progress.md for the canonical ordering)
+        // 2. Commit each polynomial using PCS
+        // 3. Return commitments and opening hints (e.g., Dory tier-1 data)
+        //
+        // For now, return empty vectors as placeholder.
+        (
+            Self {
+                commitments: Vec::new(),
+            },
+            Vec::new(),
+        )
+    }
+}
+
+/// Bytecode information available to the verifier.
+///
+/// In `Full` mode, the verifier has access to the complete bytecode preprocessing
+/// and can materialize bytecode-dependent polynomials (O(K) work).
+///
+/// In `Committed` mode, the verifier only sees commitments to the bytecode polynomials,
+/// enabling succinct verification via claim reductions.
+///
+/// **Note**: The bytecode size K is stored in `JoltSharedPreprocessing.bytecode_size`,
+/// NOT in this enum. Use `shared.bytecode_size` to get the size.
+#[derive(Debug, Clone)]
+pub enum VerifierBytecode<PCS: CommitmentScheme> {
+    /// Full bytecode available (Full mode) — verifier can materialize polynomials.
+    Full(Arc<BytecodePreprocessing>),
+    /// Only trusted commitments available (Committed mode) — verifier uses claim reductions.
+    /// Size K is in `JoltSharedPreprocessing.bytecode_size`.
+    Committed(TrustedBytecodeCommitments<PCS>),
+}
+
+impl<PCS: CommitmentScheme> VerifierBytecode<PCS> {
+    /// Returns the full bytecode preprocessing, or an error if in Committed mode.
+    pub fn as_full(&self) -> Result<&Arc<BytecodePreprocessing>, ProofVerifyError> {
+        match self {
+            VerifierBytecode::Full(bp) => Ok(bp),
+            VerifierBytecode::Committed(_) => Err(ProofVerifyError::BytecodeTypeMismatch(
+                "expected Full, got Committed".to_string(),
+            )),
+        }
+    }
+
+    /// Returns true if this is Full mode.
+    pub fn is_full(&self) -> bool {
+        matches!(self, VerifierBytecode::Full(_))
+    }
+
+    /// Returns true if this is Committed mode.
+    pub fn is_committed(&self) -> bool {
+        matches!(self, VerifierBytecode::Committed(_))
+    }
+
+    /// Returns the trusted commitments, or an error if in Full mode.
+    pub fn as_committed(&self) -> Result<&TrustedBytecodeCommitments<PCS>, ProofVerifyError> {
+        match self {
+            VerifierBytecode::Committed(trusted) => Ok(trusted),
+            VerifierBytecode::Full(_) => Err(ProofVerifyError::BytecodeTypeMismatch(
+                "expected Committed, got Full".to_string(),
+            )),
+        }
+    }
+}
+
+// Manual serialization for VerifierBytecode
+// Format: tag (u8) followed by variant data
+impl<PCS: CommitmentScheme> CanonicalSerialize for VerifierBytecode<PCS> {
+    fn serialize_with_mode<W: Write>(
+        &self,
+        mut writer: W,
+        compress: Compress,
+    ) -> Result<(), SerializationError> {
+        match self {
+            VerifierBytecode::Full(bp) => {
+                0u8.serialize_with_mode(&mut writer, compress)?;
+                bp.as_ref().serialize_with_mode(&mut writer, compress)?;
+            }
+            VerifierBytecode::Committed(trusted) => {
+                1u8.serialize_with_mode(&mut writer, compress)?;
+                trusted.serialize_with_mode(&mut writer, compress)?;
+            }
+        }
+        Ok(())
+    }
+
+    fn serialized_size(&self, compress: Compress) -> usize {
+        1 + match self {
+            VerifierBytecode::Full(bp) => bp.serialized_size(compress),
+            VerifierBytecode::Committed(trusted) => trusted.serialized_size(compress),
+        }
+    }
+}
+
+impl<PCS: CommitmentScheme> Valid for VerifierBytecode<PCS> {
+    fn check(&self) -> Result<(), SerializationError> {
+        match self {
+            VerifierBytecode::Full(bp) => bp.check(),
+            VerifierBytecode::Committed(trusted) => trusted.check(),
+        }
+    }
+}
+
+impl<PCS: CommitmentScheme> CanonicalDeserialize for VerifierBytecode<PCS> {
+    fn deserialize_with_mode<R: Read>(
+        mut reader: R,
+        compress: Compress,
+        validate: Validate,
+    ) -> Result<Self, SerializationError> {
+        let tag = u8::deserialize_with_mode(&mut reader, compress, validate)?;
+        match tag {
+            0 => {
+                let bp =
+                    BytecodePreprocessing::deserialize_with_mode(&mut reader, compress, validate)?;
+                Ok(VerifierBytecode::Full(Arc::new(bp)))
+            }
+            1 => {
+                let trusted = TrustedBytecodeCommitments::<PCS>::deserialize_with_mode(
+                    &mut reader,
+                    compress,
+                    validate,
+                )?;
+                Ok(VerifierBytecode::Committed(trusted))
+            }
+            _ => Err(SerializationError::InvalidData),
+        }
+    }
+}
+
+/// Bytecode preprocessing data (O(K)).
+///
+/// **Note**: The bytecode size K is stored in `JoltSharedPreprocessing.bytecode_size`,
+/// NOT in this struct. Use `shared.bytecode_size` to get the size.
 #[derive(Default, Debug, Clone, CanonicalSerialize, CanonicalDeserialize)]
 pub struct BytecodePreprocessing {
-    pub code_size: usize,
     pub bytecode: Vec<Instruction>,
     /// Maps the memory address of each instruction in the bytecode to its "virtual" address.
     /// See Section 6.1 of the Jolt paper, "Reflecting the program counter". The virtual address
@@ -21,18 +195,15 @@ impl BytecodePreprocessing {
         bytecode.insert(0, Instruction::NoOp);
         let pc_map = BytecodePCMapper::new(&bytecode);
 
-        let code_size = bytecode.len().next_power_of_two().max(2);
+        let bytecode_size = bytecode.len().next_power_of_two().max(2);
 
         // Bytecode: Pad to nearest power of 2
-        bytecode.resize(code_size, Instruction::NoOp);
+        bytecode.resize(bytecode_size, Instruction::NoOp);
 
-        Self {
-            code_size,
-            bytecode,
-            pc_map,
-        }
+        Self { bytecode, pc_map }
     }
 
+    #[inline(always)]
     pub fn get_pc(&self, cycle: &Cycle) -> usize {
         if matches!(cycle, tracer::instruction::Cycle::NoOp) {
             return 0;
@@ -56,13 +227,17 @@ impl BytecodePCMapper {
         let mut indices: Vec<Option<(usize, u16)>> = {
             // For read-raf tests we simulate bytecode being empty
             #[cfg(test)]
-            if bytecode.len() == 1 {
-                vec![None; 1]
-            } else {
-                vec![None; Self::get_index(bytecode.last().unwrap().normalize().address) + 1]
+            {
+                if bytecode.len() == 1 {
+                    vec![None; 1]
+                } else {
+                    vec![None; Self::get_index(bytecode.last().unwrap().normalize().address) + 1]
+                }
             }
             #[cfg(not(test))]
-            vec![None; Self::get_index(bytecode.last().unwrap().normalize().address) + 1]
+            {
+                vec![None; Self::get_index(bytecode.last().unwrap().normalize().address) + 1]
+            }
         };
         let mut last_pc = 0;
         // Push the initial noop instruction
@@ -89,6 +264,7 @@ impl BytecodePCMapper {
         Self { indices }
     }
 
+    #[inline(always)]
     pub fn get_pc(&self, address: usize, virtual_sequence_remaining: u16) -> usize {
         let (base_pc, max_inline_seq) = self
             .indices
@@ -98,6 +274,7 @@ impl BytecodePCMapper {
         base_pc + (max_inline_seq - virtual_sequence_remaining) as usize
     }
 
+    #[inline(always)]
     pub const fn get_index(address: usize) -> usize {
         assert!(address >= RAM_START_ADDRESS as usize);
         assert!(address.is_multiple_of(ALIGNMENT_FACTOR_BYTECODE));
diff --git a/jolt-core/src/zkvm/claim_reductions/hamming_weight.rs b/jolt-core/src/zkvm/claim_reductions/hamming_weight.rs
index d40860f35a..266287f80c 100644
--- a/jolt-core/src/zkvm/claim_reductions/hamming_weight.rs
+++ b/jolt-core/src/zkvm/claim_reductions/hamming_weight.rs
@@ -98,6 +98,7 @@ use crate::subprotocols::{
 };
 use crate::transcripts::Transcript;
 use crate::zkvm::{
+    bytecode::BytecodePreprocessing,
     config::OneHotParams,
     verifier::JoltSharedPreprocessing,
     witness::{CommittedPolynomial, VirtualPolynomial},
@@ -309,13 +310,14 @@ impl<F: JoltField> HammingWeightClaimReductionProver<F> {
         params: HammingWeightClaimReductionParams<F>,
         trace: &[Cycle],
         preprocessing: &JoltSharedPreprocessing,
+        bytecode: &BytecodePreprocessing,
         one_hot_params: &OneHotParams,
     ) -> Self {
         // Compute all G_i polynomials via streaming.
         // `params.r_cycle` is in BIG_ENDIAN (OpeningPoint) convention.
         let G_vecs = compute_all_G::<F>(
             trace,
-            &preprocessing.bytecode,
+            bytecode,
             &preprocessing.memory_layout,
             one_hot_params,
             &params.r_cycle,
diff --git a/jolt-core/src/zkvm/prover.rs b/jolt-core/src/zkvm/prover.rs
index eccc9cf569..972c2dda56 100644
--- a/jolt-core/src/zkvm/prover.rs
+++ b/jolt-core/src/zkvm/prover.rs
@@ -16,6 +16,7 @@ use std::{
 use crate::poly::commitment::dory::DoryContext;
 use ark_serialize::{CanonicalDeserialize, CanonicalSerialize};
 
+use crate::zkvm::bytecode::{BytecodePreprocessing, TrustedBytecodeCommitments};
 use crate::zkvm::config::{BytecodeMode, ReadWriteConfig};
 use crate::zkvm::verifier::JoltSharedPreprocessing;
 use crate::zkvm::Serializable;
@@ -395,7 +396,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         // `log_T >= log_K_bytecode`, i.e. `T >= K_bytecode`. Enforce by padding up-front.
         let mut padded_trace_len = padded_trace_len;
         if bytecode_mode == BytecodeMode::Committed {
-            let bytecode_k = preprocessing.shared.bytecode.code_size;
+            let bytecode_k = preprocessing.shared.bytecode_size;
             if bytecode_k > preprocessing.shared.max_padded_trace_length {
                 panic!(
                     "Bytecode commitment mode requires max_padded_trace_length >= bytecode_K.\n\
@@ -459,8 +460,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         let log_T = trace.len().log_2();
         let ram_log_K = ram_K.log_2();
         let rw_config = ReadWriteConfig::new(log_T, ram_log_K);
-        let one_hot_params =
-            OneHotParams::new(log_T, preprocessing.shared.bytecode.code_size, ram_K);
+        let one_hot_params = OneHotParams::new(log_T, preprocessing.shared.bytecode_size, ram_K);
 
         Self {
             preprocessing,
@@ -508,10 +508,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             &mut self.transcript,
         );
 
-        tracing::info!(
-            "bytecode size: {}",
-            self.preprocessing.shared.bytecode.code_size
-        );
+        tracing::info!("bytecode size: {}", self.preprocessing.shared.bytecode_size);
 
         let (commitments, mut opening_proof_hints) = self.generate_and_commit_witness_polynomials();
         let untrusted_advice_commitment = self.generate_and_commit_untrusted_advice();
@@ -629,7 +626,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
                 .par_iter()
                 .map(|poly_id| {
                     let witness: MultilinearPolynomial<F> = poly_id.generate_witness(
-                        &self.preprocessing.shared.bytecode,
+                        &self.preprocessing.bytecode,
                         &self.preprocessing.shared.memory_layout,
                         &trace,
                         Some(&self.one_hot_params),
@@ -669,6 +666,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
                             poly.stream_witness_and_commit_rows::<_, PCS>(
                                 &self.preprocessing.generators,
                                 &self.preprocessing.shared,
+                                &self.preprocessing.bytecode,
                                 &chunk,
                                 &self.one_hot_params,
                             )
@@ -783,7 +781,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         let mut uni_skip = OuterUniSkipProver::initialize(
             uni_skip_params.clone(),
             &self.trace,
-            &self.preprocessing.shared.bytecode,
+            &self.preprocessing.bytecode,
         );
         let first_round_proof = prove_uniskip_round(
             &mut uni_skip,
@@ -799,7 +797,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         let schedule = LinearOnlySchedule::new(uni_skip_params.tau.len() - 1);
         let shared = OuterSharedState::new(
             Arc::clone(&self.trace),
-            &self.preprocessing.shared.bytecode,
+            &self.preprocessing.bytecode,
             &uni_skip_params,
             &self.opening_accumulator,
         );
@@ -879,7 +877,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         let ram_read_write_checking = RamReadWriteCheckingProver::initialize(
             ram_read_write_checking_params,
             &self.trace,
-            &self.preprocessing.shared.bytecode,
+            &self.preprocessing.bytecode,
             &self.program_io.memory_layout,
             &self.initial_ram_state,
         );
@@ -956,7 +954,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         let spartan_shift = ShiftSumcheckProver::initialize(
             spartan_shift_params,
             Arc::clone(&self.trace),
-            &self.preprocessing.shared.bytecode,
+            &self.preprocessing.bytecode,
         );
         let spartan_instruction_input = InstructionInputSumcheckProver::initialize(
             spartan_instruction_input_params,
@@ -1036,19 +1034,19 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         let registers_read_write_checking = RegistersReadWriteCheckingProver::initialize(
             registers_read_write_checking_params,
             self.trace.clone(),
-            &self.preprocessing.shared.bytecode,
+            &self.preprocessing.bytecode,
             &self.program_io.memory_layout,
         );
         let ram_val_evaluation = RamValEvaluationSumcheckProver::initialize(
             ram_val_evaluation_params,
             &self.trace,
-            &self.preprocessing.shared.bytecode,
+            &self.preprocessing.bytecode,
             &self.program_io.memory_layout,
         );
         let ram_val_final = ValFinalSumcheckProver::initialize(
             ram_val_final_params,
             &self.trace,
-            &self.preprocessing.shared.bytecode,
+            &self.preprocessing.bytecode,
             &self.program_io.memory_layout,
         );
 
@@ -1105,7 +1103,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         let registers_val_evaluation = RegistersValEvaluationSumcheckProver::initialize(
             registers_val_evaluation_params,
             &self.trace,
-            &self.preprocessing.shared.bytecode,
+            &self.preprocessing.bytecode,
             &self.program_io.memory_layout,
         );
         let ram_ra_reduction = RamRaClaimReductionSumcheckProver::initialize(
@@ -1162,7 +1160,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         print_current_memory_usage("Stage 6a baseline");
 
         let mut bytecode_read_raf_params = BytecodeReadRafSumcheckParams::gen(
-            &self.preprocessing.shared.bytecode,
+            &self.preprocessing.bytecode,
             self.trace.len().log_2(),
             &self.one_hot_params,
             &self.opening_accumulator,
@@ -1181,12 +1179,12 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         let mut bytecode_read_raf = BytecodeReadRafAddressSumcheckProver::initialize(
             bytecode_read_raf_params.clone(),
             Arc::clone(&self.trace),
-            Arc::clone(&self.preprocessing.shared.bytecode),
+            Arc::clone(&self.preprocessing.bytecode),
         );
         let mut booleanity = BooleanityAddressSumcheckProver::initialize(
             booleanity_params.clone(),
             &self.trace,
-            &self.preprocessing.shared.bytecode,
+            &self.preprocessing.bytecode,
             &self.program_io.memory_layout,
         );
 
@@ -1258,7 +1256,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             );
             self.bytecode_reduction_prover = Some(BytecodeClaimReductionProver::initialize(
                 bytecode_reduction_params,
-                Arc::clone(&self.preprocessing.shared.bytecode),
+                Arc::clone(&self.preprocessing.bytecode),
             ));
         } else {
             // Legacy mode: do not run the bytecode claim reduction.
@@ -1320,13 +1318,13 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         let mut bytecode_read_raf = BytecodeReadRafCycleSumcheckProver::initialize(
             bytecode_read_raf_params,
             Arc::clone(&self.trace),
-            Arc::clone(&self.preprocessing.shared.bytecode),
+            Arc::clone(&self.preprocessing.bytecode),
             &self.opening_accumulator,
         );
         let mut booleanity = BooleanityCycleSumcheckProver::initialize(
             booleanity_params,
             &self.trace,
-            &self.preprocessing.shared.bytecode,
+            &self.preprocessing.bytecode,
             &self.program_io.memory_layout,
             &self.opening_accumulator,
         );
@@ -1419,6 +1417,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             hw_params,
             &self.trace,
             &self.preprocessing.shared,
+            &self.preprocessing.bytecode,
             &self.one_hot_params,
         );
 
@@ -1619,7 +1618,7 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         };
 
         let streaming_data = Arc::new(RLCStreamingData {
-            bytecode: Arc::clone(&self.preprocessing.shared.bytecode),
+            bytecode: Arc::clone(&self.preprocessing.bytecode),
             memory_layout: self.preprocessing.shared.memory_layout.clone(),
         });
 
@@ -1690,6 +1689,17 @@ fn write_instance_flamegraph_svg(
 pub struct JoltProverPreprocessing<F: JoltField, PCS: CommitmentScheme<Field = F>> {
     pub generators: PCS::ProverSetup,
     pub shared: JoltSharedPreprocessing,
+    /// Full bytecode preprocessing (prover always has full access for witness computation).
+    pub bytecode: Arc<BytecodePreprocessing>,
+    /// Trusted bytecode commitments (only in Committed mode).
+    ///
+    /// In Full mode: None (verifier has full bytecode).
+    /// In Committed mode: Some(trusted) for bytecode chunk polynomial commitments.
+    pub bytecode_commitments: Option<TrustedBytecodeCommitments<PCS>>,
+    /// Opening proof hints for bytecode commitments, e.g., Dory tier-1 data (only in Committed mode).
+    ///
+    /// One hint per commitment in `bytecode_commitments`.
+    pub bytecode_commitment_hints: Option<Vec<PCS::OpeningProofHint>>,
 }
 
 impl<F, PCS> JoltProverPreprocessing<F, PCS>
@@ -1697,11 +1707,8 @@ where
     F: JoltField,
     PCS: CommitmentScheme<Field = F>,
 {
-    #[tracing::instrument(skip_all, name = "JoltProverPreprocessing::gen")]
-    pub fn new(
-        shared: JoltSharedPreprocessing,
-        // max_trace_length: usize,
-    ) -> JoltProverPreprocessing<F, PCS> {
+    /// Setup generators based on trace length.
+    fn setup_generators(shared: &JoltSharedPreprocessing) -> PCS::ProverSetup {
         use common::constants::ONEHOT_CHUNK_THRESHOLD_LOG_T;
         let max_T: usize = shared.max_padded_trace_length.next_power_of_two();
         let max_log_T = max_T.log_2();
@@ -1711,8 +1718,51 @@ where
         } else {
             8
         };
-        let generators = PCS::setup_prover(max_log_k_chunk + max_log_T);
-        JoltProverPreprocessing { generators, shared }
+        PCS::setup_prover(max_log_k_chunk + max_log_T)
+    }
+
+    /// Create prover preprocessing in Full mode (no bytecode commitments).
+    ///
+    /// Use this when the verifier will have access to full bytecode.
+    #[tracing::instrument(skip_all, name = "JoltProverPreprocessing::new")]
+    pub fn new(
+        shared: JoltSharedPreprocessing,
+        bytecode: Arc<BytecodePreprocessing>,
+    ) -> JoltProverPreprocessing<F, PCS> {
+        let generators = Self::setup_generators(&shared);
+        JoltProverPreprocessing {
+            generators,
+            shared,
+            bytecode,
+            bytecode_commitments: None,
+            bytecode_commitment_hints: None,
+        }
+    }
+
+    /// Create prover preprocessing in Committed mode (with bytecode commitments).
+    ///
+    /// Use this when the verifier should only receive bytecode commitments (succinct verification).
+    /// Computes commitments + hints for all bytecode chunk polynomials during preprocessing.
+    #[tracing::instrument(skip_all, name = "JoltProverPreprocessing::new_committed")]
+    pub fn new_committed(
+        shared: JoltSharedPreprocessing,
+        bytecode: Arc<BytecodePreprocessing>,
+    ) -> JoltProverPreprocessing<F, PCS> {
+        let generators = Self::setup_generators(&shared);
+        let (trusted_commitments, hints) =
+            TrustedBytecodeCommitments::derive(&bytecode, &generators);
+        JoltProverPreprocessing {
+            generators,
+            shared,
+            bytecode,
+            bytecode_commitments: Some(trusted_commitments),
+            bytecode_commitment_hints: Some(hints),
+        }
+    }
+
+    /// Check if this preprocessing is in Committed mode.
+    pub fn is_committed_mode(&self) -> bool {
+        self.bytecode_commitments.is_some()
     }
 
     pub fn save_to_target_dir(&self, target_dir: &str) -> std::io::Result<()> {
@@ -1740,6 +1790,8 @@ impl<F: JoltField, PCS: CommitmentScheme<Field = F>> Serializable
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
+
     use ark_bn254::Fr;
     use serial_test::serial;
 
@@ -1753,6 +1805,7 @@ mod tests {
         multilinear_polynomial::MultilinearPolynomial,
         opening_proof::{OpeningAccumulator, SumcheckId},
     };
+    use crate::zkvm::bytecode::BytecodePreprocessing;
     use crate::zkvm::claim_reductions::AdviceKind;
     use crate::zkvm::verifier::JoltSharedPreprocessing;
     use crate::zkvm::witness::CommittedPolynomial;
@@ -1797,16 +1850,20 @@ mod tests {
         DoryGlobals::reset();
         let mut program = host::Program::new("fibonacci-guest");
         let inputs = postcard::to_stdvec(&100u32).unwrap();
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
         let (_, _, _, io_device) = program.trace(&inputs, &[], &[]);
+
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             io_device.memory_layout.clone(),
             init_memory_state,
             1 << 16,
         );
 
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
         let elf_contents_opt = program.get_elf_contents();
         let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
         let prover = RV64IMACProver::gen_from_elf(
@@ -1821,9 +1878,10 @@ mod tests {
         let io_device = prover.program_io.clone();
         let (jolt_proof, debug_info) = prover.prove();
 
-        let verifier_preprocessing = JoltVerifierPreprocessing::new(
+        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
             shared_preprocessing,
             prover_preprocessing.generators.to_verifier_setup(),
+            bytecode,
         );
         let verifier = RV64IMACVerifier::new(
             &verifier_preprocessing,
@@ -1842,17 +1900,20 @@ mod tests {
         DoryGlobals::reset();
         let mut program = host::Program::new("fibonacci-guest");
         let inputs = postcard::to_stdvec(&5u32).unwrap();
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
         let (_, _, _, io_device) = program.trace(&inputs, &[], &[]);
 
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             io_device.memory_layout.clone(),
             init_memory_state,
             256,
         );
 
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
         let elf_contents_opt = program.get_elf_contents();
         let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
         let log_chunk = 8; // Use default log_chunk for tests
@@ -1876,9 +1937,10 @@ mod tests {
         let io_device = prover.program_io.clone();
         let (jolt_proof, debug_info) = prover.prove();
 
-        let verifier_preprocessing = JoltVerifierPreprocessing::new(
+        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
             prover_preprocessing.shared.clone(),
             prover_preprocessing.generators.to_verifier_setup(),
+            bytecode,
         );
         let verifier = RV64IMACVerifier::new(
             &verifier_preprocessing,
@@ -1902,18 +1964,21 @@ mod tests {
         // when the jolt-inlines-keccak256 crate is linked (see lib.rs)
 
         let mut program = host::Program::new("sha3-guest");
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
         let inputs = postcard::to_stdvec(&[5u8; 32]).unwrap();
         let (_, _, _, io_device) = program.trace(&inputs, &[], &[]);
 
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             io_device.memory_layout.clone(),
             init_memory_state,
             1 << 16,
         );
 
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
         let elf_contents_opt = program.get_elf_contents();
         let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
         let prover = RV64IMACProver::gen_from_elf(
@@ -1928,9 +1993,10 @@ mod tests {
         let io_device = prover.program_io.clone();
         let (jolt_proof, debug_info) = prover.prove();
 
-        let verifier_preprocessing = JoltVerifierPreprocessing::new(
+        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
             prover_preprocessing.shared.clone(),
             prover_preprocessing.generators.to_verifier_setup(),
+            Arc::clone(&prover_preprocessing.bytecode),
         );
         let verifier = RV64IMACVerifier::new(
             &verifier_preprocessing,
@@ -1964,18 +2030,21 @@ mod tests {
         // SHA2 inlines are automatically registered via #[ctor::ctor]
         // when the jolt-inlines-sha2 crate is linked (see lib.rs)
         let mut program = host::Program::new("sha2-guest");
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
         let inputs = postcard::to_stdvec(&[5u8; 32]).unwrap();
         let (_, _, _, io_device) = program.trace(&inputs, &[], &[]);
 
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             io_device.memory_layout.clone(),
             init_memory_state,
             1 << 16,
         );
 
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
         let elf_contents_opt = program.get_elf_contents();
         let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
         let prover = RV64IMACProver::gen_from_elf(
@@ -1990,9 +2059,10 @@ mod tests {
         let io_device = prover.program_io.clone();
         let (jolt_proof, debug_info) = prover.prove();
 
-        let verifier_preprocessing = JoltVerifierPreprocessing::new(
+        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
             prover_preprocessing.shared.clone(),
             prover_preprocessing.generators.to_verifier_setup(),
+            Arc::clone(&prover_preprocessing.bytecode),
         );
         let verifier = RV64IMACVerifier::new(
             &verifier_preprocessing,
@@ -2024,20 +2094,23 @@ mod tests {
         // - Trusted: commit in preprocessing-only context, reduce in Stage 6, batch in Stage 8
         // - Untrusted: commit at prove time, reduce in Stage 6, batch in Stage 8
         let mut program = host::Program::new("sha2-guest");
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
         let inputs = postcard::to_stdvec(&[5u8; 32]).unwrap();
         let trusted_advice = postcard::to_stdvec(&[7u8; 32]).unwrap();
         let untrusted_advice = postcard::to_stdvec(&[9u8; 32]).unwrap();
 
         let (_, _, _, io_device) = program.trace(&inputs, &untrusted_advice, &trusted_advice);
 
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             io_device.memory_layout.clone(),
             init_memory_state,
             1 << 16,
         );
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
         let elf_contents = program.get_elf_contents().expect("elf contents is None");
 
         let (trusted_commitment, trusted_hint) =
@@ -2088,17 +2161,20 @@ mod tests {
         let trusted_advice = vec![7u8; 4096];
         let untrusted_advice = vec![9u8; 4096];
 
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
         let (lazy_trace, trace, final_memory_state, io_device) =
             program.trace(&inputs, &untrusted_advice, &trusted_advice);
 
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             io_device.memory_layout.clone(),
             init_memory_state,
             256,
         );
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
         tracing::info!(
             "preprocessing.memory_layout.max_trusted_advice_size: {}",
             shared_preprocessing.memory_layout.max_trusted_advice_size
@@ -2143,7 +2219,7 @@ mod tests {
         DoryGlobals::reset();
         // Tests a guest (merkle-tree) that actually consumes both trusted and untrusted advice.
         let mut program = host::Program::new("merkle-tree-guest");
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
 
         // Merkle tree with 4 leaves: input=leaf1, trusted=[leaf2, leaf3], untrusted=leaf4
         let inputs = postcard::to_stdvec(&[5u8; 32].as_slice()).unwrap();
@@ -2152,13 +2228,17 @@ mod tests {
         trusted_advice.extend(postcard::to_stdvec(&[7u8; 32]).unwrap());
 
         let (_, _, _, io_device) = program.trace(&inputs, &untrusted_advice, &trusted_advice);
+
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             io_device.memory_layout.clone(),
             init_memory_state,
             1 << 16,
         );
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
         let elf_contents = program.get_elf_contents().expect("elf contents is None");
 
         let (trusted_commitment, trusted_hint) =
@@ -2211,17 +2291,20 @@ mod tests {
         let trusted_advice = postcard::to_stdvec(&[7u8; 32]).unwrap();
         let untrusted_advice = postcard::to_stdvec(&[9u8; 32]).unwrap();
 
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
         let (lazy_trace, trace, final_memory_state, io_device) =
             program.trace(&inputs, &untrusted_advice, &trusted_advice);
 
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             io_device.memory_layout.clone(),
             init_memory_state,
             1 << 16,
         );
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
         let (trusted_commitment, trusted_hint) =
             commit_trusted_advice_preprocessing_only(&prover_preprocessing, &trusted_advice);
 
@@ -2301,17 +2384,20 @@ mod tests {
     fn memory_ops_e2e_dory() {
         DoryGlobals::reset();
         let mut program = host::Program::new("memory-ops-guest");
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
         let (_, _, _, io_device) = program.trace(&[], &[], &[]);
 
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             io_device.memory_layout.clone(),
             init_memory_state,
             1 << 16,
         );
 
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
         let elf_contents_opt = program.get_elf_contents();
         let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
         let prover = RV64IMACProver::gen_from_elf(
@@ -2326,9 +2412,10 @@ mod tests {
         let io_device = prover.program_io.clone();
         let (jolt_proof, debug_info) = prover.prove();
 
-        let verifier_preprocessing = JoltVerifierPreprocessing::new(
+        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
             prover_preprocessing.shared.clone(),
             prover_preprocessing.generators.to_verifier_setup(),
+            Arc::clone(&prover_preprocessing.bytecode),
         );
         let verifier = RV64IMACVerifier::new(
             &verifier_preprocessing,
@@ -2346,18 +2433,21 @@ mod tests {
     fn btreemap_e2e_dory() {
         DoryGlobals::reset();
         let mut program = host::Program::new("btreemap-guest");
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
         let inputs = postcard::to_stdvec(&50u32).unwrap();
         let (_, _, _, io_device) = program.trace(&inputs, &[], &[]);
 
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             io_device.memory_layout.clone(),
             init_memory_state,
             1 << 16,
         );
 
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
         let elf_contents_opt = program.get_elf_contents();
         let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
         let prover = RV64IMACProver::gen_from_elf(
@@ -2372,9 +2462,10 @@ mod tests {
         let io_device = prover.program_io.clone();
         let (jolt_proof, debug_info) = prover.prove();
 
-        let verifier_preprocessing = JoltVerifierPreprocessing::new(
+        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
             prover_preprocessing.shared.clone(),
             prover_preprocessing.generators.to_verifier_setup(),
+            Arc::clone(&prover_preprocessing.bytecode),
         );
         let verifier = RV64IMACVerifier::new(
             &verifier_preprocessing,
@@ -2392,18 +2483,21 @@ mod tests {
     fn muldiv_e2e_dory() {
         DoryGlobals::reset();
         let mut program = host::Program::new("muldiv-guest");
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
         let inputs = postcard::to_stdvec(&[9u32, 5u32, 3u32]).unwrap();
         let (_, _, _, io_device) = program.trace(&inputs, &[], &[]);
 
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             io_device.memory_layout.clone(),
             init_memory_state,
             1 << 16,
         );
 
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
         let elf_contents_opt = program.get_elf_contents();
         let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
         let prover = RV64IMACProver::gen_from_elf(
@@ -2418,9 +2512,10 @@ mod tests {
         let io_device = prover.program_io.clone();
         let (jolt_proof, debug_info) = prover.prove();
 
-        let verifier_preprocessing = JoltVerifierPreprocessing::new(
+        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
             prover_preprocessing.shared.clone(),
             prover_preprocessing.generators.to_verifier_setup(),
+            Arc::clone(&prover_preprocessing.bytecode),
         );
         let verifier = RV64IMACVerifier::new(
             &verifier_preprocessing,
@@ -2438,21 +2533,24 @@ mod tests {
     #[should_panic]
     fn truncated_trace() {
         let mut program = host::Program::new("fibonacci-guest");
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
         let inputs = postcard::to_stdvec(&9u8).unwrap();
         let (lazy_trace, mut trace, final_memory_state, mut program_io) =
             program.trace(&inputs, &[], &[]);
         trace.truncate(100);
         program_io.outputs[0] = 0; // change the output to 0
 
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             program_io.memory_layout.clone(),
             init_memory_state,
             1 << 16,
         );
 
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
 
         let prover = RV64IMACProver::gen_from_trace(
             &prover_preprocessing,
@@ -2466,9 +2564,10 @@ mod tests {
 
         let (proof, _) = prover.prove();
 
-        let verifier_preprocessing = JoltVerifierPreprocessing::new(
+        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
             prover_preprocessing.shared.clone(),
             prover_preprocessing.generators.to_verifier_setup(),
+            Arc::clone(&prover_preprocessing.bytecode),
         );
         let verifier =
             RV64IMACVerifier::new(&verifier_preprocessing, proof, program_io, None, None).unwrap();
@@ -2481,18 +2580,22 @@ mod tests {
     fn malicious_trace() {
         let mut program = host::Program::new("fibonacci-guest");
         let inputs = postcard::to_stdvec(&1u8).unwrap();
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
         let (lazy_trace, trace, final_memory_state, mut program_io) =
             program.trace(&inputs, &[], &[]);
 
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
+
         // Since the preprocessing is done with the original memory layout, the verifier should fail
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             program_io.memory_layout.clone(),
             init_memory_state,
             1 << 16,
         );
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
 
         // change memory address of output & termination bit to the same address as input
         // changes here should not be able to spoof the verifier result
@@ -2511,9 +2614,10 @@ mod tests {
         );
         let (proof, _) = prover.prove();
 
-        let verifier_preprocessing = JoltVerifierPreprocessing::new(
+        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
             prover_preprocessing.shared.clone(),
             prover_preprocessing.generators.to_verifier_setup(),
+            Arc::clone(&prover_preprocessing.bytecode),
         );
         let verifier =
             JoltVerifier::new(&verifier_preprocessing, proof, program_io, None, None).unwrap();
@@ -2528,16 +2632,19 @@ mod tests {
 
         let mut program = host::Program::new("fibonacci-guest");
         let inputs = postcard::to_stdvec(&50u32).unwrap();
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
         let (_, _, _, io_device) = program.trace(&inputs, &[], &[]);
 
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             io_device.memory_layout.clone(),
             init_memory_state,
             1 << 16,
         );
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
         let elf_contents = program.get_elf_contents().expect("elf contents is None");
         let prover = RV64IMACProver::gen_from_elf(
             &prover_preprocessing,
@@ -2551,9 +2658,10 @@ mod tests {
         let io_device = prover.program_io.clone();
         let (proof, debug_info) = prover.prove();
 
-        let verifier_preprocessing = JoltVerifierPreprocessing::new(
+        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
             shared_preprocessing,
             prover_preprocessing.generators.to_verifier_setup(),
+            Arc::clone(&prover_preprocessing.bytecode),
         );
 
         // DoryGlobals is now initialized inside the verifier's verify_stage8
@@ -2571,7 +2679,7 @@ mod tests {
 
         // Tests a guest (merkle-tree) that actually consumes both trusted and untrusted advice.
         let mut program = host::Program::new("merkle-tree-guest");
-        let (bytecode, init_memory_state, _) = program.decode();
+        let (instructions, init_memory_state, _) = program.decode();
 
         // Merkle tree with 4 leaves: input=leaf1, trusted=[leaf2, leaf3], untrusted=leaf4
         let inputs = postcard::to_stdvec(&[5u8; 32].as_slice()).unwrap();
@@ -2580,13 +2688,17 @@ mod tests {
         trusted_advice.extend(postcard::to_stdvec(&[7u8; 32]).unwrap());
 
         let (_, _, _, io_device) = program.trace(&inputs, &untrusted_advice, &trusted_advice);
+
+        let bytecode: Arc<BytecodePreprocessing> =
+            BytecodePreprocessing::preprocess(instructions).into();
         let shared_preprocessing = JoltSharedPreprocessing::new(
-            bytecode.clone(),
+            &bytecode,
             io_device.memory_layout.clone(),
             init_memory_state,
             1 << 16,
         );
-        let prover_preprocessing = JoltProverPreprocessing::new(shared_preprocessing.clone());
+        let prover_preprocessing =
+            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
         let elf_contents = program.get_elf_contents().expect("elf contents is None");
 
         let (trusted_commitment, trusted_hint) =
diff --git a/jolt-core/src/zkvm/verifier.rs b/jolt-core/src/zkvm/verifier.rs
index 819fa3c712..f1def93030 100644
--- a/jolt-core/src/zkvm/verifier.rs
+++ b/jolt-core/src/zkvm/verifier.rs
@@ -7,7 +7,7 @@ use std::sync::Arc;
 use crate::poly::commitment::commitment_scheme::CommitmentScheme;
 use crate::poly::commitment::dory::{DoryContext, DoryGlobals};
 use crate::subprotocols::sumcheck::BatchedSumcheck;
-use crate::zkvm::bytecode::BytecodePreprocessing;
+use crate::zkvm::bytecode::{BytecodePreprocessing, TrustedBytecodeCommitments, VerifierBytecode};
 use crate::zkvm::claim_reductions::advice::ReductionPhase;
 use crate::zkvm::claim_reductions::RegistersClaimReductionSumcheckVerifier;
 use crate::zkvm::config::BytecodeMode;
@@ -77,7 +77,6 @@ use anyhow::Context;
 use ark_serialize::{CanonicalDeserialize, CanonicalSerialize};
 use common::jolt_device::MemoryLayout;
 use itertools::Itertools;
-use tracer::instruction::Instruction;
 use tracer::JoltDevice;
 
 pub struct JoltVerifier<
@@ -438,8 +437,9 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
         anyhow::Error,
     > {
         let n_cycle_vars = self.proof.trace_length.log_2();
+        // In Committed mode, this returns an error (Full bytecode not available)
         let bytecode_read_raf = BytecodeReadRafAddressSumcheckVerifier::new(
-            &self.preprocessing.shared.bytecode,
+            self.preprocessing.bytecode.as_full()?,
             n_cycle_vars,
             &self.one_hot_params,
             &self.opening_accumulator,
@@ -801,81 +801,35 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
     }
 }
 
-#[derive(Debug, Clone)]
+/// Shared preprocessing between prover and verifier.
+///
+/// **Note**: This struct does NOT contain the full bytecode data.
+/// - Bytecode size K is stored here as the single source of truth.
+/// - Full bytecode data is in `JoltProverPreprocessing.bytecode`.
+/// - Verifier bytecode (Full or Committed) is in `JoltVerifierPreprocessing.bytecode`.
+#[derive(Debug, Clone, CanonicalSerialize, CanonicalDeserialize)]
 pub struct JoltSharedPreprocessing {
-    pub bytecode: Arc<BytecodePreprocessing>,
+    pub bytecode_size: usize,
     pub ram: RAMPreprocessing,
     pub memory_layout: MemoryLayout,
     pub max_padded_trace_length: usize,
 }
 
-impl CanonicalSerialize for JoltSharedPreprocessing {
-    fn serialize_with_mode<W: std::io::Write>(
-        &self,
-        mut writer: W,
-        compress: ark_serialize::Compress,
-    ) -> Result<(), ark_serialize::SerializationError> {
-        // Serialize the inner BytecodePreprocessing (not the Arc wrapper)
-        self.bytecode
-            .as_ref()
-            .serialize_with_mode(&mut writer, compress)?;
-        self.ram.serialize_with_mode(&mut writer, compress)?;
-        self.memory_layout
-            .serialize_with_mode(&mut writer, compress)?;
-        self.max_padded_trace_length
-            .serialize_with_mode(&mut writer, compress)?;
-        Ok(())
-    }
-
-    fn serialized_size(&self, compress: ark_serialize::Compress) -> usize {
-        self.bytecode.serialized_size(compress)
-            + self.ram.serialized_size(compress)
-            + self.memory_layout.serialized_size(compress)
-            + self.max_padded_trace_length.serialized_size(compress)
-    }
-}
-
-impl CanonicalDeserialize for JoltSharedPreprocessing {
-    fn deserialize_with_mode<R: std::io::Read>(
-        mut reader: R,
-        compress: ark_serialize::Compress,
-        validate: ark_serialize::Validate,
-    ) -> Result<Self, ark_serialize::SerializationError> {
-        let bytecode =
-            BytecodePreprocessing::deserialize_with_mode(&mut reader, compress, validate)?;
-        let ram = RAMPreprocessing::deserialize_with_mode(&mut reader, compress, validate)?;
-        let memory_layout = MemoryLayout::deserialize_with_mode(&mut reader, compress, validate)?;
-        let max_padded_trace_length =
-            usize::deserialize_with_mode(&mut reader, compress, validate)?;
-        Ok(Self {
-            bytecode: Arc::new(bytecode),
-            ram,
-            memory_layout,
-            max_padded_trace_length,
-        })
-    }
-}
-
-impl ark_serialize::Valid for JoltSharedPreprocessing {
-    fn check(&self) -> Result<(), ark_serialize::SerializationError> {
-        self.bytecode.check()?;
-        self.ram.check()?;
-        self.memory_layout.check()
-    }
-}
-
 impl JoltSharedPreprocessing {
+    /// Create shared preprocessing from bytecode.
+    ///
+    /// Bytecode size K is derived from `bytecode.bytecode.len()` (already padded).
+    /// The caller is responsible for wrapping bytecode in `Arc` and passing to prover/verifier.
     #[tracing::instrument(skip_all, name = "JoltSharedPreprocessing::new")]
     pub fn new(
-        bytecode: Vec<Instruction>,
+        bytecode: &BytecodePreprocessing,
         memory_layout: MemoryLayout,
         memory_init: Vec<(u64, u8)>,
         max_padded_trace_length: usize,
     ) -> JoltSharedPreprocessing {
-        let bytecode = Arc::new(BytecodePreprocessing::preprocess(bytecode));
         let ram = RAMPreprocessing::preprocess(memory_init);
         Self {
-            bytecode,
+            bytecode_size: bytecode.bytecode.len(),
             ram,
             memory_layout,
             max_padded_trace_length,
@@ -883,7 +837,7 @@ impl JoltSharedPreprocessing {
     }
 }
 
-#[derive(Debug, Clone, CanonicalSerialize, CanonicalDeserialize)]
+#[derive(Debug, Clone)]
 pub struct JoltVerifierPreprocessing<F, PCS>
 where
     F: JoltField,
@@ -891,6 +845,69 @@ where
 {
     pub generators: PCS::VerifierSetup,
     pub shared: JoltSharedPreprocessing,
+    /// Bytecode information for verification.
+    ///
+    /// In Full mode: contains full bytecode preprocessing (O(K) data).
+    /// In Committed mode: contains only commitments (succinct).
+    pub bytecode: VerifierBytecode<PCS>,
+}
+
+impl<F, PCS> CanonicalSerialize for JoltVerifierPreprocessing<F, PCS>
+where
+    F: JoltField,
+    PCS: CommitmentScheme<Field = F>,
+{
+    fn serialize_with_mode<W: std::io::Write>(
+        &self,
+        mut writer: W,
+        compress: ark_serialize::Compress,
+    ) -> Result<(), ark_serialize::SerializationError> {
+        self.generators.serialize_with_mode(&mut writer, compress)?;
+        self.shared.serialize_with_mode(&mut writer, compress)?;
+        self.bytecode.serialize_with_mode(&mut writer, compress)?;
+        Ok(())
+    }
+
+    fn serialized_size(&self, compress: ark_serialize::Compress) -> usize {
+        self.generators.serialized_size(compress)
+            + self.shared.serialized_size(compress)
+            + self.bytecode.serialized_size(compress)
+    }
+}
+
+impl<F, PCS> ark_serialize::Valid for JoltVerifierPreprocessing<F, PCS>
+where
+    F: JoltField,
+    PCS: CommitmentScheme<Field = F>,
+{
+    fn check(&self) -> Result<(), ark_serialize::SerializationError> {
+        self.generators.check()?;
+        self.shared.check()?;
+        self.bytecode.check()
+    }
+}
+
+impl<F, PCS> CanonicalDeserialize for JoltVerifierPreprocessing<F, PCS>
+where
+    F: JoltField,
+    PCS: CommitmentScheme<Field = F>,
+{
+    fn deserialize_with_mode<R: std::io::Read>(
+        mut reader: R,
+        compress: ark_serialize::Compress,
+        validate: ark_serialize::Validate,
+    ) -> Result<Self, ark_serialize::SerializationError> {
+        let generators =
+            PCS::VerifierSetup::deserialize_with_mode(&mut reader, compress, validate)?;
+        let shared =
+            JoltSharedPreprocessing::deserialize_with_mode(&mut reader, compress, validate)?;
+        let bytecode = VerifierBytecode::deserialize_with_mode(&mut reader, compress, validate)?;
+        Ok(Self {
+            generators,
+            shared,
+            bytecode,
+        })
+    }
 }
 
 impl<F, PCS> Serializable for JoltVerifierPreprocessing<F, PCS>
@@ -924,14 +941,39 @@ where
 }
 
 impl<F: JoltField, PCS: CommitmentScheme<Field = F>> JoltVerifierPreprocessing<F, PCS> {
-    #[tracing::instrument(skip_all, name = "JoltVerifierPreprocessing::new")]
-    pub fn new(
+    /// Create verifier preprocessing in Full mode (verifier has full bytecode).
+    #[tracing::instrument(skip_all, name = "JoltVerifierPreprocessing::new_full")]
+    pub fn new_full(
         shared: JoltSharedPreprocessing,
         generators: PCS::VerifierSetup,
+        bytecode: Arc<BytecodePreprocessing>,
     ) -> JoltVerifierPreprocessing<F, PCS> {
         Self {
             generators,
-            shared: shared.clone(),
+            shared,
+            bytecode: VerifierBytecode::Full(bytecode),
+        }
+    }
+
+    /// Create verifier preprocessing in Committed mode with trusted commitments.
+    ///
+    /// This is the "fast path" for online verification. The `TrustedBytecodeCommitments`
+    /// type guarantees (at the type level) that these commitments were derived from
+    /// actual bytecode via `TrustedBytecodeCommitments::derive()`.
+    ///
+    /// # Trust Model
+    /// The caller must ensure the commitments were honestly derived (e.g., loaded from
+    /// a trusted file or received from trusted preprocessing).
+    #[tracing::instrument(skip_all, name = "JoltVerifierPreprocessing::new_committed")]
+    pub fn new_committed(
+        shared: JoltSharedPreprocessing,
+        generators: PCS::VerifierSetup,
+        bytecode_commitments: TrustedBytecodeCommitments<PCS>,
+    ) -> JoltVerifierPreprocessing<F, PCS> {
+        Self {
+            generators,
+            shared,
+            bytecode: VerifierBytecode::Committed(bytecode_commitments),
         }
     }
 }
@@ -942,9 +984,15 @@ impl<F: JoltField, PCS: CommitmentScheme<Field = F>> From<&JoltProverPreprocessi
 {
     fn from(prover_preprocessing: &JoltProverPreprocessing<F, PCS>) -> Self {
         let generators = PCS::setup_verifier(&prover_preprocessing.generators);
+        // Choose VerifierBytecode variant based on whether prover has bytecode commitments
+        let bytecode = match &prover_preprocessing.bytecode_commitments {
+            Some(commitments) => VerifierBytecode::Committed(commitments.clone()),
+            None => VerifierBytecode::Full(Arc::clone(&prover_preprocessing.bytecode)),
+        };
         Self {
             generators,
             shared: prover_preprocessing.shared.clone(),
+            bytecode,
         }
     }
 }
diff --git a/jolt-core/src/zkvm/witness.rs b/jolt-core/src/zkvm/witness.rs
index bde767ba52..e4011002f5 100644
--- a/jolt-core/src/zkvm/witness.rs
+++ b/jolt-core/src/zkvm/witness.rs
@@ -67,6 +67,7 @@ impl CommittedPolynomial {
         &self,
         setup: &PCS::ProverSetup,
         preprocessing: &JoltSharedPreprocessing,
+        bytecode: &BytecodePreprocessing,
         row_cycles: &[tracer::instruction::Cycle],
         one_hot_params: &OneHotParams,
     ) -> <PCS as StreamingCommitmentScheme>::ChunkState
@@ -111,7 +112,7 @@ impl CommittedPolynomial {
                 let row: Vec<Option<usize>> = row_cycles
                     .iter()
                     .map(|cycle| {
-                        let pc = preprocessing.bytecode.get_pc(cycle);
+                        let pc = bytecode.get_pc(cycle);
                         Some(one_hot_params.bytecode_pc_chunk(pc, *idx) as usize)
                     })
                     .collect();
diff --git a/jolt-sdk/macros/src/lib.rs b/jolt-sdk/macros/src/lib.rs
index 58ab22c7ec..68c1d8afc9 100644
--- a/jolt-sdk/macros/src/lib.rs
+++ b/jolt-sdk/macros/src/lib.rs
@@ -446,11 +446,10 @@ impl MacroBuilder {
         quote! {
             #[cfg(all(not(target_arch = "wasm32"), not(feature = "guest")))]
             pub fn #preprocess_shared_fn_name(program: &mut jolt::host::Program)
-                -> jolt::JoltSharedPreprocessing
+                -> (jolt::JoltSharedPreprocessing, jolt::BytecodePreprocessing)
             {
                 #imports
-
-                let (bytecode, memory_init, program_size) = program.decode();
+                let (instructions, memory_init, program_size) = program.decode();
                 let memory_config = MemoryConfig {
                     max_input_size: #max_input_size,
                     max_output_size: #max_output_size,
@@ -461,15 +460,14 @@ impl MacroBuilder {
                     program_size: Some(program_size),
                 };
                 let memory_layout = MemoryLayout::new(&memory_config);
-
+                let bytecode = BytecodePreprocessing::preprocess(instructions);
                 let preprocessing = JoltSharedPreprocessing::new(
-                    bytecode,
+                    &bytecode,
                     memory_layout,
                     memory_init,
                     #max_trace_length,
                 );
-
-                preprocessing
+                (preprocessing, bytecode)
             }
         }
     }
@@ -482,15 +480,13 @@ impl MacroBuilder {
             Ident::new(&format!("preprocess_prover_{fn_name}"), fn_name.span());
         quote! {
             #[cfg(all(not(target_arch = "wasm32"), not(feature = "guest")))]
-            pub fn #preprocess_prover_fn_name(shared_preprocessing: jolt::JoltSharedPreprocessing)
-                -> jolt::JoltProverPreprocessing<jolt::F, jolt::PCS>
+            pub fn #preprocess_prover_fn_name(
+                shared_preprocessing: jolt::JoltSharedPreprocessing,
+                bytecode: std::sync::Arc<jolt::BytecodePreprocessing>,
+            ) -> jolt::JoltProverPreprocessing<jolt::F, jolt::PCS>
             {
                 #imports
-                let prover_preprocessing = JoltProverPreprocessing::new(
-                    shared_preprocessing,
-                );
-
-                prover_preprocessing
+                JoltProverPreprocessing::new(shared_preprocessing, bytecode)
             }
         }
     }
@@ -506,11 +502,11 @@ impl MacroBuilder {
             pub fn #preprocess_verifier_fn_name(
                 shared_preprocess: jolt::JoltSharedPreprocessing,
                 generators: <jolt::PCS as jolt::CommitmentScheme>::VerifierSetup,
+                bytecode: std::sync::Arc<jolt::BytecodePreprocessing>,
             ) -> jolt::JoltVerifierPreprocessing<jolt::F, jolt::PCS>
             {
                 #imports
-                let preprocessing = JoltVerifierPreprocessing::new(shared_preprocess, generators);
-                preprocessing
+                JoltVerifierPreprocessing::new_full(shared_preprocess, generators, bytecode)
             }
         }
     }
diff --git a/jolt-sdk/src/host_utils.rs b/jolt-sdk/src/host_utils.rs
index af6c8192a6..a0b37479af 100644
--- a/jolt-sdk/src/host_utils.rs
+++ b/jolt-sdk/src/host_utils.rs
@@ -10,6 +10,7 @@ pub use jolt_core::ark_bn254::Fr as F;
 pub use jolt_core::field::JoltField;
 pub use jolt_core::guest;
 pub use jolt_core::poly::commitment::dory::DoryCommitmentScheme as PCS;
+pub use jolt_core::zkvm::bytecode::BytecodePreprocessing;
 pub use jolt_core::zkvm::{
     proof_serialization::JoltProof, verifier::JoltSharedPreprocessing,
     verifier::JoltVerifierPreprocessing, RV64IMACProof, RV64IMACVerifier, Serializable,

From 71ee2e147726b04917a2604a99489aae18cd4e2f Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Tue, 20 Jan 2026 01:35:14 -0800
Subject: [PATCH 08/16] refactor(zkvm): move e2e tests to dedicated tests.rs
 module

- Create jolt-core/src/zkvm/tests.rs with E2ETestConfig infrastructure
- Port all 15 e2e tests from prover.rs to unified test runner
- Add committed bytecode mode tests (ignored until verifier ready)
- Wire verifier Stage 6a to branch on BytecodeMode (committed path)
- Update read_raf_checking for optional bytecode preprocessing
- Update bytecode-commitment-progress.md with status
---
 bytecode-commitment-progress.md               |  60 ++
 .../src/zkvm/bytecode/read_raf_checking.rs    |  32 +-
 jolt-core/src/zkvm/mod.rs                     |   3 +
 jolt-core/src/zkvm/prover.rs                  | 950 ------------------
 jolt-core/src/zkvm/tests.rs                   | 773 ++++++++++++++
 jolt-core/src/zkvm/verifier.rs                |  13 +-
 6 files changed, 865 insertions(+), 966 deletions(-)
 create mode 100644 jolt-core/src/zkvm/tests.rs

diff --git a/bytecode-commitment-progress.md b/bytecode-commitment-progress.md
index 33164f339c..ea0ed0ca81 100644
--- a/bytecode-commitment-progress.md
+++ b/bytecode-commitment-progress.md
@@ -353,12 +353,38 @@ Key idea: mirror advice:
 
 ---
 
+## Progress update (2026-01-20)
+
+High-level status (diff vs main):
+- Stage 6 split into 6a/6b with new proofs and wiring in prover/verifier (`jolt-core/src/zkvm/proof_serialization.rs` **L28–L41**; `jolt-core/src/zkvm/prover.rs` **L525–L534**, **L1151–L1394**; `jolt-core/src/zkvm/verifier.rs` **L225–L233**, **L430–L571**).
+- Booleanity split into address/cycle sumchecks; advice round alignment updated (`jolt-core/src/subprotocols/booleanity.rs` **L497–L736**; `jolt-core/src/poly/opening_proof.rs` **L157–L158**; `jolt-core/src/zkvm/witness.rs` **L285–L287**; `jolt-core/src/zkvm/claim_reductions/advice.rs` **L521–L526**).
+- BytecodeReadRaf split + staged Val claims + committed verifier Stage 6a path wired (`jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L1261–L1288**, **L1554–L1636**; `jolt-core/src/zkvm/verifier.rs` **L430–L455**).
+- BytecodeClaimReduction implemented with canonical lane ordering and BytecodeChunk openings (`jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L1–L15**, **L193–L236**, **L470–L488**, **L494–L671**).
+- Bytecode commitment plumbing is in place (BytecodeMode + preprocessing + VerifierBytecode), but commitment derivation and Stage 8 batching are still TODO (`jolt-core/src/zkvm/config.rs` **L26–L35**; `jolt-core/src/zkvm/prover.rs` **L1688–L1760**; `jolt-core/src/zkvm/verifier.rs` **L840–L976**; `jolt-core/src/zkvm/bytecode/mod.rs` **L41–L59**; `jolt-core/src/poly/rlc_polynomial.rs` **L184–L198**).
+
+Immediate next steps:
+1. Implement `TrustedBytecodeCommitments::derive` and add BytecodeChunk commitments + hints; consider new Dory context if needed (`jolt-core/src/zkvm/bytecode/mod.rs` **L41–L59**; `jolt-core/src/zkvm/prover.rs` **L1688–L1760**).
+2. Wire BytecodeChunk into Stage 8 batching and RLC streaming; add BytecodeChunk to committed polynomial list and witness generation (`jolt-core/src/zkvm/witness.rs` **L34–L61**, **L121–L171**; `jolt-core/src/poly/rlc_polynomial.rs` **L184–L198**; `jolt-core/src/zkvm/prover.rs` **L1504–L1567**).
+3. Add/enable tests (lane ordering, padding, committed mode e2e) and remove ignores once commitments + Stage 8 batching are wired (`jolt-core/src/zkvm/tests.rs` **L426–L486**; `jolt-core/src/zkvm/prover.rs` **L395–L409**; `jolt-core/src/zkvm/verifier.rs` **L171–L177**).
+4. Consider streaming/implicit bytecode chunk representation to avoid `k_chunk * T` materialization (`jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L216–L239**).
+
+Concerns / risks:
+- BytecodeClaimReduction currently materializes `weight_chunks` and `bytecode_chunks` of size `k_chunk * T` (memory heavy for large bytecode) (`jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L216–L239**).
+- BytecodeChunk polynomials are placeholders and not yet supported by streaming RLC or witness generation (`jolt-core/src/zkvm/witness.rs` **L121–L123**, **L169–L171**; `jolt-core/src/poly/rlc_polynomial.rs` **L184–L198**).
+
+---
+
 ## Detailed implementation plan (agreed direction)
 
 This section is an implementation checklist in dependency order.
 
 ### Step 1 — Refactor Stage 6 into two substages (6a + 6b)
 
+**Status (2026-01-20)**: DONE  
+- Proof split + serialization: `jolt-core/src/zkvm/proof_serialization.rs` **L28–L41**.  
+- Prover 6a/6b wiring: `jolt-core/src/zkvm/prover.rs` **L525–L534**, **L1151–L1394**.  
+- Verifier 6a/6b wiring: `jolt-core/src/zkvm/verifier.rs` **L225–L233**, **L430–L571**.
+
 **Goal**: make “end of BytecodeReadRaf address rounds” a real stage boundary so we can:
 - emit `Val_s(r_bc)` claims **immediately** after binding `r_bc`,
 - start `BytecodeClaimReduction` during the subsequent **cycle** randomness (what will become Stage 6b),
@@ -404,6 +430,10 @@ Target contents:
 
 ### Step 2 — Split Booleanity into two sumchecks (address + cycle)
 
+**Status (2026-01-20)**: DONE  
+- Address/cycle split + addr-claim chaining: `jolt-core/src/subprotocols/booleanity.rs` **L497–L736**; `jolt-core/src/zkvm/witness.rs` **L285–L287**; `jolt-core/src/poly/opening_proof.rs` **L157–L158**.  
+- Advice round_offset fix: `jolt-core/src/zkvm/claim_reductions/advice.rs` **L521–L526**.
+
 Reason: `Booleanity` is currently a *single* sumcheck with an internal phase transition at `log_k_chunk`:
 - `jolt-core/src/subprotocols/booleanity.rs` **L399–L446**
 
@@ -441,6 +471,11 @@ File:
 
 ### Step 3 — Split BytecodeReadRaf into two sumchecks (address + cycle)
 
+**Status (2026-01-20)**: DONE (split + staged claims + committed verifier wired).  
+- Stage 6a emits Val-only claims: `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L838–L875**.  
+- Verifier fast path uses staged claims: `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L1427–L1445**.  
+- Committed verifier uses bytecode-agnostic params in Stage 6a: `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L1261–L1288**, **L1554–L1636**; `jolt-core/src/zkvm/verifier.rs` **L430–L455**.
+
 Reason: we need a real stage boundary right after binding `r_bc` (bytecode-index address point), because:
 - `Val_s(r_bc)` is computed exactly at the transition today in `init_log_t_rounds`
   - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L307–L340**
@@ -466,6 +501,10 @@ Both kinds of values must land in `opening_claims` so the verifier has them with
 
 ### Step 4 — Implement `BytecodeClaimReduction` (two-phase, single instance)
 
+**Status (2026-01-20)**: PARTIAL (sumcheck + openings done; Stage 8 batching pending).  
+- Claim reduction + lane ordering + weight construction: `jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L1–L15**, **L193–L236**, **L494–L671**.  
+- Emits BytecodeChunk openings (Phase 2): `jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L470–L488**.
+
 This is the new sumcheck that replaces verifier’s \(O(K_{\text{bytecode}})\) evaluation of `val_polys`.
 
 #### 4.1 High-level role
@@ -497,6 +536,10 @@ The address phase should be simpler than advice because lane vars = exactly `log
 
 ### Step 5 — `SumcheckId` / opening bookkeeping (naming + flow)
 
+**Status (2026-01-20)**: DONE  
+- SumcheckId additions: `jolt-core/src/poly/opening_proof.rs` **L136–L162**.  
+- VirtualPolynomial additions: `jolt-core/src/zkvm/witness.rs` **L242–L287**.
+
 #### 5.1 How `SumcheckId` actually enters the proving / verifying flow
 
 `SumcheckId` is part of the **key** used to store scalar claims in the opening accumulator maps.
@@ -598,6 +641,11 @@ We will also add **new `VirtualPolynomial` variants** for scalar claims that are
 
 ### Step 6 — Bytecode commitments in preprocessing + transcript
 
+**Status (2026-01-20)**: PARTIAL  
+- Bytecode commitment plumbing added (types + preprocessing + proof field): `jolt-core/src/zkvm/bytecode/mod.rs` **L30–L111**; `jolt-core/src/zkvm/prover.rs` **L1688–L1760**; `jolt-core/src/zkvm/verifier.rs` **L840–L976**; `jolt-core/src/zkvm/proof_serialization.rs` **L43–L47**.  
+- Commitment derivation still TODO: `jolt-core/src/zkvm/bytecode/mod.rs` **L41–L59**.  
+- Canonical lane ordering implemented in BytecodeClaimReduction: `jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L494–L671**.
+
 #### 6.1 New Dory context + storage
 
 Add a new `DoryContext::Bytecode` (like Trusted/UntrustedAdvice) so we can commit to bytecode chunk polynomials in preprocessing and hand the commitments to the verifier.
@@ -619,6 +667,10 @@ This ordering must be used consistently by:
 
 ### Step 7 — Stage 8 batching integration (bytecode polynomials)
 
+**Status (2026-01-20)**: NOT STARTED / TODO  
+- BytecodeChunk polynomials not yet supported by witness generation or streaming RLC (panic placeholders): `jolt-core/src/zkvm/witness.rs` **L121–L123**, **L169–L171**; `jolt-core/src/poly/rlc_polynomial.rs` **L184–L198**.  
+- Stage 8 currently batches dense + RA + advice only (no BytecodeChunk): `jolt-core/src/zkvm/prover.rs` **L1504–L1567**.
+
 Stage 8 currently builds a streaming `RLCPolynomial` from:
 - dense trace polys
 - onehot RA polys
@@ -638,6 +690,10 @@ Files involved:
 
 ### Step 8 — Defensive padding: bytecode_len vs trace_len
 
+**Status (2026-01-20)**: DONE  
+- Prover pads `T >= K` in committed mode: `jolt-core/src/zkvm/prover.rs` **L395–L409**.  
+- Verifier rejects proofs with `trace_length < bytecode_K` in committed mode: `jolt-core/src/zkvm/verifier.rs` **L171–L177**.
+
 When bytecode commitments are enabled, ensure we have enough cycle randomness to bind bytecode-index vars:
 
 - `padded_trace_len = max(padded_trace_len, bytecode_len.next_power_of_two())`
@@ -646,6 +702,10 @@ This is analogous to `adjust_trace_length_for_advice` in `jolt-core/src/zkvm/pro
 
 ### Step 9 — Tests / validation
 
+**Status (2026-01-20)**: PARTIAL  
+- New e2e harness + bytecode-mode detection tests added locally: `jolt-core/src/zkvm/tests.rs` **L1–L486** (file currently untracked).  
+- Committed-mode e2e tests currently ignored: `jolt-core/src/zkvm/tests.rs` **L426–L447**.
+
 - Unit tests:
   - lane ordering + chunking (k_chunk=16 ⇒ 28 chunks, k_chunk=256 ⇒ 2 chunks)
   - bytecode_len > trace_len padding path
diff --git a/jolt-core/src/zkvm/bytecode/read_raf_checking.rs b/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
index f13713ddab..cc2af56021 100644
--- a/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
+++ b/jolt-core/src/zkvm/bytecode/read_raf_checking.rs
@@ -24,7 +24,10 @@ use crate::{
         sumcheck_verifier::{SumcheckInstanceParams, SumcheckInstanceVerifier},
     },
     transcripts::Transcript,
-    utils::{math::Math, small_scalar::SmallScalar, thread::unsafe_allocate_zero_vec},
+    utils::{
+        errors::ProofVerifyError, math::Math, small_scalar::SmallScalar,
+        thread::unsafe_allocate_zero_vec,
+    },
     zkvm::{
         bytecode::BytecodePreprocessing,
         config::{BytecodeMode, OneHotParams},
@@ -1259,26 +1262,29 @@ pub struct BytecodeReadRafAddressSumcheckVerifier<F: JoltField> {
 
 impl<F: JoltField> BytecodeReadRafAddressSumcheckVerifier<F> {
     pub fn new(
-        bytecode_preprocessing: &BytecodePreprocessing,
+        bytecode_preprocessing: Option<&BytecodePreprocessing>,
         n_cycle_vars: usize,
         one_hot_params: &OneHotParams,
         opening_accumulator: &VerifierOpeningAccumulator<F>,
         transcript: &mut impl Transcript,
         bytecode_mode: BytecodeMode,
-    ) -> Self {
+    ) -> Result<Self, ProofVerifyError> {
         let mut params = match bytecode_mode {
             // Commitment mode: verifier MUST avoid O(K_bytecode) work here, and later stages will
             // relate staged Val claims to committed bytecode.
             BytecodeMode::Committed => BytecodeReadRafSumcheckParams::gen_verifier(
-                bytecode_preprocessing,
                 n_cycle_vars,
                 one_hot_params,
                 opening_accumulator,
                 transcript,
             ),
-            // Legacy mode: verifier materializes/evaluates bytecode-dependent polynomials (O(K_bytecode)).
+            // Full mode: verifier materializes/evaluates bytecode-dependent polynomials (O(K_bytecode)).
             BytecodeMode::Full => BytecodeReadRafSumcheckParams::gen(
-                bytecode_preprocessing,
+                bytecode_preprocessing.ok_or_else(|| {
+                    ProofVerifyError::BytecodeTypeMismatch(
+                        "expected Full bytecode preprocessing, got Committed".to_string(),
+                    )
+                })?,
                 n_cycle_vars,
                 one_hot_params,
                 opening_accumulator,
@@ -1286,7 +1292,7 @@ impl<F: JoltField> BytecodeReadRafAddressSumcheckVerifier<F> {
             ),
         };
         params.use_staged_val_claims = bytecode_mode == BytecodeMode::Committed;
-        Self { params }
+        Ok(Self { params })
     }
 
     /// Consume this verifier and return the underlying parameters (for Option B orchestration).
@@ -1542,7 +1548,7 @@ impl<F: JoltField> BytecodeReadRafSumcheckParams<F> {
         transcript: &mut impl Transcript,
     ) -> Self {
         Self::gen_impl(
-            bytecode_preprocessing,
+            Some(bytecode_preprocessing),
             n_cycle_vars,
             one_hot_params,
             opening_accumulator,
@@ -1554,14 +1560,13 @@ impl<F: JoltField> BytecodeReadRafSumcheckParams<F> {
     /// Verifier-side generator: avoids materializing Val(k) polynomials (O(K_bytecode)).
     #[tracing::instrument(skip_all, name = "BytecodeReadRafSumcheckParams::gen_verifier")]
     pub fn gen_verifier(
-        bytecode_preprocessing: &BytecodePreprocessing,
         n_cycle_vars: usize,
         one_hot_params: &OneHotParams,
         opening_accumulator: &dyn OpeningAccumulator<F>,
         transcript: &mut impl Transcript,
     ) -> Self {
         Self::gen_impl(
-            bytecode_preprocessing,
+            None,
             n_cycle_vars,
             one_hot_params,
             opening_accumulator,
@@ -1572,7 +1577,7 @@ impl<F: JoltField> BytecodeReadRafSumcheckParams<F> {
 
     #[allow(clippy::too_many_arguments)]
     fn gen_impl(
-        bytecode_preprocessing: &BytecodePreprocessing,
+        bytecode_preprocessing: Option<&BytecodePreprocessing>,
         n_cycle_vars: usize,
         one_hot_params: &OneHotParams,
         opening_accumulator: &dyn OpeningAccumulator<F>,
@@ -1581,8 +1586,6 @@ impl<F: JoltField> BytecodeReadRafSumcheckParams<F> {
     ) -> Self {
         let gamma_powers = transcript.challenge_scalar_powers(7);
 
-        let bytecode = &bytecode_preprocessing.bytecode;
-
         // Generate all stage-specific gamma powers upfront (order must match verifier)
         let stage1_gammas: Vec<F> = transcript.challenge_scalar_powers(2 + NUM_CIRCUIT_FLAGS);
         let stage2_gammas: Vec<F> = transcript.challenge_scalar_powers(4);
@@ -1599,6 +1602,9 @@ impl<F: JoltField> BytecodeReadRafSumcheckParams<F> {
         let rv_claims = [rv_claim_1, rv_claim_2, rv_claim_3, rv_claim_4, rv_claim_5];
 
         let val_polys = if compute_val_polys {
+            let bytecode = &bytecode_preprocessing
+                .expect("compute_val_polys requires bytecode preprocessing")
+                .bytecode;
             // Pre-compute eq_r_register for stages 4 and 5 (they use different r_register points)
             let r_register_4 = opening_accumulator
                 .get_virtual_polynomial_opening(
diff --git a/jolt-core/src/zkvm/mod.rs b/jolt-core/src/zkvm/mod.rs
index 82117f6b76..fe5ebf6d2c 100644
--- a/jolt-core/src/zkvm/mod.rs
+++ b/jolt-core/src/zkvm/mod.rs
@@ -36,6 +36,9 @@ pub mod spartan;
 pub mod verifier;
 pub mod witness;
 
+#[cfg(test)]
+mod tests;
+
 // Scoped CPU profiler for performance analysis. Feature-gated by "pprof".
 // Usage: let _guard = pprof_scope!("label");
 //
diff --git a/jolt-core/src/zkvm/prover.rs b/jolt-core/src/zkvm/prover.rs
index 972c2dda56..6d01f73e5a 100644
--- a/jolt-core/src/zkvm/prover.rs
+++ b/jolt-core/src/zkvm/prover.rs
@@ -1787,953 +1787,3 @@ impl<F: JoltField, PCS: CommitmentScheme<Field = F>> Serializable
     for JoltProverPreprocessing<F, PCS>
 {
 }
-
-#[cfg(test)]
-mod tests {
-    use std::sync::Arc;
-
-    use ark_bn254::Fr;
-    use serial_test::serial;
-
-    use crate::host;
-    use crate::poly::commitment::dory::{DoryGlobals, DoryLayout};
-    use crate::poly::{
-        commitment::{
-            commitment_scheme::CommitmentScheme,
-            dory::{DoryCommitmentScheme, DoryContext},
-        },
-        multilinear_polynomial::MultilinearPolynomial,
-        opening_proof::{OpeningAccumulator, SumcheckId},
-    };
-    use crate::zkvm::bytecode::BytecodePreprocessing;
-    use crate::zkvm::claim_reductions::AdviceKind;
-    use crate::zkvm::verifier::JoltSharedPreprocessing;
-    use crate::zkvm::witness::CommittedPolynomial;
-    use crate::zkvm::{
-        prover::JoltProverPreprocessing,
-        ram::populate_memory_states,
-        verifier::{JoltVerifier, JoltVerifierPreprocessing},
-        RV64IMACProver, RV64IMACVerifier,
-    };
-
-    fn commit_trusted_advice_preprocessing_only(
-        preprocessing: &JoltProverPreprocessing<Fr, DoryCommitmentScheme>,
-        trusted_advice_bytes: &[u8],
-    ) -> (
-        <DoryCommitmentScheme as CommitmentScheme>::Commitment,
-        <DoryCommitmentScheme as CommitmentScheme>::OpeningProofHint,
-    ) {
-        let max_trusted_advice_size = preprocessing.shared.memory_layout.max_trusted_advice_size;
-        let mut trusted_advice_words = vec![0u64; (max_trusted_advice_size as usize) / 8];
-        populate_memory_states(
-            0,
-            trusted_advice_bytes,
-            Some(&mut trusted_advice_words),
-            None,
-        );
-
-        let poly = MultilinearPolynomial::<Fr>::from(trusted_advice_words);
-        let advice_len = poly.len().next_power_of_two().max(1);
-
-        let _guard =
-            DoryGlobals::initialize_context(1, advice_len, DoryContext::TrustedAdvice, None);
-        let (commitment, hint) = {
-            let _ctx = DoryGlobals::with_context(DoryContext::TrustedAdvice);
-            DoryCommitmentScheme::commit(&poly, &preprocessing.generators)
-        };
-        (commitment, hint)
-    }
-
-    #[test]
-    #[serial]
-    fn fib_e2e_dory() {
-        DoryGlobals::reset();
-        let mut program = host::Program::new("fibonacci-guest");
-        let inputs = postcard::to_stdvec(&100u32).unwrap();
-        let (instructions, init_memory_state, _) = program.decode();
-        let (_, _, _, io_device) = program.trace(&inputs, &[], &[]);
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            io_device.memory_layout.clone(),
-            init_memory_state,
-            1 << 16,
-        );
-
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-        let elf_contents_opt = program.get_elf_contents();
-        let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
-        let prover = RV64IMACProver::gen_from_elf(
-            &prover_preprocessing,
-            elf_contents,
-            &inputs,
-            &[],
-            &[],
-            None,
-            None,
-        );
-        let io_device = prover.program_io.clone();
-        let (jolt_proof, debug_info) = prover.prove();
-
-        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
-            shared_preprocessing,
-            prover_preprocessing.generators.to_verifier_setup(),
-            bytecode,
-        );
-        let verifier = RV64IMACVerifier::new(
-            &verifier_preprocessing,
-            jolt_proof,
-            io_device,
-            None,
-            debug_info,
-        )
-        .expect("Failed to create verifier");
-        verifier.verify().expect("Failed to verify proof");
-    }
-
-    #[test]
-    #[serial]
-    fn small_trace_e2e_dory() {
-        DoryGlobals::reset();
-        let mut program = host::Program::new("fibonacci-guest");
-        let inputs = postcard::to_stdvec(&5u32).unwrap();
-        let (instructions, init_memory_state, _) = program.decode();
-        let (_, _, _, io_device) = program.trace(&inputs, &[], &[]);
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            io_device.memory_layout.clone(),
-            init_memory_state,
-            256,
-        );
-
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-        let elf_contents_opt = program.get_elf_contents();
-        let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
-        let log_chunk = 8; // Use default log_chunk for tests
-        let prover = RV64IMACProver::gen_from_elf(
-            &prover_preprocessing,
-            elf_contents,
-            &inputs,
-            &[],
-            &[],
-            None,
-            None,
-        );
-
-        assert!(
-            prover.padded_trace_len <= (1 << log_chunk),
-            "Test requires T <= chunk_size ({}), got T = {}",
-            1 << log_chunk,
-            prover.padded_trace_len
-        );
-
-        let io_device = prover.program_io.clone();
-        let (jolt_proof, debug_info) = prover.prove();
-
-        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
-            prover_preprocessing.shared.clone(),
-            prover_preprocessing.generators.to_verifier_setup(),
-            bytecode,
-        );
-        let verifier = RV64IMACVerifier::new(
-            &verifier_preprocessing,
-            jolt_proof,
-            io_device,
-            None,
-            debug_info,
-        )
-        .expect("Failed to create verifier");
-        verifier.verify().expect("Failed to verify proof");
-    }
-
-    #[test]
-    #[serial]
-    fn sha3_e2e_dory() {
-        DoryGlobals::reset();
-        // Ensure SHA3 inline library is linked and auto-registered
-        #[cfg(feature = "host")]
-        use jolt_inlines_keccak256 as _;
-        // SHA3 inlines are automatically registered via #[ctor::ctor]
-        // when the jolt-inlines-keccak256 crate is linked (see lib.rs)
-
-        let mut program = host::Program::new("sha3-guest");
-        let (instructions, init_memory_state, _) = program.decode();
-        let inputs = postcard::to_stdvec(&[5u8; 32]).unwrap();
-        let (_, _, _, io_device) = program.trace(&inputs, &[], &[]);
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            io_device.memory_layout.clone(),
-            init_memory_state,
-            1 << 16,
-        );
-
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-        let elf_contents_opt = program.get_elf_contents();
-        let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
-        let prover = RV64IMACProver::gen_from_elf(
-            &prover_preprocessing,
-            elf_contents,
-            &inputs,
-            &[],
-            &[],
-            None,
-            None,
-        );
-        let io_device = prover.program_io.clone();
-        let (jolt_proof, debug_info) = prover.prove();
-
-        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
-            prover_preprocessing.shared.clone(),
-            prover_preprocessing.generators.to_verifier_setup(),
-            Arc::clone(&prover_preprocessing.bytecode),
-        );
-        let verifier = RV64IMACVerifier::new(
-            &verifier_preprocessing,
-            jolt_proof,
-            io_device.clone(),
-            None,
-            debug_info,
-        )
-        .expect("Failed to create verifier");
-        verifier.verify().expect("Failed to verify proof");
-        assert_eq!(
-            io_device.inputs, inputs,
-            "Inputs mismatch: expected {:?}, got {:?}",
-            inputs, io_device.inputs
-        );
-        let expected_output = &[
-            0xd0, 0x3, 0x5c, 0x96, 0x86, 0x6e, 0xe2, 0x2e, 0x81, 0xf5, 0xc4, 0xef, 0xbd, 0x88,
-            0x33, 0xc1, 0x7e, 0xa1, 0x61, 0x10, 0x81, 0xfc, 0xd7, 0xa3, 0xdd, 0xce, 0xce, 0x7f,
-            0x44, 0x72, 0x4, 0x66,
-        ];
-        assert_eq!(io_device.outputs, expected_output, "Outputs mismatch",);
-    }
-
-    #[test]
-    #[serial]
-    fn sha2_e2e_dory() {
-        DoryGlobals::reset();
-        // Ensure SHA2 inline library is linked and auto-registered
-        #[cfg(feature = "host")]
-        use jolt_inlines_sha2 as _;
-        // SHA2 inlines are automatically registered via #[ctor::ctor]
-        // when the jolt-inlines-sha2 crate is linked (see lib.rs)
-        let mut program = host::Program::new("sha2-guest");
-        let (instructions, init_memory_state, _) = program.decode();
-        let inputs = postcard::to_stdvec(&[5u8; 32]).unwrap();
-        let (_, _, _, io_device) = program.trace(&inputs, &[], &[]);
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            io_device.memory_layout.clone(),
-            init_memory_state,
-            1 << 16,
-        );
-
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-        let elf_contents_opt = program.get_elf_contents();
-        let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
-        let prover = RV64IMACProver::gen_from_elf(
-            &prover_preprocessing,
-            elf_contents,
-            &inputs,
-            &[],
-            &[],
-            None,
-            None,
-        );
-        let io_device = prover.program_io.clone();
-        let (jolt_proof, debug_info) = prover.prove();
-
-        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
-            prover_preprocessing.shared.clone(),
-            prover_preprocessing.generators.to_verifier_setup(),
-            Arc::clone(&prover_preprocessing.bytecode),
-        );
-        let verifier = RV64IMACVerifier::new(
-            &verifier_preprocessing,
-            jolt_proof,
-            io_device.clone(),
-            None,
-            debug_info,
-        )
-        .expect("Failed to create verifier");
-        verifier.verify().expect("Failed to verify proof");
-        let expected_output = &[
-            0x28, 0x9b, 0xdf, 0x82, 0x9b, 0x4a, 0x30, 0x26, 0x7, 0x9a, 0x3e, 0xa0, 0x89, 0x73,
-            0xb1, 0x97, 0x2d, 0x12, 0x4e, 0x7e, 0xaf, 0x22, 0x33, 0xc6, 0x3, 0x14, 0x3d, 0xc6,
-            0x3b, 0x50, 0xd2, 0x57,
-        ];
-        assert_eq!(
-            io_device.outputs, expected_output,
-            "Outputs mismatch: expected {:?}, got {:?}",
-            expected_output, io_device.outputs
-        );
-    }
-
-    #[test]
-    #[serial]
-    fn sha2_e2e_dory_with_unused_advice() {
-        DoryGlobals::reset();
-        // SHA2 guest does not consume advice, but providing both trusted and untrusted advice
-        // should still work correctly through the full pipeline:
-        // - Trusted: commit in preprocessing-only context, reduce in Stage 6, batch in Stage 8
-        // - Untrusted: commit at prove time, reduce in Stage 6, batch in Stage 8
-        let mut program = host::Program::new("sha2-guest");
-        let (instructions, init_memory_state, _) = program.decode();
-        let inputs = postcard::to_stdvec(&[5u8; 32]).unwrap();
-        let trusted_advice = postcard::to_stdvec(&[7u8; 32]).unwrap();
-        let untrusted_advice = postcard::to_stdvec(&[9u8; 32]).unwrap();
-
-        let (_, _, _, io_device) = program.trace(&inputs, &untrusted_advice, &trusted_advice);
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            io_device.memory_layout.clone(),
-            init_memory_state,
-            1 << 16,
-        );
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-        let elf_contents = program.get_elf_contents().expect("elf contents is None");
-
-        let (trusted_commitment, trusted_hint) =
-            commit_trusted_advice_preprocessing_only(&prover_preprocessing, &trusted_advice);
-
-        let prover = RV64IMACProver::gen_from_elf(
-            &prover_preprocessing,
-            &elf_contents,
-            &inputs,
-            &untrusted_advice,
-            &trusted_advice,
-            Some(trusted_commitment),
-            Some(trusted_hint),
-        );
-        let io_device = prover.program_io.clone();
-        let (jolt_proof, debug_info) = prover.prove();
-
-        let verifier_preprocessing = JoltVerifierPreprocessing::from(&prover_preprocessing);
-        RV64IMACVerifier::new(
-            &verifier_preprocessing,
-            jolt_proof,
-            io_device.clone(),
-            Some(trusted_commitment),
-            debug_info,
-        )
-        .expect("Failed to create verifier")
-        .verify()
-        .expect("Failed to verify proof");
-
-        // Verify output is correct (advice should not affect sha2 output)
-        let expected_output = &[
-            0x28, 0x9b, 0xdf, 0x82, 0x9b, 0x4a, 0x30, 0x26, 0x7, 0x9a, 0x3e, 0xa0, 0x89, 0x73,
-            0xb1, 0x97, 0x2d, 0x12, 0x4e, 0x7e, 0xaf, 0x22, 0x33, 0xc6, 0x3, 0x14, 0x3d, 0xc6,
-            0x3b, 0x50, 0xd2, 0x57,
-        ];
-        assert_eq!(io_device.outputs, expected_output);
-    }
-
-    #[test]
-    #[serial]
-    fn max_advice_with_small_trace() {
-        DoryGlobals::reset();
-        // Tests that max-sized advice (4KB = 512 words) works with a minimal trace.
-        // With balanced dims (sigma_a=5, nu_a=4 for 512 words), the minimum padded trace
-        // (256 cycles -> total_vars=12) is sufficient to embed advice.
-        let mut program = host::Program::new("fibonacci-guest");
-        let inputs = postcard::to_stdvec(&5u32).unwrap();
-        let trusted_advice = vec![7u8; 4096];
-        let untrusted_advice = vec![9u8; 4096];
-
-        let (instructions, init_memory_state, _) = program.decode();
-        let (lazy_trace, trace, final_memory_state, io_device) =
-            program.trace(&inputs, &untrusted_advice, &trusted_advice);
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            io_device.memory_layout.clone(),
-            init_memory_state,
-            256,
-        );
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-        tracing::info!(
-            "preprocessing.memory_layout.max_trusted_advice_size: {}",
-            shared_preprocessing.memory_layout.max_trusted_advice_size
-        );
-
-        let (trusted_commitment, trusted_hint) =
-            commit_trusted_advice_preprocessing_only(&prover_preprocessing, &trusted_advice);
-
-        let prover = RV64IMACProver::gen_from_trace(
-            &prover_preprocessing,
-            lazy_trace,
-            trace,
-            io_device,
-            Some(trusted_commitment),
-            Some(trusted_hint),
-            final_memory_state,
-        );
-
-        // Trace is tiny but advice is max-sized
-        assert!(prover.unpadded_trace_len < 512);
-        assert_eq!(prover.padded_trace_len, 256);
-
-        let io_device = prover.program_io.clone();
-        let (jolt_proof, debug_info) = prover.prove();
-
-        let verifier_preprocessing = JoltVerifierPreprocessing::from(&prover_preprocessing);
-        RV64IMACVerifier::new(
-            &verifier_preprocessing,
-            jolt_proof,
-            io_device,
-            Some(trusted_commitment),
-            debug_info,
-        )
-        .expect("Failed to create verifier")
-        .verify()
-        .expect("Verification failed");
-    }
-
-    #[test]
-    #[serial]
-    fn advice_e2e_dory() {
-        DoryGlobals::reset();
-        // Tests a guest (merkle-tree) that actually consumes both trusted and untrusted advice.
-        let mut program = host::Program::new("merkle-tree-guest");
-        let (instructions, init_memory_state, _) = program.decode();
-
-        // Merkle tree with 4 leaves: input=leaf1, trusted=[leaf2, leaf3], untrusted=leaf4
-        let inputs = postcard::to_stdvec(&[5u8; 32].as_slice()).unwrap();
-        let untrusted_advice = postcard::to_stdvec(&[8u8; 32]).unwrap();
-        let mut trusted_advice = postcard::to_stdvec(&[6u8; 32]).unwrap();
-        trusted_advice.extend(postcard::to_stdvec(&[7u8; 32]).unwrap());
-
-        let (_, _, _, io_device) = program.trace(&inputs, &untrusted_advice, &trusted_advice);
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            io_device.memory_layout.clone(),
-            init_memory_state,
-            1 << 16,
-        );
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-        let elf_contents = program.get_elf_contents().expect("elf contents is None");
-
-        let (trusted_commitment, trusted_hint) =
-            commit_trusted_advice_preprocessing_only(&prover_preprocessing, &trusted_advice);
-
-        let prover = RV64IMACProver::gen_from_elf(
-            &prover_preprocessing,
-            &elf_contents,
-            &inputs,
-            &untrusted_advice,
-            &trusted_advice,
-            Some(trusted_commitment),
-            Some(trusted_hint),
-        );
-        let io_device = prover.program_io.clone();
-        let (jolt_proof, debug_info) = prover.prove();
-
-        let verifier_preprocessing = JoltVerifierPreprocessing::from(&prover_preprocessing);
-        RV64IMACVerifier::new(
-            &verifier_preprocessing,
-            jolt_proof,
-            io_device.clone(),
-            Some(trusted_commitment),
-            debug_info,
-        )
-        .expect("Failed to create verifier")
-        .verify()
-        .expect("Verification failed");
-
-        // Expected merkle root for leaves [5;32], [6;32], [7;32], [8;32]
-        let expected_output = &[
-            0xb4, 0x37, 0x0f, 0x3a, 0xb, 0x3d, 0x38, 0xa8, 0x7a, 0x6c, 0x4c, 0x46, 0x9, 0xe7, 0x83,
-            0xb3, 0xcc, 0xb7, 0x1c, 0x30, 0x1f, 0xf8, 0x54, 0xd, 0xf7, 0xdd, 0xc8, 0x42, 0x32,
-            0xbb, 0x16, 0xd7,
-        ];
-        assert_eq!(io_device.outputs, expected_output);
-    }
-
-    #[test]
-    #[serial]
-    fn advice_opening_point_derives_from_unified_point() {
-        DoryGlobals::reset();
-        // Tests that advice opening points are correctly derived from the unified main opening
-        // point using Dory's balanced dimension policy.
-        //
-        // For a small trace (256 cycles), the advice row coordinates span both Stage 6 (cycle)
-        // and Stage 7 (address) challenges, verifying the two-phase reduction works correctly.
-        let mut program = host::Program::new("fibonacci-guest");
-        let inputs = postcard::to_stdvec(&5u32).unwrap();
-        let trusted_advice = postcard::to_stdvec(&[7u8; 32]).unwrap();
-        let untrusted_advice = postcard::to_stdvec(&[9u8; 32]).unwrap();
-
-        let (instructions, init_memory_state, _) = program.decode();
-        let (lazy_trace, trace, final_memory_state, io_device) =
-            program.trace(&inputs, &untrusted_advice, &trusted_advice);
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            io_device.memory_layout.clone(),
-            init_memory_state,
-            1 << 16,
-        );
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-        let (trusted_commitment, trusted_hint) =
-            commit_trusted_advice_preprocessing_only(&prover_preprocessing, &trusted_advice);
-
-        let prover = RV64IMACProver::gen_from_trace(
-            &prover_preprocessing,
-            lazy_trace,
-            trace,
-            io_device,
-            Some(trusted_commitment),
-            Some(trusted_hint),
-            final_memory_state,
-        );
-
-        assert_eq!(prover.padded_trace_len, 256, "test expects small trace");
-
-        let io_device = prover.program_io.clone();
-        let (jolt_proof, debug_info) = prover.prove();
-        let debug_info = debug_info.expect("expected debug_info in tests");
-
-        // Get unified opening point and derive expected advice point
-        let (opening_point, _) = debug_info
-            .opening_accumulator
-            .get_committed_polynomial_opening(
-                CommittedPolynomial::InstructionRa(0),
-                SumcheckId::HammingWeightClaimReduction,
-            );
-        let mut point_dory_le = opening_point.r.clone();
-        point_dory_le.reverse();
-
-        let total_vars = point_dory_le.len();
-        let (sigma_main, _nu_main) = DoryGlobals::balanced_sigma_nu(total_vars);
-        let (sigma_a, nu_a) = DoryGlobals::advice_sigma_nu_from_max_bytes(
-            prover_preprocessing
-                .shared
-                .memory_layout
-                .max_trusted_advice_size as usize,
-        );
-
-        // Build expected advice point: [col_bits[0..sigma_a] || row_bits[0..nu_a]]
-        let mut expected_advice_le: Vec<_> = point_dory_le[0..sigma_a].to_vec();
-        expected_advice_le.extend_from_slice(&point_dory_le[sigma_main..sigma_main + nu_a]);
-
-        // Verify both advice types derive the same opening point
-        for (name, kind) in [
-            ("trusted", AdviceKind::Trusted),
-            ("untrusted", AdviceKind::Untrusted),
-        ] {
-            let get_fn = debug_info
-                .opening_accumulator
-                .get_advice_opening(kind, SumcheckId::AdviceClaimReduction);
-            assert!(
-                get_fn.is_some(),
-                "{name} advice opening missing for AdviceClaimReductionPhase2"
-            );
-            let (point_be, _) = get_fn.unwrap();
-            let mut point_le = point_be.r.clone();
-            point_le.reverse();
-            assert_eq!(point_le, expected_advice_le, "{name} advice point mismatch");
-        }
-
-        // Verify end-to-end
-        let verifier_preprocessing = JoltVerifierPreprocessing::from(&prover_preprocessing);
-        RV64IMACVerifier::new(
-            &verifier_preprocessing,
-            jolt_proof,
-            io_device,
-            Some(trusted_commitment),
-            Some(debug_info),
-        )
-        .expect("Failed to create verifier")
-        .verify()
-        .expect("Verification failed");
-    }
-
-    #[test]
-    #[serial]
-    fn memory_ops_e2e_dory() {
-        DoryGlobals::reset();
-        let mut program = host::Program::new("memory-ops-guest");
-        let (instructions, init_memory_state, _) = program.decode();
-        let (_, _, _, io_device) = program.trace(&[], &[], &[]);
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            io_device.memory_layout.clone(),
-            init_memory_state,
-            1 << 16,
-        );
-
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-        let elf_contents_opt = program.get_elf_contents();
-        let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
-        let prover = RV64IMACProver::gen_from_elf(
-            &prover_preprocessing,
-            elf_contents,
-            &[],
-            &[],
-            &[],
-            None,
-            None,
-        );
-        let io_device = prover.program_io.clone();
-        let (jolt_proof, debug_info) = prover.prove();
-
-        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
-            prover_preprocessing.shared.clone(),
-            prover_preprocessing.generators.to_verifier_setup(),
-            Arc::clone(&prover_preprocessing.bytecode),
-        );
-        let verifier = RV64IMACVerifier::new(
-            &verifier_preprocessing,
-            jolt_proof,
-            io_device,
-            None,
-            debug_info,
-        )
-        .expect("Failed to create verifier");
-        verifier.verify().expect("Failed to verify proof");
-    }
-
-    #[test]
-    #[serial]
-    fn btreemap_e2e_dory() {
-        DoryGlobals::reset();
-        let mut program = host::Program::new("btreemap-guest");
-        let (instructions, init_memory_state, _) = program.decode();
-        let inputs = postcard::to_stdvec(&50u32).unwrap();
-        let (_, _, _, io_device) = program.trace(&inputs, &[], &[]);
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            io_device.memory_layout.clone(),
-            init_memory_state,
-            1 << 16,
-        );
-
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-        let elf_contents_opt = program.get_elf_contents();
-        let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
-        let prover = RV64IMACProver::gen_from_elf(
-            &prover_preprocessing,
-            elf_contents,
-            &inputs,
-            &[],
-            &[],
-            None,
-            None,
-        );
-        let io_device = prover.program_io.clone();
-        let (jolt_proof, debug_info) = prover.prove();
-
-        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
-            prover_preprocessing.shared.clone(),
-            prover_preprocessing.generators.to_verifier_setup(),
-            Arc::clone(&prover_preprocessing.bytecode),
-        );
-        let verifier = RV64IMACVerifier::new(
-            &verifier_preprocessing,
-            jolt_proof,
-            io_device,
-            None,
-            debug_info,
-        )
-        .expect("Failed to create verifier");
-        verifier.verify().expect("Failed to verify proof");
-    }
-
-    #[test]
-    #[serial]
-    fn muldiv_e2e_dory() {
-        DoryGlobals::reset();
-        let mut program = host::Program::new("muldiv-guest");
-        let (instructions, init_memory_state, _) = program.decode();
-        let inputs = postcard::to_stdvec(&[9u32, 5u32, 3u32]).unwrap();
-        let (_, _, _, io_device) = program.trace(&inputs, &[], &[]);
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            io_device.memory_layout.clone(),
-            init_memory_state,
-            1 << 16,
-        );
-
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-        let elf_contents_opt = program.get_elf_contents();
-        let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
-        let prover = RV64IMACProver::gen_from_elf(
-            &prover_preprocessing,
-            elf_contents,
-            &[50],
-            &[],
-            &[],
-            None,
-            None,
-        );
-        let io_device = prover.program_io.clone();
-        let (jolt_proof, debug_info) = prover.prove();
-
-        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
-            prover_preprocessing.shared.clone(),
-            prover_preprocessing.generators.to_verifier_setup(),
-            Arc::clone(&prover_preprocessing.bytecode),
-        );
-        let verifier = RV64IMACVerifier::new(
-            &verifier_preprocessing,
-            jolt_proof,
-            io_device,
-            None,
-            debug_info,
-        )
-        .expect("Failed to create verifier");
-        verifier.verify().expect("Failed to verify proof");
-    }
-
-    #[test]
-    #[serial]
-    #[should_panic]
-    fn truncated_trace() {
-        let mut program = host::Program::new("fibonacci-guest");
-        let (instructions, init_memory_state, _) = program.decode();
-        let inputs = postcard::to_stdvec(&9u8).unwrap();
-        let (lazy_trace, mut trace, final_memory_state, mut program_io) =
-            program.trace(&inputs, &[], &[]);
-        trace.truncate(100);
-        program_io.outputs[0] = 0; // change the output to 0
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            program_io.memory_layout.clone(),
-            init_memory_state,
-            1 << 16,
-        );
-
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-
-        let prover = RV64IMACProver::gen_from_trace(
-            &prover_preprocessing,
-            lazy_trace,
-            trace,
-            program_io.clone(),
-            None,
-            None,
-            final_memory_state,
-        );
-
-        let (proof, _) = prover.prove();
-
-        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
-            prover_preprocessing.shared.clone(),
-            prover_preprocessing.generators.to_verifier_setup(),
-            Arc::clone(&prover_preprocessing.bytecode),
-        );
-        let verifier =
-            RV64IMACVerifier::new(&verifier_preprocessing, proof, program_io, None, None).unwrap();
-        verifier.verify().unwrap();
-    }
-
-    #[test]
-    #[serial]
-    #[should_panic]
-    fn malicious_trace() {
-        let mut program = host::Program::new("fibonacci-guest");
-        let inputs = postcard::to_stdvec(&1u8).unwrap();
-        let (instructions, init_memory_state, _) = program.decode();
-        let (lazy_trace, trace, final_memory_state, mut program_io) =
-            program.trace(&inputs, &[], &[]);
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-
-        // Since the preprocessing is done with the original memory layout, the verifier should fail
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            program_io.memory_layout.clone(),
-            init_memory_state,
-            1 << 16,
-        );
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-
-        // change memory address of output & termination bit to the same address as input
-        // changes here should not be able to spoof the verifier result
-        program_io.memory_layout.output_start = program_io.memory_layout.input_start;
-        program_io.memory_layout.output_end = program_io.memory_layout.input_end;
-        program_io.memory_layout.termination = program_io.memory_layout.input_start;
-
-        let prover = RV64IMACProver::gen_from_trace(
-            &prover_preprocessing,
-            lazy_trace,
-            trace,
-            program_io.clone(),
-            None,
-            None,
-            final_memory_state,
-        );
-        let (proof, _) = prover.prove();
-
-        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
-            prover_preprocessing.shared.clone(),
-            prover_preprocessing.generators.to_verifier_setup(),
-            Arc::clone(&prover_preprocessing.bytecode),
-        );
-        let verifier =
-            JoltVerifier::new(&verifier_preprocessing, proof, program_io, None, None).unwrap();
-        verifier.verify().unwrap();
-    }
-
-    #[test]
-    #[serial]
-    fn fib_e2e_dory_address_major() {
-        DoryGlobals::reset();
-        DoryGlobals::set_layout(DoryLayout::AddressMajor);
-
-        let mut program = host::Program::new("fibonacci-guest");
-        let inputs = postcard::to_stdvec(&50u32).unwrap();
-        let (instructions, init_memory_state, _) = program.decode();
-        let (_, _, _, io_device) = program.trace(&inputs, &[], &[]);
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            io_device.memory_layout.clone(),
-            init_memory_state,
-            1 << 16,
-        );
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-        let elf_contents = program.get_elf_contents().expect("elf contents is None");
-        let prover = RV64IMACProver::gen_from_elf(
-            &prover_preprocessing,
-            &elf_contents,
-            &inputs,
-            &[],
-            &[],
-            None,
-            None,
-        );
-        let io_device = prover.program_io.clone();
-        let (proof, debug_info) = prover.prove();
-
-        let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
-            shared_preprocessing,
-            prover_preprocessing.generators.to_verifier_setup(),
-            Arc::clone(&prover_preprocessing.bytecode),
-        );
-
-        // DoryGlobals is now initialized inside the verifier's verify_stage8
-        RV64IMACVerifier::new(&verifier_preprocessing, proof, io_device, None, debug_info)
-            .expect("verifier creation failed")
-            .verify()
-            .expect("verification failed");
-    }
-
-    #[test]
-    #[serial]
-    fn advice_e2e_dory_address_major() {
-        DoryGlobals::reset();
-        DoryGlobals::set_layout(DoryLayout::AddressMajor);
-
-        // Tests a guest (merkle-tree) that actually consumes both trusted and untrusted advice.
-        let mut program = host::Program::new("merkle-tree-guest");
-        let (instructions, init_memory_state, _) = program.decode();
-
-        // Merkle tree with 4 leaves: input=leaf1, trusted=[leaf2, leaf3], untrusted=leaf4
-        let inputs = postcard::to_stdvec(&[5u8; 32].as_slice()).unwrap();
-        let untrusted_advice = postcard::to_stdvec(&[8u8; 32]).unwrap();
-        let mut trusted_advice = postcard::to_stdvec(&[6u8; 32]).unwrap();
-        trusted_advice.extend(postcard::to_stdvec(&[7u8; 32]).unwrap());
-
-        let (_, _, _, io_device) = program.trace(&inputs, &untrusted_advice, &trusted_advice);
-
-        let bytecode: Arc<BytecodePreprocessing> =
-            BytecodePreprocessing::preprocess(instructions).into();
-        let shared_preprocessing = JoltSharedPreprocessing::new(
-            &bytecode,
-            io_device.memory_layout.clone(),
-            init_memory_state,
-            1 << 16,
-        );
-        let prover_preprocessing =
-            JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
-        let elf_contents = program.get_elf_contents().expect("elf contents is None");
-
-        let (trusted_commitment, trusted_hint) =
-            commit_trusted_advice_preprocessing_only(&prover_preprocessing, &trusted_advice);
-
-        let prover = RV64IMACProver::gen_from_elf(
-            &prover_preprocessing,
-            &elf_contents,
-            &inputs,
-            &untrusted_advice,
-            &trusted_advice,
-            Some(trusted_commitment),
-            Some(trusted_hint),
-        );
-        let io_device = prover.program_io.clone();
-        let (jolt_proof, debug_info) = prover.prove();
-
-        let verifier_preprocessing = JoltVerifierPreprocessing::from(&prover_preprocessing);
-        RV64IMACVerifier::new(
-            &verifier_preprocessing,
-            jolt_proof,
-            io_device.clone(),
-            Some(trusted_commitment),
-            debug_info,
-        )
-        .expect("Failed to create verifier")
-        .verify()
-        .expect("Verification failed");
-
-        // Expected merkle root for leaves [5;32], [6;32], [7;32], [8;32]
-        let expected_output = &[
-            0xb4, 0x37, 0x0f, 0x3a, 0xb, 0x3d, 0x38, 0xa8, 0x7a, 0x6c, 0x4c, 0x46, 0x9, 0xe7, 0x83,
-            0xb3, 0xcc, 0xb7, 0x1c, 0x30, 0x1f, 0xf8, 0x54, 0xd, 0xf7, 0xdd, 0xc8, 0x42, 0x32,
-            0xbb, 0x16, 0xd7,
-        ];
-        assert_eq!(io_device.outputs, expected_output);
-    }
-}
diff --git a/jolt-core/src/zkvm/tests.rs b/jolt-core/src/zkvm/tests.rs
new file mode 100644
index 0000000000..d821e8429a
--- /dev/null
+++ b/jolt-core/src/zkvm/tests.rs
@@ -0,0 +1,773 @@
+//! End-to-end test infrastructure for Jolt ZKVM.
+//!
+//! This module provides a unified test runner that reduces boilerplate across e2e tests.
+//! Tests can be configured via `E2ETestConfig` to vary:
+//! - Program (fibonacci, sha2, etc.)
+//! - BytecodeMode (Full vs Committed)
+//! - DoryLayout (CycleMajor vs AddressMajor)
+//! - Trace size
+//! - Advice (trusted/untrusted)
+
+use std::sync::Arc;
+
+use ark_bn254::Fr;
+use serial_test::serial;
+
+use crate::host;
+use crate::poly::commitment::commitment_scheme::CommitmentScheme;
+use crate::poly::commitment::dory::{DoryCommitmentScheme, DoryContext, DoryGlobals, DoryLayout};
+use crate::poly::multilinear_polynomial::MultilinearPolynomial;
+use crate::poly::opening_proof::{OpeningAccumulator, SumcheckId};
+use crate::zkvm::bytecode::BytecodePreprocessing;
+use crate::zkvm::claim_reductions::AdviceKind;
+use crate::zkvm::prover::JoltProverPreprocessing;
+use crate::zkvm::ram::populate_memory_states;
+use crate::zkvm::verifier::{JoltSharedPreprocessing, JoltVerifier, JoltVerifierPreprocessing};
+use crate::zkvm::witness::CommittedPolynomial;
+use crate::zkvm::{RV64IMACProver, RV64IMACVerifier};
+
+/// Configuration for an end-to-end test.
+#[derive(Clone)]
+pub struct E2ETestConfig {
+    /// Guest program name (e.g., "fibonacci-guest", "sha2-guest")
+    pub program_name: &'static str,
+    /// Serialized inputs to pass to the guest
+    pub inputs: Vec<u8>,
+    /// Maximum padded trace length (must be power of 2)
+    pub max_trace_length: usize,
+    /// Whether to use Committed bytecode mode (vs Full)
+    pub committed_bytecode: bool,
+    /// Dory layout override (None = use default CycleMajor)
+    pub dory_layout: Option<DoryLayout>,
+    /// Trusted advice bytes
+    pub trusted_advice: Vec<u8>,
+    /// Untrusted advice bytes
+    pub untrusted_advice: Vec<u8>,
+    /// Expected output bytes (None = don't verify output)
+    pub expected_output: Option<Vec<u8>>,
+}
+
+impl Default for E2ETestConfig {
+    fn default() -> Self {
+        Self {
+            program_name: "fibonacci-guest",
+            inputs: postcard::to_stdvec(&100u32).unwrap(),
+            max_trace_length: 1 << 16,
+            committed_bytecode: false,
+            dory_layout: None,
+            trusted_advice: vec![],
+            untrusted_advice: vec![],
+            expected_output: None,
+        }
+    }
+}
+
+impl E2ETestConfig {
+    // ========================================================================
+    // Program Constructors
+    // ========================================================================
+
+    /// Create config for fibonacci with custom input.
+    pub fn fibonacci(n: u32) -> Self {
+        Self {
+            inputs: postcard::to_stdvec(&n).unwrap(),
+            ..Default::default()
+        }
+    }
+
+    /// Create config for sha2 (with default 32-byte input).
+    pub fn sha2() -> Self {
+        Self {
+            program_name: "sha2-guest",
+            inputs: postcard::to_stdvec(&[5u8; 32]).unwrap(),
+            expected_output: Some(vec![
+                0x28, 0x9b, 0xdf, 0x82, 0x9b, 0x4a, 0x30, 0x26, 0x7, 0x9a, 0x3e, 0xa0, 0x89, 0x73,
+                0xb1, 0x97, 0x2d, 0x12, 0x4e, 0x7e, 0xaf, 0x22, 0x33, 0xc6, 0x3, 0x14, 0x3d, 0xc6,
+                0x3b, 0x50, 0xd2, 0x57,
+            ]),
+            ..Default::default()
+        }
+    }
+
+    /// Create config for sha3 (with default 32-byte input).
+    pub fn sha3() -> Self {
+        Self {
+            program_name: "sha3-guest",
+            inputs: postcard::to_stdvec(&[5u8; 32]).unwrap(),
+            expected_output: Some(vec![
+                0xd0, 0x3, 0x5c, 0x96, 0x86, 0x6e, 0xe2, 0x2e, 0x81, 0xf5, 0xc4, 0xef, 0xbd, 0x88,
+                0x33, 0xc1, 0x7e, 0xa1, 0x61, 0x10, 0x81, 0xfc, 0xd7, 0xa3, 0xdd, 0xce, 0xce, 0x7f,
+                0x44, 0x72, 0x4, 0x66,
+            ]),
+            ..Default::default()
+        }
+    }
+
+    /// Create config for merkle-tree guest.
+    /// Default: 4 leaves with input=[5;32], trusted=[6;32,7;32], untrusted=[8;32]
+    pub fn merkle_tree() -> Self {
+        let inputs = postcard::to_stdvec(&[5u8; 32].as_slice()).unwrap();
+        let untrusted_advice = postcard::to_stdvec(&[8u8; 32]).unwrap();
+        let mut trusted_advice = postcard::to_stdvec(&[6u8; 32]).unwrap();
+        trusted_advice.extend(postcard::to_stdvec(&[7u8; 32]).unwrap());
+
+        Self {
+            program_name: "merkle-tree-guest",
+            inputs,
+            trusted_advice,
+            untrusted_advice,
+            expected_output: Some(vec![
+                0xb4, 0x37, 0x0f, 0x3a, 0xb, 0x3d, 0x38, 0xa8, 0x7a, 0x6c, 0x4c, 0x46, 0x9, 0xe7,
+                0x83, 0xb3, 0xcc, 0xb7, 0x1c, 0x30, 0x1f, 0xf8, 0x54, 0xd, 0xf7, 0xdd, 0xc8, 0x42,
+                0x32, 0xbb, 0x16, 0xd7,
+            ]),
+            ..Default::default()
+        }
+    }
+
+    /// Create config for memory-ops guest (no inputs).
+    pub fn memory_ops() -> Self {
+        Self {
+            program_name: "memory-ops-guest",
+            inputs: vec![],
+            ..Default::default()
+        }
+    }
+
+    /// Create config for btreemap guest.
+    pub fn btreemap(n: u32) -> Self {
+        Self {
+            program_name: "btreemap-guest",
+            inputs: postcard::to_stdvec(&n).unwrap(),
+            ..Default::default()
+        }
+    }
+
+    /// Create config for muldiv guest.
+    pub fn muldiv(a: u32, b: u32, c: u32) -> Self {
+        Self {
+            program_name: "muldiv-guest",
+            inputs: postcard::to_stdvec(&[a, b, c]).unwrap(),
+            ..Default::default()
+        }
+    }
+
+    // ========================================================================
+    // Builder Methods
+    // ========================================================================
+
+    /// Set committed bytecode mode.
+    pub fn with_committed_bytecode(mut self) -> Self {
+        self.committed_bytecode = true;
+        self
+    }
+
+    /// Set Dory layout.
+    pub fn with_dory_layout(mut self, layout: DoryLayout) -> Self {
+        self.dory_layout = Some(layout);
+        self
+    }
+
+    /// Set small trace (256 cycles).
+    pub fn with_small_trace(mut self) -> Self {
+        self.max_trace_length = 256;
+        self
+    }
+
+    /// Set custom max trace length.
+    #[allow(dead_code)] // API for future tests
+    pub fn with_max_trace_length(mut self, len: usize) -> Self {
+        self.max_trace_length = len;
+        self
+    }
+
+    /// Set trusted advice bytes.
+    pub fn with_trusted_advice(mut self, advice: Vec<u8>) -> Self {
+        self.trusted_advice = advice;
+        self
+    }
+
+    /// Set untrusted advice bytes.
+    pub fn with_untrusted_advice(mut self, advice: Vec<u8>) -> Self {
+        self.untrusted_advice = advice;
+        self
+    }
+
+    /// Set expected output for verification.
+    #[allow(dead_code)] // API for future tests
+    pub fn expecting_output(mut self, output: Vec<u8>) -> Self {
+        self.expected_output = Some(output);
+        self
+    }
+
+    /// Clear expected output (don't verify).
+    #[allow(dead_code)] // API for future tests
+    pub fn without_output_check(mut self) -> Self {
+        self.expected_output = None;
+        self
+    }
+}
+
+/// Run an end-to-end test with the given configuration.
+///
+/// This handles all axes of variation:
+/// - Program selection
+/// - Bytecode mode (Full vs Committed)
+/// - Dory layout (CycleMajor vs AddressMajor)
+/// - Trusted/untrusted advice (computes commitment if non-empty)
+/// - Maximum padded trace length
+pub fn run_e2e_test(config: E2ETestConfig) {
+    // Setup Dory globals
+    DoryGlobals::reset();
+    if let Some(layout) = config.dory_layout {
+        DoryGlobals::set_layout(layout);
+    }
+
+    // Decode and trace program
+    let mut program = host::Program::new(config.program_name);
+    let (instructions, init_memory_state, _) = program.decode();
+    let (_, _, _, io_device) = program.trace(
+        &config.inputs,
+        &config.untrusted_advice,
+        &config.trusted_advice,
+    );
+
+    // Preprocess bytecode
+    let bytecode = Arc::new(BytecodePreprocessing::preprocess(instructions));
+    let shared_preprocessing = JoltSharedPreprocessing::new(
+        &bytecode,
+        io_device.memory_layout.clone(),
+        init_memory_state,
+        config.max_trace_length,
+    );
+
+    // Create prover preprocessing (mode-dependent)
+    let prover_preprocessing = if config.committed_bytecode {
+        JoltProverPreprocessing::new_committed(shared_preprocessing.clone(), Arc::clone(&bytecode))
+    } else {
+        JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode))
+    };
+
+    // Verify mode is correct
+    assert_eq!(
+        prover_preprocessing.is_committed_mode(),
+        config.committed_bytecode,
+        "Prover mode mismatch"
+    );
+
+    // Compute trusted advice commitment if advice is provided
+    let (trusted_commitment, trusted_hint) = if !config.trusted_advice.is_empty() {
+        let (c, h) =
+            commit_trusted_advice_preprocessing_only(&prover_preprocessing, &config.trusted_advice);
+        (Some(c), Some(h))
+    } else {
+        (None, None)
+    };
+
+    // Create prover and prove
+    let elf_contents = program.get_elf_contents().expect("elf contents is None");
+    let prover = RV64IMACProver::gen_from_elf(
+        &prover_preprocessing,
+        &elf_contents,
+        &config.inputs,
+        &config.untrusted_advice,
+        &config.trusted_advice,
+        trusted_commitment,
+        trusted_hint,
+    );
+    let io_device = prover.program_io.clone();
+    let (jolt_proof, debug_info) = prover.prove();
+
+    // Create verifier preprocessing from prover (respects mode)
+    let verifier_preprocessing = JoltVerifierPreprocessing::from(&prover_preprocessing);
+
+    // Verify mode propagated correctly
+    assert_eq!(
+        verifier_preprocessing.bytecode.is_committed(),
+        config.committed_bytecode,
+        "Verifier mode mismatch"
+    );
+
+    // Verify
+    let verifier = RV64IMACVerifier::new(
+        &verifier_preprocessing,
+        jolt_proof,
+        io_device.clone(),
+        trusted_commitment,
+        debug_info,
+    )
+    .expect("Failed to create verifier");
+    verifier.verify().expect("Verification failed");
+
+    // Check expected output if specified
+    if let Some(expected) = config.expected_output {
+        assert_eq!(
+            io_device.outputs, expected,
+            "Output mismatch for program '{}'",
+            config.program_name
+        );
+    }
+}
+
+/// Helper to commit trusted advice during preprocessing.
+fn commit_trusted_advice_preprocessing_only(
+    preprocessing: &JoltProverPreprocessing<Fr, DoryCommitmentScheme>,
+    trusted_advice_bytes: &[u8],
+) -> (
+    <DoryCommitmentScheme as CommitmentScheme>::Commitment,
+    <DoryCommitmentScheme as CommitmentScheme>::OpeningProofHint,
+) {
+    let max_trusted_advice_size = preprocessing.shared.memory_layout.max_trusted_advice_size;
+    let mut trusted_advice_words = vec![0u64; (max_trusted_advice_size as usize) / 8];
+    populate_memory_states(
+        0,
+        trusted_advice_bytes,
+        Some(&mut trusted_advice_words),
+        None,
+    );
+
+    let poly = MultilinearPolynomial::<Fr>::from(trusted_advice_words);
+    let advice_len = poly.len().next_power_of_two().max(1);
+
+    let _guard = DoryGlobals::initialize_context(1, advice_len, DoryContext::TrustedAdvice, None);
+    let (commitment, hint) = {
+        let _ctx = DoryGlobals::with_context(DoryContext::TrustedAdvice);
+        DoryCommitmentScheme::commit(&poly, &preprocessing.generators)
+    };
+    (commitment, hint)
+}
+
+#[test]
+#[serial]
+fn fib_e2e() {
+    run_e2e_test(E2ETestConfig::default());
+}
+
+#[test]
+#[serial]
+fn fib_e2e_small_trace() {
+    run_e2e_test(E2ETestConfig::fibonacci(5).with_small_trace());
+}
+
+#[test]
+#[serial]
+fn sha2_e2e() {
+    #[cfg(feature = "host")]
+    use jolt_inlines_sha2 as _;
+    run_e2e_test(E2ETestConfig::sha2());
+}
+
+#[test]
+#[serial]
+fn sha3_e2e() {
+    #[cfg(feature = "host")]
+    use jolt_inlines_keccak256 as _;
+    run_e2e_test(E2ETestConfig::sha3());
+}
+
+#[test]
+#[serial]
+fn sha2_with_unused_advice_e2e() {
+    // SHA2 guest does not consume advice, but providing both trusted and untrusted advice
+    // should still work correctly through the full pipeline.
+    #[cfg(feature = "host")]
+    use jolt_inlines_sha2 as _;
+
+    run_e2e_test(
+        E2ETestConfig::sha2()
+            .with_trusted_advice(postcard::to_stdvec(&[7u8; 32]).unwrap())
+            .with_untrusted_advice(postcard::to_stdvec(&[9u8; 32]).unwrap()),
+    );
+}
+
+#[test]
+#[serial]
+fn advice_merkle_tree_e2e() {
+    run_e2e_test(E2ETestConfig::merkle_tree());
+}
+
+#[test]
+#[serial]
+fn memory_ops_e2e() {
+    run_e2e_test(E2ETestConfig::memory_ops());
+}
+
+#[test]
+#[serial]
+fn btreemap_e2e() {
+    run_e2e_test(E2ETestConfig::btreemap(50));
+}
+
+#[test]
+#[serial]
+fn muldiv_e2e() {
+    run_e2e_test(E2ETestConfig::muldiv(9, 5, 3));
+}
+
+#[test]
+#[serial]
+fn fib_e2e_address_major() {
+    run_e2e_test(E2ETestConfig::default().with_dory_layout(DoryLayout::AddressMajor));
+}
+
+#[test]
+#[serial]
+fn advice_merkle_tree_e2e_address_major() {
+    run_e2e_test(E2ETestConfig::merkle_tree().with_dory_layout(DoryLayout::AddressMajor));
+}
+
+// ============================================================================
+// New Tests - Committed Bytecode Mode
+//
+// These tests are ignored until the verifier is fully updated to support
+// Committed mode (currently it calls as_full() which fails in Committed mode).
+// See verifier.rs line 442 - needs to branch on bytecode mode.
+// ============================================================================
+
+#[test]
+#[serial]
+#[ignore = "Verifier not yet updated for Committed mode"]
+fn fib_e2e_committed_bytecode() {
+    run_e2e_test(E2ETestConfig::default().with_committed_bytecode());
+}
+
+#[test]
+#[serial]
+#[ignore = "Verifier not yet updated for Committed mode"]
+fn fib_e2e_committed_bytecode_address_major() {
+    run_e2e_test(
+        E2ETestConfig::default()
+            .with_committed_bytecode()
+            .with_dory_layout(DoryLayout::AddressMajor),
+    );
+}
+
+// ============================================================================
+// New Tests - Bytecode Mode Detection
+// ============================================================================
+
+#[test]
+#[serial]
+fn bytecode_mode_detection_full() {
+    DoryGlobals::reset();
+    let mut program = host::Program::new("fibonacci-guest");
+    let (instructions, init_memory_state, _) = program.decode();
+    let (_, _, _, io_device) = program.trace(&[], &[], &[]);
+
+    let bytecode = Arc::new(BytecodePreprocessing::preprocess(instructions));
+    let shared = JoltSharedPreprocessing::new(
+        &bytecode,
+        io_device.memory_layout.clone(),
+        init_memory_state,
+        1 << 16,
+    );
+
+    // Full mode
+    let prover_full: JoltProverPreprocessing<Fr, DoryCommitmentScheme> =
+        JoltProverPreprocessing::new(shared.clone(), Arc::clone(&bytecode));
+    assert!(!prover_full.is_committed_mode());
+    assert!(prover_full.bytecode_commitments.is_none());
+
+    let verifier_full = JoltVerifierPreprocessing::from(&prover_full);
+    assert!(verifier_full.bytecode.is_full());
+    assert!(!verifier_full.bytecode.is_committed());
+    assert!(verifier_full.bytecode.as_full().is_ok());
+    assert!(verifier_full.bytecode.as_committed().is_err());
+}
+
+#[test]
+#[serial]
+fn bytecode_mode_detection_committed() {
+    DoryGlobals::reset();
+    let mut program = host::Program::new("fibonacci-guest");
+    let (instructions, init_memory_state, _) = program.decode();
+    let (_, _, _, io_device) = program.trace(&[], &[], &[]);
+
+    let bytecode = Arc::new(BytecodePreprocessing::preprocess(instructions));
+    let shared = JoltSharedPreprocessing::new(
+        &bytecode,
+        io_device.memory_layout.clone(),
+        init_memory_state,
+        1 << 16,
+    );
+
+    // Committed mode
+    let prover_committed: JoltProverPreprocessing<Fr, DoryCommitmentScheme> =
+        JoltProverPreprocessing::new_committed(shared.clone(), Arc::clone(&bytecode));
+    assert!(prover_committed.is_committed_mode());
+    assert!(prover_committed.bytecode_commitments.is_some());
+
+    let verifier_committed = JoltVerifierPreprocessing::from(&prover_committed);
+    assert!(!verifier_committed.bytecode.is_full());
+    assert!(verifier_committed.bytecode.is_committed());
+    assert!(verifier_committed.bytecode.as_full().is_err());
+    assert!(verifier_committed.bytecode.as_committed().is_ok());
+}
+
+// ============================================================================
+// Internal and Security Tests
+//
+// These tests require access to prover internals or manipulate trace/io
+// directly for security testing. They cannot use E2ETestConfig.
+// ============================================================================
+
+#[test]
+#[serial]
+fn max_advice_with_small_trace() {
+    DoryGlobals::reset();
+    // Tests that max-sized advice (4KB = 512 words) works with a minimal trace.
+    // With balanced dims (sigma_a=5, nu_a=4 for 512 words), the minimum padded trace
+    // (256 cycles -> total_vars=12) is sufficient to embed advice.
+    let mut program = host::Program::new("fibonacci-guest");
+    let inputs = postcard::to_stdvec(&5u32).unwrap();
+    let trusted_advice = vec![7u8; 4096];
+    let untrusted_advice = vec![9u8; 4096];
+
+    let (instructions, init_memory_state, _) = program.decode();
+    let (lazy_trace, trace, final_memory_state, io_device) =
+        program.trace(&inputs, &untrusted_advice, &trusted_advice);
+
+    let bytecode: Arc<BytecodePreprocessing> =
+        BytecodePreprocessing::preprocess(instructions).into();
+    let shared_preprocessing = JoltSharedPreprocessing::new(
+        &bytecode,
+        io_device.memory_layout.clone(),
+        init_memory_state,
+        256,
+    );
+    let prover_preprocessing: JoltProverPreprocessing<Fr, DoryCommitmentScheme> =
+        JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
+    tracing::info!(
+        "preprocessing.memory_layout.max_trusted_advice_size: {}",
+        shared_preprocessing.memory_layout.max_trusted_advice_size
+    );
+
+    let (trusted_commitment, trusted_hint) =
+        commit_trusted_advice_preprocessing_only(&prover_preprocessing, &trusted_advice);
+
+    let prover = RV64IMACProver::gen_from_trace(
+        &prover_preprocessing,
+        lazy_trace,
+        trace,
+        io_device,
+        Some(trusted_commitment),
+        Some(trusted_hint),
+        final_memory_state,
+    );
+
+    // Trace is tiny but advice is max-sized
+    assert!(prover.unpadded_trace_len < 512);
+    assert_eq!(prover.padded_trace_len, 256);
+
+    let io_device = prover.program_io.clone();
+    let (jolt_proof, debug_info) = prover.prove();
+
+    let verifier_preprocessing = JoltVerifierPreprocessing::from(&prover_preprocessing);
+    RV64IMACVerifier::new(
+        &verifier_preprocessing,
+        jolt_proof,
+        io_device,
+        Some(trusted_commitment),
+        debug_info,
+    )
+    .expect("Failed to create verifier")
+    .verify()
+    .expect("Verification failed");
+}
+
+#[test]
+#[serial]
+fn advice_opening_point_derives_from_unified_point() {
+    DoryGlobals::reset();
+    // Tests that advice opening points are correctly derived from the unified main opening
+    // point using Dory's balanced dimension policy.
+    //
+    // For a small trace (256 cycles), the advice row coordinates span both Stage 6 (cycle)
+    // and Stage 7 (address) challenges, verifying the two-phase reduction works correctly.
+    let mut program = host::Program::new("fibonacci-guest");
+    let inputs = postcard::to_stdvec(&5u32).unwrap();
+    let trusted_advice = postcard::to_stdvec(&[7u8; 32]).unwrap();
+    let untrusted_advice = postcard::to_stdvec(&[9u8; 32]).unwrap();
+
+    let (instructions, init_memory_state, _) = program.decode();
+    let (lazy_trace, trace, final_memory_state, io_device) =
+        program.trace(&inputs, &untrusted_advice, &trusted_advice);
+
+    let bytecode: Arc<BytecodePreprocessing> =
+        BytecodePreprocessing::preprocess(instructions).into();
+    let shared_preprocessing = JoltSharedPreprocessing::new(
+        &bytecode,
+        io_device.memory_layout.clone(),
+        init_memory_state,
+        1 << 16,
+    );
+    let prover_preprocessing: JoltProverPreprocessing<Fr, DoryCommitmentScheme> =
+        JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
+    let (trusted_commitment, trusted_hint) =
+        commit_trusted_advice_preprocessing_only(&prover_preprocessing, &trusted_advice);
+
+    let prover = RV64IMACProver::gen_from_trace(
+        &prover_preprocessing,
+        lazy_trace,
+        trace,
+        io_device,
+        Some(trusted_commitment),
+        Some(trusted_hint),
+        final_memory_state,
+    );
+
+    assert_eq!(prover.padded_trace_len, 256, "test expects small trace");
+
+    let io_device = prover.program_io.clone();
+    let (jolt_proof, debug_info) = prover.prove();
+    let debug_info = debug_info.expect("expected debug_info in tests");
+
+    // Get unified opening point and derive expected advice point
+    let (opening_point, _) = debug_info
+        .opening_accumulator
+        .get_committed_polynomial_opening(
+            CommittedPolynomial::InstructionRa(0),
+            SumcheckId::HammingWeightClaimReduction,
+        );
+    let mut point_dory_le = opening_point.r.clone();
+    point_dory_le.reverse();
+
+    let total_vars = point_dory_le.len();
+    let (sigma_main, _nu_main) = DoryGlobals::balanced_sigma_nu(total_vars);
+    let (sigma_a, nu_a) = DoryGlobals::advice_sigma_nu_from_max_bytes(
+        prover_preprocessing
+            .shared
+            .memory_layout
+            .max_trusted_advice_size as usize,
+    );
+
+    // Build expected advice point: [col_bits[0..sigma_a] || row_bits[0..nu_a]]
+    let mut expected_advice_le: Vec<_> = point_dory_le[0..sigma_a].to_vec();
+    expected_advice_le.extend_from_slice(&point_dory_le[sigma_main..sigma_main + nu_a]);
+
+    // Verify both advice types derive the same opening point
+    for (name, kind) in [
+        ("trusted", AdviceKind::Trusted),
+        ("untrusted", AdviceKind::Untrusted),
+    ] {
+        let get_fn = debug_info
+            .opening_accumulator
+            .get_advice_opening(kind, SumcheckId::AdviceClaimReduction);
+        assert!(
+            get_fn.is_some(),
+            "{name} advice opening missing for AdviceClaimReductionPhase2"
+        );
+        let (point_be, _) = get_fn.unwrap();
+        let mut point_le = point_be.r.clone();
+        point_le.reverse();
+        assert_eq!(point_le, expected_advice_le, "{name} advice point mismatch");
+    }
+
+    // Verify end-to-end
+    let verifier_preprocessing = JoltVerifierPreprocessing::from(&prover_preprocessing);
+    RV64IMACVerifier::new(
+        &verifier_preprocessing,
+        jolt_proof,
+        io_device,
+        Some(trusted_commitment),
+        Some(debug_info),
+    )
+    .expect("Failed to create verifier")
+    .verify()
+    .expect("Verification failed");
+}
+
+#[test]
+#[serial]
+#[should_panic]
+fn truncated_trace() {
+    let mut program = host::Program::new("fibonacci-guest");
+    let (instructions, init_memory_state, _) = program.decode();
+    let inputs = postcard::to_stdvec(&9u8).unwrap();
+    let (lazy_trace, mut trace, final_memory_state, mut program_io) =
+        program.trace(&inputs, &[], &[]);
+    trace.truncate(100);
+    program_io.outputs[0] = 0; // change the output to 0
+
+    let bytecode: Arc<BytecodePreprocessing> =
+        BytecodePreprocessing::preprocess(instructions).into();
+    let shared_preprocessing = JoltSharedPreprocessing::new(
+        &bytecode,
+        program_io.memory_layout.clone(),
+        init_memory_state,
+        1 << 16,
+    );
+
+    let prover_preprocessing: JoltProverPreprocessing<Fr, DoryCommitmentScheme> =
+        JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
+
+    let prover = RV64IMACProver::gen_from_trace(
+        &prover_preprocessing,
+        lazy_trace,
+        trace,
+        program_io.clone(),
+        None,
+        None,
+        final_memory_state,
+    );
+
+    let (proof, _) = prover.prove();
+
+    let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
+        prover_preprocessing.shared.clone(),
+        prover_preprocessing.generators.to_verifier_setup(),
+        Arc::clone(&prover_preprocessing.bytecode),
+    );
+    let verifier =
+        RV64IMACVerifier::new(&verifier_preprocessing, proof, program_io, None, None).unwrap();
+    verifier.verify().unwrap();
+}
+
+#[test]
+#[serial]
+#[should_panic]
+fn malicious_trace() {
+    let mut program = host::Program::new("fibonacci-guest");
+    let inputs = postcard::to_stdvec(&1u8).unwrap();
+    let (instructions, init_memory_state, _) = program.decode();
+    let (lazy_trace, trace, final_memory_state, mut program_io) = program.trace(&inputs, &[], &[]);
+
+    let bytecode: Arc<BytecodePreprocessing> =
+        BytecodePreprocessing::preprocess(instructions).into();
+
+    // Since the preprocessing is done with the original memory layout, the verifier should fail
+    let shared_preprocessing = JoltSharedPreprocessing::new(
+        &bytecode,
+        program_io.memory_layout.clone(),
+        init_memory_state,
+        1 << 16,
+    );
+    let prover_preprocessing: JoltProverPreprocessing<Fr, DoryCommitmentScheme> =
+        JoltProverPreprocessing::new(shared_preprocessing.clone(), Arc::clone(&bytecode));
+
+    // change memory address of output & termination bit to the same address as input
+    // changes here should not be able to spoof the verifier result
+    program_io.memory_layout.output_start = program_io.memory_layout.input_start;
+    program_io.memory_layout.output_end = program_io.memory_layout.input_end;
+    program_io.memory_layout.termination = program_io.memory_layout.input_start;
+
+    let prover = RV64IMACProver::gen_from_trace(
+        &prover_preprocessing,
+        lazy_trace,
+        trace,
+        program_io.clone(),
+        None,
+        None,
+        final_memory_state,
+    );
+    let (proof, _) = prover.prove();
+
+    let verifier_preprocessing = JoltVerifierPreprocessing::new_full(
+        prover_preprocessing.shared.clone(),
+        prover_preprocessing.generators.to_verifier_setup(),
+        Arc::clone(&prover_preprocessing.bytecode),
+    );
+    let verifier =
+        JoltVerifier::new(&verifier_preprocessing, proof, program_io, None, None).unwrap();
+    verifier.verify().unwrap();
+}
diff --git a/jolt-core/src/zkvm/verifier.rs b/jolt-core/src/zkvm/verifier.rs
index f1def93030..34c5f69674 100644
--- a/jolt-core/src/zkvm/verifier.rs
+++ b/jolt-core/src/zkvm/verifier.rs
@@ -437,15 +437,22 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
         anyhow::Error,
     > {
         let n_cycle_vars = self.proof.trace_length.log_2();
-        // In Committed mode, this returns an error (Full bytecode not available)
+        let bytecode_preprocessing = match self.proof.bytecode_mode {
+            BytecodeMode::Committed => {
+                // Ensure we have committed bytecode commitments for committed mode.
+                let _ = self.preprocessing.bytecode.as_committed()?;
+                None
+            }
+            BytecodeMode::Full => Some(self.preprocessing.bytecode.as_full()?.as_ref()),
+        };
         let bytecode_read_raf = BytecodeReadRafAddressSumcheckVerifier::new(
-            self.preprocessing.bytecode.as_full()?,
+            bytecode_preprocessing,
             n_cycle_vars,
             &self.one_hot_params,
             &self.opening_accumulator,
             &mut self.transcript,
             self.proof.bytecode_mode,
-        );
+        )?;
         let booleanity_params = BooleanitySumcheckParams::new(
             n_cycle_vars,
             &self.one_hot_params,

From f9e5fed7aca9ca5fe7edaba98a33784933cf7456 Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Tue, 20 Jan 2026 02:15:43 -0800
Subject: [PATCH 09/16] refactor(sdk): simplify preprocessing API

- Add macro-generated preprocess_<func> and keep verifier preprocessing derived from prover
- Update examples and host template to the new 2-call workflow
- Fold bytecode preprocessing refactor notes into bytecode-commitment-progress.md (single authoritative doc)
- Fix bigint inline assembly gating to avoid host build failures
---
 book/src/usage/guests_hosts/hosts.md          |   4 +-
 bytecode-commitment-progress.md               |  89 ++++++++
 bytecode-refactor-design.md                   | 202 ------------------
 examples/alloc/src/main.rs                    |   9 +-
 examples/btreemap/host/src/main.rs            |  11 +-
 examples/collatz/src/main.rs                  |  14 +-
 examples/fibonacci/src/main.rs                |   7 +-
 examples/hash-bench/src/main.rs               |   6 +-
 examples/malloc/src/main.rs                   |   9 +-
 examples/memory-ops/src/main.rs               |   9 +-
 examples/merkle-tree/src/main.rs              |   9 +-
 examples/muldiv/src/main.rs                   |   9 +-
 examples/multi-function/src/main.rs           |  15 +-
 examples/overflow/src/main.rs                 |  21 +-
 examples/random/src/main.rs                   |   9 +-
 examples/recover-ecdsa/src/main.rs            |   9 +-
 examples/secp256k1-ecdsa-verify/src/main.rs   |   7 +-
 examples/sha2-chain/src/main.rs               |   9 +-
 examples/sha2-ex/src/main.rs                  |   9 +-
 examples/sha3-chain/src/main.rs               |   9 +-
 examples/sha3-ex/src/main.rs                  |   9 +-
 examples/stdlib/src/main.rs                   |  18 +-
 jolt-inlines/bigint/src/multiplication/mod.rs |   1 -
 jolt-inlines/bigint/src/multiplication/sdk.rs |  10 +-
 jolt-sdk/macros/src/lib.rs                    |  84 ++++----
 src/main.rs                                   |   7 +-
 26 files changed, 209 insertions(+), 386 deletions(-)
 delete mode 100644 bytecode-refactor-design.md

diff --git a/book/src/usage/guests_hosts/hosts.md b/book/src/usage/guests_hosts/hosts.md
index 5c05bb9dda..5c1f2fae1f 100644
--- a/book/src/usage/guests_hosts/hosts.md
+++ b/book/src/usage/guests_hosts/hosts.md
@@ -5,7 +5,7 @@ Hosts are where we can invoke the Jolt prover to prove functions defined within
 The host imports the guest package, and will have automatically generated functions to build each of the Jolt functions. For the SHA3 example we looked at in the [guest](./guests.md) section, the `jolt::provable` procedural macro generates several functions that can be invoked from the host (shown below):
 
 - `compile_sha3(target_dir)` to compile the SHA3 guest to RISC-V
-- `preprocess_prover_sha3` and `verifier_preprocessing_from_prover_sha3` to generate the prover and verifier preprocessing. Note that the preprocessing only needs to be generated once for a given guest program, and can subsequently be reused to prove multiple invocations of the guest.
+- `preprocess_sha3` and `verifier_preprocessing_from_prover_sha3` to generate the prover and verifier preprocessing. Note that the preprocessing only needs to be generated once for a given guest program, and can subsequently be reused to prove multiple invocations of the guest.
 - `build_prover_sha3` returns a closure for the prover, which takes in the same input types as the original function and modifies the output to additionally include a proof.
 - `build_verifier_sha3` returns a closure for the verifier, which verifies the proof. The verifier closure's parameters comprise of the program input, the claimed output, a `bool` value claiming whether the guest panicked, and the proof.
 
@@ -14,7 +14,7 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_sha3(target_dir);
 
-    let prover_preprocessing = guest::preprocess_prover_sha3(&mut program);
+    let prover_preprocessing = guest::preprocess_sha3(&mut program);
     let verifier_preprocessing =
         guest::verifier_preprocessing_from_prover_sha3(&prover_preprocessing);
 
diff --git a/bytecode-commitment-progress.md b/bytecode-commitment-progress.md
index ea0ed0ca81..cd084b8e02 100644
--- a/bytecode-commitment-progress.md
+++ b/bytecode-commitment-progress.md
@@ -2,6 +2,94 @@
 
 This file is a **living design doc** for implementing **bytecode commitment** to remove verifier work linear in bytecode size \(K\), especially in recursion contexts (e.g. `examples/recursion/`).
 
+This is the **single authoritative document** for:
+- bytecode commitment design + implementation progress
+- the bytecode preprocessing refactor (Full vs Committed split via `BytecodeMode`)
+
+## Current architecture baseline (post-refactor)
+
+Bytecode preprocessing is now split between prover and verifier based on `BytecodeMode`:
+
+- **Full mode**: verifier has access to full bytecode (may do \(O(K)\) work).
+- **Committed mode**: verifier only has bytecode *commitments* (succinct), and verification uses claim reductions.
+
+### Data structures (single source of truth for bytecode size \(K\))
+
+```
+BytecodePreprocessing  ← O(K) data, created first via preprocess()
+├── bytecode: Vec<Instruction>
+└── pc_map: BytecodePCMapper
+
+JoltSharedPreprocessing  ← Truly shared, single source of truth for size
+├── bytecode_size: usize            ← Derived from bytecode.bytecode.len()
+├── ram: RAMPreprocessing
+├── memory_layout: MemoryLayout
+└── max_padded_trace_length: usize
+
+JoltProverPreprocessing  ← Prover always has full bytecode
+├── generators: PCS::ProverSetup
+├── shared: JoltSharedPreprocessing
+├── bytecode: Arc<BytecodePreprocessing>        ← Full bytecode (always)
+├── bytecode_commitments: Option<TrustedBytecodeCommitments<PCS>>  ← Only in Committed mode
+└── bytecode_commitment_hints: Option<Vec<PCS::OpeningProofHint>>  ← Only in Committed mode
+
+JoltVerifierPreprocessing  ← Verifier has mode-dependent bytecode
+├── generators: PCS::VerifierSetup
+├── shared: JoltSharedPreprocessing
+└── bytecode: VerifierBytecode<PCS>        ← Full OR Committed
+
+VerifierBytecode<PCS>  ← Mode-dependent bytecode info
+├── Full(Arc<BytecodePreprocessing>)              ← Full mode
+└── Committed(TrustedBytecodeCommitments<PCS>)    ← Committed mode
+```
+
+`BytecodeMode` is the first-class “full vs committed” selector (`jolt-core/src/zkvm/config.rs`).
+
+### Trace-like `Arc` pattern (parallel to trace handling)
+
+```rust
+// Trace:
+let trace: std::sync::Arc<Vec<Cycle>> = trace.into();
+
+// Bytecode (parallel):
+let bytecode: std::sync::Arc<BytecodePreprocessing> =
+    BytecodePreprocessing::preprocess(instructions).into();
+```
+
+### Key design decisions (implemented)
+
+- `BytecodePreprocessing::preprocess()` returns `Self` (callers wrap in `Arc<Self>` as needed).
+- `JoltSharedPreprocessing::new()` takes `&BytecodePreprocessing` and stores only `bytecode_size` (single source of truth for \(K\)).
+- `TrustedBytecodeCommitments<PCS>` is a trust-typed wrapper: create via `derive()` (offline preprocessing) or trusted deserialization.
+- `VerifierBytecode::as_full()` / `as_committed()` return `Result<_, ProofVerifyError>` (no panics for mismatched mode).
+
+### SDK macro API (current)
+
+The `#[jolt::provable]` macro generates a **2-call** preprocessing workflow for the common case:
+
+```rust
+let prover_pp = guest::preprocess_<func>(&mut program);
+let verifier_pp = guest::verifier_preprocessing_from_prover_<func>(&prover_pp);
+```
+
+Advanced/secondary API (still generated):
+
+- `preprocess_shared_<func>(&mut Program) -> (JoltSharedPreprocessing, BytecodePreprocessing)`
+
+### TODO (SDK): expose Committed bytecode mode end-to-end
+
+Committed mode requires **both**:
+
+1. **Committed preprocessing**: create prover preprocessing via `JoltProverPreprocessing::new_committed(...)`
+2. **Committed proving**: prove via `RV64IMACProver::gen_from_elf_with_bytecode_mode(..., BytecodeMode::Committed)`
+
+TODO items:
+
+- Generate `preprocess_committed_<func>(&mut Program) -> JoltProverPreprocessing<...>` (calls `JoltProverPreprocessing::new_committed`).
+- Generate a committed proving entrypoint (either `prove_committed_<func>` / `build_prover_committed_<func>`, or add a `bytecode_mode: BytecodeMode` parameter to the existing prover entrypoints).
+- Re-export `BytecodeMode` from the SDK host surface (or otherwise make it available to macro-generated code).
+- Keep committed mode behind an explicit opt-in until bytecode commitment derivation + Stage 8 batching are complete (`TrustedBytecodeCommitments::derive` is currently a stub).
+
 ## Problem statement (what is slow today?)
 
 ### Where the verifier is doing \(O(K)\) work
@@ -367,6 +455,7 @@ Immediate next steps:
 2. Wire BytecodeChunk into Stage 8 batching and RLC streaming; add BytecodeChunk to committed polynomial list and witness generation (`jolt-core/src/zkvm/witness.rs` **L34–L61**, **L121–L171**; `jolt-core/src/poly/rlc_polynomial.rs` **L184–L198**; `jolt-core/src/zkvm/prover.rs` **L1504–L1567**).
 3. Add/enable tests (lane ordering, padding, committed mode e2e) and remove ignores once commitments + Stage 8 batching are wired (`jolt-core/src/zkvm/tests.rs` **L426–L486**; `jolt-core/src/zkvm/prover.rs` **L395–L409**; `jolt-core/src/zkvm/verifier.rs` **L171–L177**).
 4. Consider streaming/implicit bytecode chunk representation to avoid `k_chunk * T` materialization (`jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L216–L239**).
+5. Expose Committed bytecode mode in the SDK (opt-in): macro-generated committed preprocessing + committed proving entrypoint / `BytecodeMode` parameter (see “TODO (SDK): expose Committed bytecode mode end-to-end” above).
 
 Concerns / risks:
 - BytecodeClaimReduction currently materializes `weight_chunks` and `bytecode_chunks` of size `k_chunk * T` (memory heavy for large bytecode) (`jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L216–L239**).
diff --git a/bytecode-refactor-design.md b/bytecode-refactor-design.md
deleted file mode 100644
index 6299fe341b..0000000000
--- a/bytecode-refactor-design.md
+++ /dev/null
@@ -1,202 +0,0 @@
-# Bytecode Preprocessing Refactor Design
-
-## Goal
-
-Separate bytecode preprocessing between prover and verifier based on `BytecodeMode`:
-
-- **Full mode**: Verifier has access to full bytecode (O(K) data) — current behavior
-- **Committed mode**: Verifier only sees bytecode commitments — enables succinct verification
-
-## Current State (After Refactor)
-
-```
-BytecodePreprocessing  ← O(K) data, created first via preprocess()
-├── bytecode: Vec<Instruction>
-└── pc_map: BytecodePCMapper
-
-JoltSharedPreprocessing  ← Truly shared, single source of truth for size
-├── bytecode_size: usize            ← Derived from bytecode.bytecode.len()
-├── ram: RAMPreprocessing
-├── memory_layout: MemoryLayout
-└── max_padded_trace_length: usize
-
-JoltProverPreprocessing  ← Prover always has full bytecode
-├── generators: PCS::ProverSetup
-├── shared: JoltSharedPreprocessing
-├── bytecode: Arc<BytecodePreprocessing>        ← Full bytecode (always)
-├── bytecode_commitments: Option<TrustedBytecodeCommitments<PCS>>  ← Only in Committed mode
-└── bytecode_commitment_hints: Option<Vec<PCS::OpeningProofHint>>  ← Only in Committed mode
-
-JoltVerifierPreprocessing  ← Verifier has mode-dependent bytecode
-├── generators: PCS::VerifierSetup
-├── shared: JoltSharedPreprocessing
-└── bytecode: VerifierBytecode<PCS>        ← Full OR Committed
-
-VerifierBytecode<PCS>  ← Mode-dependent bytecode info
-├── Full(Arc<BytecodePreprocessing>)              ← For Full mode
-└── Committed(TrustedBytecodeCommitments<PCS>)    ← For Committed mode
-```
-
----
-
-## The Trace-Like Pattern
-
-Bytecode preprocessing follows the same pattern as trace:
-
-```rust
-// Trace pattern:
-let trace: Arc<Vec<Cycle>> = trace.into();
-
-// Bytecode pattern (parallel):
-let bytecode: Arc<BytecodePreprocessing> = BytecodePreprocessing::preprocess(instructions).into();
-```
-
-Both use `Arc` for cheap cloning (`Arc::clone` is O(1) reference count increment).
-
----
-
-## Usage Examples
-
-### E2E Flow (Full Mode)
-
-```rust
-// 1. Decode + preprocess bytecode (returns Self, wrap in Arc)
-let (instructions, memory_init, _) = program.decode();
-let bytecode: Arc<BytecodePreprocessing> = BytecodePreprocessing::preprocess(instructions).into();
-
-// 2. Create shared preprocessing (borrows bytecode to get size)
-let shared = JoltSharedPreprocessing::new(
-    &bytecode,
-    memory_layout,
-    memory_init,
-    max_trace_length,
-);
-
-// 3. Prover (Arc::clone is O(1))
-let prover_pp = JoltProverPreprocessing::new(shared.clone(), Arc::clone(&bytecode));
-
-// 4. Verifier (Full mode)
-let verifier_pp = JoltVerifierPreprocessing::new_full(shared, generators, bytecode);
-```
-
-### E2E Flow (Committed Mode)
-
-```rust
-// 1-2. Same as above...
-let bytecode: Arc<BytecodePreprocessing> = BytecodePreprocessing::preprocess(instructions).into();
-let shared = JoltSharedPreprocessing::new(&bytecode, memory_layout, memory_init, max_trace);
-
-// 3. Prover in Committed mode (computes commitments during preprocessing)
-let prover_pp = JoltProverPreprocessing::new_committed(shared.clone(), Arc::clone(&bytecode));
-
-// 4. Verifier receives only commitments (from prover's preprocessing)
-let verifier_pp = JoltVerifierPreprocessing::new_committed(
-    shared,
-    generators,
-    prover_pp.bytecode_commitments.clone().unwrap(),
-);
-```
-
-### Accessing Bytecode Data
-
-```rust
-// Access bytecode size (always from shared - single source of truth)
-let code_size = prover_pp.shared.bytecode_size;   // ✅ Definitive source
-let code_size = verifier_pp.shared.bytecode_size; // ✅ Same
-
-// Access full bytecode (prover only, or verifier in Full mode)
-let bytecode_data = &prover_pp.bytecode;                              // Arc<BytecodePreprocessing>
-let bytecode_data = verifier_pp.bytecode.as_full()?;                  // Result<&Arc<...>, ProofVerifyError>
-let commitments = verifier_pp.bytecode.as_committed()?;               // Result<&TrustedBytecodeCommitments<PCS>, ProofVerifyError>
-```
-
----
-
-## SDK Macro Changes
-
-The generated preprocessing functions now follow the trace-like pattern:
-
-```rust
-// Old API (deprecated)
-pub fn preprocess_shared_foo(program: &mut Program) -> JoltSharedPreprocessing
-
-// New API
-pub fn preprocess_shared_foo(program: &mut Program) 
-    -> (JoltSharedPreprocessing, Arc<BytecodePreprocessing>)
-
-pub fn preprocess_prover_foo(
-    shared: JoltSharedPreprocessing,
-    bytecode: Arc<BytecodePreprocessing>,
-) -> JoltProverPreprocessing<F, PCS>
-
-pub fn preprocess_verifier_foo(
-    shared: JoltSharedPreprocessing,
-    generators: PCS::VerifierSetup,
-    bytecode: Arc<BytecodePreprocessing>,  // For Full mode
-) -> JoltVerifierPreprocessing<F, PCS>
-```
-
----
-
-## Key Design Decisions
-
-1. **`BytecodePreprocessing::preprocess()` returns `Self`** (not `Arc<Self>`)
-   - Caller uses `.into()` to wrap in Arc, just like trace
-
-2. **`JoltSharedPreprocessing::new()` takes `&BytecodePreprocessing`**
-   - Borrows to compute `bytecode_size = bytecode.bytecode.len()`
-   - Returns just `Self`, not a tuple
-
-3. **`bytecode_size` is the single source of truth**
-   - Stored in `JoltSharedPreprocessing`
-   - `BytecodePreprocessing` has no size field
-
-4. **`TrustedBytecodeCommitments<PCS>`** wrapper enforces trust model
-   - Type-level guarantee that commitments came from honest preprocessing
-   - Public `commitments: Vec<PCS::Commitment>` field for simplicity
-
-5. **No panics in `VerifierBytecode::as_full()` / `as_committed()`**
-   - Returns `Result<_, ProofVerifyError>` with `BytecodeTypeMismatch` error
-
----
-
-## Files Modified
-
-| File | Changes |
-|------|---------|
-| `jolt-core/src/zkvm/bytecode/mod.rs` | `preprocess()` returns `Self`, added `VerifierBytecode<PCS>`, `TrustedBytecodeCommitments<PCS>` |
-| `jolt-core/src/zkvm/prover.rs` | Added `bytecode`, `bytecode_commitments`, `bytecode_commitment_hints` fields |
-| `jolt-core/src/zkvm/verifier.rs` | `new()` takes `&BytecodePreprocessing`, added `bytecode_size`, removed `bytecode` |
-| `jolt-core/src/guest/prover.rs` | Updated to new pattern |
-| `jolt-core/src/guest/verifier.rs` | Updated to new pattern |
-| `jolt-sdk/macros/src/lib.rs` | Updated generated code for new API |
-| `jolt-sdk/src/host_utils.rs` | Added `BytecodePreprocessing` export |
-| `jolt-core/benches/e2e_profiling.rs` | Updated to new pattern |
-
----
-
-## Verification
-
-- ✅ `cargo fmt` clean
-- ✅ `cargo clippy -p jolt-core --tests -- -D warnings` passes
-- ✅ `cargo clippy -p jolt-sdk --benches -- -D warnings` passes
-
----
-
-## Status
-
-**Refactor Complete** — Structure for Full and Committed modes is in place.
-
-### What's Done
-- Bytecode preprocessing separated from shared preprocessing
-- `Arc<BytecodePreprocessing>` pattern (like trace)
-- `JoltSharedPreprocessing.bytecode_size` as single source of truth
-- `VerifierBytecode<PCS>` enum for mode-dependent bytecode
-- `TrustedBytecodeCommitments<PCS>` wrapper for type-safe commitments
-- All call sites updated (tests, guest/*, SDK macros, benchmarks)
-
-### What's TODO (future PRs)
-- [ ] Implement actual bytecode commitment computation in `TrustedBytecodeCommitments::derive()`
-- [ ] Add E2E tests for Committed mode
-- [ ] Exercise `BytecodeClaimReduction` sumcheck with Committed mode
-- [ ] Consider unified `JoltConfig` struct for all configuration
diff --git a/examples/alloc/src/main.rs b/examples/alloc/src/main.rs
index 1afd790d20..8845e61aaf 100644
--- a/examples/alloc/src/main.rs
+++ b/examples/alloc/src/main.rs
@@ -7,12 +7,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_alloc(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_alloc(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_alloc(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_alloc(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_alloc(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_alloc(&prover_preprocessing);
 
     let prove_alloc = guest::build_prover_alloc(program, prover_preprocessing);
     let verify_alloc = guest::build_verifier_alloc(verifier_preprocessing);
diff --git a/examples/btreemap/host/src/main.rs b/examples/btreemap/host/src/main.rs
index 011f502489..5bfb3ef5b5 100644
--- a/examples/btreemap/host/src/main.rs
+++ b/examples/btreemap/host/src/main.rs
@@ -17,19 +17,12 @@ pub fn btreemap() {
         guest::compile_btreemap(target_dir)
     });
 
-    let shared_preprocessing = step!("Preprocessing shared", {
-        guest::preprocess_shared_btreemap(&mut program)
-    });
-
     let prover_preprocessing = step!("Preprocessing prover", {
-        guest::preprocess_prover_btreemap(shared_preprocessing.clone())
+        guest::preprocess_btreemap(&mut program)
     });
 
     let verifier_preprocessing = step!("Preprocessing verifier", {
-        guest::preprocess_verifier_btreemap(
-            shared_preprocessing,
-            prover_preprocessing.generators.to_verifier_setup(),
-        )
+        guest::verifier_preprocessing_from_prover_btreemap(&prover_preprocessing)
     });
 
     let prove = step!("Building prover", {
diff --git a/examples/collatz/src/main.rs b/examples/collatz/src/main.rs
index c91450547d..1ea0415512 100644
--- a/examples/collatz/src/main.rs
+++ b/examples/collatz/src/main.rs
@@ -8,12 +8,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_collatz_convergence(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_collatz_convergence(&mut program);
-    let prover_preprocessing =
-        guest::preprocess_prover_collatz_convergence(shared_preprocessing.clone());
-    let verifier_setup = prover_preprocessing.generators.to_verifier_setup();
+    let prover_preprocessing = guest::preprocess_collatz_convergence(&mut program);
     let verifier_preprocessing =
-        guest::preprocess_verifier_collatz_convergence(shared_preprocessing, verifier_setup);
+        guest::verifier_preprocessing_from_prover_collatz_convergence(&prover_preprocessing);
 
     let prove_collatz_single =
         guest::build_prover_collatz_convergence(program, prover_preprocessing);
@@ -31,12 +28,9 @@ pub fn main() {
     // Prove/verify convergence for a range of numbers:
     let mut program = guest::compile_collatz_convergence_range(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_collatz_convergence_range(&mut program);
-    let prover_preprocessing =
-        guest::preprocess_prover_collatz_convergence_range(shared_preprocessing.clone());
-    let verifier_setup = prover_preprocessing.generators.to_verifier_setup();
+    let prover_preprocessing = guest::preprocess_collatz_convergence_range(&mut program);
     let verifier_preprocessing =
-        guest::preprocess_verifier_collatz_convergence_range(shared_preprocessing, verifier_setup);
+        guest::verifier_preprocessing_from_prover_collatz_convergence_range(&prover_preprocessing);
 
     let prove_collatz_convergence =
         guest::build_prover_collatz_convergence_range(program, prover_preprocessing);
diff --git a/examples/fibonacci/src/main.rs b/examples/fibonacci/src/main.rs
index ac2b755cad..324cfe3096 100644
--- a/examples/fibonacci/src/main.rs
+++ b/examples/fibonacci/src/main.rs
@@ -10,12 +10,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_fib(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_fib(&mut program);
-
-    let prover_preprocessing = guest::preprocess_prover_fib(shared_preprocessing.clone());
-    let verifier_setup = prover_preprocessing.generators.to_verifier_setup();
+    let prover_preprocessing = guest::preprocess_fib(&mut program);
     let verifier_preprocessing =
-        guest::preprocess_verifier_fib(shared_preprocessing, verifier_setup);
+        guest::verifier_preprocessing_from_prover_fib(&prover_preprocessing);
 
     if save_to_disk {
         serialize_and_print_size(
diff --git a/examples/hash-bench/src/main.rs b/examples/hash-bench/src/main.rs
index 181ec912c9..8c498ab3f2 100644
--- a/examples/hash-bench/src/main.rs
+++ b/examples/hash-bench/src/main.rs
@@ -6,11 +6,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_hashbench(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_hashbench(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_hashbench(shared_preprocessing.clone());
-    let verifier_setup = prover_preprocessing.generators.to_verifier_setup();
+    let prover_preprocessing = guest::preprocess_hashbench(&mut program);
     let verifier_preprocessing =
-        guest::preprocess_verifier_hashbench(shared_preprocessing, verifier_setup);
+        guest::verifier_preprocessing_from_prover_hashbench(&prover_preprocessing);
 
     let prove_hashbench = guest::build_prover_hashbench(program, prover_preprocessing);
     let verify_hashbench = guest::build_verifier_hashbench(verifier_preprocessing);
diff --git a/examples/malloc/src/main.rs b/examples/malloc/src/main.rs
index d28e99d067..39b3b955d4 100644
--- a/examples/malloc/src/main.rs
+++ b/examples/malloc/src/main.rs
@@ -4,12 +4,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_alloc(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_alloc(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_alloc(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_alloc(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_alloc(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_alloc(&prover_preprocessing);
 
     let prove = guest::build_prover_alloc(program, prover_preprocessing);
     let verify = guest::build_verifier_alloc(verifier_preprocessing);
diff --git a/examples/memory-ops/src/main.rs b/examples/memory-ops/src/main.rs
index a95af60aa0..3516b6144c 100644
--- a/examples/memory-ops/src/main.rs
+++ b/examples/memory-ops/src/main.rs
@@ -7,12 +7,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_memory_ops(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_memory_ops(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_memory_ops(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_memory_ops(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_memory_ops(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_memory_ops(&prover_preprocessing);
 
     let prove = guest::build_prover_memory_ops(program, prover_preprocessing);
     let verify = guest::build_verifier_memory_ops(verifier_preprocessing);
diff --git a/examples/merkle-tree/src/main.rs b/examples/merkle-tree/src/main.rs
index c31353402c..4a89261071 100644
--- a/examples/merkle-tree/src/main.rs
+++ b/examples/merkle-tree/src/main.rs
@@ -8,12 +8,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_merkle_tree(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_merkle_tree(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_merkle_tree(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_merkle_tree(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_merkle_tree(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_merkle_tree(&prover_preprocessing);
 
     let leaf1: &[u8] = &[5u8; 32];
     let leaf2 = [6u8; 32];
diff --git a/examples/muldiv/src/main.rs b/examples/muldiv/src/main.rs
index 7a3680e5dc..5cc95530db 100644
--- a/examples/muldiv/src/main.rs
+++ b/examples/muldiv/src/main.rs
@@ -7,12 +7,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_muldiv(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_muldiv(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_muldiv(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_muldiv(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_muldiv(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_muldiv(&prover_preprocessing);
 
     let prove = guest::build_prover_muldiv(program, prover_preprocessing);
     let verify = guest::build_verifier_muldiv(verifier_preprocessing);
diff --git a/examples/multi-function/src/main.rs b/examples/multi-function/src/main.rs
index 6d9f9da9f8..c12c081bbd 100644
--- a/examples/multi-function/src/main.rs
+++ b/examples/multi-function/src/main.rs
@@ -8,11 +8,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_add(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_add(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_add(shared_preprocessing.clone());
-    let verifier_setup = prover_preprocessing.generators.to_verifier_setup();
+    let prover_preprocessing = guest::preprocess_add(&mut program);
     let verifier_preprocessing =
-        guest::preprocess_verifier_add(shared_preprocessing, verifier_setup);
+        guest::verifier_preprocessing_from_prover_add(&prover_preprocessing);
 
     let prove_add = guest::build_prover_add(program, prover_preprocessing);
     let verify_add = guest::build_verifier_add(verifier_preprocessing);
@@ -21,12 +19,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_mul(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_mul(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_mul(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_mul(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_mul(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_mul(&prover_preprocessing);
 
     let prove_mul = guest::build_prover_mul(program, prover_preprocessing);
     let verify_mul = guest::build_verifier_mul(verifier_preprocessing);
diff --git a/examples/overflow/src/main.rs b/examples/overflow/src/main.rs
index 4a17575e70..a677dc4537 100644
--- a/examples/overflow/src/main.rs
+++ b/examples/overflow/src/main.rs
@@ -9,9 +9,7 @@ pub fn main() {
     // An overflowing stack should fail to prove.
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_overflow_stack(target_dir);
-    let shared_preprocessing = guest::preprocess_shared_overflow_stack(&mut program);
-    let prover_preprocessing =
-        guest::preprocess_prover_overflow_stack(shared_preprocessing.clone());
+    let prover_preprocessing = guest::preprocess_overflow_stack(&mut program);
     let prove_overflow_stack = guest::build_prover_overflow_stack(program, prover_preprocessing);
 
     let res = panic::catch_unwind(|| {
@@ -23,8 +21,7 @@ pub fn main() {
 
     // now lets try to overflow the heap, should also panic
     let mut program = guest::compile_overflow_heap(target_dir);
-    let shared_preprocessing = guest::preprocess_shared_overflow_heap(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_overflow_heap(shared_preprocessing.clone());
+    let prover_preprocessing = guest::preprocess_overflow_heap(&mut program);
     let prove_overflow_heap = guest::build_prover_overflow_heap(program, prover_preprocessing);
 
     let res = panic::catch_unwind(|| {
@@ -35,15 +32,11 @@ pub fn main() {
     // valid case for stack allocation, calls overflow_stack() under the hood
     // but with stack_size=8192
     let mut program = guest::compile_allocate_stack_with_increased_size(target_dir);
-
-    let shared_preprocessing =
-        guest::preprocess_shared_allocate_stack_with_increased_size(&mut program);
-    let prover_preprocessing =
-        guest::preprocess_prover_allocate_stack_with_increased_size(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_allocate_stack_with_increased_size(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_allocate_stack_with_increased_size(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_allocate_stack_with_increased_size(
+            &prover_preprocessing,
+        );
 
     let prove_allocate_stack_with_increased_size =
         guest::build_prover_allocate_stack_with_increased_size(program, prover_preprocessing);
diff --git a/examples/random/src/main.rs b/examples/random/src/main.rs
index e4456db259..0379c49bd0 100644
--- a/examples/random/src/main.rs
+++ b/examples/random/src/main.rs
@@ -7,12 +7,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_rand(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_rand(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_rand(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_rand(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_rand(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_rand(&prover_preprocessing);
 
     let prove = guest::build_prover_rand(program, prover_preprocessing);
     let verify = guest::build_verifier_rand(verifier_preprocessing);
diff --git a/examples/recover-ecdsa/src/main.rs b/examples/recover-ecdsa/src/main.rs
index 038a5c1fa7..512a59ca22 100644
--- a/examples/recover-ecdsa/src/main.rs
+++ b/examples/recover-ecdsa/src/main.rs
@@ -31,12 +31,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_recover(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_recover(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_recover(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_recover(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_recover(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_recover(&prover_preprocessing);
 
     if save_to_disk {
         serialize_and_print_size(
diff --git a/examples/secp256k1-ecdsa-verify/src/main.rs b/examples/secp256k1-ecdsa-verify/src/main.rs
index dfe38f6da8..4ebc61bcec 100644
--- a/examples/secp256k1-ecdsa-verify/src/main.rs
+++ b/examples/secp256k1-ecdsa-verify/src/main.rs
@@ -7,12 +7,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_secp256k1_ecdsa_verify(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_secp256k1_ecdsa_verify(&mut program);
-    let prover_preprocessing =
-        guest::preprocess_prover_secp256k1_ecdsa_verify(shared_preprocessing.clone());
-    let verifier_setup = prover_preprocessing.generators.to_verifier_setup();
+    let prover_preprocessing = guest::preprocess_secp256k1_ecdsa_verify(&mut program);
     let verifier_preprocessing =
-        guest::preprocess_verifier_secp256k1_ecdsa_verify(shared_preprocessing, verifier_setup);
+        guest::verifier_preprocessing_from_prover_secp256k1_ecdsa_verify(&prover_preprocessing);
 
     let prove_secp256k1_ecdsa_verify =
         guest::build_prover_secp256k1_ecdsa_verify(program, prover_preprocessing);
diff --git a/examples/sha2-chain/src/main.rs b/examples/sha2-chain/src/main.rs
index 94114c0414..f7f1ccbd60 100644
--- a/examples/sha2-chain/src/main.rs
+++ b/examples/sha2-chain/src/main.rs
@@ -7,12 +7,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_sha2_chain(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_sha2_chain(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_sha2_chain(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_sha2_chain(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_sha2_chain(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_sha2_chain(&prover_preprocessing);
 
     let prove_sha2_chain = guest::build_prover_sha2_chain(program, prover_preprocessing);
     let verify_sha2_chain = guest::build_verifier_sha2_chain(verifier_preprocessing);
diff --git a/examples/sha2-ex/src/main.rs b/examples/sha2-ex/src/main.rs
index 4bce837fb8..2d86050f25 100644
--- a/examples/sha2-ex/src/main.rs
+++ b/examples/sha2-ex/src/main.rs
@@ -7,12 +7,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_sha2(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_sha2(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_sha2(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_sha2(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_sha2(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_sha2(&prover_preprocessing);
 
     let prove_sha2 = guest::build_prover_sha2(program, prover_preprocessing);
     let verify_sha2 = guest::build_verifier_sha2(verifier_preprocessing);
diff --git a/examples/sha3-chain/src/main.rs b/examples/sha3-chain/src/main.rs
index 97e223467b..cae32b0148 100644
--- a/examples/sha3-chain/src/main.rs
+++ b/examples/sha3-chain/src/main.rs
@@ -6,12 +6,9 @@ pub fn main() {
 
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_sha3_chain(target_dir);
-    let shared_preprocessing = guest::preprocess_shared_sha3_chain(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_sha3_chain(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_sha3_chain(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_sha3_chain(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_sha3_chain(&prover_preprocessing);
 
     let prove_sha3_chain = guest::build_prover_sha3_chain(program, prover_preprocessing);
     let verify_sha3_chain = guest::build_verifier_sha3_chain(verifier_preprocessing);
diff --git a/examples/sha3-ex/src/main.rs b/examples/sha3-ex/src/main.rs
index 1b49530258..69467d6f4e 100644
--- a/examples/sha3-ex/src/main.rs
+++ b/examples/sha3-ex/src/main.rs
@@ -6,12 +6,9 @@ pub fn main() {
 
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_sha3(target_dir);
-    let shared_preprocessing = guest::preprocess_shared_sha3(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_sha3(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_sha3(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_sha3(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_sha3(&prover_preprocessing);
 
     let prove_sha3 = guest::build_prover_sha3(program, prover_preprocessing);
     let verify_sha3 = guest::build_verifier_sha3(verifier_preprocessing);
diff --git a/examples/stdlib/src/main.rs b/examples/stdlib/src/main.rs
index 8edd0fed21..8b84b31743 100644
--- a/examples/stdlib/src/main.rs
+++ b/examples/stdlib/src/main.rs
@@ -7,12 +7,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_int_to_string(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_int_to_string(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_int_to_string(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_int_to_string(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_int_to_string(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_int_to_string(&prover_preprocessing);
 
     let prove = guest::build_prover_int_to_string(program, prover_preprocessing);
     let verify = guest::build_verifier_int_to_string(verifier_preprocessing);
@@ -24,12 +21,9 @@ pub fn main() {
 
     let mut program = guest::compile_string_concat(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_string_concat(&mut program);
-    let prover_preprocessing = guest::preprocess_prover_string_concat(shared_preprocessing.clone());
-    let verifier_preprocessing = guest::preprocess_verifier_string_concat(
-        shared_preprocessing,
-        prover_preprocessing.generators.to_verifier_setup(),
-    );
+    let prover_preprocessing = guest::preprocess_string_concat(&mut program);
+    let verifier_preprocessing =
+        guest::verifier_preprocessing_from_prover_string_concat(&prover_preprocessing);
 
     let prove = guest::build_prover_string_concat(program, prover_preprocessing);
     let verify = guest::build_verifier_string_concat(verifier_preprocessing);
diff --git a/jolt-inlines/bigint/src/multiplication/mod.rs b/jolt-inlines/bigint/src/multiplication/mod.rs
index ec327f0fad..3aac420c7b 100644
--- a/jolt-inlines/bigint/src/multiplication/mod.rs
+++ b/jolt-inlines/bigint/src/multiplication/mod.rs
@@ -10,7 +10,6 @@ const OUTPUT_LIMBS: usize = 2 * INPUT_LIMBS;
 pub mod sdk;
 pub use sdk::*;
 
-#[cfg(feature = "host")]
 pub mod exec;
 #[cfg(feature = "host")]
 pub mod sequence_builder;
diff --git a/jolt-inlines/bigint/src/multiplication/sdk.rs b/jolt-inlines/bigint/src/multiplication/sdk.rs
index f927a4fb27..11ca6a8b75 100644
--- a/jolt-inlines/bigint/src/multiplication/sdk.rs
+++ b/jolt-inlines/bigint/src/multiplication/sdk.rs
@@ -33,7 +33,10 @@ pub fn bigint256_mul(lhs: [u64; INPUT_LIMBS], rhs: [u64; INPUT_LIMBS]) -> [u64;
 /// - `a` and `b` must point to at least 32 bytes of readable memory
 /// - `result` must point to at least 64 bytes of writable memory
 /// - The memory regions may overlap (result can be the same as a or b)
-#[cfg(not(feature = "host"))]
+#[cfg(all(
+    not(feature = "host"),
+    any(target_arch = "riscv32", target_arch = "riscv64")
+))]
 pub unsafe fn bigint256_mul_inline(a: *const u64, b: *const u64, result: *mut u64) {
     use super::{BIGINT256_MUL_FUNCT3, BIGINT256_MUL_FUNCT7, INLINE_OPCODE};
     core::arch::asm!(
@@ -59,7 +62,10 @@ pub unsafe fn bigint256_mul_inline(a: *const u64, b: *const u64, result: *mut u6
 /// - All pointers must be valid and properly aligned for u64 access (8-byte alignment)
 /// - `a` and `b` must point to at least 32 bytes of readable memory
 /// - `result` must point to at least 64 bytes of writable memory
-#[cfg(feature = "host")]
+#[cfg(any(
+    feature = "host",
+    not(any(target_arch = "riscv32", target_arch = "riscv64"))
+))]
 pub unsafe fn bigint256_mul_inline(a: *const u64, b: *const u64, result: *mut u64) {
     use crate::multiplication::exec;
 
diff --git a/jolt-sdk/macros/src/lib.rs b/jolt-sdk/macros/src/lib.rs
index 68c1d8afc9..9f47cc678a 100644
--- a/jolt-sdk/macros/src/lib.rs
+++ b/jolt-sdk/macros/src/lib.rs
@@ -70,9 +70,8 @@ impl MacroBuilder {
         let analyze_fn = self.make_analyze_function();
         let trace_to_file_fn = self.make_trace_to_file_func();
         let compile_fn = self.make_compile_func();
+        let preprocess_fn = self.make_preprocess_func();
         let preprocess_shared_fn = self.make_preprocess_shared_func();
-        let preprocess_prover_fn = self.make_preprocess_prover_func();
-        let preprocess_verifier_fn = self.make_preprocess_verifier_func();
         let verifier_preprocess_from_prover_fn = self.make_preprocess_from_prover_func();
         let commit_trusted_advice_fn = self.make_commit_trusted_advice_func();
         let prove_fn = self.make_prove_func();
@@ -101,9 +100,8 @@ impl MacroBuilder {
             #analyze_fn
             #trace_to_file_fn
             #compile_fn
+            #preprocess_fn
             #preprocess_shared_fn
-            #preprocess_prover_fn
-            #preprocess_verifier_fn
             #verifier_preprocess_from_prover_fn
             #commit_trusted_advice_fn
             #prove_fn
@@ -427,7 +425,7 @@ impl MacroBuilder {
         }
     }
 
-    fn make_preprocess_shared_func(&self) -> TokenStream2 {
+    fn make_preprocess_func(&self) -> TokenStream2 {
         let attributes = parse_attributes(&self.attr);
         let max_trace_length = proc_macro2::Literal::u64_unsuffixed(attributes.max_trace_length);
         let max_input_size = proc_macro2::Literal::u64_unsuffixed(attributes.max_input_size);
@@ -441,14 +439,14 @@ impl MacroBuilder {
         let imports = self.make_imports();
 
         let fn_name = self.get_func_name();
-        let preprocess_shared_fn_name =
-            Ident::new(&format!("preprocess_shared_{fn_name}"), fn_name.span());
+        let preprocess_fn_name = Ident::new(&format!("preprocess_{fn_name}"), fn_name.span());
         quote! {
             #[cfg(all(not(target_arch = "wasm32"), not(feature = "guest")))]
-            pub fn #preprocess_shared_fn_name(program: &mut jolt::host::Program)
-                -> (jolt::JoltSharedPreprocessing, jolt::BytecodePreprocessing)
+            pub fn #preprocess_fn_name(program: &mut jolt::host::Program)
+                -> jolt::JoltProverPreprocessing<jolt::F, jolt::PCS>
             {
                 #imports
+
                 let (instructions, memory_init, program_size) = program.decode();
                 let memory_config = MemoryConfig {
                     max_input_size: #max_input_size,
@@ -460,53 +458,60 @@ impl MacroBuilder {
                     program_size: Some(program_size),
                 };
                 let memory_layout = MemoryLayout::new(&memory_config);
+
                 let bytecode = BytecodePreprocessing::preprocess(instructions);
-                let preprocessing = JoltSharedPreprocessing::new(
+                let shared = JoltSharedPreprocessing::new(
                     &bytecode,
                     memory_layout,
                     memory_init,
                     #max_trace_length,
                 );
-                (preprocessing, bytecode)
+                JoltProverPreprocessing::new(shared, std::sync::Arc::new(bytecode))
             }
         }
     }
 
-    fn make_preprocess_prover_func(&self) -> TokenStream2 {
-        let imports = self.make_imports();
-
-        let fn_name = self.get_func_name();
-        let preprocess_prover_fn_name =
-            Ident::new(&format!("preprocess_prover_{fn_name}"), fn_name.span());
-        quote! {
-            #[cfg(all(not(target_arch = "wasm32"), not(feature = "guest")))]
-            pub fn #preprocess_prover_fn_name(
-                shared_preprocessing: jolt::JoltSharedPreprocessing,
-                bytecode: std::sync::Arc<jolt::BytecodePreprocessing>,
-            ) -> jolt::JoltProverPreprocessing<jolt::F, jolt::PCS>
-            {
-                #imports
-                JoltProverPreprocessing::new(shared_preprocessing, bytecode)
-            }
-        }
-    }
-
-    fn make_preprocess_verifier_func(&self) -> TokenStream2 {
+    fn make_preprocess_shared_func(&self) -> TokenStream2 {
+        let attributes = parse_attributes(&self.attr);
+        let max_trace_length = proc_macro2::Literal::u64_unsuffixed(attributes.max_trace_length);
+        let max_input_size = proc_macro2::Literal::u64_unsuffixed(attributes.max_input_size);
+        let max_output_size = proc_macro2::Literal::u64_unsuffixed(attributes.max_output_size);
+        let max_untrusted_advice_size =
+            proc_macro2::Literal::u64_unsuffixed(attributes.max_untrusted_advice_size);
+        let max_trusted_advice_size =
+            proc_macro2::Literal::u64_unsuffixed(attributes.max_trusted_advice_size);
+        let stack_size = proc_macro2::Literal::u64_unsuffixed(attributes.stack_size);
+        let memory_size = proc_macro2::Literal::u64_unsuffixed(attributes.memory_size);
         let imports = self.make_imports();
 
         let fn_name = self.get_func_name();
-        let preprocess_verifier_fn_name =
-            Ident::new(&format!("preprocess_verifier_{fn_name}"), fn_name.span());
+        let preprocess_shared_fn_name =
+            Ident::new(&format!("preprocess_shared_{fn_name}"), fn_name.span());
         quote! {
             #[cfg(all(not(target_arch = "wasm32"), not(feature = "guest")))]
-            pub fn #preprocess_verifier_fn_name(
-                shared_preprocess: jolt::JoltSharedPreprocessing,
-                generators: <jolt::PCS as jolt::CommitmentScheme>::VerifierSetup,
-                bytecode: std::sync::Arc<jolt::BytecodePreprocessing>,
-            ) -> jolt::JoltVerifierPreprocessing<jolt::F, jolt::PCS>
+            pub fn #preprocess_shared_fn_name(program: &mut jolt::host::Program)
+                -> (jolt::JoltSharedPreprocessing, jolt::BytecodePreprocessing)
             {
                 #imports
-                JoltVerifierPreprocessing::new_full(shared_preprocess, generators, bytecode)
+                let (instructions, memory_init, program_size) = program.decode();
+                let memory_config = MemoryConfig {
+                    max_input_size: #max_input_size,
+                    max_output_size: #max_output_size,
+                    max_untrusted_advice_size: #max_untrusted_advice_size,
+                    max_trusted_advice_size: #max_trusted_advice_size,
+                    stack_size: #stack_size,
+                    memory_size: #memory_size,
+                    program_size: Some(program_size),
+                };
+                let memory_layout = MemoryLayout::new(&memory_config);
+                let bytecode = BytecodePreprocessing::preprocess(instructions);
+                let preprocessing = JoltSharedPreprocessing::new(
+                    &bytecode,
+                    memory_layout,
+                    memory_init,
+                    #max_trace_length,
+                );
+                (preprocessing, bytecode)
             }
         }
     }
@@ -886,6 +891,7 @@ impl MacroBuilder {
                 RV64IMACVerifier,
                 RV64IMACProof,
                 host::Program,
+                BytecodePreprocessing,
                 JoltProverPreprocessing,
                 MemoryConfig,
                 MemoryLayout,
diff --git a/src/main.rs b/src/main.rs
index 771806164e..84f4aded53 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -222,12 +222,9 @@ pub fn main() {
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_fib(target_dir);
 
-    let shared_preprocessing = guest::preprocess_shared_fib(&mut program);
-
-    let prover_preprocessing = guest::preprocess_prover_fib(shared_preprocessing.clone());
-    let verifier_setup = prover_preprocessing.generators.to_verifier_setup();
+    let prover_preprocessing = guest::preprocess_fib(&mut program);
     let verifier_preprocessing =
-        guest::preprocess_verifier_fib(shared_preprocessing, verifier_setup);
+        guest::verifier_preprocessing_from_prover_fib(&prover_preprocessing);
 
     let prove_fib = guest::build_prover_fib(program, prover_preprocessing);
     let verify_fib = guest::build_verifier_fib(verifier_preprocessing);

From 0e30fa717001731163840cfa9bc6af56d0f0ab9b Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Tue, 20 Jan 2026 05:06:19 -0800
Subject: [PATCH 10/16] feat(zkvm): committed bytecode mode (AddressMajor)

---
 bytecode-commitment-progress.md               | 153 +++++++++++--
 examples/fibonacci/src/main.rs                |  24 +-
 .../src/poly/commitment/dory/dory_globals.rs  |  32 ++-
 .../src/poly/commitment/dory/wrappers.rs      |  52 +++--
 jolt-core/src/poly/rlc_polynomial.rs          | 100 +++++++-
 jolt-core/src/zkvm/bytecode/chunks.rs         | 147 ++++++++++++
 jolt-core/src/zkvm/bytecode/mod.rs            |  43 +++-
 .../src/zkvm/claim_reductions/bytecode.rs     | 204 +++++------------
 jolt-core/src/zkvm/config.rs                  |  22 +-
 jolt-core/src/zkvm/prover.rs                  | 132 +++++++++--
 jolt-core/src/zkvm/tests.rs                   |  31 ++-
 jolt-core/src/zkvm/verifier.rs                |  98 ++++++--
 jolt-sdk/macros/src/lib.rs                    | 216 +++++++++++++++++-
 jolt-sdk/src/host_utils.rs                    |   1 +
 14 files changed, 1004 insertions(+), 251 deletions(-)
 create mode 100644 jolt-core/src/zkvm/bytecode/chunks.rs

diff --git a/bytecode-commitment-progress.md b/bytecode-commitment-progress.md
index cd084b8e02..66c17b7db9 100644
--- a/bytecode-commitment-progress.md
+++ b/bytecode-commitment-progress.md
@@ -76,19 +76,24 @@ Advanced/secondary API (still generated):
 
 - `preprocess_shared_<func>(&mut Program) -> (JoltSharedPreprocessing, BytecodePreprocessing)`
 
-### TODO (SDK): expose Committed bytecode mode end-to-end
+### SDK status (2026-01-20): Committed bytecode mode exposed end-to-end
 
 Committed mode requires **both**:
 
 1. **Committed preprocessing**: create prover preprocessing via `JoltProverPreprocessing::new_committed(...)`
 2. **Committed proving**: prove via `RV64IMACProver::gen_from_elf_with_bytecode_mode(..., BytecodeMode::Committed)`
 
-TODO items:
+**Done in this branch:**
+- Macro generates committed APIs:
+  - `preprocess_committed_<func>`
+  - `build_prover_committed_<func>`
+  - `prove_committed_<func>`
+- `BytecodeMode` is re-exported from the SDK host surface (`jolt-sdk/src/host_utils.rs`).
+- Example CLI surfaced (`examples/fibonacci --committed-bytecode`), using the committed APIs.
 
-- Generate `preprocess_committed_<func>(&mut Program) -> JoltProverPreprocessing<...>` (calls `JoltProverPreprocessing::new_committed`).
-- Generate a committed proving entrypoint (either `prove_committed_<func>` / `build_prover_committed_<func>`, or add a `bytecode_mode: BytecodeMode` parameter to the existing prover entrypoints).
-- Re-export `BytecodeMode` from the SDK host surface (or otherwise make it available to macro-generated code).
-- Keep committed mode behind an explicit opt-in until bytecode commitment derivation + Stage 8 batching are complete (`TrustedBytecodeCommitments::derive` is currently a stub).
+**Remaining SDK work (polish):**
+- Decide whether “committed” should remain separate entrypoints or become a `bytecode_mode: BytecodeMode` parameter on the default APIs.
+- Optionally propagate `--committed-bytecode` to other examples / docs.
 
 ## Problem statement (what is slow today?)
 
@@ -448,18 +453,119 @@ High-level status (diff vs main):
 - Booleanity split into address/cycle sumchecks; advice round alignment updated (`jolt-core/src/subprotocols/booleanity.rs` **L497–L736**; `jolt-core/src/poly/opening_proof.rs` **L157–L158**; `jolt-core/src/zkvm/witness.rs` **L285–L287**; `jolt-core/src/zkvm/claim_reductions/advice.rs` **L521–L526**).
 - BytecodeReadRaf split + staged Val claims + committed verifier Stage 6a path wired (`jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L1261–L1288**, **L1554–L1636**; `jolt-core/src/zkvm/verifier.rs` **L430–L455**).
 - BytecodeClaimReduction implemented with canonical lane ordering and BytecodeChunk openings (`jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L1–L15**, **L193–L236**, **L470–L488**, **L494–L671**).
-- Bytecode commitment plumbing is in place (BytecodeMode + preprocessing + VerifierBytecode), but commitment derivation and Stage 8 batching are still TODO (`jolt-core/src/zkvm/config.rs` **L26–L35**; `jolt-core/src/zkvm/prover.rs` **L1688–L1760**; `jolt-core/src/zkvm/verifier.rs` **L840–L976**; `jolt-core/src/zkvm/bytecode/mod.rs` **L41–L59**; `jolt-core/src/poly/rlc_polynomial.rs` **L184–L198**).
+- Bytecode commitment plumbing is in place (BytecodeMode + preprocessing + VerifierBytecode), and commitment derivation + Stage 8 batching/folding are now implemented (see next update).
 
 Immediate next steps:
-1. Implement `TrustedBytecodeCommitments::derive` and add BytecodeChunk commitments + hints; consider new Dory context if needed (`jolt-core/src/zkvm/bytecode/mod.rs` **L41–L59**; `jolt-core/src/zkvm/prover.rs` **L1688–L1760**).
-2. Wire BytecodeChunk into Stage 8 batching and RLC streaming; add BytecodeChunk to committed polynomial list and witness generation (`jolt-core/src/zkvm/witness.rs` **L34–L61**, **L121–L171**; `jolt-core/src/poly/rlc_polynomial.rs` **L184–L198**; `jolt-core/src/zkvm/prover.rs` **L1504–L1567**).
-3. Add/enable tests (lane ordering, padding, committed mode e2e) and remove ignores once commitments + Stage 8 batching are wired (`jolt-core/src/zkvm/tests.rs` **L426–L486**; `jolt-core/src/zkvm/prover.rs` **L395–L409**; `jolt-core/src/zkvm/verifier.rs` **L171–L177**).
-4. Consider streaming/implicit bytecode chunk representation to avoid `k_chunk * T` materialization (`jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L216–L239**).
-5. Expose Committed bytecode mode in the SDK (opt-in): macro-generated committed preprocessing + committed proving entrypoint / `BytecodeMode` parameter (see “TODO (SDK): expose Committed bytecode mode end-to-end” above).
+1. Add/enable tests (lane ordering, committed mode e2e, Stage 8 folding) and remove ignores once committed mode is fully wired (`jolt-core/src/zkvm/tests.rs` **L426–L486**).
+2. Optimize bytecode VMV contribution in streaming RLC (current path iterates `K * k_chunk * num_chunks`) (`jolt-core/src/poly/rlc_polynomial.rs` **L420–L485**).
+3. Enforce or document the `log_T >= log_K_bytecode` requirement for Stage 8 folding; decide whether to lift this (see “log_K > log_T” discussion below).
+4. Expose Committed bytecode mode in the SDK (opt-in): macro-generated committed preprocessing + committed proving entrypoint / `BytecodeMode` parameter (see “TODO (SDK): expose Committed bytecode mode end-to-end” above).
 
 Concerns / risks:
-- BytecodeClaimReduction currently materializes `weight_chunks` and `bytecode_chunks` of size `k_chunk * T` (memory heavy for large bytecode) (`jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L216–L239**).
-- BytecodeChunk polynomials are placeholders and not yet supported by streaming RLC or witness generation (`jolt-core/src/zkvm/witness.rs` **L121–L123**, **L169–L171**; `jolt-core/src/poly/rlc_polynomial.rs` **L184–L198**).
+- BytecodeClaimReduction still materializes `weight_chunks` and `bytecode_chunks` of size `k_chunk * K_bytecode` (no longer `k_chunk * T`), but this can be large for big bytecode (`jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L190–L218**).
+- Streaming RLC bytecode contribution currently iterates `K * k_chunk * num_chunks` (needs optimization) (`jolt-core/src/poly/rlc_polynomial.rs` **L420–L485**).
+
+---
+
+## Progress update (2026-01-20, continued)
+
+High-level status (diff vs previous update):
+- BytecodeClaimReduction now runs over `log_K` (no `log_T` padding) and consumes `r_bc` directly (`jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L73–L215**).
+- Canonical lane ordering + lane value logic centralized in `bytecode::chunks`, used by both commitment derivation and claim reduction (`jolt-core/src/zkvm/bytecode/chunks.rs` **L11–L138**).
+- `TrustedBytecodeCommitments::derive` implemented and commits in a dedicated `DoryContext::Bytecode`, carrying `log_k_chunk` + `bytecode_len` metadata (`jolt-core/src/zkvm/bytecode/mod.rs` **L33–L79**; `jolt-core/src/poly/commitment/dory/dory_globals.rs` **L154–L171**).
+- Stage 8 now *folds bytecode chunk openings into the joint opening proof* via a Lagrange selector over missing cycle vars (prover+verifier) (`jolt-core/src/zkvm/prover.rs` **L1618–L1664**; `jolt-core/src/zkvm/verifier.rs` **L741–L788**).
+- Streaming RLC now supports bytecode chunk contributions in the VMV pass (`jolt-core/src/poly/rlc_polynomial.rs` **L420–L539**).
+
+---
+
+## Progress update (2026-01-20, AddressMajor correctness)
+
+Status:
+- **Committed bytecode now passes in both layouts** (CycleMajor + AddressMajor). In particular,
+  `fib_e2e_committed_bytecode_address_major` passes.
+
+Root cause:
+- Under `DoryLayout::AddressMajor`, the bytecode chunk coefficient order makes
+  `BindingOrder::LowToHigh` bind **lane/address** bits first. But `BytecodeClaimReduction` Phase 1
+  (Stage 6b) must bind **cycle** bits first to match the staged `r_bc` semantics.
+
+Fix:
+- Keep bytecode commitments in the layout’s native order for Dory opening, but in the **claim
+  reduction prover** permute AddressMajor chunk coefficients into **CycleMajor** order so Phase 1
+  binds cycle variables first.
+- Implemented by `permute_address_major_to_cycle_major` and applied in
+  `BytecodeClaimReductionProver::initialize` (`jolt-core/src/zkvm/claim_reductions/bytecode.rs`).
+
+---
+
+## Remaining work (as of 2026-01-20)
+
+Cleanup / correctness hardening:
+- Remove temporary debug-only code in `jolt-core/src/zkvm/tests.rs` (env-var gated bytecode/Dory open checks).
+- Add the new module file to git: `jolt-core/src/zkvm/bytecode/chunks.rs` is currently untracked in `git status`.
+
+Perf / scalability:
+- Optimize Stage 8 bytecode VMV contribution (currently iterates `K * k_chunk * num_chunks`) (`jolt-core/src/poly/rlc_polynomial.rs`).
+- Consider making `BytecodeClaimReduction` avoid materializing `k_chunk * K_bytecode` dense polynomials (streaming / implicit evaluation).
+
+Repo hygiene:
+- Before committing: run `cargo fmt` and `cargo clippy` and fix warnings.
+
+## Handling arbitrary `log_K` vs `log_T` (design sketch, not pursued)
+
+We may want to allow `log_K_bytecode > log_T` without a separate opening proof by **padding the cycle dimension** and embedding all trace-derived polynomials into a larger main opening domain.
+
+### Padding semantics: selector vs repetition
+
+There are two incompatible padding semantics today:
+
+1) **Selector padding (zero outside domain)**  
+   Embed a polynomial `P(a, c)` defined on `c ∈ {0,1}^{log_T}` into a larger `c' ∈ {0,1}^{log_T'}` (`log_T' = max(log_T, log_K)`) via:
+   - `P'(a, c, z) = P(a, c) · ∏_{i=1..Δ} (1 - z_i)`, where `Δ = log_T' - log_T`
+   - So `P' = P` when `z=0…0` and **0** elsewhere.
+
+2) **Repetition padding (independent vars)**  
+   Treat `P` as independent of the extra variables, so it repeats across them.
+   - In sumcheck batching, inactive rounds are dummy constants, which implies repetition.
+   - Batched sumcheck multiplies the input claim by `2^Δ` (see `BatchedSumcheck` in `jolt-core/src/subprotocols/sumcheck.rs` **L52–L91**).
+
+**Important:** selector padding and repetition padding are not equivalent; they lead to different claims and different opening proofs. Current sumcheck batching implements repetition padding.
+
+### What would need to change (high-level steps)
+
+To support arbitrary `log_K` and `log_T` while keeping a *single* Stage 8 opening:
+
+1) **Stage 6b round count becomes `log_T' = max(log_T, log_K)`**
+   - All cycle-phase instances must run in a batched sumcheck of length `log_T'`.
+   - Instances with `log_T` rounds become inactive for the first `Δ` rounds (front-loaded).
+
+2) **BatchedSumcheck must support selector padding**
+   - Today, inactive rounds use a constant univariate and the input claim is scaled by `2^Δ` (repetition semantics).
+   - To get selector padding, inactive rounds must instead use `H(z) = prev · (1 - z)` and **no `2^Δ` scaling**.
+   - This requires new per-instance hooks (inactive-round univariate + scaling policy) in `BatchedSumcheck` (`jolt-core/src/subprotocols/sumcheck.rs` **L52–L91**).
+
+3) **Main Dory matrix size uses `T'`**
+   - Stage 8’s main context must be initialized with `T'`, not the trace length.
+   - This affects the unified opening point and all VMV paths (`jolt-core/src/zkvm/prover.rs` **L1493–L1498**, `jolt-core/src/zkvm/verifier.rs` **L653–L661**).
+
+4) **All trace-derived polynomials must be embedded with selector padding**
+   - Add a Lagrange selector `∏(1 - r_extra)` to **every** claim whose cycle dimension is `log_T`.
+   - This includes dense polys and all RA polys (not just bytecode). The bytecode folding logic already does this (see `jolt-core/src/zkvm/prover.rs` **L1618–L1664** and `jolt-core/src/zkvm/verifier.rs` **L741–L788**).
+
+5) **Commitment and streaming need a zero-padding mode**
+   - Current trace padding uses `Cycle::NoOp`, which does **not** imply zero rows for all polynomials.
+   - For selector padding, padded cycles must contribute zero for **all** polynomials; this requires a new “zero row” padding mode in witness generation and streaming VMV.
+
+### Why this is not pursued now
+
+This change is cross-cutting and affects:
+- Batched sumcheck semantics,
+- Stage 6b scheduling,
+- Main Dory context sizing,
+- Stage 8 claim embedding for *all* polynomials,
+- Streaming witness/VMV paths.
+
+Given scope and risk, we are **not pursuing arbitrary `log_K` vs `log_T` support right now**. The current design assumes `log_T >= log_K` for the folded Stage 8 bytecode opening path.
 
 ---
 
@@ -730,10 +836,10 @@ We will also add **new `VirtualPolynomial` variants** for scalar claims that are
 
 ### Step 6 — Bytecode commitments in preprocessing + transcript
 
-**Status (2026-01-20)**: PARTIAL  
+**Status (2026-01-20)**: DONE (functionality)  
 - Bytecode commitment plumbing added (types + preprocessing + proof field): `jolt-core/src/zkvm/bytecode/mod.rs` **L30–L111**; `jolt-core/src/zkvm/prover.rs` **L1688–L1760**; `jolt-core/src/zkvm/verifier.rs` **L840–L976**; `jolt-core/src/zkvm/proof_serialization.rs` **L43–L47**.  
-- Commitment derivation still TODO: `jolt-core/src/zkvm/bytecode/mod.rs` **L41–L59**.  
-- Canonical lane ordering implemented in BytecodeClaimReduction: `jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L494–L671**.
+- Commitment derivation implemented: `TrustedBytecodeCommitments::derive` in `jolt-core/src/zkvm/bytecode/mod.rs`.  
+- Canonical lane ordering + lane materialization centralized in `jolt-core/src/zkvm/bytecode/chunks.rs` (used by both commitment derivation and claim reduction).
 
 #### 6.1 New Dory context + storage
 
@@ -756,9 +862,9 @@ This ordering must be used consistently by:
 
 ### Step 7 — Stage 8 batching integration (bytecode polynomials)
 
-**Status (2026-01-20)**: NOT STARTED / TODO  
-- BytecodeChunk polynomials not yet supported by witness generation or streaming RLC (panic placeholders): `jolt-core/src/zkvm/witness.rs` **L121–L123**, **L169–L171**; `jolt-core/src/poly/rlc_polynomial.rs` **L184–L198**.  
-- Stage 8 currently batches dense + RA + advice only (no BytecodeChunk): `jolt-core/src/zkvm/prover.rs` **L1504–L1567**.
+**Status (2026-01-20)**: DONE (functionality)  
+- Stage 8 folds bytecode chunk openings into the joint opening proof via a Lagrange selector over missing cycle vars (`jolt-core/src/zkvm/prover.rs` and `jolt-core/src/zkvm/verifier.rs`).
+- Streaming RLC includes bytecode chunk contributions in the VMV pass (`jolt-core/src/poly/rlc_polynomial.rs`).
 
 Stage 8 currently builds a streaming `RLCPolynomial` from:
 - dense trace polys
@@ -791,9 +897,10 @@ This is analogous to `adjust_trace_length_for_advice` in `jolt-core/src/zkvm/pro
 
 ### Step 9 — Tests / validation
 
-**Status (2026-01-20)**: PARTIAL  
-- New e2e harness + bytecode-mode detection tests added locally: `jolt-core/src/zkvm/tests.rs` **L1–L486** (file currently untracked).  
-- Committed-mode e2e tests currently ignored: `jolt-core/src/zkvm/tests.rs` **L426–L447**.
+**Status (2026-01-20)**: DONE (core coverage)  
+- Lane ordering + chunking tests added.
+- E2E committed-bytecode tests enabled and passing for both layouts (CycleMajor + AddressMajor).
+- Note: `jolt-core/src/zkvm/tests.rs` still contains some env-var gated debug helpers; remove once stabilized.
 
 - Unit tests:
   - lane ordering + chunking (k_chunk=16 ⇒ 28 chunks, k_chunk=256 ⇒ 2 chunks)
diff --git a/examples/fibonacci/src/main.rs b/examples/fibonacci/src/main.rs
index 324cfe3096..58bfd5e05f 100644
--- a/examples/fibonacci/src/main.rs
+++ b/examples/fibonacci/src/main.rs
@@ -6,11 +6,16 @@ pub fn main() {
     tracing_subscriber::fmt::init();
 
     let save_to_disk = std::env::args().any(|arg| arg == "--save");
+    let committed_bytecode = std::env::args().any(|arg| arg == "--committed-bytecode");
 
     let target_dir = "/tmp/jolt-guest-targets";
     let mut program = guest::compile_fib(target_dir);
 
-    let prover_preprocessing = guest::preprocess_fib(&mut program);
+    let prover_preprocessing = if committed_bytecode {
+        guest::preprocess_committed_fib(&mut program)
+    } else {
+        guest::preprocess_fib(&mut program)
+    };
     let verifier_preprocessing =
         guest::verifier_preprocessing_from_prover_fib(&prover_preprocessing);
 
@@ -23,7 +28,6 @@ pub fn main() {
         .expect("Could not serialize preprocessing.");
     }
 
-    let prove_fib = guest::build_prover_fib(program, prover_preprocessing);
     let verify_fib = guest::build_verifier_fib(verifier_preprocessing);
 
     let program_summary = guest::analyze_fib(10);
@@ -36,8 +40,22 @@ pub fn main() {
     info!("Trace file written to: {trace_file}.");
 
     let now = Instant::now();
-    let (output, proof, io_device) = prove_fib(50);
+    let (output, proof, io_device) = if committed_bytecode {
+        let prove_fib = guest::build_prover_committed_fib(program, prover_preprocessing);
+        prove_fib(50)
+    } else {
+        let prove_fib = guest::build_prover_fib(program, prover_preprocessing);
+        prove_fib(50)
+    };
     info!("Prover runtime: {} s", now.elapsed().as_secs_f64());
+    info!(
+        "bytecode mode: {}",
+        if committed_bytecode {
+            "Committed"
+        } else {
+            "Full"
+        }
+    );
 
     if save_to_disk {
         serialize_and_print_size("Proof", "/tmp/fib_proof.bin", &proof)
diff --git a/jolt-core/src/poly/commitment/dory/dory_globals.rs b/jolt-core/src/poly/commitment/dory/dory_globals.rs
index c4c2ebe421..80e8e304cf 100644
--- a/jolt-core/src/poly/commitment/dory/dory_globals.rs
+++ b/jolt-core/src/poly/commitment/dory/dory_globals.rs
@@ -151,7 +151,12 @@ static mut UNTRUSTED_ADVICE_T: OnceLock<usize> = OnceLock::new();
 static mut UNTRUSTED_ADVICE_MAX_NUM_ROWS: OnceLock<usize> = OnceLock::new();
 static mut UNTRUSTED_ADVICE_NUM_COLUMNS: OnceLock<usize> = OnceLock::new();
 
-// Context tracking: 0=Main, 1=TrustedAdvice, 2=UntrustedAdvice
+// Bytecode globals
+static mut BYTECODE_T: OnceLock<usize> = OnceLock::new();
+static mut BYTECODE_MAX_NUM_ROWS: OnceLock<usize> = OnceLock::new();
+static mut BYTECODE_NUM_COLUMNS: OnceLock<usize> = OnceLock::new();
+
+// Context tracking: 0=Main, 1=TrustedAdvice, 2=UntrustedAdvice, 3=Bytecode
 static CURRENT_CONTEXT: AtomicU8 = AtomicU8::new(0);
 
 // Layout tracking: 0=CycleMajor, 1=AddressMajor
@@ -163,6 +168,7 @@ pub enum DoryContext {
     Main = 0,
     TrustedAdvice = 1,
     UntrustedAdvice = 2,
+    Bytecode = 3,
 }
 
 impl From<u8> for DoryContext {
@@ -171,6 +177,7 @@ impl From<u8> for DoryContext {
             0 => DoryContext::Main,
             1 => DoryContext::TrustedAdvice,
             2 => DoryContext::UntrustedAdvice,
+            3 => DoryContext::Bytecode,
             _ => panic!("Invalid DoryContext value: {value}"),
         }
     }
@@ -305,6 +312,9 @@ impl DoryGlobals {
                 DoryContext::UntrustedAdvice => {
                     let _ = UNTRUSTED_ADVICE_MAX_NUM_ROWS.set(max_num_rows);
                 }
+                DoryContext::Bytecode => {
+                    let _ = BYTECODE_MAX_NUM_ROWS.set(max_num_rows);
+                }
             }
         }
     }
@@ -321,6 +331,9 @@ impl DoryGlobals {
                 DoryContext::UntrustedAdvice => *UNTRUSTED_ADVICE_MAX_NUM_ROWS
                     .get()
                     .expect("untrusted_advice max_num_rows not initialized"),
+                DoryContext::Bytecode => *BYTECODE_MAX_NUM_ROWS
+                    .get()
+                    .expect("bytecode max_num_rows not initialized"),
             }
         }
     }
@@ -338,6 +351,9 @@ impl DoryGlobals {
                 DoryContext::UntrustedAdvice => {
                     let _ = UNTRUSTED_ADVICE_NUM_COLUMNS.set(num_columns);
                 }
+                DoryContext::Bytecode => {
+                    let _ = BYTECODE_NUM_COLUMNS.set(num_columns);
+                }
             }
         }
     }
@@ -354,6 +370,9 @@ impl DoryGlobals {
                 DoryContext::UntrustedAdvice => *UNTRUSTED_ADVICE_NUM_COLUMNS
                     .get()
                     .expect("untrusted_advice num_columns not initialized"),
+                DoryContext::Bytecode => *BYTECODE_NUM_COLUMNS
+                    .get()
+                    .expect("bytecode num_columns not initialized"),
             }
         }
     }
@@ -371,6 +390,9 @@ impl DoryGlobals {
                 DoryContext::UntrustedAdvice => {
                     let _ = UNTRUSTED_ADVICE_T.set(t);
                 }
+                DoryContext::Bytecode => {
+                    let _ = BYTECODE_T.set(t);
+                }
             }
         }
     }
@@ -387,6 +409,7 @@ impl DoryGlobals {
                 DoryContext::UntrustedAdvice => *UNTRUSTED_ADVICE_T
                     .get()
                     .expect("untrusted_advice t not initialized"),
+                DoryContext::Bytecode => *BYTECODE_T.get().expect("bytecode t not initialized"),
             }
         }
     }
@@ -414,7 +437,7 @@ impl DoryGlobals {
     /// # Arguments
     /// * `K` - Maximum address space size (K in OneHot polynomials)
     /// * `T` - Maximum trace length (cycle count)
-    /// * `context` - The Dory context to initialize (Main, TrustedAdvice, or UntrustedAdvice)
+    /// * `context` - The Dory context to initialize (Main, TrustedAdvice, UntrustedAdvice, Bytecode)
     /// * `layout` - Optional layout for the Dory matrix. Only applies to Main context.
     ///   If `Some(layout)`, sets the layout. If `None`, leaves the existing layout
     ///   unchanged (defaults to `CycleMajor` after `reset()`). Ignored for advice contexts.
@@ -466,6 +489,11 @@ impl DoryGlobals {
             let _ = UNTRUSTED_ADVICE_T.take();
             let _ = UNTRUSTED_ADVICE_MAX_NUM_ROWS.take();
             let _ = UNTRUSTED_ADVICE_NUM_COLUMNS.take();
+
+            // Reset bytecode globals
+            let _ = BYTECODE_T.take();
+            let _ = BYTECODE_MAX_NUM_ROWS.take();
+            let _ = BYTECODE_NUM_COLUMNS.take();
         }
 
         // Reset context to Main
diff --git a/jolt-core/src/poly/commitment/dory/wrappers.rs b/jolt-core/src/poly/commitment/dory/wrappers.rs
index 431387d7c2..a4c3fa5eb9 100644
--- a/jolt-core/src/poly/commitment/dory/wrappers.rs
+++ b/jolt-core/src/poly/commitment/dory/wrappers.rs
@@ -227,28 +227,50 @@ where
     let dory_layout = DoryGlobals::get_layout();
 
     // Dense polynomials (all scalar variants except OneHot/RLC) are committed row-wise.
-    // Under AddressMajor, dense coefficients occupy evenly-spaced columns, so each row
-    // commitment uses `cycles_per_row` bases (one per occupied column).
-    let (dense_affine_bases, dense_chunk_size): (Vec<_>, usize) = match (dory_context, dory_layout)
+    //
+    // In `Main` + `AddressMajor`, we have two *representations* in this repo:
+    // - **Trace-dense**: length == T (e.g., `RdInc`, `RamInc`). These are embedded into the
+    //   main matrix by occupying evenly-spaced columns, so each row commitment uses
+    //   `cycles_per_row` bases (one per occupied column).
+    // - **Matrix-dense**: length == K*T (e.g., bytecode chunk polynomials). These occupy the
+    //   full matrix and must use the full `row_len` bases.
+    let is_trace_dense = match poly {
+        MultilinearPolynomial::LargeScalars(p) => p.Z.len() == DoryGlobals::get_T(),
+        MultilinearPolynomial::BoolScalars(p) => p.coeffs.len() == DoryGlobals::get_T(),
+        MultilinearPolynomial::U8Scalars(p) => p.coeffs.len() == DoryGlobals::get_T(),
+        MultilinearPolynomial::U16Scalars(p) => p.coeffs.len() == DoryGlobals::get_T(),
+        MultilinearPolynomial::U32Scalars(p) => p.coeffs.len() == DoryGlobals::get_T(),
+        MultilinearPolynomial::U64Scalars(p) => p.coeffs.len() == DoryGlobals::get_T(),
+        MultilinearPolynomial::U128Scalars(p) => p.coeffs.len() == DoryGlobals::get_T(),
+        MultilinearPolynomial::I64Scalars(p) => p.coeffs.len() == DoryGlobals::get_T(),
+        MultilinearPolynomial::I128Scalars(p) => p.coeffs.len() == DoryGlobals::get_T(),
+        MultilinearPolynomial::S128Scalars(p) => p.coeffs.len() == DoryGlobals::get_T(),
+        MultilinearPolynomial::OneHot(_) | MultilinearPolynomial::RLC(_) => false,
+    };
+
+    let is_trace_dense_main_addr_major = dory_context == DoryContext::Main
+        && dory_layout == DoryLayout::AddressMajor
+        && is_trace_dense;
+
+    let (dense_affine_bases, dense_chunk_size): (Vec<_>, usize) = if is_trace_dense_main_addr_major
     {
-        (DoryContext::Main, DoryLayout::AddressMajor) => {
-            let cycles_per_row = DoryGlobals::address_major_cycles_per_row();
-            let bases: Vec<_> = g1_slice
-                .par_iter()
-                .take(row_len)
-                .step_by(row_len / cycles_per_row)
-                .map(|g| g.0.into_affine())
-                .collect();
-            (bases, cycles_per_row)
-        }
-        _ => (
+        let cycles_per_row = DoryGlobals::address_major_cycles_per_row();
+        let bases: Vec<_> = g1_slice
+            .par_iter()
+            .take(row_len)
+            .step_by(row_len / cycles_per_row)
+            .map(|g| g.0.into_affine())
+            .collect();
+        (bases, cycles_per_row)
+    } else {
+        (
             g1_slice
                 .par_iter()
                 .take(row_len)
                 .map(|g| g.0.into_affine())
                 .collect(),
             row_len,
-        ),
+        )
     };
 
     let result: Vec<ArkG1> = match poly {
diff --git a/jolt-core/src/poly/rlc_polynomial.rs b/jolt-core/src/poly/rlc_polynomial.rs
index 5a657549b1..51bc6a69b2 100644
--- a/jolt-core/src/poly/rlc_polynomial.rs
+++ b/jolt-core/src/poly/rlc_polynomial.rs
@@ -4,8 +4,10 @@ use crate::poly::multilinear_polynomial::MultilinearPolynomial;
 use crate::utils::accumulation::Acc6S;
 use crate::utils::math::{s64_from_diff_u64s, Math};
 use crate::utils::thread::unsafe_allocate_zero_vec;
+use crate::zkvm::bytecode::chunks::{lane_value, total_lanes};
 use crate::zkvm::config::OneHotParams;
-use crate::zkvm::instruction::LookupQuery;
+use crate::zkvm::instruction::{Flags, InstructionLookup, LookupQuery};
+use crate::zkvm::lookup_table::LookupTables;
 use crate::zkvm::ram::remap_address;
 use crate::zkvm::{bytecode::BytecodePreprocessing, witness::CommittedPolynomial};
 use allocative::Allocative;
@@ -16,7 +18,7 @@ use rayon::prelude::*;
 use std::collections::HashMap;
 use std::sync::Arc;
 use tracer::ChunksIterator;
-use tracer::{instruction::Cycle, LazyTraceIterator};
+use tracer::{instruction::Cycle, instruction::Instruction, LazyTraceIterator};
 
 #[derive(Clone, Debug)]
 pub struct RLCStreamingData {
@@ -56,6 +58,8 @@ impl TraceSource {
 pub struct StreamingRLCContext<F: JoltField> {
     pub dense_polys: Vec<(CommittedPolynomial, F)>,
     pub onehot_polys: Vec<(CommittedPolynomial, F)>,
+    /// Bytecode chunk polynomials with their RLC coefficients.
+    pub bytecode_polys: Vec<(usize, F)>,
     /// Advice polynomials with their RLC coefficients.
     /// These are NOT streamed from trace - they're passed in directly.
     pub advice_polys: Vec<(F, MultilinearPolynomial<F>)>,
@@ -179,6 +183,7 @@ impl<F: JoltField> RLCPolynomial<F> {
 
         let mut dense_polys = Vec::new();
         let mut onehot_polys = Vec::new();
+        let mut bytecode_polys = Vec::new();
         let mut advice_polys = Vec::new();
 
         for (poly_id, coeff) in poly_ids.iter().zip(coefficients.iter()) {
@@ -192,9 +197,9 @@ impl<F: JoltField> RLCPolynomial<F> {
                     onehot_polys.push((*poly_id, *coeff));
                 }
                 CommittedPolynomial::BytecodeChunk(_) => {
-                    // Bytecode chunk polynomials are staged for later integration into Stage 8
-                    // streaming (see bytecode commitment track).
-                    panic!("BytecodeChunk polynomials are not yet supported in streaming RLC");
+                    if let CommittedPolynomial::BytecodeChunk(idx) = poly_id {
+                        bytecode_polys.push((*idx, *coeff));
+                    }
                 }
                 CommittedPolynomial::TrustedAdvice | CommittedPolynomial::UntrustedAdvice => {
                     // Advice polynomials are passed in directly (not streamed from trace)
@@ -211,6 +216,7 @@ impl<F: JoltField> RLCPolynomial<F> {
             streaming_context: Some(Arc::new(StreamingRLCContext {
                 dense_polys,
                 onehot_polys,
+                bytecode_polys,
                 advice_polys,
                 trace_source,
                 preprocessing,
@@ -404,6 +410,87 @@ guardrail in gen_from_trace should ensure sigma_main >= sigma_a."
             });
     }
 
+    /// Adds the bytecode chunk polynomial contribution to the vector-matrix-vector product result.
+    ///
+    /// Bytecode chunk polynomials are embedded in the top-left block by fixing the extra cycle
+    /// variables to 0, so we only iterate cycles in `[0, bytecode_len)`.
+    fn vmp_bytecode_contribution(
+        result: &mut [F],
+        left_vec: &[F],
+        num_columns: usize,
+        ctx: &StreamingRLCContext<F>,
+    ) {
+        if ctx.bytecode_polys.is_empty() {
+            return;
+        }
+
+        let layout = DoryGlobals::get_layout();
+        let k_chunk = ctx.one_hot_params.k_chunk;
+        let bytecode = &ctx.preprocessing.bytecode;
+        let bytecode_len = bytecode.bytecode.len();
+        let (sigma_bc, _nu_bc) = DoryGlobals::balanced_sigma_nu((k_chunk * bytecode_len).log_2());
+        let bytecode_cols = 1usize << sigma_bc;
+        let total = total_lanes();
+
+        debug_assert!(
+            bytecode_cols <= num_columns,
+            "Bytecode columns (2^{{sigma_bc}}={bytecode_cols}) must fit in main num_columns={num_columns}; \
+guardrail in gen_from_trace should ensure sigma_main >= sigma_bc."
+        );
+
+        for (chunk_idx, coeff) in ctx.bytecode_polys.iter() {
+            if coeff.is_zero() {
+                continue;
+            }
+            for (cycle, instr) in bytecode.bytecode.iter().enumerate().take(bytecode_len) {
+                let normalized = instr.normalize();
+                let circuit_flags = <Instruction as Flags>::circuit_flags(instr);
+                let instr_flags = <Instruction as Flags>::instruction_flags(instr);
+                let lookup_idx = <Instruction as InstructionLookup<XLEN>>::lookup_table(instr)
+                    .map(|t| LookupTables::<XLEN>::enum_index(&t));
+                let raf_flag =
+                    !crate::zkvm::instruction::InterleavedBitsMarker::is_interleaved_operands(
+                        &circuit_flags,
+                    );
+
+                let unexpanded_pc = F::from_u64(normalized.address as u64);
+                let imm = F::from_i128(normalized.operands.imm);
+                let rs1 = normalized.operands.rs1;
+                let rs2 = normalized.operands.rs2;
+                let rd = normalized.operands.rd;
+
+                for lane in 0..k_chunk {
+                    let global_lane = chunk_idx * k_chunk + lane;
+                    if global_lane >= total {
+                        break;
+                    }
+                    let value = lane_value::<F>(
+                        global_lane,
+                        rs1,
+                        rs2,
+                        rd,
+                        unexpanded_pc,
+                        imm,
+                        &circuit_flags,
+                        &instr_flags,
+                        lookup_idx,
+                        raf_flag,
+                    );
+                    if value.is_zero() {
+                        continue;
+                    }
+                    let global_index =
+                        layout.address_cycle_to_index(lane, cycle, k_chunk, bytecode_len);
+                    let row_index = global_index / bytecode_cols;
+                    let col_index = global_index % bytecode_cols;
+                    if row_index < left_vec.len() {
+                        result[col_index] += left_vec[row_index] * (*coeff) * value;
+                    }
+                }
+            }
+        }
+    }
+
     /// Streaming VMP implementation that generates rows on-demand from trace.
     /// Achieves O(sqrt(n)) space complexity by lazily generating the witness.
     /// Single pass through trace for both dense and one-hot polynomials.
@@ -455,6 +542,7 @@ guardrail in gen_from_trace should ensure sigma_main >= sigma_a."
         let mut result = materialized.vector_matrix_product(left_vec);
 
         Self::vmp_advice_contribution(&mut result, left_vec, num_columns, ctx);
+        Self::vmp_bytecode_contribution(&mut result, left_vec, num_columns, ctx);
 
         result
     }
@@ -578,6 +666,7 @@ guardrail in gen_from_trace should ensure sigma_main >= sigma_a."
 
         // Advice contribution is small and independent of the trace; add it after the streamed pass.
         Self::vmp_advice_contribution(&mut result, left_vec, num_columns, ctx);
+        Self::vmp_bytecode_contribution(&mut result, left_vec, num_columns, ctx);
         result
     }
 
@@ -632,6 +721,7 @@ guardrail in gen_from_trace should ensure sigma_main >= sigma_a."
 
         // Advice contribution is small and independent of the trace; add it after the streamed pass.
         Self::vmp_advice_contribution(&mut result, left_vec, num_columns, ctx);
+        Self::vmp_bytecode_contribution(&mut result, left_vec, num_columns, ctx);
         result
     }
 }
diff --git a/jolt-core/src/zkvm/bytecode/chunks.rs b/jolt-core/src/zkvm/bytecode/chunks.rs
new file mode 100644
index 0000000000..991818edbf
--- /dev/null
+++ b/jolt-core/src/zkvm/bytecode/chunks.rs
@@ -0,0 +1,147 @@
+use crate::field::JoltField;
+use crate::poly::commitment::dory::DoryGlobals;
+use crate::poly::multilinear_polynomial::MultilinearPolynomial;
+use crate::utils::thread::unsafe_allocate_zero_vec;
+use crate::zkvm::bytecode::BytecodePreprocessing;
+use crate::zkvm::instruction::{
+    Flags, InstructionLookup, NUM_CIRCUIT_FLAGS, NUM_INSTRUCTION_FLAGS,
+};
+use crate::zkvm::lookup_table::LookupTables;
+use common::constants::{REGISTER_COUNT, XLEN};
+use rayon::prelude::*;
+use tracer::instruction::Instruction;
+
+/// Total number of "lanes" to commit bytecode fields
+pub const fn total_lanes() -> usize {
+    3 * (REGISTER_COUNT as usize) // rs1, rs2, rd one-hot lanes
+        + 2 // unexpanded_pc, imm
+        + NUM_CIRCUIT_FLAGS
+        + NUM_INSTRUCTION_FLAGS
+        + <LookupTables<XLEN> as strum::EnumCount>::COUNT
+        + 1 // raf flag
+}
+
+#[allow(clippy::too_many_arguments)]
+#[inline(always)]
+pub fn lane_value<F: JoltField>(
+    global_lane: usize,
+    rs1: Option<u8>,
+    rs2: Option<u8>,
+    rd: Option<u8>,
+    unexpanded_pc: F,
+    imm: F,
+    circuit_flags: &[bool; NUM_CIRCUIT_FLAGS],
+    instr_flags: &[bool; NUM_INSTRUCTION_FLAGS],
+    lookup_idx: Option<usize>,
+    raf_flag: bool,
+) -> F {
+    let reg_count = REGISTER_COUNT as usize;
+    let rs1_start = 0usize;
+    let rs2_start = rs1_start + reg_count;
+    let rd_start = rs2_start + reg_count;
+    let unexp_pc_idx = rd_start + reg_count;
+    let imm_idx = unexp_pc_idx + 1;
+    let circuit_start = imm_idx + 1;
+    let instr_start = circuit_start + NUM_CIRCUIT_FLAGS;
+    let lookup_start = instr_start + NUM_INSTRUCTION_FLAGS;
+    let raf_flag_idx = lookup_start + <LookupTables<XLEN> as strum::EnumCount>::COUNT;
+
+    if global_lane < rs2_start {
+        // rs1 one-hot
+        let r = global_lane as u8;
+        return F::from_bool(rs1 == Some(r));
+    }
+    if global_lane < rd_start {
+        // rs2 one-hot
+        let r = (global_lane - rs2_start) as u8;
+        return F::from_bool(rs2 == Some(r));
+    }
+    if global_lane < unexp_pc_idx {
+        // rd one-hot
+        let r = (global_lane - rd_start) as u8;
+        return F::from_bool(rd == Some(r));
+    }
+    if global_lane == unexp_pc_idx {
+        return unexpanded_pc;
+    }
+    if global_lane == imm_idx {
+        return imm;
+    }
+    if global_lane < instr_start {
+        let flag_idx = global_lane - circuit_start;
+        return F::from_bool(circuit_flags[flag_idx]);
+    }
+    if global_lane < lookup_start {
+        let flag_idx = global_lane - instr_start;
+        return F::from_bool(instr_flags[flag_idx]);
+    }
+    if global_lane < raf_flag_idx {
+        let table_idx = global_lane - lookup_start;
+        return F::from_bool(lookup_idx == Some(table_idx));
+    }
+    debug_assert_eq!(global_lane, raf_flag_idx);
+    F::from_bool(raf_flag)
+}
+
+#[tracing::instrument(skip_all, name = "bytecode::build_bytecode_chunks")]
+pub fn build_bytecode_chunks<F: JoltField>(
+    bytecode: &BytecodePreprocessing,
+    log_k_chunk: usize,
+) -> Vec<MultilinearPolynomial<F>> {
+    let k_chunk = 1usize << log_k_chunk;
+    let bytecode_len = bytecode.bytecode.len();
+    let total = total_lanes();
+    let num_chunks = total.div_ceil(k_chunk);
+
+    (0..num_chunks)
+        .into_par_iter()
+        .map(|chunk_idx| {
+            let mut coeffs = unsafe_allocate_zero_vec(k_chunk * bytecode_len);
+            for k in 0..bytecode_len {
+                let instr = &bytecode.bytecode[k];
+                let normalized = instr.normalize();
+                let circuit_flags = <Instruction as Flags>::circuit_flags(instr);
+                let instr_flags = <Instruction as Flags>::instruction_flags(instr);
+                let lookup_idx = <Instruction as InstructionLookup<XLEN>>::lookup_table(instr)
+                    .map(|t| LookupTables::<XLEN>::enum_index(&t));
+                let raf_flag =
+                    !crate::zkvm::instruction::InterleavedBitsMarker::is_interleaved_operands(
+                        &circuit_flags,
+                    );
+
+                let unexpanded_pc = F::from_u64(normalized.address as u64);
+                let imm = F::from_i128(normalized.operands.imm);
+                let rs1 = normalized.operands.rs1;
+                let rs2 = normalized.operands.rs2;
+                let rd = normalized.operands.rd;
+
+                for lane in 0..k_chunk {
+                    let global_lane = chunk_idx * k_chunk + lane;
+                    if global_lane >= total {
+                        break;
+                    }
+                    let value = lane_value::<F>(
+                        global_lane,
+                        rs1,
+                        rs2,
+                        rd,
+                        unexpanded_pc,
+                        imm,
+                        &circuit_flags,
+                        &instr_flags,
+                        lookup_idx,
+                        raf_flag,
+                    );
+                    let idx = DoryGlobals::get_layout().address_cycle_to_index(
+                        lane,
+                        k,
+                        k_chunk,
+                        bytecode_len,
+                    );
+                    coeffs[idx] = value;
+                }
+            }
+            MultilinearPolynomial::from(coeffs)
+        })
+        .collect()
+}
diff --git a/jolt-core/src/zkvm/bytecode/mod.rs b/jolt-core/src/zkvm/bytecode/mod.rs
index 65695c7b4f..7c0f41a3c7 100644
--- a/jolt-core/src/zkvm/bytecode/mod.rs
+++ b/jolt-core/src/zkvm/bytecode/mod.rs
@@ -8,8 +8,12 @@ use common::constants::{ALIGNMENT_FACTOR_BYTECODE, RAM_START_ADDRESS};
 use tracer::instruction::{Cycle, Instruction};
 
 use crate::poly::commitment::commitment_scheme::CommitmentScheme;
+use crate::poly::commitment::dory::{DoryContext, DoryGlobals};
 use crate::utils::errors::ProofVerifyError;
+use crate::zkvm::bytecode::chunks::{build_bytecode_chunks, total_lanes};
+use rayon::prelude::*;
 
+pub(crate) mod chunks;
 pub mod read_raf_checking;
 
 /// Bytecode commitments that were derived from actual bytecode.
@@ -31,6 +35,10 @@ pub struct TrustedBytecodeCommitments<PCS: CommitmentScheme> {
     /// The bytecode chunk commitments.
     /// Trust is enforced by the type - create via `derive()` or deserialize from trusted source.
     pub commitments: Vec<PCS::Commitment>,
+    /// log2(k_chunk) used for lane chunking.
+    pub log_k_chunk: u8,
+    /// Bytecode length (power-of-two padded).
+    pub bytecode_len: usize,
 }
 
 impl<PCS: CommitmentScheme> TrustedBytecodeCommitments<PCS> {
@@ -40,22 +48,33 @@ impl<PCS: CommitmentScheme> TrustedBytecodeCommitments<PCS> {
     /// Returns trusted commitments + hints for opening proofs.
     #[tracing::instrument(skip_all, name = "TrustedBytecodeCommitments::derive")]
     pub fn derive(
-        _bytecode: &BytecodePreprocessing,
-        _generators: &PCS::ProverSetup,
+        bytecode: &BytecodePreprocessing,
+        generators: &PCS::ProverSetup,
+        log_k_chunk: usize,
     ) -> (Self, Vec<PCS::OpeningProofHint>) {
-        // TODO: Implement bytecode chunk polynomial commitment computation.
-        // This will:
-        // 1. Build bytecode chunk polynomials based on lane ordering
-        //    (see bytecode-commitment-progress.md for the canonical ordering)
-        // 2. Commit each polynomial using PCS
-        // 3. Return commitments and opening hints (e.g., Dory tier-1 data)
-        //
-        // For now, return empty vectors as placeholder.
+        let k_chunk = 1usize << log_k_chunk;
+        let bytecode_len = bytecode.bytecode.len();
+        let num_chunks = total_lanes().div_ceil(k_chunk);
+
+        let _guard =
+            DoryGlobals::initialize_context(k_chunk, bytecode_len, DoryContext::Bytecode, None);
+        let _ctx = DoryGlobals::with_context(DoryContext::Bytecode);
+
+        let bytecode_chunks = build_bytecode_chunks::<PCS::Field>(bytecode, log_k_chunk);
+        debug_assert_eq!(bytecode_chunks.len(), num_chunks);
+
+        let (commitments, hints): (Vec<_>, Vec<_>) = bytecode_chunks
+            .par_iter()
+            .map(|poly| PCS::commit(poly, generators))
+            .unzip();
+
         (
             Self {
-                commitments: Vec::new(),
+                commitments,
+                log_k_chunk: log_k_chunk as u8,
+                bytecode_len,
             },
-            Vec::new(),
+            hints,
         )
     }
 }
diff --git a/jolt-core/src/zkvm/claim_reductions/bytecode.rs b/jolt-core/src/zkvm/claim_reductions/bytecode.rs
index 31e64f94f3..303cc22435 100644
--- a/jolt-core/src/zkvm/claim_reductions/bytecode.rs
+++ b/jolt-core/src/zkvm/claim_reductions/bytecode.rs
@@ -20,9 +20,9 @@ use std::sync::Arc;
 use allocative::Allocative;
 use itertools::Itertools;
 use rayon::prelude::*;
-use strum::EnumCount;
 
 use crate::field::JoltField;
+use crate::poly::commitment::dory::{DoryGlobals, DoryLayout};
 use crate::poly::eq_poly::EqPolynomial;
 use crate::poly::multilinear_polynomial::{
     BindingOrder, MultilinearPolynomial, PolynomialBinding, PolynomialEvaluation,
@@ -37,27 +37,45 @@ use crate::subprotocols::sumcheck_verifier::{SumcheckInstanceParams, SumcheckIns
 use crate::transcripts::Transcript;
 use crate::utils::math::Math;
 use crate::utils::thread::unsafe_allocate_zero_vec;
+use crate::zkvm::bytecode::chunks::{build_bytecode_chunks, total_lanes};
 use crate::zkvm::bytecode::read_raf_checking::BytecodeReadRafSumcheckParams;
 use crate::zkvm::bytecode::BytecodePreprocessing;
 use crate::zkvm::instruction::{
-    CircuitFlags, Flags, InstructionFlags, InstructionLookup, NUM_CIRCUIT_FLAGS,
-    NUM_INSTRUCTION_FLAGS,
+    CircuitFlags, InstructionFlags, NUM_CIRCUIT_FLAGS, NUM_INSTRUCTION_FLAGS,
 };
 use crate::zkvm::lookup_table::LookupTables;
 use crate::zkvm::witness::{CommittedPolynomial, VirtualPolynomial};
 use common::constants::{REGISTER_COUNT, XLEN};
+use strum::EnumCount;
 
 const DEGREE_BOUND: usize = 2;
 const NUM_VAL_STAGES: usize = 5;
 
-/// Total lanes (authoritative ordering; see design doc).
-const fn total_lanes() -> usize {
-    3 * (REGISTER_COUNT as usize) // rs1, rs2, rd one-hot lanes
-        + 2 // unexpanded_pc, imm
-        + NUM_CIRCUIT_FLAGS
-        + NUM_INSTRUCTION_FLAGS
-        + LookupTables::<XLEN>::COUNT
-        + 1 // raf flag
+/// For `DoryLayout::AddressMajor`, committed bytecode chunks are stored in "cycle-major" index order
+/// (cycle*K + address), which makes `BindingOrder::LowToHigh` bind **lane** bits first.
+///
+/// The claim reduction sumcheck needs to bind **cycle** bits first in Stage 6b, so we permute
+/// dense coefficient vectors into the `DoryLayout::CycleMajor` order (address*T + cycle) when
+/// running the reduction. This is a pure index permutation, i.e. a variable renaming, and the
+/// resulting evaluations match the committed polynomial when the opening point is interpreted in
+/// the unified `[lane || cycle]` order.
+fn permute_address_major_to_cycle_major<F: JoltField>(
+    coeffs: Vec<F>,
+    k_chunk: usize,
+    t_size: usize,
+) -> Vec<F> {
+    debug_assert_eq!(coeffs.len(), k_chunk * t_size);
+    let mut out: Vec<F> = unsafe_allocate_zero_vec(k_chunk * t_size);
+    for lane in 0..k_chunk {
+        for k in 0..t_size {
+            // AddressMajor: idx = cycle * K + address
+            let idx_in = k * k_chunk + lane;
+            // CycleMajor: idx = address * T + cycle
+            let idx_out = lane * t_size + k;
+            out[idx_out] = coeffs[idx_in];
+        }
+    }
+    out
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Allocative)]
@@ -71,11 +89,11 @@ pub struct BytecodeClaimReductionParams<F: JoltField> {
     pub phase: BytecodeReductionPhase,
     pub eta: F,
     pub eta_powers: [F; NUM_VAL_STAGES],
-    pub log_t: usize,
+    pub log_k: usize,
     pub log_k_chunk: usize,
     pub num_chunks: usize,
-    /// Bytecode address point, embedded into `log_t` bits by prefixing MSB zeros (BE).
-    pub r_bc_ext: OpeningPoint<BIG_ENDIAN, F>,
+    /// Bytecode address point (log_K bits, big-endian).
+    pub r_bc: OpeningPoint<BIG_ENDIAN, F>,
     /// Per-chunk lane weight tables (length = k_chunk) for `W_eta`.
     pub chunk_lane_weights: Vec<Vec<F>>,
     /// (little-endian) challenges used in the cycle phase.
@@ -88,14 +106,7 @@ impl<F: JoltField> BytecodeClaimReductionParams<F> {
         accumulator: &dyn OpeningAccumulator<F>,
         transcript: &mut impl Transcript,
     ) -> Self {
-        let log_t = bytecode_read_raf_params.log_T;
         let log_k = bytecode_read_raf_params.log_K;
-        if log_t < log_k {
-            panic!(
-                "BytecodeClaimReduction requires log_T >= log_K_bytecode (got log_T={log_t}, log_K={log_k}). \
-                 Pad trace length to at least bytecode_len when enabling bytecode commitment/reduction."
-            );
-        }
 
         let eta: F = transcript.challenge_scalar();
         let mut eta_powers = [F::one(); NUM_VAL_STAGES];
@@ -108,9 +119,6 @@ impl<F: JoltField> BytecodeClaimReductionParams<F> {
             VirtualPolynomial::BytecodeReadRafAddrClaim,
             SumcheckId::BytecodeReadRafAddressPhase,
         );
-        let mut r_bc_ext: Vec<F::Challenge> = vec![F::Challenge::from(0u128); log_t - r_bc.len()];
-        r_bc_ext.extend_from_slice(&r_bc.r);
-        let r_bc_ext = OpeningPoint::<BIG_ENDIAN, F>::new(r_bc_ext);
 
         let log_k_chunk = bytecode_read_raf_params.one_hot_params.log_k_chunk;
         let k_chunk = 1 << log_k_chunk;
@@ -128,10 +136,10 @@ impl<F: JoltField> BytecodeClaimReductionParams<F> {
             phase: BytecodeReductionPhase::CycleVariables,
             eta,
             eta_powers,
-            log_t,
+            log_k,
             log_k_chunk,
             num_chunks,
-            r_bc_ext,
+            r_bc,
             chunk_lane_weights,
             cycle_var_challenges: vec![],
         }
@@ -167,7 +175,7 @@ impl<F: JoltField> SumcheckInstanceParams<F> for BytecodeClaimReductionParams<F>
 
     fn num_rounds(&self) -> usize {
         match self.phase {
-            BytecodeReductionPhase::CycleVariables => self.log_t,
+            BytecodeReductionPhase::CycleVariables => self.log_k,
             BytecodeReductionPhase::LaneVariables => self.log_k_chunk,
         }
     }
@@ -205,12 +213,13 @@ impl<F: JoltField> BytecodeClaimReductionProver<F> {
         params: BytecodeClaimReductionParams<F>,
         bytecode: Arc<BytecodePreprocessing>,
     ) -> Self {
-        let log_t = params.log_t;
-        let t_size = 1 << log_t;
+        let log_k = params.log_k;
+        let t_size = 1 << log_k;
         let k_chunk = 1 << params.log_k_chunk;
+        let layout = DoryGlobals::get_layout();
 
-        // Eq table over the (embedded) bytecode address point.
-        let eq_r_bc = EqPolynomial::<F>::evals(&params.r_bc_ext.r);
+        // Eq table over the bytecode address point.
+        let eq_r_bc = EqPolynomial::<F>::evals(&params.r_bc.r);
         debug_assert_eq!(eq_r_bc.len(), t_size);
 
         // Build per-chunk weight polynomials as an outer product (lane_weight ⊗ eq_r_bc).
@@ -222,9 +231,12 @@ impl<F: JoltField> BytecodeClaimReductionProver<F> {
                 let mut coeffs: Vec<F> = unsafe_allocate_zero_vec(k_chunk * t_size);
                 for lane in 0..k_chunk {
                     let w = lane_weights[lane];
-                    let base = lane * t_size;
                     for k in 0..t_size {
-                        coeffs[base + k] = w * eq_r_bc[k];
+                        // Claim reduction always uses CycleMajor ordering so that
+                        // `BindingOrder::LowToHigh` binds cycle bits first in Stage 6b.
+                        let idx =
+                            DoryLayout::CycleMajor.address_cycle_to_index(lane, k, k_chunk, t_size);
+                        coeffs[idx] = w * eq_r_bc[k];
                     }
                 }
                 MultilinearPolynomial::from(coeffs)
@@ -233,57 +245,19 @@ impl<F: JoltField> BytecodeClaimReductionProver<F> {
 
         // Build per-chunk bytecode polynomials B_i(lane, k).
         let bytecode_len = bytecode.bytecode.len();
-        let total = total_lanes();
-        let bytecode_chunks: Vec<MultilinearPolynomial<F>> = (0..params.num_chunks)
-            .into_par_iter()
-            .map(|chunk_idx| {
-                let mut coeffs: Vec<F> = unsafe_allocate_zero_vec(k_chunk * t_size);
-                for k in 0..t_size {
-                    if k >= bytecode_len {
-                        break;
-                    }
-                    let instr = &bytecode.bytecode[k];
-                    let normalized = instr.normalize();
-                    let circuit_flags = instr.circuit_flags();
-                    let instr_flags = instr.instruction_flags();
-                    let lookup_idx = instr
-                        .lookup_table()
-                        .map(|t| LookupTables::<XLEN>::enum_index(&t));
-                    let raf_flag =
-                        !crate::zkvm::instruction::InterleavedBitsMarker::is_interleaved_operands(
-                            &circuit_flags,
-                        );
-
-                    // Common scalars
-                    let unexpanded_pc = F::from_u64(normalized.address as u64);
-                    let imm = F::from_i128(normalized.operands.imm);
-                    let rs1 = normalized.operands.rs1;
-                    let rs2 = normalized.operands.rs2;
-                    let rd = normalized.operands.rd;
-
-                    for lane in 0..k_chunk {
-                        let global_lane = chunk_idx * k_chunk + lane;
-                        if global_lane >= total {
-                            break;
-                        }
-                        let value = lane_value::<F>(
-                            global_lane,
-                            rs1,
-                            rs2,
-                            rd,
-                            unexpanded_pc,
-                            imm,
-                            &circuit_flags,
-                            &instr_flags,
-                            lookup_idx,
-                            raf_flag,
-                        );
-                        coeffs[lane * t_size + k] = value;
-                    }
+        debug_assert_eq!(bytecode_len, t_size);
+        let mut bytecode_chunks = build_bytecode_chunks::<F>(&bytecode, params.log_k_chunk);
+        if layout == DoryLayout::AddressMajor {
+            // Permute committed AddressMajor coefficient order into CycleMajor for the reduction.
+            for poly in bytecode_chunks.iter_mut() {
+                if let MultilinearPolynomial::LargeScalars(p) = poly {
+                    let old = std::mem::take(&mut p.Z);
+                    p.Z = permute_address_major_to_cycle_major(old, k_chunk, t_size);
+                } else {
+                    unreachable!("bytecode chunks are dense field polynomials");
                 }
-                MultilinearPolynomial::from(coeffs)
-            })
-            .collect();
+            }
+        }
 
         debug_assert_eq!(bytecode_chunks.len(), params.num_chunks);
         debug_assert_eq!(weight_chunks.len(), params.num_chunks);
@@ -436,7 +410,7 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
                 let opening_point = params.normalize_opening_point(sumcheck_challenges);
                 let (r_lane, r_cycle) = opening_point.split_at(params.log_k_chunk);
 
-                let eq_eval = EqPolynomial::<F>::mle(&r_cycle.r, &params.r_bc_ext.r);
+                let eq_eval = EqPolynomial::<F>::mle(&r_cycle.r, &params.r_bc.r);
 
                 // Evaluate each chunk's lane-weight polynomial at r_lane and combine with chunk openings.
                 let mut sum = F::zero();
@@ -608,65 +582,3 @@ fn compute_chunk_lane_weights<F: JoltField>(
         })
         .collect_vec()
 }
-
-#[allow(clippy::too_many_arguments)]
-#[inline(always)]
-fn lane_value<F: JoltField>(
-    global_lane: usize,
-    rs1: Option<u8>,
-    rs2: Option<u8>,
-    rd: Option<u8>,
-    unexpanded_pc: F,
-    imm: F,
-    circuit_flags: &[bool; NUM_CIRCUIT_FLAGS],
-    instr_flags: &[bool; NUM_INSTRUCTION_FLAGS],
-    lookup_idx: Option<usize>,
-    raf_flag: bool,
-) -> F {
-    let reg_count = REGISTER_COUNT as usize;
-    let rs1_start = 0usize;
-    let rs2_start = rs1_start + reg_count;
-    let rd_start = rs2_start + reg_count;
-    let unexp_pc_idx = rd_start + reg_count;
-    let imm_idx = unexp_pc_idx + 1;
-    let circuit_start = imm_idx + 1;
-    let instr_start = circuit_start + NUM_CIRCUIT_FLAGS;
-    let lookup_start = instr_start + NUM_INSTRUCTION_FLAGS;
-    let raf_flag_idx = lookup_start + LookupTables::<XLEN>::COUNT;
-
-    if global_lane < rs2_start {
-        // rs1 one-hot
-        let r = global_lane as u8;
-        return F::from_bool(rs1 == Some(r));
-    }
-    if global_lane < rd_start {
-        // rs2 one-hot
-        let r = (global_lane - rs2_start) as u8;
-        return F::from_bool(rs2 == Some(r));
-    }
-    if global_lane < unexp_pc_idx {
-        // rd one-hot
-        let r = (global_lane - rd_start) as u8;
-        return F::from_bool(rd == Some(r));
-    }
-    if global_lane == unexp_pc_idx {
-        return unexpanded_pc;
-    }
-    if global_lane == imm_idx {
-        return imm;
-    }
-    if global_lane < instr_start {
-        let flag_idx = global_lane - circuit_start;
-        return F::from_bool(circuit_flags[flag_idx]);
-    }
-    if global_lane < lookup_start {
-        let flag_idx = global_lane - instr_start;
-        return F::from_bool(instr_flags[flag_idx]);
-    }
-    if global_lane < raf_flag_idx {
-        let table_idx = global_lane - lookup_start;
-        return F::from_bool(lookup_idx == Some(table_idx));
-    }
-    debug_assert_eq!(global_lane, raf_flag_idx);
-    F::from_bool(raf_flag)
-}
diff --git a/jolt-core/src/zkvm/config.rs b/jolt-core/src/zkvm/config.rs
index acc98a198b..64e792e7ac 100644
--- a/jolt-core/src/zkvm/config.rs
+++ b/jolt-core/src/zkvm/config.rs
@@ -24,13 +24,15 @@ pub fn get_instruction_sumcheck_phases(log_t: usize) -> usize {
 }
 
 /// Controls whether the prover/verifier use the **full** bytecode path (verifier may do O(K))
-/// or the **committed** bytecode path (requires padding so `T >= K_bytecode`).
+/// or the **committed** bytecode path (staged Val claims + claim reduction + folded Stage 8
+/// opening for bytecode chunk commitments).
 #[repr(u8)]
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Allocative)]
 pub enum BytecodeMode {
     /// Full mode: verifier may materialize bytecode-dependent polynomials (O(K_bytecode)).
     Full = 0,
-    /// Committed mode: use staged Val claims + BytecodeClaimReduction; requires `log_T >= log_K`.
+    /// Committed mode: use staged Val claims + `BytecodeClaimReduction`, and fold committed
+    /// bytecode chunk openings into the joint Stage 8 opening (Bytecode context embedding).
     Committed = 1,
 }
 
@@ -205,6 +207,22 @@ impl OneHotConfig {
         }
     }
 
+    /// Create a OneHotConfig with an explicit log_k_chunk.
+    pub fn from_log_k_chunk(log_k_chunk: usize) -> Self {
+        debug_assert!(log_k_chunk == 4 || log_k_chunk == 8);
+        let log_k_chunk = log_k_chunk as u8;
+        let lookups_ra_virtual_log_k_chunk = if log_k_chunk == 4 {
+            LOG_K / 8
+        } else {
+            LOG_K / 4
+        };
+
+        Self {
+            log_k_chunk,
+            lookups_ra_virtual_log_k_chunk: lookups_ra_virtual_log_k_chunk as u8,
+        }
+    }
+
     /// Validates that the one-hot configuration is valid.
     ///
     /// This is called by the verifier to ensure the prover hasn't provided
diff --git a/jolt-core/src/zkvm/prover.rs b/jolt-core/src/zkvm/prover.rs
index 6d01f73e5a..a8c797367c 100644
--- a/jolt-core/src/zkvm/prover.rs
+++ b/jolt-core/src/zkvm/prover.rs
@@ -16,6 +16,7 @@ use std::{
 use crate::poly::commitment::dory::DoryContext;
 use ark_serialize::{CanonicalDeserialize, CanonicalSerialize};
 
+use crate::zkvm::bytecode::chunks::total_lanes;
 use crate::zkvm::bytecode::{BytecodePreprocessing, TrustedBytecodeCommitments};
 use crate::zkvm::config::{BytecodeMode, ReadWriteConfig};
 use crate::zkvm::verifier::JoltSharedPreprocessing;
@@ -392,21 +393,14 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             (trace.len() + 1).next_power_of_two()
         };
 
-        // If we intend to use the bytecode-commitment/claim-reduction path, we must ensure
-        // `log_T >= log_K_bytecode`, i.e. `T >= K_bytecode`. Enforce by padding up-front.
-        let mut padded_trace_len = padded_trace_len;
-        if bytecode_mode == BytecodeMode::Committed {
-            let bytecode_k = preprocessing.shared.bytecode_size;
-            if bytecode_k > preprocessing.shared.max_padded_trace_length {
-                panic!(
-                    "Bytecode commitment mode requires max_padded_trace_length >= bytecode_K.\n\
-                     bytecode_K={} > max_padded_trace_length={}\n\
-                     Increase max_trace_length in preprocessing (JoltSharedPreprocessing::new).",
-                    bytecode_k, preprocessing.shared.max_padded_trace_length
-                );
-            }
-            padded_trace_len = padded_trace_len.max(bytecode_k);
-        }
+        // In Committed mode, Stage 8 folds bytecode chunk openings into the *joint* opening.
+        // That folding currently requires log_T >= log_K_bytecode, so we ensure the padded trace
+        // length is at least the (power-of-two padded) bytecode size.
+        let padded_trace_len = if bytecode_mode == BytecodeMode::Committed {
+            padded_trace_len.max(preprocessing.shared.bytecode_size)
+        } else {
+            padded_trace_len
+        };
         // We may need extra padding so the main Dory matrix has enough (row, col) variables
         // to embed advice commitments committed in their own preprocessing-only contexts.
         let has_trusted_advice = !program_io.trusted_advice.is_empty();
@@ -460,7 +454,16 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         let log_T = trace.len().log_2();
         let ram_log_K = ram_K.log_2();
         let rw_config = ReadWriteConfig::new(log_T, ram_log_K);
-        let one_hot_params = OneHotParams::new(log_T, preprocessing.shared.bytecode_size, ram_K);
+        let one_hot_params = if bytecode_mode == BytecodeMode::Committed {
+            let committed = preprocessing
+                .bytecode_commitments
+                .as_ref()
+                .expect("bytecode commitments missing in committed mode");
+            let config = OneHotConfig::from_log_k_chunk(committed.log_k_chunk as usize);
+            OneHotParams::from_config(&config, preprocessing.shared.bytecode_size, ram_K)
+        } else {
+            OneHotParams::new(log_T, preprocessing.shared.bytecode_size, ram_K)
+        };
 
         Self {
             preprocessing,
@@ -514,6 +517,14 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         let untrusted_advice_commitment = self.generate_and_commit_untrusted_advice();
         self.generate_and_commit_trusted_advice();
 
+        if self.bytecode_mode == BytecodeMode::Committed {
+            if let Some(trusted) = &self.preprocessing.bytecode_commitments {
+                for commitment in &trusted.commitments {
+                    self.transcript.append_serializable(commitment);
+                }
+            }
+        }
+
         // Add advice hints for batched Stage 8 opening
         if let Some(hint) = self.advice.trusted_advice_hint.take() {
             opening_proof_hints.insert(CommittedPolynomial::TrustedAdvice, hint);
@@ -521,6 +532,14 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         if let Some(hint) = self.advice.untrusted_advice_hint.take() {
             opening_proof_hints.insert(CommittedPolynomial::UntrustedAdvice, hint);
         }
+        if self.bytecode_mode == BytecodeMode::Committed {
+            if let Some(hints) = self.preprocessing.bytecode_commitment_hints.as_ref() {
+                for (idx, hint) in hints.iter().enumerate() {
+                    opening_proof_hints
+                        .insert(CommittedPolynomial::BytecodeChunk(idx), hint.clone());
+                }
+            }
+        }
 
         let (stage1_uni_skip_first_round_proof, stage1_sumcheck_proof) = self.prove_stage1();
         let (stage2_uni_skip_first_round_proof, stage2_sumcheck_proof) = self.prove_stage2();
@@ -1245,10 +1264,6 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
         // Bytecode claim reduction (Phase 1 in Stage 6b): consumes Val_s(r_bc) from Stage 6a and
         // caches an intermediate claim for Stage 7.
         if self.bytecode_mode == BytecodeMode::Committed {
-            debug_assert!(
-                bytecode_read_raf_params.log_T >= bytecode_read_raf_params.log_K,
-                "commitment mode requires log_T >= log_K_bytecode"
-            );
             let bytecode_reduction_params = BytecodeClaimReductionParams::new(
                 &bytecode_read_raf_params,
                 &self.opening_accumulator,
@@ -1605,6 +1620,49 @@ impl<'a, F: JoltField, PCS: StreamingCommitmentScheme<Field = F>, ProofTranscrip
             ));
         }
 
+        // Bytecode chunk polynomials: committed in Bytecode context and embedded into the
+        // main opening point by fixing the extra cycle variables to 0.
+        if self.bytecode_mode == BytecodeMode::Committed {
+            let (bytecode_point, _) = self.opening_accumulator.get_committed_polynomial_opening(
+                CommittedPolynomial::BytecodeChunk(0),
+                SumcheckId::BytecodeClaimReduction,
+            );
+            let log_t = opening_point.r.len() - log_k_chunk;
+            let log_k = bytecode_point.r.len() - log_k_chunk;
+            assert!(
+                log_k <= log_t,
+                "bytecode folding requires log_T >= log_K (got log_T={log_t}, log_K={log_k})"
+            );
+            #[cfg(test)]
+            {
+                if log_k == log_t {
+                    assert_eq!(
+                        bytecode_point.r, opening_point.r,
+                        "BytecodeChunk opening point must equal unified opening point when log_K == log_T"
+                    );
+                } else {
+                    let (r_lane_main, r_cycle_main) = opening_point.split_at(log_k_chunk);
+                    let (r_lane_bc, r_cycle_bc) = bytecode_point.split_at(log_k_chunk);
+                    debug_assert_eq!(r_lane_main.r, r_lane_bc.r);
+                    debug_assert_eq!(&r_cycle_main.r[(log_t - log_k)..], r_cycle_bc.r.as_slice());
+                }
+            }
+            let lagrange_factor =
+                compute_advice_lagrange_factor::<F>(&opening_point.r, &bytecode_point.r);
+
+            let num_chunks = total_lanes().div_ceil(self.one_hot_params.k_chunk);
+            for i in 0..num_chunks {
+                let (_, claim) = self.opening_accumulator.get_committed_polynomial_opening(
+                    CommittedPolynomial::BytecodeChunk(i),
+                    SumcheckId::BytecodeClaimReduction,
+                );
+                polynomial_claims.push((
+                    CommittedPolynomial::BytecodeChunk(i),
+                    claim * lagrange_factor,
+                ));
+            }
+        }
+
         // 2. Sample gamma and compute powers for RLC
         let claims: Vec<F> = polynomial_claims.iter().map(|(_, c)| *c).collect();
         self.transcript.append_scalars(&claims);
@@ -1707,7 +1765,7 @@ where
     F: JoltField,
     PCS: CommitmentScheme<Field = F>,
 {
-    /// Setup generators based on trace length.
+    /// Setup generators based on trace length (Main context).
     fn setup_generators(shared: &JoltSharedPreprocessing) -> PCS::ProverSetup {
         use common::constants::ONEHOT_CHUNK_THRESHOLD_LOG_T;
         let max_T: usize = shared.max_padded_trace_length.next_power_of_two();
@@ -1721,6 +1779,24 @@ where
         PCS::setup_prover(max_log_k_chunk + max_log_T)
     }
 
+    /// Setup generators for Committed mode, ensuring capacity for both:
+    /// - Main context up to `max_padded_trace_length`
+    /// - Bytecode context up to `bytecode_size`
+    fn setup_generators_committed(shared: &JoltSharedPreprocessing) -> PCS::ProverSetup {
+        use common::constants::ONEHOT_CHUNK_THRESHOLD_LOG_T;
+        let max_t_any: usize = shared
+            .max_padded_trace_length
+            .max(shared.bytecode_size)
+            .next_power_of_two();
+        let max_log_t_any = max_t_any.log_2();
+        let max_log_k_chunk = if max_log_t_any < ONEHOT_CHUNK_THRESHOLD_LOG_T {
+            4
+        } else {
+            8
+        };
+        PCS::setup_prover(max_log_k_chunk + max_log_t_any)
+    }
+
     /// Create prover preprocessing in Full mode (no bytecode commitments).
     ///
     /// Use this when the verifier will have access to full bytecode.
@@ -1748,9 +1824,19 @@ where
         shared: JoltSharedPreprocessing,
         bytecode: Arc<BytecodePreprocessing>,
     ) -> JoltProverPreprocessing<F, PCS> {
-        let generators = Self::setup_generators(&shared);
+        let generators = Self::setup_generators_committed(&shared);
+        let max_t_any: usize = shared
+            .max_padded_trace_length
+            .max(shared.bytecode_size)
+            .next_power_of_two();
+        let max_log_t = max_t_any.log_2();
+        let log_k_chunk = if max_log_t < common::constants::ONEHOT_CHUNK_THRESHOLD_LOG_T {
+            4
+        } else {
+            8
+        };
         let (trusted_commitments, hints) =
-            TrustedBytecodeCommitments::derive(&bytecode, &generators);
+            TrustedBytecodeCommitments::derive(&bytecode, &generators, log_k_chunk);
         JoltProverPreprocessing {
             generators,
             shared,
diff --git a/jolt-core/src/zkvm/tests.rs b/jolt-core/src/zkvm/tests.rs
index d821e8429a..1f165b9584 100644
--- a/jolt-core/src/zkvm/tests.rs
+++ b/jolt-core/src/zkvm/tests.rs
@@ -18,8 +18,10 @@ use crate::poly::commitment::commitment_scheme::CommitmentScheme;
 use crate::poly::commitment::dory::{DoryCommitmentScheme, DoryContext, DoryGlobals, DoryLayout};
 use crate::poly::multilinear_polynomial::MultilinearPolynomial;
 use crate::poly::opening_proof::{OpeningAccumulator, SumcheckId};
+use crate::zkvm::bytecode::chunks::total_lanes;
 use crate::zkvm::bytecode::BytecodePreprocessing;
 use crate::zkvm::claim_reductions::AdviceKind;
+use crate::zkvm::config::BytecodeMode;
 use crate::zkvm::prover::JoltProverPreprocessing;
 use crate::zkvm::ram::populate_memory_states;
 use crate::zkvm::verifier::{JoltSharedPreprocessing, JoltVerifier, JoltVerifierPreprocessing};
@@ -266,7 +268,12 @@ pub fn run_e2e_test(config: E2ETestConfig) {
 
     // Create prover and prove
     let elf_contents = program.get_elf_contents().expect("elf contents is None");
-    let prover = RV64IMACProver::gen_from_elf(
+    let bytecode_mode = if config.committed_bytecode {
+        BytecodeMode::Committed
+    } else {
+        BytecodeMode::Full
+    };
+    let prover = RV64IMACProver::gen_from_elf_with_bytecode_mode(
         &prover_preprocessing,
         &elf_contents,
         &config.inputs,
@@ -274,9 +281,11 @@ pub fn run_e2e_test(config: E2ETestConfig) {
         &config.trusted_advice,
         trusted_commitment,
         trusted_hint,
+        bytecode_mode,
     );
     let io_device = prover.program_io.clone();
     let (jolt_proof, debug_info) = prover.prove();
+    assert_eq!(jolt_proof.bytecode_mode, bytecode_mode);
 
     // Create verifier preprocessing from prover (respects mode)
     let verifier_preprocessing = JoltVerifierPreprocessing::from(&prover_preprocessing);
@@ -419,21 +428,17 @@ fn advice_merkle_tree_e2e_address_major() {
 // ============================================================================
 // New Tests - Committed Bytecode Mode
 //
-// These tests are ignored until the verifier is fully updated to support
-// Committed mode (currently it calls as_full() which fails in Committed mode).
-// See verifier.rs line 442 - needs to branch on bytecode mode.
+// These tests exercise the end-to-end committed bytecode path.
 // ============================================================================
 
 #[test]
 #[serial]
-#[ignore = "Verifier not yet updated for Committed mode"]
 fn fib_e2e_committed_bytecode() {
     run_e2e_test(E2ETestConfig::default().with_committed_bytecode());
 }
 
 #[test]
 #[serial]
-#[ignore = "Verifier not yet updated for Committed mode"]
 fn fib_e2e_committed_bytecode_address_major() {
     run_e2e_test(
         E2ETestConfig::default()
@@ -442,6 +447,20 @@ fn fib_e2e_committed_bytecode_address_major() {
     );
 }
 
+// ============================================================================
+// New Tests - Bytecode Lane Ordering / Chunking
+// ============================================================================
+
+#[test]
+fn bytecode_lane_chunking_counts() {
+    // Canonical lane spec (see bytecode-commitment-progress.md):
+    // 3*REGISTER_COUNT (rs1/rs2/rd) + 2 scalars + 13 circuit flags + 7 instr flags
+    // + 41 lookup selector + 1 raf flag = 448 (with REGISTER_COUNT=128).
+    assert_eq!(total_lanes(), 448);
+    assert_eq!(total_lanes().div_ceil(16), 28);
+    assert_eq!(total_lanes().div_ceil(256), 2);
+}
+
 // ============================================================================
 // New Tests - Bytecode Mode Detection
 // ============================================================================
diff --git a/jolt-core/src/zkvm/verifier.rs b/jolt-core/src/zkvm/verifier.rs
index 34c5f69674..05d110e906 100644
--- a/jolt-core/src/zkvm/verifier.rs
+++ b/jolt-core/src/zkvm/verifier.rs
@@ -7,6 +7,7 @@ use std::sync::Arc;
 use crate::poly::commitment::commitment_scheme::CommitmentScheme;
 use crate::poly::commitment::dory::{DoryContext, DoryGlobals};
 use crate::subprotocols::sumcheck::BatchedSumcheck;
+use crate::zkvm::bytecode::chunks::total_lanes;
 use crate::zkvm::bytecode::{BytecodePreprocessing, TrustedBytecodeCommitments, VerifierBytecode};
 use crate::zkvm::claim_reductions::advice::ReductionPhase;
 use crate::zkvm::claim_reductions::RegistersClaimReductionSumcheckVerifier;
@@ -168,19 +169,34 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             .validate(proof.trace_length.log_2(), proof.ram_K.log_2())
             .map_err(ProofVerifyError::InvalidReadWriteConfig)?;
 
-        // If the proof claims it used bytecode commitment mode, it must have enough cycle vars
-        // to embed bytecode address variables (log_T >= log_K_bytecode), i.e. T >= K_bytecode.
-        if proof.bytecode_mode == BytecodeMode::Committed && proof.trace_length < proof.bytecode_K {
-            return Err(ProofVerifyError::InvalidBytecodeConfig(format!(
-                "bytecode commitment mode requires trace_length >= bytecode_K (got trace_length={}, bytecode_K={})",
-                proof.trace_length, proof.bytecode_K
-            )));
-        }
-
         // Construct full params from the validated config
         let one_hot_params =
             OneHotParams::from_config(&proof.one_hot_config, proof.bytecode_K, proof.ram_K);
 
+        if proof.bytecode_mode == BytecodeMode::Committed {
+            let committed = preprocessing.bytecode.as_committed()?;
+            if committed.log_k_chunk != proof.one_hot_config.log_k_chunk {
+                return Err(ProofVerifyError::InvalidBytecodeConfig(format!(
+                    "bytecode log_k_chunk mismatch: commitments={}, proof={}",
+                    committed.log_k_chunk, proof.one_hot_config.log_k_chunk
+                )));
+            }
+            if committed.bytecode_len != preprocessing.shared.bytecode_size {
+                return Err(ProofVerifyError::InvalidBytecodeConfig(format!(
+                    "bytecode length mismatch: commitments={}, shared={}",
+                    committed.bytecode_len, preprocessing.shared.bytecode_size
+                )));
+            }
+            let k_chunk = 1usize << (committed.log_k_chunk as usize);
+            let expected_chunks = total_lanes().div_ceil(k_chunk);
+            if committed.commitments.len() != expected_chunks {
+                return Err(ProofVerifyError::InvalidBytecodeConfig(format!(
+                    "expected {expected_chunks} bytecode commitments, got {}",
+                    committed.commitments.len()
+                )));
+            }
+        }
+
         Ok(Self {
             trusted_advice_commitment,
             program_io,
@@ -221,6 +237,12 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             self.transcript
                 .append_serializable(trusted_advice_commitment);
         }
+        if self.proof.bytecode_mode == BytecodeMode::Committed {
+            let trusted = self.preprocessing.bytecode.as_committed()?;
+            for commitment in &trusted.commitments {
+                self.transcript.append_serializable(commitment);
+            }
+        }
 
         self.verify_stage1()?;
         self.verify_stage2()?;
@@ -506,10 +528,6 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
         // IMPORTANT: This must be sampled *after* other Stage 6b params (e.g. lookup/inc gammas),
         // to match the prover's transcript order.
         if self.proof.bytecode_mode == BytecodeMode::Committed {
-            debug_assert!(
-                bytecode_read_raf_params.log_T >= bytecode_read_raf_params.log_K,
-                "commitment mode requires log_T >= log_K_bytecode"
-            );
             let bytecode_reduction_params = BytecodeClaimReductionParams::new(
                 &bytecode_read_raf_params,
                 &self.opening_accumulator,
@@ -720,6 +738,51 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             ));
         }
 
+        // Bytecode chunk polynomials: committed in Bytecode context and embedded into the
+        // main opening point by fixing the extra cycle variables to 0.
+        if self.proof.bytecode_mode == BytecodeMode::Committed {
+            let (bytecode_point, _) = self.opening_accumulator.get_committed_polynomial_opening(
+                CommittedPolynomial::BytecodeChunk(0),
+                SumcheckId::BytecodeClaimReduction,
+            );
+            let log_t = opening_point.r.len() - log_k_chunk;
+            let log_k = bytecode_point.r.len() - log_k_chunk;
+            if log_k > log_t {
+                return Err(ProofVerifyError::InvalidBytecodeConfig(format!(
+                    "bytecode folding requires log_T >= log_K (got log_T={log_t}, log_K={log_k})"
+                ))
+                .into());
+            }
+            #[cfg(test)]
+            {
+                if log_k == log_t {
+                    assert_eq!(
+                        bytecode_point.r, opening_point.r,
+                        "BytecodeChunk opening point must equal unified opening point when log_K == log_T"
+                    );
+                } else {
+                    let (r_lane_main, r_cycle_main) = opening_point.split_at(log_k_chunk);
+                    let (r_lane_bc, r_cycle_bc) = bytecode_point.split_at(log_k_chunk);
+                    debug_assert_eq!(r_lane_main.r, r_lane_bc.r);
+                    debug_assert_eq!(&r_cycle_main.r[(log_t - log_k)..], r_cycle_bc.r.as_slice());
+                }
+            }
+            let lagrange_factor =
+                compute_advice_lagrange_factor::<F>(&opening_point.r, &bytecode_point.r);
+
+            let num_chunks = total_lanes().div_ceil(self.one_hot_params.k_chunk);
+            for i in 0..num_chunks {
+                let (_, claim) = self.opening_accumulator.get_committed_polynomial_opening(
+                    CommittedPolynomial::BytecodeChunk(i),
+                    SumcheckId::BytecodeClaimReduction,
+                );
+                polynomial_claims.push((
+                    CommittedPolynomial::BytecodeChunk(i),
+                    claim * lagrange_factor,
+                ));
+            }
+        }
+
         // 2. Sample gamma and compute powers for RLC
         let claims: Vec<F> = polynomial_claims.iter().map(|(_, c)| *c).collect();
         self.transcript.append_scalars(&claims);
@@ -761,6 +824,15 @@ impl<'a, F: JoltField, PCS: CommitmentScheme<Field = F>, ProofTranscript: Transc
             }
         }
 
+        if self.proof.bytecode_mode == BytecodeMode::Committed {
+            let committed = self.preprocessing.bytecode.as_committed()?;
+            for (idx, commitment) in committed.commitments.iter().enumerate() {
+                commitments_map
+                    .entry(CommittedPolynomial::BytecodeChunk(idx))
+                    .or_insert_with(|| commitment.clone());
+            }
+        }
+
         // Compute joint commitment: Σ γ_i · C_i
         let joint_commitment = self.compute_joint_commitment(&mut commitments_map, &state);
 
diff --git a/jolt-sdk/macros/src/lib.rs b/jolt-sdk/macros/src/lib.rs
index 9f47cc678a..0b292af8eb 100644
--- a/jolt-sdk/macros/src/lib.rs
+++ b/jolt-sdk/macros/src/lib.rs
@@ -66,15 +66,18 @@ impl MacroBuilder {
     fn build(&mut self) -> TokenStream {
         let memory_config_fn = self.make_memory_config_fn();
         let build_prover_fn = self.make_build_prover_fn();
+        let build_prover_committed_fn = self.make_build_prover_committed_fn();
         let build_verifier_fn = self.make_build_verifier_fn();
         let analyze_fn = self.make_analyze_function();
         let trace_to_file_fn = self.make_trace_to_file_func();
         let compile_fn = self.make_compile_func();
         let preprocess_fn = self.make_preprocess_func();
+        let preprocess_committed_fn = self.make_preprocess_committed_func();
         let preprocess_shared_fn = self.make_preprocess_shared_func();
         let verifier_preprocess_from_prover_fn = self.make_preprocess_from_prover_func();
         let commit_trusted_advice_fn = self.make_commit_trusted_advice_func();
         let prove_fn = self.make_prove_func();
+        let prove_committed_fn = self.make_prove_committed_func();
 
         let attributes = parse_attributes(&self.attr);
         let mut execute_fn = quote! {};
@@ -95,16 +98,19 @@ impl MacroBuilder {
         quote! {
             #memory_config_fn
             #build_prover_fn
+            #build_prover_committed_fn
             #build_verifier_fn
             #execute_fn
             #analyze_fn
             #trace_to_file_fn
             #compile_fn
             #preprocess_fn
+            #preprocess_committed_fn
             #preprocess_shared_fn
             #verifier_preprocess_from_prover_fn
             #commit_trusted_advice_fn
             #prove_fn
+            #prove_committed_fn
             #main_fn
         }
         .into()
@@ -204,6 +210,69 @@ impl MacroBuilder {
         }
     }
 
+    fn make_build_prover_committed_fn(&self) -> TokenStream2 {
+        let fn_name = self.get_func_name();
+        let build_prover_fn_name =
+            Ident::new(&format!("build_prover_committed_{fn_name}"), fn_name.span());
+        let prove_output_ty = self.get_prove_output_type();
+
+        // Include public, trusted_advice, and untrusted_advice arguments for the prover
+        let ordered_func_args = self.get_all_func_args_in_order();
+        let all_names: Vec<_> = ordered_func_args.iter().map(|(name, _)| name).collect();
+        let all_types: Vec<_> = ordered_func_args.iter().map(|(_, ty)| ty).collect();
+
+        let inputs_vec: Vec<_> = self.func.sig.inputs.iter().collect();
+        let inputs = quote! { #(#inputs_vec),* };
+        let prove_fn_name = Ident::new(&format!("prove_committed_{fn_name}"), fn_name.span());
+        let imports = self.make_imports();
+
+        let has_trusted_advice = !self.trusted_func_args.is_empty();
+
+        let commitment_param_in_closure = if has_trusted_advice {
+            quote! { , trusted_advice_commitment: Option<<jolt::PCS as jolt::CommitmentScheme>::Commitment>,
+            trusted_advice_hint: Option<<jolt::PCS as jolt::CommitmentScheme>::OpeningProofHint> }
+        } else {
+            quote! {}
+        };
+
+        let commitment_arg_in_call = if has_trusted_advice {
+            quote! { , trusted_advice_commitment, trusted_advice_hint }
+        } else {
+            quote! {}
+        };
+
+        let return_type = if has_trusted_advice {
+            quote! {
+                impl Fn(#(#all_types),*, Option<<jolt::PCS as jolt::CommitmentScheme>::Commitment>, Option<<jolt::PCS as jolt::CommitmentScheme>::OpeningProofHint>) -> #prove_output_ty + Sync + Send
+            }
+        } else {
+            quote! {
+                impl Fn(#(#all_types),*) -> #prove_output_ty + Sync + Send
+            }
+        };
+
+        quote! {
+            #[cfg(all(not(target_arch = "wasm32"), not(feature = "guest")))]
+            pub fn #build_prover_fn_name(
+                program: jolt::host::Program,
+                preprocessing: jolt::JoltProverPreprocessing<jolt::F, jolt::PCS>,
+            ) -> #return_type
+            {
+                #imports
+                let program = std::sync::Arc::new(program);
+                let preprocessing = std::sync::Arc::new(preprocessing);
+
+                let prove_closure = move |#inputs #commitment_param_in_closure| {
+                    let program = (*program).clone();
+                    let preprocessing = (*preprocessing).clone();
+                    #prove_fn_name(program, preprocessing, #(#all_names),* #commitment_arg_in_call)
+                };
+
+                prove_closure
+            }
+        }
+    }
+
     fn make_build_verifier_fn(&self) -> TokenStream2 {
         let fn_name = self.get_func_name();
         let build_verifier_fn_name =
@@ -471,6 +540,53 @@ impl MacroBuilder {
         }
     }
 
+    fn make_preprocess_committed_func(&self) -> TokenStream2 {
+        let attributes = parse_attributes(&self.attr);
+        let max_trace_length = proc_macro2::Literal::u64_unsuffixed(attributes.max_trace_length);
+        let max_input_size = proc_macro2::Literal::u64_unsuffixed(attributes.max_input_size);
+        let max_output_size = proc_macro2::Literal::u64_unsuffixed(attributes.max_output_size);
+        let max_untrusted_advice_size =
+            proc_macro2::Literal::u64_unsuffixed(attributes.max_untrusted_advice_size);
+        let max_trusted_advice_size =
+            proc_macro2::Literal::u64_unsuffixed(attributes.max_trusted_advice_size);
+        let stack_size = proc_macro2::Literal::u64_unsuffixed(attributes.stack_size);
+        let memory_size = proc_macro2::Literal::u64_unsuffixed(attributes.memory_size);
+        let imports = self.make_imports();
+
+        let fn_name = self.get_func_name();
+        let preprocess_fn_name =
+            Ident::new(&format!("preprocess_committed_{fn_name}"), fn_name.span());
+        quote! {
+            #[cfg(all(not(target_arch = "wasm32"), not(feature = "guest")))]
+            pub fn #preprocess_fn_name(program: &mut jolt::host::Program)
+                -> jolt::JoltProverPreprocessing<jolt::F, jolt::PCS>
+            {
+                #imports
+
+                let (instructions, memory_init, program_size) = program.decode();
+                let memory_config = MemoryConfig {
+                    max_input_size: #max_input_size,
+                    max_output_size: #max_output_size,
+                    max_untrusted_advice_size: #max_untrusted_advice_size,
+                    max_trusted_advice_size: #max_trusted_advice_size,
+                    stack_size: #stack_size,
+                    memory_size: #memory_size,
+                    program_size: Some(program_size),
+                };
+                let memory_layout = MemoryLayout::new(&memory_config);
+
+                let bytecode = BytecodePreprocessing::preprocess(instructions);
+                let shared = JoltSharedPreprocessing::new(
+                    &bytecode,
+                    memory_layout,
+                    memory_init,
+                    #max_trace_length,
+                );
+                JoltProverPreprocessing::new_committed(shared, std::sync::Arc::new(bytecode))
+            }
+        }
+    }
+
     fn make_preprocess_shared_func(&self) -> TokenStream2 {
         let attributes = parse_attributes(&self.attr);
         let max_trace_length = proc_macro2::Literal::u64_unsuffixed(attributes.max_trace_length);
@@ -688,12 +804,110 @@ impl MacroBuilder {
 
                 let elf_contents_opt = program.get_elf_contents();
                 let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
-                let prover = RV64IMACProver::gen_from_elf(&preprocessing,
+                let prover = RV64IMACProver::gen_from_elf_with_bytecode_mode(&preprocessing,
+                    &elf_contents,
+                    &input_bytes,
+                    &untrusted_advice_bytes,
+                    &trusted_advice_bytes,
+                    #commitment_arg,
+                    jolt::BytecodeMode::Full,
+                );
+                let io_device = prover.program_io.clone();
+                let (jolt_proof, _) = prover.prove();
+
+                #handle_return
+
+                (ret_val, jolt_proof, io_device)
+            }
+        }
+    }
+
+    fn make_prove_committed_func(&self) -> TokenStream2 {
+        let prove_output_ty = self.get_prove_output_type();
+
+        let handle_return = match &self.func.sig.output {
+            ReturnType::Default => quote! {
+                let ret_val = ();
+            },
+            ReturnType::Type(_, ty) => quote! {
+                let mut outputs = io_device.outputs.clone();
+                outputs.resize(preprocessing.shared.memory_layout.max_output_size as usize, 0);
+                let ret_val = jolt::postcard::from_bytes::<#ty>(&outputs).unwrap();
+            },
+        };
+
+        let set_program_args = self.pub_func_args.iter().map(|(name, _)| {
+            quote! {
+                input_bytes.append(&mut jolt::postcard::to_stdvec(&#name).unwrap())
+            }
+        });
+        let set_program_untrusted_advice_args = self.untrusted_func_args.iter().map(|(name, _)| {
+            quote! {
+                untrusted_advice_bytes.append(&mut jolt::postcard::to_stdvec(&#name).unwrap())
+            }
+        });
+        let set_program_trusted_advice_args = self.trusted_func_args.iter().map(|(name, _)| {
+            quote! {
+                trusted_advice_bytes.append(&mut jolt::postcard::to_stdvec(&#name).unwrap())
+            }
+        });
+
+        let fn_name = self.get_func_name();
+        let inputs_vec: Vec<_> = self.func.sig.inputs.iter().collect();
+        let inputs = quote! { #(#inputs_vec),* };
+        let imports = self.make_imports();
+
+        let prove_fn_name = syn::Ident::new(&format!("prove_committed_{fn_name}"), fn_name.span());
+
+        let has_trusted_advice = !self.trusted_func_args.is_empty();
+
+        let commitment_param = if has_trusted_advice {
+            quote! { , trusted_advice_commitment: Option<<jolt::PCS as jolt::CommitmentScheme>::Commitment>,
+            trusted_advice_hint: Option<<jolt::PCS as jolt::CommitmentScheme>::OpeningProofHint> }
+        } else {
+            quote! {}
+        };
+
+        let commitment_arg = if has_trusted_advice {
+            quote! { trusted_advice_commitment, trusted_advice_hint }
+        } else {
+            quote! { None, None }
+        };
+
+        quote! {
+            #[cfg(all(not(target_arch = "wasm32"), not(feature = "guest")))]
+            #[allow(clippy::too_many_arguments)]
+            pub fn #prove_fn_name(
+                mut program: jolt::host::Program,
+                preprocessing: jolt::JoltProverPreprocessing<jolt::F, jolt::PCS>,
+                #inputs
+                #commitment_param
+            ) -> #prove_output_ty {
+                #imports
+
+                if !preprocessing.is_committed_mode() {
+                    panic!(
+                        "Committed bytecode proving requires committed preprocessing. \
+                        Use `preprocess_committed_*` / `JoltProverPreprocessing::new_committed`."
+                    );
+                }
+
+                let mut input_bytes = vec![];
+                #(#set_program_args;)*
+                let mut untrusted_advice_bytes = vec![];
+                #(#set_program_untrusted_advice_args;)*
+                let mut trusted_advice_bytes = vec![];
+                #(#set_program_trusted_advice_args;)*
+
+                let elf_contents_opt = program.get_elf_contents();
+                let elf_contents = elf_contents_opt.as_deref().expect("elf contents is None");
+                let prover = RV64IMACProver::gen_from_elf_with_bytecode_mode(&preprocessing,
                     &elf_contents,
                     &input_bytes,
                     &untrusted_advice_bytes,
                     &trusted_advice_bytes,
                     #commitment_arg,
+                    jolt::BytecodeMode::Committed,
                 );
                 let io_device = prover.program_io.clone();
                 let (jolt_proof, _) = prover.prove();
diff --git a/jolt-sdk/src/host_utils.rs b/jolt-sdk/src/host_utils.rs
index a0b37479af..4b9c3cea93 100644
--- a/jolt-sdk/src/host_utils.rs
+++ b/jolt-sdk/src/host_utils.rs
@@ -11,6 +11,7 @@ pub use jolt_core::field::JoltField;
 pub use jolt_core::guest;
 pub use jolt_core::poly::commitment::dory::DoryCommitmentScheme as PCS;
 pub use jolt_core::zkvm::bytecode::BytecodePreprocessing;
+pub use jolt_core::zkvm::config::BytecodeMode;
 pub use jolt_core::zkvm::{
     proof_serialization::JoltProof, verifier::JoltSharedPreprocessing,
     verifier::JoltVerifierPreprocessing, RV64IMACProof, RV64IMACVerifier, Serializable,

From a491a8fcd7ff4abdf788c4a8848b0477ab2c03fe Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Tue, 20 Jan 2026 05:10:59 -0800
Subject: [PATCH 11/16] fix: add missing update_flamegraph method to
 BytecodeClaimReductionProver

---
 jolt-core/src/zkvm/claim_reductions/bytecode.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/jolt-core/src/zkvm/claim_reductions/bytecode.rs b/jolt-core/src/zkvm/claim_reductions/bytecode.rs
index 303cc22435..6aa8ab84d6 100644
--- a/jolt-core/src/zkvm/claim_reductions/bytecode.rs
+++ b/jolt-core/src/zkvm/claim_reductions/bytecode.rs
@@ -370,6 +370,11 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T> for BytecodeClaim
             }
         }
     }
+
+    #[cfg(feature = "allocative")]
+    fn update_flamegraph(&self, flamegraph: &mut allocative::FlameGraphBuilder) {
+        flamegraph.visit_root(self);
+    }
 }
 
 pub struct BytecodeClaimReductionVerifier<F: JoltField> {

From 5e4668c4e79b6d34491376a9fbdcdfd60cafdbc3 Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Tue, 20 Jan 2026 05:16:18 -0800
Subject: [PATCH 12/16] chore: untrack bytecode-commitment-progress.md planning
 doc

---
 .gitignore                      |   1 +
 bytecode-commitment-progress.md | 911 --------------------------------
 2 files changed, 1 insertion(+), 911 deletions(-)
 delete mode 100644 bytecode-commitment-progress.md

diff --git a/.gitignore b/.gitignore
index 6c88a867c6..fc6d03d695 100644
--- a/.gitignore
+++ b/.gitignore
@@ -65,3 +65,4 @@ jolt-sdk/tests/fib_io_device_bytes.rs
 jolt-sdk/tests/fib_proof_bytes.rs
 jolt-sdk/tests/jolt_verifier_preprocessing_bytes.rs
 
+bytecode-commitment-progress.md
diff --git a/bytecode-commitment-progress.md b/bytecode-commitment-progress.md
deleted file mode 100644
index 66c17b7db9..0000000000
--- a/bytecode-commitment-progress.md
+++ /dev/null
@@ -1,911 +0,0 @@
-# Bytecode Commitment (Planning / Progress Notes)
-
-This file is a **living design doc** for implementing **bytecode commitment** to remove verifier work linear in bytecode size \(K\), especially in recursion contexts (e.g. `examples/recursion/`).
-
-This is the **single authoritative document** for:
-- bytecode commitment design + implementation progress
-- the bytecode preprocessing refactor (Full vs Committed split via `BytecodeMode`)
-
-## Current architecture baseline (post-refactor)
-
-Bytecode preprocessing is now split between prover and verifier based on `BytecodeMode`:
-
-- **Full mode**: verifier has access to full bytecode (may do \(O(K)\) work).
-- **Committed mode**: verifier only has bytecode *commitments* (succinct), and verification uses claim reductions.
-
-### Data structures (single source of truth for bytecode size \(K\))
-
-```
-BytecodePreprocessing  ← O(K) data, created first via preprocess()
-├── bytecode: Vec<Instruction>
-└── pc_map: BytecodePCMapper
-
-JoltSharedPreprocessing  ← Truly shared, single source of truth for size
-├── bytecode_size: usize            ← Derived from bytecode.bytecode.len()
-├── ram: RAMPreprocessing
-├── memory_layout: MemoryLayout
-└── max_padded_trace_length: usize
-
-JoltProverPreprocessing  ← Prover always has full bytecode
-├── generators: PCS::ProverSetup
-├── shared: JoltSharedPreprocessing
-├── bytecode: Arc<BytecodePreprocessing>        ← Full bytecode (always)
-├── bytecode_commitments: Option<TrustedBytecodeCommitments<PCS>>  ← Only in Committed mode
-└── bytecode_commitment_hints: Option<Vec<PCS::OpeningProofHint>>  ← Only in Committed mode
-
-JoltVerifierPreprocessing  ← Verifier has mode-dependent bytecode
-├── generators: PCS::VerifierSetup
-├── shared: JoltSharedPreprocessing
-└── bytecode: VerifierBytecode<PCS>        ← Full OR Committed
-
-VerifierBytecode<PCS>  ← Mode-dependent bytecode info
-├── Full(Arc<BytecodePreprocessing>)              ← Full mode
-└── Committed(TrustedBytecodeCommitments<PCS>)    ← Committed mode
-```
-
-`BytecodeMode` is the first-class “full vs committed” selector (`jolt-core/src/zkvm/config.rs`).
-
-### Trace-like `Arc` pattern (parallel to trace handling)
-
-```rust
-// Trace:
-let trace: std::sync::Arc<Vec<Cycle>> = trace.into();
-
-// Bytecode (parallel):
-let bytecode: std::sync::Arc<BytecodePreprocessing> =
-    BytecodePreprocessing::preprocess(instructions).into();
-```
-
-### Key design decisions (implemented)
-
-- `BytecodePreprocessing::preprocess()` returns `Self` (callers wrap in `Arc<Self>` as needed).
-- `JoltSharedPreprocessing::new()` takes `&BytecodePreprocessing` and stores only `bytecode_size` (single source of truth for \(K\)).
-- `TrustedBytecodeCommitments<PCS>` is a trust-typed wrapper: create via `derive()` (offline preprocessing) or trusted deserialization.
-- `VerifierBytecode::as_full()` / `as_committed()` return `Result<_, ProofVerifyError>` (no panics for mismatched mode).
-
-### SDK macro API (current)
-
-The `#[jolt::provable]` macro generates a **2-call** preprocessing workflow for the common case:
-
-```rust
-let prover_pp = guest::preprocess_<func>(&mut program);
-let verifier_pp = guest::verifier_preprocessing_from_prover_<func>(&prover_pp);
-```
-
-Advanced/secondary API (still generated):
-
-- `preprocess_shared_<func>(&mut Program) -> (JoltSharedPreprocessing, BytecodePreprocessing)`
-
-### SDK status (2026-01-20): Committed bytecode mode exposed end-to-end
-
-Committed mode requires **both**:
-
-1. **Committed preprocessing**: create prover preprocessing via `JoltProverPreprocessing::new_committed(...)`
-2. **Committed proving**: prove via `RV64IMACProver::gen_from_elf_with_bytecode_mode(..., BytecodeMode::Committed)`
-
-**Done in this branch:**
-- Macro generates committed APIs:
-  - `preprocess_committed_<func>`
-  - `build_prover_committed_<func>`
-  - `prove_committed_<func>`
-- `BytecodeMode` is re-exported from the SDK host surface (`jolt-sdk/src/host_utils.rs`).
-- Example CLI surfaced (`examples/fibonacci --committed-bytecode`), using the committed APIs.
-
-**Remaining SDK work (polish):**
-- Decide whether “committed” should remain separate entrypoints or become a `bytecode_mode: BytecodeMode` parameter on the default APIs.
-- Optionally propagate `--committed-bytecode` to other examples / docs.
-
-## Problem statement (what is slow today?)
-
-### Where the verifier is doing \(O(K)\) work
-
-- **Stage 6 verifier constructs `BytecodeReadRafSumcheckVerifier` by calling `BytecodeReadRafSumcheckParams::gen`**, passing the full `BytecodePreprocessing`.
-  - This happens in:
-    - `jolt-core/src/zkvm/verifier.rs` **L409–L417**
-
-- `BytecodeReadRafSumcheckParams::gen` currently **materializes 5 full `val_polys` of length `K`** by iterating the entire bytecode.
-  - `compute_val_polys(...)` call site:
-    - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L773–L784**
-  - The fused per-instruction loop is here:
-    - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L874–L1009**
-
-- In `expected_output_claim`, the verifier then **evaluates each `val_poly` at `r_address`**, which is also \(O(K)\).
-  - `val.evaluate(&r_address_prime.r)`:
-    - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L648–L666**
-  - `MultilinearPolynomial::evaluate` builds EQ tables and does a split-eq evaluation (still linear in coeff count):
-    - `jolt-core/src/poly/multilinear_polynomial.rs` **L682–L772**
-
-Net: for large bytecode (e.g. \(K \approx 2^{20}\)), the verifier is doing millions of field ops per verification, which explodes in recursion.
-
-## Relevant existing patterns we can mirror
-
-### 1) Two-phase claim reduction spanning Stage 6 → Stage 7 (Advice)
-
-- Stage 6 includes Advice claim reduction Phase 1:
-  - `jolt-core/src/zkvm/verifier.rs` **L446–L486**
-- Stage 7 conditionally includes Advice claim reduction Phase 2:
-  - `jolt-core/src/zkvm/verifier.rs` **L508–L529**
-- Advice reduction module:
-  - `jolt-core/src/zkvm/claim_reductions/advice.rs` (full file)
-
-### 2) “Trusted commitment in preprocessing-only context” (Advice)
-
-- Untrusted advice: prover commits during proving (`DoryContext::UntrustedAdvice`) and includes commitment in proof.
-  - `jolt-core/src/zkvm/prover.rs` **L636–L667**
-- Trusted advice: commitment/hint computed in preprocessing-only context (`DoryContext::TrustedAdvice`), verifier has commitment; prover just appends it to transcript.
-  - `jolt-core/src/zkvm/prover.rs` **L669–L688**
-- Dory contexts currently supported:
-  - `jolt-core/src/poly/commitment/dory/dory_globals.rs` **L160–L166**
-
-### 3) Single Stage 8 joint opening (Dory batch opening)
-
-Stage 8 collects polynomial claims, samples gamma, combines commitments, and verifies a single opening.
-
-- Stage 8 verifier:
-  - `jolt-core/src/zkvm/verifier.rs` **L542–L691**
-
-Advice polynomials get a **Lagrange embedding factor** so a smaller context polynomial can be batched with main polynomials:
-
-- `compute_advice_lagrange_factor`:
-  - `jolt-core/src/poly/opening_proof.rs` **L635–L672**
-
-## Key batching detail (important for scheduling reductions)
-
-Batched sumcheck instances are “front-loaded” via a **global round offset**:
-
-- Default `round_offset` shifts shorter instances to the **end**:
-  - `jolt-core/src/subprotocols/sumcheck_prover.rs` **L30–L37**
-  - `jolt-core/src/subprotocols/sumcheck_verifier.rs` **L24–L30**
-- `BatchedSumcheck` uses that offset to decide whether an instance is active in a global round:
-  - `jolt-core/src/subprotocols/sumcheck.rs` **L79–L93**
-
-This matters because it explains why Stage 6 “cycle rounds” can align across many instances even if they have different `num_rounds()`.
-
-## Bytecode commitment: what we likely need to commit to
-
-### Bytecode-side “fields” referenced in `compute_val_polys`
-
-From `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L874–L1009**, Val polynomials depend on:
-
-- **Instruction scalar fields**
-  - `instr.address` (a.k.a. unexpanded PC)
-  - `instr.operands.imm`
-- **Circuit flags**: `NUM_CIRCUIT_FLAGS = 13`
-  - `jolt-core/src/zkvm/instruction/mod.rs` **L59–L86**, **L121**
-- **Instruction flags**: `NUM_INSTRUCTION_FLAGS = 7`
-  - `jolt-core/src/zkvm/instruction/mod.rs` **L104–L119**, **L122**
-- **Register operands**: `rd`, `rs1`, `rs2` (used via `eq_r_register[...]` lookup)
-  - This strongly suggests committing to **one-hot indicators** `1_{rd=r}`, `1_{rs1=r}`, `1_{rs2=r}` for all `r` (linear combination with EQ table).
-- **Lookup table selector**
-  - `NUM_LOOKUP_TABLES = LookupTables::<32>::COUNT` (currently 41)
-  - `jolt-core/src/zkvm/lookup_table/mod.rs` **L118–L166**
-- **RAF / interleaving flag**
-  - `!circuit_flags.is_interleaved_operands()` (non-linear in circuit flags, so likely needs its own committed boolean field if we want linear combination only).
-  - `jolt-core/src/zkvm/instruction/mod.rs` **L124–L135**
-
-## Decisions so far (from discussion)
-
-### Commitment granularity + packing (key)
-
-We will **commit to the “atomic” bytecode fields**, but **pack/chunk them so each committed polynomial’s “lane” dimension fits `k_chunk = 2^{log_k_chunk}`**.
-
-- `log_k_chunk` is **either 4 or 8** (so `k_chunk` is **16 or 256**), chosen from trace length:
-  - `jolt-core/src/zkvm/config.rs` **L133–L151**
-
-#### Canonical lane ordering (authoritative)
-
-We fix a canonical total ordering of “lanes” (fields) so packing/chunking is purely mechanical and future-proof:
-
-1. **`rs1` one-hot lanes**: 128 lanes (registers 0..127)
-2. **`rs2` one-hot lanes**: 128 lanes
-3. **`rd` one-hot lanes**: 128 lanes
-4. **`unexpanded_pc` lane** (scalar)
-5. **`imm` lane** (scalar)
-6. **circuit flags** lanes: 13 boolean lanes (`NUM_CIRCUIT_FLAGS`)
-7. **instruction flags** lanes: 7 boolean lanes (`NUM_INSTRUCTION_FLAGS`)
-8. **lookup-table selector** lanes: 41 boolean lanes (`NUM_LOOKUP_TABLES`)
-9. **RAF/interleave flag** lane: 1 boolean lane (`raf_flag := !circuit_flags.is_interleaved_operands()`)
-
-Lane counts:
-- registers: `3 * REGISTER_COUNT = 3 * 128 = 384`
-  - `REGISTER_COUNT` definition: `common/src/constants.rs` **L1–L5**
-- “dense-ish” bytecode fields: `2 + 13 + 7 + 41 + 1 = 64`
-  - flags definitions: `jolt-core/src/zkvm/instruction/mod.rs` **L59–L86** (circuit), **L104–L119** (instruction)
-  - lookup tables count: `jolt-core/src/zkvm/lookup_table/mod.rs` **L118–L166**
-
-Total lanes = **384 + 64 = 448**.
-
-Packing policy:
-- We chunk the lane list into consecutive blocks of size `k_chunk`.
-- Each block becomes one committed “bytecode commitment polynomial”.
-- **`k_chunk=16`**: 448 lanes ⇒ **28 commitments** (exactly `3*(128/16)=24` for registers + `64/16=4` for the rest).
-- **`k_chunk=256`**: 448 lanes ⇒ **2 commitments**:
-  - chunk0: `rs1[0..127] || rs2[0..127]` (256 lanes)
-  - chunk1: `rd[0..127] || (all remaining 64 lanes) || (64 lanes padding)`
-
-Notes:
-- Even though the first 384 lanes are “one-hot structured”, the packing is defined by lanes, so rs1/rs2/rd can be packed together when `k_chunk=256`.
-- We will likely encode all lanes as field elements in the packed polynomial (booleans as 0/1), but **the representation choice (dense vs specialized one-hot)** is still an implementation detail (see Remaining plan questions below).
-
-### Embedding policy
-
-We will **not** require the main Dory matrix to grow to fit bytecode commitments. Instead we:
-
-- keep each bytecode-commit polynomial within the main `k_chunk` address-dimension, and
-- use a claim reduction (Stage 6→7) so these commitments can be batched into the single Stage 8 opening, similar to advice.
-
-### Domain / padding
-
-Bytecode commitments use the same **padding-to-power-of-two** policy as other committed polynomials:
-
-- the “instruction index” dimension is padded to a power of 2 (like other `T`-style dimensions).
-- the “lane/index” dimension is `k_chunk` (16 or 256), with unused lanes zero-padded.
-
-### Ownership / preprocessing storage
-
-Bytecode commitments should behave like **trusted preprocessing**:
-
-- verifier has them in shared preprocessing (like trusted advice commitment is “known” to verifier),
-- we define an enum where shared preprocessing stores **either**:
-  - raw bytecode (`BytecodePreprocessing`), **or**
-  - commitments (+ minimal metadata).
-
-## Remaining plan questions (to settle before coding)
-
-1. **Representation / PCS support for packed bytecode polynomials**:
-   - Packing into `k_chunk` lanes means each packed polynomial has `k_chunk * bytecode_len` coefficients (very large).
-   - We likely need a **streaming / implicit** polynomial representation (similar in spirit to `RLCPolynomial`) so Stage 8 can include bytecode commitments in the joint opening without materializing all coefficients.
-2. **“rs1+rs2 as one-hot” wording (important clarity)**:
-   - A single `OneHotPolynomial` can only select **one** lane index per column.
-   - Packing `rs1` and `rs2` into the same 256-lane chunk means two 1s per instruction; this may need to be represented as a packed dense-bool polynomial (still sparse), or via a different encoding.
-3. **Reduction batching**: we want **one** `BytecodeClaimReduction` sumcheck that batches all bytecode commitments and normalizes to the unified point (like `AdviceClaimReduction` + `HammingWeightClaimReduction` patterns).
-4. **Stage 6 refactor** (required for mid-stage emission):
-   - Stage 6 must split into **Stage 6a (log_K)** and **Stage 6b (log_T)** so bytecode-field claims emitted after the address rounds can be consumed immediately.
-   - This also requires splitting `Booleanity` into address/cycle sumchecks (it is internally two-phase today):
-     - `jolt-core/src/subprotocols/booleanity.rs` **L399–L453** (phase switch), **L455–L478** (cache_openings)
-5. **Exact API surface**:
-   - what concrete type should live in `JoltSharedPreprocessing` for the commitment-only variant (commitments-only vs commitments+opening hints)?
-   - which `SumcheckId` values should be used for the new reduction’s intermediate/final cached openings?
-
----
-
-## BytecodeReadRaf Stage 6a: what claims should be emitted?
-
-The “emission point” is already explicit in the prover today: it happens right when we transition from the first `log_K` (address) rounds into the remaining `log_T` (cycle) rounds.
-
-In `BytecodeReadRafSumcheckProver::init_log_t_rounds`:
-
-- The prover computes the 5 stage-specific scalars:
-  - `poly.final_sumcheck_claim()` for each stage Val polynomial, plus the RAF-injected identity contribution for stages 1 and 3:
-    - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L307–L335**
-- It also finalizes the address point by reversing the collected low-to-high challenges:
-  - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L337–L340**
-
-Those 5 scalars are stored in:
-
-- `self.bound_val_evals: Option<[F; 5]>`
-  - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L320–L335**
-
-**Stage 6a should emit exactly these 5 scalars as “bytecode field claims”**, keyed by a new `SumcheckId` / `OpeningId`, with opening point = the address point `r_address` produced at the end of the address rounds.
-
-Implementation detail we’ll likely choose:
-
-- Emit **Val-only** claims `Val_s(r_address)` (no RAF Int injected), and let `BytecodeReadRaf` add the constant RAF terms itself (since `Int(r_address)=1`).
-  - Today RAF is injected in `bound_val_evals` at **L324–L331**; we can split this for cleaner “bytecode-only” claim reduction.
-
-Why this is the “right” interface:
-
-- Stage 6b (the cycle-phase continuation of BytecodeReadRaf) needs these 5 scalars as weights for the remaining `log_T` rounds (today they’re read from `bound_val_evals` during the `round >= log_K` branch).
-
-## BytecodeClaimReduction: what it should prove (high level)
-
-We mirror the structure of `AdviceClaimReduction` (`jolt-core/src/zkvm/claim_reductions/advice.rs`), but with different “payload polynomials” and a simpler address schedule thanks to `k_chunk`.
-
-### Inputs (from Stage 6a)
-
-- The 5 “Val stage” claims:
-  - `c_s := Val_s(r_bc)` for `s ∈ {1..5}`, where `r_bc` is the Stage 6a address point (bytecode-index point).
-- The point `r_bc` itself (implicitly stored as the opening point associated with `c_s`).
-
-### Witness (committed) polynomials
-
-Let `B_i` be the committed bytecode chunk polynomials induced by the canonical lane ordering.
-
-- `i ∈ [0, n_chunks)` where `n_chunks = ceil(448 / k_chunk)`:
-  - `k_chunk=16` ⇒ `n_chunks=28`
-  - `k_chunk=256` ⇒ `n_chunks=2`
-  - See lane spec above.
-
-Each `B_i` is a polynomial over:
-- **lane/address vars**: `log_k_chunk`
-- **bytecode-index vars**: `log_K_bytecode` (padded / embedded as needed; see “bytecode_len vs trace_len” note below)
-
-### The identity to prove (batched)
-
-Define a per-stage lane weight table `w_s[lane]` derived from:
-- stage gammas sampled in `BytecodeReadRafSumcheckParams::gen`:
-  - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L738–L742**
-- register EQ tables (`eq_r_register_4`, `eq_r_register_5`) and the stage formulas in `compute_val_polys`:
-  - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L752–L783**, **L874–L1009**
-
-Then for each stage:
-
-- \(c_s = \sum_{lane,k} B[lane,k] \cdot w_s[lane] \cdot \mathrm{eq}(r_{bc}, k)\)
-
-We will batch the 5 stages with a transcript challenge \( \eta \) (powers), so the sumcheck instance has a **single scalar input claim**:
-
-- \(C_{\text{in}} = \sum_s \eta^s \cdot c_s\)
-
-and proves:
-
-- \(C_{\text{in}} = \sum_{lane,k} B[lane,k] \cdot W_{\eta}(lane) \cdot \mathrm{eq}(r_{bc}, k)\)
-  - where \(W_{\eta}(lane) := \sum_s \eta^s \cdot w_s[lane]\)
-
-This keeps verifier complexity small: evaluating \(W_{\eta}\) at a point costs `O(k_chunk)` and computing \(\mathrm{eq}(r_{bc}, \cdot)\) uses `EqPolynomial`.
-
-### Reduction target (Stage 8 compatibility)
-
-BytecodeClaimReduction will run in two phases like advice:
-
-- **Phase 1 (Stage 6b)**: bind the bytecode-index variables (cycle-phase rounds).
-  - Cache an intermediate claim (like `AdviceClaimReductionCyclePhase`).
-- **Phase 2 (Stage 7)**: bind the lane variables (`log_k_chunk` rounds).
-  - When each `B_i` is fully bound (len==1), cache its final opening `B_i(final_point)` for batching into Stage 8.
-
-Verifier then reconstructs the stage-6a claim(s) from:
-- the final `B_i(final_point)` openings,
-- the scalar `EqPolynomial::mle(r_bc, final_point_k)`,
-- the scalar `W_eta(final_point_lane)`,
-exactly analogous to `AdviceClaimReductionVerifier::expected_output_claim`.
-
-### bytecode_len vs trace_len (defensive padding)
-
-If `bytecode_len > padded_trace_len` (rare but possible for “mostly dead code”), we need to ensure:
-- the main Dory URS / generators are large enough, and
-- any “bytecode index variable count” that is driven by Stage 6 cycle rounds has enough randomness.
-
-Pragmatic policy:
-- set `padded_trace_len = max(padded_trace_len, bytecode_len.next_power_of_two())` *when bytecode commitments are enabled*,
-  similar in spirit to `adjust_trace_length_for_advice` in `jolt-core/src/zkvm/prover.rs`.
-
-### Preliminary “field count” if committed separately (worst-case baseline)
-
-If we commit one polynomial per “atomic linear field”:
-
-- `pc` + `imm`: **2**
-- circuit flags: **13**
-- instruction flags: **7**
-- register one-hots: **3 * REGISTER_COUNT**
-  - Note: `REGISTER_COUNT = 32 (RISC-V) + 96 (virtual) = 128` in this repo
-    - `common/src/constants.rs` **L1–L5**
-- lookup table one-hots: **41**
-- raf/interleave flag: **1**
-
-Total baseline (with `REGISTER_COUNT=128`): **2 + 13 + 7 + 384 + 41 + 1 = 448 polynomials**.
-
-This is too many to *open individually*, but may be fine if we **derive only a few linear-combo commitments** (see open design options below).
-
-## Proposed direction (high-level)
-
-Goal: make verifier’s `BytecodeReadRaf` expected-claim computation **not materialize or evaluate length-K `val_polys`**, and instead consume **opening claims** that are later checked against a **trusted bytecode commitment** via Stage 8.
-
-Key idea: mirror advice:
-
-- **(A) Commit to bytecode (trusted preprocessing)**
-  - Add a dedicated Dory context (e.g. `DoryContext::Bytecode`) whose matrix is a top-left block of main, like advice.
-  - Verifier has these commitments “for free” (hard-coded / preprocessing).
-
-- **(B) Emit bytecode-related evaluation claims during Stage 6**
-  - Similar to how advice emits `RamValEvaluation` openings that later get reduced, `BytecodeReadRaf` should stop evaluating `val_polys` itself and instead *read* an opening claim (or small number of claims) from the opening accumulator.
-
-- **(C) New two-phase “BytecodeClaimReduction” sumcheck**
-  - Stage 6 phase: bind cycle-derived coordinates (last `log_T` rounds)
-  - Stage 7 phase: bind address-derived coordinates (`log_k_chunk` rounds)
-  - Cache final opening(s) so Stage 8 can batch them.
-
-- **(D) Stage 8 batches bytecode commitments**
-  - Include bytecode commitment(s) and reduced claim(s) in `polynomial_claims` with an embedding/Lagrange factor (same pattern as advice).
-
-## Open design questions (need alignment before coding)
-
-1. **Embedding feasibility**
-   - Bytecode commitment context must fit in main Dory matrix: need `(sigma_bytecode <= sigma_main)` and `(nu_bytecode <= nu_main)`.
-   - If program has **small trace length but huge bytecode**, do we:
-     - pad `T` upward (like `adjust_trace_length_for_advice`), or
-     - allow a second opening / separate Stage 8, or
-     - impose a constraint “recursion requires T big enough”?
-
-2. **Granularity**
-   - Commit per field (many polynomials), or
-   - commit a smaller set + derive per-stage Val polynomials by linear combinations of commitments, or
-   - pack fields into one polynomial `p(k, idx)` (but then Val is *not* a simple linear combo of `p` at one point; needs more thought).
-
-3. **How many bytecode “claims” should Stage 6 consume?**
-   - 5 claims (one per stage Val polynomial), or
-   - 1 claim (random linear combo of stage Vals, or another fixed fold) to minimize downstream reduction/opening cost.
-
-4. **Where should the “initial” bytecode openings live?**
-   - As `OpeningId::Committed(CommittedPolynomial::..., SumcheckId::BytecodeReadRaf)` entries, analogous to other committed openings, or
-   - a new `OpeningId` variant (like `TrustedAdvice(...)`) if we need special casing.
-
-5. **Commitment ownership**
-   - Should bytecode commitments be stored inside `JoltSharedPreprocessing` / `JoltVerifierPreprocessing`, or passed separately like `trusted_advice_commitment`?
-
-6. **Transcript binding**
-   - We likely need to append trusted bytecode commitment(s) to the transcript in `JoltVerifier::verify` (similar to trusted advice):
-     - `jolt-core/src/zkvm/verifier.rs` **L190–L203**
-
----
-
-## Next steps (for plan agreement)
-
-1. Decide **commit granularity** (per-field vs derived vs packed) with a target of minimizing **recursive verifier cycles**.
-2. Decide **embedding policy** when bytecode is larger than main Dory dims.
-3. Define the **exact claims** `BytecodeReadRaf` will consume (count + meaning).
-4. Define the new **BytecodeClaimReduction** parameters (analogous to `AdviceClaimReductionParams`) and which Stage 6/7 rounds it occupies.
-
----
-
-## Progress update (2026-01-20)
-
-High-level status (diff vs main):
-- Stage 6 split into 6a/6b with new proofs and wiring in prover/verifier (`jolt-core/src/zkvm/proof_serialization.rs` **L28–L41**; `jolt-core/src/zkvm/prover.rs` **L525–L534**, **L1151–L1394**; `jolt-core/src/zkvm/verifier.rs` **L225–L233**, **L430–L571**).
-- Booleanity split into address/cycle sumchecks; advice round alignment updated (`jolt-core/src/subprotocols/booleanity.rs` **L497–L736**; `jolt-core/src/poly/opening_proof.rs` **L157–L158**; `jolt-core/src/zkvm/witness.rs` **L285–L287**; `jolt-core/src/zkvm/claim_reductions/advice.rs` **L521–L526**).
-- BytecodeReadRaf split + staged Val claims + committed verifier Stage 6a path wired (`jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L1261–L1288**, **L1554–L1636**; `jolt-core/src/zkvm/verifier.rs` **L430–L455**).
-- BytecodeClaimReduction implemented with canonical lane ordering and BytecodeChunk openings (`jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L1–L15**, **L193–L236**, **L470–L488**, **L494–L671**).
-- Bytecode commitment plumbing is in place (BytecodeMode + preprocessing + VerifierBytecode), and commitment derivation + Stage 8 batching/folding are now implemented (see next update).
-
-Immediate next steps:
-1. Add/enable tests (lane ordering, committed mode e2e, Stage 8 folding) and remove ignores once committed mode is fully wired (`jolt-core/src/zkvm/tests.rs` **L426–L486**).
-2. Optimize bytecode VMV contribution in streaming RLC (current path iterates `K * k_chunk * num_chunks`) (`jolt-core/src/poly/rlc_polynomial.rs` **L420–L485**).
-3. Enforce or document the `log_T >= log_K_bytecode` requirement for Stage 8 folding; decide whether to lift this (see “log_K > log_T” discussion below).
-4. Expose Committed bytecode mode in the SDK (opt-in): macro-generated committed preprocessing + committed proving entrypoint / `BytecodeMode` parameter (see “TODO (SDK): expose Committed bytecode mode end-to-end” above).
-
-Concerns / risks:
-- BytecodeClaimReduction still materializes `weight_chunks` and `bytecode_chunks` of size `k_chunk * K_bytecode` (no longer `k_chunk * T`), but this can be large for big bytecode (`jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L190–L218**).
-- Streaming RLC bytecode contribution currently iterates `K * k_chunk * num_chunks` (needs optimization) (`jolt-core/src/poly/rlc_polynomial.rs` **L420–L485**).
-
----
-
-## Progress update (2026-01-20, continued)
-
-High-level status (diff vs previous update):
-- BytecodeClaimReduction now runs over `log_K` (no `log_T` padding) and consumes `r_bc` directly (`jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L73–L215**).
-- Canonical lane ordering + lane value logic centralized in `bytecode::chunks`, used by both commitment derivation and claim reduction (`jolt-core/src/zkvm/bytecode/chunks.rs` **L11–L138**).
-- `TrustedBytecodeCommitments::derive` implemented and commits in a dedicated `DoryContext::Bytecode`, carrying `log_k_chunk` + `bytecode_len` metadata (`jolt-core/src/zkvm/bytecode/mod.rs` **L33–L79**; `jolt-core/src/poly/commitment/dory/dory_globals.rs` **L154–L171**).
-- Stage 8 now *folds bytecode chunk openings into the joint opening proof* via a Lagrange selector over missing cycle vars (prover+verifier) (`jolt-core/src/zkvm/prover.rs` **L1618–L1664**; `jolt-core/src/zkvm/verifier.rs` **L741–L788**).
-- Streaming RLC now supports bytecode chunk contributions in the VMV pass (`jolt-core/src/poly/rlc_polynomial.rs` **L420–L539**).
-
----
-
-## Progress update (2026-01-20, AddressMajor correctness)
-
-Status:
-- **Committed bytecode now passes in both layouts** (CycleMajor + AddressMajor). In particular,
-  `fib_e2e_committed_bytecode_address_major` passes.
-
-Root cause:
-- Under `DoryLayout::AddressMajor`, the bytecode chunk coefficient order makes
-  `BindingOrder::LowToHigh` bind **lane/address** bits first. But `BytecodeClaimReduction` Phase 1
-  (Stage 6b) must bind **cycle** bits first to match the staged `r_bc` semantics.
-
-Fix:
-- Keep bytecode commitments in the layout’s native order for Dory opening, but in the **claim
-  reduction prover** permute AddressMajor chunk coefficients into **CycleMajor** order so Phase 1
-  binds cycle variables first.
-- Implemented by `permute_address_major_to_cycle_major` and applied in
-  `BytecodeClaimReductionProver::initialize` (`jolt-core/src/zkvm/claim_reductions/bytecode.rs`).
-
----
-
-## Remaining work (as of 2026-01-20)
-
-Cleanup / correctness hardening:
-- Remove temporary debug-only code in `jolt-core/src/zkvm/tests.rs` (env-var gated bytecode/Dory open checks).
-- Add the new module file to git: `jolt-core/src/zkvm/bytecode/chunks.rs` is currently untracked in `git status`.
-
-Perf / scalability:
-- Optimize Stage 8 bytecode VMV contribution (currently iterates `K * k_chunk * num_chunks`) (`jolt-core/src/poly/rlc_polynomial.rs`).
-- Consider making `BytecodeClaimReduction` avoid materializing `k_chunk * K_bytecode` dense polynomials (streaming / implicit evaluation).
-
-Repo hygiene:
-- Before committing: run `cargo fmt` and `cargo clippy` and fix warnings.
-
-## Handling arbitrary `log_K` vs `log_T` (design sketch, not pursued)
-
-We may want to allow `log_K_bytecode > log_T` without a separate opening proof by **padding the cycle dimension** and embedding all trace-derived polynomials into a larger main opening domain.
-
-### Padding semantics: selector vs repetition
-
-There are two incompatible padding semantics today:
-
-1) **Selector padding (zero outside domain)**  
-   Embed a polynomial `P(a, c)` defined on `c ∈ {0,1}^{log_T}` into a larger `c' ∈ {0,1}^{log_T'}` (`log_T' = max(log_T, log_K)`) via:
-   - `P'(a, c, z) = P(a, c) · ∏_{i=1..Δ} (1 - z_i)`, where `Δ = log_T' - log_T`
-   - So `P' = P` when `z=0…0` and **0** elsewhere.
-
-2) **Repetition padding (independent vars)**  
-   Treat `P` as independent of the extra variables, so it repeats across them.
-   - In sumcheck batching, inactive rounds are dummy constants, which implies repetition.
-   - Batched sumcheck multiplies the input claim by `2^Δ` (see `BatchedSumcheck` in `jolt-core/src/subprotocols/sumcheck.rs` **L52–L91**).
-
-**Important:** selector padding and repetition padding are not equivalent; they lead to different claims and different opening proofs. Current sumcheck batching implements repetition padding.
-
-### What would need to change (high-level steps)
-
-To support arbitrary `log_K` and `log_T` while keeping a *single* Stage 8 opening:
-
-1) **Stage 6b round count becomes `log_T' = max(log_T, log_K)`**
-   - All cycle-phase instances must run in a batched sumcheck of length `log_T'`.
-   - Instances with `log_T` rounds become inactive for the first `Δ` rounds (front-loaded).
-
-2) **BatchedSumcheck must support selector padding**
-   - Today, inactive rounds use a constant univariate and the input claim is scaled by `2^Δ` (repetition semantics).
-   - To get selector padding, inactive rounds must instead use `H(z) = prev · (1 - z)` and **no `2^Δ` scaling**.
-   - This requires new per-instance hooks (inactive-round univariate + scaling policy) in `BatchedSumcheck` (`jolt-core/src/subprotocols/sumcheck.rs` **L52–L91**).
-
-3) **Main Dory matrix size uses `T'`**
-   - Stage 8’s main context must be initialized with `T'`, not the trace length.
-   - This affects the unified opening point and all VMV paths (`jolt-core/src/zkvm/prover.rs` **L1493–L1498**, `jolt-core/src/zkvm/verifier.rs` **L653–L661**).
-
-4) **All trace-derived polynomials must be embedded with selector padding**
-   - Add a Lagrange selector `∏(1 - r_extra)` to **every** claim whose cycle dimension is `log_T`.
-   - This includes dense polys and all RA polys (not just bytecode). The bytecode folding logic already does this (see `jolt-core/src/zkvm/prover.rs` **L1618–L1664** and `jolt-core/src/zkvm/verifier.rs` **L741–L788**).
-
-5) **Commitment and streaming need a zero-padding mode**
-   - Current trace padding uses `Cycle::NoOp`, which does **not** imply zero rows for all polynomials.
-   - For selector padding, padded cycles must contribute zero for **all** polynomials; this requires a new “zero row” padding mode in witness generation and streaming VMV.
-
-### Why this is not pursued now
-
-This change is cross-cutting and affects:
-- Batched sumcheck semantics,
-- Stage 6b scheduling,
-- Main Dory context sizing,
-- Stage 8 claim embedding for *all* polynomials,
-- Streaming witness/VMV paths.
-
-Given scope and risk, we are **not pursuing arbitrary `log_K` vs `log_T` support right now**. The current design assumes `log_T >= log_K` for the folded Stage 8 bytecode opening path.
-
----
-
-## Detailed implementation plan (agreed direction)
-
-This section is an implementation checklist in dependency order.
-
-### Step 1 — Refactor Stage 6 into two substages (6a + 6b)
-
-**Status (2026-01-20)**: DONE  
-- Proof split + serialization: `jolt-core/src/zkvm/proof_serialization.rs` **L28–L41**.  
-- Prover 6a/6b wiring: `jolt-core/src/zkvm/prover.rs` **L525–L534**, **L1151–L1394**.  
-- Verifier 6a/6b wiring: `jolt-core/src/zkvm/verifier.rs` **L225–L233**, **L430–L571**.
-
-**Goal**: make “end of BytecodeReadRaf address rounds” a real stage boundary so we can:
-- emit `Val_s(r_bc)` claims **immediately** after binding `r_bc`,
-- start `BytecodeClaimReduction` during the subsequent **cycle** randomness (what will become Stage 6b),
-- avoid verifier doing any \(O(K_{\text{bytecode}})\) work.
-
-#### 1.1 Proof object / serialization changes
-
-- Split `stage6_sumcheck_proof` into:
-  - `stage6a_sumcheck_proof` (address rounds)
-  - `stage6b_sumcheck_proof` (cycle rounds)
-- Transcript ordering: **run Stage 6a sumcheck → append Stage 6a claims → run Stage 6b sumcheck → append Stage 6b claims** (breaking change OK).
-- Files:
-  - `jolt-core/src/zkvm/proof_serialization.rs` (`JoltProof` struct)
-  - any serialize/deserialize helpers that assume a single Stage 6 proof.
-
-#### 1.2 Prover plumbing
-
-- In `jolt-core/src/zkvm/prover.rs`:
-  - Replace `prove_stage6()` with `prove_stage6a()` + `prove_stage6b()`.
-  - Update the main `prove()` flow to call both and store both proofs.
-  - Stage 6 instances currently assembled at `prover.rs` **L1206–L1214** must be split across 6a/6b.
-
-Target contents:
-- **Stage 6a (max rounds = `max(log_K_bytecode, log_k_chunk)`)**:
-  - `BytecodeReadRafAddr` (new; `log_K_bytecode` rounds)
-  - `BooleanityAddr` (new; `log_k_chunk` rounds; will be active only in last `log_k_chunk` rounds via front-loaded batching)
-- **Stage 6b (max rounds = `log_T`)**:
-  - `BytecodeReadRafCycle` (new; `log_T` rounds)
-  - `BooleanityCycle` (new; `log_T` rounds)
-  - existing Stage-6 cycle-only instances (unchanged logic, just move them here):
-    - `RamHammingBooleanity` (`log_T`)
-    - `RamRaVirtualization` (`log_T`)
-    - `InstructionRaVirtualization` (`log_T`)
-    - `IncClaimReduction` (`log_T`)
-    - AdviceClaimReduction Phase 1 (if present) **needs a `round_offset` update** because Stage 6b `max_num_rounds` will now be `log_T` (see Step 2.3).
-  - `BytecodeClaimReduction` phase 1 (new; `log_T` rounds; see Step 4)
-
-#### 1.3 Verifier plumbing
-
-- In `jolt-core/src/zkvm/verifier.rs`:
-  - Replace `verify_stage6()` with `verify_stage6a()` + `verify_stage6b()`.
-  - Update the main `verify()` call chain to include both.
-
-### Step 2 — Split Booleanity into two sumchecks (address + cycle)
-
-**Status (2026-01-20)**: DONE  
-- Address/cycle split + addr-claim chaining: `jolt-core/src/subprotocols/booleanity.rs` **L497–L736**; `jolt-core/src/zkvm/witness.rs` **L285–L287**; `jolt-core/src/poly/opening_proof.rs` **L157–L158**.  
-- Advice round_offset fix: `jolt-core/src/zkvm/claim_reductions/advice.rs` **L521–L526**.
-
-Reason: `Booleanity` is currently a *single* sumcheck with an internal phase transition at `log_k_chunk`:
-- `jolt-core/src/subprotocols/booleanity.rs` **L399–L446**
-
-But Stage 6 is becoming two proofs, so Booleanity must be representable as two separate sumcheck instances.
-
-#### 2.1 New sumcheck instances
-
-Create:
-- `BooleanityAddressSumcheck` (`num_rounds = log_k_chunk`)
-- `BooleanityCycleSumcheck` (`num_rounds = log_T`)
-
-We will reuse most of the existing prover state splitting exactly at the current transition:
-- address phase ends where today `eq_r_r` is computed and `H` is initialized (**L415–L445**)
-- cycle phase reuses `D` and `H` binding (**L446–L452**)
-
-#### 2.2 Chaining between 6a and 6b (important)
-
-To make `BooleanityCycle` a standalone sumcheck, it needs an **input claim**:
-- the output of `BooleanityAddress`, i.e. the partially summed claim after binding `r_address`.
-
-We will follow the **AdviceClaimReduction** pattern:
-- Stage 6a prover computes this intermediate claim and stores it in the opening accumulator under a new `SumcheckId` (see Step 5).
-- Stage 6a verifier treats that stored claim as the expected output of `BooleanityAddress`.
-- Stage 6b `BooleanityCycle` uses that stored claim as its `input_claim`.
-
-This avoids needing BatchedSumcheck to “return per-instance output claims”.
-
-#### 2.3 Update advice reduction round alignment (PINNED)
-
-`AdviceClaimReductionProver::round_offset` currently assumes Stage 6 max rounds includes `log_k_chunk + log_T` (it aligns to the start of Booleanity’s cycle segment).
-With Stage 6b max rounds = `log_T`, this must be updated to avoid underflow and to align to Stage 6b round 0.
-
-File:
-- `jolt-core/src/zkvm/claim_reductions/advice.rs` (`round_offset` in both prover+verifier impls)
-
-### Step 3 — Split BytecodeReadRaf into two sumchecks (address + cycle)
-
-**Status (2026-01-20)**: DONE (split + staged claims + committed verifier wired).  
-- Stage 6a emits Val-only claims: `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L838–L875**.  
-- Verifier fast path uses staged claims: `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L1427–L1445**.  
-- Committed verifier uses bytecode-agnostic params in Stage 6a: `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L1261–L1288**, **L1554–L1636**; `jolt-core/src/zkvm/verifier.rs` **L430–L455**.
-
-Reason: we need a real stage boundary right after binding `r_bc` (bytecode-index address point), because:
-- `Val_s(r_bc)` is computed exactly at the transition today in `init_log_t_rounds`
-  - `jolt-core/src/zkvm/bytecode/read_raf_checking.rs` **L307–L340**
-
-#### 3.1 New sumcheck instances
-
-Create:
-- `BytecodeReadRafAddressSumcheck` (`num_rounds = log_K_bytecode`)
-- `BytecodeReadRafCycleSumcheck` (`num_rounds = log_T`)
-
-#### 3.2 Stage 6a emissions (the key interface)
-
-At the end of address rounds (today’s `init_log_t_rounds` boundary):
-- emit **Val-only** claims:
-  - `c_s := Val_s(r_bc)` for `s=1..5`
-  - RAF terms are *not* included; verifier can add them succinctly because `Int(r_bc)=1`.
-- batch these 5 claims with a random \(\eta\) in later reduction (Step 4), but still store the 5 scalars in the opening map.
-
-Also emit the **cycle-phase input claim** for `BytecodeReadRafCycle`:
-- this is the output claim of the address-only sumcheck (the partially summed value over cycle variables).
-
-Both kinds of values must land in `opening_claims` so the verifier has them without recomputation.
-
-### Step 4 — Implement `BytecodeClaimReduction` (two-phase, single instance)
-
-**Status (2026-01-20)**: PARTIAL (sumcheck + openings done; Stage 8 batching pending).  
-- Claim reduction + lane ordering + weight construction: `jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L1–L15**, **L193–L236**, **L494–L671**.  
-- Emits BytecodeChunk openings (Phase 2): `jolt-core/src/zkvm/claim_reductions/bytecode.rs` **L470–L488**.
-
-This is the new sumcheck that replaces verifier’s \(O(K_{\text{bytecode}})\) evaluation of `val_polys`.
-
-#### 4.1 High-level role
-
-Input: the 5 `Val_s(r_bc)` scalars from Stage 6a.
-
-Output: a set of committed-polynomial openings for the **bytecode commitment chunk polynomials** at the unified Dory opening point, so Stage 8 can batch them.
-
-#### 4.2 Batching the 5 stage claims
-
-We will batch the 5 `Val_s(r_bc)` using a transcript challenge \(\eta\):
-
-- \(C_{\text{in}} = \sum_s \eta^s \cdot Val_s(r_{bc})\)
-
-and prove this equals a single linear functional of the committed bytecode polynomials:
-
-- \(C_{\text{in}} = \sum_{lane,k} B[lane,k] \cdot W_{\eta}(lane) \cdot \mathrm{eq}(r_{bc}, k)\)
-
-No per-lane openings are needed; correctness follows from linearity.
-
-#### 4.3 Two phases aligned to new stages
-
-- **Phase 1 (Stage 6b)**: bind the bytecode-index variables using Stage 6b cycle challenges.
-  - cache an intermediate claim (like `AdviceClaimReductionCyclePhase`) to start Phase 2.
-- **Phase 2 (Stage 7)**: bind the lane variables (`log_k_chunk` rounds).
-  - when each chunk polynomial is fully bound, cache its final opening for Stage 8.
-
-The address phase should be simpler than advice because lane vars = exactly `log_k_chunk` (no partial consumption).
-
-### Step 5 — `SumcheckId` / opening bookkeeping (naming + flow)
-
-**Status (2026-01-20)**: DONE  
-- SumcheckId additions: `jolt-core/src/poly/opening_proof.rs` **L136–L162**.  
-- VirtualPolynomial additions: `jolt-core/src/zkvm/witness.rs` **L242–L287**.
-
-#### 5.1 How `SumcheckId` actually enters the proving / verifying flow
-
-`SumcheckId` is part of the **key** used to store scalar claims in the opening accumulator maps.
-Concretely, the key type is `OpeningId`, and it embeds `SumcheckId`:
-
-- `OpeningId::Committed(CommittedPolynomial, SumcheckId)`
-- `OpeningId::Virtual(VirtualPolynomial, SumcheckId)`
-- `OpeningId::TrustedAdvice(SumcheckId)` / `OpeningId::UntrustedAdvice(SumcheckId)`
-  - `jolt-core/src/poly/opening_proof.rs` **L136–L175**
-
-**Prover side**: each sumcheck instance labels the claims it emits in `cache_openings(...)` by calling `ProverOpeningAccumulator::append_*` with a `SumcheckId`.
-Those become entries in `opening_claims` (serialized into the proof).
-
-**Verifier side**: the verifier is initialized with these claim scalars already present (from `opening_claims`), and each instance’s `cache_openings(...)` uses the same `SumcheckId` to populate the **opening point** for the existing claim (and to keep the transcript in sync).
-
-#### 5.2 Why advice has two `SumcheckId`s (`...CyclePhase` and final)
-
-Advice claim reduction spans Stage 6 → Stage 7, so it must store:
-
-- an **intermediate** scalar after Phase 1 (cycle binding), and
-- the **final** advice evaluation after Phase 2 (address binding).
-
-This is why `SumcheckId` has both:
-
-- `AdviceClaimReductionCyclePhase` (intermediate)
-- `AdviceClaimReduction` (final)
-  - `jolt-core/src/poly/opening_proof.rs` **L157–L160**
-
-Where it’s used:
-
-- Phase 2 starts from the Phase 1 intermediate:
-  - `AdviceClaimReductionParams::input_claim` (AddressVariables case):
-    - `jolt-core/src/zkvm/claim_reductions/advice.rs` **L190–L216**
-- Phase 1 and Phase 2 both cache openings under their respective IDs:
-  - `AdviceClaimReductionProver::cache_openings`:
-    - `jolt-core/src/zkvm/claim_reductions/advice.rs` **L466–L518**
-
-So neither is unused; they identify *two different stored claims*.
-
-#### 5.3 Naming rule of thumb (must match variable order)
-
-Two-phase protocols in this repo come in **both** variable orders:
-
-- **cycle → address**: advice claim reduction, bytecode claim reduction
-- **address → cycle**: booleanity, bytecode read+raf
-
-So the naming should reflect **what phase 1 binds**:
-
-- `XCyclePhase`: output claim after Phase 1 binds the **cycle-derived** variables
-- `XAddressPhase`: output claim after Phase 1 binds the **address-derived** variables
-- `X` (or `XFinal`): final output after all variables are bound
-
-For protocols we split into two physical sumchecks (Stage 6a + 6b) but want downstream stability:
-
-- keep the existing “final” `SumcheckId` if other modules already key off it (e.g. `HammingWeightClaimReduction` expects `SumcheckId::BytecodeReadRaf` today),
-- add a new `...AddressPhase` id for the Stage 6a pre-phase when the protocol binds address first.
-
-#### 5.4 Concrete `SumcheckId` changes for this rollout
-
-File to update:
-- `jolt-core/src/poly/opening_proof.rs` (`SumcheckId` enum)
-
-We will add:
-
-- **Address → cycle protocols (Stage 6 split)**:
-  - `BytecodeReadRafAddressPhase` (new; Stage 6a sumcheck; binds **address** first)
-  - `BooleanityAddressPhase` (new; Stage 6a sumcheck; binds **address** first)
-  - keep `BytecodeReadRaf` and `Booleanity` as the “final” IDs (Stage 6b sumchecks + cached openings) so downstream modules that key off them (e.g. HW reduction) remain stable.
-
-- **Cycle → address protocols (two-phase reductions)**:
-  - `BytecodeClaimReductionCyclePhase` (new; phase 1 output after binding **cycle** vars in Stage 6b)
-  - `BytecodeClaimReduction` (new; final output after binding **lane/address** vars in Stage 7)
-  - (existing) `AdviceClaimReductionCyclePhase` / `AdviceClaimReduction` already follow this pattern.
-
-We will also add **new `VirtualPolynomial` variants** for scalar claims that are *not* openings of committed polynomials:
-
-- **Stage 6a (BytecodeReadRafAddressPhase)**:
-  - `VirtualPolynomial::BytecodeValStage(usize)` for the 5 Val-only claims.
-  - `VirtualPolynomial::BytecodeReadRafAddrClaim` for the address-phase output claim that seeds the cycle-phase sumcheck.
-- **Stage 6a (BooleanityAddressPhase)**:
-  - `VirtualPolynomial::BooleanityAddrClaim` for the address-phase output claim that seeds the cycle-phase sumcheck.
-- **Stage 6b → Stage 7 (BytecodeClaimReduction)**:
-  - `VirtualPolynomial::BytecodeClaimReductionIntermediate` for the cycle-phase intermediate claim (analogous to advice’s `...CyclePhase`), used as Stage 7 input.
-
-#### 5.5 Quick “protocol → variable order → IDs” table (sanity)
-
-- **BytecodeReadRaf**: address → cycle
-  - Stage 6a: `SumcheckId::BytecodeReadRafAddressPhase`
-  - Stage 6b: `SumcheckId::BytecodeReadRaf` (final)
-- **Booleanity**: address → cycle
-  - Stage 6a: `SumcheckId::BooleanityAddressPhase`
-  - Stage 6b: `SumcheckId::Booleanity` (final)
-- **BytecodeClaimReduction**: cycle → lane/address
-  - Stage 6b: `SumcheckId::BytecodeClaimReductionCyclePhase` (intermediate stored)
-  - Stage 7: `SumcheckId::BytecodeClaimReduction` (final)
-- **AdviceClaimReduction** (existing): cycle → address (two-phase)
-  - Stage 6: `SumcheckId::AdviceClaimReductionCyclePhase`
-  - Stage 7: `SumcheckId::AdviceClaimReduction`
-
-### Step 6 — Bytecode commitments in preprocessing + transcript
-
-**Status (2026-01-20)**: DONE (functionality)  
-- Bytecode commitment plumbing added (types + preprocessing + proof field): `jolt-core/src/zkvm/bytecode/mod.rs` **L30–L111**; `jolt-core/src/zkvm/prover.rs` **L1688–L1760**; `jolt-core/src/zkvm/verifier.rs` **L840–L976**; `jolt-core/src/zkvm/proof_serialization.rs` **L43–L47**.  
-- Commitment derivation implemented: `TrustedBytecodeCommitments::derive` in `jolt-core/src/zkvm/bytecode/mod.rs`.  
-- Canonical lane ordering + lane materialization centralized in `jolt-core/src/zkvm/bytecode/chunks.rs` (used by both commitment derivation and claim reduction).
-
-#### 6.1 New Dory context + storage
-
-Add a new `DoryContext::Bytecode` (like Trusted/UntrustedAdvice) so we can commit to bytecode chunk polynomials in preprocessing and hand the commitments to the verifier.
-
-Update shared preprocessing to store either:
-- raw `BytecodePreprocessing`, or
-- `{ bytecode_len, k_chunk, commitments: Vec<Commitment>, (optional) layout metadata }`
-
-#### 6.2 Canonical lane ordering implementation
-
-Implement an enum (or equivalent) encoding the authoritative lane ordering:
-- rs1 lanes (0..127), rs2 lanes (0..127), rd lanes (0..127), then dense fields.
-Then chunk into blocks of size `k_chunk` to get commitment indices.
-
-This ordering must be used consistently by:
-- commitment generation
-- `BytecodeClaimReduction` weight construction
-- Stage 8 batching / VMV contribution
-
-### Step 7 — Stage 8 batching integration (bytecode polynomials)
-
-**Status (2026-01-20)**: DONE (functionality)  
-- Stage 8 folds bytecode chunk openings into the joint opening proof via a Lagrange selector over missing cycle vars (`jolt-core/src/zkvm/prover.rs` and `jolt-core/src/zkvm/verifier.rs`).
-- Streaming RLC includes bytecode chunk contributions in the VMV pass (`jolt-core/src/poly/rlc_polynomial.rs`).
-
-Stage 8 currently builds a streaming `RLCPolynomial` from:
-- dense trace polys
-- onehot RA polys
-- advice polys (passed directly)
-
-We need to extend this to include “bytecode commitment chunk polynomials”:
-- they are **not** streamed from trace
-- they are too large to materialize when bytecode is big
-
-Implementation direction:
-- extend the streaming RLC machinery to support an additional source (“stream from bytecode”),
-  analogous to how it already streams onehot polys from trace.
-
-Files involved:
-- `jolt-core/src/poly/rlc_polynomial.rs` (extend streaming context + VMP to include bytecode chunk polys)
-- `jolt-core/src/zkvm/prover.rs` / `verifier.rs` Stage 8 claim collection (include bytecode chunk claims with appropriate embedding factor, like advice)
-
-### Step 8 — Defensive padding: bytecode_len vs trace_len
-
-**Status (2026-01-20)**: DONE  
-- Prover pads `T >= K` in committed mode: `jolt-core/src/zkvm/prover.rs` **L395–L409**.  
-- Verifier rejects proofs with `trace_length < bytecode_K` in committed mode: `jolt-core/src/zkvm/verifier.rs` **L171–L177**.
-
-When bytecode commitments are enabled, ensure we have enough cycle randomness to bind bytecode-index vars:
-
-- `padded_trace_len = max(padded_trace_len, bytecode_len.next_power_of_two())`
-
-This is analogous to `adjust_trace_length_for_advice` in `jolt-core/src/zkvm/prover.rs`.
-
-### Step 9 — Tests / validation
-
-**Status (2026-01-20)**: DONE (core coverage)  
-- Lane ordering + chunking tests added.
-- E2E committed-bytecode tests enabled and passing for both layouts (CycleMajor + AddressMajor).
-- Note: `jolt-core/src/zkvm/tests.rs` still contains some env-var gated debug helpers; remove once stabilized.
-
-- Unit tests:
-  - lane ordering + chunking (k_chunk=16 ⇒ 28 chunks, k_chunk=256 ⇒ 2 chunks)
-  - bytecode_len > trace_len padding path
-- E2E:
-  - prove+verify with bytecode commitment enabled, both layouts (CycleMajor/AddressMajor)
-- Recursion benchmark:
-  - confirm verifier cycle count no longer scales with bytecode length.

From e596a43d310c63a47e61a57332693f966d708a66 Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Tue, 20 Jan 2026 05:27:38 -0800
Subject: [PATCH 13/16] ci: clear stale Dory URS cache before tests

---
 .github/workflows/rust.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index ac1395be7b..982524904b 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -69,6 +69,8 @@ jobs:
           path: ~/.jolt
       - name: Install Jolt RISC-V Rust toolchain
         run: cargo run install-toolchain
+      - name: Clear Dory URS cache
+        run: rm -rf ~/.cache/dory
       - name: Install nextest
         uses: taiki-e/install-action@nextest
       - name: Run jolt-core tests

From 2e3ce4091abd613f7e0831c36d2382cc4ba10aaa Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Thu, 22 Jan 2026 07:00:35 -0800
Subject: [PATCH 14/16] Add standalone bytecode VMP computation function

Expose compute_bytecode_vmp_contribution for external callers (e.g., GPU prover)
and remove #[cfg(test)] restriction from set_layout.
---
 .../src/poly/commitment/dory/dory_globals.rs  |  1 -
 jolt-core/src/poly/rlc_polynomial.rs          | 90 +++++++++++++++++++
 2 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/jolt-core/src/poly/commitment/dory/dory_globals.rs b/jolt-core/src/poly/commitment/dory/dory_globals.rs
index 80e8e304cf..8554f18d0a 100644
--- a/jolt-core/src/poly/commitment/dory/dory_globals.rs
+++ b/jolt-core/src/poly/commitment/dory/dory_globals.rs
@@ -258,7 +258,6 @@ impl DoryGlobals {
     /// Set the Dory matrix layout directly (test-only).
     ///
     /// In production code, prefer passing the layout to `initialize_context` instead.
-    #[cfg(test)]
     pub fn set_layout(layout: DoryLayout) {
         CURRENT_LAYOUT.store(layout as u8, Ordering::SeqCst);
     }
diff --git a/jolt-core/src/poly/rlc_polynomial.rs b/jolt-core/src/poly/rlc_polynomial.rs
index 51bc6a69b2..443fce373d 100644
--- a/jolt-core/src/poly/rlc_polynomial.rs
+++ b/jolt-core/src/poly/rlc_polynomial.rs
@@ -26,6 +26,96 @@ pub struct RLCStreamingData {
     pub memory_layout: MemoryLayout,
 }
 
+/// Computes the bytecode chunk polynomial contribution to a vector-matrix product.
+///
+/// This is a standalone version of the bytecode VMP computation that can be used
+/// by external callers (e.g., GPU prover) without needing a full `StreamingRLCContext`.
+///
+/// # Arguments
+/// * `result` - Output buffer to accumulate contributions into
+/// * `left_vec` - Left vector for the vector-matrix product (length >= num_rows)
+/// * `num_columns` - Number of columns in the Dory matrix
+/// * `bytecode_polys` - List of (chunk_index, coefficient) pairs for the RLC
+/// * `bytecode` - Bytecode preprocessing data
+/// * `one_hot_params` - One-hot parameters (contains k_chunk)
+pub fn compute_bytecode_vmp_contribution<F: JoltField>(
+    result: &mut [F],
+    left_vec: &[F],
+    num_columns: usize,
+    bytecode_polys: &[(usize, F)],
+    bytecode: &BytecodePreprocessing,
+    one_hot_params: &OneHotParams,
+) {
+    if bytecode_polys.is_empty() {
+        return;
+    }
+
+    let layout = DoryGlobals::get_layout();
+    let k_chunk = one_hot_params.k_chunk;
+    let bytecode_len = bytecode.bytecode.len();
+    let (sigma_bc, _nu_bc) = DoryGlobals::balanced_sigma_nu((k_chunk * bytecode_len).log_2());
+    let bytecode_cols = 1usize << sigma_bc;
+    let total = total_lanes();
+
+    debug_assert!(
+        bytecode_cols <= num_columns,
+        "Bytecode columns (2^{{sigma_bc}}={bytecode_cols}) must fit in main num_columns={num_columns}; \
+guardrail in gen_from_trace should ensure sigma_main >= sigma_bc."
+    );
+
+    for (chunk_idx, coeff) in bytecode_polys.iter() {
+        if coeff.is_zero() {
+            continue;
+        }
+        for (cycle, instr) in bytecode.bytecode.iter().enumerate().take(bytecode_len) {
+            let normalized = instr.normalize();
+            let circuit_flags = <Instruction as Flags>::circuit_flags(instr);
+            let instr_flags = <Instruction as Flags>::instruction_flags(instr);
+            let lookup_idx = <Instruction as InstructionLookup<XLEN>>::lookup_table(instr)
+                .map(|t| LookupTables::<XLEN>::enum_index(&t));
+            let raf_flag =
+                !crate::zkvm::instruction::InterleavedBitsMarker::is_interleaved_operands(
+                    &circuit_flags,
+                );
+
+            let unexpanded_pc = F::from_u64(normalized.address as u64);
+            let imm = F::from_i128(normalized.operands.imm);
+            let rs1 = normalized.operands.rs1;
+            let rs2 = normalized.operands.rs2;
+            let rd = normalized.operands.rd;
+
+            for lane in 0..k_chunk {
+                let global_lane = chunk_idx * k_chunk + lane;
+                if global_lane >= total {
+                    break;
+                }
+                let value = lane_value::<F>(
+                    global_lane,
+                    rs1,
+                    rs2,
+                    rd,
+                    unexpanded_pc,
+                    imm,
+                    &circuit_flags,
+                    &instr_flags,
+                    lookup_idx,
+                    raf_flag,
+                );
+                if value.is_zero() {
+                    continue;
+                }
+                let global_index =
+                    layout.address_cycle_to_index(lane, cycle, k_chunk, bytecode_len);
+                let row_index = global_index / bytecode_cols;
+                let col_index = global_index % bytecode_cols;
+                if row_index < left_vec.len() {
+                    result[col_index] += left_vec[row_index] * (*coeff) * value;
+                }
+            }
+        }
+    }
+}
+
 /// Source of trace data for streaming VMV computation.
 #[derive(Clone, Debug)]
 pub enum TraceSource {

From 71006c686e600c866827b3d5f9fa5ff94d1bf47f Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Thu, 22 Jan 2026 07:10:02 -0800
Subject: [PATCH 15/16] Refactor vmp_bytecode_contribution to use standalone
 function

Delegate to compute_bytecode_vmp_contribution to eliminate code duplication.
---
 jolt-core/src/poly/rlc_polynomial.rs | 75 +++-------------------------
 1 file changed, 7 insertions(+), 68 deletions(-)

diff --git a/jolt-core/src/poly/rlc_polynomial.rs b/jolt-core/src/poly/rlc_polynomial.rs
index 443fce373d..044d4ce8ca 100644
--- a/jolt-core/src/poly/rlc_polynomial.rs
+++ b/jolt-core/src/poly/rlc_polynomial.rs
@@ -510,75 +510,14 @@ guardrail in gen_from_trace should ensure sigma_main >= sigma_a."
         num_columns: usize,
         ctx: &StreamingRLCContext<F>,
     ) {
-        if ctx.bytecode_polys.is_empty() {
-            return;
-        }
-
-        let layout = DoryGlobals::get_layout();
-        let k_chunk = ctx.one_hot_params.k_chunk;
-        let bytecode = &ctx.preprocessing.bytecode;
-        let bytecode_len = bytecode.bytecode.len();
-        let (sigma_bc, _nu_bc) = DoryGlobals::balanced_sigma_nu((k_chunk * bytecode_len).log_2());
-        let bytecode_cols = 1usize << sigma_bc;
-        let total = total_lanes();
-
-        debug_assert!(
-            bytecode_cols <= num_columns,
-            "Bytecode columns (2^{{sigma_bc}}={bytecode_cols}) must fit in main num_columns={num_columns}; \
-guardrail in gen_from_trace should ensure sigma_main >= sigma_bc."
+        compute_bytecode_vmp_contribution(
+            result,
+            left_vec,
+            num_columns,
+            &ctx.bytecode_polys,
+            &ctx.preprocessing.bytecode,
+            &ctx.one_hot_params,
         );
-
-        for (chunk_idx, coeff) in ctx.bytecode_polys.iter() {
-            if coeff.is_zero() {
-                continue;
-            }
-            for (cycle, instr) in bytecode.bytecode.iter().enumerate().take(bytecode_len) {
-                let normalized = instr.normalize();
-                let circuit_flags = <Instruction as Flags>::circuit_flags(instr);
-                let instr_flags = <Instruction as Flags>::instruction_flags(instr);
-                let lookup_idx = <Instruction as InstructionLookup<XLEN>>::lookup_table(instr)
-                    .map(|t| LookupTables::<XLEN>::enum_index(&t));
-                let raf_flag =
-                    !crate::zkvm::instruction::InterleavedBitsMarker::is_interleaved_operands(
-                        &circuit_flags,
-                    );
-
-                let unexpanded_pc = F::from_u64(normalized.address as u64);
-                let imm = F::from_i128(normalized.operands.imm);
-                let rs1 = normalized.operands.rs1;
-                let rs2 = normalized.operands.rs2;
-                let rd = normalized.operands.rd;
-
-                for lane in 0..k_chunk {
-                    let global_lane = chunk_idx * k_chunk + lane;
-                    if global_lane >= total {
-                        break;
-                    }
-                    let value = lane_value::<F>(
-                        global_lane,
-                        rs1,
-                        rs2,
-                        rd,
-                        unexpanded_pc,
-                        imm,
-                        &circuit_flags,
-                        &instr_flags,
-                        lookup_idx,
-                        raf_flag,
-                    );
-                    if value.is_zero() {
-                        continue;
-                    }
-                    let global_index =
-                        layout.address_cycle_to_index(lane, cycle, k_chunk, bytecode_len);
-                    let row_index = global_index / bytecode_cols;
-                    let col_index = global_index % bytecode_cols;
-                    if row_index < left_vec.len() {
-                        result[col_index] += left_vec[row_index] * (*coeff) * value;
-                    }
-                }
-            }
-        }
     }
 
     /// Streaming VMP implementation that generates rows on-demand from trace.

From 8d306d96dd7e98de3de234c194bb9280890fa998 Mon Sep 17 00:00:00 2001
From: Quang Dao <quang.dao@layerzerolabs.org>
Date: Thu, 22 Jan 2026 11:10:31 -0800
Subject: [PATCH 16/16] feat(bytecode): align bytecode context with main sigma
 for Stage 8 folding

- Initialize bytecode Dory context using main matrix dimensions to support embedding in Stage 8.
- Update VMP contribution logic to use correct column count.
- Handle trailing dummy rounds in BytecodeClaimReductionProver for batched sumcheck alignment.
- Pass max_trace_len to TrustedBytecodeCommitments derivation.
---
 .../src/poly/commitment/dory/dory_globals.rs  | 42 +++++++++++++++++++
 jolt-core/src/poly/rlc_polynomial.rs          | 10 ++---
 jolt-core/src/zkvm/bytecode/mod.rs            | 11 ++++-
 .../src/zkvm/claim_reductions/bytecode.rs     | 37 +++++++++++++++-
 jolt-core/src/zkvm/prover.rs                  |  2 +-
 5 files changed, 93 insertions(+), 9 deletions(-)

diff --git a/jolt-core/src/poly/commitment/dory/dory_globals.rs b/jolt-core/src/poly/commitment/dory/dory_globals.rs
index 8554f18d0a..5f78157184 100644
--- a/jolt-core/src/poly/commitment/dory/dory_globals.rs
+++ b/jolt-core/src/poly/commitment/dory/dory_globals.rs
@@ -197,6 +197,48 @@ impl Drop for DoryContextGuard {
 pub struct DoryGlobals;
 
 impl DoryGlobals {
+    /// Initialize Bytecode context so its `num_columns` matches Main's `sigma_main`.
+    ///
+    /// This is required for committed-bytecode Stage 8 folding when `sigma_main > sigma_bytecode`:
+    /// we commit bytecode chunk polynomials using the Main matrix width (more columns, fewer rows),
+    /// so they embed as a top block of rows in the Main matrix when extra cycle variables are fixed to 0.
+    pub fn initialize_bytecode_context_for_main_sigma(
+        k_chunk: usize,
+        bytecode_len: usize,
+        log_k_chunk: usize,
+        log_t: usize,
+    ) -> Option<()> {
+        let (sigma_main, _) = Self::main_sigma_nu(log_k_chunk, log_t);
+        let num_columns = 1usize << sigma_main;
+        let total_size = k_chunk * bytecode_len;
+
+        assert!(
+            total_size % num_columns == 0,
+            "bytecode matrix width {num_columns} must divide total_size {total_size}"
+        );
+        let num_rows = total_size / num_columns;
+
+        // If already initialized, ensure it matches (avoid silently ignoring OnceCell::set failures).
+        #[allow(static_mut_refs)]
+        unsafe {
+            if let (Some(existing_cols), Some(existing_rows), Some(existing_t)) = (
+                BYTECODE_NUM_COLUMNS.get(),
+                BYTECODE_MAX_NUM_ROWS.get(),
+                BYTECODE_T.get(),
+            ) {
+                assert_eq!(*existing_cols, num_columns);
+                assert_eq!(*existing_rows, num_rows);
+                assert_eq!(*existing_t, bytecode_len);
+                return Some(());
+            }
+        }
+
+        Self::set_num_columns_for_context(num_columns, DoryContext::Bytecode);
+        Self::set_T_for_context(bytecode_len, DoryContext::Bytecode);
+        Self::set_max_num_rows_for_context(num_rows, DoryContext::Bytecode);
+        Some(())
+    }
+
     /// Split `total_vars` into a *balanced* pair `(sigma, nu)` where:
     /// - **sigma** is the number of **column** variables
     /// - **nu** is the number of **row** variables
diff --git a/jolt-core/src/poly/rlc_polynomial.rs b/jolt-core/src/poly/rlc_polynomial.rs
index 044d4ce8ca..3785dae52b 100644
--- a/jolt-core/src/poly/rlc_polynomial.rs
+++ b/jolt-core/src/poly/rlc_polynomial.rs
@@ -53,14 +53,14 @@ pub fn compute_bytecode_vmp_contribution<F: JoltField>(
     let layout = DoryGlobals::get_layout();
     let k_chunk = one_hot_params.k_chunk;
     let bytecode_len = bytecode.bytecode.len();
-    let (sigma_bc, _nu_bc) = DoryGlobals::balanced_sigma_nu((k_chunk * bytecode_len).log_2());
-    let bytecode_cols = 1usize << sigma_bc;
+    let bytecode_cols = num_columns;
     let total = total_lanes();
 
     debug_assert!(
-        bytecode_cols <= num_columns,
-        "Bytecode columns (2^{{sigma_bc}}={bytecode_cols}) must fit in main num_columns={num_columns}; \
-guardrail in gen_from_trace should ensure sigma_main >= sigma_bc."
+        k_chunk * bytecode_len >= bytecode_cols,
+        "bytecode_len*k_chunk must cover at least one full row: (k_chunk*bytecode_len)={} < num_columns={}",
+        k_chunk * bytecode_len,
+        bytecode_cols
     );
 
     for (chunk_idx, coeff) in bytecode_polys.iter() {
diff --git a/jolt-core/src/zkvm/bytecode/mod.rs b/jolt-core/src/zkvm/bytecode/mod.rs
index 7c0f41a3c7..6744c16944 100644
--- a/jolt-core/src/zkvm/bytecode/mod.rs
+++ b/jolt-core/src/zkvm/bytecode/mod.rs
@@ -9,6 +9,7 @@ use tracer::instruction::{Cycle, Instruction};
 
 use crate::poly::commitment::commitment_scheme::CommitmentScheme;
 use crate::poly::commitment::dory::{DoryContext, DoryGlobals};
+use crate::utils::math::Math;
 use crate::utils::errors::ProofVerifyError;
 use crate::zkvm::bytecode::chunks::{build_bytecode_chunks, total_lanes};
 use rayon::prelude::*;
@@ -51,13 +52,19 @@ impl<PCS: CommitmentScheme> TrustedBytecodeCommitments<PCS> {
         bytecode: &BytecodePreprocessing,
         generators: &PCS::ProverSetup,
         log_k_chunk: usize,
+        max_trace_len: usize,
     ) -> (Self, Vec<PCS::OpeningProofHint>) {
         let k_chunk = 1usize << log_k_chunk;
         let bytecode_len = bytecode.bytecode.len();
         let num_chunks = total_lanes().div_ceil(k_chunk);
 
-        let _guard =
-            DoryGlobals::initialize_context(k_chunk, bytecode_len, DoryContext::Bytecode, None);
+        let log_t = max_trace_len.log_2();
+        let _guard = DoryGlobals::initialize_bytecode_context_for_main_sigma(
+            k_chunk,
+            bytecode_len,
+            log_k_chunk,
+            log_t,
+        );
         let _ctx = DoryGlobals::with_context(DoryContext::Bytecode);
 
         let bytecode_chunks = build_bytecode_chunks::<PCS::Field>(bytecode, log_k_chunk);
diff --git a/jolt-core/src/zkvm/claim_reductions/bytecode.rs b/jolt-core/src/zkvm/claim_reductions/bytecode.rs
index 6aa8ab84d6..0cebaee937 100644
--- a/jolt-core/src/zkvm/claim_reductions/bytecode.rs
+++ b/jolt-core/src/zkvm/claim_reductions/bytecode.rs
@@ -15,6 +15,7 @@
 //! Commitment + Stage 8 batching integration is handled separately (see `bytecode-commitment-progress.md`).
 
 use std::cell::RefCell;
+use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;
 
 use allocative::Allocative;
@@ -205,6 +206,9 @@ pub struct BytecodeClaimReductionProver<F: JoltField> {
     bytecode_chunks: Vec<MultilinearPolynomial<F>>,
     /// Weight polynomials W_i(lane, k) = W_eta(lane) * eq(r_bc, k) (multilinear).
     weight_chunks: Vec<MultilinearPolynomial<F>>,
+    /// Batched-sumcheck scaling for trailing dummy rounds (see `round_offset`).
+    #[allocative(skip)]
+    batch_dummy_rounds: AtomicUsize,
 }
 
 impl<F: JoltField> BytecodeClaimReductionProver<F> {
@@ -266,12 +270,13 @@ impl<F: JoltField> BytecodeClaimReductionProver<F> {
             params,
             bytecode_chunks,
             weight_chunks,
+            batch_dummy_rounds: AtomicUsize::new(0),
         }
     }
 
     fn compute_message_impl(&self, previous_claim: F) -> UniPoly<F> {
         let half = self.bytecode_chunks[0].len() / 2;
-        let evals: [F; DEGREE_BOUND] = (0..half)
+        let mut evals: [F; DEGREE_BOUND] = (0..half)
             .into_par_iter()
             .map(|j| {
                 let mut out = [F::zero(); DEGREE_BOUND];
@@ -293,6 +298,17 @@ impl<F: JoltField> BytecodeClaimReductionProver<F> {
                     acc
                 },
             );
+
+        // If this instance is back-loaded in a batched sumcheck (i.e., it has trailing dummy
+        // rounds), then `previous_claim` is scaled by 2^{dummy_rounds}. The per-round univariate
+        // evaluations must be scaled by the same factor to satisfy the sumcheck consistency check.
+        let dummy_rounds = self.batch_dummy_rounds.load(Ordering::Relaxed);
+        if dummy_rounds != 0 {
+            let scale = F::one().mul_pow_2(dummy_rounds);
+            for e in evals.iter_mut() {
+                *e *= scale;
+            }
+        }
         UniPoly::from_evals_and_hint(previous_claim, &evals)
     }
 }
@@ -302,6 +318,20 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceProver<F, T> for BytecodeClaim
         &self.params
     }
 
+    fn round_offset(&self, max_num_rounds: usize) -> usize {
+        // Bytecode claim reduction's cycle-phase rounds must align to the *start* of the
+        // batched cycle challenge vector so that its (log_K) point is the suffix (LSB side)
+        // of the full (log_T) cycle point used by other Stage 6b instances. This is required
+        // for Stage 8's committed-bytecode embedding when log_T > log_K.
+        //
+        // This deviates from the default "front-loaded" batching offset, so we record the number
+        // of trailing dummy rounds and scale univariate evaluations accordingly.
+        let dummy_rounds = max_num_rounds.saturating_sub(self.params.num_rounds());
+        self.batch_dummy_rounds
+            .store(dummy_rounds, Ordering::Relaxed);
+        0
+    }
+
     fn compute_message(&mut self, _round: usize, previous_claim: F) -> UniPoly<F> {
         self.compute_message_impl(previous_claim)
     }
@@ -396,6 +426,11 @@ impl<F: JoltField, T: Transcript> SumcheckInstanceVerifier<F, T>
         unsafe { &*self.params.as_ptr() }
     }
 
+    fn round_offset(&self, _max_num_rounds: usize) -> usize {
+        // Must mirror the prover: align this instance to the start of the batched challenge vector.
+        0
+    }
+
     fn expected_output_claim(
         &self,
         accumulator: &VerifierOpeningAccumulator<F>,
diff --git a/jolt-core/src/zkvm/prover.rs b/jolt-core/src/zkvm/prover.rs
index a8c797367c..3d9cf4226a 100644
--- a/jolt-core/src/zkvm/prover.rs
+++ b/jolt-core/src/zkvm/prover.rs
@@ -1836,7 +1836,7 @@ where
             8
         };
         let (trusted_commitments, hints) =
-            TrustedBytecodeCommitments::derive(&bytecode, &generators, log_k_chunk);
+            TrustedBytecodeCommitments::derive(&bytecode, &generators, log_k_chunk, max_t_any);
         JoltProverPreprocessing {
             generators,
             shared,