diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
deleted file mode 100644
index d854df3..0000000
--- a/.github/workflows/benchmark.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: Workflow for leaderboard submission
-
-on:
-  # push:
-    # branches: [ main ]
-  # pull_request:
-    # branches: [ main ]
-  workflow_dispatch:
-jobs:
-  leaderboard:
-    name: leaderboard
-    runs-on:
-      group: benchmark
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v5
-
-      - name: Configure CMake
-        run: |
-          cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -Wno-dev
-
-      - name: Build project
-        run: |
-          cmake --build build -- -j$(nproc) leaderboard
-          
-      - name: Run for leaderboard
-        run: |
-          leaderboard.sh
-
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9ccf48b..b66ff8f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -7,24 +7,40 @@ on:
   pull_request:
     branches:
       - main
-
-  workflow_dispatch:
-
 jobs:
   build_and_test:
-    runs-on: 
-      group: k23a
+    runs-on: self-hosted
+
+    env:
+      CCACHE_DIR: ${{ github.workspace }}/.ccache
 
     steps:
     - name: Checkout repository code
       uses: actions/checkout@v4
+
+    - name: Setup cache
+      uses: actions/cache@v4
+      with:
+        path: .ccache
+        key: ${{ runner.os }}-ccache-${{ github.sha }}
+        restore-keys: |
+          ${{ runner.os }}-ccache-
       
     - name: Configure initial build with CMake
       run: |
-        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -Wno-dev
+        nix develop -c \
+        cmake -S . -B build -Wno-dev \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
 
     - name: Build all targets
-      run: cmake --build build -- -j $(nproc) unit_tests
+      run: |
+        nix develop -c \
+        cmake --build build -- -j $(nproc) unit_tests
 
     - name: Run unit tests
       run: ./build/unit_tests
+
+    - name: Cache stats
+      run: nix develop -c ccache -s
diff --git a/.github/workflows/notifier.yaml b/.github/workflows/notifier.yaml
deleted file mode 100644
index 5f9b68b..0000000
--- a/.github/workflows/notifier.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-name: GitHub Push Notifier
-
-on:
-  push:
-    branches:
-      - '**'
-
-jobs:
-  notify:
-    runs-on: 
-      group: k23a
-    
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-
-      - name: Set up Node.js
-        uses: actions/setup-node@v3
-        with:
-          node-version: '18'
-
-      - name: Install dependencies
-        run: npm install discord.js node-fetch dotenv
-
-      - name: Send Discord notification
-        env:
-          DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
-          CHANNEL_ID: ${{ secrets.CHANNEL_ID }}
-          GITHUB_TOKEN: ${{ vars.GH_TOKEN }}
-          GITHUB_OWNER: ${{ github.repository_owner }}
-          GITHUB_REPO: ${{ github.event.repository.name }}
-        run: |
-          node -e "
-          require('dotenv').config();
-          const { Client, GatewayIntentBits } = require('discord.js');
-          const client = new Client({ intents: [GatewayIntentBits.Guilds] });
-          
-          client.once('ready', async () => {
-            const channel = client.channels.cache.get(process.env.CHANNEL_ID);
-            const message = \`🚨 New Push to \\\`${{ github.repository }}\\\` Branch \\\`${{ github.ref_name }}\\\`!\n👤 Author: \\\`${{ github.event.pusher.name }}\\\`\n📝 Commit: \\\`${{ github.event.head_commit.message }}\\\`\n🔗 View: ${{ github.event.head_commit.url }}\`;
-            await channel.send(message);
-            process.exit(0);
-          });
-          
-          client.login(process.env.DISCORD_TOKEN);
-          "
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 12c22e9..c21caa5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,7 +12,12 @@
 /docs/html
 /docs/xml
 .clangd
+.cache
+.ccache
 compile_commands.json
 /env/
 script.py
 *.md
+/build_deferred
+/build_debug
+/build_eager
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dc0739a..ab5f3ae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,40 +54,11 @@ FetchContent_Declare(
 
 FetchContent_MakeAvailable(fmtlib)
 
-set(ENABLE_SANITIZER OFF)
-set(ENABLE_UBSAN OFF)
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|powerpc|ppc64|ppc64le")
     message("Disabling jemalloc extension of DuckDB on Power.")
     set(SKIP_EXTENSIONS jemalloc)
 endif()
 
-# Detect Xeon E5-2680 v3 CPU for benchmark VM hardware configuration
-# Requires both: correct CPU AND at least 32GB RAM (benchmark VM has 64GB, CI has 4GB)
-set(IS_BENCHMARK_VM_HARDWARE OFF)
-if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND EXISTS "/proc/cpuinfo")
-    file(READ "/proc/cpuinfo" CPUINFO_CONTENT)
-    if(CPUINFO_CONTENT MATCHES "E5-2680 v3")
-        # Check available memory to distinguish benchmark VM from CI VM
-        if(EXISTS "/proc/meminfo")
-            file(READ "/proc/meminfo" MEMINFO_CONTENT)
-            string(REGEX MATCH "MemTotal:[ \t]+([0-9]+)" MEM_MATCH "${MEMINFO_CONTENT}")
-            if(MEM_MATCH)
-                set(MEM_TOTAL_KB "${CMAKE_MATCH_1}")
-                math(EXPR MEM_TOTAL_GB "${MEM_TOTAL_KB} / 1024 / 1024")
-                if(MEM_TOTAL_GB GREATER_EQUAL 32)
-                    message(STATUS "Detected Intel Xeon E5-2680 v3 CPU with ${MEM_TOTAL_GB}GB RAM - using benchmark VM hardware configuration")
-                    add_compile_definitions(SPC__USE_BENCHMARKVM_HARDWARE)
-                    set(IS_BENCHMARK_VM_HARDWARE ON)
-                else()
-                    message(STATUS "Detected Intel Xeon E5-2680 v3 CPU but only ${MEM_TOTAL_GB}GB RAM (need >=32GB) - using generic hardware configuration")
-                endif()
-            endif()
-        endif()
-    endif()
-endif()
-
-# Include all sources from /src directory. CONFIGURE_DEPENDS can be unreliable.
-# Try re-running cmake in case changes are not recognized.
 file(GLOB ALL_SRC
     CONFIGURE_DEPENDS
     "src/*.cpp"
@@ -100,7 +71,7 @@ set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64")
     add_compile_options(-O3 -mcpu=apple-m1 -flto)
 else()
-    add_compile_options(-O3 -march=native -m64 -mcrc32 -fpermissive -flto)
+    add_compile_options(-O0 -march=skylake -m64 -mcrc32 -fpermissive)
 endif()
 
 add_link_options(-flto)
@@ -112,19 +83,12 @@ if(NOT CMAKE_SYSTEM_NAME STREQUAL "Windows")
     target_compile_definitions(faster PRIVATE)
     target_link_libraries(faster PRIVATE re2 fmt range-v3 nlohmann_json::nlohmann_json sqlparser)
     target_include_directories(faster PRIVATE include)
-
-    if(IS_BENCHMARK_VM_HARDWARE)
-        add_executable(leaderboard ${MANOLATES_SRC} tests/read_sql.cpp)
-        target_compile_definitions(leaderboard PRIVATE)
-        target_link_libraries(leaderboard PRIVATE re2 fmt range-v3 nlohmann_json::nlohmann_json sqlparser)
-        target_include_directories(leaderboard PRIVATE include)
-    endif()
 endif()
 
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64")
     target_compile_options(unit_tests PRIVATE -O3 -mcpu=apple-m1 -flto)
 else()
-    target_compile_options(unit_tests PRIVATE -O3 -march=native -m64 -fpermissive -flto)
+    target_compile_options(unit_tests PRIVATE -O0 -march=skylake -m64 -fpermissive)
 endif()
 
 target_compile_definitions(unit_tests PRIVATE)
diff --git a/flake.nix b/flake.nix
index cd6af67..1f459a3 100644
--- a/flake.nix
+++ b/flake.nix
@@ -23,19 +23,19 @@
           buildInputs = with pkgs; [
             llvmPackages.libcxxClang
             llvmPackages.libllvm
+            ccache
             doxygen
             curl
             git
             cmake
             typst
           ] ++ lib.optionals (system == "x86_64-linux") [
-            linuxPackages.perf
+            perf
             gef
           ];
           shellHook = ''
             CLANGD_FILE=".clangd"
             CPP_STANDARD="c++20"
-
             echo "Generating $CLANGD_FILE from \$ clang++ -v output..."
 
             INCLUDE_PATHS=$(
@@ -57,9 +57,10 @@
                 echo "    - -I$CLEAN_PATH" >> $CLANGD_FILE
             done <<< "$INCLUDE_PATHS"
 
-            echo "    - -O2" >> $CLANGD_FILE
-
-            echo "Generation of $CLANGD_FILE complete."                
+            echo "exporting ccache paths..."
+            export CCACHE_DIR="$PWD/.ccache"
+            export PATH="${pkgs.ccache}/bin:$PATH"
+            echo "done."
 
             if command -v fish &> /dev/null; then
                 exec fish
diff --git a/include/data_access/columnar_reader.h b/include/data_access/columnar_reader.h
index 2074498..39348d0 100644
--- a/include/data_access/columnar_reader.h
+++ b/include/data_access/columnar_reader.h
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <algorithm>
+#include <array>
 #include <atomic>
 #include <cstdint>
 #include <cstring>
@@ -266,6 +267,127 @@ class ColumnarReader {
         global_probe_version.fetch_add(1, std::memory_order_relaxed);
     }
 
+    // ========================================================================
+    // Base Table Page Index Methods (for O(1) deferred column resolution)
+    // ========================================================================
+
+    /** @brief Reset base table prepared flags for new query. */
+    inline void reset_base_tables() {
+        base_table_prepared_.fill(false);
+        base_table_version_++;
+    }
+
+    /**
+     * @brief Prepare page index for a base table column.
+     *
+     * Called once per unique (table_id, col_idx) before deferred resolution.
+     * Enables O(log P) page lookup instead of O(P) linear scan per read.
+     *
+     * @param table_id Base table ID (0-15).
+     * @param col_idx Column index within base table (0-15).
+     * @param column The Column to build page index for.
+     */
+    inline void prepare_base_column(uint8_t table_id, uint8_t col_idx,
+                                    const Column &column) {
+        size_t idx = (static_cast<size_t>(table_id) << BASE_TABLE_SHIFT) |
+                     static_cast<size_t>(col_idx);
+        if (idx >= MAX_BASE_TABLE_INDICES)
+            return;
+
+        if (!base_table_prepared_[idx]) {
+            auto &arena = platform::get_arena(0);
+            base_table_indices_[idx] = PageIndex(arena);
+            base_table_indices_[idx].build(column);
+            base_table_prepared_[idx] = true;
+        }
+    }
+
+    /** @brief Check if base column page index is prepared. */
+    inline bool is_base_column_prepared(uint8_t table_id,
+                                        uint8_t col_idx) const {
+        size_t idx = (static_cast<size_t>(table_id) << BASE_TABLE_SHIFT) |
+                     static_cast<size_t>(col_idx);
+        return idx < MAX_BASE_TABLE_INDICES && base_table_prepared_[idx];
+    }
+
+    /**
+     * @brief Read value from base table using prepared page index.
+     *
+     * O(1) with cursor caching for sequential access, O(log P) on cache miss.
+     * Falls back to O(P) linear scan if page index not prepared.
+     *
+     * @param column The base table column.
+     * @param table_id Base table ID.
+     * @param col_idx Column index within base table.
+     * @param row_id Row ID within the column.
+     * @param data_type Data type of the column.
+     * @param cursor Thread-local cursor for caching.
+     * @return The value at the specified row.
+     */
+    inline mema::value_t read_base_table_value(const Column &column,
+                                               uint8_t table_id,
+                                               uint8_t col_idx, uint32_t row_id,
+                                               DataType data_type,
+                                               Cursor &cursor) const {
+        size_t idx = (static_cast<size_t>(table_id) << BASE_TABLE_SHIFT) |
+                     static_cast<size_t>(col_idx);
+
+        if (idx >= MAX_BASE_TABLE_INDICES || !base_table_prepared_[idx]) {
+            // Fallback to O(P) linear scan
+            return read_value_direct(column, row_id, data_type);
+        }
+
+        const PageIndex &page_index = base_table_indices_[idx];
+
+        // Dense INT32 fast path: O(1) arithmetic lookup
+        if (data_type == DataType::INT32 && page_index.is_dense_int32) {
+            return mema::value_t{read_dense_int32(page_index, row_id)};
+        }
+
+        // Check cursor cache (version uses base_table_version_ + idx for
+        // uniqueness)
+        uint64_t effective_version = base_table_version_ + idx;
+        bool cache_hit =
+            cursor.version == effective_version && cursor.cached_col == idx &&
+            row_id >= cursor.cached_start && row_id < cursor.cached_end;
+
+        if (!cache_hit) {
+            // Check sequential access optimization
+            if (cursor.version == effective_version &&
+                cursor.cached_col == idx && row_id == cursor.cached_end) {
+                size_t next_page = cursor.cached_page + 1;
+                if (next_page < page_index.cumulative_rows.size()) {
+                    load_page_into_cursor_base(column, page_index, next_page,
+                                               idx, effective_version, cursor);
+                } else {
+                    // Past end of data
+                    return mema::value_t{mema::value_t::NULL_VALUE};
+                }
+            } else {
+                // Binary search for page
+                size_t page_num = page_index.find_page(row_id);
+                load_page_into_cursor_base(column, page_index, page_num, idx,
+                                           effective_version, cursor);
+            }
+        }
+
+        // Now cursor is loaded for the correct page
+        uint32_t local_row = row_id - cursor.cached_start;
+        if (SPC_LIKELY(cursor.is_dense)) {
+            if (data_type == DataType::INT32) {
+                return mema::value_t{cursor.data_ptr[local_row]};
+            } else {
+                return mema::value_t::encode_string(
+                    cursor.page_idx_val, static_cast<int32_t>(local_row));
+            }
+        }
+        if (SPC_UNLIKELY(cursor.is_special)) {
+            return mema::value_t::encode_string(
+                cursor.page_idx_val, mema::value_t::LONG_STRING_OFFSET);
+        }
+        return read_sparse(local_row, data_type, cursor);
+    }
+
     /** @brief Fast path: check cursor cache, dispatch to appropriate handler.
      */
     template <bool IsBuild>
@@ -275,10 +397,15 @@ class ColumnarReader {
 
         /* Dense INT32 fast path: O(1) arithmetic lookup, bypasses cursor */
         if (data_type == DataType::INT32) {
-            const PageIndex &page_index = IsBuild ? build_page_indices[col_idx]
+            size_t pidx_size =
+                IsBuild ? build_page_indices.size() : probe_page_indices.size();
+            if (SPC_LIKELY(col_idx < pidx_size)) {
+                const PageIndex &page_index = IsBuild
+                                                  ? build_page_indices[col_idx]
                                                   : probe_page_indices[col_idx];
-            if (SPC_LIKELY(page_index.is_dense_int32)) {
-                return mema::value_t{read_dense_int32(page_index, row_id)};
+                if (SPC_LIKELY(page_index.is_dense_int32)) {
+                    return mema::value_t{read_dense_int32(page_index, row_id)};
+                }
             }
         }
 
@@ -291,10 +418,10 @@ class ColumnarReader {
                 global_probe_version.load(std::memory_order_relaxed);
         }
 
-        if (SPC_LIKELY(cursor.version == current_version &&
-                       col_idx == cursor.cached_col &&
-                       row_id >= cursor.cached_start &&
-                       row_id < cursor.cached_end)) {
+        bool cache_hit =
+            cursor.version == current_version && col_idx == cursor.cached_col &&
+            row_id >= cursor.cached_start && row_id < cursor.cached_end;
+        if (SPC_LIKELY(cache_hit)) {
             uint32_t local_row = row_id - cursor.cached_start;
             if (SPC_LIKELY(cursor.is_dense)) {
                 if (data_type == DataType::INT32) {
@@ -313,9 +440,11 @@ class ColumnarReader {
         }
 
         /* sequential access optimization: skip binary search for next page */
+        size_t pidx_count =
+            IsBuild ? build_page_indices.size() : probe_page_indices.size();
         if (SPC_LIKELY(cursor.version == current_version &&
                        col_idx == cursor.cached_col &&
-                       row_id == cursor.cached_end)) {
+                       row_id == cursor.cached_end && col_idx < pidx_count)) {
             const PageIndex &page_index = IsBuild ? build_page_indices[col_idx]
                                                   : probe_page_indices[col_idx];
             size_t next_page = cursor.cached_page + 1;
@@ -384,6 +513,12 @@ class ColumnarReader {
                                          Cursor &cursor,
                                          uint64_t current_version) const {
 
+        size_t pidx_size =
+            IsBuild ? build_page_indices.size() : probe_page_indices.size();
+        if (SPC_UNLIKELY(col_idx >= pidx_size)) {
+            // No page index prepared - use direct page read
+            return read_value_direct(column, row_id, data_type);
+        }
         const PageIndex &page_index =
             IsBuild ? build_page_indices[col_idx] : probe_page_indices[col_idx];
         size_t page_num = page_index.find_page(row_id);
@@ -407,6 +542,18 @@ class ColumnarReader {
         }
     }
 
+    /**
+     * @brief Direct value read bypassing page index cache.
+     *
+     * Used for deferred column resolution when reading from base tables
+     * that don't have prepared page indices. O(n) page scan per read.
+     */
+    inline mema::value_t read_value_direct_public(const Column &column,
+                                                  uint32_t row_id,
+                                                  DataType data_type) const {
+        return read_value_direct(column, row_id, data_type);
+    }
+
     inline const PageIndex &get_build_page_index(size_t col_idx) const {
         return build_page_indices[col_idx];
     }
@@ -428,6 +575,89 @@ class ColumnarReader {
         return reinterpret_cast<const int32_t *>(page_data + 4)[local_row];
     }
 
+    /**
+     * @brief Direct value read without prepared page index.
+     *
+     * Used when page indices aren't available (e.g., reading base tables
+     * during deferred resolution). O(n) page scan - slower than cached path.
+     */
+    inline mema::value_t read_value_direct(const Column &column,
+                                           uint32_t row_id,
+                                           DataType data_type) const {
+        // Linear scan to find page containing row_id
+        uint32_t cumulative = 0;
+        for (size_t page_num = 0; page_num < column.pages.size(); ++page_num) {
+            auto *page_data = column.pages[page_num]->data;
+            auto num_rows = *reinterpret_cast<const uint16_t *>(page_data);
+            auto num_values =
+                *reinterpret_cast<const uint16_t *>(page_data + 2);
+
+            // Handle special pages
+            if (num_rows == 0xffff) {
+                // Long string page - single row
+                if (row_id == cumulative) {
+                    return mema::value_t::encode_string(
+                        static_cast<int32_t>(page_num),
+                        mema::value_t::LONG_STRING_OFFSET);
+                }
+                cumulative += 1;
+                continue;
+            }
+            if (num_rows == 0xfffe) {
+                // Skip special marker pages
+                continue;
+            }
+
+            if (row_id < cumulative + num_rows) {
+                // Found the page
+                uint32_t local_row = row_id - cumulative;
+                bool is_dense = (num_rows == num_values);
+                const auto *data_ptr =
+                    reinterpret_cast<const int32_t *>(page_data + 4);
+
+                if (is_dense) {
+                    if (data_type == DataType::INT32) {
+                        return mema::value_t{data_ptr[local_row]};
+                    } else {
+                        return mema::value_t::encode_string(
+                            static_cast<int32_t>(page_num),
+                            static_cast<int32_t>(local_row));
+                    }
+                } else {
+                    // Sparse page - check bitmap
+                    size_t bitmap_size = (num_rows + 7) / 8;
+                    const auto *bitmap_ptr = reinterpret_cast<const uint8_t *>(
+                        page_data + PAGE_SIZE - bitmap_size);
+
+                    bool is_valid =
+                        bitmap_ptr[local_row >> 3] & (1u << (local_row & 7));
+                    if (!is_valid) {
+                        return mema::value_t{mema::value_t::NULL_VALUE};
+                    }
+
+                    // Compute data index via popcount
+                    uint32_t data_idx = 0;
+                    for (uint32_t i = 0; i < local_row; ++i) {
+                        if (bitmap_ptr[i >> 3] & (1u << (i & 7))) {
+                            data_idx++;
+                        }
+                    }
+
+                    if (data_type == DataType::INT32) {
+                        return mema::value_t{data_ptr[data_idx]};
+                    } else {
+                        return mema::value_t::encode_string(
+                            static_cast<int32_t>(page_num),
+                            static_cast<int32_t>(data_idx));
+                    }
+                }
+            }
+            cumulative += num_rows;
+        }
+        // Row not found - return NULL
+        return mema::value_t{mema::value_t::NULL_VALUE};
+    }
+
     /** @brief Reads from sparse pages using bitmap and popcount. */
     inline mema::value_t read_sparse(uint32_t local_row, DataType data_type,
                                      const Cursor &cursor) const {
@@ -452,8 +682,48 @@ class ColumnarReader {
                                                 static_cast<int32_t>(data_idx));
         }
     }
+
+    /** @brief Load page into cursor for base table access. */
+    inline void load_page_into_cursor_base(const Column &column,
+                                           const PageIndex &page_index,
+                                           size_t page_num, size_t col_idx,
+                                           uint64_t version,
+                                           Cursor &cursor) const {
+        cursor.version = version;
+        cursor.cached_col = col_idx;
+        cursor.cached_page = page_num;
+        cursor.cached_start = page_index.page_start_row(page_num);
+        cursor.cached_end = page_index.cumulative_rows[page_num];
+        cursor.page_idx_val = static_cast<int32_t>(page_num);
+        cursor.col_all_dense = page_index.all_dense;
+
+        auto *page_data = column.pages[page_num]->data;
+        auto num_rows = *reinterpret_cast<const uint16_t *>(page_data);
+        auto num_values = *reinterpret_cast<const uint16_t *>(page_data + 2);
+
+        cursor.is_special = (num_rows == 0xffff);
+        cursor.is_dense = (num_rows == num_values);
+        cursor.data_ptr = reinterpret_cast<const int32_t *>(page_data + 4);
+
+        if (!cursor.is_dense && !cursor.is_special) {
+            size_t bitmap_size = (num_rows + 7) / 8;
+            cursor.bitmap_ptr = reinterpret_cast<const uint8_t *>(
+                page_data + PAGE_SIZE - bitmap_size);
+            cursor.prefix_sum_ptr =
+                page_index.page_prefix_sums[page_num].data();
+        }
+    }
+
     std::vector<PageIndex> build_page_indices;
     std::vector<PageIndex> probe_page_indices;
+
+    // Base table page indices for deferred column resolution.
+    // Index = (table_id << 4) | col_idx, supports 16 tables × 16 cols = 256.
+    static constexpr size_t BASE_TABLE_SHIFT = 4;
+    static constexpr size_t MAX_BASE_TABLE_INDICES = 256;
+    std::array<PageIndex, MAX_BASE_TABLE_INDICES> base_table_indices_;
+    std::array<bool, MAX_BASE_TABLE_INDICES> base_table_prepared_{};
+    uint64_t base_table_version_ = 0;
 };
 } // namespace Contest::io
 
diff --git a/include/data_model/deferred_plan.h b/include/data_model/deferred_plan.h
new file mode 100644
index 0000000..13be4dd
--- /dev/null
+++ b/include/data_model/deferred_plan.h
@@ -0,0 +1,142 @@
+/**
+ * @file deferred_plan.h
+ * @brief Analyzed plan with materialization decisions for execution.
+ *
+ * AnalyzedPlan mirrors the original Plan structure but includes pre-computed
+ * decisions about which columns to materialize eagerly (join keys) vs defer
+ * until final output. Each AnalyzedJoinNode tracks column provenance back to
+ * base tables for efficient deferred resolution.
+ *
+ * @see analyze_plan.cpp for the analysis algorithm.
+ * @see intermediate.h for the runtime result format.
+ */
+#pragma once
+
+#include <cstdint>
+#include <optional>
+#include <variant>
+#include <vector>
+
+#include <data_model/plan.h>
+#include <join_execution/match_collector.h>
+
+namespace Contest {
+
+/**
+ * @brief Materialization decision for an output column.
+ *
+ * MATERIALIZE: Column is needed as a join key by parent - materialize eagerly.
+ * DEFER: Column only needed at final output - defer until root materialization.
+ */
+enum class ColumnResolution : uint8_t { MATERIALIZE, DEFER };
+
+/**
+ * @brief Tracks the base table origin of a column for deferred resolution.
+ *
+ * Used to resolve deferred columns at final materialization by looking up
+ * the original value in the base table using row ID provenance.
+ */
+struct ColumnProvenance {
+    uint8_t base_table_id;   ///< Index into Plan::inputs.
+    uint8_t base_column_idx; ///< Column index within the base table.
+};
+
+/**
+ * @brief Complete metadata for an output column in a join.
+ *
+ * Combines materialization decision, provenance tracking, and child source
+ * information for efficient intermediate construction and final resolution.
+ */
+struct AnalyzedColumnInfo {
+    size_t original_idx; ///< Index in node's output_attrs.
+    DataType type;       ///< INT32 or VARCHAR.
+
+    ColumnResolution resolution; ///< MATERIALIZE or DEFER.
+    ColumnProvenance provenance; ///< Base table source for deferred resolution.
+
+    bool from_left;          ///< True if from left child, false if right.
+    size_t child_output_idx; ///< Index in child's output_attrs.
+};
+
+/**
+ * @brief Analyzed scan node for execution.
+ *
+ * Wraps a ScanNode with output attribute information.
+ */
+struct AnalyzedScanNode {
+    size_t node_idx;       ///< Index in original Plan::nodes.
+    uint8_t base_table_id; ///< Index into Plan::inputs.
+    std::vector<std::tuple<size_t, DataType>> output_attrs; ///< Projected cols.
+};
+
+/**
+ * @brief Analyzed join node with pre-computed materialization decisions.
+ *
+ * Contains all information needed for execution:
+ * - Which columns to materialize eagerly (join keys for parent)
+ * - Column provenance for deferred resolution
+ * - Pre-computed match collection mode
+ * - Number of deferred columns for allocation
+ */
+struct AnalyzedJoinNode {
+    size_t node_idx; ///< Index in original Plan::nodes.
+
+    size_t left_child_idx;  ///< Left child index in Plan::nodes.
+    size_t right_child_idx; ///< Right child index in Plan::nodes.
+    size_t left_join_attr;  ///< Join key index in left child's output.
+    size_t right_join_attr; ///< Join key index in right child's output.
+
+    /// Original output attributes (global indexing).
+    std::vector<std::tuple<size_t, DataType>> output_attrs;
+
+    /// Per-column materialization decisions and provenance.
+    std::vector<AnalyzedColumnInfo> columns;
+
+    /// Pre-computed collection mode (assumes build=left; flip if build=right).
+    join::MatchCollectionMode base_collection_mode;
+
+    /// Number of deferred columns (for pre-allocation).
+    size_t num_deferred_columns = 0;
+
+    /// Column index that parent needs as join key (nullopt if root).
+    std::optional<size_t> parent_join_key_idx;
+
+    /// True if this is the root node.
+    bool is_root;
+};
+
+/**
+ * @brief Plan node variant for execution.
+ */
+using AnalyzedNode = std::variant<AnalyzedScanNode, AnalyzedJoinNode>;
+
+/**
+ * @brief Analyzed plan with materialization decisions.
+ *
+ * Mirrors Plan structure but includes pre-computed decisions for deferred
+ * materialization. The original_plan pointer provides access to base tables
+ * for value resolution.
+ */
+struct AnalyzedPlan {
+    std::vector<AnalyzedNode> nodes; ///< Analyzed nodes (same indices as Plan).
+    size_t root;                     ///< Root node index.
+    const Plan *original_plan;       ///< Non-owning reference to original plan.
+
+    const AnalyzedNode &operator[](size_t idx) const { return nodes[idx]; }
+};
+
+/**
+ * @brief Analyze plan and compute materialization decisions.
+ *
+ * Walks the plan tree in post-order, determining for each join node:
+ * 1. Which column the parent needs as join key (MATERIALIZE)
+ * 2. All other columns (DEFER)
+ * 3. Provenance for each column back to base table
+ * 4. Pre-computed collection mode based on output columns
+ *
+ * @param plan Original query plan.
+ * @return AnalyzedPlan with materialization decisions.
+ */
+AnalyzedPlan analyze_plan(const Plan &plan);
+
+} // namespace Contest
diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h
index e0e2667..e8cefe7 100644
--- a/include/data_model/intermediate.h
+++ b/include/data_model/intermediate.h
@@ -1,32 +1,174 @@
 /**
  * @file intermediate.h
- * @brief Intermediate join format: VARCHAR as page/offset refs (no string
- * copy).
+ * @brief Intermediate join result types and input abstraction.
  *
- * Base tables must outlive execution. @see plan.h ColumnarTable,
- * construct_intermediate.h
+ * Provides:
+ * - mema::value_t: 4-byte value encoding (INT32 direct, VARCHAR as page/offset)
+ * - mema::column_t: 16KB-paged column for materialized values
+ * - mema::DeferredTable: 16KB-paged 32-bit row ID storage per base table
+ * - IntermediateResult: Lightweight result with selective materialization
+ * - JoinInput: Unified abstraction over columnar tables and intermediate
+ * results
+ *
+ * Base tables must outlive execution.
+ *
+ * @see plan.h for ColumnarTable, construct_intermediate.h for building results.
+ * @see deferred_plan.h for AnalyzedJoinNode with column decisions.
  */
 #pragma once
 
 #include <cstdint>
 #include <data_access/table.h>
+#include <data_model/deferred_plan.h>
 #include <data_model/plan.h>
 #include <foundation/common.h>
+#include <optional>
 #include <platform/arena.h>
+#include <variant>
 #include <vector>
 
 /**
  * @namespace mema
- * @brief Compact join intermediate: value_t (4B) + column_t (16KB pages).
+ * @brief Compact join intermediate: value_t (4B) + column_t (16KB pages) +
+ * DeferredTable (32-bit row IDs) + key_row_column_t (8B tuples).
  *
  * value_t: INT32 direct or VARCHAR page/offset ref. column_t: arena-allocated
- * pages with write_at(). @see Contest::ExecuteResult, plan.h ColumnarTable.
+ * pages with write_at(). DeferredTable: 32-bit row ID storage per base table.
+ * key_row_column_t: (key, row_id) tuples for join key propagation.
+ *
+ * @see Contest::IntermediateResult, plan.h ColumnarTable.
  */
 namespace mema {
 
+/**
+ * @brief Join key with associated row ID for tuple-based storage.
+ *
+ * For LEFT_ONLY/RIGHT_ONLY modes: row_id is base table row ID (zero
+ * indirection) For BOTH mode: row_id may be IR index (requires deferred table
+ * lookup)
+ *
+ * 8-byte aligned for efficient memory access and potential SIMD operations.
+ */
+struct alignas(8) KeyRowPair {
+    int32_t key;     ///< Join key value
+    uint32_t row_id; ///< Row ID (base table or IR index depending on mode)
+};
+
+/**
+ *
+ * @brief Column of (key, row_id) tuples for join key storage.
+ *
+ * Enables accelerated hashtable build (tuples match internal format) and
+ * zero-indirection row ID propagation through join chains. Used instead of
+ * separate column_t for join key columns.
+ *
+ * Memory layout: 16KB pages containing 2048 KeyRowPair entries each.
+ *
+ **/
+struct key_row_column_t {
+    static constexpr size_t PAGE_SIZE = 1 << 14; // 16KB
+    static constexpr size_t PAIRS_PER_PAGE =
+        PAGE_SIZE / sizeof(KeyRowPair);       // 2048
+    static constexpr size_t ENTRY_SHIFT = 11; // log2(2048)
+    static constexpr size_t ENTRY_MASK = PAIRS_PER_PAGE - 1;
+
+    struct alignas(PAGE_SIZE) Page {
+        KeyRowPair data[PAIRS_PER_PAGE];
+    };
+
+    std::vector<Page *> pages;
+    size_t num_values = 0;
+
+    /// Base table ID for row_id component (valid when stores_base_row_ids=true)
+    uint8_t base_table_id = 0;
+
+    /// Source column in base table (for VARCHAR provenance)
+    uint8_t source_column = 0;
+
+    /// True if row_id contains base table row IDs, false if IR indices
+    bool stores_base_row_ids = false;
+
+    key_row_column_t() = default;
+
+    key_row_column_t(key_row_column_t &&other) noexcept
+        : pages(std::move(other.pages)), num_values(other.num_values),
+          base_table_id(other.base_table_id),
+          source_column(other.source_column),
+          stores_base_row_ids(other.stores_base_row_ids) {
+        other.pages.clear();
+        other.num_values = 0;
+    }
+
+    key_row_column_t &operator=(key_row_column_t &&other) noexcept {
+        if (this != &other) {
+            pages = std::move(other.pages);
+            num_values = other.num_values;
+            base_table_id = other.base_table_id;
+            source_column = other.source_column;
+            stores_base_row_ids = other.stores_base_row_ids;
+            other.pages.clear();
+            other.num_values = 0;
+        }
+        return *this;
+    }
+
+    key_row_column_t(const key_row_column_t &) = delete;
+    key_row_column_t &operator=(const key_row_column_t &) = delete;
+
+    ~key_row_column_t() = default;
+
+    /// O(1) read: idx >> 11 for page, idx & 0x7FF for offset
+    inline KeyRowPair operator[](size_t idx) const {
+        return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK];
+    }
+
+    /// Thread-safe write at idx (requires pages set up first)
+    inline void write_at(size_t idx, KeyRowPair pair) {
+        pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK] = pair;
+    }
+
+    /// Read only the key at index
+    inline int32_t key_at(size_t idx) const {
+        return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK].key;
+    }
+
+    /// Read only the row_id at index
+    inline uint32_t row_id_at(size_t idx) const {
+        return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK].row_id;
+    }
+
+    size_t row_count() const { return num_values; }
+    void set_row_count(size_t count) { num_values = count; }
+
+    /// Release ownership of pages for zero-copy transfer to hashtable.
+    /// After this call, the column is empty (pages cleared, num_values = 0).
+    /// @return Vector of page pointers (caller takes ownership).
+    std::vector<Page *> release_pages() && {
+        std::vector<Page *> released = std::move(pages);
+        pages.clear();
+        num_values = 0;
+        return released;
+    }
+
+    /// Pre-allocate pages from arena
+    inline void pre_allocate_from_arena(Contest::platform::ThreadArena &arena,
+                                        size_t count) {
+        size_t pages_needed = (count + PAIRS_PER_PAGE - 1) / PAIRS_PER_PAGE;
+        pages.reserve(pages_needed);
+        for (size_t i = 0; i < pages_needed; ++i) {
+            void *ptr =
+                arena.alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
+            pages.push_back(reinterpret_cast<Page *>(ptr));
+        }
+        num_values = count;
+    }
+};
+
 /**
  * @brief 4-byte value: INT32 direct, VARCHAR packed (19-bit page + 13-bit
- * offset), NULL = INT32_MIN, long string offset = 0x1FFF. Refs valid only while
+ * offset).
+ *
+ * NULL = INT32_MIN, long string offset = 0x1FFF. Refs valid only while
  * source exists.
  */
 struct alignas(4) value_t {
@@ -45,16 +187,18 @@ struct alignas(4) value_t {
         offset_idx = (static_cast<uint32_t>(encoded) >> 19) & 0x1FFF;
     }
 
-    static constexpr int32_t LONG_STRING_OFFSET =
-        0x1FFF; /**< Sentinel for long strings. */
-    static constexpr int32_t NULL_VALUE =
-        INT32_MIN; /**< NULL sentinel for both types. */
+    /** @brief Sentinel for long strings. */
+    static constexpr int32_t LONG_STRING_OFFSET = 0x1FFF;
+
+    /** @brief NULL sentinel for both types. */
+    static constexpr int32_t NULL_VALUE = INT32_MIN;
 
     /** @brief Check if this value represents NULL. */
     inline bool is_null() const { return value == NULL_VALUE; }
 };
 
-/** @brief Page size for intermediate results (16KB, larger than ColumnarTable).
+/**
+ * @brief Page size for intermediate results (16KB, larger than ColumnarTable).
  */
 constexpr size_t IR_PAGE_SIZE = 1 << 14;
 
@@ -82,9 +226,12 @@ struct column_t {
 
   public:
     std::vector<Page *> pages; /**< Pointers to arena-allocated pages. */
-    uint8_t source_table =
-        0; /**< Base table index for VARCHAR dereferencing. */
-    uint8_t source_column = 0; /**< Column index within source table. */
+
+    /** @brief Base table index for VARCHAR dereferencing. */
+    uint8_t source_table = 0;
+
+    /** @brief Column index within source table. */
+    uint8_t source_column = 0;
 
   public:
     column_t() = default;
@@ -114,8 +261,10 @@ struct column_t {
 
     ~column_t() = default;
 
-    /** @brief O(1) read: idx>>12 for page, idx&0xFFF for offset. No bounds
-     * check. */
+    /**
+     * @brief O(1) read: idx>>12 for page, idx&0xFFF for offset.
+     * @note No bounds check.
+     */
     inline const value_t &operator[](size_t idx) const {
         return pages[idx >> 12]->data[idx & 0xFFF];
     }
@@ -153,14 +302,314 @@ struct column_t {
 using Columnar = std::vector<column_t>;
 
 /**
- * @brief Convert column_t vector to ColumnarTable. Dereferences VARCHAR refs.
- * @see materialize.h
+ * @brief Per-base-table deferred row ID storage with multi-column tracking.
+ *
+ * Stores 32-bit row IDs for a single base table. All columns from this
+ * base table share the same row ID lookup, reducing memory from 8 bytes
+ * per column to 4 bytes per table.
+ *
+ * Uses 16KB pages (reuses IR_PAGE arena chunk) with 4096 uint32_t entries.
  */
-ColumnarTable to_columnar(const Columnar &table, const Plan &plan);
-} /* namespace mema */
+struct DeferredTable {
+    static constexpr size_t PAGE_SIZE = 1 << 14; // 16KB
+    static constexpr size_t ENTRIES_PER_PAGE =
+        PAGE_SIZE / sizeof(uint32_t);         // 4096
+    static constexpr size_t ENTRY_SHIFT = 12; // log2(4096)
+    static constexpr size_t ENTRY_MASK = ENTRIES_PER_PAGE - 1;
+
+    struct alignas(PAGE_SIZE) Page {
+        uint32_t data[ENTRIES_PER_PAGE];
+    };
+
+    std::vector<Page *> pages;
+    size_t num_values = 0;
+
+    /// Base table ID this deferred table references
+    uint8_t base_table_id = 0;
+
+    /// True if this deferred table comes from build side (vs probe)
+    bool from_build = false;
+
+    /// Column indices from this base table that need deferred resolution
+    std::vector<uint8_t> column_indices;
+
+    DeferredTable() = default;
+
+    DeferredTable(DeferredTable &&other) noexcept
+        : pages(std::move(other.pages)), num_values(other.num_values),
+          base_table_id(other.base_table_id), from_build(other.from_build),
+          column_indices(std::move(other.column_indices)) {
+        other.pages.clear();
+        other.num_values = 0;
+    }
+
+    DeferredTable &operator=(DeferredTable &&other) noexcept {
+        if (this != &other) {
+            pages = std::move(other.pages);
+            num_values = other.num_values;
+            base_table_id = other.base_table_id;
+            from_build = other.from_build;
+            column_indices = std::move(other.column_indices);
+            other.pages.clear();
+            other.num_values = 0;
+        }
+        return *this;
+    }
+
+    DeferredTable(const DeferredTable &) = delete;
+    DeferredTable &operator=(const DeferredTable &) = delete;
+
+    ~DeferredTable() = default;
+
+    /// O(1) read: idx >> 12 for page, idx & 0xFFF for offset
+    inline uint32_t operator[](size_t idx) const {
+        return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK];
+    }
+
+    /// Thread-safe write at idx (requires pages set up first)
+    inline void write_at(size_t idx, uint32_t row_id) {
+        pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK] = row_id;
+    }
+
+    size_t row_count() const { return num_values; }
+    void set_row_count(size_t count) { num_values = count; }
+
+    /// Check if this table tracks a specific base column
+    bool has_column(uint8_t col_idx) const {
+        for (uint8_t c : column_indices) {
+            if (c == col_idx)
+                return true;
+        }
+        return false;
+    }
+};
+
+} // namespace mema
 
-/** @namespace Contest @brief Contest API. @see Plan, execute.cpp */
 namespace Contest {
-/** @brief Result type for non-root joins (intermediate format). */
-using ExecuteResult = std::vector<mema::column_t>;
-} /* namespace Contest */
+
+/**
+ * @brief Reference from a column to its deferred table.
+ */
+struct DeferredColumnRef {
+    uint8_t table_idx; ///< Index into IntermediateResult::deferred_tables
+    uint8_t base_col;  ///< Base column index in Plan::inputs[base_table_id]
+};
+
+/**
+ * @brief Lightweight intermediate result with selective materialization.
+ *
+ * Stores join key as (value, row_id) tuples for accelerated hashtable build
+ * and zero-indirection row ID propagation. Other columns use per-table 32-bit
+ * row ID storage for deferred resolution.
+ *
+ * For LEFT_ONLY/RIGHT_ONLY modes: join_key_tuples stores base table row IDs
+ * For BOTH mode: join_key_tuples may store IR indices + DeferredTable for other
+ * side
+ *
+ * @see AnalyzedColumnInfo for materialization decisions.
+ * @see key_row_column_t for tuple storage.
+ * @see DeferredTable for 32-bit row ID storage.
+ */
+struct IntermediateResult {
+    /// Join key stored as (value, row_id) tuples for accelerated propagation.
+    /// Replaces materialized column for join key when present.
+    std::optional<mema::key_row_column_t> join_key_tuples;
+
+    /// Index of join key column in output (nullopt if root or no tuples).
+    std::optional<size_t> join_key_idx;
+
+    /// Other materialized columns (non-join-key columns marked MATERIALIZE).
+    std::vector<mema::column_t> materialized;
+
+    /// Map: original column index -> index in materialized (nullopt if
+    /// deferred or is join key).
+    std::vector<std::optional<size_t>> materialized_map;
+
+    /// Per-base-table deferred row ID storage. One DeferredTable per unique
+    /// (from_build, base_table_id) pair. All columns from same base table share
+    /// the same row ID lookup. Used for BOTH mode's non-tracked side.
+    std::vector<mema::DeferredTable> deferred_tables;
+
+    /// Map: original column index -> DeferredColumnRef (nullopt if
+    /// materialized). The ref contains table_idx (into deferred_tables) and
+    /// base_col for resolution.
+    std::vector<std::optional<DeferredColumnRef>> deferred_map;
+
+    /// Reference to node info for column provenance resolution.
+    const AnalyzedJoinNode *node_info = nullptr;
+
+    /// Total row count.
+    size_t num_rows = 0;
+
+    IntermediateResult() = default;
+    IntermediateResult(IntermediateResult &&) = default;
+    IntermediateResult &operator=(IntermediateResult &&) = default;
+    IntermediateResult(const IntermediateResult &) = delete;
+    IntermediateResult &operator=(const IntermediateResult &) = delete;
+
+    /** @brief Total row count. */
+    size_t row_count() const { return num_rows; }
+
+    /** @brief Check if join key is stored as tuples. */
+    bool has_join_key_tuples() const { return join_key_tuples.has_value(); }
+
+    /** @brief Check if join key tuples contain base row IDs (vs IR indices). */
+    bool join_key_has_base_rows() const {
+        return join_key_tuples && join_key_tuples->stores_base_row_ids;
+    }
+
+    /** @brief Get join key tuple at index. */
+    mema::KeyRowPair get_join_key_tuple(size_t idx) const {
+        return join_key_tuples ? (*join_key_tuples)[idx]
+                               : mema::KeyRowPair{0, 0};
+    }
+
+    /** @brief Check if column was materialized (not deferred). */
+    bool is_materialized(size_t orig_idx) const {
+        return orig_idx < materialized_map.size() &&
+               materialized_map[orig_idx].has_value();
+    }
+
+    /** @brief Check if column is the join key (stored as tuples). */
+    bool is_join_key(size_t orig_idx) const {
+        return join_key_idx.has_value() && *join_key_idx == orig_idx;
+    }
+
+    /** @brief Check if column is deferred. */
+    bool is_deferred(size_t orig_idx) const {
+        return orig_idx < deferred_map.size() &&
+               deferred_map[orig_idx].has_value();
+    }
+
+    /** @brief Get materialized column, or nullptr if deferred/join key. */
+    const mema::column_t *get_materialized(size_t orig_idx) const {
+        if (!is_materialized(orig_idx))
+            return nullptr;
+        return &materialized[*materialized_map[orig_idx]];
+    }
+
+    /** @brief Get deferred table for a column, or nullptr if materialized. */
+    const mema::DeferredTable *get_deferred_table(size_t orig_idx) const {
+        if (!is_deferred(orig_idx))
+            return nullptr;
+        return &deferred_tables[deferred_map[orig_idx]->table_idx];
+    }
+
+    /** @brief Get mutable deferred table for a column, or nullptr. */
+    mema::DeferredTable *get_deferred_table_mut(size_t orig_idx) {
+        if (!is_deferred(orig_idx))
+            return nullptr;
+        return &deferred_tables[deferred_map[orig_idx]->table_idx];
+    }
+
+    /** @brief Get base column index for deferred column. */
+    uint8_t get_deferred_base_col(size_t orig_idx) const {
+        if (!is_deferred(orig_idx))
+            return 0;
+        return deferred_map[orig_idx]->base_col;
+    }
+
+    /** @brief Get full DeferredColumnRef for a column, or nullptr. */
+    const DeferredColumnRef *get_deferred_ref(size_t orig_idx) const {
+        if (!is_deferred(orig_idx))
+            return nullptr;
+        return &(*deferred_map[orig_idx]);
+    }
+
+    /** @brief Number of deferred tables (unique base tables). */
+    size_t num_deferred_tables() const { return deferred_tables.size(); }
+};
+
+/**
+ * @brief Unified abstraction over columnar tables and intermediate results.
+ *
+ * Stores ColumnarTable* (base scans) or IntermediateResult (child joins).
+ * Provides uniform interface for columnar (base table) and intermediate
+ * data sources.
+ *
+ * @see IntermediateResult for intermediate join results.
+ * @see ColumnarTable for base table storage.
+ */
+struct JoinInput {
+    /// Either base table pointer or owned IntermediateResult.
+    std::variant<const ColumnarTable *, IntermediateResult> data;
+
+    /// Original plan node for output_attrs mapping.
+    const PlanNode *node = nullptr;
+
+    /// Analyzed plan node for materialization decisions.
+    const AnalyzedNode *analyzed_node = nullptr;
+
+    /// Base table ID (for columnar inputs).
+    uint8_t table_id = 0;
+
+    /** @brief True if data is columnar (base table). */
+    bool is_columnar() const {
+        return std::holds_alternative<const ColumnarTable *>(data);
+    }
+
+    /** @brief Row count for join key column. */
+    size_t row_count(size_t col_idx) const {
+        if (is_columnar()) {
+            const auto *table = std::get<const ColumnarTable *>(data);
+            return table->num_rows;
+        }
+        return std::get<IntermediateResult>(data).row_count();
+    }
+
+    /** @brief Total row count. */
+    size_t row_count() const {
+        if (is_columnar()) {
+            const auto *table = std::get<const ColumnarTable *>(data);
+            return table->num_rows;
+        }
+        return std::get<IntermediateResult>(data).row_count();
+    }
+
+    /** @brief Number of output columns. */
+    size_t output_size() const {
+        if (node)
+            return node->output_attrs.size();
+        return 0;
+    }
+
+    /**
+     * @brief Get deferred table for a column index.
+     *
+     * For columnar inputs, returns nullptr (caller must encode fresh).
+     * For IntermediateResult inputs, returns existing deferred table.
+     */
+    const mema::DeferredTable *get_deferred_table(size_t col_idx) const {
+        if (is_columnar())
+            return nullptr;
+        return std::get<IntermediateResult>(data).get_deferred_table(col_idx);
+    }
+
+    /**
+     * @brief Get base column index for a deferred column.
+     *
+     * For columnar inputs, returns 0 (caller must use column metadata).
+     * For IntermediateResult inputs, returns stored base column index.
+     */
+    uint8_t get_deferred_base_col(size_t col_idx) const {
+        if (is_columnar())
+            return 0;
+        return std::get<IntermediateResult>(data).get_deferred_base_col(
+            col_idx);
+    }
+
+    /**
+     * @brief Get full DeferredColumnRef for a column index.
+     *
+     * For columnar inputs, returns nullptr (caller must encode fresh).
+     * For IntermediateResult inputs, returns pointer to DeferredColumnRef.
+     */
+    const DeferredColumnRef *get_deferred_ref(size_t col_idx) const {
+        if (is_columnar())
+            return nullptr;
+        return std::get<IntermediateResult>(data).get_deferred_ref(col_idx);
+    }
+};
+
+} // namespace Contest
diff --git a/include/data_model/plan.h b/include/data_model/plan.h
index 99c623e..897a8e2 100644
--- a/include/data_model/plan.h
+++ b/include/data_model/plan.h
@@ -33,7 +33,8 @@
 #endif
 
 /**
- * @brief RAII mmap wrapper with refcount. munmap on last ref release. Move-only.
+ * @brief RAII mmap wrapper with refcount. munmap on last ref release.
+ * Move-only.
  */
 class MappedMemory {
   public:
@@ -127,8 +128,8 @@ constexpr size_t PAGE_SIZE = 8192;
  * @brief 8-byte aligned page (8KB) for columnar data.
  *
  * INT32: [num_rows:u16][num_values:u16][values...][bitmap at end]
- * VARCHAR: [num_rows:u16][num_offsets:u16][offsets:u16...][string bytes][bitmap]
- * Long string markers: 0xFFFF (first), 0xFFFE (continuation).
+ * VARCHAR: [num_rows:u16][num_offsets:u16][offsets:u16...][string
+ * bytes][bitmap] Long string markers: 0xFFFF (first), 0xFFFE (continuation).
  * Dense page (no NULLs): num_rows == num_values → fast path.
  */
 struct alignas(8) Page {
@@ -219,7 +220,8 @@ struct Plan {
     size_t root;                       /**< Index of root node in nodes. */
 
     /**
-     * @brief Create JoinNode. @return node index. Execution may override build_left.
+     * @brief Create JoinNode. @return node index. Execution may override
+     * build_left.
      */
     size_t
     new_join_node(bool build_left, size_t left, size_t right, size_t left_attr,
@@ -282,7 +284,8 @@ template <class T> struct ColumnInserter {
         bitmap.resize(PAGE_SIZE);
     }
 
-    /** @brief Get current page, allocating if needed. Does not advance index. */
+    /** @brief Get current page, allocating if needed. Does not advance index.
+     */
     std::byte *get_page() {
         if (last_page_idx == column.pages.size()) [[unlikely]] {
             column.new_page();
@@ -369,7 +372,8 @@ template <> struct ColumnInserter<std::string> {
         bitmap.resize(PAGE_SIZE);
     }
 
-    /** @brief Get current page, allocating if needed. Does not advance index. */
+    /** @brief Get current page, allocating if needed. Does not advance index.
+     */
     std::byte *get_page() {
         if (last_page_idx == column.pages.size()) [[unlikely]] {
             column.new_page();
@@ -378,7 +382,8 @@ template <> struct ColumnInserter<std::string> {
         return page->data;
     }
 
-    /** @brief Write long string (>PAGE_SIZE-7) across pages. 0xFFFF/0xFFFE markers. */
+    /** @brief Write long string (>PAGE_SIZE-7) across pages. 0xFFFF/0xFFFE
+     * markers. */
     void save_long_string(std::string_view value) {
         size_t offset = 0;
         auto first_page = true;
@@ -484,6 +489,8 @@ struct TimingStats {
     int64_t setup_ms = 0;            /**< JoinSetup + build/probe selection. */
     int64_t total_execution_ms = 0;  /**< Wall-clock total for execute(). */
     int64_t intermediate_ms = 0; /**< construct_intermediate for non-root. */
+    int64_t analyze_plan_ms = 0; /**< Deferred: plan analysis time. */
+    int64_t deferred_resolve_ms = 0; /**< Deferred: column resolution time. */
 };
 
 /** @brief Allocate execution context (worker pool, shared state). */
diff --git a/include/foundation/common.h b/include/foundation/common.h
index 16c8aa7..49967cd 100644
--- a/include/foundation/common.h
+++ b/include/foundation/common.h
@@ -125,7 +125,8 @@ class File {
     }
 };
 
-/** @brief Read entire file into string. @throws std::runtime_error on failure. */
+/** @brief Read entire file into string. @throws std::runtime_error on failure.
+ */
 inline std::string read_file(const std::filesystem::path &path) {
     File f(path, "rb");
     ::fseek(f, 0, SEEK_END);
@@ -154,7 +155,8 @@ struct DSU {
     void unite(size_t x, size_t y) { pa[find(x)] = find(y); }
 };
 
-/** @brief Mark unreachable code path for compiler optimization (UB if reached). */
+/** @brief Mark unreachable code path for compiler optimization (UB if reached).
+ */
 [[noreturn]] inline void unreachable() {
     // Uses compiler specific extensions if possible.
     // Even if no extension is used, undefined behavior is still raised by
@@ -164,4 +166,90 @@ struct DSU {
 #else // GCC, Clang
     __builtin_unreachable();
 #endif
-}
\ No newline at end of file
+}
+
+namespace Contest {
+
+/**
+ * @brief Encoded global row ID: 5-bit table_id + 27-bit row_id.
+ *
+ * Supports up to 32 tables and 134M rows per table.
+ * Used to track original scan table rows through recursive joins.
+ *
+ * Encoding: [table_id (5 bits)][row_id (27 bits)]
+ *   - table_id: bits 27-31
+ *   - row_id: bits 0-26
+ */
+struct GlobalRowId {
+    static constexpr uint32_t TABLE_BITS = 5;
+    static constexpr uint32_t ROW_BITS = 27;
+    static constexpr uint32_t TABLE_SHIFT = ROW_BITS;
+    static constexpr uint32_t ROW_MASK = (1u << ROW_BITS) - 1;
+    static constexpr uint32_t MAX_TABLES = 1u << TABLE_BITS; // 32
+    static constexpr uint32_t MAX_ROWS = 1u << ROW_BITS;     // 134,217,728
+
+    /** @brief Encode table_id and row_id into a single uint32_t. */
+    static inline uint32_t encode(uint8_t table_id, uint32_t row_id) {
+        return (static_cast<uint32_t>(table_id) << TABLE_SHIFT) |
+               (row_id & ROW_MASK);
+    }
+
+    /** @brief Extract table_id from encoded global row ID. */
+    static inline uint8_t table(uint32_t encoded) {
+        return static_cast<uint8_t>(encoded >> TABLE_SHIFT);
+    }
+
+    /** @brief Extract row_id from encoded global row ID. */
+    static inline uint32_t row(uint32_t encoded) { return encoded & ROW_MASK; }
+};
+
+/**
+ * @brief 64-bit encoding for deferred column provenance.
+ *
+ * Encodes table_id, column_idx, and row_id into a single 64-bit value
+ * for efficient storage and resolution of deferred columns.
+ *
+ * Encoding: [table_id (8 bits)][column_idx (8 bits)][row_id (48 bits)]
+ *   - table_id: bits 56-63
+ *   - column_idx: bits 48-55
+ *   - row_id: bits 0-47
+ *
+ * Supports up to 256 tables, 256 columns per table, and 281 trillion rows.
+ */
+struct DeferredProvenance {
+    static constexpr uint64_t ROW_BITS = 48;
+    static constexpr uint64_t COLUMN_BITS = 8;
+    static constexpr uint64_t TABLE_BITS = 8;
+
+    static constexpr uint64_t ROW_MASK = (1ULL << ROW_BITS) - 1;
+    static constexpr uint64_t COLUMN_MASK = (1ULL << COLUMN_BITS) - 1;
+    static constexpr uint64_t COLUMN_SHIFT = ROW_BITS;
+    static constexpr uint64_t TABLE_SHIFT = ROW_BITS + COLUMN_BITS;
+
+    static constexpr uint64_t MAX_TABLES = 1ULL << TABLE_BITS;   // 256
+    static constexpr uint64_t MAX_COLUMNS = 1ULL << COLUMN_BITS; // 256
+    static constexpr uint64_t MAX_ROWS = 1ULL << ROW_BITS;       // 281 trillion
+
+    /** @brief Encode table_id, column_idx, row_id into single uint64_t. */
+    static inline uint64_t encode(uint8_t table_id, uint8_t column_idx,
+                                  uint64_t row_id) {
+        return (static_cast<uint64_t>(table_id) << TABLE_SHIFT) |
+               (static_cast<uint64_t>(column_idx) << COLUMN_SHIFT) |
+               (row_id & ROW_MASK);
+    }
+
+    /** @brief Extract table_id from encoded provenance. */
+    static inline uint8_t table(uint64_t encoded) {
+        return static_cast<uint8_t>(encoded >> TABLE_SHIFT);
+    }
+
+    /** @brief Extract column_idx from encoded provenance. */
+    static inline uint8_t column(uint64_t encoded) {
+        return static_cast<uint8_t>((encoded >> COLUMN_SHIFT) & COLUMN_MASK);
+    }
+
+    /** @brief Extract row_id from encoded provenance. */
+    static inline uint64_t row(uint64_t encoded) { return encoded & ROW_MASK; }
+};
+
+} // namespace Contest
\ No newline at end of file
diff --git a/include/join_execution/hash_join.h b/include/join_execution/hash_join.h
index 0e2b777..f518df2 100644
--- a/include/join_execution/hash_join.h
+++ b/include/join_execution/hash_join.h
@@ -1,12 +1,3 @@
-#pragma once
-
-#include <data_model/intermediate.h>
-#include <join_execution/hashtable.h>
-#include <join_execution/join_setup.h>
-#include <join_execution/match_collector.h>
-#include <platform/arena_vector.h>
-#include <platform/worker_pool.h>
-
 /**
  * @file hash_join.h
  * @brief Hash join build and probe operations.
@@ -19,6 +10,13 @@
  *
  * @see hashtable.h, match_collector.h
  */
+#pragma once
+
+#include <data_model/intermediate.h>
+#include <join_execution/hashtable.h>
+#include <join_execution/match_collector.h>
+#include <platform/arena_vector.h>
+#include <platform/worker_pool.h>
 
 /**
  * @namespace Contest::join
@@ -27,7 +25,6 @@
  */
 namespace Contest::join {
 
-using Contest::ExecuteResult;
 using Contest::platform::THREAD_COUNT;
 using Contest::platform::worker_pool;
 
@@ -52,16 +49,22 @@ inline UnchainedHashtable build_from_columnar(const JoinInput &input,
 /**
  * @brief Build hash table from intermediate results (column_t).
  *
- * Uses join key column from ExecuteResult produced by prior pipeline stages.
+ * Uses join key column from IntermediateResult produced by prior pipeline
+ * stages.
  */
 inline UnchainedHashtable build_from_intermediate(const JoinInput &input,
                                                   size_t attr_idx) {
-    const auto &result = std::get<ExecuteResult>(input.data);
-    const auto &column = result[attr_idx];
+    const auto &result = std::get<IntermediateResult>(input.data);
+    // Get the materialized column for the join key
+    const auto *column = result.get_materialized(attr_idx);
+    if (!column) {
+        // This should never happen - join keys must be materialized
+        std::abort();
+    }
 
     size_t row_count = input.row_count(attr_idx);
     UnchainedHashtable hash_table(row_count);
-    hash_table.build_intermediate(column, 8);
+    hash_table.build_intermediate(*column, 8);
 
     return hash_table;
 }
@@ -79,8 +82,7 @@ template <MatchCollectionMode Mode>
 inline std::vector<ThreadLocalMatchBuffer<Mode>>
 probe_intermediate(const UnchainedHashtable &hash_table,
                    const mema::column_t &probe_column) {
-    const auto *keys = hash_table.keys();
-    const auto *row_ids = hash_table.row_ids();
+    const auto *entries = hash_table.entries();
 
     size_t pool_size = THREAD_COUNT;
     std::vector<ThreadLocalMatchBuffer<Mode>> local_buffers(pool_size);
@@ -121,8 +123,8 @@ probe_intermediate(const UnchainedHashtable &hash_table,
                         hash_table.find_indices(key_val);
 
                     for (uint64_t i = start_idx; i < end_idx; ++i) {
-                        if (keys[i] == key_val) {
-                            local_buf.add_match(row_ids[i],
+                        if (entries[i].key == key_val) {
+                            local_buf.add_match(entries[i].row_id,
                                                 static_cast<uint32_t>(idx));
                         }
                     }
@@ -148,8 +150,7 @@ inline std::vector<ThreadLocalMatchBuffer<Mode>>
 probe_columnar(const UnchainedHashtable &hash_table,
                const JoinInput &probe_input, size_t probe_attr) {
 
-    const auto *keys = hash_table.keys();
-    const auto *row_ids = hash_table.row_ids();
+    const auto *entries = hash_table.entries();
 
     auto *table = std::get<const ColumnarTable *>(probe_input.data);
     auto [actual_idx_col, _] = probe_input.node->output_attrs[probe_attr];
@@ -197,8 +198,9 @@ probe_columnar(const UnchainedHashtable &hash_table,
                         hash_table.find_indices(key_val);
 
                     for (uint64_t j = start_idx; j < end_idx; ++j) {
-                        if (keys[j] == key_val) {
-                            local_buf.add_match(row_ids[j], probe_row_id);
+                        if (entries[j].key == key_val) {
+                            local_buf.add_match(entries[j].row_id,
+                                                probe_row_id);
                         }
                     }
                     probe_row_id++;
@@ -219,8 +221,9 @@ probe_columnar(const UnchainedHashtable &hash_table,
                             hash_table.find_indices(key_val);
 
                         for (uint64_t j = start_idx; j < end_idx; ++j) {
-                            if (keys[j] == key_val) {
-                                local_buf.add_match(row_ids[j], probe_row_id);
+                            if (entries[j].key == key_val) {
+                                local_buf.add_match(entries[j].row_id,
+                                                    probe_row_id);
                             }
                         }
                     }
@@ -233,4 +236,74 @@ probe_columnar(const UnchainedHashtable &hash_table,
     return local_buffers;
 }
 
+/**
+ * @brief Probe hash table with tuple column, returning thread-local buffers.
+ *
+ * Uses (key, row_id) tuples from IntermediateResult. The row_id in each
+ * tuple is propagated to the match buffer, enabling zero-indirection
+ * resolution when tuples contain base table row IDs.
+ *
+ * @tparam Mode Collection mode (BOTH, LEFT_ONLY, RIGHT_ONLY) for compile-time
+ *              specialization of match buffer operations.
+ * @param hash_table Hash table to probe against.
+ * @param probe_tuples Tuple column containing (key, row_id) pairs.
+ * @return Vector of thread-local match buffers.
+ */
+template <MatchCollectionMode Mode>
+inline std::vector<ThreadLocalMatchBuffer<Mode>>
+probe_tuples(const UnchainedHashtable &hash_table,
+             const mema::key_row_column_t &probe_tuples) {
+
+    const auto *entries = hash_table.entries();
+    const size_t probe_count = probe_tuples.row_count();
+    const size_t num_pages = probe_tuples.pages.size();
+
+    std::vector<ThreadLocalMatchBuffer<Mode>> local_buffers(THREAD_COUNT);
+    std::atomic<size_t> page_counter(0);
+
+    worker_pool().execute([&](size_t thread_id) {
+        local_buffers[thread_id] = ThreadLocalMatchBuffer<Mode>(
+            Contest::platform::get_arena(thread_id));
+        auto &local_buf = local_buffers[thread_id];
+
+        while (true) {
+            size_t page_idx = page_counter.fetch_add(1);
+            if (page_idx >= num_pages)
+                break;
+
+            size_t base = page_idx * mema::key_row_column_t::PAIRS_PER_PAGE;
+            size_t end = std::min(base + mema::key_row_column_t::PAIRS_PER_PAGE,
+                                  probe_count);
+
+            constexpr size_t PREFETCH_DIST = 8;
+            for (size_t idx = base; idx < end; ++idx) {
+                // Prefetch future slot
+                if (idx + PREFETCH_DIST < end) {
+                    hash_table.prefetch_slot(
+                        probe_tuples.key_at(idx + PREFETCH_DIST));
+                }
+
+                mema::KeyRowPair pair = probe_tuples[idx];
+
+                // Skip NULL keys
+                if (pair.key != mema::value_t::NULL_VALUE) {
+                    auto [start_idx, end_idx] =
+                        hash_table.find_indices(pair.key);
+
+                    for (uint64_t i = start_idx; i < end_idx; ++i) {
+                        if (entries[i].key == pair.key) {
+                            // entries[i].row_id = build side's row ID (base or
+                            // IR) pair.row_id = probe side's row ID (base or
+                            // IR)
+                            local_buf.add_match(entries[i].row_id, pair.row_id);
+                        }
+                    }
+                }
+            }
+        }
+    });
+
+    return local_buffers;
+}
+
 } // namespace Contest::join
diff --git a/include/join_execution/hashtable.h b/include/join_execution/hashtable.h
index f98ea18..a39c3aa 100644
--- a/include/join_execution/hashtable.h
+++ b/include/join_execution/hashtable.h
@@ -57,12 +57,18 @@ using Contest::join::BLOOM_TAGS;
  */
 class UnchainedHashtable {
   public:
-    /** @brief Key-rowid pair for hash table entries. */
+    /** @brief Key-rowid pair for hash table entries (build phase). */
     struct alignas(4) Tuple {
         int32_t key;     /**< Join key value. */
         uint32_t row_id; /**< Row index in source table. */
     };
 
+    /** @brief Fused key-rowid for cache-friendly probe (8-byte aligned). */
+    struct alignas(8) Entry {
+        int32_t key;     /**< Join key value. */
+        uint32_t row_id; /**< Row index in source table. */
+    };
+
     /** @brief L2-sized chunk for partition buffers. */
     static constexpr size_t CHUNK_SIZE = 4096;
     static constexpr size_t CHUNK_HEADER = 16;
@@ -126,28 +132,11 @@ class UnchainedHashtable {
         nullptr; /**< Arena for hash table allocations. */
     Contest::platform::ArenaVector<uint64_t>
         directory; /**< Slot entries: (end_offset << 16) | bloom_tag. */
-    Contest::platform::ArenaVector<int32_t>
-        keys_; /**< Contiguous key storage, indexed by directory. */
-    Contest::platform::ArenaVector<uint32_t>
-        row_ids_; /**< Parallel row_id storage, same indexing. */
+    Contest::platform::ArenaVector<Entry>
+        entries_; /**< Fused key+row_id storage, indexed by directory. */
     int shift =
         0; /**< Bit shift for slot calculation: slot = hash >> (64-shift). */
 
-    /**
-     * @brief CRC32-based hash with multiplicative mixing.
-     * @param key INT32 join key.
-     * @return 64-bit hash (upper bits index directory slot).
-     */
-    static uint64_t hash_key(int32_t key) noexcept {
-        constexpr uint64_t k = 0x8648DBDB;
-#if defined(__aarch64__)
-        uint32_t crc = __crc32w(0, static_cast<uint32_t>(key));
-#else
-        uint32_t crc = _mm_crc32_u32(0, static_cast<uint32_t>(key));
-#endif
-        return crc * ((k << 32) + 1);
-    }
-
     /**
      * @brief Returns bloom tag from hash. Uses bits 32-42 to index BLOOM_TAGS.
      * @see bloom_tags.h
@@ -231,8 +220,7 @@ class UnchainedHashtable {
                     uint64_t h = hash_key(tup.key);
                     size_t local_slot = slot_for(h) - slot_start;
                     uint32_t idx = offsets[local_slot] + counts[local_slot]++;
-                    keys_[idx] = tup.key;
-                    row_ids_[idx] = tup.row_id;
+                    entries_[idx] = {tup.key, tup.row_id};
                     directory[slot_start + local_slot] |= bloom_tag(h);
                 }
             }
@@ -253,7 +241,7 @@ class UnchainedHashtable {
      */
     explicit UnchainedHashtable(size_t build_size)
         : arena_(&Contest::platform::get_arena(0)), directory(*arena_),
-          keys_(*arena_), row_ids_(*arena_) {
+          entries_(*arena_) {
         size_t pow2 = 2048;
         while (pow2 < build_size)
             pow2 <<= 1;
@@ -262,17 +250,29 @@ class UnchainedHashtable {
         shift = __builtin_ctzll(pow2);
     }
 
-    /** @brief Number of keys in the hash table. */
-    size_t size() const noexcept { return keys_.size(); }
+    /** @brief Number of entries in the hash table. */
+    size_t size() const noexcept { return entries_.size(); }
 
     /** @brief True if hash table is empty. */
-    bool empty() const noexcept { return keys_.empty(); }
+    bool empty() const noexcept { return entries_.empty(); }
 
-    /** @brief Direct access to key array for probe. */
-    const int32_t *keys() const noexcept { return keys_.data(); }
+    /** @brief Direct access to fused entry array for probe. */
+    const Entry *entries() const noexcept { return entries_.data(); }
 
-    /** @brief Direct access to row_id array for probe. */
-    const uint32_t *row_ids() const noexcept { return row_ids_.data(); }
+    /**
+     * @brief CRC32-based hash with multiplicative mixing. Public for pre-hash.
+     * @param key INT32 join key.
+     * @return 64-bit hash (upper bits index directory slot).
+     */
+    static uint64_t hash_key(int32_t key) noexcept {
+        constexpr uint64_t k = 0x8648DBDB;
+#if defined(__aarch64__)
+        uint32_t crc = __crc32w(0, static_cast<uint32_t>(key));
+#else
+        uint32_t crc = _mm_crc32_u32(0, static_cast<uint32_t>(key));
+#endif
+        return crc * ((k << 32) + 1);
+    }
 
     /**
      * @brief Prefetch directory slot for a key to hide memory latency.
@@ -286,13 +286,24 @@ class UnchainedHashtable {
         __builtin_prefetch(&directory[slot], 0, 2);
     }
 
+    /**
+     * @brief Prefetch directory slot using pre-computed hash.
+     *
+     * Avoids recomputing hash when already computed for another purpose.
+     * @param h Pre-computed hash from hash_key().
+     */
+    void prefetch_slot_prehashed(uint64_t h) const noexcept {
+        size_t slot = slot_for(h);
+        __builtin_prefetch(&directory[slot], 0, 2);
+    }
+
     /**
      * @brief Find index range for keys matching probe key.
      *
-     * @return [start, end) into keys_/row_ids_; (0,0) if bloom rejects.
+     * @return [start, end) into entries_; (0,0) if bloom rejects.
      */
     std::pair<uint64_t, uint64_t> find_indices(int32_t key) const noexcept {
-        if (keys_.empty())
+        if (entries_.empty())
             return {0, 0};
 
         uint64_t h = hash_key(key);
@@ -308,6 +319,32 @@ class UnchainedHashtable {
         return {start, end};
     }
 
+    /**
+     * @brief Find index range using pre-computed hash (avoids rehashing).
+     *
+     * Use when hash was already computed for prefetch or bloom filter check.
+     * @param key Original key (for comparison in caller).
+     * @param h Pre-computed hash from hash_key(key).
+     * @return [start, end) into entries_; (0,0) if bloom rejects.
+     */
+    std::pair<uint64_t, uint64_t>
+    find_indices_prehashed(int32_t key, uint64_t h) const noexcept {
+        (void)key; // Key used by caller for comparison, not needed here
+        if (entries_.empty())
+            return {0, 0};
+
+        size_t slot = slot_for(h);
+        uint64_t entry = directory[slot];
+        uint16_t tag = bloom_tag(h);
+
+        if ((entry & tag) != tag)
+            return {0, 0};
+
+        uint64_t end = entry >> 16;
+        uint64_t start = (slot == 0) ? 0 : (directory[slot - 1] >> 16);
+        return {start, end};
+    }
+
     /**
      * @brief Build hash table from intermediate column_t.
      *
@@ -376,8 +413,7 @@ class UnchainedHashtable {
         size_t total = global_offsets[num_partitions];
         if (total == 0)
             return;
-        keys_.resize(total);
-        row_ids_.resize(total);
+        entries_.resize(total);
 
         // Build partitions in parallel
         const int nt = num_threads;
@@ -390,6 +426,92 @@ class UnchainedHashtable {
         });
     }
 
+    /**
+     * @brief Build hash table from (key, row_id) tuple column.
+     *
+     * Radix-partitioned parallel build from key_row_column_t.
+     * Uses page-based work distribution for better cache locality.
+     * Each thread processes whole pages to avoid cross-page access.
+     *
+     * @param tuples Key-row tuple column from IntermediateResult.
+     * @param num_threads Thread count hint (unused, uses pool size).
+     */
+    void build_from_tuples(const mema::key_row_column_t &tuples,
+                           int /*num_threads*/ = 4) {
+        const size_t row_count = tuples.row_count();
+        if (row_count == 0)
+            return;
+
+        const int pool_threads = Contest::platform::worker_pool().thread_count();
+        const size_t num_slots = directory.size();
+        const size_t num_partitions =
+            compute_num_partitions(row_count, pool_threads);
+        const int partition_bits = __builtin_ctzll(num_partitions);
+        const size_t slots_per_partition = num_slots / num_partitions;
+
+        // Thread-local partitions for lock-free parallel partitioning
+        std::vector<ChunkAllocator> allocators(pool_threads);
+        for (int t = 0; t < pool_threads; ++t)
+            allocators[t].set_arena(Contest::platform::get_arena(t));
+        std::vector<std::vector<Partition>> thread_parts(pool_threads);
+        for (auto &tp : thread_parts)
+            tp.resize(num_partitions);
+
+        // Page-based partition phase - each thread processes whole pages
+        constexpr size_t PAIRS_PER_PAGE = mema::key_row_column_t::PAIRS_PER_PAGE;
+        const size_t num_pages = tuples.pages.size();
+
+        Contest::platform::worker_pool().execute(
+            [&, partition_bits, pool_threads](size_t t) {
+                const int shift = 64 - partition_bits;
+                const size_t stride = static_cast<size_t>(pool_threads);
+                for (size_t pg = t; pg < num_pages; pg += stride) {
+                    // Prefetch next page
+                    if (pg + stride < num_pages) {
+                        __builtin_prefetch(tuples.pages[pg + stride]->data, 0, 3);
+                    }
+                    const auto *page_data = tuples.pages[pg]->data;
+                    size_t base = pg * PAIRS_PER_PAGE;
+                    size_t count = std::min(PAIRS_PER_PAGE, row_count - base);
+                    for (size_t i = 0; i < count; ++i) {
+                        const auto &pair = page_data[i];
+                        uint64_t h = hash_key(pair.key);
+                        size_t p = (partition_bits == 0) ? 0 : (h >> shift);
+                        thread_parts[t][p].append(allocators[t],
+                                                  {pair.key, pair.row_id});
+                    }
+                }
+            });
+
+        // Compute global offsets from per-thread counts
+        Contest::platform::ArenaVector<size_t> global_offsets(*arena_);
+        global_offsets.resize(num_partitions + 1);
+        std::memset(global_offsets.data(), 0,
+                    (num_partitions + 1) * sizeof(size_t));
+        for (size_t p = 0; p < num_partitions; ++p) {
+            for (int t = 0; t < pool_threads; ++t) {
+                global_offsets[p + 1] += thread_parts[t][p].total_count;
+            }
+            global_offsets[p + 1] += global_offsets[p];
+        }
+
+        size_t total = global_offsets[num_partitions];
+        if (total == 0)
+            return;
+        entries_.resize(total);
+
+        // Build partitions in parallel
+        Contest::platform::worker_pool().execute([&, pool_threads](size_t t) {
+            for (size_t p = t; p < num_partitions;
+                 p += static_cast<size_t>(pool_threads)) {
+                build_partition(thread_parts, p, slots_per_partition,
+                                global_offsets[p],
+                                global_offsets[p + 1] - global_offsets[p],
+                                pool_threads, t);
+            }
+        });
+    }
+
     /**
      * @brief Build hash table from ColumnarTable Column.
      *
@@ -489,8 +611,7 @@ class UnchainedHashtable {
         size_t total = global_offsets[num_partitions];
         if (total == 0)
             return;
-        keys_.resize(total);
-        row_ids_.resize(total);
+        entries_.resize(total);
 
         const int nt = num_threads;
         Contest::platform::worker_pool().execute([&, nt](size_t t) {
diff --git a/include/join_execution/join_setup.h b/include/join_execution/join_setup.h
index 299dd65..f2917f0 100644
--- a/include/join_execution/join_setup.h
+++ b/include/join_execution/join_setup.h
@@ -1,63 +1,24 @@
 /**
  * @file join_setup.h
- * @brief Join configuration and input abstraction.
+ * @brief Join configuration and build/probe side selection.
  *
- * Provides JoinInput to abstract over columnar and intermediate data sources,
- * and utilities for selecting build/probe sides and preparing output columns.
+ * Provides utilities for selecting build/probe sides and determining
+ * which row IDs to collect based on output columns.
  */
 #pragma once
 
-#include <data_access/columnar_reader.h>
 #include <data_model/intermediate.h>
 #include <data_model/plan.h>
 #include <join_execution/match_collector.h>
 #include <tuple>
-#include <variant>
 #include <vector>
 
 /**
  * @namespace Contest::join
- * @brief JoinInput abstraction, build/probe selection, output column setup.
+ * @brief Build/probe selection and collection mode determination.
  */
 namespace Contest::join {
 
-using Contest::ExecuteResult;
-using Contest::io::ColumnarReader;
-
-/**
- * @brief Unified abstraction over columnar tables and intermediate results.
- *
- * Stores ColumnarTable* (base scans) or ExecuteResult (child joins). Node
- * provides output_attrs mapping for column resolution.
- */
-struct JoinInput {
-    std::variant<ExecuteResult, const ColumnarTable *> data;
-    const PlanNode *node; /**< Provides output_attrs for column mapping. */
-    uint8_t table_id;     /**< Source table ID for provenance tracking. */
-
-    /** @brief True if data is columnar (base table), false if intermediate. */
-    bool is_columnar() const {
-        return std::holds_alternative<const ColumnarTable *>(data);
-    }
-
-    /**
-     * @brief Row count for a given output column.
-     * @param col_idx Index into node->output_attrs.
-     */
-    size_t row_count(size_t col_idx) const {
-        if (is_columnar()) {
-            auto *table = std::get<const ColumnarTable *>(data);
-            auto [actual_col_idx, _] = node->output_attrs[col_idx];
-            return table->num_rows;
-        } else {
-            return std::get<ExecuteResult>(data)[col_idx].row_count();
-        }
-    }
-
-    /** @brief Number of output columns. */
-    size_t output_size() const { return node->output_attrs.size(); }
-};
-
 /**
  * @brief Configuration for build/probe side assignment.
  *
@@ -75,17 +36,6 @@ struct BuildProbeConfig {
     size_t probe_attr; /**< Join key index in probe's output_attrs. */
 };
 
-/** @brief Resolves global output column index to source input. */
-inline std::tuple<const JoinInput &, const PlanNode &, size_t>
-resolve_input_source(size_t global_idx, size_t split_point,
-                     const JoinInput &input_a, const PlanNode &node_a,
-                     const JoinInput &input_b, const PlanNode &node_b) {
-    if (global_idx < split_point) {
-        return {input_a, node_a, global_idx};
-    }
-    return {input_b, node_b, global_idx - split_point};
-}
-
 /**
  * @brief Chooses build/probe sides based on cardinality.
  *
@@ -156,153 +106,63 @@ inline MatchCollectionMode determine_collection_mode(
     return MatchCollectionMode::BOTH;
 }
 
-/**
- * @brief Creates output columns with provenance metadata from inputs.
- */
-inline ExecuteResult initialize_output_columns(
-    const std::vector<std::tuple<size_t, DataType>> &output_attrs,
-    const PlanNode &left_node, const PlanNode &right_node,
-    const JoinInput &left_input, const JoinInput &right_input,
-    size_t estimated_rows) {
-    ExecuteResult results;
-    results.reserve(output_attrs.size());
-    size_t left_size = left_input.output_size();
-
-    auto set_column_metadata = [](mema::column_t &col, const JoinInput &input,
-                                  const PlanNode &node, size_t col_idx) {
-        auto [actual_col_idx, _] = node.output_attrs[col_idx];
-        if (input.is_columnar()) {
-            col.source_table = input.table_id;
-            col.source_column = actual_col_idx;
-        } else {
-            const auto &result = std::get<ExecuteResult>(input.data);
-            col.source_table = result[col_idx].source_table;
-            col.source_column = result[col_idx].source_column;
-        }
-    };
-
-    for (size_t i = 0; i < output_attrs.size(); ++i) {
-        auto [col_idx, _] = output_attrs[i];
-        auto [input, node, local_idx] = resolve_input_source(
-            col_idx, left_size, left_input, left_node, right_input, right_node);
+} // namespace Contest::join
 
-        mema::column_t col;
-        set_column_metadata(col, input, node, local_idx);
-        results.push_back(std::move(col));
-    }
+namespace Contest {
 
-    return results;
-}
+// Forward declare AnalyzedJoinNode
+struct AnalyzedJoinNode;
 
 /**
- * @brief Join output state and columnar reader.
+ * @brief Tracking info for one side of a join (build or probe).
  *
- * prepared flag implements lazy PageIndex construction.
+ * Determines whether to embed base table row IDs or IR indices in the
+ * output tuples for this side.
  */
-struct JoinSetup {
-    ExecuteResult results; /**< Output columns being populated. */
-    ColumnarReader
-        columnar_reader; /**< Page cursor caching for columnar access. */
-    /**
-     * True after prepare_output_columns called.
-     */
-    bool prepared;
-
-    JoinSetup() : prepared(false) {}
+struct SideTrackingInfo {
+    bool track_base_rows =
+        false; ///< True to embed base row IDs, false for IR indices
+    uint8_t base_table_id = 0; ///< Base table to track (if track_base_rows)
 };
 
 /**
- * @brief Initializes JoinSetup with output columns; call before join execution.
+ * @brief Tracking configuration for intermediate construction.
  *
- * PageIndex construction deferred to prepare_output_columns().
+ * Determines what row IDs to embed in join key tuples and whether
+ * DeferredTables are needed for non-tracked sides.
  */
-inline JoinSetup
-setup_join(const JoinInput &build_input, const JoinInput &probe_input,
-           const PlanNode &build_node, const PlanNode &probe_node,
-           const PlanNode &left_node, const PlanNode &right_node,
-           const JoinInput &left_input, const JoinInput &right_input,
-           const std::vector<std::tuple<size_t, DataType>> &output_attrs,
-           size_t estimated_rows) {
-    JoinSetup setup;
-
-    setup.results =
-        initialize_output_columns(output_attrs, left_node, right_node,
-                                  left_input, right_input, estimated_rows);
-
-    setup.prepared = false;
-
-    return setup;
-}
+struct TupleTrackingInfo {
+    SideTrackingInfo build_tracking; ///< Tracking info for build side
+    SideTrackingInfo probe_tracking; ///< Tracking info for probe side
+    bool key_from_build =
+        true; ///< True if parent join key comes from build side
+};
 
 /**
- * @brief Collects Column pointers for needed output columns from columnar
- * input.
+ * @brief Result of a join execution before intermediate construction.
  *
- * Unused columns get nullptr to skip PageIndex construction.
- */
-inline platform::ArenaVector<const Column *>
-collect_needed_columns(const JoinInput &input, const PlanNode &node,
-                       const platform::ArenaVector<uint8_t> &needed,
-                       platform::ThreadArena &arena) {
-    platform::ArenaVector<const Column *> columns(arena);
-    columns.resize(node.output_attrs.size());
-    std::memset(columns.data(), 0, columns.size() * sizeof(const Column *));
-    auto *table = std::get<const ColumnarTable *>(input.data);
-
-    for (size_t i = 0; i < node.output_attrs.size(); ++i) {
-        auto [actual_col_idx, _] = node.output_attrs[i];
-        columns[i] = needed[i] ? &table->columns[actual_col_idx] : nullptr;
-    }
-    return columns;
-}
-
-/**
- * @brief Prepares ColumnarReader with columns needed for materialization.
+ * Contains match buffers and metadata needed for deferred IR construction.
+ * Allows parent join to decide row ID format based on its cardinality
+ * requirements before constructing the intermediate result.
  *
- * Triggers lazy PageIndex construction only for projected columns.
+ * @tparam Mode Match collection mode for this join's buffers.
  */
-inline void prepare_output_columns(
-    ColumnarReader &reader, const JoinInput &build_input,
-    const JoinInput &probe_input, const PlanNode &build_node,
-    const PlanNode &probe_node,
-    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
-    size_t build_size) {
-
-    bool build_is_columnar = build_input.is_columnar();
-    bool probe_is_columnar = probe_input.is_columnar();
-
-    if (!build_is_columnar && !probe_is_columnar)
-        return;
-
-    auto &arena = Contest::platform::get_arena(0);
-
-    platform::ArenaVector<uint8_t> build_needed(arena);
-    build_needed.resize(build_node.output_attrs.size());
-    std::memset(build_needed.data(), 0, build_needed.size());
-
-    platform::ArenaVector<uint8_t> probe_needed(arena);
-    probe_needed.resize(probe_node.output_attrs.size());
-    std::memset(probe_needed.data(), 0, probe_needed.size());
-
-    for (const auto &[col_idx, dtype] : remapped_attrs) {
-        if (col_idx < build_size) {
-            if (build_is_columnar) {
-                build_needed[col_idx] = 1;
-            }
-        } else if (probe_is_columnar) {
-            probe_needed[col_idx - build_size] = 1;
-        }
-    }
-
-    if (build_is_columnar) {
-        reader.prepare_build(collect_needed_columns(build_input, build_node,
-                                                    build_needed, arena));
-    }
-
-    if (probe_is_columnar) {
-        reader.prepare_probe(collect_needed_columns(probe_input, probe_node,
-                                                    probe_needed, arena));
-    }
-}
+template <join::MatchCollectionMode Mode> struct MatchResult {
+    std::vector<join::ThreadLocalMatchBuffer<Mode>> buffers;
+    size_t total_count = 0;
+
+    /// The inputs that were joined (for resolving row IDs during IR
+    /// construction)
+    JoinInput build_input;
+    JoinInput probe_input;
+
+    /// Join configuration
+    const AnalyzedJoinNode *join_node = nullptr;
+    join::BuildProbeConfig config;
+
+    /// Convenience accessors
+    size_t count() const { return total_count; }
+    bool empty() const { return total_count == 0; }
+};
 
-} // namespace Contest::join
+} // namespace Contest
diff --git a/include/join_execution/match_collector.h b/include/join_execution/match_collector.h
index 78657b7..a4136cb 100644
--- a/include/join_execution/match_collector.h
+++ b/include/join_execution/match_collector.h
@@ -160,6 +160,60 @@ class ThreadLocalMatchBuffer {
         ChainIterator end() const { return ChainIterator(nullptr, 0); }
     };
 
+    /**
+     * @brief Batch reader for efficient SIMD access to chunk chains.
+     *
+     * Unlike ChainIterator which reads one element at a time, this reader
+     * provides direct pointer access to contiguous batches within chunks.
+     * Essential for SIMD provenance encoding in deferred materialization.
+     */
+    class ChunkBatchReader {
+        IndexChunk *current_chunk;
+        uint32_t offset;
+        size_t remaining;
+
+      public:
+        ChunkBatchReader(IndexChunk *chunk, size_t count)
+            : current_chunk(chunk), offset(0), remaining(count) {}
+
+        /** @brief Returns true if more data is available. */
+        inline bool has_more() const { return remaining > 0 && current_chunk; }
+
+        /**
+         * @brief Get pointer to contiguous batch of row IDs.
+         *
+         * Returns pointer to up to max_batch contiguous elements within
+         * current chunk. Actual count may be less if chunk boundary reached.
+         *
+         * @param max_batch Maximum elements to return.
+         * @param actual_count Output: actual number of elements available.
+         * @return Pointer to contiguous row IDs, or nullptr if exhausted.
+         */
+        inline const uint32_t *get_batch(size_t max_batch,
+                                         size_t &actual_count) {
+            if (!current_chunk || remaining == 0) {
+                actual_count = 0;
+                return nullptr;
+            }
+
+            size_t available = current_chunk->count - offset;
+            actual_count = std::min({max_batch, remaining, available});
+            const uint32_t *ptr = &current_chunk->ids[offset];
+
+            offset += static_cast<uint32_t>(actual_count);
+            remaining -= actual_count;
+
+            if (offset >= current_chunk->count && current_chunk->next) {
+                current_chunk = current_chunk->next;
+                offset = 0;
+            }
+            return ptr;
+        }
+
+        /** @brief Remaining element count. */
+        inline size_t count() const { return remaining; }
+    };
+
     /** @brief Returns range for iterating left (build) row IDs. */
     inline ChainRange left_range() const {
         return ChainRange(left_head, total_count);
@@ -170,6 +224,16 @@ class ThreadLocalMatchBuffer {
         return ChainRange(right_head, total_count);
     }
 
+    /** @brief Returns batch reader for left (build) row IDs. */
+    inline ChunkBatchReader left_batch_reader() const {
+        return ChunkBatchReader(left_head, total_count);
+    }
+
+    /** @brief Returns batch reader for right (probe) row IDs. */
+    inline ChunkBatchReader right_batch_reader() const {
+        return ChunkBatchReader(right_head, total_count);
+    }
+
     /** @brief Returns match count in this buffer. */
     size_t count() const { return total_count; }
 
diff --git a/include/join_execution/nested_loop.h b/include/join_execution/nested_loop.h
index e1086d0..d836409 100644
--- a/include/join_execution/nested_loop.h
+++ b/include/join_execution/nested_loop.h
@@ -15,7 +15,6 @@
 #include <cstdint>
 #include <data_model/intermediate.h>
 #include <data_model/plan.h>
-#include <join_execution/join_setup.h>
 #include <join_execution/match_collector.h>
 #include <join_execution/simd_compare.h>
 #include <platform/arena_vector.h>
@@ -28,7 +27,6 @@
  */
 namespace Contest::join {
 
-using Contest::ExecuteResult;
 using Contest::platform::THREAD_COUNT;
 using Contest::platform::worker_pool;
 
@@ -36,6 +34,8 @@ using Contest::platform::worker_pool;
  * @brief Iterates over non-NULL values in a join input column.
  *
  * Abstracts columnar vs intermediate input. Handles NULL bitmaps.
+ * For IntermediateResult, reads from join_key_tuples if available,
+ * otherwise from materialized columns (join keys are always available).
  *
  * @tparam Func void(uint32_t row_id, int32_t value).
  */
@@ -69,11 +69,29 @@ inline void visit_rows(const JoinInput &input, size_t attr_idx,
             }
         }
     } else {
-        const auto &res = std::get<ExecuteResult>(input.data);
-        const mema::column_t &col = res[attr_idx];
-        size_t count = col.row_count();
+        const auto &res = std::get<IntermediateResult>(input.data);
+
+        // Check if join key is stored as tuples
+        if (res.has_join_key_tuples() && res.join_key_idx.has_value() &&
+            *res.join_key_idx == attr_idx) {
+            const auto &tuples = *res.join_key_tuples;
+            size_t count = tuples.row_count();
+            for (size_t i = 0; i < count; i++) {
+                mema::KeyRowPair pair = tuples[i];
+                if (pair.key != mema::value_t::NULL_VALUE) {
+                    visitor(static_cast<uint32_t>(i), pair.key);
+                }
+            }
+            return;
+        }
+
+        // Fall back to materialized column
+        const mema::column_t *col = res.get_materialized(attr_idx);
+        if (!col)
+            return; // Should not happen - join keys are always available
+        size_t count = col->row_count();
         for (size_t i = 0; i < count; i++) {
-            const mema::value_t &val = col[i];
+            const mema::value_t &val = (*col)[i];
             if (!val.is_null()) {
                 visitor(static_cast<uint32_t>(i), val.value);
             }
@@ -124,6 +142,7 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input,
         b_vals[i] = INT32_MIN;
     }
 
+    // Setup for columnar probe (page-based parallel processing)
     const Column *probe_col = nullptr;
     platform::ArenaVector<uint32_t> page_offsets(
         Contest::platform::get_arena(0));
@@ -140,6 +159,24 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input,
         }
         page_offsets.push_back(current);
     }
+
+    // Setup for IntermediateResult probe - check tuples first, then
+    // materialized
+    const mema::column_t *probe_mat_col = nullptr;
+    const mema::key_row_column_t *probe_tuples = nullptr;
+    if (!probe_input.is_columnar()) {
+        const auto &res = std::get<IntermediateResult>(probe_input.data);
+        // Check if join key is stored as tuples
+        if (res.has_join_key_tuples() && res.join_key_idx.has_value() &&
+            *res.join_key_idx == probe_attr) {
+            probe_tuples = &(*res.join_key_tuples);
+        } else {
+            probe_mat_col = res.get_materialized(probe_attr);
+            if (!probe_mat_col)
+                return {}; // Join key not available - should not happen
+        }
+    }
+
     std::atomic<size_t> probe_page_counter{0};
 
     worker_pool().execute([&](size_t t_id) {
@@ -189,9 +226,22 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input,
                     }
                 }
             }
+        } else if (probe_tuples) {
+            // IntermediateResult probe - use tuple column
+            const mema::key_row_column_t &tuples = *probe_tuples;
+            size_t count = tuples.row_count();
+            size_t start = (t_id * count) / THREAD_COUNT;
+            size_t end = ((t_id + 1) * count) / THREAD_COUNT;
+
+            for (size_t i = start; i < end; i++) {
+                mema::KeyRowPair pair = tuples[i];
+                if (pair.key != mema::value_t::NULL_VALUE) {
+                    process_value(static_cast<uint32_t>(i), pair.key);
+                }
+            }
         } else {
-            const auto &res = std::get<ExecuteResult>(probe_input.data);
-            const mema::column_t &col = res[probe_attr];
+            // IntermediateResult probe - use materialized column
+            const mema::column_t &col = *probe_mat_col;
             size_t count = col.row_count();
             size_t start = (t_id * count) / THREAD_COUNT;
             size_t end = ((t_id + 1) * count) / THREAD_COUNT;
diff --git a/include/materialization/construct_intermediate.h b/include/materialization/construct_intermediate.h
index 45a4386..ec3db86 100644
--- a/include/materialization/construct_intermediate.h
+++ b/include/materialization/construct_intermediate.h
@@ -2,122 +2,502 @@
  * @file construct_intermediate.h
  * @brief Constructs intermediate results for multi-way joins.
  *
- * Allocates and populates ExecuteResult (column_t) from match collectors.
- * Templated on MatchCollectionMode for zero-overhead mode selection.
+ * Allocates and populates IntermediateResult with only MATERIALIZE columns
+ * (typically just the parent's join key). Deferred columns use per-table
+ * 32-bit row ID storage for memory efficiency.
+ *
+ * Optimized with:
+ * - Column-major iteration for cache locality
+ * - Precomputed source metadata to avoid per-row variant access
+ * - Per-table 32-bit row ID storage (vs per-column 64-bit provenance)
+ * - Batch access to match collector chunks
+ *
+ * @see materialize.h for final resolution of deferred columns.
  */
 #pragma once
 
+#include <cstring>
+#include <unordered_map>
+#include <vector>
+
 #include <data_access/columnar_reader.h>
+#include <data_model/deferred_plan.h>
 #include <data_model/intermediate.h>
-#include <data_model/plan.h>
+#include <foundation/common.h>
 #include <join_execution/join_setup.h>
 #include <join_execution/match_collector.h>
 #include <platform/arena.h>
 #include <platform/worker_pool.h>
-#include <vector>
-/**
- * @namespace Contest::materialize
- * @brief Materialization of join results into columnar format.
- *
- * @see intermediate.h for column_t/value_t format details.
- */
-namespace Contest::materialize {
 
-using Contest::ExecuteResult;
+namespace Contest {
+namespace materialize {
+
 using Contest::io::ColumnarReader;
-using Contest::join::JoinInput;
 using Contest::join::MatchCollectionMode;
 using Contest::join::ThreadLocalMatchBuffer;
 using Contest::platform::THREAD_COUNT;
 using Contest::platform::worker_pool;
 
+// ============================================================================
+// Row ID Batch Operations (for 32-bit per-table deferred)
+// ============================================================================
+
+namespace row_id_ops {
+
 /**
- * @brief Precomputed metadata for resolving an output column's source.
+ * @brief Write row IDs directly from columnar input.
  *
- * Avoids per-value std::variant accesses and tuple lookups in hot loop.
- * 8-byte alignment optimizes struct packing for vector iteration.
+ * For columnar inputs, we just write the row_id directly (it's already
+ * the base table row ID). Optimized with memcpy when batch fits in one page.
+ */
+inline size_t write_row_ids_direct(mema::DeferredTable &dest, size_t start_idx,
+                                   const uint32_t *row_ids, size_t count) {
+    // Constants for DeferredTable layout
+    constexpr size_t ENTRY_SHIFT = mema::DeferredTable::ENTRY_SHIFT;
+    constexpr size_t ENTRY_MASK = mema::DeferredTable::ENTRY_MASK;
+
+    size_t page_idx = start_idx >> ENTRY_SHIFT;
+    size_t offset = start_idx & ENTRY_MASK;
+
+    // Fast path: entire batch fits in current page
+    if (offset + count <= mema::DeferredTable::ENTRIES_PER_PAGE) {
+        std::memcpy(&dest.pages[page_idx]->data[offset], row_ids,
+                    count * sizeof(uint32_t));
+        return count;
+    }
+
+    // Slow path: batch spans pages
+    for (size_t i = 0; i < count; ++i) {
+        dest.write_at(start_idx + i, row_ids[i]);
+    }
+    return count;
+}
+
+/**
+ * @brief Copy row IDs from child deferred table.
+ *
+ * For intermediate inputs, we look up the base table row ID from the
+ * child's deferred table and copy it to the parent's deferred table.
+ */
+inline size_t copy_row_ids_from_child(mema::DeferredTable &dest,
+                                      size_t start_idx,
+                                      const mema::DeferredTable &src,
+                                      const uint32_t *row_ids, size_t count) {
+    for (size_t i = 0; i < count; ++i) {
+        dest.write_at(start_idx + i, src[row_ids[i]]);
+    }
+    return count;
+}
+
+} // namespace row_id_ops
+
+// ============================================================================
+// Source Precomputation Structures
+// ============================================================================
+
+/**
+ * @brief Precomputed metadata for a deferred table source.
  *
- * @see prepare_sources() for precomputation logic.
+ * Groups columns by (from_build, base_table_id) so we only store 32-bit
+ * row IDs once per unique base table instead of 64-bit provenance per column.
  */
-struct alignas(8) SourceInfo {
+struct DeferredTableSource {
+    const mema::DeferredTable *child_table =
+        nullptr;                ///< Source deferred table from child (if any).
+    uint8_t base_table_id = 0;  ///< Base table ID.
+    uint8_t dest_table_idx = 0; ///< Index in result.deferred_tables[].
+    bool from_build = false;    ///< True if from build side.
+    bool needs_direct = false;  ///< True if columnar (write row IDs directly).
+};
+
+/**
+ * @brief Precomputed metadata for materialized column sources.
+ *
+ * Eliminates per-row std::variant access and conditional checks in hot loop.
+ */
+struct alignas(8) MaterializedColumnSource {
     const mema::column_t *intermediate_col =
-        nullptr;                          /**< Source if intermediate. */
-    const Column *columnar_col = nullptr; /**< Source if columnar. */
-    size_t remapped_col_idx = 0; /**< Local index within source side. */
-    bool is_columnar = false;    /**< True if source is columnar table. */
-    bool from_build = false; /**< True if from build side, false if probe. */
+        nullptr; ///< Source if from IntermediateResult materialized
+    const Column *columnar_col = nullptr; ///< Source if from ColumnarTable
+    const mema::DeferredTable *deferred_table =
+        nullptr; ///< Source deferred table if needs resolution
+    const mema::key_row_column_t *tuple_col =
+        nullptr;                 ///< Source if from child's join_key_tuples
+    size_t child_output_idx = 0; ///< Index in child's output
+    size_t mat_col_idx = 0;      ///< Index in result.materialized[]
+    DataType type = DataType::INT32;
+    uint8_t base_table_id = 0;           ///< For VARCHAR source tracking
+    uint8_t base_column_idx = 0;         ///< For VARCHAR source tracking
+    bool is_columnar = false;            ///< True if source is ColumnarTable
+    bool from_build = false;             ///< True if from build side
+    bool needs_deferred_resolve = false; ///< True if child deferred this column
+    bool needs_tuple_key_read = false;   ///< True if reading key from tuples
 };
 
+// ============================================================================
+// Helper Functions
+// ============================================================================
+
+/**
+ * @brief Collect columns needed from a JoinInput for page index building.
+ */
+inline platform::ArenaVector<const Column *>
+collect_input_columns(const JoinInput &input,
+                      const platform::ArenaVector<uint8_t> &needed,
+                      platform::ThreadArena &arena) {
+    platform::ArenaVector<const Column *> columns(arena);
+    if (!input.node)
+        return columns;
+
+    columns.resize(input.node->output_attrs.size());
+    std::memset(columns.data(), 0, columns.size() * sizeof(const Column *));
+
+    if (!input.is_columnar())
+        return columns;
+
+    auto *table = std::get<const ColumnarTable *>(input.data);
+    for (size_t i = 0; i < input.node->output_attrs.size(); ++i) {
+        if (i < needed.size() && needed[i]) {
+            auto [actual_col_idx, _] = input.node->output_attrs[i];
+            columns[i] = &table->columns[actual_col_idx];
+        }
+    }
+    return columns;
+}
+
+/**
+ * @brief Prepare ColumnarReader for intermediate construction.
+ *
+ * Sets up page indices for columns that need to be read from columnar inputs.
+ * If parent_key_idx is provided, also prepares the join key column for tuple
+ * population.
+ */
+inline void prepare_intermediate_columns(
+    ColumnarReader &reader, const JoinInput &build_input,
+    const JoinInput &probe_input, const AnalyzedJoinNode &join_node,
+    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
+    size_t build_size, bool build_is_left,
+    std::optional<size_t> parent_key_idx = std::nullopt) {
+
+    bool build_is_columnar = build_input.is_columnar();
+    bool probe_is_columnar = probe_input.is_columnar();
+
+    if (!build_is_columnar && !probe_is_columnar)
+        return;
+
+    auto &arena = Contest::platform::get_arena(0);
+
+    // Determine which columns from each side are needed
+    platform::ArenaVector<uint8_t> build_needed(arena);
+    if (build_input.node) {
+        build_needed.resize(build_input.node->output_attrs.size());
+        std::memset(build_needed.data(), 0, build_needed.size());
+    }
+
+    platform::ArenaVector<uint8_t> probe_needed(arena);
+    if (probe_input.node) {
+        probe_needed.resize(probe_input.node->output_attrs.size());
+        std::memset(probe_needed.data(), 0, probe_needed.size());
+    }
+
+    // Mark columns needed based on materialization decisions
+    for (const auto &col : join_node.columns) {
+        if (col.resolution == ColumnResolution::MATERIALIZE) {
+            bool from_build = (col.from_left == build_is_left);
+            if (from_build && col.child_output_idx < build_needed.size()) {
+                build_needed[col.child_output_idx] = 1;
+            } else if (!from_build &&
+                       col.child_output_idx < probe_needed.size()) {
+                probe_needed[col.child_output_idx] = 1;
+            }
+        }
+    }
+
+    // If parent needs a join key via tuples, mark that column as needed too
+    // This ensures page indices are prepared for efficient tuple population
+    if (parent_key_idx.has_value()) {
+        for (const auto &col : join_node.columns) {
+            if (col.original_idx == *parent_key_idx) {
+                bool from_build = (col.from_left == build_is_left);
+                if (from_build && col.child_output_idx < build_needed.size()) {
+                    build_needed[col.child_output_idx] = 1;
+                } else if (!from_build &&
+                           col.child_output_idx < probe_needed.size()) {
+                    probe_needed[col.child_output_idx] = 1;
+                }
+                break;
+            }
+        }
+    }
+
+    if (build_is_columnar) {
+        reader.prepare_build(
+            collect_input_columns(build_input, build_needed, arena));
+    }
+
+    if (probe_is_columnar) {
+        reader.prepare_probe(
+            collect_input_columns(probe_input, probe_needed, arena));
+    }
+}
+
+/**
+ * @brief Prepare page indices for base table columns used in deferred
+ * resolution.
+ *
+ * Called before constructing intermediate results to enable O(log P) page
+ * lookup instead of O(P) linear scan when resolving deferred columns that need
+ * to materialize values from base tables.
+ *
+ * @param reader ColumnarReader to prepare page indices in.
+ * @param mat_sources Precomputed materialized column sources.
+ * @param analyzed_plan Full analyzed plan containing base tables.
+ */
+inline void prepare_deferred_base_tables(
+    ColumnarReader &reader,
+    const std::vector<MaterializedColumnSource> &mat_sources,
+    const AnalyzedPlan &analyzed_plan) {
+    if (!analyzed_plan.original_plan)
+        return;
+
+    // NOTE: We do NOT reset base tables here - they persist across joins
+    // within the same query since the base tables don't change.
+    // reset_base_tables() should only be called once per query, externally.
+
+    // Prepare page indices for each base table column that needs deferred
+    // resolve
+    for (const auto &src : mat_sources) {
+        if (src.needs_deferred_resolve) {
+            uint8_t table_id = src.base_table_id;
+            uint8_t col_idx = src.base_column_idx;
+
+            if (!reader.is_base_column_prepared(table_id, col_idx)) {
+                if (table_id < analyzed_plan.original_plan->inputs.size()) {
+                    const auto &base_table =
+                        analyzed_plan.original_plan->inputs[table_id];
+                    if (col_idx < base_table.columns.size()) {
+                        reader.prepare_base_column(table_id, col_idx,
+                                                   base_table.columns[col_idx]);
+                    }
+                }
+            }
+        }
+    }
+}
+
+/**
+ * @brief Create empty intermediate result with proper schema.
+ */
+inline IntermediateResult
+create_empty_intermediate_result(const AnalyzedJoinNode &node) {
+    IntermediateResult result;
+    result.node_info = &node;
+    result.num_rows = 0;
+    result.materialized_map.resize(node.columns.size(), std::nullopt);
+    result.deferred_map.resize(node.columns.size(), std::nullopt);
+
+    size_t mat_count = 0;
+    for (const auto &col : node.columns) {
+        if (col.resolution == ColumnResolution::MATERIALIZE) {
+            result.materialized_map[col.original_idx] = mat_count++;
+        }
+        // For empty result, we don't need to set up deferred tables
+    }
+    result.materialized.resize(mat_count);
+
+    return result;
+}
+
+/**
+ * @brief Prepare deferred table sources for intermediate construction.
+ *
+ * Groups deferred columns by (from_build, base_table_id) to create
+ * DeferredTable entries. Returns list of sources for populating the tables.
+ */
+inline std::vector<DeferredTableSource>
+prepare_deferred_table_sources(const AnalyzedJoinNode &join_node,
+                               const JoinInput &build_input,
+                               const JoinInput &probe_input, bool build_is_left,
+                               IntermediateResult &out_result) {
+    // Map from (from_build << 8 | base_table_id) -> dest_table_idx
+    std::unordered_map<uint16_t, uint8_t> table_key_to_idx;
+    std::vector<DeferredTableSource> sources;
+
+    for (const auto &col : join_node.columns) {
+        if (col.resolution != ColumnResolution::DEFER)
+            continue;
+
+        bool from_build = (col.from_left == build_is_left);
+        uint16_t key = (static_cast<uint16_t>(from_build) << 8) |
+                       col.provenance.base_table_id;
+
+        auto it = table_key_to_idx.find(key);
+        uint8_t dest_idx;
+
+        if (it == table_key_to_idx.end()) {
+            // New deferred table needed
+            dest_idx = static_cast<uint8_t>(out_result.deferred_tables.size());
+            table_key_to_idx[key] = dest_idx;
+
+            mema::DeferredTable dt;
+            dt.base_table_id = col.provenance.base_table_id;
+            dt.from_build = from_build;
+            out_result.deferred_tables.push_back(std::move(dt));
+
+            // Create source entry
+            DeferredTableSource src;
+            src.base_table_id = col.provenance.base_table_id;
+            src.dest_table_idx = dest_idx;
+            src.from_build = from_build;
+
+            const auto &src_input = from_build ? build_input : probe_input;
+            if (src_input.is_columnar()) {
+                src.needs_direct = true;
+                src.child_table = nullptr;
+            } else {
+                const auto &child_ir =
+                    std::get<IntermediateResult>(src_input.data);
+                // Find child's deferred table for this base table
+                const auto *child_ref =
+                    src_input.get_deferred_ref(col.child_output_idx);
+                if (child_ref) {
+                    src.needs_direct = false;
+                    src.child_table =
+                        src_input.get_deferred_table(col.child_output_idx);
+                } else if (child_ir.is_join_key(col.child_output_idx)) {
+                    // Child stored this as tuples - the row_id in tuples
+                    // is an IR index, but we need base table row IDs for
+                    // deferred resolution. This shouldn't happen if the
+                    // join key column is properly excluded from DEFER.
+#ifndef NDEBUG
+                    std::fprintf(stderr,
+                                 "[BUG] DEFER column %zu is child's "
+                                 "join key - this is unexpected!\n",
+                                 col.child_output_idx);
+#endif
+                    src.needs_direct = true;
+                    src.child_table = nullptr;
+                } else {
+                    // Child materialized this, shouldn't happen for DEFER cols
+                    src.needs_direct = true;
+                    src.child_table = nullptr;
+                }
+            }
+            sources.push_back(src);
+        } else {
+            dest_idx = it->second;
+        }
+
+        // Add column to deferred table's column list
+        out_result.deferred_tables[dest_idx].column_indices.push_back(
+            col.provenance.base_column_idx);
+
+        // Set up deferred_map entry
+        DeferredColumnRef ref;
+        ref.table_idx = dest_idx;
+        ref.base_col = col.provenance.base_column_idx;
+        out_result.deferred_map[col.original_idx] = ref;
+    }
+
+    return sources;
+}
+
 /**
- * @brief Builds SourceInfo for each output column for fast hot-loop lookup.
- *
- * @param remapped_attrs Output column specifications (global indexing).
- * @param build_input    Build side data (ColumnarTable* or ExecuteResult).
- * @param probe_input    Probe side data (ColumnarTable* or ExecuteResult).
- * @param build_node     PlanNode for build side (contains output_attrs).
- * @param probe_node     PlanNode for probe side (contains output_attrs).
- * @param build_size     Number of columns from build side.
- * @return Vector of SourceInfo, one per output column.
- *
- * @see SourceInfo for field documentation.
- * @see construct_intermediate() for consumption in hot loop.
+ * @brief Precompute materialized column sources for column-major iteration.
+ *
+ * For each MATERIALIZE column, determines source type and caches pointers
+ * to avoid per-row std::variant access in the hot loop.
  */
-inline std::vector<SourceInfo>
-prepare_sources(const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
-                const JoinInput &build_input, const JoinInput &probe_input,
-                const PlanNode &build_node, const PlanNode &probe_node,
-                size_t build_size) {
-    std::vector<SourceInfo> sources;
-    sources.reserve(remapped_attrs.size());
-    for (const auto &[col_idx, _] : remapped_attrs) {
-        SourceInfo info;
-        info.from_build = (col_idx < build_size);
-        size_t local_idx = info.from_build ? col_idx : col_idx - build_size;
-        info.remapped_col_idx = local_idx;
-        const JoinInput &input = info.from_build ? build_input : probe_input;
-        const PlanNode &node = info.from_build ? build_node : probe_node;
-        if (input.is_columnar()) {
-            info.is_columnar = true;
-            auto *table = std::get<const ColumnarTable *>(input.data);
-            auto [actual_idx, _] = node.output_attrs[local_idx];
-            info.columnar_col = &table->columns[actual_idx];
+inline std::vector<MaterializedColumnSource>
+prepare_materialized_sources(const AnalyzedJoinNode &join_node,
+                             const JoinInput &build_input,
+                             const JoinInput &probe_input, bool build_is_left) {
+    std::vector<MaterializedColumnSource> sources;
+    sources.reserve(join_node.columns.size());
+
+    size_t mat_idx = 0;
+    for (const auto &col : join_node.columns) {
+        if (col.resolution != ColumnResolution::MATERIALIZE)
+            continue;
+
+        MaterializedColumnSource src;
+        src.mat_col_idx = mat_idx++;
+        src.child_output_idx = col.child_output_idx;
+        src.type = col.type;
+        src.base_table_id = col.provenance.base_table_id;
+        src.base_column_idx = col.provenance.base_column_idx;
+        src.from_build = (col.from_left == build_is_left);
+
+        const auto &src_input = src.from_build ? build_input : probe_input;
+
+        if (src_input.is_columnar()) {
+            src.is_columnar = true;
+            const auto *table = std::get<const ColumnarTable *>(src_input.data);
+            auto [actual_idx, _] =
+                src_input.node->output_attrs[col.child_output_idx];
+            src.columnar_col = &table->columns[actual_idx];
         } else {
-            info.is_columnar = false;
-            const auto &res = std::get<ExecuteResult>(input.data);
-            info.intermediate_col = &res[local_idx];
+            src.is_columnar = false;
+            const auto &ir = std::get<IntermediateResult>(src_input.data);
+
+            // Check source type in priority order:
+            // 1. Tuples (join key stored as key-row pairs)
+            // 2. Materialized column
+            // 3. Deferred table
+            if (ir.is_join_key(col.child_output_idx)) {
+                // Child stored this column as tuples - read key from there
+                src.needs_tuple_key_read = true;
+                src.tuple_col = &(*ir.join_key_tuples);
+            } else if (ir.is_materialized(col.child_output_idx)) {
+                src.intermediate_col =
+                    ir.get_materialized(col.child_output_idx);
+            } else if (ir.is_deferred(col.child_output_idx)) {
+                src.needs_deferred_resolve = true;
+                src.deferred_table =
+                    ir.get_deferred_table(col.child_output_idx);
+                // base_column_idx is already set from col.provenance
+            }
         }
-        sources.push_back(info);
+        sources.push_back(src);
     }
+
     return sources;
 }
 
+// ============================================================================
+// Main Construction Function
+// ============================================================================
+
 /**
- * @brief Constructs intermediate results directly from thread-local buffers.
+ * @brief Constructs intermediate result from thread-local buffers.
  *
- * Each thread iterates its own buffer, avoiding the merge step. Total matches
- * computed by summing buffer counts. Each thread writes its contiguous portion
- * of output pages.
+ * Optimized with column-major iteration and per-table 32-bit row ID storage.
+ * Only materializes columns marked MATERIALIZE in the AnalyzedJoinNode.
+ * Deferred columns share row ID storage per unique base table.
  *
  * @tparam Mode            Collection mode for compile-time specialization.
- * @param buffers          Vector of ThreadLocalMatchBuffer from probe.
- * @param build_input      Build side data (ColumnarTable* or ExecuteResult).
- * @param probe_input      Probe side data (ColumnarTable* or ExecuteResult).
- * @param remapped_attrs   Output column specifications (global indexing).
- * @param build_node       PlanNode for build side output_attrs mapping.
- * @param probe_node       PlanNode for probe side output_attrs mapping.
- * @param build_size       Number of output columns from build side.
- * @param columnar_reader  ColumnarReader with Cursor caching for page access.
- * @param results          Pre-initialized ExecuteResult, populated in-place.
+ * @param buffers          Thread-local match buffers from probe.
+ * @param build_input      Build side data source.
+ * @param probe_input      Probe side data source.
+ * @param join_node        Analyzed join node with materialization decisions.
+ * @param remapped_attrs   Output attributes (after build/probe remapping).
+ * @param build_output_size Number of columns from build side.
+ * @param build_is_left    True if build side is the original left child.
+ * @param columnar_reader  Reader for columnar data access.
+ * @param out_result       Output IntermediateResult (populated in-place).
+ * @param analyzed_plan    Full analyzed plan for base table access.
  */
 template <MatchCollectionMode Mode>
-inline void construct_intermediate_from_buffers(
+void construct_intermediate_from_buffers(
     std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
     const JoinInput &build_input, const JoinInput &probe_input,
+    const AnalyzedJoinNode &join_node,
     const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
-    const PlanNode &build_node, const PlanNode &probe_node, size_t build_size,
-    ColumnarReader &columnar_reader, ExecuteResult &results) {
+    size_t build_output_size, bool build_is_left,
+    ColumnarReader &columnar_reader, IntermediateResult &out_result,
+    const AnalyzedPlan &analyzed_plan) {
 
-    // Compute total matches and per-buffer start offsets
+    // Count total matches and compute buffer start offsets
     size_t total_matches = 0;
     std::vector<size_t> buffer_starts(buffers.size());
     for (size_t i = 0; i < buffers.size(); ++i) {
@@ -125,40 +505,94 @@ inline void construct_intermediate_from_buffers(
         total_matches += buffers[i].count();
     }
 
-    if (total_matches == 0)
+    if (total_matches == 0) {
+        out_result = create_empty_intermediate_result(join_node);
         return;
+    }
 
-    auto sources = prepare_sources(remapped_attrs, build_input, probe_input,
-                                   build_node, probe_node, build_size);
+    // Initialize result metadata
+    out_result.node_info = &join_node;
+    out_result.num_rows = total_matches;
+    out_result.materialized_map.resize(join_node.columns.size(), std::nullopt);
+    out_result.deferred_map.resize(join_node.columns.size(), std::nullopt);
 
-    const size_t num_threads = THREAD_COUNT;
-    const size_t num_cols = sources.size();
+    // Count materialized columns and set up maps
+    size_t mat_count = 0;
+    for (const auto &col : join_node.columns) {
+        if (col.resolution == ColumnResolution::MATERIALIZE) {
+            out_result.materialized_map[col.original_idx] = mat_count++;
+        }
+    }
+
+    // Prepare deferred table sources (this populates deferred_tables and
+    // deferred_map)
+    auto deferred_sources = prepare_deferred_table_sources(
+        join_node, build_input, probe_input, build_is_left, out_result);
 
-    // Pre-size page vectors for each column
+    // Precompute materialized sources
+    auto mat_sources = prepare_materialized_sources(join_node, build_input,
+                                                    probe_input, build_is_left);
+
+    // Prepare page indices for base tables used in deferred resolution
+    prepare_deferred_base_tables(columnar_reader, mat_sources, analyzed_plan);
+
+    // Pre-allocate pages
     using Page = mema::column_t::Page;
-    size_t total_pages_needed =
+    using DeferredPage = mema::DeferredTable::Page;
+    size_t mat_pages_needed =
         (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE;
+    size_t def_pages_needed =
+        (total_matches + mema::DeferredTable::ENTRIES_PER_PAGE - 1) /
+        mema::DeferredTable::ENTRIES_PER_PAGE;
 
-    for (size_t c = 0; c < num_cols; ++c) {
-        auto &col = results[c];
-        col.pages.resize(total_pages_needed);
-        col.set_row_count(total_matches);
+    out_result.materialized.resize(mat_count);
+    for (size_t c = 0; c < mat_count; ++c) {
+        out_result.materialized[c].pages.resize(mat_pages_needed);
+        out_result.materialized[c].set_row_count(total_matches);
     }
 
-    // Parallel page allocation - each thread allocates its own pages
+    for (auto &dt : out_result.deferred_tables) {
+        dt.pages.resize(def_pages_needed);
+        dt.set_row_count(total_matches);
+    }
+
+    // Set source metadata for materialized columns
+    for (const auto &src : mat_sources) {
+        out_result.materialized[src.mat_col_idx].source_table =
+            src.base_table_id;
+        out_result.materialized[src.mat_col_idx].source_column =
+            src.base_column_idx;
+    }
+
+    const size_t num_threads = THREAD_COUNT;
+    const size_t num_deferred_tables = out_result.deferred_tables.size();
+
+    // Parallel page allocation
     worker_pool().execute([&](size_t t) {
-        for (size_t c = 0; c < num_cols; ++c) {
-            auto &col = results[c];
-            for (size_t p = t; p < total_pages_needed; p += num_threads) {
+        for (size_t c = 0; c < mat_count; ++c) {
+            auto &col = out_result.materialized[c];
+            for (size_t p = t; p < mat_pages_needed; p += num_threads) {
                 void *ptr =
                     Contest::platform::get_arena(t)
                         .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
                 col.pages[p] = reinterpret_cast<Page *>(ptr);
             }
         }
+        for (size_t d = 0; d < num_deferred_tables; ++d) {
+            auto &dt = out_result.deferred_tables[d];
+            for (size_t p = t; p < def_pages_needed; p += num_threads) {
+                // Use IR_PAGE (16KB) for DeferredTable pages
+                void *ptr =
+                    Contest::platform::get_arena(t)
+                        .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
+                dt.pages[p] = reinterpret_cast<DeferredPage *>(ptr);
+            }
+        }
     });
 
-    // Parallel: each thread processes its own buffer
+    // ========================================================================
+    // COLUMN-MAJOR PARALLEL POPULATION
+    // ========================================================================
     worker_pool().execute([&](size_t t) {
         if (t >= buffers.size())
             return;
@@ -168,50 +602,540 @@ inline void construct_intermediate_from_buffers(
             return;
 
         size_t start = buffer_starts[t];
-        Contest::ColumnarReader::Cursor cursor;
+        ColumnarReader::Cursor cursor;
+        ColumnarReader::Cursor base_cursor; // For deferred resolution reads
 
-        for (size_t c = 0; c < num_cols; ++c) {
-            const auto &src = sources[c];
-            auto &dest_col = results[c];
+        // ====================================================================
+        // Process MATERIALIZED columns (column-major for cache locality)
+        // ====================================================================
+        for (const auto &src : mat_sources) {
+            auto &dest_col = out_result.materialized[src.mat_col_idx];
 
-            auto left_range = buf.left_range();
-            auto right_range = buf.right_range();
+            // Get appropriate range based on which side this column comes from
+            auto range = src.from_build ? buf.left_range() : buf.right_range();
 
             if (src.is_columnar) {
+                // Columnar source - use ColumnarReader with cursor caching
                 const auto &col = *src.columnar_col;
-                if (src.from_build) {
-                    size_t k = start;
-                    for (uint32_t rid : left_range) {
-                        dest_col.write_at(k++,
-                                          columnar_reader.read_value(
-                                              col, src.remapped_col_idx, rid,
-                                              col.type, cursor, true));
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    dest_col.write_at(k++,
+                                      columnar_reader.read_value(
+                                          col, src.child_output_idx, rid,
+                                          src.type, cursor, src.from_build));
+                }
+            } else if (src.needs_tuple_key_read && src.tuple_col) {
+                // Child stored this column as tuples - read key from there
+                const auto &tuples = *src.tuple_col;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    int32_t key = tuples.key_at(rid);
+                    dest_col.write_at(k++, mema::value_t{key});
+                }
+            } else if (src.intermediate_col) {
+                // Intermediate materialized source - direct copy
+                const auto &vec = *src.intermediate_col;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    dest_col.write_at(k++, vec[rid]);
+                }
+            } else if (src.needs_deferred_resolve && src.deferred_table) {
+                // Deferred in child - resolve via deferred table + base table
+                const auto &def_table = *src.deferred_table;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    uint32_t base_row = def_table[rid];
+
+                    if (analyzed_plan.original_plan) [[likely]] {
+                        const auto &base_table =
+                            analyzed_plan.original_plan
+                                ->inputs[src.base_table_id];
+                        mema::value_t val =
+                            columnar_reader.read_base_table_value(
+                                base_table.columns[src.base_column_idx],
+                                src.base_table_id, src.base_column_idx,
+                                base_row, src.type, base_cursor);
+                        dest_col.write_at(k++, val);
+                    } else {
+                        dest_col.write_at(
+                            k++, mema::value_t{mema::value_t::NULL_VALUE});
                     }
-                } else {
-                    size_t k = start;
-                    for (uint32_t rid : right_range) {
-                        dest_col.write_at(k++,
-                                          columnar_reader.read_value(
-                                              col, src.remapped_col_idx, rid,
-                                              col.type, cursor, false));
+                }
+            }
+        }
+
+        // ====================================================================
+        // Process DEFERRED tables (one pass per unique base table)
+        // ====================================================================
+        for (const auto &def_src : deferred_sources) {
+            auto &dest_table =
+                out_result.deferred_tables[def_src.dest_table_idx];
+
+            auto batch_reader = def_src.from_build ? buf.left_batch_reader()
+                                                   : buf.right_batch_reader();
+
+            size_t k = start;
+            while (batch_reader.has_more()) {
+                size_t batch_count;
+                const uint32_t *row_ids =
+                    batch_reader.get_batch(256, batch_count);
+
+                if (batch_count > 0) {
+                    if (def_src.needs_direct) {
+                        // Columnar input: write row IDs directly
+                        row_id_ops::write_row_ids_direct(dest_table, k, row_ids,
+                                                         batch_count);
+                    } else if (def_src.child_table) {
+                        // Intermediate input: copy from child's deferred table
+                        row_id_ops::copy_row_ids_from_child(
+                            dest_table, k, *def_src.child_table, row_ids,
+                            batch_count);
                     }
+                    k += batch_count;
+                }
+            }
+        }
+    });
+}
+
+// ============================================================================
+// Tuple-Based Intermediate Construction
+// ============================================================================
+
+/**
+ * @brief Resolves a row ID to base table row ID if possible.
+ *
+ * For columnar inputs: row ID is already base row ID (direct).
+ * For IR with tuples storing base rows: lookup via key_row_column_t.
+ * For IR with tuples storing IR indices: lookup via deferred table.
+ * For IR without tuples: lookup via deferred table.
+ *
+ * @param input The JoinInput to resolve from.
+ * @param row_id The row ID from match buffer.
+ * @param key_col_idx The join key column index in input's output.
+ * @return Resolved base table row ID.
+ */
+inline uint32_t resolve_to_base_row(const JoinInput &input, uint32_t row_id,
+                                    size_t key_col_idx) {
+    if (input.is_columnar()) {
+        // Columnar input: row ID is already base table row
+        return row_id;
+    }
+
+    const auto &ir = std::get<IntermediateResult>(input.data);
+
+    if (ir.has_join_key_tuples() && ir.join_key_has_base_rows()) {
+        // IR stores base row IDs in tuples - one lookup
+        return ir.join_key_tuples->row_id_at(row_id);
+    }
+
+    // IR stores IR indices - need deferred table lookup
+    const auto *def_table = ir.get_deferred_table(key_col_idx);
+    if (def_table) {
+        return (*def_table)[row_id];
+    }
+
+    // Fallback: return as-is (shouldn't happen for correct plans)
+    return row_id;
+}
+
+/**
+ * @brief Populates join key tuples column from match buffers.
+ *
+ * Extracts join keys and resolves row IDs based on tracking configuration.
+ * For tracked side with base rows, embeds base table row IDs directly.
+ * For non-tracked side, embeds IR indices for later DeferredTable lookup.
+ *
+ * @tparam Mode Match collection mode.
+ * @param buffers Thread-local match buffers.
+ * @param buffer_starts Per-buffer write offsets.
+ * @param build_input Build side input.
+ * @param probe_input Probe side input.
+ * @param key_from_build True if parent's join key comes from build side.
+ * @param key_child_output_idx Column index in the key input's output.
+ * @param out_tuples Output tuple column (pre-allocated).
+ * @param columnar_reader Reader for columnar access.
+ */
+template <MatchCollectionMode Mode>
+void populate_join_key_tuples(
+    std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
+    const std::vector<size_t> &buffer_starts, const JoinInput &build_input,
+    const JoinInput &probe_input, bool key_from_build,
+    size_t key_child_output_idx, mema::key_row_column_t &out_tuples,
+    ColumnarReader &columnar_reader) {
+
+    const JoinInput &key_input = key_from_build ? build_input : probe_input;
+    size_t key_attr = key_child_output_idx;
+
+    worker_pool().execute([&](size_t t) {
+        if (t >= buffers.size())
+            return;
+        auto &buf = buffers[t];
+        size_t my_count = buf.count();
+        if (my_count == 0)
+            return;
+
+        size_t write_pos = buffer_starts[t];
+
+        // Get the appropriate range based on which side provides the key
+        auto range = key_from_build ? buf.left_range() : buf.right_range();
+
+        if (key_input.is_columnar()) {
+            // Columnar source - read key from base table using prepared page
+            // index Store OUTPUT IR index (write_pos) so parent can use it to
+            // index into this IR
+            auto *table = std::get<const ColumnarTable *>(key_input.data);
+            auto [actual_col_idx, _] = key_input.node->output_attrs[key_attr];
+            const Column &col = table->columns[actual_col_idx];
+
+            // Use cursor for efficient sequential/near-sequential access
+            ColumnarReader::Cursor cursor;
+            for (uint32_t row_id : range) {
+                // Use read_value with prepared page index (O(1) amortized)
+                // instead of read_value_direct_public (O(n) per read)
+                int32_t key =
+                    columnar_reader
+                        .read_value(col, key_attr, row_id, DataType::INT32,
+                                    cursor, key_from_build)
+                        .value;
+                // Store OUTPUT IR index (write_pos), not base table row_id
+                // Parent needs IR index to access other columns in this IR
+                uint32_t output_ir_idx = static_cast<uint32_t>(write_pos);
+                out_tuples.write_at(write_pos++, {key, output_ir_idx});
+            }
+        } else {
+            // Intermediate source - store OUTPUT IR index
+            const auto &ir = std::get<IntermediateResult>(key_input.data);
+
+            // Only propagate existing tuples if they contain the column we need
+            // Otherwise, read from materialized column
+            if (ir.has_join_key_tuples() && ir.join_key_idx.has_value() &&
+                *ir.join_key_idx == key_attr) {
+                // IR's tuples contain the column we need - propagate directly
+                const auto &src_tuples = *ir.join_key_tuples;
+
+                for (uint32_t ir_idx : range) {
+                    mema::KeyRowPair src = src_tuples[ir_idx];
+                    // Store OUTPUT IR index for parent to index into this IR
+                    uint32_t output_ir_idx = static_cast<uint32_t>(write_pos);
+                    out_tuples.write_at(write_pos++, {src.key, output_ir_idx});
                 }
             } else {
+                // IR's tuples contain a different column, or no tuples exist
+                // Read from materialized column instead
+                const auto *mat_col = ir.get_materialized(key_attr);
+                if (mat_col) {
+                    for (uint32_t ir_idx : range) {
+                        int32_t key = (*mat_col)[ir_idx].value;
+                        // Store OUTPUT IR index for parent to index into this
+                        // IR
+                        uint32_t output_ir_idx =
+                            static_cast<uint32_t>(write_pos);
+                        out_tuples.write_at(write_pos++, {key, output_ir_idx});
+                    }
+                }
+            }
+        }
+    });
+}
+
+/**
+ * @brief Constructs intermediate result with tuple-based join key storage.
+ *
+ * Stores join key as (value, row_id) tuples for accelerated hashtable build
+ * and zero-indirection row ID propagation. Other columns handled normally
+ * via deferred tables or materialization.
+ *
+ * @tparam Mode Collection mode for compile-time specialization.
+ * @param buffers Thread-local match buffers from probe.
+ * @param build_input Build side data source.
+ * @param probe_input Probe side data source.
+ * @param join_node Analyzed join node with materialization decisions.
+ * @param config Build/probe configuration.
+ * @param build_is_left True if build side is the original left child.
+ * @param parent_key_idx Index of column that will be parent's join key.
+ * @param columnar_reader Reader for columnar data access.
+ * @param out_result Output IntermediateResult (populated in-place).
+ * @param analyzed_plan Full analyzed plan for base table access.
+ */
+template <MatchCollectionMode Mode>
+void construct_intermediate_with_tuples(
+    std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
+    const JoinInput &build_input, const JoinInput &probe_input,
+    const AnalyzedJoinNode &join_node, const join::BuildProbeConfig &config,
+    bool build_is_left, size_t parent_key_idx, ColumnarReader &columnar_reader,
+    IntermediateResult &out_result, const AnalyzedPlan &analyzed_plan) {
+
+    // Count total matches and compute buffer start offsets
+    size_t total_matches = 0;
+    std::vector<size_t> buffer_starts(buffers.size());
+    for (size_t i = 0; i < buffers.size(); ++i) {
+        buffer_starts[i] = total_matches;
+        total_matches += buffers[i].count();
+    }
+
+    if (total_matches == 0) {
+        out_result = create_empty_intermediate_result(join_node);
+        return;
+    }
+
+    // Initialize result metadata
+    out_result.node_info = &join_node;
+    out_result.num_rows = total_matches;
+    out_result.materialized_map.resize(join_node.columns.size(), std::nullopt);
+    out_result.deferred_map.resize(join_node.columns.size(), std::nullopt);
+
+    // Determine if parent's join key comes from build or probe side
+    // and which base table it traces back to
+    bool key_from_build = true;
+    size_t key_child_output_idx = 0; // Column index in child's output
+    uint8_t key_base_table_id = 0;
+    uint8_t key_base_column = 0;
+
+    for (const auto &col : join_node.columns) {
+        if (col.original_idx == parent_key_idx) {
+            key_from_build = (col.from_left == build_is_left);
+            key_child_output_idx = col.child_output_idx;
+            key_base_table_id = col.provenance.base_table_id;
+            key_base_column = col.provenance.base_column_idx;
+            break;
+        }
+    }
+
+    // Allocate join key tuples column
+    out_result.join_key_tuples.emplace();
+    out_result.join_key_tuples->pre_allocate_from_arena(
+        Contest::platform::get_arena(0), total_matches);
+    out_result.join_key_tuples->base_table_id = key_base_table_id;
+    out_result.join_key_tuples->source_column = key_base_column;
+    // Always store OUTPUT IR indices (not base row IDs) so parent can
+    // index into this IR to access deferred columns
+    out_result.join_key_tuples->stores_base_row_ids = false;
+    out_result.join_key_idx = parent_key_idx;
+    const JoinInput &key_input = key_from_build ? build_input : probe_input;
+    (void)key_input; // Used in populate_join_key_tuples
+
+    // Count non-join-key materialized columns and set up maps
+    size_t mat_count = 0;
+    for (const auto &col : join_node.columns) {
+        if (col.resolution == ColumnResolution::MATERIALIZE &&
+            col.original_idx != parent_key_idx) {
+            out_result.materialized_map[col.original_idx] = mat_count++;
+        }
+    }
+
+    // Prepare deferred table sources (unchanged from non-tuple version)
+    auto deferred_sources = prepare_deferred_table_sources(
+        join_node, build_input, probe_input, build_is_left, out_result);
+
+    // Precompute materialized sources (excluding join key)
+    std::vector<MaterializedColumnSource> mat_sources;
+    mat_sources.reserve(join_node.columns.size());
+    size_t mat_idx = 0;
+    for (const auto &col : join_node.columns) {
+        if (col.resolution != ColumnResolution::MATERIALIZE)
+            continue;
+        if (col.original_idx == parent_key_idx)
+            continue; // Skip join key - handled via tuples
+
+        MaterializedColumnSource src;
+        src.mat_col_idx = mat_idx++;
+        src.child_output_idx = col.child_output_idx;
+        src.type = col.type;
+        src.base_table_id = col.provenance.base_table_id;
+        src.base_column_idx = col.provenance.base_column_idx;
+        src.from_build = (col.from_left == build_is_left);
+
+        const auto &src_input = src.from_build ? build_input : probe_input;
+
+        if (src_input.is_columnar()) {
+            src.is_columnar = true;
+            const auto *table = std::get<const ColumnarTable *>(src_input.data);
+            auto [actual_idx, _] =
+                src_input.node->output_attrs[col.child_output_idx];
+            src.columnar_col = &table->columns[actual_idx];
+        } else {
+            src.is_columnar = false;
+            const auto &ir = std::get<IntermediateResult>(src_input.data);
+
+            // Check source type in priority order:
+            // 1. Tuples (join key stored as key-row pairs)
+            // 2. Materialized column
+            // 3. Deferred table
+            if (ir.is_join_key(col.child_output_idx)) {
+                // Child stored this column as tuples - read key from there
+                src.needs_tuple_key_read = true;
+                src.tuple_col = &(*ir.join_key_tuples);
+            } else if (ir.is_materialized(col.child_output_idx)) {
+                src.intermediate_col =
+                    ir.get_materialized(col.child_output_idx);
+            } else if (ir.is_deferred(col.child_output_idx)) {
+                src.needs_deferred_resolve = true;
+                src.deferred_table =
+                    ir.get_deferred_table(col.child_output_idx);
+            }
+        }
+        mat_sources.push_back(src);
+    }
+
+    // Prepare page indices for base tables used in deferred resolution
+    prepare_deferred_base_tables(columnar_reader, mat_sources, analyzed_plan);
+
+    // Pre-allocate pages
+    using Page = mema::column_t::Page;
+    using DeferredPage = mema::DeferredTable::Page;
+    size_t mat_pages_needed =
+        (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE;
+    size_t def_pages_needed =
+        (total_matches + mema::DeferredTable::ENTRIES_PER_PAGE - 1) /
+        mema::DeferredTable::ENTRIES_PER_PAGE;
+
+    out_result.materialized.resize(mat_count);
+    for (size_t c = 0; c < mat_count; ++c) {
+        out_result.materialized[c].pages.resize(mat_pages_needed);
+        out_result.materialized[c].set_row_count(total_matches);
+    }
+
+    for (auto &dt : out_result.deferred_tables) {
+        dt.pages.resize(def_pages_needed);
+        dt.set_row_count(total_matches);
+    }
+
+    // Set source metadata for materialized columns
+    for (const auto &src : mat_sources) {
+        out_result.materialized[src.mat_col_idx].source_table =
+            src.base_table_id;
+        out_result.materialized[src.mat_col_idx].source_column =
+            src.base_column_idx;
+    }
+
+    const size_t num_threads = THREAD_COUNT;
+    const size_t num_deferred_tables = out_result.deferred_tables.size();
+
+    // Parallel page allocation
+    worker_pool().execute([&](size_t t) {
+        for (size_t c = 0; c < mat_count; ++c) {
+            auto &col = out_result.materialized[c];
+            for (size_t p = t; p < mat_pages_needed; p += num_threads) {
+                void *ptr =
+                    Contest::platform::get_arena(t)
+                        .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
+                col.pages[p] = reinterpret_cast<Page *>(ptr);
+            }
+        }
+        for (size_t d = 0; d < num_deferred_tables; ++d) {
+            auto &dt = out_result.deferred_tables[d];
+            for (size_t p = t; p < def_pages_needed; p += num_threads) {
+                void *ptr =
+                    Contest::platform::get_arena(t)
+                        .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
+                dt.pages[p] = reinterpret_cast<DeferredPage *>(ptr);
+            }
+        }
+    });
+
+    // Populate join key tuples
+    populate_join_key_tuples<Mode>(
+        buffers, buffer_starts, build_input, probe_input, key_from_build,
+        key_child_output_idx, *out_result.join_key_tuples, columnar_reader);
+
+    // Populate other materialized columns and deferred tables
+    // (same logic as construct_intermediate_from_buffers)
+    worker_pool().execute([&](size_t t) {
+        if (t >= buffers.size())
+            return;
+        auto &buf = buffers[t];
+        size_t my_count = buf.count();
+        if (my_count == 0)
+            return;
+
+        size_t start = buffer_starts[t];
+        ColumnarReader::Cursor cursor;
+        ColumnarReader::Cursor base_cursor; // For deferred resolution reads
+
+        // Process MATERIALIZED columns (excluding join key)
+        for (const auto &src : mat_sources) {
+            auto &dest_col = out_result.materialized[src.mat_col_idx];
+
+            auto range = src.from_build ? buf.left_range() : buf.right_range();
+
+            if (src.is_columnar) {
+                const auto &col = *src.columnar_col;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    mema::value_t val = columnar_reader.read_value(
+                        col, src.child_output_idx, rid, src.type, cursor,
+                        src.from_build);
+                    dest_col.write_at(k++, val);
+                }
+            } else if (src.needs_tuple_key_read && src.tuple_col) {
+                // Child stored this column as tuples - read key from there
+                const auto &tuples = *src.tuple_col;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    int32_t key = tuples.key_at(rid);
+                    dest_col.write_at(k++, mema::value_t{key});
+                }
+            } else if (src.intermediate_col) {
                 const auto &vec = *src.intermediate_col;
-                if (src.from_build) {
-                    size_t k = start;
-                    for (uint32_t rid : left_range) {
-                        dest_col.write_at(k++, vec[rid]);
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    mema::value_t val = vec[rid];
+                    dest_col.write_at(k++, val);
+                }
+            } else if (src.needs_deferred_resolve && src.deferred_table) {
+                const auto &def_table = *src.deferred_table;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    uint32_t base_row = def_table[rid];
+
+                    if (analyzed_plan.original_plan) [[likely]] {
+                        const auto &base_table =
+                            analyzed_plan.original_plan
+                                ->inputs[src.base_table_id];
+                        mema::value_t val =
+                            columnar_reader.read_base_table_value(
+                                base_table.columns[src.base_column_idx],
+                                src.base_table_id, src.base_column_idx,
+                                base_row, src.type, base_cursor);
+                        dest_col.write_at(k++, val);
+                    } else {
+                        dest_col.write_at(
+                            k++, mema::value_t{mema::value_t::NULL_VALUE});
                     }
-                } else {
-                    size_t k = start;
-                    for (uint32_t rid : right_range) {
-                        dest_col.write_at(k++, vec[rid]);
+                }
+            }
+        }
+
+        // Process DEFERRED tables
+        for (const auto &def_src : deferred_sources) {
+            auto &dest_table =
+                out_result.deferred_tables[def_src.dest_table_idx];
+
+            auto batch_reader = def_src.from_build ? buf.left_batch_reader()
+                                                   : buf.right_batch_reader();
+
+            size_t k = start;
+            while (batch_reader.has_more()) {
+                size_t batch_count;
+                const uint32_t *row_ids =
+                    batch_reader.get_batch(256, batch_count);
+
+                if (batch_count > 0) {
+                    if (def_src.needs_direct) {
+                        row_id_ops::write_row_ids_direct(dest_table, k, row_ids,
+                                                         batch_count);
+                    } else if (def_src.child_table) {
+                        row_id_ops::copy_row_ids_from_child(
+                            dest_table, k, *def_src.child_table, row_ids,
+                            batch_count);
                     }
+                    k += batch_count;
                 }
             }
         }
     });
 }
 
-} // namespace Contest::materialize
+} // namespace materialize
+} // namespace Contest
diff --git a/include/materialization/materialize.h b/include/materialization/materialize.h
index e154e93..4cff7ab 100644
--- a/include/materialization/materialize.h
+++ b/include/materialization/materialize.h
@@ -1,66 +1,149 @@
 /**
  * @file materialize.h
- * @brief Materialization of join results into ColumnarTable format.
+ * @brief Final materialization for execution path.
  *
- * Parallel materialization using per-thread page builders and mmap allocation.
- * Templated on MatchCollectionMode for zero-overhead mode selection.
+ * Materializes all output columns at the root join, resolving deferred
+ * columns by looking up 32-bit row IDs in DeferredTable back to base tables.
+ *
+ * @see construct_intermediate.h for building IntermediateResult intermediates.
  */
 #pragma once
 
-#include <algorithm>
 #include <cstring>
+#include <functional>
+#include <sys/mman.h>
+#include <vector>
+
 #include <data_access/columnar_reader.h>
+#include <data_model/deferred_plan.h>
 #include <data_model/intermediate.h>
-#include <data_model/plan.h>
-#include <functional>
-#include <join_execution/join_setup.h>
+#include <foundation/common.h>
 #include <join_execution/match_collector.h>
-#include <materialization/construct_intermediate.h>
 #include <materialization/page_builders.h>
+#include <platform/arena.h>
 #include <platform/worker_pool.h>
-#include <sys/mman.h>
-#include <vector>
 
-/** @namespace Contest::materialize @brief Join result materialization. */
-namespace Contest::materialize {
+namespace Contest {
+namespace materialize {
 
-using Contest::ExecuteResult;
 using Contest::io::ColumnarReader;
-using Contest::join::JoinInput;
 using Contest::join::MatchCollectionMode;
-using Contest::join::resolve_input_source;
 using Contest::join::ThreadLocalMatchBuffer;
 using Contest::platform::THREAD_COUNT;
 using Contest::platform::worker_pool;
 
-/** @brief Creates empty ColumnarTable with correct column types for zero-match
- * case. */
+/**
+ * @brief Collect columns needed from a JoinInput for final materialization.
+ */
+inline platform::ArenaVector<const Column *>
+collect_final_columns(const JoinInput &input,
+                      const platform::ArenaVector<uint8_t> &needed,
+                      platform::ThreadArena &arena) {
+    platform::ArenaVector<const Column *> columns(arena);
+    if (!input.node)
+        return columns;
+
+    columns.resize(input.node->output_attrs.size());
+    std::memset(columns.data(), 0, columns.size() * sizeof(const Column *));
+
+    if (!input.is_columnar())
+        return columns;
+
+    auto *table = std::get<const ColumnarTable *>(input.data);
+    for (size_t i = 0; i < input.node->output_attrs.size(); ++i) {
+        if (i < needed.size() && needed[i]) {
+            auto [actual_col_idx, _] = input.node->output_attrs[i];
+            columns[i] = &table->columns[actual_col_idx];
+        }
+    }
+    return columns;
+}
+
+/**
+ * @brief Prepare ColumnarReader for final materialization at root.
+ *
+ * Sets up page indices for ALL output columns (since all need materialization
+ * at root).
+ */
+inline void prepare_final_columns(
+    ColumnarReader &reader, const JoinInput &build_input,
+    const JoinInput &probe_input, const AnalyzedJoinNode &join_node,
+    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
+    size_t build_size, bool build_is_left) {
+
+    bool build_is_columnar = build_input.is_columnar();
+    bool probe_is_columnar = probe_input.is_columnar();
+
+    if (!build_is_columnar && !probe_is_columnar)
+        return;
+
+    auto &arena = Contest::platform::get_arena(0);
+
+    // All output columns needed at root
+    platform::ArenaVector<uint8_t> build_needed(arena);
+    if (build_input.node) {
+        build_needed.resize(build_input.node->output_attrs.size());
+        std::memset(build_needed.data(), 0, build_needed.size());
+    }
+
+    platform::ArenaVector<uint8_t> probe_needed(arena);
+    if (probe_input.node) {
+        probe_needed.resize(probe_input.node->output_attrs.size());
+        std::memset(probe_needed.data(), 0, probe_needed.size());
+    }
+
+    // Mark ALL columns needed for final materialization
+    // from_left refers to original left child
+    // build_is_left tells us if build side is the left child
+    for (const auto &col : join_node.columns) {
+        bool from_build = (col.from_left == build_is_left);
+        if (from_build && col.child_output_idx < build_needed.size()) {
+            build_needed[col.child_output_idx] = 1;
+        } else if (!from_build && col.child_output_idx < probe_needed.size()) {
+            probe_needed[col.child_output_idx] = 1;
+        }
+    }
+
+    if (build_is_columnar) {
+        reader.prepare_build(
+            collect_final_columns(build_input, build_needed, arena));
+    }
+
+    if (probe_is_columnar) {
+        reader.prepare_probe(
+            collect_final_columns(probe_input, probe_needed, arena));
+    }
+}
+
+/**
+ * @brief Create empty result for zero-match case.
+ */
 inline ColumnarTable create_empty_result(
-    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs) {
+    const std::vector<std::tuple<size_t, DataType>> &output_attrs) {
     ColumnarTable empty_result;
     empty_result.num_rows = 0;
-    for (auto [_, data_type] : remapped_attrs) {
+    for (auto [_, data_type] : output_attrs) {
         empty_result.columns.emplace_back(data_type);
     }
     return empty_result;
 }
 
 /**
- * @brief Parallel materialization of a single output column from thread-local
- * buffers.
+ * @brief Materialize a single column from sources.
  *
- * Each thread processes its own buffer directly without merge overhead.
+ * Handles three cases:
+ * 1. COLUMNAR_DIRECT: Input is columnar, read directly via row index
+ * 2. MATERIALIZED: Column was materialized in IntermediateResult
+ * 3. DEFERRED: Resolve via 64-bit provenance to base table
  *
- * @tparam Mode            Collection mode for compile-time specialization.
- * @tparam BuilderType     Int32PageBuilder or VarcharPageBuilder.
- * @tparam ReaderFunc      Callable: (row_id, cursor) -> value_t.
+ * @tparam Mode Collection mode for compile-time specialization.
+ * @tparam BuilderType Int32PageBuilder or VarcharPageBuilder.
+ * @tparam ReaderFunc Callable: (row_idx, cursor) -> value_t.
  * @tparam InitBuilderFunc Callable: (page_allocator) -> BuilderType.
- * @param est_bytes_per_row Average bytes per row (4 for INT32, ~35 for
- * VARCHAR).
  */
 template <MatchCollectionMode Mode, typename BuilderType, typename ReaderFunc,
           typename InitBuilderFunc>
-inline void materialize_column_from_buffers(
+inline void materialize_column(
     Column &dest_col, std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
     size_t total_matches, ReaderFunc &&read_value,
     InitBuilderFunc &&init_builder, bool from_build, size_t est_bytes_per_row) {
@@ -162,133 +245,203 @@ inline void materialize_column_from_buffers(
 }
 
 /**
- * @brief Materializes a single output column from thread-local buffers.
+ * @brief Materialize single output column handling deferred resolution.
  *
- * Dispatcher that determines source location (columnar/intermediate,
- * build/probe), selects page builder type, and invokes
- * materialize_column_from_buffers<>. VARCHAR handling requires source Column
- * pointer for string dereferencing.
+ * For deferred columns, resolves via DeferredTable (32-bit row ID) back to
+ * base table.
  *
  * @tparam Mode Collection mode for compile-time specialization.
  */
 template <MatchCollectionMode Mode>
-inline void materialize_single_column_from_buffers(
-    Column &dest_col, size_t col_idx, size_t build_size,
+inline void materialize_single_column(
+    Column &dest_col, size_t col_idx, size_t build_size, bool build_is_left,
     std::vector<ThreadLocalMatchBuffer<Mode>> &buffers, size_t total_matches,
     const JoinInput &build_input, const JoinInput &probe_input,
-    const PlanNode &build_node, const PlanNode &probe_node,
-    ColumnarReader &columnar_reader, const Plan &plan) {
-
-    auto [input, node, local_idx] = resolve_input_source(
-        col_idx, build_size, build_input, build_node, probe_input, probe_node);
-    bool from_build = col_idx < build_size;
+    const AnalyzedJoinNode &join_node, ColumnarReader &columnar_reader,
+    const AnalyzedPlan &analyzed_plan) {
+
+    // Find column info
+    const AnalyzedColumnInfo *col_info = nullptr;
+    for (const auto &col : join_node.columns) {
+        if (col.original_idx == col_idx) {
+            col_info = &col;
+            break;
+        }
+    }
 
-    const Column *col_source = nullptr;
-    const mema::column_t *inter_source = nullptr;
+    if (!col_info) {
+        // Fallback - shouldn't happen
+        return;
+    }
 
-    if (input.is_columnar()) {
-        auto *table = std::get<const ColumnarTable *>(input.data);
-        auto [actual_idx, _] = node.output_attrs[local_idx];
-        col_source = &table->columns[actual_idx];
+    // Determine if this column comes from build or probe side at runtime
+    bool from_build = (col_info->from_left == build_is_left);
+    const JoinInput &src_input = from_build ? build_input : probe_input;
+
+    // Determine how to read the value
+    const Column *columnar_source = nullptr;
+    const mema::column_t *materialized_source = nullptr;
+    const mema::key_row_column_t *tuple_source = nullptr;
+    const mema::DeferredTable *deferred_table = nullptr;
+    uint8_t deferred_base_col = 0;
+    uint8_t deferred_base_table = 0;
+
+    if (src_input.is_columnar()) {
+        // Direct columnar read
+        const auto *table = std::get<const ColumnarTable *>(src_input.data);
+        auto [actual_idx, _] =
+            src_input.node->output_attrs[col_info->child_output_idx];
+        columnar_source = &table->columns[actual_idx];
     } else {
-        const auto &res = std::get<ExecuteResult>(input.data);
-        inter_source = &res[local_idx];
+        const auto &ir = std::get<IntermediateResult>(src_input.data);
+        // Check if column is stored as join key tuples
+        if (ir.is_join_key(col_info->child_output_idx)) {
+            tuple_source = &(*ir.join_key_tuples);
+        } else if (ir.is_materialized(col_info->child_output_idx)) {
+            // Read from materialized column
+            materialized_source =
+                ir.get_materialized(col_info->child_output_idx);
+        } else if (ir.is_deferred(col_info->child_output_idx)) {
+            // Deferred - need to resolve via deferred table + base table
+            deferred_table = ir.get_deferred_table(col_info->child_output_idx);
+            deferred_base_col =
+                ir.get_deferred_base_col(col_info->child_output_idx);
+            // Get base table ID from the deferred table itself
+            if (deferred_table) {
+                deferred_base_table = deferred_table->base_table_id;
+            }
+        }
     }
 
-    auto reader = [&](uint32_t rid, ColumnarReader::Cursor &cursor,
-                      DataType type) {
-        if (col_source) {
-            return columnar_reader.read_value(*col_source, local_idx, rid, type,
-                                              cursor, from_build);
+    // Create reader lambda
+    auto reader = [&](uint32_t local_row_id,
+                      ColumnarReader::Cursor &cursor) -> mema::value_t {
+        mema::value_t result;
+        if (columnar_source) {
+            result = columnar_reader.read_value(
+                *columnar_source, col_info->child_output_idx, local_row_id,
+                col_info->type, cursor, from_build);
+        } else if (tuple_source) {
+            // Read key value from tuple column
+            result = mema::value_t{tuple_source->key_at(local_row_id)};
+        } else if (materialized_source) {
+            result = (*materialized_source)[local_row_id];
+        } else if (deferred_table && analyzed_plan.original_plan) {
+            // Deferred resolution: look up base table row ID from deferred
+            // table
+            uint32_t base_row = (*deferred_table)[local_row_id];
+            const auto &base_table =
+                analyzed_plan.original_plan->inputs[deferred_base_table];
+            result = columnar_reader.read_value(
+                base_table.columns[deferred_base_col], deferred_base_col,
+                base_row, col_info->type, cursor, true);
+        } else {
+            result = mema::value_t{mema::value_t::NULL_VALUE};
         }
-        return (*inter_source)[rid];
+        return result;
     };
 
+    // Materialize based on type
     if (dest_col.type == DataType::INT32) {
         auto init = [](std::function<Page *()> alloc) {
             return Int32PageBuilder(std::move(alloc));
         };
-        materialize_column_from_buffers<Mode, Int32PageBuilder>(
+        materialize_column<Mode, Int32PageBuilder>(
             dest_col, buffers, total_matches,
             [&](uint32_t rid, ColumnarReader::Cursor &cursor) {
-                return reader(rid, cursor, DataType::INT32);
+                return reader(rid, cursor);
             },
             init, from_build, 4);
         return;
     }
 
-    const Column *str_src_ptr = col_source;
-    if (!str_src_ptr && inter_source) {
-        str_src_ptr = &plan.inputs[inter_source->source_table]
-                           .columns[inter_source->source_column];
+    // VARCHAR
+    const Column *str_src_ptr = columnar_source;
+    if (!str_src_ptr) {
+        if (materialized_source) {
+            str_src_ptr = &analyzed_plan.original_plan
+                               ->inputs[materialized_source->source_table]
+                               .columns[materialized_source->source_column];
+        } else if (deferred_table && analyzed_plan.original_plan) {
+            // For deferred VARCHAR, get source from provenance metadata
+            str_src_ptr =
+                &analyzed_plan.original_plan->inputs[deferred_base_table]
+                     .columns[deferred_base_col];
+        }
+    }
+
+    if (!str_src_ptr) {
+        // Shouldn't happen, but handle gracefully
+        return;
     }
 
     auto init = [str_src_ptr](std::function<Page *()> alloc) {
         return VarcharPageBuilder(*str_src_ptr, std::move(alloc));
     };
 
-    materialize_column_from_buffers<Mode, VarcharPageBuilder>(
+    materialize_column<Mode, VarcharPageBuilder>(
         dest_col, buffers, total_matches,
         [&](uint32_t rid, ColumnarReader::Cursor &cursor) {
-            return reader(rid, cursor, DataType::VARCHAR);
+            return reader(rid, cursor);
         },
         init, from_build, 35);
 }
 
 /**
- * @brief Materializes all output columns from thread-local buffers into
- * ColumnarTable.
- *
- * Dereferences VARCHAR value_t references into actual string bytes.
+ * @brief Materialize all output columns from intermediate result.
  *
- * @tparam Mode            Collection mode for compile-time specialization.
- * @param buffers          Thread-local match buffers from probe.
- * @param build_input      Build side data source.
- * @param probe_input      Probe side data source.
- * @param remapped_attrs   Output projection: (col_idx, DataType) pairs.
- * @param build_node       Metadata for build side output_attrs mapping.
- * @param probe_node       Metadata for probe side output_attrs mapping.
- * @param build_size       Number of columns from build side.
- * @param columnar_reader  PageIndex-accelerated reader for Column page access.
- * @param plan             Full query plan for VARCHAR dereferencing.
- * @return ColumnarTable with self-contained page data.
+ * For root join. Resolves all deferred columns by looking up 32-bit row IDs
+ * in DeferredTable back to base tables.
  *
- * @see construct_intermediate.h for creating intermediate ExecuteResult.
- * @see page_builders.h for Int32PageBuilder and VarcharPageBuilder.
+ * @tparam Mode Collection mode for compile-time specialization.
+ * @param buffers Thread-local match buffers from probe.
+ * @param build_input Build side input.
+ * @param probe_input Probe side input.
+ * @param join_node Analyzed join node with column info.
+ * @param remapped_attrs Output projection after build/probe remapping.
+ * @param build_size Number of columns from build side.
+ * @param columnar_reader Reader for columnar data.
+ * @param analyzed_plan Full analyzed plan for base table access.
+ * @return ColumnarTable with final output.
  */
 template <MatchCollectionMode Mode>
 inline ColumnarTable materialize_from_buffers(
     std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
     const JoinInput &build_input, const JoinInput &probe_input,
+    const AnalyzedJoinNode &join_node,
     const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
-    const PlanNode &build_node, const PlanNode &probe_node, size_t build_size,
-    ColumnarReader &columnar_reader, const Plan &plan) {
+    size_t build_size, bool build_is_left, ColumnarReader &columnar_reader,
+    const AnalyzedPlan &analyzed_plan) {
 
-    // Compute total_matches
+    // Compute total matches
     size_t total_matches = 0;
     for (const auto &buf : buffers) {
         total_matches += buf.count();
     }
 
-    ColumnarTable result;
-    result.num_rows = total_matches;
-
     if (total_matches == 0) {
-        for (auto [_, dtype] : remapped_attrs) {
-            result.columns.emplace_back(dtype);
-        }
-        return result;
+        return create_empty_result(remapped_attrs);
     }
 
+    ColumnarTable result;
+    result.num_rows = total_matches;
+
     for (size_t out_idx = 0; out_idx < remapped_attrs.size(); ++out_idx) {
         auto [col_idx, data_type] = remapped_attrs[out_idx];
         result.columns.emplace_back(data_type);
         Column &dest_col = result.columns.back();
-        materialize_single_column_from_buffers<Mode>(
-            dest_col, col_idx, build_size, buffers, total_matches, build_input,
-            probe_input, build_node, probe_node, columnar_reader, plan);
+
+        // Pass out_idx (output position) not col_idx (global column index)
+        // because materialize_single_column searches by original_idx
+        // which is the output position in join_node.columns
+        materialize_single_column<Mode>(dest_col, out_idx, build_size,
+                                        build_is_left, buffers, total_matches,
+                                        build_input, probe_input, join_node,
+                                        columnar_reader, analyzed_plan);
     }
+
     return result;
 }
 
-} // namespace Contest::materialize
+} // namespace materialize
+} // namespace Contest
diff --git a/include/platform/arena.h b/include/platform/arena.h
index f1aa32e..59d3442 100644
--- a/include/platform/arena.h
+++ b/include/platform/arena.h
@@ -41,12 +41,13 @@ static constexpr size_t PAGE_2MB = 2 * 1024 * 1024;
  * @brief Chunk type enumeration for arena regions.
  */
 enum class ChunkType : uint8_t {
-    HASH_CHUNK = 0,  ///< 4KB  - hash table partition chunks
-    IR_PAGE = 1,     ///< 16KB - intermediate result pages
-    INDEX_CHUNK = 2, ///< 32KB - match collector index chunks
-    GENERAL = 3,     ///< Variable - misc allocations
+    HASH_CHUNK = 0,    ///< 4KB  - hash table partition chunks
+    IR_PAGE = 1,       ///< 16KB - intermediate result pages (32-bit values)
+    INDEX_CHUNK = 2,   ///< 32KB - match collector index chunks
+    DEFERRED_PAGE = 3, ///< 32KB - deferred provenance pages (64-bit values)
+    GENERAL = 4,       ///< Variable - misc allocations
 
-    NUM_TYPES = 4
+    NUM_TYPES = 5
 };
 
 // ============================================================================
@@ -67,12 +68,15 @@ template <> struct ChunkSize<ChunkType::IR_PAGE> {
 template <> struct ChunkSize<ChunkType::INDEX_CHUNK> {
     static constexpr size_t value = 32768;
 };
+template <> struct ChunkSize<ChunkType::DEFERRED_PAGE> {
+    static constexpr size_t value = 32768;
+};
 template <> struct ChunkSize<ChunkType::GENERAL> {
     static constexpr size_t value = 0;
 };
 
 /// Runtime chunk size array indexed by ChunkType.
-inline constexpr size_t CHUNK_SIZES[] = {4096, 16384, 32768, 0};
+inline constexpr size_t CHUNK_SIZES[] = {4096, 16384, 32768, 32768, 0};
 
 // ============================================================================
 // Page Policies
@@ -92,6 +96,7 @@ inline constexpr PagePolicy REGION_PAGE_POLICY[] = {
     PagePolicy::SMALL_PAGES, // HASH_CHUNK
     PagePolicy::HUGE_PAGES,  // IR_PAGE
     PagePolicy::HUGE_PAGES,  // INDEX_CHUNK
+    PagePolicy::HUGE_PAGES,  // DEFERRED_PAGE
     PagePolicy::HUGE_PAGES,  // GENERAL
 };
 
@@ -102,7 +107,7 @@ inline constexpr PagePolicy REGION_PAGE_POLICY[] = {
 /**
  * @brief Region size configuration based on available DRAM.
  *
- * Uses 75% of SPC__NUMA_NODE_DRAM_MB, divided equally (25%) among 4 regions.
+ * Uses 75% of SPC__NUMA_NODE_DRAM_MB, divided equally (20%) among 5 regions.
  */
 struct RegionConfig {
     size_t total_arena_bytes;
@@ -113,8 +118,8 @@ struct RegionConfig {
                             1024ULL * 1024ULL * 3ULL / 4ULL;
     }
 
-    /// Get total size for a region (25% each).
-    size_t get(ChunkType /*ct*/) const { return total_arena_bytes / 4; }
+    /// Get total size for a region (20% each).
+    size_t get(ChunkType /*ct*/) const { return total_arena_bytes / 5; }
 
     /// Get total arena size.
     size_t total() const { return total_arena_bytes; }
@@ -450,7 +455,8 @@ class ArenaManager {
 // Global Instance and Helper
 // ============================================================================
 
-/// Global arena manager instance (inline global, constructed at program startup).
+/// Global arena manager instance (inline global, constructed at program
+/// startup).
 inline ArenaManager g_arena_manager{};
 
 /// Get thread arena by thread ID.
diff --git a/include/platform/hardware.h b/include/platform/hardware.h
index 83ef443..0cbb011 100644
--- a/include/platform/hardware.h
+++ b/include/platform/hardware.h
@@ -10,8 +10,8 @@
  */
 #pragma once
 
-#define SPC__CORE_COUNT 8
-#define SPC__THREAD_COUNT 16
+#define SPC__CORE_COUNT 6
+#define SPC__THREAD_COUNT 6
 #define SPC__LEVEL1_DCACHE_SIZE 32768
 #define SPC__LEVEL2_CACHE_SIZE 1048576
 #define SPC__LEVEL3_CACHE_SIZE 33554432
diff --git a/src/analyze_plan.cpp b/src/analyze_plan.cpp
new file mode 100644
index 0000000..3c7fc83
--- /dev/null
+++ b/src/analyze_plan.cpp
@@ -0,0 +1,302 @@
+/**
+ * @file analyze_plan.cpp
+ * @brief Analyzes query plan and computes materialization decisions.
+ *
+ * Walks the plan tree in post-order to determine which columns should be
+ * materialized eagerly (join keys needed by parent) vs deferred until final
+ * output. Traces column provenance back to base tables for deferred resolution.
+ *
+ * @see deferred_plan.h for AnalyzedPlan structure.
+ */
+#include <functional>
+#include <unordered_map>
+
+#include <data_model/deferred_plan.h>
+
+namespace Contest {
+
+namespace {
+
+/**
+ * @brief Parent relationship info for a node.
+ */
+struct ParentInfo {
+    size_t parent_idx;  ///< Parent node index in Plan::nodes.
+    bool is_left_child; ///< True if this node is parent's left child.
+};
+
+/**
+ * @brief Build map of node_idx → parent info.
+ *
+ * Root node will not have an entry in the map.
+ */
+std::unordered_map<size_t, ParentInfo> build_parent_map(const Plan &plan) {
+    std::unordered_map<size_t, ParentInfo> parent_map;
+
+    for (size_t i = 0; i < plan.nodes.size(); ++i) {
+        const auto &node = plan.nodes[i];
+        if (const auto *join = std::get_if<JoinNode>(&node.data)) {
+            parent_map[join->left] = {i, true};
+            parent_map[join->right] = {i, false};
+        }
+    }
+    return parent_map;
+}
+
+/**
+ * @brief Trace column provenance to base table.
+ *
+ * Recursively follows column through join nodes until reaching a scan node.
+ *
+ * @param plan Original query plan.
+ * @param node_idx Current node index.
+ * @param column_idx Column index in node's output_attrs.
+ * @return ColumnProvenance with base table ID and column index.
+ */
+ColumnProvenance trace_provenance(const Plan &plan, size_t node_idx,
+                                  size_t column_idx) {
+    const auto &node = plan.nodes[node_idx];
+
+    if (const auto *scan = std::get_if<ScanNode>(&node.data)) {
+        // Base case: column comes directly from scan
+        auto [actual_col_idx, _] = node.output_attrs[column_idx];
+        return ColumnProvenance{static_cast<uint8_t>(scan->base_table_id),
+                                static_cast<uint8_t>(actual_col_idx)};
+    }
+
+    // Join node: determine which child the column comes from
+    const auto &join = std::get<JoinNode>(node.data);
+    const auto &left_node = plan.nodes[join.left];
+    size_t left_size = left_node.output_attrs.size();
+
+    auto [col_idx, _] = node.output_attrs[column_idx];
+
+    if (col_idx < left_size) {
+        // Column from left child
+        return trace_provenance(plan, join.left, col_idx);
+    } else {
+        // Column from right child
+        return trace_provenance(plan, join.right, col_idx - left_size);
+    }
+}
+
+/**
+ * @brief Find which column index in this node the parent needs as join key.
+ *
+ * @param plan Original query plan.
+ * @param node_idx Current node index.
+ * @param parent_map Map of node → parent relationship.
+ * @return Column index parent uses as join key, or nullopt if root.
+ */
+std::optional<size_t>
+find_parent_join_key(const Plan &plan, size_t node_idx,
+                     const std::unordered_map<size_t, ParentInfo> &parent_map) {
+    auto it = parent_map.find(node_idx);
+    if (it == parent_map.end()) {
+        return std::nullopt; // Root node
+    }
+
+    const auto &parent_node = plan.nodes[it->second.parent_idx];
+    const auto &parent_join = std::get<JoinNode>(parent_node.data);
+
+    // Parent's join key for this child
+    return it->second.is_left_child ? parent_join.left_attr
+                                    : parent_join.right_attr;
+}
+
+/**
+ * @brief Compute base collection mode based on which sides have output columns.
+ *
+ * Assumes build=left. If build=right at runtime, caller flips
+ * LEFT_ONLY/RIGHT_ONLY.
+ */
+join::MatchCollectionMode
+compute_base_collection_mode(const std::vector<AnalyzedColumnInfo> &columns,
+                             size_t left_output_size) {
+    bool needs_left = false;
+    bool needs_right = false;
+
+    for (const auto &col : columns) {
+        if (col.from_left) {
+            needs_left = true;
+        } else {
+            needs_right = true;
+        }
+        if (needs_left && needs_right) {
+            return join::MatchCollectionMode::BOTH;
+        }
+    }
+
+    if (needs_left && !needs_right)
+        return join::MatchCollectionMode::LEFT_ONLY;
+    if (needs_right && !needs_left)
+        return join::MatchCollectionMode::RIGHT_ONLY;
+    return join::MatchCollectionMode::BOTH;
+}
+
+} // anonymous namespace
+
+AnalyzedPlan analyze_plan(const Plan &plan) {
+    AnalyzedPlan analyzed;
+    analyzed.original_plan = &plan;
+    analyzed.nodes.resize(plan.nodes.size());
+    analyzed.root = plan.root;
+
+    auto parent_map = build_parent_map(plan);
+
+    // Build post-order traversal (children before parents)
+    std::vector<size_t> post_order;
+    post_order.reserve(plan.nodes.size());
+    std::vector<bool> visited(plan.nodes.size(), false);
+
+    std::function<void(size_t)> visit = [&](size_t idx) {
+        if (visited[idx])
+            return;
+        visited[idx] = true;
+
+        const auto &node = plan.nodes[idx];
+        if (const auto *join = std::get_if<JoinNode>(&node.data)) {
+            visit(join->left);
+            visit(join->right);
+        }
+        post_order.push_back(idx);
+    };
+    visit(plan.root);
+
+    // PASS 1: Build structure and initial materialization decisions
+    for (size_t node_idx : post_order) {
+        const auto &node = plan.nodes[node_idx];
+
+        if (const auto *scan = std::get_if<ScanNode>(&node.data)) {
+            // Scan node: simple wrapper
+            AnalyzedScanNode ascan;
+            ascan.node_idx = node_idx;
+            ascan.base_table_id = scan->base_table_id;
+            ascan.output_attrs = node.output_attrs;
+            analyzed.nodes[node_idx] = std::move(ascan);
+
+        } else {
+            // Join node: compute materialization decisions
+            const auto &join = std::get<JoinNode>(node.data);
+            AnalyzedJoinNode ajoin;
+            ajoin.node_idx = node_idx;
+            ajoin.left_child_idx = join.left;
+            ajoin.right_child_idx = join.right;
+            ajoin.left_join_attr = join.left_attr;
+            ajoin.right_join_attr = join.right_attr;
+            ajoin.output_attrs = node.output_attrs;
+            ajoin.is_root = (node_idx == plan.root);
+
+            // Find which column parent needs as join key
+            ajoin.parent_join_key_idx =
+                find_parent_join_key(plan, node_idx, parent_map);
+
+            // Get child sizes for determining column source
+            const auto &left_node = plan.nodes[join.left];
+            size_t left_size = left_node.output_attrs.size();
+
+            // Build column info for each output column
+            for (size_t i = 0; i < node.output_attrs.size(); ++i) {
+                auto [col_idx, col_type] = node.output_attrs[i];
+
+                AnalyzedColumnInfo info;
+                info.original_idx = i;
+                info.type = col_type;
+
+                // Determine if column is from left or right child
+                // col_idx is the combined L+R index:
+                // - [0, left_size) = position in left child's output
+                // - [left_size, ...) = position in right child's output +
+                // left_size
+                if (col_idx < left_size) {
+                    info.from_left = true;
+                    info.child_output_idx = col_idx;
+                } else {
+                    info.from_left = false;
+                    info.child_output_idx = col_idx - left_size;
+                }
+
+                // Materialization decision:
+                // - At root: ALL columns must be materialized (final output)
+                // - At intermediate: only parent's join key is materialized
+                if (ajoin.is_root) {
+                    // Root node: materialize everything
+                    info.resolution = ColumnResolution::MATERIALIZE;
+                } else if (ajoin.parent_join_key_idx.has_value() &&
+                           i == *ajoin.parent_join_key_idx) {
+                    info.resolution = ColumnResolution::MATERIALIZE;
+                } else {
+                    info.resolution = ColumnResolution::DEFER;
+                }
+
+                // Trace provenance to base table
+                info.provenance = trace_provenance(plan, node_idx, i);
+
+                ajoin.columns.push_back(std::move(info));
+            }
+
+            // Compute collection mode and count deferred columns
+            ajoin.base_collection_mode =
+                compute_base_collection_mode(ajoin.columns, left_size);
+
+            // Count deferred columns for pre-allocation
+            ajoin.num_deferred_columns = 0;
+            for (const auto &col : ajoin.columns) {
+                if (col.resolution == ColumnResolution::DEFER) {
+                    ++ajoin.num_deferred_columns;
+                }
+            }
+
+            analyzed.nodes[node_idx] = std::move(ajoin);
+        }
+    }
+
+    // PASS 2: Propagate materialization requirements to children
+    // Process in reverse post-order (parents before children)
+    for (auto it = post_order.rbegin(); it != post_order.rend(); ++it) {
+        size_t node_idx = *it;
+        auto *ajoin = std::get_if<AnalyzedJoinNode>(&analyzed.nodes[node_idx]);
+        if (!ajoin)
+            continue;
+
+        // For each column that must be MATERIALIZE, ensure the child also
+        // materializes it
+        for (const auto &col : ajoin->columns) {
+            if (col.resolution != ColumnResolution::MATERIALIZE)
+                continue;
+
+            // Find which child this column comes from
+            size_t child_idx =
+                col.from_left ? ajoin->left_child_idx : ajoin->right_child_idx;
+
+            auto *child_ajoin =
+                std::get_if<AnalyzedJoinNode>(&analyzed.nodes[child_idx]);
+            if (!child_ajoin)
+                continue; // Child is a scan - always has data
+
+            // Mark child's column as MATERIALIZE
+            if (col.child_output_idx < child_ajoin->columns.size()) {
+                child_ajoin->columns[col.child_output_idx].resolution =
+                    ColumnResolution::MATERIALIZE;
+            }
+        }
+    }
+
+    // PASS 3: Recount num_deferred_columns after propagation
+    for (size_t node_idx : post_order) {
+        auto *ajoin = std::get_if<AnalyzedJoinNode>(&analyzed.nodes[node_idx]);
+        if (!ajoin)
+            continue;
+
+        ajoin->num_deferred_columns = 0;
+        for (const auto &col : ajoin->columns) {
+            if (col.resolution == ColumnResolution::DEFER) {
+                ++ajoin->num_deferred_columns;
+            }
+        }
+    }
+
+    return analyzed;
+}
+
+} // namespace Contest
diff --git a/src/execute.cpp b/src/execute.cpp
index c5a3eed..ce81a31 100644
--- a/src/execute.cpp
+++ b/src/execute.cpp
@@ -5,18 +5,25 @@
  * Traverses plan tree: resolve inputs -> select build/probe -> algorithm
  * selection -> match collection -> output construction.
  *
- * Flow: execute() -> execute_impl() recursively -> resolve_join_input() for
- * ScanNode (ColumnarTable*) or JoinNode (ExecuteResult). Root produces
- * ColumnarTable; non-root produces ExecuteResult.
+ * Flow: execute() -> execute_impl() recursively -> resolve_input() for
+ * ScanNode (ColumnarTable*) or JoinNode (IntermediateResult). Root produces
+ * ColumnarTable; non-root produces IntermediateResult.
  *
- * Lifetimes: base tables live for query duration; ExecuteResult held on stack
- * until parent completes; VARCHAR refs valid via base table lifetime.
+ * Lifetimes: base tables live for query duration; IntermediateResult held on
+ * stack until parent completes; VARCHAR refs valid via base table lifetime.
  *
  * Row order non-deterministic (work-stealing); semantically correct per SQL.
  *
  * @see plan.h, match_collector.h, materialize.h, construct_intermediate.h
  */
+#include "data_model/plan.h"
+#include <cassert>
 #include <foundation/attribute.h>
+#include <functional>
+#include <ostream>
+#include <queue>
+#include <string>
+#include <utility>
 #if defined(__APPLE__) && defined(__aarch64__)
 #include <platform/hardware_darwin.h>
 #elif defined(SPC__USE_BENCHMARKVM_HARDWARE)
@@ -27,6 +34,7 @@
 
 #include <chrono>
 #include <data_access/columnar_reader.h>
+#include <data_model/deferred_plan.h>
 #include <data_model/intermediate.h>
 #include <iostream>
 #include <join_execution/hash_join.h>
@@ -44,71 +52,87 @@ namespace Contest {
 
 using namespace join;
 
-using materialize::construct_intermediate_from_buffers;
-using materialize::create_empty_result;
+using materialize::construct_intermediate_with_tuples;
+using materialize::create_empty_intermediate_result;
 using materialize::materialize_from_buffers;
 
 /**
- * @brief Result variant: ExecuteResult (intermediate, value_t columns) or
- * ColumnarTable (final output per contest API).
+ * @brief Result variant: IntermediateResult (non-root) or ColumnarTable (root).
  */
-using JoinResult = std::variant<ExecuteResult, ColumnarTable>;
+using JoinResult = std::variant<IntermediateResult, ColumnarTable>;
 
-/**
- * @brief Recursive join execution with timing.
- * @param plan Query plan with nodes and base tables.
- * @param node_idx Current node index in plan.nodes.
- * @param is_root True -> ColumnarTable output; false -> ExecuteResult.
- * @param stats Timing accumulator.
- * @return JoinResult (intermediate or final).
- */
-JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root,
+// Forward declaration
+JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root,
                         TimingStats &stats);
 
 /**
  * @brief Resolve plan node to JoinInput.
  *
  * ScanNode -> non-owning ColumnarTable*; JoinNode -> recursive execution
- * returning owned ExecuteResult. Implements depth-first traversal.
- *
- * @param plan Query plan.
- * @param node_idx Node index to resolve.
- * @param stats Timing accumulator.
- * @return JoinInput with data variant and metadata.
+ * returning owned IntermediateResult.
  */
-JoinInput resolve_join_input(const Plan &plan, size_t node_idx,
-                             TimingStats &stats) {
+JoinInput resolve_input(const AnalyzedPlan &plan, size_t node_idx,
+                        TimingStats &stats) {
     JoinInput input;
-    const auto &node = plan.nodes[node_idx];
-    input.node = &node;
+    const auto &anode = plan[node_idx];
+    const auto &pnode = plan.original_plan->nodes[node_idx];
+    input.node = &pnode;
+    input.analyzed_node = &anode;
 
-    if (const auto *scan = std::get_if<ScanNode>(&node.data)) {
-        input.data = &plan.inputs[scan->base_table_id];
+    if (const auto *scan = std::get_if<AnalyzedScanNode>(&anode)) {
+        input.data = &plan.original_plan->inputs[scan->base_table_id];
         input.table_id = scan->base_table_id;
     } else {
         auto result = execute_impl(plan, node_idx, false, stats);
-        input.data = std::get<ExecuteResult>(std::move(result));
+        input.data = std::get<IntermediateResult>(std::move(result));
         input.table_id = 0;
     }
     return input;
 }
 
+/**
+ * @brief Select build/probe sides for join input.
+ */
+BuildProbeConfig select_join_build_probe_side(
+    const JoinNode &join, const JoinInput &left_input,
+    const JoinInput &right_input,
+    const std::vector<std::tuple<size_t, DataType>> &output_attrs) {
+    BuildProbeConfig config;
+
+    size_t left_rows = left_input.row_count(join.left_attr);
+    size_t right_rows = right_input.row_count(join.right_attr);
+    config.build_left = left_rows <= right_rows;
+
+    config.build_attr = config.build_left ? join.left_attr : join.right_attr;
+    config.probe_attr = config.build_left ? join.right_attr : join.left_attr;
+
+    config.remapped_attrs = output_attrs;
+    size_t left_size = left_input.output_size();
+    size_t build_size =
+        config.build_left ? left_size : right_input.output_size();
+
+    if (!config.build_left) {
+        for (auto &[col_idx, dtype] : config.remapped_attrs) {
+            if (col_idx < left_size) {
+                col_idx = build_size + col_idx;
+            } else {
+                col_idx = col_idx - left_size;
+            }
+        }
+    }
+    return config;
+}
+
 /**
  * @brief Unified probe + materialize helper templated on collection mode.
- *
- * Executes probe (nested loop or hash join) and materialization/intermediate
- * construction in a single function. Template parameter eliminates runtime
- * branching in hot loops.
- *
- * @tparam Mode Collection mode (BOTH, LEFT_ONLY, RIGHT_ONLY).
  */
 template <MatchCollectionMode Mode>
 JoinResult execute_join_with_mode(
     bool use_nested_loop, bool probe_is_columnar, bool is_root,
     const UnchainedHashtable *hash_table, const JoinInput &build_input,
     const JoinInput &probe_input, const BuildProbeConfig &config,
-    const PlanNode &build_node, const PlanNode &probe_node, JoinSetup &setup,
-    io::ColumnarReader &columnar_reader, const Plan &plan, TimingStats &stats) {
+    const AnalyzedJoinNode &join_node, io::ColumnarReader &columnar_reader,
+    const AnalyzedPlan &plan, TimingStats &stats) {
 
     std::vector<ThreadLocalMatchBuffer<Mode>> match_buffers;
 
@@ -128,9 +152,31 @@ JoinResult execute_join_with_mode(
                                                  config.probe_attr);
         } else {
             const auto &probe_result =
-                std::get<ExecuteResult>(probe_input.data);
-            match_buffers = probe_intermediate<Mode>(
-                *hash_table, probe_result[config.probe_attr]);
+                std::get<IntermediateResult>(probe_input.data);
+
+            // Use tuple-based probe if available
+            if (probe_result.has_join_key_tuples() &&
+                probe_result.join_key_idx.has_value() &&
+                *probe_result.join_key_idx == config.probe_attr) {
+                match_buffers = probe_tuples<Mode>(
+                    *hash_table, *probe_result.join_key_tuples);
+            } else {
+                // Fall back to materialized column probe
+                const auto *mat_col =
+                    probe_result.get_materialized(config.probe_attr);
+                if (!mat_col) {
+                    std::fprintf(
+                        stderr,
+                        "ERROR: probe join key not materialized! "
+                        "probe_attr=%zu "
+                        "mat_map_size=%zu num_rows=%zu has_tuples=%d\n",
+                        config.probe_attr, probe_result.materialized_map.size(),
+                        probe_result.num_rows,
+                        probe_result.has_join_key_tuples() ? 1 : 0);
+                    std::abort();
+                }
+                match_buffers = probe_intermediate<Mode>(*hash_table, *mat_col);
+            }
         }
         auto probe_end = std::chrono::high_resolution_clock::now();
         stats.hash_join_probe_ms +=
@@ -148,16 +194,19 @@ JoinResult execute_join_with_mode(
         auto mat_start = std::chrono::high_resolution_clock::now();
         JoinResult final_result;
         if (total_matches == 0) {
-            final_result = create_empty_result(config.remapped_attrs);
+            final_result =
+                materialize::create_empty_result(config.remapped_attrs);
         } else {
-            prepare_output_columns(
-                columnar_reader, build_input, probe_input, build_node,
-                probe_node, config.remapped_attrs, build_input.output_size());
+            // Prepare page indices for final materialization
+            materialize::prepare_final_columns(
+                columnar_reader, build_input, probe_input, join_node,
+                config.remapped_attrs, build_input.output_size(),
+                config.build_left);
 
             final_result = materialize_from_buffers<Mode>(
-                match_buffers, build_input, probe_input, config.remapped_attrs,
-                build_node, probe_node, build_input.output_size(),
-                columnar_reader, plan);
+                match_buffers, build_input, probe_input, join_node,
+                config.remapped_attrs, build_input.output_size(),
+                config.build_left, columnar_reader, plan);
         }
         auto mat_end = std::chrono::high_resolution_clock::now();
         stats.materialize_ms +=
@@ -167,93 +216,110 @@ JoinResult execute_join_with_mode(
         return final_result;
     } else {
         auto inter_start = std::chrono::high_resolution_clock::now();
+        IntermediateResult result;
         if (total_matches > 0) {
-            prepare_output_columns(
-                columnar_reader, build_input, probe_input, build_node,
-                probe_node, config.remapped_attrs, build_input.output_size());
-
-            construct_intermediate_from_buffers<Mode>(
-                match_buffers, build_input, probe_input, config.remapped_attrs,
-                build_node, probe_node, build_input.output_size(),
-                columnar_reader, setup.results);
+            materialize::prepare_intermediate_columns(
+                columnar_reader, build_input, probe_input, join_node,
+                config.remapped_attrs, build_input.output_size(),
+                config.build_left, join_node.parent_join_key_idx);
+
+            construct_intermediate_with_tuples<Mode>(
+                match_buffers, build_input, probe_input, join_node, config,
+                config.build_left, *join_node.parent_join_key_idx,
+                columnar_reader, result, plan);
+        } else {
+            result = create_empty_intermediate_result(join_node);
         }
         auto inter_end = std::chrono::high_resolution_clock::now();
         stats.intermediate_ms +=
             std::chrono::duration_cast<std::chrono::milliseconds>(inter_end -
                                                                   inter_start)
                 .count();
-        return std::move(setup.results);
+        return std::move(result);
     }
 }
 
 /**
- * @brief Core recursive join execution.
- *
- * Phases: resolve L/R inputs -> select build/probe (smaller=build) -> algorithm
- * choice -> build/probe -> output construction.
- *
- * Algorithm: nested loop if build_rows < HASH_TABLE_THRESHOLD (8); else radix-
- * partitioned hash join.
- *
- * Memory: hash table and MatchCollector local (freed on return); child
- * ExecuteResults on stack until materialization; setup.results pre-allocated.
+ * @brief Recursive join execution.
  */
-JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root,
+JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root,
                         TimingStats &stats) {
-    auto &node = plan.nodes[node_idx];
+    const auto &anode = plan[node_idx];
 
-    if (!std::holds_alternative<JoinNode>(node.data)) {
-        return ExecuteResult{};
+    if (std::holds_alternative<AnalyzedScanNode>(anode)) {
+        return IntermediateResult{};
     }
 
-    const auto &join = std::get<JoinNode>(node.data);
-    const auto &output_attrs = node.output_attrs;
-    const auto &left_node = plan.nodes[join.left];
-    const auto &right_node = plan.nodes[join.right];
+    const auto &ajoin = std::get<AnalyzedJoinNode>(anode);
+    const auto &original_plan = *plan.original_plan;
+    const auto &pnode = original_plan.nodes[node_idx];
+    const auto &join = std::get<JoinNode>(pnode.data);
 
-    JoinInput left_input = resolve_join_input(plan, join.left, stats);
-    JoinInput right_input = resolve_join_input(plan, join.right, stats);
+    // Resolve inputs
+    JoinInput left_input = resolve_input(plan, ajoin.left_child_idx, stats);
+    JoinInput right_input = resolve_input(plan, ajoin.right_child_idx, stats);
 
-    /* Build/probe selection: smaller input = build side; remaps output_attrs.
-     */
+    // Build/probe selection
     auto setup_start = std::chrono::high_resolution_clock::now();
-    auto config =
-        select_build_probe_side(join, left_input, right_input, output_attrs);
+    auto config = select_join_build_probe_side(join, left_input, right_input,
+                                               ajoin.output_attrs);
     const JoinInput &build_input = config.build_left ? left_input : right_input;
     const JoinInput &probe_input = config.build_left ? right_input : left_input;
-    const auto &build_node = config.build_left ? left_node : right_node;
-    const auto &probe_node = config.build_left ? right_node : left_node;
 
     bool build_is_columnar = build_input.is_columnar();
     bool probe_is_columnar = probe_input.is_columnar();
 
-    /* Nested loop for <8 rows (L1-resident, no hash overhead, SIMD). */
     const size_t HASH_TABLE_THRESHOLD = 8;
     size_t build_rows = build_input.row_count(config.build_attr);
     bool use_nested_loop = (build_rows < HASH_TABLE_THRESHOLD);
 
-    /* Pre-allocate ExecuteResult; ColumnarReader PageIndex built lazily. */
-    JoinSetup setup = setup_join(build_input, probe_input, build_node,
-                                 probe_node, left_node, right_node, left_input,
-                                 right_input, output_attrs, build_rows);
+    io::ColumnarReader columnar_reader;
     auto setup_end = std::chrono::high_resolution_clock::now();
-    auto setup_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
-        setup_end - setup_start);
-    stats.setup_ms += setup_elapsed.count();
-
-    /* Skip unused-side row IDs if output needs only one side (50% savings). */
-    MatchCollectionMode collection_mode = determine_collection_mode(
-        config.remapped_attrs, config.build_left ? left_input.output_size()
-                                                 : right_input.output_size());
+    stats.setup_ms += std::chrono::duration_cast<std::chrono::milliseconds>(
+                          setup_end - setup_start)
+                          .count();
+
+    // Use pre-computed collection mode from plan analysis.
+    // base_collection_mode assumes build=left; flip if build=right at runtime.
+    MatchCollectionMode mode = ajoin.base_collection_mode;
+    if (!config.build_left) {
+        if (mode == MatchCollectionMode::LEFT_ONLY)
+            mode = MatchCollectionMode::RIGHT_ONLY;
+        else if (mode == MatchCollectionMode::RIGHT_ONLY)
+            mode = MatchCollectionMode::LEFT_ONLY;
+    }
 
-    /* Build hash table if needed (before mode dispatch). */
+    // Build hash table if needed
     std::optional<UnchainedHashtable> hash_table;
     if (!use_nested_loop) {
         auto build_start = std::chrono::high_resolution_clock::now();
-        hash_table =
-            build_is_columnar
-                ? build_from_columnar(build_input, config.build_attr)
-                : build_from_intermediate(build_input, config.build_attr);
+        if (build_is_columnar) {
+            hash_table = build_from_columnar(build_input, config.build_attr);
+        } else {
+            const auto &ir = std::get<IntermediateResult>(build_input.data);
+
+            // Use tuple-based build if available and matches build_attr
+            if (ir.has_join_key_tuples() && ir.join_key_idx.has_value() &&
+                *ir.join_key_idx == config.build_attr) {
+                hash_table.emplace(ir.join_key_tuples->row_count());
+                hash_table->build_from_tuples(*ir.join_key_tuples);
+            } else {
+                // Fall back to materialized column build
+                const auto *mat_col = ir.get_materialized(config.build_attr);
+                if (!mat_col) {
+                    std::fprintf(
+                        stderr,
+                        "ERROR: build join key not materialized! "
+                        "build_attr=%zu "
+                        "mat_map_size=%zu num_rows=%zu has_tuples=%d\n",
+                        config.build_attr, ir.materialized_map.size(),
+                        ir.num_rows, ir.has_join_key_tuples() ? 1 : 0);
+                    std::abort();
+                }
+                hash_table.emplace(mat_col->row_count());
+                hash_table->build_intermediate(*mat_col);
+            }
+        }
         auto build_end = std::chrono::high_resolution_clock::now();
         stats.hashtable_build_ms +=
             std::chrono::duration_cast<std::chrono::milliseconds>(build_end -
@@ -261,36 +327,91 @@ JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root,
                 .count();
     }
 
-    /* Dispatch based on collection mode - single runtime branch, then
-     * fully specialized template instantiation with zero branching in hot
-     * loops. */
-    switch (collection_mode) {
+    // Dispatch based on collection mode
+    switch (mode) {
     case MatchCollectionMode::BOTH:
         return execute_join_with_mode<MatchCollectionMode::BOTH>(
             use_nested_loop, probe_is_columnar, is_root,
             use_nested_loop ? nullptr : &(*hash_table), build_input,
-            probe_input, config, build_node, probe_node, setup,
-            setup.columnar_reader, plan, stats);
+            probe_input, config, ajoin, columnar_reader, plan, stats);
 
     case MatchCollectionMode::LEFT_ONLY:
         return execute_join_with_mode<MatchCollectionMode::LEFT_ONLY>(
             use_nested_loop, probe_is_columnar, is_root,
             use_nested_loop ? nullptr : &(*hash_table), build_input,
-            probe_input, config, build_node, probe_node, setup,
-            setup.columnar_reader, plan, stats);
+            probe_input, config, ajoin, columnar_reader, plan, stats);
 
     case MatchCollectionMode::RIGHT_ONLY:
         return execute_join_with_mode<MatchCollectionMode::RIGHT_ONLY>(
             use_nested_loop, probe_is_columnar, is_root,
             use_nested_loop ? nullptr : &(*hash_table), build_input,
-            probe_input, config, build_node, probe_node, setup,
-            setup.columnar_reader, plan, stats);
+            probe_input, config, ajoin, columnar_reader, plan, stats);
     }
 
-    // Should never reach here, but satisfy compiler
-    return ExecuteResult{};
+    return IntermediateResult{};
 }
 
+
+/**
+ *
+ * @brief Prints the plan tree with metadata.
+ *
+ * @param the query plan itself.
+ * @param queue that should contain the root node.
+ *
+ **/
+static std::function<void(const Plan&, std::queue<std::tuple<int, int>>, int)> 
+print_plan = [](const Plan& plan, std::queue<std::tuple<int, int>> q, int table_id) {
+    if (q.empty()) return;
+    int initial_size = q.size();
+    for (int i = 0; i < initial_size; i++) {
+        auto [node_idx, parent_attr] = q.front();
+        q.pop();
+        const auto& node = plan.nodes[node_idx];
+        if (std::holds_alternative<ScanNode>(node.data)) {
+            continue;
+        }
+        const auto data = std::get<JoinNode>(node.data);
+
+        std::cout << " node: "<< node_idx << " size: "
+            << node.output_attrs.size() << std::endl; 
+
+        bool match_left = false;
+        bool match_right = false;
+        for (int i = 0; i < node.output_attrs.size(); i++) {
+            auto [col, type] = node.output_attrs[i];
+            if (node_idx != plan.root) {
+                if (i == parent_attr) std::cout << "build->";
+                else std::cout << "defer->";
+            }
+            if (col < plan.nodes[data.left].output_attrs.size()) {
+                std::cout << "left->";
+                match_left = true;
+            } else {
+                std::cout << "right->";
+                match_right = true;
+            }
+            if (DataType::INT32 == type) std::cout << "(" << col << ", INT32)";
+            else std::cout << "(" << col << ", STR)";
+            std::cout << std::endl;
+
+        }
+        std::cout << "====";
+        if (match_left && match_right) std::cout << "Match both";
+        else if (match_left) std::cout << "Match left";
+        else std::cout << "Match right";
+        std::cout << "====" << std::endl;
+
+        std::cout << "left_key: " << data.left_attr << " left child: " << data.left;
+        std::cout << "\nright_key: " << data.right_attr << " right child: " << data.right;
+        q.emplace(data.left, data.left_attr);
+        q.emplace(data.right, data.right_attr);
+        std::cout << "\n\n\n\n\n";
+    }
+    print_plan(plan, std::move(q));
+};
+
+
 /**
  * @brief Public entry point: execute plan from root, return ColumnarTable.
  * @param plan Query plan with nodes and base tables.
@@ -307,7 +428,22 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out,
     auto total_start = std::chrono::high_resolution_clock::now();
 
     TimingStats stats;
-    auto result = execute_impl(plan, plan.root, true, stats);
+
+    // Analyze plan and execute with deferred intermediate construction
+    auto analyze_start = std::chrono::high_resolution_clock::now();
+    AnalyzedPlan analyzed_plan = analyze_plan(plan);
+    auto analyze_end = std::chrono::high_resolution_clock::now();
+    stats.analyze_plan_ms =
+        std::chrono::duration_cast<std::chrono::milliseconds>(analyze_end -
+                                                              analyze_start)
+            .count();
+    /*
+    auto result = execute_impl(analyzed_plan, plan.root, true, stats);
+    ColumnarTable final_result = std::get<ColumnarTable>(std::move(result));
+    */
+    std::queue<std::tuple<int, int>> q;
+    q.emplace(plan.root, 0);
+    print_plan(plan, q);
 
     auto total_end = std::chrono::high_resolution_clock::now();
     auto total_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
@@ -317,9 +453,11 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out,
     if (show_detailed_timing) {
         int64_t accounted =
             stats.hashtable_build_ms + stats.hash_join_probe_ms +
-            stats.nested_loop_join_ms + stats.materialize_ms + stats.setup_ms;
+            stats.nested_loop_join_ms + stats.materialize_ms + stats.setup_ms +
+            stats.intermediate_ms + stats.analyze_plan_ms;
         int64_t other = stats.total_execution_ms - accounted;
 
+        std::cout << "Plan Analysis Time: " << stats.analyze_plan_ms << " ms\n";
         std::cout << "Hashtable Build Time: " << stats.hashtable_build_ms
                   << " ms\n";
         std::cout << "Hash Join Probe Time: " << stats.hash_join_probe_ms
@@ -339,7 +477,7 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out,
         *stats_out = stats;
     }
 
-    return std::move(std::get<ColumnarTable>(result));
+    return ColumnarTable(); 
 }
 
 void *build_context() { return nullptr; }