From 24ad6d6cbeed8428b3a18dcdaa308acdcb0e41c5 Mon Sep 17 00:00:00 2001
From: Themos Papatheofanous <themos360@gmail.com>
Date: Wed, 21 Jan 2026 03:44:41 +0200
Subject: [PATCH 01/13] feat: extended columns

---
 include/data_model/intermediate.h             | 130 ++++++++++++++
 include/foundation/common.h                   |  45 ++++-
 include/join_execution/hash_join.h            |   5 +-
 include/join_execution/join_setup.h           |  83 +++++++--
 include/join_execution/nested_loop.h          |  10 +-
 .../materialization/construct_intermediate.h  | 163 ++++++++++++++++--
 include/materialization/materialize.h         |   5 +-
 src/execute.cpp                               |  18 +-
 8 files changed, 415 insertions(+), 44 deletions(-)
diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h
index e0e2667..5f693a7 100644
--- a/include/data_model/intermediate.h
+++ b/include/data_model/intermediate.h
@@ -152,6 +152,84 @@ struct column_t {
 /** @brief Alias for a collection of intermediate columns. */
 using Columnar = std::vector<column_t>;
 
+/**
+ * @brief Row ID column storing encoded global row IDs.
+ *
+ * Parallel structure to column_t but stores uint32_t (encoded table_id +
+ * row_id). One column per base table participating in joins up to this point.
+ * Uses same page size and arena allocation as column_t.
+ *
+ * @see GlobalRowId for encoding scheme, ExtendedResult for usage.
+ */
+struct rowid_column_t {
+    /** @brief Page for row ID storage: fixed array of uint32_t entries. */
+    struct alignas(IR_PAGE_SIZE) Page {
+        uint32_t data[CAP_PER_PAGE];
+    };
+
+    std::vector<Page *> pages; ///< Pointers to arena-allocated pages.
+    size_t num_values = 0;     ///< Total row ID count across all pages.
+    uint8_t table_id = 0;      ///< Which base table this column tracks.
+
+    rowid_column_t() = default;
+
+    rowid_column_t(rowid_column_t &&other) noexcept
+        : pages(std::move(other.pages)), num_values(other.num_values),
+          table_id(other.table_id) {
+        other.pages.clear();
+        other.num_values = 0;
+    }
+
+    rowid_column_t &operator=(rowid_column_t &&other) noexcept {
+        if (this != &other) {
+            pages = std::move(other.pages);
+            num_values = other.num_values;
+            table_id = other.table_id;
+            other.pages.clear();
+            other.num_values = 0;
+        }
+        return *this;
+    }
+
+    rowid_column_t(const rowid_column_t &) = delete;
+    rowid_column_t &operator=(const rowid_column_t &) = delete;
+
+    ~rowid_column_t() = default;
+
+    /** @brief O(1) read: idx>>12 for page, idx&0xFFF for offset. */
+    inline uint32_t operator[](size_t idx) const {
+        return pages[idx >> 12]->data[idx & 0xFFF];
+    }
+
+    /** @brief Thread-safe write at idx (requires pages to be set up first). */
+    inline void write_at(size_t idx, uint32_t val) {
+        pages[idx >> 12]->data[idx & 0xFFF] = val;
+    }
+
+    /** @brief Total row ID count. */
+    size_t row_count() const { return num_values; }
+
+    /** @brief Set row count without allocation (for assembly pattern). */
+    inline void set_row_count(size_t count) { num_values = count; }
+
+    /** @brief Pre-allocate pages from arena. */
+    inline void pre_allocate_from_arena(Contest::platform::ThreadArena &arena,
+                                        size_t count) {
+        static_assert(sizeof(Page) ==
+                          Contest::platform::ChunkSize<
+                              Contest::platform::ChunkType::IR_PAGE>::value,
+                      "Page size mismatch with IR_PAGE chunk size");
+        size_t pages_needed = (count + CAP_PER_PAGE - 1) / CAP_PER_PAGE;
+        pages.reserve(pages_needed);
+        for (size_t i = 0; i < pages_needed; ++i) {
+            void *ptr =
+                arena.alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
+            pages.push_back(reinterpret_cast<Page *>(ptr));
+        }
+        num_values = count;
+    }
+};
+
 /**
  * @brief Convert column_t vector to ColumnarTable. Dereferences VARCHAR refs.
  * @see materialize.h
@@ -163,4 +241,56 @@ ColumnarTable to_columnar(const Columnar &table, const Plan &plan);
 namespace Contest {
 /** @brief Result type for non-root joins (intermediate format). */
 using ExecuteResult = std::vector<mema::column_t>;
+
+/**
+ * @brief Extended intermediate result with row ID tracking.
+ *
+ * Wraps ExecuteResult with parallel row ID columns that track
+ * which original scan rows contributed to each intermediate row.
+ * One rowid_column_t per base table participating in the join tree.
+ *
+ * @see GlobalRowId for encoding, construct_intermediate.h for population.
+ */
+struct ExtendedResult {
+    ExecuteResult columns;                     ///< Data columns (value_t).
+    std::vector<mema::rowid_column_t> row_ids; ///< One per participating table.
+    std::vector<uint8_t> table_ids; ///< Which tables are tracked (sorted).
+
+    ExtendedResult() = default;
+
+    ExtendedResult(ExtendedResult &&) = default;
+    ExtendedResult &operator=(ExtendedResult &&) = default;
+
+    ExtendedResult(const ExtendedResult &) = delete;
+    ExtendedResult &operator=(const ExtendedResult &) = delete;
+
+    /** @brief Total row count (from first data column). */
+    size_t row_count() const {
+        return columns.empty() ? 0 : columns[0].row_count();
+    }
+
+    /** @brief Find row ID column index for a specific table, or -1 if not
+     * found. */
+    int find_rowid_index(uint8_t tid) const {
+        for (size_t i = 0; i < table_ids.size(); ++i) {
+            if (table_ids[i] == tid)
+                return static_cast<int>(i);
+        }
+        return -1;
+    }
+
+    /** @brief Get row ID column for a specific table, or nullptr if not found.
+     */
+    const mema::rowid_column_t *get_rowid_column(uint8_t tid) const {
+        int idx = find_rowid_index(tid);
+        return (idx >= 0) ? &row_ids[idx] : nullptr;
+    }
+
+    /** @brief Get mutable row ID column for a specific table, or nullptr. */
+    mema::rowid_column_t *get_rowid_column_mut(uint8_t tid) {
+        int idx = find_rowid_index(tid);
+        return (idx >= 0) ? &row_ids[idx] : nullptr;
+    }
+};
+
 } /* namespace Contest */
diff --git a/include/foundation/common.h b/include/foundation/common.h
index 16c8aa7..192fe08 100644
--- a/include/foundation/common.h
+++ b/include/foundation/common.h
@@ -125,7 +125,8 @@ class File {
     }
 };
 
-/** @brief Read entire file into string. @throws std::runtime_error on failure. */
+/** @brief Read entire file into string. @throws std::runtime_error on failure.
+ */
 inline std::string read_file(const std::filesystem::path &path) {
     File f(path, "rb");
     ::fseek(f, 0, SEEK_END);
@@ -154,7 +155,8 @@ struct DSU {
     void unite(size_t x, size_t y) { pa[find(x)] = find(y); }
 };
 
-/** @brief Mark unreachable code path for compiler optimization (UB if reached). */
+/** @brief Mark unreachable code path for compiler optimization (UB if reached).
+ */
 [[noreturn]] inline void unreachable() {
     // Uses compiler specific extensions if possible.
     // Even if no extension is used, undefined behavior is still raised by
@@ -164,4 +166,41 @@ struct DSU {
 #else // GCC, Clang
     __builtin_unreachable();
 #endif
-}
\ No newline at end of file
+}
+
+namespace Contest {
+
+/**
+ * @brief Encoded global row ID: 5-bit table_id + 27-bit row_id.
+ *
+ * Supports up to 32 tables and 134M rows per table.
+ * Used to track original scan table rows through recursive joins.
+ *
+ * Encoding: [table_id (5 bits)][row_id (27 bits)]
+ *   - table_id: bits 27-31
+ *   - row_id: bits 0-26
+ */
+struct GlobalRowId {
+    static constexpr uint32_t TABLE_BITS = 5;
+    static constexpr uint32_t ROW_BITS = 27;
+    static constexpr uint32_t TABLE_SHIFT = ROW_BITS;
+    static constexpr uint32_t ROW_MASK = (1u << ROW_BITS) - 1;
+    static constexpr uint32_t MAX_TABLES = 1u << TABLE_BITS; // 32
+    static constexpr uint32_t MAX_ROWS = 1u << ROW_BITS;     // 134,217,728
+
+    /** @brief Encode table_id and row_id into a single uint32_t. */
+    static inline uint32_t encode(uint8_t table_id, uint32_t row_id) {
+        return (static_cast<uint32_t>(table_id) << TABLE_SHIFT) |
+               (row_id & ROW_MASK);
+    }
+
+    /** @brief Extract table_id from encoded global row ID. */
+    static inline uint8_t table(uint32_t encoded) {
+        return static_cast<uint8_t>(encoded >> TABLE_SHIFT);
+    }
+
+    /** @brief Extract row_id from encoded global row ID. */
+    static inline uint32_t row(uint32_t encoded) { return encoded & ROW_MASK; }
+};
+
+} // namespace Contest
\ No newline at end of file
diff --git a/include/join_execution/hash_join.h b/include/join_execution/hash_join.h
index 0e2b777..b2f1f00 100644
--- a/include/join_execution/hash_join.h
+++ b/include/join_execution/hash_join.h
@@ -28,6 +28,7 @@
 namespace Contest::join {
 
 using Contest::ExecuteResult;
+using Contest::ExtendedResult;
 using Contest::platform::THREAD_COUNT;
 using Contest::platform::worker_pool;
 
@@ -56,8 +57,8 @@ inline UnchainedHashtable build_from_columnar(const JoinInput &input,
  */
 inline UnchainedHashtable build_from_intermediate(const JoinInput &input,
                                                   size_t attr_idx) {
-    const auto &result = std::get<ExecuteResult>(input.data);
-    const auto &column = result[attr_idx];
+    const auto &result = std::get<ExtendedResult>(input.data);
+    const auto &column = result.columns[attr_idx];
 
     size_t row_count = input.row_count(attr_idx);
     UnchainedHashtable hash_table(row_count);
diff --git a/include/join_execution/join_setup.h b/include/join_execution/join_setup.h
index 299dd65..188873d 100644
--- a/include/join_execution/join_setup.h
+++ b/include/join_execution/join_setup.h
@@ -22,16 +22,17 @@
 namespace Contest::join {
 
 using Contest::ExecuteResult;
+using Contest::ExtendedResult;
 using Contest::io::ColumnarReader;
 
 /**
  * @brief Unified abstraction over columnar tables and intermediate results.
  *
- * Stores ColumnarTable* (base scans) or ExecuteResult (child joins). Node
+ * Stores ColumnarTable* (base scans) or ExtendedResult (child joins). Node
  * provides output_attrs mapping for column resolution.
  */
 struct JoinInput {
-    std::variant<ExecuteResult, const ColumnarTable *> data;
+    std::variant<ExtendedResult, const ColumnarTable *> data;
     const PlanNode *node; /**< Provides output_attrs for column mapping. */
     uint8_t table_id;     /**< Source table ID for provenance tracking. */
 
@@ -50,12 +51,35 @@ struct JoinInput {
             auto [actual_col_idx, _] = node->output_attrs[col_idx];
             return table->num_rows;
         } else {
-            return std::get<ExecuteResult>(data)[col_idx].row_count();
+            return std::get<ExtendedResult>(data).columns[col_idx].row_count();
         }
     }
 
     /** @brief Number of output columns. */
     size_t output_size() const { return node->output_attrs.size(); }
+
+    /**
+     * @brief Get list of tables whose row IDs are tracked in this input.
+     *
+     * For columnar input: returns {table_id}.
+     * For intermediate: returns the tracked table_ids from ExtendedResult.
+     */
+    std::vector<uint8_t> tracked_tables() const {
+        if (is_columnar()) {
+            return {table_id};
+        }
+        return std::get<ExtendedResult>(data).table_ids;
+    }
+
+    /**
+     * @brief Get row ID column for a specific table.
+     * @return nullptr for columnar inputs (row IDs encoded on-the-fly).
+     */
+    const mema::rowid_column_t *get_rowid_column(uint8_t tid) const {
+        if (is_columnar())
+            return nullptr;
+        return std::get<ExtendedResult>(data).get_rowid_column(tid);
+    }
 };
 
 /**
@@ -159,13 +183,13 @@ inline MatchCollectionMode determine_collection_mode(
 /**
  * @brief Creates output columns with provenance metadata from inputs.
  */
-inline ExecuteResult initialize_output_columns(
+inline ExtendedResult initialize_output_columns(
     const std::vector<std::tuple<size_t, DataType>> &output_attrs,
     const PlanNode &left_node, const PlanNode &right_node,
     const JoinInput &left_input, const JoinInput &right_input,
     size_t estimated_rows) {
-    ExecuteResult results;
-    results.reserve(output_attrs.size());
+    ExtendedResult results;
+    results.columns.reserve(output_attrs.size());
     size_t left_size = left_input.output_size();
 
     auto set_column_metadata = [](mema::column_t &col, const JoinInput &input,
@@ -175,9 +199,9 @@ inline ExecuteResult initialize_output_columns(
             col.source_table = input.table_id;
             col.source_column = actual_col_idx;
         } else {
-            const auto &result = std::get<ExecuteResult>(input.data);
-            col.source_table = result[col_idx].source_table;
-            col.source_column = result[col_idx].source_column;
+            const auto &result = std::get<ExtendedResult>(input.data);
+            col.source_table = result.columns[col_idx].source_table;
+            col.source_column = result.columns[col_idx].source_column;
         }
     };
 
@@ -188,7 +212,7 @@ inline ExecuteResult initialize_output_columns(
 
         mema::column_t col;
         set_column_metadata(col, input, node, local_idx);
-        results.push_back(std::move(col));
+        results.columns.push_back(std::move(col));
     }
 
     return results;
@@ -200,9 +224,10 @@ inline ExecuteResult initialize_output_columns(
  * prepared flag implements lazy PageIndex construction.
  */
 struct JoinSetup {
-    ExecuteResult results; /**< Output columns being populated. */
+    ExtendedResult results; /**< Output columns + row ID columns. */
     ColumnarReader
         columnar_reader; /**< Page cursor caching for columnar access. */
+    std::vector<uint8_t> merged_table_ids; /**< Tables tracked in output. */
     /**
      * True after prepare_output_columns called.
      */
@@ -211,10 +236,41 @@ struct JoinSetup {
     JoinSetup() : prepared(false) {}
 };
 
+/**
+ * @brief Merge tracked table IDs from build and probe (sorted, unique).
+ *
+ * Both input vectors must be sorted. Output is sorted and deduplicated.
+ */
+inline std::vector<uint8_t>
+merge_tracked_tables(const std::vector<uint8_t> &build_tables,
+                     const std::vector<uint8_t> &probe_tables) {
+    std::vector<uint8_t> merged;
+    merged.reserve(build_tables.size() + probe_tables.size());
+
+    size_t i = 0, j = 0;
+    while (i < build_tables.size() && j < probe_tables.size()) {
+        if (build_tables[i] < probe_tables[j]) {
+            merged.push_back(build_tables[i++]);
+        } else if (probe_tables[j] < build_tables[i]) {
+            merged.push_back(probe_tables[j++]);
+        } else {
+            merged.push_back(build_tables[i++]);
+            j++; // Skip duplicate
+        }
+    }
+    while (i < build_tables.size())
+        merged.push_back(build_tables[i++]);
+    while (j < probe_tables.size())
+        merged.push_back(probe_tables[j++]);
+
+    return merged;
+}
+
 /**
  * @brief Initializes JoinSetup with output columns; call before join execution.
  *
  * PageIndex construction deferred to prepare_output_columns().
+ * Computes merged table IDs from build and probe inputs.
  */
 inline JoinSetup
 setup_join(const JoinInput &build_input, const JoinInput &probe_input,
@@ -229,6 +285,11 @@ setup_join(const JoinInput &build_input, const JoinInput &probe_input,
         initialize_output_columns(output_attrs, left_node, right_node,
                                   left_input, right_input, estimated_rows);
 
+    // Compute merged table IDs from build and probe sides
+    auto build_tables = build_input.tracked_tables();
+    auto probe_tables = probe_input.tracked_tables();
+    setup.merged_table_ids = merge_tracked_tables(build_tables, probe_tables);
+
     setup.prepared = false;
 
     return setup;
diff --git a/include/join_execution/nested_loop.h b/include/join_execution/nested_loop.h
index e1086d0..7646639 100644
--- a/include/join_execution/nested_loop.h
+++ b/include/join_execution/nested_loop.h
@@ -28,6 +28,8 @@
  */
 namespace Contest::join {
 
+using Contest::ExtendedResult;
+
 using Contest::ExecuteResult;
 using Contest::platform::THREAD_COUNT;
 using Contest::platform::worker_pool;
@@ -69,8 +71,8 @@ inline void visit_rows(const JoinInput &input, size_t attr_idx,
             }
         }
     } else {
-        const auto &res = std::get<ExecuteResult>(input.data);
-        const mema::column_t &col = res[attr_idx];
+        const auto &res = std::get<ExtendedResult>(input.data);
+        const mema::column_t &col = res.columns[attr_idx];
         size_t count = col.row_count();
         for (size_t i = 0; i < count; i++) {
             const mema::value_t &val = col[i];
@@ -190,8 +192,8 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input,
                 }
             }
         } else {
-            const auto &res = std::get<ExecuteResult>(probe_input.data);
-            const mema::column_t &col = res[probe_attr];
+            const auto &res = std::get<ExtendedResult>(probe_input.data);
+            const mema::column_t &col = res.columns[probe_attr];
             size_t count = col.row_count();
             size_t start = (t_id * count) / THREAD_COUNT;
             size_t end = ((t_id + 1) * count) / THREAD_COUNT;
diff --git a/include/materialization/construct_intermediate.h b/include/materialization/construct_intermediate.h
index 45a4386..090863f 100644
--- a/include/materialization/construct_intermediate.h
+++ b/include/materialization/construct_intermediate.h
@@ -24,6 +24,8 @@
 namespace Contest::materialize {
 
 using Contest::ExecuteResult;
+using Contest::ExtendedResult;
+using Contest::GlobalRowId;
 using Contest::io::ColumnarReader;
 using Contest::join::JoinInput;
 using Contest::join::MatchCollectionMode;
@@ -52,8 +54,8 @@ struct alignas(8) SourceInfo {
  * @brief Builds SourceInfo for each output column for fast hot-loop lookup.
  *
  * @param remapped_attrs Output column specifications (global indexing).
- * @param build_input    Build side data (ColumnarTable* or ExecuteResult).
- * @param probe_input    Probe side data (ColumnarTable* or ExecuteResult).
+ * @param build_input    Build side data (ColumnarTable* or ExtendedResult).
+ * @param probe_input    Probe side data (ColumnarTable* or ExtendedResult).
  * @param build_node     PlanNode for build side (contains output_attrs).
  * @param probe_node     PlanNode for probe side (contains output_attrs).
  * @param build_size     Number of columns from build side.
@@ -83,31 +85,98 @@ prepare_sources(const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
             info.columnar_col = &table->columns[actual_idx];
         } else {
             info.is_columnar = false;
-            const auto &res = std::get<ExecuteResult>(input.data);
-            info.intermediate_col = &res[local_idx];
+            const auto &res = std::get<ExtendedResult>(input.data);
+            info.intermediate_col = &res.columns[local_idx];
         }
         sources.push_back(info);
     }
     return sources;
 }
 
+/**
+ * @brief Precomputed metadata for resolving a row ID column's source.
+ *
+ * Determines how to populate each output row ID column:
+ * - For columnar input: encode GlobalRowId on-the-fly from local index
+ * - For intermediate input: copy from existing rowid_column_t
+ *
+ * @see prepare_rowid_sources() for precomputation logic.
+ */
+struct alignas(8) RowIdSource {
+    const mema::rowid_column_t *source_col =
+        nullptr;             /**< Source if from intermediate (else encode). */
+    uint8_t table_id = 0;    /**< Table ID for encoding/lookup. */
+    bool from_build = false; /**< True if from build side, false if probe. */
+    bool needs_encode =
+        false; /**< True if columnar (needs GlobalRowId encode). */
+};
+
+/**
+ * @brief Builds RowIdSource for each output row ID column.
+ *
+ * @param merged_table_ids  Sorted, unique table IDs to track in output.
+ * @param build_input       Build side data (ColumnarTable* or ExtendedResult).
+ * @param probe_input       Probe side data (ColumnarTable* or ExtendedResult).
+ * @return Vector of RowIdSource, one per tracked table.
+ */
+inline std::vector<RowIdSource>
+prepare_rowid_sources(const std::vector<uint8_t> &merged_table_ids,
+                      const JoinInput &build_input,
+                      const JoinInput &probe_input) {
+    std::vector<RowIdSource> sources;
+    sources.reserve(merged_table_ids.size());
+
+    for (uint8_t tid : merged_table_ids) {
+        RowIdSource src;
+        src.table_id = tid;
+
+        // Check build side first
+        auto build_tables = build_input.tracked_tables();
+        bool in_build = std::find(build_tables.begin(), build_tables.end(),
+                                  tid) != build_tables.end();
+        if (in_build) {
+            src.from_build = true;
+            if (build_input.is_columnar()) {
+                src.needs_encode = true;
+                src.source_col = nullptr;
+            } else {
+                src.needs_encode = false;
+                src.source_col = build_input.get_rowid_column(tid);
+            }
+        } else {
+            // Must be from probe side
+            src.from_build = false;
+            if (probe_input.is_columnar()) {
+                src.needs_encode = true;
+                src.source_col = nullptr;
+            } else {
+                src.needs_encode = false;
+                src.source_col = probe_input.get_rowid_column(tid);
+            }
+        }
+        sources.push_back(src);
+    }
+    return sources;
+}
+
 /**
  * @brief Constructs intermediate results directly from thread-local buffers.
  *
  * Each thread iterates its own buffer, avoiding the merge step. Total matches
  * computed by summing buffer counts. Each thread writes its contiguous portion
- * of output pages.
+ * of output pages. Also populates row ID columns for provenance tracking.
  *
  * @tparam Mode            Collection mode for compile-time specialization.
  * @param buffers          Vector of ThreadLocalMatchBuffer from probe.
- * @param build_input      Build side data (ColumnarTable* or ExecuteResult).
- * @param probe_input      Probe side data (ColumnarTable* or ExecuteResult).
+ * @param build_input      Build side data (ColumnarTable* or ExtendedResult).
+ * @param probe_input      Probe side data (ColumnarTable* or ExtendedResult).
  * @param remapped_attrs   Output column specifications (global indexing).
  * @param build_node       PlanNode for build side output_attrs mapping.
  * @param probe_node       PlanNode for probe side output_attrs mapping.
  * @param build_size       Number of output columns from build side.
  * @param columnar_reader  ColumnarReader with Cursor caching for page access.
- * @param results          Pre-initialized ExecuteResult, populated in-place.
+ * @param results          Pre-initialized ExtendedResult, populated in-place.
+ * @param merged_table_ids Sorted, unique table IDs to track in output.
  */
 template <MatchCollectionMode Mode>
 inline void construct_intermediate_from_buffers(
@@ -115,7 +184,8 @@ inline void construct_intermediate_from_buffers(
     const JoinInput &build_input, const JoinInput &probe_input,
     const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
     const PlanNode &build_node, const PlanNode &probe_node, size_t build_size,
-    ColumnarReader &columnar_reader, ExecuteResult &results) {
+    ColumnarReader &columnar_reader, ExtendedResult &results,
+    const std::vector<uint8_t> &merged_table_ids) {
 
     // Compute total matches and per-buffer start offsets
     size_t total_matches = 0;
@@ -130,25 +200,38 @@ inline void construct_intermediate_from_buffers(
 
     auto sources = prepare_sources(remapped_attrs, build_input, probe_input,
                                    build_node, probe_node, build_size);
+    auto rowid_sources =
+        prepare_rowid_sources(merged_table_ids, build_input, probe_input);
 
     const size_t num_threads = THREAD_COUNT;
     const size_t num_cols = sources.size();
+    const size_t num_rowid_cols = rowid_sources.size();
 
-    // Pre-size page vectors for each column
+    // Pre-size page vectors for each data column
     using Page = mema::column_t::Page;
+    using RowIdPage = mema::rowid_column_t::Page;
     size_t total_pages_needed =
         (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE;
 
     for (size_t c = 0; c < num_cols; ++c) {
-        auto &col = results[c];
+        auto &col = results.columns[c];
         col.pages.resize(total_pages_needed);
         col.set_row_count(total_matches);
     }
 
+    // Setup row ID columns in results
+    results.table_ids = merged_table_ids;
+    results.row_ids.resize(num_rowid_cols);
+    for (size_t r = 0; r < num_rowid_cols; ++r) {
+        results.row_ids[r].table_id = merged_table_ids[r];
+        results.row_ids[r].pages.resize(total_pages_needed);
+        results.row_ids[r].set_row_count(total_matches);
+    }
+
     // Parallel page allocation - each thread allocates its own pages
     worker_pool().execute([&](size_t t) {
         for (size_t c = 0; c < num_cols; ++c) {
-            auto &col = results[c];
+            auto &col = results.columns[c];
             for (size_t p = t; p < total_pages_needed; p += num_threads) {
                 void *ptr =
                     Contest::platform::get_arena(t)
@@ -156,6 +239,16 @@ inline void construct_intermediate_from_buffers(
                 col.pages[p] = reinterpret_cast<Page *>(ptr);
             }
         }
+        // Allocate row ID pages
+        for (size_t r = 0; r < num_rowid_cols; ++r) {
+            auto &rid_col = results.row_ids[r];
+            for (size_t p = t; p < total_pages_needed; p += num_threads) {
+                void *ptr =
+                    Contest::platform::get_arena(t)
+                        .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
+                rid_col.pages[p] = reinterpret_cast<RowIdPage *>(ptr);
+            }
+        }
     });
 
     // Parallel: each thread processes its own buffer
@@ -170,9 +263,10 @@ inline void construct_intermediate_from_buffers(
         size_t start = buffer_starts[t];
         Contest::ColumnarReader::Cursor cursor;
 
+        // Process data columns
         for (size_t c = 0; c < num_cols; ++c) {
             const auto &src = sources[c];
-            auto &dest_col = results[c];
+            auto &dest_col = results.columns[c];
 
             auto left_range = buf.left_range();
             auto right_range = buf.right_range();
@@ -211,6 +305,49 @@ inline void construct_intermediate_from_buffers(
                 }
             }
         }
+
+        // Process row ID columns
+        for (size_t r = 0; r < num_rowid_cols; ++r) {
+            const auto &rid_src = rowid_sources[r];
+            auto &dest_rid_col = results.row_ids[r];
+
+            auto left_range = buf.left_range();
+            auto right_range = buf.right_range();
+
+            if (rid_src.from_build) {
+                size_t k = start;
+                if (rid_src.needs_encode) {
+                    // Columnar build: encode GlobalRowId on-the-fly
+                    for (uint32_t local_idx : left_range) {
+                        dest_rid_col.write_at(
+                            k++,
+                            GlobalRowId::encode(rid_src.table_id, local_idx));
+                    }
+                } else {
+                    // Intermediate build: copy from source row ID column
+                    const auto &src_col = *rid_src.source_col;
+                    for (uint32_t local_idx : left_range) {
+                        dest_rid_col.write_at(k++, src_col[local_idx]);
+                    }
+                }
+            } else {
+                size_t k = start;
+                if (rid_src.needs_encode) {
+                    // Columnar probe: encode GlobalRowId on-the-fly
+                    for (uint32_t local_idx : right_range) {
+                        dest_rid_col.write_at(
+                            k++,
+                            GlobalRowId::encode(rid_src.table_id, local_idx));
+                    }
+                } else {
+                    // Intermediate probe: copy from source row ID column
+                    const auto &src_col = *rid_src.source_col;
+                    for (uint32_t local_idx : right_range) {
+                        dest_rid_col.write_at(k++, src_col[local_idx]);
+                    }
+                }
+            }
+        }
     });
 }
 
diff --git a/include/materialization/materialize.h b/include/materialization/materialize.h
index e154e93..6d4a3be 100644
--- a/include/materialization/materialize.h
+++ b/include/materialization/materialize.h
@@ -25,6 +25,7 @@
 namespace Contest::materialize {
 
 using Contest::ExecuteResult;
+using Contest::ExtendedResult;
 using Contest::io::ColumnarReader;
 using Contest::join::JoinInput;
 using Contest::join::MatchCollectionMode;
@@ -191,8 +192,8 @@ inline void materialize_single_column_from_buffers(
         auto [actual_idx, _] = node.output_attrs[local_idx];
         col_source = &table->columns[actual_idx];
     } else {
-        const auto &res = std::get<ExecuteResult>(input.data);
-        inter_source = &res[local_idx];
+        const auto &res = std::get<ExtendedResult>(input.data);
+        inter_source = &res.columns[local_idx];
     }
 
     auto reader = [&](uint32_t rid, ColumnarReader::Cursor &cursor,
diff --git a/src/execute.cpp b/src/execute.cpp
index c5a3eed..a9589ad 100644
--- a/src/execute.cpp
+++ b/src/execute.cpp
@@ -49,10 +49,10 @@ using materialize::create_empty_result;
 using materialize::materialize_from_buffers;
 
 /**
- * @brief Result variant: ExecuteResult (intermediate, value_t columns) or
+ * @brief Result variant: ExtendedResult (intermediate, with row ID tracking) or
  * ColumnarTable (final output per contest API).
  */
-using JoinResult = std::variant<ExecuteResult, ColumnarTable>;
+using JoinResult = std::variant<ExtendedResult, ColumnarTable>;
 
 /**
  * @brief Recursive join execution with timing.
@@ -69,7 +69,7 @@ JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root,
  * @brief Resolve plan node to JoinInput.
  *
  * ScanNode -> non-owning ColumnarTable*; JoinNode -> recursive execution
- * returning owned ExecuteResult. Implements depth-first traversal.
+ * returning owned ExtendedResult. Implements depth-first traversal.
  *
  * @param plan Query plan.
  * @param node_idx Node index to resolve.
@@ -87,7 +87,7 @@ JoinInput resolve_join_input(const Plan &plan, size_t node_idx,
         input.table_id = scan->base_table_id;
     } else {
         auto result = execute_impl(plan, node_idx, false, stats);
-        input.data = std::get<ExecuteResult>(std::move(result));
+        input.data = std::get<ExtendedResult>(std::move(result));
         input.table_id = 0;
     }
     return input;
@@ -128,9 +128,9 @@ JoinResult execute_join_with_mode(
                                                  config.probe_attr);
         } else {
             const auto &probe_result =
-                std::get<ExecuteResult>(probe_input.data);
+                std::get<ExtendedResult>(probe_input.data);
             match_buffers = probe_intermediate<Mode>(
-                *hash_table, probe_result[config.probe_attr]);
+                *hash_table, probe_result.columns[config.probe_attr]);
         }
         auto probe_end = std::chrono::high_resolution_clock::now();
         stats.hash_join_probe_ms +=
@@ -175,7 +175,7 @@ JoinResult execute_join_with_mode(
             construct_intermediate_from_buffers<Mode>(
                 match_buffers, build_input, probe_input, config.remapped_attrs,
                 build_node, probe_node, build_input.output_size(),
-                columnar_reader, setup.results);
+                columnar_reader, setup.results, setup.merged_table_ids);
         }
         auto inter_end = std::chrono::high_resolution_clock::now();
         stats.intermediate_ms +=
@@ -203,7 +203,7 @@ JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root,
     auto &node = plan.nodes[node_idx];
 
     if (!std::holds_alternative<JoinNode>(node.data)) {
-        return ExecuteResult{};
+        return ExtendedResult{};
     }
 
     const auto &join = std::get<JoinNode>(node.data);
@@ -288,7 +288,7 @@ JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root,
     }
 
     // Should never reach here, but satisfy compiler
-    return ExecuteResult{};
+    return ExtendedResult{};
 }
 
 /**

From d1a6bfcd14de66aa3d29213f56f8b3a40bb9b83e Mon Sep 17 00:00:00 2001
From: Themos Papatheofanous <themos360@gmail.com>
Date: Wed, 21 Jan 2026 23:51:01 +0200
Subject: [PATCH 02/13] feat: initial deferred materialization

---
 .gitignore                                    |   3 +
 CMakeLists.txt                                |   7 +
 include/data_access/columnar_reader.h         | 124 ++++-
 include/data_model/deferred_intermediate.h    | 169 +++++++
 include/data_model/deferred_plan.h            | 142 ++++++
 include/data_model/plan.h                     |  21 +-
 include/materialization/construct_deferred.h  | 446 ++++++++++++++++++
 .../materialization/materialize_deferred.h    | 439 +++++++++++++++++
 src/analyze_plan.cpp                          | 311 ++++++++++++
 src/execute.cpp                               | 386 ++++++++++++++-
 10 files changed, 2029 insertions(+), 19 deletions(-)
 create mode 100644 include/data_model/deferred_intermediate.h
 create mode 100644 include/data_model/deferred_plan.h
 create mode 100644 include/materialization/construct_deferred.h
 create mode 100644 include/materialization/materialize_deferred.h
 create mode 100644 src/analyze_plan.cpp

diff --git a/.gitignore b/.gitignore
index 12c22e9..f1c8719 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,6 @@ compile_commands.json
 /env/
 script.py
 *.md
+/build_deferred
+/build_debug
+/build_eager
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dc0739a..2621d56 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,13 @@ FetchContent_MakeAvailable(fmtlib)
 
 set(ENABLE_SANITIZER OFF)
 set(ENABLE_UBSAN OFF)
+
+# Deferred materialization: only materialize join keys, defer other columns
+option(USE_DEFERRED_MATERIALIZATION "Enable deferred column materialization" OFF)
+if(USE_DEFERRED_MATERIALIZATION)
+    message(STATUS "Deferred materialization ENABLED")
+    add_compile_definitions(USE_DEFERRED_MATERIALIZATION)
+endif()
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|powerpc|ppc64|ppc64le")
     message("Disabling jemalloc extension of DuckDB on Power.")
     set(SKIP_EXTENSIONS jemalloc)
diff --git a/include/data_access/columnar_reader.h b/include/data_access/columnar_reader.h
index 2074498..e143c95 100644
--- a/include/data_access/columnar_reader.h
+++ b/include/data_access/columnar_reader.h
@@ -275,10 +275,15 @@ class ColumnarReader {
 
         /* Dense INT32 fast path: O(1) arithmetic lookup, bypasses cursor */
         if (data_type == DataType::INT32) {
-            const PageIndex &page_index = IsBuild ? build_page_indices[col_idx]
+            size_t pidx_size =
+                IsBuild ? build_page_indices.size() : probe_page_indices.size();
+            if (SPC_LIKELY(col_idx < pidx_size)) {
+                const PageIndex &page_index = IsBuild
+                                                  ? build_page_indices[col_idx]
                                                   : probe_page_indices[col_idx];
-            if (SPC_LIKELY(page_index.is_dense_int32)) {
-                return mema::value_t{read_dense_int32(page_index, row_id)};
+                if (SPC_LIKELY(page_index.is_dense_int32)) {
+                    return mema::value_t{read_dense_int32(page_index, row_id)};
+                }
             }
         }
 
@@ -291,10 +296,10 @@ class ColumnarReader {
                 global_probe_version.load(std::memory_order_relaxed);
         }
 
-        if (SPC_LIKELY(cursor.version == current_version &&
-                       col_idx == cursor.cached_col &&
-                       row_id >= cursor.cached_start &&
-                       row_id < cursor.cached_end)) {
+        bool cache_hit =
+            cursor.version == current_version && col_idx == cursor.cached_col &&
+            row_id >= cursor.cached_start && row_id < cursor.cached_end;
+        if (SPC_LIKELY(cache_hit)) {
             uint32_t local_row = row_id - cursor.cached_start;
             if (SPC_LIKELY(cursor.is_dense)) {
                 if (data_type == DataType::INT32) {
@@ -313,9 +318,11 @@ class ColumnarReader {
         }
 
         /* sequential access optimization: skip binary search for next page */
+        size_t pidx_count =
+            IsBuild ? build_page_indices.size() : probe_page_indices.size();
         if (SPC_LIKELY(cursor.version == current_version &&
                        col_idx == cursor.cached_col &&
-                       row_id == cursor.cached_end)) {
+                       row_id == cursor.cached_end && col_idx < pidx_count)) {
             const PageIndex &page_index = IsBuild ? build_page_indices[col_idx]
                                                   : probe_page_indices[col_idx];
             size_t next_page = cursor.cached_page + 1;
@@ -384,6 +391,12 @@ class ColumnarReader {
                                          Cursor &cursor,
                                          uint64_t current_version) const {
 
+        size_t pidx_size =
+            IsBuild ? build_page_indices.size() : probe_page_indices.size();
+        if (SPC_UNLIKELY(col_idx >= pidx_size)) {
+            // No page index prepared - use direct page read
+            return read_value_direct(column, row_id, data_type);
+        }
         const PageIndex &page_index =
             IsBuild ? build_page_indices[col_idx] : probe_page_indices[col_idx];
         size_t page_num = page_index.find_page(row_id);
@@ -407,6 +420,18 @@ class ColumnarReader {
         }
     }
 
+    /**
+     * @brief Direct value read bypassing page index cache.
+     *
+     * Used for deferred column resolution when reading from base tables
+     * that don't have prepared page indices. O(n) page scan per read.
+     */
+    inline mema::value_t read_value_direct_public(const Column &column,
+                                                  uint32_t row_id,
+                                                  DataType data_type) const {
+        return read_value_direct(column, row_id, data_type);
+    }
+
     inline const PageIndex &get_build_page_index(size_t col_idx) const {
         return build_page_indices[col_idx];
     }
@@ -428,6 +453,89 @@ class ColumnarReader {
         return reinterpret_cast<const int32_t *>(page_data + 4)[local_row];
     }
 
+    /**
+     * @brief Direct value read without prepared page index.
+     *
+     * Used when page indices aren't available (e.g., reading base tables
+     * during deferred resolution). O(n) page scan - slower than cached path.
+     */
+    inline mema::value_t read_value_direct(const Column &column,
+                                           uint32_t row_id,
+                                           DataType data_type) const {
+        // Linear scan to find page containing row_id
+        uint32_t cumulative = 0;
+        for (size_t page_num = 0; page_num < column.pages.size(); ++page_num) {
+            auto *page_data = column.pages[page_num]->data;
+            auto num_rows = *reinterpret_cast<const uint16_t *>(page_data);
+            auto num_values =
+                *reinterpret_cast<const uint16_t *>(page_data + 2);
+
+            // Handle special pages
+            if (num_rows == 0xffff) {
+                // Long string page - single row
+                if (row_id == cumulative) {
+                    return mema::value_t::encode_string(
+                        static_cast<int32_t>(page_num),
+                        mema::value_t::LONG_STRING_OFFSET);
+                }
+                cumulative += 1;
+                continue;
+            }
+            if (num_rows == 0xfffe) {
+                // Skip special marker pages
+                continue;
+            }
+
+            if (row_id < cumulative + num_rows) {
+                // Found the page
+                uint32_t local_row = row_id - cumulative;
+                bool is_dense = (num_rows == num_values);
+                const auto *data_ptr =
+                    reinterpret_cast<const int32_t *>(page_data + 4);
+
+                if (is_dense) {
+                    if (data_type == DataType::INT32) {
+                        return mema::value_t{data_ptr[local_row]};
+                    } else {
+                        return mema::value_t::encode_string(
+                            static_cast<int32_t>(page_num),
+                            static_cast<int32_t>(local_row));
+                    }
+                } else {
+                    // Sparse page - check bitmap
+                    size_t bitmap_size = (num_rows + 7) / 8;
+                    const auto *bitmap_ptr = reinterpret_cast<const uint8_t *>(
+                        page_data + PAGE_SIZE - bitmap_size);
+
+                    bool is_valid =
+                        bitmap_ptr[local_row >> 3] & (1u << (local_row & 7));
+                    if (!is_valid) {
+                        return mema::value_t{mema::value_t::NULL_VALUE};
+                    }
+
+                    // Compute data index via popcount
+                    uint32_t data_idx = 0;
+                    for (uint32_t i = 0; i < local_row; ++i) {
+                        if (bitmap_ptr[i >> 3] & (1u << (i & 7))) {
+                            data_idx++;
+                        }
+                    }
+
+                    if (data_type == DataType::INT32) {
+                        return mema::value_t{data_ptr[data_idx]};
+                    } else {
+                        return mema::value_t::encode_string(
+                            static_cast<int32_t>(page_num),
+                            static_cast<int32_t>(data_idx));
+                    }
+                }
+            }
+            cumulative += num_rows;
+        }
+        // Row not found - return NULL
+        return mema::value_t{mema::value_t::NULL_VALUE};
+    }
+
     /** @brief Reads from sparse pages using bitmap and popcount. */
     inline mema::value_t read_sparse(uint32_t local_row, DataType data_type,
                                      const Cursor &cursor) const {
diff --git a/include/data_model/deferred_intermediate.h b/include/data_model/deferred_intermediate.h
new file mode 100644
index 0000000..0c16a13
--- /dev/null
+++ b/include/data_model/deferred_intermediate.h
@@ -0,0 +1,169 @@
+/**
+ * @file deferred_intermediate.h
+ * @brief Lightweight intermediate result for deferred materialization.
+ *
+ * DeferredResult stores only materialized columns (join keys) plus row ID
+ * provenance columns. Deferred columns are resolved at final materialization
+ * by following row IDs back to base tables.
+ *
+ * @see deferred_plan.h for DeferredJoinNode with column decisions.
+ * @see construct_deferred.h for building DeferredResult.
+ * @see materialize_deferred.h for final resolution.
+ */
+#pragma once
+
+#include <cstdint>
+#include <optional>
+#include <variant>
+#include <vector>
+
+#include <data_model/deferred_plan.h>
+#include <data_model/intermediate.h>
+
+namespace Contest {
+
+/**
+ * @brief Lightweight intermediate result with only join keys materialized.
+ *
+ * Unlike ExtendedResult which stores all projected columns, DeferredResult
+ * stores only columns marked MATERIALIZE (typically just the parent's join
+ * key). All other columns are resolved at final materialization using row ID
+ * provenance.
+ *
+ * Memory savings: For a join projecting N columns where only 1 is a join key,
+ * DeferredResult uses ~1/N the memory of ExtendedResult for data columns.
+ *
+ * @see DeferredColumnInfo for materialization decisions.
+ * @see DeferredJoinNode for column provenance tracking.
+ */
+struct DeferredResult {
+    /// Only columns marked MATERIALIZE (typically 1 join key).
+    std::vector<mema::column_t> materialized;
+
+    /// Map: original column index → index in materialized (nullopt if
+    /// deferred).
+    std::vector<std::optional<size_t>> materialized_map;
+
+    /// Row ID tracking for provenance (same as ExtendedResult).
+    std::vector<mema::rowid_column_t> row_ids;
+
+    /// Which base tables are tracked (sorted).
+    std::vector<uint8_t> table_ids;
+
+    /// Reference to node info for column provenance resolution.
+    const DeferredJoinNode *node_info = nullptr;
+
+    /// Total row count.
+    size_t num_rows = 0;
+
+    DeferredResult() = default;
+    DeferredResult(DeferredResult &&) = default;
+    DeferredResult &operator=(DeferredResult &&) = default;
+    DeferredResult(const DeferredResult &) = delete;
+    DeferredResult &operator=(const DeferredResult &) = delete;
+
+    /** @brief Total row count. */
+    size_t row_count() const { return num_rows; }
+
+    /** @brief Check if column was materialized (not deferred). */
+    bool is_materialized(size_t orig_idx) const {
+        return orig_idx < materialized_map.size() &&
+               materialized_map[orig_idx].has_value();
+    }
+
+    /** @brief Get materialized column, or nullptr if deferred. */
+    const mema::column_t *get_materialized(size_t orig_idx) const {
+        if (!is_materialized(orig_idx))
+            return nullptr;
+        return &materialized[*materialized_map[orig_idx]];
+    }
+
+    /** @brief Find row ID column index for a table, or -1 if not found. */
+    int find_rowid_index(uint8_t tid) const {
+        for (size_t i = 0; i < table_ids.size(); ++i) {
+            if (table_ids[i] == tid)
+                return static_cast<int>(i);
+        }
+        return -1;
+    }
+
+    /** @brief Get row ID column for a table, or nullptr if not found. */
+    const mema::rowid_column_t *get_rowid_column(uint8_t tid) const {
+        int idx = find_rowid_index(tid);
+        return (idx >= 0) ? &row_ids[idx] : nullptr;
+    }
+
+    /** @brief Get mutable row ID column for a table, or nullptr. */
+    mema::rowid_column_t *get_rowid_column_mut(uint8_t tid) {
+        int idx = find_rowid_index(tid);
+        return (idx >= 0) ? &row_ids[idx] : nullptr;
+    }
+};
+
+/**
+ * @brief Input abstraction for deferred execution path.
+ *
+ * Similar to JoinInput but works with DeferredResult instead of ExtendedResult.
+ * Provides uniform interface for columnar (base table) and deferred
+ * intermediate data sources.
+ */
+struct DeferredInput {
+    /// Either base table pointer or owned DeferredResult.
+    std::variant<const ColumnarTable *, DeferredResult> data;
+
+    /// Original plan node for output_attrs mapping.
+    const PlanNode *node = nullptr;
+
+    /// Deferred plan node for materialization decisions.
+    const DeferredNode *deferred_node = nullptr;
+
+    /// Base table ID (for columnar inputs).
+    uint8_t table_id = 0;
+
+    /** @brief True if data is columnar (base table). */
+    bool is_columnar() const {
+        return std::holds_alternative<const ColumnarTable *>(data);
+    }
+
+    /** @brief Row count for join key column. */
+    size_t row_count(size_t col_idx) const {
+        if (is_columnar()) {
+            const auto *table = std::get<const ColumnarTable *>(data);
+            return table->num_rows;
+        }
+        return std::get<DeferredResult>(data).row_count();
+    }
+
+    /** @brief Total row count. */
+    size_t row_count() const {
+        if (is_columnar()) {
+            const auto *table = std::get<const ColumnarTable *>(data);
+            return table->num_rows;
+        }
+        return std::get<DeferredResult>(data).row_count();
+    }
+
+    /** @brief Number of output columns. */
+    size_t output_size() const {
+        if (node)
+            return node->output_attrs.size();
+        return 0;
+    }
+
+    /** @brief Get list of tracked table IDs. */
+    std::vector<uint8_t> tracked_tables() const {
+        if (is_columnar()) {
+            return {table_id};
+        }
+        return std::get<DeferredResult>(data).table_ids;
+    }
+
+    /** @brief Get row ID column for a table. */
+    const mema::rowid_column_t *get_rowid_column(uint8_t tid) const {
+        if (is_columnar())
+            return nullptr;
+        return std::get<DeferredResult>(data).get_rowid_column(tid);
+    }
+};
+
+} // namespace Contest
diff --git a/include/data_model/deferred_plan.h b/include/data_model/deferred_plan.h
new file mode 100644
index 0000000..46daa23
--- /dev/null
+++ b/include/data_model/deferred_plan.h
@@ -0,0 +1,142 @@
+/**
+ * @file deferred_plan.h
+ * @brief Analyzed plan with materialization decisions for deferred execution.
+ *
+ * DeferredPlan mirrors the original Plan structure but includes pre-computed
+ * decisions about which columns to materialize eagerly (join keys) vs defer
+ * until final output. Each DeferredJoinNode tracks column provenance back to
+ * base tables for efficient deferred resolution.
+ *
+ * @see analyze_plan.cpp for the analysis algorithm.
+ * @see deferred_intermediate.h for the runtime result format.
+ */
+#pragma once
+
+#include <cstdint>
+#include <optional>
+#include <variant>
+#include <vector>
+
+#include <data_model/plan.h>
+#include <join_execution/match_collector.h>
+
+namespace Contest {
+
+/**
+ * @brief Materialization decision for an output column.
+ *
+ * MATERIALIZE: Column is needed as a join key by parent - materialize eagerly.
+ * DEFER: Column only needed at final output - defer until root materialization.
+ */
+enum class ColumnResolution : uint8_t { MATERIALIZE, DEFER };
+
+/**
+ * @brief Tracks the base table origin of a column for deferred resolution.
+ *
+ * Used to resolve deferred columns at final materialization by looking up
+ * the original value in the base table using row ID provenance.
+ */
+struct ColumnProvenance {
+    uint8_t base_table_id;   ///< Index into Plan::inputs.
+    uint8_t base_column_idx; ///< Column index within the base table.
+};
+
+/**
+ * @brief Complete metadata for an output column in a deferred join.
+ *
+ * Combines materialization decision, provenance tracking, and child source
+ * information for efficient intermediate construction and final resolution.
+ */
+struct DeferredColumnInfo {
+    size_t original_idx; ///< Index in node's output_attrs.
+    DataType type;       ///< INT32 or VARCHAR.
+
+    ColumnResolution resolution; ///< MATERIALIZE or DEFER.
+    ColumnProvenance provenance; ///< Base table source for deferred resolution.
+
+    bool from_left;          ///< True if from left child, false if right.
+    size_t child_output_idx; ///< Index in child's output_attrs.
+};
+
+/**
+ * @brief Analyzed scan node for deferred execution.
+ *
+ * Wraps a ScanNode with output attribute information.
+ */
+struct DeferredScanNode {
+    size_t node_idx;       ///< Index in original Plan::nodes.
+    uint8_t base_table_id; ///< Index into Plan::inputs.
+    std::vector<std::tuple<size_t, DataType>> output_attrs; ///< Projected cols.
+};
+
+/**
+ * @brief Analyzed join node with pre-computed materialization decisions.
+ *
+ * Contains all information needed for deferred execution:
+ * - Which columns to materialize eagerly (join keys for parent)
+ * - Column provenance for deferred resolution
+ * - Pre-computed match collection mode
+ * - Table IDs tracked through this node
+ */
+struct DeferredJoinNode {
+    size_t node_idx; ///< Index in original Plan::nodes.
+
+    size_t left_child_idx;  ///< Left child index in Plan::nodes.
+    size_t right_child_idx; ///< Right child index in Plan::nodes.
+    size_t left_join_attr;  ///< Join key index in left child's output.
+    size_t right_join_attr; ///< Join key index in right child's output.
+
+    /// Original output attributes (global indexing).
+    std::vector<std::tuple<size_t, DataType>> output_attrs;
+
+    /// Per-column materialization decisions and provenance.
+    std::vector<DeferredColumnInfo> columns;
+
+    /// Pre-computed collection mode (assumes build=left; flip if build=right).
+    join::MatchCollectionMode base_collection_mode;
+
+    /// Sorted table IDs tracked through this node (union of children).
+    std::vector<uint8_t> tracked_table_ids;
+
+    /// Column index that parent needs as join key (nullopt if root).
+    std::optional<size_t> parent_join_key_idx;
+
+    /// True if this is the root node.
+    bool is_root;
+};
+
+/**
+ * @brief Plan node variant for deferred execution.
+ */
+using DeferredNode = std::variant<DeferredScanNode, DeferredJoinNode>;
+
+/**
+ * @brief Analyzed plan with materialization decisions.
+ *
+ * Mirrors Plan structure but includes pre-computed decisions for deferred
+ * materialization. The original_plan pointer provides access to base tables
+ * for value resolution.
+ */
+struct DeferredPlan {
+    std::vector<DeferredNode> nodes; ///< Analyzed nodes (same indices as Plan).
+    size_t root;                     ///< Root node index.
+    const Plan *original_plan;       ///< Non-owning reference to original plan.
+
+    const DeferredNode &operator[](size_t idx) const { return nodes[idx]; }
+};
+
+/**
+ * @brief Analyze plan and compute materialization decisions.
+ *
+ * Walks the plan tree in post-order, determining for each join node:
+ * 1. Which column the parent needs as join key (MATERIALIZE)
+ * 2. All other columns (DEFER)
+ * 3. Provenance for each column back to base table
+ * 4. Pre-computed collection mode based on output columns
+ *
+ * @param plan Original query plan.
+ * @return DeferredPlan with materialization decisions.
+ */
+DeferredPlan analyze_plan(const Plan &plan);
+
+} // namespace Contest
diff --git a/include/data_model/plan.h b/include/data_model/plan.h
index 99c623e..897a8e2 100644
--- a/include/data_model/plan.h
+++ b/include/data_model/plan.h
@@ -33,7 +33,8 @@
 #endif
 
 /**
- * @brief RAII mmap wrapper with refcount. munmap on last ref release. Move-only.
+ * @brief RAII mmap wrapper with refcount. munmap on last ref release.
+ * Move-only.
  */
 class MappedMemory {
   public:
@@ -127,8 +128,8 @@ constexpr size_t PAGE_SIZE = 8192;
  * @brief 8-byte aligned page (8KB) for columnar data.
  *
  * INT32: [num_rows:u16][num_values:u16][values...][bitmap at end]
- * VARCHAR: [num_rows:u16][num_offsets:u16][offsets:u16...][string bytes][bitmap]
- * Long string markers: 0xFFFF (first), 0xFFFE (continuation).
+ * VARCHAR: [num_rows:u16][num_offsets:u16][offsets:u16...][string
+ * bytes][bitmap] Long string markers: 0xFFFF (first), 0xFFFE (continuation).
  * Dense page (no NULLs): num_rows == num_values → fast path.
  */
 struct alignas(8) Page {
@@ -219,7 +220,8 @@ struct Plan {
     size_t root;                       /**< Index of root node in nodes. */
 
     /**
-     * @brief Create JoinNode. @return node index. Execution may override build_left.
+     * @brief Create JoinNode. @return node index. Execution may override
+     * build_left.
      */
     size_t
     new_join_node(bool build_left, size_t left, size_t right, size_t left_attr,
@@ -282,7 +284,8 @@ template <class T> struct ColumnInserter {
         bitmap.resize(PAGE_SIZE);
     }
 
-    /** @brief Get current page, allocating if needed. Does not advance index. */
+    /** @brief Get current page, allocating if needed. Does not advance index.
+     */
     std::byte *get_page() {
         if (last_page_idx == column.pages.size()) [[unlikely]] {
             column.new_page();
@@ -369,7 +372,8 @@ template <> struct ColumnInserter<std::string> {
         bitmap.resize(PAGE_SIZE);
     }
 
-    /** @brief Get current page, allocating if needed. Does not advance index. */
+    /** @brief Get current page, allocating if needed. Does not advance index.
+     */
     std::byte *get_page() {
         if (last_page_idx == column.pages.size()) [[unlikely]] {
             column.new_page();
@@ -378,7 +382,8 @@ template <> struct ColumnInserter<std::string> {
         return page->data;
     }
 
-    /** @brief Write long string (>PAGE_SIZE-7) across pages. 0xFFFF/0xFFFE markers. */
+    /** @brief Write long string (>PAGE_SIZE-7) across pages. 0xFFFF/0xFFFE
+     * markers. */
     void save_long_string(std::string_view value) {
         size_t offset = 0;
         auto first_page = true;
@@ -484,6 +489,8 @@ struct TimingStats {
     int64_t setup_ms = 0;            /**< JoinSetup + build/probe selection. */
     int64_t total_execution_ms = 0;  /**< Wall-clock total for execute(). */
     int64_t intermediate_ms = 0; /**< construct_intermediate for non-root. */
+    int64_t analyze_plan_ms = 0; /**< Deferred: plan analysis time. */
+    int64_t deferred_resolve_ms = 0; /**< Deferred: column resolution time. */
 };
 
 /** @brief Allocate execution context (worker pool, shared state). */
diff --git a/include/materialization/construct_deferred.h b/include/materialization/construct_deferred.h
new file mode 100644
index 0000000..5ba8b3e
--- /dev/null
+++ b/include/materialization/construct_deferred.h
@@ -0,0 +1,446 @@
+/**
+ * @file construct_deferred.h
+ * @brief Constructs deferred intermediate results for multi-way joins.
+ *
+ * Allocates and populates DeferredResult with only MATERIALIZE columns
+ * (typically just the parent's join key). Row ID columns are always
+ * populated for provenance tracking.
+ *
+ * @see construct_intermediate.h for the eager materialization equivalent.
+ * @see materialize_deferred.h for final resolution of deferred columns.
+ */
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+#include <vector>
+
+#include <data_access/columnar_reader.h>
+#include <data_model/deferred_intermediate.h>
+#include <data_model/deferred_plan.h>
+#include <foundation/common.h>
+#include <join_execution/match_collector.h>
+#include <platform/arena.h>
+#include <platform/worker_pool.h>
+
+namespace Contest {
+namespace materialize {
+
+using Contest::io::ColumnarReader;
+using Contest::join::MatchCollectionMode;
+using Contest::join::ThreadLocalMatchBuffer;
+using Contest::platform::THREAD_COUNT;
+using Contest::platform::worker_pool;
+
+/**
+ * @brief Collect columns needed from a DeferredInput for page index building.
+ */
+inline platform::ArenaVector<const Column *>
+collect_deferred_columns(const DeferredInput &input,
+                         const platform::ArenaVector<uint8_t> &needed,
+                         platform::ThreadArena &arena) {
+    platform::ArenaVector<const Column *> columns(arena);
+    if (!input.node)
+        return columns;
+
+    columns.resize(input.node->output_attrs.size());
+    std::memset(columns.data(), 0, columns.size() * sizeof(const Column *));
+
+    if (!input.is_columnar())
+        return columns;
+
+    auto *table = std::get<const ColumnarTable *>(input.data);
+    for (size_t i = 0; i < input.node->output_attrs.size(); ++i) {
+        if (i < needed.size() && needed[i]) {
+            auto [actual_col_idx, _] = input.node->output_attrs[i];
+            columns[i] = &table->columns[actual_col_idx];
+        }
+    }
+    return columns;
+}
+
+/**
+ * @brief Prepare ColumnarReader for deferred materialization path.
+ *
+ * Sets up page indices for columns that need to be read from columnar inputs.
+ */
+inline void prepare_deferred_columns(
+    ColumnarReader &reader, const DeferredInput &build_input,
+    const DeferredInput &probe_input, const DeferredJoinNode &join_node,
+    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
+    size_t build_size, bool build_is_left) {
+
+    bool build_is_columnar = build_input.is_columnar();
+    bool probe_is_columnar = probe_input.is_columnar();
+
+    if (!build_is_columnar && !probe_is_columnar)
+        return;
+
+    auto &arena = Contest::platform::get_arena(0);
+
+    // Determine which columns from each side are needed
+    platform::ArenaVector<uint8_t> build_needed(arena);
+    if (build_input.node) {
+        build_needed.resize(build_input.node->output_attrs.size());
+        std::memset(build_needed.data(), 0, build_needed.size());
+    }
+
+    platform::ArenaVector<uint8_t> probe_needed(arena);
+    if (probe_input.node) {
+        probe_needed.resize(probe_input.node->output_attrs.size());
+        std::memset(probe_needed.data(), 0, probe_needed.size());
+    }
+
+    // Mark columns needed based on materialization decisions
+    // from_left refers to original left child
+    // build_is_left tells us if build side is the left child
+    for (const auto &col : join_node.columns) {
+        if (col.resolution == ColumnResolution::MATERIALIZE) {
+            bool from_build = (col.from_left == build_is_left);
+            if (from_build && col.child_output_idx < build_needed.size()) {
+                build_needed[col.child_output_idx] = 1;
+            } else if (!from_build &&
+                       col.child_output_idx < probe_needed.size()) {
+                probe_needed[col.child_output_idx] = 1;
+            }
+        }
+    }
+
+    if (build_is_columnar) {
+        reader.prepare_build(
+            collect_deferred_columns(build_input, build_needed, arena));
+    }
+
+    if (probe_is_columnar) {
+        reader.prepare_probe(
+            collect_deferred_columns(probe_input, probe_needed, arena));
+    }
+}
+
+/**
+ * @brief Create empty deferred result with proper schema.
+ *
+ * Used when total_matches == 0. Creates empty materialized columns
+ * for columns marked MATERIALIZE so they can be used in subsequent joins.
+ */
+inline DeferredResult
+create_empty_deferred_result(const DeferredJoinNode &node) {
+    DeferredResult result;
+    result.node_info = &node;
+    result.num_rows = 0;
+    result.materialized_map.resize(node.columns.size(), std::nullopt);
+    result.table_ids = node.tracked_table_ids;
+
+    // Count and allocate empty materialized columns
+    size_t mat_count = 0;
+    for (const auto &col : node.columns) {
+        if (col.resolution == ColumnResolution::MATERIALIZE) {
+            result.materialized_map[col.original_idx] = mat_count++;
+        }
+    }
+    result.materialized.resize(mat_count);
+    // Each column has 0 rows, which is valid for empty result
+
+    // Also create empty row ID columns
+    result.row_ids.resize(node.tracked_table_ids.size());
+    for (size_t i = 0; i < node.tracked_table_ids.size(); ++i) {
+        result.row_ids[i].table_id = node.tracked_table_ids[i];
+    }
+
+    return result;
+}
+
+/**
+ * @brief Precomputed metadata for row ID column sources.
+ *
+ * Mirrors RowIdSource from construct_intermediate.h but adapted for
+ * DeferredInput.
+ */
+struct DeferredRowIdSource {
+    const mema::rowid_column_t *source_col =
+        nullptr;               ///< Source if from intermediate.
+    uint8_t table_id = 0;      ///< Table ID for encoding.
+    bool from_build = false;   ///< True if from build side.
+    bool needs_encode = false; ///< True if columnar (needs GlobalRowId encode).
+};
+
+/**
+ * @brief Prepare row ID sources for deferred intermediate construction.
+ */
+inline std::vector<DeferredRowIdSource>
+prepare_deferred_rowid_sources(const std::vector<uint8_t> &merged_table_ids,
+                               const DeferredInput &build_input,
+                               const DeferredInput &probe_input) {
+    std::vector<DeferredRowIdSource> sources;
+    sources.reserve(merged_table_ids.size());
+
+    for (uint8_t tid : merged_table_ids) {
+        DeferredRowIdSource src;
+        src.table_id = tid;
+
+        // Check build side first
+        auto build_tables = build_input.tracked_tables();
+        bool in_build = std::find(build_tables.begin(), build_tables.end(),
+                                  tid) != build_tables.end();
+        if (in_build) {
+            src.from_build = true;
+            if (build_input.is_columnar()) {
+                src.needs_encode = true;
+                src.source_col = nullptr;
+            } else {
+                src.needs_encode = false;
+                src.source_col = build_input.get_rowid_column(tid);
+            }
+        } else {
+            // Must be from probe side
+            src.from_build = false;
+            if (probe_input.is_columnar()) {
+                src.needs_encode = true;
+                src.source_col = nullptr;
+            } else {
+                src.needs_encode = false;
+                src.source_col = probe_input.get_rowid_column(tid);
+            }
+        }
+        sources.push_back(src);
+    }
+    return sources;
+}
+
+/**
+ * @brief Constructs deferred intermediate result from thread-local buffers.
+ *
+ * Only materializes columns marked MATERIALIZE in the DeferredJoinNode.
+ * All row ID columns are populated for provenance tracking.
+ *
+ * @tparam Mode            Collection mode for compile-time specialization.
+ * @param buffers          Thread-local match buffers from probe.
+ * @param build_input      Build side data source.
+ * @param probe_input      Probe side data source.
+ * @param join_node        Deferred join node with materialization decisions.
+ * @param remapped_attrs   Output attributes (after build/probe remapping).
+ * @param build_output_size Number of columns from build side.
+ * @param columnar_reader  Reader for columnar data access.
+ * @param out_result       Output DeferredResult (populated in-place).
+ * @param merged_table_ids Sorted table IDs to track.
+ * @param deferred_plan    Full deferred plan for base table access (deferred
+ * resolution).
+ */
+template <MatchCollectionMode Mode>
+void construct_deferred_from_buffers(
+    std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
+    const DeferredInput &build_input, const DeferredInput &probe_input,
+    const DeferredJoinNode &join_node,
+    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
+    size_t build_output_size, bool build_is_left,
+    ColumnarReader &columnar_reader, DeferredResult &out_result,
+    const std::vector<uint8_t> &merged_table_ids,
+    const DeferredPlan &deferred_plan) {
+
+    // Count total matches
+    size_t total_matches = 0;
+    std::vector<size_t> buffer_starts(buffers.size());
+    for (size_t i = 0; i < buffers.size(); ++i) {
+        buffer_starts[i] = total_matches;
+        total_matches += buffers[i].count();
+    }
+
+    if (total_matches == 0) {
+        out_result = create_empty_deferred_result(join_node);
+        return;
+    }
+
+    out_result.node_info = &join_node;
+    out_result.num_rows = total_matches;
+    out_result.table_ids = merged_table_ids;
+
+    // Build materialized_map: count MATERIALIZE columns and create mapping
+    // materialized_map[original_idx] -> index into out_result.materialized
+    out_result.materialized_map.resize(join_node.columns.size(), std::nullopt);
+    size_t mat_count = 0;
+
+    // Iterate over join_node.columns (which uses original output order)
+    // and assign materialized indices to MATERIALIZE columns
+    for (const auto &col : join_node.columns) {
+        if (col.resolution == ColumnResolution::MATERIALIZE) {
+            out_result.materialized_map[col.original_idx] = mat_count++;
+        }
+    }
+
+    // Prepare row ID sources
+    auto rowid_sources = prepare_deferred_rowid_sources(
+        merged_table_ids, build_input, probe_input);
+
+    const size_t num_rowid_cols = rowid_sources.size();
+
+    // Pre-allocate pages
+    using Page = mema::column_t::Page;
+    using RowIdPage = mema::rowid_column_t::Page;
+    size_t total_pages_needed =
+        (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE;
+
+    // Allocate materialized columns
+    out_result.materialized.resize(mat_count);
+    for (size_t c = 0; c < mat_count; ++c) {
+        out_result.materialized[c].pages.resize(total_pages_needed);
+        out_result.materialized[c].set_row_count(total_matches);
+    }
+
+    // Allocate row ID columns
+    out_result.row_ids.resize(num_rowid_cols);
+    for (size_t r = 0; r < num_rowid_cols; ++r) {
+        out_result.row_ids[r].table_id = merged_table_ids[r];
+        out_result.row_ids[r].pages.resize(total_pages_needed);
+        out_result.row_ids[r].set_row_count(total_matches);
+    }
+
+    // Parallel page allocation
+    const size_t num_threads = THREAD_COUNT;
+    worker_pool().execute([&](size_t t) {
+        for (size_t c = 0; c < mat_count; ++c) {
+            auto &col = out_result.materialized[c];
+            for (size_t p = t; p < total_pages_needed; p += num_threads) {
+                void *ptr =
+                    Contest::platform::get_arena(t)
+                        .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
+                col.pages[p] = reinterpret_cast<Page *>(ptr);
+            }
+        }
+        for (size_t r = 0; r < num_rowid_cols; ++r) {
+            auto &rid_col = out_result.row_ids[r];
+            for (size_t p = t; p < total_pages_needed; p += num_threads) {
+                void *ptr =
+                    Contest::platform::get_arena(t)
+                        .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
+                rid_col.pages[p] = reinterpret_cast<RowIdPage *>(ptr);
+            }
+        }
+    });
+
+    // Set source metadata for materialized columns
+    for (const auto &col : join_node.columns) {
+        if (col.resolution == ColumnResolution::MATERIALIZE) {
+            size_t mat_idx = *out_result.materialized_map[col.original_idx];
+            out_result.materialized[mat_idx].source_table =
+                col.provenance.base_table_id;
+            out_result.materialized[mat_idx].source_column =
+                col.provenance.base_column_idx;
+        }
+    }
+
+    // Parallel population: each thread processes its own buffer
+    worker_pool().execute([&](size_t t) {
+        if (t >= buffers.size())
+            return;
+        auto &buf = buffers[t];
+        size_t my_count = buf.count();
+        if (my_count == 0)
+            return;
+
+        size_t start = buffer_starts[t];
+        ColumnarReader::Cursor cursor;
+
+        // Iterate through matches
+        auto left_it = buf.left_range().begin();
+        auto right_it = buf.right_range().begin();
+
+        for (size_t m = 0; m < my_count; ++m) {
+            uint32_t build_row = 0, probe_row = 0;
+
+            if constexpr (Mode == MatchCollectionMode::BOTH) {
+                build_row = *left_it;
+                probe_row = *right_it;
+                ++left_it;
+                ++right_it;
+            } else if constexpr (Mode == MatchCollectionMode::LEFT_ONLY) {
+                build_row = *left_it;
+                ++left_it;
+            } else {
+                probe_row = *right_it;
+                ++right_it;
+            }
+
+            size_t out_row = start + m;
+
+            // Write materialized columns
+            for (const auto &col : join_node.columns) {
+                if (col.resolution != ColumnResolution::MATERIALIZE)
+                    continue;
+
+                size_t mat_col_idx =
+                    *out_result.materialized_map[col.original_idx];
+                auto &out_col = out_result.materialized[mat_col_idx];
+
+                // Determine source based on from_left and build/probe mapping
+                // col.from_left refers to original left child
+                // build_is_left tells us if build side is the left child
+                // If from_left && build_is_left => from build
+                // If from_left && !build_is_left => from probe (left became
+                // probe)
+                bool from_build = (col.from_left == build_is_left);
+                uint32_t src_row = from_build ? build_row : probe_row;
+                const auto &src_input = from_build ? build_input : probe_input;
+
+                mema::value_t val;
+                if (src_input.is_columnar()) {
+                    const auto *table =
+                        std::get<const ColumnarTable *>(src_input.data);
+                    auto [actual_idx, _] =
+                        src_input.node->output_attrs[col.child_output_idx];
+                    val = columnar_reader.read_value(
+                        table->columns[actual_idx], col.child_output_idx,
+                        src_row, col.type, cursor, from_build);
+                } else {
+                    const auto &ir = std::get<DeferredResult>(src_input.data);
+                    // Check if materialized in child
+                    const auto *src_col =
+                        ir.get_materialized(col.child_output_idx);
+                    if (src_col) {
+                        val = (*src_col)[src_row];
+                    } else {
+                        // Deferred - resolve via row ID to base table
+                        // This should only happen if materialization wasn't
+                        // propagated properly. Use direct read as fallback.
+                        const auto *rowid_col =
+                            ir.get_rowid_column(col.provenance.base_table_id);
+                        if (rowid_col && deferred_plan.original_plan) {
+                            uint32_t encoded = (*rowid_col)[src_row];
+                            uint32_t base_row = GlobalRowId::row(encoded);
+                            const auto &base_table =
+                                deferred_plan.original_plan
+                                    ->inputs[col.provenance.base_table_id];
+                            val = columnar_reader.read_value_direct_public(
+                                base_table
+                                    .columns[col.provenance.base_column_idx],
+                                base_row, col.type);
+                        } else {
+                            val = mema::value_t{mema::value_t::NULL_VALUE};
+                        }
+                    }
+                }
+
+                out_col.write_at(out_row, val);
+            }
+
+            // Write row ID columns
+            for (size_t r = 0; r < num_rowid_cols; ++r) {
+                const auto &rid_src = rowid_sources[r];
+                auto &dest_rid_col = out_result.row_ids[r];
+
+                uint32_t local_idx = rid_src.from_build ? build_row : probe_row;
+
+                if (rid_src.needs_encode) {
+                    dest_rid_col.write_at(
+                        out_row,
+                        GlobalRowId::encode(rid_src.table_id, local_idx));
+                } else if (rid_src.source_col) {
+                    dest_rid_col.write_at(out_row,
+                                          (*rid_src.source_col)[local_idx]);
+                }
+            }
+        }
+    });
+}
+
+} // namespace materialize
+} // namespace Contest
diff --git a/include/materialization/materialize_deferred.h b/include/materialization/materialize_deferred.h
new file mode 100644
index 0000000..8b548c0
--- /dev/null
+++ b/include/materialization/materialize_deferred.h
@@ -0,0 +1,439 @@
+/**
+ * @file materialize_deferred.h
+ * @brief Final materialization for deferred execution path.
+ *
+ * Materializes all output columns at the root join, resolving deferred
+ * columns by following row ID provenance back to base tables.
+ *
+ * @see construct_deferred.h for building DeferredResult intermediates.
+ * @see materialize.h for the eager materialization equivalent.
+ */
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+#include <functional>
+#include <sys/mman.h>
+#include <vector>
+
+#include <data_access/columnar_reader.h>
+#include <data_model/deferred_intermediate.h>
+#include <data_model/deferred_plan.h>
+#include <foundation/common.h>
+#include <join_execution/match_collector.h>
+#include <materialization/page_builders.h>
+#include <platform/arena.h>
+#include <platform/worker_pool.h>
+
+namespace Contest {
+namespace materialize {
+
+using Contest::io::ColumnarReader;
+using Contest::join::MatchCollectionMode;
+using Contest::join::ThreadLocalMatchBuffer;
+using Contest::platform::THREAD_COUNT;
+using Contest::platform::worker_pool;
+
+/**
+ * @brief Collect columns needed from a DeferredInput for final materialization.
+ */
+inline platform::ArenaVector<const Column *>
+collect_final_columns(const DeferredInput &input,
+                      const platform::ArenaVector<uint8_t> &needed,
+                      platform::ThreadArena &arena) {
+    platform::ArenaVector<const Column *> columns(arena);
+    if (!input.node)
+        return columns;
+
+    columns.resize(input.node->output_attrs.size());
+    std::memset(columns.data(), 0, columns.size() * sizeof(const Column *));
+
+    if (!input.is_columnar())
+        return columns;
+
+    auto *table = std::get<const ColumnarTable *>(input.data);
+    for (size_t i = 0; i < input.node->output_attrs.size(); ++i) {
+        if (i < needed.size() && needed[i]) {
+            auto [actual_col_idx, _] = input.node->output_attrs[i];
+            columns[i] = &table->columns[actual_col_idx];
+        }
+    }
+    return columns;
+}
+
+/**
+ * @brief Prepare ColumnarReader for final deferred materialization at root.
+ *
+ * Sets up page indices for ALL output columns (since all need materialization
+ * at root).
+ */
+inline void prepare_final_deferred_columns(
+    ColumnarReader &reader, const DeferredInput &build_input,
+    const DeferredInput &probe_input, const DeferredJoinNode &join_node,
+    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
+    size_t build_size, bool build_is_left) {
+
+    bool build_is_columnar = build_input.is_columnar();
+    bool probe_is_columnar = probe_input.is_columnar();
+
+    if (!build_is_columnar && !probe_is_columnar)
+        return;
+
+    auto &arena = Contest::platform::get_arena(0);
+
+    // All output columns needed at root
+    platform::ArenaVector<uint8_t> build_needed(arena);
+    if (build_input.node) {
+        build_needed.resize(build_input.node->output_attrs.size());
+        std::memset(build_needed.data(), 0, build_needed.size());
+    }
+
+    platform::ArenaVector<uint8_t> probe_needed(arena);
+    if (probe_input.node) {
+        probe_needed.resize(probe_input.node->output_attrs.size());
+        std::memset(probe_needed.data(), 0, probe_needed.size());
+    }
+
+    // Mark ALL columns needed for final materialization
+    // from_left refers to original left child
+    // build_is_left tells us if build side is the left child
+    for (const auto &col : join_node.columns) {
+        bool from_build = (col.from_left == build_is_left);
+        if (from_build && col.child_output_idx < build_needed.size()) {
+            build_needed[col.child_output_idx] = 1;
+        } else if (!from_build && col.child_output_idx < probe_needed.size()) {
+            probe_needed[col.child_output_idx] = 1;
+        }
+    }
+
+    if (build_is_columnar) {
+        reader.prepare_build(
+            collect_final_columns(build_input, build_needed, arena));
+    }
+
+    if (probe_is_columnar) {
+        reader.prepare_probe(
+            collect_final_columns(probe_input, probe_needed, arena));
+    }
+}
+
+/**
+ * @brief Create empty result for zero-match case in deferred path.
+ */
+inline ColumnarTable create_empty_deferred_final(
+    const std::vector<std::tuple<size_t, DataType>> &output_attrs) {
+    ColumnarTable empty_result;
+    empty_result.num_rows = 0;
+    for (auto [_, data_type] : output_attrs) {
+        empty_result.columns.emplace_back(data_type);
+    }
+    return empty_result;
+}
+
+/**
+ * @brief Materialize a single column from deferred sources.
+ *
+ * Handles three cases:
+ * 1. COLUMNAR_DIRECT: Input is columnar, read directly via row index
+ * 2. MATERIALIZED: Column was materialized in DeferredResult
+ * 3. DEFERRED: Resolve via row ID lookup to base table
+ *
+ * @tparam Mode Collection mode for compile-time specialization.
+ * @tparam BuilderType Int32PageBuilder or VarcharPageBuilder.
+ * @tparam ReaderFunc Callable: (row_idx, cursor) -> value_t.
+ * @tparam InitBuilderFunc Callable: (page_allocator) -> BuilderType.
+ */
+template <MatchCollectionMode Mode, typename BuilderType, typename ReaderFunc,
+          typename InitBuilderFunc>
+inline void materialize_deferred_column(
+    Column &dest_col, std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
+    size_t total_matches, ReaderFunc &&read_value,
+    InitBuilderFunc &&init_builder, bool from_build, size_t est_bytes_per_row) {
+
+    if (total_matches == 0)
+        return;
+
+    const int num_threads = THREAD_COUNT;
+
+    size_t matches_per_thread = (total_matches + num_threads - 1) / num_threads;
+    size_t usable_per_page = PAGE_SIZE - 256;
+    size_t rows_per_page = std::max(1ul, usable_per_page / est_bytes_per_row);
+    size_t pages_per_thread =
+        (matches_per_thread + rows_per_page - 1) / rows_per_page + 10;
+    size_t total_pages = pages_per_thread * num_threads;
+
+    void *page_memory =
+        mmap(nullptr, total_pages * PAGE_SIZE, PROT_READ | PROT_WRITE,
+             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (page_memory == MAP_FAILED)
+        throw std::bad_alloc();
+
+    std::vector<Column> thread_columns;
+    thread_columns.reserve(num_threads);
+    for (int i = 0; i < num_threads; ++i) {
+        thread_columns.emplace_back(dest_col.type);
+    }
+
+    worker_pool().execute([&](size_t t) {
+        if (t >= buffers.size())
+            return;
+        auto &buf = buffers[t];
+        size_t my_count = buf.count();
+        if (my_count == 0)
+            return;
+
+        Column &local_col = thread_columns[t];
+
+        size_t thread_page_start = t * pages_per_thread;
+        size_t thread_page_limit = pages_per_thread;
+        size_t used_pages = 0;
+
+        ColumnarReader::Cursor cursor;
+
+        auto page_allocator = [&]() -> Page * {
+            Page *p;
+            if (used_pages < thread_page_limit) {
+                p = reinterpret_cast<Page *>(static_cast<char *>(page_memory) +
+                                             (thread_page_start + used_pages) *
+                                                 PAGE_SIZE);
+                used_pages++;
+            } else {
+                p = new Page();
+            }
+            local_col.pages.push_back(p);
+            return p;
+        };
+
+        BuilderType builder = init_builder(page_allocator);
+        builder.prepare(my_count);
+
+        const size_t check_interval = BuilderType::MIN_ROWS_PER_PAGE_CHECK;
+        size_t rows_since_check = 0;
+
+        auto range = from_build ? buf.left_range() : buf.right_range();
+
+        for (uint32_t row_id : range) {
+            bool flushed = builder.add(read_value(row_id, cursor));
+
+            if (flushed) {
+                rows_since_check = 0;
+            } else {
+                rows_since_check++;
+                if (rows_since_check >= check_interval) {
+                    if (builder.should_check_overflow()) {
+                        builder.save_to_page(builder.current_page);
+                        rows_since_check = 0;
+                    }
+                    if (rows_since_check > check_interval * 2)
+                        rows_since_check = 0;
+                }
+            }
+        }
+
+        if (builder.num_rows != 0) {
+            builder.save_to_page(builder.current_page);
+        }
+    });
+
+    for (auto &thread_col : thread_columns) {
+        for (auto *page : thread_col.pages) {
+            dest_col.pages.push_back(page);
+        }
+        thread_col.pages.clear();
+    }
+
+    auto *mapped_mem = new MappedMemory(page_memory, total_pages * PAGE_SIZE);
+    dest_col.assign_mapped_memory(mapped_mem);
+}
+
+/**
+ * @brief Materialize single output column handling deferred resolution.
+ *
+ * For deferred columns, resolves via row ID tracking back to base table.
+ *
+ * @tparam Mode Collection mode for compile-time specialization.
+ */
+template <MatchCollectionMode Mode>
+inline void materialize_single_deferred_column(
+    Column &dest_col, size_t col_idx, size_t build_size, bool build_is_left,
+    std::vector<ThreadLocalMatchBuffer<Mode>> &buffers, size_t total_matches,
+    const DeferredInput &build_input, const DeferredInput &probe_input,
+    const DeferredJoinNode &join_node, ColumnarReader &columnar_reader,
+    const DeferredPlan &deferred_plan) {
+
+    // Find column info
+    const DeferredColumnInfo *col_info = nullptr;
+    for (const auto &col : join_node.columns) {
+        if (col.original_idx == col_idx) {
+            col_info = &col;
+            break;
+        }
+    }
+
+    if (!col_info) {
+        // Fallback - shouldn't happen
+        return;
+    }
+
+    // Determine if this column comes from build or probe side at runtime
+    // col_info->from_left refers to original left child
+    // build_is_left tells us if build side is the left child
+    // If from_left && build_is_left => from build
+    // If from_left && !build_is_left => from probe (left became probe)
+    // If !from_left && build_is_left => from probe (right is probe)
+    // If !from_left && !build_is_left => from build (right became build)
+    bool from_build = (col_info->from_left == build_is_left);
+    const DeferredInput &src_input = from_build ? build_input : probe_input;
+
+    // Determine how to read the value
+    const Column *columnar_source = nullptr;
+    const mema::column_t *materialized_source = nullptr;
+    const mema::rowid_column_t *rowid_source = nullptr;
+    const Column *base_table_column = nullptr;
+
+    if (src_input.is_columnar()) {
+        // Direct columnar read
+        const auto *table = std::get<const ColumnarTable *>(src_input.data);
+        auto [actual_idx, _] =
+            src_input.node->output_attrs[col_info->child_output_idx];
+        columnar_source = &table->columns[actual_idx];
+    } else {
+        const auto &ir = std::get<DeferredResult>(src_input.data);
+        if (ir.is_materialized(col_info->child_output_idx)) {
+            // Read from materialized column
+            materialized_source =
+                ir.get_materialized(col_info->child_output_idx);
+        } else {
+            // Deferred - need to resolve via row ID
+            rowid_source =
+                ir.get_rowid_column(col_info->provenance.base_table_id);
+            if (deferred_plan.original_plan) {
+                base_table_column =
+                    &deferred_plan.original_plan
+                         ->inputs[col_info->provenance.base_table_id]
+                         .columns[col_info->provenance.base_column_idx];
+            }
+        }
+    }
+
+    // Create reader lambda
+    auto reader = [&](uint32_t local_row_id,
+                      ColumnarReader::Cursor &cursor) -> mema::value_t {
+        if (columnar_source) {
+            return columnar_reader.read_value(
+                *columnar_source, col_info->child_output_idx, local_row_id,
+                col_info->type, cursor, from_build);
+        } else if (materialized_source) {
+            return (*materialized_source)[local_row_id];
+        } else if (rowid_source && base_table_column) {
+            // Deferred resolution: get base table row from encoded row ID
+            uint32_t encoded = (*rowid_source)[local_row_id];
+            uint32_t base_row = GlobalRowId::row(encoded);
+            return columnar_reader.read_value(
+                *base_table_column, col_info->provenance.base_column_idx,
+                base_row, col_info->type, cursor, true);
+        }
+        return mema::value_t{mema::value_t::NULL_VALUE};
+    };
+
+    // Materialize based on type
+    if (dest_col.type == DataType::INT32) {
+        auto init = [](std::function<Page *()> alloc) {
+            return Int32PageBuilder(std::move(alloc));
+        };
+        materialize_deferred_column<Mode, Int32PageBuilder>(
+            dest_col, buffers, total_matches,
+            [&](uint32_t rid, ColumnarReader::Cursor &cursor) {
+                return reader(rid, cursor);
+            },
+            init, from_build, 4);
+        return;
+    }
+
+    // VARCHAR
+    const Column *str_src_ptr = columnar_source;
+    if (!str_src_ptr) {
+        if (materialized_source) {
+            str_src_ptr = &deferred_plan.original_plan
+                               ->inputs[materialized_source->source_table]
+                               .columns[materialized_source->source_column];
+        } else if (base_table_column) {
+            str_src_ptr = base_table_column;
+        }
+    }
+
+    if (!str_src_ptr) {
+        // Shouldn't happen, but handle gracefully
+        return;
+    }
+
+    auto init = [str_src_ptr](std::function<Page *()> alloc) {
+        return VarcharPageBuilder(*str_src_ptr, std::move(alloc));
+    };
+
+    materialize_deferred_column<Mode, VarcharPageBuilder>(
+        dest_col, buffers, total_matches,
+        [&](uint32_t rid, ColumnarReader::Cursor &cursor) {
+            return reader(rid, cursor);
+        },
+        init, from_build, 35);
+}
+
+/**
+ * @brief Materialize all output columns from deferred intermediate.
+ *
+ * For root join in deferred execution path. Resolves all deferred columns
+ * by following row ID provenance to base tables.
+ *
+ * @tparam Mode Collection mode for compile-time specialization.
+ * @param buffers Thread-local match buffers from probe.
+ * @param build_input Build side deferred input.
+ * @param probe_input Probe side deferred input.
+ * @param join_node Deferred join node with column info.
+ * @param remapped_attrs Output projection after build/probe remapping.
+ * @param build_size Number of columns from build side.
+ * @param columnar_reader Reader for columnar data.
+ * @param deferred_plan Full deferred plan for base table access.
+ * @return ColumnarTable with final output.
+ */
+template <MatchCollectionMode Mode>
+inline ColumnarTable materialize_deferred_from_buffers(
+    std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
+    const DeferredInput &build_input, const DeferredInput &probe_input,
+    const DeferredJoinNode &join_node,
+    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
+    size_t build_size, bool build_is_left, ColumnarReader &columnar_reader,
+    const DeferredPlan &deferred_plan) {
+
+    // Compute total matches
+    size_t total_matches = 0;
+    for (const auto &buf : buffers) {
+        total_matches += buf.count();
+    }
+
+    if (total_matches == 0) {
+        return create_empty_deferred_final(remapped_attrs);
+    }
+
+    ColumnarTable result;
+    result.num_rows = total_matches;
+
+    for (size_t out_idx = 0; out_idx < remapped_attrs.size(); ++out_idx) {
+        auto [col_idx, data_type] = remapped_attrs[out_idx];
+        result.columns.emplace_back(data_type);
+        Column &dest_col = result.columns.back();
+
+        // Pass out_idx (output position) not col_idx (global column index)
+        // because materialize_single_deferred_column searches by original_idx
+        // which is the output position in join_node.columns
+        materialize_single_deferred_column<Mode>(
+            dest_col, out_idx, build_size, build_is_left, buffers,
+            total_matches, build_input, probe_input, join_node, columnar_reader,
+            deferred_plan);
+    }
+
+    return result;
+}
+
+} // namespace materialize
+} // namespace Contest
diff --git a/src/analyze_plan.cpp b/src/analyze_plan.cpp
new file mode 100644
index 0000000..f0ef0a8
--- /dev/null
+++ b/src/analyze_plan.cpp
@@ -0,0 +1,311 @@
+/**
+ * @file analyze_plan.cpp
+ * @brief Analyzes query plan and computes materialization decisions.
+ *
+ * Walks the plan tree in post-order to determine which columns should be
+ * materialized eagerly (join keys needed by parent) vs deferred until final
+ * output. Traces column provenance back to base tables for deferred resolution.
+ *
+ * @see deferred_plan.h for DeferredPlan structure.
+ */
+#include <algorithm>
+#include <functional>
+#include <unordered_map>
+
+#include <data_model/deferred_plan.h>
+
+namespace Contest {
+
+namespace {
+
+/**
+ * @brief Parent relationship info for a node.
+ */
+struct ParentInfo {
+    size_t parent_idx;  ///< Parent node index in Plan::nodes.
+    bool is_left_child; ///< True if this node is parent's left child.
+};
+
+/**
+ * @brief Build map of node_idx → parent info.
+ *
+ * Root node will not have an entry in the map.
+ */
+std::unordered_map<size_t, ParentInfo> build_parent_map(const Plan &plan) {
+    std::unordered_map<size_t, ParentInfo> parent_map;
+
+    for (size_t i = 0; i < plan.nodes.size(); ++i) {
+        const auto &node = plan.nodes[i];
+        if (const auto *join = std::get_if<JoinNode>(&node.data)) {
+            parent_map[join->left] = {i, true};
+            parent_map[join->right] = {i, false};
+        }
+    }
+    return parent_map;
+}
+
+/**
+ * @brief Trace column provenance to base table.
+ *
+ * Recursively follows column through join nodes until reaching a scan node.
+ *
+ * @param plan Original query plan.
+ * @param node_idx Current node index.
+ * @param column_idx Column index in node's output_attrs.
+ * @return ColumnProvenance with base table ID and column index.
+ */
+ColumnProvenance trace_provenance(const Plan &plan, size_t node_idx,
+                                  size_t column_idx) {
+    const auto &node = plan.nodes[node_idx];
+
+    if (const auto *scan = std::get_if<ScanNode>(&node.data)) {
+        // Base case: column comes directly from scan
+        auto [actual_col_idx, _] = node.output_attrs[column_idx];
+        return ColumnProvenance{static_cast<uint8_t>(scan->base_table_id),
+                                static_cast<uint8_t>(actual_col_idx)};
+    }
+
+    // Join node: determine which child the column comes from
+    const auto &join = std::get<JoinNode>(node.data);
+    const auto &left_node = plan.nodes[join.left];
+    size_t left_size = left_node.output_attrs.size();
+
+    auto [col_idx, _] = node.output_attrs[column_idx];
+
+    if (col_idx < left_size) {
+        // Column from left child
+        return trace_provenance(plan, join.left, col_idx);
+    } else {
+        // Column from right child
+        return trace_provenance(plan, join.right, col_idx - left_size);
+    }
+}
+
+/**
+ * @brief Find which column index in this node the parent needs as join key.
+ *
+ * @param plan Original query plan.
+ * @param node_idx Current node index.
+ * @param parent_map Map of node → parent relationship.
+ * @return Column index parent uses as join key, or nullopt if root.
+ */
+std::optional<size_t>
+find_parent_join_key(const Plan &plan, size_t node_idx,
+                     const std::unordered_map<size_t, ParentInfo> &parent_map) {
+    auto it = parent_map.find(node_idx);
+    if (it == parent_map.end()) {
+        return std::nullopt; // Root node
+    }
+
+    const auto &parent_node = plan.nodes[it->second.parent_idx];
+    const auto &parent_join = std::get<JoinNode>(parent_node.data);
+
+    // Parent's join key for this child
+    return it->second.is_left_child ? parent_join.left_attr
+                                    : parent_join.right_attr;
+}
+
+/**
+ * @brief Compute base collection mode based on which sides have output columns.
+ *
+ * Assumes build=left. If build=right at runtime, caller flips
+ * LEFT_ONLY/RIGHT_ONLY.
+ */
+join::MatchCollectionMode
+compute_base_collection_mode(const std::vector<DeferredColumnInfo> &columns,
+                             size_t left_output_size) {
+    bool needs_left = false;
+    bool needs_right = false;
+
+    for (const auto &col : columns) {
+        if (col.from_left) {
+            needs_left = true;
+        } else {
+            needs_right = true;
+        }
+        if (needs_left && needs_right) {
+            return join::MatchCollectionMode::BOTH;
+        }
+    }
+
+    if (needs_left && !needs_right)
+        return join::MatchCollectionMode::LEFT_ONLY;
+    if (needs_right && !needs_left)
+        return join::MatchCollectionMode::RIGHT_ONLY;
+    return join::MatchCollectionMode::BOTH;
+}
+
+/**
+ * @brief Collect tracked table IDs from a DeferredNode.
+ */
+std::vector<uint8_t> get_tracked_tables(const DeferredNode &node) {
+    if (const auto *scan = std::get_if<DeferredScanNode>(&node)) {
+        return {scan->base_table_id};
+    }
+    return std::get<DeferredJoinNode>(node).tracked_table_ids;
+}
+
+/**
+ * @brief Merge tracked table IDs from two children (sorted, unique).
+ */
+std::vector<uint8_t> merge_table_ids(const DeferredNode &left,
+                                     const DeferredNode &right) {
+    auto left_ids = get_tracked_tables(left);
+    auto right_ids = get_tracked_tables(right);
+
+    std::vector<uint8_t> result;
+    result.reserve(left_ids.size() + right_ids.size());
+
+    std::merge(left_ids.begin(), left_ids.end(), right_ids.begin(),
+               right_ids.end(), std::back_inserter(result));
+
+    result.erase(std::unique(result.begin(), result.end()), result.end());
+    return result;
+}
+
+} // anonymous namespace
+
+DeferredPlan analyze_plan(const Plan &plan) {
+    DeferredPlan deferred;
+    deferred.original_plan = &plan;
+    deferred.nodes.resize(plan.nodes.size());
+    deferred.root = plan.root;
+
+    auto parent_map = build_parent_map(plan);
+
+    // Build post-order traversal (children before parents)
+    std::vector<size_t> post_order;
+    post_order.reserve(plan.nodes.size());
+    std::vector<bool> visited(plan.nodes.size(), false);
+
+    std::function<void(size_t)> visit = [&](size_t idx) {
+        if (visited[idx])
+            return;
+        visited[idx] = true;
+
+        const auto &node = plan.nodes[idx];
+        if (const auto *join = std::get_if<JoinNode>(&node.data)) {
+            visit(join->left);
+            visit(join->right);
+        }
+        post_order.push_back(idx);
+    };
+    visit(plan.root);
+
+    // PASS 1: Build structure and initial materialization decisions
+    for (size_t node_idx : post_order) {
+        const auto &node = plan.nodes[node_idx];
+
+        if (const auto *scan = std::get_if<ScanNode>(&node.data)) {
+            // Scan node: simple wrapper
+            DeferredScanNode dscan;
+            dscan.node_idx = node_idx;
+            dscan.base_table_id = scan->base_table_id;
+            dscan.output_attrs = node.output_attrs;
+            deferred.nodes[node_idx] = std::move(dscan);
+
+        } else {
+            // Join node: compute materialization decisions
+            const auto &join = std::get<JoinNode>(node.data);
+            DeferredJoinNode djoin;
+            djoin.node_idx = node_idx;
+            djoin.left_child_idx = join.left;
+            djoin.right_child_idx = join.right;
+            djoin.left_join_attr = join.left_attr;
+            djoin.right_join_attr = join.right_attr;
+            djoin.output_attrs = node.output_attrs;
+            djoin.is_root = (node_idx == plan.root);
+
+            // Find which column parent needs as join key
+            djoin.parent_join_key_idx =
+                find_parent_join_key(plan, node_idx, parent_map);
+
+            // Get child sizes for determining column source
+            const auto &left_node = plan.nodes[join.left];
+            size_t left_size = left_node.output_attrs.size();
+
+            // Build column info for each output column
+            for (size_t i = 0; i < node.output_attrs.size(); ++i) {
+                auto [col_idx, col_type] = node.output_attrs[i];
+
+                DeferredColumnInfo info;
+                info.original_idx = i;
+                info.type = col_type;
+
+                // Determine if column is from left or right child
+                // col_idx is the combined L+R index:
+                // - [0, left_size) = position in left child's output
+                // - [left_size, ...) = position in right child's output +
+                // left_size
+                if (col_idx < left_size) {
+                    info.from_left = true;
+                    info.child_output_idx = col_idx;
+                } else {
+                    info.from_left = false;
+                    info.child_output_idx = col_idx - left_size;
+                }
+
+                // Materialization decision:
+                // - At root: ALL columns must be materialized (final output)
+                // - At intermediate: only parent's join key is materialized
+                if (djoin.is_root) {
+                    // Root node: materialize everything
+                    info.resolution = ColumnResolution::MATERIALIZE;
+                } else if (djoin.parent_join_key_idx.has_value() &&
+                           i == *djoin.parent_join_key_idx) {
+                    info.resolution = ColumnResolution::MATERIALIZE;
+                } else {
+                    info.resolution = ColumnResolution::DEFER;
+                }
+
+                // Trace provenance to base table
+                info.provenance = trace_provenance(plan, node_idx, i);
+
+                djoin.columns.push_back(std::move(info));
+            }
+
+            // Compute collection mode and tracked tables
+            djoin.base_collection_mode =
+                compute_base_collection_mode(djoin.columns, left_size);
+            djoin.tracked_table_ids = merge_table_ids(
+                deferred.nodes[join.left], deferred.nodes[join.right]);
+
+            deferred.nodes[node_idx] = std::move(djoin);
+        }
+    }
+
+    // PASS 2: Propagate materialization requirements to children
+    // Process in reverse post-order (parents before children)
+    for (auto it = post_order.rbegin(); it != post_order.rend(); ++it) {
+        size_t node_idx = *it;
+        auto *djoin = std::get_if<DeferredJoinNode>(&deferred.nodes[node_idx]);
+        if (!djoin)
+            continue;
+
+        // For each column that must be MATERIALIZE, ensure the child also
+        // materializes it
+        for (const auto &col : djoin->columns) {
+            if (col.resolution != ColumnResolution::MATERIALIZE)
+                continue;
+
+            // Find which child this column comes from
+            size_t child_idx =
+                col.from_left ? djoin->left_child_idx : djoin->right_child_idx;
+
+            auto *child_djoin =
+                std::get_if<DeferredJoinNode>(&deferred.nodes[child_idx]);
+            if (!child_djoin)
+                continue; // Child is a scan - always has data
+
+            // Mark child's column as MATERIALIZE
+            if (col.child_output_idx < child_djoin->columns.size()) {
+                child_djoin->columns[col.child_output_idx].resolution =
+                    ColumnResolution::MATERIALIZE;
+            }
+        }
+    }
+
+    return deferred;
+}
+
+} // namespace Contest
diff --git a/src/execute.cpp b/src/execute.cpp
index a9589ad..d8ef462 100644
--- a/src/execute.cpp
+++ b/src/execute.cpp
@@ -40,6 +40,13 @@
 #include <platform/worker_pool.h>
 #include <variant>
 
+#ifdef USE_DEFERRED_MATERIALIZATION
+#include <data_model/deferred_intermediate.h>
+#include <data_model/deferred_plan.h>
+#include <materialization/construct_deferred.h>
+#include <materialization/materialize_deferred.h>
+#endif
+
 namespace Contest {
 
 using namespace join;
@@ -291,6 +298,349 @@ JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root,
     return ExtendedResult{};
 }
 
+#ifdef USE_DEFERRED_MATERIALIZATION
+// ============================================================================
+// DEFERRED MATERIALIZATION PATH
+// ============================================================================
+
+using DeferredJoinResult = std::variant<DeferredResult, ColumnarTable>;
+
+using materialize::construct_deferred_from_buffers;
+using materialize::create_empty_deferred_result;
+using materialize::materialize_deferred_from_buffers;
+
+// Forward declaration
+DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan,
+                                         size_t node_idx, bool is_root,
+                                         TimingStats &stats);
+
+/**
+ * @brief Resolve deferred plan node to DeferredInput.
+ */
+DeferredInput resolve_deferred_input(const DeferredPlan &deferred_plan,
+                                     size_t node_idx, TimingStats &stats) {
+    DeferredInput input;
+    const auto &dnode = deferred_plan[node_idx];
+    const auto &pnode = deferred_plan.original_plan->nodes[node_idx];
+    input.node = &pnode;
+    input.deferred_node = &dnode;
+
+    if (const auto *dscan = std::get_if<DeferredScanNode>(&dnode)) {
+        input.data = &deferred_plan.original_plan->inputs[dscan->base_table_id];
+        input.table_id = dscan->base_table_id;
+    } else {
+        auto result =
+            execute_deferred_impl(deferred_plan, node_idx, false, stats);
+        input.data = std::get<DeferredResult>(std::move(result));
+        input.table_id = 0;
+    }
+    return input;
+}
+
+/**
+ * @brief Select build/probe sides for deferred input.
+ */
+BuildProbeConfig select_deferred_build_probe_side(
+    const JoinNode &join, const DeferredInput &left_input,
+    const DeferredInput &right_input,
+    const std::vector<std::tuple<size_t, DataType>> &output_attrs) {
+    BuildProbeConfig config;
+
+    size_t left_rows = left_input.row_count(join.left_attr);
+    size_t right_rows = right_input.row_count(join.right_attr);
+    config.build_left = left_rows <= right_rows;
+
+    config.build_attr = config.build_left ? join.left_attr : join.right_attr;
+    config.probe_attr = config.build_left ? join.right_attr : join.left_attr;
+
+    config.remapped_attrs = output_attrs;
+    size_t left_size = left_input.output_size();
+    size_t build_size =
+        config.build_left ? left_size : right_input.output_size();
+
+    if (!config.build_left) {
+        for (auto &[col_idx, dtype] : config.remapped_attrs) {
+            if (col_idx < left_size) {
+                col_idx = build_size + col_idx;
+            } else {
+                col_idx = col_idx - left_size;
+            }
+        }
+    }
+    return config;
+}
+
+/**
+ * @brief Unified probe + materialize for deferred path.
+ */
+template <MatchCollectionMode Mode>
+DeferredJoinResult execute_deferred_join_with_mode(
+    bool use_nested_loop, bool probe_is_columnar, bool is_root,
+    const UnchainedHashtable *hash_table, const DeferredInput &build_input,
+    const DeferredInput &probe_input, const BuildProbeConfig &config,
+    const DeferredJoinNode &join_node, io::ColumnarReader &columnar_reader,
+    const DeferredPlan &deferred_plan,
+    const std::vector<uint8_t> &merged_table_ids, TimingStats &stats) {
+
+    std::vector<ThreadLocalMatchBuffer<Mode>> match_buffers;
+
+    // Probe phase - need to convert DeferredInput to JoinInput for probing
+    // For now, handle columnar probe directly
+    if (use_nested_loop) {
+        auto nested_loop_start = std::chrono::high_resolution_clock::now();
+        // Nested loop requires JoinInput - create adapter
+        JoinInput build_ji, probe_ji;
+        build_ji.node = build_input.node;
+        probe_ji.node = probe_input.node;
+
+        if (build_input.is_columnar()) {
+            build_ji.data = std::get<const ColumnarTable *>(build_input.data);
+            build_ji.table_id = build_input.table_id;
+        } else {
+            // Convert DeferredResult to ExtendedResult for compatibility
+            // This is a limitation - nested loop path falls back to eager
+            const auto &dr = std::get<DeferredResult>(build_input.data);
+            ExtendedResult er;
+            er.columns = std::move(
+                const_cast<std::vector<mema::column_t> &>(dr.materialized));
+            er.row_ids = std::move(
+                const_cast<std::vector<mema::rowid_column_t> &>(dr.row_ids));
+            er.table_ids = dr.table_ids;
+            build_ji.data = std::move(er);
+            build_ji.table_id = 0;
+        }
+
+        if (probe_input.is_columnar()) {
+            probe_ji.data = std::get<const ColumnarTable *>(probe_input.data);
+            probe_ji.table_id = probe_input.table_id;
+        } else {
+            const auto &dr = std::get<DeferredResult>(probe_input.data);
+            ExtendedResult er;
+            er.columns = std::move(
+                const_cast<std::vector<mema::column_t> &>(dr.materialized));
+            er.row_ids = std::move(
+                const_cast<std::vector<mema::rowid_column_t> &>(dr.row_ids));
+            er.table_ids = dr.table_ids;
+            probe_ji.data = std::move(er);
+            probe_ji.table_id = 0;
+        }
+
+        match_buffers = nested_loop_join<Mode>(
+            build_ji, probe_ji, config.build_attr, config.probe_attr);
+        auto nested_loop_end = std::chrono::high_resolution_clock::now();
+        stats.nested_loop_join_ms +=
+            std::chrono::duration_cast<std::chrono::milliseconds>(
+                nested_loop_end - nested_loop_start)
+                .count();
+    } else {
+        auto probe_start = std::chrono::high_resolution_clock::now();
+        if (probe_is_columnar) {
+            // Create JoinInput for columnar probe
+            JoinInput probe_ji;
+            probe_ji.node = probe_input.node;
+            probe_ji.data = std::get<const ColumnarTable *>(probe_input.data);
+            probe_ji.table_id = probe_input.table_id;
+            match_buffers =
+                probe_columnar<Mode>(*hash_table, probe_ji, config.probe_attr);
+        } else {
+            const auto &probe_result =
+                std::get<DeferredResult>(probe_input.data);
+            // Probe using materialized column (should be the join key)
+            const auto *mat_col =
+                probe_result.get_materialized(config.probe_attr);
+            if (!mat_col) {
+                std::fprintf(
+                    stderr,
+                    "ERROR: probe join key not materialized! probe_attr=%zu "
+                    "mat_map_size=%zu num_rows=%zu\n",
+                    config.probe_attr, probe_result.materialized_map.size(),
+                    probe_result.num_rows);
+                std::abort();
+            }
+            match_buffers = probe_intermediate<Mode>(*hash_table, *mat_col);
+        }
+        auto probe_end = std::chrono::high_resolution_clock::now();
+        stats.hash_join_probe_ms +=
+            std::chrono::duration_cast<std::chrono::milliseconds>(probe_end -
+                                                                  probe_start)
+                .count();
+    }
+
+    size_t total_matches = 0;
+    for (const auto &buf : match_buffers) {
+        total_matches += buf.count();
+    }
+
+    if (is_root) {
+        auto mat_start = std::chrono::high_resolution_clock::now();
+        DeferredJoinResult final_result;
+        if (total_matches == 0) {
+            final_result =
+                materialize::create_empty_deferred_final(config.remapped_attrs);
+        } else {
+            // Prepare page indices for final materialization
+            materialize::prepare_final_deferred_columns(
+                columnar_reader, build_input, probe_input, join_node,
+                config.remapped_attrs, build_input.output_size(),
+                config.build_left);
+
+            final_result = materialize_deferred_from_buffers<Mode>(
+                match_buffers, build_input, probe_input, join_node,
+                config.remapped_attrs, build_input.output_size(),
+                config.build_left, columnar_reader, deferred_plan);
+        }
+        auto mat_end = std::chrono::high_resolution_clock::now();
+        stats.materialize_ms +=
+            std::chrono::duration_cast<std::chrono::milliseconds>(mat_end -
+                                                                  mat_start)
+                .count();
+        return final_result;
+    } else {
+        auto inter_start = std::chrono::high_resolution_clock::now();
+        DeferredResult result;
+        if (total_matches > 0) {
+            // Prepare page indices for intermediate construction
+            materialize::prepare_deferred_columns(
+                columnar_reader, build_input, probe_input, join_node,
+                config.remapped_attrs, build_input.output_size(),
+                config.build_left);
+
+            construct_deferred_from_buffers<Mode>(
+                match_buffers, build_input, probe_input, join_node,
+                config.remapped_attrs, build_input.output_size(),
+                config.build_left, columnar_reader, result, merged_table_ids,
+                deferred_plan);
+        } else {
+            result = create_empty_deferred_result(join_node);
+        }
+        auto inter_end = std::chrono::high_resolution_clock::now();
+        stats.intermediate_ms +=
+            std::chrono::duration_cast<std::chrono::milliseconds>(inter_end -
+                                                                  inter_start)
+                .count();
+        return std::move(result);
+    }
+}
+
+/**
+ * @brief Recursive deferred join execution.
+ */
+DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan,
+                                         size_t node_idx, bool is_root,
+                                         TimingStats &stats) {
+    const auto &dnode = deferred_plan[node_idx];
+
+    if (std::holds_alternative<DeferredScanNode>(dnode)) {
+        return DeferredResult{};
+    }
+
+    const auto &djoin = std::get<DeferredJoinNode>(dnode);
+    const auto &plan = *deferred_plan.original_plan;
+    const auto &pnode = plan.nodes[node_idx];
+    const auto &join = std::get<JoinNode>(pnode.data);
+
+    // Resolve inputs
+    DeferredInput left_input =
+        resolve_deferred_input(deferred_plan, djoin.left_child_idx, stats);
+    DeferredInput right_input =
+        resolve_deferred_input(deferred_plan, djoin.right_child_idx, stats);
+
+    // Build/probe selection
+    auto setup_start = std::chrono::high_resolution_clock::now();
+    auto config = select_deferred_build_probe_side(
+        join, left_input, right_input, djoin.output_attrs);
+    const DeferredInput &build_input =
+        config.build_left ? left_input : right_input;
+    const DeferredInput &probe_input =
+        config.build_left ? right_input : left_input;
+
+    bool build_is_columnar = build_input.is_columnar();
+    bool probe_is_columnar = probe_input.is_columnar();
+
+    const size_t HASH_TABLE_THRESHOLD = 8;
+    size_t build_rows = build_input.row_count(config.build_attr);
+    // Nested loop doesn't work with DeferredResult because it only has join
+    // keys materialized. Force hash join when either side is DeferredResult.
+    bool use_nested_loop = (build_rows < HASH_TABLE_THRESHOLD) &&
+                           build_is_columnar && probe_is_columnar;
+
+    // Merge table IDs
+    auto build_tables = build_input.tracked_tables();
+    auto probe_tables = probe_input.tracked_tables();
+    auto merged_table_ids = merge_tracked_tables(build_tables, probe_tables);
+
+    io::ColumnarReader columnar_reader;
+    auto setup_end = std::chrono::high_resolution_clock::now();
+    stats.setup_ms += std::chrono::duration_cast<std::chrono::milliseconds>(
+                          setup_end - setup_start)
+                          .count();
+
+    // For deferred materialization, we always need BOTH row indices because
+    // we track provenance from both sides for deferred column resolution.
+    // The optimization to collect only one side's indices is not safe here.
+    MatchCollectionMode mode = MatchCollectionMode::BOTH;
+
+    // Build hash table if needed
+    std::optional<UnchainedHashtable> hash_table;
+    if (!use_nested_loop) {
+        auto build_start = std::chrono::high_resolution_clock::now();
+        if (build_is_columnar) {
+            JoinInput build_ji;
+            build_ji.node = build_input.node;
+            build_ji.data = std::get<const ColumnarTable *>(build_input.data);
+            build_ji.table_id = build_input.table_id;
+            hash_table = build_from_columnar(build_ji, config.build_attr);
+        } else {
+            const auto &dr = std::get<DeferredResult>(build_input.data);
+            const auto *mat_col = dr.get_materialized(config.build_attr);
+            if (!mat_col) {
+                std::fprintf(
+                    stderr,
+                    "ERROR: build join key not materialized! build_attr=%zu "
+                    "mat_map_size=%zu num_rows=%zu\n",
+                    config.build_attr, dr.materialized_map.size(), dr.num_rows);
+                // Fatal - this should never happen
+                std::abort();
+            }
+            hash_table.emplace(mat_col->row_count());
+            hash_table->build_intermediate(*mat_col);
+        }
+        auto build_end = std::chrono::high_resolution_clock::now();
+        stats.hashtable_build_ms +=
+            std::chrono::duration_cast<std::chrono::milliseconds>(build_end -
+                                                                  build_start)
+                .count();
+    }
+
+    // Dispatch based on collection mode
+    switch (mode) {
+    case MatchCollectionMode::BOTH:
+        return execute_deferred_join_with_mode<MatchCollectionMode::BOTH>(
+            use_nested_loop, probe_is_columnar, is_root,
+            use_nested_loop ? nullptr : &(*hash_table), build_input,
+            probe_input, config, djoin, columnar_reader, deferred_plan,
+            merged_table_ids, stats);
+
+    case MatchCollectionMode::LEFT_ONLY:
+        return execute_deferred_join_with_mode<MatchCollectionMode::LEFT_ONLY>(
+            use_nested_loop, probe_is_columnar, is_root,
+            use_nested_loop ? nullptr : &(*hash_table), build_input,
+            probe_input, config, djoin, columnar_reader, deferred_plan,
+            merged_table_ids, stats);
+
+    case MatchCollectionMode::RIGHT_ONLY:
+        return execute_deferred_join_with_mode<MatchCollectionMode::RIGHT_ONLY>(
+            use_nested_loop, probe_is_columnar, is_root,
+            use_nested_loop ? nullptr : &(*hash_table), build_input,
+            probe_input, config, djoin, columnar_reader, deferred_plan,
+            merged_table_ids, stats);
+    }
+
+    return DeferredResult{};
+}
+
+#endif // USE_DEFERRED_MATERIALIZATION
+
 /**
  * @brief Public entry point: execute plan from root, return ColumnarTable.
  * @param plan Query plan with nodes and base tables.
@@ -307,7 +657,27 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out,
     auto total_start = std::chrono::high_resolution_clock::now();
 
     TimingStats stats;
+
+#ifdef USE_DEFERRED_MATERIALIZATION
+    // Deferred materialization path: analyze plan, then execute with deferred
+    // intermediate construction
+    auto analyze_start = std::chrono::high_resolution_clock::now();
+    DeferredPlan deferred_plan = analyze_plan(plan);
+    auto analyze_end = std::chrono::high_resolution_clock::now();
+    stats.analyze_plan_ms =
+        std::chrono::duration_cast<std::chrono::milliseconds>(analyze_end -
+                                                              analyze_start)
+            .count();
+
+    auto deferred_result =
+        execute_deferred_impl(deferred_plan, plan.root, true, stats);
+    ColumnarTable final_result =
+        std::get<ColumnarTable>(std::move(deferred_result));
+#else
+    // Eager materialization path (original)
     auto result = execute_impl(plan, plan.root, true, stats);
+    ColumnarTable final_result = std::get<ColumnarTable>(std::move(result));
+#endif
 
     auto total_end = std::chrono::high_resolution_clock::now();
     auto total_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
@@ -315,11 +685,19 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out,
     stats.total_execution_ms = total_elapsed.count();
 
     if (show_detailed_timing) {
-        int64_t accounted =
-            stats.hashtable_build_ms + stats.hash_join_probe_ms +
-            stats.nested_loop_join_ms + stats.materialize_ms + stats.setup_ms;
+        int64_t accounted = stats.hashtable_build_ms +
+                            stats.hash_join_probe_ms +
+                            stats.nested_loop_join_ms + stats.materialize_ms +
+                            stats.setup_ms + stats.intermediate_ms;
+#ifdef USE_DEFERRED_MATERIALIZATION
+        accounted += stats.analyze_plan_ms;
+#endif
         int64_t other = stats.total_execution_ms - accounted;
 
+#ifdef USE_DEFERRED_MATERIALIZATION
+        std::cout << "[DEFERRED] Plan Analysis Time: " << stats.analyze_plan_ms
+                  << " ms\n";
+#endif
         std::cout << "Hashtable Build Time: " << stats.hashtable_build_ms
                   << " ms\n";
         std::cout << "Hash Join Probe Time: " << stats.hash_join_probe_ms
@@ -339,7 +717,7 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out,
         *stats_out = stats;
     }
 
-    return std::move(std::get<ColumnarTable>(result));
+    return std::move(final_result);
 }
 
 void *build_context() { return nullptr; }

From 665968c6a27f73350f75d1496dd94bc92b898d33 Mon Sep 17 00:00:00 2001
From: Themos Papatheofanous <themos360@gmail.com>
Date: Thu, 22 Jan 2026 01:47:27 +0200
Subject: [PATCH 03/13] feat: draft degerred materialization

---
 include/data_model/deferred_intermediate.h    |  77 +--
 include/data_model/deferred_plan.h            |   6 +-
 include/data_model/intermediate.h             |  82 +++
 include/foundation/common.h                   |  49 ++
 include/join_execution/match_collector.h      |  64 ++
 include/join_execution/nested_loop.h          | 220 +++++++
 include/materialization/construct_deferred.h  | 597 ++++++++++++------
 .../materialization/materialize_deferred.h    |  56 +-
 include/platform/arena.h                      |  26 +-
 src/analyze_plan.cpp                          |  54 +-
 src/execute.cpp                               |  83 +--
 11 files changed, 942 insertions(+), 372 deletions(-)

diff --git a/include/data_model/deferred_intermediate.h b/include/data_model/deferred_intermediate.h
index 0c16a13..8e183d5 100644
--- a/include/data_model/deferred_intermediate.h
+++ b/include/data_model/deferred_intermediate.h
@@ -2,9 +2,10 @@
  * @file deferred_intermediate.h
  * @brief Lightweight intermediate result for deferred materialization.
  *
- * DeferredResult stores only materialized columns (join keys) plus row ID
- * provenance columns. Deferred columns are resolved at final materialization
- * by following row IDs back to base tables.
+ * DeferredResult stores only materialized columns (join keys) plus
+ * per-deferred-column provenance using 64-bit encoding (table_id, column_idx,
+ * row_id). Deferred columns are resolved at final materialization by decoding
+ * the provenance and reading directly from base tables.
  *
  * @see deferred_plan.h for DeferredJoinNode with column decisions.
  * @see construct_deferred.h for building DeferredResult.
@@ -27,14 +28,15 @@ namespace Contest {
  *
  * Unlike ExtendedResult which stores all projected columns, DeferredResult
  * stores only columns marked MATERIALIZE (typically just the parent's join
- * key). All other columns are resolved at final materialization using row ID
- * provenance.
+ * key). All other columns are resolved at final materialization using
+ * per-column 64-bit provenance (table_id, column_idx, row_id).
  *
  * Memory savings: For a join projecting N columns where only 1 is a join key,
  * DeferredResult uses ~1/N the memory of ExtendedResult for data columns.
+ * Additionally, we only track provenance for deferred columns (not all tables).
  *
  * @see DeferredColumnInfo for materialization decisions.
- * @see DeferredJoinNode for column provenance tracking.
+ * @see DeferredProvenance for 64-bit encoding scheme.
  */
 struct DeferredResult {
     /// Only columns marked MATERIALIZE (typically 1 join key).
@@ -44,11 +46,13 @@ struct DeferredResult {
     /// deferred).
     std::vector<std::optional<size_t>> materialized_map;
 
-    /// Row ID tracking for provenance (same as ExtendedResult).
-    std::vector<mema::rowid_column_t> row_ids;
+    /// Per-deferred-column provenance (64-bit encoded table_id+column_idx+row).
+    /// One deferred_column_t per DEFER column, stores full provenance per row.
+    std::vector<mema::deferred_column_t> deferred_columns;
 
-    /// Which base tables are tracked (sorted).
-    std::vector<uint8_t> table_ids;
+    /// Map: original column index → index in deferred_columns (nullopt if
+    /// materialized).
+    std::vector<std::optional<size_t>> deferred_map;
 
     /// Reference to node info for column provenance resolution.
     const DeferredJoinNode *node_info = nullptr;
@@ -71,6 +75,12 @@ struct DeferredResult {
                materialized_map[orig_idx].has_value();
     }
 
+    /** @brief Check if column is deferred. */
+    bool is_deferred(size_t orig_idx) const {
+        return orig_idx < deferred_map.size() &&
+               deferred_map[orig_idx].has_value();
+    }
+
     /** @brief Get materialized column, or nullptr if deferred. */
     const mema::column_t *get_materialized(size_t orig_idx) const {
         if (!is_materialized(orig_idx))
@@ -78,26 +88,22 @@ struct DeferredResult {
         return &materialized[*materialized_map[orig_idx]];
     }
 
-    /** @brief Find row ID column index for a table, or -1 if not found. */
-    int find_rowid_index(uint8_t tid) const {
-        for (size_t i = 0; i < table_ids.size(); ++i) {
-            if (table_ids[i] == tid)
-                return static_cast<int>(i);
-        }
-        return -1;
+    /** @brief Get deferred column provenance, or nullptr if materialized. */
+    const mema::deferred_column_t *get_deferred(size_t orig_idx) const {
+        if (!is_deferred(orig_idx))
+            return nullptr;
+        return &deferred_columns[*deferred_map[orig_idx]];
     }
 
-    /** @brief Get row ID column for a table, or nullptr if not found. */
-    const mema::rowid_column_t *get_rowid_column(uint8_t tid) const {
-        int idx = find_rowid_index(tid);
-        return (idx >= 0) ? &row_ids[idx] : nullptr;
+    /** @brief Get mutable deferred column provenance, or nullptr. */
+    mema::deferred_column_t *get_deferred_mut(size_t orig_idx) {
+        if (!is_deferred(orig_idx))
+            return nullptr;
+        return &deferred_columns[*deferred_map[orig_idx]];
     }
 
-    /** @brief Get mutable row ID column for a table, or nullptr. */
-    mema::rowid_column_t *get_rowid_column_mut(uint8_t tid) {
-        int idx = find_rowid_index(tid);
-        return (idx >= 0) ? &row_ids[idx] : nullptr;
-    }
+    /** @brief Number of deferred columns. */
+    size_t num_deferred() const { return deferred_columns.size(); }
 };
 
 /**
@@ -150,19 +156,16 @@ struct DeferredInput {
         return 0;
     }
 
-    /** @brief Get list of tracked table IDs. */
-    std::vector<uint8_t> tracked_tables() const {
-        if (is_columnar()) {
-            return {table_id};
-        }
-        return std::get<DeferredResult>(data).table_ids;
-    }
-
-    /** @brief Get row ID column for a table. */
-    const mema::rowid_column_t *get_rowid_column(uint8_t tid) const {
+    /**
+     * @brief Get deferred column provenance for a column index.
+     *
+     * For columnar inputs, returns nullptr (caller must encode fresh).
+     * For DeferredResult inputs, returns existing provenance column.
+     */
+    const mema::deferred_column_t *get_deferred_column(size_t col_idx) const {
         if (is_columnar())
             return nullptr;
-        return std::get<DeferredResult>(data).get_rowid_column(tid);
+        return std::get<DeferredResult>(data).get_deferred(col_idx);
     }
 };
 
diff --git a/include/data_model/deferred_plan.h b/include/data_model/deferred_plan.h
index 46daa23..abb934f 100644
--- a/include/data_model/deferred_plan.h
+++ b/include/data_model/deferred_plan.h
@@ -76,7 +76,7 @@ struct DeferredScanNode {
  * - Which columns to materialize eagerly (join keys for parent)
  * - Column provenance for deferred resolution
  * - Pre-computed match collection mode
- * - Table IDs tracked through this node
+ * - Number of deferred columns for allocation
  */
 struct DeferredJoinNode {
     size_t node_idx; ///< Index in original Plan::nodes.
@@ -95,8 +95,8 @@ struct DeferredJoinNode {
     /// Pre-computed collection mode (assumes build=left; flip if build=right).
     join::MatchCollectionMode base_collection_mode;
 
-    /// Sorted table IDs tracked through this node (union of children).
-    std::vector<uint8_t> tracked_table_ids;
+    /// Number of deferred columns (for pre-allocation).
+    size_t num_deferred_columns = 0;
 
     /// Column index that parent needs as join key (nullopt if root).
     std::optional<size_t> parent_join_key_idx;
diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h
index 5f693a7..f4fa9c8 100644
--- a/include/data_model/intermediate.h
+++ b/include/data_model/intermediate.h
@@ -230,6 +230,88 @@ struct rowid_column_t {
     }
 };
 
+/**
+ * @brief 64-bit provenance column for deferred materialization.
+ *
+ * Stores encoded (table_id, column_idx, row_id) for each row using
+ * DeferredProvenance encoding. Uses 32KB pages with 4096 entries each.
+ *
+ * @see DeferredProvenance for encoding scheme.
+ * @see deferred_intermediate.h for DeferredResult usage.
+ */
+struct deferred_column_t {
+    static constexpr size_t PAGE_SIZE = 1 << 15; // 32KB
+    static constexpr size_t ENTRIES_PER_PAGE =
+        PAGE_SIZE / sizeof(uint64_t);         // 4096
+    static constexpr size_t ENTRY_SHIFT = 12; // log2(4096)
+    static constexpr size_t ENTRY_MASK = ENTRIES_PER_PAGE - 1;
+
+    struct alignas(PAGE_SIZE) Page {
+        uint64_t data[ENTRIES_PER_PAGE];
+    };
+
+    std::vector<Page *> pages;
+    size_t num_values = 0;
+
+    deferred_column_t() = default;
+
+    deferred_column_t(deferred_column_t &&other) noexcept
+        : pages(std::move(other.pages)), num_values(other.num_values) {
+        other.pages.clear();
+        other.num_values = 0;
+    }
+
+    deferred_column_t &operator=(deferred_column_t &&other) noexcept {
+        if (this != &other) {
+            pages = std::move(other.pages);
+            num_values = other.num_values;
+            other.pages.clear();
+            other.num_values = 0;
+        }
+        return *this;
+    }
+
+    deferred_column_t(const deferred_column_t &) = delete;
+    deferred_column_t &operator=(const deferred_column_t &) = delete;
+
+    ~deferred_column_t() = default;
+
+    /** @brief O(1) read: idx>>12 for page, idx&0xFFF for offset. */
+    inline uint64_t operator[](size_t idx) const {
+        return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK];
+    }
+
+    /** @brief Thread-safe write at idx (requires pages to be set up first). */
+    inline void write_at(size_t idx, uint64_t val) {
+        pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK] = val;
+    }
+
+    /** @brief Total value count. */
+    size_t row_count() const { return num_values; }
+
+    /** @brief Set row count without allocation (for assembly pattern). */
+    inline void set_row_count(size_t count) { num_values = count; }
+
+    /** @brief Pre-allocate pages from arena. */
+    inline void pre_allocate_from_arena(Contest::platform::ThreadArena &arena,
+                                        size_t count) {
+        static_assert(
+            sizeof(Page) ==
+                Contest::platform::ChunkSize<
+                    Contest::platform::ChunkType::DEFERRED_PAGE>::value,
+            "Page size mismatch with DEFERRED_PAGE chunk size");
+        size_t pages_needed = (count + ENTRIES_PER_PAGE - 1) / ENTRIES_PER_PAGE;
+        pages.reserve(pages_needed);
+        for (size_t i = 0; i < pages_needed; ++i) {
+            void *ptr =
+                arena
+                    .alloc_chunk<Contest::platform::ChunkType::DEFERRED_PAGE>();
+            pages.push_back(reinterpret_cast<Page *>(ptr));
+        }
+        num_values = count;
+    }
+};
+
 /**
  * @brief Convert column_t vector to ColumnarTable. Dereferences VARCHAR refs.
  * @see materialize.h
diff --git a/include/foundation/common.h b/include/foundation/common.h
index 192fe08..49967cd 100644
--- a/include/foundation/common.h
+++ b/include/foundation/common.h
@@ -203,4 +203,53 @@ struct GlobalRowId {
     static inline uint32_t row(uint32_t encoded) { return encoded & ROW_MASK; }
 };
 
+/**
+ * @brief 64-bit encoding for deferred column provenance.
+ *
+ * Encodes table_id, column_idx, and row_id into a single 64-bit value
+ * for efficient storage and resolution of deferred columns.
+ *
+ * Encoding: [table_id (8 bits)][column_idx (8 bits)][row_id (48 bits)]
+ *   - table_id: bits 56-63
+ *   - column_idx: bits 48-55
+ *   - row_id: bits 0-47
+ *
+ * Supports up to 256 tables, 256 columns per table, and 281 trillion rows.
+ */
+struct DeferredProvenance {
+    static constexpr uint64_t ROW_BITS = 48;
+    static constexpr uint64_t COLUMN_BITS = 8;
+    static constexpr uint64_t TABLE_BITS = 8;
+
+    static constexpr uint64_t ROW_MASK = (1ULL << ROW_BITS) - 1;
+    static constexpr uint64_t COLUMN_MASK = (1ULL << COLUMN_BITS) - 1;
+    static constexpr uint64_t COLUMN_SHIFT = ROW_BITS;
+    static constexpr uint64_t TABLE_SHIFT = ROW_BITS + COLUMN_BITS;
+
+    static constexpr uint64_t MAX_TABLES = 1ULL << TABLE_BITS;   // 256
+    static constexpr uint64_t MAX_COLUMNS = 1ULL << COLUMN_BITS; // 256
+    static constexpr uint64_t MAX_ROWS = 1ULL << ROW_BITS;       // 281 trillion
+
+    /** @brief Encode table_id, column_idx, row_id into single uint64_t. */
+    static inline uint64_t encode(uint8_t table_id, uint8_t column_idx,
+                                  uint64_t row_id) {
+        return (static_cast<uint64_t>(table_id) << TABLE_SHIFT) |
+               (static_cast<uint64_t>(column_idx) << COLUMN_SHIFT) |
+               (row_id & ROW_MASK);
+    }
+
+    /** @brief Extract table_id from encoded provenance. */
+    static inline uint8_t table(uint64_t encoded) {
+        return static_cast<uint8_t>(encoded >> TABLE_SHIFT);
+    }
+
+    /** @brief Extract column_idx from encoded provenance. */
+    static inline uint8_t column(uint64_t encoded) {
+        return static_cast<uint8_t>((encoded >> COLUMN_SHIFT) & COLUMN_MASK);
+    }
+
+    /** @brief Extract row_id from encoded provenance. */
+    static inline uint64_t row(uint64_t encoded) { return encoded & ROW_MASK; }
+};
+
 } // namespace Contest
\ No newline at end of file
diff --git a/include/join_execution/match_collector.h b/include/join_execution/match_collector.h
index 78657b7..a4136cb 100644
--- a/include/join_execution/match_collector.h
+++ b/include/join_execution/match_collector.h
@@ -160,6 +160,60 @@ class ThreadLocalMatchBuffer {
         ChainIterator end() const { return ChainIterator(nullptr, 0); }
     };
 
+    /**
+     * @brief Batch reader for efficient SIMD access to chunk chains.
+     *
+     * Unlike ChainIterator which reads one element at a time, this reader
+     * provides direct pointer access to contiguous batches within chunks.
+     * Essential for SIMD provenance encoding in deferred materialization.
+     */
+    class ChunkBatchReader {
+        IndexChunk *current_chunk;
+        uint32_t offset;
+        size_t remaining;
+
+      public:
+        ChunkBatchReader(IndexChunk *chunk, size_t count)
+            : current_chunk(chunk), offset(0), remaining(count) {}
+
+        /** @brief Returns true if more data is available. */
+        inline bool has_more() const { return remaining > 0 && current_chunk; }
+
+        /**
+         * @brief Get pointer to contiguous batch of row IDs.
+         *
+         * Returns pointer to up to max_batch contiguous elements within
+         * current chunk. Actual count may be less if chunk boundary reached.
+         *
+         * @param max_batch Maximum elements to return.
+         * @param actual_count Output: actual number of elements available.
+         * @return Pointer to contiguous row IDs, or nullptr if exhausted.
+         */
+        inline const uint32_t *get_batch(size_t max_batch,
+                                         size_t &actual_count) {
+            if (!current_chunk || remaining == 0) {
+                actual_count = 0;
+                return nullptr;
+            }
+
+            size_t available = current_chunk->count - offset;
+            actual_count = std::min({max_batch, remaining, available});
+            const uint32_t *ptr = &current_chunk->ids[offset];
+
+            offset += static_cast<uint32_t>(actual_count);
+            remaining -= actual_count;
+
+            if (offset >= current_chunk->count && current_chunk->next) {
+                current_chunk = current_chunk->next;
+                offset = 0;
+            }
+            return ptr;
+        }
+
+        /** @brief Remaining element count. */
+        inline size_t count() const { return remaining; }
+    };
+
     /** @brief Returns range for iterating left (build) row IDs. */
     inline ChainRange left_range() const {
         return ChainRange(left_head, total_count);
@@ -170,6 +224,16 @@ class ThreadLocalMatchBuffer {
         return ChainRange(right_head, total_count);
     }
 
+    /** @brief Returns batch reader for left (build) row IDs. */
+    inline ChunkBatchReader left_batch_reader() const {
+        return ChunkBatchReader(left_head, total_count);
+    }
+
+    /** @brief Returns batch reader for right (probe) row IDs. */
+    inline ChunkBatchReader right_batch_reader() const {
+        return ChunkBatchReader(right_head, total_count);
+    }
+
     /** @brief Returns match count in this buffer. */
     size_t count() const { return total_count; }
 
diff --git a/include/join_execution/nested_loop.h b/include/join_execution/nested_loop.h
index 7646639..8546854 100644
--- a/include/join_execution/nested_loop.h
+++ b/include/join_execution/nested_loop.h
@@ -13,6 +13,7 @@
 
 #include <atomic>
 #include <cstdint>
+#include <data_model/deferred_intermediate.h>
 #include <data_model/intermediate.h>
 #include <data_model/plan.h>
 #include <join_execution/join_setup.h>
@@ -83,6 +84,59 @@ inline void visit_rows(const JoinInput &input, size_t attr_idx,
     }
 }
 
+/**
+ * @brief Iterates over non-NULL values in a deferred input column.
+ *
+ * Abstracts columnar vs DeferredResult input. For DeferredResult, reads from
+ * materialized columns (join keys are always materialized).
+ *
+ * @tparam Func void(uint32_t row_id, int32_t value).
+ */
+template <typename Func>
+inline void visit_deferred_rows(const DeferredInput &input, size_t attr_idx,
+                                Func &&visitor) {
+    if (input.is_columnar()) {
+        auto *table = std::get<const ColumnarTable *>(input.data);
+        auto [col_idx, _] = input.node->output_attrs[attr_idx];
+        const Column &col = table->columns[col_idx];
+
+        uint32_t row_id = 0;
+        for (auto *page_obj : col.pages) {
+            auto *page = page_obj->data;
+            auto num_rows = *reinterpret_cast<uint16_t *>(page);
+            auto num_values = *reinterpret_cast<uint16_t *>(page + 2);
+            auto *data = reinterpret_cast<const int32_t *>(page + 4);
+
+            uint16_t val_idx = 0;
+            for (uint16_t i = 0; i < num_rows; i++) {
+                if (num_rows == num_values) {
+                    visitor(row_id++, data[i]);
+                } else {
+                    auto *bitmap = reinterpret_cast<const uint8_t *>(
+                        page + PAGE_SIZE - (num_rows + 7) / 8);
+                    if (bitmap[i / 8] & (1u << (i % 8))) {
+                        visitor(row_id, data[val_idx++]);
+                    }
+                    row_id++;
+                }
+            }
+        }
+    } else {
+        const auto &res = std::get<DeferredResult>(input.data);
+        // Join key must be materialized
+        const mema::column_t *col = res.get_materialized(attr_idx);
+        if (!col)
+            return; // Should not happen - join keys are always materialized
+        size_t count = col->row_count();
+        for (size_t i = 0; i < count; i++) {
+            const mema::value_t &val = (*col)[i];
+            if (!val.is_null()) {
+                visitor(static_cast<uint32_t>(i), val.value);
+            }
+        }
+    }
+}
+
 /**
  * @brief Nested loop join for small build tables (<=8 rows).
  *
@@ -239,4 +293,170 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input,
     return buffers;
 }
 
+/**
+ * @brief Nested loop join for deferred execution path.
+ *
+ * Same algorithm as nested_loop_join but works with DeferredInput.
+ * Supports both columnar and DeferredResult inputs.
+ *
+ * @tparam Mode Collection mode (BOTH, LEFT_ONLY, RIGHT_ONLY).
+ * @return Thread-local match buffers for direct iteration.
+ */
+template <MatchCollectionMode Mode>
+inline std::vector<ThreadLocalMatchBuffer<Mode>>
+nested_loop_join_deferred(const DeferredInput &build_input,
+                          const DeferredInput &probe_input, size_t build_attr,
+                          size_t probe_attr) {
+    size_t build_rows = build_input.row_count(build_attr);
+    size_t probe_rows = probe_input.row_count(probe_attr);
+
+    if (build_rows == 0 || probe_rows == 0)
+        return {};
+
+    size_t num_threads = THREAD_COUNT;
+    std::vector<ThreadLocalMatchBuffer<Mode>> buffers(num_threads);
+
+    constexpr size_t MAX_BUILD_SIZE = 8;
+    alignas(32) int32_t b_vals[MAX_BUILD_SIZE];
+    alignas(16) uint32_t b_ids[MAX_BUILD_SIZE];
+    size_t b_count = 0;
+
+    auto collect_build = [&](uint32_t id, int32_t val) {
+        if (b_count < MAX_BUILD_SIZE) {
+            b_ids[b_count] = id;
+            b_vals[b_count] = val;
+            b_count++;
+        }
+    };
+
+    visit_deferred_rows(build_input, build_attr, collect_build);
+
+    for (size_t i = b_count; i < MAX_BUILD_SIZE; ++i) {
+        b_vals[i] = INT32_MIN;
+    }
+
+    // Setup for columnar probe (page-based parallel processing)
+    const Column *probe_col = nullptr;
+    platform::ArenaVector<uint32_t> page_offsets(
+        Contest::platform::get_arena(0));
+    if (probe_input.is_columnar()) {
+        auto *table = std::get<const ColumnarTable *>(probe_input.data);
+        auto [col_idx, _] = probe_input.node->output_attrs[probe_attr];
+        probe_col = &table->columns[col_idx];
+
+        page_offsets.reserve(probe_col->pages.size() + 1);
+        uint32_t current = 0;
+        for (auto *p : probe_col->pages) {
+            page_offsets.push_back(current);
+            current += *reinterpret_cast<const uint16_t *>(p->data);
+        }
+        page_offsets.push_back(current);
+    }
+
+    // Setup for DeferredResult probe
+    const mema::column_t *probe_mat_col = nullptr;
+    if (!probe_input.is_columnar()) {
+        const auto &res = std::get<DeferredResult>(probe_input.data);
+        probe_mat_col = res.get_materialized(probe_attr);
+        if (!probe_mat_col)
+            return {}; // Join key not materialized - should not happen
+    }
+
+    std::atomic<size_t> probe_page_counter{0};
+
+    worker_pool().execute([&](size_t t_id) {
+        buffers[t_id] =
+            ThreadLocalMatchBuffer<Mode>(Contest::platform::get_arena(t_id));
+        auto &local_buffer = buffers[t_id];
+
+        auto process_value = [&](uint32_t p_id, int32_t p_val) {
+            simd::eq_scan_build<Mode>(p_id, p_val, b_vals, b_ids, b_count,
+                                      local_buffer);
+        };
+
+        if (probe_input.is_columnar()) {
+            size_t num_pages = probe_col->pages.size();
+
+            while (true) {
+                size_t i =
+                    probe_page_counter.fetch_add(1, std::memory_order_relaxed);
+
+                if (i >= num_pages)
+                    break;
+                auto *page = probe_col->pages[i]->data;
+                auto num_rows = *reinterpret_cast<const uint16_t *>(page);
+                auto num_values = *reinterpret_cast<const uint16_t *>(page + 2);
+                auto *data = reinterpret_cast<const int32_t *>(page + 4);
+                uint32_t row_id = page_offsets[i];
+
+                if (num_rows == num_values) {
+                    // SIMD batch: process multiple probe values at a time
+                    uint16_t j = simd::eq_batch_columnar<Mode>(
+                        data, num_rows, row_id, b_vals, b_ids, b_count,
+                        local_buffer);
+                    row_id += j;
+                    // Handle remaining elements with scalar
+                    for (; j < num_rows; j++) {
+                        process_value(row_id++, data[j]);
+                    }
+                } else {
+                    auto *bitmap = reinterpret_cast<const uint8_t *>(
+                        page + PAGE_SIZE - (num_rows + 7) / 8);
+                    uint16_t val_idx = 0;
+                    for (uint16_t j = 0; j < num_rows; j++) {
+                        if (bitmap[j / 8] & (1u << (j % 8))) {
+                            process_value(row_id, data[val_idx++]);
+                        }
+                        row_id++;
+                    }
+                }
+            }
+        } else {
+            // DeferredResult probe - use materialized column
+            const mema::column_t &col = *probe_mat_col;
+            size_t count = col.row_count();
+            size_t start = (t_id * count) / THREAD_COUNT;
+            size_t end = ((t_id + 1) * count) / THREAD_COUNT;
+
+            constexpr size_t BATCH_SIZE = simd::INTERMEDIATE_BATCH_SIZE;
+            size_t i = start;
+
+            if constexpr (BATCH_SIZE > 0) {
+                // SIMD batch processing
+                for (; i + BATCH_SIZE <= end; i += BATCH_SIZE) {
+                    size_t page_idx = i >> 12;
+                    size_t offset = i & 0xFFF;
+
+                    // Only use SIMD if all values are on same page
+                    if (offset + BATCH_SIZE <= mema::CAP_PER_PAGE) {
+                        const int32_t *vals = reinterpret_cast<const int32_t *>(
+                            &col.pages[page_idx]->data[offset]);
+                        simd::eq_batch_intermediate<Mode>(
+                            vals, i, b_vals, b_ids, b_count, local_buffer);
+                    } else {
+                        // Cross-page boundary: fall back to scalar
+                        for (size_t j = i; j < i + BATCH_SIZE; j++) {
+                            const mema::value_t &val = col[j];
+                            if (!val.is_null()) {
+                                process_value(static_cast<uint32_t>(j),
+                                              val.value);
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Handle remaining elements (or all elements if no SIMD)
+            for (; i < end; i++) {
+                const mema::value_t &val = col[i];
+                if (!val.is_null()) {
+                    process_value(static_cast<uint32_t>(i), val.value);
+                }
+            }
+        }
+    });
+
+    return buffers;
+}
+
 } // namespace Contest::join
diff --git a/include/materialization/construct_deferred.h b/include/materialization/construct_deferred.h
index 5ba8b3e..bb9a425 100644
--- a/include/materialization/construct_deferred.h
+++ b/include/materialization/construct_deferred.h
@@ -3,8 +3,14 @@
  * @brief Constructs deferred intermediate results for multi-way joins.
  *
  * Allocates and populates DeferredResult with only MATERIALIZE columns
- * (typically just the parent's join key). Row ID columns are always
- * populated for provenance tracking.
+ * (typically just the parent's join key). Deferred columns store 64-bit
+ * provenance (table_id, column_idx, row_id) for resolution at final output.
+ *
+ * Optimized with:
+ * - Column-major iteration for cache locality
+ * - Precomputed source metadata to avoid per-row variant access
+ * - SIMD provenance encoding (AVX2/NEON) for deferred columns
+ * - Batch access to match collector chunks
  *
  * @see construct_intermediate.h for the eager materialization equivalent.
  * @see materialize_deferred.h for final resolution of deferred columns.
@@ -23,6 +29,12 @@
 #include <platform/arena.h>
 #include <platform/worker_pool.h>
 
+#if defined(__x86_64__)
+#include <immintrin.h>
+#elif defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
 namespace Contest {
 namespace materialize {
 
@@ -32,6 +44,164 @@ using Contest::join::ThreadLocalMatchBuffer;
 using Contest::platform::THREAD_COUNT;
 using Contest::platform::worker_pool;
 
+// ============================================================================
+// SIMD Provenance Encoding
+// ============================================================================
+
+namespace simd_provenance {
+
+#if defined(__x86_64__) && defined(__AVX2__)
+inline constexpr size_t BATCH_SIZE = 4; ///< 4 x uint64_t in AVX2 (256-bit)
+#elif defined(__aarch64__)
+inline constexpr size_t BATCH_SIZE = 2; ///< 2 x uint64_t in NEON (128-bit)
+#else
+inline constexpr size_t BATCH_SIZE = 0; ///< No SIMD available
+#endif
+
+/**
+ * @brief Encode provenance for batch of row IDs using SIMD.
+ *
+ * Encodes (table_id << 56) | (column_idx << 48) | row_id for each row.
+ * Uses AVX2 on x86_64 or NEON on aarch64, with scalar fallback.
+ *
+ * @param dest       Destination deferred column
+ * @param start_idx  Starting output index
+ * @param row_ids    Pointer to row IDs (from IndexChunk, contiguous)
+ * @param count      Number of row IDs to process
+ * @param table_id   Base table ID (constant for all rows)
+ * @param column_idx Base column index (constant for all rows)
+ * @return Number of rows processed (always == count)
+ */
+inline size_t encode_provenance_batch(mema::deferred_column_t &dest,
+                                      size_t start_idx, const uint32_t *row_ids,
+                                      size_t count, uint8_t table_id,
+                                      uint8_t column_idx) {
+    // Precompute constant prefix: (table_id << 56) | (column_idx << 48)
+    const uint64_t prefix = DeferredProvenance::encode(table_id, column_idx, 0);
+
+    size_t i = 0;
+
+#if defined(__x86_64__) && defined(__AVX2__)
+    // AVX2: Process 4 x uint64_t at a time
+    // Load 4 x uint32_t, zero-extend to 4 x uint64_t, OR with prefix
+    const __m256i prefix_vec = _mm256_set1_epi64x(static_cast<int64_t>(prefix));
+
+    for (; i + 4 <= count; i += 4) {
+        // Load 4 x uint32_t and zero-extend to 4 x uint64_t
+        __m128i rows_32 =
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(row_ids + i));
+        __m256i rows_64 = _mm256_cvtepu32_epi64(rows_32);
+
+        // OR with prefix to create provenance values
+        __m256i result = _mm256_or_si256(rows_64, prefix_vec);
+
+        // Store to aligned buffer, then write individually (page-safe)
+        alignas(32) uint64_t out[4];
+        _mm256_store_si256(reinterpret_cast<__m256i *>(out), result);
+
+        dest.write_at(start_idx + i, out[0]);
+        dest.write_at(start_idx + i + 1, out[1]);
+        dest.write_at(start_idx + i + 2, out[2]);
+        dest.write_at(start_idx + i + 3, out[3]);
+    }
+#elif defined(__aarch64__)
+    // NEON: Process 2 x uint64_t at a time
+    const uint64x2_t prefix_vec = vdupq_n_u64(prefix);
+
+    for (; i + 2 <= count; i += 2) {
+        // Load 2 x uint32_t and zero-extend to 2 x uint64_t
+        uint32x2_t rows_32 = vld1_u32(row_ids + i);
+        uint64x2_t rows_64 = vmovl_u32(rows_32);
+
+        // OR with prefix
+        uint64x2_t result = vorrq_u64(rows_64, prefix_vec);
+
+        // Store individually (page boundary safe)
+        dest.write_at(start_idx + i, vgetq_lane_u64(result, 0));
+        dest.write_at(start_idx + i + 1, vgetq_lane_u64(result, 1));
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; ++i) {
+        dest.write_at(start_idx + i,
+                      prefix | static_cast<uint64_t>(row_ids[i]));
+    }
+
+    return count;
+}
+
+/**
+ * @brief Copy provenance from source column using batch reads.
+ *
+ * Copies existing 64-bit provenance values from child intermediate.
+ * Uses contiguous batch access for better cache behavior.
+ *
+ * @param dest       Destination deferred column
+ * @param start_idx  Starting output index
+ * @param src        Source deferred column (from child)
+ * @param row_ids    Row indices into source column
+ * @param count      Number of rows to copy
+ * @return Number of rows processed (always == count)
+ */
+inline size_t copy_provenance_batch(mema::deferred_column_t &dest,
+                                    size_t start_idx,
+                                    const mema::deferred_column_t &src,
+                                    const uint32_t *row_ids, size_t count) {
+    for (size_t i = 0; i < count; ++i) {
+        dest.write_at(start_idx + i, src[row_ids[i]]);
+    }
+    return count;
+}
+
+} // namespace simd_provenance
+
+// ============================================================================
+// Source Precomputation Structures
+// ============================================================================
+
+/**
+ * @brief Precomputed metadata for deferred column sources.
+ *
+ * Tracks where each deferred column's provenance comes from:
+ * - For columnar inputs: encode fresh (table_id, column_idx, row_id)
+ * - For DeferredResult inputs: copy existing provenance from child
+ */
+struct DeferredColumnSource {
+    const mema::deferred_column_t *source_col =
+        nullptr;                 ///< Source if from intermediate.
+    uint8_t base_table_id = 0;   ///< Base table ID for encoding.
+    uint8_t base_column_idx = 0; ///< Base column index for encoding.
+    bool from_build = false;     ///< True if from build side.
+    bool needs_encode = false;   ///< True if columnar (needs fresh encode).
+};
+
+/**
+ * @brief Precomputed metadata for materialized column sources.
+ *
+ * Eliminates per-row std::variant access and conditional checks in hot loop.
+ * Mirrors SourceInfo from construct_intermediate.h but for deferred path.
+ */
+struct alignas(8) MaterializedColumnSource {
+    const mema::column_t *intermediate_col =
+        nullptr; ///< Source if from DeferredResult materialized
+    const Column *columnar_col = nullptr; ///< Source if from ColumnarTable
+    const mema::deferred_column_t *deferred_resolve_col =
+        nullptr;                 ///< Source if needs deferred resolution
+    size_t child_output_idx = 0; ///< Index in child's output
+    size_t mat_col_idx = 0;      ///< Index in result.materialized[]
+    DataType type = DataType::INT32;
+    uint8_t base_table_id = 0;           ///< For VARCHAR source tracking
+    uint8_t base_column_idx = 0;         ///< For VARCHAR source tracking
+    bool is_columnar = false;            ///< True if source is ColumnarTable
+    bool from_build = false;             ///< True if from build side
+    bool needs_deferred_resolve = false; ///< True if child deferred this column
+};
+
+// ============================================================================
+// Helper Functions
+// ============================================================================
+
 /**
  * @brief Collect columns needed from a DeferredInput for page index building.
  */
@@ -92,8 +262,6 @@ inline void prepare_deferred_columns(
     }
 
     // Mark columns needed based on materialization decisions
-    // from_left refers to original left child
-    // build_is_left tells us if build side is the left child
     for (const auto &col : join_node.columns) {
         if (col.resolution == ColumnResolution::MATERIALIZE) {
             bool from_build = (col.from_left == build_is_left);
@@ -119,9 +287,6 @@ inline void prepare_deferred_columns(
 
 /**
  * @brief Create empty deferred result with proper schema.
- *
- * Used when total_matches == 0. Creates empty materialized columns
- * for columns marked MATERIALIZE so they can be used in subsequent joins.
  */
 inline DeferredResult
 create_empty_deferred_result(const DeferredJoinNode &node) {
@@ -129,89 +294,126 @@ create_empty_deferred_result(const DeferredJoinNode &node) {
     result.node_info = &node;
     result.num_rows = 0;
     result.materialized_map.resize(node.columns.size(), std::nullopt);
-    result.table_ids = node.tracked_table_ids;
+    result.deferred_map.resize(node.columns.size(), std::nullopt);
 
-    // Count and allocate empty materialized columns
     size_t mat_count = 0;
+    size_t def_count = 0;
     for (const auto &col : node.columns) {
         if (col.resolution == ColumnResolution::MATERIALIZE) {
             result.materialized_map[col.original_idx] = mat_count++;
+        } else {
+            result.deferred_map[col.original_idx] = def_count++;
         }
     }
     result.materialized.resize(mat_count);
-    // Each column has 0 rows, which is valid for empty result
-
-    // Also create empty row ID columns
-    result.row_ids.resize(node.tracked_table_ids.size());
-    for (size_t i = 0; i < node.tracked_table_ids.size(); ++i) {
-        result.row_ids[i].table_id = node.tracked_table_ids[i];
-    }
+    result.deferred_columns.resize(def_count);
 
     return result;
 }
 
 /**
- * @brief Precomputed metadata for row ID column sources.
- *
- * Mirrors RowIdSource from construct_intermediate.h but adapted for
- * DeferredInput.
+ * @brief Prepare deferred column sources for intermediate construction.
  */
-struct DeferredRowIdSource {
-    const mema::rowid_column_t *source_col =
-        nullptr;               ///< Source if from intermediate.
-    uint8_t table_id = 0;      ///< Table ID for encoding.
-    bool from_build = false;   ///< True if from build side.
-    bool needs_encode = false; ///< True if columnar (needs GlobalRowId encode).
-};
+inline std::vector<DeferredColumnSource>
+prepare_deferred_sources(const DeferredJoinNode &join_node,
+                         const DeferredInput &build_input,
+                         const DeferredInput &probe_input, bool build_is_left) {
+    std::vector<DeferredColumnSource> sources;
+    sources.reserve(join_node.num_deferred_columns);
 
-/**
- * @brief Prepare row ID sources for deferred intermediate construction.
- */
-inline std::vector<DeferredRowIdSource>
-prepare_deferred_rowid_sources(const std::vector<uint8_t> &merged_table_ids,
-                               const DeferredInput &build_input,
-                               const DeferredInput &probe_input) {
-    std::vector<DeferredRowIdSource> sources;
-    sources.reserve(merged_table_ids.size());
-
-    for (uint8_t tid : merged_table_ids) {
-        DeferredRowIdSource src;
-        src.table_id = tid;
-
-        // Check build side first
-        auto build_tables = build_input.tracked_tables();
-        bool in_build = std::find(build_tables.begin(), build_tables.end(),
-                                  tid) != build_tables.end();
-        if (in_build) {
-            src.from_build = true;
-            if (build_input.is_columnar()) {
+    for (const auto &col : join_node.columns) {
+        if (col.resolution != ColumnResolution::DEFER)
+            continue;
+
+        DeferredColumnSource src;
+        src.base_table_id = col.provenance.base_table_id;
+        src.base_column_idx = col.provenance.base_column_idx;
+        src.from_build = (col.from_left == build_is_left);
+
+        const auto &src_input = src.from_build ? build_input : probe_input;
+
+        if (src_input.is_columnar()) {
+            src.needs_encode = true;
+            src.source_col = nullptr;
+        } else {
+            const auto *child_def =
+                src_input.get_deferred_column(col.child_output_idx);
+            if (child_def) {
+                src.needs_encode = false;
+                src.source_col = child_def;
+            } else {
                 src.needs_encode = true;
                 src.source_col = nullptr;
-            } else {
-                src.needs_encode = false;
-                src.source_col = build_input.get_rowid_column(tid);
             }
+        }
+        sources.push_back(src);
+    }
+    return sources;
+}
+
+/**
+ * @brief Precompute materialized column sources for column-major iteration.
+ *
+ * For each MATERIALIZE column, determines source type and caches pointers
+ * to avoid per-row std::variant access in the hot loop.
+ */
+inline std::vector<MaterializedColumnSource> prepare_materialized_sources(
+    const DeferredJoinNode &join_node, const DeferredInput &build_input,
+    const DeferredInput &probe_input, bool build_is_left) {
+    std::vector<MaterializedColumnSource> sources;
+    sources.reserve(join_node.columns.size());
+
+    size_t mat_idx = 0;
+    for (const auto &col : join_node.columns) {
+        if (col.resolution != ColumnResolution::MATERIALIZE)
+            continue;
+
+        MaterializedColumnSource src;
+        src.mat_col_idx = mat_idx++;
+        src.child_output_idx = col.child_output_idx;
+        src.type = col.type;
+        src.base_table_id = col.provenance.base_table_id;
+        src.base_column_idx = col.provenance.base_column_idx;
+        src.from_build = (col.from_left == build_is_left);
+
+        const auto &src_input = src.from_build ? build_input : probe_input;
+
+        if (src_input.is_columnar()) {
+            src.is_columnar = true;
+            const auto *table = std::get<const ColumnarTable *>(src_input.data);
+            auto [actual_idx, _] =
+                src_input.node->output_attrs[col.child_output_idx];
+            src.columnar_col = &table->columns[actual_idx];
         } else {
-            // Must be from probe side
-            src.from_build = false;
-            if (probe_input.is_columnar()) {
-                src.needs_encode = true;
-                src.source_col = nullptr;
-            } else {
-                src.needs_encode = false;
-                src.source_col = probe_input.get_rowid_column(tid);
+            src.is_columnar = false;
+            const auto &ir = std::get<DeferredResult>(src_input.data);
+
+            if (ir.is_materialized(col.child_output_idx)) {
+                src.intermediate_col =
+                    ir.get_materialized(col.child_output_idx);
+            } else if (ir.is_deferred(col.child_output_idx)) {
+                src.needs_deferred_resolve = true;
+                src.deferred_resolve_col =
+                    ir.get_deferred(col.child_output_idx);
             }
         }
         sources.push_back(src);
     }
+
     return sources;
 }
 
+// ============================================================================
+// Main Construction Function
+// ============================================================================
+
 /**
  * @brief Constructs deferred intermediate result from thread-local buffers.
  *
+ * Optimized with column-major iteration and SIMD provenance encoding.
  * Only materializes columns marked MATERIALIZE in the DeferredJoinNode.
- * All row ID columns are populated for provenance tracking.
+ * Deferred columns store 64-bit provenance encoding for resolution at final
+ * output.
  *
  * @tparam Mode            Collection mode for compile-time specialization.
  * @param buffers          Thread-local match buffers from probe.
@@ -220,11 +422,10 @@ prepare_deferred_rowid_sources(const std::vector<uint8_t> &merged_table_ids,
  * @param join_node        Deferred join node with materialization decisions.
  * @param remapped_attrs   Output attributes (after build/probe remapping).
  * @param build_output_size Number of columns from build side.
+ * @param build_is_left    True if build side is the original left child.
  * @param columnar_reader  Reader for columnar data access.
  * @param out_result       Output DeferredResult (populated in-place).
- * @param merged_table_ids Sorted table IDs to track.
- * @param deferred_plan    Full deferred plan for base table access (deferred
- * resolution).
+ * @param deferred_plan    Full deferred plan for base table access.
  */
 template <MatchCollectionMode Mode>
 void construct_deferred_from_buffers(
@@ -234,10 +435,9 @@ void construct_deferred_from_buffers(
     const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
     size_t build_output_size, bool build_is_left,
     ColumnarReader &columnar_reader, DeferredResult &out_result,
-    const std::vector<uint8_t> &merged_table_ids,
     const DeferredPlan &deferred_plan) {
 
-    // Count total matches
+    // Count total matches and compute buffer start offsets
     size_t total_matches = 0;
     std::vector<size_t> buffer_starts(buffers.size());
     for (size_t i = 0; i < buffers.size(); ++i) {
@@ -250,85 +450,85 @@ void construct_deferred_from_buffers(
         return;
     }
 
+    // Initialize result metadata
     out_result.node_info = &join_node;
     out_result.num_rows = total_matches;
-    out_result.table_ids = merged_table_ids;
-
-    // Build materialized_map: count MATERIALIZE columns and create mapping
-    // materialized_map[original_idx] -> index into out_result.materialized
     out_result.materialized_map.resize(join_node.columns.size(), std::nullopt);
-    size_t mat_count = 0;
+    out_result.deferred_map.resize(join_node.columns.size(), std::nullopt);
 
-    // Iterate over join_node.columns (which uses original output order)
-    // and assign materialized indices to MATERIALIZE columns
+    size_t mat_count = 0;
+    size_t def_count = 0;
     for (const auto &col : join_node.columns) {
         if (col.resolution == ColumnResolution::MATERIALIZE) {
             out_result.materialized_map[col.original_idx] = mat_count++;
+        } else {
+            out_result.deferred_map[col.original_idx] = def_count++;
         }
     }
 
-    // Prepare row ID sources
-    auto rowid_sources = prepare_deferred_rowid_sources(
-        merged_table_ids, build_input, probe_input);
-
-    const size_t num_rowid_cols = rowid_sources.size();
+    // Precompute sources for column-major iteration
+    auto mat_sources = prepare_materialized_sources(join_node, build_input,
+                                                    probe_input, build_is_left);
+    auto deferred_sources = prepare_deferred_sources(
+        join_node, build_input, probe_input, build_is_left);
 
     // Pre-allocate pages
     using Page = mema::column_t::Page;
-    using RowIdPage = mema::rowid_column_t::Page;
-    size_t total_pages_needed =
+    using DeferredPage = mema::deferred_column_t::Page;
+    size_t mat_pages_needed =
         (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE;
+    size_t def_pages_needed =
+        (total_matches + mema::deferred_column_t::ENTRIES_PER_PAGE - 1) /
+        mema::deferred_column_t::ENTRIES_PER_PAGE;
 
-    // Allocate materialized columns
     out_result.materialized.resize(mat_count);
     for (size_t c = 0; c < mat_count; ++c) {
-        out_result.materialized[c].pages.resize(total_pages_needed);
+        out_result.materialized[c].pages.resize(mat_pages_needed);
         out_result.materialized[c].set_row_count(total_matches);
     }
 
-    // Allocate row ID columns
-    out_result.row_ids.resize(num_rowid_cols);
-    for (size_t r = 0; r < num_rowid_cols; ++r) {
-        out_result.row_ids[r].table_id = merged_table_ids[r];
-        out_result.row_ids[r].pages.resize(total_pages_needed);
-        out_result.row_ids[r].set_row_count(total_matches);
+    out_result.deferred_columns.resize(def_count);
+    for (size_t d = 0; d < def_count; ++d) {
+        out_result.deferred_columns[d].pages.resize(def_pages_needed);
+        out_result.deferred_columns[d].set_row_count(total_matches);
+    }
+
+    // Set source metadata for materialized columns
+    for (const auto &src : mat_sources) {
+        out_result.materialized[src.mat_col_idx].source_table =
+            src.base_table_id;
+        out_result.materialized[src.mat_col_idx].source_column =
+            src.base_column_idx;
     }
 
-    // Parallel page allocation
     const size_t num_threads = THREAD_COUNT;
+
+    // Parallel page allocation
     worker_pool().execute([&](size_t t) {
         for (size_t c = 0; c < mat_count; ++c) {
             auto &col = out_result.materialized[c];
-            for (size_t p = t; p < total_pages_needed; p += num_threads) {
+            for (size_t p = t; p < mat_pages_needed; p += num_threads) {
                 void *ptr =
                     Contest::platform::get_arena(t)
                         .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
                 col.pages[p] = reinterpret_cast<Page *>(ptr);
             }
         }
-        for (size_t r = 0; r < num_rowid_cols; ++r) {
-            auto &rid_col = out_result.row_ids[r];
-            for (size_t p = t; p < total_pages_needed; p += num_threads) {
+        for (size_t d = 0; d < def_count; ++d) {
+            auto &def_col = out_result.deferred_columns[d];
+            for (size_t p = t; p < def_pages_needed; p += num_threads) {
                 void *ptr =
                     Contest::platform::get_arena(t)
-                        .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
-                rid_col.pages[p] = reinterpret_cast<RowIdPage *>(ptr);
+                        .alloc_chunk<
+                            Contest::platform::ChunkType::DEFERRED_PAGE>();
+                def_col.pages[p] = reinterpret_cast<DeferredPage *>(ptr);
             }
         }
     });
 
-    // Set source metadata for materialized columns
-    for (const auto &col : join_node.columns) {
-        if (col.resolution == ColumnResolution::MATERIALIZE) {
-            size_t mat_idx = *out_result.materialized_map[col.original_idx];
-            out_result.materialized[mat_idx].source_table =
-                col.provenance.base_table_id;
-            out_result.materialized[mat_idx].source_column =
-                col.provenance.base_column_idx;
-        }
-    }
-
-    // Parallel population: each thread processes its own buffer
+    // ========================================================================
+    // COLUMN-MAJOR PARALLEL POPULATION
+    // ========================================================================
     worker_pool().execute([&](size_t t) {
         if (t >= buffers.size())
             return;
@@ -340,102 +540,105 @@ void construct_deferred_from_buffers(
         size_t start = buffer_starts[t];
         ColumnarReader::Cursor cursor;
 
-        // Iterate through matches
-        auto left_it = buf.left_range().begin();
-        auto right_it = buf.right_range().begin();
-
-        for (size_t m = 0; m < my_count; ++m) {
-            uint32_t build_row = 0, probe_row = 0;
-
-            if constexpr (Mode == MatchCollectionMode::BOTH) {
-                build_row = *left_it;
-                probe_row = *right_it;
-                ++left_it;
-                ++right_it;
-            } else if constexpr (Mode == MatchCollectionMode::LEFT_ONLY) {
-                build_row = *left_it;
-                ++left_it;
-            } else {
-                probe_row = *right_it;
-                ++right_it;
-            }
-
-            size_t out_row = start + m;
-
-            // Write materialized columns
-            for (const auto &col : join_node.columns) {
-                if (col.resolution != ColumnResolution::MATERIALIZE)
-                    continue;
-
-                size_t mat_col_idx =
-                    *out_result.materialized_map[col.original_idx];
-                auto &out_col = out_result.materialized[mat_col_idx];
-
-                // Determine source based on from_left and build/probe mapping
-                // col.from_left refers to original left child
-                // build_is_left tells us if build side is the left child
-                // If from_left && build_is_left => from build
-                // If from_left && !build_is_left => from probe (left became
-                // probe)
-                bool from_build = (col.from_left == build_is_left);
-                uint32_t src_row = from_build ? build_row : probe_row;
-                const auto &src_input = from_build ? build_input : probe_input;
-
-                mema::value_t val;
-                if (src_input.is_columnar()) {
-                    const auto *table =
-                        std::get<const ColumnarTable *>(src_input.data);
-                    auto [actual_idx, _] =
-                        src_input.node->output_attrs[col.child_output_idx];
-                    val = columnar_reader.read_value(
-                        table->columns[actual_idx], col.child_output_idx,
-                        src_row, col.type, cursor, from_build);
-                } else {
-                    const auto &ir = std::get<DeferredResult>(src_input.data);
-                    // Check if materialized in child
-                    const auto *src_col =
-                        ir.get_materialized(col.child_output_idx);
-                    if (src_col) {
-                        val = (*src_col)[src_row];
+        // ====================================================================
+        // Process MATERIALIZED columns (column-major for cache locality)
+        // ====================================================================
+        for (const auto &src : mat_sources) {
+            auto &dest_col = out_result.materialized[src.mat_col_idx];
+
+            // Get appropriate range based on which side this column comes from
+            auto range = src.from_build ? buf.left_range() : buf.right_range();
+
+            if (src.is_columnar) {
+                // Columnar source - use ColumnarReader with cursor caching
+                const auto &col = *src.columnar_col;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    dest_col.write_at(k++,
+                                      columnar_reader.read_value(
+                                          col, src.child_output_idx, rid,
+                                          src.type, cursor, src.from_build));
+                }
+            } else if (src.intermediate_col) {
+                // Intermediate materialized source - direct copy
+                const auto &vec = *src.intermediate_col;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    dest_col.write_at(k++, vec[rid]);
+                }
+            } else if (src.needs_deferred_resolve && src.deferred_resolve_col) {
+                // Deferred in child - resolve via provenance
+                const auto &def_col = *src.deferred_resolve_col;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    uint64_t prov = def_col[rid];
+                    uint8_t base_tid = DeferredProvenance::table(prov);
+                    uint8_t base_col = DeferredProvenance::column(prov);
+                    uint64_t base_row = DeferredProvenance::row(prov);
+
+                    if (deferred_plan.original_plan) [[likely]] {
+                        const auto &base_table =
+                            deferred_plan.original_plan->inputs[base_tid];
+                        mema::value_t val =
+                            columnar_reader.read_value_direct_public(
+                                base_table.columns[base_col],
+                                static_cast<uint32_t>(base_row), src.type);
+                        dest_col.write_at(k++, val);
                     } else {
-                        // Deferred - resolve via row ID to base table
-                        // This should only happen if materialization wasn't
-                        // propagated properly. Use direct read as fallback.
-                        const auto *rowid_col =
-                            ir.get_rowid_column(col.provenance.base_table_id);
-                        if (rowid_col && deferred_plan.original_plan) {
-                            uint32_t encoded = (*rowid_col)[src_row];
-                            uint32_t base_row = GlobalRowId::row(encoded);
-                            const auto &base_table =
-                                deferred_plan.original_plan
-                                    ->inputs[col.provenance.base_table_id];
-                            val = columnar_reader.read_value_direct_public(
-                                base_table
-                                    .columns[col.provenance.base_column_idx],
-                                base_row, col.type);
-                        } else {
-                            val = mema::value_t{mema::value_t::NULL_VALUE};
-                        }
+                        dest_col.write_at(
+                            k++, mema::value_t{mema::value_t::NULL_VALUE});
                     }
                 }
-
-                out_col.write_at(out_row, val);
             }
+        }
 
-            // Write row ID columns
-            for (size_t r = 0; r < num_rowid_cols; ++r) {
-                const auto &rid_src = rowid_sources[r];
-                auto &dest_rid_col = out_result.row_ids[r];
-
-                uint32_t local_idx = rid_src.from_build ? build_row : probe_row;
-
-                if (rid_src.needs_encode) {
-                    dest_rid_col.write_at(
-                        out_row,
-                        GlobalRowId::encode(rid_src.table_id, local_idx));
-                } else if (rid_src.source_col) {
-                    dest_rid_col.write_at(out_row,
-                                          (*rid_src.source_col)[local_idx]);
+        // ====================================================================
+        // Process DEFERRED columns (column-major with SIMD batch encoding)
+        // ====================================================================
+        for (size_t d = 0; d < deferred_sources.size(); ++d) {
+            const auto &def_src = deferred_sources[d];
+            auto &dest_def_col = out_result.deferred_columns[d];
+
+            if (def_src.needs_encode) {
+                // Fresh encoding from columnar input - use SIMD batch
+                auto batch_reader = def_src.from_build
+                                        ? buf.left_batch_reader()
+                                        : buf.right_batch_reader();
+
+                size_t k = start;
+                while (batch_reader.has_more()) {
+                    size_t batch_count;
+                    // Request larger batches for SIMD efficiency
+                    constexpr size_t MAX_BATCH =
+                        simd_provenance::BATCH_SIZE > 0 ? 64 : 256;
+                    const uint32_t *row_ids =
+                        batch_reader.get_batch(MAX_BATCH, batch_count);
+
+                    if (batch_count > 0) {
+                        simd_provenance::encode_provenance_batch(
+                            dest_def_col, k, row_ids, batch_count,
+                            def_src.base_table_id, def_src.base_column_idx);
+                        k += batch_count;
+                    }
+                }
+            } else if (def_src.source_col) {
+                // Copy existing provenance from child intermediate
+                auto batch_reader = def_src.from_build
+                                        ? buf.left_batch_reader()
+                                        : buf.right_batch_reader();
+
+                size_t k = start;
+                while (batch_reader.has_more()) {
+                    size_t batch_count;
+                    const uint32_t *row_ids =
+                        batch_reader.get_batch(256, batch_count);
+
+                    if (batch_count > 0) {
+                        simd_provenance::copy_provenance_batch(
+                            dest_def_col, k, *def_src.source_col, row_ids,
+                            batch_count);
+                        k += batch_count;
+                    }
                 }
             }
         }
diff --git a/include/materialization/materialize_deferred.h b/include/materialization/materialize_deferred.h
index 8b548c0..bd7a2af 100644
--- a/include/materialization/materialize_deferred.h
+++ b/include/materialization/materialize_deferred.h
@@ -3,7 +3,8 @@
  * @brief Final materialization for deferred execution path.
  *
  * Materializes all output columns at the root join, resolving deferred
- * columns by following row ID provenance back to base tables.
+ * columns by decoding 64-bit provenance (table_id, column_idx, row_id) back
+ * to base tables.
  *
  * @see construct_deferred.h for building DeferredResult intermediates.
  * @see materialize.h for the eager materialization equivalent.
@@ -136,7 +137,7 @@ inline ColumnarTable create_empty_deferred_final(
  * Handles three cases:
  * 1. COLUMNAR_DIRECT: Input is columnar, read directly via row index
  * 2. MATERIALIZED: Column was materialized in DeferredResult
- * 3. DEFERRED: Resolve via row ID lookup to base table
+ * 3. DEFERRED: Resolve via 64-bit provenance to base table
  *
  * @tparam Mode Collection mode for compile-time specialization.
  * @tparam BuilderType Int32PageBuilder or VarcharPageBuilder.
@@ -249,7 +250,8 @@ inline void materialize_deferred_column(
 /**
  * @brief Materialize single output column handling deferred resolution.
  *
- * For deferred columns, resolves via row ID tracking back to base table.
+ * For deferred columns, resolves via 64-bit provenance encoding back to
+ * base table.
  *
  * @tparam Mode Collection mode for compile-time specialization.
  */
@@ -276,20 +278,13 @@ inline void materialize_single_deferred_column(
     }
 
     // Determine if this column comes from build or probe side at runtime
-    // col_info->from_left refers to original left child
-    // build_is_left tells us if build side is the left child
-    // If from_left && build_is_left => from build
-    // If from_left && !build_is_left => from probe (left became probe)
-    // If !from_left && build_is_left => from probe (right is probe)
-    // If !from_left && !build_is_left => from build (right became build)
     bool from_build = (col_info->from_left == build_is_left);
     const DeferredInput &src_input = from_build ? build_input : probe_input;
 
     // Determine how to read the value
     const Column *columnar_source = nullptr;
     const mema::column_t *materialized_source = nullptr;
-    const mema::rowid_column_t *rowid_source = nullptr;
-    const Column *base_table_column = nullptr;
+    const mema::deferred_column_t *deferred_source = nullptr;
 
     if (src_input.is_columnar()) {
         // Direct columnar read
@@ -303,16 +298,9 @@ inline void materialize_single_deferred_column(
             // Read from materialized column
             materialized_source =
                 ir.get_materialized(col_info->child_output_idx);
-        } else {
-            // Deferred - need to resolve via row ID
-            rowid_source =
-                ir.get_rowid_column(col_info->provenance.base_table_id);
-            if (deferred_plan.original_plan) {
-                base_table_column =
-                    &deferred_plan.original_plan
-                         ->inputs[col_info->provenance.base_table_id]
-                         .columns[col_info->provenance.base_column_idx];
-            }
+        } else if (ir.is_deferred(col_info->child_output_idx)) {
+            // Deferred - need to resolve via 64-bit provenance
+            deferred_source = ir.get_deferred(col_info->child_output_idx);
         }
     }
 
@@ -325,13 +313,17 @@ inline void materialize_single_deferred_column(
                 col_info->type, cursor, from_build);
         } else if (materialized_source) {
             return (*materialized_source)[local_row_id];
-        } else if (rowid_source && base_table_column) {
-            // Deferred resolution: get base table row from encoded row ID
-            uint32_t encoded = (*rowid_source)[local_row_id];
-            uint32_t base_row = GlobalRowId::row(encoded);
+        } else if (deferred_source && deferred_plan.original_plan) {
+            // Deferred resolution: decode 64-bit provenance
+            uint64_t prov = (*deferred_source)[local_row_id];
+            uint8_t base_tid = DeferredProvenance::table(prov);
+            uint8_t base_col = DeferredProvenance::column(prov);
+            uint64_t base_row = DeferredProvenance::row(prov);
+            const auto &base_table =
+                deferred_plan.original_plan->inputs[base_tid];
             return columnar_reader.read_value(
-                *base_table_column, col_info->provenance.base_column_idx,
-                base_row, col_info->type, cursor, true);
+                base_table.columns[base_col], base_col,
+                static_cast<uint32_t>(base_row), col_info->type, cursor, true);
         }
         return mema::value_t{mema::value_t::NULL_VALUE};
     };
@@ -357,8 +349,12 @@ inline void materialize_single_deferred_column(
             str_src_ptr = &deferred_plan.original_plan
                                ->inputs[materialized_source->source_table]
                                .columns[materialized_source->source_column];
-        } else if (base_table_column) {
-            str_src_ptr = base_table_column;
+        } else if (deferred_source && deferred_plan.original_plan) {
+            // For deferred VARCHAR, get source from provenance of first row
+            // All rows in a deferred column share the same base table/column
+            str_src_ptr = &deferred_plan.original_plan
+                               ->inputs[col_info->provenance.base_table_id]
+                               .columns[col_info->provenance.base_column_idx];
         }
     }
 
@@ -383,7 +379,7 @@ inline void materialize_single_deferred_column(
  * @brief Materialize all output columns from deferred intermediate.
  *
  * For root join in deferred execution path. Resolves all deferred columns
- * by following row ID provenance to base tables.
+ * by decoding 64-bit provenance to base tables.
  *
  * @tparam Mode Collection mode for compile-time specialization.
  * @param buffers Thread-local match buffers from probe.
diff --git a/include/platform/arena.h b/include/platform/arena.h
index f1aa32e..59d3442 100644
--- a/include/platform/arena.h
+++ b/include/platform/arena.h
@@ -41,12 +41,13 @@ static constexpr size_t PAGE_2MB = 2 * 1024 * 1024;
  * @brief Chunk type enumeration for arena regions.
  */
 enum class ChunkType : uint8_t {
-    HASH_CHUNK = 0,  ///< 4KB  - hash table partition chunks
-    IR_PAGE = 1,     ///< 16KB - intermediate result pages
-    INDEX_CHUNK = 2, ///< 32KB - match collector index chunks
-    GENERAL = 3,     ///< Variable - misc allocations
+    HASH_CHUNK = 0,    ///< 4KB  - hash table partition chunks
+    IR_PAGE = 1,       ///< 16KB - intermediate result pages (32-bit values)
+    INDEX_CHUNK = 2,   ///< 32KB - match collector index chunks
+    DEFERRED_PAGE = 3, ///< 32KB - deferred provenance pages (64-bit values)
+    GENERAL = 4,       ///< Variable - misc allocations
 
-    NUM_TYPES = 4
+    NUM_TYPES = 5
 };
 
 // ============================================================================
@@ -67,12 +68,15 @@ template <> struct ChunkSize<ChunkType::IR_PAGE> {
 template <> struct ChunkSize<ChunkType::INDEX_CHUNK> {
     static constexpr size_t value = 32768;
 };
+template <> struct ChunkSize<ChunkType::DEFERRED_PAGE> {
+    static constexpr size_t value = 32768;
+};
 template <> struct ChunkSize<ChunkType::GENERAL> {
     static constexpr size_t value = 0;
 };
 
 /// Runtime chunk size array indexed by ChunkType.
-inline constexpr size_t CHUNK_SIZES[] = {4096, 16384, 32768, 0};
+inline constexpr size_t CHUNK_SIZES[] = {4096, 16384, 32768, 32768, 0};
 
 // ============================================================================
 // Page Policies
@@ -92,6 +96,7 @@ inline constexpr PagePolicy REGION_PAGE_POLICY[] = {
     PagePolicy::SMALL_PAGES, // HASH_CHUNK
     PagePolicy::HUGE_PAGES,  // IR_PAGE
     PagePolicy::HUGE_PAGES,  // INDEX_CHUNK
+    PagePolicy::HUGE_PAGES,  // DEFERRED_PAGE
     PagePolicy::HUGE_PAGES,  // GENERAL
 };
 
@@ -102,7 +107,7 @@ inline constexpr PagePolicy REGION_PAGE_POLICY[] = {
 /**
  * @brief Region size configuration based on available DRAM.
  *
- * Uses 75% of SPC__NUMA_NODE_DRAM_MB, divided equally (25%) among 4 regions.
+ * Uses 75% of SPC__NUMA_NODE_DRAM_MB, divided equally (20%) among 5 regions.
  */
 struct RegionConfig {
     size_t total_arena_bytes;
@@ -113,8 +118,8 @@ struct RegionConfig {
                             1024ULL * 1024ULL * 3ULL / 4ULL;
     }
 
-    /// Get total size for a region (25% each).
-    size_t get(ChunkType /*ct*/) const { return total_arena_bytes / 4; }
+    /// Get total size for a region (20% each).
+    size_t get(ChunkType /*ct*/) const { return total_arena_bytes / 5; }
 
     /// Get total arena size.
     size_t total() const { return total_arena_bytes; }
@@ -450,7 +455,8 @@ class ArenaManager {
 // Global Instance and Helper
 // ============================================================================
 
-/// Global arena manager instance (inline global, constructed at program startup).
+/// Global arena manager instance (inline global, constructed at program
+/// startup).
 inline ArenaManager g_arena_manager{};
 
 /// Get thread arena by thread ID.
diff --git a/src/analyze_plan.cpp b/src/analyze_plan.cpp
index f0ef0a8..f511c60 100644
--- a/src/analyze_plan.cpp
+++ b/src/analyze_plan.cpp
@@ -135,34 +135,6 @@ compute_base_collection_mode(const std::vector<DeferredColumnInfo> &columns,
     return join::MatchCollectionMode::BOTH;
 }
 
-/**
- * @brief Collect tracked table IDs from a DeferredNode.
- */
-std::vector<uint8_t> get_tracked_tables(const DeferredNode &node) {
-    if (const auto *scan = std::get_if<DeferredScanNode>(&node)) {
-        return {scan->base_table_id};
-    }
-    return std::get<DeferredJoinNode>(node).tracked_table_ids;
-}
-
-/**
- * @brief Merge tracked table IDs from two children (sorted, unique).
- */
-std::vector<uint8_t> merge_table_ids(const DeferredNode &left,
-                                     const DeferredNode &right) {
-    auto left_ids = get_tracked_tables(left);
-    auto right_ids = get_tracked_tables(right);
-
-    std::vector<uint8_t> result;
-    result.reserve(left_ids.size() + right_ids.size());
-
-    std::merge(left_ids.begin(), left_ids.end(), right_ids.begin(),
-               right_ids.end(), std::back_inserter(result));
-
-    result.erase(std::unique(result.begin(), result.end()), result.end());
-    return result;
-}
-
 } // anonymous namespace
 
 DeferredPlan analyze_plan(const Plan &plan) {
@@ -264,11 +236,17 @@ DeferredPlan analyze_plan(const Plan &plan) {
                 djoin.columns.push_back(std::move(info));
             }
 
-            // Compute collection mode and tracked tables
+            // Compute collection mode and count deferred columns
             djoin.base_collection_mode =
                 compute_base_collection_mode(djoin.columns, left_size);
-            djoin.tracked_table_ids = merge_table_ids(
-                deferred.nodes[join.left], deferred.nodes[join.right]);
+
+            // Count deferred columns for pre-allocation
+            djoin.num_deferred_columns = 0;
+            for (const auto &col : djoin.columns) {
+                if (col.resolution == ColumnResolution::DEFER) {
+                    ++djoin.num_deferred_columns;
+                }
+            }
 
             deferred.nodes[node_idx] = std::move(djoin);
         }
@@ -305,6 +283,20 @@ DeferredPlan analyze_plan(const Plan &plan) {
         }
     }
 
+    // PASS 3: Recount num_deferred_columns after propagation
+    for (size_t node_idx : post_order) {
+        auto *djoin = std::get_if<DeferredJoinNode>(&deferred.nodes[node_idx]);
+        if (!djoin)
+            continue;
+
+        djoin->num_deferred_columns = 0;
+        for (const auto &col : djoin->columns) {
+            if (col.resolution == ColumnResolution::DEFER) {
+                ++djoin->num_deferred_columns;
+            }
+        }
+    }
+
     return deferred;
 }
 
diff --git a/src/execute.cpp b/src/execute.cpp
index d8ef462..b9d45e5 100644
--- a/src/execute.cpp
+++ b/src/execute.cpp
@@ -379,54 +379,14 @@ DeferredJoinResult execute_deferred_join_with_mode(
     const UnchainedHashtable *hash_table, const DeferredInput &build_input,
     const DeferredInput &probe_input, const BuildProbeConfig &config,
     const DeferredJoinNode &join_node, io::ColumnarReader &columnar_reader,
-    const DeferredPlan &deferred_plan,
-    const std::vector<uint8_t> &merged_table_ids, TimingStats &stats) {
+    const DeferredPlan &deferred_plan, TimingStats &stats) {
 
     std::vector<ThreadLocalMatchBuffer<Mode>> match_buffers;
 
-    // Probe phase - need to convert DeferredInput to JoinInput for probing
-    // For now, handle columnar probe directly
     if (use_nested_loop) {
         auto nested_loop_start = std::chrono::high_resolution_clock::now();
-        // Nested loop requires JoinInput - create adapter
-        JoinInput build_ji, probe_ji;
-        build_ji.node = build_input.node;
-        probe_ji.node = probe_input.node;
-
-        if (build_input.is_columnar()) {
-            build_ji.data = std::get<const ColumnarTable *>(build_input.data);
-            build_ji.table_id = build_input.table_id;
-        } else {
-            // Convert DeferredResult to ExtendedResult for compatibility
-            // This is a limitation - nested loop path falls back to eager
-            const auto &dr = std::get<DeferredResult>(build_input.data);
-            ExtendedResult er;
-            er.columns = std::move(
-                const_cast<std::vector<mema::column_t> &>(dr.materialized));
-            er.row_ids = std::move(
-                const_cast<std::vector<mema::rowid_column_t> &>(dr.row_ids));
-            er.table_ids = dr.table_ids;
-            build_ji.data = std::move(er);
-            build_ji.table_id = 0;
-        }
-
-        if (probe_input.is_columnar()) {
-            probe_ji.data = std::get<const ColumnarTable *>(probe_input.data);
-            probe_ji.table_id = probe_input.table_id;
-        } else {
-            const auto &dr = std::get<DeferredResult>(probe_input.data);
-            ExtendedResult er;
-            er.columns = std::move(
-                const_cast<std::vector<mema::column_t> &>(dr.materialized));
-            er.row_ids = std::move(
-                const_cast<std::vector<mema::rowid_column_t> &>(dr.row_ids));
-            er.table_ids = dr.table_ids;
-            probe_ji.data = std::move(er);
-            probe_ji.table_id = 0;
-        }
-
-        match_buffers = nested_loop_join<Mode>(
-            build_ji, probe_ji, config.build_attr, config.probe_attr);
+        match_buffers = nested_loop_join_deferred<Mode>(
+            build_input, probe_input, config.build_attr, config.probe_attr);
         auto nested_loop_end = std::chrono::high_resolution_clock::now();
         stats.nested_loop_join_ms +=
             std::chrono::duration_cast<std::chrono::milliseconds>(
@@ -508,8 +468,7 @@ DeferredJoinResult execute_deferred_join_with_mode(
             construct_deferred_from_buffers<Mode>(
                 match_buffers, build_input, probe_input, join_node,
                 config.remapped_attrs, build_input.output_size(),
-                config.build_left, columnar_reader, result, merged_table_ids,
-                deferred_plan);
+                config.build_left, columnar_reader, result, deferred_plan);
         } else {
             result = create_empty_deferred_result(join_node);
         }
@@ -559,15 +518,9 @@ DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan,
 
     const size_t HASH_TABLE_THRESHOLD = 8;
     size_t build_rows = build_input.row_count(config.build_attr);
-    // Nested loop doesn't work with DeferredResult because it only has join
-    // keys materialized. Force hash join when either side is DeferredResult.
-    bool use_nested_loop = (build_rows < HASH_TABLE_THRESHOLD) &&
-                           build_is_columnar && probe_is_columnar;
-
-    // Merge table IDs
-    auto build_tables = build_input.tracked_tables();
-    auto probe_tables = probe_input.tracked_tables();
-    auto merged_table_ids = merge_tracked_tables(build_tables, probe_tables);
+    // Use nested loop for small build tables - works with both columnar and
+    // DeferredResult inputs (join keys are always materialized).
+    bool use_nested_loop = (build_rows < HASH_TABLE_THRESHOLD);
 
     io::ColumnarReader columnar_reader;
     auto setup_end = std::chrono::high_resolution_clock::now();
@@ -575,10 +528,15 @@ DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan,
                           setup_end - setup_start)
                           .count();
 
-    // For deferred materialization, we always need BOTH row indices because
-    // we track provenance from both sides for deferred column resolution.
-    // The optimization to collect only one side's indices is not safe here.
-    MatchCollectionMode mode = MatchCollectionMode::BOTH;
+    // Use pre-computed collection mode from plan analysis.
+    // base_collection_mode assumes build=left; flip if build=right at runtime.
+    MatchCollectionMode mode = djoin.base_collection_mode;
+    if (!config.build_left) {
+        if (mode == MatchCollectionMode::LEFT_ONLY)
+            mode = MatchCollectionMode::RIGHT_ONLY;
+        else if (mode == MatchCollectionMode::RIGHT_ONLY)
+            mode = MatchCollectionMode::LEFT_ONLY;
+    }
 
     // Build hash table if needed
     std::optional<UnchainedHashtable> hash_table;
@@ -618,22 +576,19 @@ DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan,
         return execute_deferred_join_with_mode<MatchCollectionMode::BOTH>(
             use_nested_loop, probe_is_columnar, is_root,
             use_nested_loop ? nullptr : &(*hash_table), build_input,
-            probe_input, config, djoin, columnar_reader, deferred_plan,
-            merged_table_ids, stats);
+            probe_input, config, djoin, columnar_reader, deferred_plan, stats);
 
     case MatchCollectionMode::LEFT_ONLY:
         return execute_deferred_join_with_mode<MatchCollectionMode::LEFT_ONLY>(
             use_nested_loop, probe_is_columnar, is_root,
             use_nested_loop ? nullptr : &(*hash_table), build_input,
-            probe_input, config, djoin, columnar_reader, deferred_plan,
-            merged_table_ids, stats);
+            probe_input, config, djoin, columnar_reader, deferred_plan, stats);
 
     case MatchCollectionMode::RIGHT_ONLY:
         return execute_deferred_join_with_mode<MatchCollectionMode::RIGHT_ONLY>(
             use_nested_loop, probe_is_columnar, is_root,
             use_nested_loop ? nullptr : &(*hash_table), build_input,
-            probe_input, config, djoin, columnar_reader, deferred_plan,
-            merged_table_ids, stats);
+            probe_input, config, djoin, columnar_reader, deferred_plan, stats);
     }
 
     return DeferredResult{};

From c7ccae48ab90b6f9da646e04143d8d27ae82de46 Mon Sep 17 00:00:00 2001
From: Themos Papatheofanous <themos360@gmail.com>
Date: Thu, 22 Jan 2026 02:19:04 +0200
Subject: [PATCH 04/13] chore: integrating deferred materialization

---
 CMakeLists.txt                                |   9 -
 include/data_model/deferred_intermediate.h    | 172 ----
 include/data_model/deferred_plan.h            |  36 +-
 include/data_model/intermediate.h             | 308 +++----
 include/join_execution/hash_join.h            |  32 +-
 include/join_execution/join_setup.h           | 268 +------
 include/join_execution/nested_loop.h          | 220 +----
 include/materialization/construct_deferred.h  | 649 ---------------
 .../materialization/construct_intermediate.h  | 749 ++++++++++++------
 include/materialization/materialize.h         | 328 +++++---
 .../materialization/materialize_deferred.h    | 435 ----------
 src/analyze_plan.cpp                          |  95 ++-
 src/execute.cpp                               | 448 ++---------
 13 files changed, 1097 insertions(+), 2652 deletions(-)
 delete mode 100644 include/data_model/deferred_intermediate.h
 delete mode 100644 include/materialization/construct_deferred.h
 delete mode 100644 include/materialization/materialize_deferred.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2621d56..fc63a09 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,15 +54,6 @@ FetchContent_Declare(
 
 FetchContent_MakeAvailable(fmtlib)
 
-set(ENABLE_SANITIZER OFF)
-set(ENABLE_UBSAN OFF)
-
-# Deferred materialization: only materialize join keys, defer other columns
-option(USE_DEFERRED_MATERIALIZATION "Enable deferred column materialization" OFF)
-if(USE_DEFERRED_MATERIALIZATION)
-    message(STATUS "Deferred materialization ENABLED")
-    add_compile_definitions(USE_DEFERRED_MATERIALIZATION)
-endif()
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|powerpc|ppc64|ppc64le")
     message("Disabling jemalloc extension of DuckDB on Power.")
     set(SKIP_EXTENSIONS jemalloc)
diff --git a/include/data_model/deferred_intermediate.h b/include/data_model/deferred_intermediate.h
deleted file mode 100644
index 8e183d5..0000000
--- a/include/data_model/deferred_intermediate.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/**
- * @file deferred_intermediate.h
- * @brief Lightweight intermediate result for deferred materialization.
- *
- * DeferredResult stores only materialized columns (join keys) plus
- * per-deferred-column provenance using 64-bit encoding (table_id, column_idx,
- * row_id). Deferred columns are resolved at final materialization by decoding
- * the provenance and reading directly from base tables.
- *
- * @see deferred_plan.h for DeferredJoinNode with column decisions.
- * @see construct_deferred.h for building DeferredResult.
- * @see materialize_deferred.h for final resolution.
- */
-#pragma once
-
-#include <cstdint>
-#include <optional>
-#include <variant>
-#include <vector>
-
-#include <data_model/deferred_plan.h>
-#include <data_model/intermediate.h>
-
-namespace Contest {
-
-/**
- * @brief Lightweight intermediate result with only join keys materialized.
- *
- * Unlike ExtendedResult which stores all projected columns, DeferredResult
- * stores only columns marked MATERIALIZE (typically just the parent's join
- * key). All other columns are resolved at final materialization using
- * per-column 64-bit provenance (table_id, column_idx, row_id).
- *
- * Memory savings: For a join projecting N columns where only 1 is a join key,
- * DeferredResult uses ~1/N the memory of ExtendedResult for data columns.
- * Additionally, we only track provenance for deferred columns (not all tables).
- *
- * @see DeferredColumnInfo for materialization decisions.
- * @see DeferredProvenance for 64-bit encoding scheme.
- */
-struct DeferredResult {
-    /// Only columns marked MATERIALIZE (typically 1 join key).
-    std::vector<mema::column_t> materialized;
-
-    /// Map: original column index → index in materialized (nullopt if
-    /// deferred).
-    std::vector<std::optional<size_t>> materialized_map;
-
-    /// Per-deferred-column provenance (64-bit encoded table_id+column_idx+row).
-    /// One deferred_column_t per DEFER column, stores full provenance per row.
-    std::vector<mema::deferred_column_t> deferred_columns;
-
-    /// Map: original column index → index in deferred_columns (nullopt if
-    /// materialized).
-    std::vector<std::optional<size_t>> deferred_map;
-
-    /// Reference to node info for column provenance resolution.
-    const DeferredJoinNode *node_info = nullptr;
-
-    /// Total row count.
-    size_t num_rows = 0;
-
-    DeferredResult() = default;
-    DeferredResult(DeferredResult &&) = default;
-    DeferredResult &operator=(DeferredResult &&) = default;
-    DeferredResult(const DeferredResult &) = delete;
-    DeferredResult &operator=(const DeferredResult &) = delete;
-
-    /** @brief Total row count. */
-    size_t row_count() const { return num_rows; }
-
-    /** @brief Check if column was materialized (not deferred). */
-    bool is_materialized(size_t orig_idx) const {
-        return orig_idx < materialized_map.size() &&
-               materialized_map[orig_idx].has_value();
-    }
-
-    /** @brief Check if column is deferred. */
-    bool is_deferred(size_t orig_idx) const {
-        return orig_idx < deferred_map.size() &&
-               deferred_map[orig_idx].has_value();
-    }
-
-    /** @brief Get materialized column, or nullptr if deferred. */
-    const mema::column_t *get_materialized(size_t orig_idx) const {
-        if (!is_materialized(orig_idx))
-            return nullptr;
-        return &materialized[*materialized_map[orig_idx]];
-    }
-
-    /** @brief Get deferred column provenance, or nullptr if materialized. */
-    const mema::deferred_column_t *get_deferred(size_t orig_idx) const {
-        if (!is_deferred(orig_idx))
-            return nullptr;
-        return &deferred_columns[*deferred_map[orig_idx]];
-    }
-
-    /** @brief Get mutable deferred column provenance, or nullptr. */
-    mema::deferred_column_t *get_deferred_mut(size_t orig_idx) {
-        if (!is_deferred(orig_idx))
-            return nullptr;
-        return &deferred_columns[*deferred_map[orig_idx]];
-    }
-
-    /** @brief Number of deferred columns. */
-    size_t num_deferred() const { return deferred_columns.size(); }
-};
-
-/**
- * @brief Input abstraction for deferred execution path.
- *
- * Similar to JoinInput but works with DeferredResult instead of ExtendedResult.
- * Provides uniform interface for columnar (base table) and deferred
- * intermediate data sources.
- */
-struct DeferredInput {
-    /// Either base table pointer or owned DeferredResult.
-    std::variant<const ColumnarTable *, DeferredResult> data;
-
-    /// Original plan node for output_attrs mapping.
-    const PlanNode *node = nullptr;
-
-    /// Deferred plan node for materialization decisions.
-    const DeferredNode *deferred_node = nullptr;
-
-    /// Base table ID (for columnar inputs).
-    uint8_t table_id = 0;
-
-    /** @brief True if data is columnar (base table). */
-    bool is_columnar() const {
-        return std::holds_alternative<const ColumnarTable *>(data);
-    }
-
-    /** @brief Row count for join key column. */
-    size_t row_count(size_t col_idx) const {
-        if (is_columnar()) {
-            const auto *table = std::get<const ColumnarTable *>(data);
-            return table->num_rows;
-        }
-        return std::get<DeferredResult>(data).row_count();
-    }
-
-    /** @brief Total row count. */
-    size_t row_count() const {
-        if (is_columnar()) {
-            const auto *table = std::get<const ColumnarTable *>(data);
-            return table->num_rows;
-        }
-        return std::get<DeferredResult>(data).row_count();
-    }
-
-    /** @brief Number of output columns. */
-    size_t output_size() const {
-        if (node)
-            return node->output_attrs.size();
-        return 0;
-    }
-
-    /**
-     * @brief Get deferred column provenance for a column index.
-     *
-     * For columnar inputs, returns nullptr (caller must encode fresh).
-     * For DeferredResult inputs, returns existing provenance column.
-     */
-    const mema::deferred_column_t *get_deferred_column(size_t col_idx) const {
-        if (is_columnar())
-            return nullptr;
-        return std::get<DeferredResult>(data).get_deferred(col_idx);
-    }
-};
-
-} // namespace Contest
diff --git a/include/data_model/deferred_plan.h b/include/data_model/deferred_plan.h
index abb934f..13be4dd 100644
--- a/include/data_model/deferred_plan.h
+++ b/include/data_model/deferred_plan.h
@@ -1,14 +1,14 @@
 /**
  * @file deferred_plan.h
- * @brief Analyzed plan with materialization decisions for deferred execution.
+ * @brief Analyzed plan with materialization decisions for execution.
  *
- * DeferredPlan mirrors the original Plan structure but includes pre-computed
+ * AnalyzedPlan mirrors the original Plan structure but includes pre-computed
  * decisions about which columns to materialize eagerly (join keys) vs defer
- * until final output. Each DeferredJoinNode tracks column provenance back to
+ * until final output. Each AnalyzedJoinNode tracks column provenance back to
  * base tables for efficient deferred resolution.
  *
  * @see analyze_plan.cpp for the analysis algorithm.
- * @see deferred_intermediate.h for the runtime result format.
+ * @see intermediate.h for the runtime result format.
  */
 #pragma once
 
@@ -42,12 +42,12 @@ struct ColumnProvenance {
 };
 
 /**
- * @brief Complete metadata for an output column in a deferred join.
+ * @brief Complete metadata for an output column in a join.
  *
  * Combines materialization decision, provenance tracking, and child source
  * information for efficient intermediate construction and final resolution.
  */
-struct DeferredColumnInfo {
+struct AnalyzedColumnInfo {
     size_t original_idx; ///< Index in node's output_attrs.
     DataType type;       ///< INT32 or VARCHAR.
 
@@ -59,11 +59,11 @@ struct DeferredColumnInfo {
 };
 
 /**
- * @brief Analyzed scan node for deferred execution.
+ * @brief Analyzed scan node for execution.
  *
  * Wraps a ScanNode with output attribute information.
  */
-struct DeferredScanNode {
+struct AnalyzedScanNode {
     size_t node_idx;       ///< Index in original Plan::nodes.
     uint8_t base_table_id; ///< Index into Plan::inputs.
     std::vector<std::tuple<size_t, DataType>> output_attrs; ///< Projected cols.
@@ -72,13 +72,13 @@ struct DeferredScanNode {
 /**
  * @brief Analyzed join node with pre-computed materialization decisions.
  *
- * Contains all information needed for deferred execution:
+ * Contains all information needed for execution:
  * - Which columns to materialize eagerly (join keys for parent)
  * - Column provenance for deferred resolution
  * - Pre-computed match collection mode
  * - Number of deferred columns for allocation
  */
-struct DeferredJoinNode {
+struct AnalyzedJoinNode {
     size_t node_idx; ///< Index in original Plan::nodes.
 
     size_t left_child_idx;  ///< Left child index in Plan::nodes.
@@ -90,7 +90,7 @@ struct DeferredJoinNode {
     std::vector<std::tuple<size_t, DataType>> output_attrs;
 
     /// Per-column materialization decisions and provenance.
-    std::vector<DeferredColumnInfo> columns;
+    std::vector<AnalyzedColumnInfo> columns;
 
     /// Pre-computed collection mode (assumes build=left; flip if build=right).
     join::MatchCollectionMode base_collection_mode;
@@ -106,9 +106,9 @@ struct DeferredJoinNode {
 };
 
 /**
- * @brief Plan node variant for deferred execution.
+ * @brief Plan node variant for execution.
  */
-using DeferredNode = std::variant<DeferredScanNode, DeferredJoinNode>;
+using AnalyzedNode = std::variant<AnalyzedScanNode, AnalyzedJoinNode>;
 
 /**
  * @brief Analyzed plan with materialization decisions.
@@ -117,12 +117,12 @@ using DeferredNode = std::variant<DeferredScanNode, DeferredJoinNode>;
  * materialization. The original_plan pointer provides access to base tables
  * for value resolution.
  */
-struct DeferredPlan {
-    std::vector<DeferredNode> nodes; ///< Analyzed nodes (same indices as Plan).
+struct AnalyzedPlan {
+    std::vector<AnalyzedNode> nodes; ///< Analyzed nodes (same indices as Plan).
     size_t root;                     ///< Root node index.
     const Plan *original_plan;       ///< Non-owning reference to original plan.
 
-    const DeferredNode &operator[](size_t idx) const { return nodes[idx]; }
+    const AnalyzedNode &operator[](size_t idx) const { return nodes[idx]; }
 };
 
 /**
@@ -135,8 +135,8 @@ struct DeferredPlan {
  * 4. Pre-computed collection mode based on output columns
  *
  * @param plan Original query plan.
- * @return DeferredPlan with materialization decisions.
+ * @return AnalyzedPlan with materialization decisions.
  */
-DeferredPlan analyze_plan(const Plan &plan);
+AnalyzedPlan analyze_plan(const Plan &plan);
 
 } // namespace Contest
diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h
index f4fa9c8..4a29919 100644
--- a/include/data_model/intermediate.h
+++ b/include/data_model/intermediate.h
@@ -1,18 +1,30 @@
 /**
  * @file intermediate.h
- * @brief Intermediate join format: VARCHAR as page/offset refs (no string
- * copy).
+ * @brief Intermediate join result types and input abstraction.
  *
- * Base tables must outlive execution. @see plan.h ColumnarTable,
- * construct_intermediate.h
+ * Provides:
+ * - mema::value_t: 4-byte value encoding (INT32 direct, VARCHAR as page/offset)
+ * - mema::column_t: 16KB-paged column for materialized values
+ * - mema::deferred_column_t: 32KB-paged column for 64-bit provenance encoding
+ * - IntermediateResult: Lightweight result with selective materialization
+ * - JoinInput: Unified abstraction over columnar tables and intermediate
+ * results
+ *
+ * Base tables must outlive execution.
+ *
+ * @see plan.h for ColumnarTable, construct_intermediate.h for building results.
+ * @see deferred_plan.h for AnalyzedJoinNode with column decisions.
  */
 #pragma once
 
 #include <cstdint>
 #include <data_access/table.h>
+#include <data_model/deferred_plan.h>
 #include <data_model/plan.h>
 #include <foundation/common.h>
+#include <optional>
 #include <platform/arena.h>
+#include <variant>
 #include <vector>
 
 /**
@@ -20,13 +32,17 @@
  * @brief Compact join intermediate: value_t (4B) + column_t (16KB pages).
  *
  * value_t: INT32 direct or VARCHAR page/offset ref. column_t: arena-allocated
- * pages with write_at(). @see Contest::ExecuteResult, plan.h ColumnarTable.
+ * pages with write_at().
+ *
+ * @see Contest::IntermediateResult, plan.h ColumnarTable.
  */
 namespace mema {
 
 /**
  * @brief 4-byte value: INT32 direct, VARCHAR packed (19-bit page + 13-bit
- * offset), NULL = INT32_MIN, long string offset = 0x1FFF. Refs valid only while
+ * offset).
+ *
+ * NULL = INT32_MIN, long string offset = 0x1FFF. Refs valid only while
  * source exists.
  */
 struct alignas(4) value_t {
@@ -45,16 +61,18 @@ struct alignas(4) value_t {
         offset_idx = (static_cast<uint32_t>(encoded) >> 19) & 0x1FFF;
     }
 
-    static constexpr int32_t LONG_STRING_OFFSET =
-        0x1FFF; /**< Sentinel for long strings. */
-    static constexpr int32_t NULL_VALUE =
-        INT32_MIN; /**< NULL sentinel for both types. */
+    /** @brief Sentinel for long strings. */
+    static constexpr int32_t LONG_STRING_OFFSET = 0x1FFF;
+
+    /** @brief NULL sentinel for both types. */
+    static constexpr int32_t NULL_VALUE = INT32_MIN;
 
     /** @brief Check if this value represents NULL. */
     inline bool is_null() const { return value == NULL_VALUE; }
 };
 
-/** @brief Page size for intermediate results (16KB, larger than ColumnarTable).
+/**
+ * @brief Page size for intermediate results (16KB, larger than ColumnarTable).
  */
 constexpr size_t IR_PAGE_SIZE = 1 << 14;
 
@@ -82,9 +100,12 @@ struct column_t {
 
   public:
     std::vector<Page *> pages; /**< Pointers to arena-allocated pages. */
-    uint8_t source_table =
-        0; /**< Base table index for VARCHAR dereferencing. */
-    uint8_t source_column = 0; /**< Column index within source table. */
+
+    /** @brief Base table index for VARCHAR dereferencing. */
+    uint8_t source_table = 0;
+
+    /** @brief Column index within source table. */
+    uint8_t source_column = 0;
 
   public:
     column_t() = default;
@@ -114,8 +135,10 @@ struct column_t {
 
     ~column_t() = default;
 
-    /** @brief O(1) read: idx>>12 for page, idx&0xFFF for offset. No bounds
-     * check. */
+    /**
+     * @brief O(1) read: idx>>12 for page, idx&0xFFF for offset.
+     * @note No bounds check.
+     */
     inline const value_t &operator[](size_t idx) const {
         return pages[idx >> 12]->data[idx & 0xFFF];
     }
@@ -152,84 +175,6 @@ struct column_t {
 /** @brief Alias for a collection of intermediate columns. */
 using Columnar = std::vector<column_t>;
 
-/**
- * @brief Row ID column storing encoded global row IDs.
- *
- * Parallel structure to column_t but stores uint32_t (encoded table_id +
- * row_id). One column per base table participating in joins up to this point.
- * Uses same page size and arena allocation as column_t.
- *
- * @see GlobalRowId for encoding scheme, ExtendedResult for usage.
- */
-struct rowid_column_t {
-    /** @brief Page for row ID storage: fixed array of uint32_t entries. */
-    struct alignas(IR_PAGE_SIZE) Page {
-        uint32_t data[CAP_PER_PAGE];
-    };
-
-    std::vector<Page *> pages; ///< Pointers to arena-allocated pages.
-    size_t num_values = 0;     ///< Total row ID count across all pages.
-    uint8_t table_id = 0;      ///< Which base table this column tracks.
-
-    rowid_column_t() = default;
-
-    rowid_column_t(rowid_column_t &&other) noexcept
-        : pages(std::move(other.pages)), num_values(other.num_values),
-          table_id(other.table_id) {
-        other.pages.clear();
-        other.num_values = 0;
-    }
-
-    rowid_column_t &operator=(rowid_column_t &&other) noexcept {
-        if (this != &other) {
-            pages = std::move(other.pages);
-            num_values = other.num_values;
-            table_id = other.table_id;
-            other.pages.clear();
-            other.num_values = 0;
-        }
-        return *this;
-    }
-
-    rowid_column_t(const rowid_column_t &) = delete;
-    rowid_column_t &operator=(const rowid_column_t &) = delete;
-
-    ~rowid_column_t() = default;
-
-    /** @brief O(1) read: idx>>12 for page, idx&0xFFF for offset. */
-    inline uint32_t operator[](size_t idx) const {
-        return pages[idx >> 12]->data[idx & 0xFFF];
-    }
-
-    /** @brief Thread-safe write at idx (requires pages to be set up first). */
-    inline void write_at(size_t idx, uint32_t val) {
-        pages[idx >> 12]->data[idx & 0xFFF] = val;
-    }
-
-    /** @brief Total row ID count. */
-    size_t row_count() const { return num_values; }
-
-    /** @brief Set row count without allocation (for assembly pattern). */
-    inline void set_row_count(size_t count) { num_values = count; }
-
-    /** @brief Pre-allocate pages from arena. */
-    inline void pre_allocate_from_arena(Contest::platform::ThreadArena &arena,
-                                        size_t count) {
-        static_assert(sizeof(Page) ==
-                          Contest::platform::ChunkSize<
-                              Contest::platform::ChunkType::IR_PAGE>::value,
-                      "Page size mismatch with IR_PAGE chunk size");
-        size_t pages_needed = (count + CAP_PER_PAGE - 1) / CAP_PER_PAGE;
-        pages.reserve(pages_needed);
-        for (size_t i = 0; i < pages_needed; ++i) {
-            void *ptr =
-                arena.alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
-            pages.push_back(reinterpret_cast<Page *>(ptr));
-        }
-        num_values = count;
-    }
-};
-
 /**
  * @brief 64-bit provenance column for deferred materialization.
  *
@@ -237,7 +182,7 @@ struct rowid_column_t {
  * DeferredProvenance encoding. Uses 32KB pages with 4096 entries each.
  *
  * @see DeferredProvenance for encoding scheme.
- * @see deferred_intermediate.h for DeferredResult usage.
+ * @see IntermediateResult for usage.
  */
 struct deferred_column_t {
     static constexpr size_t PAGE_SIZE = 1 << 15; // 32KB
@@ -312,67 +257,156 @@ struct deferred_column_t {
     }
 };
 
+} // namespace mema
+
+namespace Contest {
+
 /**
- * @brief Convert column_t vector to ColumnarTable. Dereferences VARCHAR refs.
- * @see materialize.h
+ * @brief Lightweight intermediate result with selective materialization.
+ *
+ * Stores only columns marked MATERIALIZE (typically just the parent's join
+ * key). All other columns are resolved at final materialization using
+ * per-column 64-bit provenance (table_id, column_idx, row_id).
+ *
+ * Memory savings: For a join projecting N columns where only 1 is a join key,
+ * IntermediateResult uses ~1/N the memory for data columns. Additionally, we
+ * only track provenance for deferred columns (not all tables).
+ *
+ * @see AnalyzedColumnInfo for materialization decisions.
+ * @see DeferredProvenance for 64-bit encoding scheme.
  */
-ColumnarTable to_columnar(const Columnar &table, const Plan &plan);
-} /* namespace mema */
+struct IntermediateResult {
+    /// Only columns marked MATERIALIZE (typically 1 join key).
+    std::vector<mema::column_t> materialized;
 
-/** @namespace Contest @brief Contest API. @see Plan, execute.cpp */
-namespace Contest {
-/** @brief Result type for non-root joins (intermediate format). */
-using ExecuteResult = std::vector<mema::column_t>;
+    /// Map: original column index -> index in materialized (nullopt if
+    /// deferred).
+    std::vector<std::optional<size_t>> materialized_map;
+
+    /// Per-deferred-column provenance (64-bit encoded table_id+column_idx+row).
+    /// One deferred_column_t per DEFER column, stores full provenance per row.
+    std::vector<mema::deferred_column_t> deferred_columns;
+
+    /// Map: original column index -> index in deferred_columns (nullopt if
+    /// materialized).
+    std::vector<std::optional<size_t>> deferred_map;
+
+    /// Reference to node info for column provenance resolution.
+    const AnalyzedJoinNode *node_info = nullptr;
+
+    /// Total row count.
+    size_t num_rows = 0;
+
+    IntermediateResult() = default;
+    IntermediateResult(IntermediateResult &&) = default;
+    IntermediateResult &operator=(IntermediateResult &&) = default;
+    IntermediateResult(const IntermediateResult &) = delete;
+    IntermediateResult &operator=(const IntermediateResult &) = delete;
+
+    /** @brief Total row count. */
+    size_t row_count() const { return num_rows; }
+
+    /** @brief Check if column was materialized (not deferred). */
+    bool is_materialized(size_t orig_idx) const {
+        return orig_idx < materialized_map.size() &&
+               materialized_map[orig_idx].has_value();
+    }
+
+    /** @brief Check if column is deferred. */
+    bool is_deferred(size_t orig_idx) const {
+        return orig_idx < deferred_map.size() &&
+               deferred_map[orig_idx].has_value();
+    }
+
+    /** @brief Get materialized column, or nullptr if deferred. */
+    const mema::column_t *get_materialized(size_t orig_idx) const {
+        if (!is_materialized(orig_idx))
+            return nullptr;
+        return &materialized[*materialized_map[orig_idx]];
+    }
+
+    /** @brief Get deferred column provenance, or nullptr if materialized. */
+    const mema::deferred_column_t *get_deferred(size_t orig_idx) const {
+        if (!is_deferred(orig_idx))
+            return nullptr;
+        return &deferred_columns[*deferred_map[orig_idx]];
+    }
+
+    /** @brief Get mutable deferred column provenance, or nullptr. */
+    mema::deferred_column_t *get_deferred_mut(size_t orig_idx) {
+        if (!is_deferred(orig_idx))
+            return nullptr;
+        return &deferred_columns[*deferred_map[orig_idx]];
+    }
+
+    /** @brief Number of deferred columns. */
+    size_t num_deferred() const { return deferred_columns.size(); }
+};
 
 /**
- * @brief Extended intermediate result with row ID tracking.
+ * @brief Unified abstraction over columnar tables and intermediate results.
  *
- * Wraps ExecuteResult with parallel row ID columns that track
- * which original scan rows contributed to each intermediate row.
- * One rowid_column_t per base table participating in the join tree.
+ * Stores ColumnarTable* (base scans) or IntermediateResult (child joins).
+ * Provides uniform interface for columnar (base table) and intermediate
+ * data sources.
  *
- * @see GlobalRowId for encoding, construct_intermediate.h for population.
+ * @see IntermediateResult for intermediate join results.
+ * @see ColumnarTable for base table storage.
  */
-struct ExtendedResult {
-    ExecuteResult columns;                     ///< Data columns (value_t).
-    std::vector<mema::rowid_column_t> row_ids; ///< One per participating table.
-    std::vector<uint8_t> table_ids; ///< Which tables are tracked (sorted).
+struct JoinInput {
+    /// Either base table pointer or owned IntermediateResult.
+    std::variant<const ColumnarTable *, IntermediateResult> data;
 
-    ExtendedResult() = default;
+    /// Original plan node for output_attrs mapping.
+    const PlanNode *node = nullptr;
 
-    ExtendedResult(ExtendedResult &&) = default;
-    ExtendedResult &operator=(ExtendedResult &&) = default;
+    /// Analyzed plan node for materialization decisions.
+    const AnalyzedNode *analyzed_node = nullptr;
 
-    ExtendedResult(const ExtendedResult &) = delete;
-    ExtendedResult &operator=(const ExtendedResult &) = delete;
+    /// Base table ID (for columnar inputs).
+    uint8_t table_id = 0;
 
-    /** @brief Total row count (from first data column). */
-    size_t row_count() const {
-        return columns.empty() ? 0 : columns[0].row_count();
+    /** @brief True if data is columnar (base table). */
+    bool is_columnar() const {
+        return std::holds_alternative<const ColumnarTable *>(data);
     }
 
-    /** @brief Find row ID column index for a specific table, or -1 if not
-     * found. */
-    int find_rowid_index(uint8_t tid) const {
-        for (size_t i = 0; i < table_ids.size(); ++i) {
-            if (table_ids[i] == tid)
-                return static_cast<int>(i);
+    /** @brief Row count for join key column. */
+    size_t row_count(size_t col_idx) const {
+        if (is_columnar()) {
+            const auto *table = std::get<const ColumnarTable *>(data);
+            return table->num_rows;
         }
-        return -1;
+        return std::get<IntermediateResult>(data).row_count();
     }
 
-    /** @brief Get row ID column for a specific table, or nullptr if not found.
-     */
-    const mema::rowid_column_t *get_rowid_column(uint8_t tid) const {
-        int idx = find_rowid_index(tid);
-        return (idx >= 0) ? &row_ids[idx] : nullptr;
+    /** @brief Total row count. */
+    size_t row_count() const {
+        if (is_columnar()) {
+            const auto *table = std::get<const ColumnarTable *>(data);
+            return table->num_rows;
+        }
+        return std::get<IntermediateResult>(data).row_count();
     }
 
-    /** @brief Get mutable row ID column for a specific table, or nullptr. */
-    mema::rowid_column_t *get_rowid_column_mut(uint8_t tid) {
-        int idx = find_rowid_index(tid);
-        return (idx >= 0) ? &row_ids[idx] : nullptr;
+    /** @brief Number of output columns. */
+    size_t output_size() const {
+        if (node)
+            return node->output_attrs.size();
+        return 0;
+    }
+
+    /**
+     * @brief Get deferred column provenance for a column index.
+     *
+     * For columnar inputs, returns nullptr (caller must encode fresh).
+     * For IntermediateResult inputs, returns existing provenance column.
+     */
+    const mema::deferred_column_t *get_deferred_column(size_t col_idx) const {
+        if (is_columnar())
+            return nullptr;
+        return std::get<IntermediateResult>(data).get_deferred(col_idx);
     }
 };
 
-} /* namespace Contest */
+} // namespace Contest
diff --git a/include/join_execution/hash_join.h b/include/join_execution/hash_join.h
index b2f1f00..0e75ccf 100644
--- a/include/join_execution/hash_join.h
+++ b/include/join_execution/hash_join.h
@@ -1,12 +1,3 @@
-#pragma once
-
-#include <data_model/intermediate.h>
-#include <join_execution/hashtable.h>
-#include <join_execution/join_setup.h>
-#include <join_execution/match_collector.h>
-#include <platform/arena_vector.h>
-#include <platform/worker_pool.h>
-
 /**
  * @file hash_join.h
  * @brief Hash join build and probe operations.
@@ -19,6 +10,13 @@
  *
  * @see hashtable.h, match_collector.h
  */
+#pragma once
+
+#include <data_model/intermediate.h>
+#include <join_execution/hashtable.h>
+#include <join_execution/match_collector.h>
+#include <platform/arena_vector.h>
+#include <platform/worker_pool.h>
 
 /**
  * @namespace Contest::join
@@ -27,8 +25,6 @@
  */
 namespace Contest::join {
 
-using Contest::ExecuteResult;
-using Contest::ExtendedResult;
 using Contest::platform::THREAD_COUNT;
 using Contest::platform::worker_pool;
 
@@ -53,16 +49,22 @@ inline UnchainedHashtable build_from_columnar(const JoinInput &input,
 /**
  * @brief Build hash table from intermediate results (column_t).
  *
- * Uses join key column from ExecuteResult produced by prior pipeline stages.
+ * Uses join key column from IntermediateResult produced by prior pipeline
+ * stages.
  */
 inline UnchainedHashtable build_from_intermediate(const JoinInput &input,
                                                   size_t attr_idx) {
-    const auto &result = std::get<ExtendedResult>(input.data);
-    const auto &column = result.columns[attr_idx];
+    const auto &result = std::get<IntermediateResult>(input.data);
+    // Get the materialized column for the join key
+    const auto *column = result.get_materialized(attr_idx);
+    if (!column) {
+        // This should never happen - join keys must be materialized
+        std::abort();
+    }
 
     size_t row_count = input.row_count(attr_idx);
     UnchainedHashtable hash_table(row_count);
-    hash_table.build_intermediate(column, 8);
+    hash_table.build_intermediate(*column, 8);
 
     return hash_table;
 }
diff --git a/include/join_execution/join_setup.h b/include/join_execution/join_setup.h
index 188873d..217995e 100644
--- a/include/join_execution/join_setup.h
+++ b/include/join_execution/join_setup.h
@@ -1,87 +1,24 @@
 /**
  * @file join_setup.h
- * @brief Join configuration and input abstraction.
+ * @brief Join configuration and build/probe side selection.
  *
- * Provides JoinInput to abstract over columnar and intermediate data sources,
- * and utilities for selecting build/probe sides and preparing output columns.
+ * Provides utilities for selecting build/probe sides and determining
+ * which row IDs to collect based on output columns.
  */
 #pragma once
 
-#include <data_access/columnar_reader.h>
 #include <data_model/intermediate.h>
 #include <data_model/plan.h>
 #include <join_execution/match_collector.h>
 #include <tuple>
-#include <variant>
 #include <vector>
 
 /**
  * @namespace Contest::join
- * @brief JoinInput abstraction, build/probe selection, output column setup.
+ * @brief Build/probe selection and collection mode determination.
  */
 namespace Contest::join {
 
-using Contest::ExecuteResult;
-using Contest::ExtendedResult;
-using Contest::io::ColumnarReader;
-
-/**
- * @brief Unified abstraction over columnar tables and intermediate results.
- *
- * Stores ColumnarTable* (base scans) or ExtendedResult (child joins). Node
- * provides output_attrs mapping for column resolution.
- */
-struct JoinInput {
-    std::variant<ExtendedResult, const ColumnarTable *> data;
-    const PlanNode *node; /**< Provides output_attrs for column mapping. */
-    uint8_t table_id;     /**< Source table ID for provenance tracking. */
-
-    /** @brief True if data is columnar (base table), false if intermediate. */
-    bool is_columnar() const {
-        return std::holds_alternative<const ColumnarTable *>(data);
-    }
-
-    /**
-     * @brief Row count for a given output column.
-     * @param col_idx Index into node->output_attrs.
-     */
-    size_t row_count(size_t col_idx) const {
-        if (is_columnar()) {
-            auto *table = std::get<const ColumnarTable *>(data);
-            auto [actual_col_idx, _] = node->output_attrs[col_idx];
-            return table->num_rows;
-        } else {
-            return std::get<ExtendedResult>(data).columns[col_idx].row_count();
-        }
-    }
-
-    /** @brief Number of output columns. */
-    size_t output_size() const { return node->output_attrs.size(); }
-
-    /**
-     * @brief Get list of tables whose row IDs are tracked in this input.
-     *
-     * For columnar input: returns {table_id}.
-     * For intermediate: returns the tracked table_ids from ExtendedResult.
-     */
-    std::vector<uint8_t> tracked_tables() const {
-        if (is_columnar()) {
-            return {table_id};
-        }
-        return std::get<ExtendedResult>(data).table_ids;
-    }
-
-    /**
-     * @brief Get row ID column for a specific table.
-     * @return nullptr for columnar inputs (row IDs encoded on-the-fly).
-     */
-    const mema::rowid_column_t *get_rowid_column(uint8_t tid) const {
-        if (is_columnar())
-            return nullptr;
-        return std::get<ExtendedResult>(data).get_rowid_column(tid);
-    }
-};
-
 /**
  * @brief Configuration for build/probe side assignment.
  *
@@ -99,17 +36,6 @@ struct BuildProbeConfig {
     size_t probe_attr; /**< Join key index in probe's output_attrs. */
 };
 
-/** @brief Resolves global output column index to source input. */
-inline std::tuple<const JoinInput &, const PlanNode &, size_t>
-resolve_input_source(size_t global_idx, size_t split_point,
-                     const JoinInput &input_a, const PlanNode &node_a,
-                     const JoinInput &input_b, const PlanNode &node_b) {
-    if (global_idx < split_point) {
-        return {input_a, node_a, global_idx};
-    }
-    return {input_b, node_b, global_idx - split_point};
-}
-
 /**
  * @brief Chooses build/probe sides based on cardinality.
  *
@@ -180,190 +106,4 @@ inline MatchCollectionMode determine_collection_mode(
     return MatchCollectionMode::BOTH;
 }
 
-/**
- * @brief Creates output columns with provenance metadata from inputs.
- */
-inline ExtendedResult initialize_output_columns(
-    const std::vector<std::tuple<size_t, DataType>> &output_attrs,
-    const PlanNode &left_node, const PlanNode &right_node,
-    const JoinInput &left_input, const JoinInput &right_input,
-    size_t estimated_rows) {
-    ExtendedResult results;
-    results.columns.reserve(output_attrs.size());
-    size_t left_size = left_input.output_size();
-
-    auto set_column_metadata = [](mema::column_t &col, const JoinInput &input,
-                                  const PlanNode &node, size_t col_idx) {
-        auto [actual_col_idx, _] = node.output_attrs[col_idx];
-        if (input.is_columnar()) {
-            col.source_table = input.table_id;
-            col.source_column = actual_col_idx;
-        } else {
-            const auto &result = std::get<ExtendedResult>(input.data);
-            col.source_table = result.columns[col_idx].source_table;
-            col.source_column = result.columns[col_idx].source_column;
-        }
-    };
-
-    for (size_t i = 0; i < output_attrs.size(); ++i) {
-        auto [col_idx, _] = output_attrs[i];
-        auto [input, node, local_idx] = resolve_input_source(
-            col_idx, left_size, left_input, left_node, right_input, right_node);
-
-        mema::column_t col;
-        set_column_metadata(col, input, node, local_idx);
-        results.columns.push_back(std::move(col));
-    }
-
-    return results;
-}
-
-/**
- * @brief Join output state and columnar reader.
- *
- * prepared flag implements lazy PageIndex construction.
- */
-struct JoinSetup {
-    ExtendedResult results; /**< Output columns + row ID columns. */
-    ColumnarReader
-        columnar_reader; /**< Page cursor caching for columnar access. */
-    std::vector<uint8_t> merged_table_ids; /**< Tables tracked in output. */
-    /**
-     * True after prepare_output_columns called.
-     */
-    bool prepared;
-
-    JoinSetup() : prepared(false) {}
-};
-
-/**
- * @brief Merge tracked table IDs from build and probe (sorted, unique).
- *
- * Both input vectors must be sorted. Output is sorted and deduplicated.
- */
-inline std::vector<uint8_t>
-merge_tracked_tables(const std::vector<uint8_t> &build_tables,
-                     const std::vector<uint8_t> &probe_tables) {
-    std::vector<uint8_t> merged;
-    merged.reserve(build_tables.size() + probe_tables.size());
-
-    size_t i = 0, j = 0;
-    while (i < build_tables.size() && j < probe_tables.size()) {
-        if (build_tables[i] < probe_tables[j]) {
-            merged.push_back(build_tables[i++]);
-        } else if (probe_tables[j] < build_tables[i]) {
-            merged.push_back(probe_tables[j++]);
-        } else {
-            merged.push_back(build_tables[i++]);
-            j++; // Skip duplicate
-        }
-    }
-    while (i < build_tables.size())
-        merged.push_back(build_tables[i++]);
-    while (j < probe_tables.size())
-        merged.push_back(probe_tables[j++]);
-
-    return merged;
-}
-
-/**
- * @brief Initializes JoinSetup with output columns; call before join execution.
- *
- * PageIndex construction deferred to prepare_output_columns().
- * Computes merged table IDs from build and probe inputs.
- */
-inline JoinSetup
-setup_join(const JoinInput &build_input, const JoinInput &probe_input,
-           const PlanNode &build_node, const PlanNode &probe_node,
-           const PlanNode &left_node, const PlanNode &right_node,
-           const JoinInput &left_input, const JoinInput &right_input,
-           const std::vector<std::tuple<size_t, DataType>> &output_attrs,
-           size_t estimated_rows) {
-    JoinSetup setup;
-
-    setup.results =
-        initialize_output_columns(output_attrs, left_node, right_node,
-                                  left_input, right_input, estimated_rows);
-
-    // Compute merged table IDs from build and probe sides
-    auto build_tables = build_input.tracked_tables();
-    auto probe_tables = probe_input.tracked_tables();
-    setup.merged_table_ids = merge_tracked_tables(build_tables, probe_tables);
-
-    setup.prepared = false;
-
-    return setup;
-}
-
-/**
- * @brief Collects Column pointers for needed output columns from columnar
- * input.
- *
- * Unused columns get nullptr to skip PageIndex construction.
- */
-inline platform::ArenaVector<const Column *>
-collect_needed_columns(const JoinInput &input, const PlanNode &node,
-                       const platform::ArenaVector<uint8_t> &needed,
-                       platform::ThreadArena &arena) {
-    platform::ArenaVector<const Column *> columns(arena);
-    columns.resize(node.output_attrs.size());
-    std::memset(columns.data(), 0, columns.size() * sizeof(const Column *));
-    auto *table = std::get<const ColumnarTable *>(input.data);
-
-    for (size_t i = 0; i < node.output_attrs.size(); ++i) {
-        auto [actual_col_idx, _] = node.output_attrs[i];
-        columns[i] = needed[i] ? &table->columns[actual_col_idx] : nullptr;
-    }
-    return columns;
-}
-
-/**
- * @brief Prepares ColumnarReader with columns needed for materialization.
- *
- * Triggers lazy PageIndex construction only for projected columns.
- */
-inline void prepare_output_columns(
-    ColumnarReader &reader, const JoinInput &build_input,
-    const JoinInput &probe_input, const PlanNode &build_node,
-    const PlanNode &probe_node,
-    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
-    size_t build_size) {
-
-    bool build_is_columnar = build_input.is_columnar();
-    bool probe_is_columnar = probe_input.is_columnar();
-
-    if (!build_is_columnar && !probe_is_columnar)
-        return;
-
-    auto &arena = Contest::platform::get_arena(0);
-
-    platform::ArenaVector<uint8_t> build_needed(arena);
-    build_needed.resize(build_node.output_attrs.size());
-    std::memset(build_needed.data(), 0, build_needed.size());
-
-    platform::ArenaVector<uint8_t> probe_needed(arena);
-    probe_needed.resize(probe_node.output_attrs.size());
-    std::memset(probe_needed.data(), 0, probe_needed.size());
-
-    for (const auto &[col_idx, dtype] : remapped_attrs) {
-        if (col_idx < build_size) {
-            if (build_is_columnar) {
-                build_needed[col_idx] = 1;
-            }
-        } else if (probe_is_columnar) {
-            probe_needed[col_idx - build_size] = 1;
-        }
-    }
-
-    if (build_is_columnar) {
-        reader.prepare_build(collect_needed_columns(build_input, build_node,
-                                                    build_needed, arena));
-    }
-
-    if (probe_is_columnar) {
-        reader.prepare_probe(collect_needed_columns(probe_input, probe_node,
-                                                    probe_needed, arena));
-    }
-}
-
 } // namespace Contest::join
diff --git a/include/join_execution/nested_loop.h b/include/join_execution/nested_loop.h
index 8546854..c99a8be 100644
--- a/include/join_execution/nested_loop.h
+++ b/include/join_execution/nested_loop.h
@@ -13,10 +13,8 @@
 
 #include <atomic>
 #include <cstdint>
-#include <data_model/deferred_intermediate.h>
 #include <data_model/intermediate.h>
 #include <data_model/plan.h>
-#include <join_execution/join_setup.h>
 #include <join_execution/match_collector.h>
 #include <join_execution/simd_compare.h>
 #include <platform/arena_vector.h>
@@ -29,9 +27,6 @@
  */
 namespace Contest::join {
 
-using Contest::ExtendedResult;
-
-using Contest::ExecuteResult;
 using Contest::platform::THREAD_COUNT;
 using Contest::platform::worker_pool;
 
@@ -39,6 +34,8 @@ using Contest::platform::worker_pool;
  * @brief Iterates over non-NULL values in a join input column.
  *
  * Abstracts columnar vs intermediate input. Handles NULL bitmaps.
+ * For IntermediateResult, reads from materialized columns (join keys are
+ * always materialized).
  *
  * @tparam Func void(uint32_t row_id, int32_t value).
  */
@@ -72,57 +69,7 @@ inline void visit_rows(const JoinInput &input, size_t attr_idx,
             }
         }
     } else {
-        const auto &res = std::get<ExtendedResult>(input.data);
-        const mema::column_t &col = res.columns[attr_idx];
-        size_t count = col.row_count();
-        for (size_t i = 0; i < count; i++) {
-            const mema::value_t &val = col[i];
-            if (!val.is_null()) {
-                visitor(static_cast<uint32_t>(i), val.value);
-            }
-        }
-    }
-}
-
-/**
- * @brief Iterates over non-NULL values in a deferred input column.
- *
- * Abstracts columnar vs DeferredResult input. For DeferredResult, reads from
- * materialized columns (join keys are always materialized).
- *
- * @tparam Func void(uint32_t row_id, int32_t value).
- */
-template <typename Func>
-inline void visit_deferred_rows(const DeferredInput &input, size_t attr_idx,
-                                Func &&visitor) {
-    if (input.is_columnar()) {
-        auto *table = std::get<const ColumnarTable *>(input.data);
-        auto [col_idx, _] = input.node->output_attrs[attr_idx];
-        const Column &col = table->columns[col_idx];
-
-        uint32_t row_id = 0;
-        for (auto *page_obj : col.pages) {
-            auto *page = page_obj->data;
-            auto num_rows = *reinterpret_cast<uint16_t *>(page);
-            auto num_values = *reinterpret_cast<uint16_t *>(page + 2);
-            auto *data = reinterpret_cast<const int32_t *>(page + 4);
-
-            uint16_t val_idx = 0;
-            for (uint16_t i = 0; i < num_rows; i++) {
-                if (num_rows == num_values) {
-                    visitor(row_id++, data[i]);
-                } else {
-                    auto *bitmap = reinterpret_cast<const uint8_t *>(
-                        page + PAGE_SIZE - (num_rows + 7) / 8);
-                    if (bitmap[i / 8] & (1u << (i % 8))) {
-                        visitor(row_id, data[val_idx++]);
-                    }
-                    row_id++;
-                }
-            }
-        }
-    } else {
-        const auto &res = std::get<DeferredResult>(input.data);
+        const auto &res = std::get<IntermediateResult>(input.data);
         // Join key must be materialized
         const mema::column_t *col = res.get_materialized(attr_idx);
         if (!col)
@@ -180,161 +127,6 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input,
         b_vals[i] = INT32_MIN;
     }
 
-    const Column *probe_col = nullptr;
-    platform::ArenaVector<uint32_t> page_offsets(
-        Contest::platform::get_arena(0));
-    if (probe_input.is_columnar()) {
-        auto *table = std::get<const ColumnarTable *>(probe_input.data);
-        auto [col_idx, _] = probe_input.node->output_attrs[probe_attr];
-        probe_col = &table->columns[col_idx];
-
-        page_offsets.reserve(probe_col->pages.size() + 1);
-        uint32_t current = 0;
-        for (auto *p : probe_col->pages) {
-            page_offsets.push_back(current);
-            current += *reinterpret_cast<const uint16_t *>(p->data);
-        }
-        page_offsets.push_back(current);
-    }
-    std::atomic<size_t> probe_page_counter{0};
-
-    worker_pool().execute([&](size_t t_id) {
-        buffers[t_id] =
-            ThreadLocalMatchBuffer<Mode>(Contest::platform::get_arena(t_id));
-        auto &local_buffer = buffers[t_id];
-
-        auto process_value = [&](uint32_t p_id, int32_t p_val) {
-            simd::eq_scan_build<Mode>(p_id, p_val, b_vals, b_ids, b_count,
-                                      local_buffer);
-        };
-
-        if (probe_input.is_columnar()) {
-            size_t num_pages = probe_col->pages.size();
-
-            while (true) {
-                size_t i =
-                    probe_page_counter.fetch_add(1, std::memory_order_relaxed);
-
-                if (i >= num_pages)
-                    break;
-                auto *page = probe_col->pages[i]->data;
-                auto num_rows = *reinterpret_cast<const uint16_t *>(page);
-                auto num_values = *reinterpret_cast<const uint16_t *>(page + 2);
-                auto *data = reinterpret_cast<const int32_t *>(page + 4);
-                uint32_t row_id = page_offsets[i];
-
-                if (num_rows == num_values) {
-                    // SIMD batch: process multiple probe values at a time
-                    uint16_t j = simd::eq_batch_columnar<Mode>(
-                        data, num_rows, row_id, b_vals, b_ids, b_count,
-                        local_buffer);
-                    row_id += j;
-                    // Handle remaining elements with scalar
-                    for (; j < num_rows; j++) {
-                        process_value(row_id++, data[j]);
-                    }
-                } else {
-                    auto *bitmap = reinterpret_cast<const uint8_t *>(
-                        page + PAGE_SIZE - (num_rows + 7) / 8);
-                    uint16_t val_idx = 0;
-                    for (uint16_t j = 0; j < num_rows; j++) {
-                        if (bitmap[j / 8] & (1u << (j % 8))) {
-                            process_value(row_id, data[val_idx++]);
-                        }
-                        row_id++;
-                    }
-                }
-            }
-        } else {
-            const auto &res = std::get<ExtendedResult>(probe_input.data);
-            const mema::column_t &col = res.columns[probe_attr];
-            size_t count = col.row_count();
-            size_t start = (t_id * count) / THREAD_COUNT;
-            size_t end = ((t_id + 1) * count) / THREAD_COUNT;
-
-            constexpr size_t BATCH_SIZE = simd::INTERMEDIATE_BATCH_SIZE;
-            size_t i = start;
-
-            if constexpr (BATCH_SIZE > 0) {
-                // SIMD batch processing
-                for (; i + BATCH_SIZE <= end; i += BATCH_SIZE) {
-                    size_t page_idx = i >> 12;
-                    size_t offset = i & 0xFFF;
-
-                    // Only use SIMD if all values are on same page
-                    if (offset + BATCH_SIZE <= mema::CAP_PER_PAGE) {
-                        const int32_t *vals = reinterpret_cast<const int32_t *>(
-                            &col.pages[page_idx]->data[offset]);
-                        simd::eq_batch_intermediate<Mode>(
-                            vals, i, b_vals, b_ids, b_count, local_buffer);
-                    } else {
-                        // Cross-page boundary: fall back to scalar
-                        for (size_t j = i; j < i + BATCH_SIZE; j++) {
-                            const mema::value_t &val = col[j];
-                            if (!val.is_null()) {
-                                process_value(static_cast<uint32_t>(j),
-                                              val.value);
-                            }
-                        }
-                    }
-                }
-            }
-
-            // Handle remaining elements (or all elements if no SIMD)
-            for (; i < end; i++) {
-                const mema::value_t &val = col[i];
-                if (!val.is_null()) {
-                    process_value(static_cast<uint32_t>(i), val.value);
-                }
-            }
-        }
-    });
-
-    return buffers;
-}
-
-/**
- * @brief Nested loop join for deferred execution path.
- *
- * Same algorithm as nested_loop_join but works with DeferredInput.
- * Supports both columnar and DeferredResult inputs.
- *
- * @tparam Mode Collection mode (BOTH, LEFT_ONLY, RIGHT_ONLY).
- * @return Thread-local match buffers for direct iteration.
- */
-template <MatchCollectionMode Mode>
-inline std::vector<ThreadLocalMatchBuffer<Mode>>
-nested_loop_join_deferred(const DeferredInput &build_input,
-                          const DeferredInput &probe_input, size_t build_attr,
-                          size_t probe_attr) {
-    size_t build_rows = build_input.row_count(build_attr);
-    size_t probe_rows = probe_input.row_count(probe_attr);
-
-    if (build_rows == 0 || probe_rows == 0)
-        return {};
-
-    size_t num_threads = THREAD_COUNT;
-    std::vector<ThreadLocalMatchBuffer<Mode>> buffers(num_threads);
-
-    constexpr size_t MAX_BUILD_SIZE = 8;
-    alignas(32) int32_t b_vals[MAX_BUILD_SIZE];
-    alignas(16) uint32_t b_ids[MAX_BUILD_SIZE];
-    size_t b_count = 0;
-
-    auto collect_build = [&](uint32_t id, int32_t val) {
-        if (b_count < MAX_BUILD_SIZE) {
-            b_ids[b_count] = id;
-            b_vals[b_count] = val;
-            b_count++;
-        }
-    };
-
-    visit_deferred_rows(build_input, build_attr, collect_build);
-
-    for (size_t i = b_count; i < MAX_BUILD_SIZE; ++i) {
-        b_vals[i] = INT32_MIN;
-    }
-
     // Setup for columnar probe (page-based parallel processing)
     const Column *probe_col = nullptr;
     platform::ArenaVector<uint32_t> page_offsets(
@@ -353,10 +145,10 @@ nested_loop_join_deferred(const DeferredInput &build_input,
         page_offsets.push_back(current);
     }
 
-    // Setup for DeferredResult probe
+    // Setup for IntermediateResult probe
     const mema::column_t *probe_mat_col = nullptr;
     if (!probe_input.is_columnar()) {
-        const auto &res = std::get<DeferredResult>(probe_input.data);
+        const auto &res = std::get<IntermediateResult>(probe_input.data);
         probe_mat_col = res.get_materialized(probe_attr);
         if (!probe_mat_col)
             return {}; // Join key not materialized - should not happen
@@ -412,7 +204,7 @@ nested_loop_join_deferred(const DeferredInput &build_input,
                 }
             }
         } else {
-            // DeferredResult probe - use materialized column
+            // IntermediateResult probe - use materialized column
             const mema::column_t &col = *probe_mat_col;
             size_t count = col.row_count();
             size_t start = (t_id * count) / THREAD_COUNT;
diff --git a/include/materialization/construct_deferred.h b/include/materialization/construct_deferred.h
deleted file mode 100644
index bb9a425..0000000
--- a/include/materialization/construct_deferred.h
+++ /dev/null
@@ -1,649 +0,0 @@
-/**
- * @file construct_deferred.h
- * @brief Constructs deferred intermediate results for multi-way joins.
- *
- * Allocates and populates DeferredResult with only MATERIALIZE columns
- * (typically just the parent's join key). Deferred columns store 64-bit
- * provenance (table_id, column_idx, row_id) for resolution at final output.
- *
- * Optimized with:
- * - Column-major iteration for cache locality
- * - Precomputed source metadata to avoid per-row variant access
- * - SIMD provenance encoding (AVX2/NEON) for deferred columns
- * - Batch access to match collector chunks
- *
- * @see construct_intermediate.h for the eager materialization equivalent.
- * @see materialize_deferred.h for final resolution of deferred columns.
- */
-#pragma once
-
-#include <algorithm>
-#include <cstring>
-#include <vector>
-
-#include <data_access/columnar_reader.h>
-#include <data_model/deferred_intermediate.h>
-#include <data_model/deferred_plan.h>
-#include <foundation/common.h>
-#include <join_execution/match_collector.h>
-#include <platform/arena.h>
-#include <platform/worker_pool.h>
-
-#if defined(__x86_64__)
-#include <immintrin.h>
-#elif defined(__aarch64__)
-#include <arm_neon.h>
-#endif
-
-namespace Contest {
-namespace materialize {
-
-using Contest::io::ColumnarReader;
-using Contest::join::MatchCollectionMode;
-using Contest::join::ThreadLocalMatchBuffer;
-using Contest::platform::THREAD_COUNT;
-using Contest::platform::worker_pool;
-
-// ============================================================================
-// SIMD Provenance Encoding
-// ============================================================================
-
-namespace simd_provenance {
-
-#if defined(__x86_64__) && defined(__AVX2__)
-inline constexpr size_t BATCH_SIZE = 4; ///< 4 x uint64_t in AVX2 (256-bit)
-#elif defined(__aarch64__)
-inline constexpr size_t BATCH_SIZE = 2; ///< 2 x uint64_t in NEON (128-bit)
-#else
-inline constexpr size_t BATCH_SIZE = 0; ///< No SIMD available
-#endif
-
-/**
- * @brief Encode provenance for batch of row IDs using SIMD.
- *
- * Encodes (table_id << 56) | (column_idx << 48) | row_id for each row.
- * Uses AVX2 on x86_64 or NEON on aarch64, with scalar fallback.
- *
- * @param dest       Destination deferred column
- * @param start_idx  Starting output index
- * @param row_ids    Pointer to row IDs (from IndexChunk, contiguous)
- * @param count      Number of row IDs to process
- * @param table_id   Base table ID (constant for all rows)
- * @param column_idx Base column index (constant for all rows)
- * @return Number of rows processed (always == count)
- */
-inline size_t encode_provenance_batch(mema::deferred_column_t &dest,
-                                      size_t start_idx, const uint32_t *row_ids,
-                                      size_t count, uint8_t table_id,
-                                      uint8_t column_idx) {
-    // Precompute constant prefix: (table_id << 56) | (column_idx << 48)
-    const uint64_t prefix = DeferredProvenance::encode(table_id, column_idx, 0);
-
-    size_t i = 0;
-
-#if defined(__x86_64__) && defined(__AVX2__)
-    // AVX2: Process 4 x uint64_t at a time
-    // Load 4 x uint32_t, zero-extend to 4 x uint64_t, OR with prefix
-    const __m256i prefix_vec = _mm256_set1_epi64x(static_cast<int64_t>(prefix));
-
-    for (; i + 4 <= count; i += 4) {
-        // Load 4 x uint32_t and zero-extend to 4 x uint64_t
-        __m128i rows_32 =
-            _mm_loadu_si128(reinterpret_cast<const __m128i *>(row_ids + i));
-        __m256i rows_64 = _mm256_cvtepu32_epi64(rows_32);
-
-        // OR with prefix to create provenance values
-        __m256i result = _mm256_or_si256(rows_64, prefix_vec);
-
-        // Store to aligned buffer, then write individually (page-safe)
-        alignas(32) uint64_t out[4];
-        _mm256_store_si256(reinterpret_cast<__m256i *>(out), result);
-
-        dest.write_at(start_idx + i, out[0]);
-        dest.write_at(start_idx + i + 1, out[1]);
-        dest.write_at(start_idx + i + 2, out[2]);
-        dest.write_at(start_idx + i + 3, out[3]);
-    }
-#elif defined(__aarch64__)
-    // NEON: Process 2 x uint64_t at a time
-    const uint64x2_t prefix_vec = vdupq_n_u64(prefix);
-
-    for (; i + 2 <= count; i += 2) {
-        // Load 2 x uint32_t and zero-extend to 2 x uint64_t
-        uint32x2_t rows_32 = vld1_u32(row_ids + i);
-        uint64x2_t rows_64 = vmovl_u32(rows_32);
-
-        // OR with prefix
-        uint64x2_t result = vorrq_u64(rows_64, prefix_vec);
-
-        // Store individually (page boundary safe)
-        dest.write_at(start_idx + i, vgetq_lane_u64(result, 0));
-        dest.write_at(start_idx + i + 1, vgetq_lane_u64(result, 1));
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; ++i) {
-        dest.write_at(start_idx + i,
-                      prefix | static_cast<uint64_t>(row_ids[i]));
-    }
-
-    return count;
-}
-
-/**
- * @brief Copy provenance from source column using batch reads.
- *
- * Copies existing 64-bit provenance values from child intermediate.
- * Uses contiguous batch access for better cache behavior.
- *
- * @param dest       Destination deferred column
- * @param start_idx  Starting output index
- * @param src        Source deferred column (from child)
- * @param row_ids    Row indices into source column
- * @param count      Number of rows to copy
- * @return Number of rows processed (always == count)
- */
-inline size_t copy_provenance_batch(mema::deferred_column_t &dest,
-                                    size_t start_idx,
-                                    const mema::deferred_column_t &src,
-                                    const uint32_t *row_ids, size_t count) {
-    for (size_t i = 0; i < count; ++i) {
-        dest.write_at(start_idx + i, src[row_ids[i]]);
-    }
-    return count;
-}
-
-} // namespace simd_provenance
-
-// ============================================================================
-// Source Precomputation Structures
-// ============================================================================
-
-/**
- * @brief Precomputed metadata for deferred column sources.
- *
- * Tracks where each deferred column's provenance comes from:
- * - For columnar inputs: encode fresh (table_id, column_idx, row_id)
- * - For DeferredResult inputs: copy existing provenance from child
- */
-struct DeferredColumnSource {
-    const mema::deferred_column_t *source_col =
-        nullptr;                 ///< Source if from intermediate.
-    uint8_t base_table_id = 0;   ///< Base table ID for encoding.
-    uint8_t base_column_idx = 0; ///< Base column index for encoding.
-    bool from_build = false;     ///< True if from build side.
-    bool needs_encode = false;   ///< True if columnar (needs fresh encode).
-};
-
-/**
- * @brief Precomputed metadata for materialized column sources.
- *
- * Eliminates per-row std::variant access and conditional checks in hot loop.
- * Mirrors SourceInfo from construct_intermediate.h but for deferred path.
- */
-struct alignas(8) MaterializedColumnSource {
-    const mema::column_t *intermediate_col =
-        nullptr; ///< Source if from DeferredResult materialized
-    const Column *columnar_col = nullptr; ///< Source if from ColumnarTable
-    const mema::deferred_column_t *deferred_resolve_col =
-        nullptr;                 ///< Source if needs deferred resolution
-    size_t child_output_idx = 0; ///< Index in child's output
-    size_t mat_col_idx = 0;      ///< Index in result.materialized[]
-    DataType type = DataType::INT32;
-    uint8_t base_table_id = 0;           ///< For VARCHAR source tracking
-    uint8_t base_column_idx = 0;         ///< For VARCHAR source tracking
-    bool is_columnar = false;            ///< True if source is ColumnarTable
-    bool from_build = false;             ///< True if from build side
-    bool needs_deferred_resolve = false; ///< True if child deferred this column
-};
-
-// ============================================================================
-// Helper Functions
-// ============================================================================
-
-/**
- * @brief Collect columns needed from a DeferredInput for page index building.
- */
-inline platform::ArenaVector<const Column *>
-collect_deferred_columns(const DeferredInput &input,
-                         const platform::ArenaVector<uint8_t> &needed,
-                         platform::ThreadArena &arena) {
-    platform::ArenaVector<const Column *> columns(arena);
-    if (!input.node)
-        return columns;
-
-    columns.resize(input.node->output_attrs.size());
-    std::memset(columns.data(), 0, columns.size() * sizeof(const Column *));
-
-    if (!input.is_columnar())
-        return columns;
-
-    auto *table = std::get<const ColumnarTable *>(input.data);
-    for (size_t i = 0; i < input.node->output_attrs.size(); ++i) {
-        if (i < needed.size() && needed[i]) {
-            auto [actual_col_idx, _] = input.node->output_attrs[i];
-            columns[i] = &table->columns[actual_col_idx];
-        }
-    }
-    return columns;
-}
-
-/**
- * @brief Prepare ColumnarReader for deferred materialization path.
- *
- * Sets up page indices for columns that need to be read from columnar inputs.
- */
-inline void prepare_deferred_columns(
-    ColumnarReader &reader, const DeferredInput &build_input,
-    const DeferredInput &probe_input, const DeferredJoinNode &join_node,
-    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
-    size_t build_size, bool build_is_left) {
-
-    bool build_is_columnar = build_input.is_columnar();
-    bool probe_is_columnar = probe_input.is_columnar();
-
-    if (!build_is_columnar && !probe_is_columnar)
-        return;
-
-    auto &arena = Contest::platform::get_arena(0);
-
-    // Determine which columns from each side are needed
-    platform::ArenaVector<uint8_t> build_needed(arena);
-    if (build_input.node) {
-        build_needed.resize(build_input.node->output_attrs.size());
-        std::memset(build_needed.data(), 0, build_needed.size());
-    }
-
-    platform::ArenaVector<uint8_t> probe_needed(arena);
-    if (probe_input.node) {
-        probe_needed.resize(probe_input.node->output_attrs.size());
-        std::memset(probe_needed.data(), 0, probe_needed.size());
-    }
-
-    // Mark columns needed based on materialization decisions
-    for (const auto &col : join_node.columns) {
-        if (col.resolution == ColumnResolution::MATERIALIZE) {
-            bool from_build = (col.from_left == build_is_left);
-            if (from_build && col.child_output_idx < build_needed.size()) {
-                build_needed[col.child_output_idx] = 1;
-            } else if (!from_build &&
-                       col.child_output_idx < probe_needed.size()) {
-                probe_needed[col.child_output_idx] = 1;
-            }
-        }
-    }
-
-    if (build_is_columnar) {
-        reader.prepare_build(
-            collect_deferred_columns(build_input, build_needed, arena));
-    }
-
-    if (probe_is_columnar) {
-        reader.prepare_probe(
-            collect_deferred_columns(probe_input, probe_needed, arena));
-    }
-}
-
-/**
- * @brief Create empty deferred result with proper schema.
- */
-inline DeferredResult
-create_empty_deferred_result(const DeferredJoinNode &node) {
-    DeferredResult result;
-    result.node_info = &node;
-    result.num_rows = 0;
-    result.materialized_map.resize(node.columns.size(), std::nullopt);
-    result.deferred_map.resize(node.columns.size(), std::nullopt);
-
-    size_t mat_count = 0;
-    size_t def_count = 0;
-    for (const auto &col : node.columns) {
-        if (col.resolution == ColumnResolution::MATERIALIZE) {
-            result.materialized_map[col.original_idx] = mat_count++;
-        } else {
-            result.deferred_map[col.original_idx] = def_count++;
-        }
-    }
-    result.materialized.resize(mat_count);
-    result.deferred_columns.resize(def_count);
-
-    return result;
-}
-
-/**
- * @brief Prepare deferred column sources for intermediate construction.
- */
-inline std::vector<DeferredColumnSource>
-prepare_deferred_sources(const DeferredJoinNode &join_node,
-                         const DeferredInput &build_input,
-                         const DeferredInput &probe_input, bool build_is_left) {
-    std::vector<DeferredColumnSource> sources;
-    sources.reserve(join_node.num_deferred_columns);
-
-    for (const auto &col : join_node.columns) {
-        if (col.resolution != ColumnResolution::DEFER)
-            continue;
-
-        DeferredColumnSource src;
-        src.base_table_id = col.provenance.base_table_id;
-        src.base_column_idx = col.provenance.base_column_idx;
-        src.from_build = (col.from_left == build_is_left);
-
-        const auto &src_input = src.from_build ? build_input : probe_input;
-
-        if (src_input.is_columnar()) {
-            src.needs_encode = true;
-            src.source_col = nullptr;
-        } else {
-            const auto *child_def =
-                src_input.get_deferred_column(col.child_output_idx);
-            if (child_def) {
-                src.needs_encode = false;
-                src.source_col = child_def;
-            } else {
-                src.needs_encode = true;
-                src.source_col = nullptr;
-            }
-        }
-        sources.push_back(src);
-    }
-    return sources;
-}
-
-/**
- * @brief Precompute materialized column sources for column-major iteration.
- *
- * For each MATERIALIZE column, determines source type and caches pointers
- * to avoid per-row std::variant access in the hot loop.
- */
-inline std::vector<MaterializedColumnSource> prepare_materialized_sources(
-    const DeferredJoinNode &join_node, const DeferredInput &build_input,
-    const DeferredInput &probe_input, bool build_is_left) {
-    std::vector<MaterializedColumnSource> sources;
-    sources.reserve(join_node.columns.size());
-
-    size_t mat_idx = 0;
-    for (const auto &col : join_node.columns) {
-        if (col.resolution != ColumnResolution::MATERIALIZE)
-            continue;
-
-        MaterializedColumnSource src;
-        src.mat_col_idx = mat_idx++;
-        src.child_output_idx = col.child_output_idx;
-        src.type = col.type;
-        src.base_table_id = col.provenance.base_table_id;
-        src.base_column_idx = col.provenance.base_column_idx;
-        src.from_build = (col.from_left == build_is_left);
-
-        const auto &src_input = src.from_build ? build_input : probe_input;
-
-        if (src_input.is_columnar()) {
-            src.is_columnar = true;
-            const auto *table = std::get<const ColumnarTable *>(src_input.data);
-            auto [actual_idx, _] =
-                src_input.node->output_attrs[col.child_output_idx];
-            src.columnar_col = &table->columns[actual_idx];
-        } else {
-            src.is_columnar = false;
-            const auto &ir = std::get<DeferredResult>(src_input.data);
-
-            if (ir.is_materialized(col.child_output_idx)) {
-                src.intermediate_col =
-                    ir.get_materialized(col.child_output_idx);
-            } else if (ir.is_deferred(col.child_output_idx)) {
-                src.needs_deferred_resolve = true;
-                src.deferred_resolve_col =
-                    ir.get_deferred(col.child_output_idx);
-            }
-        }
-        sources.push_back(src);
-    }
-
-    return sources;
-}
-
-// ============================================================================
-// Main Construction Function
-// ============================================================================
-
-/**
- * @brief Constructs deferred intermediate result from thread-local buffers.
- *
- * Optimized with column-major iteration and SIMD provenance encoding.
- * Only materializes columns marked MATERIALIZE in the DeferredJoinNode.
- * Deferred columns store 64-bit provenance encoding for resolution at final
- * output.
- *
- * @tparam Mode            Collection mode for compile-time specialization.
- * @param buffers          Thread-local match buffers from probe.
- * @param build_input      Build side data source.
- * @param probe_input      Probe side data source.
- * @param join_node        Deferred join node with materialization decisions.
- * @param remapped_attrs   Output attributes (after build/probe remapping).
- * @param build_output_size Number of columns from build side.
- * @param build_is_left    True if build side is the original left child.
- * @param columnar_reader  Reader for columnar data access.
- * @param out_result       Output DeferredResult (populated in-place).
- * @param deferred_plan    Full deferred plan for base table access.
- */
-template <MatchCollectionMode Mode>
-void construct_deferred_from_buffers(
-    std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
-    const DeferredInput &build_input, const DeferredInput &probe_input,
-    const DeferredJoinNode &join_node,
-    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
-    size_t build_output_size, bool build_is_left,
-    ColumnarReader &columnar_reader, DeferredResult &out_result,
-    const DeferredPlan &deferred_plan) {
-
-    // Count total matches and compute buffer start offsets
-    size_t total_matches = 0;
-    std::vector<size_t> buffer_starts(buffers.size());
-    for (size_t i = 0; i < buffers.size(); ++i) {
-        buffer_starts[i] = total_matches;
-        total_matches += buffers[i].count();
-    }
-
-    if (total_matches == 0) {
-        out_result = create_empty_deferred_result(join_node);
-        return;
-    }
-
-    // Initialize result metadata
-    out_result.node_info = &join_node;
-    out_result.num_rows = total_matches;
-    out_result.materialized_map.resize(join_node.columns.size(), std::nullopt);
-    out_result.deferred_map.resize(join_node.columns.size(), std::nullopt);
-
-    size_t mat_count = 0;
-    size_t def_count = 0;
-    for (const auto &col : join_node.columns) {
-        if (col.resolution == ColumnResolution::MATERIALIZE) {
-            out_result.materialized_map[col.original_idx] = mat_count++;
-        } else {
-            out_result.deferred_map[col.original_idx] = def_count++;
-        }
-    }
-
-    // Precompute sources for column-major iteration
-    auto mat_sources = prepare_materialized_sources(join_node, build_input,
-                                                    probe_input, build_is_left);
-    auto deferred_sources = prepare_deferred_sources(
-        join_node, build_input, probe_input, build_is_left);
-
-    // Pre-allocate pages
-    using Page = mema::column_t::Page;
-    using DeferredPage = mema::deferred_column_t::Page;
-    size_t mat_pages_needed =
-        (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE;
-    size_t def_pages_needed =
-        (total_matches + mema::deferred_column_t::ENTRIES_PER_PAGE - 1) /
-        mema::deferred_column_t::ENTRIES_PER_PAGE;
-
-    out_result.materialized.resize(mat_count);
-    for (size_t c = 0; c < mat_count; ++c) {
-        out_result.materialized[c].pages.resize(mat_pages_needed);
-        out_result.materialized[c].set_row_count(total_matches);
-    }
-
-    out_result.deferred_columns.resize(def_count);
-    for (size_t d = 0; d < def_count; ++d) {
-        out_result.deferred_columns[d].pages.resize(def_pages_needed);
-        out_result.deferred_columns[d].set_row_count(total_matches);
-    }
-
-    // Set source metadata for materialized columns
-    for (const auto &src : mat_sources) {
-        out_result.materialized[src.mat_col_idx].source_table =
-            src.base_table_id;
-        out_result.materialized[src.mat_col_idx].source_column =
-            src.base_column_idx;
-    }
-
-    const size_t num_threads = THREAD_COUNT;
-
-    // Parallel page allocation
-    worker_pool().execute([&](size_t t) {
-        for (size_t c = 0; c < mat_count; ++c) {
-            auto &col = out_result.materialized[c];
-            for (size_t p = t; p < mat_pages_needed; p += num_threads) {
-                void *ptr =
-                    Contest::platform::get_arena(t)
-                        .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
-                col.pages[p] = reinterpret_cast<Page *>(ptr);
-            }
-        }
-        for (size_t d = 0; d < def_count; ++d) {
-            auto &def_col = out_result.deferred_columns[d];
-            for (size_t p = t; p < def_pages_needed; p += num_threads) {
-                void *ptr =
-                    Contest::platform::get_arena(t)
-                        .alloc_chunk<
-                            Contest::platform::ChunkType::DEFERRED_PAGE>();
-                def_col.pages[p] = reinterpret_cast<DeferredPage *>(ptr);
-            }
-        }
-    });
-
-    // ========================================================================
-    // COLUMN-MAJOR PARALLEL POPULATION
-    // ========================================================================
-    worker_pool().execute([&](size_t t) {
-        if (t >= buffers.size())
-            return;
-        auto &buf = buffers[t];
-        size_t my_count = buf.count();
-        if (my_count == 0)
-            return;
-
-        size_t start = buffer_starts[t];
-        ColumnarReader::Cursor cursor;
-
-        // ====================================================================
-        // Process MATERIALIZED columns (column-major for cache locality)
-        // ====================================================================
-        for (const auto &src : mat_sources) {
-            auto &dest_col = out_result.materialized[src.mat_col_idx];
-
-            // Get appropriate range based on which side this column comes from
-            auto range = src.from_build ? buf.left_range() : buf.right_range();
-
-            if (src.is_columnar) {
-                // Columnar source - use ColumnarReader with cursor caching
-                const auto &col = *src.columnar_col;
-                size_t k = start;
-                for (uint32_t rid : range) {
-                    dest_col.write_at(k++,
-                                      columnar_reader.read_value(
-                                          col, src.child_output_idx, rid,
-                                          src.type, cursor, src.from_build));
-                }
-            } else if (src.intermediate_col) {
-                // Intermediate materialized source - direct copy
-                const auto &vec = *src.intermediate_col;
-                size_t k = start;
-                for (uint32_t rid : range) {
-                    dest_col.write_at(k++, vec[rid]);
-                }
-            } else if (src.needs_deferred_resolve && src.deferred_resolve_col) {
-                // Deferred in child - resolve via provenance
-                const auto &def_col = *src.deferred_resolve_col;
-                size_t k = start;
-                for (uint32_t rid : range) {
-                    uint64_t prov = def_col[rid];
-                    uint8_t base_tid = DeferredProvenance::table(prov);
-                    uint8_t base_col = DeferredProvenance::column(prov);
-                    uint64_t base_row = DeferredProvenance::row(prov);
-
-                    if (deferred_plan.original_plan) [[likely]] {
-                        const auto &base_table =
-                            deferred_plan.original_plan->inputs[base_tid];
-                        mema::value_t val =
-                            columnar_reader.read_value_direct_public(
-                                base_table.columns[base_col],
-                                static_cast<uint32_t>(base_row), src.type);
-                        dest_col.write_at(k++, val);
-                    } else {
-                        dest_col.write_at(
-                            k++, mema::value_t{mema::value_t::NULL_VALUE});
-                    }
-                }
-            }
-        }
-
-        // ====================================================================
-        // Process DEFERRED columns (column-major with SIMD batch encoding)
-        // ====================================================================
-        for (size_t d = 0; d < deferred_sources.size(); ++d) {
-            const auto &def_src = deferred_sources[d];
-            auto &dest_def_col = out_result.deferred_columns[d];
-
-            if (def_src.needs_encode) {
-                // Fresh encoding from columnar input - use SIMD batch
-                auto batch_reader = def_src.from_build
-                                        ? buf.left_batch_reader()
-                                        : buf.right_batch_reader();
-
-                size_t k = start;
-                while (batch_reader.has_more()) {
-                    size_t batch_count;
-                    // Request larger batches for SIMD efficiency
-                    constexpr size_t MAX_BATCH =
-                        simd_provenance::BATCH_SIZE > 0 ? 64 : 256;
-                    const uint32_t *row_ids =
-                        batch_reader.get_batch(MAX_BATCH, batch_count);
-
-                    if (batch_count > 0) {
-                        simd_provenance::encode_provenance_batch(
-                            dest_def_col, k, row_ids, batch_count,
-                            def_src.base_table_id, def_src.base_column_idx);
-                        k += batch_count;
-                    }
-                }
-            } else if (def_src.source_col) {
-                // Copy existing provenance from child intermediate
-                auto batch_reader = def_src.from_build
-                                        ? buf.left_batch_reader()
-                                        : buf.right_batch_reader();
-
-                size_t k = start;
-                while (batch_reader.has_more()) {
-                    size_t batch_count;
-                    const uint32_t *row_ids =
-                        batch_reader.get_batch(256, batch_count);
-
-                    if (batch_count > 0) {
-                        simd_provenance::copy_provenance_batch(
-                            dest_def_col, k, *def_src.source_col, row_ids,
-                            batch_count);
-                        k += batch_count;
-                    }
-                }
-            }
-        }
-    });
-}
-
-} // namespace materialize
-} // namespace Contest
diff --git a/include/materialization/construct_intermediate.h b/include/materialization/construct_intermediate.h
index 090863f..ed1834d 100644
--- a/include/materialization/construct_intermediate.h
+++ b/include/materialization/construct_intermediate.h
@@ -2,156 +2,345 @@
  * @file construct_intermediate.h
  * @brief Constructs intermediate results for multi-way joins.
  *
- * Allocates and populates ExecuteResult (column_t) from match collectors.
- * Templated on MatchCollectionMode for zero-overhead mode selection.
+ * Allocates and populates IntermediateResult with only MATERIALIZE columns
+ * (typically just the parent's join key). Deferred columns store 64-bit
+ * provenance (table_id, column_idx, row_id) for resolution at final output.
+ *
+ * Optimized with:
+ * - Column-major iteration for cache locality
+ * - Precomputed source metadata to avoid per-row variant access
+ * - SIMD provenance encoding (AVX2/NEON) for deferred columns
+ * - Batch access to match collector chunks
+ *
+ * @see materialize.h for final resolution of deferred columns.
  */
 #pragma once
 
+#include <cstring>
+#include <vector>
+
 #include <data_access/columnar_reader.h>
+#include <data_model/deferred_plan.h>
 #include <data_model/intermediate.h>
-#include <data_model/plan.h>
-#include <join_execution/join_setup.h>
+#include <foundation/common.h>
 #include <join_execution/match_collector.h>
 #include <platform/arena.h>
 #include <platform/worker_pool.h>
-#include <vector>
-/**
- * @namespace Contest::materialize
- * @brief Materialization of join results into columnar format.
- *
- * @see intermediate.h for column_t/value_t format details.
- */
-namespace Contest::materialize {
 
-using Contest::ExecuteResult;
-using Contest::ExtendedResult;
-using Contest::GlobalRowId;
+#if defined(__x86_64__)
+#include <immintrin.h>
+#elif defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+
+namespace Contest {
+namespace materialize {
+
 using Contest::io::ColumnarReader;
-using Contest::join::JoinInput;
 using Contest::join::MatchCollectionMode;
 using Contest::join::ThreadLocalMatchBuffer;
 using Contest::platform::THREAD_COUNT;
 using Contest::platform::worker_pool;
 
+// ============================================================================
+// SIMD Provenance Encoding
+// ============================================================================
+
+namespace simd_provenance {
+
+#if defined(__x86_64__) && defined(__AVX2__)
+inline constexpr size_t BATCH_SIZE = 4; ///< 4 x uint64_t in AVX2 (256-bit)
+#elif defined(__aarch64__)
+inline constexpr size_t BATCH_SIZE = 2; ///< 2 x uint64_t in NEON (128-bit)
+#else
+inline constexpr size_t BATCH_SIZE = 0; ///< No SIMD available
+#endif
+
 /**
- * @brief Precomputed metadata for resolving an output column's source.
+ * @brief Encode provenance for batch of row IDs using SIMD.
  *
- * Avoids per-value std::variant accesses and tuple lookups in hot loop.
- * 8-byte alignment optimizes struct packing for vector iteration.
+ * Encodes (table_id << 56) | (column_idx << 48) | row_id for each row.
+ * Uses AVX2 on x86_64 or NEON on aarch64, with scalar fallback.
  *
- * @see prepare_sources() for precomputation logic.
+ * @param dest       Destination deferred column
+ * @param start_idx  Starting output index
+ * @param row_ids    Pointer to row IDs (from IndexChunk, contiguous)
+ * @param count      Number of row IDs to process
+ * @param table_id   Base table ID (constant for all rows)
+ * @param column_idx Base column index (constant for all rows)
+ * @return Number of rows processed (always == count)
  */
-struct alignas(8) SourceInfo {
-    const mema::column_t *intermediate_col =
-        nullptr;                          /**< Source if intermediate. */
-    const Column *columnar_col = nullptr; /**< Source if columnar. */
-    size_t remapped_col_idx = 0; /**< Local index within source side. */
-    bool is_columnar = false;    /**< True if source is columnar table. */
-    bool from_build = false; /**< True if from build side, false if probe. */
-};
+inline size_t encode_provenance_batch(mema::deferred_column_t &dest,
+                                      size_t start_idx, const uint32_t *row_ids,
+                                      size_t count, uint8_t table_id,
+                                      uint8_t column_idx) {
+    // Precompute constant prefix: (table_id << 56) | (column_idx << 48)
+    const uint64_t prefix = DeferredProvenance::encode(table_id, column_idx, 0);
+
+    size_t i = 0;
+
+#if defined(__x86_64__) && defined(__AVX2__)
+    // AVX2: Process 4 x uint64_t at a time
+    // Load 4 x uint32_t, zero-extend to 4 x uint64_t, OR with prefix
+    const __m256i prefix_vec = _mm256_set1_epi64x(static_cast<int64_t>(prefix));
+
+    for (; i + 4 <= count; i += 4) {
+        // Load 4 x uint32_t and zero-extend to 4 x uint64_t
+        __m128i rows_32 =
+            _mm_loadu_si128(reinterpret_cast<const __m128i *>(row_ids + i));
+        __m256i rows_64 = _mm256_cvtepu32_epi64(rows_32);
+
+        // OR with prefix to create provenance values
+        __m256i result = _mm256_or_si256(rows_64, prefix_vec);
+
+        // Store to aligned buffer, then write individually (page-safe)
+        alignas(32) uint64_t out[4];
+        _mm256_store_si256(reinterpret_cast<__m256i *>(out), result);
+
+        dest.write_at(start_idx + i, out[0]);
+        dest.write_at(start_idx + i + 1, out[1]);
+        dest.write_at(start_idx + i + 2, out[2]);
+        dest.write_at(start_idx + i + 3, out[3]);
+    }
+#elif defined(__aarch64__)
+    // NEON: Process 2 x uint64_t at a time
+    const uint64x2_t prefix_vec = vdupq_n_u64(prefix);
+
+    for (; i + 2 <= count; i += 2) {
+        // Load 2 x uint32_t and zero-extend to 2 x uint64_t
+        uint32x2_t rows_32 = vld1_u32(row_ids + i);
+        uint64x2_t rows_64 = vmovl_u32(rows_32);
+
+        // OR with prefix
+        uint64x2_t result = vorrq_u64(rows_64, prefix_vec);
+
+        // Store individually (page boundary safe)
+        dest.write_at(start_idx + i, vgetq_lane_u64(result, 0));
+        dest.write_at(start_idx + i + 1, vgetq_lane_u64(result, 1));
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; ++i) {
+        dest.write_at(start_idx + i,
+                      prefix | static_cast<uint64_t>(row_ids[i]));
+    }
+
+    return count;
+}
 
 /**
- * @brief Builds SourceInfo for each output column for fast hot-loop lookup.
+ * @brief Copy provenance from source column using batch reads.
  *
- * @param remapped_attrs Output column specifications (global indexing).
- * @param build_input    Build side data (ColumnarTable* or ExtendedResult).
- * @param probe_input    Probe side data (ColumnarTable* or ExtendedResult).
- * @param build_node     PlanNode for build side (contains output_attrs).
- * @param probe_node     PlanNode for probe side (contains output_attrs).
- * @param build_size     Number of columns from build side.
- * @return Vector of SourceInfo, one per output column.
+ * Copies existing 64-bit provenance values from child intermediate.
+ * Uses contiguous batch access for better cache behavior.
  *
- * @see SourceInfo for field documentation.
- * @see construct_intermediate() for consumption in hot loop.
+ * @param dest       Destination deferred column
+ * @param start_idx  Starting output index
+ * @param src        Source deferred column (from child)
+ * @param row_ids    Row indices into source column
+ * @param count      Number of rows to copy
+ * @return Number of rows processed (always == count)
  */
-inline std::vector<SourceInfo>
-prepare_sources(const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
-                const JoinInput &build_input, const JoinInput &probe_input,
-                const PlanNode &build_node, const PlanNode &probe_node,
-                size_t build_size) {
-    std::vector<SourceInfo> sources;
-    sources.reserve(remapped_attrs.size());
-    for (const auto &[col_idx, _] : remapped_attrs) {
-        SourceInfo info;
-        info.from_build = (col_idx < build_size);
-        size_t local_idx = info.from_build ? col_idx : col_idx - build_size;
-        info.remapped_col_idx = local_idx;
-        const JoinInput &input = info.from_build ? build_input : probe_input;
-        const PlanNode &node = info.from_build ? build_node : probe_node;
-        if (input.is_columnar()) {
-            info.is_columnar = true;
-            auto *table = std::get<const ColumnarTable *>(input.data);
-            auto [actual_idx, _] = node.output_attrs[local_idx];
-            info.columnar_col = &table->columns[actual_idx];
-        } else {
-            info.is_columnar = false;
-            const auto &res = std::get<ExtendedResult>(input.data);
-            info.intermediate_col = &res.columns[local_idx];
-        }
-        sources.push_back(info);
+inline size_t copy_provenance_batch(mema::deferred_column_t &dest,
+                                    size_t start_idx,
+                                    const mema::deferred_column_t &src,
+                                    const uint32_t *row_ids, size_t count) {
+    for (size_t i = 0; i < count; ++i) {
+        dest.write_at(start_idx + i, src[row_ids[i]]);
     }
-    return sources;
+    return count;
 }
 
+} // namespace simd_provenance
+
+// ============================================================================
+// Source Precomputation Structures
+// ============================================================================
+
 /**
- * @brief Precomputed metadata for resolving a row ID column's source.
+ * @brief Precomputed metadata for deferred column sources.
  *
- * Determines how to populate each output row ID column:
- * - For columnar input: encode GlobalRowId on-the-fly from local index
- * - For intermediate input: copy from existing rowid_column_t
+ * Tracks where each deferred column's provenance comes from:
+ * - For columnar inputs: encode fresh (table_id, column_idx, row_id)
+ * - For IntermediateResult inputs: copy existing provenance from child
+ */
+struct DeferredColumnSource {
+    const mema::deferred_column_t *source_col =
+        nullptr;                 ///< Source if from intermediate.
+    uint8_t base_table_id = 0;   ///< Base table ID for encoding.
+    uint8_t base_column_idx = 0; ///< Base column index for encoding.
+    bool from_build = false;     ///< True if from build side.
+    bool needs_encode = false;   ///< True if columnar (needs fresh encode).
+};
+
+/**
+ * @brief Precomputed metadata for materialized column sources.
  *
- * @see prepare_rowid_sources() for precomputation logic.
+ * Eliminates per-row std::variant access and conditional checks in hot loop.
  */
-struct alignas(8) RowIdSource {
-    const mema::rowid_column_t *source_col =
-        nullptr;             /**< Source if from intermediate (else encode). */
-    uint8_t table_id = 0;    /**< Table ID for encoding/lookup. */
-    bool from_build = false; /**< True if from build side, false if probe. */
-    bool needs_encode =
-        false; /**< True if columnar (needs GlobalRowId encode). */
+struct alignas(8) MaterializedColumnSource {
+    const mema::column_t *intermediate_col =
+        nullptr; ///< Source if from IntermediateResult materialized
+    const Column *columnar_col = nullptr; ///< Source if from ColumnarTable
+    const mema::deferred_column_t *deferred_resolve_col =
+        nullptr;                 ///< Source if needs deferred resolution
+    size_t child_output_idx = 0; ///< Index in child's output
+    size_t mat_col_idx = 0;      ///< Index in result.materialized[]
+    DataType type = DataType::INT32;
+    uint8_t base_table_id = 0;           ///< For VARCHAR source tracking
+    uint8_t base_column_idx = 0;         ///< For VARCHAR source tracking
+    bool is_columnar = false;            ///< True if source is ColumnarTable
+    bool from_build = false;             ///< True if from build side
+    bool needs_deferred_resolve = false; ///< True if child deferred this column
 };
 
+// ============================================================================
+// Helper Functions
+// ============================================================================
+
 /**
- * @brief Builds RowIdSource for each output row ID column.
+ * @brief Collect columns needed from a JoinInput for page index building.
+ */
+inline platform::ArenaVector<const Column *>
+collect_input_columns(const JoinInput &input,
+                      const platform::ArenaVector<uint8_t> &needed,
+                      platform::ThreadArena &arena) {
+    platform::ArenaVector<const Column *> columns(arena);
+    if (!input.node)
+        return columns;
+
+    columns.resize(input.node->output_attrs.size());
+    std::memset(columns.data(), 0, columns.size() * sizeof(const Column *));
+
+    if (!input.is_columnar())
+        return columns;
+
+    auto *table = std::get<const ColumnarTable *>(input.data);
+    for (size_t i = 0; i < input.node->output_attrs.size(); ++i) {
+        if (i < needed.size() && needed[i]) {
+            auto [actual_col_idx, _] = input.node->output_attrs[i];
+            columns[i] = &table->columns[actual_col_idx];
+        }
+    }
+    return columns;
+}
+
+/**
+ * @brief Prepare ColumnarReader for intermediate construction.
  *
- * @param merged_table_ids  Sorted, unique table IDs to track in output.
- * @param build_input       Build side data (ColumnarTable* or ExtendedResult).
- * @param probe_input       Probe side data (ColumnarTable* or ExtendedResult).
- * @return Vector of RowIdSource, one per tracked table.
+ * Sets up page indices for columns that need to be read from columnar inputs.
  */
-inline std::vector<RowIdSource>
-prepare_rowid_sources(const std::vector<uint8_t> &merged_table_ids,
-                      const JoinInput &build_input,
-                      const JoinInput &probe_input) {
-    std::vector<RowIdSource> sources;
-    sources.reserve(merged_table_ids.size());
-
-    for (uint8_t tid : merged_table_ids) {
-        RowIdSource src;
-        src.table_id = tid;
-
-        // Check build side first
-        auto build_tables = build_input.tracked_tables();
-        bool in_build = std::find(build_tables.begin(), build_tables.end(),
-                                  tid) != build_tables.end();
-        if (in_build) {
-            src.from_build = true;
-            if (build_input.is_columnar()) {
-                src.needs_encode = true;
-                src.source_col = nullptr;
-            } else {
-                src.needs_encode = false;
-                src.source_col = build_input.get_rowid_column(tid);
+inline void prepare_intermediate_columns(
+    ColumnarReader &reader, const JoinInput &build_input,
+    const JoinInput &probe_input, const AnalyzedJoinNode &join_node,
+    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
+    size_t build_size, bool build_is_left) {
+
+    bool build_is_columnar = build_input.is_columnar();
+    bool probe_is_columnar = probe_input.is_columnar();
+
+    if (!build_is_columnar && !probe_is_columnar)
+        return;
+
+    auto &arena = Contest::platform::get_arena(0);
+
+    // Determine which columns from each side are needed
+    platform::ArenaVector<uint8_t> build_needed(arena);
+    if (build_input.node) {
+        build_needed.resize(build_input.node->output_attrs.size());
+        std::memset(build_needed.data(), 0, build_needed.size());
+    }
+
+    platform::ArenaVector<uint8_t> probe_needed(arena);
+    if (probe_input.node) {
+        probe_needed.resize(probe_input.node->output_attrs.size());
+        std::memset(probe_needed.data(), 0, probe_needed.size());
+    }
+
+    // Mark columns needed based on materialization decisions
+    for (const auto &col : join_node.columns) {
+        if (col.resolution == ColumnResolution::MATERIALIZE) {
+            bool from_build = (col.from_left == build_is_left);
+            if (from_build && col.child_output_idx < build_needed.size()) {
+                build_needed[col.child_output_idx] = 1;
+            } else if (!from_build &&
+                       col.child_output_idx < probe_needed.size()) {
+                probe_needed[col.child_output_idx] = 1;
             }
+        }
+    }
+
+    if (build_is_columnar) {
+        reader.prepare_build(
+            collect_input_columns(build_input, build_needed, arena));
+    }
+
+    if (probe_is_columnar) {
+        reader.prepare_probe(
+            collect_input_columns(probe_input, probe_needed, arena));
+    }
+}
+
+/**
+ * @brief Create empty intermediate result with proper schema.
+ */
+inline IntermediateResult
+create_empty_intermediate_result(const AnalyzedJoinNode &node) {
+    IntermediateResult result;
+    result.node_info = &node;
+    result.num_rows = 0;
+    result.materialized_map.resize(node.columns.size(), std::nullopt);
+    result.deferred_map.resize(node.columns.size(), std::nullopt);
+
+    size_t mat_count = 0;
+    size_t def_count = 0;
+    for (const auto &col : node.columns) {
+        if (col.resolution == ColumnResolution::MATERIALIZE) {
+            result.materialized_map[col.original_idx] = mat_count++;
+        } else {
+            result.deferred_map[col.original_idx] = def_count++;
+        }
+    }
+    result.materialized.resize(mat_count);
+    result.deferred_columns.resize(def_count);
+
+    return result;
+}
+
+/**
+ * @brief Prepare deferred column sources for intermediate construction.
+ */
+inline std::vector<DeferredColumnSource>
+prepare_deferred_sources(const AnalyzedJoinNode &join_node,
+                         const JoinInput &build_input,
+                         const JoinInput &probe_input, bool build_is_left) {
+    std::vector<DeferredColumnSource> sources;
+    sources.reserve(join_node.num_deferred_columns);
+
+    for (const auto &col : join_node.columns) {
+        if (col.resolution != ColumnResolution::DEFER)
+            continue;
+
+        DeferredColumnSource src;
+        src.base_table_id = col.provenance.base_table_id;
+        src.base_column_idx = col.provenance.base_column_idx;
+        src.from_build = (col.from_left == build_is_left);
+
+        const auto &src_input = src.from_build ? build_input : probe_input;
+
+        if (src_input.is_columnar()) {
+            src.needs_encode = true;
+            src.source_col = nullptr;
         } else {
-            // Must be from probe side
-            src.from_build = false;
-            if (probe_input.is_columnar()) {
+            const auto *child_def =
+                src_input.get_deferred_column(col.child_output_idx);
+            if (child_def) {
+                src.needs_encode = false;
+                src.source_col = child_def;
+            } else {
                 src.needs_encode = true;
                 src.source_col = nullptr;
-            } else {
-                src.needs_encode = false;
-                src.source_col = probe_input.get_rowid_column(tid);
             }
         }
         sources.push_back(src);
@@ -160,34 +349,93 @@ prepare_rowid_sources(const std::vector<uint8_t> &merged_table_ids,
 }
 
 /**
- * @brief Constructs intermediate results directly from thread-local buffers.
+ * @brief Precompute materialized column sources for column-major iteration.
  *
- * Each thread iterates its own buffer, avoiding the merge step. Total matches
- * computed by summing buffer counts. Each thread writes its contiguous portion
- * of output pages. Also populates row ID columns for provenance tracking.
+ * For each MATERIALIZE column, determines source type and caches pointers
+ * to avoid per-row std::variant access in the hot loop.
+ */
+inline std::vector<MaterializedColumnSource>
+prepare_materialized_sources(const AnalyzedJoinNode &join_node,
+                             const JoinInput &build_input,
+                             const JoinInput &probe_input, bool build_is_left) {
+    std::vector<MaterializedColumnSource> sources;
+    sources.reserve(join_node.columns.size());
+
+    size_t mat_idx = 0;
+    for (const auto &col : join_node.columns) {
+        if (col.resolution != ColumnResolution::MATERIALIZE)
+            continue;
+
+        MaterializedColumnSource src;
+        src.mat_col_idx = mat_idx++;
+        src.child_output_idx = col.child_output_idx;
+        src.type = col.type;
+        src.base_table_id = col.provenance.base_table_id;
+        src.base_column_idx = col.provenance.base_column_idx;
+        src.from_build = (col.from_left == build_is_left);
+
+        const auto &src_input = src.from_build ? build_input : probe_input;
+
+        if (src_input.is_columnar()) {
+            src.is_columnar = true;
+            const auto *table = std::get<const ColumnarTable *>(src_input.data);
+            auto [actual_idx, _] =
+                src_input.node->output_attrs[col.child_output_idx];
+            src.columnar_col = &table->columns[actual_idx];
+        } else {
+            src.is_columnar = false;
+            const auto &ir = std::get<IntermediateResult>(src_input.data);
+
+            if (ir.is_materialized(col.child_output_idx)) {
+                src.intermediate_col =
+                    ir.get_materialized(col.child_output_idx);
+            } else if (ir.is_deferred(col.child_output_idx)) {
+                src.needs_deferred_resolve = true;
+                src.deferred_resolve_col =
+                    ir.get_deferred(col.child_output_idx);
+            }
+        }
+        sources.push_back(src);
+    }
+
+    return sources;
+}
+
+// ============================================================================
+// Main Construction Function
+// ============================================================================
+
+/**
+ * @brief Constructs intermediate result from thread-local buffers.
+ *
+ * Optimized with column-major iteration and SIMD provenance encoding.
+ * Only materializes columns marked MATERIALIZE in the AnalyzedJoinNode.
+ * Deferred columns store 64-bit provenance encoding for resolution at final
+ * output.
  *
  * @tparam Mode            Collection mode for compile-time specialization.
- * @param buffers          Vector of ThreadLocalMatchBuffer from probe.
- * @param build_input      Build side data (ColumnarTable* or ExtendedResult).
- * @param probe_input      Probe side data (ColumnarTable* or ExtendedResult).
- * @param remapped_attrs   Output column specifications (global indexing).
- * @param build_node       PlanNode for build side output_attrs mapping.
- * @param probe_node       PlanNode for probe side output_attrs mapping.
- * @param build_size       Number of output columns from build side.
- * @param columnar_reader  ColumnarReader with Cursor caching for page access.
- * @param results          Pre-initialized ExtendedResult, populated in-place.
- * @param merged_table_ids Sorted, unique table IDs to track in output.
+ * @param buffers          Thread-local match buffers from probe.
+ * @param build_input      Build side data source.
+ * @param probe_input      Probe side data source.
+ * @param join_node        Analyzed join node with materialization decisions.
+ * @param remapped_attrs   Output attributes (after build/probe remapping).
+ * @param build_output_size Number of columns from build side.
+ * @param build_is_left    True if build side is the original left child.
+ * @param columnar_reader  Reader for columnar data access.
+ * @param out_result       Output IntermediateResult (populated in-place).
+ * @param analyzed_plan    Full analyzed plan for base table access.
  */
 template <MatchCollectionMode Mode>
-inline void construct_intermediate_from_buffers(
+void construct_intermediate_from_buffers(
     std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
     const JoinInput &build_input, const JoinInput &probe_input,
+    const AnalyzedJoinNode &join_node,
     const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
-    const PlanNode &build_node, const PlanNode &probe_node, size_t build_size,
-    ColumnarReader &columnar_reader, ExtendedResult &results,
-    const std::vector<uint8_t> &merged_table_ids) {
+    size_t build_output_size, bool build_is_left,
+    ColumnarReader &columnar_reader, IntermediateResult &out_result,
+    const AnalyzedPlan &analyzed_plan) {
 
-    // Compute total matches and per-buffer start offsets
+    // Count total matches and compute buffer start offsets
     size_t total_matches = 0;
     std::vector<size_t> buffer_starts(buffers.size());
     for (size_t i = 0; i < buffers.size(); ++i) {
@@ -195,63 +443,90 @@ inline void construct_intermediate_from_buffers(
         total_matches += buffers[i].count();
     }
 
-    if (total_matches == 0)
+    if (total_matches == 0) {
+        out_result = create_empty_intermediate_result(join_node);
         return;
+    }
 
-    auto sources = prepare_sources(remapped_attrs, build_input, probe_input,
-                                   build_node, probe_node, build_size);
-    auto rowid_sources =
-        prepare_rowid_sources(merged_table_ids, build_input, probe_input);
+    // Initialize result metadata
+    out_result.node_info = &join_node;
+    out_result.num_rows = total_matches;
+    out_result.materialized_map.resize(join_node.columns.size(), std::nullopt);
+    out_result.deferred_map.resize(join_node.columns.size(), std::nullopt);
 
-    const size_t num_threads = THREAD_COUNT;
-    const size_t num_cols = sources.size();
-    const size_t num_rowid_cols = rowid_sources.size();
+    size_t mat_count = 0;
+    size_t def_count = 0;
+    for (const auto &col : join_node.columns) {
+        if (col.resolution == ColumnResolution::MATERIALIZE) {
+            out_result.materialized_map[col.original_idx] = mat_count++;
+        } else {
+            out_result.deferred_map[col.original_idx] = def_count++;
+        }
+    }
+
+    // Precompute sources for column-major iteration
+    auto mat_sources = prepare_materialized_sources(join_node, build_input,
+                                                    probe_input, build_is_left);
+    auto deferred_sources = prepare_deferred_sources(
+        join_node, build_input, probe_input, build_is_left);
 
-    // Pre-size page vectors for each data column
+    // Pre-allocate pages
     using Page = mema::column_t::Page;
-    using RowIdPage = mema::rowid_column_t::Page;
-    size_t total_pages_needed =
+    using DeferredPage = mema::deferred_column_t::Page;
+    size_t mat_pages_needed =
         (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE;
+    size_t def_pages_needed =
+        (total_matches + mema::deferred_column_t::ENTRIES_PER_PAGE - 1) /
+        mema::deferred_column_t::ENTRIES_PER_PAGE;
 
-    for (size_t c = 0; c < num_cols; ++c) {
-        auto &col = results.columns[c];
-        col.pages.resize(total_pages_needed);
-        col.set_row_count(total_matches);
+    out_result.materialized.resize(mat_count);
+    for (size_t c = 0; c < mat_count; ++c) {
+        out_result.materialized[c].pages.resize(mat_pages_needed);
+        out_result.materialized[c].set_row_count(total_matches);
     }
 
-    // Setup row ID columns in results
-    results.table_ids = merged_table_ids;
-    results.row_ids.resize(num_rowid_cols);
-    for (size_t r = 0; r < num_rowid_cols; ++r) {
-        results.row_ids[r].table_id = merged_table_ids[r];
-        results.row_ids[r].pages.resize(total_pages_needed);
-        results.row_ids[r].set_row_count(total_matches);
+    out_result.deferred_columns.resize(def_count);
+    for (size_t d = 0; d < def_count; ++d) {
+        out_result.deferred_columns[d].pages.resize(def_pages_needed);
+        out_result.deferred_columns[d].set_row_count(total_matches);
     }
 
-    // Parallel page allocation - each thread allocates its own pages
+    // Set source metadata for materialized columns
+    for (const auto &src : mat_sources) {
+        out_result.materialized[src.mat_col_idx].source_table =
+            src.base_table_id;
+        out_result.materialized[src.mat_col_idx].source_column =
+            src.base_column_idx;
+    }
+
+    const size_t num_threads = THREAD_COUNT;
+
+    // Parallel page allocation
     worker_pool().execute([&](size_t t) {
-        for (size_t c = 0; c < num_cols; ++c) {
-            auto &col = results.columns[c];
-            for (size_t p = t; p < total_pages_needed; p += num_threads) {
+        for (size_t c = 0; c < mat_count; ++c) {
+            auto &col = out_result.materialized[c];
+            for (size_t p = t; p < mat_pages_needed; p += num_threads) {
                 void *ptr =
                     Contest::platform::get_arena(t)
                         .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
                 col.pages[p] = reinterpret_cast<Page *>(ptr);
             }
         }
-        // Allocate row ID pages
-        for (size_t r = 0; r < num_rowid_cols; ++r) {
-            auto &rid_col = results.row_ids[r];
-            for (size_t p = t; p < total_pages_needed; p += num_threads) {
+        for (size_t d = 0; d < def_count; ++d) {
+            auto &def_col = out_result.deferred_columns[d];
+            for (size_t p = t; p < def_pages_needed; p += num_threads) {
                 void *ptr =
                     Contest::platform::get_arena(t)
-                        .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
-                rid_col.pages[p] = reinterpret_cast<RowIdPage *>(ptr);
+                        .alloc_chunk<
+                            Contest::platform::ChunkType::DEFERRED_PAGE>();
+                def_col.pages[p] = reinterpret_cast<DeferredPage *>(ptr);
             }
         }
     });
 
-    // Parallel: each thread processes its own buffer
+    // ========================================================================
+    // COLUMN-MAJOR PARALLEL POPULATION
+    // ========================================================================
     worker_pool().execute([&](size_t t) {
         if (t >= buffers.size())
             return;
@@ -261,89 +536,106 @@ inline void construct_intermediate_from_buffers(
             return;
 
         size_t start = buffer_starts[t];
-        Contest::ColumnarReader::Cursor cursor;
+        ColumnarReader::Cursor cursor;
 
-        // Process data columns
-        for (size_t c = 0; c < num_cols; ++c) {
-            const auto &src = sources[c];
-            auto &dest_col = results.columns[c];
+        // ====================================================================
+        // Process MATERIALIZED columns (column-major for cache locality)
+        // ====================================================================
+        for (const auto &src : mat_sources) {
+            auto &dest_col = out_result.materialized[src.mat_col_idx];
 
-            auto left_range = buf.left_range();
-            auto right_range = buf.right_range();
+            // Get appropriate range based on which side this column comes from
+            auto range = src.from_build ? buf.left_range() : buf.right_range();
 
             if (src.is_columnar) {
+                // Columnar source - use ColumnarReader with cursor caching
                 const auto &col = *src.columnar_col;
-                if (src.from_build) {
-                    size_t k = start;
-                    for (uint32_t rid : left_range) {
-                        dest_col.write_at(k++,
-                                          columnar_reader.read_value(
-                                              col, src.remapped_col_idx, rid,
-                                              col.type, cursor, true));
-                    }
-                } else {
-                    size_t k = start;
-                    for (uint32_t rid : right_range) {
-                        dest_col.write_at(k++,
-                                          columnar_reader.read_value(
-                                              col, src.remapped_col_idx, rid,
-                                              col.type, cursor, false));
-                    }
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    dest_col.write_at(k++,
+                                      columnar_reader.read_value(
+                                          col, src.child_output_idx, rid,
+                                          src.type, cursor, src.from_build));
                 }
-            } else {
+            } else if (src.intermediate_col) {
+                // Intermediate materialized source - direct copy
                 const auto &vec = *src.intermediate_col;
-                if (src.from_build) {
-                    size_t k = start;
-                    for (uint32_t rid : left_range) {
-                        dest_col.write_at(k++, vec[rid]);
-                    }
-                } else {
-                    size_t k = start;
-                    for (uint32_t rid : right_range) {
-                        dest_col.write_at(k++, vec[rid]);
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    dest_col.write_at(k++, vec[rid]);
+                }
+            } else if (src.needs_deferred_resolve && src.deferred_resolve_col) {
+                // Deferred in child - resolve via provenance
+                const auto &def_col = *src.deferred_resolve_col;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    uint64_t prov = def_col[rid];
+                    uint8_t base_tid = DeferredProvenance::table(prov);
+                    uint8_t base_col = DeferredProvenance::column(prov);
+                    uint64_t base_row = DeferredProvenance::row(prov);
+
+                    if (analyzed_plan.original_plan) [[likely]] {
+                        const auto &base_table =
+                            analyzed_plan.original_plan->inputs[base_tid];
+                        mema::value_t val =
+                            columnar_reader.read_value_direct_public(
+                                base_table.columns[base_col],
+                                static_cast<uint32_t>(base_row), src.type);
+                        dest_col.write_at(k++, val);
+                    } else {
+                        dest_col.write_at(
+                            k++, mema::value_t{mema::value_t::NULL_VALUE});
                     }
                 }
             }
         }
 
-        // Process row ID columns
-        for (size_t r = 0; r < num_rowid_cols; ++r) {
-            const auto &rid_src = rowid_sources[r];
-            auto &dest_rid_col = results.row_ids[r];
+        // ====================================================================
+        // Process DEFERRED columns (column-major with SIMD batch encoding)
+        // ====================================================================
+        for (size_t d = 0; d < deferred_sources.size(); ++d) {
+            const auto &def_src = deferred_sources[d];
+            auto &dest_def_col = out_result.deferred_columns[d];
 
-            auto left_range = buf.left_range();
-            auto right_range = buf.right_range();
+            if (def_src.needs_encode) {
+                // Fresh encoding from columnar input - use SIMD batch
+                auto batch_reader = def_src.from_build
+                                        ? buf.left_batch_reader()
+                                        : buf.right_batch_reader();
 
-            if (rid_src.from_build) {
                 size_t k = start;
-                if (rid_src.needs_encode) {
-                    // Columnar build: encode GlobalRowId on-the-fly
-                    for (uint32_t local_idx : left_range) {
-                        dest_rid_col.write_at(
-                            k++,
-                            GlobalRowId::encode(rid_src.table_id, local_idx));
-                    }
-                } else {
-                    // Intermediate build: copy from source row ID column
-                    const auto &src_col = *rid_src.source_col;
-                    for (uint32_t local_idx : left_range) {
-                        dest_rid_col.write_at(k++, src_col[local_idx]);
+                while (batch_reader.has_more()) {
+                    size_t batch_count;
+                    // Request larger batches for SIMD efficiency
+                    constexpr size_t MAX_BATCH =
+                        simd_provenance::BATCH_SIZE > 0 ? 64 : 256;
+                    const uint32_t *row_ids =
+                        batch_reader.get_batch(MAX_BATCH, batch_count);
+
+                    if (batch_count > 0) {
+                        simd_provenance::encode_provenance_batch(
+                            dest_def_col, k, row_ids, batch_count,
+                            def_src.base_table_id, def_src.base_column_idx);
+                        k += batch_count;
                     }
                 }
-            } else {
+            } else if (def_src.source_col) {
+                // Copy existing provenance from child intermediate
+                auto batch_reader = def_src.from_build
+                                        ? buf.left_batch_reader()
+                                        : buf.right_batch_reader();
+
                 size_t k = start;
-                if (rid_src.needs_encode) {
-                    // Columnar probe: encode GlobalRowId on-the-fly
-                    for (uint32_t local_idx : right_range) {
-                        dest_rid_col.write_at(
-                            k++,
-                            GlobalRowId::encode(rid_src.table_id, local_idx));
-                    }
-                } else {
-                    // Intermediate probe: copy from source row ID column
-                    const auto &src_col = *rid_src.source_col;
-                    for (uint32_t local_idx : right_range) {
-                        dest_rid_col.write_at(k++, src_col[local_idx]);
+                while (batch_reader.has_more()) {
+                    size_t batch_count;
+                    const uint32_t *row_ids =
+                        batch_reader.get_batch(256, batch_count);
+
+                    if (batch_count > 0) {
+                        simd_provenance::copy_provenance_batch(
+                            dest_def_col, k, *def_src.source_col, row_ids,
+                            batch_count);
+                        k += batch_count;
                     }
                 }
             }
@@ -351,4 +643,5 @@ inline void construct_intermediate_from_buffers(
     });
 }
 
-} // namespace Contest::materialize
+} // namespace materialize
+} // namespace Contest
diff --git a/include/materialization/materialize.h b/include/materialization/materialize.h
index 6d4a3be..dca7f49 100644
--- a/include/materialization/materialize.h
+++ b/include/materialization/materialize.h
@@ -1,67 +1,150 @@
 /**
  * @file materialize.h
- * @brief Materialization of join results into ColumnarTable format.
+ * @brief Final materialization for execution path.
  *
- * Parallel materialization using per-thread page builders and mmap allocation.
- * Templated on MatchCollectionMode for zero-overhead mode selection.
+ * Materializes all output columns at the root join, resolving deferred
+ * columns by decoding 64-bit provenance (table_id, column_idx, row_id) back
+ * to base tables.
+ *
+ * @see construct_intermediate.h for building IntermediateResult intermediates.
  */
 #pragma once
 
-#include <algorithm>
 #include <cstring>
+#include <functional>
+#include <sys/mman.h>
+#include <vector>
+
 #include <data_access/columnar_reader.h>
+#include <data_model/deferred_plan.h>
 #include <data_model/intermediate.h>
-#include <data_model/plan.h>
-#include <functional>
-#include <join_execution/join_setup.h>
+#include <foundation/common.h>
 #include <join_execution/match_collector.h>
-#include <materialization/construct_intermediate.h>
 #include <materialization/page_builders.h>
+#include <platform/arena.h>
 #include <platform/worker_pool.h>
-#include <sys/mman.h>
-#include <vector>
 
-/** @namespace Contest::materialize @brief Join result materialization. */
-namespace Contest::materialize {
+namespace Contest {
+namespace materialize {
 
-using Contest::ExecuteResult;
-using Contest::ExtendedResult;
 using Contest::io::ColumnarReader;
-using Contest::join::JoinInput;
 using Contest::join::MatchCollectionMode;
-using Contest::join::resolve_input_source;
 using Contest::join::ThreadLocalMatchBuffer;
 using Contest::platform::THREAD_COUNT;
 using Contest::platform::worker_pool;
 
-/** @brief Creates empty ColumnarTable with correct column types for zero-match
- * case. */
+/**
+ * @brief Collect columns needed from a JoinInput for final materialization.
+ */
+inline platform::ArenaVector<const Column *>
+collect_final_columns(const JoinInput &input,
+                      const platform::ArenaVector<uint8_t> &needed,
+                      platform::ThreadArena &arena) {
+    platform::ArenaVector<const Column *> columns(arena);
+    if (!input.node)
+        return columns;
+
+    columns.resize(input.node->output_attrs.size());
+    std::memset(columns.data(), 0, columns.size() * sizeof(const Column *));
+
+    if (!input.is_columnar())
+        return columns;
+
+    auto *table = std::get<const ColumnarTable *>(input.data);
+    for (size_t i = 0; i < input.node->output_attrs.size(); ++i) {
+        if (i < needed.size() && needed[i]) {
+            auto [actual_col_idx, _] = input.node->output_attrs[i];
+            columns[i] = &table->columns[actual_col_idx];
+        }
+    }
+    return columns;
+}
+
+/**
+ * @brief Prepare ColumnarReader for final materialization at root.
+ *
+ * Sets up page indices for ALL output columns (since all need materialization
+ * at root).
+ */
+inline void prepare_final_columns(
+    ColumnarReader &reader, const JoinInput &build_input,
+    const JoinInput &probe_input, const AnalyzedJoinNode &join_node,
+    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
+    size_t build_size, bool build_is_left) {
+
+    bool build_is_columnar = build_input.is_columnar();
+    bool probe_is_columnar = probe_input.is_columnar();
+
+    if (!build_is_columnar && !probe_is_columnar)
+        return;
+
+    auto &arena = Contest::platform::get_arena(0);
+
+    // All output columns needed at root
+    platform::ArenaVector<uint8_t> build_needed(arena);
+    if (build_input.node) {
+        build_needed.resize(build_input.node->output_attrs.size());
+        std::memset(build_needed.data(), 0, build_needed.size());
+    }
+
+    platform::ArenaVector<uint8_t> probe_needed(arena);
+    if (probe_input.node) {
+        probe_needed.resize(probe_input.node->output_attrs.size());
+        std::memset(probe_needed.data(), 0, probe_needed.size());
+    }
+
+    // Mark ALL columns needed for final materialization
+    // from_left refers to original left child
+    // build_is_left tells us if build side is the left child
+    for (const auto &col : join_node.columns) {
+        bool from_build = (col.from_left == build_is_left);
+        if (from_build && col.child_output_idx < build_needed.size()) {
+            build_needed[col.child_output_idx] = 1;
+        } else if (!from_build && col.child_output_idx < probe_needed.size()) {
+            probe_needed[col.child_output_idx] = 1;
+        }
+    }
+
+    if (build_is_columnar) {
+        reader.prepare_build(
+            collect_final_columns(build_input, build_needed, arena));
+    }
+
+    if (probe_is_columnar) {
+        reader.prepare_probe(
+            collect_final_columns(probe_input, probe_needed, arena));
+    }
+}
+
+/**
+ * @brief Create empty result for zero-match case.
+ */
 inline ColumnarTable create_empty_result(
-    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs) {
+    const std::vector<std::tuple<size_t, DataType>> &output_attrs) {
     ColumnarTable empty_result;
     empty_result.num_rows = 0;
-    for (auto [_, data_type] : remapped_attrs) {
+    for (auto [_, data_type] : output_attrs) {
         empty_result.columns.emplace_back(data_type);
     }
     return empty_result;
 }
 
 /**
- * @brief Parallel materialization of a single output column from thread-local
- * buffers.
+ * @brief Materialize a single column from sources.
  *
- * Each thread processes its own buffer directly without merge overhead.
+ * Handles three cases:
+ * 1. COLUMNAR_DIRECT: Input is columnar, read directly via row index
+ * 2. MATERIALIZED: Column was materialized in IntermediateResult
+ * 3. DEFERRED: Resolve via 64-bit provenance to base table
  *
- * @tparam Mode            Collection mode for compile-time specialization.
- * @tparam BuilderType     Int32PageBuilder or VarcharPageBuilder.
- * @tparam ReaderFunc      Callable: (row_id, cursor) -> value_t.
+ * @tparam Mode Collection mode for compile-time specialization.
+ * @tparam BuilderType Int32PageBuilder or VarcharPageBuilder.
+ * @tparam ReaderFunc Callable: (row_idx, cursor) -> value_t.
  * @tparam InitBuilderFunc Callable: (page_allocator) -> BuilderType.
- * @param est_bytes_per_row Average bytes per row (4 for INT32, ~35 for
- * VARCHAR).
  */
 template <MatchCollectionMode Mode, typename BuilderType, typename ReaderFunc,
           typename InitBuilderFunc>
-inline void materialize_column_from_buffers(
+inline void materialize_column(
     Column &dest_col, std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
     size_t total_matches, ReaderFunc &&read_value,
     InitBuilderFunc &&init_builder, bool from_build, size_t est_bytes_per_row) {
@@ -163,133 +246,188 @@ inline void materialize_column_from_buffers(
 }
 
 /**
- * @brief Materializes a single output column from thread-local buffers.
+ * @brief Materialize single output column handling deferred resolution.
  *
- * Dispatcher that determines source location (columnar/intermediate,
- * build/probe), selects page builder type, and invokes
- * materialize_column_from_buffers<>. VARCHAR handling requires source Column
- * pointer for string dereferencing.
+ * For deferred columns, resolves via 64-bit provenance encoding back to
+ * base table.
  *
  * @tparam Mode Collection mode for compile-time specialization.
  */
 template <MatchCollectionMode Mode>
-inline void materialize_single_column_from_buffers(
-    Column &dest_col, size_t col_idx, size_t build_size,
+inline void materialize_single_column(
+    Column &dest_col, size_t col_idx, size_t build_size, bool build_is_left,
     std::vector<ThreadLocalMatchBuffer<Mode>> &buffers, size_t total_matches,
     const JoinInput &build_input, const JoinInput &probe_input,
-    const PlanNode &build_node, const PlanNode &probe_node,
-    ColumnarReader &columnar_reader, const Plan &plan) {
-
-    auto [input, node, local_idx] = resolve_input_source(
-        col_idx, build_size, build_input, build_node, probe_input, probe_node);
-    bool from_build = col_idx < build_size;
+    const AnalyzedJoinNode &join_node, ColumnarReader &columnar_reader,
+    const AnalyzedPlan &analyzed_plan) {
+
+    // Find column info
+    const AnalyzedColumnInfo *col_info = nullptr;
+    for (const auto &col : join_node.columns) {
+        if (col.original_idx == col_idx) {
+            col_info = &col;
+            break;
+        }
+    }
 
-    const Column *col_source = nullptr;
-    const mema::column_t *inter_source = nullptr;
+    if (!col_info) {
+        // Fallback - shouldn't happen
+        return;
+    }
 
-    if (input.is_columnar()) {
-        auto *table = std::get<const ColumnarTable *>(input.data);
-        auto [actual_idx, _] = node.output_attrs[local_idx];
-        col_source = &table->columns[actual_idx];
+    // Determine if this column comes from build or probe side at runtime
+    bool from_build = (col_info->from_left == build_is_left);
+    const JoinInput &src_input = from_build ? build_input : probe_input;
+
+    // Determine how to read the value
+    const Column *columnar_source = nullptr;
+    const mema::column_t *materialized_source = nullptr;
+    const mema::deferred_column_t *deferred_source = nullptr;
+
+    if (src_input.is_columnar()) {
+        // Direct columnar read
+        const auto *table = std::get<const ColumnarTable *>(src_input.data);
+        auto [actual_idx, _] =
+            src_input.node->output_attrs[col_info->child_output_idx];
+        columnar_source = &table->columns[actual_idx];
     } else {
-        const auto &res = std::get<ExtendedResult>(input.data);
-        inter_source = &res.columns[local_idx];
+        const auto &ir = std::get<IntermediateResult>(src_input.data);
+        if (ir.is_materialized(col_info->child_output_idx)) {
+            // Read from materialized column
+            materialized_source =
+                ir.get_materialized(col_info->child_output_idx);
+        } else if (ir.is_deferred(col_info->child_output_idx)) {
+            // Deferred - need to resolve via 64-bit provenance
+            deferred_source = ir.get_deferred(col_info->child_output_idx);
+        }
     }
 
-    auto reader = [&](uint32_t rid, ColumnarReader::Cursor &cursor,
-                      DataType type) {
-        if (col_source) {
-            return columnar_reader.read_value(*col_source, local_idx, rid, type,
-                                              cursor, from_build);
+    // Create reader lambda
+    auto reader = [&](uint32_t local_row_id,
+                      ColumnarReader::Cursor &cursor) -> mema::value_t {
+        if (columnar_source) {
+            return columnar_reader.read_value(
+                *columnar_source, col_info->child_output_idx, local_row_id,
+                col_info->type, cursor, from_build);
+        } else if (materialized_source) {
+            return (*materialized_source)[local_row_id];
+        } else if (deferred_source && analyzed_plan.original_plan) {
+            // Deferred resolution: decode 64-bit provenance
+            uint64_t prov = (*deferred_source)[local_row_id];
+            uint8_t base_tid = DeferredProvenance::table(prov);
+            uint8_t base_col = DeferredProvenance::column(prov);
+            uint64_t base_row = DeferredProvenance::row(prov);
+            const auto &base_table =
+                analyzed_plan.original_plan->inputs[base_tid];
+            return columnar_reader.read_value(
+                base_table.columns[base_col], base_col,
+                static_cast<uint32_t>(base_row), col_info->type, cursor, true);
         }
-        return (*inter_source)[rid];
+        return mema::value_t{mema::value_t::NULL_VALUE};
     };
 
+    // Materialize based on type
     if (dest_col.type == DataType::INT32) {
         auto init = [](std::function<Page *()> alloc) {
             return Int32PageBuilder(std::move(alloc));
         };
-        materialize_column_from_buffers<Mode, Int32PageBuilder>(
+        materialize_column<Mode, Int32PageBuilder>(
             dest_col, buffers, total_matches,
             [&](uint32_t rid, ColumnarReader::Cursor &cursor) {
-                return reader(rid, cursor, DataType::INT32);
+                return reader(rid, cursor);
             },
             init, from_build, 4);
         return;
     }
 
-    const Column *str_src_ptr = col_source;
-    if (!str_src_ptr && inter_source) {
-        str_src_ptr = &plan.inputs[inter_source->source_table]
-                           .columns[inter_source->source_column];
+    // VARCHAR
+    const Column *str_src_ptr = columnar_source;
+    if (!str_src_ptr) {
+        if (materialized_source) {
+            str_src_ptr = &analyzed_plan.original_plan
+                               ->inputs[materialized_source->source_table]
+                               .columns[materialized_source->source_column];
+        } else if (deferred_source && analyzed_plan.original_plan) {
+            // For deferred VARCHAR, get source from provenance of first row
+            // All rows in a deferred column share the same base table/column
+            str_src_ptr = &analyzed_plan.original_plan
+                               ->inputs[col_info->provenance.base_table_id]
+                               .columns[col_info->provenance.base_column_idx];
+        }
+    }
+
+    if (!str_src_ptr) {
+        // Shouldn't happen, but handle gracefully
+        return;
     }
 
     auto init = [str_src_ptr](std::function<Page *()> alloc) {
         return VarcharPageBuilder(*str_src_ptr, std::move(alloc));
     };
 
-    materialize_column_from_buffers<Mode, VarcharPageBuilder>(
+    materialize_column<Mode, VarcharPageBuilder>(
         dest_col, buffers, total_matches,
         [&](uint32_t rid, ColumnarReader::Cursor &cursor) {
-            return reader(rid, cursor, DataType::VARCHAR);
+            return reader(rid, cursor);
         },
         init, from_build, 35);
 }
 
 /**
- * @brief Materializes all output columns from thread-local buffers into
- * ColumnarTable.
- *
- * Dereferences VARCHAR value_t references into actual string bytes.
+ * @brief Materialize all output columns from intermediate result.
  *
- * @tparam Mode            Collection mode for compile-time specialization.
- * @param buffers          Thread-local match buffers from probe.
- * @param build_input      Build side data source.
- * @param probe_input      Probe side data source.
- * @param remapped_attrs   Output projection: (col_idx, DataType) pairs.
- * @param build_node       Metadata for build side output_attrs mapping.
- * @param probe_node       Metadata for probe side output_attrs mapping.
- * @param build_size       Number of columns from build side.
- * @param columnar_reader  PageIndex-accelerated reader for Column page access.
- * @param plan             Full query plan for VARCHAR dereferencing.
- * @return ColumnarTable with self-contained page data.
+ * For root join. Resolves all deferred columns by decoding 64-bit provenance
+ * to base tables.
  *
- * @see construct_intermediate.h for creating intermediate ExecuteResult.
- * @see page_builders.h for Int32PageBuilder and VarcharPageBuilder.
+ * @tparam Mode Collection mode for compile-time specialization.
+ * @param buffers Thread-local match buffers from probe.
+ * @param build_input Build side input.
+ * @param probe_input Probe side input.
+ * @param join_node Analyzed join node with column info.
+ * @param remapped_attrs Output projection after build/probe remapping.
+ * @param build_size Number of columns from build side.
+ * @param columnar_reader Reader for columnar data.
+ * @param analyzed_plan Full analyzed plan for base table access.
+ * @return ColumnarTable with final output.
  */
 template <MatchCollectionMode Mode>
 inline ColumnarTable materialize_from_buffers(
     std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
     const JoinInput &build_input, const JoinInput &probe_input,
+    const AnalyzedJoinNode &join_node,
     const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
-    const PlanNode &build_node, const PlanNode &probe_node, size_t build_size,
-    ColumnarReader &columnar_reader, const Plan &plan) {
+    size_t build_size, bool build_is_left, ColumnarReader &columnar_reader,
+    const AnalyzedPlan &analyzed_plan) {
 
-    // Compute total_matches
+    // Compute total matches
     size_t total_matches = 0;
     for (const auto &buf : buffers) {
         total_matches += buf.count();
     }
 
-    ColumnarTable result;
-    result.num_rows = total_matches;
-
     if (total_matches == 0) {
-        for (auto [_, dtype] : remapped_attrs) {
-            result.columns.emplace_back(dtype);
-        }
-        return result;
+        return create_empty_result(remapped_attrs);
     }
 
+    ColumnarTable result;
+    result.num_rows = total_matches;
+
     for (size_t out_idx = 0; out_idx < remapped_attrs.size(); ++out_idx) {
         auto [col_idx, data_type] = remapped_attrs[out_idx];
         result.columns.emplace_back(data_type);
         Column &dest_col = result.columns.back();
-        materialize_single_column_from_buffers<Mode>(
-            dest_col, col_idx, build_size, buffers, total_matches, build_input,
-            probe_input, build_node, probe_node, columnar_reader, plan);
+
+        // Pass out_idx (output position) not col_idx (global column index)
+        // because materialize_single_column searches by original_idx
+        // which is the output position in join_node.columns
+        materialize_single_column<Mode>(dest_col, out_idx, build_size,
+                                        build_is_left, buffers, total_matches,
+                                        build_input, probe_input, join_node,
+                                        columnar_reader, analyzed_plan);
     }
+
     return result;
 }
 
-} // namespace Contest::materialize
+} // namespace materialize
+} // namespace Contest
diff --git a/include/materialization/materialize_deferred.h b/include/materialization/materialize_deferred.h
deleted file mode 100644
index bd7a2af..0000000
--- a/include/materialization/materialize_deferred.h
+++ /dev/null
@@ -1,435 +0,0 @@
-/**
- * @file materialize_deferred.h
- * @brief Final materialization for deferred execution path.
- *
- * Materializes all output columns at the root join, resolving deferred
- * columns by decoding 64-bit provenance (table_id, column_idx, row_id) back
- * to base tables.
- *
- * @see construct_deferred.h for building DeferredResult intermediates.
- * @see materialize.h for the eager materialization equivalent.
- */
-#pragma once
-
-#include <algorithm>
-#include <cstring>
-#include <functional>
-#include <sys/mman.h>
-#include <vector>
-
-#include <data_access/columnar_reader.h>
-#include <data_model/deferred_intermediate.h>
-#include <data_model/deferred_plan.h>
-#include <foundation/common.h>
-#include <join_execution/match_collector.h>
-#include <materialization/page_builders.h>
-#include <platform/arena.h>
-#include <platform/worker_pool.h>
-
-namespace Contest {
-namespace materialize {
-
-using Contest::io::ColumnarReader;
-using Contest::join::MatchCollectionMode;
-using Contest::join::ThreadLocalMatchBuffer;
-using Contest::platform::THREAD_COUNT;
-using Contest::platform::worker_pool;
-
-/**
- * @brief Collect columns needed from a DeferredInput for final materialization.
- */
-inline platform::ArenaVector<const Column *>
-collect_final_columns(const DeferredInput &input,
-                      const platform::ArenaVector<uint8_t> &needed,
-                      platform::ThreadArena &arena) {
-    platform::ArenaVector<const Column *> columns(arena);
-    if (!input.node)
-        return columns;
-
-    columns.resize(input.node->output_attrs.size());
-    std::memset(columns.data(), 0, columns.size() * sizeof(const Column *));
-
-    if (!input.is_columnar())
-        return columns;
-
-    auto *table = std::get<const ColumnarTable *>(input.data);
-    for (size_t i = 0; i < input.node->output_attrs.size(); ++i) {
-        if (i < needed.size() && needed[i]) {
-            auto [actual_col_idx, _] = input.node->output_attrs[i];
-            columns[i] = &table->columns[actual_col_idx];
-        }
-    }
-    return columns;
-}
-
-/**
- * @brief Prepare ColumnarReader for final deferred materialization at root.
- *
- * Sets up page indices for ALL output columns (since all need materialization
- * at root).
- */
-inline void prepare_final_deferred_columns(
-    ColumnarReader &reader, const DeferredInput &build_input,
-    const DeferredInput &probe_input, const DeferredJoinNode &join_node,
-    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
-    size_t build_size, bool build_is_left) {
-
-    bool build_is_columnar = build_input.is_columnar();
-    bool probe_is_columnar = probe_input.is_columnar();
-
-    if (!build_is_columnar && !probe_is_columnar)
-        return;
-
-    auto &arena = Contest::platform::get_arena(0);
-
-    // All output columns needed at root
-    platform::ArenaVector<uint8_t> build_needed(arena);
-    if (build_input.node) {
-        build_needed.resize(build_input.node->output_attrs.size());
-        std::memset(build_needed.data(), 0, build_needed.size());
-    }
-
-    platform::ArenaVector<uint8_t> probe_needed(arena);
-    if (probe_input.node) {
-        probe_needed.resize(probe_input.node->output_attrs.size());
-        std::memset(probe_needed.data(), 0, probe_needed.size());
-    }
-
-    // Mark ALL columns needed for final materialization
-    // from_left refers to original left child
-    // build_is_left tells us if build side is the left child
-    for (const auto &col : join_node.columns) {
-        bool from_build = (col.from_left == build_is_left);
-        if (from_build && col.child_output_idx < build_needed.size()) {
-            build_needed[col.child_output_idx] = 1;
-        } else if (!from_build && col.child_output_idx < probe_needed.size()) {
-            probe_needed[col.child_output_idx] = 1;
-        }
-    }
-
-    if (build_is_columnar) {
-        reader.prepare_build(
-            collect_final_columns(build_input, build_needed, arena));
-    }
-
-    if (probe_is_columnar) {
-        reader.prepare_probe(
-            collect_final_columns(probe_input, probe_needed, arena));
-    }
-}
-
-/**
- * @brief Create empty result for zero-match case in deferred path.
- */
-inline ColumnarTable create_empty_deferred_final(
-    const std::vector<std::tuple<size_t, DataType>> &output_attrs) {
-    ColumnarTable empty_result;
-    empty_result.num_rows = 0;
-    for (auto [_, data_type] : output_attrs) {
-        empty_result.columns.emplace_back(data_type);
-    }
-    return empty_result;
-}
-
-/**
- * @brief Materialize a single column from deferred sources.
- *
- * Handles three cases:
- * 1. COLUMNAR_DIRECT: Input is columnar, read directly via row index
- * 2. MATERIALIZED: Column was materialized in DeferredResult
- * 3. DEFERRED: Resolve via 64-bit provenance to base table
- *
- * @tparam Mode Collection mode for compile-time specialization.
- * @tparam BuilderType Int32PageBuilder or VarcharPageBuilder.
- * @tparam ReaderFunc Callable: (row_idx, cursor) -> value_t.
- * @tparam InitBuilderFunc Callable: (page_allocator) -> BuilderType.
- */
-template <MatchCollectionMode Mode, typename BuilderType, typename ReaderFunc,
-          typename InitBuilderFunc>
-inline void materialize_deferred_column(
-    Column &dest_col, std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
-    size_t total_matches, ReaderFunc &&read_value,
-    InitBuilderFunc &&init_builder, bool from_build, size_t est_bytes_per_row) {
-
-    if (total_matches == 0)
-        return;
-
-    const int num_threads = THREAD_COUNT;
-
-    size_t matches_per_thread = (total_matches + num_threads - 1) / num_threads;
-    size_t usable_per_page = PAGE_SIZE - 256;
-    size_t rows_per_page = std::max(1ul, usable_per_page / est_bytes_per_row);
-    size_t pages_per_thread =
-        (matches_per_thread + rows_per_page - 1) / rows_per_page + 10;
-    size_t total_pages = pages_per_thread * num_threads;
-
-    void *page_memory =
-        mmap(nullptr, total_pages * PAGE_SIZE, PROT_READ | PROT_WRITE,
-             MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-    if (page_memory == MAP_FAILED)
-        throw std::bad_alloc();
-
-    std::vector<Column> thread_columns;
-    thread_columns.reserve(num_threads);
-    for (int i = 0; i < num_threads; ++i) {
-        thread_columns.emplace_back(dest_col.type);
-    }
-
-    worker_pool().execute([&](size_t t) {
-        if (t >= buffers.size())
-            return;
-        auto &buf = buffers[t];
-        size_t my_count = buf.count();
-        if (my_count == 0)
-            return;
-
-        Column &local_col = thread_columns[t];
-
-        size_t thread_page_start = t * pages_per_thread;
-        size_t thread_page_limit = pages_per_thread;
-        size_t used_pages = 0;
-
-        ColumnarReader::Cursor cursor;
-
-        auto page_allocator = [&]() -> Page * {
-            Page *p;
-            if (used_pages < thread_page_limit) {
-                p = reinterpret_cast<Page *>(static_cast<char *>(page_memory) +
-                                             (thread_page_start + used_pages) *
-                                                 PAGE_SIZE);
-                used_pages++;
-            } else {
-                p = new Page();
-            }
-            local_col.pages.push_back(p);
-            return p;
-        };
-
-        BuilderType builder = init_builder(page_allocator);
-        builder.prepare(my_count);
-
-        const size_t check_interval = BuilderType::MIN_ROWS_PER_PAGE_CHECK;
-        size_t rows_since_check = 0;
-
-        auto range = from_build ? buf.left_range() : buf.right_range();
-
-        for (uint32_t row_id : range) {
-            bool flushed = builder.add(read_value(row_id, cursor));
-
-            if (flushed) {
-                rows_since_check = 0;
-            } else {
-                rows_since_check++;
-                if (rows_since_check >= check_interval) {
-                    if (builder.should_check_overflow()) {
-                        builder.save_to_page(builder.current_page);
-                        rows_since_check = 0;
-                    }
-                    if (rows_since_check > check_interval * 2)
-                        rows_since_check = 0;
-                }
-            }
-        }
-
-        if (builder.num_rows != 0) {
-            builder.save_to_page(builder.current_page);
-        }
-    });
-
-    for (auto &thread_col : thread_columns) {
-        for (auto *page : thread_col.pages) {
-            dest_col.pages.push_back(page);
-        }
-        thread_col.pages.clear();
-    }
-
-    auto *mapped_mem = new MappedMemory(page_memory, total_pages * PAGE_SIZE);
-    dest_col.assign_mapped_memory(mapped_mem);
-}
-
-/**
- * @brief Materialize single output column handling deferred resolution.
- *
- * For deferred columns, resolves via 64-bit provenance encoding back to
- * base table.
- *
- * @tparam Mode Collection mode for compile-time specialization.
- */
-template <MatchCollectionMode Mode>
-inline void materialize_single_deferred_column(
-    Column &dest_col, size_t col_idx, size_t build_size, bool build_is_left,
-    std::vector<ThreadLocalMatchBuffer<Mode>> &buffers, size_t total_matches,
-    const DeferredInput &build_input, const DeferredInput &probe_input,
-    const DeferredJoinNode &join_node, ColumnarReader &columnar_reader,
-    const DeferredPlan &deferred_plan) {
-
-    // Find column info
-    const DeferredColumnInfo *col_info = nullptr;
-    for (const auto &col : join_node.columns) {
-        if (col.original_idx == col_idx) {
-            col_info = &col;
-            break;
-        }
-    }
-
-    if (!col_info) {
-        // Fallback - shouldn't happen
-        return;
-    }
-
-    // Determine if this column comes from build or probe side at runtime
-    bool from_build = (col_info->from_left == build_is_left);
-    const DeferredInput &src_input = from_build ? build_input : probe_input;
-
-    // Determine how to read the value
-    const Column *columnar_source = nullptr;
-    const mema::column_t *materialized_source = nullptr;
-    const mema::deferred_column_t *deferred_source = nullptr;
-
-    if (src_input.is_columnar()) {
-        // Direct columnar read
-        const auto *table = std::get<const ColumnarTable *>(src_input.data);
-        auto [actual_idx, _] =
-            src_input.node->output_attrs[col_info->child_output_idx];
-        columnar_source = &table->columns[actual_idx];
-    } else {
-        const auto &ir = std::get<DeferredResult>(src_input.data);
-        if (ir.is_materialized(col_info->child_output_idx)) {
-            // Read from materialized column
-            materialized_source =
-                ir.get_materialized(col_info->child_output_idx);
-        } else if (ir.is_deferred(col_info->child_output_idx)) {
-            // Deferred - need to resolve via 64-bit provenance
-            deferred_source = ir.get_deferred(col_info->child_output_idx);
-        }
-    }
-
-    // Create reader lambda
-    auto reader = [&](uint32_t local_row_id,
-                      ColumnarReader::Cursor &cursor) -> mema::value_t {
-        if (columnar_source) {
-            return columnar_reader.read_value(
-                *columnar_source, col_info->child_output_idx, local_row_id,
-                col_info->type, cursor, from_build);
-        } else if (materialized_source) {
-            return (*materialized_source)[local_row_id];
-        } else if (deferred_source && deferred_plan.original_plan) {
-            // Deferred resolution: decode 64-bit provenance
-            uint64_t prov = (*deferred_source)[local_row_id];
-            uint8_t base_tid = DeferredProvenance::table(prov);
-            uint8_t base_col = DeferredProvenance::column(prov);
-            uint64_t base_row = DeferredProvenance::row(prov);
-            const auto &base_table =
-                deferred_plan.original_plan->inputs[base_tid];
-            return columnar_reader.read_value(
-                base_table.columns[base_col], base_col,
-                static_cast<uint32_t>(base_row), col_info->type, cursor, true);
-        }
-        return mema::value_t{mema::value_t::NULL_VALUE};
-    };
-
-    // Materialize based on type
-    if (dest_col.type == DataType::INT32) {
-        auto init = [](std::function<Page *()> alloc) {
-            return Int32PageBuilder(std::move(alloc));
-        };
-        materialize_deferred_column<Mode, Int32PageBuilder>(
-            dest_col, buffers, total_matches,
-            [&](uint32_t rid, ColumnarReader::Cursor &cursor) {
-                return reader(rid, cursor);
-            },
-            init, from_build, 4);
-        return;
-    }
-
-    // VARCHAR
-    const Column *str_src_ptr = columnar_source;
-    if (!str_src_ptr) {
-        if (materialized_source) {
-            str_src_ptr = &deferred_plan.original_plan
-                               ->inputs[materialized_source->source_table]
-                               .columns[materialized_source->source_column];
-        } else if (deferred_source && deferred_plan.original_plan) {
-            // For deferred VARCHAR, get source from provenance of first row
-            // All rows in a deferred column share the same base table/column
-            str_src_ptr = &deferred_plan.original_plan
-                               ->inputs[col_info->provenance.base_table_id]
-                               .columns[col_info->provenance.base_column_idx];
-        }
-    }
-
-    if (!str_src_ptr) {
-        // Shouldn't happen, but handle gracefully
-        return;
-    }
-
-    auto init = [str_src_ptr](std::function<Page *()> alloc) {
-        return VarcharPageBuilder(*str_src_ptr, std::move(alloc));
-    };
-
-    materialize_deferred_column<Mode, VarcharPageBuilder>(
-        dest_col, buffers, total_matches,
-        [&](uint32_t rid, ColumnarReader::Cursor &cursor) {
-            return reader(rid, cursor);
-        },
-        init, from_build, 35);
-}
-
-/**
- * @brief Materialize all output columns from deferred intermediate.
- *
- * For root join in deferred execution path. Resolves all deferred columns
- * by decoding 64-bit provenance to base tables.
- *
- * @tparam Mode Collection mode for compile-time specialization.
- * @param buffers Thread-local match buffers from probe.
- * @param build_input Build side deferred input.
- * @param probe_input Probe side deferred input.
- * @param join_node Deferred join node with column info.
- * @param remapped_attrs Output projection after build/probe remapping.
- * @param build_size Number of columns from build side.
- * @param columnar_reader Reader for columnar data.
- * @param deferred_plan Full deferred plan for base table access.
- * @return ColumnarTable with final output.
- */
-template <MatchCollectionMode Mode>
-inline ColumnarTable materialize_deferred_from_buffers(
-    std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
-    const DeferredInput &build_input, const DeferredInput &probe_input,
-    const DeferredJoinNode &join_node,
-    const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
-    size_t build_size, bool build_is_left, ColumnarReader &columnar_reader,
-    const DeferredPlan &deferred_plan) {
-
-    // Compute total matches
-    size_t total_matches = 0;
-    for (const auto &buf : buffers) {
-        total_matches += buf.count();
-    }
-
-    if (total_matches == 0) {
-        return create_empty_deferred_final(remapped_attrs);
-    }
-
-    ColumnarTable result;
-    result.num_rows = total_matches;
-
-    for (size_t out_idx = 0; out_idx < remapped_attrs.size(); ++out_idx) {
-        auto [col_idx, data_type] = remapped_attrs[out_idx];
-        result.columns.emplace_back(data_type);
-        Column &dest_col = result.columns.back();
-
-        // Pass out_idx (output position) not col_idx (global column index)
-        // because materialize_single_deferred_column searches by original_idx
-        // which is the output position in join_node.columns
-        materialize_single_deferred_column<Mode>(
-            dest_col, out_idx, build_size, build_is_left, buffers,
-            total_matches, build_input, probe_input, join_node, columnar_reader,
-            deferred_plan);
-    }
-
-    return result;
-}
-
-} // namespace materialize
-} // namespace Contest
diff --git a/src/analyze_plan.cpp b/src/analyze_plan.cpp
index f511c60..3c7fc83 100644
--- a/src/analyze_plan.cpp
+++ b/src/analyze_plan.cpp
@@ -6,9 +6,8 @@
  * materialized eagerly (join keys needed by parent) vs deferred until final
  * output. Traces column provenance back to base tables for deferred resolution.
  *
- * @see deferred_plan.h for DeferredPlan structure.
+ * @see deferred_plan.h for AnalyzedPlan structure.
  */
-#include <algorithm>
 #include <functional>
 #include <unordered_map>
 
@@ -112,7 +111,7 @@ find_parent_join_key(const Plan &plan, size_t node_idx,
  * LEFT_ONLY/RIGHT_ONLY.
  */
 join::MatchCollectionMode
-compute_base_collection_mode(const std::vector<DeferredColumnInfo> &columns,
+compute_base_collection_mode(const std::vector<AnalyzedColumnInfo> &columns,
                              size_t left_output_size) {
     bool needs_left = false;
     bool needs_right = false;
@@ -137,11 +136,11 @@ compute_base_collection_mode(const std::vector<DeferredColumnInfo> &columns,
 
 } // anonymous namespace
 
-DeferredPlan analyze_plan(const Plan &plan) {
-    DeferredPlan deferred;
-    deferred.original_plan = &plan;
-    deferred.nodes.resize(plan.nodes.size());
-    deferred.root = plan.root;
+AnalyzedPlan analyze_plan(const Plan &plan) {
+    AnalyzedPlan analyzed;
+    analyzed.original_plan = &plan;
+    analyzed.nodes.resize(plan.nodes.size());
+    analyzed.root = plan.root;
 
     auto parent_map = build_parent_map(plan);
 
@@ -170,26 +169,26 @@ DeferredPlan analyze_plan(const Plan &plan) {
 
         if (const auto *scan = std::get_if<ScanNode>(&node.data)) {
             // Scan node: simple wrapper
-            DeferredScanNode dscan;
-            dscan.node_idx = node_idx;
-            dscan.base_table_id = scan->base_table_id;
-            dscan.output_attrs = node.output_attrs;
-            deferred.nodes[node_idx] = std::move(dscan);
+            AnalyzedScanNode ascan;
+            ascan.node_idx = node_idx;
+            ascan.base_table_id = scan->base_table_id;
+            ascan.output_attrs = node.output_attrs;
+            analyzed.nodes[node_idx] = std::move(ascan);
 
         } else {
             // Join node: compute materialization decisions
             const auto &join = std::get<JoinNode>(node.data);
-            DeferredJoinNode djoin;
-            djoin.node_idx = node_idx;
-            djoin.left_child_idx = join.left;
-            djoin.right_child_idx = join.right;
-            djoin.left_join_attr = join.left_attr;
-            djoin.right_join_attr = join.right_attr;
-            djoin.output_attrs = node.output_attrs;
-            djoin.is_root = (node_idx == plan.root);
+            AnalyzedJoinNode ajoin;
+            ajoin.node_idx = node_idx;
+            ajoin.left_child_idx = join.left;
+            ajoin.right_child_idx = join.right;
+            ajoin.left_join_attr = join.left_attr;
+            ajoin.right_join_attr = join.right_attr;
+            ajoin.output_attrs = node.output_attrs;
+            ajoin.is_root = (node_idx == plan.root);
 
             // Find which column parent needs as join key
-            djoin.parent_join_key_idx =
+            ajoin.parent_join_key_idx =
                 find_parent_join_key(plan, node_idx, parent_map);
 
             // Get child sizes for determining column source
@@ -200,7 +199,7 @@ DeferredPlan analyze_plan(const Plan &plan) {
             for (size_t i = 0; i < node.output_attrs.size(); ++i) {
                 auto [col_idx, col_type] = node.output_attrs[i];
 
-                DeferredColumnInfo info;
+                AnalyzedColumnInfo info;
                 info.original_idx = i;
                 info.type = col_type;
 
@@ -220,11 +219,11 @@ DeferredPlan analyze_plan(const Plan &plan) {
                 // Materialization decision:
                 // - At root: ALL columns must be materialized (final output)
                 // - At intermediate: only parent's join key is materialized
-                if (djoin.is_root) {
+                if (ajoin.is_root) {
                     // Root node: materialize everything
                     info.resolution = ColumnResolution::MATERIALIZE;
-                } else if (djoin.parent_join_key_idx.has_value() &&
-                           i == *djoin.parent_join_key_idx) {
+                } else if (ajoin.parent_join_key_idx.has_value() &&
+                           i == *ajoin.parent_join_key_idx) {
                     info.resolution = ColumnResolution::MATERIALIZE;
                 } else {
                     info.resolution = ColumnResolution::DEFER;
@@ -233,22 +232,22 @@ DeferredPlan analyze_plan(const Plan &plan) {
                 // Trace provenance to base table
                 info.provenance = trace_provenance(plan, node_idx, i);
 
-                djoin.columns.push_back(std::move(info));
+                ajoin.columns.push_back(std::move(info));
             }
 
             // Compute collection mode and count deferred columns
-            djoin.base_collection_mode =
-                compute_base_collection_mode(djoin.columns, left_size);
+            ajoin.base_collection_mode =
+                compute_base_collection_mode(ajoin.columns, left_size);
 
             // Count deferred columns for pre-allocation
-            djoin.num_deferred_columns = 0;
-            for (const auto &col : djoin.columns) {
+            ajoin.num_deferred_columns = 0;
+            for (const auto &col : ajoin.columns) {
                 if (col.resolution == ColumnResolution::DEFER) {
-                    ++djoin.num_deferred_columns;
+                    ++ajoin.num_deferred_columns;
                 }
             }
 
-            deferred.nodes[node_idx] = std::move(djoin);
+            analyzed.nodes[node_idx] = std::move(ajoin);
         }
     }
 
@@ -256,28 +255,28 @@ DeferredPlan analyze_plan(const Plan &plan) {
     // Process in reverse post-order (parents before children)
     for (auto it = post_order.rbegin(); it != post_order.rend(); ++it) {
         size_t node_idx = *it;
-        auto *djoin = std::get_if<DeferredJoinNode>(&deferred.nodes[node_idx]);
-        if (!djoin)
+        auto *ajoin = std::get_if<AnalyzedJoinNode>(&analyzed.nodes[node_idx]);
+        if (!ajoin)
             continue;
 
         // For each column that must be MATERIALIZE, ensure the child also
         // materializes it
-        for (const auto &col : djoin->columns) {
+        for (const auto &col : ajoin->columns) {
             if (col.resolution != ColumnResolution::MATERIALIZE)
                 continue;
 
             // Find which child this column comes from
             size_t child_idx =
-                col.from_left ? djoin->left_child_idx : djoin->right_child_idx;
+                col.from_left ? ajoin->left_child_idx : ajoin->right_child_idx;
 
-            auto *child_djoin =
-                std::get_if<DeferredJoinNode>(&deferred.nodes[child_idx]);
-            if (!child_djoin)
+            auto *child_ajoin =
+                std::get_if<AnalyzedJoinNode>(&analyzed.nodes[child_idx]);
+            if (!child_ajoin)
                 continue; // Child is a scan - always has data
 
             // Mark child's column as MATERIALIZE
-            if (col.child_output_idx < child_djoin->columns.size()) {
-                child_djoin->columns[col.child_output_idx].resolution =
+            if (col.child_output_idx < child_ajoin->columns.size()) {
+                child_ajoin->columns[col.child_output_idx].resolution =
                     ColumnResolution::MATERIALIZE;
             }
         }
@@ -285,19 +284,19 @@ DeferredPlan analyze_plan(const Plan &plan) {
 
     // PASS 3: Recount num_deferred_columns after propagation
     for (size_t node_idx : post_order) {
-        auto *djoin = std::get_if<DeferredJoinNode>(&deferred.nodes[node_idx]);
-        if (!djoin)
+        auto *ajoin = std::get_if<AnalyzedJoinNode>(&analyzed.nodes[node_idx]);
+        if (!ajoin)
             continue;
 
-        djoin->num_deferred_columns = 0;
-        for (const auto &col : djoin->columns) {
+        ajoin->num_deferred_columns = 0;
+        for (const auto &col : ajoin->columns) {
             if (col.resolution == ColumnResolution::DEFER) {
-                ++djoin->num_deferred_columns;
+                ++ajoin->num_deferred_columns;
             }
         }
     }
 
-    return deferred;
+    return analyzed;
 }
 
 } // namespace Contest
diff --git a/src/execute.cpp b/src/execute.cpp
index b9d45e5..29a485c 100644
--- a/src/execute.cpp
+++ b/src/execute.cpp
@@ -5,12 +5,12 @@
  * Traverses plan tree: resolve inputs -> select build/probe -> algorithm
  * selection -> match collection -> output construction.
  *
- * Flow: execute() -> execute_impl() recursively -> resolve_join_input() for
- * ScanNode (ColumnarTable*) or JoinNode (ExecuteResult). Root produces
- * ColumnarTable; non-root produces ExecuteResult.
+ * Flow: execute() -> execute_impl() recursively -> resolve_input() for
+ * ScanNode (ColumnarTable*) or JoinNode (IntermediateResult). Root produces
+ * ColumnarTable; non-root produces IntermediateResult.
  *
- * Lifetimes: base tables live for query duration; ExecuteResult held on stack
- * until parent completes; VARCHAR refs valid via base table lifetime.
+ * Lifetimes: base tables live for query duration; IntermediateResult held on
+ * stack until parent completes; VARCHAR refs valid via base table lifetime.
  *
  * Row order non-deterministic (work-stealing); semantically correct per SQL.
  *
@@ -27,6 +27,7 @@
 
 #include <chrono>
 #include <data_access/columnar_reader.h>
+#include <data_model/deferred_plan.h>
 #include <data_model/intermediate.h>
 #include <iostream>
 #include <join_execution/hash_join.h>
@@ -40,309 +41,54 @@
 #include <platform/worker_pool.h>
 #include <variant>
 
-#ifdef USE_DEFERRED_MATERIALIZATION
-#include <data_model/deferred_intermediate.h>
-#include <data_model/deferred_plan.h>
-#include <materialization/construct_deferred.h>
-#include <materialization/materialize_deferred.h>
-#endif
-
 namespace Contest {
 
 using namespace join;
 
 using materialize::construct_intermediate_from_buffers;
-using materialize::create_empty_result;
+using materialize::create_empty_intermediate_result;
 using materialize::materialize_from_buffers;
 
 /**
- * @brief Result variant: ExtendedResult (intermediate, with row ID tracking) or
- * ColumnarTable (final output per contest API).
+ * @brief Result variant: IntermediateResult (non-root) or ColumnarTable (root).
  */
-using JoinResult = std::variant<ExtendedResult, ColumnarTable>;
+using JoinResult = std::variant<IntermediateResult, ColumnarTable>;
 
-/**
- * @brief Recursive join execution with timing.
- * @param plan Query plan with nodes and base tables.
- * @param node_idx Current node index in plan.nodes.
- * @param is_root True -> ColumnarTable output; false -> ExecuteResult.
- * @param stats Timing accumulator.
- * @return JoinResult (intermediate or final).
- */
-JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root,
+// Forward declaration
+JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root,
                         TimingStats &stats);
 
 /**
  * @brief Resolve plan node to JoinInput.
  *
  * ScanNode -> non-owning ColumnarTable*; JoinNode -> recursive execution
- * returning owned ExtendedResult. Implements depth-first traversal.
- *
- * @param plan Query plan.
- * @param node_idx Node index to resolve.
- * @param stats Timing accumulator.
- * @return JoinInput with data variant and metadata.
+ * returning owned IntermediateResult.
  */
-JoinInput resolve_join_input(const Plan &plan, size_t node_idx,
-                             TimingStats &stats) {
+JoinInput resolve_input(const AnalyzedPlan &plan, size_t node_idx,
+                        TimingStats &stats) {
     JoinInput input;
-    const auto &node = plan.nodes[node_idx];
-    input.node = &node;
+    const auto &anode = plan[node_idx];
+    const auto &pnode = plan.original_plan->nodes[node_idx];
+    input.node = &pnode;
+    input.analyzed_node = &anode;
 
-    if (const auto *scan = std::get_if<ScanNode>(&node.data)) {
-        input.data = &plan.inputs[scan->base_table_id];
+    if (const auto *scan = std::get_if<AnalyzedScanNode>(&anode)) {
+        input.data = &plan.original_plan->inputs[scan->base_table_id];
         input.table_id = scan->base_table_id;
     } else {
         auto result = execute_impl(plan, node_idx, false, stats);
-        input.data = std::get<ExtendedResult>(std::move(result));
+        input.data = std::get<IntermediateResult>(std::move(result));
         input.table_id = 0;
     }
     return input;
 }
 
 /**
- * @brief Unified probe + materialize helper templated on collection mode.
- *
- * Executes probe (nested loop or hash join) and materialization/intermediate
- * construction in a single function. Template parameter eliminates runtime
- * branching in hot loops.
- *
- * @tparam Mode Collection mode (BOTH, LEFT_ONLY, RIGHT_ONLY).
- */
-template <MatchCollectionMode Mode>
-JoinResult execute_join_with_mode(
-    bool use_nested_loop, bool probe_is_columnar, bool is_root,
-    const UnchainedHashtable *hash_table, const JoinInput &build_input,
-    const JoinInput &probe_input, const BuildProbeConfig &config,
-    const PlanNode &build_node, const PlanNode &probe_node, JoinSetup &setup,
-    io::ColumnarReader &columnar_reader, const Plan &plan, TimingStats &stats) {
-
-    std::vector<ThreadLocalMatchBuffer<Mode>> match_buffers;
-
-    if (use_nested_loop) {
-        auto nested_loop_start = std::chrono::high_resolution_clock::now();
-        match_buffers = nested_loop_join<Mode>(
-            build_input, probe_input, config.build_attr, config.probe_attr);
-        auto nested_loop_end = std::chrono::high_resolution_clock::now();
-        stats.nested_loop_join_ms +=
-            std::chrono::duration_cast<std::chrono::milliseconds>(
-                nested_loop_end - nested_loop_start)
-                .count();
-    } else {
-        auto probe_start = std::chrono::high_resolution_clock::now();
-        if (probe_is_columnar) {
-            match_buffers = probe_columnar<Mode>(*hash_table, probe_input,
-                                                 config.probe_attr);
-        } else {
-            const auto &probe_result =
-                std::get<ExtendedResult>(probe_input.data);
-            match_buffers = probe_intermediate<Mode>(
-                *hash_table, probe_result.columns[config.probe_attr]);
-        }
-        auto probe_end = std::chrono::high_resolution_clock::now();
-        stats.hash_join_probe_ms +=
-            std::chrono::duration_cast<std::chrono::milliseconds>(probe_end -
-                                                                  probe_start)
-                .count();
-    }
-
-    size_t total_matches = 0;
-    for (const auto &buf : match_buffers) {
-        total_matches += buf.count();
-    }
-
-    if (is_root) {
-        auto mat_start = std::chrono::high_resolution_clock::now();
-        JoinResult final_result;
-        if (total_matches == 0) {
-            final_result = create_empty_result(config.remapped_attrs);
-        } else {
-            prepare_output_columns(
-                columnar_reader, build_input, probe_input, build_node,
-                probe_node, config.remapped_attrs, build_input.output_size());
-
-            final_result = materialize_from_buffers<Mode>(
-                match_buffers, build_input, probe_input, config.remapped_attrs,
-                build_node, probe_node, build_input.output_size(),
-                columnar_reader, plan);
-        }
-        auto mat_end = std::chrono::high_resolution_clock::now();
-        stats.materialize_ms +=
-            std::chrono::duration_cast<std::chrono::milliseconds>(mat_end -
-                                                                  mat_start)
-                .count();
-        return final_result;
-    } else {
-        auto inter_start = std::chrono::high_resolution_clock::now();
-        if (total_matches > 0) {
-            prepare_output_columns(
-                columnar_reader, build_input, probe_input, build_node,
-                probe_node, config.remapped_attrs, build_input.output_size());
-
-            construct_intermediate_from_buffers<Mode>(
-                match_buffers, build_input, probe_input, config.remapped_attrs,
-                build_node, probe_node, build_input.output_size(),
-                columnar_reader, setup.results, setup.merged_table_ids);
-        }
-        auto inter_end = std::chrono::high_resolution_clock::now();
-        stats.intermediate_ms +=
-            std::chrono::duration_cast<std::chrono::milliseconds>(inter_end -
-                                                                  inter_start)
-                .count();
-        return std::move(setup.results);
-    }
-}
-
-/**
- * @brief Core recursive join execution.
- *
- * Phases: resolve L/R inputs -> select build/probe (smaller=build) -> algorithm
- * choice -> build/probe -> output construction.
- *
- * Algorithm: nested loop if build_rows < HASH_TABLE_THRESHOLD (8); else radix-
- * partitioned hash join.
- *
- * Memory: hash table and MatchCollector local (freed on return); child
- * ExecuteResults on stack until materialization; setup.results pre-allocated.
+ * @brief Select build/probe sides for join input.
  */
-JoinResult execute_impl(const Plan &plan, size_t node_idx, bool is_root,
-                        TimingStats &stats) {
-    auto &node = plan.nodes[node_idx];
-
-    if (!std::holds_alternative<JoinNode>(node.data)) {
-        return ExtendedResult{};
-    }
-
-    const auto &join = std::get<JoinNode>(node.data);
-    const auto &output_attrs = node.output_attrs;
-    const auto &left_node = plan.nodes[join.left];
-    const auto &right_node = plan.nodes[join.right];
-
-    JoinInput left_input = resolve_join_input(plan, join.left, stats);
-    JoinInput right_input = resolve_join_input(plan, join.right, stats);
-
-    /* Build/probe selection: smaller input = build side; remaps output_attrs.
-     */
-    auto setup_start = std::chrono::high_resolution_clock::now();
-    auto config =
-        select_build_probe_side(join, left_input, right_input, output_attrs);
-    const JoinInput &build_input = config.build_left ? left_input : right_input;
-    const JoinInput &probe_input = config.build_left ? right_input : left_input;
-    const auto &build_node = config.build_left ? left_node : right_node;
-    const auto &probe_node = config.build_left ? right_node : left_node;
-
-    bool build_is_columnar = build_input.is_columnar();
-    bool probe_is_columnar = probe_input.is_columnar();
-
-    /* Nested loop for <8 rows (L1-resident, no hash overhead, SIMD). */
-    const size_t HASH_TABLE_THRESHOLD = 8;
-    size_t build_rows = build_input.row_count(config.build_attr);
-    bool use_nested_loop = (build_rows < HASH_TABLE_THRESHOLD);
-
-    /* Pre-allocate ExecuteResult; ColumnarReader PageIndex built lazily. */
-    JoinSetup setup = setup_join(build_input, probe_input, build_node,
-                                 probe_node, left_node, right_node, left_input,
-                                 right_input, output_attrs, build_rows);
-    auto setup_end = std::chrono::high_resolution_clock::now();
-    auto setup_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
-        setup_end - setup_start);
-    stats.setup_ms += setup_elapsed.count();
-
-    /* Skip unused-side row IDs if output needs only one side (50% savings). */
-    MatchCollectionMode collection_mode = determine_collection_mode(
-        config.remapped_attrs, config.build_left ? left_input.output_size()
-                                                 : right_input.output_size());
-
-    /* Build hash table if needed (before mode dispatch). */
-    std::optional<UnchainedHashtable> hash_table;
-    if (!use_nested_loop) {
-        auto build_start = std::chrono::high_resolution_clock::now();
-        hash_table =
-            build_is_columnar
-                ? build_from_columnar(build_input, config.build_attr)
-                : build_from_intermediate(build_input, config.build_attr);
-        auto build_end = std::chrono::high_resolution_clock::now();
-        stats.hashtable_build_ms +=
-            std::chrono::duration_cast<std::chrono::milliseconds>(build_end -
-                                                                  build_start)
-                .count();
-    }
-
-    /* Dispatch based on collection mode - single runtime branch, then
-     * fully specialized template instantiation with zero branching in hot
-     * loops. */
-    switch (collection_mode) {
-    case MatchCollectionMode::BOTH:
-        return execute_join_with_mode<MatchCollectionMode::BOTH>(
-            use_nested_loop, probe_is_columnar, is_root,
-            use_nested_loop ? nullptr : &(*hash_table), build_input,
-            probe_input, config, build_node, probe_node, setup,
-            setup.columnar_reader, plan, stats);
-
-    case MatchCollectionMode::LEFT_ONLY:
-        return execute_join_with_mode<MatchCollectionMode::LEFT_ONLY>(
-            use_nested_loop, probe_is_columnar, is_root,
-            use_nested_loop ? nullptr : &(*hash_table), build_input,
-            probe_input, config, build_node, probe_node, setup,
-            setup.columnar_reader, plan, stats);
-
-    case MatchCollectionMode::RIGHT_ONLY:
-        return execute_join_with_mode<MatchCollectionMode::RIGHT_ONLY>(
-            use_nested_loop, probe_is_columnar, is_root,
-            use_nested_loop ? nullptr : &(*hash_table), build_input,
-            probe_input, config, build_node, probe_node, setup,
-            setup.columnar_reader, plan, stats);
-    }
-
-    // Should never reach here, but satisfy compiler
-    return ExtendedResult{};
-}
-
-#ifdef USE_DEFERRED_MATERIALIZATION
-// ============================================================================
-// DEFERRED MATERIALIZATION PATH
-// ============================================================================
-
-using DeferredJoinResult = std::variant<DeferredResult, ColumnarTable>;
-
-using materialize::construct_deferred_from_buffers;
-using materialize::create_empty_deferred_result;
-using materialize::materialize_deferred_from_buffers;
-
-// Forward declaration
-DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan,
-                                         size_t node_idx, bool is_root,
-                                         TimingStats &stats);
-
-/**
- * @brief Resolve deferred plan node to DeferredInput.
- */
-DeferredInput resolve_deferred_input(const DeferredPlan &deferred_plan,
-                                     size_t node_idx, TimingStats &stats) {
-    DeferredInput input;
-    const auto &dnode = deferred_plan[node_idx];
-    const auto &pnode = deferred_plan.original_plan->nodes[node_idx];
-    input.node = &pnode;
-    input.deferred_node = &dnode;
-
-    if (const auto *dscan = std::get_if<DeferredScanNode>(&dnode)) {
-        input.data = &deferred_plan.original_plan->inputs[dscan->base_table_id];
-        input.table_id = dscan->base_table_id;
-    } else {
-        auto result =
-            execute_deferred_impl(deferred_plan, node_idx, false, stats);
-        input.data = std::get<DeferredResult>(std::move(result));
-        input.table_id = 0;
-    }
-    return input;
-}
-
-/**
- * @brief Select build/probe sides for deferred input.
- */
-BuildProbeConfig select_deferred_build_probe_side(
-    const JoinNode &join, const DeferredInput &left_input,
-    const DeferredInput &right_input,
+BuildProbeConfig select_join_build_probe_side(
+    const JoinNode &join, const JoinInput &left_input,
+    const JoinInput &right_input,
     const std::vector<std::tuple<size_t, DataType>> &output_attrs) {
     BuildProbeConfig config;
 
@@ -371,21 +117,21 @@ BuildProbeConfig select_deferred_build_probe_side(
 }
 
 /**
- * @brief Unified probe + materialize for deferred path.
+ * @brief Unified probe + materialize helper templated on collection mode.
  */
 template <MatchCollectionMode Mode>
-DeferredJoinResult execute_deferred_join_with_mode(
+JoinResult execute_join_with_mode(
     bool use_nested_loop, bool probe_is_columnar, bool is_root,
-    const UnchainedHashtable *hash_table, const DeferredInput &build_input,
-    const DeferredInput &probe_input, const BuildProbeConfig &config,
-    const DeferredJoinNode &join_node, io::ColumnarReader &columnar_reader,
-    const DeferredPlan &deferred_plan, TimingStats &stats) {
+    const UnchainedHashtable *hash_table, const JoinInput &build_input,
+    const JoinInput &probe_input, const BuildProbeConfig &config,
+    const AnalyzedJoinNode &join_node, io::ColumnarReader &columnar_reader,
+    const AnalyzedPlan &plan, TimingStats &stats) {
 
     std::vector<ThreadLocalMatchBuffer<Mode>> match_buffers;
 
     if (use_nested_loop) {
         auto nested_loop_start = std::chrono::high_resolution_clock::now();
-        match_buffers = nested_loop_join_deferred<Mode>(
+        match_buffers = nested_loop_join<Mode>(
             build_input, probe_input, config.build_attr, config.probe_attr);
         auto nested_loop_end = std::chrono::high_resolution_clock::now();
         stats.nested_loop_join_ms +=
@@ -395,16 +141,11 @@ DeferredJoinResult execute_deferred_join_with_mode(
     } else {
         auto probe_start = std::chrono::high_resolution_clock::now();
         if (probe_is_columnar) {
-            // Create JoinInput for columnar probe
-            JoinInput probe_ji;
-            probe_ji.node = probe_input.node;
-            probe_ji.data = std::get<const ColumnarTable *>(probe_input.data);
-            probe_ji.table_id = probe_input.table_id;
-            match_buffers =
-                probe_columnar<Mode>(*hash_table, probe_ji, config.probe_attr);
+            match_buffers = probe_columnar<Mode>(*hash_table, probe_input,
+                                                 config.probe_attr);
         } else {
             const auto &probe_result =
-                std::get<DeferredResult>(probe_input.data);
+                std::get<IntermediateResult>(probe_input.data);
             // Probe using materialized column (should be the join key)
             const auto *mat_col =
                 probe_result.get_materialized(config.probe_attr);
@@ -433,21 +174,21 @@ DeferredJoinResult execute_deferred_join_with_mode(
 
     if (is_root) {
         auto mat_start = std::chrono::high_resolution_clock::now();
-        DeferredJoinResult final_result;
+        JoinResult final_result;
         if (total_matches == 0) {
             final_result =
-                materialize::create_empty_deferred_final(config.remapped_attrs);
+                materialize::create_empty_result(config.remapped_attrs);
         } else {
             // Prepare page indices for final materialization
-            materialize::prepare_final_deferred_columns(
+            materialize::prepare_final_columns(
                 columnar_reader, build_input, probe_input, join_node,
                 config.remapped_attrs, build_input.output_size(),
                 config.build_left);
 
-            final_result = materialize_deferred_from_buffers<Mode>(
+            final_result = materialize_from_buffers<Mode>(
                 match_buffers, build_input, probe_input, join_node,
                 config.remapped_attrs, build_input.output_size(),
-                config.build_left, columnar_reader, deferred_plan);
+                config.build_left, columnar_reader, plan);
         }
         auto mat_end = std::chrono::high_resolution_clock::now();
         stats.materialize_ms +=
@@ -457,20 +198,20 @@ DeferredJoinResult execute_deferred_join_with_mode(
         return final_result;
     } else {
         auto inter_start = std::chrono::high_resolution_clock::now();
-        DeferredResult result;
+        IntermediateResult result;
         if (total_matches > 0) {
             // Prepare page indices for intermediate construction
-            materialize::prepare_deferred_columns(
+            materialize::prepare_intermediate_columns(
                 columnar_reader, build_input, probe_input, join_node,
                 config.remapped_attrs, build_input.output_size(),
                 config.build_left);
 
-            construct_deferred_from_buffers<Mode>(
+            construct_intermediate_from_buffers<Mode>(
                 match_buffers, build_input, probe_input, join_node,
                 config.remapped_attrs, build_input.output_size(),
-                config.build_left, columnar_reader, result, deferred_plan);
+                config.build_left, columnar_reader, result, plan);
         } else {
-            result = create_empty_deferred_result(join_node);
+            result = create_empty_intermediate_result(join_node);
         }
         auto inter_end = std::chrono::high_resolution_clock::now();
         stats.intermediate_ms +=
@@ -482,44 +223,37 @@ DeferredJoinResult execute_deferred_join_with_mode(
 }
 
 /**
- * @brief Recursive deferred join execution.
+ * @brief Recursive join execution.
  */
-DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan,
-                                         size_t node_idx, bool is_root,
-                                         TimingStats &stats) {
-    const auto &dnode = deferred_plan[node_idx];
+JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root,
+                        TimingStats &stats) {
+    const auto &anode = plan[node_idx];
 
-    if (std::holds_alternative<DeferredScanNode>(dnode)) {
-        return DeferredResult{};
+    if (std::holds_alternative<AnalyzedScanNode>(anode)) {
+        return IntermediateResult{};
     }
 
-    const auto &djoin = std::get<DeferredJoinNode>(dnode);
-    const auto &plan = *deferred_plan.original_plan;
-    const auto &pnode = plan.nodes[node_idx];
+    const auto &ajoin = std::get<AnalyzedJoinNode>(anode);
+    const auto &original_plan = *plan.original_plan;
+    const auto &pnode = original_plan.nodes[node_idx];
     const auto &join = std::get<JoinNode>(pnode.data);
 
     // Resolve inputs
-    DeferredInput left_input =
-        resolve_deferred_input(deferred_plan, djoin.left_child_idx, stats);
-    DeferredInput right_input =
-        resolve_deferred_input(deferred_plan, djoin.right_child_idx, stats);
+    JoinInput left_input = resolve_input(plan, ajoin.left_child_idx, stats);
+    JoinInput right_input = resolve_input(plan, ajoin.right_child_idx, stats);
 
     // Build/probe selection
     auto setup_start = std::chrono::high_resolution_clock::now();
-    auto config = select_deferred_build_probe_side(
-        join, left_input, right_input, djoin.output_attrs);
-    const DeferredInput &build_input =
-        config.build_left ? left_input : right_input;
-    const DeferredInput &probe_input =
-        config.build_left ? right_input : left_input;
+    auto config = select_join_build_probe_side(join, left_input, right_input,
+                                               ajoin.output_attrs);
+    const JoinInput &build_input = config.build_left ? left_input : right_input;
+    const JoinInput &probe_input = config.build_left ? right_input : left_input;
 
     bool build_is_columnar = build_input.is_columnar();
     bool probe_is_columnar = probe_input.is_columnar();
 
     const size_t HASH_TABLE_THRESHOLD = 8;
     size_t build_rows = build_input.row_count(config.build_attr);
-    // Use nested loop for small build tables - works with both columnar and
-    // DeferredResult inputs (join keys are always materialized).
     bool use_nested_loop = (build_rows < HASH_TABLE_THRESHOLD);
 
     io::ColumnarReader columnar_reader;
@@ -530,7 +264,7 @@ DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan,
 
     // Use pre-computed collection mode from plan analysis.
     // base_collection_mode assumes build=left; flip if build=right at runtime.
-    MatchCollectionMode mode = djoin.base_collection_mode;
+    MatchCollectionMode mode = ajoin.base_collection_mode;
     if (!config.build_left) {
         if (mode == MatchCollectionMode::LEFT_ONLY)
             mode = MatchCollectionMode::RIGHT_ONLY;
@@ -543,21 +277,16 @@ DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan,
     if (!use_nested_loop) {
         auto build_start = std::chrono::high_resolution_clock::now();
         if (build_is_columnar) {
-            JoinInput build_ji;
-            build_ji.node = build_input.node;
-            build_ji.data = std::get<const ColumnarTable *>(build_input.data);
-            build_ji.table_id = build_input.table_id;
-            hash_table = build_from_columnar(build_ji, config.build_attr);
+            hash_table = build_from_columnar(build_input, config.build_attr);
         } else {
-            const auto &dr = std::get<DeferredResult>(build_input.data);
-            const auto *mat_col = dr.get_materialized(config.build_attr);
+            const auto &ir = std::get<IntermediateResult>(build_input.data);
+            const auto *mat_col = ir.get_materialized(config.build_attr);
             if (!mat_col) {
                 std::fprintf(
                     stderr,
                     "ERROR: build join key not materialized! build_attr=%zu "
                     "mat_map_size=%zu num_rows=%zu\n",
-                    config.build_attr, dr.materialized_map.size(), dr.num_rows);
-                // Fatal - this should never happen
+                    config.build_attr, ir.materialized_map.size(), ir.num_rows);
                 std::abort();
             }
             hash_table.emplace(mat_col->row_count());
@@ -573,29 +302,27 @@ DeferredJoinResult execute_deferred_impl(const DeferredPlan &deferred_plan,
     // Dispatch based on collection mode
     switch (mode) {
     case MatchCollectionMode::BOTH:
-        return execute_deferred_join_with_mode<MatchCollectionMode::BOTH>(
+        return execute_join_with_mode<MatchCollectionMode::BOTH>(
             use_nested_loop, probe_is_columnar, is_root,
             use_nested_loop ? nullptr : &(*hash_table), build_input,
-            probe_input, config, djoin, columnar_reader, deferred_plan, stats);
+            probe_input, config, ajoin, columnar_reader, plan, stats);
 
     case MatchCollectionMode::LEFT_ONLY:
-        return execute_deferred_join_with_mode<MatchCollectionMode::LEFT_ONLY>(
+        return execute_join_with_mode<MatchCollectionMode::LEFT_ONLY>(
             use_nested_loop, probe_is_columnar, is_root,
             use_nested_loop ? nullptr : &(*hash_table), build_input,
-            probe_input, config, djoin, columnar_reader, deferred_plan, stats);
+            probe_input, config, ajoin, columnar_reader, plan, stats);
 
     case MatchCollectionMode::RIGHT_ONLY:
-        return execute_deferred_join_with_mode<MatchCollectionMode::RIGHT_ONLY>(
+        return execute_join_with_mode<MatchCollectionMode::RIGHT_ONLY>(
             use_nested_loop, probe_is_columnar, is_root,
             use_nested_loop ? nullptr : &(*hash_table), build_input,
-            probe_input, config, djoin, columnar_reader, deferred_plan, stats);
+            probe_input, config, ajoin, columnar_reader, plan, stats);
     }
 
-    return DeferredResult{};
+    return IntermediateResult{};
 }
 
-#endif // USE_DEFERRED_MATERIALIZATION
-
 /**
  * @brief Public entry point: execute plan from root, return ColumnarTable.
  * @param plan Query plan with nodes and base tables.
@@ -613,26 +340,17 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out,
 
     TimingStats stats;
 
-#ifdef USE_DEFERRED_MATERIALIZATION
-    // Deferred materialization path: analyze plan, then execute with deferred
-    // intermediate construction
+    // Analyze plan and execute with deferred intermediate construction
     auto analyze_start = std::chrono::high_resolution_clock::now();
-    DeferredPlan deferred_plan = analyze_plan(plan);
+    AnalyzedPlan analyzed_plan = analyze_plan(plan);
     auto analyze_end = std::chrono::high_resolution_clock::now();
     stats.analyze_plan_ms =
         std::chrono::duration_cast<std::chrono::milliseconds>(analyze_end -
                                                               analyze_start)
             .count();
 
-    auto deferred_result =
-        execute_deferred_impl(deferred_plan, plan.root, true, stats);
-    ColumnarTable final_result =
-        std::get<ColumnarTable>(std::move(deferred_result));
-#else
-    // Eager materialization path (original)
-    auto result = execute_impl(plan, plan.root, true, stats);
+    auto result = execute_impl(analyzed_plan, plan.root, true, stats);
     ColumnarTable final_result = std::get<ColumnarTable>(std::move(result));
-#endif
 
     auto total_end = std::chrono::high_resolution_clock::now();
     auto total_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
@@ -640,19 +358,13 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out,
     stats.total_execution_ms = total_elapsed.count();
 
     if (show_detailed_timing) {
-        int64_t accounted = stats.hashtable_build_ms +
-                            stats.hash_join_probe_ms +
-                            stats.nested_loop_join_ms + stats.materialize_ms +
-                            stats.setup_ms + stats.intermediate_ms;
-#ifdef USE_DEFERRED_MATERIALIZATION
-        accounted += stats.analyze_plan_ms;
-#endif
+        int64_t accounted =
+            stats.hashtable_build_ms + stats.hash_join_probe_ms +
+            stats.nested_loop_join_ms + stats.materialize_ms + stats.setup_ms +
+            stats.intermediate_ms + stats.analyze_plan_ms;
         int64_t other = stats.total_execution_ms - accounted;
 
-#ifdef USE_DEFERRED_MATERIALIZATION
-        std::cout << "[DEFERRED] Plan Analysis Time: " << stats.analyze_plan_ms
-                  << " ms\n";
-#endif
+        std::cout << "Plan Analysis Time: " << stats.analyze_plan_ms << " ms\n";
         std::cout << "Hashtable Build Time: " << stats.hashtable_build_ms
                   << " ms\n";
         std::cout << "Hash Join Probe Time: " << stats.hash_join_probe_ms

From d2bbe70380c222e5c9fa85bcdf1ca23f082acfea Mon Sep 17 00:00:00 2001
From: Themos Papatheofanous <themos360@gmail.com>
Date: Thu, 22 Jan 2026 14:18:05 +0200
Subject: [PATCH 05/13] feat: deferred tables

---
 include/data_model/intermediate.h             | 188 +++++----
 .../materialization/construct_intermediate.h  | 374 ++++++++----------
 include/materialization/materialize.h         |  50 +--
 3 files changed, 302 insertions(+), 310 deletions(-)

diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h
index 4a29919..6a697fa 100644
--- a/include/data_model/intermediate.h
+++ b/include/data_model/intermediate.h
@@ -5,7 +5,7 @@
  * Provides:
  * - mema::value_t: 4-byte value encoding (INT32 direct, VARCHAR as page/offset)
  * - mema::column_t: 16KB-paged column for materialized values
- * - mema::deferred_column_t: 32KB-paged column for 64-bit provenance encoding
+ * - mema::DeferredTable: 16KB-paged 32-bit row ID storage per base table
  * - IntermediateResult: Lightweight result with selective materialization
  * - JoinInput: Unified abstraction over columnar tables and intermediate
  * results
@@ -29,10 +29,11 @@
 
 /**
  * @namespace mema
- * @brief Compact join intermediate: value_t (4B) + column_t (16KB pages).
+ * @brief Compact join intermediate: value_t (4B) + column_t (16KB pages) +
+ * DeferredTable (32-bit row IDs).
  *
  * value_t: INT32 direct or VARCHAR page/offset ref. column_t: arena-allocated
- * pages with write_at().
+ * pages with write_at(). DeferredTable: 32-bit row ID storage per base table.
  *
  * @see Contest::IntermediateResult, plan.h ColumnarTable.
  */
@@ -176,84 +177,85 @@ struct column_t {
 using Columnar = std::vector<column_t>;
 
 /**
- * @brief 64-bit provenance column for deferred materialization.
+ * @brief Per-base-table deferred row ID storage with multi-column tracking.
  *
- * Stores encoded (table_id, column_idx, row_id) for each row using
- * DeferredProvenance encoding. Uses 32KB pages with 4096 entries each.
+ * Stores 32-bit row IDs for a single base table. All columns from this
+ * base table share the same row ID lookup, reducing memory from 8 bytes
+ * per column to 4 bytes per table.
  *
- * @see DeferredProvenance for encoding scheme.
- * @see IntermediateResult for usage.
+ * Uses 16KB pages (reuses IR_PAGE arena chunk) with 4096 uint32_t entries.
  */
-struct deferred_column_t {
-    static constexpr size_t PAGE_SIZE = 1 << 15; // 32KB
+struct DeferredTable {
+    static constexpr size_t PAGE_SIZE = 1 << 14; // 16KB
     static constexpr size_t ENTRIES_PER_PAGE =
-        PAGE_SIZE / sizeof(uint64_t);         // 4096
+        PAGE_SIZE / sizeof(uint32_t);         // 4096
     static constexpr size_t ENTRY_SHIFT = 12; // log2(4096)
     static constexpr size_t ENTRY_MASK = ENTRIES_PER_PAGE - 1;
 
     struct alignas(PAGE_SIZE) Page {
-        uint64_t data[ENTRIES_PER_PAGE];
+        uint32_t data[ENTRIES_PER_PAGE];
     };
 
     std::vector<Page *> pages;
     size_t num_values = 0;
 
-    deferred_column_t() = default;
+    /// Base table ID this deferred table references
+    uint8_t base_table_id = 0;
 
-    deferred_column_t(deferred_column_t &&other) noexcept
-        : pages(std::move(other.pages)), num_values(other.num_values) {
+    /// True if this deferred table comes from build side (vs probe)
+    bool from_build = false;
+
+    /// Column indices from this base table that need deferred resolution
+    std::vector<uint8_t> column_indices;
+
+    DeferredTable() = default;
+
+    DeferredTable(DeferredTable &&other) noexcept
+        : pages(std::move(other.pages)), num_values(other.num_values),
+          base_table_id(other.base_table_id), from_build(other.from_build),
+          column_indices(std::move(other.column_indices)) {
         other.pages.clear();
         other.num_values = 0;
     }
 
-    deferred_column_t &operator=(deferred_column_t &&other) noexcept {
+    DeferredTable &operator=(DeferredTable &&other) noexcept {
         if (this != &other) {
             pages = std::move(other.pages);
             num_values = other.num_values;
+            base_table_id = other.base_table_id;
+            from_build = other.from_build;
+            column_indices = std::move(other.column_indices);
             other.pages.clear();
             other.num_values = 0;
         }
         return *this;
     }
 
-    deferred_column_t(const deferred_column_t &) = delete;
-    deferred_column_t &operator=(const deferred_column_t &) = delete;
+    DeferredTable(const DeferredTable &) = delete;
+    DeferredTable &operator=(const DeferredTable &) = delete;
 
-    ~deferred_column_t() = default;
+    ~DeferredTable() = default;
 
-    /** @brief O(1) read: idx>>12 for page, idx&0xFFF for offset. */
-    inline uint64_t operator[](size_t idx) const {
+    /// O(1) read: idx >> 12 for page, idx & 0xFFF for offset
+    inline uint32_t operator[](size_t idx) const {
         return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK];
     }
 
-    /** @brief Thread-safe write at idx (requires pages to be set up first). */
-    inline void write_at(size_t idx, uint64_t val) {
-        pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK] = val;
+    /// Thread-safe write at idx (requires pages set up first)
+    inline void write_at(size_t idx, uint32_t row_id) {
+        pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK] = row_id;
     }
 
-    /** @brief Total value count. */
     size_t row_count() const { return num_values; }
+    void set_row_count(size_t count) { num_values = count; }
 
-    /** @brief Set row count without allocation (for assembly pattern). */
-    inline void set_row_count(size_t count) { num_values = count; }
-
-    /** @brief Pre-allocate pages from arena. */
-    inline void pre_allocate_from_arena(Contest::platform::ThreadArena &arena,
-                                        size_t count) {
-        static_assert(
-            sizeof(Page) ==
-                Contest::platform::ChunkSize<
-                    Contest::platform::ChunkType::DEFERRED_PAGE>::value,
-            "Page size mismatch with DEFERRED_PAGE chunk size");
-        size_t pages_needed = (count + ENTRIES_PER_PAGE - 1) / ENTRIES_PER_PAGE;
-        pages.reserve(pages_needed);
-        for (size_t i = 0; i < pages_needed; ++i) {
-            void *ptr =
-                arena
-                    .alloc_chunk<Contest::platform::ChunkType::DEFERRED_PAGE>();
-            pages.push_back(reinterpret_cast<Page *>(ptr));
+    /// Check if this table tracks a specific base column
+    bool has_column(uint8_t col_idx) const {
+        for (uint8_t c : column_indices) {
+            if (c == col_idx)
+                return true;
         }
-        num_values = count;
+        return false;
     }
 };
 
@@ -261,19 +263,28 @@ struct deferred_column_t {
 
 namespace Contest {
 
+/**
+ * @brief Reference from a column to its deferred table.
+ */
+struct DeferredColumnRef {
+    uint8_t table_idx; ///< Index into IntermediateResult::deferred_tables
+    uint8_t base_col;  ///< Base column index in Plan::inputs[base_table_id]
+};
+
 /**
  * @brief Lightweight intermediate result with selective materialization.
  *
  * Stores only columns marked MATERIALIZE (typically just the parent's join
- * key). All other columns are resolved at final materialization using
- * per-column 64-bit provenance (table_id, column_idx, row_id).
+ * key). Deferred columns use per-table 32-bit row ID storage instead of
+ * per-column 64-bit provenance, achieving up to 10x memory reduction for
+ * multi-column deferred scenarios.
  *
- * Memory savings: For a join projecting N columns where only 1 is a join key,
- * IntermediateResult uses ~1/N the memory for data columns. Additionally, we
- * only track provenance for deferred columns (not all tables).
+ * Memory savings example: For 5 columns from same base table:
+ * - Old: 5 columns × 8 bytes = 40 bytes per row
+ * - New: 1 DeferredTable × 4 bytes = 4 bytes per row
  *
  * @see AnalyzedColumnInfo for materialization decisions.
- * @see DeferredProvenance for 64-bit encoding scheme.
+ * @see DeferredTable for 32-bit row ID storage.
  */
 struct IntermediateResult {
     /// Only columns marked MATERIALIZE (typically 1 join key).
@@ -283,13 +294,15 @@ struct IntermediateResult {
     /// deferred).
     std::vector<std::optional<size_t>> materialized_map;
 
-    /// Per-deferred-column provenance (64-bit encoded table_id+column_idx+row).
-    /// One deferred_column_t per DEFER column, stores full provenance per row.
-    std::vector<mema::deferred_column_t> deferred_columns;
+    /// Per-base-table deferred row ID storage. One DeferredTable per unique
+    /// (from_build, base_table_id) pair. All columns from same base table share
+    /// the same row ID lookup.
+    std::vector<mema::DeferredTable> deferred_tables;
 
-    /// Map: original column index -> index in deferred_columns (nullopt if
-    /// materialized).
-    std::vector<std::optional<size_t>> deferred_map;
+    /// Map: original column index -> DeferredColumnRef (nullopt if
+    /// materialized). The ref contains table_idx (into deferred_tables) and
+    /// base_col for resolution.
+    std::vector<std::optional<DeferredColumnRef>> deferred_map;
 
     /// Reference to node info for column provenance resolution.
     const AnalyzedJoinNode *node_info = nullptr;
@@ -325,22 +338,36 @@ struct IntermediateResult {
         return &materialized[*materialized_map[orig_idx]];
     }
 
-    /** @brief Get deferred column provenance, or nullptr if materialized. */
-    const mema::deferred_column_t *get_deferred(size_t orig_idx) const {
+    /** @brief Get deferred table for a column, or nullptr if materialized. */
+    const mema::DeferredTable *get_deferred_table(size_t orig_idx) const {
         if (!is_deferred(orig_idx))
             return nullptr;
-        return &deferred_columns[*deferred_map[orig_idx]];
+        return &deferred_tables[deferred_map[orig_idx]->table_idx];
     }
 
-    /** @brief Get mutable deferred column provenance, or nullptr. */
-    mema::deferred_column_t *get_deferred_mut(size_t orig_idx) {
+    /** @brief Get mutable deferred table for a column, or nullptr. */
+    mema::DeferredTable *get_deferred_table_mut(size_t orig_idx) {
         if (!is_deferred(orig_idx))
             return nullptr;
-        return &deferred_columns[*deferred_map[orig_idx]];
+        return &deferred_tables[deferred_map[orig_idx]->table_idx];
     }
 
-    /** @brief Number of deferred columns. */
-    size_t num_deferred() const { return deferred_columns.size(); }
+    /** @brief Get base column index for deferred column. */
+    uint8_t get_deferred_base_col(size_t orig_idx) const {
+        if (!is_deferred(orig_idx))
+            return 0;
+        return deferred_map[orig_idx]->base_col;
+    }
+
+    /** @brief Get full DeferredColumnRef for a column, or nullptr. */
+    const DeferredColumnRef *get_deferred_ref(size_t orig_idx) const {
+        if (!is_deferred(orig_idx))
+            return nullptr;
+        return &(*deferred_map[orig_idx]);
+    }
+
+    /** @brief Number of deferred tables (unique base tables). */
+    size_t num_deferred_tables() const { return deferred_tables.size(); }
 };
 
 /**
@@ -397,15 +424,40 @@ struct JoinInput {
     }
 
     /**
-     * @brief Get deferred column provenance for a column index.
+     * @brief Get deferred table for a column index.
+     *
+     * For columnar inputs, returns nullptr (caller must encode fresh).
+     * For IntermediateResult inputs, returns existing deferred table.
+     */
+    const mema::DeferredTable *get_deferred_table(size_t col_idx) const {
+        if (is_columnar())
+            return nullptr;
+        return std::get<IntermediateResult>(data).get_deferred_table(col_idx);
+    }
+
+    /**
+     * @brief Get base column index for a deferred column.
+     *
+     * For columnar inputs, returns 0 (caller must use column metadata).
+     * For IntermediateResult inputs, returns stored base column index.
+     */
+    uint8_t get_deferred_base_col(size_t col_idx) const {
+        if (is_columnar())
+            return 0;
+        return std::get<IntermediateResult>(data).get_deferred_base_col(
+            col_idx);
+    }
+
+    /**
+     * @brief Get full DeferredColumnRef for a column index.
      *
      * For columnar inputs, returns nullptr (caller must encode fresh).
-     * For IntermediateResult inputs, returns existing provenance column.
+     * For IntermediateResult inputs, returns pointer to DeferredColumnRef.
      */
-    const mema::deferred_column_t *get_deferred_column(size_t col_idx) const {
+    const DeferredColumnRef *get_deferred_ref(size_t col_idx) const {
         if (is_columnar())
             return nullptr;
-        return std::get<IntermediateResult>(data).get_deferred(col_idx);
+        return std::get<IntermediateResult>(data).get_deferred_ref(col_idx);
     }
 };
 
diff --git a/include/materialization/construct_intermediate.h b/include/materialization/construct_intermediate.h
index ed1834d..99c7d69 100644
--- a/include/materialization/construct_intermediate.h
+++ b/include/materialization/construct_intermediate.h
@@ -3,13 +3,13 @@
  * @brief Constructs intermediate results for multi-way joins.
  *
  * Allocates and populates IntermediateResult with only MATERIALIZE columns
- * (typically just the parent's join key). Deferred columns store 64-bit
- * provenance (table_id, column_idx, row_id) for resolution at final output.
+ * (typically just the parent's join key). Deferred columns use per-table
+ * 32-bit row ID storage for memory efficiency.
  *
  * Optimized with:
  * - Column-major iteration for cache locality
  * - Precomputed source metadata to avoid per-row variant access
- * - SIMD provenance encoding (AVX2/NEON) for deferred columns
+ * - Per-table 32-bit row ID storage (vs per-column 64-bit provenance)
  * - Batch access to match collector chunks
  *
  * @see materialize.h for final resolution of deferred columns.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cstring>
+#include <unordered_map>
 #include <vector>
 
 #include <data_access/columnar_reader.h>
@@ -27,12 +28,6 @@
 #include <platform/arena.h>
 #include <platform/worker_pool.h>
 
-#if defined(__x86_64__)
-#include <immintrin.h>
-#elif defined(__aarch64__)
-#include <arm_neon.h>
-#endif
-
 namespace Contest {
 namespace materialize {
 
@@ -43,135 +38,60 @@ using Contest::platform::THREAD_COUNT;
 using Contest::platform::worker_pool;
 
 // ============================================================================
-// SIMD Provenance Encoding
+// Row ID Batch Operations (for 32-bit per-table deferred)
 // ============================================================================
 
-namespace simd_provenance {
-
-#if defined(__x86_64__) && defined(__AVX2__)
-inline constexpr size_t BATCH_SIZE = 4; ///< 4 x uint64_t in AVX2 (256-bit)
-#elif defined(__aarch64__)
-inline constexpr size_t BATCH_SIZE = 2; ///< 2 x uint64_t in NEON (128-bit)
-#else
-inline constexpr size_t BATCH_SIZE = 0; ///< No SIMD available
-#endif
+namespace row_id_ops {
 
 /**
- * @brief Encode provenance for batch of row IDs using SIMD.
- *
- * Encodes (table_id << 56) | (column_idx << 48) | row_id for each row.
- * Uses AVX2 on x86_64 or NEON on aarch64, with scalar fallback.
+ * @brief Write row IDs directly from columnar input.
  *
- * @param dest       Destination deferred column
- * @param start_idx  Starting output index
- * @param row_ids    Pointer to row IDs (from IndexChunk, contiguous)
- * @param count      Number of row IDs to process
- * @param table_id   Base table ID (constant for all rows)
- * @param column_idx Base column index (constant for all rows)
- * @return Number of rows processed (always == count)
+ * For columnar inputs, we just write the row_id directly (it's already
+ * the base table row ID).
  */
-inline size_t encode_provenance_batch(mema::deferred_column_t &dest,
-                                      size_t start_idx, const uint32_t *row_ids,
-                                      size_t count, uint8_t table_id,
-                                      uint8_t column_idx) {
-    // Precompute constant prefix: (table_id << 56) | (column_idx << 48)
-    const uint64_t prefix = DeferredProvenance::encode(table_id, column_idx, 0);
-
-    size_t i = 0;
-
-#if defined(__x86_64__) && defined(__AVX2__)
-    // AVX2: Process 4 x uint64_t at a time
-    // Load 4 x uint32_t, zero-extend to 4 x uint64_t, OR with prefix
-    const __m256i prefix_vec = _mm256_set1_epi64x(static_cast<int64_t>(prefix));
-
-    for (; i + 4 <= count; i += 4) {
-        // Load 4 x uint32_t and zero-extend to 4 x uint64_t
-        __m128i rows_32 =
-            _mm_loadu_si128(reinterpret_cast<const __m128i *>(row_ids + i));
-        __m256i rows_64 = _mm256_cvtepu32_epi64(rows_32);
-
-        // OR with prefix to create provenance values
-        __m256i result = _mm256_or_si256(rows_64, prefix_vec);
-
-        // Store to aligned buffer, then write individually (page-safe)
-        alignas(32) uint64_t out[4];
-        _mm256_store_si256(reinterpret_cast<__m256i *>(out), result);
-
-        dest.write_at(start_idx + i, out[0]);
-        dest.write_at(start_idx + i + 1, out[1]);
-        dest.write_at(start_idx + i + 2, out[2]);
-        dest.write_at(start_idx + i + 3, out[3]);
-    }
-#elif defined(__aarch64__)
-    // NEON: Process 2 x uint64_t at a time
-    const uint64x2_t prefix_vec = vdupq_n_u64(prefix);
-
-    for (; i + 2 <= count; i += 2) {
-        // Load 2 x uint32_t and zero-extend to 2 x uint64_t
-        uint32x2_t rows_32 = vld1_u32(row_ids + i);
-        uint64x2_t rows_64 = vmovl_u32(rows_32);
-
-        // OR with prefix
-        uint64x2_t result = vorrq_u64(rows_64, prefix_vec);
-
-        // Store individually (page boundary safe)
-        dest.write_at(start_idx + i, vgetq_lane_u64(result, 0));
-        dest.write_at(start_idx + i + 1, vgetq_lane_u64(result, 1));
-    }
-#endif
-
-    // Scalar remainder
-    for (; i < count; ++i) {
-        dest.write_at(start_idx + i,
-                      prefix | static_cast<uint64_t>(row_ids[i]));
+inline size_t write_row_ids_direct(mema::DeferredTable &dest, size_t start_idx,
+                                   const uint32_t *row_ids, size_t count) {
+    for (size_t i = 0; i < count; ++i) {
+        dest.write_at(start_idx + i, row_ids[i]);
     }
-
     return count;
 }
 
 /**
- * @brief Copy provenance from source column using batch reads.
- *
- * Copies existing 64-bit provenance values from child intermediate.
- * Uses contiguous batch access for better cache behavior.
+ * @brief Copy row IDs from child deferred table.
  *
- * @param dest       Destination deferred column
- * @param start_idx  Starting output index
- * @param src        Source deferred column (from child)
- * @param row_ids    Row indices into source column
- * @param count      Number of rows to copy
- * @return Number of rows processed (always == count)
+ * For intermediate inputs, we look up the base table row ID from the
+ * child's deferred table and copy it to the parent's deferred table.
  */
-inline size_t copy_provenance_batch(mema::deferred_column_t &dest,
-                                    size_t start_idx,
-                                    const mema::deferred_column_t &src,
-                                    const uint32_t *row_ids, size_t count) {
+inline size_t copy_row_ids_from_child(mema::DeferredTable &dest,
+                                      size_t start_idx,
+                                      const mema::DeferredTable &src,
+                                      const uint32_t *row_ids, size_t count) {
     for (size_t i = 0; i < count; ++i) {
         dest.write_at(start_idx + i, src[row_ids[i]]);
     }
     return count;
 }
 
-} // namespace simd_provenance
+} // namespace row_id_ops
 
 // ============================================================================
 // Source Precomputation Structures
 // ============================================================================
 
 /**
- * @brief Precomputed metadata for deferred column sources.
+ * @brief Precomputed metadata for a deferred table source.
  *
- * Tracks where each deferred column's provenance comes from:
- * - For columnar inputs: encode fresh (table_id, column_idx, row_id)
- * - For IntermediateResult inputs: copy existing provenance from child
+ * Groups columns by (from_build, base_table_id) so we only store 32-bit
+ * row IDs once per unique base table instead of 64-bit provenance per column.
  */
-struct DeferredColumnSource {
-    const mema::deferred_column_t *source_col =
-        nullptr;                 ///< Source if from intermediate.
-    uint8_t base_table_id = 0;   ///< Base table ID for encoding.
-    uint8_t base_column_idx = 0; ///< Base column index for encoding.
-    bool from_build = false;     ///< True if from build side.
-    bool needs_encode = false;   ///< True if columnar (needs fresh encode).
+struct DeferredTableSource {
+    const mema::DeferredTable *child_table =
+        nullptr;                ///< Source deferred table from child (if any).
+    uint8_t base_table_id = 0;  ///< Base table ID.
+    uint8_t dest_table_idx = 0; ///< Index in result.deferred_tables[].
+    bool from_build = false;    ///< True if from build side.
+    bool needs_direct = false;  ///< True if columnar (write row IDs directly).
 };
 
 /**
@@ -183,8 +103,8 @@ struct alignas(8) MaterializedColumnSource {
     const mema::column_t *intermediate_col =
         nullptr; ///< Source if from IntermediateResult materialized
     const Column *columnar_col = nullptr; ///< Source if from ColumnarTable
-    const mema::deferred_column_t *deferred_resolve_col =
-        nullptr;                 ///< Source if needs deferred resolution
+    const mema::DeferredTable *deferred_table =
+        nullptr;                 ///< Source deferred table if needs resolution
     size_t child_output_idx = 0; ///< Index in child's output
     size_t mat_col_idx = 0;      ///< Index in result.materialized[]
     DataType type = DataType::INT32;
@@ -294,57 +214,93 @@ create_empty_intermediate_result(const AnalyzedJoinNode &node) {
     result.deferred_map.resize(node.columns.size(), std::nullopt);
 
     size_t mat_count = 0;
-    size_t def_count = 0;
     for (const auto &col : node.columns) {
         if (col.resolution == ColumnResolution::MATERIALIZE) {
             result.materialized_map[col.original_idx] = mat_count++;
-        } else {
-            result.deferred_map[col.original_idx] = def_count++;
         }
+        // For empty result, we don't need to set up deferred tables
     }
     result.materialized.resize(mat_count);
-    result.deferred_columns.resize(def_count);
 
     return result;
 }
 
 /**
- * @brief Prepare deferred column sources for intermediate construction.
+ * @brief Prepare deferred table sources for intermediate construction.
+ *
+ * Groups deferred columns by (from_build, base_table_id) to create
+ * DeferredTable entries. Returns list of sources for populating the tables.
  */
-inline std::vector<DeferredColumnSource>
-prepare_deferred_sources(const AnalyzedJoinNode &join_node,
-                         const JoinInput &build_input,
-                         const JoinInput &probe_input, bool build_is_left) {
-    std::vector<DeferredColumnSource> sources;
-    sources.reserve(join_node.num_deferred_columns);
+inline std::vector<DeferredTableSource>
+prepare_deferred_table_sources(const AnalyzedJoinNode &join_node,
+                               const JoinInput &build_input,
+                               const JoinInput &probe_input, bool build_is_left,
+                               IntermediateResult &out_result) {
+    // Map from (from_build << 8 | base_table_id) -> dest_table_idx
+    std::unordered_map<uint16_t, uint8_t> table_key_to_idx;
+    std::vector<DeferredTableSource> sources;
 
     for (const auto &col : join_node.columns) {
         if (col.resolution != ColumnResolution::DEFER)
             continue;
 
-        DeferredColumnSource src;
-        src.base_table_id = col.provenance.base_table_id;
-        src.base_column_idx = col.provenance.base_column_idx;
-        src.from_build = (col.from_left == build_is_left);
-
-        const auto &src_input = src.from_build ? build_input : probe_input;
-
-        if (src_input.is_columnar()) {
-            src.needs_encode = true;
-            src.source_col = nullptr;
-        } else {
-            const auto *child_def =
-                src_input.get_deferred_column(col.child_output_idx);
-            if (child_def) {
-                src.needs_encode = false;
-                src.source_col = child_def;
+        bool from_build = (col.from_left == build_is_left);
+        uint16_t key = (static_cast<uint16_t>(from_build) << 8) |
+                       col.provenance.base_table_id;
+
+        auto it = table_key_to_idx.find(key);
+        uint8_t dest_idx;
+
+        if (it == table_key_to_idx.end()) {
+            // New deferred table needed
+            dest_idx = static_cast<uint8_t>(out_result.deferred_tables.size());
+            table_key_to_idx[key] = dest_idx;
+
+            mema::DeferredTable dt;
+            dt.base_table_id = col.provenance.base_table_id;
+            dt.from_build = from_build;
+            out_result.deferred_tables.push_back(std::move(dt));
+
+            // Create source entry
+            DeferredTableSource src;
+            src.base_table_id = col.provenance.base_table_id;
+            src.dest_table_idx = dest_idx;
+            src.from_build = from_build;
+
+            const auto &src_input = from_build ? build_input : probe_input;
+            if (src_input.is_columnar()) {
+                src.needs_direct = true;
+                src.child_table = nullptr;
             } else {
-                src.needs_encode = true;
-                src.source_col = nullptr;
+                // Find child's deferred table for this base table
+                const auto *child_ref =
+                    src_input.get_deferred_ref(col.child_output_idx);
+                if (child_ref) {
+                    src.needs_direct = false;
+                    src.child_table =
+                        src_input.get_deferred_table(col.child_output_idx);
+                } else {
+                    // Child materialized this, shouldn't happen for DEFER cols
+                    src.needs_direct = true;
+                    src.child_table = nullptr;
+                }
             }
+            sources.push_back(src);
+        } else {
+            dest_idx = it->second;
         }
-        sources.push_back(src);
+
+        // Add column to deferred table's column list
+        out_result.deferred_tables[dest_idx].column_indices.push_back(
+            col.provenance.base_column_idx);
+
+        // Set up deferred_map entry
+        DeferredColumnRef ref;
+        ref.table_idx = dest_idx;
+        ref.base_col = col.provenance.base_column_idx;
+        out_result.deferred_map[col.original_idx] = ref;
     }
+
     return sources;
 }
 
@@ -391,8 +347,9 @@ prepare_materialized_sources(const AnalyzedJoinNode &join_node,
                     ir.get_materialized(col.child_output_idx);
             } else if (ir.is_deferred(col.child_output_idx)) {
                 src.needs_deferred_resolve = true;
-                src.deferred_resolve_col =
-                    ir.get_deferred(col.child_output_idx);
+                src.deferred_table =
+                    ir.get_deferred_table(col.child_output_idx);
+                // base_column_idx is already set from col.provenance
             }
         }
         sources.push_back(src);
@@ -408,10 +365,9 @@ prepare_materialized_sources(const AnalyzedJoinNode &join_node,
 /**
  * @brief Constructs intermediate result from thread-local buffers.
  *
- * Optimized with column-major iteration and SIMD provenance encoding.
+ * Optimized with column-major iteration and per-table 32-bit row ID storage.
  * Only materializes columns marked MATERIALIZE in the AnalyzedJoinNode.
- * Deferred columns store 64-bit provenance encoding for resolution at final
- * output.
+ * Deferred columns share row ID storage per unique base table.
  *
  * @tparam Mode            Collection mode for compile-time specialization.
  * @param buffers          Thread-local match buffers from probe.
@@ -454,30 +410,31 @@ void construct_intermediate_from_buffers(
     out_result.materialized_map.resize(join_node.columns.size(), std::nullopt);
     out_result.deferred_map.resize(join_node.columns.size(), std::nullopt);
 
+    // Count materialized columns and set up maps
     size_t mat_count = 0;
-    size_t def_count = 0;
     for (const auto &col : join_node.columns) {
         if (col.resolution == ColumnResolution::MATERIALIZE) {
             out_result.materialized_map[col.original_idx] = mat_count++;
-        } else {
-            out_result.deferred_map[col.original_idx] = def_count++;
         }
     }
 
-    // Precompute sources for column-major iteration
+    // Prepare deferred table sources (this populates deferred_tables and
+    // deferred_map)
+    auto deferred_sources = prepare_deferred_table_sources(
+        join_node, build_input, probe_input, build_is_left, out_result);
+
+    // Precompute materialized sources
     auto mat_sources = prepare_materialized_sources(join_node, build_input,
                                                     probe_input, build_is_left);
-    auto deferred_sources = prepare_deferred_sources(
-        join_node, build_input, probe_input, build_is_left);
 
     // Pre-allocate pages
     using Page = mema::column_t::Page;
-    using DeferredPage = mema::deferred_column_t::Page;
+    using DeferredPage = mema::DeferredTable::Page;
     size_t mat_pages_needed =
         (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE;
     size_t def_pages_needed =
-        (total_matches + mema::deferred_column_t::ENTRIES_PER_PAGE - 1) /
-        mema::deferred_column_t::ENTRIES_PER_PAGE;
+        (total_matches + mema::DeferredTable::ENTRIES_PER_PAGE - 1) /
+        mema::DeferredTable::ENTRIES_PER_PAGE;
 
     out_result.materialized.resize(mat_count);
     for (size_t c = 0; c < mat_count; ++c) {
@@ -485,10 +442,9 @@ void construct_intermediate_from_buffers(
         out_result.materialized[c].set_row_count(total_matches);
     }
 
-    out_result.deferred_columns.resize(def_count);
-    for (size_t d = 0; d < def_count; ++d) {
-        out_result.deferred_columns[d].pages.resize(def_pages_needed);
-        out_result.deferred_columns[d].set_row_count(total_matches);
+    for (auto &dt : out_result.deferred_tables) {
+        dt.pages.resize(def_pages_needed);
+        dt.set_row_count(total_matches);
     }
 
     // Set source metadata for materialized columns
@@ -500,6 +456,7 @@ void construct_intermediate_from_buffers(
     }
 
     const size_t num_threads = THREAD_COUNT;
+    const size_t num_deferred_tables = out_result.deferred_tables.size();
 
     // Parallel page allocation
     worker_pool().execute([&](size_t t) {
@@ -512,14 +469,14 @@ void construct_intermediate_from_buffers(
                 col.pages[p] = reinterpret_cast<Page *>(ptr);
             }
         }
-        for (size_t d = 0; d < def_count; ++d) {
-            auto &def_col = out_result.deferred_columns[d];
+        for (size_t d = 0; d < num_deferred_tables; ++d) {
+            auto &dt = out_result.deferred_tables[d];
             for (size_t p = t; p < def_pages_needed; p += num_threads) {
+                // Use IR_PAGE (16KB) for DeferredTable pages
                 void *ptr =
                     Contest::platform::get_arena(t)
-                        .alloc_chunk<
-                            Contest::platform::ChunkType::DEFERRED_PAGE>();
-                def_col.pages[p] = reinterpret_cast<DeferredPage *>(ptr);
+                        .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
+                dt.pages[p] = reinterpret_cast<DeferredPage *>(ptr);
             }
         }
     });
@@ -564,23 +521,21 @@ void construct_intermediate_from_buffers(
                 for (uint32_t rid : range) {
                     dest_col.write_at(k++, vec[rid]);
                 }
-            } else if (src.needs_deferred_resolve && src.deferred_resolve_col) {
-                // Deferred in child - resolve via provenance
-                const auto &def_col = *src.deferred_resolve_col;
+            } else if (src.needs_deferred_resolve && src.deferred_table) {
+                // Deferred in child - resolve via deferred table + base table
+                const auto &def_table = *src.deferred_table;
                 size_t k = start;
                 for (uint32_t rid : range) {
-                    uint64_t prov = def_col[rid];
-                    uint8_t base_tid = DeferredProvenance::table(prov);
-                    uint8_t base_col = DeferredProvenance::column(prov);
-                    uint64_t base_row = DeferredProvenance::row(prov);
+                    uint32_t base_row = def_table[rid];
 
                     if (analyzed_plan.original_plan) [[likely]] {
                         const auto &base_table =
-                            analyzed_plan.original_plan->inputs[base_tid];
+                            analyzed_plan.original_plan
+                                ->inputs[src.base_table_id];
                         mema::value_t val =
                             columnar_reader.read_value_direct_public(
-                                base_table.columns[base_col],
-                                static_cast<uint32_t>(base_row), src.type);
+                                base_table.columns[src.base_column_idx],
+                                base_row, src.type);
                         dest_col.write_at(k++, val);
                     } else {
                         dest_col.write_at(
@@ -591,52 +546,33 @@ void construct_intermediate_from_buffers(
         }
 
         // ====================================================================
-        // Process DEFERRED columns (column-major with SIMD batch encoding)
+        // Process DEFERRED tables (one pass per unique base table)
         // ====================================================================
-        for (size_t d = 0; d < deferred_sources.size(); ++d) {
-            const auto &def_src = deferred_sources[d];
-            auto &dest_def_col = out_result.deferred_columns[d];
-
-            if (def_src.needs_encode) {
-                // Fresh encoding from columnar input - use SIMD batch
-                auto batch_reader = def_src.from_build
-                                        ? buf.left_batch_reader()
-                                        : buf.right_batch_reader();
-
-                size_t k = start;
-                while (batch_reader.has_more()) {
-                    size_t batch_count;
-                    // Request larger batches for SIMD efficiency
-                    constexpr size_t MAX_BATCH =
-                        simd_provenance::BATCH_SIZE > 0 ? 64 : 256;
-                    const uint32_t *row_ids =
-                        batch_reader.get_batch(MAX_BATCH, batch_count);
-
-                    if (batch_count > 0) {
-                        simd_provenance::encode_provenance_batch(
-                            dest_def_col, k, row_ids, batch_count,
-                            def_src.base_table_id, def_src.base_column_idx);
-                        k += batch_count;
-                    }
-                }
-            } else if (def_src.source_col) {
-                // Copy existing provenance from child intermediate
-                auto batch_reader = def_src.from_build
-                                        ? buf.left_batch_reader()
-                                        : buf.right_batch_reader();
-
-                size_t k = start;
-                while (batch_reader.has_more()) {
-                    size_t batch_count;
-                    const uint32_t *row_ids =
-                        batch_reader.get_batch(256, batch_count);
-
-                    if (batch_count > 0) {
-                        simd_provenance::copy_provenance_batch(
-                            dest_def_col, k, *def_src.source_col, row_ids,
+        for (const auto &def_src : deferred_sources) {
+            auto &dest_table =
+                out_result.deferred_tables[def_src.dest_table_idx];
+
+            auto batch_reader = def_src.from_build ? buf.left_batch_reader()
+                                                   : buf.right_batch_reader();
+
+            size_t k = start;
+            while (batch_reader.has_more()) {
+                size_t batch_count;
+                const uint32_t *row_ids =
+                    batch_reader.get_batch(256, batch_count);
+
+                if (batch_count > 0) {
+                    if (def_src.needs_direct) {
+                        // Columnar input: write row IDs directly
+                        row_id_ops::write_row_ids_direct(dest_table, k, row_ids,
+                                                         batch_count);
+                    } else if (def_src.child_table) {
+                        // Intermediate input: copy from child's deferred table
+                        row_id_ops::copy_row_ids_from_child(
+                            dest_table, k, *def_src.child_table, row_ids,
                             batch_count);
-                        k += batch_count;
                     }
+                    k += batch_count;
                 }
             }
         }
diff --git a/include/materialization/materialize.h b/include/materialization/materialize.h
index dca7f49..4425042 100644
--- a/include/materialization/materialize.h
+++ b/include/materialization/materialize.h
@@ -3,8 +3,7 @@
  * @brief Final materialization for execution path.
  *
  * Materializes all output columns at the root join, resolving deferred
- * columns by decoding 64-bit provenance (table_id, column_idx, row_id) back
- * to base tables.
+ * columns by looking up 32-bit row IDs in DeferredTable back to base tables.
  *
  * @see construct_intermediate.h for building IntermediateResult intermediates.
  */
@@ -248,7 +247,7 @@ inline void materialize_column(
 /**
  * @brief Materialize single output column handling deferred resolution.
  *
- * For deferred columns, resolves via 64-bit provenance encoding back to
+ * For deferred columns, resolves via DeferredTable (32-bit row ID) back to
  * base table.
  *
  * @tparam Mode Collection mode for compile-time specialization.
@@ -282,7 +281,9 @@ inline void materialize_single_column(
     // Determine how to read the value
     const Column *columnar_source = nullptr;
     const mema::column_t *materialized_source = nullptr;
-    const mema::deferred_column_t *deferred_source = nullptr;
+    const mema::DeferredTable *deferred_table = nullptr;
+    uint8_t deferred_base_col = 0;
+    uint8_t deferred_base_table = 0;
 
     if (src_input.is_columnar()) {
         // Direct columnar read
@@ -297,8 +298,14 @@ inline void materialize_single_column(
             materialized_source =
                 ir.get_materialized(col_info->child_output_idx);
         } else if (ir.is_deferred(col_info->child_output_idx)) {
-            // Deferred - need to resolve via 64-bit provenance
-            deferred_source = ir.get_deferred(col_info->child_output_idx);
+            // Deferred - need to resolve via deferred table + base table
+            deferred_table = ir.get_deferred_table(col_info->child_output_idx);
+            deferred_base_col =
+                ir.get_deferred_base_col(col_info->child_output_idx);
+            // Get base table ID from the deferred table itself
+            if (deferred_table) {
+                deferred_base_table = deferred_table->base_table_id;
+            }
         }
     }
 
@@ -311,17 +318,15 @@ inline void materialize_single_column(
                 col_info->type, cursor, from_build);
         } else if (materialized_source) {
             return (*materialized_source)[local_row_id];
-        } else if (deferred_source && analyzed_plan.original_plan) {
-            // Deferred resolution: decode 64-bit provenance
-            uint64_t prov = (*deferred_source)[local_row_id];
-            uint8_t base_tid = DeferredProvenance::table(prov);
-            uint8_t base_col = DeferredProvenance::column(prov);
-            uint64_t base_row = DeferredProvenance::row(prov);
+        } else if (deferred_table && analyzed_plan.original_plan) {
+            // Deferred resolution: look up base table row ID from deferred
+            // table
+            uint32_t base_row = (*deferred_table)[local_row_id];
             const auto &base_table =
-                analyzed_plan.original_plan->inputs[base_tid];
+                analyzed_plan.original_plan->inputs[deferred_base_table];
             return columnar_reader.read_value(
-                base_table.columns[base_col], base_col,
-                static_cast<uint32_t>(base_row), col_info->type, cursor, true);
+                base_table.columns[deferred_base_col], deferred_base_col,
+                base_row, col_info->type, cursor, true);
         }
         return mema::value_t{mema::value_t::NULL_VALUE};
     };
@@ -347,12 +352,11 @@ inline void materialize_single_column(
             str_src_ptr = &analyzed_plan.original_plan
                                ->inputs[materialized_source->source_table]
                                .columns[materialized_source->source_column];
-        } else if (deferred_source && analyzed_plan.original_plan) {
-            // For deferred VARCHAR, get source from provenance of first row
-            // All rows in a deferred column share the same base table/column
-            str_src_ptr = &analyzed_plan.original_plan
-                               ->inputs[col_info->provenance.base_table_id]
-                               .columns[col_info->provenance.base_column_idx];
+        } else if (deferred_table && analyzed_plan.original_plan) {
+            // For deferred VARCHAR, get source from provenance metadata
+            str_src_ptr =
+                &analyzed_plan.original_plan->inputs[deferred_base_table]
+                     .columns[deferred_base_col];
         }
     }
 
@@ -376,8 +380,8 @@ inline void materialize_single_column(
 /**
  * @brief Materialize all output columns from intermediate result.
  *
- * For root join. Resolves all deferred columns by decoding 64-bit provenance
- * to base tables.
+ * For root join. Resolves all deferred columns by looking up 32-bit row IDs
+ * in DeferredTable back to base tables.
  *
  * @tparam Mode Collection mode for compile-time specialization.
  * @param buffers Thread-local match buffers from probe.

From 61e0f2fea5fbf921b5628daed1f8294128c6aa39 Mon Sep 17 00:00:00 2001
From: Themos Papatheofanous <themos360@gmail.com>
Date: Thu, 22 Jan 2026 17:54:38 +0200
Subject: [PATCH 06/13] feat: draft propagation

---
 include/data_model/intermediate.h             | 163 +++++-
 include/join_execution/hash_join.h            |  70 +++
 include/join_execution/hashtable.h            |  81 +++
 include/join_execution/join_setup.h           |  59 +++
 include/join_execution/nested_loop.h          |  52 +-
 .../materialization/construct_intermediate.h  | 468 +++++++++++++++++-
 include/materialization/materialize.h         |  20 +-
 src/execute.cpp                               |  83 +++-
 8 files changed, 943 insertions(+), 53 deletions(-)

diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h
index 6a697fa..53f6418 100644
--- a/include/data_model/intermediate.h
+++ b/include/data_model/intermediate.h
@@ -30,15 +30,128 @@
 /**
  * @namespace mema
  * @brief Compact join intermediate: value_t (4B) + column_t (16KB pages) +
- * DeferredTable (32-bit row IDs).
+ * DeferredTable (32-bit row IDs) + key_row_column_t (8B tuples).
  *
  * value_t: INT32 direct or VARCHAR page/offset ref. column_t: arena-allocated
  * pages with write_at(). DeferredTable: 32-bit row ID storage per base table.
+ * key_row_column_t: (key, row_id) tuples for join key propagation.
  *
  * @see Contest::IntermediateResult, plan.h ColumnarTable.
  */
 namespace mema {
 
+/**
+ * @brief Join key with associated row ID for tuple-based storage.
+ *
+ * For LEFT_ONLY/RIGHT_ONLY modes: row_id is base table row ID (zero
+ * indirection) For BOTH mode: row_id may be IR index (requires deferred table
+ * lookup)
+ *
+ * 8-byte aligned for efficient memory access and potential SIMD operations.
+ */
+struct alignas(8) KeyRowPair {
+    int32_t key;     ///< Join key value
+    uint32_t row_id; ///< Row ID (base table or IR index depending on mode)
+};
+
+/**
+ * @brief Column of (key, row_id) tuples for join key storage.
+ *
+ * Enables accelerated hashtable build (tuples match internal format) and
+ * zero-indirection row ID propagation through join chains. Used instead of
+ * separate column_t for join key columns.
+ *
+ * Memory layout: 16KB pages containing 2048 KeyRowPair entries each.
+ */
+struct key_row_column_t {
+    static constexpr size_t PAGE_SIZE = 1 << 14; // 16KB
+    static constexpr size_t PAIRS_PER_PAGE =
+        PAGE_SIZE / sizeof(KeyRowPair);       // 2048
+    static constexpr size_t ENTRY_SHIFT = 11; // log2(2048)
+    static constexpr size_t ENTRY_MASK = PAIRS_PER_PAGE - 1;
+
+    struct alignas(PAGE_SIZE) Page {
+        KeyRowPair data[PAIRS_PER_PAGE];
+    };
+
+    std::vector<Page *> pages;
+    size_t num_values = 0;
+
+    /// Base table ID for row_id component (valid when stores_base_row_ids=true)
+    uint8_t base_table_id = 0;
+
+    /// Source column in base table (for VARCHAR provenance)
+    uint8_t source_column = 0;
+
+    /// True if row_id contains base table row IDs, false if IR indices
+    bool stores_base_row_ids = false;
+
+    key_row_column_t() = default;
+
+    key_row_column_t(key_row_column_t &&other) noexcept
+        : pages(std::move(other.pages)), num_values(other.num_values),
+          base_table_id(other.base_table_id),
+          source_column(other.source_column),
+          stores_base_row_ids(other.stores_base_row_ids) {
+        other.pages.clear();
+        other.num_values = 0;
+    }
+
+    key_row_column_t &operator=(key_row_column_t &&other) noexcept {
+        if (this != &other) {
+            pages = std::move(other.pages);
+            num_values = other.num_values;
+            base_table_id = other.base_table_id;
+            source_column = other.source_column;
+            stores_base_row_ids = other.stores_base_row_ids;
+            other.pages.clear();
+            other.num_values = 0;
+        }
+        return *this;
+    }
+
+    key_row_column_t(const key_row_column_t &) = delete;
+    key_row_column_t &operator=(const key_row_column_t &) = delete;
+
+    ~key_row_column_t() = default;
+
+    /// O(1) read: idx >> 11 for page, idx & 0x7FF for offset
+    inline KeyRowPair operator[](size_t idx) const {
+        return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK];
+    }
+
+    /// Thread-safe write at idx (requires pages set up first)
+    inline void write_at(size_t idx, KeyRowPair pair) {
+        pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK] = pair;
+    }
+
+    /// Read only the key at index
+    inline int32_t key_at(size_t idx) const {
+        return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK].key;
+    }
+
+    /// Read only the row_id at index
+    inline uint32_t row_id_at(size_t idx) const {
+        return pages[idx >> ENTRY_SHIFT]->data[idx & ENTRY_MASK].row_id;
+    }
+
+    size_t row_count() const { return num_values; }
+    void set_row_count(size_t count) { num_values = count; }
+
+    /// Pre-allocate pages from arena
+    inline void pre_allocate_from_arena(Contest::platform::ThreadArena &arena,
+                                        size_t count) {
+        size_t pages_needed = (count + PAIRS_PER_PAGE - 1) / PAIRS_PER_PAGE;
+        pages.reserve(pages_needed);
+        for (size_t i = 0; i < pages_needed; ++i) {
+            void *ptr =
+                arena.alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
+            pages.push_back(reinterpret_cast<Page *>(ptr));
+        }
+        num_values = count;
+    }
+};
+
 /**
  * @brief 4-byte value: INT32 direct, VARCHAR packed (19-bit page + 13-bit
  * offset).
@@ -274,29 +387,36 @@ struct DeferredColumnRef {
 /**
  * @brief Lightweight intermediate result with selective materialization.
  *
- * Stores only columns marked MATERIALIZE (typically just the parent's join
- * key). Deferred columns use per-table 32-bit row ID storage instead of
- * per-column 64-bit provenance, achieving up to 10x memory reduction for
- * multi-column deferred scenarios.
+ * Stores join key as (value, row_id) tuples for accelerated hashtable build
+ * and zero-indirection row ID propagation. Other columns use per-table 32-bit
+ * row ID storage for deferred resolution.
  *
- * Memory savings example: For 5 columns from same base table:
- * - Old: 5 columns × 8 bytes = 40 bytes per row
- * - New: 1 DeferredTable × 4 bytes = 4 bytes per row
+ * For LEFT_ONLY/RIGHT_ONLY modes: join_key_tuples stores base table row IDs
+ * For BOTH mode: join_key_tuples may store IR indices + DeferredTable for other
+ * side
  *
  * @see AnalyzedColumnInfo for materialization decisions.
+ * @see key_row_column_t for tuple storage.
  * @see DeferredTable for 32-bit row ID storage.
  */
 struct IntermediateResult {
-    /// Only columns marked MATERIALIZE (typically 1 join key).
+    /// Join key stored as (value, row_id) tuples for accelerated propagation.
+    /// Replaces materialized column for join key when present.
+    std::optional<mema::key_row_column_t> join_key_tuples;
+
+    /// Index of join key column in output (nullopt if root or no tuples).
+    std::optional<size_t> join_key_idx;
+
+    /// Other materialized columns (non-join-key columns marked MATERIALIZE).
     std::vector<mema::column_t> materialized;
 
     /// Map: original column index -> index in materialized (nullopt if
-    /// deferred).
+    /// deferred or is join key).
     std::vector<std::optional<size_t>> materialized_map;
 
     /// Per-base-table deferred row ID storage. One DeferredTable per unique
     /// (from_build, base_table_id) pair. All columns from same base table share
-    /// the same row ID lookup.
+    /// the same row ID lookup. Used for BOTH mode's non-tracked side.
     std::vector<mema::DeferredTable> deferred_tables;
 
     /// Map: original column index -> DeferredColumnRef (nullopt if
@@ -319,19 +439,38 @@ struct IntermediateResult {
     /** @brief Total row count. */
     size_t row_count() const { return num_rows; }
 
+    /** @brief Check if join key is stored as tuples. */
+    bool has_join_key_tuples() const { return join_key_tuples.has_value(); }
+
+    /** @brief Check if join key tuples contain base row IDs (vs IR indices). */
+    bool join_key_has_base_rows() const {
+        return join_key_tuples && join_key_tuples->stores_base_row_ids;
+    }
+
+    /** @brief Get join key tuple at index. */
+    mema::KeyRowPair get_join_key_tuple(size_t idx) const {
+        return join_key_tuples ? (*join_key_tuples)[idx]
+                               : mema::KeyRowPair{0, 0};
+    }
+
     /** @brief Check if column was materialized (not deferred). */
     bool is_materialized(size_t orig_idx) const {
         return orig_idx < materialized_map.size() &&
                materialized_map[orig_idx].has_value();
     }
 
+    /** @brief Check if column is the join key (stored as tuples). */
+    bool is_join_key(size_t orig_idx) const {
+        return join_key_idx.has_value() && *join_key_idx == orig_idx;
+    }
+
     /** @brief Check if column is deferred. */
     bool is_deferred(size_t orig_idx) const {
         return orig_idx < deferred_map.size() &&
                deferred_map[orig_idx].has_value();
     }
 
-    /** @brief Get materialized column, or nullptr if deferred. */
+    /** @brief Get materialized column, or nullptr if deferred/join key. */
     const mema::column_t *get_materialized(size_t orig_idx) const {
         if (!is_materialized(orig_idx))
             return nullptr;
diff --git a/include/join_execution/hash_join.h b/include/join_execution/hash_join.h
index 0e75ccf..5895f11 100644
--- a/include/join_execution/hash_join.h
+++ b/include/join_execution/hash_join.h
@@ -236,4 +236,74 @@ probe_columnar(const UnchainedHashtable &hash_table,
     return local_buffers;
 }
 
+/**
+ * @brief Probe hash table with tuple column, returning thread-local buffers.
+ *
+ * Uses (key, row_id) tuples from IntermediateResult. The row_id in each
+ * tuple is propagated to the match buffer, enabling zero-indirection
+ * resolution when tuples contain base table row IDs.
+ *
+ * @tparam Mode Collection mode (BOTH, LEFT_ONLY, RIGHT_ONLY) for compile-time
+ *              specialization of match buffer operations.
+ * @param hash_table Hash table to probe against.
+ * @param probe_tuples Tuple column containing (key, row_id) pairs.
+ * @return Vector of thread-local match buffers.
+ */
+template <MatchCollectionMode Mode>
+inline std::vector<ThreadLocalMatchBuffer<Mode>>
+probe_tuples(const UnchainedHashtable &hash_table,
+             const mema::key_row_column_t &probe_tuples) {
+
+    const auto *keys = hash_table.keys();
+    const auto *row_ids = hash_table.row_ids();
+    const size_t probe_count = probe_tuples.row_count();
+    const size_t num_pages = probe_tuples.pages.size();
+
+    std::vector<ThreadLocalMatchBuffer<Mode>> local_buffers(THREAD_COUNT);
+    std::atomic<size_t> page_counter(0);
+
+    worker_pool().execute([&](size_t thread_id) {
+        local_buffers[thread_id] = ThreadLocalMatchBuffer<Mode>(
+            Contest::platform::get_arena(thread_id));
+        auto &local_buf = local_buffers[thread_id];
+
+        while (true) {
+            size_t page_idx = page_counter.fetch_add(1);
+            if (page_idx >= num_pages)
+                break;
+
+            size_t base = page_idx * mema::key_row_column_t::PAIRS_PER_PAGE;
+            size_t end = std::min(base + mema::key_row_column_t::PAIRS_PER_PAGE,
+                                  probe_count);
+
+            constexpr size_t PREFETCH_DIST = 8;
+            for (size_t idx = base; idx < end; ++idx) {
+                // Prefetch future slot
+                if (idx + PREFETCH_DIST < end) {
+                    hash_table.prefetch_slot(
+                        probe_tuples.key_at(idx + PREFETCH_DIST));
+                }
+
+                mema::KeyRowPair pair = probe_tuples[idx];
+
+                // Skip NULL keys
+                if (pair.key != mema::value_t::NULL_VALUE) {
+                    auto [start_idx, end_idx] =
+                        hash_table.find_indices(pair.key);
+
+                    for (uint64_t i = start_idx; i < end_idx; ++i) {
+                        if (keys[i] == pair.key) {
+                            // row_ids[i] = build side's row ID (base or IR)
+                            // pair.row_id = probe side's row ID (base or IR)
+                            local_buf.add_match(row_ids[i], pair.row_id);
+                        }
+                    }
+                }
+            }
+        }
+    });
+
+    return local_buffers;
+}
+
 } // namespace Contest::join
diff --git a/include/join_execution/hashtable.h b/include/join_execution/hashtable.h
index f98ea18..1ea10da 100644
--- a/include/join_execution/hashtable.h
+++ b/include/join_execution/hashtable.h
@@ -390,6 +390,87 @@ class UnchainedHashtable {
         });
     }
 
+    /**
+     * @brief Build hash table from (key, row_id) tuple column.
+     *
+     * Consumes tuples directly - row_ids are already in correct format
+     * (base table IDs or IR indices depending on how IR was constructed).
+     * More efficient than build_intermediate() as tuples match internal format.
+     *
+     * @param tuples Key-row tuple column from IntermediateResult.
+     * @param num_threads Thread count hint.
+     */
+    void build_from_tuples(const mema::key_row_column_t &tuples,
+                           int num_threads = 4) {
+        const size_t row_count = tuples.row_count();
+        if (row_count == 0)
+            return;
+
+        static constexpr size_t PARALLEL_BUILD_THRESHOLD = 10000;
+        num_threads = Contest::platform::worker_pool().thread_count();
+        if (row_count < PARALLEL_BUILD_THRESHOLD)
+            num_threads = 1;
+
+        const size_t num_slots = directory.size();
+        const size_t num_partitions =
+            compute_num_partitions(row_count, num_threads);
+        const int partition_bits = __builtin_ctzll(num_partitions);
+        const size_t slots_per_partition = num_slots / num_partitions;
+
+        std::vector<ChunkAllocator> allocators(num_threads);
+        for (int t = 0; t < num_threads; ++t)
+            allocators[t].set_arena(Contest::platform::get_arena(t));
+        std::vector<std::vector<Partition>> thread_parts(num_threads);
+        for (auto &tp : thread_parts)
+            tp.resize(num_partitions);
+
+        // Partition phase - 8-byte tuple reads, cache-friendly streaming
+        size_t batch = (row_count + num_threads - 1) / num_threads;
+        Contest::platform::worker_pool().execute([&, partition_bits](size_t t) {
+            size_t start = t * batch;
+            size_t end = std::min(start + batch, row_count);
+            if (start >= end)
+                return;
+            const int shift = 64 - partition_bits;
+            for (size_t i = start; i < end; ++i) {
+                mema::KeyRowPair pair = tuples[i];
+                uint64_t h = hash_key(pair.key);
+                size_t p = (partition_bits == 0) ? 0 : (h >> shift);
+                // Direct use of tuple - no conversion needed
+                thread_parts[t][p].append(allocators[t],
+                                          {pair.key, pair.row_id});
+            }
+        });
+
+        // Compute global offsets from per-thread counts
+        Contest::platform::ArenaVector<size_t> global_offsets(*arena_);
+        global_offsets.resize(num_partitions + 1);
+        std::memset(global_offsets.data(), 0,
+                    (num_partitions + 1) * sizeof(size_t));
+        for (size_t p = 0; p < num_partitions; ++p) {
+            for (size_t t = 0; t < num_threads; ++t) {
+                global_offsets[p + 1] += thread_parts[t][p].total_count;
+            }
+            global_offsets[p + 1] += global_offsets[p];
+        }
+
+        size_t total = global_offsets[num_partitions];
+        if (total == 0)
+            return;
+        keys_.resize(total);
+        row_ids_.resize(total);
+
+        // Build partitions in parallel
+        const int nt = num_threads;
+        Contest::platform::worker_pool().execute([&, nt](size_t t) {
+            for (size_t p = t; p < num_partitions; p += nt) {
+                build_partition(
+                    thread_parts, p, slots_per_partition, global_offsets[p],
+                    global_offsets[p + 1] - global_offsets[p], nt, t);
+            }
+        });
+    }
+
     /**
      * @brief Build hash table from ColumnarTable Column.
      *
diff --git a/include/join_execution/join_setup.h b/include/join_execution/join_setup.h
index 217995e..f2917f0 100644
--- a/include/join_execution/join_setup.h
+++ b/include/join_execution/join_setup.h
@@ -107,3 +107,62 @@ inline MatchCollectionMode determine_collection_mode(
 }
 
 } // namespace Contest::join
+
+namespace Contest {
+
+// Forward declare AnalyzedJoinNode
+struct AnalyzedJoinNode;
+
+/**
+ * @brief Tracking info for one side of a join (build or probe).
+ *
+ * Determines whether to embed base table row IDs or IR indices in the
+ * output tuples for this side.
+ */
+struct SideTrackingInfo {
+    bool track_base_rows =
+        false; ///< True to embed base row IDs, false for IR indices
+    uint8_t base_table_id = 0; ///< Base table to track (if track_base_rows)
+};
+
+/**
+ * @brief Tracking configuration for intermediate construction.
+ *
+ * Determines what row IDs to embed in join key tuples and whether
+ * DeferredTables are needed for non-tracked sides.
+ */
+struct TupleTrackingInfo {
+    SideTrackingInfo build_tracking; ///< Tracking info for build side
+    SideTrackingInfo probe_tracking; ///< Tracking info for probe side
+    bool key_from_build =
+        true; ///< True if parent join key comes from build side
+};
+
+/**
+ * @brief Result of a join execution before intermediate construction.
+ *
+ * Contains match buffers and metadata needed for deferred IR construction.
+ * Allows parent join to decide row ID format based on its cardinality
+ * requirements before constructing the intermediate result.
+ *
+ * @tparam Mode Match collection mode for this join's buffers.
+ */
+template <join::MatchCollectionMode Mode> struct MatchResult {
+    std::vector<join::ThreadLocalMatchBuffer<Mode>> buffers;
+    size_t total_count = 0;
+
+    /// The inputs that were joined (for resolving row IDs during IR
+    /// construction)
+    JoinInput build_input;
+    JoinInput probe_input;
+
+    /// Join configuration
+    const AnalyzedJoinNode *join_node = nullptr;
+    join::BuildProbeConfig config;
+
+    /// Convenience accessors
+    size_t count() const { return total_count; }
+    bool empty() const { return total_count == 0; }
+};
+
+} // namespace Contest
diff --git a/include/join_execution/nested_loop.h b/include/join_execution/nested_loop.h
index c99a8be..d836409 100644
--- a/include/join_execution/nested_loop.h
+++ b/include/join_execution/nested_loop.h
@@ -34,8 +34,8 @@ using Contest::platform::worker_pool;
  * @brief Iterates over non-NULL values in a join input column.
  *
  * Abstracts columnar vs intermediate input. Handles NULL bitmaps.
- * For IntermediateResult, reads from materialized columns (join keys are
- * always materialized).
+ * For IntermediateResult, reads from join_key_tuples if available,
+ * otherwise from materialized columns (join keys are always available).
  *
  * @tparam Func void(uint32_t row_id, int32_t value).
  */
@@ -70,10 +70,25 @@ inline void visit_rows(const JoinInput &input, size_t attr_idx,
         }
     } else {
         const auto &res = std::get<IntermediateResult>(input.data);
-        // Join key must be materialized
+
+        // Check if join key is stored as tuples
+        if (res.has_join_key_tuples() && res.join_key_idx.has_value() &&
+            *res.join_key_idx == attr_idx) {
+            const auto &tuples = *res.join_key_tuples;
+            size_t count = tuples.row_count();
+            for (size_t i = 0; i < count; i++) {
+                mema::KeyRowPair pair = tuples[i];
+                if (pair.key != mema::value_t::NULL_VALUE) {
+                    visitor(static_cast<uint32_t>(i), pair.key);
+                }
+            }
+            return;
+        }
+
+        // Fall back to materialized column
         const mema::column_t *col = res.get_materialized(attr_idx);
         if (!col)
-            return; // Should not happen - join keys are always materialized
+            return; // Should not happen - join keys are always available
         size_t count = col->row_count();
         for (size_t i = 0; i < count; i++) {
             const mema::value_t &val = (*col)[i];
@@ -145,13 +160,21 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input,
         page_offsets.push_back(current);
     }
 
-    // Setup for IntermediateResult probe
+    // Setup for IntermediateResult probe - check tuples first, then
+    // materialized
     const mema::column_t *probe_mat_col = nullptr;
+    const mema::key_row_column_t *probe_tuples = nullptr;
     if (!probe_input.is_columnar()) {
         const auto &res = std::get<IntermediateResult>(probe_input.data);
-        probe_mat_col = res.get_materialized(probe_attr);
-        if (!probe_mat_col)
-            return {}; // Join key not materialized - should not happen
+        // Check if join key is stored as tuples
+        if (res.has_join_key_tuples() && res.join_key_idx.has_value() &&
+            *res.join_key_idx == probe_attr) {
+            probe_tuples = &(*res.join_key_tuples);
+        } else {
+            probe_mat_col = res.get_materialized(probe_attr);
+            if (!probe_mat_col)
+                return {}; // Join key not available - should not happen
+        }
     }
 
     std::atomic<size_t> probe_page_counter{0};
@@ -203,6 +226,19 @@ nested_loop_join(const JoinInput &build_input, const JoinInput &probe_input,
                     }
                 }
             }
+        } else if (probe_tuples) {
+            // IntermediateResult probe - use tuple column
+            const mema::key_row_column_t &tuples = *probe_tuples;
+            size_t count = tuples.row_count();
+            size_t start = (t_id * count) / THREAD_COUNT;
+            size_t end = ((t_id + 1) * count) / THREAD_COUNT;
+
+            for (size_t i = start; i < end; i++) {
+                mema::KeyRowPair pair = tuples[i];
+                if (pair.key != mema::value_t::NULL_VALUE) {
+                    process_value(static_cast<uint32_t>(i), pair.key);
+                }
+            }
         } else {
             // IntermediateResult probe - use materialized column
             const mema::column_t &col = *probe_mat_col;
diff --git a/include/materialization/construct_intermediate.h b/include/materialization/construct_intermediate.h
index 99c7d69..a5431aa 100644
--- a/include/materialization/construct_intermediate.h
+++ b/include/materialization/construct_intermediate.h
@@ -24,6 +24,7 @@
 #include <data_model/deferred_plan.h>
 #include <data_model/intermediate.h>
 #include <foundation/common.h>
+#include <join_execution/join_setup.h>
 #include <join_execution/match_collector.h>
 #include <platform/arena.h>
 #include <platform/worker_pool.h>
@@ -104,7 +105,9 @@ struct alignas(8) MaterializedColumnSource {
         nullptr; ///< Source if from IntermediateResult materialized
     const Column *columnar_col = nullptr; ///< Source if from ColumnarTable
     const mema::DeferredTable *deferred_table =
-        nullptr;                 ///< Source deferred table if needs resolution
+        nullptr; ///< Source deferred table if needs resolution
+    const mema::key_row_column_t *tuple_col =
+        nullptr;                 ///< Source if from child's join_key_tuples
     size_t child_output_idx = 0; ///< Index in child's output
     size_t mat_col_idx = 0;      ///< Index in result.materialized[]
     DataType type = DataType::INT32;
@@ -113,6 +116,7 @@ struct alignas(8) MaterializedColumnSource {
     bool is_columnar = false;            ///< True if source is ColumnarTable
     bool from_build = false;             ///< True if from build side
     bool needs_deferred_resolve = false; ///< True if child deferred this column
+    bool needs_tuple_key_read = false;   ///< True if reading key from tuples
 };
 
 // ============================================================================
@@ -272,6 +276,8 @@ prepare_deferred_table_sources(const AnalyzedJoinNode &join_node,
                 src.needs_direct = true;
                 src.child_table = nullptr;
             } else {
+                const auto &child_ir =
+                    std::get<IntermediateResult>(src_input.data);
                 // Find child's deferred table for this base table
                 const auto *child_ref =
                     src_input.get_deferred_ref(col.child_output_idx);
@@ -279,6 +285,17 @@ prepare_deferred_table_sources(const AnalyzedJoinNode &join_node,
                     src.needs_direct = false;
                     src.child_table =
                         src_input.get_deferred_table(col.child_output_idx);
+                } else if (child_ir.is_join_key(col.child_output_idx)) {
+                    // Child stored this as tuples - the row_id in tuples
+                    // is an IR index, but we need base table row IDs for
+                    // deferred resolution. This shouldn't happen if the
+                    // join key column is properly excluded from DEFER.
+                    std::fprintf(stderr,
+                                 "[BUG] DEFER column %zu is child's "
+                                 "join key - this is unexpected!\n",
+                                 col.child_output_idx);
+                    src.needs_direct = true;
+                    src.child_table = nullptr;
                 } else {
                     // Child materialized this, shouldn't happen for DEFER cols
                     src.needs_direct = true;
@@ -342,7 +359,15 @@ prepare_materialized_sources(const AnalyzedJoinNode &join_node,
             src.is_columnar = false;
             const auto &ir = std::get<IntermediateResult>(src_input.data);
 
-            if (ir.is_materialized(col.child_output_idx)) {
+            // Check source type in priority order:
+            // 1. Tuples (join key stored as key-row pairs)
+            // 2. Materialized column
+            // 3. Deferred table
+            if (ir.is_join_key(col.child_output_idx)) {
+                // Child stored this column as tuples - read key from there
+                src.needs_tuple_key_read = true;
+                src.tuple_col = &(*ir.join_key_tuples);
+            } else if (ir.is_materialized(col.child_output_idx)) {
                 src.intermediate_col =
                     ir.get_materialized(col.child_output_idx);
             } else if (ir.is_deferred(col.child_output_idx)) {
@@ -514,6 +539,14 @@ void construct_intermediate_from_buffers(
                                           col, src.child_output_idx, rid,
                                           src.type, cursor, src.from_build));
                 }
+            } else if (src.needs_tuple_key_read && src.tuple_col) {
+                // Child stored this column as tuples - read key from there
+                const auto &tuples = *src.tuple_col;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    int32_t key = tuples.key_at(rid);
+                    dest_col.write_at(k++, mema::value_t{key});
+                }
             } else if (src.intermediate_col) {
                 // Intermediate materialized source - direct copy
                 const auto &vec = *src.intermediate_col;
@@ -579,5 +612,436 @@ void construct_intermediate_from_buffers(
     });
 }
 
+// ============================================================================
+// Tuple-Based Intermediate Construction
+// ============================================================================
+
+/**
+ * @brief Resolves a row ID to base table row ID if possible.
+ *
+ * For columnar inputs: row ID is already base row ID (direct).
+ * For IR with tuples storing base rows: lookup via key_row_column_t.
+ * For IR with tuples storing IR indices: lookup via deferred table.
+ * For IR without tuples: lookup via deferred table.
+ *
+ * @param input The JoinInput to resolve from.
+ * @param row_id The row ID from match buffer.
+ * @param key_col_idx The join key column index in input's output.
+ * @return Resolved base table row ID.
+ */
+inline uint32_t resolve_to_base_row(const JoinInput &input, uint32_t row_id,
+                                    size_t key_col_idx) {
+    if (input.is_columnar()) {
+        // Columnar input: row ID is already base table row
+        return row_id;
+    }
+
+    const auto &ir = std::get<IntermediateResult>(input.data);
+
+    if (ir.has_join_key_tuples() && ir.join_key_has_base_rows()) {
+        // IR stores base row IDs in tuples - one lookup
+        return ir.join_key_tuples->row_id_at(row_id);
+    }
+
+    // IR stores IR indices - need deferred table lookup
+    const auto *def_table = ir.get_deferred_table(key_col_idx);
+    if (def_table) {
+        return (*def_table)[row_id];
+    }
+
+    // Fallback: return as-is (shouldn't happen for correct plans)
+    return row_id;
+}
+
+/**
+ * @brief Populates join key tuples column from match buffers.
+ *
+ * Extracts join keys and resolves row IDs based on tracking configuration.
+ * For tracked side with base rows, embeds base table row IDs directly.
+ * For non-tracked side, embeds IR indices for later DeferredTable lookup.
+ *
+ * @tparam Mode Match collection mode.
+ * @param buffers Thread-local match buffers.
+ * @param buffer_starts Per-buffer write offsets.
+ * @param build_input Build side input.
+ * @param probe_input Probe side input.
+ * @param key_from_build True if parent's join key comes from build side.
+ * @param key_child_output_idx Column index in the key input's output.
+ * @param out_tuples Output tuple column (pre-allocated).
+ * @param columnar_reader Reader for columnar access.
+ */
+template <MatchCollectionMode Mode>
+void populate_join_key_tuples(
+    std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
+    const std::vector<size_t> &buffer_starts, const JoinInput &build_input,
+    const JoinInput &probe_input, bool key_from_build,
+    size_t key_child_output_idx, mema::key_row_column_t &out_tuples,
+    ColumnarReader &columnar_reader) {
+
+    const JoinInput &key_input = key_from_build ? build_input : probe_input;
+    size_t key_attr = key_child_output_idx;
+
+    worker_pool().execute([&](size_t t) {
+        if (t >= buffers.size())
+            return;
+        auto &buf = buffers[t];
+        size_t my_count = buf.count();
+        if (my_count == 0)
+            return;
+
+        size_t write_pos = buffer_starts[t];
+
+        // Get the appropriate range based on which side provides the key
+        auto range = key_from_build ? buf.left_range() : buf.right_range();
+
+        if (key_input.is_columnar()) {
+            // Columnar source - read key from base table, but store OUTPUT IR
+            // index (write_pos) so parent can use it to index into this IR
+            auto *table = std::get<const ColumnarTable *>(key_input.data);
+            auto [actual_col_idx, _] = key_input.node->output_attrs[key_attr];
+            const Column &col = table->columns[actual_col_idx];
+
+            for (uint32_t row_id : range) {
+                // Use read_value_direct_public since page indices may not be
+                // prepared for the join key column (it's stored in tuples,
+                // not as a MATERIALIZE column)
+                int32_t key =
+                    columnar_reader
+                        .read_value_direct_public(col, row_id, DataType::INT32)
+                        .value;
+                // Store OUTPUT IR index (write_pos), not base table row_id
+                // Parent needs IR index to access other columns in this IR
+                uint32_t output_ir_idx = static_cast<uint32_t>(write_pos);
+                out_tuples.write_at(write_pos++, {key, output_ir_idx});
+            }
+        } else {
+            // Intermediate source - store OUTPUT IR index
+            const auto &ir = std::get<IntermediateResult>(key_input.data);
+
+            // Only propagate existing tuples if they contain the column we need
+            // Otherwise, read from materialized column
+            if (ir.has_join_key_tuples() && ir.join_key_idx.has_value() &&
+                *ir.join_key_idx == key_attr) {
+                // IR's tuples contain the column we need - propagate directly
+                const auto &src_tuples = *ir.join_key_tuples;
+
+                for (uint32_t ir_idx : range) {
+                    mema::KeyRowPair src = src_tuples[ir_idx];
+                    // Store OUTPUT IR index for parent to index into this IR
+                    uint32_t output_ir_idx = static_cast<uint32_t>(write_pos);
+                    out_tuples.write_at(write_pos++, {src.key, output_ir_idx});
+                }
+            } else {
+                // IR's tuples contain a different column, or no tuples exist
+                // Read from materialized column instead
+                const auto *mat_col = ir.get_materialized(key_attr);
+                if (mat_col) {
+                    for (uint32_t ir_idx : range) {
+                        int32_t key = (*mat_col)[ir_idx].value;
+                        // Store OUTPUT IR index for parent to index into this
+                        // IR
+                        uint32_t output_ir_idx =
+                            static_cast<uint32_t>(write_pos);
+                        out_tuples.write_at(write_pos++, {key, output_ir_idx});
+                    }
+                }
+            }
+        }
+    });
+}
+
+/**
+ * @brief Constructs intermediate result with tuple-based join key storage.
+ *
+ * Stores join key as (value, row_id) tuples for accelerated hashtable build
+ * and zero-indirection row ID propagation. Other columns handled normally
+ * via deferred tables or materialization.
+ *
+ * @tparam Mode Collection mode for compile-time specialization.
+ * @param buffers Thread-local match buffers from probe.
+ * @param build_input Build side data source.
+ * @param probe_input Probe side data source.
+ * @param join_node Analyzed join node with materialization decisions.
+ * @param config Build/probe configuration.
+ * @param build_is_left True if build side is the original left child.
+ * @param parent_key_idx Index of column that will be parent's join key.
+ * @param columnar_reader Reader for columnar data access.
+ * @param out_result Output IntermediateResult (populated in-place).
+ * @param analyzed_plan Full analyzed plan for base table access.
+ */
+template <MatchCollectionMode Mode>
+void construct_intermediate_with_tuples(
+    std::vector<ThreadLocalMatchBuffer<Mode>> &buffers,
+    const JoinInput &build_input, const JoinInput &probe_input,
+    const AnalyzedJoinNode &join_node, const join::BuildProbeConfig &config,
+    bool build_is_left, size_t parent_key_idx, ColumnarReader &columnar_reader,
+    IntermediateResult &out_result, const AnalyzedPlan &analyzed_plan) {
+
+    // Count total matches and compute buffer start offsets
+    size_t total_matches = 0;
+    std::vector<size_t> buffer_starts(buffers.size());
+    for (size_t i = 0; i < buffers.size(); ++i) {
+        buffer_starts[i] = total_matches;
+        total_matches += buffers[i].count();
+    }
+
+    if (total_matches == 0) {
+        out_result = create_empty_intermediate_result(join_node);
+        return;
+    }
+
+    // Initialize result metadata
+    out_result.node_info = &join_node;
+    out_result.num_rows = total_matches;
+    out_result.materialized_map.resize(join_node.columns.size(), std::nullopt);
+    out_result.deferred_map.resize(join_node.columns.size(), std::nullopt);
+
+    // Determine if parent's join key comes from build or probe side
+    // and which base table it traces back to
+    bool key_from_build = true;
+    size_t key_child_output_idx = 0; // Column index in child's output
+    uint8_t key_base_table_id = 0;
+    uint8_t key_base_column = 0;
+
+    for (const auto &col : join_node.columns) {
+        if (col.original_idx == parent_key_idx) {
+            key_from_build = (col.from_left == build_is_left);
+            key_child_output_idx = col.child_output_idx;
+            key_base_table_id = col.provenance.base_table_id;
+            key_base_column = col.provenance.base_column_idx;
+            break;
+        }
+    }
+
+    // Allocate join key tuples column
+    out_result.join_key_tuples.emplace();
+    out_result.join_key_tuples->pre_allocate_from_arena(
+        Contest::platform::get_arena(0), total_matches);
+    out_result.join_key_tuples->base_table_id = key_base_table_id;
+    out_result.join_key_tuples->source_column = key_base_column;
+    // Always store OUTPUT IR indices (not base row IDs) so parent can
+    // index into this IR to access deferred columns
+    out_result.join_key_tuples->stores_base_row_ids = false;
+    out_result.join_key_idx = parent_key_idx;
+    const JoinInput &key_input = key_from_build ? build_input : probe_input;
+    (void)key_input; // Used in populate_join_key_tuples
+
+    // Count non-join-key materialized columns and set up maps
+    size_t mat_count = 0;
+    for (const auto &col : join_node.columns) {
+        if (col.resolution == ColumnResolution::MATERIALIZE &&
+            col.original_idx != parent_key_idx) {
+            out_result.materialized_map[col.original_idx] = mat_count++;
+        }
+    }
+
+    // Prepare deferred table sources (unchanged from non-tuple version)
+    auto deferred_sources = prepare_deferred_table_sources(
+        join_node, build_input, probe_input, build_is_left, out_result);
+
+    // Precompute materialized sources (excluding join key)
+    std::vector<MaterializedColumnSource> mat_sources;
+    mat_sources.reserve(join_node.columns.size());
+    size_t mat_idx = 0;
+    for (const auto &col : join_node.columns) {
+        if (col.resolution != ColumnResolution::MATERIALIZE)
+            continue;
+        if (col.original_idx == parent_key_idx)
+            continue; // Skip join key - handled via tuples
+
+        MaterializedColumnSource src;
+        src.mat_col_idx = mat_idx++;
+        src.child_output_idx = col.child_output_idx;
+        src.type = col.type;
+        src.base_table_id = col.provenance.base_table_id;
+        src.base_column_idx = col.provenance.base_column_idx;
+        src.from_build = (col.from_left == build_is_left);
+
+        const auto &src_input = src.from_build ? build_input : probe_input;
+
+        if (src_input.is_columnar()) {
+            src.is_columnar = true;
+            const auto *table = std::get<const ColumnarTable *>(src_input.data);
+            auto [actual_idx, _] =
+                src_input.node->output_attrs[col.child_output_idx];
+            src.columnar_col = &table->columns[actual_idx];
+        } else {
+            src.is_columnar = false;
+            const auto &ir = std::get<IntermediateResult>(src_input.data);
+
+            // Check source type in priority order:
+            // 1. Tuples (join key stored as key-row pairs)
+            // 2. Materialized column
+            // 3. Deferred table
+            if (ir.is_join_key(col.child_output_idx)) {
+                // Child stored this column as tuples - read key from there
+                src.needs_tuple_key_read = true;
+                src.tuple_col = &(*ir.join_key_tuples);
+            } else if (ir.is_materialized(col.child_output_idx)) {
+                src.intermediate_col =
+                    ir.get_materialized(col.child_output_idx);
+            } else if (ir.is_deferred(col.child_output_idx)) {
+                src.needs_deferred_resolve = true;
+                src.deferred_table =
+                    ir.get_deferred_table(col.child_output_idx);
+            }
+        }
+        mat_sources.push_back(src);
+    }
+
+    // Pre-allocate pages
+    using Page = mema::column_t::Page;
+    using DeferredPage = mema::DeferredTable::Page;
+    size_t mat_pages_needed =
+        (total_matches + mema::CAP_PER_PAGE - 1) / mema::CAP_PER_PAGE;
+    size_t def_pages_needed =
+        (total_matches + mema::DeferredTable::ENTRIES_PER_PAGE - 1) /
+        mema::DeferredTable::ENTRIES_PER_PAGE;
+
+    out_result.materialized.resize(mat_count);
+    for (size_t c = 0; c < mat_count; ++c) {
+        out_result.materialized[c].pages.resize(mat_pages_needed);
+        out_result.materialized[c].set_row_count(total_matches);
+    }
+
+    for (auto &dt : out_result.deferred_tables) {
+        dt.pages.resize(def_pages_needed);
+        dt.set_row_count(total_matches);
+    }
+
+    // Set source metadata for materialized columns
+    for (const auto &src : mat_sources) {
+        out_result.materialized[src.mat_col_idx].source_table =
+            src.base_table_id;
+        out_result.materialized[src.mat_col_idx].source_column =
+            src.base_column_idx;
+    }
+
+    const size_t num_threads = THREAD_COUNT;
+    const size_t num_deferred_tables = out_result.deferred_tables.size();
+
+    // Parallel page allocation
+    worker_pool().execute([&](size_t t) {
+        for (size_t c = 0; c < mat_count; ++c) {
+            auto &col = out_result.materialized[c];
+            for (size_t p = t; p < mat_pages_needed; p += num_threads) {
+                void *ptr =
+                    Contest::platform::get_arena(t)
+                        .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
+                col.pages[p] = reinterpret_cast<Page *>(ptr);
+            }
+        }
+        for (size_t d = 0; d < num_deferred_tables; ++d) {
+            auto &dt = out_result.deferred_tables[d];
+            for (size_t p = t; p < def_pages_needed; p += num_threads) {
+                void *ptr =
+                    Contest::platform::get_arena(t)
+                        .alloc_chunk<Contest::platform::ChunkType::IR_PAGE>();
+                dt.pages[p] = reinterpret_cast<DeferredPage *>(ptr);
+            }
+        }
+    });
+
+    // Populate join key tuples
+    populate_join_key_tuples<Mode>(
+        buffers, buffer_starts, build_input, probe_input, key_from_build,
+        key_child_output_idx, *out_result.join_key_tuples, columnar_reader);
+
+    // Populate other materialized columns and deferred tables
+    // (same logic as construct_intermediate_from_buffers)
+    worker_pool().execute([&](size_t t) {
+        if (t >= buffers.size())
+            return;
+        auto &buf = buffers[t];
+        size_t my_count = buf.count();
+        if (my_count == 0)
+            return;
+
+        size_t start = buffer_starts[t];
+        ColumnarReader::Cursor cursor;
+
+        // Process MATERIALIZED columns (excluding join key)
+        for (const auto &src : mat_sources) {
+            auto &dest_col = out_result.materialized[src.mat_col_idx];
+
+            auto range = src.from_build ? buf.left_range() : buf.right_range();
+
+            if (src.is_columnar) {
+                const auto &col = *src.columnar_col;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    mema::value_t val = columnar_reader.read_value(
+                        col, src.child_output_idx, rid, src.type, cursor,
+                        src.from_build);
+                    dest_col.write_at(k++, val);
+                }
+            } else if (src.needs_tuple_key_read && src.tuple_col) {
+                // Child stored this column as tuples - read key from there
+                const auto &tuples = *src.tuple_col;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    int32_t key = tuples.key_at(rid);
+                    dest_col.write_at(k++, mema::value_t{key});
+                }
+            } else if (src.intermediate_col) {
+                const auto &vec = *src.intermediate_col;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    mema::value_t val = vec[rid];
+                    dest_col.write_at(k++, val);
+                }
+            } else if (src.needs_deferred_resolve && src.deferred_table) {
+                const auto &def_table = *src.deferred_table;
+                size_t k = start;
+                for (uint32_t rid : range) {
+                    uint32_t base_row = def_table[rid];
+
+                    if (analyzed_plan.original_plan) [[likely]] {
+                        const auto &base_table =
+                            analyzed_plan.original_plan
+                                ->inputs[src.base_table_id];
+                        mema::value_t val =
+                            columnar_reader.read_value_direct_public(
+                                base_table.columns[src.base_column_idx],
+                                base_row, src.type);
+                        dest_col.write_at(k++, val);
+                    } else {
+                        dest_col.write_at(
+                            k++, mema::value_t{mema::value_t::NULL_VALUE});
+                    }
+                }
+            }
+        }
+
+        // Process DEFERRED tables
+        for (const auto &def_src : deferred_sources) {
+            auto &dest_table =
+                out_result.deferred_tables[def_src.dest_table_idx];
+
+            auto batch_reader = def_src.from_build ? buf.left_batch_reader()
+                                                   : buf.right_batch_reader();
+
+            size_t k = start;
+            while (batch_reader.has_more()) {
+                size_t batch_count;
+                const uint32_t *row_ids =
+                    batch_reader.get_batch(256, batch_count);
+
+                if (batch_count > 0) {
+                    if (def_src.needs_direct) {
+                        row_id_ops::write_row_ids_direct(dest_table, k, row_ids,
+                                                         batch_count);
+                    } else if (def_src.child_table) {
+                        row_id_ops::copy_row_ids_from_child(
+                            dest_table, k, *def_src.child_table, row_ids,
+                            batch_count);
+                    }
+                    k += batch_count;
+                }
+            }
+        }
+    });
+}
+
 } // namespace materialize
 } // namespace Contest
diff --git a/include/materialization/materialize.h b/include/materialization/materialize.h
index 4425042..4cff7ab 100644
--- a/include/materialization/materialize.h
+++ b/include/materialization/materialize.h
@@ -281,6 +281,7 @@ inline void materialize_single_column(
     // Determine how to read the value
     const Column *columnar_source = nullptr;
     const mema::column_t *materialized_source = nullptr;
+    const mema::key_row_column_t *tuple_source = nullptr;
     const mema::DeferredTable *deferred_table = nullptr;
     uint8_t deferred_base_col = 0;
     uint8_t deferred_base_table = 0;
@@ -293,7 +294,10 @@ inline void materialize_single_column(
         columnar_source = &table->columns[actual_idx];
     } else {
         const auto &ir = std::get<IntermediateResult>(src_input.data);
-        if (ir.is_materialized(col_info->child_output_idx)) {
+        // Check if column is stored as join key tuples
+        if (ir.is_join_key(col_info->child_output_idx)) {
+            tuple_source = &(*ir.join_key_tuples);
+        } else if (ir.is_materialized(col_info->child_output_idx)) {
             // Read from materialized column
             materialized_source =
                 ir.get_materialized(col_info->child_output_idx);
@@ -312,23 +316,29 @@ inline void materialize_single_column(
     // Create reader lambda
     auto reader = [&](uint32_t local_row_id,
                       ColumnarReader::Cursor &cursor) -> mema::value_t {
+        mema::value_t result;
         if (columnar_source) {
-            return columnar_reader.read_value(
+            result = columnar_reader.read_value(
                 *columnar_source, col_info->child_output_idx, local_row_id,
                 col_info->type, cursor, from_build);
+        } else if (tuple_source) {
+            // Read key value from tuple column
+            result = mema::value_t{tuple_source->key_at(local_row_id)};
         } else if (materialized_source) {
-            return (*materialized_source)[local_row_id];
+            result = (*materialized_source)[local_row_id];
         } else if (deferred_table && analyzed_plan.original_plan) {
             // Deferred resolution: look up base table row ID from deferred
             // table
             uint32_t base_row = (*deferred_table)[local_row_id];
             const auto &base_table =
                 analyzed_plan.original_plan->inputs[deferred_base_table];
-            return columnar_reader.read_value(
+            result = columnar_reader.read_value(
                 base_table.columns[deferred_base_col], deferred_base_col,
                 base_row, col_info->type, cursor, true);
+        } else {
+            result = mema::value_t{mema::value_t::NULL_VALUE};
         }
-        return mema::value_t{mema::value_t::NULL_VALUE};
+        return result;
     };
 
     // Materialize based on type
diff --git a/src/execute.cpp b/src/execute.cpp
index 29a485c..0a9e1fc 100644
--- a/src/execute.cpp
+++ b/src/execute.cpp
@@ -46,6 +46,7 @@ namespace Contest {
 using namespace join;
 
 using materialize::construct_intermediate_from_buffers;
+using materialize::construct_intermediate_with_tuples;
 using materialize::create_empty_intermediate_result;
 using materialize::materialize_from_buffers;
 
@@ -146,19 +147,30 @@ JoinResult execute_join_with_mode(
         } else {
             const auto &probe_result =
                 std::get<IntermediateResult>(probe_input.data);
-            // Probe using materialized column (should be the join key)
-            const auto *mat_col =
-                probe_result.get_materialized(config.probe_attr);
-            if (!mat_col) {
-                std::fprintf(
-                    stderr,
-                    "ERROR: probe join key not materialized! probe_attr=%zu "
-                    "mat_map_size=%zu num_rows=%zu\n",
-                    config.probe_attr, probe_result.materialized_map.size(),
-                    probe_result.num_rows);
-                std::abort();
+
+            // Use tuple-based probe if available
+            if (probe_result.has_join_key_tuples() &&
+                probe_result.join_key_idx.has_value() &&
+                *probe_result.join_key_idx == config.probe_attr) {
+                match_buffers = probe_tuples<Mode>(
+                    *hash_table, *probe_result.join_key_tuples);
+            } else {
+                // Fall back to materialized column probe
+                const auto *mat_col =
+                    probe_result.get_materialized(config.probe_attr);
+                if (!mat_col) {
+                    std::fprintf(
+                        stderr,
+                        "ERROR: probe join key not materialized! "
+                        "probe_attr=%zu "
+                        "mat_map_size=%zu num_rows=%zu has_tuples=%d\n",
+                        config.probe_attr, probe_result.materialized_map.size(),
+                        probe_result.num_rows,
+                        probe_result.has_join_key_tuples() ? 1 : 0);
+                    std::abort();
+                }
+                match_buffers = probe_intermediate<Mode>(*hash_table, *mat_col);
             }
-            match_buffers = probe_intermediate<Mode>(*hash_table, *mat_col);
         }
         auto probe_end = std::chrono::high_resolution_clock::now();
         stats.hash_join_probe_ms +=
@@ -206,10 +218,18 @@ JoinResult execute_join_with_mode(
                 config.remapped_attrs, build_input.output_size(),
                 config.build_left);
 
-            construct_intermediate_from_buffers<Mode>(
-                match_buffers, build_input, probe_input, join_node,
-                config.remapped_attrs, build_input.output_size(),
-                config.build_left, columnar_reader, result, plan);
+            // Use tuple-based construction if parent needs a join key
+            if (join_node.parent_join_key_idx.has_value()) {
+                construct_intermediate_with_tuples<Mode>(
+                    match_buffers, build_input, probe_input, join_node, config,
+                    config.build_left, *join_node.parent_join_key_idx,
+                    columnar_reader, result, plan);
+            } else {
+                construct_intermediate_from_buffers<Mode>(
+                    match_buffers, build_input, probe_input, join_node,
+                    config.remapped_attrs, build_input.output_size(),
+                    config.build_left, columnar_reader, result, plan);
+            }
         } else {
             result = create_empty_intermediate_result(join_node);
         }
@@ -280,17 +300,28 @@ JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root,
             hash_table = build_from_columnar(build_input, config.build_attr);
         } else {
             const auto &ir = std::get<IntermediateResult>(build_input.data);
-            const auto *mat_col = ir.get_materialized(config.build_attr);
-            if (!mat_col) {
-                std::fprintf(
-                    stderr,
-                    "ERROR: build join key not materialized! build_attr=%zu "
-                    "mat_map_size=%zu num_rows=%zu\n",
-                    config.build_attr, ir.materialized_map.size(), ir.num_rows);
-                std::abort();
+
+            // Use tuple-based build if available and matches build_attr
+            if (ir.has_join_key_tuples() && ir.join_key_idx.has_value() &&
+                *ir.join_key_idx == config.build_attr) {
+                hash_table.emplace(ir.join_key_tuples->row_count());
+                hash_table->build_from_tuples(*ir.join_key_tuples);
+            } else {
+                // Fall back to materialized column build
+                const auto *mat_col = ir.get_materialized(config.build_attr);
+                if (!mat_col) {
+                    std::fprintf(
+                        stderr,
+                        "ERROR: build join key not materialized! "
+                        "build_attr=%zu "
+                        "mat_map_size=%zu num_rows=%zu has_tuples=%d\n",
+                        config.build_attr, ir.materialized_map.size(),
+                        ir.num_rows, ir.has_join_key_tuples() ? 1 : 0);
+                    std::abort();
+                }
+                hash_table.emplace(mat_col->row_count());
+                hash_table->build_intermediate(*mat_col);
             }
-            hash_table.emplace(mat_col->row_count());
-            hash_table->build_intermediate(*mat_col);
         }
         auto build_end = std::chrono::high_resolution_clock::now();
         stats.hashtable_build_ms +=

From ddae72eb6082f61d8675c09e8350be0a74628421 Mon Sep 17 00:00:00 2001
From: Themos Papatheofanous <themos360@gmail.com>
Date: Thu, 22 Jan 2026 18:19:23 +0200
Subject: [PATCH 07/13] fix: columnar reader initialization

---
 .../materialization/construct_intermediate.h  | 37 +++++++++++++++----
 src/execute.cpp                               |  4 +-
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/include/materialization/construct_intermediate.h b/include/materialization/construct_intermediate.h
index a5431aa..f65218a 100644
--- a/include/materialization/construct_intermediate.h
+++ b/include/materialization/construct_intermediate.h
@@ -154,12 +154,15 @@ collect_input_columns(const JoinInput &input,
  * @brief Prepare ColumnarReader for intermediate construction.
  *
  * Sets up page indices for columns that need to be read from columnar inputs.
+ * If parent_key_idx is provided, also prepares the join key column for tuple
+ * population.
  */
 inline void prepare_intermediate_columns(
     ColumnarReader &reader, const JoinInput &build_input,
     const JoinInput &probe_input, const AnalyzedJoinNode &join_node,
     const std::vector<std::tuple<size_t, DataType>> &remapped_attrs,
-    size_t build_size, bool build_is_left) {
+    size_t build_size, bool build_is_left,
+    std::optional<size_t> parent_key_idx = std::nullopt) {
 
     bool build_is_columnar = build_input.is_columnar();
     bool probe_is_columnar = probe_input.is_columnar();
@@ -195,6 +198,23 @@ inline void prepare_intermediate_columns(
         }
     }
 
+    // If parent needs a join key via tuples, mark that column as needed too
+    // This ensures page indices are prepared for efficient tuple population
+    if (parent_key_idx.has_value()) {
+        for (const auto &col : join_node.columns) {
+            if (col.original_idx == *parent_key_idx) {
+                bool from_build = (col.from_left == build_is_left);
+                if (from_build && col.child_output_idx < build_needed.size()) {
+                    build_needed[col.child_output_idx] = 1;
+                } else if (!from_build &&
+                           col.child_output_idx < probe_needed.size()) {
+                    probe_needed[col.child_output_idx] = 1;
+                }
+                break;
+            }
+        }
+    }
+
     if (build_is_columnar) {
         reader.prepare_build(
             collect_input_columns(build_input, build_needed, arena));
@@ -695,19 +715,22 @@ void populate_join_key_tuples(
         auto range = key_from_build ? buf.left_range() : buf.right_range();
 
         if (key_input.is_columnar()) {
-            // Columnar source - read key from base table, but store OUTPUT IR
-            // index (write_pos) so parent can use it to index into this IR
+            // Columnar source - read key from base table using prepared page
+            // index Store OUTPUT IR index (write_pos) so parent can use it to
+            // index into this IR
             auto *table = std::get<const ColumnarTable *>(key_input.data);
             auto [actual_col_idx, _] = key_input.node->output_attrs[key_attr];
             const Column &col = table->columns[actual_col_idx];
 
+            // Use cursor for efficient sequential/near-sequential access
+            ColumnarReader::Cursor cursor;
             for (uint32_t row_id : range) {
-                // Use read_value_direct_public since page indices may not be
-                // prepared for the join key column (it's stored in tuples,
-                // not as a MATERIALIZE column)
+                // Use read_value with prepared page index (O(1) amortized)
+                // instead of read_value_direct_public (O(n) per read)
                 int32_t key =
                     columnar_reader
-                        .read_value_direct_public(col, row_id, DataType::INT32)
+                        .read_value(col, key_attr, row_id, DataType::INT32,
+                                    cursor, key_from_build)
                         .value;
                 // Store OUTPUT IR index (write_pos), not base table row_id
                 // Parent needs IR index to access other columns in this IR
diff --git a/src/execute.cpp b/src/execute.cpp
index 0a9e1fc..1cb0e10 100644
--- a/src/execute.cpp
+++ b/src/execute.cpp
@@ -213,10 +213,12 @@ JoinResult execute_join_with_mode(
         IntermediateResult result;
         if (total_matches > 0) {
             // Prepare page indices for intermediate construction
+            // Pass parent_join_key_idx so the key column is prepared for tuple
+            // population
             materialize::prepare_intermediate_columns(
                 columnar_reader, build_input, probe_input, join_node,
                 config.remapped_attrs, build_input.output_size(),
-                config.build_left);
+                config.build_left, join_node.parent_join_key_idx);
 
             // Use tuple-based construction if parent needs a join key
             if (join_node.parent_join_key_idx.has_value()) {

From e2abed6497d0f74689e2e181790ce81b749f3d20 Mon Sep 17 00:00:00 2001
From: Themos Papatheofanous <themos360@gmail.com>
Date: Thu, 22 Jan 2026 18:54:21 +0200
Subject: [PATCH 08/13] feat: improvements

---
 include/data_access/columnar_reader.h         | 162 ++++++++++++++++++
 .../materialization/construct_intermediate.h  |  81 ++++++++-
 2 files changed, 238 insertions(+), 5 deletions(-)

diff --git a/include/data_access/columnar_reader.h b/include/data_access/columnar_reader.h
index e143c95..39348d0 100644
--- a/include/data_access/columnar_reader.h
+++ b/include/data_access/columnar_reader.h
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <algorithm>
+#include <array>
 #include <atomic>
 #include <cstdint>
 #include <cstring>
@@ -266,6 +267,127 @@ class ColumnarReader {
         global_probe_version.fetch_add(1, std::memory_order_relaxed);
     }
 
+    // ========================================================================
+    // Base Table Page Index Methods (for O(1) deferred column resolution)
+    // ========================================================================
+
+    /** @brief Reset base table prepared flags for new query. */
+    inline void reset_base_tables() {
+        base_table_prepared_.fill(false);
+        base_table_version_++;
+    }
+
+    /**
+     * @brief Prepare page index for a base table column.
+     *
+     * Called once per unique (table_id, col_idx) before deferred resolution.
+     * Enables O(log P) page lookup instead of O(P) linear scan per read.
+     *
+     * @param table_id Base table ID (0-15).
+     * @param col_idx Column index within base table (0-15).
+     * @param column The Column to build page index for.
+     */
+    inline void prepare_base_column(uint8_t table_id, uint8_t col_idx,
+                                    const Column &column) {
+        size_t idx = (static_cast<size_t>(table_id) << BASE_TABLE_SHIFT) |
+                     static_cast<size_t>(col_idx);
+        if (idx >= MAX_BASE_TABLE_INDICES)
+            return;
+
+        if (!base_table_prepared_[idx]) {
+            auto &arena = platform::get_arena(0);
+            base_table_indices_[idx] = PageIndex(arena);
+            base_table_indices_[idx].build(column);
+            base_table_prepared_[idx] = true;
+        }
+    }
+
+    /** @brief Check if base column page index is prepared. */
+    inline bool is_base_column_prepared(uint8_t table_id,
+                                        uint8_t col_idx) const {
+        size_t idx = (static_cast<size_t>(table_id) << BASE_TABLE_SHIFT) |
+                     static_cast<size_t>(col_idx);
+        return idx < MAX_BASE_TABLE_INDICES && base_table_prepared_[idx];
+    }
+
+    /**
+     * @brief Read value from base table using prepared page index.
+     *
+     * O(1) with cursor caching for sequential access, O(log P) on cache miss.
+     * Falls back to O(P) linear scan if page index not prepared.
+     *
+     * @param column The base table column.
+     * @param table_id Base table ID.
+     * @param col_idx Column index within base table.
+     * @param row_id Row ID within the column.
+     * @param data_type Data type of the column.
+     * @param cursor Thread-local cursor for caching.
+     * @return The value at the specified row.
+     */
+    inline mema::value_t read_base_table_value(const Column &column,
+                                               uint8_t table_id,
+                                               uint8_t col_idx, uint32_t row_id,
+                                               DataType data_type,
+                                               Cursor &cursor) const {
+        size_t idx = (static_cast<size_t>(table_id) << BASE_TABLE_SHIFT) |
+                     static_cast<size_t>(col_idx);
+
+        if (idx >= MAX_BASE_TABLE_INDICES || !base_table_prepared_[idx]) {
+            // Fallback to O(P) linear scan
+            return read_value_direct(column, row_id, data_type);
+        }
+
+        const PageIndex &page_index = base_table_indices_[idx];
+
+        // Dense INT32 fast path: O(1) arithmetic lookup
+        if (data_type == DataType::INT32 && page_index.is_dense_int32) {
+            return mema::value_t{read_dense_int32(page_index, row_id)};
+        }
+
+        // Check cursor cache (version uses base_table_version_ + idx for
+        // uniqueness)
+        uint64_t effective_version = base_table_version_ + idx;
+        bool cache_hit =
+            cursor.version == effective_version && cursor.cached_col == idx &&
+            row_id >= cursor.cached_start && row_id < cursor.cached_end;
+
+        if (!cache_hit) {
+            // Check sequential access optimization
+            if (cursor.version == effective_version &&
+                cursor.cached_col == idx && row_id == cursor.cached_end) {
+                size_t next_page = cursor.cached_page + 1;
+                if (next_page < page_index.cumulative_rows.size()) {
+                    load_page_into_cursor_base(column, page_index, next_page,
+                                               idx, effective_version, cursor);
+                } else {
+                    // Past end of data
+                    return mema::value_t{mema::value_t::NULL_VALUE};
+                }
+            } else {
+                // Binary search for page
+                size_t page_num = page_index.find_page(row_id);
+                load_page_into_cursor_base(column, page_index, page_num, idx,
+                                           effective_version, cursor);
+            }
+        }
+
+        // Now cursor is loaded for the correct page
+        uint32_t local_row = row_id - cursor.cached_start;
+        if (SPC_LIKELY(cursor.is_dense)) {
+            if (data_type == DataType::INT32) {
+                return mema::value_t{cursor.data_ptr[local_row]};
+            } else {
+                return mema::value_t::encode_string(
+                    cursor.page_idx_val, static_cast<int32_t>(local_row));
+            }
+        }
+        if (SPC_UNLIKELY(cursor.is_special)) {
+            return mema::value_t::encode_string(
+                cursor.page_idx_val, mema::value_t::LONG_STRING_OFFSET);
+        }
+        return read_sparse(local_row, data_type, cursor);
+    }
+
     /** @brief Fast path: check cursor cache, dispatch to appropriate handler.
      */
     template <bool IsBuild>
@@ -560,8 +682,48 @@ class ColumnarReader {
                                                 static_cast<int32_t>(data_idx));
         }
     }
+
+    /** @brief Load page into cursor for base table access. */
+    inline void load_page_into_cursor_base(const Column &column,
+                                           const PageIndex &page_index,
+                                           size_t page_num, size_t col_idx,
+                                           uint64_t version,
+                                           Cursor &cursor) const {
+        cursor.version = version;
+        cursor.cached_col = col_idx;
+        cursor.cached_page = page_num;
+        cursor.cached_start = page_index.page_start_row(page_num);
+        cursor.cached_end = page_index.cumulative_rows[page_num];
+        cursor.page_idx_val = static_cast<int32_t>(page_num);
+        cursor.col_all_dense = page_index.all_dense;
+
+        auto *page_data = column.pages[page_num]->data;
+        auto num_rows = *reinterpret_cast<const uint16_t *>(page_data);
+        auto num_values = *reinterpret_cast<const uint16_t *>(page_data + 2);
+
+        cursor.is_special = (num_rows == 0xffff);
+        cursor.is_dense = (num_rows == num_values);
+        cursor.data_ptr = reinterpret_cast<const int32_t *>(page_data + 4);
+
+        if (!cursor.is_dense && !cursor.is_special) {
+            size_t bitmap_size = (num_rows + 7) / 8;
+            cursor.bitmap_ptr = reinterpret_cast<const uint8_t *>(
+                page_data + PAGE_SIZE - bitmap_size);
+            cursor.prefix_sum_ptr =
+                page_index.page_prefix_sums[page_num].data();
+        }
+    }
+
     std::vector<PageIndex> build_page_indices;
     std::vector<PageIndex> probe_page_indices;
+
+    // Base table page indices for deferred column resolution.
+    // Index = (table_id << 4) | col_idx, supports 16 tables × 16 cols = 256.
+    static constexpr size_t BASE_TABLE_SHIFT = 4;
+    static constexpr size_t MAX_BASE_TABLE_INDICES = 256;
+    std::array<PageIndex, MAX_BASE_TABLE_INDICES> base_table_indices_;
+    std::array<bool, MAX_BASE_TABLE_INDICES> base_table_prepared_{};
+    uint64_t base_table_version_ = 0;
 };
 } // namespace Contest::io
 
diff --git a/include/materialization/construct_intermediate.h b/include/materialization/construct_intermediate.h
index f65218a..ec3db86 100644
--- a/include/materialization/construct_intermediate.h
+++ b/include/materialization/construct_intermediate.h
@@ -48,10 +48,25 @@ namespace row_id_ops {
  * @brief Write row IDs directly from columnar input.
  *
  * For columnar inputs, we just write the row_id directly (it's already
- * the base table row ID).
+ * the base table row ID). Optimized with memcpy when batch fits in one page.
  */
 inline size_t write_row_ids_direct(mema::DeferredTable &dest, size_t start_idx,
                                    const uint32_t *row_ids, size_t count) {
+    // Constants for DeferredTable layout
+    constexpr size_t ENTRY_SHIFT = mema::DeferredTable::ENTRY_SHIFT;
+    constexpr size_t ENTRY_MASK = mema::DeferredTable::ENTRY_MASK;
+
+    size_t page_idx = start_idx >> ENTRY_SHIFT;
+    size_t offset = start_idx & ENTRY_MASK;
+
+    // Fast path: entire batch fits in current page
+    if (offset + count <= mema::DeferredTable::ENTRIES_PER_PAGE) {
+        std::memcpy(&dest.pages[page_idx]->data[offset], row_ids,
+                    count * sizeof(uint32_t));
+        return count;
+    }
+
+    // Slow path: batch spans pages
     for (size_t i = 0; i < count; ++i) {
         dest.write_at(start_idx + i, row_ids[i]);
     }
@@ -226,6 +241,50 @@ inline void prepare_intermediate_columns(
     }
 }
 
+/**
+ * @brief Prepare page indices for base table columns used in deferred
+ * resolution.
+ *
+ * Called before constructing intermediate results to enable O(log P) page
+ * lookup instead of O(P) linear scan when resolving deferred columns that need
+ * to materialize values from base tables.
+ *
+ * @param reader ColumnarReader to prepare page indices in.
+ * @param mat_sources Precomputed materialized column sources.
+ * @param analyzed_plan Full analyzed plan containing base tables.
+ */
+inline void prepare_deferred_base_tables(
+    ColumnarReader &reader,
+    const std::vector<MaterializedColumnSource> &mat_sources,
+    const AnalyzedPlan &analyzed_plan) {
+    if (!analyzed_plan.original_plan)
+        return;
+
+    // NOTE: We do NOT reset base tables here - they persist across joins
+    // within the same query since the base tables don't change.
+    // reset_base_tables() should only be called once per query, externally.
+
+    // Prepare page indices for each base table column that needs deferred
+    // resolve
+    for (const auto &src : mat_sources) {
+        if (src.needs_deferred_resolve) {
+            uint8_t table_id = src.base_table_id;
+            uint8_t col_idx = src.base_column_idx;
+
+            if (!reader.is_base_column_prepared(table_id, col_idx)) {
+                if (table_id < analyzed_plan.original_plan->inputs.size()) {
+                    const auto &base_table =
+                        analyzed_plan.original_plan->inputs[table_id];
+                    if (col_idx < base_table.columns.size()) {
+                        reader.prepare_base_column(table_id, col_idx,
+                                                   base_table.columns[col_idx]);
+                    }
+                }
+            }
+        }
+    }
+}
+
 /**
  * @brief Create empty intermediate result with proper schema.
  */
@@ -310,10 +369,12 @@ prepare_deferred_table_sources(const AnalyzedJoinNode &join_node,
                     // is an IR index, but we need base table row IDs for
                     // deferred resolution. This shouldn't happen if the
                     // join key column is properly excluded from DEFER.
+#ifndef NDEBUG
                     std::fprintf(stderr,
                                  "[BUG] DEFER column %zu is child's "
                                  "join key - this is unexpected!\n",
                                  col.child_output_idx);
+#endif
                     src.needs_direct = true;
                     src.child_table = nullptr;
                 } else {
@@ -472,6 +533,9 @@ void construct_intermediate_from_buffers(
     auto mat_sources = prepare_materialized_sources(join_node, build_input,
                                                     probe_input, build_is_left);
 
+    // Prepare page indices for base tables used in deferred resolution
+    prepare_deferred_base_tables(columnar_reader, mat_sources, analyzed_plan);
+
     // Pre-allocate pages
     using Page = mema::column_t::Page;
     using DeferredPage = mema::DeferredTable::Page;
@@ -539,6 +603,7 @@ void construct_intermediate_from_buffers(
 
         size_t start = buffer_starts[t];
         ColumnarReader::Cursor cursor;
+        ColumnarReader::Cursor base_cursor; // For deferred resolution reads
 
         // ====================================================================
         // Process MATERIALIZED columns (column-major for cache locality)
@@ -586,9 +651,10 @@ void construct_intermediate_from_buffers(
                             analyzed_plan.original_plan
                                 ->inputs[src.base_table_id];
                         mema::value_t val =
-                            columnar_reader.read_value_direct_public(
+                            columnar_reader.read_base_table_value(
                                 base_table.columns[src.base_column_idx],
-                                base_row, src.type);
+                                src.base_table_id, src.base_column_idx,
+                                base_row, src.type, base_cursor);
                         dest_col.write_at(k++, val);
                     } else {
                         dest_col.write_at(
@@ -912,6 +978,9 @@ void construct_intermediate_with_tuples(
         mat_sources.push_back(src);
     }
 
+    // Prepare page indices for base tables used in deferred resolution
+    prepare_deferred_base_tables(columnar_reader, mat_sources, analyzed_plan);
+
     // Pre-allocate pages
     using Page = mema::column_t::Page;
     using DeferredPage = mema::DeferredTable::Page;
@@ -982,6 +1051,7 @@ void construct_intermediate_with_tuples(
 
         size_t start = buffer_starts[t];
         ColumnarReader::Cursor cursor;
+        ColumnarReader::Cursor base_cursor; // For deferred resolution reads
 
         // Process MATERIALIZED columns (excluding join key)
         for (const auto &src : mat_sources) {
@@ -1024,9 +1094,10 @@ void construct_intermediate_with_tuples(
                             analyzed_plan.original_plan
                                 ->inputs[src.base_table_id];
                         mema::value_t val =
-                            columnar_reader.read_value_direct_public(
+                            columnar_reader.read_base_table_value(
                                 base_table.columns[src.base_column_idx],
-                                base_row, src.type);
+                                src.base_table_id, src.base_column_idx,
+                                base_row, src.type, base_cursor);
                         dest_col.write_at(k++, val);
                     } else {
                         dest_col.write_at(

From d1f2c21ce4bdb669d286655fc868125bad91a59f Mon Sep 17 00:00:00 2001
From: Themos Papatheofanous <themos360@gmail.com>
Date: Thu, 22 Jan 2026 20:53:26 +0200
Subject: [PATCH 09/13] feat: tuple storage

---
 include/data_model/intermediate.h  |  10 ++
 include/join_execution/hash_join.h |  32 ++---
 include/join_execution/hashtable.h | 192 +++++++++++++++++------------
 3 files changed, 142 insertions(+), 92 deletions(-)

diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h
index 53f6418..0b723d0 100644
--- a/include/data_model/intermediate.h
+++ b/include/data_model/intermediate.h
@@ -138,6 +138,16 @@ struct key_row_column_t {
     size_t row_count() const { return num_values; }
     void set_row_count(size_t count) { num_values = count; }
 
+    /// Release ownership of pages for zero-copy transfer to hashtable.
+    /// After this call, the column is empty (pages cleared, num_values = 0).
+    /// @return Vector of page pointers (caller takes ownership).
+    std::vector<Page *> release_pages() && {
+        std::vector<Page *> released = std::move(pages);
+        pages.clear();
+        num_values = 0;
+        return released;
+    }
+
     /// Pre-allocate pages from arena
     inline void pre_allocate_from_arena(Contest::platform::ThreadArena &arena,
                                         size_t count) {
diff --git a/include/join_execution/hash_join.h b/include/join_execution/hash_join.h
index 5895f11..f518df2 100644
--- a/include/join_execution/hash_join.h
+++ b/include/join_execution/hash_join.h
@@ -82,8 +82,7 @@ template <MatchCollectionMode Mode>
 inline std::vector<ThreadLocalMatchBuffer<Mode>>
 probe_intermediate(const UnchainedHashtable &hash_table,
                    const mema::column_t &probe_column) {
-    const auto *keys = hash_table.keys();
-    const auto *row_ids = hash_table.row_ids();
+    const auto *entries = hash_table.entries();
 
     size_t pool_size = THREAD_COUNT;
     std::vector<ThreadLocalMatchBuffer<Mode>> local_buffers(pool_size);
@@ -124,8 +123,8 @@ probe_intermediate(const UnchainedHashtable &hash_table,
                         hash_table.find_indices(key_val);
 
                     for (uint64_t i = start_idx; i < end_idx; ++i) {
-                        if (keys[i] == key_val) {
-                            local_buf.add_match(row_ids[i],
+                        if (entries[i].key == key_val) {
+                            local_buf.add_match(entries[i].row_id,
                                                 static_cast<uint32_t>(idx));
                         }
                     }
@@ -151,8 +150,7 @@ inline std::vector<ThreadLocalMatchBuffer<Mode>>
 probe_columnar(const UnchainedHashtable &hash_table,
                const JoinInput &probe_input, size_t probe_attr) {
 
-    const auto *keys = hash_table.keys();
-    const auto *row_ids = hash_table.row_ids();
+    const auto *entries = hash_table.entries();
 
     auto *table = std::get<const ColumnarTable *>(probe_input.data);
     auto [actual_idx_col, _] = probe_input.node->output_attrs[probe_attr];
@@ -200,8 +198,9 @@ probe_columnar(const UnchainedHashtable &hash_table,
                         hash_table.find_indices(key_val);
 
                     for (uint64_t j = start_idx; j < end_idx; ++j) {
-                        if (keys[j] == key_val) {
-                            local_buf.add_match(row_ids[j], probe_row_id);
+                        if (entries[j].key == key_val) {
+                            local_buf.add_match(entries[j].row_id,
+                                                probe_row_id);
                         }
                     }
                     probe_row_id++;
@@ -222,8 +221,9 @@ probe_columnar(const UnchainedHashtable &hash_table,
                             hash_table.find_indices(key_val);
 
                         for (uint64_t j = start_idx; j < end_idx; ++j) {
-                            if (keys[j] == key_val) {
-                                local_buf.add_match(row_ids[j], probe_row_id);
+                            if (entries[j].key == key_val) {
+                                local_buf.add_match(entries[j].row_id,
+                                                    probe_row_id);
                             }
                         }
                     }
@@ -254,8 +254,7 @@ inline std::vector<ThreadLocalMatchBuffer<Mode>>
 probe_tuples(const UnchainedHashtable &hash_table,
              const mema::key_row_column_t &probe_tuples) {
 
-    const auto *keys = hash_table.keys();
-    const auto *row_ids = hash_table.row_ids();
+    const auto *entries = hash_table.entries();
     const size_t probe_count = probe_tuples.row_count();
     const size_t num_pages = probe_tuples.pages.size();
 
@@ -292,10 +291,11 @@ probe_tuples(const UnchainedHashtable &hash_table,
                         hash_table.find_indices(pair.key);
 
                     for (uint64_t i = start_idx; i < end_idx; ++i) {
-                        if (keys[i] == pair.key) {
-                            // row_ids[i] = build side's row ID (base or IR)
-                            // pair.row_id = probe side's row ID (base or IR)
-                            local_buf.add_match(row_ids[i], pair.row_id);
+                        if (entries[i].key == pair.key) {
+                            // entries[i].row_id = build side's row ID (base or
+                            // IR) pair.row_id = probe side's row ID (base or
+                            // IR)
+                            local_buf.add_match(entries[i].row_id, pair.row_id);
                         }
                     }
                 }
diff --git a/include/join_execution/hashtable.h b/include/join_execution/hashtable.h
index 1ea10da..a39c3aa 100644
--- a/include/join_execution/hashtable.h
+++ b/include/join_execution/hashtable.h
@@ -57,12 +57,18 @@ using Contest::join::BLOOM_TAGS;
  */
 class UnchainedHashtable {
   public:
-    /** @brief Key-rowid pair for hash table entries. */
+    /** @brief Key-rowid pair for hash table entries (build phase). */
     struct alignas(4) Tuple {
         int32_t key;     /**< Join key value. */
         uint32_t row_id; /**< Row index in source table. */
     };
 
+    /** @brief Fused key-rowid for cache-friendly probe (8-byte aligned). */
+    struct alignas(8) Entry {
+        int32_t key;     /**< Join key value. */
+        uint32_t row_id; /**< Row index in source table. */
+    };
+
     /** @brief L2-sized chunk for partition buffers. */
     static constexpr size_t CHUNK_SIZE = 4096;
     static constexpr size_t CHUNK_HEADER = 16;
@@ -126,28 +132,11 @@ class UnchainedHashtable {
         nullptr; /**< Arena for hash table allocations. */
     Contest::platform::ArenaVector<uint64_t>
         directory; /**< Slot entries: (end_offset << 16) | bloom_tag. */
-    Contest::platform::ArenaVector<int32_t>
-        keys_; /**< Contiguous key storage, indexed by directory. */
-    Contest::platform::ArenaVector<uint32_t>
-        row_ids_; /**< Parallel row_id storage, same indexing. */
+    Contest::platform::ArenaVector<Entry>
+        entries_; /**< Fused key+row_id storage, indexed by directory. */
     int shift =
         0; /**< Bit shift for slot calculation: slot = hash >> (64-shift). */
 
-    /**
-     * @brief CRC32-based hash with multiplicative mixing.
-     * @param key INT32 join key.
-     * @return 64-bit hash (upper bits index directory slot).
-     */
-    static uint64_t hash_key(int32_t key) noexcept {
-        constexpr uint64_t k = 0x8648DBDB;
-#if defined(__aarch64__)
-        uint32_t crc = __crc32w(0, static_cast<uint32_t>(key));
-#else
-        uint32_t crc = _mm_crc32_u32(0, static_cast<uint32_t>(key));
-#endif
-        return crc * ((k << 32) + 1);
-    }
-
     /**
      * @brief Returns bloom tag from hash. Uses bits 32-42 to index BLOOM_TAGS.
      * @see bloom_tags.h
@@ -231,8 +220,7 @@ class UnchainedHashtable {
                     uint64_t h = hash_key(tup.key);
                     size_t local_slot = slot_for(h) - slot_start;
                     uint32_t idx = offsets[local_slot] + counts[local_slot]++;
-                    keys_[idx] = tup.key;
-                    row_ids_[idx] = tup.row_id;
+                    entries_[idx] = {tup.key, tup.row_id};
                     directory[slot_start + local_slot] |= bloom_tag(h);
                 }
             }
@@ -253,7 +241,7 @@ class UnchainedHashtable {
      */
     explicit UnchainedHashtable(size_t build_size)
         : arena_(&Contest::platform::get_arena(0)), directory(*arena_),
-          keys_(*arena_), row_ids_(*arena_) {
+          entries_(*arena_) {
         size_t pow2 = 2048;
         while (pow2 < build_size)
             pow2 <<= 1;
@@ -262,17 +250,29 @@ class UnchainedHashtable {
         shift = __builtin_ctzll(pow2);
     }
 
-    /** @brief Number of keys in the hash table. */
-    size_t size() const noexcept { return keys_.size(); }
+    /** @brief Number of entries in the hash table. */
+    size_t size() const noexcept { return entries_.size(); }
 
     /** @brief True if hash table is empty. */
-    bool empty() const noexcept { return keys_.empty(); }
+    bool empty() const noexcept { return entries_.empty(); }
 
-    /** @brief Direct access to key array for probe. */
-    const int32_t *keys() const noexcept { return keys_.data(); }
+    /** @brief Direct access to fused entry array for probe. */
+    const Entry *entries() const noexcept { return entries_.data(); }
 
-    /** @brief Direct access to row_id array for probe. */
-    const uint32_t *row_ids() const noexcept { return row_ids_.data(); }
+    /**
+     * @brief CRC32-based hash with multiplicative mixing. Public for pre-hash.
+     * @param key INT32 join key.
+     * @return 64-bit hash (upper bits index directory slot).
+     */
+    static uint64_t hash_key(int32_t key) noexcept {
+        constexpr uint64_t k = 0x8648DBDB;
+#if defined(__aarch64__)
+        uint32_t crc = __crc32w(0, static_cast<uint32_t>(key));
+#else
+        uint32_t crc = _mm_crc32_u32(0, static_cast<uint32_t>(key));
+#endif
+        return crc * ((k << 32) + 1);
+    }
 
     /**
      * @brief Prefetch directory slot for a key to hide memory latency.
@@ -286,13 +286,24 @@ class UnchainedHashtable {
         __builtin_prefetch(&directory[slot], 0, 2);
     }
 
+    /**
+     * @brief Prefetch directory slot using pre-computed hash.
+     *
+     * Avoids recomputing hash when already computed for another purpose.
+     * @param h Pre-computed hash from hash_key().
+     */
+    void prefetch_slot_prehashed(uint64_t h) const noexcept {
+        size_t slot = slot_for(h);
+        __builtin_prefetch(&directory[slot], 0, 2);
+    }
+
     /**
      * @brief Find index range for keys matching probe key.
      *
-     * @return [start, end) into keys_/row_ids_; (0,0) if bloom rejects.
+     * @return [start, end) into entries_; (0,0) if bloom rejects.
      */
     std::pair<uint64_t, uint64_t> find_indices(int32_t key) const noexcept {
-        if (keys_.empty())
+        if (entries_.empty())
             return {0, 0};
 
         uint64_t h = hash_key(key);
@@ -308,6 +319,32 @@ class UnchainedHashtable {
         return {start, end};
     }
 
+    /**
+     * @brief Find index range using pre-computed hash (avoids rehashing).
+     *
+     * Use when hash was already computed for prefetch or bloom filter check.
+     * @param key Original key (for comparison in caller).
+     * @param h Pre-computed hash from hash_key(key).
+     * @return [start, end) into entries_; (0,0) if bloom rejects.
+     */
+    std::pair<uint64_t, uint64_t>
+    find_indices_prehashed(int32_t key, uint64_t h) const noexcept {
+        (void)key; // Key used by caller for comparison, not needed here
+        if (entries_.empty())
+            return {0, 0};
+
+        size_t slot = slot_for(h);
+        uint64_t entry = directory[slot];
+        uint16_t tag = bloom_tag(h);
+
+        if ((entry & tag) != tag)
+            return {0, 0};
+
+        uint64_t end = entry >> 16;
+        uint64_t start = (slot == 0) ? 0 : (directory[slot - 1] >> 16);
+        return {start, end};
+    }
+
     /**
      * @brief Build hash table from intermediate column_t.
      *
@@ -376,8 +413,7 @@ class UnchainedHashtable {
         size_t total = global_offsets[num_partitions];
         if (total == 0)
             return;
-        keys_.resize(total);
-        row_ids_.resize(total);
+        entries_.resize(total);
 
         // Build partitions in parallel
         const int nt = num_threads;
@@ -393,54 +429,59 @@ class UnchainedHashtable {
     /**
      * @brief Build hash table from (key, row_id) tuple column.
      *
-     * Consumes tuples directly - row_ids are already in correct format
-     * (base table IDs or IR indices depending on how IR was constructed).
-     * More efficient than build_intermediate() as tuples match internal format.
+     * Radix-partitioned parallel build from key_row_column_t.
+     * Uses page-based work distribution for better cache locality.
+     * Each thread processes whole pages to avoid cross-page access.
      *
      * @param tuples Key-row tuple column from IntermediateResult.
-     * @param num_threads Thread count hint.
+     * @param num_threads Thread count hint (unused, uses pool size).
      */
     void build_from_tuples(const mema::key_row_column_t &tuples,
-                           int num_threads = 4) {
+                           int /*num_threads*/ = 4) {
         const size_t row_count = tuples.row_count();
         if (row_count == 0)
             return;
 
-        static constexpr size_t PARALLEL_BUILD_THRESHOLD = 10000;
-        num_threads = Contest::platform::worker_pool().thread_count();
-        if (row_count < PARALLEL_BUILD_THRESHOLD)
-            num_threads = 1;
-
+        const int pool_threads = Contest::platform::worker_pool().thread_count();
         const size_t num_slots = directory.size();
         const size_t num_partitions =
-            compute_num_partitions(row_count, num_threads);
+            compute_num_partitions(row_count, pool_threads);
         const int partition_bits = __builtin_ctzll(num_partitions);
         const size_t slots_per_partition = num_slots / num_partitions;
 
-        std::vector<ChunkAllocator> allocators(num_threads);
-        for (int t = 0; t < num_threads; ++t)
+        // Thread-local partitions for lock-free parallel partitioning
+        std::vector<ChunkAllocator> allocators(pool_threads);
+        for (int t = 0; t < pool_threads; ++t)
             allocators[t].set_arena(Contest::platform::get_arena(t));
-        std::vector<std::vector<Partition>> thread_parts(num_threads);
+        std::vector<std::vector<Partition>> thread_parts(pool_threads);
         for (auto &tp : thread_parts)
             tp.resize(num_partitions);
 
-        // Partition phase - 8-byte tuple reads, cache-friendly streaming
-        size_t batch = (row_count + num_threads - 1) / num_threads;
-        Contest::platform::worker_pool().execute([&, partition_bits](size_t t) {
-            size_t start = t * batch;
-            size_t end = std::min(start + batch, row_count);
-            if (start >= end)
-                return;
-            const int shift = 64 - partition_bits;
-            for (size_t i = start; i < end; ++i) {
-                mema::KeyRowPair pair = tuples[i];
-                uint64_t h = hash_key(pair.key);
-                size_t p = (partition_bits == 0) ? 0 : (h >> shift);
-                // Direct use of tuple - no conversion needed
-                thread_parts[t][p].append(allocators[t],
-                                          {pair.key, pair.row_id});
-            }
-        });
+        // Page-based partition phase - each thread processes whole pages
+        constexpr size_t PAIRS_PER_PAGE = mema::key_row_column_t::PAIRS_PER_PAGE;
+        const size_t num_pages = tuples.pages.size();
+
+        Contest::platform::worker_pool().execute(
+            [&, partition_bits, pool_threads](size_t t) {
+                const int shift = 64 - partition_bits;
+                const size_t stride = static_cast<size_t>(pool_threads);
+                for (size_t pg = t; pg < num_pages; pg += stride) {
+                    // Prefetch next page
+                    if (pg + stride < num_pages) {
+                        __builtin_prefetch(tuples.pages[pg + stride]->data, 0, 3);
+                    }
+                    const auto *page_data = tuples.pages[pg]->data;
+                    size_t base = pg * PAIRS_PER_PAGE;
+                    size_t count = std::min(PAIRS_PER_PAGE, row_count - base);
+                    for (size_t i = 0; i < count; ++i) {
+                        const auto &pair = page_data[i];
+                        uint64_t h = hash_key(pair.key);
+                        size_t p = (partition_bits == 0) ? 0 : (h >> shift);
+                        thread_parts[t][p].append(allocators[t],
+                                                  {pair.key, pair.row_id});
+                    }
+                }
+            });
 
         // Compute global offsets from per-thread counts
         Contest::platform::ArenaVector<size_t> global_offsets(*arena_);
@@ -448,7 +489,7 @@ class UnchainedHashtable {
         std::memset(global_offsets.data(), 0,
                     (num_partitions + 1) * sizeof(size_t));
         for (size_t p = 0; p < num_partitions; ++p) {
-            for (size_t t = 0; t < num_threads; ++t) {
+            for (int t = 0; t < pool_threads; ++t) {
                 global_offsets[p + 1] += thread_parts[t][p].total_count;
             }
             global_offsets[p + 1] += global_offsets[p];
@@ -457,16 +498,16 @@ class UnchainedHashtable {
         size_t total = global_offsets[num_partitions];
         if (total == 0)
             return;
-        keys_.resize(total);
-        row_ids_.resize(total);
+        entries_.resize(total);
 
         // Build partitions in parallel
-        const int nt = num_threads;
-        Contest::platform::worker_pool().execute([&, nt](size_t t) {
-            for (size_t p = t; p < num_partitions; p += nt) {
-                build_partition(
-                    thread_parts, p, slots_per_partition, global_offsets[p],
-                    global_offsets[p + 1] - global_offsets[p], nt, t);
+        Contest::platform::worker_pool().execute([&, pool_threads](size_t t) {
+            for (size_t p = t; p < num_partitions;
+                 p += static_cast<size_t>(pool_threads)) {
+                build_partition(thread_parts, p, slots_per_partition,
+                                global_offsets[p],
+                                global_offsets[p + 1] - global_offsets[p],
+                                pool_threads, t);
             }
         });
     }
@@ -570,8 +611,7 @@ class UnchainedHashtable {
         size_t total = global_offsets[num_partitions];
         if (total == 0)
             return;
-        keys_.resize(total);
-        row_ids_.resize(total);
+        entries_.resize(total);
 
         const int nt = num_threads;
         Contest::platform::worker_pool().execute([&, nt](size_t t) {

From 68ee4082891431d8a0e5290f507e38b9d0956465 Mon Sep 17 00:00:00 2001
From: themosgit <themos360@gmail.com>
Date: Sat, 24 Jan 2026 01:08:26 +0200
Subject: [PATCH 10/13] WIP

---
 CMakeLists.txt              |  2 +-
 include/platform/hardware.h |  4 +--
 src/execute.cpp             | 71 +++++++++++++++++++++++++++----------
 3 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc63a09..76039bc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,7 +98,7 @@ set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64")
     add_compile_options(-O3 -mcpu=apple-m1 -flto)
 else()
-    add_compile_options(-O3 -march=native -m64 -mcrc32 -fpermissive -flto)
+    add_compile_options(-O3 -march=skylake -m64 -mcrc32 -fpermissive -flto)
 endif()
 
 add_link_options(-flto)
diff --git a/include/platform/hardware.h b/include/platform/hardware.h
index 83ef443..0cbb011 100644
--- a/include/platform/hardware.h
+++ b/include/platform/hardware.h
@@ -10,8 +10,8 @@
  */
 #pragma once
 
-#define SPC__CORE_COUNT 8
-#define SPC__THREAD_COUNT 16
+#define SPC__CORE_COUNT 6
+#define SPC__THREAD_COUNT 6
 #define SPC__LEVEL1_DCACHE_SIZE 32768
 #define SPC__LEVEL2_CACHE_SIZE 1048576
 #define SPC__LEVEL3_CACHE_SIZE 33554432
diff --git a/src/execute.cpp b/src/execute.cpp
index 1cb0e10..ab935d5 100644
--- a/src/execute.cpp
+++ b/src/execute.cpp
@@ -16,7 +16,13 @@
  *
  * @see plan.h, match_collector.h, materialize.h, construct_intermediate.h
  */
+#include <cassert>
 #include <foundation/attribute.h>
+#include <functional>
+#include <ostream>
+#include <queue>
+#include <string>
+#include <utility>
 #if defined(__APPLE__) && defined(__aarch64__)
 #include <platform/hardware_darwin.h>
 #elif defined(SPC__USE_BENCHMARKVM_HARDWARE)
@@ -45,7 +51,6 @@ namespace Contest {
 
 using namespace join;
 
-using materialize::construct_intermediate_from_buffers;
 using materialize::construct_intermediate_with_tuples;
 using materialize::create_empty_intermediate_result;
 using materialize::materialize_from_buffers;
@@ -212,26 +217,15 @@ JoinResult execute_join_with_mode(
         auto inter_start = std::chrono::high_resolution_clock::now();
         IntermediateResult result;
         if (total_matches > 0) {
-            // Prepare page indices for intermediate construction
-            // Pass parent_join_key_idx so the key column is prepared for tuple
-            // population
             materialize::prepare_intermediate_columns(
                 columnar_reader, build_input, probe_input, join_node,
                 config.remapped_attrs, build_input.output_size(),
                 config.build_left, join_node.parent_join_key_idx);
 
-            // Use tuple-based construction if parent needs a join key
-            if (join_node.parent_join_key_idx.has_value()) {
-                construct_intermediate_with_tuples<Mode>(
-                    match_buffers, build_input, probe_input, join_node, config,
-                    config.build_left, *join_node.parent_join_key_idx,
-                    columnar_reader, result, plan);
-            } else {
-                construct_intermediate_from_buffers<Mode>(
-                    match_buffers, build_input, probe_input, join_node,
-                    config.remapped_attrs, build_input.output_size(),
-                    config.build_left, columnar_reader, result, plan);
-            }
+            construct_intermediate_with_tuples<Mode>(
+                match_buffers, build_input, probe_input, join_node, config,
+                config.build_left, *join_node.parent_join_key_idx,
+                columnar_reader, result, plan);
         } else {
             result = create_empty_intermediate_result(join_node);
         }
@@ -356,6 +350,43 @@ JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root,
     return IntermediateResult{};
 }
 
+/**
+ *
+ * @brief Prints the plan tree with metadata.
+ *
+ * @param the query plan itself.
+ * @param queue that should contain the root node.
+ *
+ **/
+static std::function<void(const Plan&, std::queue<std::tuple<int, int>>)> 
+print_plan = [](const Plan& plan, std::queue<std::tuple<int, int>> q) {
+    if (q.empty()) return;
+    int initial_size = q.size();
+    for (int i = 0; i < initial_size; i++) {
+        auto [parent_idx, node_idx] = q.front();
+        q.pop();
+        const auto& node = plan.nodes[node_idx];
+        std::cout << "parent: " << parent_idx << ", node: "<< node_idx << " size: "
+            << node.output_attrs.size() << " pairs: { ";
+        for (int i = 0; i < node.output_attrs.size(); i++) {
+            auto [col, type] = node.output_attrs[i];
+            if (DataType::INT32 == type)
+                std::cout << "(" << col << ", INT32)-";
+            else
+                std::cout << "(" << col << ", STR)-";
+        }
+        if (const auto* join = std::get_if<JoinNode>(&node.data)) {
+            std::cout << "left_key: " << join->left_attr;
+            std::cout << " right_key: " << join->right_attr;
+            q.emplace(node_idx ,join->left);
+            q.emplace(node_idx, join->right);
+        }
+        std::cout << "}\n";
+    }
+    std::cout << std::endl << std::endl << std::endl << std::endl ;
+    print_plan(plan, std::move(q));
+};
+
 /**
  * @brief Public entry point: execute plan from root, return ColumnarTable.
  * @param plan Query plan with nodes and base tables.
@@ -381,9 +412,13 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out,
         std::chrono::duration_cast<std::chrono::milliseconds>(analyze_end -
                                                               analyze_start)
             .count();
-
+    /*
     auto result = execute_impl(analyzed_plan, plan.root, true, stats);
     ColumnarTable final_result = std::get<ColumnarTable>(std::move(result));
+    */
+    std::queue<std::tuple<int,int>> q;
+    q.emplace(0, plan.root);
+    print_plan(plan, q);
 
     auto total_end = std::chrono::high_resolution_clock::now();
     auto total_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
@@ -417,7 +452,7 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out,
         *stats_out = stats;
     }
 
-    return std::move(final_result);
+    return ColumnarTable(); 
 }
 
 void *build_context() { return nullptr; }

From 83a1aea639e1ef42b9533b313c1938b68176e0da Mon Sep 17 00:00:00 2001
From: themosgit <themos360@gmail.com>
Date: Sat, 24 Jan 2026 01:10:31 +0200
Subject: [PATCH 11/13] chore: cleanup

---
 CMakeLists.txt | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 76039bc..53ed07b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,33 +59,6 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|powerpc|ppc64|ppc64le")
     set(SKIP_EXTENSIONS jemalloc)
 endif()
 
-# Detect Xeon E5-2680 v3 CPU for benchmark VM hardware configuration
-# Requires both: correct CPU AND at least 32GB RAM (benchmark VM has 64GB, CI has 4GB)
-set(IS_BENCHMARK_VM_HARDWARE OFF)
-if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND EXISTS "/proc/cpuinfo")
-    file(READ "/proc/cpuinfo" CPUINFO_CONTENT)
-    if(CPUINFO_CONTENT MATCHES "E5-2680 v3")
-        # Check available memory to distinguish benchmark VM from CI VM
-        if(EXISTS "/proc/meminfo")
-            file(READ "/proc/meminfo" MEMINFO_CONTENT)
-            string(REGEX MATCH "MemTotal:[ \t]+([0-9]+)" MEM_MATCH "${MEMINFO_CONTENT}")
-            if(MEM_MATCH)
-                set(MEM_TOTAL_KB "${CMAKE_MATCH_1}")
-                math(EXPR MEM_TOTAL_GB "${MEM_TOTAL_KB} / 1024 / 1024")
-                if(MEM_TOTAL_GB GREATER_EQUAL 32)
-                    message(STATUS "Detected Intel Xeon E5-2680 v3 CPU with ${MEM_TOTAL_GB}GB RAM - using benchmark VM hardware configuration")
-                    add_compile_definitions(SPC__USE_BENCHMARKVM_HARDWARE)
-                    set(IS_BENCHMARK_VM_HARDWARE ON)
-                else()
-                    message(STATUS "Detected Intel Xeon E5-2680 v3 CPU but only ${MEM_TOTAL_GB}GB RAM (need >=32GB) - using generic hardware configuration")
-                endif()
-            endif()
-        endif()
-    endif()
-endif()
-
-# Include all sources from /src directory. CONFIGURE_DEPENDS can be unreliable.
-# Try re-running cmake in case changes are not recognized.
 file(GLOB ALL_SRC
     CONFIGURE_DEPENDS
     "src/*.cpp"
@@ -110,13 +83,6 @@ if(NOT CMAKE_SYSTEM_NAME STREQUAL "Windows")
     target_compile_definitions(faster PRIVATE)
     target_link_libraries(faster PRIVATE re2 fmt range-v3 nlohmann_json::nlohmann_json sqlparser)
     target_include_directories(faster PRIVATE include)
-
-    if(IS_BENCHMARK_VM_HARDWARE)
-        add_executable(leaderboard ${MANOLATES_SRC} tests/read_sql.cpp)
-        target_compile_definitions(leaderboard PRIVATE)
-        target_link_libraries(leaderboard PRIVATE re2 fmt range-v3 nlohmann_json::nlohmann_json sqlparser)
-        target_include_directories(leaderboard PRIVATE include)
-    endif()
 endif()
 
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64")

From 4b644e716f43ba3c68409268b0a8f8d56885e7b1 Mon Sep 17 00:00:00 2001
From: themosgit <themos360@gmail.com>
Date: Sat, 24 Jan 2026 15:22:27 +0200
Subject: [PATCH 12/13] fix: migrating workflow

---
 .github/workflows/benchmark.yml | 29 --------------------
 .github/workflows/ci.yml        | 31 ++++++++++++++++-----
 .github/workflows/notifier.yaml | 48 ---------------------------------
 .gitignore                      |  2 ++
 flake.nix                       | 11 ++++----
 5 files changed, 32 insertions(+), 89 deletions(-)
 delete mode 100644 .github/workflows/benchmark.yml
 delete mode 100644 .github/workflows/notifier.yaml

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
deleted file mode 100644
index d854df3..0000000
--- a/.github/workflows/benchmark.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: Workflow for leaderboard submission
-
-on:
-  # push:
-    # branches: [ main ]
-  # pull_request:
-    # branches: [ main ]
-  workflow_dispatch:
-jobs:
-  leaderboard:
-    name: leaderboard
-    runs-on:
-      group: benchmark
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v5
-
-      - name: Configure CMake
-        run: |
-          cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -Wno-dev
-
-      - name: Build project
-        run: |
-          cmake --build build -- -j$(nproc) leaderboard
-          
-      - name: Run for leaderboard
-        run: |
-          leaderboard.sh
-
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9ccf48b..643c1de 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4,27 +4,44 @@ on:
   push:
     branches:
       - main
+      - opt
   pull_request:
     branches:
       - main
-
-  workflow_dispatch:
-
 jobs:
   build_and_test:
-    runs-on: 
-      group: k23a
+    runs-on: self-hosted
+
+    env:
+      CCACHE_DIR: ${{ github.workspace }}/.ccache
 
     steps:
     - name: Checkout repository code
       uses: actions/checkout@v4
+
+    - name: Setup cache
+      uses: actions/cache@v4
+      with:
+        path: .ccache
+        key: ${{ runner.os }}-ccache-${{ github.sha }}
+        restore-keys: |
+          ${{ runner.os }}-ccache-
       
     - name: Configure initial build with CMake
       run: |
-        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -Wno-dev
+        nix develop -c \
+        cmake -S . -B build -Wno-dev \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache
 
     - name: Build all targets
-      run: cmake --build build -- -j $(nproc) unit_tests
+      run: |
+        nix develop -c \
+        cmake --build build -- -j $(nproc) unit_tests
 
     - name: Run unit tests
       run: ./build/unit_tests
+
+    - name: Cache stats
+      run: nix develop -c ccache -s
diff --git a/.github/workflows/notifier.yaml b/.github/workflows/notifier.yaml
deleted file mode 100644
index 5f9b68b..0000000
--- a/.github/workflows/notifier.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
-name: GitHub Push Notifier
-
-on:
-  push:
-    branches:
-      - '**'
-
-jobs:
-  notify:
-    runs-on: 
-      group: k23a
-    
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-
-      - name: Set up Node.js
-        uses: actions/setup-node@v3
-        with:
-          node-version: '18'
-
-      - name: Install dependencies
-        run: npm install discord.js node-fetch dotenv
-
-      - name: Send Discord notification
-        env:
-          DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
-          CHANNEL_ID: ${{ secrets.CHANNEL_ID }}
-          GITHUB_TOKEN: ${{ vars.GH_TOKEN }}
-          GITHUB_OWNER: ${{ github.repository_owner }}
-          GITHUB_REPO: ${{ github.event.repository.name }}
-        run: |
-          node -e "
-          require('dotenv').config();
-          const { Client, GatewayIntentBits } = require('discord.js');
-          const client = new Client({ intents: [GatewayIntentBits.Guilds] });
-          
-          client.once('ready', async () => {
-            const channel = client.channels.cache.get(process.env.CHANNEL_ID);
-            const message = \`🚨 New Push to \\\`${{ github.repository }}\\\` Branch \\\`${{ github.ref_name }}\\\`!\n👤 Author: \\\`${{ github.event.pusher.name }}\\\`\n📝 Commit: \\\`${{ github.event.head_commit.message }}\\\`\n🔗 View: ${{ github.event.head_commit.url }}\`;
-            await channel.send(message);
-            process.exit(0);
-          });
-          
-          client.login(process.env.DISCORD_TOKEN);
-          "
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index f1c8719..c21caa5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,8 @@
 /docs/html
 /docs/xml
 .clangd
+.cache
+.ccache
 compile_commands.json
 /env/
 script.py
diff --git a/flake.nix b/flake.nix
index cd6af67..1f459a3 100644
--- a/flake.nix
+++ b/flake.nix
@@ -23,19 +23,19 @@
           buildInputs = with pkgs; [
             llvmPackages.libcxxClang
             llvmPackages.libllvm
+            ccache
             doxygen
             curl
             git
             cmake
             typst
           ] ++ lib.optionals (system == "x86_64-linux") [
-            linuxPackages.perf
+            perf
             gef
           ];
           shellHook = ''
             CLANGD_FILE=".clangd"
             CPP_STANDARD="c++20"
-
             echo "Generating $CLANGD_FILE from \$ clang++ -v output..."
 
             INCLUDE_PATHS=$(
@@ -57,9 +57,10 @@
                 echo "    - -I$CLEAN_PATH" >> $CLANGD_FILE
             done <<< "$INCLUDE_PATHS"
 
-            echo "    - -O2" >> $CLANGD_FILE
-
-            echo "Generation of $CLANGD_FILE complete."                
+            echo "exporting ccache paths..."
+            export CCACHE_DIR="$PWD/.ccache"
+            export PATH="${pkgs.ccache}/bin:$PATH"
+            echo "done."
 
             if command -v fish &> /dev/null; then
                 exec fish

From b3355047d1062ac8601082c285820394cf07697b Mon Sep 17 00:00:00 2001
From: themosgit <themos360@gmail.com>
Date: Sat, 24 Jan 2026 22:18:40 +0200
Subject: [PATCH 13/13] WIP

---
 .github/workflows/ci.yml          |  1 -
 CMakeLists.txt                    |  4 +-
 include/data_model/intermediate.h |  4 +-
 src/execute.cpp                   | 63 +++++++++++++++++++++----------
 4 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 643c1de..b66ff8f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -4,7 +4,6 @@ on:
   push:
     branches:
       - main
-      - opt
   pull_request:
     branches:
       - main
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53ed07b..ab5f3ae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,7 +71,7 @@ set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64")
     add_compile_options(-O3 -mcpu=apple-m1 -flto)
 else()
-    add_compile_options(-O3 -march=skylake -m64 -mcrc32 -fpermissive -flto)
+    add_compile_options(-O0 -march=skylake -m64 -mcrc32 -fpermissive)
 endif()
 
 add_link_options(-flto)
@@ -88,7 +88,7 @@ endif()
 if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm|aarch64")
     target_compile_options(unit_tests PRIVATE -O3 -mcpu=apple-m1 -flto)
 else()
-    target_compile_options(unit_tests PRIVATE -O3 -march=native -m64 -fpermissive -flto)
+    target_compile_options(unit_tests PRIVATE -O0 -march=skylake -m64 -fpermissive)
 endif()
 
 target_compile_definitions(unit_tests PRIVATE)
diff --git a/include/data_model/intermediate.h b/include/data_model/intermediate.h
index 0b723d0..e8cefe7 100644
--- a/include/data_model/intermediate.h
+++ b/include/data_model/intermediate.h
@@ -55,6 +55,7 @@ struct alignas(8) KeyRowPair {
 };
 
 /**
+ *
  * @brief Column of (key, row_id) tuples for join key storage.
  *
  * Enables accelerated hashtable build (tuples match internal format) and
@@ -62,7 +63,8 @@ struct alignas(8) KeyRowPair {
  * separate column_t for join key columns.
  *
  * Memory layout: 16KB pages containing 2048 KeyRowPair entries each.
- */
+ *
+ **/
 struct key_row_column_t {
     static constexpr size_t PAGE_SIZE = 1 << 14; // 16KB
     static constexpr size_t PAIRS_PER_PAGE =
diff --git a/src/execute.cpp b/src/execute.cpp
index ab935d5..ce81a31 100644
--- a/src/execute.cpp
+++ b/src/execute.cpp
@@ -16,6 +16,7 @@
  *
  * @see plan.h, match_collector.h, materialize.h, construct_intermediate.h
  */
+#include "data_model/plan.h"
 #include <cassert>
 #include <foundation/attribute.h>
 #include <functional>
@@ -350,6 +351,7 @@ JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root,
     return IntermediateResult{};
 }
 
+
 /**
  *
  * @brief Prints the plan tree with metadata.
@@ -358,35 +360,58 @@ JoinResult execute_impl(const AnalyzedPlan &plan, size_t node_idx, bool is_root,
  * @param queue that should contain the root node.
  *
  **/
-static std::function<void(const Plan&, std::queue<std::tuple<int, int>>)> 
-print_plan = [](const Plan& plan, std::queue<std::tuple<int, int>> q) {
+static std::function<void(const Plan&, std::queue<std::tuple<int, int>>, int)> 
+print_plan = [](const Plan& plan, std::queue<std::tuple<int, int>> q, int table_id) {
     if (q.empty()) return;
     int initial_size = q.size();
     for (int i = 0; i < initial_size; i++) {
-        auto [parent_idx, node_idx] = q.front();
+        auto [node_idx, parent_attr] = q.front();
         q.pop();
         const auto& node = plan.nodes[node_idx];
-        std::cout << "parent: " << parent_idx << ", node: "<< node_idx << " size: "
-            << node.output_attrs.size() << " pairs: { ";
+        if (std::holds_alternative<ScanNode>(node.data)) {
+            continue;
+        }
+        const auto data = std::get<JoinNode>(node.data);
+
+        std::cout << " node: "<< node_idx << " size: "
+            << node.output_attrs.size() << std::endl; 
+
+        bool match_left = false;
+        bool match_right = false;
         for (int i = 0; i < node.output_attrs.size(); i++) {
             auto [col, type] = node.output_attrs[i];
-            if (DataType::INT32 == type)
-                std::cout << "(" << col << ", INT32)-";
-            else
-                std::cout << "(" << col << ", STR)-";
-        }
-        if (const auto* join = std::get_if<JoinNode>(&node.data)) {
-            std::cout << "left_key: " << join->left_attr;
-            std::cout << " right_key: " << join->right_attr;
-            q.emplace(node_idx ,join->left);
-            q.emplace(node_idx, join->right);
+            if (node_idx != plan.root) {
+                if (i == parent_attr) std::cout << "build->";
+                else std::cout << "defer->";
+            }
+            if (col < plan.nodes[data.left].output_attrs.size()) {
+                std::cout << "left->";
+                match_left = true;
+            } else {
+                std::cout << "right->";
+                match_right = true;
+            }
+            if (DataType::INT32 == type) std::cout << "(" << col << ", INT32)";
+            else std::cout << "(" << col << ", STR)";
+            std::cout << std::endl;
+
         }
-        std::cout << "}\n";
+        std::cout << "====";
+        if (match_left && match_right) std::cout << "Match both";
+        else if (match_left) std::cout << "Match left";
+        else std::cout << "Match right";
+        std::cout << "====" << std::endl;
+
+        std::cout << "left_key: " << data.left_attr << " left child: " << data.left;
+        std::cout << "\nright_key: " << data.right_attr << " right child: " << data.right;
+        q.emplace(data.left, data.left_attr);
+        q.emplace(data.right, data.right_attr);
+        std::cout << "\n\n\n\n\n";
     }
-    std::cout << std::endl << std::endl << std::endl << std::endl ;
     print_plan(plan, std::move(q));
 };
 
+
 /**
  * @brief Public entry point: execute plan from root, return ColumnarTable.
  * @param plan Query plan with nodes and base tables.
@@ -416,8 +441,8 @@ ColumnarTable execute(const Plan &plan, void *context, TimingStats *stats_out,
     auto result = execute_impl(analyzed_plan, plan.root, true, stats);
     ColumnarTable final_result = std::get<ColumnarTable>(std::move(result));
     */
-    std::queue<std::tuple<int,int>> q;
-    q.emplace(0, plan.root);
+    std::queue<std::tuple<int, int>> q;
+    q.emplace(plan.root, 0);
     print_plan(plan, q);
 
     auto total_end = std::chrono::high_resolution_clock::now();